From 0e24cb7db1c0fb3baef524175d2a1f90f8a78ec2 Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Mon, 12 Jun 2023 18:09:01 +0200
Subject: [PATCH 01/80] some first movement on dask-task-graph style
 runner/executor

---
 src/coffea/processor/executor.py | 129 +++++++++++++++++++++++++++++++
 1 file changed, 129 insertions(+)

diff --git a/src/coffea/processor/executor.py b/src/coffea/processor/executor.py
index 618b1c741..3bf6a735b 100644
--- a/src/coffea/processor/executor.py
+++ b/src/coffea/processor/executor.py
@@ -362,6 +362,107 @@ def copy(self, **kwargs):
         return type(self)(**tmp)
 
 
+@dataclass
+class DaskExecutorBase(ExecutorBase):
+    """This base class for dak-based processors
+    synthesizes all analysis inputs into one
+    task graph that's then executed by derived
+    classes.
+    """
+
+    def prepare_dataset_graph(self, items, function, accumulator):
+        accumulator = None
+        for dset, info in items.items():
+            if isinstance(items, dict) and "object_path" not in list(items.values()):
+                raise ValueError(
+                    "items should be normalized to uproot spec in prepare_dataset_graph"
+                )
+
+            metadata = info["metadata"].copy()
+            metadata["dataset"] = dset
+
+            temp = function(info["files"], metadata=metadata)
+            if accumulator is None:
+                accumulator = temp
+            else:
+                accumulator = accumulate((accumulator, temp))
+
+        return accumulator
+
+
+@dataclass
+class DaskSyncExecutor(DaskExecutorBase):
+    """Execute dask task graph in one thread
+
+    Parameters
+    ----------
+        items : list
+            List of input arguments
+        function : callable
+            A function to be called on each input, which returns an accumulator instance
+        accumulator : Accumulatable
+            An accumulator to collect the output of the function
+        status : bool
+            If true (default), enable progress bar
+        unit : str
+            Label of progress bar unit
+        desc : str
+            Label of progress bar description
+        compression : int, optional
+            Ignored for iterative executor
+    """
+
+    def __call__(
+        self,
+        items: Iterable,
+        function: Callable,
+        accumulator: Accumulatable,
+    ):
+        import dask
+
+        to_compute = self.prepare_dataset_graph(items, function, None)
+        computed = dask.compute(to_compute, scheduler="sync")
+        return computed[0] if len(computed) == 1 else computed
+
+
+@dataclass
+class DaskProcessesExecutor(DaskExecutorBase):
+    """Execute dask task graph in a multiprocessing pool
+
+    Parameters
+    ----------
+        items : list
+            List of input arguments
+        function : callable
+            A function to be called on each input, which returns an accumulator instance
+        accumulator : Accumulatable
+            An accumulator to collect the output of the function
+        status : bool
+            If true (default), enable progress bar
+        unit : str
+            Label of progress bar unit
+        desc : str
+            Label of progress bar description
+        compression : int, optional
+            Ignored for iterative executor
+    """
+
+    workers = 1
+
+    def __call__(
+        self,
+        items: Iterable,
+        function: Callable,
+        accumulator: Accumulatable,
+    ):
+        import dask
+
+        to_compute = self.prepare_dataset_graph(items, function, None)
+        with dask.config.set(num_workers=self.workers):
+            computed = dask.compute(to_compute, scheduler="processes")
+        return computed[0] if len(computed) == 1 else computed
+
+
 def _watcher(
     FH: _FuturesHolder,
     executor: ExecutorBase,
@@ -1764,6 +1865,8 @@ def __call__(
             processor_instance : ProcessorABC
                 An instance of a class deriving from ProcessorABC
         """
+        if isinstance(self.executor, DaskExecutorBase):
+            return self.run_dask(fileset, processor_instance, treename)
 
         wrapped_out = self.run(fileset, processor_instance, treename)
         if self.use_dataframes:
@@ -1819,6 +1922,32 @@ def preprocess(
 
         return self._chunk_generator(fileset, treename)
 
+    def run_dask(
+        self,
+        fileset: Union[Dict, str, List[WorkItem], Generator],
+        processor_instance: ProcessorABC,
+        treename: str = None,
+    ) -> Accumulatable:
+        """Run the processor_instance on a given fileset
+
+        Parameters
+        ----------
+            fileset : dict | str | List[WorkItem] | Generator
+                - A dictionary ``{dataset: [file, file], }``
+                  Optionally, if some files' tree name differ, the dictionary can be specified:
+                  ``{dataset: {'treename': 'name', 'files': [file, file]}, }``
+                - A single file name
+                - File chunks for self.preprocess()
+                - Chunk generator
+            treename : str, optional
+                name of tree inside each root file, can be ``None``;
+                treename can also be defined in fileset, which will override the passed treename
+                Not needed if processing premade chunks
+            processor_instance : ProcessorABC
+                An instance of a class deriving from ProcessorABC
+        """
+        pass
+
     def run(
         self,
         fileset: Union[Dict, str, List[WorkItem], Generator],

From dcb6f742f2071331c1e2c3de00870de56f5e238b Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Tue, 27 Jun 2023 11:35:55 -0500
Subject: [PATCH 02/80] changes for preprocessing prototype

---
 src/coffea/nanoevents/factory.py    | 11 ++++++++++-
 src/coffea/processor/accumulator.py |  6 +++---
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/coffea/nanoevents/factory.py b/src/coffea/nanoevents/factory.py
index 38e06d601..c12485249 100644
--- a/src/coffea/nanoevents/factory.py
+++ b/src/coffea/nanoevents/factory.py
@@ -232,7 +232,7 @@ def from_root(
         treepath="/Events",
         entry_start=None,
         entry_stop=None,
-        chunks_per_file=1,
+        chunks_per_file=None,
         runtime_cache=None,
         persistent_cache=None,
         schemaclass=NanoAODSchema,
@@ -327,6 +327,15 @@ def from_root(
                     filter_branch=_remove_not_interpretable,
                     steps_per_file=chunks_per_file,
                 )
+            elif chunks_per_file is None:
+                opener = partial(
+                    uproot.dask,
+                    file,
+                    full_paths=True,
+                    open_files=False,
+                    ak_add_doc=True,
+                    filter_branch=_remove_not_interpretable,
+                )
             else:
                 opener = partial(
                     uproot.dask,
diff --git a/src/coffea/processor/accumulator.py b/src/coffea/processor/accumulator.py
index 8ad12dab1..282214a6c 100644
--- a/src/coffea/processor/accumulator.py
+++ b/src/coffea/processor/accumulator.py
@@ -57,14 +57,14 @@ def add(a: Accumulatable, b: Accumulatable) -> Accumulatable:
                 out[key] = (
                     copy.deepcopy(a[key])
                     if not isinstance(a[key], DaskMethodsMixin)
-                    else copy.copy(a[key])
+                    else a[key]
                 )
         for key in b:
             if key not in lhs:
                 out[key] = (
                     copy.deepcopy(b[key])
                     if not isinstance(b[key], DaskMethodsMixin)
-                    else copy.copy(b[key])
+                    else b[key]
                 )
         return out
     raise ValueError(
@@ -93,7 +93,7 @@ def iadd(a: Accumulatable, b: Accumulatable) -> Accumulatable:
                 a[key] = (
                     copy.deepcopy(b[key])
                     if not isinstance(b[key], DaskMethodsMixin)
-                    else copy.copy(b[key])
+                    else b[key]
                 )
         return a
     raise ValueError(

From a03beb9be03ce10eff9b42649c1ae581ac88a829 Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Tue, 22 Aug 2023 17:21:43 -0500
Subject: [PATCH 03/80] new dask-based dataset pre-processor

---
 src/coffea/dataset_tools/__init__.py   |   3 +
 src/coffea/dataset_tools/preprocess.py | 159 +++++++++++++++++++++++++
 2 files changed, 162 insertions(+)
 create mode 100644 src/coffea/dataset_tools/__init__.py
 create mode 100644 src/coffea/dataset_tools/preprocess.py

diff --git a/src/coffea/dataset_tools/__init__.py b/src/coffea/dataset_tools/__init__.py
new file mode 100644
index 000000000..c02a96e48
--- /dev/null
+++ b/src/coffea/dataset_tools/__init__.py
@@ -0,0 +1,3 @@
+from coffea.dataset_tools.preprocess import preprocess
+
+__all__ = ["preprocess"]
diff --git a/src/coffea/dataset_tools/preprocess.py b/src/coffea/dataset_tools/preprocess.py
new file mode 100644
index 000000000..7fdc46857
--- /dev/null
+++ b/src/coffea/dataset_tools/preprocess.py
@@ -0,0 +1,159 @@
+import math
+
+import awkward
+import dask
+import dask_awkward
+import numpy
+import uproot
+
+
+def _get_steps(
+    normed_files,
+    maybe_step_size=None,
+    align_clusters=False,
+    recalculate_seen_steps=False,
+):
+    nf_backend = awkward.backend(normed_files)
+    lz_or_nf = awkward.typetracer.length_zero_if_typetracer(normed_files)
+
+    array = [] if nf_backend != "typetracer" else lz_or_nf
+    for arg in lz_or_nf:
+        try:
+            the_file = uproot.open({arg.file: None})
+        except FileNotFoundError:
+            array.append(None)
+            continue
+
+        tree = the_file[arg.object_path]
+        num_entries = tree.num_entries
+
+        target_step_size = num_entries if maybe_step_size is None else maybe_step_size
+
+        file_uuid = str(the_file.file.uuid)
+
+        out_uuid = arg.uuid
+        out_steps = arg.steps
+
+        if out_uuid != file_uuid or recalculate_seen_steps:
+            if align_clusters:
+                clusters = tree.common_entry_offsets()
+                out = [0]
+                for c in clusters:
+                    if c >= out[-1] + target_step_size:
+                        out.append(c)
+                if clusters[-1] != out[-1]:
+                    out.append(clusters[-1])
+                out = numpy.array(out, dtype="int64")
+                out = numpy.stack((out[:-1], out[1:]), axis=1)
+            else:
+                n_steps = num_entries // target_step_size
+                out = numpy.array(
+                    [
+                        [
+                            i * target_step_size,
+                            min((i + 1) * target_step_size, num_entries),
+                        ]
+                        for i in range(n_steps)
+                    ],
+                    dtype="int64",
+                )
+
+            out_uuid = file_uuid
+            out_steps = out.tolist()
+
+        array.append(
+            {
+                "file": arg.file,
+                "object_path": arg.object_path,
+                "steps": out_steps,
+                "uuid": out_uuid,
+            }
+        )
+
+    if len(array) == 0:
+        array = awkward.Array(
+            [
+                {"file": "junk", "object_path": "junk", "steps": [[]], "uuid": "junk"},
+                None,
+            ]
+        )
+        array = awkward.Array(array.layout.form.length_zero_array(highlevel=False))
+    else:
+        array = awkward.Array(array)
+
+    if nf_backend == "typetracer":
+        array = awkward.Array(
+            array.layout.to_typetracer(forget_length=True),
+        )
+
+    return array
+
+
+def preprocess(
+    fileset,
+    maybe_step_size=None,
+    align_clusters=False,
+    recalculate_seen_steps=False,
+    files_per_batch=1,
+):
+    out_updated = fileset.copy()
+    out_available = fileset.copy()
+    all_ak_norm_files = {}
+    files_to_preprocess = {}
+    for name, info in fileset.items():
+        norm_files = uproot._util.regularize_files(info["files"], steps_allowed=True)
+        for ifile in range(len(norm_files)):
+            the_file_info = norm_files[ifile]
+            maybe_finfo = info["files"].get(the_file_info[0], None)
+            maybe_uuid = (
+                None
+                if not isinstance(maybe_finfo, dict)
+                else maybe_finfo.get("uuid", None)
+            )
+            norm_files[ifile] += (3 - len(norm_files[ifile])) * (None,) + (maybe_uuid,)
+        fields = ["file", "object_path", "steps", "uuid"]
+        ak_norm_files = awkward.from_iter(norm_files)
+        ak_norm_files = awkward.Array(
+            {field: ak_norm_files[str(ifield)] for ifield, field in enumerate(fields)}
+        )
+        all_ak_norm_files[name] = ak_norm_files
+
+        dak_norm_files = dask_awkward.from_awkward(
+            ak_norm_files, math.ceil(len(ak_norm_files) / files_per_batch)
+        )
+
+        files_to_preprocess[name] = dask_awkward.map_partitions(
+            _get_steps,
+            dak_norm_files,
+            maybe_step_size=maybe_step_size,
+            align_clusters=align_clusters,
+            recalculate_seen_steps=recalculate_seen_steps,
+        )
+
+    all_processed_files = dask.compute(files_to_preprocess)[0]
+
+    for name, processed_files in all_processed_files.items():
+        files_available = {
+            item["file"]: {
+                "object_path": item["object_path"],
+                "steps": item["steps"],
+                "uuid": item["uuid"],
+            }
+            for item in awkward.drop_none(processed_files).to_list()
+        }
+
+        files_out = {}
+        for proc_item, orig_item in zip(
+            processed_files.to_list(), all_ak_norm_files[name].to_list()
+        ):
+            item = orig_item if proc_item is None else proc_item
+            files_out[item["file"]] = {
+                "object_path": item["object_path"],
+                "steps": item["steps"],
+                "uuid": item["uuid"],
+            }
+
+        out_updated[name]["files"] = files_out
+        out_available[name]["files"] = files_available
+
+    return out_available, out_updated

From 95c5fc80a79195e354f80a072f3ebebd631aa677 Mon Sep 17 00:00:00 2001
From: Davide Valsecchi <davide.valsecchi@cern.ch>
Date: Wed, 23 Aug 2023 11:41:29 +0200
Subject: [PATCH 04/80] Added the rucio utils functions from pocketcoffea

---
 pyproject.toml                          |   1 +
 src/coffea/dataset_tools/rucio_utils.py | 266 ++++++++++++++++++++++++
 2 files changed, 267 insertions(+)
 create mode 100644 src/coffea/dataset_tools/rucio_utils.py

diff --git a/pyproject.toml b/pyproject.toml
index 454ed3319..1ddcd9fe6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -58,6 +58,7 @@ dependencies = [
   "pandas",
   "hist>=2",
   "cachetools",
+  "rucio>=32.2.0"
 ]
 dynamic = ["version"]
 
diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py
new file mode 100644
index 000000000..545884962
--- /dev/null
+++ b/src/coffea/dataset_tools/rucio_utils.py
@@ -0,0 +1,266 @@
+import os
+import getpass
+import re
+import json
+from rucio.client import Client
+from collections import defaultdict
+import subprocess
+
+# Rucio needs the default configuration --> taken from CMS cvmfs defaults
+os.environ["RUCIO_HOME"] = "/cvmfs/cms.cern.ch/rucio/current"
+
+
+def get_proxy_path() -> str:
+    """
+    Checks if the VOMS proxy exists and if it is valid
+    for at least 1 hour.
+    If it exists, returns the path of it"""
+    try:
+        subprocess.run("voms-proxy-info -exists -hours 1", shell=True, check=True)
+    except subprocess.CalledProcessError:
+        raise Exception(
+            "VOMS proxy expirend or non-existing: please run `voms-proxy-init -voms cms -rfc --valid 168:0`"
+        )
+
+    # Now get the path of the certificate
+    proxy = subprocess.check_output(
+        "voms-proxy-info -path", shell=True, text=True
+    ).strip()
+    return proxy
+
+
+def get_rucio_client(proxy=None) -> Client:
+    """
+    Open a client to the CMS rucio server using x509 proxy.
+
+    Parameters
+    ----------
+        proxy : str, optional
+            Use the provided proxy file if given, if not use `voms-proxy-info` to get the current active one.
+
+    Returns
+    -------
+        nativeClient: rucio.Client
+            Rucio client
+    """
+    try:
+        if not proxy:
+            proxy = get_proxy_path()
+
+        nativeClient = Client(
+            rucio_host="https://cms-rucio.cern.ch",
+            auth_host="https://cms-rucio-auth.cern.ch",
+            account=getpass.getuser(),
+            creds={"client_cert": proxy, "client_key": proxy},
+            auth_type="x509",
+        )
+        return nativeClient
+
+    except Exception as e:
+        print("Wrong Rucio configuration, impossible to create client")
+        raise e
+
+
+def get_xrootd_sites_map():
+    """
+    The mapping beetween RSE (sites) and the xrootd prefix rules is read
+    from `/cvmfs/cms/cern.ch/SITECONF/*site*/storage.json`.
+
+    This function returns the list of xrootd prefix rules for each site.
+    """
+    sites_xrootd_access = defaultdict(dict)
+    # TODO Do not rely on local sites_map cache. Just reload it?
+    if not os.path.exists(".sites_map.json"):
+        print("Loading SITECONF info")
+        sites = [
+            (s, "/cvmfs/cms.cern.ch/SITECONF/" + s + "/storage.json")
+            for s in os.listdir("/cvmfs/cms.cern.ch/SITECONF/")
+            if s.startswith("T")
+        ]
+        for site_name, conf in sites:
+            if not os.path.exists(conf):
+                continue
+            try:
+                data = json.load(open(conf))
+            except:
+                continue
+            for site in data:
+                if site["type"] != "DISK":
+                    continue
+                if site["rse"] == None:
+                    continue
+                for proc in site["protocols"]:
+                    if proc["protocol"] == "XRootD":
+                        if proc["access"] not in ["global-ro", "global-rw"]:
+                            continue
+                        if "prefix" not in proc:
+                            if "rules" in proc:
+                                for rule in proc["rules"]:
+                                    sites_xrootd_access[site["rse"]][
+                                        rule["lfn"]
+                                    ] = rule["pfn"]
+                        else:
+                            sites_xrootd_access[site["rse"]] = proc["prefix"]
+        json.dump(sites_xrootd_access, open(".sites_map.json", "w"))
+
+    return json.load(open(".sites_map.json"))
+
+
+def _get_pfn_for_site(path, rules):
+    """
+    Utility function that converts the file path to a valid pfn matching
+    the file path with the site rules (regexes).
+    """
+    if isinstance(rules, dict):
+        for rule, pfn in rules.items():
+            if m := re.match(rule, path):
+                grs = m.groups()
+                for i in range(len(grs)):
+                    pfn = pfn.replace(f"${i+1}", grs[i])
+                return pfn
+    else:
+        return rules + "/" + path
+
+
+def get_dataset_files_replicas(
+    dataset, whitelist_sites=None, blacklist_sites=None, regex_sites=None, mode="full"
+):
+    """
+    This function queries the Rucio server to get information about the location
+    of all the replicas of the files in a CMS dataset.
+
+    The sites can be filtered in 3 different ways:
+    - `whilist_sites`: list of sites to select from. If the file is not found there, raise an Expection.
+    - `blacklist_sites`: list of sites to avoid. If the file has no left site, raise an Exception
+    - `regex_sites`: regex expression to restrict the list of sites.
+
+    The fileset returned by the function is controlled by the `mode` parameter:
+    - "full": returns the full set of replicas and sites (passing the filtering parameters)
+    - "first": returns the first replica found for each file
+    - "best": to be implemented (ServiceX..)
+    - "roundrobin": try to distribute the replicas over different sites
+
+    Parameters
+    ----------
+
+        dataset: str
+        whilelist_sites: list
+        blacklist_sites: list
+        regex_sites: list
+        mode:  str, default "full"
+
+    Returns
+    -------
+        files: list
+           depending on the `mode` option.
+           - If `mode=="full"`, returns the complete list of replicas for each file in the dataset
+           - If `mode=="first"`, returns only the first replica for each file.
+
+        sites: list
+           depending on the `mode` option.
+           - If `mode=="full"`, returns the list of sites where the file replica is available for each file in the dataset
+           - If `mode=="first"`, returns a list of sites for the first replica of each file.
+
+    """
+    sites_xrootd_prefix = get_xrootd_sites_map()
+    client = get_rucio_client()
+    outsites = []
+    outfiles = []
+    for filedata in client.list_replicas([{"scope": "cms", "name": dataset}]):
+        outfile = []
+        outsite = []
+        rses = filedata["rses"]
+        found = False
+        if whitelist_sites:
+            for site in whitelist_sites:
+                if site in rses:
+                    # Check actual availability
+                    meta = filedata["pfns"][rses[site][0]]
+                    if (
+                        meta["type"] != "DISK"
+                        or meta["volatile"] == True
+                        or filedata["states"][site] != "AVAILABLE"
+                        or site not in sites_xrootd_prefix
+                    ):
+                        continue
+                    outfile.append(
+                        _get_pfn_for_site(filedata["name"], sites_xrootd_prefix[site])
+                    )
+                    outsite.append(site)
+                    found = True
+
+            if not found:
+                raise Exception(
+                    f"No SITE available in the whitelist for file {filedata['name']}"
+                )
+        else:
+            possible_sites = list(rses.keys())
+            if blacklist_sites:
+                possible_sites = list(
+                    filter(lambda key: key not in blacklist_sites, possible_sites)
+                )
+
+            if len(possible_sites) == 0:
+                raise Exception(f"No SITE available for file {filedata['name']}")
+
+            # now check for regex
+            for site in possible_sites:
+                if regex_sites:
+                    if re.search(regex_sites, site):
+                        # Check actual availability
+                        meta = filedata["pfns"][rses[site][0]]
+                        if (
+                            meta["type"] != "DISK"
+                            or meta["volatile"] == True
+                            or filedata["states"][site] != "AVAILABLE"
+                            or site not in sites_xrootd_prefix
+                        ):
+                            continue
+                        outfile.append(
+                            _get_pfn_for_site(
+                                filedata["name"], sites_xrootd_prefix[site]
+                            )
+                        )
+                        outsite.append(site)
+                        found = True
+                else:
+                    # Just take the first one
+                    # Check actual availability
+                    meta = filedata["pfns"][rses[site][0]]
+                    if (
+                        meta["type"] != "DISK"
+                        or meta["volatile"] == True
+                        or filedata["states"][site] != "AVAILABLE"
+                        or site not in sites_xrootd_prefix
+                    ):
+                        continue
+                    outfile.append(
+                        _get_pfn_for_site(filedata["name"], sites_xrootd_prefix[site])
+                    )
+                    outsite.append(site)
+                    found = True
+
+        if not found:
+            raise Exception(f"No SITE available for file {filedata['name']}")
+        else:
+            if mode == "full":
+                outfiles.append(outfile)
+                outsites.append(outsite)
+            elif mode == "first":
+                outfiles.append(outfile[0])
+                outsites.append(outsite[0])
+            else:
+                raise NotImplemented(f"Mode {mode} not yet implemented!")
+
+    # Computing replicas by site:
+    totfiles = len(outfiles)
+    sites_counts = defaultdict(float)
+    if mode == "full":
+        for sites_by_file in outsites:
+            for site in sites_by_file:
+                sites_counts[site] += 1 / totfiles
+    elif mode == "first":
+        for site_by_file in outsites:
+            sites_counts[site] += 1 / totfiles
+
+    return outfiles, outsites, sites_counts

From 1f870c2dfdb013d0c9cd6c4118a23bb51ac71dfd Mon Sep 17 00:00:00 2001
From: Davide Valsecchi <davide.valsecchi@cern.ch>
Date: Wed, 23 Aug 2023 18:14:43 +0200
Subject: [PATCH 05/80] Added dataset querying function

---
 src/coffea/dataset_tools/rucio_utils.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py
index 545884962..36954ea6e 100644
--- a/src/coffea/dataset_tools/rucio_utils.py
+++ b/src/coffea/dataset_tools/rucio_utils.py
@@ -123,7 +123,12 @@ def _get_pfn_for_site(path, rules):
 
 
 def get_dataset_files_replicas(
-    dataset, whitelist_sites=None, blacklist_sites=None, regex_sites=None, mode="full"
+    dataset,
+    whitelist_sites=None,
+    blacklist_sites=None,
+    regex_sites=None,
+    mode="full",
+    client=None,
 ):
     """
     This function queries the Rucio server to get information about the location
@@ -148,6 +153,7 @@ def get_dataset_files_replicas(
         blacklist_sites: list
         regex_sites: list
         mode:  str, default "full"
+        client: rucio Client, optional
 
     Returns
     -------
@@ -163,7 +169,7 @@ def get_dataset_files_replicas(
 
     """
     sites_xrootd_prefix = get_xrootd_sites_map()
-    client = get_rucio_client()
+    client = client if client else get_rucio_client()
     outsites = []
     outfiles = []
     for filedata in client.list_replicas([{"scope": "cms", "name": dataset}]):
@@ -264,3 +270,9 @@ def get_dataset_files_replicas(
             sites_counts[site] += 1 / totfiles
 
     return outfiles, outsites, sites_counts
+
+
+def query_dataset(query, client=None):
+    client = client if client else get_rucio_client()
+    return list(client.list_dids(scope="cms", filters={"name": query, "type":"container"},long=False))
+

From c5d2d579287537f3017c3bd3dd45a5b35fc5e2cd Mon Sep 17 00:00:00 2001
From: Davide Valsecchi <davide.valsecchi@cern.ch>
Date: Mon, 28 Aug 2023 10:09:36 +0200
Subject: [PATCH 06/80] Working on interface for datasets query

---
 src/coffea/dataset_tools/dataset_query.py | 109 ++++++++++++++++++++++
 src/coffea/dataset_tools/rucio_utils.py   |  17 +++-
 2 files changed, 123 insertions(+), 3 deletions(-)
 create mode 100644 src/coffea/dataset_tools/dataset_query.py

diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py
new file mode 100644
index 000000000..b2e782095
--- /dev/null
+++ b/src/coffea/dataset_tools/dataset_query.py
@@ -0,0 +1,109 @@
+from cmd2 import Cmd
+import cmd2
+from rich import print
+from rich.pretty import pprint
+from rich.console import Console
+from rich.table import Table
+from rich.tree import Tree
+import rucio_utils
+
+
+def print_dataset_query(query, dataset_list, selected, console):
+    table = Table(title=f"Query: [bold red]{query}")
+    table.add_column("name", justify="left", style="cyan", no_wrap=True)
+    table.add_column("tag", style="magenta", no_wrap=True)
+    table.add_column("selected", justify="center")
+    table.row_styles = ["dim", "none"]
+    j = 1
+    for name, conds in dataset_list.items():
+        ic = 0
+        ncond = len(conds)
+        for c, tiers in conds.items():
+            dataset = f"/{name}/{c}/{tiers[0]}"
+            sel = dataset in selected
+            if ic ==0:
+                table.add_row(name, f"[bold]({j})[/bold] {c}/{'-'.join(tiers)}",
+                              f"[green]Y" if sel else f"[red]N",
+                              end_section = ic==ncond-1)
+            else:
+                table.add_row("",  f"[bold]({j})[/bold] {c}/{'-'.join(tiers)}",
+                              f"[green]Y" if sel else f"[red]N",
+                              end_section = ic==ncond-1)
+            ic+=1
+            j+=1
+
+    console.print(table)
+
+
+class MyCmdApp(cmd2.Cmd):
+
+    prompt = "\033[1;34m" + "cms-datasets" + "\033[0m > "
+    
+    def __init__(self):
+        shortcuts = cmd2.DEFAULT_SHORTCUTS
+        shortcuts.update({ 'L': 'login', 'Q': 'query', 'R': 'replicas',
+                           'S': 'select',
+                           'lr': 'list_results'})
+        self.console = Console()
+        self.rucio_client = None
+        self.selected_datasets = [ ]
+        self.last_query = ""
+        self.last_query_results = None
+        super().__init__(shortcuts=shortcuts)
+
+    def do_login(self, args):
+        '''Login to the rucio client. Optionally a specific proxy file can be passed to the command.
+        If the proxy file is not specified, `voms-proxy-info` is used'''
+        if args:
+            self.rucio_client = rucio_utils.get_rucio_client(args[0])
+        else:
+            self.rucio_client = rucio_utils.get_rucio_client()
+            
+        print(self.rucio_client)
+        #pprint(self.rucio_client.whoami())
+
+    def do_whoami(self, args):
+        # Your code here
+        if not self.rucio_client:
+            print("First [bold]login (L)[/] to the rucio server")
+            return
+        print(self.rucio_client.whoami())
+        
+    def do_query(self, args):
+        # Your code here
+        with self.console.status(f"Querying rucio for: [bold red]{args}[/]"):    
+            out = rucio_utils.query_dataset(args.arg_list[0],
+                                            client=self.rucio_client,
+                                            tree=True)
+            # Now let's print the results as a tree
+            print_dataset_query(args, out,
+                               self.selected_datasets,
+                               self.console)
+            self.last_query = args
+            self.last_query_results = out
+
+    def do_list_results(self, args):
+        if self.last_query_results:
+            print_dataset_query(self.last_query, self.last_query_results,
+                            self.selected_datasets, self.console)
+        else:
+            print("First [bold red]query (Q)[/] for a dataset")
+
+    def do_select(self, args):
+        if not self.last_query_results:
+            print("First [bold red]query (Q)[/] for a dataset")
+            return
+
+        for s in map(int, args.arg_list):
+            print(s)
+        
+    
+        
+        
+    def do_replicas(self, args):
+        # Your code here
+        self.poutput("Replicas command executed")
+
+if __name__ == "__main__":
+    app = MyCmdApp()
+    app.cmdloop()
diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py
index 36954ea6e..70f301a5b 100644
--- a/src/coffea/dataset_tools/rucio_utils.py
+++ b/src/coffea/dataset_tools/rucio_utils.py
@@ -272,7 +272,18 @@ def get_dataset_files_replicas(
     return outfiles, outsites, sites_counts
 
 
-def query_dataset(query, client=None):
+def query_dataset(query, client=None, tree=False):
     client = client if client else get_rucio_client()
-    return list(client.list_dids(scope="cms", filters={"name": query, "type":"container"},long=False))
-
+    out = list(client.list_dids(
+        scope="cms", filters={"name": query, "type":"container"},
+        long=False))
+    if tree:
+        outdict = {}
+        for dataset in out:
+            split = dataset[1:].split("/")
+            if split[0] not in outdict:
+                outdict[split[0]] = defaultdict(list)
+            outdict[split[0]][split[1]].append(split[2])
+        return outdict
+    else:
+        return out

From d4d371c094a0e5f980f6366d6c237405fd9936a5 Mon Sep 17 00:00:00 2001
From: Davide Valsecchi <davide.valsecchi@cern.ch>
Date: Mon, 28 Aug 2023 10:34:17 +0200
Subject: [PATCH 07/80] Querying and listing implemented: selected of results

---
 pyproject.toml                            |  3 +-
 src/coffea/dataset_tools/dataset_query.py | 45 +++++++++++++++--------
 src/coffea/dataset_tools/rucio_utils.py   |  2 +-
 3 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 1ddcd9fe6..82fa7e7e1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -58,7 +58,8 @@ dependencies = [
   "pandas",
   "hist>=2",
   "cachetools",
-  "rucio>=32.2.0"
+  "rucio>=32.2.0",
+  "cmd2"
 ]
 dynamic = ["version"]
 
diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py
index b2e782095..299f03c72 100644
--- a/src/coffea/dataset_tools/dataset_query.py
+++ b/src/coffea/dataset_tools/dataset_query.py
@@ -23,11 +23,11 @@ def print_dataset_query(query, dataset_list, selected, console):
             sel = dataset in selected
             if ic ==0:
                 table.add_row(name, f"[bold]({j})[/bold] {c}/{'-'.join(tiers)}",
-                              f"[green]Y" if sel else f"[red]N",
+                              f"[green bold]Y" if sel else f"[red]N",
                               end_section = ic==ncond-1)
             else:
                 table.add_row("",  f"[bold]({j})[/bold] {c}/{'-'.join(tiers)}",
-                              f"[green]Y" if sel else f"[red]N",
+                              f"[green bold]Y" if sel else f"[red]N",
                               end_section = ic==ncond-1)
             ic+=1
             j+=1
@@ -42,13 +42,14 @@ class MyCmdApp(cmd2.Cmd):
     def __init__(self):
         shortcuts = cmd2.DEFAULT_SHORTCUTS
         shortcuts.update({ 'L': 'login', 'Q': 'query', 'R': 'replicas',
-                           'S': 'select',
+                           'S': 'select', "LS": 'list_selected',
                            'lr': 'list_results'})
         self.console = Console()
         self.rucio_client = None
         self.selected_datasets = [ ]
         self.last_query = ""
-        self.last_query_results = None
+        self.last_query_tree = None
+        self.last_query_list = None
         super().__init__(shortcuts=shortcuts)
 
     def do_login(self, args):
@@ -72,37 +73,51 @@ def do_whoami(self, args):
     def do_query(self, args):
         # Your code here
         with self.console.status(f"Querying rucio for: [bold red]{args}[/]"):    
-            out = rucio_utils.query_dataset(args.arg_list[0],
+            outlist, outtree = rucio_utils.query_dataset(args.arg_list[0],
                                             client=self.rucio_client,
                                             tree=True)
             # Now let's print the results as a tree
-            print_dataset_query(args, out,
+            print_dataset_query(args, outtree,
                                self.selected_datasets,
                                self.console)
             self.last_query = args
-            self.last_query_results = out
+            self.last_query_list = outlist
+            self.last_query_tree = outtree
+        print("Use the command [bold red]select (S)[/] to selected the datasets")
 
     def do_list_results(self, args):
-        if self.last_query_results:
-            print_dataset_query(self.last_query, self.last_query_results,
+        if self.last_query_list:
+            print_dataset_query(self.last_query, self.last_query_tree,
                             self.selected_datasets, self.console)
         else:
             print("First [bold red]query (Q)[/] for a dataset")
 
     def do_select(self, args):
-        if not self.last_query_results:
+        if not self.last_query_list:
             print("First [bold red]query (Q)[/] for a dataset")
             return
 
+        Nresults = len(self.last_query_list)
+        print("[cyan]Selected datasets:")
         for s in map(int, args.arg_list):
-            print(s)
-        
-    
+            if s <= Nresults:
+                self.selected_datasets.append(self.last_query_list[s-1])
+                print(f"- ({s}) {self.last_query_list[s-1]}")
+            else:
+                print(f"[red]The requested dataset is not in the list. Please insert a position <={Nresults}")
         
+    def do_list_selected(self, args):
+        print("[cyan]Selected datasets:")
+        for i, ds in enumerate(self.selected_datasets):
+            print(f"- [{i}] [blue]{ds}")
         
     def do_replicas(self, args):
-        # Your code here
-        self.poutput("Replicas command executed")
+        if len(args.arg_list)==0:
+            print("[red] Please provide the index of the [bold]selected[/bold] dataset to analyze")
+            return
+
+        
+    
 
 if __name__ == "__main__":
     app = MyCmdApp()
diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py
index 70f301a5b..884b25039 100644
--- a/src/coffea/dataset_tools/rucio_utils.py
+++ b/src/coffea/dataset_tools/rucio_utils.py
@@ -284,6 +284,6 @@ def query_dataset(query, client=None, tree=False):
             if split[0] not in outdict:
                 outdict[split[0]] = defaultdict(list)
             outdict[split[0]][split[1]].append(split[2])
-        return outdict
+        return out, outdict
     else:
         return out

From e1f11cfff29e983893aae1b1d1c7f6365fb5f499 Mon Sep 17 00:00:00 2001
From: Davide Valsecchi <davide.valsecchi@cern.ch>
Date: Mon, 28 Aug 2023 11:07:41 +0200
Subject: [PATCH 08/80] Printing sites availability for replicas

---
 src/coffea/dataset_tools/dataset_query.py | 42 ++++++++++++++++++++---
 src/coffea/dataset_tools/rucio_utils.py   |  9 +++--
 2 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py
index 299f03c72..60fab9973 100644
--- a/src/coffea/dataset_tools/dataset_query.py
+++ b/src/coffea/dataset_tools/dataset_query.py
@@ -6,13 +6,14 @@
 from rich.table import Table
 from rich.tree import Tree
 import rucio_utils
+from collections import defaultdict
 
 
 def print_dataset_query(query, dataset_list, selected, console):
     table = Table(title=f"Query: [bold red]{query}")
-    table.add_column("name", justify="left", style="cyan", no_wrap=True)
-    table.add_column("tag", style="magenta", no_wrap=True)
-    table.add_column("selected", justify="center")
+    table.add_column("Name", justify="left", style="cyan", no_wrap=True)
+    table.add_column("Tag", style="magenta", no_wrap=True)
+    table.add_column("Selected", justify="center")
     table.row_styles = ["dim", "none"]
     j = 1
     for name, conds in dataset_list.items():
@@ -50,6 +51,10 @@ def __init__(self):
         self.last_query = ""
         self.last_query_tree = None
         self.last_query_list = None
+        self.sites_whitelist = None
+        self.sites_blacklist = None
+        self.sites_regex = None
+        self.replicas_results = defaultdict(list)
         super().__init__(shortcuts=shortcuts)
 
     def do_login(self, args):
@@ -113,10 +118,37 @@ def do_list_selected(self, args):
         
     def do_replicas(self, args):
         if len(args.arg_list)==0:
-            print("[red] Please provide the index of the [bold]selected[/bold] dataset to analyze")
-            return
+            print("[red] Please provide the index of the [bold]selected[/bold] dataset to analyze or the [bold]full dataset name[/bold]")
+
+        if args.isdigit():
+            if int(args) <= len(self.selected_datasets):
+                dataset = self.selected_datasets[int(args)-1]
+            else:
+                print(f"[red]The requested dataset is not in the list. Please insert a position <={len(self.selected_datasets)}")
+        else:
+            dataset = args
 
+        with self.console.status(f"Querying rucio for replicas: [bold red]{dataset}[/]"):  
+            outfiles, outsites, sites_counts = rucio_utils.get_dataset_files_replicas(dataset,
+                                                                                  whitelist_sites=self.sites_whitelist,
+                                                                                  blacklist_sites=self.sites_blacklist,
+                                                                                  regex_sites=self.sites_regex,
+                                                                                  mode="full",
+                                                                                  client=self.rucio_client)
         
+        table = Table(title=f"[cyan]Sites availability for dataset: [red]{dataset}")
+        table.add_column("Site", justify="left", style="cyan", no_wrap=True)
+        table.add_column("Files", style="magenta", no_wrap=True)
+        table.add_column("Availability", justify="center")
+        table.row_styles = ["dim", "none"]
+        Nfiles = len(outfiles)
+
+        sorted_sites = dict(sorted(sites_counts.items(), key=lambda x:x[1], reverse=True))
+        for site, stat in sorted_sites.items():
+            table.add_row(site, f"{stat} / {Nfiles}", f"{stat*100/Nfiles:.1f}%")
+
+        self.console.print(table)
+                    
     
 
 if __name__ == "__main__":
diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py
index 884b25039..20d3f7955 100644
--- a/src/coffea/dataset_tools/rucio_utils.py
+++ b/src/coffea/dataset_tools/rucio_utils.py
@@ -167,6 +167,9 @@ def get_dataset_files_replicas(
            - If `mode=="full"`, returns the list of sites where the file replica is available for each file in the dataset
            - If `mode=="first"`, returns a list of sites for the first replica of each file.
 
+        sites_counts: dict
+           Metadata couting the coverage of the dataset by site
+
     """
     sites_xrootd_prefix = get_xrootd_sites_map()
     client = client if client else get_rucio_client()
@@ -260,14 +263,14 @@ def get_dataset_files_replicas(
 
     # Computing replicas by site:
     totfiles = len(outfiles)
-    sites_counts = defaultdict(float)
+    sites_counts = defaultdict(int)
     if mode == "full":
         for sites_by_file in outsites:
             for site in sites_by_file:
-                sites_counts[site] += 1 / totfiles
+                sites_counts[site] += 1
     elif mode == "first":
         for site_by_file in outsites:
-            sites_counts[site] += 1 / totfiles
+            sites_counts[site] += 1
 
     return outfiles, outsites, sites_counts
 

From 1e191bf002625798970ddfe1bc48e53d2488c7b9 Mon Sep 17 00:00:00 2001
From: Davide Valsecchi <davide.valsecchi@cern.ch>
Date: Mon, 28 Aug 2023 14:23:00 +0200
Subject: [PATCH 09/80] Added replica site selection

---
 src/coffea/dataset_tools/dataset_query.py | 253 +++++++++++++++++-----
 src/coffea/dataset_tools/rucio_utils.py   |   8 +-
 2 files changed, 204 insertions(+), 57 deletions(-)

diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py
index 60fab9973..5359c2c1c 100644
--- a/src/coffea/dataset_tools/dataset_query.py
+++ b/src/coffea/dataset_tools/dataset_query.py
@@ -5,8 +5,10 @@
 from rich.console import Console
 from rich.table import Table
 from rich.tree import Tree
+from rich.prompt import Prompt
 import rucio_utils
 from collections import defaultdict
+import random
 
 
 def print_dataset_query(query, dataset_list, selected, console):
@@ -22,51 +24,65 @@ def print_dataset_query(query, dataset_list, selected, console):
         for c, tiers in conds.items():
             dataset = f"/{name}/{c}/{tiers[0]}"
             sel = dataset in selected
-            if ic ==0:
-                table.add_row(name, f"[bold]({j})[/bold] {c}/{'-'.join(tiers)}",
-                              f"[green bold]Y" if sel else f"[red]N",
-                              end_section = ic==ncond-1)
+            if ic == 0:
+                table.add_row(
+                    name,
+                    f"[bold]({j})[/bold] {c}/{'-'.join(tiers)}",
+                    f"[green bold]Y" if sel else f"[red]N",
+                    end_section=ic == ncond - 1,
+                )
             else:
-                table.add_row("",  f"[bold]({j})[/bold] {c}/{'-'.join(tiers)}",
-                              f"[green bold]Y" if sel else f"[red]N",
-                              end_section = ic==ncond-1)
-            ic+=1
-            j+=1
+                table.add_row(
+                    "",
+                    f"[bold]({j})[/bold] {c}/{'-'.join(tiers)}",
+                    f"[green bold]Y" if sel else f"[red]N",
+                    end_section=ic == ncond - 1,
+                )
+            ic += 1
+            j += 1
 
     console.print(table)
 
 
 class MyCmdApp(cmd2.Cmd):
-
     prompt = "\033[1;34m" + "cms-datasets" + "\033[0m > "
-    
+
     def __init__(self):
         shortcuts = cmd2.DEFAULT_SHORTCUTS
-        shortcuts.update({ 'L': 'login', 'Q': 'query', 'R': 'replicas',
-                           'S': 'select', "LS": 'list_selected',
-                           'lr': 'list_results'})
+        shortcuts.update(
+            {
+                "L": "login",
+                "Q": "query",
+                "R": "replicas",
+                "S": "select",
+                "LS": "list_selected",
+                "lr": "list_results",
+            }
+        )
         self.console = Console()
         self.rucio_client = None
-        self.selected_datasets = [ ]
+        self.selected_datasets = []
         self.last_query = ""
         self.last_query_tree = None
         self.last_query_list = None
         self.sites_whitelist = None
         self.sites_blacklist = None
         self.sites_regex = None
-        self.replicas_results = defaultdict(list)
+        self.last_replicas_results = None
+
+        self.replica_results = defaultdict(list)
         super().__init__(shortcuts=shortcuts)
 
     def do_login(self, args):
-        '''Login to the rucio client. Optionally a specific proxy file can be passed to the command.
-        If the proxy file is not specified, `voms-proxy-info` is used'''
+        """Login to the rucio client. Optionally a specific proxy file can be passed to the command.
+        If the proxy file is not specified, `voms-proxy-info` is used"""
         if args:
             self.rucio_client = rucio_utils.get_rucio_client(args[0])
         else:
             self.rucio_client = rucio_utils.get_rucio_client()
-            
+
         print(self.rucio_client)
-        #pprint(self.rucio_client.whoami())
+        # pprint(self.rucio_client.whoami())
 
     def do_whoami(self, args):
         # Your code here
@@ -74,17 +90,15 @@ def do_whoami(self, args):
             print("First [bold]login (L)[/] to the rucio server")
             return
         print(self.rucio_client.whoami())
-        
+
     def do_query(self, args):
         # Your code here
-        with self.console.status(f"Querying rucio for: [bold red]{args}[/]"):    
-            outlist, outtree = rucio_utils.query_dataset(args.arg_list[0],
-                                            client=self.rucio_client,
-                                            tree=True)
+        with self.console.status(f"Querying rucio for: [bold red]{args}[/]"):
+            outlist, outtree = rucio_utils.query_dataset(
+                args.arg_list[0], client=self.rucio_client, tree=True
+            )
             # Now let's print the results as a tree
-            print_dataset_query(args, outtree,
-                               self.selected_datasets,
-                               self.console)
+            print_dataset_query(args, outtree, self.selected_datasets, self.console)
             self.last_query = args
             self.last_query_list = outlist
             self.last_query_tree = outtree
@@ -92,8 +106,12 @@ def do_query(self, args):
 
     def do_list_results(self, args):
         if self.last_query_list:
-            print_dataset_query(self.last_query, self.last_query_tree,
-                            self.selected_datasets, self.console)
+            print_dataset_query(
+                self.last_query,
+                self.last_query_tree,
+                self.selected_datasets,
+                self.console,
+            )
         else:
             print("First [bold red]query (Q)[/] for a dataset")
 
@@ -106,50 +124,177 @@ def do_select(self, args):
         print("[cyan]Selected datasets:")
         for s in map(int, args.arg_list):
             if s <= Nresults:
-                self.selected_datasets.append(self.last_query_list[s-1])
+                self.selected_datasets.append(self.last_query_list[s - 1])
                 print(f"- ({s}) {self.last_query_list[s-1]}")
             else:
-                print(f"[red]The requested dataset is not in the list. Please insert a position <={Nresults}")
-        
+                print(
+                    f"[red]The requested dataset is not in the list. Please insert a position <={Nresults}"
+                )
+
     def do_list_selected(self, args):
         print("[cyan]Selected datasets:")
         for i, ds in enumerate(self.selected_datasets):
             print(f"- [{i}] [blue]{ds}")
-        
+
     def do_replicas(self, args):
-        if len(args.arg_list)==0:
-            print("[red] Please provide the index of the [bold]selected[/bold] dataset to analyze or the [bold]full dataset name[/bold]")
+        if len(args.arg_list) == 0:
+            print(
+                "[red] Please provide the index of the [bold]selected[/bold] dataset to analyze or the [bold]full dataset name[/bold]"
+            )
+            return
 
         if args.isdigit():
             if int(args) <= len(self.selected_datasets):
-                dataset = self.selected_datasets[int(args)-1]
+                dataset = self.selected_datasets[int(args) - 1]
             else:
-                print(f"[red]The requested dataset is not in the list. Please insert a position <={len(self.selected_datasets)}")
+                print(
+                    f"[red]The requested dataset is not in the list. Please insert a position <={len(self.selected_datasets)}"
+                )
         else:
-            dataset = args
-
-        with self.console.status(f"Querying rucio for replicas: [bold red]{dataset}[/]"):  
-            outfiles, outsites, sites_counts = rucio_utils.get_dataset_files_replicas(dataset,
-                                                                                  whitelist_sites=self.sites_whitelist,
-                                                                                  blacklist_sites=self.sites_blacklist,
-                                                                                  regex_sites=self.sites_regex,
-                                                                                  mode="full",
-                                                                                  client=self.rucio_client)
-        
-        table = Table(title=f"[cyan]Sites availability for dataset: [red]{dataset}")
+            dataset = args.arg_list[0]
+
+        with self.console.status(
+            f"Querying rucio for replicas: [bold red]{dataset}[/]"
+        ):
+            outfiles, outsites, sites_counts = rucio_utils.get_dataset_files_replicas(
+                dataset,
+                whitelist_sites=self.sites_whitelist,
+                blacklist_sites=self.sites_blacklist,
+                regex_sites=self.sites_regex,
+                mode="full",
+                client=self.rucio_client,
+            )
+            self.last_replicas_results = (outfiles, outsites, sites_counts)
+        print(f"[cyan]Sites availability for dataset: [red]{dataset}")
+        table = Table(title="Available replicas")
+        table.add_column("Index", justify="center")
         table.add_column("Site", justify="left", style="cyan", no_wrap=True)
         table.add_column("Files", style="magenta", no_wrap=True)
         table.add_column("Availability", justify="center")
         table.row_styles = ["dim", "none"]
         Nfiles = len(outfiles)
 
-        sorted_sites = dict(sorted(sites_counts.items(), key=lambda x:x[1], reverse=True))
-        for site, stat in sorted_sites.items():
-            table.add_row(site, f"{stat} / {Nfiles}", f"{stat*100/Nfiles:.1f}%")
+        sorted_sites = dict(
+            sorted(sites_counts.items(), key=lambda x: x[1], reverse=True)
+        )
+        for i, (site, stat) in enumerate(sorted_sites.items()):
+            table.add_row(str(i), site, f"{stat} / {Nfiles}", f"{stat*100/Nfiles:.1f}%")
 
         self.console.print(table)
-                    
-    
+        strategy = Prompt.ask(
+            "Select sites",
+            choices=["round-robin", "choice", "quit"],
+            default="round-robin",
+        )
+
+        files_by_site = defaultdict(list)
+
+        if strategy == "choice":
+            ind = list(
+                map(int, Prompt.ask("Enter list of sites index to be used").split(" "))
+            )
+            sites_to_use = [list(sorted_sites.keys())[i] for i in ind]
+            print(f"Filtering replicas with [green]: {' '.join(sites_to_use)}")
+
+            output = []
+            for ifile, (files, sites) in enumerate(zip(outfiles, outsites)):
+                random.shuffle(sites_to_use)
+                found = False
+                # loop on shuffled selected sites until one is found
+                for site in sites_to_use:
+                    try:
+                        iS = sites.index(site)
+                        output.append(files[iS])
+                        files_by_site[sites[iS]].append(files[iS])
+                        found = True
+                        break  # keep only one replica
+                    except ValueError:
+                        # if the index is not found just go to the next site
+                        pass
+
+                if not found:
+                    print(
+                        f"[bold red]No replica found compatible with sites selection for file #{ifile}. The available sites are:"
+                    )
+                    for f, s in zip(files, sites):
+                        print(f"\t- [green]{s} [cyan]{f}")
+                    return
+
+            self.replica_results[dataset] = output
+
+        elif strategy == "round-robin":
+            output = []
+            for ifile, (files, sites) in enumerate(zip(outfiles, outsites)):
+                # selecting randomly from the sites
+                iS = random.randint(0, len(sites) - 1)
+                output.append(files[iS])
+                files_by_site[sites[iS]].append(files[iS])
+            self.replica_results[dataset] = output
+
+        elif strategy == "quit":
+            print("[orange]Doing nothing...")
+            return
+
+        # Now let's print the results
+        tree = Tree(label=f"Replicas for [green]{dataset}")
+        for site, files in files_by_site.items():
+            T = tree.add(f"[green]{site}")
+            for f in files:
+                T.add(f"[cyan]{f}")
+
+        print("Final replicas selection")
+        self.console.print(tree)
+
+    def do_whitelist_sites(self, args):
+        if self.sites_whitelist == None:
+            self.sites_whitelist = args.arg_list
+        else:
+            self.sites_whitelist += args.arg_list
+        print("[green]Whitelisted sites:")
+        for s in self.sites_whitelist:
+            print(f"- {s}")
+
+    def do_blacklist_sites(self, args):
+        if self.sites_blacklist == None:
+            self.sites_blacklist = args.arg_list
+        else:
+            self.sites_blacklist += args.arg_list
+        print("[red]Blacklisted sites:")
+        for s in self.sites_blacklist:
+            print(f"- {s}")
+
+    def do_regex_sites(self, args):
+        if args.startswith('"'):
+            args = args[1:]
+        if args.endswith('"'):
+            args = args[:-1]
+        self.sites_regex = r"{}".format(args)
+        print(f"New sites regex: [cyan]{self.sites_regex}")
+
+    def do_sites_filters(self, args):
+        if args == "":
+            print("[green bold]Whitelisted sites:")
+            if self.sites_whitelist:
+                for s in self.sites_whitelist:
+                    print(f"- {s}")
+
+            print("[bold red]Blacklisted sites:")
+            if self.sites_blacklist:
+                for s in self.sites_blacklist:
+                    print(f"- {s}")
+
+            print(f"[bold cyan]Sites regex: [italics]{self.sites_regex}")
+        if args == "clear":
+            self.sites_whitelist = None
+            self.sites_blacklist = None
+            self.sites_regex = None
+            print("[bold green]Sites filters cleared")
+
+    def do_list_replicas(self, args):
+        print("Datasets with selected replicas: ")
+        for dataset in self.replica_results:
+            print(f"\t-[cyan]{dataset}")
+
 
 if __name__ == "__main__":
     app = MyCmdApp()
diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py
index 20d3f7955..eb6a3ae0b 100644
--- a/src/coffea/dataset_tools/rucio_utils.py
+++ b/src/coffea/dataset_tools/rucio_utils.py
@@ -277,9 +277,11 @@ def get_dataset_files_replicas(
 
 def query_dataset(query, client=None, tree=False):
     client = client if client else get_rucio_client()
-    out = list(client.list_dids(
-        scope="cms", filters={"name": query, "type":"container"},
-        long=False))
+    out = list(
+        client.list_dids(
+            scope="cms", filters={"name": query, "type": "container"}, long=False
+        )
+    )
     if tree:
         outdict = {}
         for dataset in out:

From 02cbbbece82dd61fe5319072eed26c2088eb9029 Mon Sep 17 00:00:00 2001
From: Davide Valsecchi <davide.valsecchi@cern.ch>
Date: Mon, 28 Aug 2023 14:58:40 +0200
Subject: [PATCH 10/80] Added saving

---
 src/coffea/dataset_tools/dataset_query.py | 58 ++++++++++++++++++-----
 1 file changed, 47 insertions(+), 11 deletions(-)

diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py
index 5359c2c1c..6f77fcce4 100644
--- a/src/coffea/dataset_tools/dataset_query.py
+++ b/src/coffea/dataset_tools/dataset_query.py
@@ -9,6 +9,7 @@
 import rucio_utils
 from collections import defaultdict
 import random
+import yaml
 
 
 def print_dataset_query(query, dataset_list, selected, console):
@@ -53,10 +54,11 @@ def __init__(self):
             {
                 "L": "login",
                 "Q": "query",
+                "QR": "query_results",
                 "R": "replicas",
                 "S": "select",
                 "LS": "list_selected",
-                "lr": "list_results",
+                "LR": "list_replicas",
             }
         )
         self.console = Console()
@@ -71,6 +73,7 @@ def __init__(self):
         self.last_replicas_results = None
 
         self.replica_results = defaultdict(list)
+        self.replica_results_bysite = {}
         super().__init__(shortcuts=shortcuts)
 
     def do_login(self, args):
@@ -104,7 +107,7 @@ def do_query(self, args):
             self.last_query_tree = outtree
         print("Use the command [bold red]select (S)[/] to selected the datasets")
 
-    def do_list_results(self, args):
+    def do_query_results(self, args):
         if self.last_query_list:
             print_dataset_query(
                 self.last_query,
@@ -133,9 +136,16 @@ def do_select(self, args):
 
     def do_list_selected(self, args):
         print("[cyan]Selected datasets:")
+        table = Table(title="Selected datasets")
+        table.add_column("Index", justify="left", style="cyan", no_wrap=True)
+        table.add_column("Dataset", style="magenta", no_wrap=True)
+        table.add_column("Replicas selected", justify="center")
+        table.add_column("N. of files", justify="center")
         for i, ds in enumerate(self.selected_datasets):
-            print(f"- [{i}] [blue]{ds}")
-
+            table.add_row(str(i+1), ds, "[green bold]Y" if ds in self.replica_results else "[red]N",
+                          str(len(self.replica_results[ds])) if ds in self.replica_results else "-")
+        self.console.print(table)
+            
     def do_replicas(self, args):
         if len(args.arg_list) == 0:
             print(
@@ -152,6 +162,8 @@ def do_replicas(self, args):
                 )
         else:
             dataset = args.arg_list[0]
+            # adding it to the selected datasets
+            self.selected_datasets.append(dataset)
 
         with self.console.status(
             f"Querying rucio for replicas: [bold red]{dataset}[/]"
@@ -235,14 +247,14 @@ def do_replicas(self, args):
             print("[orange]Doing nothing...")
             return
 
+        self.replica_results_bysite[dataset] = files_by_site
+        
         # Now let's print the results
-        tree = Tree(label=f"Replicas for [green]{dataset}")
+        tree = Tree(label=f"[bold orange]Replicas for [green]{dataset}")
         for site, files in files_by_site.items():
             T = tree.add(f"[green]{site}")
             for f in files:
                 T.add(f"[cyan]{f}")
-
-        print("Final replicas selection")
         self.console.print(tree)
 
     def do_whitelist_sites(self, args):
@@ -291,10 +303,34 @@ def do_sites_filters(self, args):
             print("[bold green]Sites filters cleared")
 
     def do_list_replicas(self, args):
-        print("Datasets with selected replicas: ")
-        for dataset in self.replica_results:
-            print(f"\t-[cyan]{dataset}")
-
+        if len(args.arg_list)==0:
+            print("[red]Please call the command with the index of a selected dataset")
+        else:
+            if int(args) > len(self.selected_datasets):
+                print(f"[red] Select the replica with index < {len(self.selected_datasets)}")
+                return
+            else:
+                dataset = self.selected_datasets[int(args)-1]
+                if dataset not in self.replica_results:
+                    print(f"[red bold]No replica info for dataset {dataset}. You need to selected the replicas with [cyan] replicas {args}")
+                tree = Tree(label=f"[bold orange]Replicas for [green]{dataset}")
+                
+                for site, files in self.replica_results_bysite[dataset].items():
+                    T = tree.add(f"[green]{site}")
+                    for f in files:
+                        T.add(f"[cyan]{f}")
+
+                self.console.print(tree)
+
+    def do_save(self, args):
+        '''Save the replica information in yaml format'''
+        if not len(args):
+            print("[red]Please provide an output filename")
+        else:
+            with open(args, "w") as file:
+                yaml.dump(dict(self.replica_results), file,
+                          default_flow_style=False)
+            print(f"[green]File {args} saved!")
 
 if __name__ == "__main__":
     app = MyCmdApp()

From daf5e52c3b8dd00aac3338a17b7d4d3ea503d076 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 28 Aug 2023 13:04:00 +0000
Subject: [PATCH 11/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/coffea/dataset_tools/dataset_query.py | 51 ++++++++++++++---------
 src/coffea/dataset_tools/rucio_utils.py   |  9 ++--
 2 files changed, 36 insertions(+), 24 deletions(-)

diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py
index 6f77fcce4..3a0c8da59 100644
--- a/src/coffea/dataset_tools/dataset_query.py
+++ b/src/coffea/dataset_tools/dataset_query.py
@@ -1,15 +1,16 @@
-from cmd2 import Cmd
+import random
+from collections import defaultdict
+
 import cmd2
+import rucio_utils
+import yaml
+from cmd2 import Cmd
 from rich import print
-from rich.pretty import pprint
 from rich.console import Console
+from rich.pretty import pprint
+from rich.prompt import Prompt
 from rich.table import Table
 from rich.tree import Tree
-from rich.prompt import Prompt
-import rucio_utils
-from collections import defaultdict
-import random
-import yaml
 
 
 def print_dataset_query(query, dataset_list, selected, console):
@@ -142,10 +143,16 @@ def do_list_selected(self, args):
         table.add_column("Replicas selected", justify="center")
         table.add_column("N. of files", justify="center")
         for i, ds in enumerate(self.selected_datasets):
-            table.add_row(str(i+1), ds, "[green bold]Y" if ds in self.replica_results else "[red]N",
-                          str(len(self.replica_results[ds])) if ds in self.replica_results else "-")
+            table.add_row(
+                str(i + 1),
+                ds,
+                "[green bold]Y" if ds in self.replica_results else "[red]N",
+                str(len(self.replica_results[ds]))
+                if ds in self.replica_results
+                else "-",
+            )
         self.console.print(table)
-            
+
     def do_replicas(self, args):
         if len(args.arg_list) == 0:
             print(
@@ -248,7 +255,7 @@ def do_replicas(self, args):
             return
 
         self.replica_results_bysite[dataset] = files_by_site
-        
+
         # Now let's print the results
         tree = Tree(label=f"[bold orange]Replicas for [green]{dataset}")
         for site, files in files_by_site.items():
@@ -280,7 +287,7 @@ def do_regex_sites(self, args):
             args = args[1:]
         if args.endswith('"'):
             args = args[:-1]
-        self.sites_regex = r"{}".format(args)
+        self.sites_regex = fr"{args}"
         print(f"New sites regex: [cyan]{self.sites_regex}")
 
     def do_sites_filters(self, args):
@@ -303,18 +310,22 @@ def do_sites_filters(self, args):
             print("[bold green]Sites filters cleared")
 
     def do_list_replicas(self, args):
-        if len(args.arg_list)==0:
+        if len(args.arg_list) == 0:
             print("[red]Please call the command with the index of a selected dataset")
         else:
             if int(args) > len(self.selected_datasets):
-                print(f"[red] Select the replica with index < {len(self.selected_datasets)}")
+                print(
+                    f"[red] Select the replica with index < {len(self.selected_datasets)}"
+                )
                 return
             else:
-                dataset = self.selected_datasets[int(args)-1]
+                dataset = self.selected_datasets[int(args) - 1]
                 if dataset not in self.replica_results:
-                    print(f"[red bold]No replica info for dataset {dataset}. You need to selected the replicas with [cyan] replicas {args}")
+                    print(
+                        f"[red bold]No replica info for dataset {dataset}. You need to selected the replicas with [cyan] replicas {args}"
+                    )
                 tree = Tree(label=f"[bold orange]Replicas for [green]{dataset}")
-                
+
                 for site, files in self.replica_results_bysite[dataset].items():
                     T = tree.add(f"[green]{site}")
                     for f in files:
@@ -323,15 +334,15 @@ def do_list_replicas(self, args):
                 self.console.print(tree)
 
     def do_save(self, args):
-        '''Save the replica information in yaml format'''
+        """Save the replica information in yaml format"""
         if not len(args):
             print("[red]Please provide an output filename")
         else:
             with open(args, "w") as file:
-                yaml.dump(dict(self.replica_results), file,
-                          default_flow_style=False)
+                yaml.dump(dict(self.replica_results), file, default_flow_style=False)
             print(f"[green]File {args} saved!")
 
+
 if __name__ == "__main__":
     app = MyCmdApp()
     app.cmdloop()
diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py
index eb6a3ae0b..3ddb0fcf8 100644
--- a/src/coffea/dataset_tools/rucio_utils.py
+++ b/src/coffea/dataset_tools/rucio_utils.py
@@ -1,10 +1,11 @@
-import os
 import getpass
-import re
 import json
-from rucio.client import Client
-from collections import defaultdict
+import os
+import re
 import subprocess
+from collections import defaultdict
+
+from rucio.client import Client
 
 # Rucio needs the default configuration --> taken from CMS cvmfs defaults
 os.environ["RUCIO_HOME"] = "/cvmfs/cms.cern.ch/rucio/current"

From c9101bb8f975b2f9131e809f831da055e3b7167d Mon Sep 17 00:00:00 2001
From: Davide Valsecchi <davide.valsecchi@cern.ch>
Date: Mon, 28 Aug 2023 15:19:11 +0200
Subject: [PATCH 12/80] Formatting and flake8

---
 src/coffea/dataset_tools/dataset_query.py | 46 +++++++++++++----------
 src/coffea/dataset_tools/rucio_utils.py   | 13 +++----
 2 files changed, 33 insertions(+), 26 deletions(-)

diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py
index 6f77fcce4..ed88bebaf 100644
--- a/src/coffea/dataset_tools/dataset_query.py
+++ b/src/coffea/dataset_tools/dataset_query.py
@@ -1,12 +1,10 @@
-from cmd2 import Cmd
 import cmd2
 from rich import print
-from rich.pretty import pprint
 from rich.console import Console
 from rich.table import Table
 from rich.tree import Tree
 from rich.prompt import Prompt
-import rucio_utils
+from . import rucio_utils
 from collections import defaultdict
 import random
 import yaml
@@ -29,14 +27,14 @@ def print_dataset_query(query, dataset_list, selected, console):
                 table.add_row(
                     name,
                     f"[bold]({j})[/bold] {c}/{'-'.join(tiers)}",
-                    f"[green bold]Y" if sel else f"[red]N",
+                    "[green bold]Y" if sel else "[red]N",
                     end_section=ic == ncond - 1,
                 )
             else:
                 table.add_row(
                     "",
                     f"[bold]({j})[/bold] {c}/{'-'.join(tiers)}",
-                    f"[green bold]Y" if sel else f"[red]N",
+                    "[green bold]Y" if sel else "[red]N",
                     end_section=ic == ncond - 1,
                 )
             ic += 1
@@ -142,10 +140,16 @@ def do_list_selected(self, args):
         table.add_column("Replicas selected", justify="center")
         table.add_column("N. of files", justify="center")
         for i, ds in enumerate(self.selected_datasets):
-            table.add_row(str(i+1), ds, "[green bold]Y" if ds in self.replica_results else "[red]N",
-                          str(len(self.replica_results[ds])) if ds in self.replica_results else "-")
+            table.add_row(
+                str(i + 1),
+                ds,
+                "[green bold]Y" if ds in self.replica_results else "[red]N",
+                str(len(self.replica_results[ds]))
+                if ds in self.replica_results
+                else "-",
+            )
         self.console.print(table)
-            
+
     def do_replicas(self, args):
         if len(args.arg_list) == 0:
             print(
@@ -248,7 +252,7 @@ def do_replicas(self, args):
             return
 
         self.replica_results_bysite[dataset] = files_by_site
-        
+
         # Now let's print the results
         tree = Tree(label=f"[bold orange]Replicas for [green]{dataset}")
         for site, files in files_by_site.items():
@@ -258,7 +262,7 @@ def do_replicas(self, args):
         self.console.print(tree)
 
     def do_whitelist_sites(self, args):
-        if self.sites_whitelist == None:
+        if self.sites_whitelist is None:
             self.sites_whitelist = args.arg_list
         else:
             self.sites_whitelist += args.arg_list
@@ -267,7 +271,7 @@ def do_whitelist_sites(self, args):
             print(f"- {s}")
 
     def do_blacklist_sites(self, args):
-        if self.sites_blacklist == None:
+        if self.sites_blacklist is None:
             self.sites_blacklist = args.arg_list
         else:
             self.sites_blacklist += args.arg_list
@@ -303,18 +307,22 @@ def do_sites_filters(self, args):
             print("[bold green]Sites filters cleared")
 
     def do_list_replicas(self, args):
-        if len(args.arg_list)==0:
+        if len(args.arg_list) == 0:
             print("[red]Please call the command with the index of a selected dataset")
         else:
             if int(args) > len(self.selected_datasets):
-                print(f"[red] Select the replica with index < {len(self.selected_datasets)}")
+                print(
+                    f"[red] Select the replica with index < {len(self.selected_datasets)}"
+                )
                 return
             else:
-                dataset = self.selected_datasets[int(args)-1]
+                dataset = self.selected_datasets[int(args) - 1]
                 if dataset not in self.replica_results:
-                    print(f"[red bold]No replica info for dataset {dataset}. You need to selected the replicas with [cyan] replicas {args}")
+                    print(
+                        f"[red bold]No replica info for dataset {dataset}. You need to selected the replicas with [cyan] replicas {args}"
+                    )
                 tree = Tree(label=f"[bold orange]Replicas for [green]{dataset}")
-                
+
                 for site, files in self.replica_results_bysite[dataset].items():
                     T = tree.add(f"[green]{site}")
                     for f in files:
@@ -323,15 +331,15 @@ def do_list_replicas(self, args):
                 self.console.print(tree)
 
     def do_save(self, args):
-        '''Save the replica information in yaml format'''
+        """Save the replica information in yaml format"""
         if not len(args):
             print("[red]Please provide an output filename")
         else:
             with open(args, "w") as file:
-                yaml.dump(dict(self.replica_results), file,
-                          default_flow_style=False)
+                yaml.dump(dict(self.replica_results), file, default_flow_style=False)
             print(f"[green]File {args} saved!")
 
+
 if __name__ == "__main__":
     app = MyCmdApp()
     app.cmdloop()
diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py
index eb6a3ae0b..ce2825c0f 100644
--- a/src/coffea/dataset_tools/rucio_utils.py
+++ b/src/coffea/dataset_tools/rucio_utils.py
@@ -82,12 +82,12 @@ def get_xrootd_sites_map():
                 continue
             try:
                 data = json.load(open(conf))
-            except:
+            except Exception:
                 continue
             for site in data:
                 if site["type"] != "DISK":
                     continue
-                if site["rse"] == None:
+                if site["rse"] is None:
                     continue
                 for proc in site["protocols"]:
                     if proc["protocol"] == "XRootD":
@@ -187,7 +187,7 @@ def get_dataset_files_replicas(
                     meta = filedata["pfns"][rses[site][0]]
                     if (
                         meta["type"] != "DISK"
-                        or meta["volatile"] == True
+                        or meta["volatile"]
                         or filedata["states"][site] != "AVAILABLE"
                         or site not in sites_xrootd_prefix
                     ):
@@ -220,7 +220,7 @@ def get_dataset_files_replicas(
                         meta = filedata["pfns"][rses[site][0]]
                         if (
                             meta["type"] != "DISK"
-                            or meta["volatile"] == True
+                            or meta["volatile"]
                             or filedata["states"][site] != "AVAILABLE"
                             or site not in sites_xrootd_prefix
                         ):
@@ -238,7 +238,7 @@ def get_dataset_files_replicas(
                     meta = filedata["pfns"][rses[site][0]]
                     if (
                         meta["type"] != "DISK"
-                        or meta["volatile"] == True
+                        or meta["volatile"]
                         or filedata["states"][site] != "AVAILABLE"
                         or site not in sites_xrootd_prefix
                     ):
@@ -259,10 +259,9 @@ def get_dataset_files_replicas(
                 outfiles.append(outfile[0])
                 outsites.append(outsite[0])
             else:
-                raise NotImplemented(f"Mode {mode} not yet implemented!")
+                raise NotImplementedError(f"Mode {mode} not yet implemented!")
 
     # Computing replicas by site:
-    totfiles = len(outfiles)
     sites_counts = defaultdict(int)
     if mode == "full":
         for sites_by_file in outsites:

From b78f6403f1b05ad5197baa0c8efaf00d234480c1 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 28 Aug 2023 13:27:01 +0000
Subject: [PATCH 13/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/coffea/dataset_tools/dataset_query.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py
index 99c41579b..cab5c7ef8 100644
--- a/src/coffea/dataset_tools/dataset_query.py
+++ b/src/coffea/dataset_tools/dataset_query.py
@@ -1,14 +1,14 @@
 import random
 from collections import defaultdict
+
 import cmd2
+import rucio_utils
 import yaml
 from rich import print
 from rich.console import Console
 from rich.prompt import Prompt
 from rich.table import Table
 from rich.tree import Tree
-from rich.prompt import Prompt
-import rucio_utils
 
 
 def print_dataset_query(query, dataset_list, selected, console):

From 21cd240117f351d933a4874ed5474e0835c6d992 Mon Sep 17 00:00:00 2001
From: Davide Valsecchi <davide.valsecchi@cern.ch>
Date: Mon, 28 Aug 2023 15:31:17 +0200
Subject: [PATCH 14/80] Fixed comments spelling

---
 src/coffea/dataset_tools/rucio_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py
index 8fa8dd906..96b17b454 100644
--- a/src/coffea/dataset_tools/rucio_utils.py
+++ b/src/coffea/dataset_tools/rucio_utils.py
@@ -64,7 +64,7 @@ def get_rucio_client(proxy=None) -> Client:
 
 def get_xrootd_sites_map():
     """
-    The mapping beetween RSE (sites) and the xrootd prefix rules is read
+    The mapping between RSE (sites) and the xrootd prefix rules is read
     from `/cvmfs/cms/cern.ch/SITECONF/*site*/storage.json`.
 
     This function returns the list of xrootd prefix rules for each site.
@@ -136,7 +136,7 @@ def get_dataset_files_replicas(
     of all the replicas of the files in a CMS dataset.
 
     The sites can be filtered in 3 different ways:
-    - `whilist_sites`: list of sites to select from. If the file is not found there, raise an Expection.
+    - `whilist_sites`: list of sites to select from. If the file is not found there, raise an Exception.
     - `blacklist_sites`: list of sites to avoid. If the file has no left site, raise an Exception
     - `regex_sites`: regex expression to restrict the list of sites.
 
@@ -169,7 +169,7 @@ def get_dataset_files_replicas(
            - If `mode=="first"`, returns a list of sites for the first replica of each file.
 
         sites_counts: dict
-           Metadata couting the coverage of the dataset by site
+           Metadata counting the coverage of the dataset by site
 
     """
     sites_xrootd_prefix = get_xrootd_sites_map()

From dbed7d4a5506233632b8b050084f3faf5d24c32f Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Mon, 28 Aug 2023 09:15:12 -0500
Subject: [PATCH 15/80] py 3.9 for cirrus

---
 .cirrus.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.cirrus.yml b/.cirrus.yml
index 42c13adb8..223b900c4 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -11,7 +11,7 @@ task:
         cpu: 2
         memory: 7G
         matrix:
-          - image: python:3.8
+          - image: python:3.9
           - image: python:3.11
 
       create_venv_script: |

From 153c0ae1c7a9c394a2a61c20ad3c52fb4acdcf6d Mon Sep 17 00:00:00 2001
From: Davide Valsecchi <davide.valsecchi@cern.ch>
Date: Wed, 30 Aug 2023 14:06:50 +0200
Subject: [PATCH 16/80] Switched to rucio-clients

---
 pyproject.toml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 82fa7e7e1..f6521fe64 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -58,7 +58,8 @@ dependencies = [
   "pandas",
   "hist>=2",
   "cachetools",
-  "rucio>=32.2.0",
+  "rucio-clients>=32;python_version>'3.8'",
+  "rucio-clients<32;python_version<'3.9'",   
   "cmd2"
 ]
 dynamic = ["version"]

From 5a45e4a270cbd73c05adf9fe74fdf5d005823d98 Mon Sep 17 00:00:00 2001
From: Davide Valsecchi <davide.valsecchi@cern.ch>
Date: Wed, 30 Aug 2023 14:42:14 +0200
Subject: [PATCH 17/80] Added some docs to the cli

---
 src/coffea/dataset_tools/dataset_query.py | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py
index cab5c7ef8..17a599096 100644
--- a/src/coffea/dataset_tools/dataset_query.py
+++ b/src/coffea/dataset_tools/dataset_query.py
@@ -2,7 +2,7 @@
 from collections import defaultdict
 
 import cmd2
-import rucio_utils
+from . import rucio_utils
 import yaml
 from rich import print
 from rich.console import Console
@@ -342,5 +342,24 @@ def do_save(self, args):
 
 
 if __name__ == "__main__":
+    intro_msg = """[bold yellow]Welcome to the datasets discovery coffea CLI![/bold yellow]
+Use this CLI tool to query the CMS datasets and to select interactively the grid sites to use for reading the files in your analysis.
+Some basic commands:
+  - [bold cyan]query (Q)[/]: Look for datasets with * wildcards (like in DAS)
+  - [bold cyan]select (S)[/]: Select datasets to process further from query results
+  - [bold cyan]replicas (R)[/]: Query rucio to look for files replica and then select the preferred sites
+  - [bold cyan]list_selected (LS)[/]: Print a list of the selected datasets
+  - [bold cyan]list_replicas (LR) index[/]: Print the selected files replicas for the selected dataset
+  - [bold cyan]sites_filters[/]: show the active sites filters
+  - [bold cyan]sites_filters clear[/]: clear all the active sites filters
+  - [bold cyan]whitelist_sites[/]: Select sites to whitelist for replica queries
+  - [bold cyan]blacklist_sites[/]: Select sites to blacklist for replica queries
+  - [bold cyan]regex_sites[/]: Select sites with a regex for replica queries: please wrap the regex like "T[123]_(FR|IT|BE|CH|DE)_\w+"
+  - [bold cyan]save (S) file.yaml[/]: Save the replicas results to file for further processing
+  - [bold cyan]help[/]: get help!
+"""
+    console = Console()
+    console.print(intro_msg, justify="left")
+
     app = DatasetQueryApp()
     app.cmdloop()

From 00fb57c2ea23e75046669ebeaf85c827bfe9fa59 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 30 Aug 2023 12:47:43 +0000
Subject: [PATCH 18/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 pyproject.toml                            | 2 +-
 src/coffea/dataset_tools/dataset_query.py | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index f6521fe64..4adec6b30 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -59,7 +59,7 @@ dependencies = [
   "hist>=2",
   "cachetools",
   "rucio-clients>=32;python_version>'3.8'",
-  "rucio-clients<32;python_version<'3.9'",   
+  "rucio-clients<32;python_version<'3.9'",
   "cmd2"
 ]
 dynamic = ["version"]
diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py
index 17a599096..91b3910c7 100644
--- a/src/coffea/dataset_tools/dataset_query.py
+++ b/src/coffea/dataset_tools/dataset_query.py
@@ -2,7 +2,6 @@
 from collections import defaultdict
 
 import cmd2
-from . import rucio_utils
 import yaml
 from rich import print
 from rich.console import Console
@@ -10,6 +9,8 @@
 from rich.table import Table
 from rich.tree import Tree
 
+from . import rucio_utils
+
 
 def print_dataset_query(query, dataset_list, selected, console):
     table = Table(title=f"Query: [bold red]{query}")
@@ -342,7 +343,7 @@ def do_save(self, args):
 
 
 if __name__ == "__main__":
-    intro_msg = """[bold yellow]Welcome to the datasets discovery coffea CLI![/bold yellow]
+    intro_msg = r"""[bold yellow]Welcome to the datasets discovery coffea CLI![/bold yellow]
 Use this CLI tool to query the CMS datasets and to select interactively the grid sites to use for reading the files in your analysis.
 Some basic commands:
   - [bold cyan]query (Q)[/]: Look for datasets with * wildcards (like in DAS)

From 6d5d8005506efbeccf78fe401204c934413c15c8 Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Wed, 30 Aug 2023 12:43:17 -0500
Subject: [PATCH 19/80] roll back test to py3.8

---
 .cirrus.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.cirrus.yml b/.cirrus.yml
index 223b900c4..42c13adb8 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -11,7 +11,7 @@ task:
         cpu: 2
         memory: 7G
         matrix:
-          - image: python:3.9
+          - image: python:3.8
           - image: python:3.11
 
       create_venv_script: |

From e0f17266bfae5fdce7784465d3d7eeccf4d74cf6 Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Wed, 30 Aug 2023 12:46:01 -0500
Subject: [PATCH 20/80] math.ceil instead of integer division

---
 src/coffea/dataset_tools/preprocess.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/coffea/dataset_tools/preprocess.py b/src/coffea/dataset_tools/preprocess.py
index 7fdc46857..ca5400492 100644
--- a/src/coffea/dataset_tools/preprocess.py
+++ b/src/coffea/dataset_tools/preprocess.py
@@ -3,6 +3,7 @@
 import awkward
 import dask
 import dask_awkward
+import math
 import numpy
 import uproot
 
@@ -46,7 +47,7 @@ def _get_steps(
                 out = numpy.array(out, dtype="int64")
                 out = numpy.stack((out[:-1], out[1:]), axis=1)
             else:
-                n_steps = num_entries // target_step_size
+                n_steps = math.ceil(num_entries // target_step_size)
                 out = numpy.array(
                     [
                         [

From 885bbf0aa153f639f7bf71f7c686cf9564f97e5d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 30 Aug 2023 17:46:31 +0000
Subject: [PATCH 21/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/coffea/dataset_tools/preprocess.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/coffea/dataset_tools/preprocess.py b/src/coffea/dataset_tools/preprocess.py
index ca5400492..18f4e615c 100644
--- a/src/coffea/dataset_tools/preprocess.py
+++ b/src/coffea/dataset_tools/preprocess.py
@@ -3,7 +3,6 @@
 import awkward
 import dask
 import dask_awkward
-import math
 import numpy
 import uproot
 

From 3283256e46c9144f8ad8b43197c715e8ded80fb3 Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Wed, 30 Aug 2023 14:00:56 -0500
Subject: [PATCH 22/80] Forgot to remove the slash.

---
 src/coffea/dataset_tools/preprocess.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/coffea/dataset_tools/preprocess.py b/src/coffea/dataset_tools/preprocess.py
index 18f4e615c..f97153c92 100644
--- a/src/coffea/dataset_tools/preprocess.py
+++ b/src/coffea/dataset_tools/preprocess.py
@@ -46,7 +46,7 @@ def _get_steps(
                 out = numpy.array(out, dtype="int64")
                 out = numpy.stack((out[:-1], out[1:]), axis=1)
             else:
-                n_steps = math.ceil(num_entries // target_step_size)
+                n_steps = math.ceil(num_entries / target_step_size)
                 out = numpy.array(
                     [
                         [

From 4c8d917ab28307d844b650faa3da7ba9537f749f Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Tue, 17 Oct 2023 13:39:40 -0500
Subject: [PATCH 23/80] make rucio-clients an extra

---
 pyproject.toml | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 9bc350125..d832f945b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -58,9 +58,6 @@ dependencies = [
   "pandas",
   "hist>=2",
   "cachetools",
-  "rucio-clients>=32;python_version>'3.8'",
-  "rucio-clients<32;python_version<'3.9'",
-  "cmd2"
 ]
 dynamic = ["version"]
 
@@ -88,6 +85,11 @@ servicex = [
   "servicex>=2.5.3",
   "func-adl_servicex",
 ]
+rucio = [
+  "rucio-clients>=32;python_version>'3.8'",
+  "rucio-clients<32;python_version<'3.9'",
+  "cmd2",
+]
 dev = [
   "pre-commit",
   "flake8",

From 9c598bba509caec6a489b8fbf13aff51202e9849 Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Fri, 10 Nov 2023 07:05:26 -0600
Subject: [PATCH 24/80] two wrappers to apply processor wrapped code to
 datasets

---
 src/coffea/dataset_tools/__init__.py        |  3 ++-
 src/coffea/dataset_tools/apply_processor.py | 23 +++++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 src/coffea/dataset_tools/apply_processor.py

diff --git a/src/coffea/dataset_tools/__init__.py b/src/coffea/dataset_tools/__init__.py
index c02a96e48..445481c3c 100644
--- a/src/coffea/dataset_tools/__init__.py
+++ b/src/coffea/dataset_tools/__init__.py
@@ -1,3 +1,4 @@
+from coffea.dataset_tools.apply_processor import apply_to_fileset, apply_to_one_dataset
 from coffea.dataset_tools.preprocess import preprocess
 
-__all__ = ["preprocess"]
+__all__ = ["preprocess", "apply_to_one_dataset", "apply_to_fileset"]
diff --git a/src/coffea/dataset_tools/apply_processor.py b/src/coffea/dataset_tools/apply_processor.py
new file mode 100644
index 000000000..1bcd7d18a
--- /dev/null
+++ b/src/coffea/dataset_tools/apply_processor.py
@@ -0,0 +1,23 @@
+from coffea.nanoevents import NanoAODSchema, NanoEventsFactory
+from coffea.processor import ProcessorABC
+
+
+def apply_to_one_dataset(
+    proc: ProcessorABC, dataset, schemaclass=NanoAODSchema, metadata={}
+):
+    files = dataset["files"]
+    events = NanoEventsFactory.from_root(
+        files,
+        metadata=metadata,
+        schemaclass=NanoAODSchema,
+    )
+    return proc.process(events)
+
+
+def apply_to_fileset(proc: ProcessorABC, fileset, schemaclass=NanoAODSchema):
+    out = {}
+    for name, dataset in fileset.items():
+        metadata = dataset.get("metadata", {})
+        metadata["dataset"] = name
+        out[name] = apply_to_one_dataset(proc, dataset, schemaclass, metadata)
+    return out

From 20697862a40cf95132fa72730b7984dbc0396e58 Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Fri, 10 Nov 2023 08:31:45 -0600
Subject: [PATCH 25/80] let preprocess deal with missing files in a
 configurable way, add in helper to replicate maxchunks functionality

---
 src/coffea/dataset_tools/__init__.py      |  9 ++++++++-
 src/coffea/dataset_tools/manipulations.py | 14 ++++++++++++++
 src/coffea/dataset_tools/preprocess.py    | 15 ++++++++++++---
 3 files changed, 34 insertions(+), 4 deletions(-)
 create mode 100644 src/coffea/dataset_tools/manipulations.py

diff --git a/src/coffea/dataset_tools/__init__.py b/src/coffea/dataset_tools/__init__.py
index 445481c3c..888df7eaf 100644
--- a/src/coffea/dataset_tools/__init__.py
+++ b/src/coffea/dataset_tools/__init__.py
@@ -1,4 +1,11 @@
 from coffea.dataset_tools.apply_processor import apply_to_fileset, apply_to_one_dataset
+from coffea.dataset_tools.manipulations import max_chunks, slice_chunks
 from coffea.dataset_tools.preprocess import preprocess
 
-__all__ = ["preprocess", "apply_to_one_dataset", "apply_to_fileset"]
+__all__ = [
+    "preprocess",
+    "apply_to_one_dataset",
+    "apply_to_fileset",
+    "max_chunks",
+    "slice_chunks",
+]
diff --git a/src/coffea/dataset_tools/manipulations.py b/src/coffea/dataset_tools/manipulations.py
new file mode 100644
index 000000000..b7e40e785
--- /dev/null
+++ b/src/coffea/dataset_tools/manipulations.py
@@ -0,0 +1,14 @@
+def max_chunks(fileset, maxchunks=None):
+    return slice_chunks(fileset, slice(maxchunks))
+
+
+def slice_chunks(fileset, theslice=slice(None)):
+    if not isinstance(theslice, slice):
+        theslice = slice(theslice)
+
+    out = fileset.copy()
+    for name, entry in fileset.items():
+        for fname, finfo in entry["files"].items():
+            out[name]["files"][fname]["steps"] = finfo["steps"][theslice]
+
+    return out
diff --git a/src/coffea/dataset_tools/preprocess.py b/src/coffea/dataset_tools/preprocess.py
index f97153c92..eef96b549 100644
--- a/src/coffea/dataset_tools/preprocess.py
+++ b/src/coffea/dataset_tools/preprocess.py
@@ -12,6 +12,8 @@ def _get_steps(
     maybe_step_size=None,
     align_clusters=False,
     recalculate_seen_steps=False,
+    skip_bad_files=False,
+    file_exceptions=(FileNotFoundError, OSError),
 ):
     nf_backend = awkward.backend(normed_files)
     lz_or_nf = awkward.typetracer.length_zero_if_typetracer(normed_files)
@@ -20,9 +22,12 @@ def _get_steps(
     for arg in lz_or_nf:
         try:
             the_file = uproot.open({arg.file: None})
-        except FileNotFoundError:
-            array.append(None)
-            continue
+        except file_exceptions as e:
+            if skip_bad_files:
+                array.append(None)
+                continue
+            else:
+                raise e
 
         tree = the_file[arg.object_path]
         num_entries = tree.num_entries
@@ -95,6 +100,8 @@ def preprocess(
     align_clusters=False,
     recalculate_seen_steps=False,
     files_per_batch=1,
+    skip_bad_files=False,
+    file_exceptions=(FileNotFoundError, OSError),
 ):
     out_updated = fileset.copy()
     out_available = fileset.copy()
@@ -128,6 +135,8 @@ def preprocess(
             maybe_step_size=maybe_step_size,
             align_clusters=align_clusters,
             recalculate_seen_steps=recalculate_seen_steps,
+            skip_bad_files=skip_bad_files,
+            file_exceptions=file_exceptions,
         )
 
     all_processed_files = dask.compute(files_to_preprocess)[0]

From e3959f4252114ba8bbd477a179e4fbe64756f241 Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Tue, 21 Nov 2023 16:46:11 -0600
Subject: [PATCH 26/80] remove old tests and switch to new tools in tests

---
 src/coffea/dataset_tools/apply_processor.py   |    6 +-
 src/coffea/dataset_tools/manipulations.py     |    5 +-
 src/coffea/dataset_tools/preprocess.py        |    5 +-
 src/coffea/processor/__init__.py              |  141 +-
 src/coffea/processor/accumulator.py           |    6 +-
 src/coffea/processor/dask/__init__.py         |   77 -
 src/coffea/processor/dataframe.py             |  117 -
 src/coffea/processor/executor.py              | 2220 -----------------
 src/coffea/processor/helpers.py               |  273 --
 src/coffea/processor/parsl/__init__.py        |    0
 src/coffea/processor/parsl/condor_config.py   |   77 -
 src/coffea/processor/parsl/detail.py          |   89 -
 src/coffea/processor/parsl/slurm_config.py    |   67 -
 src/coffea/processor/parsl/timeout.py         |   21 -
 src/coffea/processor/servicex/__init__.py     |   39 -
 src/coffea/processor/servicex/analysis.py     |   45 -
 .../processor/servicex/dask_executor.py       |   89 -
 src/coffea/processor/servicex/data_source.py  |  116 -
 src/coffea/processor/servicex/executor.py     |  183 --
 .../processor/servicex/local_executor.py      |   69 -
 src/coffea/processor/spark/__init__.py        |    0
 src/coffea/processor/spark/detail.py          |  133 -
 src/coffea/processor/spark/spark_executor.py  |  195 --
 src/coffea/processor/templates/__init__.py    |    0
 src/coffea/processor/templates/spark.py.tmpl  |   24 -
 tests/test_local_executors.py                 |  125 -
 tests/test_workitem.py                        |   29 -
 tests/wq.py                                   |   66 -
 28 files changed, 16 insertions(+), 4201 deletions(-)
 delete mode 100644 src/coffea/processor/dask/__init__.py
 delete mode 100644 src/coffea/processor/dataframe.py
 delete mode 100644 src/coffea/processor/executor.py
 delete mode 100644 src/coffea/processor/helpers.py
 delete mode 100644 src/coffea/processor/parsl/__init__.py
 delete mode 100644 src/coffea/processor/parsl/condor_config.py
 delete mode 100644 src/coffea/processor/parsl/detail.py
 delete mode 100644 src/coffea/processor/parsl/slurm_config.py
 delete mode 100644 src/coffea/processor/parsl/timeout.py
 delete mode 100644 src/coffea/processor/servicex/__init__.py
 delete mode 100644 src/coffea/processor/servicex/analysis.py
 delete mode 100644 src/coffea/processor/servicex/dask_executor.py
 delete mode 100644 src/coffea/processor/servicex/data_source.py
 delete mode 100644 src/coffea/processor/servicex/executor.py
 delete mode 100644 src/coffea/processor/servicex/local_executor.py
 delete mode 100644 src/coffea/processor/spark/__init__.py
 delete mode 100644 src/coffea/processor/spark/detail.py
 delete mode 100644 src/coffea/processor/spark/spark_executor.py
 delete mode 100644 src/coffea/processor/templates/__init__.py
 delete mode 100644 src/coffea/processor/templates/spark.py.tmpl
 delete mode 100644 tests/test_local_executors.py
 delete mode 100644 tests/test_workitem.py
 delete mode 100755 tests/wq.py

diff --git a/src/coffea/dataset_tools/apply_processor.py b/src/coffea/dataset_tools/apply_processor.py
index 1bcd7d18a..610bae8fa 100644
--- a/src/coffea/dataset_tools/apply_processor.py
+++ b/src/coffea/dataset_tools/apply_processor.py
@@ -9,15 +9,15 @@ def apply_to_one_dataset(
     events = NanoEventsFactory.from_root(
         files,
         metadata=metadata,
-        schemaclass=NanoAODSchema,
-    )
+        schemaclass=schemaclass,
+    ).events()
     return proc.process(events)
 
 
 def apply_to_fileset(proc: ProcessorABC, fileset, schemaclass=NanoAODSchema):
     out = {}
     for name, dataset in fileset.items():
-        metadata = dataset.get("metadata", {})
+        metadata = dataset.get("metadata", {}).copy()
         metadata["dataset"] = name
         out[name] = apply_to_one_dataset(proc, dataset, schemaclass, metadata)
     return out
diff --git a/src/coffea/dataset_tools/manipulations.py b/src/coffea/dataset_tools/manipulations.py
index b7e40e785..e963a067c 100644
--- a/src/coffea/dataset_tools/manipulations.py
+++ b/src/coffea/dataset_tools/manipulations.py
@@ -1,3 +1,6 @@
+import copy
+
+
 def max_chunks(fileset, maxchunks=None):
     return slice_chunks(fileset, slice(maxchunks))
 
@@ -6,7 +9,7 @@ def slice_chunks(fileset, theslice=slice(None)):
     if not isinstance(theslice, slice):
         theslice = slice(theslice)
 
-    out = fileset.copy()
+    out = copy.deepcopy(fileset)
     for name, entry in fileset.items():
         for fname, finfo in entry["files"].items():
             out[name]["files"][fname]["steps"] = finfo["steps"][theslice]
diff --git a/src/coffea/dataset_tools/preprocess.py b/src/coffea/dataset_tools/preprocess.py
index eef96b549..ad4d4bc6d 100644
--- a/src/coffea/dataset_tools/preprocess.py
+++ b/src/coffea/dataset_tools/preprocess.py
@@ -1,3 +1,4 @@
+import copy
 import math
 
 import awkward
@@ -103,8 +104,8 @@ def preprocess(
     skip_bad_files=False,
     file_exceptions=(FileNotFoundError, OSError),
 ):
-    out_updated = fileset.copy()
-    out_available = fileset.copy()
+    out_updated = copy.deepcopy(fileset)
+    out_available = copy.deepcopy(fileset)
     all_ak_norm_files = {}
     files_to_preprocess = {}
     for name, info in fileset.items():
diff --git a/src/coffea/processor/__init__.py b/src/coffea/processor/__init__.py
index 54aea102d..91b6f357d 100644
--- a/src/coffea/processor/__init__.py
+++ b/src/coffea/processor/__init__.py
@@ -2,146 +2,11 @@
 
 
 """
-# deprecated run_uproot_job & executor usage:
-from functools import partial
-
-from coffea.nanoevents.schemas import NanoAODSchema, TreeMakerSchema
-
-from .accumulator import (
-    Accumulatable,
-    AccumulatorABC,
-    accumulate,
-    column_accumulator,
-    defaultdict_accumulator,
-    dict_accumulator,
-    list_accumulator,
-    set_accumulator,
-    value_accumulator,
-)
-from .dataframe import LazyDataFrame
-from .executor import (
-    DaskExecutor,
-    FuturesExecutor,
-    IterativeExecutor,
-    ParslExecutor,
-    Runner,
-    WorkQueueExecutor,
-    run_spark_job,
-)
-from .helpers import PackedSelection, Weights
+from .accumulator import AccumulatorABC, dict_accumulator
 from .processor import ProcessorABC
 
-
-def _run_x_job(
-    fileset,
-    treename,
-    processor_instance,
-    executor,
-    executor_args={},
-    pre_executor=None,
-    pre_args=None,
-    chunksize=100000,
-    maxchunks=None,
-    metadata_cache=None,
-    dynamic_chunksize=None,
-    format="root",
-):
-    """
-    Please use instead, e.g.:
-
-        executor = IterativeExecutor()
-        run = processor.Runner(
-            executor=executor,
-            schema=processor.NanoAODSchema,
-        )
-        hists = run(filelist, "Events", processor_instance=processor_instance)
-    """
-
-    # turn this deprecation warning on from coffea.__version__ >= 0.8 on
-    # from coffea.util import deprecate
-    # deprecate(
-    #     RuntimeError(f"This method is deprecated, please use directly the new: {executor} and {Runner} classes.\n {_run_x_job.__doc__}"),  # noqa: E501
-    #     0.9,
-    # )
-
-    # extract executor kwargs
-    exe_args = {}
-    exe_fields = executor.__dataclass_fields__.keys()
-    exe_keys = list(executor_args.keys())
-    for k in exe_keys:
-        if k in exe_fields:
-            exe_args[k] = executor_args.pop(k)
-
-    executor = executor(**exe_args)
-
-    # extract preexecutor kwargs
-    if pre_executor is not None and pre_args is not None:
-        pre_exe_args = {}
-        pre_exe_fields = pre_executor.__dataclass_fields__.keys()
-        pre_exe_keys = list(pre_args.keys())
-        for k in pre_exe_keys:
-            if k in pre_exe_fields:
-                pre_exe_args[k] = pre_args.pop(k)
-
-        pre_executor = pre_executor(**pre_exe_args)
-
-    # make Runner instance, assume other args are for _work_function & co.
-    run = Runner(
-        executor=executor,
-        chunksize=chunksize,
-        maxchunks=maxchunks,
-        metadata_cache=metadata_cache,
-        dynamic_chunksize=dynamic_chunksize,
-        format=format,
-        **executor_args,
-    )
-
-    return run(
-        fileset,
-        treename,
-        processor_instance=processor_instance,
-    )
-
-
-run_uproot_job = partial(_run_x_job, format="root")
-run_parquet_job = partial(_run_x_job, format="parquet")
-
-iterative_executor = IterativeExecutor
-futures_executor = FuturesExecutor
-dask_executor = DaskExecutor
-parsl_executor = ParslExecutor
-work_queue_executor = WorkQueueExecutor
-
-
 __all__ = [
-    "ProcessorABC",
-    "LazyDataFrame",
-    "Weights",
-    "PackedSelection",
-    "IterativeExecutor",
-    "FuturesExecutor",
-    "DaskExecutor",
-    "ParslExecutor",
-    "WorkQueueExecutor",
-    "Runner",
-    "run_spark_job",
-    "accumulate",
-    "Accumulatable",
-    "AccumulatorABC",
-    "value_accumulator",
-    "list_accumulator",
-    "set_accumulator",
     "dict_accumulator",
-    "defaultdict_accumulator",
-    "column_accumulator",
-    "NanoAODSchema",
-    "TreeMakerSchema",
-    # following methods are deprecated
-    "run_uproot_job",
-    "run_parquet_job",
-    "iterative_executor",
-    "futures_executor",
-    "dask_executor",
-    "parsl_executor",
-    "work_queue_executor",
+    "AccumulatorABC",
+    "ProcessorABC",
 ]
diff --git a/src/coffea/processor/accumulator.py b/src/coffea/processor/accumulator.py
index 282214a6c..8ad12dab1 100644
--- a/src/coffea/processor/accumulator.py
+++ b/src/coffea/processor/accumulator.py
@@ -57,14 +57,14 @@ def add(a: Accumulatable, b: Accumulatable) -> Accumulatable:
                 out[key] = (
                     copy.deepcopy(a[key])
                     if not isinstance(a[key], DaskMethodsMixin)
-                    else a[key]
+                    else copy.copy(a[key])
                 )
         for key in b:
             if key not in lhs:
                 out[key] = (
                     copy.deepcopy(b[key])
                     if not isinstance(b[key], DaskMethodsMixin)
-                    else b[key]
+                    else copy.copy(b[key])
                 )
         return out
     raise ValueError(
@@ -93,7 +93,7 @@ def iadd(a: Accumulatable, b: Accumulatable) -> Accumulatable:
                 a[key] = (
                     copy.deepcopy(b[key])
                     if not isinstance(b[key], DaskMethodsMixin)
-                    else b[key]
+                    else copy.copy(b[key])
                 )
         return a
     raise ValueError(
diff --git a/src/coffea/processor/dask/__init__.py b/src/coffea/processor/dask/__init__.py
deleted file mode 100644
index fc73f0d8e..000000000
--- a/src/coffea/processor/dask/__init__.py
+++ /dev/null
@@ -1,77 +0,0 @@
-import os
-from collections.abc import MutableMapping
-from threading import Lock
-
-import blosc
-from distributed import WorkerPlugin, get_worker
-from zict import LRU, Buffer, File, Func
-
-
-class ColumnCache(WorkerPlugin, MutableMapping):
-    name = "columncache"
-
-    def __init__(self, maxmem=5e8, maxcompressed=2e9, maxdisk=1e10):
-        self._maxmem = maxmem
-        self._maxcompressed = maxcompressed
-        self._maxdisk = maxdisk
-
-    def setup(self, worker):
-        self.cache = Buffer(
-            fast={},
-            slow=Func(
-                dump=blosc.pack_array,
-                load=blosc.unpack_array,
-                d=Buffer(
-                    fast={},
-                    slow=LRU(
-                        n=self._maxdisk,
-                        d=File(os.path.join(worker.local_directory, "cache")),
-                        weight=lambda k, v: len(v),
-                    ),
-                    n=self._maxcompressed,
-                    weight=lambda k, v: len(v),
-                ),
-            ),
-            n=self._maxmem,
-            weight=lambda k, v: v.nbytes,
-        )
-        self.lock = Lock()
-        self.hits = 0
-        self.misses = 0
-
-    def teardown(self, worker):
-        pass
-
-    def __getitem__(self, key):
-        with self.lock:
-            try:
-                out = self.cache[key]
-                self.hits += 1
-                return out
-            except KeyError:
-                self.misses += 1
-                raise
-
-    def __setitem__(self, key, value):
-        with self.lock:
-            self.cache[key] = value
-
-    def __delitem__(self, key):
-        with self.lock:
-            del self.cache[key]
-
-    def __iter__(self):
-        with self.lock:
-            return iter(self.cache)
-
-    def __len__(self):
-        with self.lock:
-            return len(self.cache)
-
-
-def register_columncache(client):
-    plugins = set()
-    for p in client.run(lambda: set(get_worker().plugins)).values():
-        plugins |= p
-    if ColumnCache.name not in plugins:
-        client.register_worker_plugin(ColumnCache())
diff --git a/src/coffea/processor/dataframe.py b/src/coffea/processor/dataframe.py
deleted file mode 100644
index 20fa48445..000000000
--- a/src/coffea/processor/dataframe.py
+++ /dev/null
@@ -1,117 +0,0 @@
-from collections.abc import MutableMapping
-
-import uproot
-
-
-class LazyDataFrame(MutableMapping):
-    """Simple delayed uproot reader (a la lazyarrays)
-
-    One can access branches either through ``df["bname"]`` or ``df.bname``, although
-    the latter is restricted to branches that do not start with a leading underscore.
-    Keeps track of values accessed, in the `materialized` attribute.
-
-    Parameters
-    ----------
-        tree : uproot.TTree
-            Tree to read
-        entrystart : int, optional
-            First entry to read, default: 0
-        entrystop : int, optional
-            Last entry to read, default None (read to end)
-        preload_items : iterable
-            Force preloading of a set of columns from the tree
-        metadata : Mapping
-            Additional metadata for the dataframe
-    """
-
-    def __init__(
-        self, tree, entrystart=None, entrystop=None, preload_items=None, metadata=None
-    ):
-        self._tree = tree
-        self._branchargs = {
-            "decompression_executor": uproot.source.futures.TrivialExecutor(),
-            "interpretation_executor": uproot.source.futures.TrivialExecutor(),
-        }
-        if entrystart is None or entrystart < 0:
-            entrystart = 0
-        if entrystop is None or entrystop > tree.num_entries:
-            entrystop = tree.num_entries
-        self._branchargs["entry_start"] = entrystart
-        self._branchargs["entry_stop"] = entrystop
-        self._available = {k for k in self._tree.keys()}
-        self._dict = {}
-        self._materialized = set()
-        if preload_items:
-            self.preload(preload_items)
-        self._metadata = metadata
-
-    def __delitem__(self, key):
-        del self._dict[key]
-
-    def __getitem__(self, key):
-        if key in self._dict:
-            return self._dict[key]
-        elif key in self._tree:
-            self._materialized.add(key)
-            array = self._tree[key].array(**self._branchargs)
-            self._dict[key] = array
-            return self._dict[key]
-        else:
-            raise KeyError(key)
-
-    def __getattr__(self, key):
-        if key.startswith("_"):
-            raise AttributeError(key)
-        try:
-            return self.__getitem__(key)
-        except KeyError:
-            raise AttributeError(key)
-
-    def __iter__(self):
-        yield from self._available
-
-    def __len__(self):
-        return len(self._dict)
-
-    def __setitem__(self, key, value):
-        self._dict[key] = value
-
-    def __contains__(self, key):
-        # by default, MutableMapping uses __getitem__ to test, but we want to avoid materialization
-        return key in self._dict or key in self._tree
-
-    @property
-    def available(self):
-        """Set of available columns"""
-        return self._available
-
-    @property
-    def columns(self):
-        """Set of available columns"""
-        return self._available
-
-    @property
-    def materialized(self):
-        """Set of columns read from tree"""
-        return self._materialized
-
-    @property
-    def size(self):
-        """Length of column vector"""
-        return self._branchargs["entry_stop"] - self._branchargs["entry_start"]
-
-    @property
-    def metadata(self):
-        return self._metadata
-
-    def preload(self, columns):
-        """Force loading of several columns
-
-        Parameters
-        ----------
-            columns : iterable
-                A list of columns to load
-        """
-        for name in columns:
-            if name in self._tree:
-                _ = self[name]
diff --git a/src/coffea/processor/executor.py b/src/coffea/processor/executor.py
deleted file mode 100644
index d61735078..000000000
--- a/src/coffea/processor/executor.py
+++ /dev/null
@@ -1,2220 +0,0 @@
-import concurrent.futures
-import json
-import math
-import os
-import pickle
-import shutil
-import sys
-import time
-import traceback
-import uuid
-import warnings
-from collections import defaultdict
-from collections.abc import Mapping, MutableMapping
-from contextlib import ExitStack
-from dataclasses import asdict, dataclass, field
-from functools import partial
-from io import BytesIO
-from itertools import repeat
-from typing import (
-    Awaitable,
-    Callable,
-    Dict,
-    Generator,
-    Iterable,
-    List,
-    Optional,
-    Set,
-    Tuple,
-    Union,
-)
-
-import cloudpickle
-import lz4.frame as lz4f
-import toml
-import uproot
-from cachetools import LRUCache
-
-from ..nanoevents import NanoEventsFactory, schemas
-from ..util import _exception_chain, _hash, rich_bar
-from .accumulator import Accumulatable, accumulate, set_accumulator
-from .dataframe import LazyDataFrame
-from .processor import ProcessorABC
-
-try:
-    from typing import Literal
-except ImportError:
-    from typing_extensions import Literal
-
-
-try:
-    from functools import cached_property
-except ImportError:
-    cached_property = property
-
-
-_PICKLE_PROTOCOL = pickle.HIGHEST_PROTOCOL
-DEFAULT_METADATA_CACHE: MutableMapping = LRUCache(100000)
-
-_PROTECTED_NAMES = {
-    "dataset",
-    "filename",
-    "treename",
-    "metadata",
-    "entrystart",
-    "entrystop",
-    "fileuuid",
-    "numentries",
-    "uuid",
-    "clusters",
-}
-
-
-class UprootMissTreeError(uproot.exceptions.KeyInFileError):
-    pass
-
-
-class FileMeta:
-    __slots__ = ["dataset", "filename", "treename", "metadata"]
-
-    def __init__(self, dataset, filename, treename, metadata=None):
-        self.dataset = dataset
-        self.filename = filename
-        self.treename = treename
-        self.metadata = metadata
-
-    def __str__(self):
-        return f"FileMeta({self.filename}:{self.treename})"
-
-    def __hash__(self):
-        # As used to lookup metadata, no need for dataset
-        return _hash((self.filename, self.treename))
-
-    def __eq__(self, other):
-        # In case of hash collisions
-        return self.filename == other.filename and self.treename == other.treename
-
-    def maybe_populate(self, cache):
-        if cache and self in cache:
-            self.metadata = cache[self]
-
-    def populated(self, clusters=False):
-        """Return true if metadata is populated
-
-        By default, only require bare minimum metadata (numentries, uuid)
-        If clusters is True, then require cluster metadata to be populated
-        """
-        if self.metadata is None:
-            return False
-        elif "numentries" not in self.metadata or "uuid" not in self.metadata:
-            return False
-        elif clusters and "clusters" not in self.metadata:
-            return False
-        return True
-
-    def chunks(self, target_chunksize, align_clusters):
-        if not self.populated(clusters=align_clusters):
-            raise RuntimeError
-        user_keys = set(self.metadata.keys()) - _PROTECTED_NAMES
-        user_meta = {k: self.metadata[k] for k in user_keys}
-        if align_clusters:
-            chunks = [0]
-            for c in self.metadata["clusters"]:
-                if c >= chunks[-1] + target_chunksize:
-                    chunks.append(c)
-            if self.metadata["clusters"][-1] != chunks[-1]:
-                chunks.append(self.metadata["clusters"][-1])
-            for start, stop in zip(chunks[:-1], chunks[1:]):
-                yield WorkItem(
-                    self.dataset,
-                    self.filename,
-                    self.treename,
-                    start,
-                    stop,
-                    self.metadata["uuid"],
-                    user_meta,
-                )
-            return target_chunksize
-        else:
-            numentries = self.metadata["numentries"]
-            update = True
-            start = 0
-            while start < numentries:
-                if update:
-                    n = max(round((numentries - start) / target_chunksize), 1)
-                    actual_chunksize = math.ceil((numentries - start) / n)
-                stop = min(numentries, start + actual_chunksize)
-                next_chunksize = yield WorkItem(
-                    self.dataset,
-                    self.filename,
-                    self.treename,
-                    start,
-                    stop,
-                    self.metadata["uuid"],
-                    user_meta,
-                )
-                start = stop
-                if next_chunksize and next_chunksize != target_chunksize:
-                    target_chunksize = next_chunksize
-                    update = True
-                else:
-                    update = False
-            return target_chunksize
-
-
-@dataclass(unsafe_hash=True, frozen=True)
-class WorkItem:
-    dataset: str
-    filename: str
-    treename: str
-    entrystart: int
-    entrystop: int
-    fileuuid: str
-    usermeta: Optional[Dict] = field(default=None, compare=False)
-
-    def __len__(self) -> int:
-        return self.entrystop - self.entrystart
-
-
-def _compress(item, compression):
-    if item is None or compression is None:
-        return item
-    else:
-        with BytesIO() as bf:
-            with lz4f.open(bf, mode="wb", compression_level=compression) as f:
-                pickle.dump(item, f, protocol=_PICKLE_PROTOCOL)
-            result = bf.getvalue()
-        return result
-
-
-def _decompress(item):
-    if isinstance(item, bytes):
-        # warning: if item is not exactly of type bytes, BytesIO(item) will
-        # make a copy of it, increasing the memory usage.
-        with BytesIO(item) as bf:
-            with lz4f.open(bf, mode="rb") as f:
-                return pickle.load(f)
-    else:
-        return item
-
-
-class _compression_wrapper:
-    def __init__(self, level, function, name=None):
-        self.level = level
-        self.function = function
-        self.name = name
-
-    def __str__(self):
-        if self.name is not None:
-            return self.name
-        try:
-            name = self.function.__name__
-            if name == "<lambda>":
-                return "lambda"
-            return name
-        except AttributeError:
-            return str(self.function)
-
-    # no @wraps due to pickle
-    def __call__(self, *args, **kwargs):
-        out = self.function(*args, **kwargs)
-        return _compress(out, self.level)
-
-
-class _reduce:
-    def __init__(self, compression):
-        self.compression = compression
-
-    def __str__(self):
-        return "reduce"
-
-    def __call__(self, items):
-        items = list(it for it in items if it is not None)
-        if len(items) == 0:
-            raise ValueError("Empty list provided to reduction")
-        if self.compression is not None:
-            out = _decompress(items.pop())
-            out = accumulate(map(_decompress, items), out)
-            return _compress(out, self.compression)
-        return accumulate(items)
-
-
-class _FuturesHolder:
-    def __init__(self, futures: Set[Awaitable], refresh=2):
-        self.futures = set(futures)
-        self.merges = set()
-        self.completed = set()
-        self.done = {"futures": 0, "merges": 0}
-        self.running = len(self.futures)
-        self.refresh = refresh
-
-    def update(self, refresh: int = None):
-        if refresh is None:
-            refresh = self.refresh
-        if self.futures:
-            completed, self.futures = concurrent.futures.wait(
-                self.futures,
-                timeout=refresh,
-                return_when=concurrent.futures.FIRST_COMPLETED,
-            )
-            self.completed.update(completed)
-            self.done["futures"] += len(completed)
-
-        if self.merges:
-            completed, self.merges = concurrent.futures.wait(
-                self.merges,
-                timeout=refresh,
-                return_when=concurrent.futures.FIRST_COMPLETED,
-            )
-            self.completed.update(completed)
-            self.done["merges"] += len(completed)
-        self.running = len(self.futures) + len(self.merges)
-
-    def add_merge(self, merges: Awaitable[Accumulatable]):
-        self.merges.add(merges)
-        self.running = len(self.futures) + len(self.merges)
-
-    def fetch(self, N: int) -> List[Accumulatable]:
-        _completed = [self.completed.pop() for _ in range(min(N, len(self.completed)))]
-        if all(_good_future(future) for future in _completed):
-            return [future.result() for future in _completed if _good_future(future)]
-        else:  # Make recoverable
-            good_futures = [future for future in _completed if _good_future(future)]
-            bad_futures = [future for future in _completed if not _good_future(future)]
-            self.completed.update(good_futures)
-            raise bad_futures[0].exception()
-
-
-def _good_future(future: Awaitable) -> bool:
-    return future.done() and not future.cancelled() and future.exception() is None
-
-
-def _futures_handler(futures, timeout):
-    """Essentially the same as concurrent.futures.as_completed
-    but makes sure not to hold references to futures any longer than strictly necessary,
-    which is important if the future holds a large result.
-    """
-    futures = set(futures)
-    try:
-        while futures:
-            try:
-                done, futures = concurrent.futures.wait(
-                    futures,
-                    timeout=timeout,
-                    return_when=concurrent.futures.FIRST_COMPLETED,
-                )
-                if len(done) == 0:
-                    warnings.warn(
-                        f"No finished jobs after {timeout}s, stopping remaining {len(futures)} jobs early"
-                    )
-                    break
-                while done:
-                    try:
-                        yield done.pop().result()
-                    except concurrent.futures.CancelledError:
-                        pass
-            except KeyboardInterrupt as e:
-                for job in futures:
-                    try:
-                        job.cancel()
-                        # this is not implemented with parsl AppFutures
-                    except NotImplementedError:
-                        raise e from None
-                running = sum(job.running() for job in futures)
-                warnings.warn(
-                    f"Early stop: cancelled {len(futures) - running} jobs, will wait for {running} running jobs to complete"
-                )
-    finally:
-        running = sum(job.running() for job in futures)
-        if running:
-            warnings.warn(
-                f"Cancelling {running} running jobs (likely due to an exception)"
-            )
-        try:
-            while futures:
-                futures.pop().cancel()
-        except NotImplementedError:
-            pass
-
-
-@dataclass
-class ExecutorBase:
-    # shared by all executors
-    status: bool = True
-    unit: str = "items"
-    desc: str = "Processing"
-    compression: Optional[int] = 1
-    function_name: Optional[str] = None
-
-    def __call__(
-        self,
-        items: Iterable,
-        function: Callable,
-        accumulator: Accumulatable,
-    ):
-        raise NotImplementedError(
-            "This class serves as a base class for executors, do not instantiate it!"
-        )
-
-    def copy(self, **kwargs):
-        tmp = self.__dict__.copy()
-        tmp.update(kwargs)
-        return type(self)(**tmp)
-
-
-@dataclass
-class DaskExecutorBase(ExecutorBase):
-    """This base class for dak-based processors
-    synthesizes all analysis inputs into one
-    task graph that's then executed by derived
-    classes.
-    """
-
-    def prepare_dataset_graph(self, items, function, accumulator):
-        accumulator = None
-        for dset, info in items.items():
-            if isinstance(items, dict) and "object_path" not in list(items.values()):
-                raise ValueError(
-                    "items should be normalized to uproot spec in prepare_dataset_graph"
-                )
-
-            metadata = info["metadata"].copy()
-            metadata["dataset"] = dset
-
-            temp = function(info["files"], metadata=metadata)
-            if accumulator is None:
-                accumulator = temp
-            else:
-                accumulator = accumulate((accumulator, temp))
-
-        return accumulator
-
-
-@dataclass
-class DaskSyncExecutor(DaskExecutorBase):
-    """Execute dask task graph in one thread
-
-    Parameters
-    ----------
-        items : list
-            List of input arguments
-        function : callable
-            A function to be called on each input, which returns an accumulator instance
-        accumulator : Accumulatable
-            An accumulator to collect the output of the function
-        status : bool
-            If true (default), enable progress bar
-        unit : str
-            Label of progress bar unit
-        desc : str
-            Label of progress bar description
-        compression : int, optional
-            Ignored for iterative executor
-    """
-
-    def __call__(
-        self,
-        items: Iterable,
-        function: Callable,
-        accumulator: Accumulatable,
-    ):
-        import dask
-
-        to_compute = self.prepare_dataset_graph(items, function, None)
-        computed = dask.compute(to_compute, scheduler="sync")
-        return computed[0] if len(computed) == 1 else computed
-
-
-@dataclass
-class DaskProcessesExecutor(DaskExecutorBase):
-    """Execute dask task graph in a multiprocessing pool
-
-    Parameters
-    ----------
-        items : list
-            List of input arguments
-        function : callable
-            A function to be called on each input, which returns an accumulator instance
-        accumulator : Accumulatable
-            An accumulator to collect the output of the function
-        status : bool
-            If true (default), enable progress bar
-        unit : str
-            Label of progress bar unit
-        desc : str
-            Label of progress bar description
-        compression : int, optional
-            Ignored for iterative executor
-    """
-
-    workers = 1
-
-    def __call__(
-        self,
-        items: Iterable,
-        function: Callable,
-        accumulator: Accumulatable,
-    ):
-        import dask
-
-        to_compute = self.prepare_dataset_graph(items, function, None)
-        with dask.config.set(num_workers=self.workers):
-            computed = dask.compute(to_compute, scheduler="processes")
-        return computed[0] if len(computed) == 1 else computed
-
-
-def _watcher(
-    FH: _FuturesHolder,
-    executor: ExecutorBase,
-    merge_fcn: Callable,
-    pool: Optional[Callable] = None,
-) -> Accumulatable:
-    with rich_bar() as progress:
-        p_id = progress.add_task(executor.desc, total=FH.running, unit=executor.unit)
-        desc_m = "Merging" if executor.merging else "Merging (local)"
-        p_idm = progress.add_task(desc_m, total=0, unit="merges")
-
-        merged = None
-        while FH.running > 0:
-            FH.update()
-            progress.update(p_id, completed=FH.done["futures"], refresh=True)
-
-            if executor.merging:  # Merge jobs
-                merge_size = executor._merge_size(len(FH.completed))
-                progress.update(p_idm, completed=FH.done["merges"])
-                while len(FH.completed) > 1:
-                    if FH.running > 0 and len(FH.completed) < executor.merging[1]:
-                        break
-                    batch = FH.fetch(merge_size)
-                    # Add debug for batch mem size? TODO with logging?
-                    if isinstance(executor, FuturesExecutor) and pool is not None:
-                        FH.add_merge(pool.submit(merge_fcn, batch))
-                    elif isinstance(executor, ParslExecutor):
-                        FH.add_merge(merge_fcn(batch))
-                    else:
-                        raise RuntimeError("Invalid executor")
-                    progress.update(
-                        p_idm,
-                        total=progress._tasks[p_idm].total + 1,
-                        refresh=True,
-                    )
-            else:  # Merge within process
-                batch = FH.fetch(len(FH.completed))
-                merged = _compress(
-                    accumulate(
-                        progress.track(
-                            map(_decompress, (c for c in batch)),
-                            task_id=p_idm,
-                            total=progress._tasks[p_idm].total + len(batch),
-                        ),
-                        _decompress(merged),
-                    ),
-                    executor.compression,
-                )
-        # Add checkpointing
-
-        if executor.merging:
-            progress.refresh()
-            merged = FH.completed.pop().result()
-        if len(FH.completed) > 0 or len(FH.futures) > 0 or len(FH.merges) > 0:
-            raise RuntimeError("Not all futures are added.")
-        return merged
-
-
-def _wait_for_merges(FH: _FuturesHolder, executor: ExecutorBase) -> Accumulatable:
-    with rich_bar() as progress:
-        if executor.merging:
-            to_finish = len(FH.merges)
-            p_id_w = progress.add_task(
-                "Waiting for merge jobs",
-                total=to_finish,
-                unit=executor.unit,
-            )
-            while len(FH.merges) > 0:
-                FH.update()
-                progress.update(
-                    p_id_w,
-                    completed=(to_finish - len(FH.merges)),
-                    refresh=True,
-                )
-
-        FH.update()
-        recovered = [future.result() for future in FH.completed if _good_future(future)]
-        p_id_m = progress.add_task("Merging finished jobs", unit="merges")
-        return _compress(
-            accumulate(
-                progress.track(
-                    map(_decompress, (c for c in recovered)),
-                    task_id=p_id_m,
-                    total=len(recovered),
-                )
-            ),
-            executor.compression,
-        )
-
-
-@dataclass
-class WorkQueueExecutor(ExecutorBase):
-    """Execute using Work Queue
-
-    For more information, see :ref:`intro-coffea-wq`
-
-    Parameters
-    ----------
-        items : sequence or generator
-            Sequence of input arguments
-        function : callable
-            A function to be called on each input, which returns an accumulator instance
-        accumulator : Accumulatable
-            An accumulator to collect the output of the function
-        status : bool
-            If true (default), enable progress bar
-        unit : str
-            Label of progress bar unit
-        desc : str
-            Label of progress bar description
-        compression : int, optional
-            Compress accumulator outputs in flight with LZ4, at level specified (default 9)
-            `None`` sets level to 1 (minimal compression)
-        # work queue specific options:
-        cores : int
-            Maximum number of cores for work queue task. If unset, use a whole worker.
-        memory : int
-            Maximum amount of memory (in MB) for work queue task. If unset, use a whole worker.
-        disk : int
-            Maximum amount of disk space (in MB) for work queue task. If unset, use a whole worker.
-        gpus : int
-            Number of GPUs to allocate to each task.  If unset, use zero.
-        resource_monitor : str
-            If given, one of 'off', 'measure', or 'watchdog'. Default is 'off'.
-            - 'off': turns off resource monitoring. Overridden to 'watchdog' if resources_mode
-                     is not set to 'fixed'.
-            - 'measure': turns on resource monitoring for Work Queue. The
-                        resources used per task are measured.
-            - 'watchdog': in addition to measuring resources, tasks are terminated if they
-                        go above the cores, memory, or disk specified.
-        resources_mode : str
-            one of 'fixed', 'max-seen', or 'max-throughput'. Default is 'max-seen'.
-            Sets the strategy to automatically allocate resources to tasks.
-            - 'fixed': allocate cores, memory, and disk specified for each task.
-            - 'max-seen' or 'auto': use the cores, memory, and disk given as maximum values to allocate,
-                          but first try each task by allocating the maximum values seen. Leads
-                          to a good compromise between parallelism and number of retries.
-            - 'max-throughput': Like max-seen, but first tries the task with an
-                          allocation that maximizes overall throughput.
-            If resources_mode is other than 'fixed', preprocessing and
-            accumulation tasks always use the 'max-seen' strategy, as the
-            former tasks always use the same resources, the latter has a
-            distribution of resources that increases over time.
-        split_on_exhaustion: bool
-            Whether to split a processing task in half according to its chunksize when it exhausts its
-            the cores, memory, or disk allocated to it. If False, a task that exhausts resources
-            permanently fails. Default is True.
-        fast_terminate_workers: int
-            Terminate workers on which tasks have been running longer than average.
-            The time limit is computed by multiplying the average runtime of tasks
-            by the value of 'fast_terminate_workers'. Since there are
-            legitimately slow tasks, no task may trigger fast termination in
-            two distinct workers. Less than 1 disables it.
-
-        manager_name : str
-            Name to refer to this work queue manager.
-            Sets port to 0 (any available port) if port not given.
-        port : int or tuple(int, int)
-            Port number or range (inclusive of ports )for work queue manager program.
-            Defaults to 9123 if manager_name not given.
-        password_file: str
-            Location of a file containing a password used to authenticate workers.
-        ssl: bool or tuple(str, str)
-            Enable ssl encryption between manager and workers. If a tuple, then it
-            should be of the form (key, cert), where key and cert are paths to the files
-            containing the key and certificate in pem format. If True, auto-signed temporary
-            key and cert are generated for the session.
-
-        extra_input_files: list
-            A list of files in the current working directory to send along with each task.
-            Useful for small custom libraries and configuration files needed by the processor.
-        x509_proxy : str
-            Path to the X509 user proxy. If None (the default), use the value of the
-            environment variable X509_USER_PROXY, or fallback to the file /tmp/x509up_u${UID} if
-            exists.  If False, disables the default behavior and no proxy is sent.
-
-        environment_file : optional, str
-            Conda python environment tarball to use. If not given, assume that
-            the python environment is already setup at the execution site.
-        wrapper : str
-            Wrapper script to run/open python environment tarball. Defaults to python_package_run found in PATH.
-
-        treereduction : int
-            Number of processed chunks per accumulation task. Defaults is 20.
-
-        verbose : bool
-            If true, emit a message on each task submission and completion.
-            Default is false.
-        print_stdout : bool
-            If true (default), print the standard output of work queue task on completion.
-
-        debug_log : str
-            Filename for debug output
-        stats_log : str
-            Filename for tasks statistics output
-        transactions_log : str
-            Filename for tasks lifetime reports output
-        tasks_accum_log : str
-            Filename for the log of tasks that have been processed and accumulated.
-
-        filepath: str
-            Path to the parent directory where to create the staging directory.
-            Default is "." (current working directory).
-
-        custom_init : function, optional
-            A function that takes as an argument the queue's WorkQueue object.
-            The function is called just before the first work unit is submitted
-            to the queue.
-    """
-
-    # Standard executor options:
-    compression: Optional[int] = 9  # as recommended by lz4
-    retries: int = 2  # task executes at most 3 times
-    # wq executor options:
-    manager_name: Optional[str] = None
-    port: Optional[Union[int, Tuple[int, int]]] = None
-    filepath: str = "."
-    events_total: Optional[int] = None
-    x509_proxy: Optional[str] = None
-    verbose: bool = False
-    print_stdout: bool = False
-    status_display_interval: Optional[int] = 10
-    debug_log: Optional[str] = None
-    stats_log: Optional[str] = None
-    transactions_log: Optional[str] = None
-    tasks_accum_log: Optional[str] = None
-    password_file: Optional[str] = None
-    ssl: Union[bool, Tuple[str, str]] = False
-    environment_file: Optional[str] = None
-    extra_input_files: List = field(default_factory=list)
-    wrapper: Optional[str] = shutil.which("poncho_package_run")
-    resource_monitor: Optional[str] = "off"
-    resources_mode: Optional[str] = "max-seen"
-    split_on_exhaustion: Optional[bool] = True
-    fast_terminate_workers: Optional[int] = None
-    cores: Optional[int] = None
-    memory: Optional[int] = None
-    disk: Optional[int] = None
-    gpus: Optional[int] = None
-    treereduction: int = 20
-    chunksize: int = 100000
-    dynamic_chunksize: Optional[Dict] = None
-    custom_init: Optional[Callable] = None
-
-    # deprecated
-    bar_format: Optional[str] = None
-    chunks_accum_in_mem: Optional[int] = None
-    master_name: Optional[str] = None
-    chunks_per_accum: Optional[int] = None
-
-    def __call__(
-        self,
-        items: Iterable,
-        function: Callable,
-        accumulator: Accumulatable,
-    ):
-        from .work_queue_tools import run
-
-        return (
-            run(
-                self,
-                items,
-                function,
-                accumulator,
-            ),
-            0,
-        )
-
-
-@dataclass
-class IterativeExecutor(ExecutorBase):
-    """Execute in one thread iteratively
-
-    Parameters
-    ----------
-        items : list
-            List of input arguments
-        function : callable
-            A function to be called on each input, which returns an accumulator instance
-        accumulator : Accumulatable
-            An accumulator to collect the output of the function
-        status : bool
-            If true (default), enable progress bar
-        unit : str
-            Label of progress bar unit
-        desc : str
-            Label of progress bar description
-        compression : int, optional
-            Ignored for iterative executor
-    """
-
-    workers: int = 1
-
-    def __call__(
-        self,
-        items: Iterable,
-        function: Callable,
-        accumulator: Accumulatable,
-    ):
-        if len(items) == 0:
-            return accumulator
-        with rich_bar() as progress:
-            p_id = progress.add_task(
-                self.desc, total=len(items), unit=self.unit, disable=not self.status
-            )
-            return (
-                accumulate(
-                    progress.track(
-                        map(function, (c for c in items)),
-                        total=len(items),
-                        task_id=p_id,
-                    ),
-                    accumulator,
-                ),
-                0,
-            )
-
-
-@dataclass
-class FuturesExecutor(ExecutorBase):
-    """Execute using multiple local cores using python futures
-
-    Parameters
-    ----------
-        items : list
-            List of input arguments
-        function : callable
-            A function to be called on each input, which returns an accumulator instance
-        accumulator : Accumulatable
-            An accumulator to collect the output of the function
-        pool : concurrent.futures.Executor class or instance, optional
-            The type of futures executor to use, defaults to ProcessPoolExecutor.
-            You can pass an instance instead of a class to reuse an executor
-        workers : int, optional
-            Number of parallel processes for futures (default 1)
-        status : bool, optional
-            If true (default), enable progress bar
-        desc : str, optional
-            Label of progress description (default: 'Processing')
-        unit : str, optional
-            Label of progress bar bar unit (default: 'items')
-        compression : int, optional
-            Compress accumulator outputs in flight with LZ4, at level specified (default 1)
-            Set to ``None`` for no compression.
-        recoverable : bool, optional
-            Instead of raising Exception right away, the exception is captured and returned
-            up for custom parsing. Already completed items will be returned as well.
-        checkpoints : bool
-            To do
-        merging : bool | tuple(int, int, int), optional
-            Enables submitting intermediate merge jobs to the executor. Format is
-            (n_batches, min_batch_size, max_batch_size). Passing ``True`` will use default: (5, 4, 100),
-            aka as they are returned try to split completed jobs into 5 batches, but of at least 4 and at most 100 items.
-            Default is ``False`` - results get merged as they finish in the main process.
-        nparts : int, optional
-            Number of merge jobs to create at a time. Also pass via ``merging(X, ..., ...)''
-        minred : int, optional
-            Minimum number of items to merge in one job. Also pass via ``merging(..., X, ...)''
-        maxred : int, optional
-            maximum number of items to merge in one job. Also pass via ``merging(..., ..., X)''
-        mergepool : concurrent.futures.Executor class or instance | int, optional
-            Supply an additional executor to process merge jobs independently.
-            An ``int`` will be interpreted as ``ProcessPoolExecutor(max_workers=int)``.
-        tailtimeout : int, optional
-            Timeout requirement on job tails. Cancel all remaining jobs if none have finished
-            in the timeout window.
-    """
-
-    pool: Union[
-        Callable[..., concurrent.futures.Executor], concurrent.futures.Executor
-    ] = concurrent.futures.ProcessPoolExecutor  # fmt: skip
-    mergepool: Optional[
-        Union[
-            Callable[..., concurrent.futures.Executor],
-            concurrent.futures.Executor,
-            bool,
-        ]
-    ] = None
-    recoverable: bool = False
-    merging: Union[bool, Tuple[int, int, int]] = False
-    workers: int = 1
-    tailtimeout: Optional[int] = None
-
-    def __post_init__(self):
-        if not (
-            isinstance(self.merging, bool)
-            or (isinstance(self.merging, tuple) and len(self.merging) == 3)
-        ):
-            raise ValueError(
-                f"merging={self.merging} not understood. Required format is "
-                "(n_batches, min_batch_size, max_batch_size)"
-            )
-        elif self.merging is True:
-            self.merging = (5, 4, 100)
-
-    def _merge_size(self, size: int):
-        return min(self.merging[2], max(size // self.merging[0] + 1, self.merging[1]))
-
-    def __getstate__(self):
-        return dict(self.__dict__, pool=None)
-
-    def __call__(
-        self,
-        items: Iterable,
-        function: Callable,
-        accumulator: Accumulatable,
-    ):
-        if len(items) == 0:
-            return accumulator
-        if self.compression is not None:
-            function = _compression_wrapper(self.compression, function)
-        reducer = _reduce(self.compression)
-
-        def _processwith(pool, mergepool):
-            FH = _FuturesHolder(
-                {pool.submit(function, item) for item in items}, refresh=2
-            )
-
-            try:
-                if mergepool is None:
-                    merged = _watcher(FH, self, reducer, pool)
-                else:
-                    merged = _watcher(FH, self, reducer, mergepool)
-                return accumulate([_decompress(merged), accumulator]), 0
-
-            except Exception as e:
-                traceback.print_exc()
-                if self.recoverable:
-                    print("Exception occurred, recovering progress...")
-                    for job in FH.futures:
-                        job.cancel()
-
-                    merged = _wait_for_merges(FH, self)
-                    return accumulate([_decompress(merged), accumulator]), e
-                else:
-                    raise e from None
-
-        if isinstance(self.pool, concurrent.futures.Executor):
-            return _processwith(pool=self.pool, mergepool=self.mergepool)
-        else:
-            # assume its a class then
-            with ExitStack() as stack:
-                poolinstance = stack.enter_context(self.pool(max_workers=self.workers))
-                if self.mergepool is not None:
-                    if isinstance(self.mergepool, int):
-                        self.mergepool = concurrent.futures.ProcessPoolExecutor(
-                            max_workers=self.mergepool
-                        )
-                    mergepoolinstance = stack.enter_context(self.mergepool)
-                else:
-                    mergepoolinstance = None
-                return _processwith(pool=poolinstance, mergepool=mergepoolinstance)
-
-
-@dataclass
-class DaskExecutor(ExecutorBase):
-    """Execute using dask futures
-
-    Parameters
-    ----------
-        items : list
-            List of input arguments
-        function : callable
-            A function to be called on each input, which returns an accumulator instance
-        accumulator : Accumulatable
-            An accumulator to collect the output of the function
-        client : distributed.client.Client
-            A dask distributed client instance
-        treereduction : int, optional
-            Tree reduction factor for output accumulators (default: 20)
-        status : bool, optional
-            If true (default), enable progress bar
-        compression : int, optional
-            Compress accumulator outputs in flight with LZ4, at level specified (default 1)
-            Set to ``None`` for no compression.
-        priority : int, optional
-            Task priority, default 0
-        retries : int, optional
-            Number of retries for failed tasks (default: 3)
-        heavy_input : serializable, optional
-            Any value placed here will be broadcast to workers and joined to input
-            items in a tuple (item, heavy_input) that is passed to function.
-        function_name : str, optional
-            Name of the function being passed
-        use_dataframes: bool, optional
-            Retrieve output as a distributed Dask DataFrame (default: False).
-            The outputs of individual tasks must be Pandas DataFrames.
-
-            .. note:: If ``heavy_input`` is set, ``function`` is assumed to be pure.
-    """
-
-    client: Optional["dask.distributed.Client"] = None  # noqa
-    treereduction: int = 20
-    priority: int = 0
-    retries: int = 3
-    heavy_input: Optional[bytes] = None
-    use_dataframes: bool = False
-    # secret options
-    worker_affinity: bool = False
-
-    def __getstate__(self):
-        return dict(self.__dict__, client=None)
-
-    def __call__(
-        self,
-        items: Iterable,
-        function: Callable,
-        accumulator: Accumulatable,
-    ):
-        if len(items) == 0:
-            return accumulator
-
-        import dask.dataframe as dd
-        from dask.distributed import Client
-        from distributed.scheduler import KilledWorker
-
-        if self.client is None:
-            self.client = Client(threads_per_worker=1)
-
-        if self.use_dataframes:
-            self.compression = None
-
-        reducer = _reduce(self.compression)
-        if self.compression is not None:
-            function = _compression_wrapper(
-                self.compression, function, name=self.function_name
-            )
-
-        if self.heavy_input is not None:
-            # client.scatter is not robust against adaptive clusters
-            # https://github.com/CoffeaTeam/coffea/issues/465
-            with warnings.catch_warnings():
-                warnings.filterwarnings("ignore", "Large object of size")
-                items = list(
-                    zip(
-                        items, repeat(self.client.submit(lambda x: x, self.heavy_input))
-                    )
-                )
-
-        work = []
-        key_to_item = {}
-        if self.worker_affinity:
-            workers = list(self.client.run(lambda: 0))
-
-            def belongsto(heavy_input, workerindex, item):
-                if heavy_input is not None:
-                    item = item[0]
-                hashed = _hash(
-                    (item.fileuuid, item.treename, item.entrystart, item.entrystop)
-                )
-                return hashed % len(workers) == workerindex
-
-            for workerindex, worker in enumerate(workers):
-                items_worker = [
-                    item
-                    for item in items
-                    if belongsto(self.heavy_input, workerindex, item)
-                ]
-                work_worker = self.client.map(
-                    function,
-                    items_worker,
-                    pure=(self.heavy_input is not None),
-                    priority=self.priority,
-                    retries=self.retries,
-                    workers={worker},
-                    allow_other_workers=False,
-                )
-                work.extend(work_worker)
-                key_to_item.update(
-                    {
-                        future.key: item
-                        for future, item in zip(work_worker, items_worker)
-                    }
-                )
-        else:
-            work = self.client.map(
-                function,
-                items,
-                pure=(self.heavy_input is not None),
-                priority=self.priority,
-                retries=self.retries,
-            )
-            key_to_item.update({future.key: item for future, item in zip(work, items)})
-        if (self.function_name == "get_metadata") or not self.use_dataframes:
-            while len(work) > 1:
-                work = self.client.map(
-                    reducer,
-                    [
-                        work[i : i + self.treereduction]
-                        for i in range(0, len(work), self.treereduction)
-                    ],
-                    pure=True,
-                    priority=self.priority,
-                    retries=self.retries,
-                )
-                key_to_item.update({future.key: "(output reducer)" for future in work})
-            work = work[0]
-            try:
-                if self.status:
-                    from distributed import progress
-
-                    # FIXME: fancy widget doesn't appear, have to live with boring pbar
-                    progress(work, multi=True, notebook=False)
-                return (
-                    accumulate(
-                        [
-                            work.result()
-                            if self.compression is None
-                            else _decompress(work.result())
-                        ],
-                        accumulator,
-                    ),
-                    0,
-                )
-            except KilledWorker as ex:
-                baditem = key_to_item[ex.task]
-                if self.heavy_input is not None and isinstance(baditem, tuple):
-                    baditem = baditem[0]
-                raise RuntimeError(
-                    f"Work item {baditem} caused a KilledWorker exception (likely a segfault or out-of-memory issue)"
-                )
-        else:
-            if self.status:
-                from distributed import progress
-
-                progress(work, multi=True, notebook=False)
-            return {"out": dd.from_delayed(work)}, 0
-
-
-@dataclass
-class ParslExecutor(ExecutorBase):
-    """Execute using parsl pyapp wrapper
-
-    Parameters
-    ----------
-        items : list
-            List of input arguments
-        function : callable
-            A function to be called on each input, which returns an accumulator instance
-        accumulator : Accumulatable
-            An accumulator to collect the output of the function
-        config : parsl.config.Config, optional
-            A parsl DataFlow configuration object. Necessary if there is no active kernel
-
-            .. note:: In general, it is safer to construct the DFK with ``parsl.load(config)`` prior to calling this function
-        status : bool
-            If true (default), enable progress bar
-        unit : str
-            Label of progress bar unit
-        desc : str
-            Label of progress bar description
-        compression : int, optional
-            Compress accumulator outputs in flight with LZ4, at level specified (default 1)
-            Set to ``None`` for no compression.
-        recoverable : bool, optional
-            Instead of raising Exception right away, the exception is captured and returned
-            up for custom parsing. Already completed items will be returned as well.
-        merging : bool | tuple(int, int, int), optional
-            Enables submitting intermediate merge jobs to the executor. Format is
-            (n_batches, min_batch_size, max_batch_size). Passing ``True`` will use default: (5, 4, 100),
-            aka as they are returned try to split completed jobs into 5 batches, but of at least 4 and at most 100 items.
-            Default is ``False`` - results get merged as they finish in the main process.
-        jobs_executors : list | "all" optional
-            Labels of the executors (from dfk.config.executors) that will process main jobs.
-            Default is 'all'. Recommended is ``['jobs']``, while passing ``label='jobs'`` to the primary executor.
-        merges_executors : list | "all" optional
-            Labels of the executors (from dfk.config.executors) that will process main jobs.
-            Default is 'all'. Recommended is ``['merges']``, while passing ``label='merges'`` to the executor dedicated towards merge jobs.
-        tailtimeout : int, optional
-            Timeout requirement on job tails. Cancel all remaining jobs if none have finished
-            in the timeout window.
-    """
-
-    tailtimeout: Optional[int] = None
-    config: Optional["parsl.config.Config"] = None  # noqa
-    recoverable: bool = False
-    merging: Optional[Union[bool, Tuple[int, int, int]]] = False
-    jobs_executors: Union[str, List] = "all"
-    merges_executors: Union[str, List] = "all"
-
-    def __post_init__(self):
-        if not (
-            isinstance(self.merging, bool)
-            or (isinstance(self.merging, tuple) and len(self.merging) == 3)
-        ):
-            raise ValueError(
-                f"merging={self.merging} not understood. Required format is "
-                "(n_batches, min_batch_size, max_batch_size)"
-            )
-        elif self.merging is True:
-            self.merging = (5, 4, 100)
-
-    def _merge_size(self, size: int):
-        return min(self.merging[2], max(size // self.merging[0] + 1, self.merging[1]))
-
-    def __call__(
-        self,
-        items: Iterable,
-        function: Callable,
-        accumulator: Accumulatable,
-    ):
-        if len(items) == 0:
-            return accumulator
-        import parsl
-        from parsl.app.app import python_app
-
-        from .parsl.timeout import timeout
-
-        if self.compression is not None:
-            function = _compression_wrapper(self.compression, function)
-
-        # Parse config if passed
-        cleanup = False
-        try:
-            parsl.dfk()
-        except RuntimeError:
-            cleanup = True
-            pass
-        if cleanup and self.config is None:
-            raise RuntimeError(
-                "No active parsl DataFlowKernel, must specify a config to construct one"
-            )
-        elif not cleanup and self.config is not None:
-            raise RuntimeError("An active parsl DataFlowKernel already exists")
-        elif self.config is not None:
-            parsl.clear()
-            parsl.load(self.config)
-
-        # Check config/executors
-        _exec_avail = [exe.label for exe in parsl.dfk().config.executors]
-        _execs_tried = (
-            [] if self.jobs_executors == "all" else [e for e in self.jobs_executors]
-        )
-        _execs_tried += (
-            [] if self.merges_executors == "all" else [e for e in self.merges_executors]
-        )
-        if not all([_e in _exec_avail for _e in _execs_tried]):
-            raise RuntimeError(
-                f"Executors: [{','.join(_e for _e in _execs_tried if _e not in _exec_avail)}] not available in the config."
-            )
-
-        # Apps
-        app = timeout(python_app(function, executors=self.jobs_executors))
-        reducer = timeout(
-            python_app(_reduce(self.compression), executors=self.merges_executors)
-        )
-
-        FH = _FuturesHolder(set(map(app, items)), refresh=2)
-        try:
-            merged = _watcher(FH, self, reducer)
-            return accumulate([_decompress(merged), accumulator]), 0
-
-        except Exception as e:
-            traceback.print_exc()
-            if self.recoverable:
-                print("Exception occurred, recovering progress...")
-                # for job in FH.futures:  # NotImplemented in parsl
-                #     job.cancel()
-
-                merged = _wait_for_merges(FH, self)
-                return accumulate([_decompress(merged), accumulator]), e
-            else:
-                raise e from None
-        finally:
-            if cleanup:
-                parsl.dfk().cleanup()
-                parsl.clear()
-
-
-class ParquetFileUprootShim:
-    def __init__(self, table, name):
-        self.table = table
-        self.name = name
-
-    def array(self, **kwargs):
-        import awkward
-
-        return awkward.Array(self.table[self.name])
-
-
-class ParquetFileContext:
-    def __init__(self, filename):
-        self.filename = filename
-        self.filehandle = None
-        self.branchnames = None
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_value, exc_traceback):
-        pass
-
-    def _get_handle(self):
-        import pyarrow.parquet as pq
-
-        if self.filehandle is None:
-            self.filehandle = pq.ParquetFile(self.filename)
-            self.branchnames = {
-                item.path.split(".")[0] for item in self.filehandle.schema
-            }
-
-    @property
-    def num_entries(self):
-        self._get_handle()
-        return self.filehandle.metadata.num_rows
-
-    def keys(self):
-        self._get_handle()
-        return self.branchnames
-
-    def __iter__(self):
-        self._get_handle()
-        return iter(self.branchnames)
-
-    def __getitem__(self, name):
-        self._get_handle()
-        if name in self.branchnames:
-            return ParquetFileUprootShim(
-                self.filehandle.read([name], use_threads=False), name
-            )
-        else:
-            return KeyError(name)
-
-    def __contains__(self, name):
-        self._get_handle()
-        return name in self.branchnames
-
-
-@dataclass
-class Runner:
-    """A tool to run a processor using uproot for data delivery
-
-    A convenience wrapper to submit jobs for a file set, which is a
-    dictionary of dataset: [file list] entries.  Supports only uproot TTree
-    reading, via NanoEvents or LazyDataFrame.  For more customized processing,
-    e.g. to read other objects from the files and pass them into data frames,
-    one can write a similar function in their user code.
-
-    Parameters
-    ----------
-        executor : ExecutorBase instance
-            Executor, which implements a callable with inputs: items, function, accumulator
-            and performs some action equivalent to:
-            ``for item in items: accumulator += function(item)``
-        pre_executor : ExecutorBase instance
-            Executor, used to calculate fileset metadata
-            Defaults to executor
-        chunksize : int, optional
-            Maximum number of entries to process at a time in the data frame, default: 100k
-        maxchunks : int, optional
-            Maximum number of chunks to process per dataset
-            Defaults to processing the whole dataset
-        metadata_cache : mapping, optional
-            A dict-like object to use as a cache for (file, tree) metadata that is used to
-            determine chunking.  Defaults to a in-memory LRU cache that holds 100k entries
-            (about 1MB depending on the length of filenames, etc.)  If you edit an input file
-            (please don't) during a session, the session can be restarted to clear the cache.
-        dynamic_chunksize : dict, optional
-            Whether to adapt the chunksize for units of work to run in the targets given.
-            Currently supported are 'wall_time' (in seconds), and 'memory' (in MB).
-            E.g., with {"wall_time": 120, "memory": 2048}, the chunksize will
-            be dynamically adapted so that processing jobs each run in about
-            two minutes, using two GB of memory. (Currently only for the WorkQueueExecutor.)
-    """
-
-    executor: ExecutorBase
-    pre_executor: Optional[ExecutorBase] = None
-    chunksize: int = 100000
-    maxchunks: Optional[int] = None
-    metadata_cache: Optional[MutableMapping] = None
-    dynamic_chunksize: Optional[Dict] = None
-    skipbadfiles: bool = False
-    xrootdtimeout: Optional[int] = 60
-    align_clusters: bool = False
-    savemetrics: bool = False
-    mmap: bool = False
-    schema: Optional[schemas.BaseSchema] = schemas.BaseSchema
-    cachestrategy: Optional[
-        Union[Literal["dask-worker"], Callable[..., MutableMapping]]
-    ] = None  # fmt: skip
-    processor_compression: int = 1
-    use_skyhook: Optional[bool] = False
-    skyhook_options: Optional[Dict] = field(default_factory=dict)
-    format: str = "root"
-
-    @staticmethod
-    def read_coffea_config():
-        config_path = None
-        if "HOME" in os.environ:
-            config_path = os.path.join(os.environ["HOME"], ".coffea.toml")
-        elif "_CONDOR_SCRATCH_DIR" in os.environ:
-            config_path = os.path.join(
-                os.environ["_CONDOR_SCRATCH_DIR"], ".coffea.toml"
-            )
-
-        if config_path is not None and os.path.exists(config_path):
-            with open(config_path) as f:
-                return toml.loads(f.read())
-        else:
-            return dict()
-
-    def __post_init__(self):
-        if self.pre_executor is None:
-            self.pre_executor = self.executor
-
-        assert isinstance(
-            self.executor, ExecutorBase
-        ), "Expected executor to derive from ExecutorBase"
-        assert isinstance(
-            self.pre_executor, ExecutorBase
-        ), "Expected pre_executor to derive from ExecutorBase"
-
-        if self.metadata_cache is None:
-            self.metadata_cache = DEFAULT_METADATA_CACHE
-
-        if self.align_clusters and self.dynamic_chunksize:
-            raise RuntimeError(
-                "align_clusters and dynamic_chunksize cannot be used simultaneously"
-            )
-        if self.maxchunks and self.dynamic_chunksize:
-            raise RuntimeError(
-                "maxchunks and dynamic_chunksize cannot be used simultaneously"
-            )
-        if self.dynamic_chunksize and not isinstance(self.executor, WorkQueueExecutor):
-            raise RuntimeError(
-                "dynamic_chunksize currently only supported by the WorkQueueExecutor"
-            )
-
-        assert self.format in ("root", "parquet")
-
-    @property
-    def retries(self):
-        if isinstance(self.executor, DaskExecutor):
-            retries = 0
-        else:
-            retries = getattr(self.executor, "retries", 0)
-        assert retries >= 0
-        return retries
-
-    @property
-    def use_dataframes(self):
-        if isinstance(self.executor, DaskExecutor):
-            return self.executor.use_dataframes
-        else:
-            return False
-
-    @staticmethod
-    def get_cache(cachestrategy):
-        cache = None
-        if cachestrategy == "dask-worker":
-            from distributed import get_worker
-
-            from coffea.processor.dask import ColumnCache
-
-            worker = get_worker()
-            try:
-                cache = worker.plugins[ColumnCache.name]
-            except KeyError:
-                # emit warning if not found?
-                pass
-        elif callable(cachestrategy):
-            cache = cachestrategy()
-        return cache
-
-    @staticmethod
-    def automatic_retries(retries: int, skipbadfiles: bool, func, *args, **kwargs):
-        """This should probably defined on Executor-level."""
-        import warnings
-
-        retry_count = 0
-        while retry_count <= retries:
-            try:
-                return func(*args, **kwargs)
-            # catch xrootd errors and optionally skip
-            # or retry to read the file
-            except Exception as e:
-                chain = _exception_chain(e)
-                if skipbadfiles and any(
-                    isinstance(c, (FileNotFoundError, UprootMissTreeError))
-                    for c in chain
-                ):
-                    warnings.warn(str(e))
-                    break
-                if (
-                    skipbadfiles
-                    and (retries == retry_count)
-                    and any(
-                        e in str(c)
-                        for c in chain
-                        for e in [
-                            "Invalid redirect URL",
-                            "Operation expired",
-                            "Socket timeout",
-                        ]
-                    )
-                ):
-                    warnings.warn(str(e))
-                    break
-                if (
-                    not skipbadfiles
-                    or any("Auth failed" in str(c) for c in chain)
-                    or retries == retry_count
-                ):
-                    raise e
-                warnings.warn("Attempt %d of %d." % (retry_count + 1, retries + 1))
-            retry_count += 1
-
-    @staticmethod
-    def _normalize_fileset(
-        fileset: Dict,
-        treename: str,
-    ) -> Generator[FileMeta, None, None]:
-        if isinstance(fileset, str):
-            with open(fileset) as fin:
-                fileset = json.load(fin)
-        elif not isinstance(fileset, Mapping):
-            raise ValueError("Expected fileset to be a path string or mapping")
-        reserved_metakeys = _PROTECTED_NAMES
-        for dataset, filelist in fileset.items():
-            user_meta = None
-            if isinstance(filelist, dict):
-                user_meta = filelist["metadata"] if "metadata" in filelist else None
-                if user_meta is not None:
-                    for rkey in reserved_metakeys:
-                        if rkey in user_meta.keys():
-                            raise ValueError(
-                                f'Reserved word "{rkey}" in metadata section of fileset dictionary, please rename this entry!'
-                            )
-                if "treename" not in filelist and treename is None:
-                    raise ValueError(
-                        "treename must be specified if the fileset does not contain tree names"
-                    )
-                local_treename = (
-                    filelist["treename"] if "treename" in filelist else treename
-                )
-                filelist = filelist["files"]
-            elif isinstance(filelist, list):
-                if treename is None:
-                    raise ValueError(
-                        "treename must be specified if the fileset does not contain tree names"
-                    )
-                local_treename = treename
-            else:
-                raise ValueError(
-                    "list of filenames in fileset must be a list or a dict"
-                )
-            for filename in filelist:
-                yield FileMeta(dataset, filename, local_treename, user_meta)
-
-    @staticmethod
-    def metadata_fetcher_root(
-        xrootdtimeout: int, align_clusters: bool, item: FileMeta
-    ) -> Accumulatable:
-        with uproot.open({item.filename: None}, timeout=xrootdtimeout) as file:
-            try:
-                tree = file[item.treename]
-            except uproot.exceptions.KeyInFileError as e:
-                raise UprootMissTreeError(str(e)) from e
-
-            metadata = {}
-            if item.metadata:
-                metadata.update(item.metadata)
-            metadata.update({"numentries": tree.num_entries, "uuid": file.file.fUUID})
-            if align_clusters:
-                metadata["clusters"] = tree.common_entry_offsets()
-            out = set_accumulator(
-                [FileMeta(item.dataset, item.filename, item.treename, metadata)]
-            )
-        return out
-
-    @staticmethod
-    def metadata_fetcher_parquet(item: FileMeta):
-        with ParquetFileContext(item.filename) as file:
-            metadata = {}
-            if item.metadata:
-                metadata.update(item.metadata)
-            metadata.update(
-                {"numentries": file.num_entries, "uuid": b"NO_UUID_0000_000"}
-            )
-            out = set_accumulator(
-                [FileMeta(item.dataset, item.filename, item.treename, metadata)]
-            )
-        return out
-
-    def _preprocess_fileset_root(self, fileset: Dict) -> None:
-        # this is a bit of an abuse of map-reduce but ok
-        to_get = {
-            filemeta
-            for filemeta in fileset
-            if not filemeta.populated(clusters=self.align_clusters)
-        }
-        if len(to_get) > 0:
-            out = set_accumulator()
-            pre_arg_override = {
-                "function_name": "get_metadata",
-                "desc": "Preprocessing",
-                "unit": "file",
-                "compression": None,
-            }
-            if isinstance(self.pre_executor, (FuturesExecutor, ParslExecutor)):
-                pre_arg_override.update({"tailtimeout": None})
-            if isinstance(self.pre_executor, (DaskExecutor)):
-                self.pre_executor.heavy_input = None
-                pre_arg_override.update({"worker_affinity": False})
-            pre_executor = self.pre_executor.copy(**pre_arg_override)
-            closure = partial(
-                self.automatic_retries,
-                self.retries,
-                self.skipbadfiles,
-                partial(
-                    self.metadata_fetcher_root, self.xrootdtimeout, self.align_clusters
-                ),
-            )
-            out, _ = pre_executor(to_get, closure, out)
-            while out:
-                item = out.pop()
-                self.metadata_cache[item] = item.metadata
-            for filemeta in fileset:
-                filemeta.maybe_populate(self.metadata_cache)
-
-    def _preprocess_fileset_parquet(self, fileset: Dict) -> None:
-        # this is a bit of an abuse of map-reduce but ok
-        to_get = {
-            filemeta
-            for filemeta in fileset
-            if not filemeta.populated(clusters=self.align_clusters)
-        }
-        if len(to_get) > 0:
-            out = set_accumulator()
-            pre_arg_override = {
-                "function_name": "get_metadata",
-                "desc": "Preprocessing",
-                "unit": "file",
-                "compression": None,
-            }
-            if isinstance(self.pre_executor, (FuturesExecutor, ParslExecutor)):
-                pre_arg_override.update({"tailtimeout": None})
-            if isinstance(self.pre_executor, (DaskExecutor)):
-                self.pre_executor.heavy_input = None
-                pre_arg_override.update({"worker_affinity": False})
-            pre_executor = self.pre_executor.copy(**pre_arg_override)
-            closure = partial(
-                self.automatic_retries,
-                self.retries,
-                self.skipbadfiles,
-                self.metadata_fetcher_parquet,
-            )
-            out, _ = pre_executor(to_get, closure, out)
-            while out:
-                item = out.pop()
-                self.metadata_cache[item] = item.metadata
-            for filemeta in fileset:
-                filemeta.maybe_populate(self.metadata_cache)
-
-    def _filter_badfiles(self, fileset: Dict) -> List:
-        final_fileset = []
-        for filemeta in fileset:
-            if filemeta.populated(clusters=self.align_clusters):
-                final_fileset.append(filemeta)
-            elif not self.skipbadfiles:
-                raise RuntimeError(
-                    f"Metadata for file {filemeta.filename} could not be accessed."
-                )
-        return final_fileset
-
-    def _chunk_generator(self, fileset: Dict, treename: str) -> Generator:
-        config = None
-        if self.use_skyhook:
-            config = Runner.read_coffea_config()
-        if not self.use_skyhook and (self.format == "root" or self.format == "parquet"):
-            if self.maxchunks is None:
-                last_chunksize = self.chunksize
-                for filemeta in fileset:
-                    last_chunksize = yield from filemeta.chunks(
-                        last_chunksize,
-                        self.align_clusters,
-                    )
-            else:
-                # get just enough file info to compute chunking
-                nchunks = defaultdict(int)
-                chunks = []
-                for filemeta in fileset:
-                    if nchunks[filemeta.dataset] >= self.maxchunks:
-                        continue
-                    for chunk in filemeta.chunks(self.chunksize, self.align_clusters):
-                        chunks.append(chunk)
-                        nchunks[filemeta.dataset] += 1
-                        if nchunks[filemeta.dataset] >= self.maxchunks:
-                            break
-                yield from (c for c in chunks)
-        else:
-            if self.use_skyhook and not config.get("skyhook", None):
-                print("No skyhook config found, using defaults")
-                config["skyhook"] = dict()
-
-            dataset_filelist_map = {}
-            if self.use_skyhook:
-                import pyarrow.dataset as ds
-
-                for dataset, basedir in fileset.items():
-                    ds_ = ds.dataset(basedir, format="parquet")
-                    dataset_filelist_map[dataset] = ds_.files
-            else:
-                for dataset, maybe_filelist in fileset.items():
-                    if isinstance(maybe_filelist, list):
-                        dataset_filelist_map[dataset] = maybe_filelist
-                    elif isinstance(maybe_filelist, dict):
-                        if "files" not in maybe_filelist:
-                            raise ValueError(
-                                "Dataset definition must have key 'files' defined!"
-                            )
-                        dataset_filelist_map[dataset] = maybe_filelist["files"]
-                    else:
-                        raise ValueError(
-                            "Dataset definition in fileset must be dict[str: list[str]] or dict[str: dict[str: Any]]"
-                        )
-            chunks = []
-            for dataset, filelist in dataset_filelist_map.items():
-                for filename in filelist:
-                    # If skyhook config is provided and is not empty,
-                    if self.use_skyhook:
-                        ceph_config_path = config["skyhook"].get(
-                            "ceph_config_path", "/etc/ceph/ceph.conf"
-                        )
-                        ceph_data_pool = config["skyhook"].get(
-                            "ceph_data_pool", "cephfs_data"
-                        )
-                        filename = f"{ceph_config_path}:{ceph_data_pool}:{filename}"
-                    chunks.append(
-                        WorkItem(
-                            dataset,
-                            filename,
-                            treename,
-                            0,
-                            0,
-                            "",
-                            fileset[dataset]["metadata"]
-                            if "metadata" in fileset[dataset]
-                            else None,
-                        )
-                    )
-            yield from iter(chunks)
-
-    @staticmethod
-    def _work_function(
-        format: str,
-        xrootdtimeout: int,
-        mmap: bool,
-        schema: schemas.BaseSchema,
-        cache_function: Callable[[], MutableMapping],
-        use_dataframes: bool,
-        savemetrics: bool,
-        item: WorkItem,
-        processor_instance: ProcessorABC,
-    ) -> Dict:
-        if processor_instance == "heavy":
-            item, processor_instance = item
-        if not isinstance(processor_instance, ProcessorABC):
-            processor_instance = cloudpickle.loads(lz4f.decompress(processor_instance))
-
-        if format == "root":
-            filecontext = uproot.open(
-                {item.filename: None},
-                timeout=xrootdtimeout,
-                file_handler=uproot.MemmapSource
-                if mmap
-                else uproot.MultithreadedFileSource,
-            )
-        elif format == "parquet":
-            filecontext = ParquetFileContext(item.filename)
-
-        metadata = {
-            "dataset": item.dataset,
-            "filename": item.filename,
-            "treename": item.treename,
-            "entrystart": item.entrystart,
-            "entrystop": item.entrystop,
-            "fileuuid": str(uuid.UUID(bytes=item.fileuuid))
-            if len(item.fileuuid) > 0
-            else "",
-        }
-        if item.usermeta is not None:
-            metadata.update(item.usermeta)
-
-        with filecontext as file:
-            if schema is None:
-                # To deprecate
-                tree = None
-                events = None
-                if format == "root":
-                    tree = file[item.treename]
-                    events = uproot.dask(tree, ak_add_doc=True)[
-                        item.entrystart : item.entrystop
-                    ]
-                    setattr(events, "metadata", metadata)
-                elif format == "parquet":
-                    import dask_awkward
-
-                    tree = file
-                    events = dask_awkward.from_parquet(item.filename)[
-                        item.entrystart : item.entrystop
-                    ]
-                    setattr(events, "metadata", metadata)
-                else:
-                    raise ValueError("Format can only be root or parquet!")
-            elif issubclass(schema, schemas.BaseSchema):
-                # change here
-                if format == "root":
-                    materialized = []
-                    factory = NanoEventsFactory.from_root(
-                        file=file,
-                        treepath=item.treename,
-                        persistent_cache=cache_function(),
-                        schemaclass=schema,
-                        metadata=metadata,
-                        access_log=materialized,
-                        delayed=True,
-                    )
-                    events = factory.events()[item.entrystart : item.entrystop]
-                elif format == "parquet":
-                    skyhook_options = {}
-                    if ":" in item.filename:
-                        (
-                            ceph_config_path,
-                            ceph_data_pool,
-                            filename,
-                        ) = item.filename.split(":")
-                        # patch back filename into item
-                        item = WorkItem(**dict(asdict(item), filename=filename))
-                        skyhook_options["ceph_config_path"] = ceph_config_path
-                        skyhook_options["ceph_data_pool"] = ceph_data_pool
-
-                    factory = NanoEventsFactory.from_parquet(
-                        file=item.filename,
-                        treepath=item.treename,
-                        schemaclass=schema,
-                        metadata=metadata,
-                        skyhook_options=skyhook_options,
-                        permit_dask=True,
-                    )
-                    events = factory.events()[item.entrystart : item.entrystop]
-            else:
-                raise ValueError(
-                    "Expected schema to derive from nanoevents.BaseSchema, instead got %r"
-                    % schema
-                )
-            tic = time.time()
-            try:
-                out = None
-                if isinstance(events, LazyDataFrame):
-                    out = processor_instance.process(events)
-                else:
-                    import dask
-                    import dask_awkward
-
-                    to_compute = processor_instance.process(events)
-                    # materialized = dask_awkward.report_necessary_buffers(to_compute)
-                    out = dask.compute(to_compute, scheduler="single-threaded")[0]
-            except Exception as e:
-                raise Exception(f"Failed processing file: {item!r}") from e
-            if out is None:
-                raise ValueError(
-                    "Output of process() should not be None. Make sure your processor's process() function returns an accumulator."
-                )
-            toc = time.time()
-            if use_dataframes:
-                return out
-            else:
-                if savemetrics:
-                    metrics = {}
-                    if isinstance(file, uproot.ReadOnlyDirectory):
-                        metrics["bytesread"] = file.file.source.num_requested_bytes
-                    # metrics["data_and_shape_buffers"] = set(materialized)
-                    # metrics["shape_only_buffers"] = set(materialized)
-                    if schema is not None and issubclass(schema, schemas.BaseSchema):
-                        metrics["entries"] = len(events)
-                    else:
-                        metrics["entries"] = events.size
-                    metrics["processtime"] = toc - tic
-                    return {"out": out, "metrics": metrics, "processed": {item}}
-                return {"out": out, "processed": {item}}
-
-    def __call__(
-        self,
-        fileset: Dict,
-        treename: str,
-        processor_instance: ProcessorABC,
-    ) -> Accumulatable:
-        """Run the processor_instance on a given fileset
-
-        Parameters
-        ----------
-            fileset : dict
-                A dictionary ``{dataset: [file, file], }``
-                Optionally, if some files' tree name differ, the dictionary can be specified:
-                ``{dataset: {'treename': 'name', 'files': [file, file]}, }``
-            treename : str
-                name of tree inside each root file, can be ``None``;
-                treename can also be defined in fileset, which will override the passed treename
-            processor_instance : ProcessorABC
-                An instance of a class deriving from ProcessorABC
-        """
-        if isinstance(self.executor, DaskExecutorBase):
-            return self.run_dask(fileset, processor_instance, treename)
-
-        wrapped_out = self.run(fileset, processor_instance, treename)
-        if self.use_dataframes:
-            return wrapped_out  # not wrapped anymore
-        if self.savemetrics:
-            return wrapped_out["out"], wrapped_out["metrics"]
-        return wrapped_out["out"]
-
-    def preprocess(
-        self,
-        fileset: Dict,
-        treename: str,
-    ) -> Generator:
-        """Run the processor_instance on a given fileset
-
-        Parameters
-        ----------
-            fileset : dict
-                A dictionary ``{dataset: [file, file], }``
-                Optionally, if some files' tree name differ, the dictionary can be specified:
-                ``{dataset: {'treename': 'name', 'files': [file, file]}, }``
-            treename : str
-                name of tree inside each root file, can be ``None``;
-                treename can also be defined in fileset, which will override the passed treename
-        """
-
-        if not isinstance(fileset, (Mapping, str)):
-            raise ValueError(
-                "Expected fileset to be a mapping dataset: list(files) or filename"
-            )
-        if self.format == "root":
-            fileset = list(self._normalize_fileset(fileset, treename))
-            for filemeta in fileset:
-                filemeta.maybe_populate(self.metadata_cache)
-
-            self._preprocess_fileset_root(fileset)
-            fileset = self._filter_badfiles(fileset)
-
-            # reverse fileset list to match the order of files as presented in version
-            # v0.7.4. This fixes tests using maxchunks.
-            fileset.reverse()
-        elif self.format == "parquet":
-            fileset = list(self._normalize_fileset(fileset, treename))
-            for filemeta in fileset:
-                filemeta.maybe_populate(self.metadata_cache)
-
-            self._preprocess_fileset_parquet(fileset)
-            fileset = self._filter_badfiles(fileset)
-
-            # reverse fileset list to match the order of files as presented in version
-            # v0.7.4. This fixes tests using maxchunks.
-            fileset.reverse()
-
-        return self._chunk_generator(fileset, treename)
-
-    def run_dask(
-        self,
-        fileset: Union[Dict, str, List[WorkItem], Generator],
-        processor_instance: ProcessorABC,
-        treename: str = None,
-    ) -> Accumulatable:
-        """Run the processor_instance on a given fileset
-
-        Parameters
-        ----------
-            fileset : dict | str | List[WorkItem] | Generator
-                - A dictionary ``{dataset: [file, file], }``
-                  Optionally, if some files' tree name differ, the dictionary can be specified:
-                  ``{dataset: {'treename': 'name', 'files': [file, file]}, }``
-                - A single file name
-                - File chunks for self.preprocess()
-                - Chunk generator
-            treename : str, optional
-                name of tree inside each root file, can be ``None``;
-                treename can also be defined in fileset, which will override the passed treename
-                Not needed if processing premade chunks
-            processor_instance : ProcessorABC
-                An instance of a class deriving from ProcessorABC
-        """
-        pass
-
-    def run(
-        self,
-        fileset: Union[Dict, str, List[WorkItem], Generator],
-        processor_instance: ProcessorABC,
-        treename: str = None,
-    ) -> Accumulatable:
-        """Run the processor_instance on a given fileset
-
-        Parameters
-        ----------
-            fileset : dict | str | List[WorkItem] | Generator
-                - A dictionary ``{dataset: [file, file], }``
-                  Optionally, if some files' tree name differ, the dictionary can be specified:
-                  ``{dataset: {'treename': 'name', 'files': [file, file]}, }``
-                - A single file name
-                - File chunks for self.preprocess()
-                - Chunk generator
-            treename : str, optional
-                name of tree inside each root file, can be ``None``;
-                treename can also be defined in fileset, which will override the passed treename
-                Not needed if processing premade chunks
-            processor_instance : ProcessorABC
-                An instance of a class deriving from ProcessorABC
-        """
-
-        meta = False
-        if not isinstance(fileset, (Mapping, str)):
-            if isinstance(fileset, Generator) or isinstance(fileset[0], WorkItem):
-                meta = True
-            else:
-                raise ValueError(
-                    "Expected fileset to be a mapping dataset: list(files) or filename"
-                )
-        if not isinstance(processor_instance, ProcessorABC):
-            raise ValueError("Expected processor_instance to derive from ProcessorABC")
-
-        if meta:
-            chunks = fileset
-        else:
-            chunks = self.preprocess(fileset, treename)
-
-        if self.processor_compression is None:
-            pi_to_send = processor_instance
-        else:
-            pi_to_send = lz4f.compress(
-                cloudpickle.dumps(processor_instance),
-                compression_level=self.processor_compression,
-            )
-        # hack around dask/dask#5503 which is really a silly request but here we are
-        if isinstance(self.executor, DaskExecutor):
-            self.executor.heavy_input = pi_to_send
-            closure = partial(
-                self._work_function,
-                self.format,
-                self.xrootdtimeout,
-                self.mmap,
-                self.schema,
-                partial(self.get_cache, self.cachestrategy),
-                self.use_dataframes,
-                self.savemetrics,
-                processor_instance="heavy",
-            )
-        else:
-            closure = partial(
-                self._work_function,
-                self.format,
-                self.xrootdtimeout,
-                self.mmap,
-                self.schema,
-                partial(self.get_cache, self.cachestrategy),
-                self.use_dataframes,
-                self.savemetrics,
-                processor_instance=pi_to_send,
-            )
-
-        if self.format == "root" and isinstance(self.executor, WorkQueueExecutor):
-            # keep chunks in generator, use a copy to count number of events
-            # this is cheap, as we are reading from the cache
-            chunks_to_count = self.preprocess(fileset, treename)
-        else:
-            # materialize chunks to list, then count that list
-            chunks = list(chunks)
-            chunks_to_count = chunks
-
-        events_total = sum(len(c) for c in chunks_to_count)
-
-        exe_args = {
-            "unit": "chunk",
-            "function_name": type(processor_instance).__name__,
-        }
-        if isinstance(self.executor, WorkQueueExecutor):
-            exe_args.update(
-                {
-                    "unit": "event",
-                    "events_total": events_total,
-                    "dynamic_chunksize": self.dynamic_chunksize,
-                    "chunksize": self.chunksize,
-                }
-            )
-
-        closure = partial(
-            self.automatic_retries, self.retries, self.skipbadfiles, closure
-        )
-
-        executor = self.executor.copy(**exe_args)
-        wrapped_out, e = executor(chunks, closure, None)
-        if wrapped_out is None:
-            raise ValueError(
-                "No chunks returned results, verify ``processor`` instance structure.\n\
-                if you used skipbadfiles=True, it is possible all your files are bad."
-            )
-        wrapped_out["exception"] = e
-
-        if not self.use_dataframes:
-            processor_instance.postprocess(wrapped_out["out"])
-
-        if "metrics" in wrapped_out.keys():
-            wrapped_out["metrics"]["chunks"] = len(chunks)
-            for k, v in wrapped_out["metrics"].items():
-                if isinstance(v, set):
-                    wrapped_out["metrics"][k] = list(v)
-        if self.use_dataframes:
-            return wrapped_out["out"]
-        else:
-            return wrapped_out
-
-
-def run_spark_job(
-    fileset,
-    processor_instance,
-    executor,
-    executor_args={},
-    spark=None,
-    partitionsize=200000,
-    thread_workers=16,
-):
-    """A wrapper to submit spark jobs
-
-    A convenience wrapper to submit jobs for spark datasets, which is a
-    dictionary of dataset: [file list] entries.  Presently supports reading of
-    parquet files converted from root.  For more customized processing,
-    e.g. to read other objects from the files and pass them into data frames,
-    one can write a similar function in their user code.
-
-    Parameters
-    ----------
-        fileset : dict
-            dictionary {dataset: [file, file], }
-        processor_instance : ProcessorABC
-            An instance of a class deriving from ProcessorABC
-
-            .. note:: The processor instance must define all the columns in data and MC that it reads as ``.columns``
-        executor:
-            anything that inherits from `SparkExecutor` like `spark_executor`
-
-            In general, a function that takes 3 arguments: items, function accumulator
-            and performs some action equivalent to:
-            for item in items: accumulator += function(item)
-        executor_args:
-            arguments to send to the creation of a spark session
-        spark:
-            an optional already created spark instance
-
-            if ``None`` then we create an ephemeral spark instance using a config
-        partitionsize:
-            partition size to try to aim for (coalescese only, repartition too expensive)
-        thread_workers:
-            how many spark jobs to let fly in parallel during processing steps
-    """
-
-    try:
-        import pyspark
-    except ImportError as e:
-        print(
-            "you must have pyspark installed to call run_spark_job()!", file=sys.stderr
-        )
-        raise e
-
-    import warnings
-
-    import pyarrow as pa
-    from packaging import version
-
-    arrow_env = ("ARROW_PRE_0_15_IPC_FORMAT", "1")
-    if version.parse(pa.__version__) >= version.parse("0.15.0") and version.parse(
-        pyspark.__version__
-    ) < version.parse("3.0.0"):
-        import os
-
-        if arrow_env[0] not in os.environ or os.environ[arrow_env[0]] != arrow_env[1]:
-            warnings.warn(
-                "If you are using pyarrow >= 0.15.0, make sure to set %s=%s in your environment!"
-                % arrow_env
-            )
-
-    import pyspark.sql
-
-    from .spark.detail import _spark_initialize, _spark_make_dfs, _spark_stop
-    from .spark.spark_executor import SparkExecutor
-
-    if not isinstance(fileset, Mapping):
-        raise ValueError("Expected fileset to be a mapping dataset: list(files)")
-    if not isinstance(processor_instance, ProcessorABC):
-        raise ValueError("Expected processor_instance to derive from ProcessorABC")
-    if not isinstance(executor, SparkExecutor):
-        raise ValueError("Expected executor to derive from SparkExecutor")
-
-    executor_args.setdefault("config", None)
-    executor_args.setdefault("file_type", "parquet")
-    executor_args.setdefault("laurelin_version", "1.1.1")
-    executor_args.setdefault("treeName", "Events")
-    executor_args.setdefault("schema", None)
-    executor_args.setdefault("cache", True)
-    executor_args.setdefault("skipbadfiles", False)
-    executor_args.setdefault("retries", 0)
-    executor_args.setdefault("xrootdtimeout", None)
-    file_type = executor_args["file_type"]
-    treeName = executor_args["treeName"]
-    schema = executor_args["schema"]
-    if "flatten" in executor_args:
-        raise ValueError(
-            "Executor argument 'flatten' is deprecated, please refactor your processor to accept awkward arrays"
-        )
-    if "nano" in executor_args:
-        raise ValueError(
-            "Awkward0 NanoEvents no longer supported.\n"
-            "Please use 'schema': processor.NanoAODSchema to enable awkward NanoEvents processing."
-        )
-    use_cache = executor_args["cache"]
-
-    if executor_args["config"] is None:
-        executor_args.pop("config")
-
-    # initialize spark if we need to
-    # if we initialize, then we deconstruct
-    # when we're done
-    killSpark = False
-    if spark is None:
-        spark = _spark_initialize(**executor_args)
-        killSpark = True
-        use_cache = False  # if we always kill spark then we cannot use the cache
-    else:
-        if not isinstance(spark, pyspark.sql.session.SparkSession):
-            raise ValueError(
-                "Expected 'spark' to be a pyspark.sql.session.SparkSession"
-            )
-
-    dfslist = {}
-    if executor._cacheddfs is None:
-        dfslist = _spark_make_dfs(
-            spark,
-            fileset,
-            partitionsize,
-            processor_instance.columns,
-            thread_workers,
-            file_type,
-            treeName,
-        )
-
-    output = executor(
-        spark, dfslist, processor_instance, None, thread_workers, use_cache, schema
-    )
-    processor_instance.postprocess(output)
-
-    if killSpark:
-        _spark_stop(spark)
-        del spark
-        spark = None
-
-    return output
diff --git a/src/coffea/processor/helpers.py b/src/coffea/processor/helpers.py
deleted file mode 100644
index dcdf03ba2..000000000
--- a/src/coffea/processor/helpers.py
+++ /dev/null
@@ -1,273 +0,0 @@
-import numpy
-
-from coffea.util import deprecate
-
-
-class Weights:
-    """Container for event weights and associated systematic shifts
-
-    This container keeps track of correction factors and systematic
-    effects that can be encoded as multiplicative modifiers to the event weight.
-    All weights are stored in vector form.
-
-    Parameters
-    ----------
-        size : int
-            size of the weight arrays to be handled (i.e. the number of events / instances).
-        storeIndividual : bool, optional
-            store not only the total weight + variations, but also each individual weight.
-            Default is false.
-    """
-
-    def __init__(self, size, storeIndividual=False):
-        deprecate(
-            RuntimeError(
-                "This utility has moved to the `coffea.analysis_tools` subpackage and has new features, check it out!"
-            ),
-            0.8,
-        )
-        self._weight = numpy.ones(size)
-        self._weights = {}
-        self._modifiers = {}
-        self._weightStats = {}
-        self._storeIndividual = storeIndividual
-
-    def add(self, name, weight, weightUp=None, weightDown=None, shift=False):
-        """Add a new weight
-
-        Adds a named correction to the event weight, and optionally also associated
-        systematic uncertainties.
-
-        Parameters
-        ----------
-            name : str
-                name of correction
-            weight : numpy.ndarray
-                the nominal event weight associated with the correction
-            weightUp : numpy.ndarray, optional
-                weight with correction uncertainty shifted up (if available)
-            weightDown : numpy.ndarray, optional
-                weight with correction uncertainty shifted down. If ``weightUp`` is supplied, and
-                the correction uncertainty is symmetric, this can be set to None to auto-calculate
-                the down shift as ``1 / weightUp``.
-            shift : bool, optional
-                if True, interpret weightUp and weightDown as a relative difference (additive) to the
-                nominal value
-
-        .. note:: ``weightUp`` and ``weightDown`` are assumed to be rvalue-like and may be modified in-place by this function
-        """
-        if name.endswith("Up") or name.endswith("Down"):
-            raise ValueError(
-                "Avoid using 'Up' and 'Down' in weight names, instead pass appropriate shifts to add() call"
-            )
-        weight = numpy.array(weight)
-        self._weight = self._weight * weight
-        if self._storeIndividual:
-            self._weights[name] = weight
-        if weightUp is not None:
-            weightUp = numpy.array(weightUp)
-            if shift:
-                weightUp += weight
-            weightUp[weight != 0.0] /= weight[weight != 0.0]
-            self._modifiers[name + "Up"] = weightUp
-        if weightDown is not None:
-            weightDown = numpy.array(weightDown)
-            if shift:
-                weightDown = weight - weightDown
-            weightDown[weight != 0.0] /= weight[weight != 0.0]
-            self._modifiers[name + "Down"] = weightDown
-        self._weightStats[name] = {
-            "sumw": weight.sum(),
-            "sumw2": (weight**2).sum(),
-            "min": weight.min(),
-            "max": weight.max(),
-            "n": weight.size,
-        }
-
-    def weight(self, modifier=None):
-        """Current event weight vector
-
-        Parameters
-        ----------
-            modifier : str, optional
-                if supplied, provide event weight corresponding to a particular
-                systematic uncertainty shift, of form ``str(name + 'Up')`` or (Down)
-
-        Returns
-        -------
-            weight : numpy.ndarray
-                The weight vector, possibly modified by the effect of a given systematic variation.
-        """
-        if modifier is None:
-            return self._weight
-        elif "Down" in modifier and modifier not in self._modifiers:
-            return self._weight / self._modifiers[modifier.replace("Down", "Up")]
-        return self._weight * self._modifiers[modifier]
-
-    def partial_weight(self, include=[], exclude=[]):
-        """Partial event weight vector
-
-        Return a partial weight by multiplying a subset of all weights.
-        Can be operated either by specifying weights to include or
-        weights to exclude, but not both at the same time. The method
-        can only be used if the individual weights are stored via the
-        ``storeIndividual`` argument in the `Weights` initializer.
-
-        Parameters
-        ----------
-            include : list | set
-                Weight names to include, defaults to []
-            exclude : list | set
-                Weight names to exclude, defaults to []
-        Returns
-        -------
-            weight : numpy.ndarray
-                The weight vector, corresponding to only the effect of the
-                corrections specified.
-        """
-        if not self._storeIndividual:
-            raise ValueError(
-                "To be able to request weight exclusion, use storeIndividual=True when creating Weights object."
-            )
-        if (include and exclude) or not (include or exclude):
-            raise ValueError(
-                "Need to specify exactly one of the 'exclude' or 'include' arguments."
-            )
-        if include and not isinstance(include, (list, set)):
-            raise ValueError("'include' should be a list or set of weight names")
-        if exclude and not isinstance(exclude, (list, set)):
-            raise ValueError("'exclude' should be a list or set of weight names")
-
-        names = set(self._weights.keys())
-        if include:
-            names = names & set(include)
-        if exclude:
-            names = names - set(exclude)
-
-        w = numpy.ones(self._weight.size)
-        for name in names:
-            w = w * self._weights[name]
-
-        return w
-
-    @property
-    def variations(self):
-        """List of available modifiers"""
-        keys = set(self._modifiers.keys())
-        # add any missing 'Down' variation
-        for k in self._modifiers.keys():
-            keys.add(k.replace("Up", "Down"))
-        return keys
-
-
-class PackedSelection:
-    """Store boolean mask vectors in a compact manner
-
-    This class can store several boolean masks (cuts, selections) and
-    evaluate arbitrary combinations of the requirements in an CPU-efficient way
-
-    Parameters
-    ----------
-        dtype : str
-            internal bitwidth of mask vector, which governs the maximum
-            number of boolean masks storable in this object.
-            By default, up to 64 masks can be stored, but smaller values
-            for the `numpy.dtype` may be more efficient.
-    """
-
-    def __init__(self, dtype="uint64"):
-        """
-        TODO: extend to multi-column for arbitrary bit depth
-        """
-        deprecate(
-            RuntimeError(
-                "This utility has moved to the `coffea.analysis_tools` subpackage and has new features, check it out!"
-            ),
-            0.8,
-        )
-        self._dtype = numpy.dtype(dtype)
-        self._names = []
-        self._mask = None
-
-    @property
-    def names(self):
-        """Current list of mask names available"""
-        return self._names
-
-    def add(self, name, selection):
-        """Add a named mask
-
-        Parameters
-        ----------
-            name : str
-                name of the mask
-            selection : numpy.ndarray
-                a flat array of dtype bool.
-                If not the first mask added, it must also have
-                the same shape as previously added masks.
-        """
-        if isinstance(selection, numpy.ndarray) and selection.dtype == numpy.dtype(
-            "bool"
-        ):
-            if len(self._names) == 0:
-                self._mask = numpy.zeros(shape=selection.shape, dtype=self._dtype)
-            elif len(self._names) == 64:
-                raise RuntimeError(
-                    "Exhausted all slots for %r, consider a larger dtype or fewer selections"
-                    % self._dtype
-                )
-            elif self._mask.shape != selection.shape:
-                raise ValueError(
-                    "New selection '%s' has different shape than existing ones (%r vs. %r)"
-                    % (name, selection.shape, self._mask.shape)
-                )
-            self._mask |= selection.astype(self._dtype) << len(self._names)
-            self._names.append(name)
-        else:
-            raise ValueError(
-                "PackedSelection only understands numpy boolean arrays, got %r"
-                % selection
-            )
-
-    def require(self, **names):
-        """Return a mask vector corresponding to specific requirements
-
-        Specify an exact requirement on an arbitrary subset of the masks
-
-        Parameters
-        ----------
-            ``**names`` : kwargs
-                Each argument to require specific value for, in form ``arg=True``
-                or ``arg=False``.
-
-        Examples
-        --------
-        If
-
-        >>> selection.names
-        ['cut1', 'cut2', 'cut3']
-
-        then
-
-        >>> selection.require(cut1=True, cut2=False)
-        array([True, False, True, ...])
-
-        returns a boolean array where each entry passes if the corresponding entry has
-        ``cut1 == True``, ``cut2 == False``, and ``cut3`` arbitrary.
-        """
-        mask = 0
-        require = 0
-        for name, val in names.items():
-            if not isinstance(val, bool):
-                raise ValueError(
-                    "Please use only booleans in PackedSelection.require(), received %r for %s"
-                    % (val, name)
-                )
-            idx = self._names.index(name)
-            mask |= 1 << idx
-            require |= int(val) << idx
-        return (self._mask & mask) == require
-
-    def all(self, *names):
-        """Shorthand for `require`, where all the values are True"""
-        return self.require(**{name: True for name in names})
diff --git a/src/coffea/processor/parsl/__init__.py b/src/coffea/processor/parsl/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/src/coffea/processor/parsl/condor_config.py b/src/coffea/processor/parsl/condor_config.py
deleted file mode 100644
index 49c066c81..000000000
--- a/src/coffea/processor/parsl/condor_config.py
+++ /dev/null
@@ -1,77 +0,0 @@
-import os
-import os.path as osp
-
-from parsl.addresses import address_by_hostname
-from parsl.channels import LocalChannel
-from parsl.config import Config
-from parsl.executors import HighThroughputExecutor
-from parsl.providers import CondorProvider
-
-x509_proxy = f"x509up_u{os.getuid()}"
-
-
-def condor_config(
-    cores_per_job=4,
-    mem_per_core=2048,
-    total_workers=24,
-    max_workers=200,
-    pyenv_dir="{}/.local".format(os.environ["HOME"]),
-    grid_proxy_dir="/tmp",
-    htex_label="coffea_parsl_condor_htex",
-    wrk_init=None,
-    condor_cfg=None,
-):
-    pyenv_relpath = pyenv_dir.split("/")[-1]
-
-    if wrk_init is None:
-        wrk_init = """
-        source /cvmfs/sft.cern.ch/lcg/views/LCG_95apython3/x86_64-centos7-gcc7-opt/setup.sh
-        export PATH=`pwd`/{}:$PATH
-        export PYTHONPATH=`pwd`/{}:$PYTHONPATH
-
-        export X509_USER_PROXY=`pwd`/{}
-        mkdir -p ./{}
-        """.format(
-            "%s/bin" % pyenv_relpath,
-            "%s/lib/python3.6/site-packages" % pyenv_relpath,
-            x509_proxy,
-            htex_label,
-        )
-
-    if condor_cfg is None:
-        condor_cfg = """
-        transfer_output_files = %s
-        RequestMemory = %d
-        RequestCpus = %d
-        """ % (
-            htex_label,
-            mem_per_core * cores_per_job,
-            cores_per_job,
-        )
-
-    xfer_files = [pyenv_dir, osp.join(grid_proxy_dir, x509_proxy)]
-
-    condor_htex = Config(
-        executors=[
-            HighThroughputExecutor(
-                label=htex_label,
-                address=address_by_hostname(),
-                prefetch_capacity=0,
-                cores_per_worker=1,
-                max_workers=cores_per_job,
-                worker_logdir_root="./",
-                provider=CondorProvider(
-                    channel=LocalChannel(),
-                    init_blocks=total_workers,
-                    max_blocks=max_workers,
-                    nodes_per_block=1,
-                    worker_init=wrk_init,
-                    transfer_input_files=xfer_files,
-                    scheduler_options=condor_cfg,
-                ),
-            )
-        ],
-        strategy=None,
-    )
-
-    return condor_htex
diff --git a/src/coffea/processor/parsl/detail.py b/src/coffea/processor/parsl/detail.py
deleted file mode 100644
index a618d99f5..000000000
--- a/src/coffea/processor/parsl/detail.py
+++ /dev/null
@@ -1,89 +0,0 @@
-import parsl
-from parsl.app.app import python_app
-from parsl.channels import LocalChannel
-from parsl.config import Config
-from parsl.executors import HighThroughputExecutor
-from parsl.providers import LocalProvider
-
-from ..executor import _futures_handler
-from .timeout import timeout
-
-_default_cfg = Config(
-    executors=[
-        HighThroughputExecutor(
-            label="coffea_parsl_default",
-            cores_per_worker=1,
-            provider=LocalProvider(
-                channel=LocalChannel(),
-                init_blocks=1,
-                max_blocks=1,
-            ),
-        )
-    ],
-    strategy=None,
-)
-
-
-def _parsl_initialize(config=None):
-    parsl.clear()
-    parsl.load(config)
-
-
-def _parsl_stop():
-    parsl.dfk().cleanup()
-    parsl.clear()
-
-
-@timeout
-@python_app
-def derive_chunks(filename, treename, chunksize, ds, timeout=10):
-    from collections.abc import Sequence
-
-    import uproot
-
-    uproot.XRootDSource.defaults["parallel"] = False
-
-    a_file = uproot.open({filename: None})
-
-    tree = None
-    if isinstance(treename, str):
-        tree = a_file[treename]
-    elif isinstance(treename, Sequence):
-        for name in reversed(treename):
-            if name in a_file:
-                tree = a_file[name]
-    else:
-        raise Exception(
-            "treename must be a str or Sequence but is a %s!" % repr(type(treename))
-        )
-
-    if tree is None:
-        raise Exception(
-            "No tree found, out of possible tree names: %s" % repr(treename)
-        )
-
-    nentries = tree.numentries
-    return (
-        ds,
-        treename,
-        [(filename, chunksize, index) for index in range(nentries // chunksize + 1)],
-    )
-
-
-def _parsl_get_chunking(filelist, chunksize, status=True, timeout=10):
-    futures = {
-        derive_chunks(fn, tn, chunksize, ds, timeout=timeout) for ds, fn, tn in filelist
-    }
-
-    items = []
-
-    def chunk_accumulator(total, result):
-        ds, treename, chunks = result
-        for chunk in chunks:
-            total.append((ds, chunk[0], treename, chunk[1], chunk[2]))
-
-    _futures_handler(
-        futures, items, status, "files", "Preprocessing", chunk_accumulator, None
-    )
-
-    return items
diff --git a/src/coffea/processor/parsl/slurm_config.py b/src/coffea/processor/parsl/slurm_config.py
deleted file mode 100644
index 48c33ae01..000000000
--- a/src/coffea/processor/parsl/slurm_config.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import os
-import os.path as osp
-import shutil
-
-from parsl.addresses import address_by_hostname
-from parsl.channels import LocalChannel
-from parsl.config import Config
-from parsl.executors import HighThroughputExecutor
-from parsl.launchers import SrunLauncher
-from parsl.providers import SlurmProvider
-
-x509_proxy = "x509up_u%s" % (os.getuid())
-
-
-def slurm_config(
-    cores_per_job=16,
-    mem_per_core=2048,
-    jobs_per_worker=1,
-    initial_workers=4,
-    max_workers=8,
-    work_dir="./",
-    grid_proxy_dir="/tmp",
-    partition="",
-    walltime="02:00:00",
-    htex_label="coffea_parsl_slurm_htex",
-):
-    shutil.copy2(osp.join(grid_proxy_dir, x509_proxy), osp.join(work_dir, x509_proxy))
-
-    wrk_init = """
-    export XRD_RUNFORKHANDLER=1
-    export X509_USER_PROXY=%s
-    """ % (
-        osp.join(work_dir, x509_proxy)
-    )
-
-    sched_opts = """
-    #SBATCH --cpus-per-task=%d
-    #SBATCH --mem-per-cpu=%d
-    """ % (
-        cores_per_job,
-        mem_per_core,
-    )
-
-    slurm_htex = Config(
-        executors=[
-            HighThroughputExecutor(
-                label=htex_label,
-                address=address_by_hostname(),
-                prefetch_capacity=0,
-                max_workers=cores_per_job,
-                provider=SlurmProvider(
-                    channel=LocalChannel(),
-                    launcher=SrunLauncher(),
-                    init_blocks=initial_workers,
-                    max_blocks=max_workers,
-                    nodes_per_block=jobs_per_worker,
-                    partition=partition,
-                    scheduler_options=sched_opts,  # Enter scheduler_options if needed
-                    worker_init=wrk_init,  # Enter worker_init if needed
-                    walltime=walltime,
-                ),
-            )
-        ],
-        strategy=None,
-    )
-
-    return slurm_htex
diff --git a/src/coffea/processor/parsl/timeout.py b/src/coffea/processor/parsl/timeout.py
deleted file mode 100644
index 35c7b42dc..000000000
--- a/src/coffea/processor/parsl/timeout.py
+++ /dev/null
@@ -1,21 +0,0 @@
-from functools import wraps
-
-
-def timeout(func):
-    @wraps(func)
-    def wrapper(*args, **kwargs):
-        import signal
-
-        def _timeout_handler(signum, frame):
-            raise Exception("Timeout hit")
-
-        signal.signal(signal.SIGALRM, _timeout_handler)
-        if kwargs.get("timeout"):
-            signal.alarm(max(1, int(kwargs["timeout"])))
-        try:
-            result = func(*args, **kwargs)
-        finally:
-            signal.alarm(0)
-        return result
-
-    return wrapper
diff --git a/src/coffea/processor/servicex/__init__.py b/src/coffea/processor/servicex/__init__.py
deleted file mode 100644
index 12d26e635..000000000
--- a/src/coffea/processor/servicex/__init__.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) 2019, IRIS-HEP
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of the copyright holder nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from .analysis import *
-from .dask_executor import *
-from .data_source import *
-from .local_executor import *
-
-__all__ = [
-    "DataSource",
-    "Analysis",
-    "LocalExecutor",
-    "DaskExecutor",
-]
diff --git a/src/coffea/processor/servicex/analysis.py b/src/coffea/processor/servicex/analysis.py
deleted file mode 100644
index 669ba8662..000000000
--- a/src/coffea/processor/servicex/analysis.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (c) 2019, IRIS-HEP
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of the copyright holder nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-from abc import ABC, abstractmethod
-
-from coffea.nanoevents.methods.base import NanoEvents
-
-
-class Analysis(ABC):
-    @staticmethod
-    @abstractmethod
-    def process(events: NanoEvents) -> dict:
-        """
-        Implement this abstract method to perform the actual analysis operations. The
-        executor will wrap this in code to construct a NanoEvents instance and will pass
-        in the analysis instance's accumulator.
-        :param events: NanoEvents
-        :return: dict[str, Accumulatable]
-            Filled with the results from this analysis
-        """
-        raise NotImplementedError
diff --git a/src/coffea/processor/servicex/dask_executor.py b/src/coffea/processor/servicex/dask_executor.py
deleted file mode 100644
index f9dcc4851..000000000
--- a/src/coffea/processor/servicex/dask_executor.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright (c) 2019, IRIS-HEP
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of the copyright holder nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-from typing import Callable, Dict, Optional
-
-from dask.distributed import Client
-
-from .executor import Executor, run_coffea_processor
-
-
-class DaskExecutor(Executor):
-    def __init__(
-        self,
-        client_addr: Optional[str] = None,
-        provided_dask_client: Optional[Client] = None,
-    ):
-        """Create a Dask executor to process the analysis
-
-        Args:
-            client_addr (Optional[str]): If `None` then create a local cluster that runs
-                                         in-process. Otherwise connect to an already
-                                         existing cluster.
-            provided_dask_client (Optional[Client]): Pass in an initialized Dask Client.
-                                         This client must have asynchronous=True.
-        """
-        if not provided_dask_client:
-            self.is_local = not client_addr
-
-            self.dask = (
-                Client(threads_per_worker=10, asynchronous=True)
-                if self.is_local
-                else Client(client_addr, asynchronous=True)
-            )
-        else:
-            assert provided_dask_client.asynchronous
-            self.dask = provided_dask_client
-            self.is_local = False
-
-    def get_result_file_stream(self, datasource, title):
-        if self.is_local:
-            return datasource.stream_result_files(title)
-        else:
-            return datasource.stream_result_file_uris(title)
-
-    def run_async_analysis(
-        self,
-        file_url: str,
-        tree_name: Optional[str],
-        data_type: str,
-        meta_data: Dict[str, str],
-        process_func: Callable,
-        schema,
-    ):
-        """Create a dask future for a dask task to run the analysis."""
-        data_result = self.dask.submit(
-            run_coffea_processor,
-            events_url=file_url,
-            tree_name=tree_name,
-            data_type=data_type,
-            meta_data=meta_data,
-            proc=process_func,
-            schema=schema,
-        )
-
-        return data_result
diff --git a/src/coffea/processor/servicex/data_source.py b/src/coffea/processor/servicex/data_source.py
deleted file mode 100644
index 741363e67..000000000
--- a/src/coffea/processor/servicex/data_source.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Copyright (c) 2021, IRIS-HEP
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of the copyright holder nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-from typing import AsyncGenerator, Dict, List, Optional, Tuple
-
-from func_adl import ObjectStream, find_EventDataset
-from servicex import ServiceXDataset, StreamInfoPath, StreamInfoUrl
-
-
-class DataSource:
-    def __init__(
-        self,
-        query: ObjectStream,
-        metadata: Dict[str, str] = {},
-        datasets: List[ServiceXDataset] = [],
-    ):
-        self.query = query
-        self.metadata = metadata
-        self.schema = None
-        self.datasets = datasets
-
-    async def _get_query(self) -> str:
-        """Return the qastle query.
-
-        Note: To do this we have to forward-cast the object: by design, not all `func_adl`
-        queries are `ServiceX` queries. But this library only works with datasets that are
-        based in `ServiceX`. Thus some duck typing occurs in this method.
-        """
-        event_dataset_ast = find_EventDataset(self.query.query_ast)
-        event_dataset = event_dataset_ast._eds_object  # type: ignore
-        if not hasattr(event_dataset, "return_qastle"):
-            raise Exception(
-                f"Base func_adl query {str(event_dataset)} does not have a way to generate qastle!"
-            )
-        event_dataset.return_qastle = True  # type: ignore
-        return await self.query.value_async()
-
-    async def stream_result_file_uris(
-        self, title: Optional[str] = None
-    ) -> AsyncGenerator[Tuple[str, str, StreamInfoUrl], None]:
-        """Launch all datasources off to servicex
-
-        Yields:
-            Tuple[str, StreamInfoUrl]: List of data types and url's to process
-        """
-        qastle = await self._get_query()
-
-        # TODO: Make this for loop parallel
-        for dataset in self.datasets:
-            data_type = dataset.first_supported_datatype(["parquet", "root"])
-            if data_type == "root":
-                async for file in dataset.get_data_rootfiles_uri_stream(
-                    qastle, title=title, as_signed_url=True
-                ):
-                    yield (data_type, dataset.dataset_as_name, file)
-            elif data_type == "parquet":
-                async for file in dataset.get_data_parquet_uri_stream(
-                    qastle, title=title, as_signed_url=True
-                ):
-                    yield (data_type, dataset.dataset_as_name, file)
-            else:
-                raise Exception(
-                    f"This dataset ({str(dataset)}) supports unknown datatypes"
-                )
-
-    async def stream_result_files(
-        self, title: Optional[str] = None
-    ) -> AsyncGenerator[Tuple[str, str, StreamInfoPath], None]:
-        """Launch all datasources at once off to servicex
-
-        Yields:
-            Tuple[str, StreamInfoPath]: List of data types and file paths to process
-        """
-        qastle = await self._get_query()
-
-        # TODO: Make this for loop parallel
-        for dataset in self.datasets:
-            data_type = dataset.first_supported_datatype(["parquet", "root"])
-            if data_type == "root":
-                async for file in dataset.get_data_rootfiles_stream(
-                    qastle, title=title
-                ):
-                    yield (data_type, dataset.dataset_as_name, file)
-            elif data_type == "parquet":
-                async for file in dataset.get_data_parquet_stream(qastle, title=title):
-                    yield (data_type, dataset.dataset_as_name, file)
-            else:
-                raise Exception(
-                    f"This dataset ({str(dataset)}) supports unknown datatypes"
-                )
diff --git a/src/coffea/processor/servicex/executor.py b/src/coffea/processor/servicex/executor.py
deleted file mode 100644
index 704f3da8a..000000000
--- a/src/coffea/processor/servicex/executor.py
+++ /dev/null
@@ -1,183 +0,0 @@
-# Copyright (c) 2019, IRIS-HEP
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of the copyright holder nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-from abc import ABC, abstractmethod
-from typing import Any, AsyncGenerator, Callable, Dict, Optional, Tuple
-
-import aiostream
-import uproot
-from servicex import StreamInfoUrl
-
-from ..accumulator import async_accumulate
-
-# from urllib.parse import urlparse, unquote
-# from urllib.request import url2pathname
-
-
-class Executor(ABC):
-    @abstractmethod
-    def run_async_analysis(
-        self,
-        file_url: str,
-        tree_name: Optional[str],
-        data_type: str,
-        meta_data: Dict[str, str],
-        process_func: Callable,
-    ):
-        raise NotImplementedError
-
-    def get_result_file_stream(self, datasource, title: Optional[str] = None):
-        return datasource.stream_result_file_uris(title)
-
-    async def execute(
-        self, analysis, datasource, title: Optional[str] = None, schema=None
-    ):
-        """
-        Launch an analysis against the given dataset on the implementation's task framework
-        :param analysis:
-            The analysis to run
-        :param datasource:
-            The datasource to run against
-        :param schema:
-            The schema to apply to data, defaults to None (will then use auto_schema).
-        :return:
-            Stream of up to date histograms. Grows as each result is received
-        """
-        # Stream transformed file references from ServiceX
-        result_file_stream = self.get_result_file_stream(datasource, title=title)
-
-        # Launch a task against this file
-        func_results = self.launch_analysis_tasks_from_stream(
-            result_file_stream, datasource.metadata, analysis.process, schema=schema
-        )
-
-        # Wait for all the data to show up
-        async def inline_wait(r):
-            "This could be inline, but python 3.6"
-            x = await r
-            return x
-
-        finished_events = aiostream.stream.map(func_results, inline_wait, ordered=False)
-        # Finally, accumulate!
-        # There is an accumulate pattern in the aiostream lib
-        async with finished_events.stream() as streamer:
-            async for results in async_accumulate(streamer):
-                yield results
-
-    async def launch_analysis_tasks_from_stream(
-        self,
-        result_file_stream: AsyncGenerator[Tuple[str, str, StreamInfoUrl], None],
-        meta_data: Dict[str, str],
-        process_func: Callable,
-        schema,
-    ) -> AsyncGenerator[Any, None]:
-        """
-        Invoke the implementation's task runner on each file from the serviceX stream.
-        We don't know the file's tree name in advance, so grab a sample the first time
-        around to inspect the tree name
-        :param result_file_stream:
-        :param accumulator:
-        :param process_func:
-        :param schema:
-            The schema to apply to data.
-        :return:
-        """
-        tree_name = None
-        async for sx_data in result_file_stream:
-            file_url = sx_data[2].url
-            sample_md = dict(meta_data, dataset=sx_data[1])
-            data_type = sx_data[0]
-
-            # Determine the tree name if we've not gotten it already
-            if data_type == "root":
-                if tree_name is None:
-                    with uproot.open({file_url: None}) as sample:
-                        tree_name = sample.keys()[0]
-
-            # Invoke the implementation's task launcher
-            data_result = self.run_async_analysis(
-                file_url=file_url,
-                tree_name=tree_name,
-                data_type=data_type,
-                meta_data=sample_md,
-                process_func=process_func,
-                schema=schema,
-            )
-
-            # Pass this down to the next item in the stream.
-            yield data_result
-
-
-def run_coffea_processor(
-    events_url: str, tree_name: Optional[str], proc, data_type, meta_data, schema
-):
-    """
-    Process a single file from a tree via a coffea processor on the remote node
-    :param events_url:
-        a URL to a ROOT file that uproot4 can open
-    :param tree_name:
-        The tree in the ROOT file to use for our data. Can be null if the data isn't a root
-        tree!
-    :param accumulator:
-        Accumulator to store the results
-    :param proc:
-        Analysis function to execute. Must have signature
-    :param data_type:
-        What datatype is the data (root, parquet?)
-    :param schema:
-        The schema to apply to data (if None, will use auto_schema).
-    :return:
-        Populated accumulator
-    """
-    # Since we execute remotely, explicitly include everything we need.
-    from coffea.nanoevents import NanoEventsFactory
-
-    if schema is None:
-        from coffea.nanoevents.schemas.schema import auto_schema
-
-        schema = auto_schema
-
-    if data_type == "root":
-        # Use NanoEvents to build a 4-vector
-        assert tree_name is not None
-        events = NanoEventsFactory.from_root(
-            file=str(events_url),
-            treepath=f"/{tree_name}",
-            schemaclass=schema,
-            metadata=dict(meta_data, filename=str(events_url)),
-        ).events()
-    elif data_type == "parquet":
-        events = NanoEventsFactory.from_parquet(
-            file=str(events_url),
-            treepath="/",
-            schemaclass=schema,
-            metadata=dict(meta_data, filename=str(events_url)),
-        ).events()
-    else:
-        raise Exception(f"Unknown stream data type of {data_type} - cannot process.")
-
-    return proc(events)
diff --git a/src/coffea/processor/servicex/local_executor.py b/src/coffea/processor/servicex/local_executor.py
deleted file mode 100644
index fd8670c0c..000000000
--- a/src/coffea/processor/servicex/local_executor.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) 2019, IRIS-HEP
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of the copyright holder nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-from typing import Callable, Dict, Optional
-
-from .executor import Executor, run_coffea_processor
-
-
-class LocalExecutor(Executor):
-    def __init__(self):
-        pass
-
-    def get_result_file_stream(self, datasource, title):
-        return datasource.stream_result_files(title)
-
-    def run_async_analysis(
-        self,
-        file_url: str,
-        tree_name: Optional[str],
-        data_type: str,
-        meta_data: Dict[str, str],
-        process_func: Callable,
-        schema,
-    ):
-        # TODO: Do we need a second routine here? Can we just use this one?
-        return self._async_analysis(
-            events_url=file_url,
-            tree_name=tree_name,
-            data_type=data_type,
-            meta_data=meta_data,
-            process_func=process_func,
-            schema=schema,
-        )
-
-    async def _async_analysis(
-        self, events_url, tree_name, data_type, meta_data, process_func, schema
-    ):
-        return run_coffea_processor(
-            events_url=events_url,
-            tree_name=tree_name,
-            data_type=data_type,
-            meta_data=meta_data,
-            proc=process_func,
-            schema=schema,
-        )
diff --git a/src/coffea/processor/spark/__init__.py b/src/coffea/processor/spark/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/src/coffea/processor/spark/detail.py b/src/coffea/processor/spark/detail.py
deleted file mode 100644
index 6e372bd2b..000000000
--- a/src/coffea/processor/spark/detail.py
+++ /dev/null
@@ -1,133 +0,0 @@
-from concurrent.futures import ThreadPoolExecutor
-
-import pyspark.sql
-import pyspark.sql.functions as fn
-from pyarrow.util import guid
-from tqdm import tqdm
-
-try:
-    from collections.abc import Sequence
-except ImportError:
-    from collections.abc import Sequence
-
-from coffea.processor.executor import _futures_handler
-
-# this is a reasonable local spark configuration
-_default_config = (
-    pyspark.sql.SparkSession.builder.appName("coffea-analysis-%s" % guid())
-    .master("local[*]")
-    .config("spark.sql.execution.arrow.enabled", "true")
-    .config("spark.sql.execution.arrow.maxRecordsPerBatch", 200000)
-)
-
-
-def _spark_initialize(config=_default_config, **kwargs):
-    spark_progress = False
-    if "spark_progress" in kwargs.keys():
-        spark_progress = kwargs["spark_progress"]
-
-    cfg_actual = config
-    # get spark to not complain about missing log configs
-    cfg_actual = cfg_actual.config(
-        "spark.driver.extraJavaOptions", "-Dlog4jspark.root.logger=ERROR,console"
-    )
-    if not spark_progress:
-        cfg_actual = cfg_actual.config("spark.ui.showConsoleProgress", "false")
-
-    kwargs.setdefault("bindAddress", None)
-    if kwargs["bindAddress"] is not None:
-        cfg_actual = cfg_actual.config(
-            "spark.driver.bindAddress", kwargs["bindAddress"]
-        )
-    kwargs.setdefault("host", None)
-    if kwargs["host"] is not None:
-        cfg_actual = cfg_actual.config("spark.driver.host", kwargs["host"])
-
-    session = cfg_actual.getOrCreate()
-    sc = session.sparkContext
-
-    if "log_level" in kwargs.keys():
-        sc.setLogLevel(kwargs["log_level"])
-    else:
-        sc.setLogLevel("ERROR")
-
-    return session
-
-
-def _read_df(
-    spark, dataset, files_or_dirs, ana_cols, partitionsize, file_type, treeName
-):
-    flist = files_or_dirs
-    tname = treeName
-    if isinstance(files_or_dirs, dict):
-        tname = files_or_dirs["treename"]
-        flist = files_or_dirs["files"]
-    if not isinstance(flist, Sequence):
-        raise ValueError("spark dataset file list must be a Sequence (like list())")
-    df = (
-        spark.read.format(file_type)
-        .option("tree", tname)
-        .option("threadCount", "-1")
-        .load(flist)
-    )
-    count = df.count()
-
-    df_cols = set(df.columns)
-    cols_in_df = ana_cols.intersection(df_cols)
-    df = df.select(*cols_in_df)
-    missing_cols = ana_cols - cols_in_df
-    for missing in missing_cols:
-        df = df.withColumn(missing, fn.lit(0.0))
-    # compatibility with older pyarrow which doesn't understand array<boolean>
-    for col, dtype in df.dtypes:
-        if dtype == "array<boolean>":
-            tempcol = col + "tempbool"
-            df = df.withColumnRenamed(col, tempcol)
-            df = df.withColumn(col, df[tempcol].cast("array<tinyint>")).drop(tempcol)
-    df = df.withColumn("dataset", fn.lit(dataset))
-    npartitions = (count // partitionsize) + 1
-    actual_partitions = df.rdd.getNumPartitions()
-    avg_counts = count / actual_partitions
-    if actual_partitions > 1.50 * npartitions or avg_counts > partitionsize:
-        df = df.repartition(npartitions)
-
-    return df, dataset, count
-
-
-def _spark_make_dfs(
-    spark,
-    fileset,
-    partitionsize,
-    columns,
-    thread_workers,
-    file_type,
-    treeName,
-    status=True,
-):
-    dfs = {}
-    ana_cols = set(columns)
-
-    with ThreadPoolExecutor(max_workers=thread_workers) as executor:
-        futures = {
-            executor.submit(
-                _read_df, spark, ds, files, ana_cols, partitionsize, file_type, treeName
-            )
-            for ds, files in fileset.items()
-        }
-
-        for df, ds, count in tqdm(
-            _futures_handler(futures, timeout=None),
-            disable=not status,
-            unit="dataset",
-            total=len(fileset),
-            desc="loading",
-        ):
-            dfs[ds] = (df, count)
-
-    return dfs
-
-
-def _spark_stop(spark):
-    # this may do more later?
-    spark._jvm.SparkSession.clearActiveSession()
-    spark.stop()
diff --git a/src/coffea/processor/spark/spark_executor.py b/src/coffea/processor/spark/spark_executor.py
deleted file mode 100644
index 0db32f475..000000000
--- a/src/coffea/processor/spark/spark_executor.py
+++ /dev/null
@@ -1,195 +0,0 @@
-import pickle  # noqa: F401
-from concurrent.futures import ThreadPoolExecutor
-
-import awkward  # noqa: F401
-import lz4.frame  # noqa: F401
-
-# must preload these for exec calls
-import numpy  # noqa: F401
-import pandas  # noqa: F401
-import pyspark.sql.functions as fn
-from jinja2 import Environment, PackageLoader, select_autoescape
-from pyspark.sql.types import StringType  # noqa: F401
-from pyspark.sql.types import BinaryType, StructField, StructType
-from tqdm import tqdm
-
-from coffea.nanoevents import NanoEventsFactory, schemas  # noqa: F401
-from coffea.nanoevents.mapping import SimplePreloadedColumnSource  # noqa: F401
-from coffea.processor.accumulator import accumulate
-from coffea.processor.executor import _decompress, _futures_handler, _reduce
-
-lz4_clevel = 1
-
-
-# this is a UDF that takes care of summing histograms across
-# various spark results where the outputs are histogram blobs
-def agg_histos_raw(series, lz4_clevel):
-    goodlines = series[series.str.len() > 0]
-    if goodlines.size == 1:  # short-circuit trivial aggregations
-        return goodlines[0]
-    return _reduce(lz4_clevel)(goodlines)
-
-
-@fn.pandas_udf(BinaryType())
-def agg_histos(series: pandas.Series) -> bytes:
-    global lz4_clevel
-    return agg_histos_raw(series, lz4_clevel)
-
-
-def reduce_histos_raw(df, lz4_clevel):
-    histos = df["histos"]
-    outhist = _reduce(lz4_clevel)(histos[histos.str.len() > 0])
-    return pandas.DataFrame(data={"histos": numpy.array([outhist], dtype="O")})
-
-
-@fn.pandas_udf(
-    StructType([StructField("histos", BinaryType(), True)]),
-)
-def reduce_histos(df: pandas.DataFrame) -> pandas.DataFrame:
-    global lz4_clevel
-    return reduce_histos_raw(df, lz4_clevel)
-
-
-def _get_ds_bistream(item):
-    global lz4_clevel
-    ds, bitstream = item
-    if bitstream is None:
-        raise Exception(
-            "No pandas dataframe returned from spark in dataset: %s, something went wrong!"
-            % ds
-        )
-    if bitstream.empty:
-        raise Exception(
-            "The histogram list returned from spark is empty in dataset: %s, something went wrong!"
-            % ds
-        )
-    out = bitstream[bitstream.columns[0]][0]
-    if lz4_clevel is not None:
-        return _decompress(out)
-    return out
-
-
-class SparkExecutor:
-    _template_name = "spark.py.tmpl"
-
-    def __init__(self):
-        self._cacheddfs = None
-        self._counts = None
-        self._env = Environment(
-            loader=PackageLoader("coffea.processor", "templates"),
-            autoescape=select_autoescape(["py"]),
-        )
-
-    @property
-    def counts(self):
-        return self._counts
-
-    def __call__(
-        self,
-        spark,
-        dfslist,
-        theprocessor,
-        output,
-        thread_workers,
-        use_df_cache,
-        schema,
-        status=True,
-        unit="datasets",
-        desc="Processing",
-    ):
-        # processor needs to be a global
-        global processor_instance, coffea_udf, nano_schema
-        processor_instance = theprocessor
-        if schema is None:
-            schema = schemas.BaseSchema
-        if not issubclass(schema, schemas.BaseSchema):
-            raise ValueError(
-                "Expected schema to derive from BaseSchema (%s)"
-                % (str(schema.__name__))
-            )
-        nano_schema = schema
-        # get columns from processor
-        columns = processor_instance.columns
-        cols_w_ds = ["dataset"] + columns
-        # make our udf
-        tmpl = self._env.get_template(self._template_name)
-        render = tmpl.render(cols=columns)
-        print(render)
-        exec(render)
-
-        # cache the input datasets if it's not already done
-        if self._counts is None:
-            self._counts = {}
-            # go through each dataset and thin down to the columns we want
-            for ds, (df, counts) in dfslist.items():
-                self._counts[ds] = counts
-
-        if self._cacheddfs is None:
-            self._cacheddfs = {}
-            cachedesc = "caching" if use_df_cache else "pruning"
-            with ThreadPoolExecutor(max_workers=thread_workers) as executor:
-                futures = set()
-                for ds, (df, counts) in dfslist.items():
-                    futures.add(
-                        executor.submit(
-                            self._pruneandcache_data, ds, df, cols_w_ds, use_df_cache
-                        )
-                    )
-                gen = _futures_handler(futures, timeout=None)
-                try:
-                    for ds, df in tqdm(
-                        gen,
-                        disable=not status,
-                        unit=unit,
-                        total=len(dfslist),
-                        desc=cachedesc,
-                    ):
-                        self._cacheddfs[ds] = df
-                finally:
-                    gen.close()
-
-        with ThreadPoolExecutor(max_workers=thread_workers) as executor:
-            futures = set()
-            for ds, df in self._cacheddfs.items():
-                co_udf = coffea_udf
-                futures.add(
-                    executor.submit(self._launch_analysis, ds, df, co_udf, cols_w_ds)
-                )
-            gen = _futures_handler(futures, timeout=None)
-            try:
-                output = accumulate(
-                    tqdm(
-                        map(_get_ds_bistream, gen),
-                        disable=not status,
-                        unit=unit,
-                        total=len(self._cacheddfs),
-                        desc=desc,
-                    ),
-                    output,
-                )
-            finally:
-                gen.close()
-
-        return output
-
-    def _pruneandcache_data(self, ds, df, columns, cacheit):
-        if cacheit:
-            return ds, df.select(*columns).cache()
-        return ds, df.select(*columns)
-
-    def _launch_analysis(self, ds, df, udf, columns):
-        histo_map_parts = (df.rdd.getNumPartitions() // 20) + 1
-        return (
-            ds,
-            df.select(udf(*columns).alias("histos"))
-            .withColumn("hpid", fn.spark_partition_id() % histo_map_parts)
-            .repartition(histo_map_parts, "hpid")
-            .groupBy("hpid")
-            .apply(reduce_histos)
-            .groupBy()
-            .agg(agg_histos("histos"))
-            .toPandas(),
-        )
-
-
-spark_executor = SparkExecutor()
diff --git a/src/coffea/processor/templates/__init__.py b/src/coffea/processor/templates/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/src/coffea/processor/templates/spark.py.tmpl b/src/coffea/processor/templates/spark.py.tmpl
deleted file mode 100644
index 5e6e5f818..000000000
--- a/src/coffea/processor/templates/spark.py.tmpl
+++ /dev/null
@@ -1,24 +0,0 @@
-global coffea_udf
-
-
-def coffea_udf(dataset: pd.Series, {% for col in cols %}{{col + ": pd.Series"}}{{ "," if not loop.last }}{% endfor %}):
-    global processor_instance, lz4_clevel, nano_schema
-
-    columns = [{% for col in cols %}awkward.Array({{col}}){{ "," if not loop.last }}{% endfor %}]
-    names = [{% for col in cols %}{{"'"|safe+col+"'"|safe}}{{ "," if not loop.last }}{% endfor %}]
-
-    size = len(dataset)
-    src = SimplePreloadedColumnSource(dict(zip(names, columns)), None, size, object_path='/Events')
-
-    events = NanoEventsFactory \
-                .from_preloaded(src, metadata={'dataset': dataset[0]}, schemaclass=nano_schema) \
-                .events()
-
-    vals = processor_instance.process(events)
-
-    valsblob = lz4.frame.compress(pickle.dumps(vals), compression_level=lz4_clevel)
-
-    outs = numpy.full(shape=(size, ), fill_value=b'', dtype='O')
-    outs[0] = valsblob
-
-    return pandas.Series(outs)
diff --git a/tests/test_local_executors.py b/tests/test_local_executors.py
deleted file mode 100644
index ccc35f5f4..000000000
--- a/tests/test_local_executors.py
+++ /dev/null
@@ -1,125 +0,0 @@
-import os.path as osp
-import sys
-
-import pytest
-
-from coffea import processor
-from coffea.nanoevents import schemas
-from coffea.processor.executor import UprootMissTreeError
-
-if sys.platform.startswith("win"):
-    pytest.skip("skipping tests that only function in linux", allow_module_level=True)
-
-
-@pytest.mark.parametrize("filetype", ["root", "parquet"])
-@pytest.mark.parametrize("skipbadfiles", [True, False])
-@pytest.mark.parametrize("maxchunks", [1, None])
-@pytest.mark.parametrize("chunksize", [100000, 5])
-@pytest.mark.parametrize("schema", [None, schemas.BaseSchema])
-@pytest.mark.parametrize(
-    "executor", [processor.IterativeExecutor]  # , processor.FuturesExecutor
-)
-def test_dataframe_analysis(
-    executor, schema, chunksize, maxchunks, skipbadfiles, filetype
-):
-    from coffea.processor.test_items import NanoTestProcessor
-
-    if schema is not None and filetype == "parquet":
-        pytest.xfail("parquet nanoevents not supported yet")
-
-    filelist = {
-        "ZJets": {"files": [osp.abspath(f"tests/samples/nano_dy.{filetype}")]},
-        "Data": {"files": [osp.abspath(f"tests/samples/nano_dimuon.{filetype}")]},
-    }
-
-    executor = executor()
-    run = processor.Runner(
-        executor=executor,
-        schema=schema,
-        chunksize=chunksize,
-        maxchunks=maxchunks,
-        skipbadfiles=skipbadfiles,
-        format=filetype,
-    )
-
-    hists = run(filelist, "Events", processor_instance=NanoTestProcessor())
-
-    if maxchunks is None:
-        assert hists["cutflow"]["ZJets_pt"] == 18
-        assert hists["cutflow"]["ZJets_mass"] == 6
-        assert hists["cutflow"]["Data_pt"] == 84
-        assert hists["cutflow"]["Data_mass"] == 66
-    else:
-        assert maxchunks == 1
-        print(hists["cutflow"]["ZJets_pt"])
-        assert hists["cutflow"]["ZJets_pt"] == (18 if chunksize == 100_000 else 2)
-        assert hists["cutflow"]["ZJets_mass"] == (6 if chunksize == 100_000 else 1)
-        assert hists["cutflow"]["Data_pt"] == (84 if chunksize == 100_000 else 13)
-        assert hists["cutflow"]["Data_mass"] == (66 if chunksize == 100_000 else 12)
-
-
-@pytest.mark.parametrize("filetype", ["root", "parquet"])
-@pytest.mark.parametrize("skipbadfiles", [True, False])
-@pytest.mark.parametrize("maxchunks", [None, 1000])
-@pytest.mark.parametrize("compression", [None, 0, 2])
-@pytest.mark.parametrize(
-    "executor", [processor.IterativeExecutor]  # , processor.FuturesExecutor
-)
-def test_nanoevents_analysis(executor, compression, maxchunks, skipbadfiles, filetype):
-    from coffea.processor.test_items import NanoEventsProcessor
-
-    if filetype == "parquet":
-        pytest.xfail("parquet nanoevents not supported yet")
-
-    filelist = {
-        "DummyBadMissingFile": {
-            "treename": "Events",
-            "files": [osp.abspath(f"tests/samples/non_existent.{filetype}")],
-        },
-        "ZJetsBadMissingTree": {
-            "treename": "NotEvents",
-            "files": [
-                osp.abspath(f"tests/samples/nano_dy.{filetype}"),
-                osp.abspath(f"tests/samples/nano_dy_SpecialTree.{filetype}"),
-            ],
-        },
-        "ZJetsBadMissingTreeAllFiles": {
-            "treename": "NotEvents",
-            "files": [osp.abspath(f"tests/samples/nano_dy.{filetype}")],
-        },
-        "ZJets": {
-            "treename": "Events",
-            "files": [osp.abspath(f"tests/samples/nano_dy.{filetype}")],
-            "metadata": {"checkusermeta": True, "someusermeta": "hello"},
-        },
-        "Data": {
-            "treename": "Events",
-            "files": [osp.abspath(f"tests/samples/nano_dimuon.{filetype}")],
-            "metadata": {"checkusermeta": True, "someusermeta2": "world"},
-        },
-    }
-
-    executor = executor(compression=compression)
-    run = processor.Runner(
-        executor=executor,
-        skipbadfiles=skipbadfiles,
-        schema=processor.NanoAODSchema,
-        maxchunks=maxchunks,
-        format=filetype,
-    )
-
-    if skipbadfiles:
-        hists = run(filelist, "Events", processor_instance=NanoEventsProcessor())
-        assert hists["cutflow"]["ZJets_pt"] == 18
-        assert hists["cutflow"]["ZJets_mass"] == 6
-        assert hists["cutflow"]["ZJetsBadMissingTree_pt"] == 18
-        assert hists["cutflow"]["ZJetsBadMissingTree_mass"] == 6
-        assert hists["cutflow"]["Data_pt"] == 84
-        assert hists["cutflow"]["Data_mass"] == 66
-
-    else:
-        LookForError = (FileNotFoundError, UprootMissTreeError)
-        with pytest.raises(LookForError):
-            hists = run(filelist, "Events", processor_instance=NanoEventsProcessor())
-        with pytest.raises(LookForError):
-            hists = run(filelist, "NotEvents", processor_instance=NanoEventsProcessor())
diff --git a/tests/test_workitem.py b/tests/test_workitem.py
deleted file mode 100644
index 205c4d161..000000000
--- a/tests/test_workitem.py
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/usr/bin/env python3
-
-from coffea.processor.executor import WorkItem
-
-
-def test_work_item():
-    item1 = WorkItem("TestDataSet", "/a/b/c.root", "Events", 500, 670, "abc", {})
-    item2 = WorkItem(
-        "TestDataSet", "/a/b/c.root", "Events", 500, 670, "abc", {"meta": "data"}
-    )
-    item3 = WorkItem("TestDataSet", "/a/b/c.root", "Events", 500, 760, "abc", {})
-
-    assert item1 == item1
-    assert item1 == item2
-    assert item1 != item3
-    assert item1.dataset == "TestDataSet"
-    assert item1.filename == "/a/b/c.root"
-    assert item1.treename == "Events"
-    assert item1.entrystart == 500
-    assert item1.entrystop == 670
-    assert item1.fileuuid == "abc"
-    assert len(item1) == 670 - 500
-    assert len(item3) == 760 - 500
-
-    # Test if hashable
-    hash(item2)
-
-    # Test if usermeta is mutable
-    item1.usermeta["user"] = "meta"
diff --git a/tests/wq.py b/tests/wq.py
deleted file mode 100755
index 888cd4b01..000000000
--- a/tests/wq.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import sys
-
-try:
-    import work_queue as wq
-
-    work_queue_port = 9123
-except ImportError:
-    print("work_queue is not installed. Omitting test.")
-    sys.exit(0)
-
-
-def template_analysis(environment_file, filelist, executor):
-    from coffea.processor import Runner
-    from coffea.processor.test_items import NanoTestProcessor
-
-    executor = executor(
-        environment_file=environment_file,
-        cores=2,
-        memory=500,  # MB
-        disk=1000,  # MB
-        manager_name="coffea_test",
-        port=work_queue_port,
-        print_stdout=True,
-    )
-
-    run = Runner(executor)
-
-    hists = run(filelist, "Events", NanoTestProcessor())
-
-    print(hists)
-    assert hists["cutflow"]["ZJets_pt"] == 18
-    assert hists["cutflow"]["ZJets_mass"] == 6
-    assert hists["cutflow"]["Data_pt"] == 84
-    assert hists["cutflow"]["Data_mass"] == 66
-
-
-def work_queue_example(environment_file):
-    from coffea.processor import WorkQueueExecutor
-
-    # Work Queue does not allow absolute paths
-    filelist = {
-        "ZJets": ["./samples/nano_dy.root"],
-        "Data": ["./samples/nano_dimuon.root"],
-    }
-
-    workers = wq.Factory(
-        batch_type="local", manager_host_port=f"localhost:{work_queue_port}"
-    )
-    workers.max_workers = 1
-    workers.min_workers = 1
-    workers.cores = 4
-    workers.memory = 1000  # MB
-    workers.disk = 4000  # MB
-
-    with workers:
-        template_analysis(environment_file, filelist, WorkQueueExecutor)
-
-
-if __name__ == "__main__":
-    try:
-        # see https://coffeateam.github.io/coffea/wq.html for constructing an
-        # environment that can be shipped with a task.
-        environment_file = sys.argv[1]
-    except IndexError:
-        environment_file = None
-    work_queue_example(environment_file)

From 707195eecbaf8117aaf39591bd9ebc5164619e1d Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Tue, 21 Nov 2023 17:01:29 -0600
Subject: [PATCH 27/80] disable workqueue tests (to be replaced with taskvine)

---
 .github/workflows/ci.yml | 54 ++++++++++++++++++++--------------------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index ea61615a7..a5263f459 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -126,33 +126,33 @@ jobs:
       env:
         GH_PAT: ${{ secrets.GITHUB_OAUTH }}
 
-  testwq:
-    runs-on: ubuntu-latest
-    needs: pre-commit
-    strategy:
-      matrix:
-        python-version: ["3.11"]
-    name: test coffea-workqueue
-
-    steps:
-    - uses: actions/checkout@v4
-    - name: Set up Conda
-      uses: conda-incubator/setup-miniconda@v2
-      env:
-        ACTIONS_ALLOW_UNSECURE_COMMANDS: true
-      with:
-        auto-update-conda: true
-        python-version: ${{ matrix.python-version }}
-        channels: conda-forge
-    - name: Test work_queue
-      shell: bash -l {0}
-      run: |
-        conda create --yes --name coffea-env -c conda-forge python=${{ matrix.python-version }} ndcctools dill conda-pack conda
-        conda activate coffea-env
-        python -m pip install --ignore-installed .
-        cd tests
-        conda-pack --output coffea-env.tar.gz
-        python wq.py coffea-env.tar.gz
+#  testwq:
+#    runs-on: ubuntu-latest
+#    needs: pre-commit
+#    strategy:
+#      matrix:
+#        python-version: ["3.11"]
+#    name: test coffea-workqueue
+#
+#    steps:
+#    - uses: actions/checkout@v4
+#    - name: Set up Conda
+#      uses: conda-incubator/setup-miniconda@v2
+#      env:
+#        ACTIONS_ALLOW_UNSECURE_COMMANDS: true
+#      with:
+#        auto-update-conda: true
+#        python-version: ${{ matrix.python-version }}
+#        channels: conda-forge
+#    - name: Test work_queue
+#      shell: bash -l {0}
+#      run: |
+#        conda create --yes --name coffea-env -c conda-forge python=${{ matrix.python-version }} ndcctools dill conda-pack conda
+#        conda activate coffea-env
+#        python -m pip install --ignore-installed .
+#        cd tests
+#        conda-pack --output coffea-env.tar.gz
+#        python wq.py coffea-env.tar.gz
 
 #  testskyhookjob:
 #    runs-on: ubuntu-latest

From 2c65b36ce3ff16b874998f3895d1d1334e817f5b Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Tue, 21 Nov 2023 17:08:05 -0600
Subject: [PATCH 28/80] patch up ci

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index a5263f459..5534495d4 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -201,7 +201,7 @@ jobs:
         password: ${{ secrets.PYPI_TOKEN }}
 
   pass:
-    needs: [test, testwq]
+    needs: [test]
     runs-on: ubuntu-latest
     steps:
       - run: echo "All jobs passed"

From cc22be89ae54dbc41758f0706460a910eed88901 Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Tue, 21 Nov 2023 17:09:56 -0600
Subject: [PATCH 29/80] more testwq removal

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 5534495d4..cc635a4a6 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -178,7 +178,7 @@ jobs:
   release:
     if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
     runs-on: ubuntu-latest
-    needs: [test, testwq]
+    needs: [test]
     strategy:
       matrix:
         python-version: ["3.11"]

From dfed4b9282eaf78cd59bf0f1a5f39606955cf8ed Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Tue, 21 Nov 2023 17:46:47 -0600
Subject: [PATCH 30/80] adjustments to removing executor / lazydataframe

---
 src/coffea/processor/__init__.py |   4 +-
 tests/test_parsl.py              | 144 -------------------------------
 tests/test_processor.py          | 117 -------------------------
 tests/test_spark.py              | 136 -----------------------------
 4 files changed, 3 insertions(+), 398 deletions(-)
 delete mode 100644 tests/test_parsl.py
 delete mode 100644 tests/test_spark.py

diff --git a/src/coffea/processor/__init__.py b/src/coffea/processor/__init__.py
index 91b6f357d..62b533d31 100644
--- a/src/coffea/processor/__init__.py
+++ b/src/coffea/processor/__init__.py
@@ -2,11 +2,13 @@
 
 
 """
-from .accumulator import AccumulatorABC, dict_accumulator
+from .accumulator import AccumulatorABC, accumulate, dict_accumulator, value_accumulator
 from .processor import ProcessorABC
 
 __all__ = [
     "dict_accumulator",
+    "value_accumulator",
+    "accumulate",
     "AccumulatorABC",
     "ProcessorABC",
 ]
diff --git a/tests/test_parsl.py b/tests/test_parsl.py
deleted file mode 100644
index c879e80be..000000000
--- a/tests/test_parsl.py
+++ /dev/null
@@ -1,144 +0,0 @@
-import multiprocessing
-import sys
-
-import pytest
-
-from coffea import processor
-
-
-def test_parsl_start_stop():
-    pytest.importorskip("parsl", minversion="0.7.2")
-
-    from coffea.processor.parsl.detail import (
-        _default_cfg,
-        _parsl_initialize,
-        _parsl_stop,
-    )
-
-    _parsl_initialize(config=_default_cfg)
-
-    _parsl_stop()
-
-
-def do_parsl_job(filelist, flatten=False, compression=0, config=None):
-    from coffea.processor.test_items import NanoTestProcessor
-
-    executor = processor.ParslExecutor(compression=compression, config=config)
-    run = processor.Runner(executor=executor)
-
-    hists = run(filelist, "Events", processor_instance=NanoTestProcessor())
-
-    assert hists["cutflow"]["ZJets_pt"] == 18
-    assert hists["cutflow"]["ZJets_mass"] == 6
-    assert hists["cutflow"]["Data_pt"] == 84
-    assert hists["cutflow"]["Data_mass"] == 66
-
-
-# @pytest.mark.skipif(sys.platform.startswith('darwin'), reason='parsl htex not working on osx again')
-def test_parsl_htex_executor():
-    pytest.importorskip("parsl", minversion="0.7.2")
-    import os
-    import os.path as osp
-
-    import parsl
-    from parsl.channels import LocalChannel
-    from parsl.config import Config
-    from parsl.executors import HighThroughputExecutor
-    from parsl.providers import LocalProvider
-
-    parsl_config = Config(
-        executors=[
-            HighThroughputExecutor(
-                label="coffea_parsl_default",
-                address="127.0.0.1",
-                cores_per_worker=max(multiprocessing.cpu_count() // 2, 1),
-                max_workers=1,
-                provider=LocalProvider(
-                    channel=LocalChannel(),
-                    init_blocks=1,
-                    max_blocks=1,
-                    nodes_per_block=1,
-                ),
-            )
-        ],
-        strategy=None,
-    )
-    parsl.load(parsl_config)
-
-    filelist = {
-        "ZJets": [osp.join(os.getcwd(), "tests/samples/nano_dy.root")],
-        "Data": [osp.join(os.getcwd(), "tests/samples/nano_dimuon.root")],
-    }
-
-    do_parsl_job(filelist)
-    do_parsl_job(filelist, compression=1)
-
-    filelist = {
-        "ZJets": {
-            "treename": "Events",
-            "files": [osp.join(os.getcwd(), "tests/samples/nano_dy.root")],
-        },
-        "Data": {
-            "treename": "Events",
-            "files": [osp.join(os.getcwd(), "tests/samples/nano_dimuon.root")],
-        },
-    }
-
-    do_parsl_job(filelist)
-
-
-@pytest.mark.skipif(
-    sys.platform.startswith("win"), reason="signals are different on windows"
-)
-def test_timeout():
-    import signal
-
-    from coffea.processor.parsl.timeout import timeout
-
-    @timeout
-    def too_long(timeout=None):
-        import time
-
-        time.sleep(20)
-
-    @timeout
-    def make_except(timeout=None):
-        import time
-
-        time.sleep(1)
-        raise Exception("oops!")
-
-    try:
-        too_long(timeout=5)
-    except Exception as e:
-        assert e.args[0] == "Timeout hit"
-
-    try:
-        make_except(timeout=20)
-    except Exception as e:
-        assert e.args[0] == "oops!"
-
-    # reset alarms for other tests, this is suspicious
-    signal.alarm(0)
-
-
-def test_parsl_condor_cfg():
-    pytest.importorskip("parsl", minversion="0.7.2")
-
-    from coffea.processor.parsl.condor_config import condor_config
-
-    print(condor_config())
-
-
-def test_parsl_slurm_cfg():
-    pytest.importorskip("parsl", minversion="0.7.2")
-    import os
-
-    x509_proxy = "x509up_u%s" % (os.getuid())
-    fname = "/tmp/%s" % x509_proxy
-    with open(fname, "w+"):
-        os.utime(fname, None)
-
-    from coffea.processor.parsl.slurm_config import slurm_config
-
-    print(slurm_config())
diff --git a/tests/test_processor.py b/tests/test_processor.py
index 732762b08..b5d836b67 100644
--- a/tests/test_processor.py
+++ b/tests/test_processor.py
@@ -1,6 +1,3 @@
-import os.path as osp
-import sys
-
 import pytest
 
 
@@ -28,117 +25,3 @@ def postprocess(self, accumulator):
 
     acc = None
     super(test, proc).postprocess(acc)
-
-
-@pytest.mark.skipif(
-    sys.platform.startswith("win"), reason="problems with paths on windows"
-)
-def test_lazy_dataframe():
-    import uproot
-
-    from coffea.processor import LazyDataFrame
-
-    tree = uproot.open(osp.abspath("tests/samples/nano_dy.root"))["Events"]
-    entrystart = 0
-    entrystop = 100
-
-    df = LazyDataFrame(tree, entrystart, entrystop, preload_items=["nMuon"])
-
-    assert len(df) == 1
-
-    pt = df["Muon_pt"]
-    assert len(df) == 2
-    df["Muon_pt_up"] = pt * 1.05
-    assert len(df) == 3
-    assert "Muon_pt" in df.materialized
-
-    assert "Muon_eta" in df.available
-
-    assert df.size == tree.num_entries
-
-    with pytest.raises(KeyError):
-        df["notthere"]
-
-
-@pytest.mark.skipif(
-    sys.platform.startswith("win"), reason="problems with paths on windows"
-)
-def test_lazy_dataframe_getattr():
-    import uproot
-
-    from coffea.processor import LazyDataFrame
-
-    tree = uproot.open(osp.abspath("tests/samples/nano_dy.root"))["Events"]
-    entrystart = 0
-    entrystop = 100
-
-    df = LazyDataFrame(tree, entrystart, entrystop, preload_items=["nMuon"])
-
-    assert len(df) == 1
-
-    df.Muon_pt
-    assert len(df) == 2
-    assert "Muon_pt" in df.materialized
-
-    assert "Muon_eta" in df.available
-
-    assert df.size == tree.num_entries
-
-    with pytest.raises(AttributeError):
-        df.notthere
-
-    import copy
-
-    df2 = copy.copy(df)
-    df2.Muon_pt
-
-    with pytest.raises(AttributeError):
-        df2.notthere
-
-
-def test_processor_newaccumulator():
-    from coffea.processor import (
-        IterativeExecutor,
-        ProcessorABC,
-        defaultdict_accumulator,
-    )
-
-    class Test(ProcessorABC):
-        def process(self, item):
-            return {"itemsum": item}
-
-        def postprocess(self, accumulator):
-            pass
-
-    proc = Test()
-
-    exe = IterativeExecutor()
-    out = exe(
-        range(10),
-        proc.process,
-        None,
-    )
-    assert out == ({"itemsum": 45}, 0)
-
-    class TestOldStyle(ProcessorABC):
-        @property
-        def accumulator(self):
-            return defaultdict_accumulator(int)
-
-        def process(self, item):
-            out = self.accumulator.identity()
-            out["itemsum"] += item
-            return out
-
-        def postprocess(self, accumulator):
-            pass
-
-    proc = TestOldStyle()
-
-    exe = IterativeExecutor()
-    out = exe(
-        range(10),
-        proc.process,
-        proc.accumulator,
-    )
-    assert out[0]["itemsum"] == 45
diff --git a/tests/test_spark.py b/tests/test_spark.py
deleted file mode 100644
index 25581213a..000000000
--- a/tests/test_spark.py
+++ /dev/null
@@ -1,136 +0,0 @@
-import pytest
-
-
-def test_spark_imports():
-    pytest.importorskip("pyspark", minversion="3.3.0")
-
-    from coffea.processor.spark.detail import _spark_initialize, _spark_stop
-
-    spark = _spark_initialize(bindAddress="127.0.0.1", host="127.0.0.1")
-    _spark_stop(spark)
-
-
-@pytest.mark.skip(reason="pyspark executor work currently in progress")
-def test_spark_executor():
-    pyspark = pytest.importorskip("pyspark", minversion="3.3.0")
-    import os
-    import os.path as osp
-
-    import pyspark.sql
-    from pyarrow.util import guid
-
-    from coffea.nanoevents import schemas
-    from coffea.processor import run_spark_job
-    from coffea.processor.spark.detail import _spark_initialize, _spark_stop
-
-    spark_config = (
-        pyspark.sql.SparkSession.builder.appName("spark-executor-test-%s" % guid())
-        .master("local[*]")
-        .config("spark.sql.execution.arrow.enabled", "true")
-        .config("spark.driver.host", "127.0.0.1")
-        .config("spark.driver.bindAddress", "127.0.0.1")
-        .config("spark.executor.x509proxyname", "x509_u12409")
-        .config("spark.sql.execution.arrow.maxRecordsPerBatch", 200000)
-    )
-
-    spark = _spark_initialize(
-        config=spark_config, log_level="ERROR", spark_progress=False
-    )
-
-    filelist = {
-        "ZJets": {
-            "files": ["file:" + osp.join(os.getcwd(), "tests/samples/nano_dy.root")],
-            "treename": "Events",
-        },
-        "Data": {
-            "files": [
-                "file:" + osp.join(os.getcwd(), "tests/samples/nano_dimuon.root")
-            ],
-            "treename": "Events",
-        },
-    }
-
-    from coffea.processor.spark.spark_executor import spark_executor
-    from coffea.processor.test_items import NanoEventsProcessor, NanoTestProcessor
-
-    columns = ["nMuon", "Muon_pt", "Muon_eta", "Muon_phi", "Muon_mass", "Muon_charge"]
-    proc = NanoTestProcessor(columns=columns)
-
-    hists = run_spark_job(
-        filelist,
-        processor_instance=proc,
-        executor=spark_executor,
-        spark=spark,
-        thread_workers=1,
-        executor_args={"file_type": "root"},
-    )
-
-    assert sum(spark_executor.counts.values()) == 80
-    assert hists["cutflow"]["ZJets_pt"] == 18
-    assert hists["cutflow"]["ZJets_mass"] == 6
-    assert hists["cutflow"]["Data_pt"] == 84
-    assert hists["cutflow"]["Data_mass"] == 66
-
-    hists = run_spark_job(
-        filelist,
-        processor_instance=proc,
-        executor=spark_executor,
-        spark=spark,
-        thread_workers=1,
-        executor_args={"file_type": "root"},
-    )
-
-    assert sum(spark_executor.counts.values()) == 80
-    assert hists["cutflow"]["ZJets_pt"] == 18
-    assert hists["cutflow"]["ZJets_mass"] == 6
-    assert hists["cutflow"]["Data_pt"] == 84
-    assert hists["cutflow"]["Data_mass"] == 66
-
-    proc = NanoEventsProcessor(columns=columns)
-    hists = run_spark_job(
-        filelist,
-        processor_instance=proc,
-        executor=spark_executor,
-        spark=spark,
-        thread_workers=1,
-        executor_args={"file_type": "root", "schema": schemas.NanoAODSchema},
-    )
-
-    _spark_stop(spark)
-
-    assert sum(spark_executor.counts.values()) == 80
-    assert hists["cutflow"]["ZJets_pt"] == 18
-    assert hists["cutflow"]["ZJets_mass"] == 6
-    assert hists["cutflow"]["Data_pt"] == 84
-    assert hists["cutflow"]["Data_mass"] == 66
-
-
-def test_spark_hist_adders():
-    pytest.importorskip("pyspark", minversion="3.3.0")
-
-    import pickle as pkl
-
-    import lz4.frame as lz4f
-    import pandas as pd
-
-    from coffea.processor.spark.spark_executor import agg_histos_raw, reduce_histos_raw
-    from coffea.processor.test_items import NanoTestProcessor
-    from coffea.util import numpy as np
-
-    proc = NanoTestProcessor()
-
-    one = proc.accumulator
-    two = proc.accumulator
-    hlist1 = [lz4f.compress(pkl.dumps(one))]
-    hlist2 = [lz4f.compress(pkl.dumps(one)), lz4f.compress(pkl.dumps(two))]
-    harray1 = np.array(hlist1, dtype="O")
-    harray2 = np.array(hlist2, dtype="O")
-
-    series1 = pd.Series(harray1)
-    series2 = pd.Series(harray2)
-    df = pd.DataFrame({"histos": harray2})
-
-    # correctness of these functions is checked in test_spark_executor
-    agg_histos_raw(series1, 1)
-    agg_histos_raw(series2, 1)
-    reduce_histos_raw(df, 1)

From a6619875b02a71bec69f51320938d194ff8425b9 Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Tue, 21 Nov 2023 18:15:32 -0600
Subject: [PATCH 31/80] accumulator tests

---
 src/coffea/processor/__init__.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/coffea/processor/__init__.py b/src/coffea/processor/__init__.py
index 62b533d31..465fc0ed0 100644
--- a/src/coffea/processor/__init__.py
+++ b/src/coffea/processor/__init__.py
@@ -2,11 +2,24 @@
 
 
 """
-from .accumulator import AccumulatorABC, accumulate, dict_accumulator, value_accumulator
+from .accumulator import (
+    AccumulatorABC,
+    accumulate,
+    column_accumulator,
+    defaultdict_accumulator,
+    dict_accumulator,
+    list_accumulator,
+    set_accumulator,
+    value_accumulator,
+)
 from .processor import ProcessorABC
 
 __all__ = [
+    "column_accumulator",
+    "defaultdict_accumulator",
     "dict_accumulator",
+    "list_accumulator",
+    "set_accumulator",
     "value_accumulator",
     "accumulate",
     "AccumulatorABC",

From 080f2af1c09f287a91bf37e6f1fc4601ca29a1c8 Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Wed, 22 Nov 2023 11:46:10 -0600
Subject: [PATCH 32/80] also allow Callables in apply_to_fileset

---
 src/coffea/dataset_tools/apply_processor.py | 25 ++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/src/coffea/dataset_tools/apply_processor.py b/src/coffea/dataset_tools/apply_processor.py
index 610bae8fa..798c73623 100644
--- a/src/coffea/dataset_tools/apply_processor.py
+++ b/src/coffea/dataset_tools/apply_processor.py
@@ -1,9 +1,15 @@
+import copy
+from typing import Callable, Union
+
 from coffea.nanoevents import NanoAODSchema, NanoEventsFactory
 from coffea.processor import ProcessorABC
 
 
 def apply_to_one_dataset(
-    proc: ProcessorABC, dataset, schemaclass=NanoAODSchema, metadata={}
+    data_manipulation: Union[ProcessorABC, Callable],
+    dataset,
+    schemaclass=NanoAODSchema,
+    metadata={},
 ):
     files = dataset["files"]
     events = NanoEventsFactory.from_root(
@@ -11,13 +17,22 @@ def apply_to_one_dataset(
         metadata=metadata,
         schemaclass=schemaclass,
     ).events()
-    return proc.process(events)
+    if isinstance(data_manipulation, ProcessorABC):
+        return data_manipulation.process(events)
+    elif isinstance(data_manipulation, Callable):
+        return data_manipulation(events)
+    else:
+        raise ValueError("data_manipulation must either be a ProcessorABC or Callable")
 
 
-def apply_to_fileset(proc: ProcessorABC, fileset, schemaclass=NanoAODSchema):
+def apply_to_fileset(
+    data_manipulation: Union[ProcessorABC, Callable], fileset, schemaclass=NanoAODSchema
+):
     out = {}
     for name, dataset in fileset.items():
-        metadata = dataset.get("metadata", {}).copy()
+        metadata = copy.deepcopy(dataset.get("metadata", {}))
         metadata["dataset"] = name
-        out[name] = apply_to_one_dataset(proc, dataset, schemaclass, metadata)
+        out[name] = apply_to_one_dataset(
+            data_manipulation, dataset, schemaclass, metadata
+        )
     return out

From 4a600536bbb5e4f2549aab52ebbf88bdf88e8657 Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Mon, 27 Nov 2023 13:43:50 -0600
Subject: [PATCH 33/80] add dataset tools test

---
 tests/test_dataset_tools.py | 220 ++++++++++++++++++++++++++++++++++++
 1 file changed, 220 insertions(+)
 create mode 100644 tests/test_dataset_tools.py

diff --git a/tests/test_dataset_tools.py b/tests/test_dataset_tools.py
new file mode 100644
index 000000000..ca46c0060
--- /dev/null
+++ b/tests/test_dataset_tools.py
@@ -0,0 +1,220 @@
+import dask
+import pytest
+from distributed import Client
+
+from coffea.dataset_tools import apply_to_fileset, max_chunks, preprocess, slice_chunks
+from coffea.nanoevents import BaseSchema, NanoAODSchema
+from coffea.processor.test_items import NanoEventsProcessor, NanoTestProcessor
+
+_starting_fileset = {
+    "ZJets": {
+        "files": {
+            "tests/samples/nano_dy.root": {
+                "object_path": "Events",
+                "steps": [
+                    [0, 5],
+                    [5, 10],
+                    [10, 15],
+                    [15, 20],
+                    [20, 25],
+                    [25, 30],
+                    [30, 35],
+                    [35, 40],
+                ],
+            }
+        }
+    },
+    "Data": {
+        "files": {
+            "tests/samples/nano_dimuon.root": "Events",
+            "tests/samples/nano_dimuon_not_there.root": "Events",
+        }
+    },
+}
+
+_runnable_result = {
+    "ZJets": {
+        "files": {
+            "tests/samples/nano_dy.root": {
+                "object_path": "Events",
+                "steps": [
+                    [0, 7],
+                    [7, 14],
+                    [14, 21],
+                    [21, 28],
+                    [28, 35],
+                    [35, 40],
+                ],
+                "uuid": "a9490124-3648-11ea-89e9-f5b55c90beef",
+            }
+        }
+    },
+    "Data": {
+        "files": {
+            "tests/samples/nano_dimuon.root": {
+                "object_path": "Events",
+                "steps": [
+                    [0, 7],
+                    [7, 14],
+                    [14, 21],
+                    [21, 28],
+                    [28, 35],
+                    [35, 40],
+                ],
+                "uuid": "a210a3f8-3648-11ea-a29f-f5b55c90beef",
+            }
+        }
+    },
+}
+
+_updated_result = {
+    "ZJets": {
+        "files": {
+            "tests/samples/nano_dy.root": {
+                "object_path": "Events",
+                "steps": [
+                    [0, 7],
+                    [7, 14],
+                    [14, 21],
+                    [21, 28],
+                    [28, 35],
+                    [35, 40],
+                ],
+                "uuid": "a9490124-3648-11ea-89e9-f5b55c90beef",
+            }
+        }
+    },
+    "Data": {
+        "files": {
+            "tests/samples/nano_dimuon.root": {
+                "object_path": "Events",
+                "steps": [
+                    [0, 7],
+                    [7, 14],
+                    [14, 21],
+                    [21, 28],
+                    [28, 35],
+                    [35, 40],
+                ],
+                "uuid": "a210a3f8-3648-11ea-a29f-f5b55c90beef",
+            },
+            "tests/samples/nano_dimuon_not_there.root": {
+                "object_path": "Events",
+                "steps": None,
+                "uuid": None,
+            },
+        }
+    },
+}
+
+
+@pytest.mark.parametrize(
+    "proc_and_schema",
+    [(NanoTestProcessor, BaseSchema), (NanoEventsProcessor, NanoAODSchema)],
+)
+def test_apply_to_fileset(proc_and_schema):
+    proc, schemaclass = proc_and_schema
+
+    with Client() as _:
+        to_compute = apply_to_fileset(
+            proc(),
+            _runnable_result,
+            schemaclass=schemaclass,
+        )
+        out = dask.compute(to_compute)[0]
+
+        assert out["ZJets"]["cutflow"]["ZJets_pt"] == 18
+        assert out["ZJets"]["cutflow"]["ZJets_mass"] == 6
+        assert out["Data"]["cutflow"]["Data_pt"] == 84
+        assert out["Data"]["cutflow"]["Data_mass"] == 66
+
+        to_compute = apply_to_fileset(
+            proc(),
+            max_chunks(_runnable_result, 1),
+            schemaclass=schemaclass,
+        )
+        out = dask.compute(to_compute)[0]
+
+        assert out["ZJets"]["cutflow"]["ZJets_pt"] == 5
+        assert out["ZJets"]["cutflow"]["ZJets_mass"] == 2
+        assert out["Data"]["cutflow"]["Data_pt"] == 17
+        assert out["Data"]["cutflow"]["Data_mass"] == 14
+
+
+def test_preprocess():
+    with Client() as _:
+        starting_fileset = _starting_fileset
+
+        dataset_runnable, dataset_updated = preprocess(
+            starting_fileset,
+            maybe_step_size=7,
+            align_clusters=False,
+            files_per_batch=10,
+            skip_bad_files=True,
+        )
+
+        assert dataset_runnable == _runnable_result
+        assert dataset_updated == _updated_result
+
+
+def test_preprocess_failed_file():
+    with Client() as _, pytest.raises(FileNotFoundError):
+        starting_fileset = _starting_fileset
+
+        dataset_runnable, dataset_updated = preprocess(
+            starting_fileset,
+            maybe_step_size=7,
+            align_clusters=False,
+            files_per_batch=10,
+            skip_bad_files=False,
+        )
+
+
+def test_maxchunks():
+    max_chunked = max_chunks(_runnable_result, 3)
+
+    assert max_chunked == {
+        "ZJets": {
+            "files": {
+                "tests/samples/nano_dy.root": {
+                    "object_path": "Events",
+                    "steps": [[0, 7], [7, 14], [14, 21]],
+                    "uuid": "a9490124-3648-11ea-89e9-f5b55c90beef",
+                }
+            }
+        },
+        "Data": {
+            "files": {
+                "tests/samples/nano_dimuon.root": {
+                    "object_path": "Events",
+                    "steps": [[0, 7], [7, 14], [14, 21]],
+                    "uuid": "a210a3f8-3648-11ea-a29f-f5b55c90beef",
+                }
+            }
+        },
+    }
+
+
+def test_slicechunks():
+    slice_chunked = slice_chunks(_runnable_result, slice(None, None, 2))
+
+    assert slice_chunked == {
+        "ZJets": {
+            "files": {
+                "tests/samples/nano_dy.root": {
+                    "object_path": "Events",
+                    "steps": [[0, 7], [14, 21], [28, 35]],
+                    "uuid": "a9490124-3648-11ea-89e9-f5b55c90beef",
+                }
+            }
+        },
+        "Data": {
+            "files": {
+                "tests/samples/nano_dimuon.root": {
+                    "object_path": "Events",
+                    "steps": [[0, 7], [14, 21], [28, 35]],
+                    "uuid": "a210a3f8-3648-11ea-a29f-f5b55c90beef",
+                }
+            }
+        },
+    }

From 98a719cb98a8137a8e5cd1909c2c34b9f71777c2 Mon Sep 17 00:00:00 2001
From: Davide Valsecchi <davide.valsecchi@cern.ch>
Date: Tue, 28 Nov 2023 16:33:32 +0100
Subject: [PATCH 34/80] Make scope of dataset_query less cms-only. Edits
 language

---
 src/coffea/dataset_tools/dataset_query.py | 54 +++++++++++------------
 src/coffea/dataset_tools/rucio_utils.py   | 37 +++++++++-------
 2 files changed, 49 insertions(+), 42 deletions(-)

diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py
index 91b3910c7..a4d53df20 100644
--- a/src/coffea/dataset_tools/dataset_query.py
+++ b/src/coffea/dataset_tools/dataset_query.py
@@ -67,8 +67,8 @@ def __init__(self):
         self.last_query = ""
         self.last_query_tree = None
         self.last_query_list = None
-        self.sites_whitelist = None
-        self.sites_blacklist = None
+        self.sites_allowlist = None
+        self.sites_blocklist = None
         self.sites_regex = None
         self.last_replicas_results = None
 
@@ -98,7 +98,7 @@ def do_query(self, args):
         # Your code here
         with self.console.status(f"Querying rucio for: [bold red]{args}[/]"):
             outlist, outtree = rucio_utils.query_dataset(
-                args.arg_list[0], client=self.rucio_client, tree=True
+                args.arg_list[0], client=self.rucio_client, tree=True, scope="cms" #TODO configure scope
             )
             # Now let's print the results as a tree
             print_dataset_query(args, outtree, self.selected_datasets, self.console)
@@ -176,8 +176,8 @@ def do_replicas(self, args):
         ):
             outfiles, outsites, sites_counts = rucio_utils.get_dataset_files_replicas(
                 dataset,
-                whitelist_sites=self.sites_whitelist,
-                blacklist_sites=self.sites_blacklist,
+                allowlist_sites=self.sites_allowlist,
+                blocklist_sites=self.sites_blocklist,
                 regex_sites=self.sites_regex,
                 mode="full",
                 client=self.rucio_client,
@@ -263,22 +263,22 @@ def do_replicas(self, args):
                 T.add(f"[cyan]{f}")
         self.console.print(tree)
 
-    def do_whitelist_sites(self, args):
-        if self.sites_whitelist is None:
-            self.sites_whitelist = args.arg_list
+    def do_allowlist_sites(self, args):
+        if self.sites_allowlist is None:
+            self.sites_allowlist = args.arg_list
         else:
-            self.sites_whitelist += args.arg_list
-        print("[green]Whitelisted sites:")
-        for s in self.sites_whitelist:
+            self.sites_allowlist += args.arg_list
+        print("[green]Allowlisted sites:")
+        for s in self.sites_allowlist:
             print(f"- {s}")
 
-    def do_blacklist_sites(self, args):
-        if self.sites_blacklist is None:
-            self.sites_blacklist = args.arg_list
+    def do_blocklist_sites(self, args):
+        if self.sites_blocklist is None:
+            self.sites_blocklist = args.arg_list
         else:
-            self.sites_blacklist += args.arg_list
-        print("[red]Blacklisted sites:")
-        for s in self.sites_blacklist:
+            self.sites_blocklist += args.arg_list
+        print("[red]Blocklisted sites:")
+        for s in self.sites_blocklist:
             print(f"- {s}")
 
     def do_regex_sites(self, args):
@@ -291,20 +291,20 @@ def do_regex_sites(self, args):
 
     def do_sites_filters(self, args):
         if args == "":
-            print("[green bold]Whitelisted sites:")
-            if self.sites_whitelist:
-                for s in self.sites_whitelist:
+            print("[green bold]Allow-listed sites:")
+            if self.sites_allowlist:
+                for s in self.sites_allowlist:
                     print(f"- {s}")
 
-            print("[bold red]Blacklisted sites:")
-            if self.sites_blacklist:
-                for s in self.sites_blacklist:
+            print("[bold red]Block-listed sites:")
+            if self.sites_blocklist:
+                for s in self.sites_blocklist:
                     print(f"- {s}")
 
             print(f"[bold cyan]Sites regex: [italics]{self.sites_regex}")
         if args == "clear":
-            self.sites_whitelist = None
-            self.sites_blacklist = None
+            self.sites_allowlist = None
+            self.sites_blocklist = None
             self.sites_regex = None
             print("[bold green]Sites filters cleared")
 
@@ -353,8 +353,8 @@ def do_save(self, args):
   - [bold cyan]list_replicas (LR) index[/]: Print the selected files replicas for the selected dataset
   - [bold cyan]sites_filters[/]: show the active sites filters
   - [bold cyan]sites_filters clear[/]: clear all the active sites filters
-  - [bold cyan]whitelist_sites[/]: Select sites to whitelist for replica queries
-  - [bold cyan]blacklist_sites[/]: Select sites to blacklist for replica queries
+  - [bold cyan]allowlist_sites[/]: Select sites to allowlist them for replica queries
+  - [bold cyan]blocklist_sites[/]: Select sites to blocklist them for replica queries
   - [bold cyan]regex_sites[/]: Select sites with a regex for replica queries: please wrap the regex like "T[123]_(FR|IT|BE|CH|DE)_\w+"
   - [bold cyan]save (S) file.yaml[/]: Save the replicas results to file for further processing
   - [bold cyan]help[/]: get help!
diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py
index 96b17b454..f4a3f3a7d 100644
--- a/src/coffea/dataset_tools/rucio_utils.py
+++ b/src/coffea/dataset_tools/rucio_utils.py
@@ -4,12 +4,17 @@
 import re
 import subprocess
 from collections import defaultdict
+import tomli
 
 from rucio.client import Client
 
 # Rucio needs the default configuration --> taken from CMS cvmfs defaults
-os.environ["RUCIO_HOME"] = "/cvmfs/cms.cern.ch/rucio/current"
+if not "RUCIO_HOME" in os.environ:
+    os.environ["RUCIO_HOME"] = "/cvmfs/cms.cern.ch/rucio/current"
 
+# with open(f"{os.environ['RUCIO_HOME']}/etc/rucio.cfg", "rb") as f:
+#     rucio_cfg = tomli.load(f)
+#     print(rucio_cfg)
 
 def get_proxy_path() -> str:
     """
@@ -125,19 +130,20 @@ def _get_pfn_for_site(path, rules):
 
 def get_dataset_files_replicas(
     dataset,
-    whitelist_sites=None,
-    blacklist_sites=None,
+    allowlist_sites=None,
+    blocklist_sites=None,
     regex_sites=None,
     mode="full",
     client=None,
+    scope="cms"
 ):
     """
     This function queries the Rucio server to get information about the location
     of all the replicas of the files in a CMS dataset.
 
     The sites can be filtered in 3 different ways:
-    - `whilist_sites`: list of sites to select from. If the file is not found there, raise an Exception.
-    - `blacklist_sites`: list of sites to avoid. If the file has no left site, raise an Exception
+    - `allowlist_sites`: list of sites to select from. If the file is not found there, raise an Exception.
+    - `blocklist_sites`: list of sites to avoid. If the file has no left site, raise an Exception
     - `regex_sites`: regex expression to restrict the list of sites.
 
     The fileset returned by the function is controlled by the `mode` parameter:
@@ -150,11 +156,12 @@ def get_dataset_files_replicas(
     ----------
 
         dataset: str
-        whilelist_sites: list
-        blacklist_sites: list
+        allowlist_sites: list
+        blocklist_sites: list
         regex_sites: list
         mode:  str, default "full"
         client: rucio Client, optional
+        scope:  rucio scope, "cms"
 
     Returns
     -------
@@ -176,13 +183,13 @@ def get_dataset_files_replicas(
     client = client if client else get_rucio_client()
     outsites = []
     outfiles = []
-    for filedata in client.list_replicas([{"scope": "cms", "name": dataset}]):
+    for filedata in client.list_replicas([{"scope": scope, "name": dataset}]):
         outfile = []
         outsite = []
         rses = filedata["rses"]
         found = False
-        if whitelist_sites:
-            for site in whitelist_sites:
+        if allowlist_sites:
+            for site in allowlist_sites:
                 if site in rses:
                     # Check actual availability
                     meta = filedata["pfns"][rses[site][0]]
@@ -201,13 +208,13 @@ def get_dataset_files_replicas(
 
             if not found:
                 raise Exception(
-                    f"No SITE available in the whitelist for file {filedata['name']}"
+                    f"No SITE available in the allowlist for file {filedata['name']}"
                 )
         else:
             possible_sites = list(rses.keys())
-            if blacklist_sites:
+            if blocklist_sites:
                 possible_sites = list(
-                    filter(lambda key: key not in blacklist_sites, possible_sites)
+                    filter(lambda key: key not in blocklist_sites, possible_sites)
                 )
 
             if len(possible_sites) == 0:
@@ -275,11 +282,11 @@ def get_dataset_files_replicas(
     return outfiles, outsites, sites_counts
 
 
-def query_dataset(query, client=None, tree=False):
+def query_dataset(query, client=None, tree=False, scope="cms"):
     client = client if client else get_rucio_client()
     out = list(
         client.list_dids(
-            scope="cms", filters={"name": query, "type": "container"}, long=False
+            scope=scope, filters={"name": query, "type": "container"}, long=False
         )
     )
     if tree:

From 8f669a000933280a6a61a80f5b08959d1e49f5c7 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 28 Nov 2023 15:36:21 +0000
Subject: [PATCH 35/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/coffea/dataset_tools/dataset_query.py | 5 ++++-
 src/coffea/dataset_tools/rucio_utils.py   | 5 +++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py
index a4d53df20..46236b805 100644
--- a/src/coffea/dataset_tools/dataset_query.py
+++ b/src/coffea/dataset_tools/dataset_query.py
@@ -98,7 +98,10 @@ def do_query(self, args):
         # Your code here
         with self.console.status(f"Querying rucio for: [bold red]{args}[/]"):
             outlist, outtree = rucio_utils.query_dataset(
-                args.arg_list[0], client=self.rucio_client, tree=True, scope="cms" #TODO configure scope
+                args.arg_list[0],
+                client=self.rucio_client,
+                tree=True,
+                scope="cms",  # TODO configure scope
             )
             # Now let's print the results as a tree
             print_dataset_query(args, outtree, self.selected_datasets, self.console)
diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py
index f4a3f3a7d..2df9a62ec 100644
--- a/src/coffea/dataset_tools/rucio_utils.py
+++ b/src/coffea/dataset_tools/rucio_utils.py
@@ -4,8 +4,8 @@
 import re
 import subprocess
 from collections import defaultdict
-import tomli
 
+import tomli
 from rucio.client import Client
 
 # Rucio needs the default configuration --> taken from CMS cvmfs defaults
@@ -16,6 +16,7 @@
 #     rucio_cfg = tomli.load(f)
 #     print(rucio_cfg)
 
+
 def get_proxy_path() -> str:
     """
     Checks if the VOMS proxy exists and if it is valid
@@ -135,7 +136,7 @@ def get_dataset_files_replicas(
     regex_sites=None,
     mode="full",
     client=None,
-    scope="cms"
+    scope="cms",
 ):
     """
     This function queries the Rucio server to get information about the location

From 5e127c25911eea67178e48ec0b8b112fe259f530 Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Tue, 28 Nov 2023 11:08:29 -0600
Subject: [PATCH 36/80] remove accumulator concept

---
 src/coffea/analysis_tools.py        |  13 +-
 src/coffea/processor/__init__.py    |  18 --
 src/coffea/processor/accumulator.py | 380 ----------------------------
 src/coffea/processor/processor.py   |   2 +-
 tests/test_accumulators.py          | 184 --------------
 5 files changed, 10 insertions(+), 587 deletions(-)
 delete mode 100644 src/coffea/processor/accumulator.py
 delete mode 100644 tests/test_accumulators.py

diff --git a/src/coffea/analysis_tools.py b/src/coffea/analysis_tools.py
index facf14e97..0f88cab27 100644
--- a/src/coffea/analysis_tools.py
+++ b/src/coffea/analysis_tools.py
@@ -19,7 +19,7 @@
 import coffea.util
 
 
-class WeightStatistics(coffea.processor.AccumulatorABC):
+class WeightStatistics:
     def __init__(self, sumw=0.0, sumw2=0.0, minw=numpy.inf, maxw=-numpy.inf, n=0):
         self.sumw = sumw
         self.sumw2 = sumw2
@@ -40,6 +40,13 @@ def add(self, other):
         self.maxw = max(self.maxw, other.maxw)
         self.n += other.n
 
+    def __add__(self, other):
+        temp = WeightStatistics(self.sumw, self.sumw2, self.minw, self.maxw, self.n)
+        return temp.add(other)
+
+    def __iadd__(self, other):
+        return self.add(other)
+
 
 class Weights:
     """Container for event weights and associated systematic shifts
@@ -62,7 +69,7 @@ def __init__(self, size, storeIndividual=False):
         self._weight = None if size is None else numpy.ones(size)
         self._weights = {}
         self._modifiers = {}
-        self._weightStats = coffea.processor.dict_accumulator()
+        self._weightStats = {}
         self._storeIndividual = storeIndividual
 
     @property
@@ -102,8 +109,6 @@ def __add_delayed(self, name, weight, weightUp, weightDown, shift):
         if self._storeIndividual:
             self._weights[name] = weight
         self.__add_variation(name, weight, weightUp, weightDown, shift)
-        if isinstance(self._weightStats, coffea.processor.dict_accumulator):
-            self._weightStats = {}
         self._weightStats[name] = {
             "sumw": dask_awkward.to_dask_array(weight).sum(),
             "sumw2": dask_awkward.to_dask_array(weight**2).sum(),
diff --git a/src/coffea/processor/__init__.py b/src/coffea/processor/__init__.py
index 465fc0ed0..888ed0cf2 100644
--- a/src/coffea/processor/__init__.py
+++ b/src/coffea/processor/__init__.py
@@ -2,26 +2,8 @@
 
 
 """
-from .accumulator import (
-    AccumulatorABC,
-    accumulate,
-    column_accumulator,
-    defaultdict_accumulator,
-    dict_accumulator,
-    list_accumulator,
-    set_accumulator,
-    value_accumulator,
-)
 from .processor import ProcessorABC
 
 __all__ = [
-    "column_accumulator",
-    "defaultdict_accumulator",
-    "dict_accumulator",
-    "list_accumulator",
-    "set_accumulator",
-    "value_accumulator",
-    "accumulate",
-    "AccumulatorABC",
     "ProcessorABC",
 ]
diff --git a/src/coffea/processor/accumulator.py b/src/coffea/processor/accumulator.py
deleted file mode 100644
index 8ad12dab1..000000000
--- a/src/coffea/processor/accumulator.py
+++ /dev/null
@@ -1,380 +0,0 @@
-import copy
-import operator
-from abc import ABCMeta, abstractmethod
-from collections import defaultdict
-from collections.abc import MutableMapping, MutableSet
-from typing import Iterable, Optional, TypeVar, Union
-
-from dask.base import DaskMethodsMixin
-
-try:
-    from typing import Protocol, runtime_checkable  # type: ignore
-except ImportError:
-    from typing_extensions import Protocol  # type: ignore
-    from typing import runtime_checkable
-
-import numpy
-
-T = TypeVar("T")
-
-
-@runtime_checkable
-class Addable(Protocol):
-    def __add__(self: T, other: T) -> T:
-        ...
-
-
-Accumulatable = Union[Addable, MutableSet, MutableMapping]
-
-
-def add(a: Accumulatable, b: Accumulatable) -> Accumulatable:
-    """Add two accumulatables together, without altering inputs
-
-    This may make copies in certain situations
-    """
-    if isinstance(a, Addable) and isinstance(b, Addable):
-        return operator.add(a, b)
-    if isinstance(a, MutableSet) and isinstance(b, MutableSet):
-        return operator.or_(a, b)
-    elif isinstance(a, MutableMapping) and isinstance(b, MutableMapping):
-        # capture type(X) by shallow copy and clear
-        # since we don't know the signature of type(X).__init__
-        if isinstance(b, type(a)):
-            out = copy.copy(a)
-        elif isinstance(a, type(b)):
-            out = copy.copy(b)
-        else:
-            raise ValueError(
-                f"Cannot add two mappings of incompatible type ({type(a)} vs. {type(b)})"
-            )
-        out.clear()
-        lhs, rhs = set(a), set(b)
-        # Keep the order of elements as far as possible
-        for key in a:
-            if key in rhs:
-                out[key] = add(a[key], b[key])
-            else:
-                out[key] = (
-                    copy.deepcopy(a[key])
-                    if not isinstance(a[key], DaskMethodsMixin)
-                    else copy.copy(a[key])
-                )
-        for key in b:
-            if key not in lhs:
-                out[key] = (
-                    copy.deepcopy(b[key])
-                    if not isinstance(b[key], DaskMethodsMixin)
-                    else copy.copy(b[key])
-                )
-        return out
-    raise ValueError(
-        f"Cannot add accumulators of incompatible type ({type(a)} vs. {type(b)})"
-    )
-
-
-def iadd(a: Accumulatable, b: Accumulatable) -> Accumulatable:
-    """Add two accumulatables together, assuming the first is mutable"""
-    if isinstance(a, Addable) and isinstance(b, Addable):
-        return operator.iadd(a, b)
-    elif isinstance(a, MutableSet) and isinstance(b, MutableSet):
-        return operator.ior(a, b)
-    elif isinstance(a, MutableMapping) and isinstance(b, MutableMapping):
-        if not isinstance(b, type(a)):
-            raise ValueError(
-                f"Cannot add two mappings of incompatible type ({type(a)} vs. {type(b)})"
-            )
-        lhs, rhs = set(a), set(b)
-        # Keep the order of elements as far as possible
-        for key in a:
-            if key in rhs:
-                a[key] = iadd(a[key], b[key])
-        for key in b:
-            if key not in lhs:
-                a[key] = (
-                    copy.deepcopy(b[key])
-                    if not isinstance(b[key], DaskMethodsMixin)
-                    else copy.copy(b[key])
-                )
-        return a
-    raise ValueError(
-        f"Cannot add accumulators of incompatible type ({type(a)} vs. {type(b)})"
-    )
-
-
-def accumulate(
-    items: Iterable[Optional[Accumulatable]], accum: Optional[Accumulatable] = None
-) -> Optional[Accumulatable]:
-    gen = (x for x in items if x is not None)
-    try:
-        if accum is None:
-            accum = next(gen)
-            # we want to produce a new object so that the input is not mutated
-            accum = add(accum, next(gen))
-        while True:
-            # subsequent additions can happen in-place, which may be more performant
-            accum = iadd(accum, next(gen))
-    except StopIteration:
-        pass
-    return accum
-
-
-async def async_accumulate(result_stream):
-    output = None
-    async for results in result_stream:
-        if output:
-            output = iadd(output, results)
-        else:
-            output = results
-        yield output
-
-
-class AccumulatorABC(metaclass=ABCMeta):
-    """Abstract base class for an accumulator
-
-    Accumulators are abstract objects that enable the reduce stage of the typical map-reduce
-    scaleout that we do in Coffea. One concrete example is a histogram. The idea is that an
-    accumulator definition holds enough information to be able to create an empty accumulator
-    (the ``identity()`` method) and add two compatible accumulators together (the ``add()`` method).
-    The former is not strictly necessary, but helps with book-keeping. Here we show an example usage
-    of a few accumulator types. An arbitrary-depth nesting of dictionary accumulators is supported, much
-    like the behavior of directories in ROOT hadd.
-
-    After defining an accumulator::
-
-        from coffea.processor import dict_accumulator, column_accumulator, defaultdict_accumulator
-        from coffea.hist import Hist, Bin
-        import numpy as np
-
-        adef = dict_accumulator({
-            'cutflow': defaultdict_accumulator(int),
-            'pt': Hist("counts", Bin("pt", "$p_T$", 100, 0, 100)),
-            'final_pt': column_accumulator(np.zeros(shape=(0,))),
-        })
-
-    Notice that this function does not mutate ``adef``::
-
-        def fill(n):
-            ptvals = np.random.exponential(scale=30, size=n)
-            cut = ptvals > 200.
-            acc = adef.identity()
-            acc['cutflow']['pt>200'] += cut.sum()
-            acc['pt'].fill(pt=ptvals)
-            acc['final_pt'] += column_accumulator(ptvals[cut])
-            return acc
-
-    As such, we can execute it several times in parallel and reduce the result::
-
-        import concurrent.futures
-        with concurrent.futures.ThreadPoolExecutor() as executor:
-            outputs = executor.map(fill, [2000, 2000])
-
-        combined = sum(outputs, adef.identity())
-
-
-    Derived classes must implement
-        - ``identity()``: returns a new object of same type as self,
-          such that ``self + self.identity() == self``
-        - ``add(other)``: adds an object of same type as self to self
-
-    Concrete implementations are then provided for ``__add__``, ``__radd__``, and ``__iadd__``.
-    """
-
-    @abstractmethod
-    def identity(self):
-        """Identity of the accumulator
-
-        A value such that any other value added to it will return
-        the other value
-        """
-        pass
-
-    @abstractmethod
-    def add(self, other):
-        """Add another accumulator to this one in-place"""
-        pass
-
-    def __add__(self, other):
-        ret = self.identity()
-        ret.add(self)
-        ret.add(other)
-        return ret
-
-    def __radd__(self, other):
-        ret = self.identity()
-        ret.add(other)
-        ret.add(self)
-        return ret
-
-    def __iadd__(self, other):
-        self.add(other)
-        return self
-
-
-class value_accumulator(AccumulatorABC):
-    """Holds a value of arbitrary type
-
-    Parameters
-    ----------
-        default_factory : callable
-            a function that returns an instance of the desired identity value
-        initial : bool, optional
-            an initial value, if the identity is not the desired initial value
-    """
-
-    def __init__(self, default_factory, initial=None):
-        self.value = default_factory() if initial is None else initial
-        self.default_factory = default_factory
-
-    def __repr__(self):
-        if type(self.default_factory) is type:
-            defrepr = self.default_factory.__name__
-        else:
-            defrepr = repr(self.default_factory)
-        return f"value_accumulator({defrepr}, {self.value!r})"
-
-    def identity(self):
-        return value_accumulator(self.default_factory)
-
-    def add(self, other):
-        if isinstance(other, value_accumulator):
-            self.value = self.value + other.value
-        else:
-            self.value = self.value + other
-
-
-class list_accumulator(list, AccumulatorABC):
-    """A list with accumulator semantics
-
-    See `list` for further info
-    """
-
-    def identity(self):
-        return list()
-
-    def add(self, other):
-        """Add another accumulator to this one in-place"""
-        if isinstance(other, list):
-            list.extend(self, other)
-        else:
-            raise ValueError
-
-
-class set_accumulator(set, AccumulatorABC):
-    """A set with accumulator semantics
-
-    See `set` for further info
-    """
-
-    def identity(self):
-        return set_accumulator()
-
-    def add(self, other):
-        """Add another accumulator to this one in-place
-
-        Note
-        ----
-        This replaces `set.add` behavior, unfortunately.
-        A workaround is to use `set.update`, e.g. ``a.update({'val'})``
-        """
-        if isinstance(other, MutableSet):
-            set.update(self, other)
-        else:
-            set.add(self, other)
-
-
-class dict_accumulator(dict, AccumulatorABC):
-    """A dictionary with accumulator semantics
-
-    See `dict` for further info.
-    It is assumed that the contents of the dict have accumulator semantics.
-    """
-
-    def identity(self):
-        ret = dict_accumulator()
-        for key, value in self.items():
-            ret[key] = value.identity()
-        return ret
-
-    def add(self, other):
-        if isinstance(other, MutableMapping):
-            for key, value in other.items():
-                if key not in self:
-                    if isinstance(value, AccumulatorABC):
-                        self[key] = value.identity()
-                    else:
-                        raise ValueError
-                self[key] += value
-        else:
-            raise ValueError
-
-
-class defaultdict_accumulator(defaultdict, AccumulatorABC):
-    """A defaultdict with accumulator semantics
-
-    See `collections.defaultdict` for further info.
-    It is assumed that the contents of the dict have accumulator semantics
-    """
-
-    def identity(self):
-        return defaultdict_accumulator(self.default_factory)
-
-    def add(self, other):
-        for key, value in other.items():
-            self[key] += value
-
-
-class column_accumulator(AccumulatorABC):
-    """An appendable numpy ndarray
-
-    Parameters
-    ----------
-        value : numpy.ndarray
-            The identity value array, which should be an empty ndarray
-            with the desired row shape. The column dimension will correspond to
-            the first index of `value` shape.
-
-    Examples
-    --------
-    If a set of accumulators is defined as::
-
-        a = column_accumulator(np.array([]))
-        b = column_accumulator(np.array([1., 2., 3.]))
-        c = column_accumulator(np.array([4., 5., 6.]))
-
-    then:
-
-    >>> a + b
-    column_accumulator(array([1., 2., 3.]))
-    >>> c + b + a
-    column_accumulator(array([4., 5., 6., 1., 2., 3.]))
-    """
-
-    def __init__(self, value):
-        if not isinstance(value, numpy.ndarray):
-            raise ValueError("column_accumulator only works with numpy arrays")
-        self._empty = numpy.zeros(dtype=value.dtype, shape=(0,) + value.shape[1:])
-        self._value = value
-
-    def __repr__(self):
-        return "column_accumulator(%r)" % self.value
-
-    def identity(self):
-        return column_accumulator(self._empty)
-
-    def add(self, other):
-        if not isinstance(other, column_accumulator):
-            raise ValueError("column_accumulator cannot be added to %r" % type(other))
-        if other._empty.shape != self._empty.shape:
-            raise ValueError(
-                "Cannot add two column_accumulator objects of dissimilar shape (%r vs %r)"
-                % (self._empty.shape, other._empty.shape)
-            )
-        self._value = numpy.concatenate((self._value, other._value))
-
-    @property
-    def value(self):
-        """The current value of the column
-
-        Returns a numpy array where the first dimension is the column dimension
-        """
-        return self._value
diff --git a/src/coffea/processor/processor.py b/src/coffea/processor/processor.py
index edc2d6162..6cbc427a8 100644
--- a/src/coffea/processor/processor.py
+++ b/src/coffea/processor/processor.py
@@ -22,7 +22,7 @@ def __init__(self, flag=False):
                 self._flag = flag
 
             def process(self, events):
-                out = {"sumw": len(events)}
+                out = {"sumw": ak.num(events, axis=0)}
 
                 # ...
 
diff --git a/tests/test_accumulators.py b/tests/test_accumulators.py
deleted file mode 100644
index ccf5d1d11..000000000
--- a/tests/test_accumulators.py
+++ /dev/null
@@ -1,184 +0,0 @@
-from collections import defaultdict
-from functools import partial
-
-import numpy as np
-import pytest
-
-from coffea import processor
-
-
-def test_accumulators():
-    a = processor.value_accumulator(float)
-    a += 3.0
-    assert a.value == 3.0
-    assert a.identity().value == 0.0
-
-    a = processor.value_accumulator(partial(np.array, [2.0]))
-    a += 3.0
-    assert np.array_equal(a.value, np.array([5.0]))
-    assert np.array_equal(a.identity().value, np.array([2.0]))
-
-    lacc = processor.list_accumulator(range(4))
-    lacc += [3]
-    lacc += processor.list_accumulator([1, 2])
-    assert lacc == [0, 1, 2, 3, 3, 1, 2]
-
-    b = processor.set_accumulator({"apples", "oranges"})
-    b += {"pears"}
-    b += "grapes"
-    assert b == {"apples", "oranges", "pears", "grapes"}
-
-    c = processor.dict_accumulator({"num": a, "fruit": b})
-    c["num"] += 2.0
-    c += processor.dict_accumulator(
-        {
-            "num2": processor.value_accumulator(int),
-            "fruit": processor.set_accumulator({"apples", "cherries"}),
-        }
-    )
-    assert c["num2"].value == 0
-    assert np.array_equal(c["num"].value, np.array([7.0]))
-    assert c["fruit"] == {"apples", "oranges", "pears", "grapes", "cherries"}
-
-    d = processor.defaultdict_accumulator(float)
-    d["x"] = 0.0
-    d["x"] += 4.0
-    d["y"] += 5.0
-    d["z"] += d["x"]
-    d["x"] += d["y"]
-    assert d["x"] == 9.0
-    assert d["y"] == 5.0
-    assert d["z"] == 4.0
-    assert d["w"] == 0.0
-
-    f = processor.defaultdict_accumulator(lambda: 2.0)
-    f["x"] += 4.0
-    assert f["x"] == 6.0
-
-    f += f
-    assert f["x"] == 12.0
-    assert f["y"] == 2.0
-
-    a = processor.column_accumulator(np.arange(6).reshape(2, 3))
-    b = processor.column_accumulator(np.arange(12).reshape(4, 3))
-    a += b
-    assert a.value.sum() == 81
-
-
-def test_new_accumulators():
-    a = processor.accumulate((0.0, 3.0))
-    assert a == 3.0
-
-    a = processor.accumulate(
-        (
-            np.array([2.0]),
-            3.0,
-        )
-    )
-    assert np.array_equal(a, np.array([5.0]))
-
-    lacc = processor.accumulate(
-        (
-            list(range(4)),
-            [3],
-            [1, 2],
-        )
-    )
-    assert lacc == [0, 1, 2, 3, 3, 1, 2]
-
-    b = processor.accumulate(
-        (
-            {"apples", "oranges"},
-            {"pears"},
-            {"grapes"},
-        )
-    )
-    assert b == {"apples", "oranges", "pears", "grapes"}
-
-    c = processor.accumulate(
-        (
-            {"num": a, "fruit": b},
-            {"num": 2.0},
-            {
-                "num2": 0,
-                "fruit": {"apples", "cherries"},
-            },
-        )
-    )
-    assert c["num2"] == 0
-    assert np.array_equal(c["num"], np.array([7.0]))
-    assert c["fruit"] == {"apples", "oranges", "pears", "grapes", "cherries"}
-
-    d = processor.accumulate(
-        (
-            defaultdict(float),
-            {"x": 4.0, "y": 5.0},
-            {"z": 4.0, "x": 5.0},
-        )
-    )
-    assert d["x"] == 9.0
-    assert d["y"] == 5.0
-    assert d["z"] == 4.0
-    # this is different than old style!
-    with pytest.raises(KeyError):
-        d["w"]
-
-    f = processor.accumulate(
-        (
-            defaultdict(lambda: 2.0),
-            defaultdict(lambda: 2, {"x": 4.0}),
-        )
-    )
-    assert f["x"] == 4.0
-    assert f["y"] == 2.0
-
-    # this is different than old style!
-    f = processor.accumulate([f], f)
-    assert f["x"] == 8.0
-    assert f["y"] == 4.0
-    assert f["z"] == 2.0
-
-    a = processor.accumulate(
-        (
-            processor.column_accumulator(np.arange(6).reshape(2, 3)),
-            processor.column_accumulator(np.arange(12).reshape(4, 3)),
-        )
-    )
-    assert a.value.sum() == 81
-
-
-def test_accumulator_types():
-    class MyDict(dict):
-        pass
-
-    out = processor.accumulate(
-        (
-            {"x": 2},
-            MyDict({"x": 3}),
-        )
-    )
-    assert type(out) is dict
-
-    with pytest.raises(ValueError):
-        processor.accumulate(
-            (
-                defaultdict(lambda: 2),
-                MyDict({"x": 3}),
-            )
-        )
-
-    out = processor.accumulate(
-        (
-            MyDict({"x": 3}),
-            {"x": 2},
-        )
-    )
-    assert type(out) is dict
-
-    with pytest.raises(ValueError):
-        processor.accumulate(
-            (
-                MyDict({"x": 3}),
-                defaultdict(lambda: 2),
-            )
-        )

From 1de78e605b2cf0576c294e137ffab7bc929315dc Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Tue, 28 Nov 2023 11:55:27 -0600
Subject: [PATCH 37/80] typing for apply_to_dataset/fileset, use setdefault

---
 src/coffea/dataset_tools/__init__.py        |  4 +--
 src/coffea/dataset_tools/apply_processor.py | 33 ++++++++++++++++-----
 2 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/src/coffea/dataset_tools/__init__.py b/src/coffea/dataset_tools/__init__.py
index 888df7eaf..dfa40296c 100644
--- a/src/coffea/dataset_tools/__init__.py
+++ b/src/coffea/dataset_tools/__init__.py
@@ -1,10 +1,10 @@
-from coffea.dataset_tools.apply_processor import apply_to_fileset, apply_to_one_dataset
+from coffea.dataset_tools.apply_processor import apply_to_dataset, apply_to_fileset
 from coffea.dataset_tools.manipulations import max_chunks, slice_chunks
 from coffea.dataset_tools.preprocess import preprocess
 
 __all__ = [
     "preprocess",
-    "apply_to_one_dataset",
+    "apply_to_dataset",
     "apply_to_fileset",
     "max_chunks",
     "slice_chunks",
diff --git a/src/coffea/dataset_tools/apply_processor.py b/src/coffea/dataset_tools/apply_processor.py
index 798c73623..d08b772dd 100644
--- a/src/coffea/dataset_tools/apply_processor.py
+++ b/src/coffea/dataset_tools/apply_processor.py
@@ -1,12 +1,29 @@
 import copy
-from typing import Callable, Union
+from typing import Callable, Dict, Hashable, List, Set, Tuple, Union
+
+import dask.base
+import dask_awkward
 
 from coffea.nanoevents import NanoAODSchema, NanoEventsFactory
 from coffea.processor import ProcessorABC
 
+GenericHEPAnalysis = Callable[
+    [dask_awkward.Array],
+    Tuple[
+        Union[
+            dask.base.DaskMethodsMixin,
+            Dict[Hashable, dask.base.DaskMethodsMixin],
+            Set[dask.base.DaskMethodsMixin],
+            List[dask.base.DaskMethodsMixin],
+            Tuple[dask.base.DaskMethodsMixin],
+        ],
+        ...,
+    ],  # NOTE TO USERS: You can use nested python containers as arguments to dask.compute!
+]
+
 
-def apply_to_one_dataset(
-    data_manipulation: Union[ProcessorABC, Callable],
+def apply_to_dataset(
+    data_manipulation: Union[ProcessorABC, GenericHEPAnalysis],
     dataset,
     schemaclass=NanoAODSchema,
     metadata={},
@@ -26,13 +43,13 @@ def apply_to_one_dataset(
 
 
 def apply_to_fileset(
-    data_manipulation: Union[ProcessorABC, Callable], fileset, schemaclass=NanoAODSchema
+    data_manipulation: Union[ProcessorABC, GenericHEPAnalysis],
+    fileset,
+    schemaclass=NanoAODSchema,
 ):
     out = {}
     for name, dataset in fileset.items():
         metadata = copy.deepcopy(dataset.get("metadata", {}))
-        metadata["dataset"] = name
-        out[name] = apply_to_one_dataset(
-            data_manipulation, dataset, schemaclass, metadata
-        )
+        metadata.setdefault("dataset", name)
+        out[name] = apply_to_dataset(data_manipulation, dataset, schemaclass, metadata)
     return out

From 5ee0b20d025d2a70245a73dacd1468cb94edffc9 Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Tue, 28 Nov 2023 13:59:21 -0600
Subject: [PATCH 38/80] typing for preprocess

---
 src/coffea/dataset_tools/preprocess.py | 70 ++++++++++++++++++++------
 1 file changed, 55 insertions(+), 15 deletions(-)

diff --git a/src/coffea/dataset_tools/preprocess.py b/src/coffea/dataset_tools/preprocess.py
index ad4d4bc6d..b5dbe9168 100644
--- a/src/coffea/dataset_tools/preprocess.py
+++ b/src/coffea/dataset_tools/preprocess.py
@@ -1,5 +1,9 @@
+from __future__ import annotations
+
 import copy
 import math
+from dataclasses import dataclass
+from typing import Any, Hashable
 
 import awkward
 import dask
@@ -9,13 +13,13 @@
 
 
 def _get_steps(
-    normed_files,
-    maybe_step_size=None,
-    align_clusters=False,
-    recalculate_seen_steps=False,
-    skip_bad_files=False,
-    file_exceptions=(FileNotFoundError, OSError),
-):
+    normed_files: awkward.Array | dask_awkward.Array,
+    maybe_step_size: None | int = None,
+    align_clusters: bool = False,
+    recalculate_seen_steps: bool = False,
+    skip_bad_files: bool = False,
+    file_exceptions: Exception | Warning = (FileNotFoundError, OSError),
+) -> awkward.Array | dask_awkward.Array:
     nf_backend = awkward.backend(normed_files)
     lz_or_nf = awkward.typetracer.length_zero_if_typetracer(normed_files)
 
@@ -95,15 +99,51 @@ def _get_steps(
     return array
 
 
+@dataclass
+class UprootFileSpec:
+    object_path: str
+    steps: list[list[int]] | list[int] | None
+
+
+@dataclass
+class CoffeaFileSpec:
+    object_path: str
+    steps: list[list[int]]
+    uuid: str
+
+
+@dataclass
+class CoffeaFileSpecOptional(UprootFileSpec):
+    uuid: str | None
+
+
+@dataclass
+class DatasetSpecOptional:
+    files: (
+        dict[str, str] | list[str] | dict[str, UprootFileSpec | CoffeaFileSpecOptional]
+    )
+    metadata: dict[Hashable, Any] | None
+
+
+@dataclass
+class DatasetSpec:
+    files: dict[str, CoffeaFileSpec]
+    metadata: dict[Hashable, Any] | None
+
+
+FilesetSpecOptional = dict[str, DatasetSpecOptional]
+FilesetSpec = dict[str, DatasetSpec]
+
+
 def preprocess(
-    fileset,
-    maybe_step_size=None,
-    align_clusters=False,
-    recalculate_seen_steps=False,
-    files_per_batch=1,
-    skip_bad_files=False,
-    file_exceptions=(FileNotFoundError, OSError),
-):
+    fileset: FilesetSpecOptional,
+    maybe_step_size: None | int = None,
+    align_clusters: bool = False,
+    recalculate_seen_steps: bool = False,
+    files_per_batch: int = 1,
+    skip_bad_files: bool = False,
+    file_exceptions: Exception | Warning = (FileNotFoundError, OSError),
+) -> tuple[FilesetSpec, FilesetSpecOptional]:
     out_updated = copy.deepcopy(fileset)
     out_available = copy.deepcopy(fileset)
     all_ak_norm_files = {}

From 009fdd0e1013a63390cbdc8c71120981ba73ee77 Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Tue, 28 Nov 2023 14:02:50 -0600
Subject: [PATCH 39/80] flake8

---
 src/coffea/dataset_tools/rucio_utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py
index 2df9a62ec..76ce58971 100644
--- a/src/coffea/dataset_tools/rucio_utils.py
+++ b/src/coffea/dataset_tools/rucio_utils.py
@@ -5,11 +5,10 @@
 import subprocess
 from collections import defaultdict
 
-import tomli
 from rucio.client import Client
 
 # Rucio needs the default configuration --> taken from CMS cvmfs defaults
-if not "RUCIO_HOME" in os.environ:
+if "RUCIO_HOME" not in os.environ:
     os.environ["RUCIO_HOME"] = "/cvmfs/cms.cern.ch/rucio/current"
 
 # with open(f"{os.environ['RUCIO_HOME']}/etc/rucio.cfg", "rb") as f:

From e00e691bd4ccb5c2a320308e49fcd4e24b4d5109 Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Tue, 28 Nov 2023 14:05:57 -0600
Subject: [PATCH 40/80] fix up typing

---
 src/coffea/dataset_tools/preprocess.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/coffea/dataset_tools/preprocess.py b/src/coffea/dataset_tools/preprocess.py
index b5dbe9168..da4ca7d6a 100644
--- a/src/coffea/dataset_tools/preprocess.py
+++ b/src/coffea/dataset_tools/preprocess.py
@@ -3,7 +3,7 @@
 import copy
 import math
 from dataclasses import dataclass
-from typing import Any, Hashable
+from typing import Any, Dict, Hashable
 
 import awkward
 import dask
@@ -131,8 +131,8 @@ class DatasetSpec:
     metadata: dict[Hashable, Any] | None
 
 
-FilesetSpecOptional = dict[str, DatasetSpecOptional]
-FilesetSpec = dict[str, DatasetSpec]
+FilesetSpecOptional = Dict[str, DatasetSpecOptional]
+FilesetSpec = Dict[str, DatasetSpec]
 
 
 def preprocess(

From 7644fe71de0793e744a54d2c2ab6c145fc4e3336 Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Tue, 28 Nov 2023 17:14:35 -0600
Subject: [PATCH 41/80] more typing for apply_processor

---
 src/coffea/dataset_tools/apply_processor.py | 55 ++++++++++++---------
 1 file changed, 31 insertions(+), 24 deletions(-)

diff --git a/src/coffea/dataset_tools/apply_processor.py b/src/coffea/dataset_tools/apply_processor.py
index d08b772dd..883ec7f0b 100644
--- a/src/coffea/dataset_tools/apply_processor.py
+++ b/src/coffea/dataset_tools/apply_processor.py
@@ -1,33 +1,40 @@
+from __future__ import annotations
+
 import copy
-from typing import Callable, Dict, Hashable, List, Set, Tuple, Union
+from typing import Any, Callable, Dict, Hashable, List, Set, Tuple, Union
 
 import dask.base
 import dask_awkward
 
-from coffea.nanoevents import NanoAODSchema, NanoEventsFactory
+from coffea.dataset_tools.preprocess import (
+    DatasetSpec,
+    DatasetSpecOptional,
+    FilesetSpec,
+    FilesetSpecOptional,
+)
+from coffea.nanoevents import BaseSchema, NanoAODSchema, NanoEventsFactory
 from coffea.processor import ProcessorABC
 
-GenericHEPAnalysis = Callable[
-    [dask_awkward.Array],
-    Tuple[
-        Union[
-            dask.base.DaskMethodsMixin,
-            Dict[Hashable, dask.base.DaskMethodsMixin],
-            Set[dask.base.DaskMethodsMixin],
-            List[dask.base.DaskMethodsMixin],
-            Tuple[dask.base.DaskMethodsMixin],
-        ],
-        ...,
-    ],  # NOTE TO USERS: You can use nested python containers as arguments to dask.compute!
-]
+DaskOutputType = Tuple[
+    Union[
+        dask.base.DaskMethodsMixin,
+        Dict[Hashable, dask.base.DaskMethodsMixin],
+        Set[dask.base.DaskMethodsMixin],
+        List[dask.base.DaskMethodsMixin],
+        Tuple[dask.base.DaskMethodsMixin],
+    ],
+    ...,
+]  # NOTE TO USERS: You can use nested python containers as arguments to dask.compute!
+
+GenericHEPAnalysis = Callable[[dask_awkward.Array], DaskOutputType]
 
 
 def apply_to_dataset(
-    data_manipulation: Union[ProcessorABC, GenericHEPAnalysis],
-    dataset,
-    schemaclass=NanoAODSchema,
-    metadata={},
-):
+    data_manipulation: ProcessorABC | GenericHEPAnalysis,
+    dataset: DatasetSpec | DatasetSpecOptional,
+    schemaclass: BaseSchema = NanoAODSchema,
+    metadata: dict[Hashable, Any] = {},
+) -> DaskOutputType:
     files = dataset["files"]
     events = NanoEventsFactory.from_root(
         files,
@@ -43,10 +50,10 @@ def apply_to_dataset(
 
 
 def apply_to_fileset(
-    data_manipulation: Union[ProcessorABC, GenericHEPAnalysis],
-    fileset,
-    schemaclass=NanoAODSchema,
-):
+    data_manipulation: ProcessorABC | GenericHEPAnalysis,
+    fileset: FilesetSpec | FilesetSpecOptional,
+    schemaclass: BaseSchema = NanoAODSchema,
+) -> dict[str, DaskOutputType]:
     out = {}
     for name, dataset in fileset.items():
         metadata = copy.deepcopy(dataset.get("metadata", {}))

From 9fee7d36e712526f4afb66bb2557886ff262cc53 Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Wed, 29 Nov 2023 14:52:36 -0600
Subject: [PATCH 42/80] being pedantic about types

---
 src/coffea/dataset_tools/apply_processor.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/coffea/dataset_tools/apply_processor.py b/src/coffea/dataset_tools/apply_processor.py
index 883ec7f0b..186eb069e 100644
--- a/src/coffea/dataset_tools/apply_processor.py
+++ b/src/coffea/dataset_tools/apply_processor.py
@@ -15,16 +15,16 @@
 from coffea.nanoevents import BaseSchema, NanoAODSchema, NanoEventsFactory
 from coffea.processor import ProcessorABC
 
-DaskOutputType = Tuple[
-    Union[
-        dask.base.DaskMethodsMixin,
-        Dict[Hashable, dask.base.DaskMethodsMixin],
-        Set[dask.base.DaskMethodsMixin],
-        List[dask.base.DaskMethodsMixin],
-        Tuple[dask.base.DaskMethodsMixin],
-    ],
-    ...,
-]  # NOTE TO USERS: You can use nested python containers as arguments to dask.compute!
+DaskOutputBaseType = Union[
+    dask.base.DaskMethodsMixin,
+    Dict[Hashable, dask.base.DaskMethodsMixin],
+    Set[dask.base.DaskMethodsMixin],
+    List[dask.base.DaskMethodsMixin],
+    Tuple[dask.base.DaskMethodsMixin],
+]
+
+# NOTE TO USERS: You can use nested python containers as arguments to dask.compute!
+DaskOutputType = Union[DaskOutputBaseType, Tuple[DaskOutputBaseType, ...]]  
 
 GenericHEPAnalysis = Callable[[dask_awkward.Array], DaskOutputType]
 

From 9ba1442ecb110bba03c82dfabd4ace3ed8f3403a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 29 Nov 2023 20:53:11 +0000
Subject: [PATCH 43/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/coffea/dataset_tools/apply_processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/coffea/dataset_tools/apply_processor.py b/src/coffea/dataset_tools/apply_processor.py
index 186eb069e..eb84580f7 100644
--- a/src/coffea/dataset_tools/apply_processor.py
+++ b/src/coffea/dataset_tools/apply_processor.py
@@ -24,7 +24,7 @@
 ]
 
 # NOTE TO USERS: You can use nested python containers as arguments to dask.compute!
-DaskOutputType = Union[DaskOutputBaseType, Tuple[DaskOutputBaseType, ...]]  
+DaskOutputType = Union[DaskOutputBaseType, Tuple[DaskOutputBaseType, ...]]
 
 GenericHEPAnalysis = Callable[[dask_awkward.Array], DaskOutputType]
 

From 059061a676c887b608acf64dfafa5c2313a26d5e Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Mon, 4 Dec 2023 16:00:07 -0600
Subject: [PATCH 44/80] taskvine test was using old location of NanoAODSchema

---
 tests/test_taskvine.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_taskvine.py b/tests/test_taskvine.py
index 8e533919e..e6f4580be 100755
--- a/tests/test_taskvine.py
+++ b/tests/test_taskvine.py
@@ -5,12 +5,12 @@
 import pytest
 
 from coffea import processor
-from coffea.nanoevents import NanoEventsFactory
+from coffea.nanoevents import NanoEventsFactory, NanoAODSchema
 
 
 def histogram_common():
     # The opendata files are non-standard NanoAOD, so some optional data columns are missing
-    processor.NanoAODSchema.warn_missing_crossrefs = False
+    NanoAODSchema.warn_missing_crossrefs = False
 
     # "file:/tmp/Run2012B_SingleMu.root",
     events = NanoEventsFactory.from_root(

From 3385bdb313151978c48a481758d682dc721401e1 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 4 Dec 2023 22:00:31 +0000
Subject: [PATCH 45/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/test_taskvine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_taskvine.py b/tests/test_taskvine.py
index e6f4580be..a35ea741e 100755
--- a/tests/test_taskvine.py
+++ b/tests/test_taskvine.py
@@ -5,7 +5,7 @@
 import pytest
 
 from coffea import processor
-from coffea.nanoevents import NanoEventsFactory, NanoAODSchema
+from coffea.nanoevents import NanoAODSchema, NanoEventsFactory
 
 
 def histogram_common():

From 0e2baf102d43c9eaf66a5c50fa1c8ad150c18b78 Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Mon, 4 Dec 2023 16:01:56 -0600
Subject: [PATCH 46/80] lint: no longer need to import processor

---
 tests/test_taskvine.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_taskvine.py b/tests/test_taskvine.py
index a35ea741e..858a35349 100755
--- a/tests/test_taskvine.py
+++ b/tests/test_taskvine.py
@@ -4,7 +4,6 @@
 import hist.dask as hda
 import pytest
 
-from coffea import processor
 from coffea.nanoevents import NanoAODSchema, NanoEventsFactory
 
 

From 59ef3529184f3ed7af9da241e75a4d2a0131e6eb Mon Sep 17 00:00:00 2001
From: Davide Valsecchi <davide.valsecchi@cern.ch>
Date: Wed, 6 Dec 2023 22:11:30 +0100
Subject: [PATCH 47/80] Getting rucio client from config in environmental
 variable

---
 src/coffea/dataset_tools/rucio_utils.py | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py
index 76ce58971..71ab31360 100644
--- a/src/coffea/dataset_tools/rucio_utils.py
+++ b/src/coffea/dataset_tools/rucio_utils.py
@@ -11,10 +11,6 @@
 if "RUCIO_HOME" not in os.environ:
     os.environ["RUCIO_HOME"] = "/cvmfs/cms.cern.ch/rucio/current"
 
-# with open(f"{os.environ['RUCIO_HOME']}/etc/rucio.cfg", "rb") as f:
-#     rucio_cfg = tomli.load(f)
-#     print(rucio_cfg)
-
 
 def get_proxy_path() -> str:
     """
@@ -52,14 +48,7 @@ def get_rucio_client(proxy=None) -> Client:
     try:
         if not proxy:
             proxy = get_proxy_path()
-
-        nativeClient = Client(
-            rucio_host="https://cms-rucio.cern.ch",
-            auth_host="https://cms-rucio-auth.cern.ch",
-            account=getpass.getuser(),
-            creds={"client_cert": proxy, "client_key": proxy},
-            auth_type="x509",
-        )
+        nativeClient = Client()
         return nativeClient
 
     except Exception as e:

From 25df8b16be6a66c1a358c2037464adb4dbabde03 Mon Sep 17 00:00:00 2001
From: Davide Valsecchi <davide.valsecchi@cern.ch>
Date: Wed, 6 Dec 2023 23:57:02 +0100
Subject: [PATCH 48/80] Added preprocess command to cli

---
 src/coffea/dataset_tools/dataset_query.py | 62 ++++++++++++++++++++---
 1 file changed, 56 insertions(+), 6 deletions(-)

diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py
index 46236b805..550f3f60d 100644
--- a/src/coffea/dataset_tools/dataset_query.py
+++ b/src/coffea/dataset_tools/dataset_query.py
@@ -1,8 +1,9 @@
 import random
 from collections import defaultdict
+import os
 
 import cmd2
-import yaml
+import yaml, json
 from rich import print
 from rich.console import Console
 from rich.prompt import Prompt
@@ -10,6 +11,8 @@
 from rich.tree import Tree
 
 from . import rucio_utils
+from .preprocess import preprocess
+from dask.distributed import Client
 
 
 def print_dataset_query(query, dataset_list, selected, console):
@@ -59,6 +62,8 @@ def __init__(self):
                 "S": "select",
                 "LS": "list_selected",
                 "LR": "list_replicas",
+                "O": "save",
+                "P": "preprocess",
             }
         )
         self.console = Console()
@@ -326,6 +331,7 @@ def do_list_replicas(self, args):
                     print(
                         f"[red bold]No replica info for dataset {dataset}. You need to selected the replicas with [cyan] replicas {args}"
                     )
+                    return
                 tree = Tree(label=f"[bold orange]Replicas for [green]{dataset}")
 
                 for site, files in self.replica_results_bysite[dataset].items():
@@ -338,11 +344,54 @@ def do_list_replicas(self, args):
     def do_save(self, args):
         """Save the replica information in yaml format"""
         if not len(args):
-            print("[red]Please provide an output filename")
+            print("[red]Please provide an output filename and format")
+            return
+        format = os.path.splitext(args)[1]
+        output = {}
+        for fileset, files in self.replica_results.items():
+            output[fileset] = {"files": files, "metadata": {}}
+
+        with open(args, "w") as file:
+            if format == ".yaml":
+                yaml.dump(output, file, default_flow_style=False)
+            elif format == ".json":
+                json.dump(output, file, indent=2)
+        print(f"[green]File {args} saved!")
+
+    def do_preprocess(self, args):
+        """Perform preprocessing for concrete fileset extraction.
+        Args:  output_name [step_size] [dask cluster url]"""
+        args_list = args.split()
+        if len(args_list) < 1:
+            print(
+                "Please provide an output name and optionally a step size and dask cluster url"
+            )
+            return
         else:
-            with open(args, "w") as file:
-                yaml.dump(dict(self.replica_results), file, default_flow_style=False)
-            print(f"[green]File {args} saved!")
+            output_file = args_list[0]
+            step_size = None
+            dask_url = None
+        if len(args_list) == 2:
+            step_size = args_list[1]
+        elif len(args_list) == 3:
+            dask_url = args_list[2]
+        replicas = {}
+        for fileset, files in self.replica_results.items():
+            replicas[fileset] = {"files": {f: "Events" for f in files}, "metadata": {}}
+        # init a local Dask cluster
+        with self.console.status(
+            "[red] Preprocessing files to extract available chunks with dask[/]"
+        ):
+            client = Client(dask_url) if dask_url else Client()
+            fileset = preprocess(replicas)
+            out_available, out_updated = preprocess(replicas)
+
+        with open(f"{output_file}_available.json", "w") as file:
+            print(f"Saved available fileset chunks to {output_file}_available.json")
+            json.dump(out_available, file, indent=2)
+        with open(f"{output_file}_all.json", "w") as file:
+            print(f"Saved all fileset chunks to {output_file}_all.json")
+            json.dump(out_updated, file, indent=2)
 
 
 if __name__ == "__main__":
@@ -359,7 +408,8 @@ def do_save(self, args):
   - [bold cyan]allowlist_sites[/]: Select sites to allowlist them for replica queries
   - [bold cyan]blocklist_sites[/]: Select sites to blocklist them for replica queries
   - [bold cyan]regex_sites[/]: Select sites with a regex for replica queries: please wrap the regex like "T[123]_(FR|IT|BE|CH|DE)_\w+"
-  - [bold cyan]save (S) file.yaml[/]: Save the replicas results to file for further processing
+  - [bold cyan]save (O) OUTPUTFILE[/]: Save the replicas results to file (json or yaml) for further processing
+  - [bold cyan]preprocess (P) OUTPUTFILE[/]: Preprocess the replicas with dask and save the fileset to the outputfile (yaml or json)
   - [bold cyan]help[/]: get help!
 """
     console = Console()

From c8a87b351936eb90264b34b91e41230c7ca46c21 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 6 Dec 2023 22:57:23 +0000
Subject: [PATCH 49/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/coffea/dataset_tools/dataset_query.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py
index 550f3f60d..b9d00b4b8 100644
--- a/src/coffea/dataset_tools/dataset_query.py
+++ b/src/coffea/dataset_tools/dataset_query.py
@@ -1,9 +1,11 @@
+import json
+import os
 import random
 from collections import defaultdict
-import os
 
 import cmd2
-import yaml, json
+import yaml
+from dask.distributed import Client
 from rich import print
 from rich.console import Console
 from rich.prompt import Prompt
@@ -12,7 +14,6 @@
 
 from . import rucio_utils
 from .preprocess import preprocess
-from dask.distributed import Client
 
 
 def print_dataset_query(query, dataset_list, selected, console):

From 31e74eb0d999a25f6de17ea7ec372385906db255 Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Wed, 6 Dec 2023 17:18:16 -0600
Subject: [PATCH 50/80] save json as gzipped, add some options

---
 src/coffea/dataset_tools/dataset_query.py | 37 +++++++++++++++--------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py
index b9d00b4b8..3c16108cc 100644
--- a/src/coffea/dataset_tools/dataset_query.py
+++ b/src/coffea/dataset_tools/dataset_query.py
@@ -1,3 +1,4 @@
+import gzip
 import json
 import os
 import random
@@ -361,21 +362,34 @@ def do_save(self, args):
 
     def do_preprocess(self, args):
         """Perform preprocessing for concrete fileset extraction.
-        Args:  output_name [step_size] [dask cluster url]"""
+        Args:  output_name [step_size] [align to file cluster boundaries] [dask cluster url]"""
         args_list = args.split()
         if len(args_list) < 1:
             print(
-                "Please provide an output name and optionally a step size and dask cluster url"
+                "Please provide an output name and optionally a step size, if you want to align to file clusters, or a dask cluster url"
             )
             return
         else:
             output_file = args_list[0]
             step_size = None
+            align_to_clusters = False
             dask_url = None
-        if len(args_list) == 2:
+        if len(args_list) >= 2:
             step_size = args_list[1]
-        elif len(args_list) == 3:
-            dask_url = args_list[2]
+        if len(args_list) >= 3:
+            if args_list[2] == "True":
+                align_to_clusters = True
+            elif args_list[2] == "False":
+                align_to_clusters = False
+            else:
+                raise ValueError("align_to_clusters must be either \"True\" or \"False\"")
+        if len(args_list) == 4:
+            dask_url = args_list[3]
+        if len(args_list) > 4:
+            print(
+                "preprocess accepts at most 3 commandline arguments!"
+            )
+            return
         replicas = {}
         for fileset, files in self.replica_results.items():
             replicas[fileset] = {"files": {f: "Events" for f in files}, "metadata": {}}
@@ -383,15 +397,14 @@ def do_preprocess(self, args):
         with self.console.status(
             "[red] Preprocessing files to extract available chunks with dask[/]"
         ):
-            client = Client(dask_url) if dask_url else Client()
-            fileset = preprocess(replicas)
-            out_available, out_updated = preprocess(replicas)
+            with Client(dask_url) as _: 
+                out_available, out_updated = preprocess(replicas, maybe_step_size=step_size, align_clusters=align_to_clusters, skip_bad_files=True)
 
-        with open(f"{output_file}_available.json", "w") as file:
-            print(f"Saved available fileset chunks to {output_file}_available.json")
+        with gzip.open(f"{output_file}_available.json.gz", "w") as file:
+            print(f"Saved available fileset chunks to {output_file}_available.json.gz")
             json.dump(out_available, file, indent=2)
-        with open(f"{output_file}_all.json", "w") as file:
-            print(f"Saved all fileset chunks to {output_file}_all.json")
+        with gzip.open(f"{output_file}_all.json.gz", "w") as file:
+            print(f"Saved all fileset chunks to {output_file}_all.json.gz")
             json.dump(out_updated, file, indent=2)
 
 

From 048767195a47b1760fa186ee607c3ff7afd66756 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 6 Dec 2023 23:18:31 +0000
Subject: [PATCH 51/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/coffea/dataset_tools/dataset_query.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py
index 3c16108cc..04a9f9f25 100644
--- a/src/coffea/dataset_tools/dataset_query.py
+++ b/src/coffea/dataset_tools/dataset_query.py
@@ -362,7 +362,8 @@ def do_save(self, args):
 
     def do_preprocess(self, args):
         """Perform preprocessing for concrete fileset extraction.
-        Args:  output_name [step_size] [align to file cluster boundaries] [dask cluster url]"""
+        Args:  output_name [step_size] [align to file cluster boundaries] [dask cluster url]
+        """
         args_list = args.split()
         if len(args_list) < 1:
             print(
@@ -382,13 +383,11 @@ def do_preprocess(self, args):
             elif args_list[2] == "False":
                 align_to_clusters = False
             else:
-                raise ValueError("align_to_clusters must be either \"True\" or \"False\"")
+                raise ValueError('align_to_clusters must be either "True" or "False"')
         if len(args_list) == 4:
             dask_url = args_list[3]
         if len(args_list) > 4:
-            print(
-                "preprocess accepts at most 3 commandline arguments!"
-            )
+            print("preprocess accepts at most 3 commandline arguments!")
             return
         replicas = {}
         for fileset, files in self.replica_results.items():
@@ -397,8 +396,13 @@ def do_preprocess(self, args):
         with self.console.status(
             "[red] Preprocessing files to extract available chunks with dask[/]"
         ):
-            with Client(dask_url) as _: 
-                out_available, out_updated = preprocess(replicas, maybe_step_size=step_size, align_clusters=align_to_clusters, skip_bad_files=True)
+            with Client(dask_url) as _:
+                out_available, out_updated = preprocess(
+                    replicas,
+                    maybe_step_size=step_size,
+                    align_clusters=align_to_clusters,
+                    skip_bad_files=True,
+                )
 
         with gzip.open(f"{output_file}_available.json.gz", "w") as file:
             print(f"Saved available fileset chunks to {output_file}_available.json.gz")

From 9c970e4ea338cb0eac01c5cb88eaa0df9d22ed27 Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Wed, 6 Dec 2023 17:19:52 -0600
Subject: [PATCH 52/80] flake: drop getpass since it is not used

---
 src/coffea/dataset_tools/rucio_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py
index 71ab31360..a2cf11fe9 100644
--- a/src/coffea/dataset_tools/rucio_utils.py
+++ b/src/coffea/dataset_tools/rucio_utils.py
@@ -1,4 +1,4 @@
-import getpass
+# import getpass
 import json
 import os
 import re

From 681782debc32ee41b04d2e6c6ab0e75b4bb55c68 Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Thu, 7 Dec 2023 16:13:05 -0600
Subject: [PATCH 53/80] add failed-tail processing for uproot reports

---
 src/coffea/dataset_tools/manipulations.py | 40 +++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/src/coffea/dataset_tools/manipulations.py b/src/coffea/dataset_tools/manipulations.py
index e963a067c..e80ffb4cb 100644
--- a/src/coffea/dataset_tools/manipulations.py
+++ b/src/coffea/dataset_tools/manipulations.py
@@ -1,4 +1,6 @@
+import awkward
 import copy
+import numpy
 
 
 def max_chunks(fileset, maxchunks=None):
@@ -15,3 +17,41 @@ def slice_chunks(fileset, theslice=slice(None)):
             out[name]["files"][fname]["steps"] = finfo["steps"][theslice]
 
     return out
+
+
+def get_failed_steps_for_dataset(dataset, report):
+    failed_dataset = {}
+    failures = report[~awkward.is_none(report.exception)]
+
+    if not awkward.all(report.args[:,4] == "True"):
+        raise RuntimeError("step specification is not completely in starts/stops form, failed-step extraction is not available for steps_per_file.")
+
+    for fdesc in dataset.values():
+        if "steps" not in fdesc:
+            raise RuntimeError("steps specification not found in dataset, please specify steps in input dataset.")
+
+    fnames = set(dataset.keys())
+    rnames = set(np.unique(report.args[:, 0][:, 1:-1:]))
+    if not rnames.issubset(fnames):
+        raise RuntimeError(f"Files: {rnames - fnames} are not in input dataset, please sure report correspond to input dataset!")
+    
+    for failure in failures:
+        args_as_types = tuple(eval(arg) for arg in failure.args)
+
+        fname, object_path, start, stop, is_step = args_as_types
+        
+        if fname in failed_dataset:
+            failed_dataset[fname]["steps"].append([start, stop])
+        else:
+            failed_dataset[fname] = copy.deepcopy(dataset[fname])
+            failed_dataset[fname]["steps"] = [[start, stop]]
+
+    return failed_dataset
+
+def get_failed_steps_for_fileset(fileset, report):
+    failed_fileset = {}
+    for name, dataset in fileset.items():
+        failed_dataset = get_failed_steps_for_dataset(dataset, report)
+        if len(failed_dataset) > 0:
+            failed_fileset[name] = failed_dataset
+    return failed_fileset

From 3ca2d96b55a4e912147e18f46bcb2def88a97113 Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Thu, 7 Dec 2023 16:13:42 -0600
Subject: [PATCH 54/80] add failed-tail stuff to __all__ of dataset_tools

---
 src/coffea/dataset_tools/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/coffea/dataset_tools/__init__.py b/src/coffea/dataset_tools/__init__.py
index dfa40296c..1895bf722 100644
--- a/src/coffea/dataset_tools/__init__.py
+++ b/src/coffea/dataset_tools/__init__.py
@@ -8,4 +8,6 @@
     "apply_to_fileset",
     "max_chunks",
     "slice_chunks",
+    "get_failed_steps_for_dataset",
+    "get_failed_steps_for_fileset",
 ]

From 3bd760a0a95e5542571e5965c0c881b771901627 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 7 Dec 2023 22:15:58 +0000
Subject: [PATCH 55/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/coffea/dataset_tools/manipulations.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/src/coffea/dataset_tools/manipulations.py b/src/coffea/dataset_tools/manipulations.py
index e80ffb4cb..0076b7117 100644
--- a/src/coffea/dataset_tools/manipulations.py
+++ b/src/coffea/dataset_tools/manipulations.py
@@ -1,5 +1,6 @@
-import awkward
 import copy
+
+import awkward
 import numpy
 
 
@@ -23,23 +24,29 @@ def get_failed_steps_for_dataset(dataset, report):
     failed_dataset = {}
     failures = report[~awkward.is_none(report.exception)]
 
-    if not awkward.all(report.args[:,4] == "True"):
-        raise RuntimeError("step specification is not completely in starts/stops form, failed-step extraction is not available for steps_per_file.")
+    if not awkward.all(report.args[:, 4] == "True"):
+        raise RuntimeError(
+            "step specification is not completely in starts/stops form, failed-step extraction is not available for steps_per_file."
+        )
 
     for fdesc in dataset.values():
         if "steps" not in fdesc:
-            raise RuntimeError("steps specification not found in dataset, please specify steps in input dataset.")
+            raise RuntimeError(
+                "steps specification not found in dataset, please specify steps in input dataset."
+            )
 
     fnames = set(dataset.keys())
     rnames = set(np.unique(report.args[:, 0][:, 1:-1:]))
     if not rnames.issubset(fnames):
-        raise RuntimeError(f"Files: {rnames - fnames} are not in input dataset, please sure report correspond to input dataset!")
-    
+        raise RuntimeError(
+            f"Files: {rnames - fnames} are not in input dataset, please sure report correspond to input dataset!"
+        )
+
     for failure in failures:
         args_as_types = tuple(eval(arg) for arg in failure.args)
 
         fname, object_path, start, stop, is_step = args_as_types
-        
+
         if fname in failed_dataset:
             failed_dataset[fname]["steps"].append([start, stop])
         else:
@@ -48,6 +55,7 @@ def get_failed_steps_for_dataset(dataset, report):
 
     return failed_dataset
 
+
 def get_failed_steps_for_fileset(fileset, report):
     failed_fileset = {}
     for name, dataset in fileset.items():

From a29e9102132c574a7e3b2c8bd2704ae0b7de9fae Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Thu, 7 Dec 2023 16:17:28 -0600
Subject: [PATCH 56/80] lint

---
 src/coffea/dataset_tools/manipulations.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/coffea/dataset_tools/manipulations.py b/src/coffea/dataset_tools/manipulations.py
index 0076b7117..49b4486fc 100644
--- a/src/coffea/dataset_tools/manipulations.py
+++ b/src/coffea/dataset_tools/manipulations.py
@@ -36,7 +36,7 @@ def get_failed_steps_for_dataset(dataset, report):
             )
 
     fnames = set(dataset.keys())
-    rnames = set(np.unique(report.args[:, 0][:, 1:-1:]))
+    rnames = set(numpy.unique(report.args[:, 0][:, 1:-1:]))
     if not rnames.issubset(fnames):
         raise RuntimeError(
             f"Files: {rnames - fnames} are not in input dataset, please sure report correspond to input dataset!"

From 13921ab2a20e48a3f457d5d724e9cecc026e1d81 Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Thu, 7 Dec 2023 16:25:01 -0600
Subject: [PATCH 57/80] typo

---
 src/coffea/dataset_tools/manipulations.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/coffea/dataset_tools/manipulations.py b/src/coffea/dataset_tools/manipulations.py
index 49b4486fc..89adf8169 100644
--- a/src/coffea/dataset_tools/manipulations.py
+++ b/src/coffea/dataset_tools/manipulations.py
@@ -39,7 +39,7 @@ def get_failed_steps_for_dataset(dataset, report):
     rnames = set(numpy.unique(report.args[:, 0][:, 1:-1:]))
     if not rnames.issubset(fnames):
         raise RuntimeError(
-            f"Files: {rnames - fnames} are not in input dataset, please sure report correspond to input dataset!"
+            f"Files: {rnames - fnames} are not in input dataset, please ensure report correspond to input dataset!"
         )
 
     for failure in failures:

From 8e4424d5abc20aa4fdb67ff2fa644caa4c515c3f Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Thu, 7 Dec 2023 18:15:09 -0600
Subject: [PATCH 58/80] typo, and fileset entrypoint needs dict of reports.

---
 src/coffea/dataset_tools/manipulations.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/coffea/dataset_tools/manipulations.py b/src/coffea/dataset_tools/manipulations.py
index 89adf8169..a749561e0 100644
--- a/src/coffea/dataset_tools/manipulations.py
+++ b/src/coffea/dataset_tools/manipulations.py
@@ -39,7 +39,7 @@ def get_failed_steps_for_dataset(dataset, report):
     rnames = set(numpy.unique(report.args[:, 0][:, 1:-1:]))
     if not rnames.issubset(fnames):
         raise RuntimeError(
-            f"Files: {rnames - fnames} are not in input dataset, please ensure report correspond to input dataset!"
+            f"Files: {rnames - fnames} are not in input dataset, please ensure report corresponds to input dataset!"
         )
 
     for failure in failures:
@@ -56,10 +56,10 @@ def get_failed_steps_for_dataset(dataset, report):
     return failed_dataset
 
 
-def get_failed_steps_for_fileset(fileset, report):
+def get_failed_steps_for_fileset(fileset, report_dict):
     failed_fileset = {}
     for name, dataset in fileset.items():
-        failed_dataset = get_failed_steps_for_dataset(dataset, report)
+        failed_dataset = get_failed_steps_for_dataset(dataset, report_dict[name])
         if len(failed_dataset) > 0:
             failed_fileset[name] = failed_dataset
     return failed_fileset

From fe7984f844261935c721729165545adc580c45af Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Thu, 7 Dec 2023 18:21:42 -0600
Subject: [PATCH 59/80] adapt apply_processor to possibility of reports

---
 src/coffea/dataset_tools/apply_processor.py | 28 ++++++++++++++++++---
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/src/coffea/dataset_tools/apply_processor.py b/src/coffea/dataset_tools/apply_processor.py
index eb84580f7..ed242ed4d 100644
--- a/src/coffea/dataset_tools/apply_processor.py
+++ b/src/coffea/dataset_tools/apply_processor.py
@@ -34,29 +34,49 @@ def apply_to_dataset(
     dataset: DatasetSpec | DatasetSpecOptional,
     schemaclass: BaseSchema = NanoAODSchema,
     metadata: dict[Hashable, Any] = {},
+    uproot_options: dict[str, Any] = {},
 ) -> DaskOutputType:
     files = dataset["files"]
     events = NanoEventsFactory.from_root(
         files,
         metadata=metadata,
-        schemaclass=schemaclass,
+        schemaclass=schemaclass,    
+        uproot_options=uproot_options,
     ).events()
+
+    report = None
+    if isinstance(events, tuple):
+        events, report = events
+
+    out = None
     if isinstance(data_manipulation, ProcessorABC):
-        return data_manipulation.process(events)
+        out = data_manipulation.process(events)
     elif isinstance(data_manipulation, Callable):
-        return data_manipulation(events)
+        out = data_manipulation(events)
     else:
         raise ValueError("data_manipulation must either be a ProcessorABC or Callable")
 
+    if report is not None:
+        return out, report
+    return out
+
 
 def apply_to_fileset(
     data_manipulation: ProcessorABC | GenericHEPAnalysis,
     fileset: FilesetSpec | FilesetSpecOptional,
     schemaclass: BaseSchema = NanoAODSchema,
+    uproot_options: dict[str, Any] = {},
 ) -> dict[str, DaskOutputType]:
     out = {}
+    report = {}
     for name, dataset in fileset.items():
         metadata = copy.deepcopy(dataset.get("metadata", {}))
         metadata.setdefault("dataset", name)
-        out[name] = apply_to_dataset(data_manipulation, dataset, schemaclass, metadata)
+        out = apply_to_dataset(data_manipulation, dataset, schemaclass, metadata)
+        if isinstance(out, tuple):
+            out[name], report[name] = out
+        else:
+            out[name] = out
+    if len(report) > 0:
+        return out, report
     return out

From 87332b87baa5a12b207dfa7cd92f77bb3058e898 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 8 Dec 2023 00:21:56 +0000
Subject: [PATCH 60/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/coffea/dataset_tools/apply_processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/coffea/dataset_tools/apply_processor.py b/src/coffea/dataset_tools/apply_processor.py
index ed242ed4d..859765fe9 100644
--- a/src/coffea/dataset_tools/apply_processor.py
+++ b/src/coffea/dataset_tools/apply_processor.py
@@ -40,7 +40,7 @@ def apply_to_dataset(
     events = NanoEventsFactory.from_root(
         files,
         metadata=metadata,
-        schemaclass=schemaclass,    
+        schemaclass=schemaclass,
         uproot_options=uproot_options,
     ).events()
 

From f846c7011198a95ee8f712cbb5f518120553df93 Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Thu, 7 Dec 2023 20:06:37 -0600
Subject: [PATCH 61/80] fix bugs

---
 src/coffea/dataset_tools/apply_processor.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/coffea/dataset_tools/apply_processor.py b/src/coffea/dataset_tools/apply_processor.py
index 859765fe9..69994b127 100644
--- a/src/coffea/dataset_tools/apply_processor.py
+++ b/src/coffea/dataset_tools/apply_processor.py
@@ -72,11 +72,13 @@ def apply_to_fileset(
     for name, dataset in fileset.items():
         metadata = copy.deepcopy(dataset.get("metadata", {}))
         metadata.setdefault("dataset", name)
-        out = apply_to_dataset(data_manipulation, dataset, schemaclass, metadata)
+        dataset_out = apply_to_dataset(
+            data_manipulation, dataset, schemaclass, metadata
+        )
         if isinstance(out, tuple):
-            out[name], report[name] = out
+            out[name], report[name] = dataset_out
         else:
-            out[name] = out
+            out[name] = dataset_out
     if len(report) > 0:
         return out, report
     return out

From 6e5caddfddac175f21cd3fe52ae500c91693fe2c Mon Sep 17 00:00:00 2001
From: Davide Valsecchi <davide.valsecchi@cern.ch>
Date: Fri, 8 Dec 2023 19:29:16 +0100
Subject: [PATCH 62/80] Added processing of multiple datasets to get replicas
 in CLI

---
 src/coffea/dataset_tools/dataset_query.py | 219 ++++++++++++----------
 1 file changed, 119 insertions(+), 100 deletions(-)

diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py
index 04a9f9f25..1c74737d1 100644
--- a/src/coffea/dataset_tools/dataset_query.py
+++ b/src/coffea/dataset_tools/dataset_query.py
@@ -165,113 +165,123 @@ def do_list_selected(self, args):
     def do_replicas(self, args):
         if len(args.arg_list) == 0:
             print(
-                "[red] Please provide the index of the [bold]selected[/bold] dataset to analyze or the [bold]full dataset name[/bold]"
+                "[red] Please provide a list of indices of the [bold]selected[/bold] datasets to analyze or [bold]all[/bold] to loop on all the selected datasets"
             )
             return
 
-        if args.isdigit():
-            if int(args) <= len(self.selected_datasets):
-                dataset = self.selected_datasets[int(args) - 1]
-            else:
-                print(
-                    f"[red]The requested dataset is not in the list. Please insert a position <={len(self.selected_datasets)}"
-                )
+        if args == "all":
+            datasets = self.selected_datasets
         else:
-            dataset = args.arg_list[0]
-            # adding it to the selected datasets
-            self.selected_datasets.append(dataset)
-
-        with self.console.status(
-            f"Querying rucio for replicas: [bold red]{dataset}[/]"
-        ):
-            outfiles, outsites, sites_counts = rucio_utils.get_dataset_files_replicas(
-                dataset,
-                allowlist_sites=self.sites_allowlist,
-                blocklist_sites=self.sites_blocklist,
-                regex_sites=self.sites_regex,
-                mode="full",
-                client=self.rucio_client,
+            for index in args.arg_list:
+                if index.isdigit():
+                    if int(index) <= len(self.selected_datasets):
+                        datasets = [self.selected_datasets[int(index) - 1]]
+                    else:
+                        print(
+                            f"[red]The requested dataset is not in the list. Please insert a position <={len(self.selected_datasets)}"
+                        )
+
+        for dataset in datasets:
+            with self.console.status(
+                f"Querying rucio for replicas: [bold red]{dataset}[/]"
+            ):
+                (
+                    outfiles,
+                    outsites,
+                    sites_counts,
+                ) = rucio_utils.get_dataset_files_replicas(
+                    dataset,
+                    allowlist_sites=self.sites_allowlist,
+                    blocklist_sites=self.sites_blocklist,
+                    regex_sites=self.sites_regex,
+                    mode="full",
+                    client=self.rucio_client,
+                )
+                self.last_replicas_results = (outfiles, outsites, sites_counts)
+            print(f"[cyan]Sites availability for dataset: [red]{dataset}")
+            table = Table(title="Available replicas")
+            table.add_column("Index", justify="center")
+            table.add_column("Site", justify="left", style="cyan", no_wrap=True)
+            table.add_column("Files", style="magenta", no_wrap=True)
+            table.add_column("Availability", justify="center")
+            table.row_styles = ["dim", "none"]
+            Nfiles = len(outfiles)
+
+            sorted_sites = dict(
+                sorted(sites_counts.items(), key=lambda x: x[1], reverse=True)
             )
-            self.last_replicas_results = (outfiles, outsites, sites_counts)
-        print(f"[cyan]Sites availability for dataset: [red]{dataset}")
-        table = Table(title="Available replicas")
-        table.add_column("Index", justify="center")
-        table.add_column("Site", justify="left", style="cyan", no_wrap=True)
-        table.add_column("Files", style="magenta", no_wrap=True)
-        table.add_column("Availability", justify="center")
-        table.row_styles = ["dim", "none"]
-        Nfiles = len(outfiles)
-
-        sorted_sites = dict(
-            sorted(sites_counts.items(), key=lambda x: x[1], reverse=True)
-        )
-        for i, (site, stat) in enumerate(sorted_sites.items()):
-            table.add_row(str(i), site, f"{stat} / {Nfiles}", f"{stat*100/Nfiles:.1f}%")
-
-        self.console.print(table)
-        strategy = Prompt.ask(
-            "Select sites",
-            choices=["round-robin", "choice", "quit"],
-            default="round-robin",
-        )
-
-        files_by_site = defaultdict(list)
+            for i, (site, stat) in enumerate(sorted_sites.items()):
+                table.add_row(
+                    str(i), site, f"{stat} / {Nfiles}", f"{stat*100/Nfiles:.1f}%"
+                )
 
-        if strategy == "choice":
-            ind = list(
-                map(int, Prompt.ask("Enter list of sites index to be used").split(" "))
+            self.console.print(table)
+            strategy = Prompt.ask(
+                "Select sites",
+                choices=["round-robin", "choice", "quit"],
+                default="round-robin",
             )
-            sites_to_use = [list(sorted_sites.keys())[i] for i in ind]
-            print(f"Filtering replicas with [green]: {' '.join(sites_to_use)}")
-
-            output = []
-            for ifile, (files, sites) in enumerate(zip(outfiles, outsites)):
-                random.shuffle(sites_to_use)
-                found = False
-                # loop on shuffled selected sites until one is found
-                for site in sites_to_use:
-                    try:
-                        iS = sites.index(site)
-                        output.append(files[iS])
-                        files_by_site[sites[iS]].append(files[iS])
-                        found = True
-                        break  # keep only one replica
-                    except ValueError:
-                        # if the index is not found just go to the next site
-                        pass
-
-                if not found:
-                    print(
-                        f"[bold red]No replica found compatible with sites selection for file #{ifile}. The available sites are:"
-                    )
-                    for f, s in zip(files, sites):
-                        print(f"\t- [green]{s} [cyan]{f}")
-                    return
-
-            self.replica_results[dataset] = output
 
-        elif strategy == "round-robin":
-            output = []
-            for ifile, (files, sites) in enumerate(zip(outfiles, outsites)):
-                # selecting randomly from the sites
-                iS = random.randint(0, len(sites) - 1)
-                output.append(files[iS])
-                files_by_site[sites[iS]].append(files[iS])
-            self.replica_results[dataset] = output
+            files_by_site = defaultdict(list)
 
-        elif strategy == "quit":
-            print("[orange]Doing nothing...")
-            return
+            if strategy == "choice":
+                ind = list(
+                    map(
+                        int,
+                        Prompt.ask("Enter list of sites index to be used").split(" "),
+                    )
+                )
+                sites_to_use = [list(sorted_sites.keys())[i] for i in ind]
+                print(f"Filtering replicas with [green]: {' '.join(sites_to_use)}")
+
+                output = []
+                for ifile, (files, sites) in enumerate(zip(outfiles, outsites)):
+                    random.shuffle(sites_to_use)
+                    found = False
+                    # loop on shuffled selected sites until one is found
+                    for site in sites_to_use:
+                        try:
+                            iS = sites.index(site)
+                            output.append(files[iS])
+                            files_by_site[sites[iS]].append(files[iS])
+                            found = True
+                            break  # keep only one replica
+                        except ValueError:
+                            # if the index is not found just go to the next site
+                            pass
+
+                    if not found:
+                        print(
+                            f"[bold red]No replica found compatible with sites selection for file #{ifile}. The available sites are:"
+                        )
+                        for f, s in zip(files, sites):
+                            print(f"\t- [green]{s} [cyan]{f}")
+                        return
+
+                self.replica_results[dataset] = output
+
+            elif strategy == "round-robin":
+                output = []
+                for ifile, (files, sites) in enumerate(zip(outfiles, outsites)):
+                    # selecting randomly from the sites
+                    iS = random.randint(0, len(sites) - 1)
+                    output.append(files[iS])
+                    files_by_site[sites[iS]].append(files[iS])
+                self.replica_results[dataset] = output
+
+            elif strategy == "quit":
+                print("[orange]Doing nothing...")
+                return
 
-        self.replica_results_bysite[dataset] = files_by_site
+            self.replica_results_bysite[dataset] = files_by_site
 
-        # Now let's print the results
-        tree = Tree(label=f"[bold orange]Replicas for [green]{dataset}")
-        for site, files in files_by_site.items():
-            T = tree.add(f"[green]{site}")
-            for f in files:
-                T.add(f"[cyan]{f}")
-        self.console.print(tree)
+            # Now let's print the results
+            tree = Tree(label=f"[bold orange]Replicas for [green]{dataset}")
+            for site, files in files_by_site.items():
+                T = tree.add(f"[green]{site}")
+                for f in files:
+                    T.add(f"[cyan]{f}")
+            self.console.print(tree)
 
     def do_allowlist_sites(self, args):
         if self.sites_allowlist is None:
@@ -413,6 +423,14 @@ def do_preprocess(self, args):
 
 
 if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--cli", help="Start interactive CLI for dataset discovery", action="store_true"
+    )
+    args = parser.parse_args()
+
     intro_msg = r"""[bold yellow]Welcome to the datasets discovery coffea CLI![/bold yellow]
 Use this CLI tool to query the CMS datasets and to select interactively the grid sites to use for reading the files in your analysis.
 Some basic commands:
@@ -430,8 +448,9 @@ def do_preprocess(self, args):
   - [bold cyan]preprocess (P) OUTPUTFILE[/]: Preprocess the replicas with dask and save the fileset to the outputfile (yaml or json)
   - [bold cyan]help[/]: get help!
 """
-    console = Console()
-    console.print(intro_msg, justify="left")
 
-    app = DatasetQueryApp()
-    app.cmdloop()
+    if args.cli:
+        console = Console()
+        console.print(intro_msg, justify="left")
+        app = DatasetQueryApp()
+        app.cmdloop()

From f04b60f6beca291c12b5fb8d04ed1683d5a78964 Mon Sep 17 00:00:00 2001
From: Davide Valsecchi <davide.valsecchi@cern.ch>
Date: Fri, 8 Dec 2023 19:51:54 +0100
Subject: [PATCH 63/80] Added Select all options to cli

---
 src/coffea/dataset_tools/dataset_query.py | 24 +++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py
index 1c74737d1..d62ec5e17 100644
--- a/src/coffea/dataset_tools/dataset_query.py
+++ b/src/coffea/dataset_tools/dataset_query.py
@@ -129,16 +129,22 @@ def do_query_results(self, args):
             print("First [bold red]query (Q)[/] for a dataset")
 
     def do_select(self, args):
+        """Selected the datasets from the list of query results. Input a list of indices of "all"."""
         if not self.last_query_list:
             print("First [bold red]query (Q)[/] for a dataset")
             return
 
         Nresults = len(self.last_query_list)
         print("[cyan]Selected datasets:")
-        for s in map(int, args.arg_list):
-            if s <= Nresults:
-                self.selected_datasets.append(self.last_query_list[s - 1])
-                print(f"- ({s}) {self.last_query_list[s-1]}")
+        if args == "all":
+            indices = range(0, len(self.last_query_list))  # 1 based list
+        else:
+            indices = map(lambda k: int(k) - 1, args.arg_list)
+
+        for s in indices:
+            if s < Nresults:
+                self.selected_datasets.append(self.last_query_list[s])
+                print(f"- ({s+1}) {self.last_query_list[s]}")
             else:
                 print(
                     f"[red]The requested dataset is not in the list. Please insert a position <={Nresults}"
@@ -374,7 +380,7 @@ def do_preprocess(self, args):
         """Perform preprocessing for concrete fileset extraction.
         Args:  output_name [step_size] [align to file cluster boundaries] [dask cluster url]
         """
-        args_list = args.split()
+        args_list = args.arg_list
         if len(args_list) < 1:
             print(
                 "Please provide an output name and optionally a step size, if you want to align to file clusters, or a dask cluster url"
@@ -386,7 +392,7 @@ def do_preprocess(self, args):
             align_to_clusters = False
             dask_url = None
         if len(args_list) >= 2:
-            step_size = args_list[1]
+            step_size = int(args_list[1])
         if len(args_list) >= 3:
             if args_list[2] == "True":
                 align_to_clusters = True
@@ -413,11 +419,13 @@ def do_preprocess(self, args):
                     align_clusters=align_to_clusters,
                     skip_bad_files=True,
                 )
+        from IPython import embed
 
-        with gzip.open(f"{output_file}_available.json.gz", "w") as file:
+        embed()
+        with gzip.open(f"{output_file}_available.json.gz", "wb") as file:
             print(f"Saved available fileset chunks to {output_file}_available.json.gz")
             json.dump(out_available, file, indent=2)
-        with gzip.open(f"{output_file}_all.json.gz", "w") as file:
+        with gzip.open(f"{output_file}_all.json.gz", "wb") as file:
             print(f"Saved all fileset chunks to {output_file}_all.json.gz")
             json.dump(out_updated, file, indent=2)
 

From 42af84bd7242d2ac7b15dfd35c1effeebcad65c7 Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Fri, 8 Dec 2023 14:15:13 -0600
Subject: [PATCH 64/80] lint

---
 src/coffea/dataset_tools/dataset_query.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py
index d62ec5e17..956ad25f7 100644
--- a/src/coffea/dataset_tools/dataset_query.py
+++ b/src/coffea/dataset_tools/dataset_query.py
@@ -171,7 +171,8 @@ def do_list_selected(self, args):
     def do_replicas(self, args):
         if len(args.arg_list) == 0:
             print(
-                "[red] Please provide a list of indices of the [bold]selected[/bold] datasets to analyze or [bold]all[/bold] to loop on all the selected datasets"
+                "[red] Please provide a list of indices of the [bold]selected[/bold] datasets "
+                "to analyze or [bold]all[/bold] to loop on all the selected datasets"
             )
             return
 

From 4ba8c0a8626fba5aeb4cb172eb29adff7da544bb Mon Sep 17 00:00:00 2001
From: Davide Valsecchi <davide.valsecchi@cern.ch>
Date: Mon, 11 Dec 2023 13:04:41 +0100
Subject: [PATCH 65/80] Moved from cmd2 to pure rich interface for the CLI

---
 pyproject.toml                            |   1 -
 src/coffea/dataset_tools/dataset_query.py | 432 ++++++++++++----------
 2 files changed, 244 insertions(+), 189 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 04ede6167..66f9e43b5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -88,7 +88,6 @@ servicex = [
 rucio = [
   "rucio-clients>=32;python_version>'3.8'",
   "rucio-clients<32;python_version<'3.9'",
-  "cmd2",
 ]
 dev = [
   "pre-commit",
diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py
index d62ec5e17..0700879fe 100644
--- a/src/coffea/dataset_tools/dataset_query.py
+++ b/src/coffea/dataset_tools/dataset_query.py
@@ -3,13 +3,13 @@
 import os
 import random
 from collections import defaultdict
+from typing import List
 
-import cmd2
 import yaml
 from dask.distributed import Client
 from rich import print
 from rich.console import Console
-from rich.prompt import Prompt
+from rich.prompt import Prompt, IntPrompt, Confirm
 from rich.table import Table
 from rich.tree import Tree
 
@@ -50,24 +50,44 @@ def print_dataset_query(query, dataset_list, selected, console):
     console.print(table)
 
 
-class DatasetQueryApp(cmd2.Cmd):
-    prompt = "\033[1;34m" + "cms-datasets" + "\033[0m > "
+def get_indices_query(input_str: str, maxN: int) -> List[int]:
+    tokens = input_str.strip().split(" ")
+    final_tokens = []
+    for t in tokens:
+        if t.isdigit():
+            if int(t) > maxN:
+                print(
+                    f"[red bold]Requested index {t} larger than available elements {maxN}"
+                )
+                return False
+            final_tokens.append(int(t) - 1)  # index 0
+        elif "-" in t:
+            rng = t.split("-")
+            try:
+                for i in range(
+                    int(rng[0]), int(rng[1]) + 1
+                ):  # including the last index
+                    if i > maxN:
+                        print(
+                            f"[red bold]Requested index {t} larger than available elements {maxN}"
+                        )
+                        return False
+                    final_tokens.append(i - 1)
+            except:
+                print(
+                    "[red]Error! Bad formatting for selection string. Use e.g. 1 4 5-9"
+                )
+                return False
+        elif t == "all":
+            final_tokens = list(range(0, maxN))
+        else:
+            print("[red]Error! Bad formatting for selection string. Use e.g. 1 4 5-9")
+            return False
+    return final_tokens
+
 
+class DataDiscoveryCLI:
     def __init__(self):
-        shortcuts = cmd2.DEFAULT_SHORTCUTS
-        shortcuts.update(
-            {
-                "L": "login",
-                "Q": "query",
-                "QR": "query_results",
-                "R": "replicas",
-                "S": "select",
-                "LS": "list_selected",
-                "LR": "list_replicas",
-                "O": "save",
-                "P": "preprocess",
-            }
-        )
         self.console = Console()
         self.rucio_client = None
         self.selected_datasets = []
@@ -81,43 +101,116 @@ def __init__(self):
 
         self.replica_results = defaultdict(list)
         self.replica_results_bysite = {}
-        super().__init__(shortcuts=shortcuts)
 
-    def do_login(self, args):
+        self.commands = [
+            "help",
+            "login",
+            "query",
+            "query-results",
+            "select",
+            "list-selected",
+            "replicas",
+            "list-replicas",
+            "save",
+            "preprocess",
+            "allow-sites",
+            "block-sites",
+            "regex-sites",
+            "sites-filters",
+            "quit",
+        ]
+
+    def start_cli(self):
+        while True:
+            command = Prompt.ask(">", choices=self.commands)
+            if command == "help":
+                print(
+                    r"""[bold yellow]Welcome to the datasets discovery coffea CLI![/bold yellow]
+Use this CLI tool to query the CMS datasets and to select interactively the grid sites to use for reading the files in your analysis.
+Some basic commands:
+  - [bold cyan]query (Q)[/]: Look for datasets with * wildcards (like in DAS)
+  - [bold cyan]select (S)[/]: Select datasets to process further from query results
+  - [bold cyan]replicas (R)[/]: Query rucio to look for files replica and then select the preferred sites
+  - [bold cyan]list_selected (LS)[/]: Print a list of the selected datasets
+  - [bold cyan]list_replicas (LR) index[/]: Print the selected files replicas for the selected dataset
+  - [bold cyan]sites_filters[/]: show the active sites filters
+  - [bold cyan]sites_filters clear[/]: clear all the active sites filters
+  - [bold cyan]allowlist_sites[/]: Select sites to allowlist them for replica queries
+  - [bold cyan]blocklist_sites[/]: Select sites to blocklist them for replica queries
+  - [bold cyan]regex_sites[/]: Select sites with a regex for replica queries: please wrap the regex like "T[123]_(FR|IT|BE|CH|DE)_\w+"
+  - [bold cyan]save (O) OUTPUTFILE[/]: Save the replicas results to file (json or yaml) for further processing
+  - [bold cyan]preprocess (P) OUTPUTFILE[/]: Preprocess the replicas with dask and save the fileset to the outputfile (yaml or json)
+  - [bold cyan]help[/]: get help!
+            """
+                )
+            elif command == "login":
+                self.do_login()
+            elif command == "quit":
+                print("Bye!")
+                break
+            elif command == "query":
+                self.do_query()
+            elif command == "query-results":
+                self.do_query_results()
+            elif command == "select":
+                self.do_select()
+            elif command == "list-selected":
+                self.do_list_selected()
+            elif command == "replicas":
+                self.do_replicas()
+            elif command == "list-replicas":
+                self.do_list_replicas()
+            elif command == "save":
+                self.do_save()
+            elif command == "preprocess":
+                self.do_preprocess()
+            elif command == "allow-sites":
+                self.do_allowlist_sites()
+            elif command == "block-sites":
+                self.do_blocklist_sites()
+            elif command == "regex-sites":
+                self.do_regex_sites()
+            elif command == "sites-filters":
+                self.do_sites_filters()
+            else:
+                break
+
+    def do_login(self, proxy=None):
         """Login to the rucio client. Optionally a specific proxy file can be passed to the command.
         If the proxy file is not specified, `voms-proxy-info` is used"""
-        if args:
-            self.rucio_client = rucio_utils.get_rucio_client(args[0])
+        if proxy:
+            self.rucio_client = rucio_utils.get_rucio_client(proxy)
         else:
             self.rucio_client = rucio_utils.get_rucio_client()
-
         print(self.rucio_client)
-        # pprint(self.rucio_client.whoami())
 
-    def do_whoami(self, args):
+    def do_whoami(self):
         # Your code here
         if not self.rucio_client:
             print("First [bold]login (L)[/] to the rucio server")
             return
         print(self.rucio_client.whoami())
 
-    def do_query(self, args):
+    def do_query(self):
         # Your code here
-        with self.console.status(f"Querying rucio for: [bold red]{args}[/]"):
+        query = Prompt.ask(
+            "[yellow bold]Query for[/]",
+        )
+        with self.console.status(f"Querying rucio for: [bold red]{query}[/]"):
             outlist, outtree = rucio_utils.query_dataset(
-                args.arg_list[0],
+                query,
                 client=self.rucio_client,
                 tree=True,
                 scope="cms",  # TODO configure scope
             )
             # Now let's print the results as a tree
-            print_dataset_query(args, outtree, self.selected_datasets, self.console)
-            self.last_query = args
+            print_dataset_query(query, outtree, self.selected_datasets, self.console)
+            self.last_query = query
             self.last_query_list = outlist
             self.last_query_tree = outtree
         print("Use the command [bold red]select (S)[/] to selected the datasets")
 
-    def do_query_results(self, args):
+    def do_query_results(self):
         if self.last_query_list:
             print_dataset_query(
                 self.last_query,
@@ -128,20 +221,24 @@ def do_query_results(self, args):
         else:
             print("First [bold red]query (Q)[/] for a dataset")
 
-    def do_select(self, args):
-        """Selected the datasets from the list of query results. Input a list of indices of "all"."""
+    def do_select(self):
+        """Selected the datasets from the list of query results. Input a list of indices
+        also with range 4-6 or "all"."""
         if not self.last_query_list:
             print("First [bold red]query (Q)[/] for a dataset")
             return
 
+        selection = Prompt.ask(
+            "[yellow bold]Select datasets indices[/] (e.g 1 4 6-10)", default="all"
+        )
+        final_tokens = get_indices_query(selection, len(self.last_query_list))
+        if not final_tokens:
+            return
+
         Nresults = len(self.last_query_list)
         print("[cyan]Selected datasets:")
-        if args == "all":
-            indices = range(0, len(self.last_query_list))  # 1 based list
-        else:
-            indices = map(lambda k: int(k) - 1, args.arg_list)
 
-        for s in indices:
+        for s in final_tokens:
             if s < Nresults:
                 self.selected_datasets.append(self.last_query_list[s])
                 print(f"- ({s+1}) {self.last_query_list[s]}")
@@ -150,7 +247,7 @@ def do_select(self, args):
                     f"[red]The requested dataset is not in the list. Please insert a position <={Nresults}"
                 )
 
-    def do_list_selected(self, args):
+    def do_list_selected(self):
         print("[cyan]Selected datasets:")
         table = Table(title="Selected datasets")
         table.add_column("Index", justify="left", style="cyan", no_wrap=True)
@@ -168,42 +265,37 @@ def do_list_selected(self, args):
             )
         self.console.print(table)
 
-    def do_replicas(self, args):
-        if len(args.arg_list) == 0:
-            print(
-                "[red] Please provide a list of indices of the [bold]selected[/bold] datasets to analyze or [bold]all[/bold] to loop on all the selected datasets"
-            )
+    def do_replicas(self):
+        selection = Prompt.ask(
+            "[yellow bold]Select datasets indices[/] (e.g 1 4 6-10)", default="all"
+        )
+        indices = get_indices_query(selection, len(self.selected_datasets))
+        if not indices:
             return
-
-        if args == "all":
-            datasets = self.selected_datasets
-        else:
-            for index in args.arg_list:
-                if index.isdigit():
-                    if int(index) <= len(self.selected_datasets):
-                        datasets = [self.selected_datasets[int(index) - 1]]
-                    else:
-                        print(
-                            f"[red]The requested dataset is not in the list. Please insert a position <={len(self.selected_datasets)}"
-                        )
+        datasets = [self.selected_datasets[ind] for ind in indices]
 
         for dataset in datasets:
             with self.console.status(
                 f"Querying rucio for replicas: [bold red]{dataset}[/]"
             ):
-                (
-                    outfiles,
-                    outsites,
-                    sites_counts,
-                ) = rucio_utils.get_dataset_files_replicas(
-                    dataset,
-                    allowlist_sites=self.sites_allowlist,
-                    blocklist_sites=self.sites_blocklist,
-                    regex_sites=self.sites_regex,
-                    mode="full",
-                    client=self.rucio_client,
-                )
+                try:
+                    (
+                        outfiles,
+                        outsites,
+                        sites_counts,
+                    ) = rucio_utils.get_dataset_files_replicas(
+                        dataset,
+                        allowlist_sites=self.sites_allowlist,
+                        blocklist_sites=self.sites_blocklist,
+                        regex_sites=self.sites_regex,
+                        mode="full",
+                        client=self.rucio_client,
+                    )
+                except Exception as e:
+                    print(f"\n[red bold] Exception: {e}[/]")
+                    return
                 self.last_replicas_results = (outfiles, outsites, sites_counts)
+
             print(f"[cyan]Sites availability for dataset: [red]{dataset}")
             table = Table(title="Available replicas")
             table.add_column("Index", justify="center")
@@ -289,122 +381,118 @@ def do_replicas(self, args):
                     T.add(f"[cyan]{f}")
             self.console.print(tree)
 
-    def do_allowlist_sites(self, args):
+    def do_allowlist_sites(self):
+        sites = Prompt.ask(
+            "[yellow]Restrict the available sites to (comma-separated list)"
+        ).split(",")
         if self.sites_allowlist is None:
-            self.sites_allowlist = args.arg_list
+            self.sites_allowlist = sites
         else:
-            self.sites_allowlist += args.arg_list
+            self.sites_allowlist += sites
         print("[green]Allowlisted sites:")
         for s in self.sites_allowlist:
             print(f"- {s}")
 
-    def do_blocklist_sites(self, args):
+    def do_blocklist_sites(self):
+        sites = Prompt.ask("[yellow]Exclude the sites (comma-separated list)").split(
+            ","
+        )
         if self.sites_blocklist is None:
-            self.sites_blocklist = args.arg_list
+            self.sites_blocklist = sites
         else:
-            self.sites_blocklist += args.arg_list
+            self.sites_blocklist += sites
         print("[red]Blocklisted sites:")
         for s in self.sites_blocklist:
             print(f"- {s}")
 
-    def do_regex_sites(self, args):
-        if args.startswith('"'):
-            args = args[1:]
-        if args.endswith('"'):
-            args = args[:-1]
-        self.sites_regex = rf"{args}"
-        print(f"New sites regex: [cyan]{self.sites_regex}")
-
-    def do_sites_filters(self, args):
-        if args == "":
-            print("[green bold]Allow-listed sites:")
-            if self.sites_allowlist:
-                for s in self.sites_allowlist:
-                    print(f"- {s}")
-
-            print("[bold red]Block-listed sites:")
-            if self.sites_blocklist:
-                for s in self.sites_blocklist:
-                    print(f"- {s}")
-
-            print(f"[bold cyan]Sites regex: [italics]{self.sites_regex}")
-        if args == "clear":
+    def do_regex_sites(self):
+        regex = Prompt.ask("[yellow]Regex to restrict the available sites")
+        if len(regex):
+            self.sites_regex = rf"{regex}"
+            print(f"New sites regex: [cyan]{self.sites_regex}")
+
+    def do_sites_filters(self):
+        print("[green bold]Allow-listed sites:")
+        if self.sites_allowlist:
+            for s in self.sites_allowlist:
+                print(f"- {s}")
+
+        print("[bold red]Block-listed sites:")
+        if self.sites_blocklist:
+            for s in self.sites_blocklist:
+                print(f"- {s}")
+
+        print(f"[bold cyan]Sites regex: [italics]{self.sites_regex}")
+
+        if Confirm.ask("Clear sites restrinction?", default=False):
             self.sites_allowlist = None
             self.sites_blocklist = None
             self.sites_regex = None
             print("[bold green]Sites filters cleared")
 
-    def do_list_replicas(self, args):
-        if len(args.arg_list) == 0:
-            print("[red]Please call the command with the index of a selected dataset")
-        else:
-            if int(args) > len(self.selected_datasets):
+    def do_list_replicas(self):
+        selection = Prompt.ask(
+            "[yellow bold]Select datasets indices[/] (e.g 1 4 6-10)", default="all"
+        )
+        indices = get_indices_query(selection, len(self.selected_datasets))
+        datasets = [self.selected_datasets[ind] for ind in indices]
+
+        for dataset in datasets:
+            if dataset not in self.replica_results:
                 print(
-                    f"[red] Select the replica with index < {len(self.selected_datasets)}"
+                    f"[red bold]No replica info for dataset {dataset}. You need to selected the replicas with [cyan] replicas [/cyan] command[/]"
                 )
                 return
-            else:
-                dataset = self.selected_datasets[int(args) - 1]
-                if dataset not in self.replica_results:
-                    print(
-                        f"[red bold]No replica info for dataset {dataset}. You need to selected the replicas with [cyan] replicas {args}"
-                    )
-                    return
-                tree = Tree(label=f"[bold orange]Replicas for [green]{dataset}")
-
-                for site, files in self.replica_results_bysite[dataset].items():
-                    T = tree.add(f"[green]{site}")
-                    for f in files:
-                        T.add(f"[cyan]{f}")
+            tree = Tree(label=f"[bold orange]Replicas for [/][green]{dataset}[/]")
+            for site, files in self.replica_results_bysite[dataset].items():
+                T = tree.add(f"[green]{site}")
+                for f in files:
+                    T.add(f"[cyan]{f}")
 
-                self.console.print(tree)
+            self.console.print(tree)
 
-    def do_save(self, args):
+    def do_save(self, filename=None):
         """Save the replica information in yaml format"""
-        if not len(args):
-            print("[red]Please provide an output filename and format")
-            return
-        format = os.path.splitext(args)[1]
+        if not filename:
+            filename = Prompt.ask(
+                "[yellow bold]Output file name (.yaml or .json)", default="output.json"
+            )
+        format = os.path.splitext(filename)[1]
         output = {}
         for fileset, files in self.replica_results.items():
             output[fileset] = {"files": files, "metadata": {}}
-
-        with open(args, "w") as file:
+        with open(filename, "w") as file:
             if format == ".yaml":
                 yaml.dump(output, file, default_flow_style=False)
             elif format == ".json":
                 json.dump(output, file, indent=2)
-        print(f"[green]File {args} saved!")
-
-    def do_preprocess(self, args):
+        print(f"[green]File {filename} saved!")
+
+    def do_preprocess(
+        self,
+        output_file=None,
+        step_size=None,
+        align_to_clusters=None,
+        dask_cluster=None,
+    ):
         """Perform preprocessing for concrete fileset extraction.
-        Args:  output_name [step_size] [align to file cluster boundaries] [dask cluster url]
+        Args:  output_file [step_size] [align to file cluster boundaries] [dask cluster url]
         """
-        args_list = args.arg_list
-        if len(args_list) < 1:
-            print(
-                "Please provide an output name and optionally a step size, if you want to align to file clusters, or a dask cluster url"
+        if not output_file:
+            output_file = Prompt.ask(
+                "[yellow bold]Output name", default="output_preprocessing"
             )
-            return
-        else:
-            output_file = args_list[0]
-            step_size = None
-            align_to_clusters = False
-            dask_url = None
-        if len(args_list) >= 2:
-            step_size = int(args_list[1])
-        if len(args_list) >= 3:
-            if args_list[2] == "True":
-                align_to_clusters = True
-            elif args_list[2] == "False":
-                align_to_clusters = False
-            else:
-                raise ValueError('align_to_clusters must be either "True" or "False"')
-        if len(args_list) == 4:
-            dask_url = args_list[3]
-        if len(args_list) > 4:
-            print("preprocess accepts at most 3 commandline arguments!")
-            return
+        if not step_size:
+            step_size = IntPrompt.ask("[yellow bold]Step size", default=None)
+        if align_to_clusters is None:
+            align_to_clusters = Confirm.ask(
+                "[yellow bold]Align to clusters", default=True
+            )
+        if not dask_cluster:
+            dask_cluster = Prompt.ask("[yellow bold]Dask cluster url", default="None")
+            if dask_cluster == "None":
+                dask_cluster = None
+
         replicas = {}
         for fileset, files in self.replica_results.items():
             replicas[fileset] = {"files": {f: "Events" for f in files}, "metadata": {}}
@@ -412,53 +500,21 @@ def do_preprocess(self, args):
         with self.console.status(
             "[red] Preprocessing files to extract available chunks with dask[/]"
         ):
-            with Client(dask_url) as _:
+            with Client(dask_cluster) as _:
                 out_available, out_updated = preprocess(
                     replicas,
                     maybe_step_size=step_size,
                     align_clusters=align_to_clusters,
                     skip_bad_files=True,
                 )
-        from IPython import embed
-
-        embed()
-        with gzip.open(f"{output_file}_available.json.gz", "wb") as file:
+        with gzip.open(f"{output_file}_available.json.gz", "wt") as file:
             print(f"Saved available fileset chunks to {output_file}_available.json.gz")
             json.dump(out_available, file, indent=2)
-        with gzip.open(f"{output_file}_all.json.gz", "wb") as file:
+        with gzip.open(f"{output_file}_all.json.gz", "wt") as file:
             print(f"Saved all fileset chunks to {output_file}_all.json.gz")
             json.dump(out_updated, file, indent=2)
 
 
 if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--cli", help="Start interactive CLI for dataset discovery", action="store_true"
-    )
-    args = parser.parse_args()
-
-    intro_msg = r"""[bold yellow]Welcome to the datasets discovery coffea CLI![/bold yellow]
-Use this CLI tool to query the CMS datasets and to select interactively the grid sites to use for reading the files in your analysis.
-Some basic commands:
-  - [bold cyan]query (Q)[/]: Look for datasets with * wildcards (like in DAS)
-  - [bold cyan]select (S)[/]: Select datasets to process further from query results
-  - [bold cyan]replicas (R)[/]: Query rucio to look for files replica and then select the preferred sites
-  - [bold cyan]list_selected (LS)[/]: Print a list of the selected datasets
-  - [bold cyan]list_replicas (LR) index[/]: Print the selected files replicas for the selected dataset
-  - [bold cyan]sites_filters[/]: show the active sites filters
-  - [bold cyan]sites_filters clear[/]: clear all the active sites filters
-  - [bold cyan]allowlist_sites[/]: Select sites to allowlist them for replica queries
-  - [bold cyan]blocklist_sites[/]: Select sites to blocklist them for replica queries
-  - [bold cyan]regex_sites[/]: Select sites with a regex for replica queries: please wrap the regex like "T[123]_(FR|IT|BE|CH|DE)_\w+"
-  - [bold cyan]save (O) OUTPUTFILE[/]: Save the replicas results to file (json or yaml) for further processing
-  - [bold cyan]preprocess (P) OUTPUTFILE[/]: Preprocess the replicas with dask and save the fileset to the outputfile (yaml or json)
-  - [bold cyan]help[/]: get help!
-"""
-
-    if args.cli:
-        console = Console()
-        console.print(intro_msg, justify="left")
-        app = DatasetQueryApp()
-        app.cmdloop()
+    cli = DataDiscoveryCLI()
+    cli.start_cli()

From dd21cbc42ab65b78a0319ff43024aedf1fe494b3 Mon Sep 17 00:00:00 2001
From: Davide Valsecchi <davide.valsecchi@cern.ch>
Date: Mon, 11 Dec 2023 13:12:22 +0100
Subject: [PATCH 66/80] Updated help message

---
 src/coffea/dataset_tools/dataset_query.py | 26 +++++++++++------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py
index 0700879fe..3a28a486d 100644
--- a/src/coffea/dataset_tools/dataset_query.py
+++ b/src/coffea/dataset_tools/dataset_query.py
@@ -128,19 +128,19 @@ def start_cli(self):
                     r"""[bold yellow]Welcome to the datasets discovery coffea CLI![/bold yellow]
 Use this CLI tool to query the CMS datasets and to select interactively the grid sites to use for reading the files in your analysis.
 Some basic commands:
-  - [bold cyan]query (Q)[/]: Look for datasets with * wildcards (like in DAS)
-  - [bold cyan]select (S)[/]: Select datasets to process further from query results
-  - [bold cyan]replicas (R)[/]: Query rucio to look for files replica and then select the preferred sites
-  - [bold cyan]list_selected (LS)[/]: Print a list of the selected datasets
-  - [bold cyan]list_replicas (LR) index[/]: Print the selected files replicas for the selected dataset
-  - [bold cyan]sites_filters[/]: show the active sites filters
-  - [bold cyan]sites_filters clear[/]: clear all the active sites filters
-  - [bold cyan]allowlist_sites[/]: Select sites to allowlist them for replica queries
-  - [bold cyan]blocklist_sites[/]: Select sites to blocklist them for replica queries
-  - [bold cyan]regex_sites[/]: Select sites with a regex for replica queries: please wrap the regex like "T[123]_(FR|IT|BE|CH|DE)_\w+"
-  - [bold cyan]save (O) OUTPUTFILE[/]: Save the replicas results to file (json or yaml) for further processing
-  - [bold cyan]preprocess (P) OUTPUTFILE[/]: Preprocess the replicas with dask and save the fileset to the outputfile (yaml or json)
-  - [bold cyan]help[/]: get help!
+  - [bold cyan]query[/]: Look for datasets with * wildcards (like in DAS)
+  - [bold cyan]select[/]: Select datasets to process further from query results
+  - [bold cyan]replicas[/]: Query rucio to look for files replica and then select the preferred sites
+  - [bold cyan]query-results[/]: List the results of the last dataset query
+  - [bold cyan]list-selected[/]: Print a list of the selected datasets
+  - [bold cyan]list-replicas[/]: Print the selected files replicas for the selected dataset
+  - [bold cyan]sites-filters[/]: show the active sites filters and aks to clear them
+  - [bold cyan]allow-sites[/]: Restrict the grid sites available for replicas query only to the requested list
+  - [bold cyan]block-sites[/]: Exclude grid sites from the available sites for replicas query
+  - [bold cyan]regex-sites[/]: Select sites with a regex for replica queries: e.g.  "T[123]_(FR|IT|BE|CH|DE)_\w+"
+  - [bold cyan]save[/]: Save the replicas query results to file (json or yaml) for further processing
+  - [bold cyan]preprocess[/]: Preprocess the replicas with dask and save the fileset for further processing with uproot/coffea
+  - [bold cyan]help[/]: Print this help message
             """
                 )
             elif command == "login":

From c07963463a32940153efe2faad21413a2b3b8383 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 11 Dec 2023 12:13:01 +0000
Subject: [PATCH 67/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/coffea/dataset_tools/dataset_query.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py
index 3a28a486d..8a92e8b5d 100644
--- a/src/coffea/dataset_tools/dataset_query.py
+++ b/src/coffea/dataset_tools/dataset_query.py
@@ -9,7 +9,7 @@
 from dask.distributed import Client
 from rich import print
 from rich.console import Console
-from rich.prompt import Prompt, IntPrompt, Confirm
+from rich.prompt import Confirm, IntPrompt, Prompt
 from rich.table import Table
 from rich.tree import Tree
 

From f09b64f4188beb42f1d6062c1027c3f72eb55879 Mon Sep 17 00:00:00 2001
From: Davide Valsecchi <davide.valsecchi@cern.ch>
Date: Mon, 11 Dec 2023 13:14:58 +0100
Subject: [PATCH 68/80] linting

---
 src/coffea/dataset_tools/dataset_query.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py
index 3a28a486d..9f73f61fb 100644
--- a/src/coffea/dataset_tools/dataset_query.py
+++ b/src/coffea/dataset_tools/dataset_query.py
@@ -73,7 +73,7 @@ def get_indices_query(input_str: str, maxN: int) -> List[int]:
                         )
                         return False
                     final_tokens.append(i - 1)
-            except:
+            except Exception:
                 print(
                     "[red]Error! Bad formatting for selection string. Use e.g. 1 4 5-9"
                 )

From 2049913a52817e3d40d744b59055da6b38adcbce Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Mon, 11 Dec 2023 07:56:34 -0600
Subject: [PATCH 69/80] typo

---
 src/coffea/dataset_tools/dataset_query.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py
index b5323723f..8bf24e8a9 100644
--- a/src/coffea/dataset_tools/dataset_query.py
+++ b/src/coffea/dataset_tools/dataset_query.py
@@ -134,7 +134,7 @@ def start_cli(self):
   - [bold cyan]query-results[/]: List the results of the last dataset query
   - [bold cyan]list-selected[/]: Print a list of the selected datasets
   - [bold cyan]list-replicas[/]: Print the selected files replicas for the selected dataset
-  - [bold cyan]sites-filters[/]: show the active sites filters and aks to clear them
+  - [bold cyan]sites-filters[/]: show the active sites filters and ask to clear them
   - [bold cyan]allow-sites[/]: Restrict the grid sites available for replicas query only to the requested list
   - [bold cyan]block-sites[/]: Exclude grid sites from the available sites for replicas query
   - [bold cyan]regex-sites[/]: Select sites with a regex for replica queries: e.g.  "T[123]_(FR|IT|BE|CH|DE)_\w+"

From 524011066bfdb50e8dedab305be129263c9154b7 Mon Sep 17 00:00:00 2001
From: Davide Valsecchi <davide.valsecchi@cern.ch>
Date: Mon, 11 Dec 2023 16:39:39 +0100
Subject: [PATCH 70/80] Adding non-cli interaction from datacard

---
 src/coffea/dataset_tools/dataset_query.py | 164 ++++++++++++++++++----
 1 file changed, 139 insertions(+), 25 deletions(-)

diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py
index b5323723f..61c45f559 100644
--- a/src/coffea/dataset_tools/dataset_query.py
+++ b/src/coffea/dataset_tools/dataset_query.py
@@ -6,6 +6,7 @@
 from typing import List
 
 import yaml
+import argparse
 from dask.distributed import Client
 from rich import print
 from rich.console import Console
@@ -91,6 +92,7 @@ def __init__(self):
         self.console = Console()
         self.rucio_client = None
         self.selected_datasets = []
+        self.selected_datasets_metadata = []
         self.last_query = ""
         self.last_query_tree = None
         self.last_query_list = None
@@ -100,6 +102,7 @@ def __init__(self):
         self.last_replicas_results = None
 
         self.replica_results = defaultdict(list)
+        self.replica_results_metadata = {}
         self.replica_results_bysite = {}
 
         self.commands = [
@@ -191,11 +194,12 @@ def do_whoami(self):
             return
         print(self.rucio_client.whoami())
 
-    def do_query(self):
+    def do_query(self, query=None):
         # Your code here
-        query = Prompt.ask(
-            "[yellow bold]Query for[/]",
-        )
+        if query is None:
+            query = Prompt.ask(
+                "[yellow bold]Query for[/]",
+            )
         with self.console.status(f"Querying rucio for: [bold red]{query}[/]"):
             outlist, outtree = rucio_utils.query_dataset(
                 query,
@@ -221,16 +225,17 @@ def do_query_results(self):
         else:
             print("First [bold red]query (Q)[/] for a dataset")
 
-    def do_select(self):
+    def do_select(self, selection=None, metadata=None):
         """Selected the datasets from the list of query results. Input a list of indices
         also with range 4-6 or "all"."""
         if not self.last_query_list:
             print("First [bold red]query (Q)[/] for a dataset")
             return
 
-        selection = Prompt.ask(
-            "[yellow bold]Select datasets indices[/] (e.g 1 4 6-10)", default="all"
-        )
+        if selection is None:
+            selection = Prompt.ask(
+                "[yellow bold]Select datasets indices[/] (e.g 1 4 6-10)", default="all"
+            )
         final_tokens = get_indices_query(selection, len(self.last_query_list))
         if not final_tokens:
             return
@@ -241,6 +246,10 @@ def do_select(self):
         for s in final_tokens:
             if s < Nresults:
                 self.selected_datasets.append(self.last_query_list[s])
+                if metadata:
+                    self.selected_datasets_metadata.append(metadata)
+                else:
+                    self.selected_datasets_metadata.append({})
                 print(f"- ({s+1}) {self.last_query_list[s]}")
             else:
                 print(
@@ -265,16 +274,25 @@ def do_list_selected(self):
             )
         self.console.print(table)
 
-    def do_replicas(self):
-        selection = Prompt.ask(
-            "[yellow bold]Select datasets indices[/] (e.g 1 4 6-10)", default="all"
-        )
+    def do_replicas(self, mode=None, selection=None):
+        """Query Rucio for replicas.
+        Mode: - None:  ask the user about the mode
+              - round-robin (take files randomly from available sites),
+              - choice: ask the user to choose the specific site
+        """
+        if selection is None:
+            selection = Prompt.ask(
+                "[yellow bold]Select datasets indices[/] (e.g 1 4 6-10)", default="all"
+            )
         indices = get_indices_query(selection, len(self.selected_datasets))
         if not indices:
             return
-        datasets = [self.selected_datasets[ind] for ind in indices]
+        datasets = [
+            (self.selected_datasets[ind], self.selected_datasets_metadata[ind])
+            for ind in indices
+        ]
 
-        for dataset in datasets:
+        for dataset, dataset_metadata in datasets:
             with self.console.status(
                 f"Querying rucio for replicas: [bold red]{dataset}[/]"
             ):
@@ -314,15 +332,16 @@ def do_replicas(self):
                 )
 
             self.console.print(table)
-            strategy = Prompt.ask(
-                "Select sites",
-                choices=["round-robin", "choice", "quit"],
-                default="round-robin",
-            )
+            if mode is None:
+                mode = Prompt.ask(
+                    "Select sites",
+                    choices=["round-robin", "choice", "quit"],
+                    default="round-robin",
+                )
 
             files_by_site = defaultdict(list)
 
-            if strategy == "choice":
+            if mode == "choice":
                 ind = list(
                     map(
                         int,
@@ -357,8 +376,9 @@ def do_replicas(self):
                         return
 
                 self.replica_results[dataset] = output
+                self.replica_results_metadata[dataset] = dataset_metadata
 
-            elif strategy == "round-robin":
+            elif mode == "round-robin":
                 output = []
                 for ifile, (files, sites) in enumerate(zip(outfiles, outsites)):
                     # selecting randomly from the sites
@@ -366,8 +386,9 @@ def do_replicas(self):
                     output.append(files[iS])
                     files_by_site[sites[iS]].append(files[iS])
                 self.replica_results[dataset] = output
+                self.replica_results_metadata[dataset] = dataset_metadata
 
-            elif strategy == "quit":
+            elif mode == "quit":
                 print("[orange]Doing nothing...")
                 return
 
@@ -460,7 +481,10 @@ def do_save(self, filename=None):
         format = os.path.splitext(filename)[1]
         output = {}
         for fileset, files in self.replica_results.items():
-            output[fileset] = {"files": files, "metadata": {}}
+            output[fileset] = {
+                "files": files,
+                "metadata": self.replica_results_metadata[fileset],
+            }
         with open(filename, "w") as file:
             if format == ".yaml":
                 yaml.dump(output, file, default_flow_style=False)
@@ -495,7 +519,10 @@ def do_preprocess(
 
         replicas = {}
         for fileset, files in self.replica_results.items():
-            replicas[fileset] = {"files": {f: "Events" for f in files}, "metadata": {}}
+            replicas[fileset] = {
+                "files": {f: "Events" for f in files},
+                "metadata": self.replica_results_metadata[fileset],
+            }
         # init a local Dask cluster
         with self.console.status(
             "[red] Preprocessing files to extract available chunks with dask[/]"
@@ -516,5 +543,92 @@ def do_preprocess(
 
 
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--cli", help="Start the dataset discovery CLI", action="store_true"
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset-definition",
+        help="Dataset definition file",
+        type=str,
+        required=False,
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        help="Output name for fileset",
+        type=str,
+        required=False,
+        default="output_fileset",
+    )
+    parser.add_argument(
+        "-as",
+        "--allow-sites",
+        help="List of sites to be allowlisted",
+        nargs="+",
+        type=str,
+    )
+    parser.add_argument(
+        "-bs",
+        "--block-sites",
+        help="List of sites to be blocklisted",
+        nargs="+",
+        type=str,
+    )
+    parser.add_argument(
+        "-rs",
+        "--regex-sites",
+        help="Regex string to be used to filter the sites",
+        type=str,
+    )
+    parser.add_argument(
+        "--replicas-strategy",
+        help="Mode for selecting replicas for datasets: [manual|round-robin|choice]",
+        default="round-robin",
+        required=False,
+    )
+    args = parser.parse_args()
+
     cli = DataDiscoveryCLI()
-    cli.start_cli()
+
+    if args.dataset_definition:
+        # Load the dataset definition if present:
+        with open(args.dataset_definition, "r") as file:
+            dataset_definition = json.load(file)
+
+        for dataset_query, dataset_meta in dataset_definition.items():
+            print(f"\nProcessing query: {dataset_query}")
+            # Adding queries
+            cli.do_query(dataset_query)
+            # Now selecting the results depending on the interactive mode or not.
+            # Metadata are passed to the selection function to associated them with the selected dataset.
+            cli.do_select(selection="all", metadata=dataset_meta)
+
+        # Now list all
+        cli.do_list_selected()
+
+        if args.allow_sites:
+            cli.sites_allowlist = args.allow_sites
+        if args.block_sites:
+            cli.sites_blocklist = args.block_sites
+        if args.regex_sites:
+            cli.sites_regex = args.regex_sites
+
+        # selecting replicas
+        if args.replicas_strategy == "manual":
+            cli.do_replicas(mode=None, selection="all")
+        else:
+            if args.replicas_strategy not in ["round-robin", "choice"]:
+                print(
+                    "Invalid replicas-strategy: please choice between manual|round-robin|choice"
+                )
+                exit(1)
+            cli.do_replicas(mode=args.replicas_strategy, selection="all")
+
+        # Now list all
+        cli.do_list_selected()
+        print("CIAO")
+
+    if args.cli:
+        cli.start_cli()

From d8260301f22c88b7d06ab305598cdbe02dea41a9 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 11 Dec 2023 15:40:49 +0000
Subject: [PATCH 71/80] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/coffea/dataset_tools/dataset_query.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py
index 4b3c26724..2ce4dc421 100644
--- a/src/coffea/dataset_tools/dataset_query.py
+++ b/src/coffea/dataset_tools/dataset_query.py
@@ -1,3 +1,4 @@
+import argparse
 import gzip
 import json
 import os
@@ -6,7 +7,6 @@
 from typing import List
 
 import yaml
-import argparse
 from dask.distributed import Client
 from rich import print
 from rich.console import Console
@@ -594,7 +594,7 @@ def do_preprocess(
 
     if args.dataset_definition:
         # Load the dataset definition if present:
-        with open(args.dataset_definition, "r") as file:
+        with open(args.dataset_definition) as file:
             dataset_definition = json.load(file)
 
         for dataset_query, dataset_meta in dataset_definition.items():

From dd31a2a9a16840d61b3da5648fe14f01ffb49f89 Mon Sep 17 00:00:00 2001
From: Davide Valsecchi <davide.valsecchi@cern.ch>
Date: Mon, 11 Dec 2023 20:06:34 +0100
Subject: [PATCH 72/80] better defaults and typos

---
 src/coffea/dataset_tools/dataset_query.py | 88 +++++++++++++++++------
 1 file changed, 65 insertions(+), 23 deletions(-)

diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py
index 2ce4dc421..80db62e53 100644
--- a/src/coffea/dataset_tools/dataset_query.py
+++ b/src/coffea/dataset_tools/dataset_query.py
@@ -278,7 +278,7 @@ def do_replicas(self, mode=None, selection=None):
         """Query Rucio for replicas.
         Mode: - None:  ask the user about the mode
               - round-robin (take files randomly from available sites),
-              - choice: ask the user to choose the specific site
+              - choose: ask the user to choose the specific site
         """
         if selection is None:
             selection = Prompt.ask(
@@ -335,13 +335,13 @@ def do_replicas(self, mode=None, selection=None):
             if mode is None:
                 mode = Prompt.ask(
                     "Select sites",
-                    choices=["round-robin", "choice", "quit"],
+                    choices=["round-robin", "choose", "quit"],
                     default="round-robin",
                 )
 
             files_by_site = defaultdict(list)
 
-            if mode == "choice":
+            if mode == "choose":
                 ind = list(
                     map(
                         int,
@@ -432,7 +432,7 @@ def do_regex_sites(self):
             self.sites_regex = rf"{regex}"
             print(f"New sites regex: [cyan]{self.sites_regex}")
 
-    def do_sites_filters(self):
+    def do_sites_filters(self, ask_clear=True):
         print("[green bold]Allow-listed sites:")
         if self.sites_allowlist:
             for s in self.sites_allowlist:
@@ -445,11 +445,12 @@ def do_sites_filters(self):
 
         print(f"[bold cyan]Sites regex: [italics]{self.sites_regex}")
 
-        if Confirm.ask("Clear sites restrinction?", default=False):
-            self.sites_allowlist = None
-            self.sites_blocklist = None
-            self.sites_regex = None
-            print("[bold green]Sites filters cleared")
+        if ask_clear:
+            if Confirm.ask("Clear sites restrinction?", default=False):
+                self.sites_allowlist = None
+                self.sites_blocklist = None
+                self.sites_regex = None
+                print("[bold green]Sites filters cleared")
 
     def do_list_replicas(self):
         selection = Prompt.ask(
@@ -506,13 +507,13 @@ def do_preprocess(
             output_file = Prompt.ask(
                 "[yellow bold]Output name", default="output_preprocessing"
             )
-        if not step_size:
+        if step_size is None:
             step_size = IntPrompt.ask("[yellow bold]Step size", default=None)
         if align_to_clusters is None:
             align_to_clusters = Confirm.ask(
                 "[yellow bold]Align to clusters", default=True
             )
-        if not dask_cluster:
+        if dask_cluster is None:
             dask_cluster = Prompt.ask("[yellow bold]Dask cluster url", default="None")
             if dask_cluster == "None":
                 dask_cluster = None
@@ -557,11 +558,26 @@ def do_preprocess(
     parser.add_argument(
         "-o",
         "--output",
+        help="Output name for dataset discovery output (no fileset preprocessing)",
+        type=str,
+        required=False,
+        default="output_dataset",
+    )
+    parser.add_argument(
+        "-fo",
+        "--fileset-output",
         help="Output name for fileset",
         type=str,
         required=False,
         default="output_fileset",
     )
+    parser.add_argument(
+        "-p", "--preprocess", help="Preprocess with dask", action="store_true"
+    )
+    parser.add_argument(
+        "--step-size", help="Step size for preprocessing", type=int, default=500000
+    )
+    parser.add_argument("--dask-cluster", help="Dask cluster url", type=str, default="")
     parser.add_argument(
         "-as",
         "--allow-sites",
@@ -582,9 +598,15 @@ def do_preprocess(
         help="Regex string to be used to filter the sites",
         type=str,
     )
+    parser.add_argument(
+        "--query-results-strategy",
+        help="Mode for query results selection: [all|manual]",
+        type=str,
+        default="all",
+    )
     parser.add_argument(
         "--replicas-strategy",
-        help="Mode for selecting replicas for datasets: [manual|round-robin|choice]",
+        help="Mode for selecting replicas for datasets: [manual|round-robin|choose]",
         default="round-robin",
         required=False,
     )
@@ -592,6 +614,13 @@ def do_preprocess(
 
     cli = DataDiscoveryCLI()
 
+    if args.allow_sites:
+        cli.sites_allowlist = args.allow_sites
+    if args.block_sites:
+        cli.sites_blocklist = args.block_sites
+    if args.regex_sites:
+        cli.sites_regex = args.regex_sites
+
     if args.dataset_definition:
         # Load the dataset definition if present:
         with open(args.dataset_definition) as file:
@@ -603,32 +632,45 @@ def do_preprocess(
             cli.do_query(dataset_query)
             # Now selecting the results depending on the interactive mode or not.
             # Metadata are passed to the selection function to associated them with the selected dataset.
-            cli.do_select(selection="all", metadata=dataset_meta)
+            if args.query_results_strategy not in ["all", "manual"]:
+                print(
+                    "Invalid query-results-strategy option: please choose between: manual|all"
+                )
+                exit(1)
+            elif args.query_results_strategy == "manual":
+                cli.do_select(selection=None, metadata=dataset_meta)
+            else:
+                cli.do_select(selection="all", metadata=dataset_meta)
 
         # Now list all
         cli.do_list_selected()
 
-        if args.allow_sites:
-            cli.sites_allowlist = args.allow_sites
-        if args.block_sites:
-            cli.sites_blocklist = args.block_sites
-        if args.regex_sites:
-            cli.sites_regex = args.regex_sites
-
         # selecting replicas
+        cli.do_sites_filters(ask_clear=False)
+        print("Getting replicas")
         if args.replicas_strategy == "manual":
             cli.do_replicas(mode=None, selection="all")
         else:
-            if args.replicas_strategy not in ["round-robin", "choice"]:
+            if args.replicas_strategy not in ["round-robin", "choose"]:
                 print(
-                    "Invalid replicas-strategy: please choice between manual|round-robin|choice"
+                    "Invalid replicas-strategy: please choose between manual|round-robin|choose"
                 )
                 exit(1)
             cli.do_replicas(mode=args.replicas_strategy, selection="all")
 
         # Now list all
         cli.do_list_selected()
-        print("CIAO")
+
+        # Save
+        if args.output:
+            cli.do_save(filename=args.output)
+        if args.preprocess:
+            cli.do_preprocess(
+                output_file=args.fileset_output,
+                step_size=args.step_size,
+                dask_cluster=args.dask_cluster,
+                align_to_clusters=False,
+            )
 
     if args.cli:
         cli.start_cli()

From 6986a109ff590638295501c2552ea2033eeb1b92 Mon Sep 17 00:00:00 2001
From: Davide Valsecchi <davide.valsecchi@cern.ch>
Date: Mon, 11 Dec 2023 23:17:43 +0100
Subject: [PATCH 73/80] Adding docs and dataset_discovery notebook

---
 binder/dataset_discovery.ipynb            | 1436 +++++++++++++++++++++
 src/coffea/dataset_tools/dataset_query.py |  122 +-
 src/coffea/dataset_tools/rucio_utils.py   |   25 +-
 3 files changed, 1524 insertions(+), 59 deletions(-)
 create mode 100644 binder/dataset_discovery.ipynb

diff --git a/binder/dataset_discovery.ipynb b/binder/dataset_discovery.ipynb
new file mode 100644
index 000000000..9c29063fe
--- /dev/null
+++ b/binder/dataset_discovery.ipynb
@@ -0,0 +1,1436 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "c5754206-f41b-4e08-bc4d-496df85e8194",
+   "metadata": {},
+   "source": [
+    "# Dataset discovery tools"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "42242097-c04e-459e-9f3a-1d746df4e9dd",
+   "metadata": {},
+   "source": [
+    "# Using Rucio utils directly"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "09103c77-b8e6-4d61-920b-b1ff8fba8791",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from coffea.dataset_tools import rucio_utils\n",
+    "from coffea.dataset_tools.dataset_query import print_dataset_query\n",
+    "from rich.console import Console\n",
+    "from rich.table import Table"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "d62b43cb-53c0-4e2d-b571-1a0683e34dc5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<rucio.client.client.Client at 0x7f9bd2277fd0>"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "client = rucio_utils.get_rucio_client()\n",
+    "client"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "0359afc0-fc98-4aa8-acf4-288ef19ac7db",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query = \"/TTToSemiLeptonic_*_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9*/NANOAODSIM\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "659bee88-9fb0-4d1a-9544-a97372595f18",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['/TTToSemiLeptonic_TuneCP5CR1_erdON_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM',\n",
+       " '/TTToSemiLeptonic_TuneCP5CR2_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2/NANOAODSIM',\n",
+       " '/TTToSemiLeptonic_TuneCP5_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM',\n",
+       " '/TTToSemiLeptonic_TuneCP5_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-20UL18JMENano_106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM']"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "outlist, outtree = rucio_utils.query_dataset(\n",
+    "                query,\n",
+    "                client=client,\n",
+    "                tree=True,\n",
+    "                scope=\"cms\",  \n",
+    "            )\n",
+    "\n",
+    "outlist[1:5]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9bc2a454-4915-4366-9c02-2e389e9eb6fb",
+   "metadata": {},
+   "source": [
+    "Let's now pretty-print the results in a table using an utility function in the `dataset_query` module."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "4487d997-dc22-4a47-87df-4da14fa5b35a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-style: italic\">              Query: </span><span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold; font-style: italic\">/TTToSemiLeptonic_*_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9*/NANOAODSIM</span><span style=\"font-style: italic\">               </span>\n",
+       "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳┓\n",
+       "┃<span style=\"font-weight: bold\"> Name                              </span>┃<span style=\"font-weight: bold\"> Tag                                                                        </span>┃<span style=\"font-weight: bold\"></span>┃\n",
+       "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇┩\n",
+       "│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> TTToSemiLeptonic_TuneCP5CR1_13Te… </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> </span><span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf; font-weight: bold\">(1)</span><span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2/NAN… </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"></span>│\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> TTToSemiLeptonic_TuneCP5CR1_erdO… </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">(2)</span><span style=\"color: #800080; text-decoration-color: #800080\"> RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NAN… </span>││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> TTToSemiLeptonic_TuneCP5CR2_13Te… </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> </span><span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf; font-weight: bold\">(3)</span><span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2/NAN… </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"></span>│\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> TTToSemiLeptonic_TuneCP5_13TeV-p… </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">(4)</span><span style=\"color: #800080; text-decoration-color: #800080\"> RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NAN… </span>││\n",
+       "│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">                                   </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> </span><span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf; font-weight: bold\">(5)</span><span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> RunIISummer20UL18NanoAODv9-20UL18JMENano_106X_upgrade2018_realistic_v… </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"></span>│\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\">                                   </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">(6)</span><span style=\"color: #800080; text-decoration-color: #800080\"> RunIISummer20UL18NanoAODv9-PUForMUOVal_106X_upgrade2018_realistic_v16… </span>││\n",
+       "│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\">                                   </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> </span><span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf; font-weight: bold\">(7)</span><span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> RunIISummer20UL18NanoAODv9-PUForTRK_TRK_106X_upgrade2018_realistic_v1… </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"></span>│\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\">                                   </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">(8)</span><span style=\"color: #800080; text-decoration-color: #800080\"> RunIISummer20UL18NanoAODv9-PUForTRKv2_TRKv2_106X_upgrade2018_realisti… </span>││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> TTToSemiLeptonic_TuneCP5_erdON_1… </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> </span><span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf; font-weight: bold\">(9)</span><span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NAN… </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"></span>│\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> TTToSemiLeptonic_TuneCP5down_13T… </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">(10)</span><span style=\"color: #800080; text-decoration-color: #800080\"> RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… </span>││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> TTToSemiLeptonic_TuneCP5up_13TeV… </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> </span><span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf; font-weight: bold\">(11)</span><span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"></span>│\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> TTToSemiLeptonic_Vcb_TuneCP5_13T… </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">(12)</span><span style=\"color: #800080; text-decoration-color: #800080\"> RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2/NA… </span>││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> TTToSemiLeptonic_hdampDOWN_TuneC… </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> </span><span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf; font-weight: bold\">(13)</span><span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"></span>│\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> TTToSemiLeptonic_hdampUP_TuneCP5… </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">(14)</span><span style=\"color: #800080; text-decoration-color: #800080\"> RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… </span>││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> TTToSemiLeptonic_mtop166p5_TuneC… </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> </span><span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf; font-weight: bold\">(15)</span><span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"></span>│\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> TTToSemiLeptonic_mtop169p5_TuneC… </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">(16)</span><span style=\"color: #800080; text-decoration-color: #800080\"> RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… </span>││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> TTToSemiLeptonic_mtop171p5_TuneC… </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> </span><span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf; font-weight: bold\">(17)</span><span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"></span>│\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> TTToSemiLeptonic_mtop173p5_TuneC… </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">(18)</span><span style=\"color: #800080; text-decoration-color: #800080\"> RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… </span>││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> TTToSemiLeptonic_mtop175p5_TuneC… </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> </span><span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf; font-weight: bold\">(19)</span><span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"></span>│\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> TTToSemiLeptonic_mtop178p5_TuneC… </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">(20)</span><span style=\"color: #800080; text-decoration-color: #800080\"> RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… </span>││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> TTToSemiLeptonic_widthx0p55_Tune… </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> </span><span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf; font-weight: bold\">(21)</span><span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"></span>│\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> TTToSemiLeptonic_widthx0p7_TuneC… </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">(22)</span><span style=\"color: #800080; text-decoration-color: #800080\"> RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… </span>││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> TTToSemiLeptonic_widthx0p85_Tune… </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> </span><span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf; font-weight: bold\">(23)</span><span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"></span>│\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> TTToSemiLeptonic_widthx1p15_Tune… </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">(24)</span><span style=\"color: #800080; text-decoration-color: #800080\"> RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… </span>││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> TTToSemiLeptonic_widthx1p3_TuneC… </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> </span><span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf; font-weight: bold\">(25)</span><span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"></span>│\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> TTToSemiLeptonic_widthx1p45_Tune… </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">(26)</span><span style=\"color: #800080; text-decoration-color: #800080\"> RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… </span>││\n",
+       "└───────────────────────────────────┴────────────────────────────────────────────────────────────────────────────┴┘\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[3m              Query: \u001b[0m\u001b[1;3;31m/TTToSemiLeptonic_*_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9*/NANOAODSIM\u001b[0m\u001b[3m               \u001b[0m\n",
+       "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳┓\n",
+       "┃\u001b[1m \u001b[0m\u001b[1mName                             \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mTag                                                                       \u001b[0m\u001b[1m \u001b[0m┃┃\n",
+       "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇┩\n",
+       "│\u001b[2;36m \u001b[0m\u001b[2;36mTTToSemiLeptonic_TuneCP5CR1_13Te…\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(1)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2/NAN…\u001b[0m\u001b[2;35m \u001b[0m││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│\u001b[36m \u001b[0m\u001b[36mTTToSemiLeptonic_TuneCP5CR1_erdO…\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(2)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NAN…\u001b[0m\u001b[35m \u001b[0m││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│\u001b[2;36m \u001b[0m\u001b[2;36mTTToSemiLeptonic_TuneCP5CR2_13Te…\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(3)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2/NAN…\u001b[0m\u001b[2;35m \u001b[0m││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│\u001b[36m \u001b[0m\u001b[36mTTToSemiLeptonic_TuneCP5_13TeV-p…\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(4)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NAN…\u001b[0m\u001b[35m \u001b[0m││\n",
+       "│\u001b[2;36m \u001b[0m\u001b[2;36m                                 \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(5)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-20UL18JMENano_106X_upgrade2018_realistic_v…\u001b[0m\u001b[2;35m \u001b[0m││\n",
+       "│\u001b[36m \u001b[0m\u001b[36m                                 \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(6)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-PUForMUOVal_106X_upgrade2018_realistic_v16…\u001b[0m\u001b[35m \u001b[0m││\n",
+       "│\u001b[2;36m \u001b[0m\u001b[2;36m                                 \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(7)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-PUForTRK_TRK_106X_upgrade2018_realistic_v1…\u001b[0m\u001b[2;35m \u001b[0m││\n",
+       "│\u001b[36m \u001b[0m\u001b[36m                                 \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(8)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-PUForTRKv2_TRKv2_106X_upgrade2018_realisti…\u001b[0m\u001b[35m \u001b[0m││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│\u001b[2;36m \u001b[0m\u001b[2;36mTTToSemiLeptonic_TuneCP5_erdON_1…\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(9)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NAN…\u001b[0m\u001b[2;35m \u001b[0m││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│\u001b[36m \u001b[0m\u001b[36mTTToSemiLeptonic_TuneCP5down_13T…\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(10)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[35m \u001b[0m││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│\u001b[2;36m \u001b[0m\u001b[2;36mTTToSemiLeptonic_TuneCP5up_13TeV…\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(11)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[2;35m \u001b[0m││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│\u001b[36m \u001b[0m\u001b[36mTTToSemiLeptonic_Vcb_TuneCP5_13T…\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(12)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2/NA…\u001b[0m\u001b[35m \u001b[0m││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│\u001b[2;36m \u001b[0m\u001b[2;36mTTToSemiLeptonic_hdampDOWN_TuneC…\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(13)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[2;35m \u001b[0m││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│\u001b[36m \u001b[0m\u001b[36mTTToSemiLeptonic_hdampUP_TuneCP5…\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(14)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[35m \u001b[0m││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│\u001b[2;36m \u001b[0m\u001b[2;36mTTToSemiLeptonic_mtop166p5_TuneC…\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(15)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[2;35m \u001b[0m││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│\u001b[36m \u001b[0m\u001b[36mTTToSemiLeptonic_mtop169p5_TuneC…\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(16)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[35m \u001b[0m││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│\u001b[2;36m \u001b[0m\u001b[2;36mTTToSemiLeptonic_mtop171p5_TuneC…\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(17)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[2;35m \u001b[0m││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│\u001b[36m \u001b[0m\u001b[36mTTToSemiLeptonic_mtop173p5_TuneC…\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(18)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[35m \u001b[0m││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│\u001b[2;36m \u001b[0m\u001b[2;36mTTToSemiLeptonic_mtop175p5_TuneC…\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(19)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[2;35m \u001b[0m││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│\u001b[36m \u001b[0m\u001b[36mTTToSemiLeptonic_mtop178p5_TuneC…\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(20)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[35m \u001b[0m││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│\u001b[2;36m \u001b[0m\u001b[2;36mTTToSemiLeptonic_widthx0p55_Tune…\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(21)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[2;35m \u001b[0m││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│\u001b[36m \u001b[0m\u001b[36mTTToSemiLeptonic_widthx0p7_TuneC…\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(22)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[35m \u001b[0m││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│\u001b[2;36m \u001b[0m\u001b[2;36mTTToSemiLeptonic_widthx0p85_Tune…\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(23)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[2;35m \u001b[0m││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│\u001b[36m \u001b[0m\u001b[36mTTToSemiLeptonic_widthx1p15_Tune…\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(24)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[35m \u001b[0m││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│\u001b[2;36m \u001b[0m\u001b[2;36mTTToSemiLeptonic_widthx1p3_TuneC…\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(25)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[2;35m \u001b[0m││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│\u001b[36m \u001b[0m\u001b[36mTTToSemiLeptonic_widthx1p45_Tune…\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(26)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[35m \u001b[0m││\n",
+       "└───────────────────────────────────┴────────────────────────────────────────────────────────────────────────────┴┘\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "console = Console()\n",
+    "print_dataset_query(query, outtree, console)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c213d5fc-6424-4cdf-8751-88ced7987a59",
+   "metadata": {},
+   "source": [
+    "### Dataset replicas"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "961b4ad8-e3d6-49b1-a2ce-7cad49b46f06",
+   "metadata": {},
+   "source": [
+    "Let's select one dataset and look for available replicas"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "d08fd6ed-4b3a-4e9f-994a-d1bd529421a7",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'/TTToSemiLeptonic_TuneCP5CR1_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2/NANOAODSIM'"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset = outlist[0]\n",
+    "dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a605fb64-6e0b-4fbe-8807-84b9d75f2d53",
+   "metadata": {},
+   "source": [
+    "Using the option `mode='full'` in the function `rucio_utils.get_dataset_file_replicas()` one gets all the available replicas. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "2d64069e-ea8f-48c2-bd33-43fc555f6ec8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "try:\n",
+    "    (\n",
+    "        outfiles,\n",
+    "        outsites,\n",
+    "        sites_counts,\n",
+    "    ) = rucio_utils.get_dataset_files_replicas(\n",
+    "        dataset,\n",
+    "        allowlist_sites=[],\n",
+    "        blocklist_sites=[],\n",
+    "        regex_sites=[],\n",
+    "        mode=\"full\",   # full or first. \"full\"==all the available replicas\n",
+    "        client=client,\n",
+    "    )\n",
+    "except Exception as e:\n",
+    "    print(f\"\\n[red bold] Exception: {e}[/]\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "3e4fc6c2-f378-40d2-a4ea-f265b6c18887",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def print_replicas(sites_counts):\n",
+    "    console.print(f\"[cyan]Sites availability for dataset: [red]{dataset}\")\n",
+    "    table = Table(title=\"Available replicas\")\n",
+    "    table.add_column(\"Index\", justify=\"center\")\n",
+    "    table.add_column(\"Site\", justify=\"left\", style=\"cyan\", no_wrap=True)\n",
+    "    table.add_column(\"Files\", style=\"magenta\", no_wrap=True)\n",
+    "    table.add_column(\"Availability\", justify=\"center\")\n",
+    "    table.row_styles = [\"dim\", \"none\"]\n",
+    "    Nfiles = len(outfiles)\n",
+    "    \n",
+    "    sorted_sites = dict(\n",
+    "        sorted(sites_counts.items(), key=lambda x: x[1], reverse=True)\n",
+    "    )\n",
+    "    for i, (site, stat) in enumerate(sorted_sites.items()):\n",
+    "        table.add_row(\n",
+    "            str(i), site, f\"{stat} / {Nfiles}\", f\"{stat*100/Nfiles:.1f}%\"\n",
+    "        )\n",
+    "    console.print(table)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "79c68044-dc3b-4dd5-a0d3-c3f6ddd0bea1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008080; text-decoration-color: #008080\">Sites availability for dataset: </span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">/TTToSemiLeptonic_TuneCP5CR1_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">/NANOAODSIM</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[36mSites availability for dataset: \u001b[0m\n",
+       "\u001b[31m/TTToSemiLeptonic_TuneCP5CR1_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2\u001b[0m\n",
+       "\u001b[31m/\u001b[0m\u001b[31mNANOAODSIM\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-style: italic\">                    Available replicas                    </span>\n",
+       "┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━┓\n",
+       "┃<span style=\"font-weight: bold\"> Index </span>┃<span style=\"font-weight: bold\"> Site                </span>┃<span style=\"font-weight: bold\"> Files     </span>┃<span style=\"font-weight: bold\"> Availability </span>┃\n",
+       "┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━┩\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">   0   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_DE_DESY          </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 294 / 294 </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">    100.0%    </span>│\n",
+       "│   1   │<span style=\"color: #008080; text-decoration-color: #008080\"> T1_DE_KIT_Disk      </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 294 / 294 </span>│    100.0%    │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">   2   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T1_UK_RAL_Disk      </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 294 / 294 </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">    100.0%    </span>│\n",
+       "│   3   │<span style=\"color: #008080; text-decoration-color: #008080\"> T1_RU_JINR_Disk     </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 294 / 294 </span>│    100.0%    │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">   4   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T3_CH_PSI           </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 294 / 294 </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">    100.0%    </span>│\n",
+       "│   5   │<span style=\"color: #008080; text-decoration-color: #008080\"> T3_KR_UOS           </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 294 / 294 </span>│    100.0%    │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">   6   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T1_US_FNAL_Disk     </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 193 / 294 </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">    65.6%     </span>│\n",
+       "│   7   │<span style=\"color: #008080; text-decoration-color: #008080\"> T2_US_Nebraska      </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 99 / 294  </span>│    33.7%     │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">   8   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T1_IT_CNAF_Disk     </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 58 / 294  </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">    19.7%     </span>│\n",
+       "│   9   │<span style=\"color: #008080; text-decoration-color: #008080\"> T2_US_Purdue        </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 53 / 294  </span>│    18.0%     │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">  10   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_BE_IIHE          </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 50 / 294  </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">    17.0%     </span>│\n",
+       "│  11   │<span style=\"color: #008080; text-decoration-color: #008080\"> T2_US_MIT           </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 50 / 294  </span>│    17.0%     │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">  12   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T1_ES_PIC_Disk      </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 43 / 294  </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">    14.6%     </span>│\n",
+       "│  13   │<span style=\"color: #008080; text-decoration-color: #008080\"> T2_US_Vanderbilt    </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 40 / 294  </span>│    13.6%     │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">  14   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_BR_SPRACE        </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 39 / 294  </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">    13.3%     </span>│\n",
+       "│  15   │<span style=\"color: #008080; text-decoration-color: #008080\"> T2_US_Florida       </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 33 / 294  </span>│    11.2%     │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">  16   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_IT_Legnaro       </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 28 / 294  </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">     9.5%     </span>│\n",
+       "│  17   │<span style=\"color: #008080; text-decoration-color: #008080\"> T2_US_UCSD          </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 28 / 294  </span>│     9.5%     │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">  18   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_UA_KIPT          </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 26 / 294  </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">     8.8%     </span>│\n",
+       "│  19   │<span style=\"color: #008080; text-decoration-color: #008080\"> T2_US_Caltech       </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 24 / 294  </span>│     8.2%     │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">  20   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_US_Wisconsin     </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 22 / 294  </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">     7.5%     </span>│\n",
+       "│  21   │<span style=\"color: #008080; text-decoration-color: #008080\"> T2_TR_METU          </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 18 / 294  </span>│     6.1%     │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">  22   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_ES_CIEMAT        </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 17 / 294  </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">     5.8%     </span>│\n",
+       "│  23   │<span style=\"color: #008080; text-decoration-color: #008080\"> T2_DE_RWTH          </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 11 / 294  </span>│     3.7%     │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">  24   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_BR_UERJ          </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 7 / 294   </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">     2.4%     </span>│\n",
+       "│  25   │<span style=\"color: #008080; text-decoration-color: #008080\"> T2_UK_SGrid_Bristol </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 3 / 294   </span>│     1.0%     │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">  26   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_ES_IFCA          </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 2 / 294   </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">     0.7%     </span>│\n",
+       "└───────┴─────────────────────┴───────────┴──────────────┘\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[3m                    Available replicas                    \u001b[0m\n",
+       "┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━┓\n",
+       "┃\u001b[1m \u001b[0m\u001b[1mIndex\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mSite               \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mFiles    \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mAvailability\u001b[0m\u001b[1m \u001b[0m┃\n",
+       "┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━┩\n",
+       "│\u001b[2m \u001b[0m\u001b[2m  0  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_DE_DESY         \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m294 / 294\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m   100.0%   \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│   1   │\u001b[36m \u001b[0m\u001b[36mT1_DE_KIT_Disk     \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m294 / 294\u001b[0m\u001b[35m \u001b[0m│    100.0%    │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m  2  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_UK_RAL_Disk     \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m294 / 294\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m   100.0%   \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│   3   │\u001b[36m \u001b[0m\u001b[36mT1_RU_JINR_Disk    \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m294 / 294\u001b[0m\u001b[35m \u001b[0m│    100.0%    │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m  4  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT3_CH_PSI          \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m294 / 294\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m   100.0%   \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│   5   │\u001b[36m \u001b[0m\u001b[36mT3_KR_UOS          \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m294 / 294\u001b[0m\u001b[35m \u001b[0m│    100.0%    │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m  6  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_US_FNAL_Disk    \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m193 / 294\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m   65.6%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│   7   │\u001b[36m \u001b[0m\u001b[36mT2_US_Nebraska     \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m99 / 294 \u001b[0m\u001b[35m \u001b[0m│    33.7%     │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m  8  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_IT_CNAF_Disk    \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m58 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m   19.7%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│   9   │\u001b[36m \u001b[0m\u001b[36mT2_US_Purdue       \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m53 / 294 \u001b[0m\u001b[35m \u001b[0m│    18.0%     │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m 10  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_BE_IIHE         \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m50 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m   17.0%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│  11   │\u001b[36m \u001b[0m\u001b[36mT2_US_MIT          \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m50 / 294 \u001b[0m\u001b[35m \u001b[0m│    17.0%     │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m 12  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_ES_PIC_Disk     \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m43 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m   14.6%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│  13   │\u001b[36m \u001b[0m\u001b[36mT2_US_Vanderbilt   \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m40 / 294 \u001b[0m\u001b[35m \u001b[0m│    13.6%     │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m 14  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_BR_SPRACE       \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m39 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m   13.3%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│  15   │\u001b[36m \u001b[0m\u001b[36mT2_US_Florida      \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m33 / 294 \u001b[0m\u001b[35m \u001b[0m│    11.2%     │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m 16  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_IT_Legnaro      \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m28 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m    9.5%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│  17   │\u001b[36m \u001b[0m\u001b[36mT2_US_UCSD         \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m28 / 294 \u001b[0m\u001b[35m \u001b[0m│     9.5%     │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m 18  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_UA_KIPT         \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m26 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m    8.8%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│  19   │\u001b[36m \u001b[0m\u001b[36mT2_US_Caltech      \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m24 / 294 \u001b[0m\u001b[35m \u001b[0m│     8.2%     │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m 20  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_US_Wisconsin    \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m22 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m    7.5%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│  21   │\u001b[36m \u001b[0m\u001b[36mT2_TR_METU         \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m18 / 294 \u001b[0m\u001b[35m \u001b[0m│     6.1%     │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m 22  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_ES_CIEMAT       \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m17 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m    5.8%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│  23   │\u001b[36m \u001b[0m\u001b[36mT2_DE_RWTH         \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m11 / 294 \u001b[0m\u001b[35m \u001b[0m│     3.7%     │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m 24  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_BR_UERJ         \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m7 / 294  \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m    2.4%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│  25   │\u001b[36m \u001b[0m\u001b[36mT2_UK_SGrid_Bristol\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m3 / 294  \u001b[0m\u001b[35m \u001b[0m│     1.0%     │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m 26  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_ES_IFCA         \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m2 / 294  \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m    0.7%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "└───────┴─────────────────────┴───────────┴──────────────┘\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "print_replicas(sites_counts)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c9544ceb-5949-4bd3-b997-14da4aa2d956",
+   "metadata": {},
+   "source": [
+    "### Filtering sites\n",
+    "Grid sites can be filtered in 3 different ways\n",
+    "- **allowlist**:  if this list of specified, only the sites in the list are considered. No blocklist and regex are considered\n",
+    "- **blocklist**: if this list is specified, those sites are excluded from the replicas\n",
+    "- **regex_sites**: regex filter the sites to be considered, on top of the blocklist"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "1f6b586c-a8b7-40d8-a25a-b02e94f4a892",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008080; text-decoration-color: #008080\">Sites availability for dataset: </span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">/TTToSemiLeptonic_TuneCP5CR1_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">/NANOAODSIM</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[36mSites availability for dataset: \u001b[0m\n",
+       "\u001b[31m/TTToSemiLeptonic_TuneCP5CR1_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2\u001b[0m\n",
+       "\u001b[31m/\u001b[0m\u001b[31mNANOAODSIM\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-style: italic\">                  Available replicas                  </span>\n",
+       "┏━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━┓\n",
+       "┃<span style=\"font-weight: bold\"> Index </span>┃<span style=\"font-weight: bold\"> Site            </span>┃<span style=\"font-weight: bold\"> Files     </span>┃<span style=\"font-weight: bold\"> Availability </span>┃\n",
+       "┡━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━┩\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">   0   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_DE_DESY      </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 294 / 294 </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">    100.0%    </span>│\n",
+       "│   1   │<span style=\"color: #008080; text-decoration-color: #008080\"> T1_US_FNAL_Disk </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 193 / 294 </span>│    65.6%     │\n",
+       "└───────┴─────────────────┴───────────┴──────────────┘\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[3m                  Available replicas                  \u001b[0m\n",
+       "┏━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━┓\n",
+       "┃\u001b[1m \u001b[0m\u001b[1mIndex\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mSite           \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mFiles    \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mAvailability\u001b[0m\u001b[1m \u001b[0m┃\n",
+       "┡━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━┩\n",
+       "│\u001b[2m \u001b[0m\u001b[2m  0  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_DE_DESY     \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m294 / 294\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m   100.0%   \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│   1   │\u001b[36m \u001b[0m\u001b[36mT1_US_FNAL_Disk\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m193 / 294\u001b[0m\u001b[35m \u001b[0m│    65.6%     │\n",
+       "└───────┴─────────────────┴───────────┴──────────────┘\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Example with allowlist\n",
+    "try:\n",
+    "    (\n",
+    "        outfiles,\n",
+    "        outsites,\n",
+    "        sites_counts,\n",
+    "    ) = rucio_utils.get_dataset_files_replicas(\n",
+    "        dataset,\n",
+    "        allowlist_sites=[\"T2_DE_DESY\", \"T1_US_FNAL_Disk\"],\n",
+    "        blocklist_sites=[],\n",
+    "        regex_sites=None,\n",
+    "        mode=\"full\",   # full or first. \"full\"==all the available replicas\n",
+    "        client=client,\n",
+    "    )\n",
+    "except Exception as e:\n",
+    "    print(f\"\\n[red bold] Exception: {e}[/]\")\n",
+    "\n",
+    "print_replicas(sites_counts)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "12f7e403-67fe-42c0-a3ee-a668006b1836",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008080; text-decoration-color: #008080\">Sites availability for dataset: </span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">/TTToSemiLeptonic_TuneCP5CR1_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">/NANOAODSIM</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[36mSites availability for dataset: \u001b[0m\n",
+       "\u001b[31m/TTToSemiLeptonic_TuneCP5CR1_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2\u001b[0m\n",
+       "\u001b[31m/\u001b[0m\u001b[31mNANOAODSIM\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-style: italic\">                    Available replicas                    </span>\n",
+       "┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━┓\n",
+       "┃<span style=\"font-weight: bold\"> Index </span>┃<span style=\"font-weight: bold\"> Site                </span>┃<span style=\"font-weight: bold\"> Files     </span>┃<span style=\"font-weight: bold\"> Availability </span>┃\n",
+       "┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━┩\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">   0   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T1_DE_KIT_Disk      </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 294 / 294 </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">    100.0%    </span>│\n",
+       "│   1   │<span style=\"color: #008080; text-decoration-color: #008080\"> T1_UK_RAL_Disk      </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 294 / 294 </span>│    100.0%    │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">   2   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T1_RU_JINR_Disk     </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 294 / 294 </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">    100.0%    </span>│\n",
+       "│   3   │<span style=\"color: #008080; text-decoration-color: #008080\"> T3_KR_UOS           </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 294 / 294 </span>│    100.0%    │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">   4   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T1_US_FNAL_Disk     </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 193 / 294 </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">    65.6%     </span>│\n",
+       "│   5   │<span style=\"color: #008080; text-decoration-color: #008080\"> T2_US_Nebraska      </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 99 / 294  </span>│    33.7%     │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">   6   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T1_IT_CNAF_Disk     </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 58 / 294  </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">    19.7%     </span>│\n",
+       "│   7   │<span style=\"color: #008080; text-decoration-color: #008080\"> T2_US_Purdue        </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 53 / 294  </span>│    18.0%     │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">   8   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_BE_IIHE          </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 50 / 294  </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">    17.0%     </span>│\n",
+       "│   9   │<span style=\"color: #008080; text-decoration-color: #008080\"> T2_US_MIT           </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 50 / 294  </span>│    17.0%     │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">  10   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T1_ES_PIC_Disk      </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 43 / 294  </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">    14.6%     </span>│\n",
+       "│  11   │<span style=\"color: #008080; text-decoration-color: #008080\"> T2_US_Vanderbilt    </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 40 / 294  </span>│    13.6%     │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">  12   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_BR_SPRACE        </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 39 / 294  </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">    13.3%     </span>│\n",
+       "│  13   │<span style=\"color: #008080; text-decoration-color: #008080\"> T2_US_Florida       </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 33 / 294  </span>│    11.2%     │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">  14   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_IT_Legnaro       </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 28 / 294  </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">     9.5%     </span>│\n",
+       "│  15   │<span style=\"color: #008080; text-decoration-color: #008080\"> T2_US_UCSD          </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 28 / 294  </span>│     9.5%     │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">  16   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_UA_KIPT          </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 26 / 294  </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">     8.8%     </span>│\n",
+       "│  17   │<span style=\"color: #008080; text-decoration-color: #008080\"> T2_US_Caltech       </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 24 / 294  </span>│     8.2%     │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">  18   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_US_Wisconsin     </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 22 / 294  </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">     7.5%     </span>│\n",
+       "│  19   │<span style=\"color: #008080; text-decoration-color: #008080\"> T2_TR_METU          </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 18 / 294  </span>│     6.1%     │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">  20   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_ES_CIEMAT        </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 17 / 294  </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">     5.8%     </span>│\n",
+       "│  21   │<span style=\"color: #008080; text-decoration-color: #008080\"> T2_DE_RWTH          </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 11 / 294  </span>│     3.7%     │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">  22   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_BR_UERJ          </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 7 / 294   </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">     2.4%     </span>│\n",
+       "│  23   │<span style=\"color: #008080; text-decoration-color: #008080\"> T2_UK_SGrid_Bristol </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 3 / 294   </span>│     1.0%     │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">  24   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_ES_IFCA          </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 2 / 294   </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">     0.7%     </span>│\n",
+       "└───────┴─────────────────────┴───────────┴──────────────┘\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[3m                    Available replicas                    \u001b[0m\n",
+       "┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━┓\n",
+       "┃\u001b[1m \u001b[0m\u001b[1mIndex\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mSite               \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mFiles    \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mAvailability\u001b[0m\u001b[1m \u001b[0m┃\n",
+       "┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━┩\n",
+       "│\u001b[2m \u001b[0m\u001b[2m  0  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_DE_KIT_Disk     \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m294 / 294\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m   100.0%   \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│   1   │\u001b[36m \u001b[0m\u001b[36mT1_UK_RAL_Disk     \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m294 / 294\u001b[0m\u001b[35m \u001b[0m│    100.0%    │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m  2  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_RU_JINR_Disk    \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m294 / 294\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m   100.0%   \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│   3   │\u001b[36m \u001b[0m\u001b[36mT3_KR_UOS          \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m294 / 294\u001b[0m\u001b[35m \u001b[0m│    100.0%    │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m  4  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_US_FNAL_Disk    \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m193 / 294\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m   65.6%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│   5   │\u001b[36m \u001b[0m\u001b[36mT2_US_Nebraska     \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m99 / 294 \u001b[0m\u001b[35m \u001b[0m│    33.7%     │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m  6  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_IT_CNAF_Disk    \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m58 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m   19.7%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│   7   │\u001b[36m \u001b[0m\u001b[36mT2_US_Purdue       \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m53 / 294 \u001b[0m\u001b[35m \u001b[0m│    18.0%     │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m  8  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_BE_IIHE         \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m50 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m   17.0%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│   9   │\u001b[36m \u001b[0m\u001b[36mT2_US_MIT          \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m50 / 294 \u001b[0m\u001b[35m \u001b[0m│    17.0%     │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m 10  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_ES_PIC_Disk     \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m43 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m   14.6%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│  11   │\u001b[36m \u001b[0m\u001b[36mT2_US_Vanderbilt   \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m40 / 294 \u001b[0m\u001b[35m \u001b[0m│    13.6%     │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m 12  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_BR_SPRACE       \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m39 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m   13.3%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│  13   │\u001b[36m \u001b[0m\u001b[36mT2_US_Florida      \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m33 / 294 \u001b[0m\u001b[35m \u001b[0m│    11.2%     │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m 14  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_IT_Legnaro      \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m28 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m    9.5%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│  15   │\u001b[36m \u001b[0m\u001b[36mT2_US_UCSD         \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m28 / 294 \u001b[0m\u001b[35m \u001b[0m│     9.5%     │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m 16  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_UA_KIPT         \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m26 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m    8.8%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│  17   │\u001b[36m \u001b[0m\u001b[36mT2_US_Caltech      \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m24 / 294 \u001b[0m\u001b[35m \u001b[0m│     8.2%     │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m 18  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_US_Wisconsin    \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m22 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m    7.5%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│  19   │\u001b[36m \u001b[0m\u001b[36mT2_TR_METU         \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m18 / 294 \u001b[0m\u001b[35m \u001b[0m│     6.1%     │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m 20  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_ES_CIEMAT       \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m17 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m    5.8%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│  21   │\u001b[36m \u001b[0m\u001b[36mT2_DE_RWTH         \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m11 / 294 \u001b[0m\u001b[35m \u001b[0m│     3.7%     │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m 22  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_BR_UERJ         \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m7 / 294  \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m    2.4%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│  23   │\u001b[36m \u001b[0m\u001b[36mT2_UK_SGrid_Bristol\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m3 / 294  \u001b[0m\u001b[35m \u001b[0m│     1.0%     │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m 24  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_ES_IFCA         \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m2 / 294  \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m    0.7%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "└───────┴─────────────────────┴───────────┴──────────────┘\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Example with blocklist\n",
+    "try:\n",
+    "    (\n",
+    "        outfiles,\n",
+    "        outsites,\n",
+    "        sites_counts,\n",
+    "    ) = rucio_utils.get_dataset_files_replicas(\n",
+    "        dataset,\n",
+    "        allowlist_sites=[],\n",
+    "        blocklist_sites=[\"T2_DE_DESY\", \"T3_CH_PSI\"],\n",
+    "        regex_sites=None,\n",
+    "        mode=\"full\",   # full or first. \"full\"==all the available replicas\n",
+    "        client=client,\n",
+    "    )\n",
+    "except Exception as e:\n",
+    "    print(f\"\\n[red bold] Exception: {e}[/]\")\n",
+    "\n",
+    "print_replicas(sites_counts)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "f5dafcc2-c32e-4e33-9878-183a8e476b73",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008080; text-decoration-color: #008080\">Sites availability for dataset: </span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">/TTToSemiLeptonic_TuneCP5CR1_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">/NANOAODSIM</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[36mSites availability for dataset: \u001b[0m\n",
+       "\u001b[31m/TTToSemiLeptonic_TuneCP5CR1_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2\u001b[0m\n",
+       "\u001b[31m/\u001b[0m\u001b[31mNANOAODSIM\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-style: italic\">                    Available replicas                    </span>\n",
+       "┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━┓\n",
+       "┃<span style=\"font-weight: bold\"> Index </span>┃<span style=\"font-weight: bold\"> Site                </span>┃<span style=\"font-weight: bold\"> Files     </span>┃<span style=\"font-weight: bold\"> Availability </span>┃\n",
+       "┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━┩\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">   0   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_DE_DESY          </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 294 / 294 </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">    100.0%    </span>│\n",
+       "│   1   │<span style=\"color: #008080; text-decoration-color: #008080\"> T1_DE_KIT_Disk      </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 294 / 294 </span>│    100.0%    │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">   2   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T1_UK_RAL_Disk      </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 294 / 294 </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">    100.0%    </span>│\n",
+       "│   3   │<span style=\"color: #008080; text-decoration-color: #008080\"> T3_CH_PSI           </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 294 / 294 </span>│    100.0%    │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">   4   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T1_IT_CNAF_Disk     </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 58 / 294  </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">    19.7%     </span>│\n",
+       "│   5   │<span style=\"color: #008080; text-decoration-color: #008080\"> T2_BE_IIHE          </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 50 / 294  </span>│    17.0%     │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">   6   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T1_ES_PIC_Disk      </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 43 / 294  </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">    14.6%     </span>│\n",
+       "│   7   │<span style=\"color: #008080; text-decoration-color: #008080\"> T2_IT_Legnaro       </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 28 / 294  </span>│     9.5%     │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">   8   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_ES_CIEMAT        </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 17 / 294  </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">     5.8%     </span>│\n",
+       "│   9   │<span style=\"color: #008080; text-decoration-color: #008080\"> T2_DE_RWTH          </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 11 / 294  </span>│     3.7%     │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">  10   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_UK_SGrid_Bristol </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 3 / 294   </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">     1.0%     </span>│\n",
+       "│  11   │<span style=\"color: #008080; text-decoration-color: #008080\"> T2_ES_IFCA          </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 2 / 294   </span>│     0.7%     │\n",
+       "└───────┴─────────────────────┴───────────┴──────────────┘\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[3m                    Available replicas                    \u001b[0m\n",
+       "┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━┓\n",
+       "┃\u001b[1m \u001b[0m\u001b[1mIndex\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mSite               \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mFiles    \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mAvailability\u001b[0m\u001b[1m \u001b[0m┃\n",
+       "┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━┩\n",
+       "│\u001b[2m \u001b[0m\u001b[2m  0  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_DE_DESY         \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m294 / 294\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m   100.0%   \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│   1   │\u001b[36m \u001b[0m\u001b[36mT1_DE_KIT_Disk     \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m294 / 294\u001b[0m\u001b[35m \u001b[0m│    100.0%    │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m  2  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_UK_RAL_Disk     \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m294 / 294\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m   100.0%   \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│   3   │\u001b[36m \u001b[0m\u001b[36mT3_CH_PSI          \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m294 / 294\u001b[0m\u001b[35m \u001b[0m│    100.0%    │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m  4  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_IT_CNAF_Disk    \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m58 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m   19.7%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│   5   │\u001b[36m \u001b[0m\u001b[36mT2_BE_IIHE         \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m50 / 294 \u001b[0m\u001b[35m \u001b[0m│    17.0%     │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m  6  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_ES_PIC_Disk     \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m43 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m   14.6%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│   7   │\u001b[36m \u001b[0m\u001b[36mT2_IT_Legnaro      \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m28 / 294 \u001b[0m\u001b[35m \u001b[0m│     9.5%     │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m  8  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_ES_CIEMAT       \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m17 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m    5.8%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│   9   │\u001b[36m \u001b[0m\u001b[36mT2_DE_RWTH         \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m11 / 294 \u001b[0m\u001b[35m \u001b[0m│     3.7%     │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m 10  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_UK_SGrid_Bristol\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m3 / 294  \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m    1.0%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│  11   │\u001b[36m \u001b[0m\u001b[36mT2_ES_IFCA         \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m2 / 294  \u001b[0m\u001b[35m \u001b[0m│     0.7%     │\n",
+       "└───────┴─────────────────────┴───────────┴──────────────┘\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Example with regex\n",
+    "try:\n",
+    "    (\n",
+    "        outfiles,\n",
+    "        outsites,\n",
+    "        sites_counts,\n",
+    "    ) = rucio_utils.get_dataset_files_replicas(\n",
+    "        dataset,\n",
+    "        allowlist_sites=[],\n",
+    "        blocklist_sites=[],\n",
+    "        regex_sites= r\"T[123]_(FR|IT|BE|CH|DE|ES|UK)_\\w+\",\n",
+    "        mode=\"full\",   # full or first. \"full\"==all the available replicas\n",
+    "        client=client,\n",
+    "    )\n",
+    "except Exception as e:\n",
+    "    print(f\"\\n[red bold] Exception: {e}[/]\")\n",
+    "\n",
+    "print_replicas(sites_counts)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0b805dde-dd38-46a4-92ad-55ab2e4a4876",
+   "metadata": {},
+   "source": [
+    "# Using the DataDiscoveryCLI\n",
+    "Manipulating the dataset query and replicas is simplified by the `DataDiscoveryCLI` class in `dataset_query` module."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "39846193-d6f2-4de5-ba42-a089d1b0786d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from coffea.dataset_tools import rucio_utils\n",
+    "from coffea.dataset_tools.dataset_query import print_dataset_query\n",
+    "from rich.console import Console\n",
+    "from rich.table import Table\n",
+    "from coffea.dataset_tools.dataset_query import DataDiscoveryCLI"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "eaba3e39-c95a-4282-83e2-3aadf748adca",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_definition = {\n",
+    "    \"/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18NanoAODv9-106X*/NANOAODSIM\": {\"short_name\": \"ZJets\",\n",
+    "                                                                                                   \"metadata\": {\"xsec\": 100.0,\"isMC\":True}},\n",
+    "    \"/SingleMuon/Run2018C-UL20*_MiniAODv2_NanoAODv9_GT36*/NANOAOD\": {\"short_name\": \"SingleMuon\", \"metadata\": {\"isMC\":False}}\n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "716a6c0c-ea07-498a-a010-f9e7f87ba3a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ddc = DataDiscoveryCLI()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "51a8ef69-5d4e-4089-b3a8-d0290cc973c1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Help on method load_dataset_definition in module coffea.dataset_tools.dataset_query:\n",
+      "\n",
+      "load_dataset_definition(dataset_definition, query_results_strategy='all', replicas_strategy='round-robin') method of coffea.dataset_tools.dataset_query.DataDiscoveryCLI instance\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "help(ddc.load_dataset_definition)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "db3ab214-93d3-49b1-b6e1-9374b9fcc1f0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008000; text-decoration-color: #008000\">⠧</span> Querying rucio for replicas: <span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[32m⠧\u001b[0m Querying rucio for replicas: \u001b[1;31m/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008080; text-decoration-color: #008080\">Sites availability for dataset: </span><span style=\"color: #800000; text-decoration-color: #800000\">/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[36mSites availability for dataset: \u001b[0m\u001b[31m/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\u001b[31mNANOAOD\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-style: italic\">                   Available replicas                   </span>\n",
+       "┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━┓\n",
+       "┃<span style=\"font-weight: bold\"> Index </span>┃<span style=\"font-weight: bold\"> Site                </span>┃<span style=\"font-weight: bold\"> Files   </span>┃<span style=\"font-weight: bold\"> Availability </span>┃\n",
+       "┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━┩\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">   0   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_DE_DESY          </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 67 / 67 </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">    100.0%    </span>│\n",
+       "│   1   │<span style=\"color: #008080; text-decoration-color: #008080\"> T3_KR_KISTI         </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 67 / 67 </span>│    100.0%    │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">   2   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_TW_NCHC          </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 67 / 67 </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">    100.0%    </span>│\n",
+       "│   3   │<span style=\"color: #008080; text-decoration-color: #008080\"> T2_BE_IIHE          </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 67 / 67 </span>│    100.0%    │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">   4   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_US_Purdue        </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 67 / 67 </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">    100.0%    </span>│\n",
+       "│   5   │<span style=\"color: #008080; text-decoration-color: #008080\"> T2_ES_CIEMAT        </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 67 / 67 </span>│    100.0%    │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">   6   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T3_FR_IPNL          </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 67 / 67 </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">    100.0%    </span>│\n",
+       "│   7   │<span style=\"color: #008080; text-decoration-color: #008080\"> T1_US_FNAL_Disk     </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 61 / 67 </span>│    91.0%     │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">   8   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_UK_London_IC     </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 39 / 67 </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">    58.2%     </span>│\n",
+       "│   9   │<span style=\"color: #008080; text-decoration-color: #008080\"> T1_FR_CCIN2P3_Disk  </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 38 / 67 </span>│    56.7%     │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">  10   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_US_Caltech       </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 26 / 67 </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">    38.8%     </span>│\n",
+       "│  11   │<span style=\"color: #008080; text-decoration-color: #008080\"> T2_CH_CERN          </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 25 / 67 </span>│    37.3%     │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">  12   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_DE_RWTH          </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 22 / 67 </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">    32.8%     </span>│\n",
+       "│  13   │<span style=\"color: #008080; text-decoration-color: #008080\"> T1_IT_CNAF_Disk     </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 20 / 67 </span>│    29.9%     │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">  14   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_US_Wisconsin     </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 16 / 67 </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">    23.9%     </span>│\n",
+       "│  15   │<span style=\"color: #008080; text-decoration-color: #008080\"> T2_US_Florida       </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 16 / 67 </span>│    23.9%     │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">  16   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_US_Nebraska      </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 13 / 67 </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">    19.4%     </span>│\n",
+       "│  17   │<span style=\"color: #008080; text-decoration-color: #008080\"> T2_TR_METU          </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 11 / 67 </span>│    16.4%     │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">  18   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T1_DE_KIT_Disk      </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 11 / 67 </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">    16.4%     </span>│\n",
+       "│  19   │<span style=\"color: #008080; text-decoration-color: #008080\"> T2_UK_SGrid_RALPP   </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 6 / 67  </span>│     9.0%     │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">  20   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_IT_Legnaro       </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 6 / 67  </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">     9.0%     </span>│\n",
+       "│  21   │<span style=\"color: #008080; text-decoration-color: #008080\"> T2_ES_IFCA          </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 4 / 67  </span>│     6.0%     │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">  22   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_FR_IPHC          </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 2 / 67  </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">     3.0%     </span>│\n",
+       "│  23   │<span style=\"color: #008080; text-decoration-color: #008080\"> T2_UK_London_Brunel </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 1 / 67  </span>│     1.5%     │\n",
+       "└───────┴─────────────────────┴─────────┴──────────────┘\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[3m                   Available replicas                   \u001b[0m\n",
+       "┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━┓\n",
+       "┃\u001b[1m \u001b[0m\u001b[1mIndex\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mSite               \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mFiles  \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mAvailability\u001b[0m\u001b[1m \u001b[0m┃\n",
+       "┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━┩\n",
+       "│\u001b[2m \u001b[0m\u001b[2m  0  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_DE_DESY         \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m67 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m   100.0%   \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│   1   │\u001b[36m \u001b[0m\u001b[36mT3_KR_KISTI        \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m67 / 67\u001b[0m\u001b[35m \u001b[0m│    100.0%    │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m  2  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_TW_NCHC         \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m67 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m   100.0%   \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│   3   │\u001b[36m \u001b[0m\u001b[36mT2_BE_IIHE         \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m67 / 67\u001b[0m\u001b[35m \u001b[0m│    100.0%    │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m  4  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_US_Purdue       \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m67 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m   100.0%   \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│   5   │\u001b[36m \u001b[0m\u001b[36mT2_ES_CIEMAT       \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m67 / 67\u001b[0m\u001b[35m \u001b[0m│    100.0%    │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m  6  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT3_FR_IPNL         \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m67 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m   100.0%   \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│   7   │\u001b[36m \u001b[0m\u001b[36mT1_US_FNAL_Disk    \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m61 / 67\u001b[0m\u001b[35m \u001b[0m│    91.0%     │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m  8  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_UK_London_IC    \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m39 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m   58.2%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│   9   │\u001b[36m \u001b[0m\u001b[36mT1_FR_CCIN2P3_Disk \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m38 / 67\u001b[0m\u001b[35m \u001b[0m│    56.7%     │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m 10  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_US_Caltech      \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m26 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m   38.8%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│  11   │\u001b[36m \u001b[0m\u001b[36mT2_CH_CERN         \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m25 / 67\u001b[0m\u001b[35m \u001b[0m│    37.3%     │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m 12  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_DE_RWTH         \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m22 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m   32.8%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│  13   │\u001b[36m \u001b[0m\u001b[36mT1_IT_CNAF_Disk    \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m20 / 67\u001b[0m\u001b[35m \u001b[0m│    29.9%     │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m 14  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_US_Wisconsin    \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m16 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m   23.9%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│  15   │\u001b[36m \u001b[0m\u001b[36mT2_US_Florida      \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m16 / 67\u001b[0m\u001b[35m \u001b[0m│    23.9%     │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m 16  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_US_Nebraska     \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m13 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m   19.4%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│  17   │\u001b[36m \u001b[0m\u001b[36mT2_TR_METU         \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m11 / 67\u001b[0m\u001b[35m \u001b[0m│    16.4%     │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m 18  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_DE_KIT_Disk     \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m11 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m   16.4%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│  19   │\u001b[36m \u001b[0m\u001b[36mT2_UK_SGrid_RALPP  \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m6 / 67 \u001b[0m\u001b[35m \u001b[0m│     9.0%     │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m 20  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_IT_Legnaro      \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m6 / 67 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m    9.0%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│  21   │\u001b[36m \u001b[0m\u001b[36mT2_ES_IFCA         \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m4 / 67 \u001b[0m\u001b[35m \u001b[0m│     6.0%     │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m 22  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_FR_IPHC         \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m2 / 67 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m    3.0%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│  23   │\u001b[36m \u001b[0m\u001b[36mT2_UK_London_Brunel\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m1 / 67 \u001b[0m\u001b[35m \u001b[0m│     1.5%     │\n",
+       "└───────┴─────────────────────┴─────────┴──────────────┘\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Replicas for <span style=\"color: #008000; text-decoration-color: #008000\">/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_US_Wisconsin</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsxrootd.hep.wisc.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">-v1/2520000/0144EC47-BFA3-EA43-BF05-BD4248ED6031.root</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsxrootd.hep.wisc.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">-v1/2520000/39D52C69-2035-A24B-A413-40976993651D.root</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_DE_DESY</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">36-v1/2520000/0C9615C1-7EE6-CD44-8FC0-04F63B2C16FD.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">36-v1/2520000/62789325-3C0B-FC4D-B578-B41A396399E4.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">36-v1/2520000/81CEA7BA-9E66-BC4F-A96F-32642D59B653.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">36-v1/2520000/D8D41BBC-D514-D342-A514-CCF48575D184.root</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">36-v1/2520000/F1B3977A-E777-EC4D-8FC7-981FE4ED5E0C.root</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T3_FR_IPNL</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv2_NanoAODv9_GT36-v1/2520000/12FAE9F1-7139-924C-A8DE-9699A00FC994.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv2_NanoAODv9_GT36-v1/2520000/1DD0FAC6-3087-E44E-ABCB-8AF812C1310D.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv2_NanoAODv9_GT36-v1/2520000/648ECD9C-8AAA-BB46-8683-C8987CCC73B9.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv2_NanoAODv9_GT36-v1/2520000/8C8690F8-4FEE-1047-85F4-29E414B3D12C.root</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">Dv2_NanoAODv9_GT36-v1/2520000/BAAA6E00-7AC3-9947-9262-D9833D3A8B19.root</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T1_US_FNAL_Disk</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv9_GT36-v1/2520000/152C304A-97AD-1649-BCB6-3EA0CCD0DD33.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv9_GT36-v1/2520000/26FC8C40-EA29-804C-B17D-84FB1C6BC505.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv9_GT36-v1/2520000/78AC6A39-C303-EB44-9264-71819CC70FCC.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv9_GT36-v1/2520000/BCBF89A2-329C-744B-A38F-139EA8F94007.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv9_GT36-v1/2520000/C4F476DA-3D00-334B-867C-7E12F94EE3AB.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv9_GT36-v1/2520000/F34F4F00-3370-EF4D-AF44-39E474E6530F.root</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">Dv9_GT36-v1/2520000/FE3D79A6-27D4-8948-A89B-2F966C5B29D4.root</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_US_Caltech</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://xrootd-redir.ultralight.org:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">_GT36-v1/2520000/1CEB718A-7DC1-C74A-A7BE-A3C8D9FA785A.root</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://xrootd-redir.ultralight.org:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">_GT36-v1/2520000/7B14228A-5331-DF4E-B677-7B8AA281D460.root</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_TR_METU</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://eymir.grid.metu.edu.tr//dpm/grid.metu.edu.tr/home/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">_MiniAODv2_NanoAODv9_GT36-v1/2520000/2747DEFE-A247-1F42-B0EF-E7B7F1D3FCD6.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://eymir.grid.metu.edu.tr//dpm/grid.metu.edu.tr/home/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">_MiniAODv2_NanoAODv9_GT36-v1/2520000/30A3A1AB-2F27-C84E-9437-6BB3881F6856.root</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://eymir.grid.metu.edu.tr//dpm/grid.metu.edu.tr/home/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">_MiniAODv2_NanoAODv9_GT36-v1/2520000/69ABD79C-C684-8244-9F0D-153C6B8C2D9C.root</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_UK_London_IC</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/2D58C3FE-512A-1F48-9AEB-6F80379B8F4A.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/B78A9B75-3B32-CF4E-A144-375189CF48AE.root</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/CBD43A1E-AE2F-0B4D-A642-29FB2E9EB33B.root</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T1_IT_CNAF_Disk</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">2520000/2DA9130E-8423-304C-9902-1E42CD72E658.root</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">2520000/CDD2CDF9-72D0-4045-B28F-89002077FB89.root</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T3_KR_KISTI</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">-v1/2520000/365F32F6-F971-1B4D-8E9D-C0ACD74FFB03.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">-v1/2520000/42DC0F42-82E8-BE47-B04D-544B67274829.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">-v1/2520000/A350E2E4-705C-2C4D-9B11-3436056EEBE7.root</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">-v1/2520000/AB8DD69D-A522-D44C-BB9C-209623F7D41A.root</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_DE_RWTH</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">_NanoAODv9_GT36-v1/2520000/37312354-59AB-E44B-BC94-CF424D4B7DDB.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">_NanoAODv9_GT36-v1/2520000/459261DD-4441-6047-9FF2-1EDE468452C9.root</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">_NanoAODv9_GT36-v1/2520000/59DA0585-BD57-CE49-A15E-CDBAC5473EDE.root</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_US_Purdue</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">2520000/3FE5B677-9AB3-0245-A1CF-4B320592F18F.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">2520000/410C32AB-DEB5-404F-BC6B-92E8F560563F.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">2520000/63047CC0-38C6-F74C-9A00-0DF9050F7CF1.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">2520000/7CCCB2C3-F210-2C42-85DF-AA00293FACFB.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">2520000/D7875684-9F26-084E-9B2B-5E9BB5D353E8.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">2520000/FAF0C67B-A8B4-8A4F-83B1-E43675CE9630.root</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">2520000/FE5EEFA5-C07A-5C44-B66D-5B31BE02C7D3.root</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_US_Florida</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsio2.rc.ufl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">520000/51515E3C-C640-3A4C-A16C-DC267FD142BF.root</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsio2.rc.ufl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">520000/6EAA5EDB-0DB3-6E40-87DC-7AB582295D29.root</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_TW_NCHC</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">1/2520000/6809B5E3-6DE6-1541-AE4C-E1804C877EDE.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">1/2520000/8369B0EA-E4CC-AC4D-BD3F-0679B3310E09.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">1/2520000/AE014F55-84BE-E84E-B447-0B614070CD17.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">1/2520000/B3487FE0-B172-AD47-A13A-388C0A9BF93F.root</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">1/2520000/BA02D468-A8CE-4F49-884F-F836BB481AD5.root</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_BE_IIHE</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">0000/6DDF448B-4605-5C41-9711-1C73EC5F01D3.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">0000/7B181B92-AA2C-1E44-86FE-B074D359BBB3.root</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">0000/F09135D8-FCBE-AF40-BCE8-03A529C5C87F.root</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_US_Nebraska</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://xrootd-local.unl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">1/2520000/74A75B73-E5B8-C942-BBC9-1DDDD7F752FB.root</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_ES_CIEMAT</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">/2520000/7DEA3718-B7BC-EE42-A8BE-11C62BB8536D.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">/2520000/8223C4A3-D4BD-6A4B-A513-54B6668C7122.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">/2520000/A59D511A-A419-714F-8EE1-8B8BAFEC04D5.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">/2520000/A74EFE57-BAD2-C143-B8DC-817CE4F96FD7.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">/2520000/B1B449CE-5952-8347-A9A7-35FE231D0C72.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">/2520000/B9E9087C-255C-C24D-A733-FB9291DC7C3C.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">/2520000/D40D1285-B075-D446-B1BF-86A463EF6993.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">/2520000/DA47C0B6-BCAB-C54C-A6BF-B0A64E88E3D4.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">/2520000/ECD4877E-707B-EA43-A38B-D1B700FBDE79.root</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">/2520000/ED95384D-9D3D-AE45-8425-C4C080E691C5.root</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T1_FR_CCIN2P3_Disk</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">18_MiniAODv2_NanoAODv9_GT36-v1/2520000/F16A9138-7563-E540-B6AD-8A8A688B3830.root</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_IT_Legnaro</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://t2-xrdcms.lnl.infn.it:7070///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">v1/2520000/F6E44EA5-F4C6-E746-AD43-7A263F1E316E.root</span>\n",
+       "└── <span style=\"color: #008000; text-decoration-color: #008000\">T2_CH_CERN</span>\n",
+       "    └── <span style=\"color: #008080; text-decoration-color: #008080\">root://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2</span>\n",
+       "        <span style=\"color: #008080; text-decoration-color: #008080\">520000/FCAF4145-8E3F-2142-BDCB-5E276523B592.root</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "Replicas for \u001b[32m/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD\u001b[0m\n",
+       "├── \u001b[32mT2_US_Wisconsin\u001b[0m\n",
+       "│   ├── \u001b[36mroot://cmsxrootd.hep.wisc.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n",
+       "│   │   \u001b[36m-v1/2520000/0144EC47-BFA3-EA43-BF05-BD4248ED6031.root\u001b[0m\n",
+       "│   └── \u001b[36mroot://cmsxrootd.hep.wisc.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n",
+       "│       \u001b[36m-v1/2520000/39D52C69-2035-A24B-A413-40976993651D.root\u001b[0m\n",
+       "├── \u001b[32mT2_DE_DESY\u001b[0m\n",
+       "│   ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n",
+       "│   │   \u001b[36m36-v1/2520000/0C9615C1-7EE6-CD44-8FC0-04F63B2C16FD.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n",
+       "│   │   \u001b[36m36-v1/2520000/62789325-3C0B-FC4D-B578-B41A396399E4.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n",
+       "│   │   \u001b[36m36-v1/2520000/81CEA7BA-9E66-BC4F-A96F-32642D59B653.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n",
+       "│   │   \u001b[36m36-v1/2520000/D8D41BBC-D514-D342-A514-CCF48575D184.root\u001b[0m\n",
+       "│   └── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n",
+       "│       \u001b[36m36-v1/2520000/F1B3977A-E777-EC4D-8FC7-981FE4ED5E0C.root\u001b[0m\n",
+       "├── \u001b[32mT3_FR_IPNL\u001b[0m\n",
+       "│   ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n",
+       "│   │   \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/12FAE9F1-7139-924C-A8DE-9699A00FC994.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n",
+       "│   │   \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/1DD0FAC6-3087-E44E-ABCB-8AF812C1310D.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n",
+       "│   │   \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/648ECD9C-8AAA-BB46-8683-C8987CCC73B9.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n",
+       "│   │   \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/8C8690F8-4FEE-1047-85F4-29E414B3D12C.root\u001b[0m\n",
+       "│   └── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n",
+       "│       \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/BAAA6E00-7AC3-9947-9262-D9833D3A8B19.root\u001b[0m\n",
+       "├── \u001b[32mT1_US_FNAL_Disk\u001b[0m\n",
+       "│   ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n",
+       "│   │   \u001b[36mDv9_GT36-v1/2520000/152C304A-97AD-1649-BCB6-3EA0CCD0DD33.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n",
+       "│   │   \u001b[36mDv9_GT36-v1/2520000/26FC8C40-EA29-804C-B17D-84FB1C6BC505.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n",
+       "│   │   \u001b[36mDv9_GT36-v1/2520000/78AC6A39-C303-EB44-9264-71819CC70FCC.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n",
+       "│   │   \u001b[36mDv9_GT36-v1/2520000/BCBF89A2-329C-744B-A38F-139EA8F94007.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n",
+       "│   │   \u001b[36mDv9_GT36-v1/2520000/C4F476DA-3D00-334B-867C-7E12F94EE3AB.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n",
+       "│   │   \u001b[36mDv9_GT36-v1/2520000/F34F4F00-3370-EF4D-AF44-39E474E6530F.root\u001b[0m\n",
+       "│   └── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n",
+       "│       \u001b[36mDv9_GT36-v1/2520000/FE3D79A6-27D4-8948-A89B-2F966C5B29D4.root\u001b[0m\n",
+       "├── \u001b[32mT2_US_Caltech\u001b[0m\n",
+       "│   ├── \u001b[36mroot://xrootd-redir.ultralight.org:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9\u001b[0m\n",
+       "│   │   \u001b[36m_GT36-v1/2520000/1CEB718A-7DC1-C74A-A7BE-A3C8D9FA785A.root\u001b[0m\n",
+       "│   └── \u001b[36mroot://xrootd-redir.ultralight.org:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9\u001b[0m\n",
+       "│       \u001b[36m_GT36-v1/2520000/7B14228A-5331-DF4E-B677-7B8AA281D460.root\u001b[0m\n",
+       "├── \u001b[32mT2_TR_METU\u001b[0m\n",
+       "│   ├── \u001b[36mroot://eymir.grid.metu.edu.tr//dpm/grid.metu.edu.tr/home/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018\u001b[0m\n",
+       "│   │   \u001b[36m_MiniAODv2_NanoAODv9_GT36-v1/2520000/2747DEFE-A247-1F42-B0EF-E7B7F1D3FCD6.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://eymir.grid.metu.edu.tr//dpm/grid.metu.edu.tr/home/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018\u001b[0m\n",
+       "│   │   \u001b[36m_MiniAODv2_NanoAODv9_GT36-v1/2520000/30A3A1AB-2F27-C84E-9437-6BB3881F6856.root\u001b[0m\n",
+       "│   └── \u001b[36mroot://eymir.grid.metu.edu.tr//dpm/grid.metu.edu.tr/home/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018\u001b[0m\n",
+       "│       \u001b[36m_MiniAODv2_NanoAODv9_GT36-v1/2520000/69ABD79C-C684-8244-9F0D-153C6B8C2D9C.root\u001b[0m\n",
+       "├── \u001b[32mT2_UK_London_IC\u001b[0m\n",
+       "│   ├── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n",
+       "│   │   \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/2D58C3FE-512A-1F48-9AEB-6F80379B8F4A.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n",
+       "│   │   \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/B78A9B75-3B32-CF4E-A144-375189CF48AE.root\u001b[0m\n",
+       "│   └── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n",
+       "│       \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/CBD43A1E-AE2F-0B4D-A642-29FB2E9EB33B.root\u001b[0m\n",
+       "├── \u001b[32mT1_IT_CNAF_Disk\u001b[0m\n",
+       "│   ├── \u001b[36mroot://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n",
+       "│   │   \u001b[36m2520000/2DA9130E-8423-304C-9902-1E42CD72E658.root\u001b[0m\n",
+       "│   └── \u001b[36mroot://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n",
+       "│       \u001b[36m2520000/CDD2CDF9-72D0-4045-B28F-89002077FB89.root\u001b[0m\n",
+       "├── \u001b[32mT3_KR_KISTI\u001b[0m\n",
+       "│   ├── \u001b[36mroot://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n",
+       "│   │   \u001b[36m-v1/2520000/365F32F6-F971-1B4D-8E9D-C0ACD74FFB03.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n",
+       "│   │   \u001b[36m-v1/2520000/42DC0F42-82E8-BE47-B04D-544B67274829.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n",
+       "│   │   \u001b[36m-v1/2520000/A350E2E4-705C-2C4D-9B11-3436056EEBE7.root\u001b[0m\n",
+       "│   └── \u001b[36mroot://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n",
+       "│       \u001b[36m-v1/2520000/AB8DD69D-A522-D44C-BB9C-209623F7D41A.root\u001b[0m\n",
+       "├── \u001b[32mT2_DE_RWTH\u001b[0m\n",
+       "│   ├── \u001b[36mroot://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\u001b[0m\n",
+       "│   │   \u001b[36m_NanoAODv9_GT36-v1/2520000/37312354-59AB-E44B-BC94-CF424D4B7DDB.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\u001b[0m\n",
+       "│   │   \u001b[36m_NanoAODv9_GT36-v1/2520000/459261DD-4441-6047-9FF2-1EDE468452C9.root\u001b[0m\n",
+       "│   └── \u001b[36mroot://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\u001b[0m\n",
+       "│       \u001b[36m_NanoAODv9_GT36-v1/2520000/59DA0585-BD57-CE49-A15E-CDBAC5473EDE.root\u001b[0m\n",
+       "├── \u001b[32mT2_US_Purdue\u001b[0m\n",
+       "│   ├── \u001b[36mroot://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n",
+       "│   │   \u001b[36m2520000/3FE5B677-9AB3-0245-A1CF-4B320592F18F.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n",
+       "│   │   \u001b[36m2520000/410C32AB-DEB5-404F-BC6B-92E8F560563F.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n",
+       "│   │   \u001b[36m2520000/63047CC0-38C6-F74C-9A00-0DF9050F7CF1.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n",
+       "│   │   \u001b[36m2520000/7CCCB2C3-F210-2C42-85DF-AA00293FACFB.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n",
+       "│   │   \u001b[36m2520000/D7875684-9F26-084E-9B2B-5E9BB5D353E8.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n",
+       "│   │   \u001b[36m2520000/FAF0C67B-A8B4-8A4F-83B1-E43675CE9630.root\u001b[0m\n",
+       "│   └── \u001b[36mroot://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n",
+       "│       \u001b[36m2520000/FE5EEFA5-C07A-5C44-B66D-5B31BE02C7D3.root\u001b[0m\n",
+       "├── \u001b[32mT2_US_Florida\u001b[0m\n",
+       "│   ├── \u001b[36mroot://cmsio2.rc.ufl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n",
+       "│   │   \u001b[36m520000/51515E3C-C640-3A4C-A16C-DC267FD142BF.root\u001b[0m\n",
+       "│   └── \u001b[36mroot://cmsio2.rc.ufl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n",
+       "│       \u001b[36m520000/6EAA5EDB-0DB3-6E40-87DC-7AB582295D29.root\u001b[0m\n",
+       "├── \u001b[32mT2_TW_NCHC\u001b[0m\n",
+       "│   ├── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n",
+       "│   │   \u001b[36m1/2520000/6809B5E3-6DE6-1541-AE4C-E1804C877EDE.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n",
+       "│   │   \u001b[36m1/2520000/8369B0EA-E4CC-AC4D-BD3F-0679B3310E09.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n",
+       "│   │   \u001b[36m1/2520000/AE014F55-84BE-E84E-B447-0B614070CD17.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n",
+       "│   │   \u001b[36m1/2520000/B3487FE0-B172-AD47-A13A-388C0A9BF93F.root\u001b[0m\n",
+       "│   └── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n",
+       "│       \u001b[36m1/2520000/BA02D468-A8CE-4F49-884F-F836BB481AD5.root\u001b[0m\n",
+       "├── \u001b[32mT2_BE_IIHE\u001b[0m\n",
+       "│   ├── \u001b[36mroot://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\u001b[0m\n",
+       "│   │   \u001b[36m0000/6DDF448B-4605-5C41-9711-1C73EC5F01D3.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\u001b[0m\n",
+       "│   │   \u001b[36m0000/7B181B92-AA2C-1E44-86FE-B074D359BBB3.root\u001b[0m\n",
+       "│   └── \u001b[36mroot://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\u001b[0m\n",
+       "│       \u001b[36m0000/F09135D8-FCBE-AF40-BCE8-03A529C5C87F.root\u001b[0m\n",
+       "├── \u001b[32mT2_US_Nebraska\u001b[0m\n",
+       "│   └── \u001b[36mroot://xrootd-local.unl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n",
+       "│       \u001b[36m1/2520000/74A75B73-E5B8-C942-BBC9-1DDDD7F752FB.root\u001b[0m\n",
+       "├── \u001b[32mT2_ES_CIEMAT\u001b[0m\n",
+       "│   ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n",
+       "│   │   \u001b[36m/2520000/7DEA3718-B7BC-EE42-A8BE-11C62BB8536D.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n",
+       "│   │   \u001b[36m/2520000/8223C4A3-D4BD-6A4B-A513-54B6668C7122.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n",
+       "│   │   \u001b[36m/2520000/A59D511A-A419-714F-8EE1-8B8BAFEC04D5.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n",
+       "│   │   \u001b[36m/2520000/A74EFE57-BAD2-C143-B8DC-817CE4F96FD7.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n",
+       "│   │   \u001b[36m/2520000/B1B449CE-5952-8347-A9A7-35FE231D0C72.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n",
+       "│   │   \u001b[36m/2520000/B9E9087C-255C-C24D-A733-FB9291DC7C3C.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n",
+       "│   │   \u001b[36m/2520000/D40D1285-B075-D446-B1BF-86A463EF6993.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n",
+       "│   │   \u001b[36m/2520000/DA47C0B6-BCAB-C54C-A6BF-B0A64E88E3D4.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n",
+       "│   │   \u001b[36m/2520000/ECD4877E-707B-EA43-A38B-D1B700FBDE79.root\u001b[0m\n",
+       "│   └── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n",
+       "│       \u001b[36m/2520000/ED95384D-9D3D-AE45-8425-C4C080E691C5.root\u001b[0m\n",
+       "├── \u001b[32mT1_FR_CCIN2P3_Disk\u001b[0m\n",
+       "│   └── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n",
+       "│       \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/F16A9138-7563-E540-B6AD-8A8A688B3830.root\u001b[0m\n",
+       "├── \u001b[32mT2_IT_Legnaro\u001b[0m\n",
+       "│   └── \u001b[36mroot://t2-xrdcms.lnl.infn.it:7070///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-\u001b[0m\n",
+       "│       \u001b[36mv1/2520000/F6E44EA5-F4C6-E746-AD43-7A263F1E316E.root\u001b[0m\n",
+       "└── \u001b[32mT2_CH_CERN\u001b[0m\n",
+       "    └── \u001b[36mroot://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n",
+       "        \u001b[36m520000/FCAF4145-8E3F-2142-BDCB-5E276523B592.root\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008080; text-decoration-color: #008080\">Selected datasets:</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[36mSelected datasets:\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-style: italic\">                                                 Selected datasets                                                 </span>\n",
+       "┏━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳┳┓\n",
+       "┃<span style=\"font-weight: bold\"> … </span>┃<span style=\"font-weight: bold\"> Dataset                                                                                                   </span>┃<span style=\"font-weight: bold\"></span>┃<span style=\"font-weight: bold\"></span>┃\n",
+       "┡━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇╇┩\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> 1 </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> /DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realisti… </span>│││\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> 2 </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> /SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD                                           </span>│││\n",
+       "└───┴───────────────────────────────────────────────────────────────────────────────────────────────────────────┴┴┘\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[3m                                                 Selected datasets                                                 \u001b[0m\n",
+       "┏━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳┳┓\n",
+       "┃\u001b[1m \u001b[0m\u001b[1m…\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mDataset                                                                                                  \u001b[0m\u001b[1m \u001b[0m┃┃┃\n",
+       "┡━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇╇┩\n",
+       "│\u001b[36m \u001b[0m\u001b[36m1\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realisti…\u001b[0m\u001b[35m \u001b[0m│││\n",
+       "│\u001b[36m \u001b[0m\u001b[36m2\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD                                          \u001b[0m\u001b[35m \u001b[0m│││\n",
+       "└───┴───────────────────────────────────────────────────────────────────────────────────────────────────────────┴┴┘\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "ddc.load_dataset_definition(dataset_definition)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "dd9ca4ea-039d-4ebb-bbf2-79092ba6e7d0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008080; text-decoration-color: #008080\">Selected datasets:</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[36mSelected datasets:\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-style: italic\">                                                 Selected datasets                                                 </span>\n",
+       "┏━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳┳┓\n",
+       "┃<span style=\"font-weight: bold\"> … </span>┃<span style=\"font-weight: bold\"> Dataset                                                                                                   </span>┃<span style=\"font-weight: bold\"></span>┃<span style=\"font-weight: bold\"></span>┃\n",
+       "┡━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇╇┩\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> 1 </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> /DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realisti… </span>│││\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> 2 </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> /SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD                                           </span>│││\n",
+       "└───┴───────────────────────────────────────────────────────────────────────────────────────────────────────────┴┴┘\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[3m                                                 Selected datasets                                                 \u001b[0m\n",
+       "┏━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳┳┓\n",
+       "┃\u001b[1m \u001b[0m\u001b[1m…\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mDataset                                                                                                  \u001b[0m\u001b[1m \u001b[0m┃┃┃\n",
+       "┡━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇╇┩\n",
+       "│\u001b[36m \u001b[0m\u001b[36m1\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realisti…\u001b[0m\u001b[35m \u001b[0m│││\n",
+       "│\u001b[36m \u001b[0m\u001b[36m2\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD                                          \u001b[0m\u001b[35m \u001b[0m│││\n",
+       "└───┴───────────────────────────────────────────────────────────────────────────────────────────────────────────┴┴┘\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "ddc.do_list_selected()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a6ffbefb-8276-4733-aedb-cc12898f4ed8",
+   "metadata": {},
+   "source": [
+    "### Save the replicas metadata"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "b0e3e4b8-34d4-4558-988a-edacd1df9b37",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008000; text-decoration-color: #008000\">File replicas_info.json saved!</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[32mFile replicas_info.json saved!\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "ddc.do_save(\"replicas_info.json\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f7d52663-c5e3-4abe-9c2f-4bf8f08d8919",
+   "metadata": {},
+   "source": [
+    "## Preprocess the fileset with dask"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "125cd0ea-ff05-414a-9177-2be98eb88362",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\u001b[0;31mSignature:\u001b[0m\n",
+       "\u001b[0mddc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdo_preprocess\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n",
+       "\u001b[0;34m\u001b[0m    \u001b[0moutput_file\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+       "\u001b[0;34m\u001b[0m    \u001b[0mstep_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+       "\u001b[0;34m\u001b[0m    \u001b[0malign_to_clusters\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+       "\u001b[0;34m\u001b[0m    \u001b[0mdask_cluster\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+       "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+       "\u001b[0;31mDocstring:\u001b[0m\n",
+       "Perform preprocessing for concrete fileset extraction.\n",
+       "Args:  output_file [step_size] [align to file cluster boundaries] [dask cluster url]\n",
+       "\u001b[0;31mFile:\u001b[0m      /work/dvalsecc/coffea/src/coffea/dataset_tools/dataset_query.py\n",
+       "\u001b[0;31mType:\u001b[0m      method"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "ddc.do_preprocess?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "04a2aeca-9c9f-4baf-b33b-b4f1b5ba4d4a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008000; text-decoration-color: #008000\">⠧</span> <span style=\"color: #800000; text-decoration-color: #800000\"> Preprocessing files to extract available chunks with dask</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[32m⠧\u001b[0m \u001b[31m Preprocessing files to extract available chunks with dask\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "ddc.do_preprocess(output_file=\"fileset\",  \n",
+    "                  step_size=10000,\n",
+    "                  align_to_clusters=False,\n",
+    "                 dask_cluster=None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d1206bce-b726-43cc-b217-d74fd5516147",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py
index 80db62e53..2dd7ca171 100644
--- a/src/coffea/dataset_tools/dataset_query.py
+++ b/src/coffea/dataset_tools/dataset_query.py
@@ -18,7 +18,7 @@
 from .preprocess import preprocess
 
 
-def print_dataset_query(query, dataset_list, selected, console):
+def print_dataset_query(query, dataset_list, console, selected=[]):
     table = Table(title=f"Query: [bold red]{query}")
     table.add_column("Name", justify="left", style="cyan", no_wrap=True)
     table.add_column("Tag", style="magenta", no_wrap=True)
@@ -208,19 +208,19 @@ def do_query(self, query=None):
                 scope="cms",  # TODO configure scope
             )
             # Now let's print the results as a tree
-            print_dataset_query(query, outtree, self.selected_datasets, self.console)
+            print_dataset_query(query, outtree, self.console, self.selected_datasets)
             self.last_query = query
             self.last_query_list = outlist
             self.last_query_tree = outtree
-        print("Use the command [bold red]select (S)[/] to selected the datasets")
+        print("Use the command [bold red]select[/] to selected the datasets")
 
     def do_query_results(self):
         if self.last_query_list:
             print_dataset_query(
                 self.last_query,
                 self.last_query_tree,
-                self.selected_datasets,
                 self.console,
+                self.selected_datasets,
             )
         else:
             print("First [bold red]query (Q)[/] for a dataset")
@@ -402,10 +402,11 @@ def do_replicas(self, mode=None, selection=None):
                     T.add(f"[cyan]{f}")
             self.console.print(tree)
 
-    def do_allowlist_sites(self):
-        sites = Prompt.ask(
-            "[yellow]Restrict the available sites to (comma-separated list)"
-        ).split(",")
+    def do_allowlist_sites(self, sites=None):
+        if sites is None:
+            sites = Prompt.ask(
+                "[yellow]Restrict the available sites to (comma-separated list)"
+            ).split(",")
         if self.sites_allowlist is None:
             self.sites_allowlist = sites
         else:
@@ -414,10 +415,11 @@ def do_allowlist_sites(self):
         for s in self.sites_allowlist:
             print(f"- {s}")
 
-    def do_blocklist_sites(self):
-        sites = Prompt.ask("[yellow]Exclude the sites (comma-separated list)").split(
-            ","
-        )
+    def do_blocklist_sites(self, sites=None):
+        if sites is None:
+            sites = Prompt.ask(
+                "[yellow]Exclude the sites (comma-separated list)"
+            ).split(",")
         if self.sites_blocklist is None:
             self.sites_blocklist = sites
         else:
@@ -426,8 +428,9 @@ def do_blocklist_sites(self):
         for s in self.sites_blocklist:
             print(f"- {s}")
 
-    def do_regex_sites(self):
-        regex = Prompt.ask("[yellow]Regex to restrict the available sites")
+    def do_regex_sites(self, regex=None):
+        if regex is None:
+            regex = Prompt.ask("[yellow]Regex to restrict the available sites")
         if len(regex):
             self.sites_regex = rf"{regex}"
             print(f"New sites regex: [cyan]{self.sites_regex}")
@@ -513,10 +516,6 @@ def do_preprocess(
             align_to_clusters = Confirm.ask(
                 "[yellow bold]Align to clusters", default=True
             )
-        if dask_cluster is None:
-            dask_cluster = Prompt.ask("[yellow bold]Dask cluster url", default="None")
-            if dask_cluster == "None":
-                dask_cluster = None
 
         replicas = {}
         for fileset, files in self.replica_results.items():
@@ -541,6 +540,47 @@ def do_preprocess(
         with gzip.open(f"{output_file}_all.json.gz", "wt") as file:
             print(f"Saved all fileset chunks to {output_file}_all.json.gz")
             json.dump(out_updated, file, indent=2)
+        return out_updated
+
+    def load_dataset_definition(
+        self,
+        dataset_definition,
+        query_results_strategy="all",
+        replicas_strategy="round-robin",
+    ):
+        for dataset_query, dataset_meta in dataset_definition.items():
+            print(f"\nProcessing query: {dataset_query}")
+            # Adding queries
+            self.do_query(dataset_query)
+            # Now selecting the results depending on the interactive mode or not.
+            # Metadata are passed to the selection function to associated them with the selected dataset.
+            if query_results_strategy not in ["all", "manual"]:
+                print(
+                    "Invalid query-results-strategy option: please choose between: manual|all"
+                )
+                exit(1)
+            elif query_results_strategy == "manual":
+                self.do_select(selection=None, metadata=dataset_meta)
+            else:
+                self.do_select(selection="all", metadata=dataset_meta)
+
+        # Now list all
+        self.do_list_selected()
+
+        # selecting replicas
+        self.do_sites_filters(ask_clear=False)
+        print("Getting replicas")
+        if replicas_strategy == "manual":
+            self.do_replicas(mode=None, selection="all")
+        else:
+            if replicas_strategy not in ["round-robin", "choose"]:
+                print(
+                    "Invalid replicas-strategy: please choose between manual|round-robin|choose"
+                )
+                exit(1)
+            self.do_replicas(mode=replicas_strategy, selection="all")
+        # Now list all
+        self.do_list_selected()
 
 
 if __name__ == "__main__":
@@ -622,49 +662,17 @@ def do_preprocess(
         cli.sites_regex = args.regex_sites
 
     if args.dataset_definition:
-        # Load the dataset definition if present:
         with open(args.dataset_definition) as file:
-            dataset_definition = json.load(file)
-
-        for dataset_query, dataset_meta in dataset_definition.items():
-            print(f"\nProcessing query: {dataset_query}")
-            # Adding queries
-            cli.do_query(dataset_query)
-            # Now selecting the results depending on the interactive mode or not.
-            # Metadata are passed to the selection function to associated them with the selected dataset.
-            if args.query_results_strategy not in ["all", "manual"]:
-                print(
-                    "Invalid query-results-strategy option: please choose between: manual|all"
-                )
-                exit(1)
-            elif args.query_results_strategy == "manual":
-                cli.do_select(selection=None, metadata=dataset_meta)
-            else:
-                cli.do_select(selection="all", metadata=dataset_meta)
-
-        # Now list all
-        cli.do_list_selected()
-
-        # selecting replicas
-        cli.do_sites_filters(ask_clear=False)
-        print("Getting replicas")
-        if args.replicas_strategy == "manual":
-            cli.do_replicas(mode=None, selection="all")
-        else:
-            if args.replicas_strategy not in ["round-robin", "choose"]:
-                print(
-                    "Invalid replicas-strategy: please choose between manual|round-robin|choose"
-                )
-                exit(1)
-            cli.do_replicas(mode=args.replicas_strategy, selection="all")
-
-        # Now list all
-        cli.do_list_selected()
-
+            dd = json.load(file)
+        cli.load_dataset_definition(
+            dd,
+            query_results_strategy=args.query_results_strategy,
+            replicas_strategy=args.replicas_strategy,
+        )
         # Save
         if args.output:
             cli.do_save(filename=args.output)
-        if args.preprocess:
+        if preprocess:
             cli.do_preprocess(
                 output_file=args.fileset_output,
                 step_size=args.step_size,
diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py
index a2cf11fe9..3940df2e1 100644
--- a/src/coffea/dataset_tools/rucio_utils.py
+++ b/src/coffea/dataset_tools/rucio_utils.py
@@ -271,11 +271,32 @@ def get_dataset_files_replicas(
     return outfiles, outsites, sites_counts
 
 
-def query_dataset(query, client=None, tree=False, scope="cms"):
+def query_dataset(
+    query: str, client=None, tree: bool = False, datatype="container", scope="cms"
+):
+    """
+    This function uses the rucio client to query for containers or datasets.
+
+    Parameters
+    ---------
+        query: str = query to filter datasets / containers with the rucio list_dids functions
+        client: rucio client
+        tree: bool = if True return the results splitting the dataset name in parts parts
+        datatype: "container/dataset":  rucio terminology. "Container"==CMS dataset. "Dataset" == CMS block.
+        scope: "cms". Rucio instance
+
+    Returns
+    -------
+       list of containers/datasets
+
+       if tree==True, returns the list of dataset and also a dictionary decomposing the datasets
+       names in the 1st commond part and a list of available 2nd parts.
+
+    """
     client = client if client else get_rucio_client()
     out = list(
         client.list_dids(
-            scope=scope, filters={"name": query, "type": "container"}, long=False
+            scope=scope, filters={"name": query, "type": datatype}, long=False
         )
     )
     if tree:

From 95b994941bf3504fb68783d1bd10f66280b5016d Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Mon, 11 Dec 2023 16:26:30 -0600
Subject: [PATCH 74/80] typo

---
 src/coffea/dataset_tools/rucio_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py
index 3940df2e1..b626f518b 100644
--- a/src/coffea/dataset_tools/rucio_utils.py
+++ b/src/coffea/dataset_tools/rucio_utils.py
@@ -290,7 +290,7 @@ def query_dataset(
        list of containers/datasets
 
        if tree==True, returns the list of dataset and also a dictionary decomposing the datasets
-       names in the 1st commond part and a list of available 2nd parts.
+       names in the 1st command part and a list of available 2nd parts.
 
     """
     client = client if client else get_rucio_client()

From a166a813989040efd54dbbdeeed9d4ca612ecf7e Mon Sep 17 00:00:00 2001
From: Davide Valsecchi <davide.valsecchi@cern.ch>
Date: Mon, 11 Dec 2023 23:56:35 +0100
Subject: [PATCH 75/80] more docs in the notebook

---
 binder/dataset_discovery.ipynb            | 1211 +++++++++++++++------
 src/coffea/dataset_tools/dataset_query.py |   11 +-
 2 files changed, 899 insertions(+), 323 deletions(-)

diff --git a/binder/dataset_discovery.ipynb b/binder/dataset_discovery.ipynb
index 9c29063fe..2b19cf1c0 100644
--- a/binder/dataset_discovery.ipynb
+++ b/binder/dataset_discovery.ipynb
@@ -5,7 +5,14 @@
    "id": "c5754206-f41b-4e08-bc4d-496df85e8194",
    "metadata": {},
    "source": [
-    "# Dataset discovery tools"
+    "# Dataset discovery tools\n",
+    "\n",
+    "This notebook shows some features to make the dataset discovery for CMS analysis easier. \n",
+    "The rucio sytem is queried to look for dataset and access to the list of all available file replicas.\n",
+    "\n",
+    "Users can exploit these tools at 2 different levels:\n",
+    "- low level: use the `rucio_utils` module directly to just query rucio\n",
+    "- high level: use the `DataDiscoveryCLI` class to simplify dataset query, replicas filters and uproot preprocessing with dask"
    ]
   },
   {
@@ -753,50 +760,27 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "716a6c0c-ea07-498a-a010-f9e7f87ba3a3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ddc = DataDiscoveryCLI()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "51a8ef69-5d4e-4089-b3a8-d0290cc973c1",
+   "cell_type": "markdown",
+   "id": "ecb84b02-b85f-4037-a08d-cce001bc35c7",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Help on method load_dataset_definition in module coffea.dataset_tools.dataset_query:\n",
-      "\n",
-      "load_dataset_definition(dataset_definition, query_results_strategy='all', replicas_strategy='round-robin') method of coffea.dataset_tools.dataset_query.DataDiscoveryCLI instance\n",
-      "\n"
-     ]
-    }
-   ],
    "source": [
-    "help(ddc.load_dataset_definition)"
+    "The dataset definition is passed to a `DataDiscoveryCLI` to automatically query rucio and get replicas"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
-   "id": "db3ab214-93d3-49b1-b6e1-9374b9fcc1f0",
+   "execution_count": 11,
+   "id": "716a6c0c-ea07-498a-a010-f9e7f87ba3a3",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/html": [
-       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008000; text-decoration-color: #008000\">⠧</span> Querying rucio for replicas: <span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD</span>\n",
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008000; text-decoration-color: #008000\">⠇</span> Querying rucio for replicas: <span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD</span>\n",
        "</pre>\n"
       ],
       "text/plain": [
-       "\u001b[32m⠧\u001b[0m Querying rucio for replicas: \u001b[1;31m/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD\u001b[0m\n"
+       "\u001b[32m⠇\u001b[0m Querying rucio for replicas: \u001b[1;31m/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD\u001b[0m\n"
       ]
      },
      "metadata": {},
@@ -898,316 +882,312 @@
      "data": {
       "text/html": [
        "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Replicas for <span style=\"color: #008000; text-decoration-color: #008000\">/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD</span>\n",
-       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_US_Wisconsin</span>\n",
-       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsxrootd.hep.wisc.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">-v1/2520000/0144EC47-BFA3-EA43-BF05-BD4248ED6031.root</span>\n",
-       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsxrootd.hep.wisc.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36</span>\n",
-       "│       <span style=\"color: #008080; text-decoration-color: #008080\">-v1/2520000/39D52C69-2035-A24B-A413-40976993651D.root</span>\n",
        "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_DE_DESY</span>\n",
        "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">36-v1/2520000/0C9615C1-7EE6-CD44-8FC0-04F63B2C16FD.root</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">36-v1/2520000/0144EC47-BFA3-EA43-BF05-BD4248ED6031.root</span>\n",
        "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">36-v1/2520000/62789325-3C0B-FC4D-B578-B41A396399E4.root</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">36-v1/2520000/2747DEFE-A247-1F42-B0EF-E7B7F1D3FCD6.root</span>\n",
        "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">36-v1/2520000/81CEA7BA-9E66-BC4F-A96F-32642D59B653.root</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">36-v1/2520000/2DA9130E-8423-304C-9902-1E42CD72E658.root</span>\n",
        "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">36-v1/2520000/D8D41BBC-D514-D342-A514-CCF48575D184.root</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">36-v1/2520000/63047CC0-38C6-F74C-9A00-0DF9050F7CF1.root</span>\n",
        "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT</span>\n",
-       "│       <span style=\"color: #008080; text-decoration-color: #008080\">36-v1/2520000/F1B3977A-E777-EC4D-8FC7-981FE4ED5E0C.root</span>\n",
-       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T3_FR_IPNL</span>\n",
-       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv2_NanoAODv9_GT36-v1/2520000/12FAE9F1-7139-924C-A8DE-9699A00FC994.root</span>\n",
-       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv2_NanoAODv9_GT36-v1/2520000/1DD0FAC6-3087-E44E-ABCB-8AF812C1310D.root</span>\n",
-       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv2_NanoAODv9_GT36-v1/2520000/648ECD9C-8AAA-BB46-8683-C8987CCC73B9.root</span>\n",
-       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv2_NanoAODv9_GT36-v1/2520000/8C8690F8-4FEE-1047-85F4-29E414B3D12C.root</span>\n",
-       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO</span>\n",
-       "│       <span style=\"color: #008080; text-decoration-color: #008080\">Dv2_NanoAODv9_GT36-v1/2520000/BAAA6E00-7AC3-9947-9262-D9833D3A8B19.root</span>\n",
-       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T1_US_FNAL_Disk</span>\n",
-       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv9_GT36-v1/2520000/152C304A-97AD-1649-BCB6-3EA0CCD0DD33.root</span>\n",
-       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv9_GT36-v1/2520000/26FC8C40-EA29-804C-B17D-84FB1C6BC505.root</span>\n",
-       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv9_GT36-v1/2520000/78AC6A39-C303-EB44-9264-71819CC70FCC.root</span>\n",
-       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv9_GT36-v1/2520000/BCBF89A2-329C-744B-A38F-139EA8F94007.root</span>\n",
-       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv9_GT36-v1/2520000/C4F476DA-3D00-334B-867C-7E12F94EE3AB.root</span>\n",
-       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv9_GT36-v1/2520000/F34F4F00-3370-EF4D-AF44-39E474E6530F.root</span>\n",
-       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO</span>\n",
-       "│       <span style=\"color: #008080; text-decoration-color: #008080\">Dv9_GT36-v1/2520000/FE3D79A6-27D4-8948-A89B-2F966C5B29D4.root</span>\n",
-       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_US_Caltech</span>\n",
-       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://xrootd-redir.ultralight.org:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">_GT36-v1/2520000/1CEB718A-7DC1-C74A-A7BE-A3C8D9FA785A.root</span>\n",
-       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://xrootd-redir.ultralight.org:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9</span>\n",
-       "│       <span style=\"color: #008080; text-decoration-color: #008080\">_GT36-v1/2520000/7B14228A-5331-DF4E-B677-7B8AA281D460.root</span>\n",
-       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_TR_METU</span>\n",
-       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://eymir.grid.metu.edu.tr//dpm/grid.metu.edu.tr/home/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">_MiniAODv2_NanoAODv9_GT36-v1/2520000/2747DEFE-A247-1F42-B0EF-E7B7F1D3FCD6.root</span>\n",
-       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://eymir.grid.metu.edu.tr//dpm/grid.metu.edu.tr/home/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">_MiniAODv2_NanoAODv9_GT36-v1/2520000/30A3A1AB-2F27-C84E-9437-6BB3881F6856.root</span>\n",
-       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://eymir.grid.metu.edu.tr//dpm/grid.metu.edu.tr/home/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018</span>\n",
-       "│       <span style=\"color: #008080; text-decoration-color: #008080\">_MiniAODv2_NanoAODv9_GT36-v1/2520000/69ABD79C-C684-8244-9F0D-153C6B8C2D9C.root</span>\n",
-       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_UK_London_IC</span>\n",
-       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/2D58C3FE-512A-1F48-9AEB-6F80379B8F4A.root</span>\n",
-       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/B78A9B75-3B32-CF4E-A144-375189CF48AE.root</span>\n",
-       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA</span>\n",
-       "│       <span style=\"color: #008080; text-decoration-color: #008080\">OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/CBD43A1E-AE2F-0B4D-A642-29FB2E9EB33B.root</span>\n",
-       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T1_IT_CNAF_Disk</span>\n",
-       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">2520000/2DA9130E-8423-304C-9902-1E42CD72E658.root</span>\n",
-       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/</span>\n",
-       "│       <span style=\"color: #008080; text-decoration-color: #008080\">2520000/CDD2CDF9-72D0-4045-B28F-89002077FB89.root</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">36-v1/2520000/8369B0EA-E4CC-AC4D-BD3F-0679B3310E09.root</span>\n",
        "├── <span style=\"color: #008000; text-decoration-color: #008000\">T3_KR_KISTI</span>\n",
        "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">-v1/2520000/365F32F6-F971-1B4D-8E9D-C0ACD74FFB03.root</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">-v1/2520000/0C9615C1-7EE6-CD44-8FC0-04F63B2C16FD.root</span>\n",
        "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">-v1/2520000/42DC0F42-82E8-BE47-B04D-544B67274829.root</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">-v1/2520000/152C304A-97AD-1649-BCB6-3EA0CCD0DD33.root</span>\n",
        "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">-v1/2520000/A350E2E4-705C-2C4D-9B11-3436056EEBE7.root</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">-v1/2520000/1CEB718A-7DC1-C74A-A7BE-A3C8D9FA785A.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">-v1/2520000/51515E3C-C640-3A4C-A16C-DC267FD142BF.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">-v1/2520000/7DEA3718-B7BC-EE42-A8BE-11C62BB8536D.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">-v1/2520000/81CEA7BA-9E66-BC4F-A96F-32642D59B653.root</span>\n",
        "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36</span>\n",
-       "│       <span style=\"color: #008080; text-decoration-color: #008080\">-v1/2520000/AB8DD69D-A522-D44C-BB9C-209623F7D41A.root</span>\n",
-       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_DE_RWTH</span>\n",
-       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">_NanoAODv9_GT36-v1/2520000/37312354-59AB-E44B-BC94-CF424D4B7DDB.root</span>\n",
-       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">_NanoAODv9_GT36-v1/2520000/459261DD-4441-6047-9FF2-1EDE468452C9.root</span>\n",
-       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2</span>\n",
-       "│       <span style=\"color: #008080; text-decoration-color: #008080\">_NanoAODv9_GT36-v1/2520000/59DA0585-BD57-CE49-A15E-CDBAC5473EDE.root</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">-v1/2520000/C4F476DA-3D00-334B-867C-7E12F94EE3AB.root</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_ES_CIEMAT</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">/2520000/12FAE9F1-7139-924C-A8DE-9699A00FC994.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">/2520000/1DD0FAC6-3087-E44E-ABCB-8AF812C1310D.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">/2520000/3FE5B677-9AB3-0245-A1CF-4B320592F18F.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">/2520000/74A75B73-E5B8-C942-BBC9-1DDDD7F752FB.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">/2520000/8C8690F8-4FEE-1047-85F4-29E414B3D12C.root</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">/2520000/DA47C0B6-BCAB-C54C-A6BF-B0A64E88E3D4.root</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T1_FR_CCIN2P3_Disk</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">18_MiniAODv2_NanoAODv9_GT36-v1/2520000/26FC8C40-EA29-804C-B17D-84FB1C6BC505.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">18_MiniAODv2_NanoAODv9_GT36-v1/2520000/2D58C3FE-512A-1F48-9AEB-6F80379B8F4A.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">18_MiniAODv2_NanoAODv9_GT36-v1/2520000/30A3A1AB-2F27-C84E-9437-6BB3881F6856.root</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">18_MiniAODv2_NanoAODv9_GT36-v1/2520000/A350E2E4-705C-2C4D-9B11-3436056EEBE7.root</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_BE_IIHE</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">0000/365F32F6-F971-1B4D-8E9D-C0ACD74FFB03.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">0000/410C32AB-DEB5-404F-BC6B-92E8F560563F.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">0000/6809B5E3-6DE6-1541-AE4C-E1804C877EDE.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">0000/78AC6A39-C303-EB44-9264-71819CC70FCC.root</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">0000/7CCCB2C3-F210-2C42-85DF-AA00293FACFB.root</span>\n",
        "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_US_Purdue</span>\n",
        "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">2520000/3FE5B677-9AB3-0245-A1CF-4B320592F18F.root</span>\n",
-       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">2520000/410C32AB-DEB5-404F-BC6B-92E8F560563F.root</span>\n",
-       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">2520000/63047CC0-38C6-F74C-9A00-0DF9050F7CF1.root</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">2520000/37312354-59AB-E44B-BC94-CF424D4B7DDB.root</span>\n",
        "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">2520000/7CCCB2C3-F210-2C42-85DF-AA00293FACFB.root</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">2520000/42DC0F42-82E8-BE47-B04D-544B67274829.root</span>\n",
        "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/</span>\n",
        "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">2520000/D7875684-9F26-084E-9B2B-5E9BB5D353E8.root</span>\n",
        "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/</span>\n",
        "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">2520000/FAF0C67B-A8B4-8A4F-83B1-E43675CE9630.root</span>\n",
        "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/</span>\n",
        "│       <span style=\"color: #008080; text-decoration-color: #008080\">2520000/FE5EEFA5-C07A-5C44-B66D-5B31BE02C7D3.root</span>\n",
-       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_US_Florida</span>\n",
-       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsio2.rc.ufl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">520000/51515E3C-C640-3A4C-A16C-DC267FD142BF.root</span>\n",
-       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsio2.rc.ufl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2</span>\n",
-       "│       <span style=\"color: #008080; text-decoration-color: #008080\">520000/6EAA5EDB-0DB3-6E40-87DC-7AB582295D29.root</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_US_Wisconsin</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsxrootd.hep.wisc.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">-v1/2520000/39D52C69-2035-A24B-A413-40976993651D.root</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsxrootd.hep.wisc.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">-v1/2520000/FCAF4145-8E3F-2142-BDCB-5E276523B592.root</span>\n",
        "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_TW_NCHC</span>\n",
        "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">1/2520000/6809B5E3-6DE6-1541-AE4C-E1804C877EDE.root</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">1/2520000/459261DD-4441-6047-9FF2-1EDE468452C9.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">1/2520000/6DDF448B-4605-5C41-9711-1C73EC5F01D3.root</span>\n",
        "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">1/2520000/8369B0EA-E4CC-AC4D-BD3F-0679B3310E09.root</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">1/2520000/7B14228A-5331-DF4E-B677-7B8AA281D460.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">1/2520000/7B181B92-AA2C-1E44-86FE-B074D359BBB3.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">1/2520000/8223C4A3-D4BD-6A4B-A513-54B6668C7122.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">1/2520000/A74EFE57-BAD2-C143-B8DC-817CE4F96FD7.root</span>\n",
        "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v</span>\n",
        "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">1/2520000/AE014F55-84BE-E84E-B447-0B614070CD17.root</span>\n",
        "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">1/2520000/B3487FE0-B172-AD47-A13A-388C0A9BF93F.root</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">1/2520000/BCBF89A2-329C-744B-A38F-139EA8F94007.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">1/2520000/D8D41BBC-D514-D342-A514-CCF48575D184.root</span>\n",
        "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v</span>\n",
-       "│       <span style=\"color: #008080; text-decoration-color: #008080\">1/2520000/BA02D468-A8CE-4F49-884F-F836BB481AD5.root</span>\n",
-       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_BE_IIHE</span>\n",
-       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">0000/6DDF448B-4605-5C41-9711-1C73EC5F01D3.root</span>\n",
-       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">0000/7B181B92-AA2C-1E44-86FE-B074D359BBB3.root</span>\n",
-       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252</span>\n",
-       "│       <span style=\"color: #008080; text-decoration-color: #008080\">0000/F09135D8-FCBE-AF40-BCE8-03A529C5C87F.root</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">1/2520000/F1B3977A-E777-EC4D-8FC7-981FE4ED5E0C.root</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_UK_London_IC</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/59DA0585-BD57-CE49-A15E-CDBAC5473EDE.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/F16A9138-7563-E540-B6AD-8A8A688B3830.root</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/FE3D79A6-27D4-8948-A89B-2F966C5B29D4.root</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T1_US_FNAL_Disk</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv9_GT36-v1/2520000/62789325-3C0B-FC4D-B578-B41A396399E4.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv9_GT36-v1/2520000/6EAA5EDB-0DB3-6E40-87DC-7AB582295D29.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv9_GT36-v1/2520000/A59D511A-A419-714F-8EE1-8B8BAFEC04D5.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv9_GT36-v1/2520000/B78A9B75-3B32-CF4E-A144-375189CF48AE.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv9_GT36-v1/2520000/B9E9087C-255C-C24D-A733-FB9291DC7C3C.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv9_GT36-v1/2520000/CDD2CDF9-72D0-4045-B28F-89002077FB89.root</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">Dv9_GT36-v1/2520000/ED95384D-9D3D-AE45-8425-C4C080E691C5.root</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T1_IT_CNAF_Disk</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">2520000/648ECD9C-8AAA-BB46-8683-C8987CCC73B9.root</span>\n",
        "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_US_Nebraska</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://xrootd-local.unl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">1/2520000/69ABD79C-C684-8244-9F0D-153C6B8C2D9C.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://xrootd-local.unl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">1/2520000/AB8DD69D-A522-D44C-BB9C-209623F7D41A.root</span>\n",
        "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://xrootd-local.unl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v</span>\n",
-       "│       <span style=\"color: #008080; text-decoration-color: #008080\">1/2520000/74A75B73-E5B8-C942-BBC9-1DDDD7F752FB.root</span>\n",
-       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_ES_CIEMAT</span>\n",
-       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">/2520000/7DEA3718-B7BC-EE42-A8BE-11C62BB8536D.root</span>\n",
-       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">/2520000/8223C4A3-D4BD-6A4B-A513-54B6668C7122.root</span>\n",
-       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">/2520000/A59D511A-A419-714F-8EE1-8B8BAFEC04D5.root</span>\n",
-       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">/2520000/A74EFE57-BAD2-C143-B8DC-817CE4F96FD7.root</span>\n",
-       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">/2520000/B1B449CE-5952-8347-A9A7-35FE231D0C72.root</span>\n",
-       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">/2520000/B9E9087C-255C-C24D-A733-FB9291DC7C3C.root</span>\n",
-       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">/2520000/D40D1285-B075-D446-B1BF-86A463EF6993.root</span>\n",
-       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">/2520000/DA47C0B6-BCAB-C54C-A6BF-B0A64E88E3D4.root</span>\n",
-       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1</span>\n",
-       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">/2520000/ECD4877E-707B-EA43-A38B-D1B700FBDE79.root</span>\n",
-       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1</span>\n",
-       "│       <span style=\"color: #008080; text-decoration-color: #008080\">/2520000/ED95384D-9D3D-AE45-8425-C4C080E691C5.root</span>\n",
-       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T1_FR_CCIN2P3_Disk</span>\n",
-       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20</span>\n",
-       "│       <span style=\"color: #008080; text-decoration-color: #008080\">18_MiniAODv2_NanoAODv9_GT36-v1/2520000/F16A9138-7563-E540-B6AD-8A8A688B3830.root</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">1/2520000/B3487FE0-B172-AD47-A13A-388C0A9BF93F.root</span>\n",
        "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_IT_Legnaro</span>\n",
        "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://t2-xrdcms.lnl.infn.it:7070///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-</span>\n",
-       "│       <span style=\"color: #008080; text-decoration-color: #008080\">v1/2520000/F6E44EA5-F4C6-E746-AD43-7A263F1E316E.root</span>\n",
-       "└── <span style=\"color: #008000; text-decoration-color: #008000\">T2_CH_CERN</span>\n",
-       "    └── <span style=\"color: #008080; text-decoration-color: #008080\">root://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2</span>\n",
-       "        <span style=\"color: #008080; text-decoration-color: #008080\">520000/FCAF4145-8E3F-2142-BDCB-5E276523B592.root</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">v1/2520000/B1B449CE-5952-8347-A9A7-35FE231D0C72.root</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T3_FR_IPNL</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv2_NanoAODv9_GT36-v1/2520000/BA02D468-A8CE-4F49-884F-F836BB481AD5.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv2_NanoAODv9_GT36-v1/2520000/BAAA6E00-7AC3-9947-9262-D9833D3A8B19.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv2_NanoAODv9_GT36-v1/2520000/CBD43A1E-AE2F-0B4D-A642-29FB2E9EB33B.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv2_NanoAODv9_GT36-v1/2520000/ECD4877E-707B-EA43-A38B-D1B700FBDE79.root</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">Dv2_NanoAODv9_GT36-v1/2520000/F09135D8-FCBE-AF40-BCE8-03A529C5C87F.root</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_DE_RWTH</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">_NanoAODv9_GT36-v1/2520000/D40D1285-B075-D446-B1BF-86A463EF6993.root</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_TR_METU</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://eymir.grid.metu.edu.tr//dpm/grid.metu.edu.tr/home/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">_MiniAODv2_NanoAODv9_GT36-v1/2520000/F34F4F00-3370-EF4D-AF44-39E474E6530F.root</span>\n",
+       "└── <span style=\"color: #008000; text-decoration-color: #008000\">T2_US_Florida</span>\n",
+       "    └── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsio2.rc.ufl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2</span>\n",
+       "        <span style=\"color: #008080; text-decoration-color: #008080\">520000/F6E44EA5-F4C6-E746-AD43-7A263F1E316E.root</span>\n",
        "</pre>\n"
       ],
       "text/plain": [
        "Replicas for \u001b[32m/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD\u001b[0m\n",
-       "├── \u001b[32mT2_US_Wisconsin\u001b[0m\n",
-       "│   ├── \u001b[36mroot://cmsxrootd.hep.wisc.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n",
-       "│   │   \u001b[36m-v1/2520000/0144EC47-BFA3-EA43-BF05-BD4248ED6031.root\u001b[0m\n",
-       "│   └── \u001b[36mroot://cmsxrootd.hep.wisc.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n",
-       "│       \u001b[36m-v1/2520000/39D52C69-2035-A24B-A413-40976993651D.root\u001b[0m\n",
        "├── \u001b[32mT2_DE_DESY\u001b[0m\n",
        "│   ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n",
-       "│   │   \u001b[36m36-v1/2520000/0C9615C1-7EE6-CD44-8FC0-04F63B2C16FD.root\u001b[0m\n",
+       "│   │   \u001b[36m36-v1/2520000/0144EC47-BFA3-EA43-BF05-BD4248ED6031.root\u001b[0m\n",
        "│   ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n",
-       "│   │   \u001b[36m36-v1/2520000/62789325-3C0B-FC4D-B578-B41A396399E4.root\u001b[0m\n",
+       "│   │   \u001b[36m36-v1/2520000/2747DEFE-A247-1F42-B0EF-E7B7F1D3FCD6.root\u001b[0m\n",
        "│   ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n",
-       "│   │   \u001b[36m36-v1/2520000/81CEA7BA-9E66-BC4F-A96F-32642D59B653.root\u001b[0m\n",
+       "│   │   \u001b[36m36-v1/2520000/2DA9130E-8423-304C-9902-1E42CD72E658.root\u001b[0m\n",
        "│   ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n",
-       "│   │   \u001b[36m36-v1/2520000/D8D41BBC-D514-D342-A514-CCF48575D184.root\u001b[0m\n",
+       "│   │   \u001b[36m36-v1/2520000/63047CC0-38C6-F74C-9A00-0DF9050F7CF1.root\u001b[0m\n",
        "│   └── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n",
-       "│       \u001b[36m36-v1/2520000/F1B3977A-E777-EC4D-8FC7-981FE4ED5E0C.root\u001b[0m\n",
-       "├── \u001b[32mT3_FR_IPNL\u001b[0m\n",
-       "│   ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n",
-       "│   │   \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/12FAE9F1-7139-924C-A8DE-9699A00FC994.root\u001b[0m\n",
-       "│   ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n",
-       "│   │   \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/1DD0FAC6-3087-E44E-ABCB-8AF812C1310D.root\u001b[0m\n",
-       "│   ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n",
-       "│   │   \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/648ECD9C-8AAA-BB46-8683-C8987CCC73B9.root\u001b[0m\n",
-       "│   ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n",
-       "│   │   \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/8C8690F8-4FEE-1047-85F4-29E414B3D12C.root\u001b[0m\n",
-       "│   └── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n",
-       "│       \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/BAAA6E00-7AC3-9947-9262-D9833D3A8B19.root\u001b[0m\n",
-       "├── \u001b[32mT1_US_FNAL_Disk\u001b[0m\n",
-       "│   ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n",
-       "│   │   \u001b[36mDv9_GT36-v1/2520000/152C304A-97AD-1649-BCB6-3EA0CCD0DD33.root\u001b[0m\n",
-       "│   ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n",
-       "│   │   \u001b[36mDv9_GT36-v1/2520000/26FC8C40-EA29-804C-B17D-84FB1C6BC505.root\u001b[0m\n",
-       "│   ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n",
-       "│   │   \u001b[36mDv9_GT36-v1/2520000/78AC6A39-C303-EB44-9264-71819CC70FCC.root\u001b[0m\n",
-       "│   ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n",
-       "│   │   \u001b[36mDv9_GT36-v1/2520000/BCBF89A2-329C-744B-A38F-139EA8F94007.root\u001b[0m\n",
-       "│   ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n",
-       "│   │   \u001b[36mDv9_GT36-v1/2520000/C4F476DA-3D00-334B-867C-7E12F94EE3AB.root\u001b[0m\n",
-       "│   ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n",
-       "│   │   \u001b[36mDv9_GT36-v1/2520000/F34F4F00-3370-EF4D-AF44-39E474E6530F.root\u001b[0m\n",
-       "│   └── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n",
-       "│       \u001b[36mDv9_GT36-v1/2520000/FE3D79A6-27D4-8948-A89B-2F966C5B29D4.root\u001b[0m\n",
-       "├── \u001b[32mT2_US_Caltech\u001b[0m\n",
-       "│   ├── \u001b[36mroot://xrootd-redir.ultralight.org:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9\u001b[0m\n",
-       "│   │   \u001b[36m_GT36-v1/2520000/1CEB718A-7DC1-C74A-A7BE-A3C8D9FA785A.root\u001b[0m\n",
-       "│   └── \u001b[36mroot://xrootd-redir.ultralight.org:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9\u001b[0m\n",
-       "│       \u001b[36m_GT36-v1/2520000/7B14228A-5331-DF4E-B677-7B8AA281D460.root\u001b[0m\n",
-       "├── \u001b[32mT2_TR_METU\u001b[0m\n",
-       "│   ├── \u001b[36mroot://eymir.grid.metu.edu.tr//dpm/grid.metu.edu.tr/home/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018\u001b[0m\n",
-       "│   │   \u001b[36m_MiniAODv2_NanoAODv9_GT36-v1/2520000/2747DEFE-A247-1F42-B0EF-E7B7F1D3FCD6.root\u001b[0m\n",
-       "│   ├── \u001b[36mroot://eymir.grid.metu.edu.tr//dpm/grid.metu.edu.tr/home/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018\u001b[0m\n",
-       "│   │   \u001b[36m_MiniAODv2_NanoAODv9_GT36-v1/2520000/30A3A1AB-2F27-C84E-9437-6BB3881F6856.root\u001b[0m\n",
-       "│   └── \u001b[36mroot://eymir.grid.metu.edu.tr//dpm/grid.metu.edu.tr/home/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018\u001b[0m\n",
-       "│       \u001b[36m_MiniAODv2_NanoAODv9_GT36-v1/2520000/69ABD79C-C684-8244-9F0D-153C6B8C2D9C.root\u001b[0m\n",
-       "├── \u001b[32mT2_UK_London_IC\u001b[0m\n",
-       "│   ├── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n",
-       "│   │   \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/2D58C3FE-512A-1F48-9AEB-6F80379B8F4A.root\u001b[0m\n",
-       "│   ├── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n",
-       "│   │   \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/B78A9B75-3B32-CF4E-A144-375189CF48AE.root\u001b[0m\n",
-       "│   └── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n",
-       "│       \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/CBD43A1E-AE2F-0B4D-A642-29FB2E9EB33B.root\u001b[0m\n",
-       "├── \u001b[32mT1_IT_CNAF_Disk\u001b[0m\n",
-       "│   ├── \u001b[36mroot://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n",
-       "│   │   \u001b[36m2520000/2DA9130E-8423-304C-9902-1E42CD72E658.root\u001b[0m\n",
-       "│   └── \u001b[36mroot://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n",
-       "│       \u001b[36m2520000/CDD2CDF9-72D0-4045-B28F-89002077FB89.root\u001b[0m\n",
+       "│       \u001b[36m36-v1/2520000/8369B0EA-E4CC-AC4D-BD3F-0679B3310E09.root\u001b[0m\n",
        "├── \u001b[32mT3_KR_KISTI\u001b[0m\n",
        "│   ├── \u001b[36mroot://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n",
-       "│   │   \u001b[36m-v1/2520000/365F32F6-F971-1B4D-8E9D-C0ACD74FFB03.root\u001b[0m\n",
+       "│   │   \u001b[36m-v1/2520000/0C9615C1-7EE6-CD44-8FC0-04F63B2C16FD.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n",
+       "│   │   \u001b[36m-v1/2520000/152C304A-97AD-1649-BCB6-3EA0CCD0DD33.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n",
+       "│   │   \u001b[36m-v1/2520000/1CEB718A-7DC1-C74A-A7BE-A3C8D9FA785A.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n",
+       "│   │   \u001b[36m-v1/2520000/51515E3C-C640-3A4C-A16C-DC267FD142BF.root\u001b[0m\n",
        "│   ├── \u001b[36mroot://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n",
-       "│   │   \u001b[36m-v1/2520000/42DC0F42-82E8-BE47-B04D-544B67274829.root\u001b[0m\n",
+       "│   │   \u001b[36m-v1/2520000/7DEA3718-B7BC-EE42-A8BE-11C62BB8536D.root\u001b[0m\n",
        "│   ├── \u001b[36mroot://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n",
-       "│   │   \u001b[36m-v1/2520000/A350E2E4-705C-2C4D-9B11-3436056EEBE7.root\u001b[0m\n",
+       "│   │   \u001b[36m-v1/2520000/81CEA7BA-9E66-BC4F-A96F-32642D59B653.root\u001b[0m\n",
        "│   └── \u001b[36mroot://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n",
-       "│       \u001b[36m-v1/2520000/AB8DD69D-A522-D44C-BB9C-209623F7D41A.root\u001b[0m\n",
-       "├── \u001b[32mT2_DE_RWTH\u001b[0m\n",
-       "│   ├── \u001b[36mroot://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\u001b[0m\n",
-       "│   │   \u001b[36m_NanoAODv9_GT36-v1/2520000/37312354-59AB-E44B-BC94-CF424D4B7DDB.root\u001b[0m\n",
-       "│   ├── \u001b[36mroot://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\u001b[0m\n",
-       "│   │   \u001b[36m_NanoAODv9_GT36-v1/2520000/459261DD-4441-6047-9FF2-1EDE468452C9.root\u001b[0m\n",
-       "│   └── \u001b[36mroot://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\u001b[0m\n",
-       "│       \u001b[36m_NanoAODv9_GT36-v1/2520000/59DA0585-BD57-CE49-A15E-CDBAC5473EDE.root\u001b[0m\n",
+       "│       \u001b[36m-v1/2520000/C4F476DA-3D00-334B-867C-7E12F94EE3AB.root\u001b[0m\n",
+       "├── \u001b[32mT2_ES_CIEMAT\u001b[0m\n",
+       "│   ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n",
+       "│   │   \u001b[36m/2520000/12FAE9F1-7139-924C-A8DE-9699A00FC994.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n",
+       "│   │   \u001b[36m/2520000/1DD0FAC6-3087-E44E-ABCB-8AF812C1310D.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n",
+       "│   │   \u001b[36m/2520000/3FE5B677-9AB3-0245-A1CF-4B320592F18F.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n",
+       "│   │   \u001b[36m/2520000/74A75B73-E5B8-C942-BBC9-1DDDD7F752FB.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n",
+       "│   │   \u001b[36m/2520000/8C8690F8-4FEE-1047-85F4-29E414B3D12C.root\u001b[0m\n",
+       "│   └── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n",
+       "│       \u001b[36m/2520000/DA47C0B6-BCAB-C54C-A6BF-B0A64E88E3D4.root\u001b[0m\n",
+       "├── \u001b[32mT1_FR_CCIN2P3_Disk\u001b[0m\n",
+       "│   ├── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n",
+       "│   │   \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/26FC8C40-EA29-804C-B17D-84FB1C6BC505.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n",
+       "│   │   \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/2D58C3FE-512A-1F48-9AEB-6F80379B8F4A.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n",
+       "│   │   \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/30A3A1AB-2F27-C84E-9437-6BB3881F6856.root\u001b[0m\n",
+       "│   └── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n",
+       "│       \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/A350E2E4-705C-2C4D-9B11-3436056EEBE7.root\u001b[0m\n",
+       "├── \u001b[32mT2_BE_IIHE\u001b[0m\n",
+       "│   ├── \u001b[36mroot://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\u001b[0m\n",
+       "│   │   \u001b[36m0000/365F32F6-F971-1B4D-8E9D-C0ACD74FFB03.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\u001b[0m\n",
+       "│   │   \u001b[36m0000/410C32AB-DEB5-404F-BC6B-92E8F560563F.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\u001b[0m\n",
+       "│   │   \u001b[36m0000/6809B5E3-6DE6-1541-AE4C-E1804C877EDE.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\u001b[0m\n",
+       "│   │   \u001b[36m0000/78AC6A39-C303-EB44-9264-71819CC70FCC.root\u001b[0m\n",
+       "│   └── \u001b[36mroot://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\u001b[0m\n",
+       "│       \u001b[36m0000/7CCCB2C3-F210-2C42-85DF-AA00293FACFB.root\u001b[0m\n",
        "├── \u001b[32mT2_US_Purdue\u001b[0m\n",
        "│   ├── \u001b[36mroot://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n",
-       "│   │   \u001b[36m2520000/3FE5B677-9AB3-0245-A1CF-4B320592F18F.root\u001b[0m\n",
-       "│   ├── \u001b[36mroot://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n",
-       "│   │   \u001b[36m2520000/410C32AB-DEB5-404F-BC6B-92E8F560563F.root\u001b[0m\n",
+       "│   │   \u001b[36m2520000/37312354-59AB-E44B-BC94-CF424D4B7DDB.root\u001b[0m\n",
        "│   ├── \u001b[36mroot://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n",
-       "│   │   \u001b[36m2520000/63047CC0-38C6-F74C-9A00-0DF9050F7CF1.root\u001b[0m\n",
-       "│   ├── \u001b[36mroot://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n",
-       "│   │   \u001b[36m2520000/7CCCB2C3-F210-2C42-85DF-AA00293FACFB.root\u001b[0m\n",
+       "│   │   \u001b[36m2520000/42DC0F42-82E8-BE47-B04D-544B67274829.root\u001b[0m\n",
        "│   ├── \u001b[36mroot://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n",
        "│   │   \u001b[36m2520000/D7875684-9F26-084E-9B2B-5E9BB5D353E8.root\u001b[0m\n",
        "│   ├── \u001b[36mroot://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n",
        "│   │   \u001b[36m2520000/FAF0C67B-A8B4-8A4F-83B1-E43675CE9630.root\u001b[0m\n",
        "│   └── \u001b[36mroot://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n",
        "│       \u001b[36m2520000/FE5EEFA5-C07A-5C44-B66D-5B31BE02C7D3.root\u001b[0m\n",
-       "├── \u001b[32mT2_US_Florida\u001b[0m\n",
-       "│   ├── \u001b[36mroot://cmsio2.rc.ufl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n",
-       "│   │   \u001b[36m520000/51515E3C-C640-3A4C-A16C-DC267FD142BF.root\u001b[0m\n",
-       "│   └── \u001b[36mroot://cmsio2.rc.ufl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n",
-       "│       \u001b[36m520000/6EAA5EDB-0DB3-6E40-87DC-7AB582295D29.root\u001b[0m\n",
+       "├── \u001b[32mT2_US_Wisconsin\u001b[0m\n",
+       "│   ├── \u001b[36mroot://cmsxrootd.hep.wisc.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n",
+       "│   │   \u001b[36m-v1/2520000/39D52C69-2035-A24B-A413-40976993651D.root\u001b[0m\n",
+       "│   └── \u001b[36mroot://cmsxrootd.hep.wisc.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n",
+       "│       \u001b[36m-v1/2520000/FCAF4145-8E3F-2142-BDCB-5E276523B592.root\u001b[0m\n",
        "├── \u001b[32mT2_TW_NCHC\u001b[0m\n",
        "│   ├── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n",
-       "│   │   \u001b[36m1/2520000/6809B5E3-6DE6-1541-AE4C-E1804C877EDE.root\u001b[0m\n",
+       "│   │   \u001b[36m1/2520000/459261DD-4441-6047-9FF2-1EDE468452C9.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n",
+       "│   │   \u001b[36m1/2520000/6DDF448B-4605-5C41-9711-1C73EC5F01D3.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n",
+       "│   │   \u001b[36m1/2520000/7B14228A-5331-DF4E-B677-7B8AA281D460.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n",
+       "│   │   \u001b[36m1/2520000/7B181B92-AA2C-1E44-86FE-B074D359BBB3.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n",
+       "│   │   \u001b[36m1/2520000/8223C4A3-D4BD-6A4B-A513-54B6668C7122.root\u001b[0m\n",
        "│   ├── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n",
-       "│   │   \u001b[36m1/2520000/8369B0EA-E4CC-AC4D-BD3F-0679B3310E09.root\u001b[0m\n",
+       "│   │   \u001b[36m1/2520000/A74EFE57-BAD2-C143-B8DC-817CE4F96FD7.root\u001b[0m\n",
        "│   ├── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n",
        "│   │   \u001b[36m1/2520000/AE014F55-84BE-E84E-B447-0B614070CD17.root\u001b[0m\n",
        "│   ├── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n",
-       "│   │   \u001b[36m1/2520000/B3487FE0-B172-AD47-A13A-388C0A9BF93F.root\u001b[0m\n",
+       "│   │   \u001b[36m1/2520000/BCBF89A2-329C-744B-A38F-139EA8F94007.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n",
+       "│   │   \u001b[36m1/2520000/D8D41BBC-D514-D342-A514-CCF48575D184.root\u001b[0m\n",
        "│   └── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n",
-       "│       \u001b[36m1/2520000/BA02D468-A8CE-4F49-884F-F836BB481AD5.root\u001b[0m\n",
-       "├── \u001b[32mT2_BE_IIHE\u001b[0m\n",
-       "│   ├── \u001b[36mroot://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\u001b[0m\n",
-       "│   │   \u001b[36m0000/6DDF448B-4605-5C41-9711-1C73EC5F01D3.root\u001b[0m\n",
-       "│   ├── \u001b[36mroot://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\u001b[0m\n",
-       "│   │   \u001b[36m0000/7B181B92-AA2C-1E44-86FE-B074D359BBB3.root\u001b[0m\n",
-       "│   └── \u001b[36mroot://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\u001b[0m\n",
-       "│       \u001b[36m0000/F09135D8-FCBE-AF40-BCE8-03A529C5C87F.root\u001b[0m\n",
+       "│       \u001b[36m1/2520000/F1B3977A-E777-EC4D-8FC7-981FE4ED5E0C.root\u001b[0m\n",
+       "├── \u001b[32mT2_UK_London_IC\u001b[0m\n",
+       "│   ├── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n",
+       "│   │   \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/59DA0585-BD57-CE49-A15E-CDBAC5473EDE.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n",
+       "│   │   \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/F16A9138-7563-E540-B6AD-8A8A688B3830.root\u001b[0m\n",
+       "│   └── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n",
+       "│       \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/FE3D79A6-27D4-8948-A89B-2F966C5B29D4.root\u001b[0m\n",
+       "├── \u001b[32mT1_US_FNAL_Disk\u001b[0m\n",
+       "│   ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n",
+       "│   │   \u001b[36mDv9_GT36-v1/2520000/62789325-3C0B-FC4D-B578-B41A396399E4.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n",
+       "│   │   \u001b[36mDv9_GT36-v1/2520000/6EAA5EDB-0DB3-6E40-87DC-7AB582295D29.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n",
+       "│   │   \u001b[36mDv9_GT36-v1/2520000/A59D511A-A419-714F-8EE1-8B8BAFEC04D5.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n",
+       "│   │   \u001b[36mDv9_GT36-v1/2520000/B78A9B75-3B32-CF4E-A144-375189CF48AE.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n",
+       "│   │   \u001b[36mDv9_GT36-v1/2520000/B9E9087C-255C-C24D-A733-FB9291DC7C3C.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n",
+       "│   │   \u001b[36mDv9_GT36-v1/2520000/CDD2CDF9-72D0-4045-B28F-89002077FB89.root\u001b[0m\n",
+       "│   └── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n",
+       "│       \u001b[36mDv9_GT36-v1/2520000/ED95384D-9D3D-AE45-8425-C4C080E691C5.root\u001b[0m\n",
+       "├── \u001b[32mT1_IT_CNAF_Disk\u001b[0m\n",
+       "│   └── \u001b[36mroot://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n",
+       "│       \u001b[36m2520000/648ECD9C-8AAA-BB46-8683-C8987CCC73B9.root\u001b[0m\n",
        "├── \u001b[32mT2_US_Nebraska\u001b[0m\n",
+       "│   ├── \u001b[36mroot://xrootd-local.unl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n",
+       "│   │   \u001b[36m1/2520000/69ABD79C-C684-8244-9F0D-153C6B8C2D9C.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://xrootd-local.unl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n",
+       "│   │   \u001b[36m1/2520000/AB8DD69D-A522-D44C-BB9C-209623F7D41A.root\u001b[0m\n",
        "│   └── \u001b[36mroot://xrootd-local.unl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n",
-       "│       \u001b[36m1/2520000/74A75B73-E5B8-C942-BBC9-1DDDD7F752FB.root\u001b[0m\n",
-       "├── \u001b[32mT2_ES_CIEMAT\u001b[0m\n",
-       "│   ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n",
-       "│   │   \u001b[36m/2520000/7DEA3718-B7BC-EE42-A8BE-11C62BB8536D.root\u001b[0m\n",
-       "│   ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n",
-       "│   │   \u001b[36m/2520000/8223C4A3-D4BD-6A4B-A513-54B6668C7122.root\u001b[0m\n",
-       "│   ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n",
-       "│   │   \u001b[36m/2520000/A59D511A-A419-714F-8EE1-8B8BAFEC04D5.root\u001b[0m\n",
-       "│   ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n",
-       "│   │   \u001b[36m/2520000/A74EFE57-BAD2-C143-B8DC-817CE4F96FD7.root\u001b[0m\n",
-       "│   ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n",
-       "│   │   \u001b[36m/2520000/B1B449CE-5952-8347-A9A7-35FE231D0C72.root\u001b[0m\n",
-       "│   ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n",
-       "│   │   \u001b[36m/2520000/B9E9087C-255C-C24D-A733-FB9291DC7C3C.root\u001b[0m\n",
-       "│   ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n",
-       "│   │   \u001b[36m/2520000/D40D1285-B075-D446-B1BF-86A463EF6993.root\u001b[0m\n",
-       "│   ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n",
-       "│   │   \u001b[36m/2520000/DA47C0B6-BCAB-C54C-A6BF-B0A64E88E3D4.root\u001b[0m\n",
-       "│   ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n",
-       "│   │   \u001b[36m/2520000/ECD4877E-707B-EA43-A38B-D1B700FBDE79.root\u001b[0m\n",
-       "│   └── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n",
-       "│       \u001b[36m/2520000/ED95384D-9D3D-AE45-8425-C4C080E691C5.root\u001b[0m\n",
-       "├── \u001b[32mT1_FR_CCIN2P3_Disk\u001b[0m\n",
-       "│   └── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n",
-       "│       \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/F16A9138-7563-E540-B6AD-8A8A688B3830.root\u001b[0m\n",
+       "│       \u001b[36m1/2520000/B3487FE0-B172-AD47-A13A-388C0A9BF93F.root\u001b[0m\n",
        "├── \u001b[32mT2_IT_Legnaro\u001b[0m\n",
        "│   └── \u001b[36mroot://t2-xrdcms.lnl.infn.it:7070///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-\u001b[0m\n",
-       "│       \u001b[36mv1/2520000/F6E44EA5-F4C6-E746-AD43-7A263F1E316E.root\u001b[0m\n",
-       "└── \u001b[32mT2_CH_CERN\u001b[0m\n",
-       "    └── \u001b[36mroot://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n",
-       "        \u001b[36m520000/FCAF4145-8E3F-2142-BDCB-5E276523B592.root\u001b[0m\n"
+       "│       \u001b[36mv1/2520000/B1B449CE-5952-8347-A9A7-35FE231D0C72.root\u001b[0m\n",
+       "├── \u001b[32mT3_FR_IPNL\u001b[0m\n",
+       "│   ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n",
+       "│   │   \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/BA02D468-A8CE-4F49-884F-F836BB481AD5.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n",
+       "│   │   \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/BAAA6E00-7AC3-9947-9262-D9833D3A8B19.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n",
+       "│   │   \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/CBD43A1E-AE2F-0B4D-A642-29FB2E9EB33B.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n",
+       "│   │   \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/ECD4877E-707B-EA43-A38B-D1B700FBDE79.root\u001b[0m\n",
+       "│   └── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n",
+       "│       \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/F09135D8-FCBE-AF40-BCE8-03A529C5C87F.root\u001b[0m\n",
+       "├── \u001b[32mT2_DE_RWTH\u001b[0m\n",
+       "│   └── \u001b[36mroot://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\u001b[0m\n",
+       "│       \u001b[36m_NanoAODv9_GT36-v1/2520000/D40D1285-B075-D446-B1BF-86A463EF6993.root\u001b[0m\n",
+       "├── \u001b[32mT2_TR_METU\u001b[0m\n",
+       "│   └── \u001b[36mroot://eymir.grid.metu.edu.tr//dpm/grid.metu.edu.tr/home/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018\u001b[0m\n",
+       "│       \u001b[36m_MiniAODv2_NanoAODv9_GT36-v1/2520000/F34F4F00-3370-EF4D-AF44-39E474E6530F.root\u001b[0m\n",
+       "└── \u001b[32mT2_US_Florida\u001b[0m\n",
+       "    └── \u001b[36mroot://cmsio2.rc.ufl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n",
+       "        \u001b[36m520000/F6E44EA5-F4C6-E746-AD43-7A263F1E316E.root\u001b[0m\n"
       ]
      },
      "metadata": {},
@@ -1253,23 +1233,42 @@
     }
    ],
    "source": [
-    "ddc.load_dataset_definition(dataset_definition)"
+    "ddc = DataDiscoveryCLI()\n",
+    "ddc.load_dataset_definition(dataset_definition, \n",
+    "                           query_results_strategy=\"all\",\n",
+    "                           replicas_strategy=\"round-robin\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "db7798eb-eb9f-47e5-9239-92cdea20600f",
+   "metadata": {},
+   "source": [
+    "### Filtering sites"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bd57fe7b-0642-48b8-9f9f-cd209e50d867",
+   "metadata": {},
+   "source": [
+    "Sites filtering works in a very similar way for `DataDiscoveryCLI`"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
-   "id": "dd9ca4ea-039d-4ebb-bbf2-79092ba6e7d0",
+   "execution_count": 17,
+   "id": "d85ca119-0a56-4c67-bb21-ebbca8164728",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/html": [
-       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008080; text-decoration-color: #008080\">Selected datasets:</span>\n",
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008000; text-decoration-color: #008000\">⠇</span> Querying rucio for replicas: <span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD</span>\n",
        "</pre>\n"
       ],
       "text/plain": [
-       "\u001b[36mSelected datasets:\u001b[0m\n"
+       "\u001b[32m⠇\u001b[0m Querying rucio for replicas: \u001b[1;31m/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD\u001b[0m\n"
       ]
      },
      "metadata": {},
@@ -1278,20 +1277,455 @@
     {
      "data": {
       "text/html": [
-       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-style: italic\">                                                 Selected datasets                                                 </span>\n",
-       "┏━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳┳┓\n",
-       "┃<span style=\"font-weight: bold\"> … </span>┃<span style=\"font-weight: bold\"> Dataset                                                                                                   </span>┃<span style=\"font-weight: bold\"></span>┃<span style=\"font-weight: bold\"></span>┃\n",
-       "┡━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇╇┩\n",
-       "│<span style=\"color: #008080; text-decoration-color: #008080\"> 1 </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> /DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realisti… </span>│││\n",
-       "│<span style=\"color: #008080; text-decoration-color: #008080\"> 2 </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> /SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD                                           </span>│││\n",
-       "└───┴───────────────────────────────────────────────────────────────────────────────────────────────────────────┴┴┘\n",
-       "</pre>\n"
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
       ],
-      "text/plain": [
-       "\u001b[3m                                                 Selected datasets                                                 \u001b[0m\n",
-       "┏━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳┳┓\n",
-       "┃\u001b[1m \u001b[0m\u001b[1m…\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mDataset                                                                                                  \u001b[0m\u001b[1m \u001b[0m┃┃┃\n",
-       "┡━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇╇┩\n",
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008080; text-decoration-color: #008080\">Sites availability for dataset: </span><span style=\"color: #800000; text-decoration-color: #800000\">/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[36mSites availability for dataset: \u001b[0m\u001b[31m/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\u001b[31mNANOAOD\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-style: italic\">                   Available replicas                   </span>\n",
+       "┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━┓\n",
+       "┃<span style=\"font-weight: bold\"> Index </span>┃<span style=\"font-weight: bold\"> Site                </span>┃<span style=\"font-weight: bold\"> Files   </span>┃<span style=\"font-weight: bold\"> Availability </span>┃\n",
+       "┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━┩\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">   0   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_DE_DESY          </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 67 / 67 </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">    100.0%    </span>│\n",
+       "│   1   │<span style=\"color: #008080; text-decoration-color: #008080\"> T3_FR_IPNL          </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 67 / 67 </span>│    100.0%    │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">   2   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_UK_London_IC     </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 39 / 67 </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">    58.2%     </span>│\n",
+       "│   3   │<span style=\"color: #008080; text-decoration-color: #008080\"> T1_FR_CCIN2P3_Disk  </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 38 / 67 </span>│    56.7%     │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">   4   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_CH_CERN          </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 25 / 67 </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">    37.3%     </span>│\n",
+       "│   5   │<span style=\"color: #008080; text-decoration-color: #008080\"> T2_DE_RWTH          </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 22 / 67 </span>│    32.8%     │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">   6   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T1_IT_CNAF_Disk     </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 20 / 67 </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">    29.9%     </span>│\n",
+       "│   7   │<span style=\"color: #008080; text-decoration-color: #008080\"> T1_DE_KIT_Disk      </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 11 / 67 </span>│    16.4%     │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">   8   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_UK_SGrid_RALPP   </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 6 / 67  </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">     9.0%     </span>│\n",
+       "│   9   │<span style=\"color: #008080; text-decoration-color: #008080\"> T2_IT_Legnaro       </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 6 / 67  </span>│     9.0%     │\n",
+       "│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">  10   </span>│<span style=\"color: #7fbfbf; text-decoration-color: #7fbfbf\"> T2_FR_IPHC          </span>│<span style=\"color: #bf7fbf; text-decoration-color: #bf7fbf\"> 2 / 67  </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">     3.0%     </span>│\n",
+       "│  11   │<span style=\"color: #008080; text-decoration-color: #008080\"> T2_UK_London_Brunel </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> 1 / 67  </span>│     1.5%     │\n",
+       "└───────┴─────────────────────┴─────────┴──────────────┘\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[3m                   Available replicas                   \u001b[0m\n",
+       "┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━┓\n",
+       "┃\u001b[1m \u001b[0m\u001b[1mIndex\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mSite               \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mFiles  \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mAvailability\u001b[0m\u001b[1m \u001b[0m┃\n",
+       "┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━┩\n",
+       "│\u001b[2m \u001b[0m\u001b[2m  0  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_DE_DESY         \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m67 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m   100.0%   \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│   1   │\u001b[36m \u001b[0m\u001b[36mT3_FR_IPNL         \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m67 / 67\u001b[0m\u001b[35m \u001b[0m│    100.0%    │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m  2  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_UK_London_IC    \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m39 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m   58.2%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│   3   │\u001b[36m \u001b[0m\u001b[36mT1_FR_CCIN2P3_Disk \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m38 / 67\u001b[0m\u001b[35m \u001b[0m│    56.7%     │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m  4  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_CH_CERN         \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m25 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m   37.3%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│   5   │\u001b[36m \u001b[0m\u001b[36mT2_DE_RWTH         \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m22 / 67\u001b[0m\u001b[35m \u001b[0m│    32.8%     │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m  6  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_IT_CNAF_Disk    \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m20 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m   29.9%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│   7   │\u001b[36m \u001b[0m\u001b[36mT1_DE_KIT_Disk     \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m11 / 67\u001b[0m\u001b[35m \u001b[0m│    16.4%     │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m  8  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_UK_SGrid_RALPP  \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m6 / 67 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m    9.0%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│   9   │\u001b[36m \u001b[0m\u001b[36mT2_IT_Legnaro      \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m6 / 67 \u001b[0m\u001b[35m \u001b[0m│     9.0%     │\n",
+       "│\u001b[2m \u001b[0m\u001b[2m 10  \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_FR_IPHC         \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m2 / 67 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m    3.0%    \u001b[0m\u001b[2m \u001b[0m│\n",
+       "│  11   │\u001b[36m \u001b[0m\u001b[36mT2_UK_London_Brunel\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m1 / 67 \u001b[0m\u001b[35m \u001b[0m│     1.5%     │\n",
+       "└───────┴─────────────────────┴─────────┴──────────────┘\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Replicas for <span style=\"color: #008000; text-decoration-color: #008000\">/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_CH_CERN</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">520000/0144EC47-BFA3-EA43-BF05-BD4248ED6031.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">520000/1DD0FAC6-3087-E44E-ABCB-8AF812C1310D.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">520000/2747DEFE-A247-1F42-B0EF-E7B7F1D3FCD6.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">520000/2DA9130E-8423-304C-9902-1E42CD72E658.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">520000/39D52C69-2035-A24B-A413-40976993651D.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">520000/69ABD79C-C684-8244-9F0D-153C6B8C2D9C.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">520000/7CCCB2C3-F210-2C42-85DF-AA00293FACFB.root</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">520000/F34F4F00-3370-EF4D-AF44-39E474E6530F.root</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T3_FR_IPNL</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv2_NanoAODv9_GT36-v1/2520000/0C9615C1-7EE6-CD44-8FC0-04F63B2C16FD.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv2_NanoAODv9_GT36-v1/2520000/30A3A1AB-2F27-C84E-9437-6BB3881F6856.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv2_NanoAODv9_GT36-v1/2520000/410C32AB-DEB5-404F-BC6B-92E8F560563F.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv2_NanoAODv9_GT36-v1/2520000/42DC0F42-82E8-BE47-B04D-544B67274829.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv2_NanoAODv9_GT36-v1/2520000/62789325-3C0B-FC4D-B578-B41A396399E4.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv2_NanoAODv9_GT36-v1/2520000/6809B5E3-6DE6-1541-AE4C-E1804C877EDE.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv2_NanoAODv9_GT36-v1/2520000/78AC6A39-C303-EB44-9264-71819CC70FCC.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv2_NanoAODv9_GT36-v1/2520000/A350E2E4-705C-2C4D-9B11-3436056EEBE7.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">Dv2_NanoAODv9_GT36-v1/2520000/FCAF4145-8E3F-2142-BDCB-5E276523B592.root</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">Dv2_NanoAODv9_GT36-v1/2520000/FE3D79A6-27D4-8948-A89B-2F966C5B29D4.root</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_UK_London_IC</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/12FAE9F1-7139-924C-A8DE-9699A00FC994.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/63047CC0-38C6-F74C-9A00-0DF9050F7CF1.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/8369B0EA-E4CC-AC4D-BD3F-0679B3310E09.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/AE014F55-84BE-E84E-B447-0B614070CD17.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/F16A9138-7563-E540-B6AD-8A8A688B3830.root</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/FAF0C67B-A8B4-8A4F-83B1-E43675CE9630.root</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T1_FR_CCIN2P3_Disk</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">18_MiniAODv2_NanoAODv9_GT36-v1/2520000/152C304A-97AD-1649-BCB6-3EA0CCD0DD33.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">18_MiniAODv2_NanoAODv9_GT36-v1/2520000/37312354-59AB-E44B-BC94-CF424D4B7DDB.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">18_MiniAODv2_NanoAODv9_GT36-v1/2520000/7B14228A-5331-DF4E-B677-7B8AA281D460.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">18_MiniAODv2_NanoAODv9_GT36-v1/2520000/7B181B92-AA2C-1E44-86FE-B074D359BBB3.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">18_MiniAODv2_NanoAODv9_GT36-v1/2520000/C4F476DA-3D00-334B-867C-7E12F94EE3AB.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">18_MiniAODv2_NanoAODv9_GT36-v1/2520000/D8D41BBC-D514-D342-A514-CCF48575D184.root</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">18_MiniAODv2_NanoAODv9_GT36-v1/2520000/FE5EEFA5-C07A-5C44-B66D-5B31BE02C7D3.root</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_FR_IPHC</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://sbgdcache.in2p3.fr///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/25200</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">00/1CEB718A-7DC1-C74A-A7BE-A3C8D9FA785A.root</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_DE_DESY</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">36-v1/2520000/26FC8C40-EA29-804C-B17D-84FB1C6BC505.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">36-v1/2520000/2D58C3FE-512A-1F48-9AEB-6F80379B8F4A.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">36-v1/2520000/459261DD-4441-6047-9FF2-1EDE468452C9.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">36-v1/2520000/51515E3C-C640-3A4C-A16C-DC267FD142BF.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">36-v1/2520000/648ECD9C-8AAA-BB46-8683-C8987CCC73B9.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">36-v1/2520000/74A75B73-E5B8-C942-BBC9-1DDDD7F752FB.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">36-v1/2520000/81CEA7BA-9E66-BC4F-A96F-32642D59B653.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">36-v1/2520000/8223C4A3-D4BD-6A4B-A513-54B6668C7122.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">36-v1/2520000/8C8690F8-4FEE-1047-85F4-29E414B3D12C.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">36-v1/2520000/B78A9B75-3B32-CF4E-A144-375189CF48AE.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">36-v1/2520000/BAAA6E00-7AC3-9947-9262-D9833D3A8B19.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">36-v1/2520000/BCBF89A2-329C-744B-A38F-139EA8F94007.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">36-v1/2520000/CBD43A1E-AE2F-0B4D-A642-29FB2E9EB33B.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">36-v1/2520000/D40D1285-B075-D446-B1BF-86A463EF6993.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">36-v1/2520000/DA47C0B6-BCAB-C54C-A6BF-B0A64E88E3D4.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">36-v1/2520000/ECD4877E-707B-EA43-A38B-D1B700FBDE79.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">36-v1/2520000/ED95384D-9D3D-AE45-8425-C4C080E691C5.root</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">36-v1/2520000/F1B3977A-E777-EC4D-8FC7-981FE4ED5E0C.root</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T1_DE_KIT_Disk</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsxrootd-kit-disk.gridka.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">9_GT36-v1/2520000/365F32F6-F971-1B4D-8E9D-C0ACD74FFB03.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsxrootd-kit-disk.gridka.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">9_GT36-v1/2520000/3FE5B677-9AB3-0245-A1CF-4B320592F18F.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsxrootd-kit-disk.gridka.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">9_GT36-v1/2520000/6DDF448B-4605-5C41-9711-1C73EC5F01D3.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsxrootd-kit-disk.gridka.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">9_GT36-v1/2520000/6EAA5EDB-0DB3-6E40-87DC-7AB582295D29.root</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://cmsxrootd-kit-disk.gridka.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">9_GT36-v1/2520000/7DEA3718-B7BC-EE42-A8BE-11C62BB8536D.root</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T2_DE_RWTH</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">_NanoAODv9_GT36-v1/2520000/59DA0585-BD57-CE49-A15E-CDBAC5473EDE.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">_NanoAODv9_GT36-v1/2520000/A59D511A-A419-714F-8EE1-8B8BAFEC04D5.root</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">_NanoAODv9_GT36-v1/2520000/B9E9087C-255C-C24D-A733-FB9291DC7C3C.root</span>\n",
+       "├── <span style=\"color: #008000; text-decoration-color: #008000\">T1_IT_CNAF_Disk</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">2520000/A74EFE57-BAD2-C143-B8DC-817CE4F96FD7.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">2520000/AB8DD69D-A522-D44C-BB9C-209623F7D41A.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">2520000/B3487FE0-B172-AD47-A13A-388C0A9BF93F.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">2520000/CDD2CDF9-72D0-4045-B28F-89002077FB89.root</span>\n",
+       "│   ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/</span>\n",
+       "│   │   <span style=\"color: #008080; text-decoration-color: #008080\">2520000/D7875684-9F26-084E-9B2B-5E9BB5D353E8.root</span>\n",
+       "│   └── <span style=\"color: #008080; text-decoration-color: #008080\">root://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/</span>\n",
+       "│       <span style=\"color: #008080; text-decoration-color: #008080\">2520000/F09135D8-FCBE-AF40-BCE8-03A529C5C87F.root</span>\n",
+       "└── <span style=\"color: #008000; text-decoration-color: #008000\">T2_UK_SGrid_RALPP</span>\n",
+       "    ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://mover.pp.rl.ac.uk:1094/pnfs/pp.rl.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_Mini</span>\n",
+       "    │   <span style=\"color: #008080; text-decoration-color: #008080\">AODv2_NanoAODv9_GT36-v1/2520000/B1B449CE-5952-8347-A9A7-35FE231D0C72.root</span>\n",
+       "    ├── <span style=\"color: #008080; text-decoration-color: #008080\">root://mover.pp.rl.ac.uk:1094/pnfs/pp.rl.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_Mini</span>\n",
+       "    │   <span style=\"color: #008080; text-decoration-color: #008080\">AODv2_NanoAODv9_GT36-v1/2520000/BA02D468-A8CE-4F49-884F-F836BB481AD5.root</span>\n",
+       "    └── <span style=\"color: #008080; text-decoration-color: #008080\">root://mover.pp.rl.ac.uk:1094/pnfs/pp.rl.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_Mini</span>\n",
+       "        <span style=\"color: #008080; text-decoration-color: #008080\">AODv2_NanoAODv9_GT36-v1/2520000/F6E44EA5-F4C6-E746-AD43-7A263F1E316E.root</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "Replicas for \u001b[32m/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD\u001b[0m\n",
+       "├── \u001b[32mT2_CH_CERN\u001b[0m\n",
+       "│   ├── \u001b[36mroot://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n",
+       "│   │   \u001b[36m520000/0144EC47-BFA3-EA43-BF05-BD4248ED6031.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n",
+       "│   │   \u001b[36m520000/1DD0FAC6-3087-E44E-ABCB-8AF812C1310D.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n",
+       "│   │   \u001b[36m520000/2747DEFE-A247-1F42-B0EF-E7B7F1D3FCD6.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n",
+       "│   │   \u001b[36m520000/2DA9130E-8423-304C-9902-1E42CD72E658.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n",
+       "│   │   \u001b[36m520000/39D52C69-2035-A24B-A413-40976993651D.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n",
+       "│   │   \u001b[36m520000/69ABD79C-C684-8244-9F0D-153C6B8C2D9C.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n",
+       "│   │   \u001b[36m520000/7CCCB2C3-F210-2C42-85DF-AA00293FACFB.root\u001b[0m\n",
+       "│   └── \u001b[36mroot://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n",
+       "│       \u001b[36m520000/F34F4F00-3370-EF4D-AF44-39E474E6530F.root\u001b[0m\n",
+       "├── \u001b[32mT3_FR_IPNL\u001b[0m\n",
+       "│   ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n",
+       "│   │   \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/0C9615C1-7EE6-CD44-8FC0-04F63B2C16FD.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n",
+       "│   │   \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/30A3A1AB-2F27-C84E-9437-6BB3881F6856.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n",
+       "│   │   \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/410C32AB-DEB5-404F-BC6B-92E8F560563F.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n",
+       "│   │   \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/42DC0F42-82E8-BE47-B04D-544B67274829.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n",
+       "│   │   \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/62789325-3C0B-FC4D-B578-B41A396399E4.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n",
+       "│   │   \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/6809B5E3-6DE6-1541-AE4C-E1804C877EDE.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n",
+       "│   │   \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/78AC6A39-C303-EB44-9264-71819CC70FCC.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n",
+       "│   │   \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/A350E2E4-705C-2C4D-9B11-3436056EEBE7.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n",
+       "│   │   \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/FCAF4145-8E3F-2142-BDCB-5E276523B592.root\u001b[0m\n",
+       "│   └── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n",
+       "│       \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/FE3D79A6-27D4-8948-A89B-2F966C5B29D4.root\u001b[0m\n",
+       "├── \u001b[32mT2_UK_London_IC\u001b[0m\n",
+       "│   ├── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n",
+       "│   │   \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/12FAE9F1-7139-924C-A8DE-9699A00FC994.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n",
+       "│   │   \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/63047CC0-38C6-F74C-9A00-0DF9050F7CF1.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n",
+       "│   │   \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/8369B0EA-E4CC-AC4D-BD3F-0679B3310E09.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n",
+       "│   │   \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/AE014F55-84BE-E84E-B447-0B614070CD17.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n",
+       "│   │   \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/F16A9138-7563-E540-B6AD-8A8A688B3830.root\u001b[0m\n",
+       "│   └── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n",
+       "│       \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/FAF0C67B-A8B4-8A4F-83B1-E43675CE9630.root\u001b[0m\n",
+       "├── \u001b[32mT1_FR_CCIN2P3_Disk\u001b[0m\n",
+       "│   ├── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n",
+       "│   │   \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/152C304A-97AD-1649-BCB6-3EA0CCD0DD33.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n",
+       "│   │   \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/37312354-59AB-E44B-BC94-CF424D4B7DDB.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n",
+       "│   │   \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/7B14228A-5331-DF4E-B677-7B8AA281D460.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n",
+       "│   │   \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/7B181B92-AA2C-1E44-86FE-B074D359BBB3.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n",
+       "│   │   \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/C4F476DA-3D00-334B-867C-7E12F94EE3AB.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n",
+       "│   │   \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/D8D41BBC-D514-D342-A514-CCF48575D184.root\u001b[0m\n",
+       "│   └── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n",
+       "│       \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/FE5EEFA5-C07A-5C44-B66D-5B31BE02C7D3.root\u001b[0m\n",
+       "├── \u001b[32mT2_FR_IPHC\u001b[0m\n",
+       "│   └── \u001b[36mroot://sbgdcache.in2p3.fr///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/25200\u001b[0m\n",
+       "│       \u001b[36m00/1CEB718A-7DC1-C74A-A7BE-A3C8D9FA785A.root\u001b[0m\n",
+       "├── \u001b[32mT2_DE_DESY\u001b[0m\n",
+       "│   ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n",
+       "│   │   \u001b[36m36-v1/2520000/26FC8C40-EA29-804C-B17D-84FB1C6BC505.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n",
+       "│   │   \u001b[36m36-v1/2520000/2D58C3FE-512A-1F48-9AEB-6F80379B8F4A.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n",
+       "│   │   \u001b[36m36-v1/2520000/459261DD-4441-6047-9FF2-1EDE468452C9.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n",
+       "│   │   \u001b[36m36-v1/2520000/51515E3C-C640-3A4C-A16C-DC267FD142BF.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n",
+       "│   │   \u001b[36m36-v1/2520000/648ECD9C-8AAA-BB46-8683-C8987CCC73B9.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n",
+       "│   │   \u001b[36m36-v1/2520000/74A75B73-E5B8-C942-BBC9-1DDDD7F752FB.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n",
+       "│   │   \u001b[36m36-v1/2520000/81CEA7BA-9E66-BC4F-A96F-32642D59B653.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n",
+       "│   │   \u001b[36m36-v1/2520000/8223C4A3-D4BD-6A4B-A513-54B6668C7122.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n",
+       "│   │   \u001b[36m36-v1/2520000/8C8690F8-4FEE-1047-85F4-29E414B3D12C.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n",
+       "│   │   \u001b[36m36-v1/2520000/B78A9B75-3B32-CF4E-A144-375189CF48AE.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n",
+       "│   │   \u001b[36m36-v1/2520000/BAAA6E00-7AC3-9947-9262-D9833D3A8B19.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n",
+       "│   │   \u001b[36m36-v1/2520000/BCBF89A2-329C-744B-A38F-139EA8F94007.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n",
+       "│   │   \u001b[36m36-v1/2520000/CBD43A1E-AE2F-0B4D-A642-29FB2E9EB33B.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n",
+       "│   │   \u001b[36m36-v1/2520000/D40D1285-B075-D446-B1BF-86A463EF6993.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n",
+       "│   │   \u001b[36m36-v1/2520000/DA47C0B6-BCAB-C54C-A6BF-B0A64E88E3D4.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n",
+       "│   │   \u001b[36m36-v1/2520000/ECD4877E-707B-EA43-A38B-D1B700FBDE79.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n",
+       "│   │   \u001b[36m36-v1/2520000/ED95384D-9D3D-AE45-8425-C4C080E691C5.root\u001b[0m\n",
+       "│   └── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n",
+       "│       \u001b[36m36-v1/2520000/F1B3977A-E777-EC4D-8FC7-981FE4ED5E0C.root\u001b[0m\n",
+       "├── \u001b[32mT1_DE_KIT_Disk\u001b[0m\n",
+       "│   ├── \u001b[36mroot://cmsxrootd-kit-disk.gridka.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv\u001b[0m\n",
+       "│   │   \u001b[36m9_GT36-v1/2520000/365F32F6-F971-1B4D-8E9D-C0ACD74FFB03.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://cmsxrootd-kit-disk.gridka.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv\u001b[0m\n",
+       "│   │   \u001b[36m9_GT36-v1/2520000/3FE5B677-9AB3-0245-A1CF-4B320592F18F.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://cmsxrootd-kit-disk.gridka.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv\u001b[0m\n",
+       "│   │   \u001b[36m9_GT36-v1/2520000/6DDF448B-4605-5C41-9711-1C73EC5F01D3.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://cmsxrootd-kit-disk.gridka.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv\u001b[0m\n",
+       "│   │   \u001b[36m9_GT36-v1/2520000/6EAA5EDB-0DB3-6E40-87DC-7AB582295D29.root\u001b[0m\n",
+       "│   └── \u001b[36mroot://cmsxrootd-kit-disk.gridka.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv\u001b[0m\n",
+       "│       \u001b[36m9_GT36-v1/2520000/7DEA3718-B7BC-EE42-A8BE-11C62BB8536D.root\u001b[0m\n",
+       "├── \u001b[32mT2_DE_RWTH\u001b[0m\n",
+       "│   ├── \u001b[36mroot://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\u001b[0m\n",
+       "│   │   \u001b[36m_NanoAODv9_GT36-v1/2520000/59DA0585-BD57-CE49-A15E-CDBAC5473EDE.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\u001b[0m\n",
+       "│   │   \u001b[36m_NanoAODv9_GT36-v1/2520000/A59D511A-A419-714F-8EE1-8B8BAFEC04D5.root\u001b[0m\n",
+       "│   └── \u001b[36mroot://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\u001b[0m\n",
+       "│       \u001b[36m_NanoAODv9_GT36-v1/2520000/B9E9087C-255C-C24D-A733-FB9291DC7C3C.root\u001b[0m\n",
+       "├── \u001b[32mT1_IT_CNAF_Disk\u001b[0m\n",
+       "│   ├── \u001b[36mroot://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n",
+       "│   │   \u001b[36m2520000/A74EFE57-BAD2-C143-B8DC-817CE4F96FD7.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n",
+       "│   │   \u001b[36m2520000/AB8DD69D-A522-D44C-BB9C-209623F7D41A.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n",
+       "│   │   \u001b[36m2520000/B3487FE0-B172-AD47-A13A-388C0A9BF93F.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n",
+       "│   │   \u001b[36m2520000/CDD2CDF9-72D0-4045-B28F-89002077FB89.root\u001b[0m\n",
+       "│   ├── \u001b[36mroot://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n",
+       "│   │   \u001b[36m2520000/D7875684-9F26-084E-9B2B-5E9BB5D353E8.root\u001b[0m\n",
+       "│   └── \u001b[36mroot://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n",
+       "│       \u001b[36m2520000/F09135D8-FCBE-AF40-BCE8-03A529C5C87F.root\u001b[0m\n",
+       "└── \u001b[32mT2_UK_SGrid_RALPP\u001b[0m\n",
+       "    ├── \u001b[36mroot://mover.pp.rl.ac.uk:1094/pnfs/pp.rl.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_Mini\u001b[0m\n",
+       "    │   \u001b[36mAODv2_NanoAODv9_GT36-v1/2520000/B1B449CE-5952-8347-A9A7-35FE231D0C72.root\u001b[0m\n",
+       "    ├── \u001b[36mroot://mover.pp.rl.ac.uk:1094/pnfs/pp.rl.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_Mini\u001b[0m\n",
+       "    │   \u001b[36mAODv2_NanoAODv9_GT36-v1/2520000/BA02D468-A8CE-4F49-884F-F836BB481AD5.root\u001b[0m\n",
+       "    └── \u001b[36mroot://mover.pp.rl.ac.uk:1094/pnfs/pp.rl.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_Mini\u001b[0m\n",
+       "        \u001b[36mAODv2_NanoAODv9_GT36-v1/2520000/F6E44EA5-F4C6-E746-AD43-7A263F1E316E.root\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008080; text-decoration-color: #008080\">Selected datasets:</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[36mSelected datasets:\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-style: italic\">                                                 Selected datasets                                                 </span>\n",
+       "┏━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳┳┓\n",
+       "┃<span style=\"font-weight: bold\"> … </span>┃<span style=\"font-weight: bold\"> Dataset                                                                                                   </span>┃<span style=\"font-weight: bold\"></span>┃<span style=\"font-weight: bold\"></span>┃\n",
+       "┡━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇╇┩\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> 1 </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> /DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realisti… </span>│││\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> 2 </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> /SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD                                           </span>│││\n",
+       "└───┴───────────────────────────────────────────────────────────────────────────────────────────────────────────┴┴┘\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[3m                                                 Selected datasets                                                 \u001b[0m\n",
+       "┏━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳┳┓\n",
+       "┃\u001b[1m \u001b[0m\u001b[1m…\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mDataset                                                                                                  \u001b[0m\u001b[1m \u001b[0m┃┃┃\n",
+       "┡━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇╇┩\n",
+       "│\u001b[36m \u001b[0m\u001b[36m1\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realisti…\u001b[0m\u001b[35m \u001b[0m│││\n",
+       "│\u001b[36m \u001b[0m\u001b[36m2\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD                                          \u001b[0m\u001b[35m \u001b[0m│││\n",
+       "└───┴───────────────────────────────────────────────────────────────────────────────────────────────────────────┴┴┘\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "ddc = DataDiscoveryCLI()\n",
+    "ddc.do_regex_sites(r\"T[123]_(CH|IT|UK|FR|DE)_\\w+\")\n",
+    "ddc.load_dataset_definition(dataset_definition, \n",
+    "                           query_results_strategy=\"all\",\n",
+    "                           replicas_strategy=\"round-robin\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "dd9ca4ea-039d-4ebb-bbf2-79092ba6e7d0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008080; text-decoration-color: #008080\">Selected datasets:</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[36mSelected datasets:\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-style: italic\">                                                 Selected datasets                                                 </span>\n",
+       "┏━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳┳┓\n",
+       "┃<span style=\"font-weight: bold\"> … </span>┃<span style=\"font-weight: bold\"> Dataset                                                                                                   </span>┃<span style=\"font-weight: bold\"></span>┃<span style=\"font-weight: bold\"></span>┃\n",
+       "┡━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇╇┩\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> 1 </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> /DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realisti… </span>│││\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> 2 </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> /SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD                                           </span>│││\n",
+       "└───┴───────────────────────────────────────────────────────────────────────────────────────────────────────────┴┴┘\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[3m                                                 Selected datasets                                                 \u001b[0m\n",
+       "┏━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳┳┓\n",
+       "┃\u001b[1m \u001b[0m\u001b[1m…\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mDataset                                                                                                  \u001b[0m\u001b[1m \u001b[0m┃┃┃\n",
+       "┡━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇╇┩\n",
        "│\u001b[36m \u001b[0m\u001b[36m1\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realisti…\u001b[0m\u001b[35m \u001b[0m│││\n",
        "│\u001b[36m \u001b[0m\u001b[36m2\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD                                          \u001b[0m\u001b[35m \u001b[0m│││\n",
        "└───┴───────────────────────────────────────────────────────────────────────────────────────────────────────────┴┴┘\n"
@@ -1315,7 +1749,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 20,
    "id": "b0e3e4b8-34d4-4558-988a-edacd1df9b37",
    "metadata": {},
    "outputs": [
@@ -1339,57 +1773,152 @@
   },
   {
    "cell_type": "markdown",
-   "id": "f7d52663-c5e3-4abe-9c2f-4bf8f08d8919",
+   "id": "f9f6a70b-0194-4b00-ab79-4fdb0b4fa0cf",
    "metadata": {},
    "source": [
-    "## Preprocess the fileset with dask"
+    "## DataDiscoveryCLI from shell"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7237fc9e-50b8-4cc4-9c51-9674fbf4358a",
+   "metadata": {},
+   "source": [
+    "The DataDiscoveryCLI can be used directly from CLI"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
-   "id": "125cd0ea-ff05-414a-9177-2be98eb88362",
+   "execution_count": 35,
+   "id": "2c075f2e-a06e-4c97-b5b6-6a6806571a9a",
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "\u001b[0;31mSignature:\u001b[0m\n",
-       "\u001b[0mddc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdo_preprocess\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n",
-       "\u001b[0;34m\u001b[0m    \u001b[0moutput_file\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
-       "\u001b[0;34m\u001b[0m    \u001b[0mstep_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
-       "\u001b[0;34m\u001b[0m    \u001b[0malign_to_clusters\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
-       "\u001b[0;34m\u001b[0m    \u001b[0mdask_cluster\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
-       "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-       "\u001b[0;31mDocstring:\u001b[0m\n",
-       "Perform preprocessing for concrete fileset extraction.\n",
-       "Args:  output_file [step_size] [align to file cluster boundaries] [dask cluster url]\n",
-       "\u001b[0;31mFile:\u001b[0m      /work/dvalsecc/coffea/src/coffea/dataset_tools/dataset_query.py\n",
-       "\u001b[0;31mType:\u001b[0m      method"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "usage: dataset_query.py [-h] [--cli] [-d DATASET_DEFINITION] [-o OUTPUT]\n",
+      "                        [-fo FILESET_OUTPUT] [-p] [--step-size STEP_SIZE]\n",
+      "                        [--dask-cluster DASK_CLUSTER]\n",
+      "                        [-as ALLOW_SITES [ALLOW_SITES ...]]\n",
+      "                        [-bs BLOCK_SITES [BLOCK_SITES ...]] [-rs REGEX_SITES]\n",
+      "                        [--query-results-strategy QUERY_RESULTS_STRATEGY]\n",
+      "                        [--replicas-strategy REPLICAS_STRATEGY]\n",
+      "\n",
+      "options:\n",
+      "  -h, --help            show this help message and exit\n",
+      "  --cli                 Start the dataset discovery CLI\n",
+      "  -d DATASET_DEFINITION, --dataset-definition DATASET_DEFINITION\n",
+      "                        Dataset definition file\n",
+      "  -o OUTPUT, --output OUTPUT\n",
+      "                        Output name for dataset discovery output (no fileset\n",
+      "                        preprocessing)\n",
+      "  -fo FILESET_OUTPUT, --fileset-output FILESET_OUTPUT\n",
+      "                        Output name for fileset\n",
+      "  -p, --preprocess      Preprocess with dask\n",
+      "  --step-size STEP_SIZE\n",
+      "                        Step size for preprocessing\n",
+      "  --dask-cluster DASK_CLUSTER\n",
+      "                        Dask cluster url\n",
+      "  -as ALLOW_SITES [ALLOW_SITES ...], --allow-sites ALLOW_SITES [ALLOW_SITES ...]\n",
+      "                        List of sites to be allowlisted\n",
+      "  -bs BLOCK_SITES [BLOCK_SITES ...], --block-sites BLOCK_SITES [BLOCK_SITES ...]\n",
+      "                        List of sites to be blocklisted\n",
+      "  -rs REGEX_SITES, --regex-sites REGEX_SITES\n",
+      "                        Regex string to be used to filter the sites\n",
+      "  --query-results-strategy QUERY_RESULTS_STRATEGY\n",
+      "                        Mode for query results selection: [all|manual]\n",
+      "  --replicas-strategy REPLICAS_STRATEGY\n",
+      "                        Mode for selecting replicas for datasets:\n",
+      "                        [manual|round-robin|choose]\n"
+     ]
     }
    ],
    "source": [
-    "ddc.do_preprocess?"
+    "!python -m coffea.dataset_tools.dataset_query --help"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "e93cb24c-44ed-43f1-8aae-0f6b03c88de0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!python -m coffea.dataset_tools.dataset_query --cli  -d dataset_definition.json"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f7d52663-c5e3-4abe-9c2f-4bf8f08d8919",
+   "metadata": {},
+   "source": [
+    "## Preprocess the fileset with dask"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "046a0c99-6500-41b5-9954-fa7b78061800",
+   "metadata": {},
+   "source": [
+    "The replicas metadata contain the file location in the CMS grid. \n",
+    "This info can be **preprocessed** with uproot and dask-awkward to extract the **fileset**. Practically a fileset is a collection of metadata about the file location, file name, chunks splitting, that can be used directly to configure the uproot reading. \n",
+    "\n",
+    "This step replaces the preprocessing step in coffea 0.7.x. The output of the preprocessing can be used directly to start an analysis with dask-awkward.\n",
+    "\n",
+    "The preprocessing is performed locally with multiple processes if `dask_cluster==None`, but a pre-existing dask cluster url can be passed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
    "id": "04a2aeca-9c9f-4baf-b33b-b4f1b5ba4d4a",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/html": [
-       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008000; text-decoration-color: #008000\">⠧</span> <span style=\"color: #800000; text-decoration-color: #800000\"> Preprocessing files to extract available chunks with dask</span>\n",
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008000; text-decoration-color: #008000\">⠙</span> <span style=\"color: #800000; text-decoration-color: #800000\"> Preprocessing files to extract available chunks with dask</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[32m⠙\u001b[0m \u001b[31m Preprocessing files to extract available chunks with dask\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Saved available fileset chunks to fileset_available.json.gz\n",
        "</pre>\n"
       ],
       "text/plain": [
-       "\u001b[32m⠧\u001b[0m \u001b[31m Preprocessing files to extract available chunks with dask\u001b[0m\n"
+       "Saved available fileset chunks to fileset_available.json.gz\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">Saved all fileset chunks to fileset_all.json.gz\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "Saved all fileset chunks to fileset_all.json.gz\n"
       ]
      },
      "metadata": {},
@@ -1397,18 +1926,56 @@
     }
    ],
    "source": [
-    "ddc.do_preprocess(output_file=\"fileset\",  \n",
-    "                  step_size=10000,\n",
+    "fileset_total = ddc.do_preprocess(output_file=\"fileset\",  \n",
+    "                  step_size=10000,  #chunk size for files splitting\n",
     "                  align_to_clusters=False,\n",
     "                 dask_cluster=None)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 24,
    "id": "d1206bce-b726-43cc-b217-d74fd5516147",
    "metadata": {},
    "outputs": [],
+   "source": [
+    "import gzip\n",
+    "import json\n",
+    "with gzip.open(\"fileset_available.json.gz\", \"rt\") as file:\n",
+    "    fileset_available = json.load(file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "957ea9c6-783a-4932-960f-cbec5f2f0656",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "root://cmsxrd.ts.infn.it:1094///store/mc/RunIISummer20UL18NanoAODv9/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/NANOAODSIM/106X_upgrade2018_realistic_v16_L1v1-v2/100000/13D0AD97-6B32-CB4C-BA87-5E37BA4CF20E.root {'object_path': 'Events', 'steps': [[0, 10000], [10000, 20000], [20000, 30000], [30000, 40000], [40000, 50000], [50000, 59081]], 'uuid': 'fbe50b00-1f7e-11ec-97b8-2bbee183beef'}\n",
+      "root://cmsxrd.ts.infn.it:1094///store/mc/RunIISummer20UL18NanoAODv9/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/NANOAODSIM/106X_upgrade2018_realistic_v16_L1v1-v2/230000/00C9792D-ACD2-2547-BB04-097F0C4E47E3.root {'object_path': 'Events', 'steps': [[0, 10000], [10000, 20000], [20000, 30000], [30000, 40000], [40000, 50000], [50000, 60000], [60000, 70000], [70000, 80000], [80000, 90000], [90000, 100000], [100000, 110000], [110000, 120000], [120000, 130000], [130000, 138192]], 'uuid': '938a4fe2-1d77-11ec-bddf-59319e86beef'}\n",
+      "root://dcache-cms-xrootd.desy.de:1094//store/mc/RunIISummer20UL18NanoAODv9/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/NANOAODSIM/106X_upgrade2018_realistic_v16_L1v1-v2/230000/00EA9563-5449-D24E-9566-98AE8E2A61AE.root {'object_path': 'Events', 'steps': [[0, 10000], [10000, 20000], [20000, 30000], [30000, 40000], [40000, 50000], [50000, 60000], [60000, 70000], [70000, 80000], [80000, 90000], [90000, 100000], [100000, 110000], [110000, 120000], [120000, 130000], [130000, 140000], [140000, 150000], [150000, 160000], [160000, 170000], [170000, 180000], [180000, 190000], [190000, 200000], [200000, 210000], [210000, 220000], [220000, 230000], [230000, 240000], [240000, 250000], [250000, 260000], [260000, 270000], [270000, 280000], [280000, 290000], [290000, 300000], [300000, 310000], [310000, 320000], [320000, 330000], [330000, 340000], [340000, 350000], [350000, 360000], [360000, 370000], [370000, 380000], [380000, 390000], [390000, 400000], [400000, 410000], [410000, 420000], [420000, 430000], [430000, 440000], [440000, 450000], [450000, 460000], [460000, 470000], [470000, 480000], [480000, 490000], [490000, 500000], [500000, 510000], [510000, 520000], [520000, 530000], [530000, 540000], [540000, 550000], [550000, 560000], [560000, 570000], [570000, 580000], [580000, 590000], [590000, 600000], [600000, 610000], [610000, 620000], [620000, 630000], [630000, 640000], [640000, 650000], [650000, 660000], [660000, 670000], [670000, 680000], [680000, 690000], [690000, 700000], [700000, 710000], [710000, 720000], [720000, 730000], [730000, 740000], [740000, 750000], [750000, 760000], [760000, 770000], [770000, 780000], [780000, 790000], [790000, 800000], [800000, 810000], [810000, 820000], [820000, 830000], [830000, 840000], [840000, 850000], [850000, 860000], [860000, 870000], [870000, 880000], [880000, 890000], [890000, 900000], [900000, 910000], [910000, 920000], [920000, 930000], [930000, 940000], [940000, 950000], [950000, 960000], [960000, 970000], [970000, 980000], [980000, 990000], [990000, 1000000], [1000000, 1010000], [1010000, 1020000], [1020000, 1030000], [1030000, 1040000], [1040000, 1050000], [1050000, 1060000], [1060000, 1070000], [1070000, 1080000], [1080000, 1090000], [1090000, 1100000], [1100000, 1110000], [1110000, 1120000], [1120000, 1130000], [1130000, 1140000], [1140000, 1150000], [1150000, 1160000], [1160000, 1170000], [1170000, 1180000], [1180000, 1190000], [1190000, 1200000], [1200000, 1210000], [1210000, 1220000], [1220000, 1230000], [1230000, 1240000], [1240000, 1250000], [1250000, 1260000], [1260000, 1270000], [1270000, 1280000], [1280000, 1290000], [1290000, 1300000], [1300000, 1310000], [1310000, 1320000], [1320000, 1330000], [1330000, 1340000], [1340000, 1350000], [1350000, 1360000], [1360000, 1370000], [1370000, 1380000], [1380000, 1390000], [1390000, 1400000], [1400000, 1410000], [1410000, 1420000], [1420000, 1430000], [1430000, 1440000], [1440000, 1450000], [1450000, 1460000], [1460000, 1470000], [1470000, 1480000], [1480000, 1490000], [1490000, 1500000], [1500000, 1510000], [1510000, 1520000], [1520000, 1530000], [1530000, 1540000], [1540000, 1550000], [1550000, 1551326]], 'uuid': 'ced110a0-1b0f-11ec-b2e9-09c08e80beef'}\n",
+      "root://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/mc/RunIISummer20UL18NanoAODv9/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/NANOAODSIM/106X_upgrade2018_realistic_v16_L1v1-v2/230000/068B0797-DEF5-9341-BBBE-EDBE50EBC6A1.root {'object_path': 'Events', 'steps': [[0, 10000], [10000, 20000], [20000, 30000], [30000, 40000], [40000, 50000], [50000, 60000], [60000, 70000], [70000, 80000], [80000, 90000], [90000, 100000], [100000, 110000], [110000, 120000], [120000, 130000], [130000, 140000], [140000, 150000], [150000, 160000], [160000, 170000], [170000, 180000], [180000, 190000], [190000, 200000], [200000, 210000], [210000, 220000], [220000, 230000], [230000, 240000], [240000, 250000], [250000, 260000], [260000, 270000], [270000, 280000], [280000, 290000], [290000, 300000], [300000, 310000], [310000, 320000], [320000, 330000], [330000, 340000], [340000, 350000], [350000, 360000], [360000, 370000], [370000, 380000], [380000, 390000], [390000, 400000], [400000, 410000], [410000, 420000], [420000, 430000], [430000, 440000], [440000, 450000], [450000, 460000], [460000, 470000], [470000, 480000], [480000, 490000], [490000, 500000], [500000, 510000], [510000, 520000], [520000, 530000], [530000, 540000], [540000, 550000], [550000, 560000], [560000, 570000], [570000, 580000], [580000, 590000], [590000, 600000], [600000, 610000], [610000, 620000], [620000, 630000], [630000, 640000], [640000, 650000], [650000, 660000], [660000, 670000], [670000, 680000], [680000, 690000], [690000, 700000], [700000, 710000], [710000, 720000], [720000, 730000], [730000, 740000], [740000, 750000], [750000, 760000], [760000, 770000], [770000, 780000], [780000, 790000], [790000, 800000], [800000, 810000], [810000, 820000], [820000, 830000], [830000, 840000], [840000, 850000], [850000, 860000], [860000, 870000], [870000, 880000], [880000, 890000], [890000, 900000], [900000, 910000], [910000, 920000], [920000, 930000], [930000, 940000], [940000, 950000], [950000, 960000], [960000, 970000], [970000, 980000], [980000, 990000], [990000, 1000000], [1000000, 1010000], [1010000, 1020000], [1020000, 1030000], [1030000, 1040000], [1040000, 1050000], [1050000, 1060000], [1060000, 1070000], [1070000, 1080000], [1080000, 1090000], [1090000, 1100000], [1100000, 1110000], [1110000, 1120000], [1120000, 1130000], [1130000, 1138724]], 'uuid': 'd86ab2e2-1b28-11ec-8504-738a8e80beef'}\n",
+      "root://cmsxrd.ts.infn.it:1094///store/mc/RunIISummer20UL18NanoAODv9/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/NANOAODSIM/106X_upgrade2018_realistic_v16_L1v1-v2/230000/0CFD79EF-41AB-4B4A-8F62-06393273EEDE.root {'object_path': 'Events', 'steps': [[0, 10000], [10000, 20000], [20000, 30000], [30000, 40000], [40000, 50000], [50000, 60000], [60000, 70000], [70000, 80000], [80000, 90000], [90000, 100000], [100000, 110000], [110000, 120000], [120000, 130000], [130000, 140000], [140000, 150000], [150000, 160000], [160000, 170000], [170000, 180000], [180000, 190000], [190000, 200000], [200000, 210000], [210000, 220000], [220000, 230000], [230000, 240000], [240000, 250000], [250000, 260000], [260000, 270000], [270000, 280000], [280000, 290000], [290000, 300000], [300000, 310000], [310000, 320000], [320000, 330000], [330000, 340000], [340000, 350000], [350000, 360000], [360000, 370000], [370000, 380000], [380000, 390000], [390000, 400000], [400000, 410000], [410000, 420000], [420000, 430000], [430000, 440000], [440000, 450000], [450000, 460000], [460000, 470000], [470000, 480000], [480000, 490000], [490000, 500000], [500000, 510000], [510000, 520000], [520000, 530000], [530000, 540000], [540000, 550000], [550000, 560000], [560000, 570000], [570000, 580000], [580000, 590000], [590000, 600000], [600000, 610000], [610000, 620000], [620000, 630000], [630000, 640000], [640000, 650000], [650000, 660000], [660000, 670000], [670000, 680000], [680000, 690000], [690000, 700000], [700000, 710000], [710000, 720000], [720000, 730000], [730000, 740000], [740000, 750000], [750000, 760000], [760000, 770000], [770000, 780000], [780000, 790000], [790000, 800000], [800000, 810000], [810000, 820000], [820000, 830000], [830000, 840000], [840000, 850000], [850000, 860000], [860000, 870000], [870000, 880000], [880000, 890000], [890000, 900000], [900000, 910000], [910000, 911868]], 'uuid': '9d799986-1ad9-11ec-9257-fc1b1e0abeef'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "dataset = '/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2/NANOAODSIM'\n",
+    "for i, (file, meta) in enumerate(fileset_available[dataset][\"files\"].items()):\n",
+    "    print(file, meta)    \n",
+    "    if i>3: break"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f23bae95-8a2e-46a9-a884-714474a8ff12",
+   "metadata": {},
+   "outputs": [],
    "source": []
   }
  ],
diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py
index 2dd7ca171..cf4f328f9 100644
--- a/src/coffea/dataset_tools/dataset_query.py
+++ b/src/coffea/dataset_tools/dataset_query.py
@@ -548,6 +548,13 @@ def load_dataset_definition(
         query_results_strategy="all",
         replicas_strategy="round-robin",
     ):
+        """
+        Initialize the DataDiscoverCLI by querying a set of datasets defined in `dataset_definitions`
+        and selected results and replicas following the options.
+
+        - query_results_strategy:  "all" or "manual" to be prompt for selection
+        - replicas_strategy: "round-robin", "choose" (to manually choose the sites), "manual": to be prompt for manual decision case by case
+        """
         for dataset_query, dataset_meta in dataset_definition.items():
             print(f"\nProcessing query: {dataset_query}")
             # Adding queries
@@ -617,7 +624,9 @@ def load_dataset_definition(
     parser.add_argument(
         "--step-size", help="Step size for preprocessing", type=int, default=500000
     )
-    parser.add_argument("--dask-cluster", help="Dask cluster url", type=str, default="")
+    parser.add_argument(
+        "--dask-cluster", help="Dask cluster url", type=str, default=None
+    )
     parser.add_argument(
         "-as",
         "--allow-sites",

From 02001b3b8641fc88cccc6077aad1018eb9f3499d Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Tue, 12 Dec 2023 05:48:22 -0600
Subject: [PATCH 76/80] actually pass uproot_options down

---
 src/coffea/dataset_tools/apply_processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/coffea/dataset_tools/apply_processor.py b/src/coffea/dataset_tools/apply_processor.py
index 69994b127..79386852e 100644
--- a/src/coffea/dataset_tools/apply_processor.py
+++ b/src/coffea/dataset_tools/apply_processor.py
@@ -73,7 +73,7 @@ def apply_to_fileset(
         metadata = copy.deepcopy(dataset.get("metadata", {}))
         metadata.setdefault("dataset", name)
         dataset_out = apply_to_dataset(
-            data_manipulation, dataset, schemaclass, metadata
+            data_manipulation, dataset, schemaclass, metadata, uproot_options
         )
         if isinstance(out, tuple):
             out[name], report[name] = dataset_out

From ba88fe4074f974de9c2c024d36d2539912c713e8 Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Tue, 12 Dec 2023 08:27:57 -0400
Subject: [PATCH 77/80] fix get_failed_steps_for_fileset/dataset

---
 src/coffea/dataset_tools/__init__.py        |  7 ++++++-
 src/coffea/dataset_tools/apply_processor.py |  2 +-
 src/coffea/dataset_tools/manipulations.py   | 22 ++++++++++++---------
 3 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/src/coffea/dataset_tools/__init__.py b/src/coffea/dataset_tools/__init__.py
index 1895bf722..8dd444189 100644
--- a/src/coffea/dataset_tools/__init__.py
+++ b/src/coffea/dataset_tools/__init__.py
@@ -1,5 +1,10 @@
 from coffea.dataset_tools.apply_processor import apply_to_dataset, apply_to_fileset
-from coffea.dataset_tools.manipulations import max_chunks, slice_chunks
+from coffea.dataset_tools.manipulations import (
+    get_failed_steps_for_dataset,
+    get_failed_steps_for_fileset,
+    max_chunks,
+    slice_chunks,
+)
 from coffea.dataset_tools.preprocess import preprocess
 
 __all__ = [
diff --git a/src/coffea/dataset_tools/apply_processor.py b/src/coffea/dataset_tools/apply_processor.py
index 79386852e..698cd916a 100644
--- a/src/coffea/dataset_tools/apply_processor.py
+++ b/src/coffea/dataset_tools/apply_processor.py
@@ -75,7 +75,7 @@ def apply_to_fileset(
         dataset_out = apply_to_dataset(
             data_manipulation, dataset, schemaclass, metadata, uproot_options
         )
-        if isinstance(out, tuple):
+        if isinstance(dataset_out, tuple):
             out[name], report[name] = dataset_out
         else:
             out[name] = dataset_out
diff --git a/src/coffea/dataset_tools/manipulations.py b/src/coffea/dataset_tools/manipulations.py
index a749561e0..3386adbe5 100644
--- a/src/coffea/dataset_tools/manipulations.py
+++ b/src/coffea/dataset_tools/manipulations.py
@@ -21,7 +21,8 @@ def slice_chunks(fileset, theslice=slice(None)):
 
 
 def get_failed_steps_for_dataset(dataset, report):
-    failed_dataset = {}
+    failed_dataset = copy.deepcopy(dataset)
+    failed_dataset["files"] = {}
     failures = report[~awkward.is_none(report.exception)]
 
     if not awkward.all(report.args[:, 4] == "True"):
@@ -30,13 +31,16 @@ def get_failed_steps_for_dataset(dataset, report):
         )
 
     for fdesc in dataset.values():
-        if "steps" not in fdesc:
+        config = list(fdesc.values())[0]
+        if "steps" not in config:
             raise RuntimeError(
                 "steps specification not found in dataset, please specify steps in input dataset."
             )
 
-    fnames = set(dataset.keys())
-    rnames = set(numpy.unique(report.args[:, 0][:, 1:-1:]))
+    fnames = set(dataset["files"].keys())
+    rnames = (
+        set(numpy.unique(failures.args[:, 0][:, 1:-1:])) if len(failures) > 0 else set()
+    )
     if not rnames.issubset(fnames):
         raise RuntimeError(
             f"Files: {rnames - fnames} are not in input dataset, please ensure report corresponds to input dataset!"
@@ -47,11 +51,11 @@ def get_failed_steps_for_dataset(dataset, report):
 
         fname, object_path, start, stop, is_step = args_as_types
 
-        if fname in failed_dataset:
-            failed_dataset[fname]["steps"].append([start, stop])
+        if fname in failed_dataset["files"]:
+            failed_dataset["files"][fname]["steps"].append([start, stop])
         else:
-            failed_dataset[fname] = copy.deepcopy(dataset[fname])
-            failed_dataset[fname]["steps"] = [[start, stop]]
+            failed_dataset["files"][fname] = copy.deepcopy(dataset["files"][fname])
+            failed_dataset["files"][fname]["steps"] = [[start, stop]]
 
     return failed_dataset
 
@@ -60,6 +64,6 @@ def get_failed_steps_for_fileset(fileset, report_dict):
     failed_fileset = {}
     for name, dataset in fileset.items():
         failed_dataset = get_failed_steps_for_dataset(dataset, report_dict[name])
-        if len(failed_dataset) > 0:
+        if len(failed_dataset["files"]) > 0:
             failed_fileset[name] = failed_dataset
     return failed_fileset

From 6e22866db03753faa98e7dabc23ed71d6f7cef61 Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Tue, 12 Dec 2023 08:40:10 -0400
Subject: [PATCH 78/80] properly check each input file for steps

---
 src/coffea/dataset_tools/manipulations.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/coffea/dataset_tools/manipulations.py b/src/coffea/dataset_tools/manipulations.py
index 3386adbe5..e18e4ec4a 100644
--- a/src/coffea/dataset_tools/manipulations.py
+++ b/src/coffea/dataset_tools/manipulations.py
@@ -30,11 +30,11 @@ def get_failed_steps_for_dataset(dataset, report):
             "step specification is not completely in starts/stops form, failed-step extraction is not available for steps_per_file."
         )
 
-    for fdesc in dataset.values():
-        config = list(fdesc.values())[0]
-        if "steps" not in config:
+    for fname, fdesc in dataset["files"].items():
+        if "steps" not in fdesc:
             raise RuntimeError(
-                "steps specification not found in dataset, please specify steps in input dataset."
+                f"steps specification not found in file description for {fname}, "
+                "please specify steps consistently in input dataset."
             )
 
     fnames = set(dataset["files"].keys())

From 6d0722cbdb7e0ccea3ac8a4f28fb83df2a237996 Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Tue, 12 Dec 2023 11:37:22 -0400
Subject: [PATCH 79/80] adjust uproot pins, add test

---
 pyproject.toml              |  2 +-
 tests/test_dataset_tools.py | 90 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 90 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index d3eaa6d95..f35d69e26 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,7 +38,7 @@ classifiers = [
 ]
 dependencies = [
   "awkward>=2.5.1rc1",
-  "uproot>=5.2.0rc3",
+  "uproot>=5.2.0rc4",
   "dask[array]>=2023.4.0",
   "dask-awkward>=2023.12.1",
   "dask-histogram>=2023.10.0",
diff --git a/tests/test_dataset_tools.py b/tests/test_dataset_tools.py
index ca46c0060..3500fdc3f 100644
--- a/tests/test_dataset_tools.py
+++ b/tests/test_dataset_tools.py
@@ -2,7 +2,13 @@
 import pytest
 from distributed import Client
 
-from coffea.dataset_tools import apply_to_fileset, max_chunks, preprocess, slice_chunks
+from coffea.dataset_tools import (
+    apply_to_fileset,
+    get_failed_steps_for_fileset,
+    max_chunks,
+    preprocess,
+    slice_chunks,
+)
 from coffea.nanoevents import BaseSchema, NanoAODSchema
 from coffea.processor.test_items import NanoEventsProcessor, NanoTestProcessor
 
@@ -32,6 +38,56 @@
     },
 }
 
+_starting_fileset_with_steps = {
+    "ZJets": {
+        "files": {
+            "tests/samples/nano_dy.root": {
+                "object_path": "Events",
+                "steps": [
+                    [0, 5],
+                    [5, 10],
+                    [10, 15],
+                    [15, 20],
+                    [20, 25],
+                    [25, 30],
+                    [30, 35],
+                    [35, 40],
+                ],
+            }
+        }
+    },
+    "Data": {
+        "files": {
+            "tests/samples/nano_dimuon.root": {
+                "object_path": "Events",
+                "steps": [
+                    [0, 5],
+                    [5, 10],
+                    [10, 15],
+                    [15, 20],
+                    [20, 25],
+                    [25, 30],
+                    [30, 35],
+                    [35, 40],
+                ],
+            },
+            "tests/samples/nano_dimuon_not_there.root": {
+                "object_path": "Events",
+                "steps": [
+                    [0, 5],
+                    [5, 10],
+                    [10, 15],
+                    [15, 20],
+                    [20, 25],
+                    [25, 30],
+                    [30, 35],
+                    [35, 40],
+                ],
+            },
+        }
+    },
+}
+
 _runnable_result = {
     "ZJets": {
         "files": {
@@ -218,3 +274,35 @@ def test_slicechunks():
             }
         },
     }
+
+
+def test_recover_failed_chunks():
+    with Client() as _:
+        to_compute = apply_to_fileset(
+            NanoEventsProcessor(),
+            _starting_fileset_with_steps,
+            schemaclass=NanoAODSchema,
+            uproot_options={"allow_read_errors_with_report": True},
+        )
+        out, reports = dask.compute(*to_compute)
+
+    failed_fset = get_failed_steps_for_fileset(_starting_fileset_with_steps, reports)
+    assert failed_fset == {
+        "Data": {
+            "files": {
+                "tests/samples/nano_dimuon_not_there.root": {
+                    "object_path": "Events",
+                    "steps": [
+                        [0, 5],
+                        [5, 10],
+                        [10, 15],
+                        [15, 20],
+                        [20, 25],
+                        [25, 30],
+                        [30, 35],
+                        [35, 40],
+                    ],
+                }
+            }
+        }
+    }

From 994ed4a463d191c7b8e60fa4e801465a05e79509 Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Tue, 12 Dec 2023 16:08:50 -0400
Subject: [PATCH 80/80] typing and docs

---
 src/coffea/dataset_tools/apply_processor.py | 46 ++++++++++++-
 src/coffea/dataset_tools/manipulations.py   | 73 +++++++++++++++++++--
 src/coffea/dataset_tools/preprocess.py      | 56 +++++++++++++++-
 3 files changed, 167 insertions(+), 8 deletions(-)

diff --git a/src/coffea/dataset_tools/apply_processor.py b/src/coffea/dataset_tools/apply_processor.py
index 698cd916a..324dfd908 100644
--- a/src/coffea/dataset_tools/apply_processor.py
+++ b/src/coffea/dataset_tools/apply_processor.py
@@ -35,7 +35,29 @@ def apply_to_dataset(
     schemaclass: BaseSchema = NanoAODSchema,
     metadata: dict[Hashable, Any] = {},
     uproot_options: dict[str, Any] = {},
-) -> DaskOutputType:
+) -> DaskOutputType | tuple[DaskOutputType, dask_awkward.Array]:
+    """
+    Apply the supplied function or processor to the supplied dataset.
+    Parameters
+    ----------
+        data_manipulation : ProcessorABC or GenericHEPAnalysis
+            The user analysis code to run on the input dataset
+        dataset: DatasetSpec | DatasetSpecOptional
+            The data to be acted upon by the data manipulation passed in.
+        schemaclass: BaseSchema, default NanoAODSchema
+            The nanoevents schema to interpret the input dataset with.
+        metadata: dict[Hashable, Any], default {}
+            Metadata for the dataset that is accessible by the input analysis. Should also be dask-serializable.
+        uproot_options: dict[str, Any], default {}
+            Options to pass to uproot. Pass at least {"allow_read_errors_with_report": True} to turn on file access reports.
+
+    Returns
+    -------
+        out : DaskOutputType
+            The output of the analysis workflow applied to the dataset
+        report : dask_awkward.Array, optional
+            The file access report for running the analysis on the input dataset. Needs to be computed in simultaneously with the analysis to be accurate.
+    """
     files = dataset["files"]
     events = NanoEventsFactory.from_root(
         files,
@@ -66,7 +88,27 @@ def apply_to_fileset(
     fileset: FilesetSpec | FilesetSpecOptional,
     schemaclass: BaseSchema = NanoAODSchema,
     uproot_options: dict[str, Any] = {},
-) -> dict[str, DaskOutputType]:
+) -> dict[str, DaskOutputType] | tuple[dict[str, DaskOutputType], dask_awkward.Array]:
+    """
+    Apply the supplied function or processor to the supplied fileset (set of datasets).
+    Parameters
+    ----------
+        data_manipulation : ProcessorABC or GenericHEPAnalysis
+            The user analysis code to run on the input dataset
+        fileset: FilesetSpec | FilesetSpecOptional
+            The data to be acted upon by the data manipulation passed in. Metadata within the fileset should be dask-serializable.
+        schemaclass: BaseSchema, default NanoAODSchema
+            The nanoevents schema to interpret the input dataset with.
+        uproot_options: dict[str, Any], default {}
+            Options to pass to uproot. Pass at least {"allow_read_errors_with_report": True} to turn on file access reports.
+
+    Returns
+    -------
+        out : dict[str, DaskOutputType]
+            The output of the analysis workflow applied to the datasets, keyed by dataset name.
+        report : dask_awkward.Array, optional
+            The file access report for running the analysis on the input dataset. Needs to be computed in simultaneously with the analysis to be accurate.
+    """
     out = {}
     report = {}
     for name, dataset in fileset.items():
diff --git a/src/coffea/dataset_tools/manipulations.py b/src/coffea/dataset_tools/manipulations.py
index e18e4ec4a..081e1d97d 100644
--- a/src/coffea/dataset_tools/manipulations.py
+++ b/src/coffea/dataset_tools/manipulations.py
@@ -1,14 +1,47 @@
+from __future__ import annotations
+
 import copy
+from typing import Any
 
 import awkward
 import numpy
 
+from coffea.dataset_tools.preprocess import DatasetSpec, FilesetSpec
+
+
+def max_chunks(fileset: FilesetSpec, maxchunks: int | None = None) -> FilesetSpec:
+    """
+    Modify the input dataset so that only the first "maxchunks" chunks of each file will be processed.
+    Parameters
+    ----------
+        fileset: FilesetSpec
+            The set of datasets reduce to max-chunks row-ranges.
+        maxchunks: int | None, default None
+            How many chunks to keep for each file.
 
-def max_chunks(fileset, maxchunks=None):
+    Returns
+    -------
+        out : FilesetSpec
+            The reduced fileset with only the first maxchunks event ranges left in.
+    """
     return slice_chunks(fileset, slice(maxchunks))
 
 
-def slice_chunks(fileset, theslice=slice(None)):
+def slice_chunks(fileset: FilesetSpec, theslice: Any = slice(None)) -> FilesetSpec:
+    """
+    Modify the input dataset so that only the chunks of each file specified by the input slice are processed.
+    Parameters
+    ----------
+        fileset: FilesetSpec
+            The set of datasets to be sliced.
+        theslice: Any, default slice(None)
+            How to slice the array of row-ranges (steps) in the input fileset.
+
+    Returns
+    -------
+        out : FilesetSpec
+            The reduce fileset with only the row-ranges specific by theslice left.
+    """
     if not isinstance(theslice, slice):
         theslice = slice(theslice)
 
@@ -20,7 +53,23 @@ def slice_chunks(fileset, theslice=slice(None)):
     return out
 
 
-def get_failed_steps_for_dataset(dataset, report):
+def get_failed_steps_for_dataset(
+    dataset: DatasetSpec, report: awkward.Array
+) -> DatasetSpec:
+    """
+    Modify an input dataset to only contain the files and row-ranges for *failed* processing jobs as specified in the supplied report.
+    Parameters
+    ----------
+        dataset: DatasetSpec
+            The dataset to be reduced to only contain files and row-ranges that have previously encountered failed file access.
+        report: awkward.Array
+            The computed file-access error report from dask-awkward.
+
+    Returns
+    -------
+        out : DatasetSpec
+            The reduced dataset with only the row-ranges and files that failed processing, according to the input report.
+    """
     failed_dataset = copy.deepcopy(dataset)
     failed_dataset["files"] = {}
     failures = report[~awkward.is_none(report.exception)]
@@ -60,7 +109,23 @@ def get_failed_steps_for_dataset(dataset, report):
     return failed_dataset
 
 
-def get_failed_steps_for_fileset(fileset, report_dict):
+def get_failed_steps_for_fileset(
+    fileset: FilesetSpec, report_dict: dict[str, awkward.Array]
+):
+    """
+    Modify an input dataset to only contain the files and row-ranges for *failed* processing jobs as specified in the supplied report.
+    Parameters
+    ----------
+        fileset: FilesetSpec
+            The set of datasets to be reduced to only contain files and row-ranges that have previously encountered failed file access.
+        report_dict: dict[str, awkward.Array]
+            The computed file-access error reports from dask-awkward, indexed by dataset name.
+
+    Returns
+    -------
+        out : FilesetSpec
+            The reduced dataset with only the row-ranges and files that failed processing, according to the input report.
+    """
     failed_fileset = {}
     for name, dataset in fileset.items():
         failed_dataset = get_failed_steps_for_dataset(dataset, report_dict[name])
diff --git a/src/coffea/dataset_tools/preprocess.py b/src/coffea/dataset_tools/preprocess.py
index da4ca7d6a..94ddf2507 100644
--- a/src/coffea/dataset_tools/preprocess.py
+++ b/src/coffea/dataset_tools/preprocess.py
@@ -14,12 +14,38 @@
 
 def _get_steps(
     normed_files: awkward.Array | dask_awkward.Array,
-    maybe_step_size: None | int = None,
+    maybe_step_size: int | None = None,
     align_clusters: bool = False,
     recalculate_seen_steps: bool = False,
     skip_bad_files: bool = False,
-    file_exceptions: Exception | Warning = (FileNotFoundError, OSError),
+    file_exceptions: Exception
+    | Warning
+    | tuple[Exception | Warning] = (FileNotFoundError, OSError),
 ) -> awkward.Array | dask_awkward.Array:
+    """
+    Given a list of normalized file and object paths (defined in uproot), determine the steps for each file according to the supplied processing options.
+    Parameters
+    ----------
+        normed_files: awkward.Array | dask_awkward.Array
+            The list of normalized file descriptions to process for steps.
+        maybe_step_sizes: int | None, default None
+            If specified, the size of the steps to make when analyzing the input files.
+        align_clusters: bool, default False
+            Round to the cluster size in a root file, when chunks are specified. Reduces data transfer in
+            analysis.
+        recalculate_seen_steps: bool, default False
+            If steps are present in the input normed files, force the recalculation of those steps, instead
+            of only recalculating the steps if the uuid has changed.
+        skip_bad_files: bool, False
+            Instead of failing, catch exceptions specified by file_exceptions and return null data.
+        file_exceptions: Exception | Warning | tuple[Exception | Warning], default (FileNotFoundError, OSError)
+            What exceptions to catch when skipping bad files.
+
+    Returns
+    -------
+        array : awkward.Array | dask_awkward.Array
+            The normalized file descriptions, appended with the calculated steps for those files.
+    """
     nf_backend = awkward.backend(normed_files)
     lz_or_nf = awkward.typetracer.length_zero_if_typetracer(normed_files)
 
@@ -144,6 +170,32 @@ def preprocess(
     skip_bad_files: bool = False,
     file_exceptions: Exception | Warning = (FileNotFoundError, OSError),
 ) -> tuple[FilesetSpec, FilesetSpecOptional]:
+    """
+    Given a list of normalized file and object paths (defined in uproot), determine the steps for each file according to the supplied processing options.
+    Parameters
+    ----------
+        fileset: FilesetSpecOptional
+            The set of datasets whose files will be preprocessed.
+        maybe_step_sizes: int | None, default None
+            If specified, the size of the steps to make when analyzing the input files.
+        align_clusters: bool, default False
+            Round to the cluster size in a root file, when chunks are specified. Reduces data transfer in
+            analysis.
+        recalculate_seen_steps: bool, default False
+            If steps are present in the input normed files, force the recalculation of those steps,
+            instead of only recalculating the steps if the uuid has changed.
+        skip_bad_files: bool, False
+            Instead of failing, catch exceptions specified by file_exceptions and return null data.
+        file_exceptions: Exception | Warning | tuple[Exception | Warning], default (FileNotFoundError, OSError)
+            What exceptions to catch when skipping bad files.
+
+    Returns
+    -------
+        out_available : FilesetSpec
+            The subset of files in each dataset that were successfully preprocessed, organized by dataset.
+        out_updated : FilesetSpecOptional
+            The original set of datasets including files that were not accessible, updated to include the result of preprocessing where available.
+    """
     out_updated = copy.deepcopy(fileset)
     out_available = copy.deepcopy(fileset)
     all_ak_norm_files = {}