diff --git a/binder/dataset_discovery.ipynb b/binder/dataset_discovery.ipynb new file mode 100644 index 000000000..2b19cf1c0 --- /dev/null +++ b/binder/dataset_discovery.ipynb @@ -0,0 +1,2003 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c5754206-f41b-4e08-bc4d-496df85e8194", + "metadata": {}, + "source": [ + "# Dataset discovery tools\n", + "\n", + "This notebook shows some features to make the dataset discovery for CMS analysis easier. \n", + "The rucio sytem is queried to look for dataset and access to the list of all available file replicas.\n", + "\n", + "Users can exploit these tools at 2 different levels:\n", + "- low level: use the `rucio_utils` module directly to just query rucio\n", + "- high level: use the `DataDiscoveryCLI` class to simplify dataset query, replicas filters and uproot preprocessing with dask" + ] + }, + { + "cell_type": "markdown", + "id": "42242097-c04e-459e-9f3a-1d746df4e9dd", + "metadata": {}, + "source": [ + "# Using Rucio utils directly" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "09103c77-b8e6-4d61-920b-b1ff8fba8791", + "metadata": {}, + "outputs": [], + "source": [ + "from coffea.dataset_tools import rucio_utils\n", + "from coffea.dataset_tools.dataset_query import print_dataset_query\n", + "from rich.console import Console\n", + "from rich.table import Table" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d62b43cb-53c0-4e2d-b571-1a0683e34dc5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client = rucio_utils.get_rucio_client()\n", + "client" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "0359afc0-fc98-4aa8-acf4-288ef19ac7db", + "metadata": {}, + "outputs": [], + "source": [ + "query = \"/TTToSemiLeptonic_*_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9*/NANOAODSIM\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "659bee88-9fb0-4d1a-9544-a97372595f18", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['/TTToSemiLeptonic_TuneCP5CR1_erdON_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM',\n", + " '/TTToSemiLeptonic_TuneCP5CR2_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2/NANOAODSIM',\n", + " '/TTToSemiLeptonic_TuneCP5_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM',\n", + " '/TTToSemiLeptonic_TuneCP5_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-20UL18JMENano_106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "outlist, outtree = rucio_utils.query_dataset(\n", + " query,\n", + " client=client,\n", + " tree=True,\n", + " scope=\"cms\", \n", + " )\n", + "\n", + "outlist[1:5]" + ] + }, + { + "cell_type": "markdown", + "id": "9bc2a454-4915-4366-9c02-2e389e9eb6fb", + "metadata": {}, + "source": [ + "Let's now pretty-print the results in a table using an utility function in the `dataset_query` module." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "4487d997-dc22-4a47-87df-4da14fa5b35a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
              Query: /TTToSemiLeptonic_*_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9*/NANOAODSIM               \n",
+       "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳┓\n",
+       "┃ Name                               Tag                                                                        ┃\n",
+       "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇┩\n",
+       "│ TTToSemiLeptonic_TuneCP5CR1_13Te…  (1) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2/NAN… │\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_TuneCP5CR1_erdO…  (2) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NAN… ││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_TuneCP5CR2_13Te…  (3) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2/NAN… │\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_TuneCP5_13TeV-p…  (4) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NAN… ││\n",
+       "│                                    (5) RunIISummer20UL18NanoAODv9-20UL18JMENano_106X_upgrade2018_realistic_v… │\n",
+       "│                                    (6) RunIISummer20UL18NanoAODv9-PUForMUOVal_106X_upgrade2018_realistic_v16… ││\n",
+       "│                                    (7) RunIISummer20UL18NanoAODv9-PUForTRK_TRK_106X_upgrade2018_realistic_v1… │\n",
+       "│                                    (8) RunIISummer20UL18NanoAODv9-PUForTRKv2_TRKv2_106X_upgrade2018_realisti… ││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_TuneCP5_erdON_1…  (9) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NAN… │\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_TuneCP5down_13T…  (10) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… ││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_TuneCP5up_13TeV…  (11) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… │\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_Vcb_TuneCP5_13T…  (12) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2/NA… ││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_hdampDOWN_TuneC…  (13) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… │\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_hdampUP_TuneCP5…  (14) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… ││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_mtop166p5_TuneC…  (15) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… │\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_mtop169p5_TuneC…  (16) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… ││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_mtop171p5_TuneC…  (17) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… │\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_mtop173p5_TuneC…  (18) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… ││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_mtop175p5_TuneC…  (19) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… │\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_mtop178p5_TuneC…  (20) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… ││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_widthx0p55_Tune…  (21) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… │\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_widthx0p7_TuneC…  (22) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… ││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_widthx0p85_Tune…  (23) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… │\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_widthx1p15_Tune…  (24) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… ││\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_widthx1p3_TuneC…  (25) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… │\n",
+       "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n",
+       "│ TTToSemiLeptonic_widthx1p45_Tune…  (26) RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA… ││\n",
+       "└───────────────────────────────────┴────────────────────────────────────────────────────────────────────────────┴┘\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[3m Query: \u001b[0m\u001b[1;3;31m/TTToSemiLeptonic_*_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9*/NANOAODSIM\u001b[0m\u001b[3m \u001b[0m\n", + "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳┓\n", + "┃\u001b[1m \u001b[0m\u001b[1mName \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mTag \u001b[0m\u001b[1m \u001b[0m┃┃\n", + "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇┩\n", + "│\u001b[2;36m \u001b[0m\u001b[2;36mTTToSemiLeptonic_TuneCP5CR1_13Te…\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(1)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2/NAN…\u001b[0m\u001b[2;35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[36m \u001b[0m\u001b[36mTTToSemiLeptonic_TuneCP5CR1_erdO…\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(2)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NAN…\u001b[0m\u001b[35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[2;36m \u001b[0m\u001b[2;36mTTToSemiLeptonic_TuneCP5CR2_13Te…\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(3)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2/NAN…\u001b[0m\u001b[2;35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[36m \u001b[0m\u001b[36mTTToSemiLeptonic_TuneCP5_13TeV-p…\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(4)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NAN…\u001b[0m\u001b[35m \u001b[0m││\n", + "│\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(5)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-20UL18JMENano_106X_upgrade2018_realistic_v…\u001b[0m\u001b[2;35m \u001b[0m││\n", + "│\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(6)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-PUForMUOVal_106X_upgrade2018_realistic_v16…\u001b[0m\u001b[35m \u001b[0m││\n", + "│\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(7)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-PUForTRK_TRK_106X_upgrade2018_realistic_v1…\u001b[0m\u001b[2;35m \u001b[0m││\n", + "│\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(8)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-PUForTRKv2_TRKv2_106X_upgrade2018_realisti…\u001b[0m\u001b[35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[2;36m \u001b[0m\u001b[2;36mTTToSemiLeptonic_TuneCP5_erdON_1…\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(9)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NAN…\u001b[0m\u001b[2;35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[36m \u001b[0m\u001b[36mTTToSemiLeptonic_TuneCP5down_13T…\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(10)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[2;36m \u001b[0m\u001b[2;36mTTToSemiLeptonic_TuneCP5up_13TeV…\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(11)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[2;35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[36m \u001b[0m\u001b[36mTTToSemiLeptonic_Vcb_TuneCP5_13T…\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(12)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2/NA…\u001b[0m\u001b[35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[2;36m \u001b[0m\u001b[2;36mTTToSemiLeptonic_hdampDOWN_TuneC…\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(13)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[2;35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[36m \u001b[0m\u001b[36mTTToSemiLeptonic_hdampUP_TuneCP5…\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(14)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[2;36m \u001b[0m\u001b[2;36mTTToSemiLeptonic_mtop166p5_TuneC…\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(15)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[2;35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[36m \u001b[0m\u001b[36mTTToSemiLeptonic_mtop169p5_TuneC…\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(16)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[2;36m \u001b[0m\u001b[2;36mTTToSemiLeptonic_mtop171p5_TuneC…\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(17)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[2;35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[36m \u001b[0m\u001b[36mTTToSemiLeptonic_mtop173p5_TuneC…\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(18)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[2;36m \u001b[0m\u001b[2;36mTTToSemiLeptonic_mtop175p5_TuneC…\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(19)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[2;35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[36m \u001b[0m\u001b[36mTTToSemiLeptonic_mtop178p5_TuneC…\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(20)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[2;36m \u001b[0m\u001b[2;36mTTToSemiLeptonic_widthx0p55_Tune…\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(21)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[2;35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[36m \u001b[0m\u001b[36mTTToSemiLeptonic_widthx0p7_TuneC…\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(22)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[2;36m \u001b[0m\u001b[2;36mTTToSemiLeptonic_widthx0p85_Tune…\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(23)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[2;35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[36m \u001b[0m\u001b[36mTTToSemiLeptonic_widthx1p15_Tune…\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(24)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[2;36m \u001b[0m\u001b[2;36mTTToSemiLeptonic_widthx1p3_TuneC…\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[1;2;35m(25)\u001b[0m\u001b[2;35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[2;35m \u001b[0m││\n", + "├───────────────────────────────────┼────────────────────────────────────────────────────────────────────────────┼┤\n", + "│\u001b[36m \u001b[0m\u001b[36mTTToSemiLeptonic_widthx1p45_Tune…\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[1;35m(26)\u001b[0m\u001b[35m RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NA…\u001b[0m\u001b[35m \u001b[0m││\n", + "└───────────────────────────────────┴────────────────────────────────────────────────────────────────────────────┴┘\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "console = Console()\n", + "print_dataset_query(query, outtree, console)" + ] + }, + { + "cell_type": "markdown", + "id": "c213d5fc-6424-4cdf-8751-88ced7987a59", + "metadata": {}, + "source": [ + "### Dataset replicas" + ] + }, + { + "cell_type": "markdown", + "id": "961b4ad8-e3d6-49b1-a2ce-7cad49b46f06", + "metadata": {}, + "source": [ + "Let's select one dataset and look for available replicas" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "d08fd6ed-4b3a-4e9f-994a-d1bd529421a7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'/TTToSemiLeptonic_TuneCP5CR1_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2/NANOAODSIM'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = outlist[0]\n", + "dataset" + ] + }, + { + "cell_type": "markdown", + "id": "a605fb64-6e0b-4fbe-8807-84b9d75f2d53", + "metadata": {}, + "source": [ + "Using the option `mode='full'` in the function `rucio_utils.get_dataset_file_replicas()` one gets all the available replicas. " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "2d64069e-ea8f-48c2-bd33-43fc555f6ec8", + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " (\n", + " outfiles,\n", + " outsites,\n", + " sites_counts,\n", + " ) = rucio_utils.get_dataset_files_replicas(\n", + " dataset,\n", + " allowlist_sites=[],\n", + " blocklist_sites=[],\n", + " regex_sites=[],\n", + " mode=\"full\", # full or first. \"full\"==all the available replicas\n", + " client=client,\n", + " )\n", + "except Exception as e:\n", + " print(f\"\\n[red bold] Exception: {e}[/]\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "3e4fc6c2-f378-40d2-a4ea-f265b6c18887", + "metadata": {}, + "outputs": [], + "source": [ + "def print_replicas(sites_counts):\n", + " console.print(f\"[cyan]Sites availability for dataset: [red]{dataset}\")\n", + " table = Table(title=\"Available replicas\")\n", + " table.add_column(\"Index\", justify=\"center\")\n", + " table.add_column(\"Site\", justify=\"left\", style=\"cyan\", no_wrap=True)\n", + " table.add_column(\"Files\", style=\"magenta\", no_wrap=True)\n", + " table.add_column(\"Availability\", justify=\"center\")\n", + " table.row_styles = [\"dim\", \"none\"]\n", + " Nfiles = len(outfiles)\n", + " \n", + " sorted_sites = dict(\n", + " sorted(sites_counts.items(), key=lambda x: x[1], reverse=True)\n", + " )\n", + " for i, (site, stat) in enumerate(sorted_sites.items()):\n", + " table.add_row(\n", + " str(i), site, f\"{stat} / {Nfiles}\", f\"{stat*100/Nfiles:.1f}%\"\n", + " )\n", + " console.print(table)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "79c68044-dc3b-4dd5-a0d3-c3f6ddd0bea1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Sites availability for dataset: \n",
+       "/TTToSemiLeptonic_TuneCP5CR1_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2\n",
+       "/NANOAODSIM\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[36mSites availability for dataset: \u001b[0m\n", + "\u001b[31m/TTToSemiLeptonic_TuneCP5CR1_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2\u001b[0m\n", + "\u001b[31m/\u001b[0m\u001b[31mNANOAODSIM\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
                    Available replicas                    \n",
+       "┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━┓\n",
+       "┃ Index  Site                 Files      Availability ┃\n",
+       "┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━┩\n",
+       "│   0    T2_DE_DESY           294 / 294     100.0%    │\n",
+       "│   1   │ T1_DE_KIT_Disk       294 / 294 │    100.0%    │\n",
+       "│   2    T1_UK_RAL_Disk       294 / 294     100.0%    │\n",
+       "│   3   │ T1_RU_JINR_Disk      294 / 294 │    100.0%    │\n",
+       "│   4    T3_CH_PSI            294 / 294     100.0%    │\n",
+       "│   5   │ T3_KR_UOS            294 / 294 │    100.0%    │\n",
+       "│   6    T1_US_FNAL_Disk      193 / 294     65.6%     │\n",
+       "│   7   │ T2_US_Nebraska       99 / 294  │    33.7%     │\n",
+       "│   8    T1_IT_CNAF_Disk      58 / 294      19.7%     │\n",
+       "│   9   │ T2_US_Purdue         53 / 294  │    18.0%     │\n",
+       "│  10    T2_BE_IIHE           50 / 294      17.0%     │\n",
+       "│  11   │ T2_US_MIT            50 / 294  │    17.0%     │\n",
+       "│  12    T1_ES_PIC_Disk       43 / 294      14.6%     │\n",
+       "│  13   │ T2_US_Vanderbilt     40 / 294  │    13.6%     │\n",
+       "│  14    T2_BR_SPRACE         39 / 294      13.3%     │\n",
+       "│  15   │ T2_US_Florida        33 / 294  │    11.2%     │\n",
+       "│  16    T2_IT_Legnaro        28 / 294       9.5%     │\n",
+       "│  17   │ T2_US_UCSD           28 / 294  │     9.5%     │\n",
+       "│  18    T2_UA_KIPT           26 / 294       8.8%     │\n",
+       "│  19   │ T2_US_Caltech        24 / 294  │     8.2%     │\n",
+       "│  20    T2_US_Wisconsin      22 / 294       7.5%     │\n",
+       "│  21   │ T2_TR_METU           18 / 294  │     6.1%     │\n",
+       "│  22    T2_ES_CIEMAT         17 / 294       5.8%     │\n",
+       "│  23   │ T2_DE_RWTH           11 / 294  │     3.7%     │\n",
+       "│  24    T2_BR_UERJ           7 / 294        2.4%     │\n",
+       "│  25   │ T2_UK_SGrid_Bristol  3 / 294   │     1.0%     │\n",
+       "│  26    T2_ES_IFCA           2 / 294        0.7%     │\n",
+       "└───────┴─────────────────────┴───────────┴──────────────┘\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[3m Available replicas \u001b[0m\n", + "┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━┓\n", + "┃\u001b[1m \u001b[0m\u001b[1mIndex\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mSite \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mFiles \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mAvailability\u001b[0m\u001b[1m \u001b[0m┃\n", + "┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━┩\n", + "│\u001b[2m \u001b[0m\u001b[2m 0 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_DE_DESY \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m294 / 294\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 100.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 1 │\u001b[36m \u001b[0m\u001b[36mT1_DE_KIT_Disk \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m294 / 294\u001b[0m\u001b[35m \u001b[0m│ 100.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 2 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_UK_RAL_Disk \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m294 / 294\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 100.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 3 │\u001b[36m \u001b[0m\u001b[36mT1_RU_JINR_Disk \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m294 / 294\u001b[0m\u001b[35m \u001b[0m│ 100.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 4 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT3_CH_PSI \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m294 / 294\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 100.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 5 │\u001b[36m \u001b[0m\u001b[36mT3_KR_UOS \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m294 / 294\u001b[0m\u001b[35m \u001b[0m│ 100.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 6 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_US_FNAL_Disk \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m193 / 294\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 65.6% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 7 │\u001b[36m \u001b[0m\u001b[36mT2_US_Nebraska \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m99 / 294 \u001b[0m\u001b[35m \u001b[0m│ 33.7% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 8 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_IT_CNAF_Disk \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m58 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 19.7% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 9 │\u001b[36m \u001b[0m\u001b[36mT2_US_Purdue \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m53 / 294 \u001b[0m\u001b[35m \u001b[0m│ 18.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 10 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_BE_IIHE \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m50 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 17.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 11 │\u001b[36m \u001b[0m\u001b[36mT2_US_MIT \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m50 / 294 \u001b[0m\u001b[35m \u001b[0m│ 17.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 12 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_ES_PIC_Disk \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m43 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 14.6% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 13 │\u001b[36m \u001b[0m\u001b[36mT2_US_Vanderbilt \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m40 / 294 \u001b[0m\u001b[35m \u001b[0m│ 13.6% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 14 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_BR_SPRACE \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m39 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 13.3% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 15 │\u001b[36m \u001b[0m\u001b[36mT2_US_Florida \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m33 / 294 \u001b[0m\u001b[35m \u001b[0m│ 11.2% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 16 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_IT_Legnaro \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m28 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 9.5% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 17 │\u001b[36m \u001b[0m\u001b[36mT2_US_UCSD \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m28 / 294 \u001b[0m\u001b[35m \u001b[0m│ 9.5% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 18 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_UA_KIPT \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m26 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 8.8% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 19 │\u001b[36m \u001b[0m\u001b[36mT2_US_Caltech \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m24 / 294 \u001b[0m\u001b[35m \u001b[0m│ 8.2% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 20 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_US_Wisconsin \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m22 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 7.5% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 21 │\u001b[36m \u001b[0m\u001b[36mT2_TR_METU \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m18 / 294 \u001b[0m\u001b[35m \u001b[0m│ 6.1% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 22 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_ES_CIEMAT \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m17 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 5.8% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 23 │\u001b[36m \u001b[0m\u001b[36mT2_DE_RWTH \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m11 / 294 \u001b[0m\u001b[35m \u001b[0m│ 3.7% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 24 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_BR_UERJ \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m7 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 2.4% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 25 │\u001b[36m \u001b[0m\u001b[36mT2_UK_SGrid_Bristol\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m3 / 294 \u001b[0m\u001b[35m \u001b[0m│ 1.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 26 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_ES_IFCA \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m2 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 0.7% \u001b[0m\u001b[2m \u001b[0m│\n", + "└───────┴─────────────────────┴───────────┴──────────────┘\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "print_replicas(sites_counts)" + ] + }, + { + "cell_type": "markdown", + "id": "c9544ceb-5949-4bd3-b997-14da4aa2d956", + "metadata": {}, + "source": [ + "### Filtering sites\n", + "Grid sites can be filtered in 3 different ways\n", + "- **allowlist**: if this list of specified, only the sites in the list are considered. No blocklist and regex are considered\n", + "- **blocklist**: if this list is specified, those sites are excluded from the replicas\n", + "- **regex_sites**: regex filter the sites to be considered, on top of the blocklist" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "1f6b586c-a8b7-40d8-a25a-b02e94f4a892", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Sites availability for dataset: \n",
+       "/TTToSemiLeptonic_TuneCP5CR1_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2\n",
+       "/NANOAODSIM\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[36mSites availability for dataset: \u001b[0m\n", + "\u001b[31m/TTToSemiLeptonic_TuneCP5CR1_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2\u001b[0m\n", + "\u001b[31m/\u001b[0m\u001b[31mNANOAODSIM\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
                  Available replicas                  \n",
+       "┏━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━┓\n",
+       "┃ Index  Site             Files      Availability ┃\n",
+       "┡━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━┩\n",
+       "│   0    T2_DE_DESY       294 / 294     100.0%    │\n",
+       "│   1   │ T1_US_FNAL_Disk  193 / 294 │    65.6%     │\n",
+       "└───────┴─────────────────┴───────────┴──────────────┘\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[3m Available replicas \u001b[0m\n", + "┏━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━┓\n", + "┃\u001b[1m \u001b[0m\u001b[1mIndex\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mSite \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mFiles \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mAvailability\u001b[0m\u001b[1m \u001b[0m┃\n", + "┡━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━┩\n", + "│\u001b[2m \u001b[0m\u001b[2m 0 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_DE_DESY \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m294 / 294\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 100.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 1 │\u001b[36m \u001b[0m\u001b[36mT1_US_FNAL_Disk\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m193 / 294\u001b[0m\u001b[35m \u001b[0m│ 65.6% │\n", + "└───────┴─────────────────┴───────────┴──────────────┘\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Example with allowlist\n", + "try:\n", + " (\n", + " outfiles,\n", + " outsites,\n", + " sites_counts,\n", + " ) = rucio_utils.get_dataset_files_replicas(\n", + " dataset,\n", + " allowlist_sites=[\"T2_DE_DESY\", \"T1_US_FNAL_Disk\"],\n", + " blocklist_sites=[],\n", + " regex_sites=None,\n", + " mode=\"full\", # full or first. \"full\"==all the available replicas\n", + " client=client,\n", + " )\n", + "except Exception as e:\n", + " print(f\"\\n[red bold] Exception: {e}[/]\")\n", + "\n", + "print_replicas(sites_counts)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "12f7e403-67fe-42c0-a3ee-a668006b1836", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Sites availability for dataset: \n",
+       "/TTToSemiLeptonic_TuneCP5CR1_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2\n",
+       "/NANOAODSIM\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[36mSites availability for dataset: \u001b[0m\n", + "\u001b[31m/TTToSemiLeptonic_TuneCP5CR1_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2\u001b[0m\n", + "\u001b[31m/\u001b[0m\u001b[31mNANOAODSIM\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
                    Available replicas                    \n",
+       "┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━┓\n",
+       "┃ Index  Site                 Files      Availability ┃\n",
+       "┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━┩\n",
+       "│   0    T1_DE_KIT_Disk       294 / 294     100.0%    │\n",
+       "│   1   │ T1_UK_RAL_Disk       294 / 294 │    100.0%    │\n",
+       "│   2    T1_RU_JINR_Disk      294 / 294     100.0%    │\n",
+       "│   3   │ T3_KR_UOS            294 / 294 │    100.0%    │\n",
+       "│   4    T1_US_FNAL_Disk      193 / 294     65.6%     │\n",
+       "│   5   │ T2_US_Nebraska       99 / 294  │    33.7%     │\n",
+       "│   6    T1_IT_CNAF_Disk      58 / 294      19.7%     │\n",
+       "│   7   │ T2_US_Purdue         53 / 294  │    18.0%     │\n",
+       "│   8    T2_BE_IIHE           50 / 294      17.0%     │\n",
+       "│   9   │ T2_US_MIT            50 / 294  │    17.0%     │\n",
+       "│  10    T1_ES_PIC_Disk       43 / 294      14.6%     │\n",
+       "│  11   │ T2_US_Vanderbilt     40 / 294  │    13.6%     │\n",
+       "│  12    T2_BR_SPRACE         39 / 294      13.3%     │\n",
+       "│  13   │ T2_US_Florida        33 / 294  │    11.2%     │\n",
+       "│  14    T2_IT_Legnaro        28 / 294       9.5%     │\n",
+       "│  15   │ T2_US_UCSD           28 / 294  │     9.5%     │\n",
+       "│  16    T2_UA_KIPT           26 / 294       8.8%     │\n",
+       "│  17   │ T2_US_Caltech        24 / 294  │     8.2%     │\n",
+       "│  18    T2_US_Wisconsin      22 / 294       7.5%     │\n",
+       "│  19   │ T2_TR_METU           18 / 294  │     6.1%     │\n",
+       "│  20    T2_ES_CIEMAT         17 / 294       5.8%     │\n",
+       "│  21   │ T2_DE_RWTH           11 / 294  │     3.7%     │\n",
+       "│  22    T2_BR_UERJ           7 / 294        2.4%     │\n",
+       "│  23   │ T2_UK_SGrid_Bristol  3 / 294   │     1.0%     │\n",
+       "│  24    T2_ES_IFCA           2 / 294        0.7%     │\n",
+       "└───────┴─────────────────────┴───────────┴──────────────┘\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[3m Available replicas \u001b[0m\n", + "┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━┓\n", + "┃\u001b[1m \u001b[0m\u001b[1mIndex\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mSite \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mFiles \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mAvailability\u001b[0m\u001b[1m \u001b[0m┃\n", + "┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━┩\n", + "│\u001b[2m \u001b[0m\u001b[2m 0 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_DE_KIT_Disk \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m294 / 294\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 100.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 1 │\u001b[36m \u001b[0m\u001b[36mT1_UK_RAL_Disk \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m294 / 294\u001b[0m\u001b[35m \u001b[0m│ 100.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 2 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_RU_JINR_Disk \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m294 / 294\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 100.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 3 │\u001b[36m \u001b[0m\u001b[36mT3_KR_UOS \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m294 / 294\u001b[0m\u001b[35m \u001b[0m│ 100.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 4 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_US_FNAL_Disk \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m193 / 294\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 65.6% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 5 │\u001b[36m \u001b[0m\u001b[36mT2_US_Nebraska \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m99 / 294 \u001b[0m\u001b[35m \u001b[0m│ 33.7% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 6 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_IT_CNAF_Disk \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m58 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 19.7% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 7 │\u001b[36m \u001b[0m\u001b[36mT2_US_Purdue \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m53 / 294 \u001b[0m\u001b[35m \u001b[0m│ 18.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 8 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_BE_IIHE \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m50 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 17.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 9 │\u001b[36m \u001b[0m\u001b[36mT2_US_MIT \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m50 / 294 \u001b[0m\u001b[35m \u001b[0m│ 17.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 10 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_ES_PIC_Disk \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m43 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 14.6% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 11 │\u001b[36m \u001b[0m\u001b[36mT2_US_Vanderbilt \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m40 / 294 \u001b[0m\u001b[35m \u001b[0m│ 13.6% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 12 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_BR_SPRACE \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m39 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 13.3% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 13 │\u001b[36m \u001b[0m\u001b[36mT2_US_Florida \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m33 / 294 \u001b[0m\u001b[35m \u001b[0m│ 11.2% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 14 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_IT_Legnaro \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m28 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 9.5% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 15 │\u001b[36m \u001b[0m\u001b[36mT2_US_UCSD \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m28 / 294 \u001b[0m\u001b[35m \u001b[0m│ 9.5% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 16 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_UA_KIPT \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m26 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 8.8% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 17 │\u001b[36m \u001b[0m\u001b[36mT2_US_Caltech \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m24 / 294 \u001b[0m\u001b[35m \u001b[0m│ 8.2% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 18 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_US_Wisconsin \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m22 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 7.5% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 19 │\u001b[36m \u001b[0m\u001b[36mT2_TR_METU \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m18 / 294 \u001b[0m\u001b[35m \u001b[0m│ 6.1% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 20 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_ES_CIEMAT \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m17 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 5.8% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 21 │\u001b[36m \u001b[0m\u001b[36mT2_DE_RWTH \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m11 / 294 \u001b[0m\u001b[35m \u001b[0m│ 3.7% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 22 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_BR_UERJ \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m7 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 2.4% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 23 │\u001b[36m \u001b[0m\u001b[36mT2_UK_SGrid_Bristol\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m3 / 294 \u001b[0m\u001b[35m \u001b[0m│ 1.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 24 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_ES_IFCA \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m2 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 0.7% \u001b[0m\u001b[2m \u001b[0m│\n", + "└───────┴─────────────────────┴───────────┴──────────────┘\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Example with blocklist\n", + "try:\n", + " (\n", + " outfiles,\n", + " outsites,\n", + " sites_counts,\n", + " ) = rucio_utils.get_dataset_files_replicas(\n", + " dataset,\n", + " allowlist_sites=[],\n", + " blocklist_sites=[\"T2_DE_DESY\", \"T3_CH_PSI\"],\n", + " regex_sites=None,\n", + " mode=\"full\", # full or first. \"full\"==all the available replicas\n", + " client=client,\n", + " )\n", + "except Exception as e:\n", + " print(f\"\\n[red bold] Exception: {e}[/]\")\n", + "\n", + "print_replicas(sites_counts)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "f5dafcc2-c32e-4e33-9878-183a8e476b73", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Sites availability for dataset: \n",
+       "/TTToSemiLeptonic_TuneCP5CR1_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2\n",
+       "/NANOAODSIM\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[36mSites availability for dataset: \u001b[0m\n", + "\u001b[31m/TTToSemiLeptonic_TuneCP5CR1_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2\u001b[0m\n", + "\u001b[31m/\u001b[0m\u001b[31mNANOAODSIM\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
                    Available replicas                    \n",
+       "┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━┓\n",
+       "┃ Index  Site                 Files      Availability ┃\n",
+       "┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━┩\n",
+       "│   0    T2_DE_DESY           294 / 294     100.0%    │\n",
+       "│   1   │ T1_DE_KIT_Disk       294 / 294 │    100.0%    │\n",
+       "│   2    T1_UK_RAL_Disk       294 / 294     100.0%    │\n",
+       "│   3   │ T3_CH_PSI            294 / 294 │    100.0%    │\n",
+       "│   4    T1_IT_CNAF_Disk      58 / 294      19.7%     │\n",
+       "│   5   │ T2_BE_IIHE           50 / 294  │    17.0%     │\n",
+       "│   6    T1_ES_PIC_Disk       43 / 294      14.6%     │\n",
+       "│   7   │ T2_IT_Legnaro        28 / 294  │     9.5%     │\n",
+       "│   8    T2_ES_CIEMAT         17 / 294       5.8%     │\n",
+       "│   9   │ T2_DE_RWTH           11 / 294  │     3.7%     │\n",
+       "│  10    T2_UK_SGrid_Bristol  3 / 294        1.0%     │\n",
+       "│  11   │ T2_ES_IFCA           2 / 294   │     0.7%     │\n",
+       "└───────┴─────────────────────┴───────────┴──────────────┘\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[3m Available replicas \u001b[0m\n", + "┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━┓\n", + "┃\u001b[1m \u001b[0m\u001b[1mIndex\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mSite \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mFiles \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mAvailability\u001b[0m\u001b[1m \u001b[0m┃\n", + "┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━┩\n", + "│\u001b[2m \u001b[0m\u001b[2m 0 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_DE_DESY \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m294 / 294\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 100.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 1 │\u001b[36m \u001b[0m\u001b[36mT1_DE_KIT_Disk \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m294 / 294\u001b[0m\u001b[35m \u001b[0m│ 100.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 2 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_UK_RAL_Disk \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m294 / 294\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 100.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 3 │\u001b[36m \u001b[0m\u001b[36mT3_CH_PSI \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m294 / 294\u001b[0m\u001b[35m \u001b[0m│ 100.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 4 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_IT_CNAF_Disk \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m58 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 19.7% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 5 │\u001b[36m \u001b[0m\u001b[36mT2_BE_IIHE \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m50 / 294 \u001b[0m\u001b[35m \u001b[0m│ 17.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 6 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_ES_PIC_Disk \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m43 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 14.6% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 7 │\u001b[36m \u001b[0m\u001b[36mT2_IT_Legnaro \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m28 / 294 \u001b[0m\u001b[35m \u001b[0m│ 9.5% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 8 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_ES_CIEMAT \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m17 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 5.8% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 9 │\u001b[36m \u001b[0m\u001b[36mT2_DE_RWTH \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m11 / 294 \u001b[0m\u001b[35m \u001b[0m│ 3.7% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 10 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_UK_SGrid_Bristol\u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m3 / 294 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 1.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 11 │\u001b[36m \u001b[0m\u001b[36mT2_ES_IFCA \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m2 / 294 \u001b[0m\u001b[35m \u001b[0m│ 0.7% │\n", + "└───────┴─────────────────────┴───────────┴──────────────┘\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Example with regex\n", + "try:\n", + " (\n", + " outfiles,\n", + " outsites,\n", + " sites_counts,\n", + " ) = rucio_utils.get_dataset_files_replicas(\n", + " dataset,\n", + " allowlist_sites=[],\n", + " blocklist_sites=[],\n", + " regex_sites= r\"T[123]_(FR|IT|BE|CH|DE|ES|UK)_\\w+\",\n", + " mode=\"full\", # full or first. \"full\"==all the available replicas\n", + " client=client,\n", + " )\n", + "except Exception as e:\n", + " print(f\"\\n[red bold] Exception: {e}[/]\")\n", + "\n", + "print_replicas(sites_counts)" + ] + }, + { + "cell_type": "markdown", + "id": "0b805dde-dd38-46a4-92ad-55ab2e4a4876", + "metadata": {}, + "source": [ + "# Using the DataDiscoveryCLI\n", + "Manipulating the dataset query and replicas is simplified by the `DataDiscoveryCLI` class in `dataset_query` module." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "39846193-d6f2-4de5-ba42-a089d1b0786d", + "metadata": {}, + "outputs": [], + "source": [ + "from coffea.dataset_tools import rucio_utils\n", + "from coffea.dataset_tools.dataset_query import print_dataset_query\n", + "from rich.console import Console\n", + "from rich.table import Table\n", + "from coffea.dataset_tools.dataset_query import DataDiscoveryCLI" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "eaba3e39-c95a-4282-83e2-3aadf748adca", + "metadata": {}, + "outputs": [], + "source": [ + "dataset_definition = {\n", + " \"/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18NanoAODv9-106X*/NANOAODSIM\": {\"short_name\": \"ZJets\",\n", + " \"metadata\": {\"xsec\": 100.0,\"isMC\":True}},\n", + " \"/SingleMuon/Run2018C-UL20*_MiniAODv2_NanoAODv9_GT36*/NANOAOD\": {\"short_name\": \"SingleMuon\", \"metadata\": {\"isMC\":False}}\n", + "}\n" + ] + }, + { + "cell_type": "markdown", + "id": "ecb84b02-b85f-4037-a08d-cce001bc35c7", + "metadata": {}, + "source": [ + "The dataset definition is passed to a `DataDiscoveryCLI` to automatically query rucio and get replicas" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "716a6c0c-ea07-498a-a010-f9e7f87ba3a3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
 Querying rucio for replicas: /SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[32m⠇\u001b[0m Querying rucio for replicas: \u001b[1;31m/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
Sites availability for dataset: /SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[36mSites availability for dataset: \u001b[0m\u001b[31m/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\u001b[31mNANOAOD\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
                   Available replicas                   \n",
+       "┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━┓\n",
+       "┃ Index  Site                 Files    Availability ┃\n",
+       "┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━┩\n",
+       "│   0    T2_DE_DESY           67 / 67     100.0%    │\n",
+       "│   1   │ T3_KR_KISTI          67 / 67 │    100.0%    │\n",
+       "│   2    T2_TW_NCHC           67 / 67     100.0%    │\n",
+       "│   3   │ T2_BE_IIHE           67 / 67 │    100.0%    │\n",
+       "│   4    T2_US_Purdue         67 / 67     100.0%    │\n",
+       "│   5   │ T2_ES_CIEMAT         67 / 67 │    100.0%    │\n",
+       "│   6    T3_FR_IPNL           67 / 67     100.0%    │\n",
+       "│   7   │ T1_US_FNAL_Disk      61 / 67 │    91.0%     │\n",
+       "│   8    T2_UK_London_IC      39 / 67     58.2%     │\n",
+       "│   9   │ T1_FR_CCIN2P3_Disk   38 / 67 │    56.7%     │\n",
+       "│  10    T2_US_Caltech        26 / 67     38.8%     │\n",
+       "│  11   │ T2_CH_CERN           25 / 67 │    37.3%     │\n",
+       "│  12    T2_DE_RWTH           22 / 67     32.8%     │\n",
+       "│  13   │ T1_IT_CNAF_Disk      20 / 67 │    29.9%     │\n",
+       "│  14    T2_US_Wisconsin      16 / 67     23.9%     │\n",
+       "│  15   │ T2_US_Florida        16 / 67 │    23.9%     │\n",
+       "│  16    T2_US_Nebraska       13 / 67     19.4%     │\n",
+       "│  17   │ T2_TR_METU           11 / 67 │    16.4%     │\n",
+       "│  18    T1_DE_KIT_Disk       11 / 67     16.4%     │\n",
+       "│  19   │ T2_UK_SGrid_RALPP    6 / 67  │     9.0%     │\n",
+       "│  20    T2_IT_Legnaro        6 / 67       9.0%     │\n",
+       "│  21   │ T2_ES_IFCA           4 / 67  │     6.0%     │\n",
+       "│  22    T2_FR_IPHC           2 / 67       3.0%     │\n",
+       "│  23   │ T2_UK_London_Brunel  1 / 67  │     1.5%     │\n",
+       "└───────┴─────────────────────┴─────────┴──────────────┘\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[3m Available replicas \u001b[0m\n", + "┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━┓\n", + "┃\u001b[1m \u001b[0m\u001b[1mIndex\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mSite \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mFiles \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mAvailability\u001b[0m\u001b[1m \u001b[0m┃\n", + "┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━┩\n", + "│\u001b[2m \u001b[0m\u001b[2m 0 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_DE_DESY \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m67 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 100.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 1 │\u001b[36m \u001b[0m\u001b[36mT3_KR_KISTI \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m67 / 67\u001b[0m\u001b[35m \u001b[0m│ 100.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 2 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_TW_NCHC \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m67 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 100.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 3 │\u001b[36m \u001b[0m\u001b[36mT2_BE_IIHE \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m67 / 67\u001b[0m\u001b[35m \u001b[0m│ 100.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 4 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_US_Purdue \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m67 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 100.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 5 │\u001b[36m \u001b[0m\u001b[36mT2_ES_CIEMAT \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m67 / 67\u001b[0m\u001b[35m \u001b[0m│ 100.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 6 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT3_FR_IPNL \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m67 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 100.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 7 │\u001b[36m \u001b[0m\u001b[36mT1_US_FNAL_Disk \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m61 / 67\u001b[0m\u001b[35m \u001b[0m│ 91.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 8 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_UK_London_IC \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m39 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 58.2% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 9 │\u001b[36m \u001b[0m\u001b[36mT1_FR_CCIN2P3_Disk \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m38 / 67\u001b[0m\u001b[35m \u001b[0m│ 56.7% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 10 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_US_Caltech \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m26 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 38.8% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 11 │\u001b[36m \u001b[0m\u001b[36mT2_CH_CERN \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m25 / 67\u001b[0m\u001b[35m \u001b[0m│ 37.3% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 12 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_DE_RWTH \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m22 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 32.8% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 13 │\u001b[36m \u001b[0m\u001b[36mT1_IT_CNAF_Disk \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m20 / 67\u001b[0m\u001b[35m \u001b[0m│ 29.9% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 14 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_US_Wisconsin \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m16 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 23.9% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 15 │\u001b[36m \u001b[0m\u001b[36mT2_US_Florida \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m16 / 67\u001b[0m\u001b[35m \u001b[0m│ 23.9% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 16 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_US_Nebraska \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m13 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 19.4% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 17 │\u001b[36m \u001b[0m\u001b[36mT2_TR_METU \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m11 / 67\u001b[0m\u001b[35m \u001b[0m│ 16.4% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 18 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_DE_KIT_Disk \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m11 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 16.4% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 19 │\u001b[36m \u001b[0m\u001b[36mT2_UK_SGrid_RALPP \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m6 / 67 \u001b[0m\u001b[35m \u001b[0m│ 9.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 20 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_IT_Legnaro \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m6 / 67 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 9.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 21 │\u001b[36m \u001b[0m\u001b[36mT2_ES_IFCA \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m4 / 67 \u001b[0m\u001b[35m \u001b[0m│ 6.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 22 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_FR_IPHC \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m2 / 67 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 3.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 23 │\u001b[36m \u001b[0m\u001b[36mT2_UK_London_Brunel\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m1 / 67 \u001b[0m\u001b[35m \u001b[0m│ 1.5% │\n", + "└───────┴─────────────────────┴─────────┴──────────────┘\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Replicas for /SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD\n",
+       "├── T2_DE_DESY\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/0144EC47-BFA3-EA43-BF05-BD4248ED6031.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/2747DEFE-A247-1F42-B0EF-E7B7F1D3FCD6.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/2DA9130E-8423-304C-9902-1E42CD72E658.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/63047CC0-38C6-F74C-9A00-0DF9050F7CF1.root\n",
+       "│   └── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│       36-v1/2520000/8369B0EA-E4CC-AC4D-BD3F-0679B3310E09.root\n",
+       "├── T3_KR_KISTI\n",
+       "│   ├── root://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\n",
+       "│   │   -v1/2520000/0C9615C1-7EE6-CD44-8FC0-04F63B2C16FD.root\n",
+       "│   ├── root://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\n",
+       "│   │   -v1/2520000/152C304A-97AD-1649-BCB6-3EA0CCD0DD33.root\n",
+       "│   ├── root://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\n",
+       "│   │   -v1/2520000/1CEB718A-7DC1-C74A-A7BE-A3C8D9FA785A.root\n",
+       "│   ├── root://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\n",
+       "│   │   -v1/2520000/51515E3C-C640-3A4C-A16C-DC267FD142BF.root\n",
+       "│   ├── root://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\n",
+       "│   │   -v1/2520000/7DEA3718-B7BC-EE42-A8BE-11C62BB8536D.root\n",
+       "│   ├── root://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\n",
+       "│   │   -v1/2520000/81CEA7BA-9E66-BC4F-A96F-32642D59B653.root\n",
+       "│   └── root://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\n",
+       "│       -v1/2520000/C4F476DA-3D00-334B-867C-7E12F94EE3AB.root\n",
+       "├── T2_ES_CIEMAT\n",
+       "│   ├── root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\n",
+       "│   │   /2520000/12FAE9F1-7139-924C-A8DE-9699A00FC994.root\n",
+       "│   ├── root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\n",
+       "│   │   /2520000/1DD0FAC6-3087-E44E-ABCB-8AF812C1310D.root\n",
+       "│   ├── root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\n",
+       "│   │   /2520000/3FE5B677-9AB3-0245-A1CF-4B320592F18F.root\n",
+       "│   ├── root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\n",
+       "│   │   /2520000/74A75B73-E5B8-C942-BBC9-1DDDD7F752FB.root\n",
+       "│   ├── root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\n",
+       "│   │   /2520000/8C8690F8-4FEE-1047-85F4-29E414B3D12C.root\n",
+       "│   └── root://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\n",
+       "│       /2520000/DA47C0B6-BCAB-C54C-A6BF-B0A64E88E3D4.root\n",
+       "├── T1_FR_CCIN2P3_Disk\n",
+       "│   ├── root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\n",
+       "│   │   18_MiniAODv2_NanoAODv9_GT36-v1/2520000/26FC8C40-EA29-804C-B17D-84FB1C6BC505.root\n",
+       "│   ├── root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\n",
+       "│   │   18_MiniAODv2_NanoAODv9_GT36-v1/2520000/2D58C3FE-512A-1F48-9AEB-6F80379B8F4A.root\n",
+       "│   ├── root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\n",
+       "│   │   18_MiniAODv2_NanoAODv9_GT36-v1/2520000/30A3A1AB-2F27-C84E-9437-6BB3881F6856.root\n",
+       "│   └── root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\n",
+       "│       18_MiniAODv2_NanoAODv9_GT36-v1/2520000/A350E2E4-705C-2C4D-9B11-3436056EEBE7.root\n",
+       "├── T2_BE_IIHE\n",
+       "│   ├── root://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\n",
+       "│   │   0000/365F32F6-F971-1B4D-8E9D-C0ACD74FFB03.root\n",
+       "│   ├── root://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\n",
+       "│   │   0000/410C32AB-DEB5-404F-BC6B-92E8F560563F.root\n",
+       "│   ├── root://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\n",
+       "│   │   0000/6809B5E3-6DE6-1541-AE4C-E1804C877EDE.root\n",
+       "│   ├── root://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\n",
+       "│   │   0000/78AC6A39-C303-EB44-9264-71819CC70FCC.root\n",
+       "│   └── root://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\n",
+       "│       0000/7CCCB2C3-F210-2C42-85DF-AA00293FACFB.root\n",
+       "├── T2_US_Purdue\n",
+       "│   ├── root://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
+       "│   │   2520000/37312354-59AB-E44B-BC94-CF424D4B7DDB.root\n",
+       "│   ├── root://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
+       "│   │   2520000/42DC0F42-82E8-BE47-B04D-544B67274829.root\n",
+       "│   ├── root://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
+       "│   │   2520000/D7875684-9F26-084E-9B2B-5E9BB5D353E8.root\n",
+       "│   ├── root://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
+       "│   │   2520000/FAF0C67B-A8B4-8A4F-83B1-E43675CE9630.root\n",
+       "│   └── root://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
+       "│       2520000/FE5EEFA5-C07A-5C44-B66D-5B31BE02C7D3.root\n",
+       "├── T2_US_Wisconsin\n",
+       "│   ├── root://cmsxrootd.hep.wisc.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\n",
+       "│   │   -v1/2520000/39D52C69-2035-A24B-A413-40976993651D.root\n",
+       "│   └── root://cmsxrootd.hep.wisc.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\n",
+       "│       -v1/2520000/FCAF4145-8E3F-2142-BDCB-5E276523B592.root\n",
+       "├── T2_TW_NCHC\n",
+       "│   ├── root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\n",
+       "│   │   1/2520000/459261DD-4441-6047-9FF2-1EDE468452C9.root\n",
+       "│   ├── root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\n",
+       "│   │   1/2520000/6DDF448B-4605-5C41-9711-1C73EC5F01D3.root\n",
+       "│   ├── root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\n",
+       "│   │   1/2520000/7B14228A-5331-DF4E-B677-7B8AA281D460.root\n",
+       "│   ├── root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\n",
+       "│   │   1/2520000/7B181B92-AA2C-1E44-86FE-B074D359BBB3.root\n",
+       "│   ├── root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\n",
+       "│   │   1/2520000/8223C4A3-D4BD-6A4B-A513-54B6668C7122.root\n",
+       "│   ├── root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\n",
+       "│   │   1/2520000/A74EFE57-BAD2-C143-B8DC-817CE4F96FD7.root\n",
+       "│   ├── root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\n",
+       "│   │   1/2520000/AE014F55-84BE-E84E-B447-0B614070CD17.root\n",
+       "│   ├── root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\n",
+       "│   │   1/2520000/BCBF89A2-329C-744B-A38F-139EA8F94007.root\n",
+       "│   ├── root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\n",
+       "│   │   1/2520000/D8D41BBC-D514-D342-A514-CCF48575D184.root\n",
+       "│   └── root://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\n",
+       "│       1/2520000/F1B3977A-E777-EC4D-8FC7-981FE4ED5E0C.root\n",
+       "├── T2_UK_London_IC\n",
+       "│   ├── root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\n",
+       "│   │   OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/59DA0585-BD57-CE49-A15E-CDBAC5473EDE.root\n",
+       "│   ├── root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\n",
+       "│   │   OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/F16A9138-7563-E540-B6AD-8A8A688B3830.root\n",
+       "│   └── root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\n",
+       "│       OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/FE3D79A6-27D4-8948-A89B-2F966C5B29D4.root\n",
+       "├── T1_US_FNAL_Disk\n",
+       "│   ├── root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\n",
+       "│   │   Dv9_GT36-v1/2520000/62789325-3C0B-FC4D-B578-B41A396399E4.root\n",
+       "│   ├── root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\n",
+       "│   │   Dv9_GT36-v1/2520000/6EAA5EDB-0DB3-6E40-87DC-7AB582295D29.root\n",
+       "│   ├── root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\n",
+       "│   │   Dv9_GT36-v1/2520000/A59D511A-A419-714F-8EE1-8B8BAFEC04D5.root\n",
+       "│   ├── root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\n",
+       "│   │   Dv9_GT36-v1/2520000/B78A9B75-3B32-CF4E-A144-375189CF48AE.root\n",
+       "│   ├── root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\n",
+       "│   │   Dv9_GT36-v1/2520000/B9E9087C-255C-C24D-A733-FB9291DC7C3C.root\n",
+       "│   ├── root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\n",
+       "│   │   Dv9_GT36-v1/2520000/CDD2CDF9-72D0-4045-B28F-89002077FB89.root\n",
+       "│   └── root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\n",
+       "│       Dv9_GT36-v1/2520000/ED95384D-9D3D-AE45-8425-C4C080E691C5.root\n",
+       "├── T1_IT_CNAF_Disk\n",
+       "│   └── root://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
+       "│       2520000/648ECD9C-8AAA-BB46-8683-C8987CCC73B9.root\n",
+       "├── T2_US_Nebraska\n",
+       "│   ├── root://xrootd-local.unl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\n",
+       "│   │   1/2520000/69ABD79C-C684-8244-9F0D-153C6B8C2D9C.root\n",
+       "│   ├── root://xrootd-local.unl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\n",
+       "│   │   1/2520000/AB8DD69D-A522-D44C-BB9C-209623F7D41A.root\n",
+       "│   └── root://xrootd-local.unl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\n",
+       "│       1/2520000/B3487FE0-B172-AD47-A13A-388C0A9BF93F.root\n",
+       "├── T2_IT_Legnaro\n",
+       "│   └── root://t2-xrdcms.lnl.infn.it:7070///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-\n",
+       "│       v1/2520000/B1B449CE-5952-8347-A9A7-35FE231D0C72.root\n",
+       "├── T3_FR_IPNL\n",
+       "│   ├── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
+       "│   │   Dv2_NanoAODv9_GT36-v1/2520000/BA02D468-A8CE-4F49-884F-F836BB481AD5.root\n",
+       "│   ├── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
+       "│   │   Dv2_NanoAODv9_GT36-v1/2520000/BAAA6E00-7AC3-9947-9262-D9833D3A8B19.root\n",
+       "│   ├── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
+       "│   │   Dv2_NanoAODv9_GT36-v1/2520000/CBD43A1E-AE2F-0B4D-A642-29FB2E9EB33B.root\n",
+       "│   ├── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
+       "│   │   Dv2_NanoAODv9_GT36-v1/2520000/ECD4877E-707B-EA43-A38B-D1B700FBDE79.root\n",
+       "│   └── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
+       "│       Dv2_NanoAODv9_GT36-v1/2520000/F09135D8-FCBE-AF40-BCE8-03A529C5C87F.root\n",
+       "├── T2_DE_RWTH\n",
+       "│   └── root://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\n",
+       "│       _NanoAODv9_GT36-v1/2520000/D40D1285-B075-D446-B1BF-86A463EF6993.root\n",
+       "├── T2_TR_METU\n",
+       "│   └── root://eymir.grid.metu.edu.tr//dpm/grid.metu.edu.tr/home/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018\n",
+       "│       _MiniAODv2_NanoAODv9_GT36-v1/2520000/F34F4F00-3370-EF4D-AF44-39E474E6530F.root\n",
+       "└── T2_US_Florida\n",
+       "    └── root://cmsio2.rc.ufl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\n",
+       "        520000/F6E44EA5-F4C6-E746-AD43-7A263F1E316E.root\n",
+       "
\n" + ], + "text/plain": [ + "Replicas for \u001b[32m/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD\u001b[0m\n", + "├── \u001b[32mT2_DE_DESY\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/0144EC47-BFA3-EA43-BF05-BD4248ED6031.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/2747DEFE-A247-1F42-B0EF-E7B7F1D3FCD6.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/2DA9130E-8423-304C-9902-1E42CD72E658.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/63047CC0-38C6-F74C-9A00-0DF9050F7CF1.root\u001b[0m\n", + "│ └── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ \u001b[36m36-v1/2520000/8369B0EA-E4CC-AC4D-BD3F-0679B3310E09.root\u001b[0m\n", + "├── \u001b[32mT3_KR_KISTI\u001b[0m\n", + "│ ├── \u001b[36mroot://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n", + "│ │ \u001b[36m-v1/2520000/0C9615C1-7EE6-CD44-8FC0-04F63B2C16FD.root\u001b[0m\n", + "│ ├── \u001b[36mroot://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n", + "│ │ \u001b[36m-v1/2520000/152C304A-97AD-1649-BCB6-3EA0CCD0DD33.root\u001b[0m\n", + "│ ├── \u001b[36mroot://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n", + "│ │ \u001b[36m-v1/2520000/1CEB718A-7DC1-C74A-A7BE-A3C8D9FA785A.root\u001b[0m\n", + "│ ├── \u001b[36mroot://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n", + "│ │ \u001b[36m-v1/2520000/51515E3C-C640-3A4C-A16C-DC267FD142BF.root\u001b[0m\n", + "│ ├── \u001b[36mroot://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n", + "│ │ \u001b[36m-v1/2520000/7DEA3718-B7BC-EE42-A8BE-11C62BB8536D.root\u001b[0m\n", + "│ ├── \u001b[36mroot://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n", + "│ │ \u001b[36m-v1/2520000/81CEA7BA-9E66-BC4F-A96F-32642D59B653.root\u001b[0m\n", + "│ └── \u001b[36mroot://cms-xrdr.sdfarm.kr:1094//xrd//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n", + "│ \u001b[36m-v1/2520000/C4F476DA-3D00-334B-867C-7E12F94EE3AB.root\u001b[0m\n", + "├── \u001b[32mT2_ES_CIEMAT\u001b[0m\n", + "│ ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n", + "│ │ \u001b[36m/2520000/12FAE9F1-7139-924C-A8DE-9699A00FC994.root\u001b[0m\n", + "│ ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n", + "│ │ \u001b[36m/2520000/1DD0FAC6-3087-E44E-ABCB-8AF812C1310D.root\u001b[0m\n", + "│ ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n", + "│ │ \u001b[36m/2520000/3FE5B677-9AB3-0245-A1CF-4B320592F18F.root\u001b[0m\n", + "│ ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n", + "│ │ \u001b[36m/2520000/74A75B73-E5B8-C942-BBC9-1DDDD7F752FB.root\u001b[0m\n", + "│ ├── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n", + "│ │ \u001b[36m/2520000/8C8690F8-4FEE-1047-85F4-29E414B3D12C.root\u001b[0m\n", + "│ └── \u001b[36mroot://gaexrdoor.ciemat.es:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1\u001b[0m\n", + "│ \u001b[36m/2520000/DA47C0B6-BCAB-C54C-A6BF-B0A64E88E3D4.root\u001b[0m\n", + "├── \u001b[32mT1_FR_CCIN2P3_Disk\u001b[0m\n", + "│ ├── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n", + "│ │ \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/26FC8C40-EA29-804C-B17D-84FB1C6BC505.root\u001b[0m\n", + "│ ├── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n", + "│ │ \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/2D58C3FE-512A-1F48-9AEB-6F80379B8F4A.root\u001b[0m\n", + "│ ├── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n", + "│ │ \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/30A3A1AB-2F27-C84E-9437-6BB3881F6856.root\u001b[0m\n", + "│ └── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n", + "│ \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/A350E2E4-705C-2C4D-9B11-3436056EEBE7.root\u001b[0m\n", + "├── \u001b[32mT2_BE_IIHE\u001b[0m\n", + "│ ├── \u001b[36mroot://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\u001b[0m\n", + "│ │ \u001b[36m0000/365F32F6-F971-1B4D-8E9D-C0ACD74FFB03.root\u001b[0m\n", + "│ ├── \u001b[36mroot://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\u001b[0m\n", + "│ │ \u001b[36m0000/410C32AB-DEB5-404F-BC6B-92E8F560563F.root\u001b[0m\n", + "│ ├── \u001b[36mroot://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\u001b[0m\n", + "│ │ \u001b[36m0000/6809B5E3-6DE6-1541-AE4C-E1804C877EDE.root\u001b[0m\n", + "│ ├── \u001b[36mroot://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\u001b[0m\n", + "│ │ \u001b[36m0000/78AC6A39-C303-EB44-9264-71819CC70FCC.root\u001b[0m\n", + "│ └── \u001b[36mroot://maite.iihe.ac.be:1095//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/252\u001b[0m\n", + "│ \u001b[36m0000/7CCCB2C3-F210-2C42-85DF-AA00293FACFB.root\u001b[0m\n", + "├── \u001b[32mT2_US_Purdue\u001b[0m\n", + "│ ├── \u001b[36mroot://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", + "│ │ \u001b[36m2520000/37312354-59AB-E44B-BC94-CF424D4B7DDB.root\u001b[0m\n", + "│ ├── \u001b[36mroot://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", + "│ │ \u001b[36m2520000/42DC0F42-82E8-BE47-B04D-544B67274829.root\u001b[0m\n", + "│ ├── \u001b[36mroot://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", + "│ │ \u001b[36m2520000/D7875684-9F26-084E-9B2B-5E9BB5D353E8.root\u001b[0m\n", + "│ ├── \u001b[36mroot://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", + "│ │ \u001b[36m2520000/FAF0C67B-A8B4-8A4F-83B1-E43675CE9630.root\u001b[0m\n", + "│ └── \u001b[36mroot://eos.cms.rcac.purdue.edu///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", + "│ \u001b[36m2520000/FE5EEFA5-C07A-5C44-B66D-5B31BE02C7D3.root\u001b[0m\n", + "├── \u001b[32mT2_US_Wisconsin\u001b[0m\n", + "│ ├── \u001b[36mroot://cmsxrootd.hep.wisc.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n", + "│ │ \u001b[36m-v1/2520000/39D52C69-2035-A24B-A413-40976993651D.root\u001b[0m\n", + "│ └── \u001b[36mroot://cmsxrootd.hep.wisc.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36\u001b[0m\n", + "│ \u001b[36m-v1/2520000/FCAF4145-8E3F-2142-BDCB-5E276523B592.root\u001b[0m\n", + "├── \u001b[32mT2_TW_NCHC\u001b[0m\n", + "│ ├── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n", + "│ │ \u001b[36m1/2520000/459261DD-4441-6047-9FF2-1EDE468452C9.root\u001b[0m\n", + "│ ├── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n", + "│ │ \u001b[36m1/2520000/6DDF448B-4605-5C41-9711-1C73EC5F01D3.root\u001b[0m\n", + "│ ├── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n", + "│ │ \u001b[36m1/2520000/7B14228A-5331-DF4E-B677-7B8AA281D460.root\u001b[0m\n", + "│ ├── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n", + "│ │ \u001b[36m1/2520000/7B181B92-AA2C-1E44-86FE-B074D359BBB3.root\u001b[0m\n", + "│ ├── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n", + "│ │ \u001b[36m1/2520000/8223C4A3-D4BD-6A4B-A513-54B6668C7122.root\u001b[0m\n", + "│ ├── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n", + "│ │ \u001b[36m1/2520000/A74EFE57-BAD2-C143-B8DC-817CE4F96FD7.root\u001b[0m\n", + "│ ├── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n", + "│ │ \u001b[36m1/2520000/AE014F55-84BE-E84E-B447-0B614070CD17.root\u001b[0m\n", + "│ ├── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n", + "│ │ \u001b[36m1/2520000/BCBF89A2-329C-744B-A38F-139EA8F94007.root\u001b[0m\n", + "│ ├── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n", + "│ │ \u001b[36m1/2520000/D8D41BBC-D514-D342-A514-CCF48575D184.root\u001b[0m\n", + "│ └── \u001b[36mroot://se01.grid.nchc.org.tw//cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n", + "│ \u001b[36m1/2520000/F1B3977A-E777-EC4D-8FC7-981FE4ED5E0C.root\u001b[0m\n", + "├── \u001b[32mT2_UK_London_IC\u001b[0m\n", + "│ ├── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n", + "│ │ \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/59DA0585-BD57-CE49-A15E-CDBAC5473EDE.root\u001b[0m\n", + "│ ├── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n", + "│ │ \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/F16A9138-7563-E540-B6AD-8A8A688B3830.root\u001b[0m\n", + "│ └── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n", + "│ \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/FE3D79A6-27D4-8948-A89B-2F966C5B29D4.root\u001b[0m\n", + "├── \u001b[32mT1_US_FNAL_Disk\u001b[0m\n", + "│ ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n", + "│ │ \u001b[36mDv9_GT36-v1/2520000/62789325-3C0B-FC4D-B578-B41A396399E4.root\u001b[0m\n", + "│ ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n", + "│ │ \u001b[36mDv9_GT36-v1/2520000/6EAA5EDB-0DB3-6E40-87DC-7AB582295D29.root\u001b[0m\n", + "│ ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n", + "│ │ \u001b[36mDv9_GT36-v1/2520000/A59D511A-A419-714F-8EE1-8B8BAFEC04D5.root\u001b[0m\n", + "│ ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n", + "│ │ \u001b[36mDv9_GT36-v1/2520000/B78A9B75-3B32-CF4E-A144-375189CF48AE.root\u001b[0m\n", + "│ ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n", + "│ │ \u001b[36mDv9_GT36-v1/2520000/B9E9087C-255C-C24D-A733-FB9291DC7C3C.root\u001b[0m\n", + "│ ├── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n", + "│ │ \u001b[36mDv9_GT36-v1/2520000/CDD2CDF9-72D0-4045-B28F-89002077FB89.root\u001b[0m\n", + "│ └── \u001b[36mroot://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAO\u001b[0m\n", + "│ \u001b[36mDv9_GT36-v1/2520000/ED95384D-9D3D-AE45-8425-C4C080E691C5.root\u001b[0m\n", + "├── \u001b[32mT1_IT_CNAF_Disk\u001b[0m\n", + "│ └── \u001b[36mroot://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", + "│ \u001b[36m2520000/648ECD9C-8AAA-BB46-8683-C8987CCC73B9.root\u001b[0m\n", + "├── \u001b[32mT2_US_Nebraska\u001b[0m\n", + "│ ├── \u001b[36mroot://xrootd-local.unl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n", + "│ │ \u001b[36m1/2520000/69ABD79C-C684-8244-9F0D-153C6B8C2D9C.root\u001b[0m\n", + "│ ├── \u001b[36mroot://xrootd-local.unl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n", + "│ │ \u001b[36m1/2520000/AB8DD69D-A522-D44C-BB9C-209623F7D41A.root\u001b[0m\n", + "│ └── \u001b[36mroot://xrootd-local.unl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v\u001b[0m\n", + "│ \u001b[36m1/2520000/B3487FE0-B172-AD47-A13A-388C0A9BF93F.root\u001b[0m\n", + "├── \u001b[32mT2_IT_Legnaro\u001b[0m\n", + "│ └── \u001b[36mroot://t2-xrdcms.lnl.infn.it:7070///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-\u001b[0m\n", + "│ \u001b[36mv1/2520000/B1B449CE-5952-8347-A9A7-35FE231D0C72.root\u001b[0m\n", + "├── \u001b[32mT3_FR_IPNL\u001b[0m\n", + "│ ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", + "│ │ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/BA02D468-A8CE-4F49-884F-F836BB481AD5.root\u001b[0m\n", + "│ ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", + "│ │ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/BAAA6E00-7AC3-9947-9262-D9833D3A8B19.root\u001b[0m\n", + "│ ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", + "│ │ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/CBD43A1E-AE2F-0B4D-A642-29FB2E9EB33B.root\u001b[0m\n", + "│ ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", + "│ │ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/ECD4877E-707B-EA43-A38B-D1B700FBDE79.root\u001b[0m\n", + "│ └── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", + "│ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/F09135D8-FCBE-AF40-BCE8-03A529C5C87F.root\u001b[0m\n", + "├── \u001b[32mT2_DE_RWTH\u001b[0m\n", + "│ └── \u001b[36mroot://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\u001b[0m\n", + "│ \u001b[36m_NanoAODv9_GT36-v1/2520000/D40D1285-B075-D446-B1BF-86A463EF6993.root\u001b[0m\n", + "├── \u001b[32mT2_TR_METU\u001b[0m\n", + "│ └── \u001b[36mroot://eymir.grid.metu.edu.tr//dpm/grid.metu.edu.tr/home/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018\u001b[0m\n", + "│ \u001b[36m_MiniAODv2_NanoAODv9_GT36-v1/2520000/F34F4F00-3370-EF4D-AF44-39E474E6530F.root\u001b[0m\n", + "└── \u001b[32mT2_US_Florida\u001b[0m\n", + " └── \u001b[36mroot://cmsio2.rc.ufl.edu:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n", + " \u001b[36m520000/F6E44EA5-F4C6-E746-AD43-7A263F1E316E.root\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Selected datasets:\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[36mSelected datasets:\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
                                                 Selected datasets                                                 \n",
+       "┏━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳┳┓\n",
+       "┃ Dataset                                                                                                   ┃\n",
+       "┡━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇╇┩\n",
+       "│ 1  /DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realisti… │││\n",
+       "│ 2  /SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD                                           │││\n",
+       "└───┴───────────────────────────────────────────────────────────────────────────────────────────────────────────┴┴┘\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[3m Selected datasets \u001b[0m\n", + "┏━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳┳┓\n", + "┃\u001b[1m \u001b[0m\u001b[1m…\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mDataset \u001b[0m\u001b[1m \u001b[0m┃┃┃\n", + "┡━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇╇┩\n", + "│\u001b[36m \u001b[0m\u001b[36m1\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realisti…\u001b[0m\u001b[35m \u001b[0m│││\n", + "│\u001b[36m \u001b[0m\u001b[36m2\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD \u001b[0m\u001b[35m \u001b[0m│││\n", + "└───┴───────────────────────────────────────────────────────────────────────────────────────────────────────────┴┴┘\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "ddc = DataDiscoveryCLI()\n", + "ddc.load_dataset_definition(dataset_definition, \n", + " query_results_strategy=\"all\",\n", + " replicas_strategy=\"round-robin\")" + ] + }, + { + "cell_type": "markdown", + "id": "db7798eb-eb9f-47e5-9239-92cdea20600f", + "metadata": {}, + "source": [ + "### Filtering sites" + ] + }, + { + "cell_type": "markdown", + "id": "bd57fe7b-0642-48b8-9f9f-cd209e50d867", + "metadata": {}, + "source": [ + "Sites filtering works in a very similar way for `DataDiscoveryCLI`" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "d85ca119-0a56-4c67-bb21-ebbca8164728", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
 Querying rucio for replicas: /SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[32m⠇\u001b[0m Querying rucio for replicas: \u001b[1;31m/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
Sites availability for dataset: /SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[36mSites availability for dataset: \u001b[0m\u001b[31m/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\u001b[31mNANOAOD\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
                   Available replicas                   \n",
+       "┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━┓\n",
+       "┃ Index  Site                 Files    Availability ┃\n",
+       "┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━┩\n",
+       "│   0    T2_DE_DESY           67 / 67     100.0%    │\n",
+       "│   1   │ T3_FR_IPNL           67 / 67 │    100.0%    │\n",
+       "│   2    T2_UK_London_IC      39 / 67     58.2%     │\n",
+       "│   3   │ T1_FR_CCIN2P3_Disk   38 / 67 │    56.7%     │\n",
+       "│   4    T2_CH_CERN           25 / 67     37.3%     │\n",
+       "│   5   │ T2_DE_RWTH           22 / 67 │    32.8%     │\n",
+       "│   6    T1_IT_CNAF_Disk      20 / 67     29.9%     │\n",
+       "│   7   │ T1_DE_KIT_Disk       11 / 67 │    16.4%     │\n",
+       "│   8    T2_UK_SGrid_RALPP    6 / 67       9.0%     │\n",
+       "│   9   │ T2_IT_Legnaro        6 / 67  │     9.0%     │\n",
+       "│  10    T2_FR_IPHC           2 / 67       3.0%     │\n",
+       "│  11   │ T2_UK_London_Brunel  1 / 67  │     1.5%     │\n",
+       "└───────┴─────────────────────┴─────────┴──────────────┘\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[3m Available replicas \u001b[0m\n", + "┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━┓\n", + "┃\u001b[1m \u001b[0m\u001b[1mIndex\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mSite \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mFiles \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mAvailability\u001b[0m\u001b[1m \u001b[0m┃\n", + "┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━┩\n", + "│\u001b[2m \u001b[0m\u001b[2m 0 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_DE_DESY \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m67 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 100.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 1 │\u001b[36m \u001b[0m\u001b[36mT3_FR_IPNL \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m67 / 67\u001b[0m\u001b[35m \u001b[0m│ 100.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 2 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_UK_London_IC \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m39 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 58.2% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 3 │\u001b[36m \u001b[0m\u001b[36mT1_FR_CCIN2P3_Disk \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m38 / 67\u001b[0m\u001b[35m \u001b[0m│ 56.7% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 4 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_CH_CERN \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m25 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 37.3% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 5 │\u001b[36m \u001b[0m\u001b[36mT2_DE_RWTH \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m22 / 67\u001b[0m\u001b[35m \u001b[0m│ 32.8% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 6 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT1_IT_CNAF_Disk \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m20 / 67\u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 29.9% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 7 │\u001b[36m \u001b[0m\u001b[36mT1_DE_KIT_Disk \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m11 / 67\u001b[0m\u001b[35m \u001b[0m│ 16.4% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 8 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_UK_SGrid_RALPP \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m6 / 67 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 9.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 9 │\u001b[36m \u001b[0m\u001b[36mT2_IT_Legnaro \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m6 / 67 \u001b[0m\u001b[35m \u001b[0m│ 9.0% │\n", + "│\u001b[2m \u001b[0m\u001b[2m 10 \u001b[0m\u001b[2m \u001b[0m│\u001b[2;36m \u001b[0m\u001b[2;36mT2_FR_IPHC \u001b[0m\u001b[2;36m \u001b[0m│\u001b[2;35m \u001b[0m\u001b[2;35m2 / 67 \u001b[0m\u001b[2;35m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m 3.0% \u001b[0m\u001b[2m \u001b[0m│\n", + "│ 11 │\u001b[36m \u001b[0m\u001b[36mT2_UK_London_Brunel\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m1 / 67 \u001b[0m\u001b[35m \u001b[0m│ 1.5% │\n", + "└───────┴─────────────────────┴─────────┴──────────────┘\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Replicas for /SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD\n",
+       "├── T2_CH_CERN\n",
+       "│   ├── root://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\n",
+       "│   │   520000/0144EC47-BFA3-EA43-BF05-BD4248ED6031.root\n",
+       "│   ├── root://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\n",
+       "│   │   520000/1DD0FAC6-3087-E44E-ABCB-8AF812C1310D.root\n",
+       "│   ├── root://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\n",
+       "│   │   520000/2747DEFE-A247-1F42-B0EF-E7B7F1D3FCD6.root\n",
+       "│   ├── root://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\n",
+       "│   │   520000/2DA9130E-8423-304C-9902-1E42CD72E658.root\n",
+       "│   ├── root://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\n",
+       "│   │   520000/39D52C69-2035-A24B-A413-40976993651D.root\n",
+       "│   ├── root://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\n",
+       "│   │   520000/69ABD79C-C684-8244-9F0D-153C6B8C2D9C.root\n",
+       "│   ├── root://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\n",
+       "│   │   520000/7CCCB2C3-F210-2C42-85DF-AA00293FACFB.root\n",
+       "│   └── root://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\n",
+       "│       520000/F34F4F00-3370-EF4D-AF44-39E474E6530F.root\n",
+       "├── T3_FR_IPNL\n",
+       "│   ├── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
+       "│   │   Dv2_NanoAODv9_GT36-v1/2520000/0C9615C1-7EE6-CD44-8FC0-04F63B2C16FD.root\n",
+       "│   ├── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
+       "│   │   Dv2_NanoAODv9_GT36-v1/2520000/30A3A1AB-2F27-C84E-9437-6BB3881F6856.root\n",
+       "│   ├── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
+       "│   │   Dv2_NanoAODv9_GT36-v1/2520000/410C32AB-DEB5-404F-BC6B-92E8F560563F.root\n",
+       "│   ├── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
+       "│   │   Dv2_NanoAODv9_GT36-v1/2520000/42DC0F42-82E8-BE47-B04D-544B67274829.root\n",
+       "│   ├── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
+       "│   │   Dv2_NanoAODv9_GT36-v1/2520000/62789325-3C0B-FC4D-B578-B41A396399E4.root\n",
+       "│   ├── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
+       "│   │   Dv2_NanoAODv9_GT36-v1/2520000/6809B5E3-6DE6-1541-AE4C-E1804C877EDE.root\n",
+       "│   ├── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
+       "│   │   Dv2_NanoAODv9_GT36-v1/2520000/78AC6A39-C303-EB44-9264-71819CC70FCC.root\n",
+       "│   ├── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
+       "│   │   Dv2_NanoAODv9_GT36-v1/2520000/A350E2E4-705C-2C4D-9B11-3436056EEBE7.root\n",
+       "│   ├── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
+       "│   │   Dv2_NanoAODv9_GT36-v1/2520000/FCAF4145-8E3F-2142-BDCB-5E276523B592.root\n",
+       "│   └── root://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\n",
+       "│       Dv2_NanoAODv9_GT36-v1/2520000/FE3D79A6-27D4-8948-A89B-2F966C5B29D4.root\n",
+       "├── T2_UK_London_IC\n",
+       "│   ├── root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\n",
+       "│   │   OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/12FAE9F1-7139-924C-A8DE-9699A00FC994.root\n",
+       "│   ├── root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\n",
+       "│   │   OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/63047CC0-38C6-F74C-9A00-0DF9050F7CF1.root\n",
+       "│   ├── root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\n",
+       "│   │   OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/8369B0EA-E4CC-AC4D-BD3F-0679B3310E09.root\n",
+       "│   ├── root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\n",
+       "│   │   OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/AE014F55-84BE-E84E-B447-0B614070CD17.root\n",
+       "│   ├── root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\n",
+       "│   │   OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/F16A9138-7563-E540-B6AD-8A8A688B3830.root\n",
+       "│   └── root://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\n",
+       "│       OD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/FAF0C67B-A8B4-8A4F-83B1-E43675CE9630.root\n",
+       "├── T1_FR_CCIN2P3_Disk\n",
+       "│   ├── root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\n",
+       "│   │   18_MiniAODv2_NanoAODv9_GT36-v1/2520000/152C304A-97AD-1649-BCB6-3EA0CCD0DD33.root\n",
+       "│   ├── root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\n",
+       "│   │   18_MiniAODv2_NanoAODv9_GT36-v1/2520000/37312354-59AB-E44B-BC94-CF424D4B7DDB.root\n",
+       "│   ├── root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\n",
+       "│   │   18_MiniAODv2_NanoAODv9_GT36-v1/2520000/7B14228A-5331-DF4E-B677-7B8AA281D460.root\n",
+       "│   ├── root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\n",
+       "│   │   18_MiniAODv2_NanoAODv9_GT36-v1/2520000/7B181B92-AA2C-1E44-86FE-B074D359BBB3.root\n",
+       "│   ├── root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\n",
+       "│   │   18_MiniAODv2_NanoAODv9_GT36-v1/2520000/C4F476DA-3D00-334B-867C-7E12F94EE3AB.root\n",
+       "│   ├── root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\n",
+       "│   │   18_MiniAODv2_NanoAODv9_GT36-v1/2520000/D8D41BBC-D514-D342-A514-CCF48575D184.root\n",
+       "│   └── root://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\n",
+       "│       18_MiniAODv2_NanoAODv9_GT36-v1/2520000/FE5EEFA5-C07A-5C44-B66D-5B31BE02C7D3.root\n",
+       "├── T2_FR_IPHC\n",
+       "│   └── root://sbgdcache.in2p3.fr///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/25200\n",
+       "│       00/1CEB718A-7DC1-C74A-A7BE-A3C8D9FA785A.root\n",
+       "├── T2_DE_DESY\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/26FC8C40-EA29-804C-B17D-84FB1C6BC505.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/2D58C3FE-512A-1F48-9AEB-6F80379B8F4A.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/459261DD-4441-6047-9FF2-1EDE468452C9.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/51515E3C-C640-3A4C-A16C-DC267FD142BF.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/648ECD9C-8AAA-BB46-8683-C8987CCC73B9.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/74A75B73-E5B8-C942-BBC9-1DDDD7F752FB.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/81CEA7BA-9E66-BC4F-A96F-32642D59B653.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/8223C4A3-D4BD-6A4B-A513-54B6668C7122.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/8C8690F8-4FEE-1047-85F4-29E414B3D12C.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/B78A9B75-3B32-CF4E-A144-375189CF48AE.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/BAAA6E00-7AC3-9947-9262-D9833D3A8B19.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/BCBF89A2-329C-744B-A38F-139EA8F94007.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/CBD43A1E-AE2F-0B4D-A642-29FB2E9EB33B.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/D40D1285-B075-D446-B1BF-86A463EF6993.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/DA47C0B6-BCAB-C54C-A6BF-B0A64E88E3D4.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/ECD4877E-707B-EA43-A38B-D1B700FBDE79.root\n",
+       "│   ├── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│   │   36-v1/2520000/ED95384D-9D3D-AE45-8425-C4C080E691C5.root\n",
+       "│   └── root://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\n",
+       "│       36-v1/2520000/F1B3977A-E777-EC4D-8FC7-981FE4ED5E0C.root\n",
+       "├── T1_DE_KIT_Disk\n",
+       "│   ├── root://cmsxrootd-kit-disk.gridka.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv\n",
+       "│   │   9_GT36-v1/2520000/365F32F6-F971-1B4D-8E9D-C0ACD74FFB03.root\n",
+       "│   ├── root://cmsxrootd-kit-disk.gridka.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv\n",
+       "│   │   9_GT36-v1/2520000/3FE5B677-9AB3-0245-A1CF-4B320592F18F.root\n",
+       "│   ├── root://cmsxrootd-kit-disk.gridka.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv\n",
+       "│   │   9_GT36-v1/2520000/6DDF448B-4605-5C41-9711-1C73EC5F01D3.root\n",
+       "│   ├── root://cmsxrootd-kit-disk.gridka.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv\n",
+       "│   │   9_GT36-v1/2520000/6EAA5EDB-0DB3-6E40-87DC-7AB582295D29.root\n",
+       "│   └── root://cmsxrootd-kit-disk.gridka.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv\n",
+       "│       9_GT36-v1/2520000/7DEA3718-B7BC-EE42-A8BE-11C62BB8536D.root\n",
+       "├── T2_DE_RWTH\n",
+       "│   ├── root://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\n",
+       "│   │   _NanoAODv9_GT36-v1/2520000/59DA0585-BD57-CE49-A15E-CDBAC5473EDE.root\n",
+       "│   ├── root://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\n",
+       "│   │   _NanoAODv9_GT36-v1/2520000/A59D511A-A419-714F-8EE1-8B8BAFEC04D5.root\n",
+       "│   └── root://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\n",
+       "│       _NanoAODv9_GT36-v1/2520000/B9E9087C-255C-C24D-A733-FB9291DC7C3C.root\n",
+       "├── T1_IT_CNAF_Disk\n",
+       "│   ├── root://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
+       "│   │   2520000/A74EFE57-BAD2-C143-B8DC-817CE4F96FD7.root\n",
+       "│   ├── root://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
+       "│   │   2520000/AB8DD69D-A522-D44C-BB9C-209623F7D41A.root\n",
+       "│   ├── root://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
+       "│   │   2520000/B3487FE0-B172-AD47-A13A-388C0A9BF93F.root\n",
+       "│   ├── root://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
+       "│   │   2520000/CDD2CDF9-72D0-4045-B28F-89002077FB89.root\n",
+       "│   ├── root://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
+       "│   │   2520000/D7875684-9F26-084E-9B2B-5E9BB5D353E8.root\n",
+       "│   └── root://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\n",
+       "│       2520000/F09135D8-FCBE-AF40-BCE8-03A529C5C87F.root\n",
+       "└── T2_UK_SGrid_RALPP\n",
+       "    ├── root://mover.pp.rl.ac.uk:1094/pnfs/pp.rl.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_Mini\n",
+       "    │   AODv2_NanoAODv9_GT36-v1/2520000/B1B449CE-5952-8347-A9A7-35FE231D0C72.root\n",
+       "    ├── root://mover.pp.rl.ac.uk:1094/pnfs/pp.rl.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_Mini\n",
+       "    │   AODv2_NanoAODv9_GT36-v1/2520000/BA02D468-A8CE-4F49-884F-F836BB481AD5.root\n",
+       "    └── root://mover.pp.rl.ac.uk:1094/pnfs/pp.rl.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_Mini\n",
+       "        AODv2_NanoAODv9_GT36-v1/2520000/F6E44EA5-F4C6-E746-AD43-7A263F1E316E.root\n",
+       "
\n" + ], + "text/plain": [ + "Replicas for \u001b[32m/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD\u001b[0m\n", + "├── \u001b[32mT2_CH_CERN\u001b[0m\n", + "│ ├── \u001b[36mroot://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n", + "│ │ \u001b[36m520000/0144EC47-BFA3-EA43-BF05-BD4248ED6031.root\u001b[0m\n", + "│ ├── \u001b[36mroot://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n", + "│ │ \u001b[36m520000/1DD0FAC6-3087-E44E-ABCB-8AF812C1310D.root\u001b[0m\n", + "│ ├── \u001b[36mroot://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n", + "│ │ \u001b[36m520000/2747DEFE-A247-1F42-B0EF-E7B7F1D3FCD6.root\u001b[0m\n", + "│ ├── \u001b[36mroot://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n", + "│ │ \u001b[36m520000/2DA9130E-8423-304C-9902-1E42CD72E658.root\u001b[0m\n", + "│ ├── \u001b[36mroot://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n", + "│ │ \u001b[36m520000/39D52C69-2035-A24B-A413-40976993651D.root\u001b[0m\n", + "│ ├── \u001b[36mroot://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n", + "│ │ \u001b[36m520000/69ABD79C-C684-8244-9F0D-153C6B8C2D9C.root\u001b[0m\n", + "│ ├── \u001b[36mroot://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n", + "│ │ \u001b[36m520000/7CCCB2C3-F210-2C42-85DF-AA00293FACFB.root\u001b[0m\n", + "│ └── \u001b[36mroot://eoscms.cern.ch//eos/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2\u001b[0m\n", + "│ \u001b[36m520000/F34F4F00-3370-EF4D-AF44-39E474E6530F.root\u001b[0m\n", + "├── \u001b[32mT3_FR_IPNL\u001b[0m\n", + "│ ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", + "│ │ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/0C9615C1-7EE6-CD44-8FC0-04F63B2C16FD.root\u001b[0m\n", + "│ ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", + "│ │ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/30A3A1AB-2F27-C84E-9437-6BB3881F6856.root\u001b[0m\n", + "│ ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", + "│ │ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/410C32AB-DEB5-404F-BC6B-92E8F560563F.root\u001b[0m\n", + "│ ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", + "│ │ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/42DC0F42-82E8-BE47-B04D-544B67274829.root\u001b[0m\n", + "│ ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", + "│ │ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/62789325-3C0B-FC4D-B578-B41A396399E4.root\u001b[0m\n", + "│ ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", + "│ │ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/6809B5E3-6DE6-1541-AE4C-E1804C877EDE.root\u001b[0m\n", + "│ ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", + "│ │ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/78AC6A39-C303-EB44-9264-71819CC70FCC.root\u001b[0m\n", + "│ ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", + "│ │ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/A350E2E4-705C-2C4D-9B11-3436056EEBE7.root\u001b[0m\n", + "│ ├── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", + "│ │ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/FCAF4145-8E3F-2142-BDCB-5E276523B592.root\u001b[0m\n", + "│ └── \u001b[36mroot://lyogrid06.in2p3.fr//dpm/in2p3.fr/home/cms/data//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAO\u001b[0m\n", + "│ \u001b[36mDv2_NanoAODv9_GT36-v1/2520000/FE3D79A6-27D4-8948-A89B-2F966C5B29D4.root\u001b[0m\n", + "├── \u001b[32mT2_UK_London_IC\u001b[0m\n", + "│ ├── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n", + "│ │ \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/12FAE9F1-7139-924C-A8DE-9699A00FC994.root\u001b[0m\n", + "│ ├── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n", + "│ │ \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/63047CC0-38C6-F74C-9A00-0DF9050F7CF1.root\u001b[0m\n", + "│ ├── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n", + "│ │ \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/8369B0EA-E4CC-AC4D-BD3F-0679B3310E09.root\u001b[0m\n", + "│ ├── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n", + "│ │ \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/AE014F55-84BE-E84E-B447-0B614070CD17.root\u001b[0m\n", + "│ ├── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n", + "│ │ \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/F16A9138-7563-E540-B6AD-8A8A688B3830.root\u001b[0m\n", + "│ └── \u001b[36mroot://gfe02.grid.hep.ph.ic.ac.uk:1094//pnfs/hep.ph.ic.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOA\u001b[0m\n", + "│ \u001b[36mOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/2520000/FAF0C67B-A8B4-8A4F-83B1-E43675CE9630.root\u001b[0m\n", + "├── \u001b[32mT1_FR_CCIN2P3_Disk\u001b[0m\n", + "│ ├── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n", + "│ │ \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/152C304A-97AD-1649-BCB6-3EA0CCD0DD33.root\u001b[0m\n", + "│ ├── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n", + "│ │ \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/37312354-59AB-E44B-BC94-CF424D4B7DDB.root\u001b[0m\n", + "│ ├── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n", + "│ │ \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/7B14228A-5331-DF4E-B677-7B8AA281D460.root\u001b[0m\n", + "│ ├── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n", + "│ │ \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/7B181B92-AA2C-1E44-86FE-B074D359BBB3.root\u001b[0m\n", + "│ ├── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n", + "│ │ \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/C4F476DA-3D00-334B-867C-7E12F94EE3AB.root\u001b[0m\n", + "│ ├── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n", + "│ │ \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/D8D41BBC-D514-D342-A514-CCF48575D184.root\u001b[0m\n", + "│ └── \u001b[36mroot://ccxrdcms.in2p3.fr:1094/pnfs/in2p3.fr/data/cms/disk/data//store/data/Run2018C/SingleMuon/NANOAOD/UL20\u001b[0m\n", + "│ \u001b[36m18_MiniAODv2_NanoAODv9_GT36-v1/2520000/FE5EEFA5-C07A-5C44-B66D-5B31BE02C7D3.root\u001b[0m\n", + "├── \u001b[32mT2_FR_IPHC\u001b[0m\n", + "│ └── \u001b[36mroot://sbgdcache.in2p3.fr///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/25200\u001b[0m\n", + "│ \u001b[36m00/1CEB718A-7DC1-C74A-A7BE-A3C8D9FA785A.root\u001b[0m\n", + "├── \u001b[32mT2_DE_DESY\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/26FC8C40-EA29-804C-B17D-84FB1C6BC505.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/2D58C3FE-512A-1F48-9AEB-6F80379B8F4A.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/459261DD-4441-6047-9FF2-1EDE468452C9.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/51515E3C-C640-3A4C-A16C-DC267FD142BF.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/648ECD9C-8AAA-BB46-8683-C8987CCC73B9.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/74A75B73-E5B8-C942-BBC9-1DDDD7F752FB.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/81CEA7BA-9E66-BC4F-A96F-32642D59B653.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/8223C4A3-D4BD-6A4B-A513-54B6668C7122.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/8C8690F8-4FEE-1047-85F4-29E414B3D12C.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/B78A9B75-3B32-CF4E-A144-375189CF48AE.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/BAAA6E00-7AC3-9947-9262-D9833D3A8B19.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/BCBF89A2-329C-744B-A38F-139EA8F94007.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/CBD43A1E-AE2F-0B4D-A642-29FB2E9EB33B.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/D40D1285-B075-D446-B1BF-86A463EF6993.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/DA47C0B6-BCAB-C54C-A6BF-B0A64E88E3D4.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/ECD4877E-707B-EA43-A38B-D1B700FBDE79.root\u001b[0m\n", + "│ ├── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ │ \u001b[36m36-v1/2520000/ED95384D-9D3D-AE45-8425-C4C080E691C5.root\u001b[0m\n", + "│ └── \u001b[36mroot://dcache-cms-xrootd.desy.de:1094//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT\u001b[0m\n", + "│ \u001b[36m36-v1/2520000/F1B3977A-E777-EC4D-8FC7-981FE4ED5E0C.root\u001b[0m\n", + "├── \u001b[32mT1_DE_KIT_Disk\u001b[0m\n", + "│ ├── \u001b[36mroot://cmsxrootd-kit-disk.gridka.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv\u001b[0m\n", + "│ │ \u001b[36m9_GT36-v1/2520000/365F32F6-F971-1B4D-8E9D-C0ACD74FFB03.root\u001b[0m\n", + "│ ├── \u001b[36mroot://cmsxrootd-kit-disk.gridka.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv\u001b[0m\n", + "│ │ \u001b[36m9_GT36-v1/2520000/3FE5B677-9AB3-0245-A1CF-4B320592F18F.root\u001b[0m\n", + "│ ├── \u001b[36mroot://cmsxrootd-kit-disk.gridka.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv\u001b[0m\n", + "│ │ \u001b[36m9_GT36-v1/2520000/6DDF448B-4605-5C41-9711-1C73EC5F01D3.root\u001b[0m\n", + "│ ├── \u001b[36mroot://cmsxrootd-kit-disk.gridka.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv\u001b[0m\n", + "│ │ \u001b[36m9_GT36-v1/2520000/6EAA5EDB-0DB3-6E40-87DC-7AB582295D29.root\u001b[0m\n", + "│ └── \u001b[36mroot://cmsxrootd-kit-disk.gridka.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv\u001b[0m\n", + "│ \u001b[36m9_GT36-v1/2520000/7DEA3718-B7BC-EE42-A8BE-11C62BB8536D.root\u001b[0m\n", + "├── \u001b[32mT2_DE_RWTH\u001b[0m\n", + "│ ├── \u001b[36mroot://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\u001b[0m\n", + "│ │ \u001b[36m_NanoAODv9_GT36-v1/2520000/59DA0585-BD57-CE49-A15E-CDBAC5473EDE.root\u001b[0m\n", + "│ ├── \u001b[36mroot://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\u001b[0m\n", + "│ │ \u001b[36m_NanoAODv9_GT36-v1/2520000/A59D511A-A419-714F-8EE1-8B8BAFEC04D5.root\u001b[0m\n", + "│ └── \u001b[36mroot://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2\u001b[0m\n", + "│ \u001b[36m_NanoAODv9_GT36-v1/2520000/B9E9087C-255C-C24D-A733-FB9291DC7C3C.root\u001b[0m\n", + "├── \u001b[32mT1_IT_CNAF_Disk\u001b[0m\n", + "│ ├── \u001b[36mroot://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", + "│ │ \u001b[36m2520000/A74EFE57-BAD2-C143-B8DC-817CE4F96FD7.root\u001b[0m\n", + "│ ├── \u001b[36mroot://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", + "│ │ \u001b[36m2520000/AB8DD69D-A522-D44C-BB9C-209623F7D41A.root\u001b[0m\n", + "│ ├── \u001b[36mroot://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", + "│ │ \u001b[36m2520000/B3487FE0-B172-AD47-A13A-388C0A9BF93F.root\u001b[0m\n", + "│ ├── \u001b[36mroot://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", + "│ │ \u001b[36m2520000/CDD2CDF9-72D0-4045-B28F-89002077FB89.root\u001b[0m\n", + "│ ├── \u001b[36mroot://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", + "│ │ \u001b[36m2520000/D7875684-9F26-084E-9B2B-5E9BB5D353E8.root\u001b[0m\n", + "│ └── \u001b[36mroot://xrootd-cms.infn.it:1194///store/data/Run2018C/SingleMuon/NANOAOD/UL2018_MiniAODv2_NanoAODv9_GT36-v1/\u001b[0m\n", + "│ \u001b[36m2520000/F09135D8-FCBE-AF40-BCE8-03A529C5C87F.root\u001b[0m\n", + "└── \u001b[32mT2_UK_SGrid_RALPP\u001b[0m\n", + " ├── \u001b[36mroot://mover.pp.rl.ac.uk:1094/pnfs/pp.rl.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_Mini\u001b[0m\n", + " │ \u001b[36mAODv2_NanoAODv9_GT36-v1/2520000/B1B449CE-5952-8347-A9A7-35FE231D0C72.root\u001b[0m\n", + " ├── \u001b[36mroot://mover.pp.rl.ac.uk:1094/pnfs/pp.rl.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_Mini\u001b[0m\n", + " │ \u001b[36mAODv2_NanoAODv9_GT36-v1/2520000/BA02D468-A8CE-4F49-884F-F836BB481AD5.root\u001b[0m\n", + " └── \u001b[36mroot://mover.pp.rl.ac.uk:1094/pnfs/pp.rl.ac.uk/data/cms//store/data/Run2018C/SingleMuon/NANOAOD/UL2018_Mini\u001b[0m\n", + " \u001b[36mAODv2_NanoAODv9_GT36-v1/2520000/F6E44EA5-F4C6-E746-AD43-7A263F1E316E.root\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Selected datasets:\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[36mSelected datasets:\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
                                                 Selected datasets                                                 \n",
+       "┏━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳┳┓\n",
+       "┃ Dataset                                                                                                   ┃\n",
+       "┡━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇╇┩\n",
+       "│ 1  /DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realisti… │││\n",
+       "│ 2  /SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD                                           │││\n",
+       "└───┴───────────────────────────────────────────────────────────────────────────────────────────────────────────┴┴┘\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[3m Selected datasets \u001b[0m\n", + "┏━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳┳┓\n", + "┃\u001b[1m \u001b[0m\u001b[1m…\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mDataset \u001b[0m\u001b[1m \u001b[0m┃┃┃\n", + "┡━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇╇┩\n", + "│\u001b[36m \u001b[0m\u001b[36m1\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realisti…\u001b[0m\u001b[35m \u001b[0m│││\n", + "│\u001b[36m \u001b[0m\u001b[36m2\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD \u001b[0m\u001b[35m \u001b[0m│││\n", + "└───┴───────────────────────────────────────────────────────────────────────────────────────────────────────────┴┴┘\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "ddc = DataDiscoveryCLI()\n", + "ddc.do_regex_sites(r\"T[123]_(CH|IT|UK|FR|DE)_\\w+\")\n", + "ddc.load_dataset_definition(dataset_definition, \n", + " query_results_strategy=\"all\",\n", + " replicas_strategy=\"round-robin\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "dd9ca4ea-039d-4ebb-bbf2-79092ba6e7d0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Selected datasets:\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[36mSelected datasets:\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
                                                 Selected datasets                                                 \n",
+       "┏━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳┳┓\n",
+       "┃ Dataset                                                                                                   ┃\n",
+       "┡━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇╇┩\n",
+       "│ 1  /DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realisti… │││\n",
+       "│ 2  /SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD                                           │││\n",
+       "└───┴───────────────────────────────────────────────────────────────────────────────────────────────────────────┴┴┘\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[3m Selected datasets \u001b[0m\n", + "┏━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳┳┓\n", + "┃\u001b[1m \u001b[0m\u001b[1m…\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mDataset \u001b[0m\u001b[1m \u001b[0m┃┃┃\n", + "┡━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇╇┩\n", + "│\u001b[36m \u001b[0m\u001b[36m1\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realisti…\u001b[0m\u001b[35m \u001b[0m│││\n", + "│\u001b[36m \u001b[0m\u001b[36m2\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m/SingleMuon/Run2018C-UL2018_MiniAODv2_NanoAODv9_GT36-v1/NANOAOD \u001b[0m\u001b[35m \u001b[0m│││\n", + "└───┴───────────────────────────────────────────────────────────────────────────────────────────────────────────┴┴┘\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "ddc.do_list_selected()" + ] + }, + { + "cell_type": "markdown", + "id": "a6ffbefb-8276-4733-aedb-cc12898f4ed8", + "metadata": {}, + "source": [ + "### Save the replicas metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "b0e3e4b8-34d4-4558-988a-edacd1df9b37", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
File replicas_info.json saved!\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[32mFile replicas_info.json saved!\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "ddc.do_save(\"replicas_info.json\")" + ] + }, + { + "cell_type": "markdown", + "id": "f9f6a70b-0194-4b00-ab79-4fdb0b4fa0cf", + "metadata": {}, + "source": [ + "## DataDiscoveryCLI from shell" + ] + }, + { + "cell_type": "markdown", + "id": "7237fc9e-50b8-4cc4-9c51-9674fbf4358a", + "metadata": {}, + "source": [ + "The DataDiscoveryCLI can be used directly from CLI" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "2c075f2e-a06e-4c97-b5b6-6a6806571a9a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "usage: dataset_query.py [-h] [--cli] [-d DATASET_DEFINITION] [-o OUTPUT]\n", + " [-fo FILESET_OUTPUT] [-p] [--step-size STEP_SIZE]\n", + " [--dask-cluster DASK_CLUSTER]\n", + " [-as ALLOW_SITES [ALLOW_SITES ...]]\n", + " [-bs BLOCK_SITES [BLOCK_SITES ...]] [-rs REGEX_SITES]\n", + " [--query-results-strategy QUERY_RESULTS_STRATEGY]\n", + " [--replicas-strategy REPLICAS_STRATEGY]\n", + "\n", + "options:\n", + " -h, --help show this help message and exit\n", + " --cli Start the dataset discovery CLI\n", + " -d DATASET_DEFINITION, --dataset-definition DATASET_DEFINITION\n", + " Dataset definition file\n", + " -o OUTPUT, --output OUTPUT\n", + " Output name for dataset discovery output (no fileset\n", + " preprocessing)\n", + " -fo FILESET_OUTPUT, --fileset-output FILESET_OUTPUT\n", + " Output name for fileset\n", + " -p, --preprocess Preprocess with dask\n", + " --step-size STEP_SIZE\n", + " Step size for preprocessing\n", + " --dask-cluster DASK_CLUSTER\n", + " Dask cluster url\n", + " -as ALLOW_SITES [ALLOW_SITES ...], --allow-sites ALLOW_SITES [ALLOW_SITES ...]\n", + " List of sites to be allowlisted\n", + " -bs BLOCK_SITES [BLOCK_SITES ...], --block-sites BLOCK_SITES [BLOCK_SITES ...]\n", + " List of sites to be blocklisted\n", + " -rs REGEX_SITES, --regex-sites REGEX_SITES\n", + " Regex string to be used to filter the sites\n", + " --query-results-strategy QUERY_RESULTS_STRATEGY\n", + " Mode for query results selection: [all|manual]\n", + " --replicas-strategy REPLICAS_STRATEGY\n", + " Mode for selecting replicas for datasets:\n", + " [manual|round-robin|choose]\n" + ] + } + ], + "source": [ + "!python -m coffea.dataset_tools.dataset_query --help" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e93cb24c-44ed-43f1-8aae-0f6b03c88de0", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m coffea.dataset_tools.dataset_query --cli -d dataset_definition.json" + ] + }, + { + "cell_type": "markdown", + "id": "f7d52663-c5e3-4abe-9c2f-4bf8f08d8919", + "metadata": {}, + "source": [ + "## Preprocess the fileset with dask" + ] + }, + { + "cell_type": "markdown", + "id": "046a0c99-6500-41b5-9954-fa7b78061800", + "metadata": {}, + "source": [ + "The replicas metadata contain the file location in the CMS grid. \n", + "This info can be **preprocessed** with uproot and dask-awkward to extract the **fileset**. Practically a fileset is a collection of metadata about the file location, file name, chunks splitting, that can be used directly to configure the uproot reading. \n", + "\n", + "This step replaces the preprocessing step in coffea 0.7.x. The output of the preprocessing can be used directly to start an analysis with dask-awkward.\n", + "\n", + "The preprocessing is performed locally with multiple processes if `dask_cluster==None`, but a pre-existing dask cluster url can be passed." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "04a2aeca-9c9f-4baf-b33b-b4f1b5ba4d4a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
  Preprocessing files to extract available chunks with dask\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[32m⠙\u001b[0m \u001b[31m Preprocessing files to extract available chunks with dask\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
Saved available fileset chunks to fileset_available.json.gz\n",
+       "
\n" + ], + "text/plain": [ + "Saved available fileset chunks to fileset_available.json.gz\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Saved all fileset chunks to fileset_all.json.gz\n",
+       "
\n" + ], + "text/plain": [ + "Saved all fileset chunks to fileset_all.json.gz\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fileset_total = ddc.do_preprocess(output_file=\"fileset\", \n", + " step_size=10000, #chunk size for files splitting\n", + " align_to_clusters=False,\n", + " dask_cluster=None)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "d1206bce-b726-43cc-b217-d74fd5516147", + "metadata": {}, + "outputs": [], + "source": [ + "import gzip\n", + "import json\n", + "with gzip.open(\"fileset_available.json.gz\", \"rt\") as file:\n", + " fileset_available = json.load(file)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "957ea9c6-783a-4932-960f-cbec5f2f0656", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "root://cmsxrd.ts.infn.it:1094///store/mc/RunIISummer20UL18NanoAODv9/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/NANOAODSIM/106X_upgrade2018_realistic_v16_L1v1-v2/100000/13D0AD97-6B32-CB4C-BA87-5E37BA4CF20E.root {'object_path': 'Events', 'steps': [[0, 10000], [10000, 20000], [20000, 30000], [30000, 40000], [40000, 50000], [50000, 59081]], 'uuid': 'fbe50b00-1f7e-11ec-97b8-2bbee183beef'}\n", + "root://cmsxrd.ts.infn.it:1094///store/mc/RunIISummer20UL18NanoAODv9/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/NANOAODSIM/106X_upgrade2018_realistic_v16_L1v1-v2/230000/00C9792D-ACD2-2547-BB04-097F0C4E47E3.root {'object_path': 'Events', 'steps': [[0, 10000], [10000, 20000], [20000, 30000], [30000, 40000], [40000, 50000], [50000, 60000], [60000, 70000], [70000, 80000], [80000, 90000], [90000, 100000], [100000, 110000], [110000, 120000], [120000, 130000], [130000, 138192]], 'uuid': '938a4fe2-1d77-11ec-bddf-59319e86beef'}\n", + "root://dcache-cms-xrootd.desy.de:1094//store/mc/RunIISummer20UL18NanoAODv9/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/NANOAODSIM/106X_upgrade2018_realistic_v16_L1v1-v2/230000/00EA9563-5449-D24E-9566-98AE8E2A61AE.root {'object_path': 'Events', 'steps': [[0, 10000], [10000, 20000], [20000, 30000], [30000, 40000], [40000, 50000], [50000, 60000], [60000, 70000], [70000, 80000], [80000, 90000], [90000, 100000], [100000, 110000], [110000, 120000], [120000, 130000], [130000, 140000], [140000, 150000], [150000, 160000], [160000, 170000], [170000, 180000], [180000, 190000], [190000, 200000], [200000, 210000], [210000, 220000], [220000, 230000], [230000, 240000], [240000, 250000], [250000, 260000], [260000, 270000], [270000, 280000], [280000, 290000], [290000, 300000], [300000, 310000], [310000, 320000], [320000, 330000], [330000, 340000], [340000, 350000], [350000, 360000], [360000, 370000], [370000, 380000], [380000, 390000], [390000, 400000], [400000, 410000], [410000, 420000], [420000, 430000], [430000, 440000], [440000, 450000], [450000, 460000], [460000, 470000], [470000, 480000], [480000, 490000], [490000, 500000], [500000, 510000], [510000, 520000], [520000, 530000], [530000, 540000], [540000, 550000], [550000, 560000], [560000, 570000], [570000, 580000], [580000, 590000], [590000, 600000], [600000, 610000], [610000, 620000], [620000, 630000], [630000, 640000], [640000, 650000], [650000, 660000], [660000, 670000], [670000, 680000], [680000, 690000], [690000, 700000], [700000, 710000], [710000, 720000], [720000, 730000], [730000, 740000], [740000, 750000], [750000, 760000], [760000, 770000], [770000, 780000], [780000, 790000], [790000, 800000], [800000, 810000], [810000, 820000], [820000, 830000], [830000, 840000], [840000, 850000], [850000, 860000], [860000, 870000], [870000, 880000], [880000, 890000], [890000, 900000], [900000, 910000], [910000, 920000], [920000, 930000], [930000, 940000], [940000, 950000], [950000, 960000], [960000, 970000], [970000, 980000], [980000, 990000], [990000, 1000000], [1000000, 1010000], [1010000, 1020000], [1020000, 1030000], [1030000, 1040000], [1040000, 1050000], [1050000, 1060000], [1060000, 1070000], [1070000, 1080000], [1080000, 1090000], [1090000, 1100000], [1100000, 1110000], [1110000, 1120000], [1120000, 1130000], [1130000, 1140000], [1140000, 1150000], [1150000, 1160000], [1160000, 1170000], [1170000, 1180000], [1180000, 1190000], [1190000, 1200000], [1200000, 1210000], [1210000, 1220000], [1220000, 1230000], [1230000, 1240000], [1240000, 1250000], [1250000, 1260000], [1260000, 1270000], [1270000, 1280000], [1280000, 1290000], [1290000, 1300000], [1300000, 1310000], [1310000, 1320000], [1320000, 1330000], [1330000, 1340000], [1340000, 1350000], [1350000, 1360000], [1360000, 1370000], [1370000, 1380000], [1380000, 1390000], [1390000, 1400000], [1400000, 1410000], [1410000, 1420000], [1420000, 1430000], [1430000, 1440000], [1440000, 1450000], [1450000, 1460000], [1460000, 1470000], [1470000, 1480000], [1480000, 1490000], [1490000, 1500000], [1500000, 1510000], [1510000, 1520000], [1520000, 1530000], [1530000, 1540000], [1540000, 1550000], [1550000, 1551326]], 'uuid': 'ced110a0-1b0f-11ec-b2e9-09c08e80beef'}\n", + "root://grid-cms-xrootd.physik.rwth-aachen.de:1094///store/mc/RunIISummer20UL18NanoAODv9/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/NANOAODSIM/106X_upgrade2018_realistic_v16_L1v1-v2/230000/068B0797-DEF5-9341-BBBE-EDBE50EBC6A1.root {'object_path': 'Events', 'steps': [[0, 10000], [10000, 20000], [20000, 30000], [30000, 40000], [40000, 50000], [50000, 60000], [60000, 70000], [70000, 80000], [80000, 90000], [90000, 100000], [100000, 110000], [110000, 120000], [120000, 130000], [130000, 140000], [140000, 150000], [150000, 160000], [160000, 170000], [170000, 180000], [180000, 190000], [190000, 200000], [200000, 210000], [210000, 220000], [220000, 230000], [230000, 240000], [240000, 250000], [250000, 260000], [260000, 270000], [270000, 280000], [280000, 290000], [290000, 300000], [300000, 310000], [310000, 320000], [320000, 330000], [330000, 340000], [340000, 350000], [350000, 360000], [360000, 370000], [370000, 380000], [380000, 390000], [390000, 400000], [400000, 410000], [410000, 420000], [420000, 430000], [430000, 440000], [440000, 450000], [450000, 460000], [460000, 470000], [470000, 480000], [480000, 490000], [490000, 500000], [500000, 510000], [510000, 520000], [520000, 530000], [530000, 540000], [540000, 550000], [550000, 560000], [560000, 570000], [570000, 580000], [580000, 590000], [590000, 600000], [600000, 610000], [610000, 620000], [620000, 630000], [630000, 640000], [640000, 650000], [650000, 660000], [660000, 670000], [670000, 680000], [680000, 690000], [690000, 700000], [700000, 710000], [710000, 720000], [720000, 730000], [730000, 740000], [740000, 750000], [750000, 760000], [760000, 770000], [770000, 780000], [780000, 790000], [790000, 800000], [800000, 810000], [810000, 820000], [820000, 830000], [830000, 840000], [840000, 850000], [850000, 860000], [860000, 870000], [870000, 880000], [880000, 890000], [890000, 900000], [900000, 910000], [910000, 920000], [920000, 930000], [930000, 940000], [940000, 950000], [950000, 960000], [960000, 970000], [970000, 980000], [980000, 990000], [990000, 1000000], [1000000, 1010000], [1010000, 1020000], [1020000, 1030000], [1030000, 1040000], [1040000, 1050000], [1050000, 1060000], [1060000, 1070000], [1070000, 1080000], [1080000, 1090000], [1090000, 1100000], [1100000, 1110000], [1110000, 1120000], [1120000, 1130000], [1130000, 1138724]], 'uuid': 'd86ab2e2-1b28-11ec-8504-738a8e80beef'}\n", + "root://cmsxrd.ts.infn.it:1094///store/mc/RunIISummer20UL18NanoAODv9/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/NANOAODSIM/106X_upgrade2018_realistic_v16_L1v1-v2/230000/0CFD79EF-41AB-4B4A-8F62-06393273EEDE.root {'object_path': 'Events', 'steps': [[0, 10000], [10000, 20000], [20000, 30000], [30000, 40000], [40000, 50000], [50000, 60000], [60000, 70000], [70000, 80000], [80000, 90000], [90000, 100000], [100000, 110000], [110000, 120000], [120000, 130000], [130000, 140000], [140000, 150000], [150000, 160000], [160000, 170000], [170000, 180000], [180000, 190000], [190000, 200000], [200000, 210000], [210000, 220000], [220000, 230000], [230000, 240000], [240000, 250000], [250000, 260000], [260000, 270000], [270000, 280000], [280000, 290000], [290000, 300000], [300000, 310000], [310000, 320000], [320000, 330000], [330000, 340000], [340000, 350000], [350000, 360000], [360000, 370000], [370000, 380000], [380000, 390000], [390000, 400000], [400000, 410000], [410000, 420000], [420000, 430000], [430000, 440000], [440000, 450000], [450000, 460000], [460000, 470000], [470000, 480000], [480000, 490000], [490000, 500000], [500000, 510000], [510000, 520000], [520000, 530000], [530000, 540000], [540000, 550000], [550000, 560000], [560000, 570000], [570000, 580000], [580000, 590000], [590000, 600000], [600000, 610000], [610000, 620000], [620000, 630000], [630000, 640000], [640000, 650000], [650000, 660000], [660000, 670000], [670000, 680000], [680000, 690000], [690000, 700000], [700000, 710000], [710000, 720000], [720000, 730000], [730000, 740000], [740000, 750000], [750000, 760000], [760000, 770000], [770000, 780000], [780000, 790000], [790000, 800000], [800000, 810000], [810000, 820000], [820000, 830000], [830000, 840000], [840000, 850000], [850000, 860000], [860000, 870000], [870000, 880000], [880000, 890000], [890000, 900000], [900000, 910000], [910000, 911868]], 'uuid': '9d799986-1ad9-11ec-9257-fc1b1e0abeef'}\n" + ] + } + ], + "source": [ + "dataset = '/DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2/NANOAODSIM'\n", + "for i, (file, meta) in enumerate(fileset_available[dataset][\"files\"].items()):\n", + " print(file, meta) \n", + " if i>3: break" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f23bae95-8a2e-46a9-a884-714474a8ff12", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pyproject.toml b/pyproject.toml index 15d53869d..f35d69e26 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,7 @@ classifiers = [ ] dependencies = [ "awkward>=2.5.1rc1", - "uproot>=5.2.0rc3", + "uproot>=5.2.0rc4", "dask[array]>=2023.4.0", "dask-awkward>=2023.12.1", "dask-histogram>=2023.10.0", @@ -85,6 +85,10 @@ servicex = [ "servicex>=2.5.3", "func-adl_servicex", ] +rucio = [ + "rucio-clients>=32;python_version>'3.8'", + "rucio-clients<32;python_version<'3.9'", +] dev = [ "pre-commit", "flake8", diff --git a/src/coffea/analysis_tools.py b/src/coffea/analysis_tools.py index facf14e97..0f88cab27 100644 --- a/src/coffea/analysis_tools.py +++ b/src/coffea/analysis_tools.py @@ -19,7 +19,7 @@ import coffea.util -class WeightStatistics(coffea.processor.AccumulatorABC): +class WeightStatistics: def __init__(self, sumw=0.0, sumw2=0.0, minw=numpy.inf, maxw=-numpy.inf, n=0): self.sumw = sumw self.sumw2 = sumw2 @@ -40,6 +40,13 @@ def add(self, other): self.maxw = max(self.maxw, other.maxw) self.n += other.n + def __add__(self, other): + temp = WeightStatistics(self.sumw, self.sumw2, self.minw, self.maxw, self.n) + return temp.add(other) + + def __iadd__(self, other): + return self.add(other) + class Weights: """Container for event weights and associated systematic shifts @@ -62,7 +69,7 @@ def __init__(self, size, storeIndividual=False): self._weight = None if size is None else numpy.ones(size) self._weights = {} self._modifiers = {} - self._weightStats = coffea.processor.dict_accumulator() + self._weightStats = {} self._storeIndividual = storeIndividual @property @@ -102,8 +109,6 @@ def __add_delayed(self, name, weight, weightUp, weightDown, shift): if self._storeIndividual: self._weights[name] = weight self.__add_variation(name, weight, weightUp, weightDown, shift) - if isinstance(self._weightStats, coffea.processor.dict_accumulator): - self._weightStats = {} self._weightStats[name] = { "sumw": dask_awkward.to_dask_array(weight).sum(), "sumw2": dask_awkward.to_dask_array(weight**2).sum(), diff --git a/src/coffea/dataset_tools/__init__.py b/src/coffea/dataset_tools/__init__.py new file mode 100644 index 000000000..8dd444189 --- /dev/null +++ b/src/coffea/dataset_tools/__init__.py @@ -0,0 +1,18 @@ +from coffea.dataset_tools.apply_processor import apply_to_dataset, apply_to_fileset +from coffea.dataset_tools.manipulations import ( + get_failed_steps_for_dataset, + get_failed_steps_for_fileset, + max_chunks, + slice_chunks, +) +from coffea.dataset_tools.preprocess import preprocess + +__all__ = [ + "preprocess", + "apply_to_dataset", + "apply_to_fileset", + "max_chunks", + "slice_chunks", + "get_failed_steps_for_dataset", + "get_failed_steps_for_fileset", +] diff --git a/src/coffea/dataset_tools/apply_processor.py b/src/coffea/dataset_tools/apply_processor.py new file mode 100644 index 000000000..324dfd908 --- /dev/null +++ b/src/coffea/dataset_tools/apply_processor.py @@ -0,0 +1,126 @@ +from __future__ import annotations + +import copy +from typing import Any, Callable, Dict, Hashable, List, Set, Tuple, Union + +import dask.base +import dask_awkward + +from coffea.dataset_tools.preprocess import ( + DatasetSpec, + DatasetSpecOptional, + FilesetSpec, + FilesetSpecOptional, +) +from coffea.nanoevents import BaseSchema, NanoAODSchema, NanoEventsFactory +from coffea.processor import ProcessorABC + +DaskOutputBaseType = Union[ + dask.base.DaskMethodsMixin, + Dict[Hashable, dask.base.DaskMethodsMixin], + Set[dask.base.DaskMethodsMixin], + List[dask.base.DaskMethodsMixin], + Tuple[dask.base.DaskMethodsMixin], +] + +# NOTE TO USERS: You can use nested python containers as arguments to dask.compute! +DaskOutputType = Union[DaskOutputBaseType, Tuple[DaskOutputBaseType, ...]] + +GenericHEPAnalysis = Callable[[dask_awkward.Array], DaskOutputType] + + +def apply_to_dataset( + data_manipulation: ProcessorABC | GenericHEPAnalysis, + dataset: DatasetSpec | DatasetSpecOptional, + schemaclass: BaseSchema = NanoAODSchema, + metadata: dict[Hashable, Any] = {}, + uproot_options: dict[str, Any] = {}, +) -> DaskOutputType | tuple[DaskOutputType, dask_awkward.Array]: + """ + Apply the supplied function or processor to the supplied dataset. + Parameters + ---------- + data_manipulation : ProcessorABC or GenericHEPAnalysis + The user analysis code to run on the input dataset + dataset: DatasetSpec | DatasetSpecOptional + The data to be acted upon by the data manipulation passed in. + schemaclass: BaseSchema, default NanoAODSchema + The nanoevents schema to interpret the input dataset with. + metadata: dict[Hashable, Any], default {} + Metadata for the dataset that is accessible by the input analysis. Should also be dask-serializable. + uproot_options: dict[str, Any], default {} + Options to pass to uproot. Pass at least {"allow_read_errors_with_report": True} to turn on file access reports. + + Returns + ------- + out : DaskOutputType + The output of the analysis workflow applied to the dataset + report : dask_awkward.Array, optional + The file access report for running the analysis on the input dataset. Needs to be computed in simultaneously with the analysis to be accurate. + """ + files = dataset["files"] + events = NanoEventsFactory.from_root( + files, + metadata=metadata, + schemaclass=schemaclass, + uproot_options=uproot_options, + ).events() + + report = None + if isinstance(events, tuple): + events, report = events + + out = None + if isinstance(data_manipulation, ProcessorABC): + out = data_manipulation.process(events) + elif isinstance(data_manipulation, Callable): + out = data_manipulation(events) + else: + raise ValueError("data_manipulation must either be a ProcessorABC or Callable") + + if report is not None: + return out, report + return out + + +def apply_to_fileset( + data_manipulation: ProcessorABC | GenericHEPAnalysis, + fileset: FilesetSpec | FilesetSpecOptional, + schemaclass: BaseSchema = NanoAODSchema, + uproot_options: dict[str, Any] = {}, +) -> dict[str, DaskOutputType] | tuple[dict[str, DaskOutputType], dask_awkward.Array]: + """ + Apply the supplied function or processor to the supplied fileset (set of datasets). + Parameters + ---------- + data_manipulation : ProcessorABC or GenericHEPAnalysis + The user analysis code to run on the input dataset + fileset: FilesetSpec | FilesetSpecOptional + The data to be acted upon by the data manipulation passed in. Metadata within the fileset should be dask-serializable. + schemaclass: BaseSchema, default NanoAODSchema + The nanoevents schema to interpret the input dataset with. + uproot_options: dict[str, Any], default {} + Options to pass to uproot. Pass at least {"allow_read_errors_with_report": True} to turn on file access reports. + + Returns + ------- + out : dict[str, DaskOutputType] + The output of the analysis workflow applied to the datasets, keyed by dataset name. + report : dask_awkward.Array, optional + The file access report for running the analysis on the input dataset. Needs to be computed in simultaneously with the analysis to be accurate. + """ + out = {} + report = {} + for name, dataset in fileset.items(): + metadata = copy.deepcopy(dataset.get("metadata", {})) + metadata.setdefault("dataset", name) + dataset_out = apply_to_dataset( + data_manipulation, dataset, schemaclass, metadata, uproot_options + ) + if isinstance(dataset_out, tuple): + out[name], report[name] = dataset_out + else: + out[name] = dataset_out + if len(report) > 0: + return out, report + return out diff --git a/src/coffea/dataset_tools/dataset_query.py b/src/coffea/dataset_tools/dataset_query.py new file mode 100644 index 000000000..cf4f328f9 --- /dev/null +++ b/src/coffea/dataset_tools/dataset_query.py @@ -0,0 +1,693 @@ +import argparse +import gzip +import json +import os +import random +from collections import defaultdict +from typing import List + +import yaml +from dask.distributed import Client +from rich import print +from rich.console import Console +from rich.prompt import Confirm, IntPrompt, Prompt +from rich.table import Table +from rich.tree import Tree + +from . import rucio_utils +from .preprocess import preprocess + + +def print_dataset_query(query, dataset_list, console, selected=[]): + table = Table(title=f"Query: [bold red]{query}") + table.add_column("Name", justify="left", style="cyan", no_wrap=True) + table.add_column("Tag", style="magenta", no_wrap=True) + table.add_column("Selected", justify="center") + table.row_styles = ["dim", "none"] + j = 1 + for name, conds in dataset_list.items(): + ic = 0 + ncond = len(conds) + for c, tiers in conds.items(): + dataset = f"/{name}/{c}/{tiers[0]}" + sel = dataset in selected + if ic == 0: + table.add_row( + name, + f"[bold]({j})[/bold] {c}/{'-'.join(tiers)}", + "[green bold]Y" if sel else "[red]N", + end_section=ic == ncond - 1, + ) + else: + table.add_row( + "", + f"[bold]({j})[/bold] {c}/{'-'.join(tiers)}", + "[green bold]Y" if sel else "[red]N", + end_section=ic == ncond - 1, + ) + ic += 1 + j += 1 + + console.print(table) + + +def get_indices_query(input_str: str, maxN: int) -> List[int]: + tokens = input_str.strip().split(" ") + final_tokens = [] + for t in tokens: + if t.isdigit(): + if int(t) > maxN: + print( + f"[red bold]Requested index {t} larger than available elements {maxN}" + ) + return False + final_tokens.append(int(t) - 1) # index 0 + elif "-" in t: + rng = t.split("-") + try: + for i in range( + int(rng[0]), int(rng[1]) + 1 + ): # including the last index + if i > maxN: + print( + f"[red bold]Requested index {t} larger than available elements {maxN}" + ) + return False + final_tokens.append(i - 1) + except Exception: + print( + "[red]Error! Bad formatting for selection string. Use e.g. 1 4 5-9" + ) + return False + elif t == "all": + final_tokens = list(range(0, maxN)) + else: + print("[red]Error! Bad formatting for selection string. Use e.g. 1 4 5-9") + return False + return final_tokens + + +class DataDiscoveryCLI: + def __init__(self): + self.console = Console() + self.rucio_client = None + self.selected_datasets = [] + self.selected_datasets_metadata = [] + self.last_query = "" + self.last_query_tree = None + self.last_query_list = None + self.sites_allowlist = None + self.sites_blocklist = None + self.sites_regex = None + self.last_replicas_results = None + + self.replica_results = defaultdict(list) + self.replica_results_metadata = {} + self.replica_results_bysite = {} + + self.commands = [ + "help", + "login", + "query", + "query-results", + "select", + "list-selected", + "replicas", + "list-replicas", + "save", + "preprocess", + "allow-sites", + "block-sites", + "regex-sites", + "sites-filters", + "quit", + ] + + def start_cli(self): + while True: + command = Prompt.ask(">", choices=self.commands) + if command == "help": + print( + r"""[bold yellow]Welcome to the datasets discovery coffea CLI![/bold yellow] +Use this CLI tool to query the CMS datasets and to select interactively the grid sites to use for reading the files in your analysis. +Some basic commands: + - [bold cyan]query[/]: Look for datasets with * wildcards (like in DAS) + - [bold cyan]select[/]: Select datasets to process further from query results + - [bold cyan]replicas[/]: Query rucio to look for files replica and then select the preferred sites + - [bold cyan]query-results[/]: List the results of the last dataset query + - [bold cyan]list-selected[/]: Print a list of the selected datasets + - [bold cyan]list-replicas[/]: Print the selected files replicas for the selected dataset + - [bold cyan]sites-filters[/]: show the active sites filters and ask to clear them + - [bold cyan]allow-sites[/]: Restrict the grid sites available for replicas query only to the requested list + - [bold cyan]block-sites[/]: Exclude grid sites from the available sites for replicas query + - [bold cyan]regex-sites[/]: Select sites with a regex for replica queries: e.g. "T[123]_(FR|IT|BE|CH|DE)_\w+" + - [bold cyan]save[/]: Save the replicas query results to file (json or yaml) for further processing + - [bold cyan]preprocess[/]: Preprocess the replicas with dask and save the fileset for further processing with uproot/coffea + - [bold cyan]help[/]: Print this help message + """ + ) + elif command == "login": + self.do_login() + elif command == "quit": + print("Bye!") + break + elif command == "query": + self.do_query() + elif command == "query-results": + self.do_query_results() + elif command == "select": + self.do_select() + elif command == "list-selected": + self.do_list_selected() + elif command == "replicas": + self.do_replicas() + elif command == "list-replicas": + self.do_list_replicas() + elif command == "save": + self.do_save() + elif command == "preprocess": + self.do_preprocess() + elif command == "allow-sites": + self.do_allowlist_sites() + elif command == "block-sites": + self.do_blocklist_sites() + elif command == "regex-sites": + self.do_regex_sites() + elif command == "sites-filters": + self.do_sites_filters() + else: + break + + def do_login(self, proxy=None): + """Login to the rucio client. Optionally a specific proxy file can be passed to the command. + If the proxy file is not specified, `voms-proxy-info` is used""" + if proxy: + self.rucio_client = rucio_utils.get_rucio_client(proxy) + else: + self.rucio_client = rucio_utils.get_rucio_client() + print(self.rucio_client) + + def do_whoami(self): + # Your code here + if not self.rucio_client: + print("First [bold]login (L)[/] to the rucio server") + return + print(self.rucio_client.whoami()) + + def do_query(self, query=None): + # Your code here + if query is None: + query = Prompt.ask( + "[yellow bold]Query for[/]", + ) + with self.console.status(f"Querying rucio for: [bold red]{query}[/]"): + outlist, outtree = rucio_utils.query_dataset( + query, + client=self.rucio_client, + tree=True, + scope="cms", # TODO configure scope + ) + # Now let's print the results as a tree + print_dataset_query(query, outtree, self.console, self.selected_datasets) + self.last_query = query + self.last_query_list = outlist + self.last_query_tree = outtree + print("Use the command [bold red]select[/] to selected the datasets") + + def do_query_results(self): + if self.last_query_list: + print_dataset_query( + self.last_query, + self.last_query_tree, + self.console, + self.selected_datasets, + ) + else: + print("First [bold red]query (Q)[/] for a dataset") + + def do_select(self, selection=None, metadata=None): + """Selected the datasets from the list of query results. Input a list of indices + also with range 4-6 or "all".""" + if not self.last_query_list: + print("First [bold red]query (Q)[/] for a dataset") + return + + if selection is None: + selection = Prompt.ask( + "[yellow bold]Select datasets indices[/] (e.g 1 4 6-10)", default="all" + ) + final_tokens = get_indices_query(selection, len(self.last_query_list)) + if not final_tokens: + return + + Nresults = len(self.last_query_list) + print("[cyan]Selected datasets:") + + for s in final_tokens: + if s < Nresults: + self.selected_datasets.append(self.last_query_list[s]) + if metadata: + self.selected_datasets_metadata.append(metadata) + else: + self.selected_datasets_metadata.append({}) + print(f"- ({s+1}) {self.last_query_list[s]}") + else: + print( + f"[red]The requested dataset is not in the list. Please insert a position <={Nresults}" + ) + + def do_list_selected(self): + print("[cyan]Selected datasets:") + table = Table(title="Selected datasets") + table.add_column("Index", justify="left", style="cyan", no_wrap=True) + table.add_column("Dataset", style="magenta", no_wrap=True) + table.add_column("Replicas selected", justify="center") + table.add_column("N. of files", justify="center") + for i, ds in enumerate(self.selected_datasets): + table.add_row( + str(i + 1), + ds, + "[green bold]Y" if ds in self.replica_results else "[red]N", + str(len(self.replica_results[ds])) + if ds in self.replica_results + else "-", + ) + self.console.print(table) + + def do_replicas(self, mode=None, selection=None): + """Query Rucio for replicas. + Mode: - None: ask the user about the mode + - round-robin (take files randomly from available sites), + - choose: ask the user to choose the specific site + """ + if selection is None: + selection = Prompt.ask( + "[yellow bold]Select datasets indices[/] (e.g 1 4 6-10)", default="all" + ) + indices = get_indices_query(selection, len(self.selected_datasets)) + if not indices: + return + datasets = [ + (self.selected_datasets[ind], self.selected_datasets_metadata[ind]) + for ind in indices + ] + + for dataset, dataset_metadata in datasets: + with self.console.status( + f"Querying rucio for replicas: [bold red]{dataset}[/]" + ): + try: + ( + outfiles, + outsites, + sites_counts, + ) = rucio_utils.get_dataset_files_replicas( + dataset, + allowlist_sites=self.sites_allowlist, + blocklist_sites=self.sites_blocklist, + regex_sites=self.sites_regex, + mode="full", + client=self.rucio_client, + ) + except Exception as e: + print(f"\n[red bold] Exception: {e}[/]") + return + self.last_replicas_results = (outfiles, outsites, sites_counts) + + print(f"[cyan]Sites availability for dataset: [red]{dataset}") + table = Table(title="Available replicas") + table.add_column("Index", justify="center") + table.add_column("Site", justify="left", style="cyan", no_wrap=True) + table.add_column("Files", style="magenta", no_wrap=True) + table.add_column("Availability", justify="center") + table.row_styles = ["dim", "none"] + Nfiles = len(outfiles) + + sorted_sites = dict( + sorted(sites_counts.items(), key=lambda x: x[1], reverse=True) + ) + for i, (site, stat) in enumerate(sorted_sites.items()): + table.add_row( + str(i), site, f"{stat} / {Nfiles}", f"{stat*100/Nfiles:.1f}%" + ) + + self.console.print(table) + if mode is None: + mode = Prompt.ask( + "Select sites", + choices=["round-robin", "choose", "quit"], + default="round-robin", + ) + + files_by_site = defaultdict(list) + + if mode == "choose": + ind = list( + map( + int, + Prompt.ask("Enter list of sites index to be used").split(" "), + ) + ) + sites_to_use = [list(sorted_sites.keys())[i] for i in ind] + print(f"Filtering replicas with [green]: {' '.join(sites_to_use)}") + + output = [] + for ifile, (files, sites) in enumerate(zip(outfiles, outsites)): + random.shuffle(sites_to_use) + found = False + # loop on shuffled selected sites until one is found + for site in sites_to_use: + try: + iS = sites.index(site) + output.append(files[iS]) + files_by_site[sites[iS]].append(files[iS]) + found = True + break # keep only one replica + except ValueError: + # if the index is not found just go to the next site + pass + + if not found: + print( + f"[bold red]No replica found compatible with sites selection for file #{ifile}. The available sites are:" + ) + for f, s in zip(files, sites): + print(f"\t- [green]{s} [cyan]{f}") + return + + self.replica_results[dataset] = output + self.replica_results_metadata[dataset] = dataset_metadata + + elif mode == "round-robin": + output = [] + for ifile, (files, sites) in enumerate(zip(outfiles, outsites)): + # selecting randomly from the sites + iS = random.randint(0, len(sites) - 1) + output.append(files[iS]) + files_by_site[sites[iS]].append(files[iS]) + self.replica_results[dataset] = output + self.replica_results_metadata[dataset] = dataset_metadata + + elif mode == "quit": + print("[orange]Doing nothing...") + return + + self.replica_results_bysite[dataset] = files_by_site + + # Now let's print the results + tree = Tree(label=f"[bold orange]Replicas for [green]{dataset}") + for site, files in files_by_site.items(): + T = tree.add(f"[green]{site}") + for f in files: + T.add(f"[cyan]{f}") + self.console.print(tree) + + def do_allowlist_sites(self, sites=None): + if sites is None: + sites = Prompt.ask( + "[yellow]Restrict the available sites to (comma-separated list)" + ).split(",") + if self.sites_allowlist is None: + self.sites_allowlist = sites + else: + self.sites_allowlist += sites + print("[green]Allowlisted sites:") + for s in self.sites_allowlist: + print(f"- {s}") + + def do_blocklist_sites(self, sites=None): + if sites is None: + sites = Prompt.ask( + "[yellow]Exclude the sites (comma-separated list)" + ).split(",") + if self.sites_blocklist is None: + self.sites_blocklist = sites + else: + self.sites_blocklist += sites + print("[red]Blocklisted sites:") + for s in self.sites_blocklist: + print(f"- {s}") + + def do_regex_sites(self, regex=None): + if regex is None: + regex = Prompt.ask("[yellow]Regex to restrict the available sites") + if len(regex): + self.sites_regex = rf"{regex}" + print(f"New sites regex: [cyan]{self.sites_regex}") + + def do_sites_filters(self, ask_clear=True): + print("[green bold]Allow-listed sites:") + if self.sites_allowlist: + for s in self.sites_allowlist: + print(f"- {s}") + + print("[bold red]Block-listed sites:") + if self.sites_blocklist: + for s in self.sites_blocklist: + print(f"- {s}") + + print(f"[bold cyan]Sites regex: [italics]{self.sites_regex}") + + if ask_clear: + if Confirm.ask("Clear sites restrinction?", default=False): + self.sites_allowlist = None + self.sites_blocklist = None + self.sites_regex = None + print("[bold green]Sites filters cleared") + + def do_list_replicas(self): + selection = Prompt.ask( + "[yellow bold]Select datasets indices[/] (e.g 1 4 6-10)", default="all" + ) + indices = get_indices_query(selection, len(self.selected_datasets)) + datasets = [self.selected_datasets[ind] for ind in indices] + + for dataset in datasets: + if dataset not in self.replica_results: + print( + f"[red bold]No replica info for dataset {dataset}. You need to selected the replicas with [cyan] replicas [/cyan] command[/]" + ) + return + tree = Tree(label=f"[bold orange]Replicas for [/][green]{dataset}[/]") + for site, files in self.replica_results_bysite[dataset].items(): + T = tree.add(f"[green]{site}") + for f in files: + T.add(f"[cyan]{f}") + + self.console.print(tree) + + def do_save(self, filename=None): + """Save the replica information in yaml format""" + if not filename: + filename = Prompt.ask( + "[yellow bold]Output file name (.yaml or .json)", default="output.json" + ) + format = os.path.splitext(filename)[1] + output = {} + for fileset, files in self.replica_results.items(): + output[fileset] = { + "files": files, + "metadata": self.replica_results_metadata[fileset], + } + with open(filename, "w") as file: + if format == ".yaml": + yaml.dump(output, file, default_flow_style=False) + elif format == ".json": + json.dump(output, file, indent=2) + print(f"[green]File {filename} saved!") + + def do_preprocess( + self, + output_file=None, + step_size=None, + align_to_clusters=None, + dask_cluster=None, + ): + """Perform preprocessing for concrete fileset extraction. + Args: output_file [step_size] [align to file cluster boundaries] [dask cluster url] + """ + if not output_file: + output_file = Prompt.ask( + "[yellow bold]Output name", default="output_preprocessing" + ) + if step_size is None: + step_size = IntPrompt.ask("[yellow bold]Step size", default=None) + if align_to_clusters is None: + align_to_clusters = Confirm.ask( + "[yellow bold]Align to clusters", default=True + ) + + replicas = {} + for fileset, files in self.replica_results.items(): + replicas[fileset] = { + "files": {f: "Events" for f in files}, + "metadata": self.replica_results_metadata[fileset], + } + # init a local Dask cluster + with self.console.status( + "[red] Preprocessing files to extract available chunks with dask[/]" + ): + with Client(dask_cluster) as _: + out_available, out_updated = preprocess( + replicas, + maybe_step_size=step_size, + align_clusters=align_to_clusters, + skip_bad_files=True, + ) + with gzip.open(f"{output_file}_available.json.gz", "wt") as file: + print(f"Saved available fileset chunks to {output_file}_available.json.gz") + json.dump(out_available, file, indent=2) + with gzip.open(f"{output_file}_all.json.gz", "wt") as file: + print(f"Saved all fileset chunks to {output_file}_all.json.gz") + json.dump(out_updated, file, indent=2) + return out_updated + + def load_dataset_definition( + self, + dataset_definition, + query_results_strategy="all", + replicas_strategy="round-robin", + ): + """ + Initialize the DataDiscoverCLI by querying a set of datasets defined in `dataset_definitions` + and selected results and replicas following the options. + + - query_results_strategy: "all" or "manual" to be prompt for selection + - replicas_strategy: "round-robin", "choose" (to manually choose the sites), "manual": to be prompt for manual decision case by case + """ + for dataset_query, dataset_meta in dataset_definition.items(): + print(f"\nProcessing query: {dataset_query}") + # Adding queries + self.do_query(dataset_query) + # Now selecting the results depending on the interactive mode or not. + # Metadata are passed to the selection function to associated them with the selected dataset. + if query_results_strategy not in ["all", "manual"]: + print( + "Invalid query-results-strategy option: please choose between: manual|all" + ) + exit(1) + elif query_results_strategy == "manual": + self.do_select(selection=None, metadata=dataset_meta) + else: + self.do_select(selection="all", metadata=dataset_meta) + + # Now list all + self.do_list_selected() + + # selecting replicas + self.do_sites_filters(ask_clear=False) + print("Getting replicas") + if replicas_strategy == "manual": + self.do_replicas(mode=None, selection="all") + else: + if replicas_strategy not in ["round-robin", "choose"]: + print( + "Invalid replicas-strategy: please choose between manual|round-robin|choose" + ) + exit(1) + self.do_replicas(mode=replicas_strategy, selection="all") + # Now list all + self.do_list_selected() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--cli", help="Start the dataset discovery CLI", action="store_true" + ) + parser.add_argument( + "-d", + "--dataset-definition", + help="Dataset definition file", + type=str, + required=False, + ) + parser.add_argument( + "-o", + "--output", + help="Output name for dataset discovery output (no fileset preprocessing)", + type=str, + required=False, + default="output_dataset", + ) + parser.add_argument( + "-fo", + "--fileset-output", + help="Output name for fileset", + type=str, + required=False, + default="output_fileset", + ) + parser.add_argument( + "-p", "--preprocess", help="Preprocess with dask", action="store_true" + ) + parser.add_argument( + "--step-size", help="Step size for preprocessing", type=int, default=500000 + ) + parser.add_argument( + "--dask-cluster", help="Dask cluster url", type=str, default=None + ) + parser.add_argument( + "-as", + "--allow-sites", + help="List of sites to be allowlisted", + nargs="+", + type=str, + ) + parser.add_argument( + "-bs", + "--block-sites", + help="List of sites to be blocklisted", + nargs="+", + type=str, + ) + parser.add_argument( + "-rs", + "--regex-sites", + help="Regex string to be used to filter the sites", + type=str, + ) + parser.add_argument( + "--query-results-strategy", + help="Mode for query results selection: [all|manual]", + type=str, + default="all", + ) + parser.add_argument( + "--replicas-strategy", + help="Mode for selecting replicas for datasets: [manual|round-robin|choose]", + default="round-robin", + required=False, + ) + args = parser.parse_args() + + cli = DataDiscoveryCLI() + + if args.allow_sites: + cli.sites_allowlist = args.allow_sites + if args.block_sites: + cli.sites_blocklist = args.block_sites + if args.regex_sites: + cli.sites_regex = args.regex_sites + + if args.dataset_definition: + with open(args.dataset_definition) as file: + dd = json.load(file) + cli.load_dataset_definition( + dd, + query_results_strategy=args.query_results_strategy, + replicas_strategy=args.replicas_strategy, + ) + # Save + if args.output: + cli.do_save(filename=args.output) + if preprocess: + cli.do_preprocess( + output_file=args.fileset_output, + step_size=args.step_size, + dask_cluster=args.dask_cluster, + align_to_clusters=False, + ) + + if args.cli: + cli.start_cli() diff --git a/src/coffea/dataset_tools/manipulations.py b/src/coffea/dataset_tools/manipulations.py new file mode 100644 index 000000000..081e1d97d --- /dev/null +++ b/src/coffea/dataset_tools/manipulations.py @@ -0,0 +1,134 @@ +from __future__ import annotations + +import copy +from typing import Any + +import awkward +import numpy + +from coffea.dataset_tools.preprocess import DatasetSpec, FilesetSpec + + +def max_chunks(fileset: FilesetSpec, maxchunks: int | None = None) -> FilesetSpec: + """ + Modify the input dataset so that only the first "maxchunks" chunks of each file will be processed. + Parameters + ---------- + fileset: FilesetSpec + The set of datasets reduce to max-chunks row-ranges. + maxchunks: int | None, default None + How many chunks to keep for each file. + + Returns + ------- + out : FilesetSpec + The reduced fileset with only the first maxchunks event ranges left in. + """ + return slice_chunks(fileset, slice(maxchunks)) + + +def slice_chunks(fileset: FilesetSpec, theslice: Any = slice(None)) -> FilesetSpec: + """ + Modify the input dataset so that only the chunks of each file specified by the input slice are processed. + Parameters + ---------- + fileset: FilesetSpec + The set of datasets to be sliced. + theslice: Any, default slice(None) + How to slice the array of row-ranges (steps) in the input fileset. + + Returns + ------- + out : FilesetSpec + The reduce fileset with only the row-ranges specific by theslice left. + """ + if not isinstance(theslice, slice): + theslice = slice(theslice) + + out = copy.deepcopy(fileset) + for name, entry in fileset.items(): + for fname, finfo in entry["files"].items(): + out[name]["files"][fname]["steps"] = finfo["steps"][theslice] + + return out + + +def get_failed_steps_for_dataset( + dataset: DatasetSpec, report: awkward.Array +) -> DatasetSpec: + """ + Modify an input dataset to only contain the files and row-ranges for *failed* processing jobs as specified in the supplied report. + Parameters + ---------- + dataset: DatasetSpec + The dataset to be reduced to only contain files and row-ranges that have previously encountered failed file access. + report: awkward.Array + The computed file-access error report from dask-awkward. + + Returns + ------- + out : DatasetSpec + The reduced dataset with only the row-ranges and files that failed processing, according to the input report. + """ + failed_dataset = copy.deepcopy(dataset) + failed_dataset["files"] = {} + failures = report[~awkward.is_none(report.exception)] + + if not awkward.all(report.args[:, 4] == "True"): + raise RuntimeError( + "step specification is not completely in starts/stops form, failed-step extraction is not available for steps_per_file." + ) + + for fname, fdesc in dataset["files"].items(): + if "steps" not in fdesc: + raise RuntimeError( + f"steps specification not found in file description for {fname}, " + "please specify steps consistently in input dataset." + ) + + fnames = set(dataset["files"].keys()) + rnames = ( + set(numpy.unique(failures.args[:, 0][:, 1:-1:])) if len(failures) > 0 else set() + ) + if not rnames.issubset(fnames): + raise RuntimeError( + f"Files: {rnames - fnames} are not in input dataset, please ensure report corresponds to input dataset!" + ) + + for failure in failures: + args_as_types = tuple(eval(arg) for arg in failure.args) + + fname, object_path, start, stop, is_step = args_as_types + + if fname in failed_dataset["files"]: + failed_dataset["files"][fname]["steps"].append([start, stop]) + else: + failed_dataset["files"][fname] = copy.deepcopy(dataset["files"][fname]) + failed_dataset["files"][fname]["steps"] = [[start, stop]] + + return failed_dataset + + +def get_failed_steps_for_fileset( + fileset: FilesetSpec, report_dict: dict[str, awkward.Array] +): + """ + Modify an input dataset to only contain the files and row-ranges for *failed* processing jobs as specified in the supplied report. + Parameters + ---------- + fileset: FilesetSpec + The set of datasets to be reduced to only contain files and row-ranges that have previously encountered failed file access. + report_dict: dict[str, awkward.Array] + The computed file-access error reports from dask-awkward, indexed by dataset name. + + Returns + ------- + out : FilesetSpec + The reduced dataset with only the row-ranges and files that failed processing, according to the input report. + """ + failed_fileset = {} + for name, dataset in fileset.items(): + failed_dataset = get_failed_steps_for_dataset(dataset, report_dict[name]) + if len(failed_dataset["files"]) > 0: + failed_fileset[name] = failed_dataset + return failed_fileset diff --git a/src/coffea/dataset_tools/preprocess.py b/src/coffea/dataset_tools/preprocess.py new file mode 100644 index 000000000..94ddf2507 --- /dev/null +++ b/src/coffea/dataset_tools/preprocess.py @@ -0,0 +1,261 @@ +from __future__ import annotations + +import copy +import math +from dataclasses import dataclass +from typing import Any, Dict, Hashable + +import awkward +import dask +import dask_awkward +import numpy +import uproot + + +def _get_steps( + normed_files: awkward.Array | dask_awkward.Array, + maybe_step_size: int | None = None, + align_clusters: bool = False, + recalculate_seen_steps: bool = False, + skip_bad_files: bool = False, + file_exceptions: Exception + | Warning + | tuple[Exception | Warning] = (FileNotFoundError, OSError), +) -> awkward.Array | dask_awkward.Array: + """ + Given a list of normalized file and object paths (defined in uproot), determine the steps for each file according to the supplied processing options. + Parameters + ---------- + normed_files: awkward.Array | dask_awkward.Array + The list of normalized file descriptions to process for steps. + maybe_step_sizes: int | None, default None + If specified, the size of the steps to make when analyzing the input files. + align_clusters: bool, default False + Round to the cluster size in a root file, when chunks are specified. Reduces data transfer in + analysis. + recalculate_seen_steps: bool, default False + If steps are present in the input normed files, force the recalculation of those steps, instead + of only recalculating the steps if the uuid has changed. + skip_bad_files: bool, False + Instead of failing, catch exceptions specified by file_exceptions and return null data. + file_exceptions: Exception | Warning | tuple[Exception | Warning], default (FileNotFoundError, OSError) + What exceptions to catch when skipping bad files. + + Returns + ------- + array : awkward.Array | dask_awkward.Array + The normalized file descriptions, appended with the calculated steps for those files. + """ + nf_backend = awkward.backend(normed_files) + lz_or_nf = awkward.typetracer.length_zero_if_typetracer(normed_files) + + array = [] if nf_backend != "typetracer" else lz_or_nf + for arg in lz_or_nf: + try: + the_file = uproot.open({arg.file: None}) + except file_exceptions as e: + if skip_bad_files: + array.append(None) + continue + else: + raise e + + tree = the_file[arg.object_path] + num_entries = tree.num_entries + + target_step_size = num_entries if maybe_step_size is None else maybe_step_size + + file_uuid = str(the_file.file.uuid) + + out_uuid = arg.uuid + out_steps = arg.steps + + if out_uuid != file_uuid or recalculate_seen_steps: + if align_clusters: + clusters = tree.common_entry_offsets() + out = [0] + for c in clusters: + if c >= out[-1] + target_step_size: + out.append(c) + if clusters[-1] != out[-1]: + out.append(clusters[-1]) + out = numpy.array(out, dtype="int64") + out = numpy.stack((out[:-1], out[1:]), axis=1) + else: + n_steps = math.ceil(num_entries / target_step_size) + out = numpy.array( + [ + [ + i * target_step_size, + min((i + 1) * target_step_size, num_entries), + ] + for i in range(n_steps) + ], + dtype="int64", + ) + + out_uuid = file_uuid + out_steps = out.tolist() + + array.append( + { + "file": arg.file, + "object_path": arg.object_path, + "steps": out_steps, + "uuid": out_uuid, + } + ) + + if len(array) == 0: + array = awkward.Array( + [ + {"file": "junk", "object_path": "junk", "steps": [[]], "uuid": "junk"}, + None, + ] + ) + array = awkward.Array(array.layout.form.length_zero_array(highlevel=False)) + else: + array = awkward.Array(array) + + if nf_backend == "typetracer": + array = awkward.Array( + array.layout.to_typetracer(forget_length=True), + ) + + return array + + +@dataclass +class UprootFileSpec: + object_path: str + steps: list[list[int]] | list[int] | None + + +@dataclass +class CoffeaFileSpec: + object_path: str + steps: list[list[int]] + uuid: str + + +@dataclass +class CoffeaFileSpecOptional(UprootFileSpec): + uuid: str | None + + +@dataclass +class DatasetSpecOptional: + files: ( + dict[str, str] | list[str] | dict[str, UprootFileSpec | CoffeaFileSpecOptional] + ) + metadata: dict[Hashable, Any] | None + + +@dataclass +class DatasetSpec: + files: dict[str, CoffeaFileSpec] + metadata: dict[Hashable, Any] | None + + +FilesetSpecOptional = Dict[str, DatasetSpecOptional] +FilesetSpec = Dict[str, DatasetSpec] + + +def preprocess( + fileset: FilesetSpecOptional, + maybe_step_size: None | int = None, + align_clusters: bool = False, + recalculate_seen_steps: bool = False, + files_per_batch: int = 1, + skip_bad_files: bool = False, + file_exceptions: Exception | Warning = (FileNotFoundError, OSError), +) -> tuple[FilesetSpec, FilesetSpecOptional]: + """ + Given a list of normalized file and object paths (defined in uproot), determine the steps for each file according to the supplied processing options. + Parameters + ---------- + fileset: FilesetSpecOptional + The set of datasets whose files will be preprocessed. + maybe_step_sizes: int | None, default None + If specified, the size of the steps to make when analyzing the input files. + align_clusters: bool, default False + Round to the cluster size in a root file, when chunks are specified. Reduces data transfer in + analysis. + recalculate_seen_steps: bool, default False + If steps are present in the input normed files, force the recalculation of those steps, + instead of only recalculating the steps if the uuid has changed. + skip_bad_files: bool, False + Instead of failing, catch exceptions specified by file_exceptions and return null data. + file_exceptions: Exception | Warning | tuple[Exception | Warning], default (FileNotFoundError, OSError) + What exceptions to catch when skipping bad files. + + Returns + ------- + out_available : FilesetSpec + The subset of files in each dataset that were successfully preprocessed, organized by dataset. + out_updated : FilesetSpecOptional + The original set of datasets including files that were not accessible, updated to include the result of preprocessing where available. + """ + out_updated = copy.deepcopy(fileset) + out_available = copy.deepcopy(fileset) + all_ak_norm_files = {} + files_to_preprocess = {} + for name, info in fileset.items(): + norm_files = uproot._util.regularize_files(info["files"], steps_allowed=True) + for ifile in range(len(norm_files)): + the_file_info = norm_files[ifile] + maybe_finfo = info["files"].get(the_file_info[0], None) + maybe_uuid = ( + None + if not isinstance(maybe_finfo, dict) + else maybe_finfo.get("uuid", None) + ) + norm_files[ifile] += (3 - len(norm_files[ifile])) * (None,) + (maybe_uuid,) + fields = ["file", "object_path", "steps", "uuid"] + ak_norm_files = awkward.from_iter(norm_files) + ak_norm_files = awkward.Array( + {field: ak_norm_files[str(ifield)] for ifield, field in enumerate(fields)} + ) + all_ak_norm_files[name] = ak_norm_files + + dak_norm_files = dask_awkward.from_awkward( + ak_norm_files, math.ceil(len(ak_norm_files) / files_per_batch) + ) + + files_to_preprocess[name] = dask_awkward.map_partitions( + _get_steps, + dak_norm_files, + maybe_step_size=maybe_step_size, + align_clusters=align_clusters, + recalculate_seen_steps=recalculate_seen_steps, + skip_bad_files=skip_bad_files, + file_exceptions=file_exceptions, + ) + + all_processed_files = dask.compute(files_to_preprocess)[0] + + for name, processed_files in all_processed_files.items(): + files_available = { + item["file"]: { + "object_path": item["object_path"], + "steps": item["steps"], + "uuid": item["uuid"], + } + for item in awkward.drop_none(processed_files).to_list() + } + + files_out = {} + for proc_item, orig_item in zip( + processed_files.to_list(), all_ak_norm_files[name].to_list() + ): + item = orig_item if proc_item is None else proc_item + files_out[item["file"]] = { + "object_path": item["object_path"], + "steps": item["steps"], + "uuid": item["uuid"], + } + + out_updated[name]["files"] = files_out + out_available[name]["files"] = files_available + + return out_available, out_updated diff --git a/src/coffea/dataset_tools/rucio_utils.py b/src/coffea/dataset_tools/rucio_utils.py new file mode 100644 index 000000000..b626f518b --- /dev/null +++ b/src/coffea/dataset_tools/rucio_utils.py @@ -0,0 +1,311 @@ +# import getpass +import json +import os +import re +import subprocess +from collections import defaultdict + +from rucio.client import Client + +# Rucio needs the default configuration --> taken from CMS cvmfs defaults +if "RUCIO_HOME" not in os.environ: + os.environ["RUCIO_HOME"] = "/cvmfs/cms.cern.ch/rucio/current" + + +def get_proxy_path() -> str: + """ + Checks if the VOMS proxy exists and if it is valid + for at least 1 hour. + If it exists, returns the path of it""" + try: + subprocess.run("voms-proxy-info -exists -hours 1", shell=True, check=True) + except subprocess.CalledProcessError: + raise Exception( + "VOMS proxy expirend or non-existing: please run `voms-proxy-init -voms cms -rfc --valid 168:0`" + ) + + # Now get the path of the certificate + proxy = subprocess.check_output( + "voms-proxy-info -path", shell=True, text=True + ).strip() + return proxy + + +def get_rucio_client(proxy=None) -> Client: + """ + Open a client to the CMS rucio server using x509 proxy. + + Parameters + ---------- + proxy : str, optional + Use the provided proxy file if given, if not use `voms-proxy-info` to get the current active one. + + Returns + ------- + nativeClient: rucio.Client + Rucio client + """ + try: + if not proxy: + proxy = get_proxy_path() + nativeClient = Client() + return nativeClient + + except Exception as e: + print("Wrong Rucio configuration, impossible to create client") + raise e + + +def get_xrootd_sites_map(): + """ + The mapping between RSE (sites) and the xrootd prefix rules is read + from `/cvmfs/cms/cern.ch/SITECONF/*site*/storage.json`. + + This function returns the list of xrootd prefix rules for each site. + """ + sites_xrootd_access = defaultdict(dict) + # TODO Do not rely on local sites_map cache. Just reload it? + if not os.path.exists(".sites_map.json"): + print("Loading SITECONF info") + sites = [ + (s, "/cvmfs/cms.cern.ch/SITECONF/" + s + "/storage.json") + for s in os.listdir("/cvmfs/cms.cern.ch/SITECONF/") + if s.startswith("T") + ] + for site_name, conf in sites: + if not os.path.exists(conf): + continue + try: + data = json.load(open(conf)) + except Exception: + continue + for site in data: + if site["type"] != "DISK": + continue + if site["rse"] is None: + continue + for proc in site["protocols"]: + if proc["protocol"] == "XRootD": + if proc["access"] not in ["global-ro", "global-rw"]: + continue + if "prefix" not in proc: + if "rules" in proc: + for rule in proc["rules"]: + sites_xrootd_access[site["rse"]][ + rule["lfn"] + ] = rule["pfn"] + else: + sites_xrootd_access[site["rse"]] = proc["prefix"] + json.dump(sites_xrootd_access, open(".sites_map.json", "w")) + + return json.load(open(".sites_map.json")) + + +def _get_pfn_for_site(path, rules): + """ + Utility function that converts the file path to a valid pfn matching + the file path with the site rules (regexes). + """ + if isinstance(rules, dict): + for rule, pfn in rules.items(): + if m := re.match(rule, path): + grs = m.groups() + for i in range(len(grs)): + pfn = pfn.replace(f"${i+1}", grs[i]) + return pfn + else: + return rules + "/" + path + + +def get_dataset_files_replicas( + dataset, + allowlist_sites=None, + blocklist_sites=None, + regex_sites=None, + mode="full", + client=None, + scope="cms", +): + """ + This function queries the Rucio server to get information about the location + of all the replicas of the files in a CMS dataset. + + The sites can be filtered in 3 different ways: + - `allowlist_sites`: list of sites to select from. If the file is not found there, raise an Exception. + - `blocklist_sites`: list of sites to avoid. If the file has no left site, raise an Exception + - `regex_sites`: regex expression to restrict the list of sites. + + The fileset returned by the function is controlled by the `mode` parameter: + - "full": returns the full set of replicas and sites (passing the filtering parameters) + - "first": returns the first replica found for each file + - "best": to be implemented (ServiceX..) + - "roundrobin": try to distribute the replicas over different sites + + Parameters + ---------- + + dataset: str + allowlist_sites: list + blocklist_sites: list + regex_sites: list + mode: str, default "full" + client: rucio Client, optional + scope: rucio scope, "cms" + + Returns + ------- + files: list + depending on the `mode` option. + - If `mode=="full"`, returns the complete list of replicas for each file in the dataset + - If `mode=="first"`, returns only the first replica for each file. + + sites: list + depending on the `mode` option. + - If `mode=="full"`, returns the list of sites where the file replica is available for each file in the dataset + - If `mode=="first"`, returns a list of sites for the first replica of each file. + + sites_counts: dict + Metadata counting the coverage of the dataset by site + + """ + sites_xrootd_prefix = get_xrootd_sites_map() + client = client if client else get_rucio_client() + outsites = [] + outfiles = [] + for filedata in client.list_replicas([{"scope": scope, "name": dataset}]): + outfile = [] + outsite = [] + rses = filedata["rses"] + found = False + if allowlist_sites: + for site in allowlist_sites: + if site in rses: + # Check actual availability + meta = filedata["pfns"][rses[site][0]] + if ( + meta["type"] != "DISK" + or meta["volatile"] + or filedata["states"][site] != "AVAILABLE" + or site not in sites_xrootd_prefix + ): + continue + outfile.append( + _get_pfn_for_site(filedata["name"], sites_xrootd_prefix[site]) + ) + outsite.append(site) + found = True + + if not found: + raise Exception( + f"No SITE available in the allowlist for file {filedata['name']}" + ) + else: + possible_sites = list(rses.keys()) + if blocklist_sites: + possible_sites = list( + filter(lambda key: key not in blocklist_sites, possible_sites) + ) + + if len(possible_sites) == 0: + raise Exception(f"No SITE available for file {filedata['name']}") + + # now check for regex + for site in possible_sites: + if regex_sites: + if re.search(regex_sites, site): + # Check actual availability + meta = filedata["pfns"][rses[site][0]] + if ( + meta["type"] != "DISK" + or meta["volatile"] + or filedata["states"][site] != "AVAILABLE" + or site not in sites_xrootd_prefix + ): + continue + outfile.append( + _get_pfn_for_site( + filedata["name"], sites_xrootd_prefix[site] + ) + ) + outsite.append(site) + found = True + else: + # Just take the first one + # Check actual availability + meta = filedata["pfns"][rses[site][0]] + if ( + meta["type"] != "DISK" + or meta["volatile"] + or filedata["states"][site] != "AVAILABLE" + or site not in sites_xrootd_prefix + ): + continue + outfile.append( + _get_pfn_for_site(filedata["name"], sites_xrootd_prefix[site]) + ) + outsite.append(site) + found = True + + if not found: + raise Exception(f"No SITE available for file {filedata['name']}") + else: + if mode == "full": + outfiles.append(outfile) + outsites.append(outsite) + elif mode == "first": + outfiles.append(outfile[0]) + outsites.append(outsite[0]) + else: + raise NotImplementedError(f"Mode {mode} not yet implemented!") + + # Computing replicas by site: + sites_counts = defaultdict(int) + if mode == "full": + for sites_by_file in outsites: + for site in sites_by_file: + sites_counts[site] += 1 + elif mode == "first": + for site_by_file in outsites: + sites_counts[site] += 1 + + return outfiles, outsites, sites_counts + + +def query_dataset( + query: str, client=None, tree: bool = False, datatype="container", scope="cms" +): + """ + This function uses the rucio client to query for containers or datasets. + + Parameters + --------- + query: str = query to filter datasets / containers with the rucio list_dids functions + client: rucio client + tree: bool = if True return the results splitting the dataset name in parts parts + datatype: "container/dataset": rucio terminology. "Container"==CMS dataset. "Dataset" == CMS block. + scope: "cms". Rucio instance + + Returns + ------- + list of containers/datasets + + if tree==True, returns the list of dataset and also a dictionary decomposing the datasets + names in the 1st command part and a list of available 2nd parts. + + """ + client = client if client else get_rucio_client() + out = list( + client.list_dids( + scope=scope, filters={"name": query, "type": datatype}, long=False + ) + ) + if tree: + outdict = {} + for dataset in out: + split = dataset[1:].split("/") + if split[0] not in outdict: + outdict[split[0]] = defaultdict(list) + outdict[split[0]][split[1]].append(split[2]) + return out, outdict + else: + return out diff --git a/src/coffea/nanoevents/factory.py b/src/coffea/nanoevents/factory.py index 09fca91e7..ba00da642 100644 --- a/src/coffea/nanoevents/factory.py +++ b/src/coffea/nanoevents/factory.py @@ -330,6 +330,7 @@ def from_root( steps_per_file=steps_per_file, **uproot_options, ) + return cls(map_schema, opener, None, cache=None, is_dask=True) elif delayed and not schemaclass.__dask_capable__: warnings.warn( diff --git a/src/coffea/processor/__init__.py b/src/coffea/processor/__init__.py index 54aea102d..888ed0cf2 100644 --- a/src/coffea/processor/__init__.py +++ b/src/coffea/processor/__init__.py @@ -2,146 +2,8 @@ """ -# deprecated run_uproot_job & executor usage: -from functools import partial - -from coffea.nanoevents.schemas import NanoAODSchema, TreeMakerSchema - -from .accumulator import ( - Accumulatable, - AccumulatorABC, - accumulate, - column_accumulator, - defaultdict_accumulator, - dict_accumulator, - list_accumulator, - set_accumulator, - value_accumulator, -) -from .dataframe import LazyDataFrame -from .executor import ( - DaskExecutor, - FuturesExecutor, - IterativeExecutor, - ParslExecutor, - Runner, - WorkQueueExecutor, - run_spark_job, -) -from .helpers import PackedSelection, Weights from .processor import ProcessorABC - -def _run_x_job( - fileset, - treename, - processor_instance, - executor, - executor_args={}, - pre_executor=None, - pre_args=None, - chunksize=100000, - maxchunks=None, - metadata_cache=None, - dynamic_chunksize=None, - format="root", -): - """ - Please use instead, e.g.: - - executor = IterativeExecutor() - run = processor.Runner( - executor=executor, - schema=processor.NanoAODSchema, - ) - hists = run(filelist, "Events", processor_instance=processor_instance) - """ - - # turn this deprecation warning on from coffea.__version__ >= 0.8 on - # from coffea.util import deprecate - # deprecate( - # RuntimeError(f"This method is deprecated, please use directly the new: {executor} and {Runner} classes.\n {_run_x_job.__doc__}"), # noqa: E501 - # 0.9, - # ) - - # extract executor kwargs - exe_args = {} - exe_fields = executor.__dataclass_fields__.keys() - exe_keys = list(executor_args.keys()) - for k in exe_keys: - if k in exe_fields: - exe_args[k] = executor_args.pop(k) - - executor = executor(**exe_args) - - # extract preexecutor kwargs - if pre_executor is not None and pre_args is not None: - pre_exe_args = {} - pre_exe_fields = pre_executor.__dataclass_fields__.keys() - pre_exe_keys = list(pre_args.keys()) - for k in pre_exe_keys: - if k in pre_exe_fields: - pre_exe_args[k] = pre_args.pop(k) - - pre_executor = pre_executor(**pre_exe_args) - - # make Runner instance, assume other args are for _work_function & co. - run = Runner( - executor=executor, - chunksize=chunksize, - maxchunks=maxchunks, - metadata_cache=metadata_cache, - dynamic_chunksize=dynamic_chunksize, - format=format, - **executor_args, - ) - - return run( - fileset, - treename, - processor_instance=processor_instance, - ) - - -run_uproot_job = partial(_run_x_job, format="root") -run_parquet_job = partial(_run_x_job, format="parquet") - -iterative_executor = IterativeExecutor -futures_executor = FuturesExecutor -dask_executor = DaskExecutor -parsl_executor = ParslExecutor -work_queue_executor = WorkQueueExecutor - - __all__ = [ "ProcessorABC", - "LazyDataFrame", - "Weights", - "PackedSelection", - "IterativeExecutor", - "FuturesExecutor", - "DaskExecutor", - "ParslExecutor", - "WorkQueueExecutor", - "Runner", - "run_spark_job", - "accumulate", - "Accumulatable", - "AccumulatorABC", - "value_accumulator", - "list_accumulator", - "set_accumulator", - "dict_accumulator", - "defaultdict_accumulator", - "column_accumulator", - "NanoAODSchema", - "TreeMakerSchema", - # following methods are deprecated - "run_uproot_job", - "run_parquet_job", - "iterative_executor", - "futures_executor", - "dask_executor", - "parsl_executor", - "work_queue_executor", ] diff --git a/src/coffea/processor/accumulator.py b/src/coffea/processor/accumulator.py deleted file mode 100644 index 8ad12dab1..000000000 --- a/src/coffea/processor/accumulator.py +++ /dev/null @@ -1,380 +0,0 @@ -import copy -import operator -from abc import ABCMeta, abstractmethod -from collections import defaultdict -from collections.abc import MutableMapping, MutableSet -from typing import Iterable, Optional, TypeVar, Union - -from dask.base import DaskMethodsMixin - -try: - from typing import Protocol, runtime_checkable # type: ignore -except ImportError: - from typing_extensions import Protocol # type: ignore - from typing import runtime_checkable - -import numpy - -T = TypeVar("T") - - -@runtime_checkable -class Addable(Protocol): - def __add__(self: T, other: T) -> T: - ... - - -Accumulatable = Union[Addable, MutableSet, MutableMapping] - - -def add(a: Accumulatable, b: Accumulatable) -> Accumulatable: - """Add two accumulatables together, without altering inputs - - This may make copies in certain situations - """ - if isinstance(a, Addable) and isinstance(b, Addable): - return operator.add(a, b) - if isinstance(a, MutableSet) and isinstance(b, MutableSet): - return operator.or_(a, b) - elif isinstance(a, MutableMapping) and isinstance(b, MutableMapping): - # capture type(X) by shallow copy and clear - # since we don't know the signature of type(X).__init__ - if isinstance(b, type(a)): - out = copy.copy(a) - elif isinstance(a, type(b)): - out = copy.copy(b) - else: - raise ValueError( - f"Cannot add two mappings of incompatible type ({type(a)} vs. {type(b)})" - ) - out.clear() - lhs, rhs = set(a), set(b) - # Keep the order of elements as far as possible - for key in a: - if key in rhs: - out[key] = add(a[key], b[key]) - else: - out[key] = ( - copy.deepcopy(a[key]) - if not isinstance(a[key], DaskMethodsMixin) - else copy.copy(a[key]) - ) - for key in b: - if key not in lhs: - out[key] = ( - copy.deepcopy(b[key]) - if not isinstance(b[key], DaskMethodsMixin) - else copy.copy(b[key]) - ) - return out - raise ValueError( - f"Cannot add accumulators of incompatible type ({type(a)} vs. {type(b)})" - ) - - -def iadd(a: Accumulatable, b: Accumulatable) -> Accumulatable: - """Add two accumulatables together, assuming the first is mutable""" - if isinstance(a, Addable) and isinstance(b, Addable): - return operator.iadd(a, b) - elif isinstance(a, MutableSet) and isinstance(b, MutableSet): - return operator.ior(a, b) - elif isinstance(a, MutableMapping) and isinstance(b, MutableMapping): - if not isinstance(b, type(a)): - raise ValueError( - f"Cannot add two mappings of incompatible type ({type(a)} vs. {type(b)})" - ) - lhs, rhs = set(a), set(b) - # Keep the order of elements as far as possible - for key in a: - if key in rhs: - a[key] = iadd(a[key], b[key]) - for key in b: - if key not in lhs: - a[key] = ( - copy.deepcopy(b[key]) - if not isinstance(b[key], DaskMethodsMixin) - else copy.copy(b[key]) - ) - return a - raise ValueError( - f"Cannot add accumulators of incompatible type ({type(a)} vs. {type(b)})" - ) - - -def accumulate( - items: Iterable[Optional[Accumulatable]], accum: Optional[Accumulatable] = None -) -> Optional[Accumulatable]: - gen = (x for x in items if x is not None) - try: - if accum is None: - accum = next(gen) - # we want to produce a new object so that the input is not mutated - accum = add(accum, next(gen)) - while True: - # subsequent additions can happen in-place, which may be more performant - accum = iadd(accum, next(gen)) - except StopIteration: - pass - return accum - - -async def async_accumulate(result_stream): - output = None - async for results in result_stream: - if output: - output = iadd(output, results) - else: - output = results - yield output - - -class AccumulatorABC(metaclass=ABCMeta): - """Abstract base class for an accumulator - - Accumulators are abstract objects that enable the reduce stage of the typical map-reduce - scaleout that we do in Coffea. One concrete example is a histogram. The idea is that an - accumulator definition holds enough information to be able to create an empty accumulator - (the ``identity()`` method) and add two compatible accumulators together (the ``add()`` method). - The former is not strictly necessary, but helps with book-keeping. Here we show an example usage - of a few accumulator types. An arbitrary-depth nesting of dictionary accumulators is supported, much - like the behavior of directories in ROOT hadd. - - After defining an accumulator:: - - from coffea.processor import dict_accumulator, column_accumulator, defaultdict_accumulator - from coffea.hist import Hist, Bin - import numpy as np - - adef = dict_accumulator({ - 'cutflow': defaultdict_accumulator(int), - 'pt': Hist("counts", Bin("pt", "$p_T$", 100, 0, 100)), - 'final_pt': column_accumulator(np.zeros(shape=(0,))), - }) - - Notice that this function does not mutate ``adef``:: - - def fill(n): - ptvals = np.random.exponential(scale=30, size=n) - cut = ptvals > 200. - acc = adef.identity() - acc['cutflow']['pt>200'] += cut.sum() - acc['pt'].fill(pt=ptvals) - acc['final_pt'] += column_accumulator(ptvals[cut]) - return acc - - As such, we can execute it several times in parallel and reduce the result:: - - import concurrent.futures - with concurrent.futures.ThreadPoolExecutor() as executor: - outputs = executor.map(fill, [2000, 2000]) - - combined = sum(outputs, adef.identity()) - - - Derived classes must implement - - ``identity()``: returns a new object of same type as self, - such that ``self + self.identity() == self`` - - ``add(other)``: adds an object of same type as self to self - - Concrete implementations are then provided for ``__add__``, ``__radd__``, and ``__iadd__``. - """ - - @abstractmethod - def identity(self): - """Identity of the accumulator - - A value such that any other value added to it will return - the other value - """ - pass - - @abstractmethod - def add(self, other): - """Add another accumulator to this one in-place""" - pass - - def __add__(self, other): - ret = self.identity() - ret.add(self) - ret.add(other) - return ret - - def __radd__(self, other): - ret = self.identity() - ret.add(other) - ret.add(self) - return ret - - def __iadd__(self, other): - self.add(other) - return self - - -class value_accumulator(AccumulatorABC): - """Holds a value of arbitrary type - - Parameters - ---------- - default_factory : callable - a function that returns an instance of the desired identity value - initial : bool, optional - an initial value, if the identity is not the desired initial value - """ - - def __init__(self, default_factory, initial=None): - self.value = default_factory() if initial is None else initial - self.default_factory = default_factory - - def __repr__(self): - if type(self.default_factory) is type: - defrepr = self.default_factory.__name__ - else: - defrepr = repr(self.default_factory) - return f"value_accumulator({defrepr}, {self.value!r})" - - def identity(self): - return value_accumulator(self.default_factory) - - def add(self, other): - if isinstance(other, value_accumulator): - self.value = self.value + other.value - else: - self.value = self.value + other - - -class list_accumulator(list, AccumulatorABC): - """A list with accumulator semantics - - See `list` for further info - """ - - def identity(self): - return list() - - def add(self, other): - """Add another accumulator to this one in-place""" - if isinstance(other, list): - list.extend(self, other) - else: - raise ValueError - - -class set_accumulator(set, AccumulatorABC): - """A set with accumulator semantics - - See `set` for further info - """ - - def identity(self): - return set_accumulator() - - def add(self, other): - """Add another accumulator to this one in-place - - Note - ---- - This replaces `set.add` behavior, unfortunately. - A workaround is to use `set.update`, e.g. ``a.update({'val'})`` - """ - if isinstance(other, MutableSet): - set.update(self, other) - else: - set.add(self, other) - - -class dict_accumulator(dict, AccumulatorABC): - """A dictionary with accumulator semantics - - See `dict` for further info. - It is assumed that the contents of the dict have accumulator semantics. - """ - - def identity(self): - ret = dict_accumulator() - for key, value in self.items(): - ret[key] = value.identity() - return ret - - def add(self, other): - if isinstance(other, MutableMapping): - for key, value in other.items(): - if key not in self: - if isinstance(value, AccumulatorABC): - self[key] = value.identity() - else: - raise ValueError - self[key] += value - else: - raise ValueError - - -class defaultdict_accumulator(defaultdict, AccumulatorABC): - """A defaultdict with accumulator semantics - - See `collections.defaultdict` for further info. - It is assumed that the contents of the dict have accumulator semantics - """ - - def identity(self): - return defaultdict_accumulator(self.default_factory) - - def add(self, other): - for key, value in other.items(): - self[key] += value - - -class column_accumulator(AccumulatorABC): - """An appendable numpy ndarray - - Parameters - ---------- - value : numpy.ndarray - The identity value array, which should be an empty ndarray - with the desired row shape. The column dimension will correspond to - the first index of `value` shape. - - Examples - -------- - If a set of accumulators is defined as:: - - a = column_accumulator(np.array([])) - b = column_accumulator(np.array([1., 2., 3.])) - c = column_accumulator(np.array([4., 5., 6.])) - - then: - - >>> a + b - column_accumulator(array([1., 2., 3.])) - >>> c + b + a - column_accumulator(array([4., 5., 6., 1., 2., 3.])) - """ - - def __init__(self, value): - if not isinstance(value, numpy.ndarray): - raise ValueError("column_accumulator only works with numpy arrays") - self._empty = numpy.zeros(dtype=value.dtype, shape=(0,) + value.shape[1:]) - self._value = value - - def __repr__(self): - return "column_accumulator(%r)" % self.value - - def identity(self): - return column_accumulator(self._empty) - - def add(self, other): - if not isinstance(other, column_accumulator): - raise ValueError("column_accumulator cannot be added to %r" % type(other)) - if other._empty.shape != self._empty.shape: - raise ValueError( - "Cannot add two column_accumulator objects of dissimilar shape (%r vs %r)" - % (self._empty.shape, other._empty.shape) - ) - self._value = numpy.concatenate((self._value, other._value)) - - @property - def value(self): - """The current value of the column - - Returns a numpy array where the first dimension is the column dimension - """ - return self._value diff --git a/src/coffea/processor/dask/__init__.py b/src/coffea/processor/dask/__init__.py deleted file mode 100644 index fc73f0d8e..000000000 --- a/src/coffea/processor/dask/__init__.py +++ /dev/null @@ -1,77 +0,0 @@ -import os -from collections.abc import MutableMapping -from threading import Lock - -import blosc -from distributed import WorkerPlugin, get_worker -from zict import LRU, Buffer, File, Func - - -class ColumnCache(WorkerPlugin, MutableMapping): - name = "columncache" - - def __init__(self, maxmem=5e8, maxcompressed=2e9, maxdisk=1e10): - self._maxmem = maxmem - self._maxcompressed = maxcompressed - self._maxdisk = maxdisk - - def setup(self, worker): - self.cache = Buffer( - fast={}, - slow=Func( - dump=blosc.pack_array, - load=blosc.unpack_array, - d=Buffer( - fast={}, - slow=LRU( - n=self._maxdisk, - d=File(os.path.join(worker.local_directory, "cache")), - weight=lambda k, v: len(v), - ), - n=self._maxcompressed, - weight=lambda k, v: len(v), - ), - ), - n=self._maxmem, - weight=lambda k, v: v.nbytes, - ) - self.lock = Lock() - self.hits = 0 - self.misses = 0 - - def teardown(self, worker): - pass - - def __getitem__(self, key): - with self.lock: - try: - out = self.cache[key] - self.hits += 1 - return out - except KeyError: - self.misses += 1 - raise - - def __setitem__(self, key, value): - with self.lock: - self.cache[key] = value - - def __delitem__(self, key): - with self.lock: - del self.cache[key] - - def __iter__(self): - with self.lock: - return iter(self.cache) - - def __len__(self): - with self.lock: - return len(self.cache) - - -def register_columncache(client): - plugins = set() - for p in client.run(lambda: set(get_worker().plugins)).values(): - plugins |= p - if ColumnCache.name not in plugins: - client.register_worker_plugin(ColumnCache()) diff --git a/src/coffea/processor/dataframe.py b/src/coffea/processor/dataframe.py deleted file mode 100644 index 20fa48445..000000000 --- a/src/coffea/processor/dataframe.py +++ /dev/null @@ -1,117 +0,0 @@ -from collections.abc import MutableMapping - -import uproot - - -class LazyDataFrame(MutableMapping): - """Simple delayed uproot reader (a la lazyarrays) - - One can access branches either through ``df["bname"]`` or ``df.bname``, although - the latter is restricted to branches that do not start with a leading underscore. - Keeps track of values accessed, in the `materialized` attribute. - - Parameters - ---------- - tree : uproot.TTree - Tree to read - entrystart : int, optional - First entry to read, default: 0 - entrystop : int, optional - Last entry to read, default None (read to end) - preload_items : iterable - Force preloading of a set of columns from the tree - metadata : Mapping - Additional metadata for the dataframe - """ - - def __init__( - self, tree, entrystart=None, entrystop=None, preload_items=None, metadata=None - ): - self._tree = tree - self._branchargs = { - "decompression_executor": uproot.source.futures.TrivialExecutor(), - "interpretation_executor": uproot.source.futures.TrivialExecutor(), - } - if entrystart is None or entrystart < 0: - entrystart = 0 - if entrystop is None or entrystop > tree.num_entries: - entrystop = tree.num_entries - self._branchargs["entry_start"] = entrystart - self._branchargs["entry_stop"] = entrystop - self._available = {k for k in self._tree.keys()} - self._dict = {} - self._materialized = set() - if preload_items: - self.preload(preload_items) - self._metadata = metadata - - def __delitem__(self, key): - del self._dict[key] - - def __getitem__(self, key): - if key in self._dict: - return self._dict[key] - elif key in self._tree: - self._materialized.add(key) - array = self._tree[key].array(**self._branchargs) - self._dict[key] = array - return self._dict[key] - else: - raise KeyError(key) - - def __getattr__(self, key): - if key.startswith("_"): - raise AttributeError(key) - try: - return self.__getitem__(key) - except KeyError: - raise AttributeError(key) - - def __iter__(self): - yield from self._available - - def __len__(self): - return len(self._dict) - - def __setitem__(self, key, value): - self._dict[key] = value - - def __contains__(self, key): - # by default, MutableMapping uses __getitem__ to test, but we want to avoid materialization - return key in self._dict or key in self._tree - - @property - def available(self): - """Set of available columns""" - return self._available - - @property - def columns(self): - """Set of available columns""" - return self._available - - @property - def materialized(self): - """Set of columns read from tree""" - return self._materialized - - @property - def size(self): - """Length of column vector""" - return self._branchargs["entry_stop"] - self._branchargs["entry_start"] - - @property - def metadata(self): - return self._metadata - - def preload(self, columns): - """Force loading of several columns - - Parameters - ---------- - columns : iterable - A list of columns to load - """ - for name in columns: - if name in self._tree: - _ = self[name] diff --git a/src/coffea/processor/executor.py b/src/coffea/processor/executor.py deleted file mode 100644 index 76080e007..000000000 --- a/src/coffea/processor/executor.py +++ /dev/null @@ -1,2091 +0,0 @@ -import concurrent.futures -import json -import math -import os -import pickle -import shutil -import sys -import time -import traceback -import uuid -import warnings -from collections import defaultdict -from collections.abc import Mapping, MutableMapping -from contextlib import ExitStack -from dataclasses import asdict, dataclass, field -from functools import partial -from io import BytesIO -from itertools import repeat -from typing import ( - Awaitable, - Callable, - Dict, - Generator, - Iterable, - List, - Optional, - Set, - Tuple, - Union, -) - -import cloudpickle -import lz4.frame as lz4f -import toml -import uproot -from cachetools import LRUCache - -from ..nanoevents import NanoEventsFactory, schemas -from ..util import _exception_chain, _hash, rich_bar -from .accumulator import Accumulatable, accumulate, set_accumulator -from .dataframe import LazyDataFrame -from .processor import ProcessorABC - -try: - from typing import Literal -except ImportError: - from typing_extensions import Literal - - -try: - from functools import cached_property -except ImportError: - cached_property = property - - -_PICKLE_PROTOCOL = pickle.HIGHEST_PROTOCOL -DEFAULT_METADATA_CACHE: MutableMapping = LRUCache(100000) - -_PROTECTED_NAMES = { - "dataset", - "filename", - "treename", - "metadata", - "entrystart", - "entrystop", - "fileuuid", - "numentries", - "uuid", - "clusters", -} - - -class UprootMissTreeError(uproot.exceptions.KeyInFileError): - pass - - -class FileMeta: - __slots__ = ["dataset", "filename", "treename", "metadata"] - - def __init__(self, dataset, filename, treename, metadata=None): - self.dataset = dataset - self.filename = filename - self.treename = treename - self.metadata = metadata - - def __str__(self): - return f"FileMeta({self.filename}:{self.treename})" - - def __hash__(self): - # As used to lookup metadata, no need for dataset - return _hash((self.filename, self.treename)) - - def __eq__(self, other): - # In case of hash collisions - return self.filename == other.filename and self.treename == other.treename - - def maybe_populate(self, cache): - if cache and self in cache: - self.metadata = cache[self] - - def populated(self, clusters=False): - """Return true if metadata is populated - - By default, only require bare minimum metadata (numentries, uuid) - If clusters is True, then require cluster metadata to be populated - """ - if self.metadata is None: - return False - elif "numentries" not in self.metadata or "uuid" not in self.metadata: - return False - elif clusters and "clusters" not in self.metadata: - return False - return True - - def chunks(self, target_chunksize, align_clusters): - if not self.populated(clusters=align_clusters): - raise RuntimeError - user_keys = set(self.metadata.keys()) - _PROTECTED_NAMES - user_meta = {k: self.metadata[k] for k in user_keys} - if align_clusters: - chunks = [0] - for c in self.metadata["clusters"]: - if c >= chunks[-1] + target_chunksize: - chunks.append(c) - if self.metadata["clusters"][-1] != chunks[-1]: - chunks.append(self.metadata["clusters"][-1]) - for start, stop in zip(chunks[:-1], chunks[1:]): - yield WorkItem( - self.dataset, - self.filename, - self.treename, - start, - stop, - self.metadata["uuid"], - user_meta, - ) - return target_chunksize - else: - numentries = self.metadata["numentries"] - update = True - start = 0 - while start < numentries: - if update: - n = max(round((numentries - start) / target_chunksize), 1) - actual_chunksize = math.ceil((numentries - start) / n) - stop = min(numentries, start + actual_chunksize) - next_chunksize = yield WorkItem( - self.dataset, - self.filename, - self.treename, - start, - stop, - self.metadata["uuid"], - user_meta, - ) - start = stop - if next_chunksize and next_chunksize != target_chunksize: - target_chunksize = next_chunksize - update = True - else: - update = False - return target_chunksize - - -@dataclass(unsafe_hash=True, frozen=True) -class WorkItem: - dataset: str - filename: str - treename: str - entrystart: int - entrystop: int - fileuuid: str - usermeta: Optional[Dict] = field(default=None, compare=False) - - def __len__(self) -> int: - return self.entrystop - self.entrystart - - -def _compress(item, compression): - if item is None or compression is None: - return item - else: - with BytesIO() as bf: - with lz4f.open(bf, mode="wb", compression_level=compression) as f: - pickle.dump(item, f, protocol=_PICKLE_PROTOCOL) - result = bf.getvalue() - return result - - -def _decompress(item): - if isinstance(item, bytes): - # warning: if item is not exactly of type bytes, BytesIO(item) will - # make a copy of it, increasing the memory usage. - with BytesIO(item) as bf: - with lz4f.open(bf, mode="rb") as f: - return pickle.load(f) - else: - return item - - -class _compression_wrapper: - def __init__(self, level, function, name=None): - self.level = level - self.function = function - self.name = name - - def __str__(self): - if self.name is not None: - return self.name - try: - name = self.function.__name__ - if name == "": - return "lambda" - return name - except AttributeError: - return str(self.function) - - # no @wraps due to pickle - def __call__(self, *args, **kwargs): - out = self.function(*args, **kwargs) - return _compress(out, self.level) - - -class _reduce: - def __init__(self, compression): - self.compression = compression - - def __str__(self): - return "reduce" - - def __call__(self, items): - items = list(it for it in items if it is not None) - if len(items) == 0: - raise ValueError("Empty list provided to reduction") - if self.compression is not None: - out = _decompress(items.pop()) - out = accumulate(map(_decompress, items), out) - return _compress(out, self.compression) - return accumulate(items) - - -class _FuturesHolder: - def __init__(self, futures: Set[Awaitable], refresh=2): - self.futures = set(futures) - self.merges = set() - self.completed = set() - self.done = {"futures": 0, "merges": 0} - self.running = len(self.futures) - self.refresh = refresh - - def update(self, refresh: int = None): - if refresh is None: - refresh = self.refresh - if self.futures: - completed, self.futures = concurrent.futures.wait( - self.futures, - timeout=refresh, - return_when=concurrent.futures.FIRST_COMPLETED, - ) - self.completed.update(completed) - self.done["futures"] += len(completed) - - if self.merges: - completed, self.merges = concurrent.futures.wait( - self.merges, - timeout=refresh, - return_when=concurrent.futures.FIRST_COMPLETED, - ) - self.completed.update(completed) - self.done["merges"] += len(completed) - self.running = len(self.futures) + len(self.merges) - - def add_merge(self, merges: Awaitable[Accumulatable]): - self.merges.add(merges) - self.running = len(self.futures) + len(self.merges) - - def fetch(self, N: int) -> List[Accumulatable]: - _completed = [self.completed.pop() for _ in range(min(N, len(self.completed)))] - if all(_good_future(future) for future in _completed): - return [future.result() for future in _completed if _good_future(future)] - else: # Make recoverable - good_futures = [future for future in _completed if _good_future(future)] - bad_futures = [future for future in _completed if not _good_future(future)] - self.completed.update(good_futures) - raise bad_futures[0].exception() - - -def _good_future(future: Awaitable) -> bool: - return future.done() and not future.cancelled() and future.exception() is None - - -def _futures_handler(futures, timeout): - """Essentially the same as concurrent.futures.as_completed - but makes sure not to hold references to futures any longer than strictly necessary, - which is important if the future holds a large result. - """ - futures = set(futures) - try: - while futures: - try: - done, futures = concurrent.futures.wait( - futures, - timeout=timeout, - return_when=concurrent.futures.FIRST_COMPLETED, - ) - if len(done) == 0: - warnings.warn( - f"No finished jobs after {timeout}s, stopping remaining {len(futures)} jobs early" - ) - break - while done: - try: - yield done.pop().result() - except concurrent.futures.CancelledError: - pass - except KeyboardInterrupt as e: - for job in futures: - try: - job.cancel() - # this is not implemented with parsl AppFutures - except NotImplementedError: - raise e from None - running = sum(job.running() for job in futures) - warnings.warn( - f"Early stop: cancelled {len(futures) - running} jobs, will wait for {running} running jobs to complete" - ) - finally: - running = sum(job.running() for job in futures) - if running: - warnings.warn( - f"Cancelling {running} running jobs (likely due to an exception)" - ) - try: - while futures: - futures.pop().cancel() - except NotImplementedError: - pass - - -@dataclass -class ExecutorBase: - # shared by all executors - status: bool = True - unit: str = "items" - desc: str = "Processing" - compression: Optional[int] = 1 - function_name: Optional[str] = None - - def __call__( - self, - items: Iterable, - function: Callable, - accumulator: Accumulatable, - ): - raise NotImplementedError( - "This class serves as a base class for executors, do not instantiate it!" - ) - - def copy(self, **kwargs): - tmp = self.__dict__.copy() - tmp.update(kwargs) - return type(self)(**tmp) - - -def _watcher( - FH: _FuturesHolder, - executor: ExecutorBase, - merge_fcn: Callable, - pool: Optional[Callable] = None, -) -> Accumulatable: - with rich_bar() as progress: - p_id = progress.add_task(executor.desc, total=FH.running, unit=executor.unit) - desc_m = "Merging" if executor.merging else "Merging (local)" - p_idm = progress.add_task(desc_m, total=0, unit="merges") - - merged = None - while FH.running > 0: - FH.update() - progress.update(p_id, completed=FH.done["futures"], refresh=True) - - if executor.merging: # Merge jobs - merge_size = executor._merge_size(len(FH.completed)) - progress.update(p_idm, completed=FH.done["merges"]) - while len(FH.completed) > 1: - if FH.running > 0 and len(FH.completed) < executor.merging[1]: - break - batch = FH.fetch(merge_size) - # Add debug for batch mem size? TODO with logging? - if isinstance(executor, FuturesExecutor) and pool is not None: - FH.add_merge(pool.submit(merge_fcn, batch)) - elif isinstance(executor, ParslExecutor): - FH.add_merge(merge_fcn(batch)) - else: - raise RuntimeError("Invalid executor") - progress.update( - p_idm, - total=progress._tasks[p_idm].total + 1, - refresh=True, - ) - else: # Merge within process - batch = FH.fetch(len(FH.completed)) - merged = _compress( - accumulate( - progress.track( - map(_decompress, (c for c in batch)), - task_id=p_idm, - total=progress._tasks[p_idm].total + len(batch), - ), - _decompress(merged), - ), - executor.compression, - ) - # Add checkpointing - - if executor.merging: - progress.refresh() - merged = FH.completed.pop().result() - if len(FH.completed) > 0 or len(FH.futures) > 0 or len(FH.merges) > 0: - raise RuntimeError("Not all futures are added.") - return merged - - -def _wait_for_merges(FH: _FuturesHolder, executor: ExecutorBase) -> Accumulatable: - with rich_bar() as progress: - if executor.merging: - to_finish = len(FH.merges) - p_id_w = progress.add_task( - "Waiting for merge jobs", - total=to_finish, - unit=executor.unit, - ) - while len(FH.merges) > 0: - FH.update() - progress.update( - p_id_w, - completed=(to_finish - len(FH.merges)), - refresh=True, - ) - - FH.update() - recovered = [future.result() for future in FH.completed if _good_future(future)] - p_id_m = progress.add_task("Merging finished jobs", unit="merges") - return _compress( - accumulate( - progress.track( - map(_decompress, (c for c in recovered)), - task_id=p_id_m, - total=len(recovered), - ) - ), - executor.compression, - ) - - -@dataclass -class WorkQueueExecutor(ExecutorBase): - """Execute using Work Queue - - For more information, see :ref:`intro-coffea-wq` - - Parameters - ---------- - items : sequence or generator - Sequence of input arguments - function : callable - A function to be called on each input, which returns an accumulator instance - accumulator : Accumulatable - An accumulator to collect the output of the function - status : bool - If true (default), enable progress bar - unit : str - Label of progress bar unit - desc : str - Label of progress bar description - compression : int, optional - Compress accumulator outputs in flight with LZ4, at level specified (default 9) - `None`` sets level to 1 (minimal compression) - # work queue specific options: - cores : int - Maximum number of cores for work queue task. If unset, use a whole worker. - memory : int - Maximum amount of memory (in MB) for work queue task. If unset, use a whole worker. - disk : int - Maximum amount of disk space (in MB) for work queue task. If unset, use a whole worker. - gpus : int - Number of GPUs to allocate to each task. If unset, use zero. - resource_monitor : str - If given, one of 'off', 'measure', or 'watchdog'. Default is 'off'. - - 'off': turns off resource monitoring. Overridden to 'watchdog' if resources_mode - is not set to 'fixed'. - - 'measure': turns on resource monitoring for Work Queue. The - resources used per task are measured. - - 'watchdog': in addition to measuring resources, tasks are terminated if they - go above the cores, memory, or disk specified. - resources_mode : str - one of 'fixed', 'max-seen', or 'max-throughput'. Default is 'max-seen'. - Sets the strategy to automatically allocate resources to tasks. - - 'fixed': allocate cores, memory, and disk specified for each task. - - 'max-seen' or 'auto': use the cores, memory, and disk given as maximum values to allocate, - but first try each task by allocating the maximum values seen. Leads - to a good compromise between parallelism and number of retries. - - 'max-throughput': Like max-seen, but first tries the task with an - allocation that maximizes overall throughput. - If resources_mode is other than 'fixed', preprocessing and - accumulation tasks always use the 'max-seen' strategy, as the - former tasks always use the same resources, the latter has a - distribution of resources that increases over time. - split_on_exhaustion: bool - Whether to split a processing task in half according to its chunksize when it exhausts its - the cores, memory, or disk allocated to it. If False, a task that exhausts resources - permanently fails. Default is True. - fast_terminate_workers: int - Terminate workers on which tasks have been running longer than average. - The time limit is computed by multiplying the average runtime of tasks - by the value of 'fast_terminate_workers'. Since there are - legitimately slow tasks, no task may trigger fast termination in - two distinct workers. Less than 1 disables it. - - manager_name : str - Name to refer to this work queue manager. - Sets port to 0 (any available port) if port not given. - port : int or tuple(int, int) - Port number or range (inclusive of ports )for work queue manager program. - Defaults to 9123 if manager_name not given. - password_file: str - Location of a file containing a password used to authenticate workers. - ssl: bool or tuple(str, str) - Enable ssl encryption between manager and workers. If a tuple, then it - should be of the form (key, cert), where key and cert are paths to the files - containing the key and certificate in pem format. If True, auto-signed temporary - key and cert are generated for the session. - - extra_input_files: list - A list of files in the current working directory to send along with each task. - Useful for small custom libraries and configuration files needed by the processor. - x509_proxy : str - Path to the X509 user proxy. If None (the default), use the value of the - environment variable X509_USER_PROXY, or fallback to the file /tmp/x509up_u${UID} if - exists. If False, disables the default behavior and no proxy is sent. - - environment_file : optional, str - Conda python environment tarball to use. If not given, assume that - the python environment is already setup at the execution site. - wrapper : str - Wrapper script to run/open python environment tarball. Defaults to python_package_run found in PATH. - - treereduction : int - Number of processed chunks per accumulation task. Defaults is 20. - - verbose : bool - If true, emit a message on each task submission and completion. - Default is false. - print_stdout : bool - If true (default), print the standard output of work queue task on completion. - - debug_log : str - Filename for debug output - stats_log : str - Filename for tasks statistics output - transactions_log : str - Filename for tasks lifetime reports output - tasks_accum_log : str - Filename for the log of tasks that have been processed and accumulated. - - filepath: str - Path to the parent directory where to create the staging directory. - Default is "." (current working directory). - - custom_init : function, optional - A function that takes as an argument the queue's WorkQueue object. - The function is called just before the first work unit is submitted - to the queue. - """ - - # Standard executor options: - compression: Optional[int] = 9 # as recommended by lz4 - retries: int = 2 # task executes at most 3 times - # wq executor options: - manager_name: Optional[str] = None - port: Optional[Union[int, Tuple[int, int]]] = None - filepath: str = "." - events_total: Optional[int] = None - x509_proxy: Optional[str] = None - verbose: bool = False - print_stdout: bool = False - status_display_interval: Optional[int] = 10 - debug_log: Optional[str] = None - stats_log: Optional[str] = None - transactions_log: Optional[str] = None - tasks_accum_log: Optional[str] = None - password_file: Optional[str] = None - ssl: Union[bool, Tuple[str, str]] = False - environment_file: Optional[str] = None - extra_input_files: List = field(default_factory=list) - wrapper: Optional[str] = shutil.which("poncho_package_run") - resource_monitor: Optional[str] = "off" - resources_mode: Optional[str] = "max-seen" - split_on_exhaustion: Optional[bool] = True - fast_terminate_workers: Optional[int] = None - cores: Optional[int] = None - memory: Optional[int] = None - disk: Optional[int] = None - gpus: Optional[int] = None - treereduction: int = 20 - chunksize: int = 100000 - dynamic_chunksize: Optional[Dict] = None - custom_init: Optional[Callable] = None - - # deprecated - bar_format: Optional[str] = None - chunks_accum_in_mem: Optional[int] = None - master_name: Optional[str] = None - chunks_per_accum: Optional[int] = None - - def __call__( - self, - items: Iterable, - function: Callable, - accumulator: Accumulatable, - ): - from .work_queue_tools import run - - return ( - run( - self, - items, - function, - accumulator, - ), - 0, - ) - - -@dataclass -class IterativeExecutor(ExecutorBase): - """Execute in one thread iteratively - - Parameters - ---------- - items : list - List of input arguments - function : callable - A function to be called on each input, which returns an accumulator instance - accumulator : Accumulatable - An accumulator to collect the output of the function - status : bool - If true (default), enable progress bar - unit : str - Label of progress bar unit - desc : str - Label of progress bar description - compression : int, optional - Ignored for iterative executor - """ - - workers: int = 1 - - def __call__( - self, - items: Iterable, - function: Callable, - accumulator: Accumulatable, - ): - if len(items) == 0: - return accumulator - with rich_bar() as progress: - p_id = progress.add_task( - self.desc, total=len(items), unit=self.unit, disable=not self.status - ) - return ( - accumulate( - progress.track( - map(function, (c for c in items)), - total=len(items), - task_id=p_id, - ), - accumulator, - ), - 0, - ) - - -@dataclass -class FuturesExecutor(ExecutorBase): - """Execute using multiple local cores using python futures - - Parameters - ---------- - items : list - List of input arguments - function : callable - A function to be called on each input, which returns an accumulator instance - accumulator : Accumulatable - An accumulator to collect the output of the function - pool : concurrent.futures.Executor class or instance, optional - The type of futures executor to use, defaults to ProcessPoolExecutor. - You can pass an instance instead of a class to reuse an executor - workers : int, optional - Number of parallel processes for futures (default 1) - status : bool, optional - If true (default), enable progress bar - desc : str, optional - Label of progress description (default: 'Processing') - unit : str, optional - Label of progress bar bar unit (default: 'items') - compression : int, optional - Compress accumulator outputs in flight with LZ4, at level specified (default 1) - Set to ``None`` for no compression. - recoverable : bool, optional - Instead of raising Exception right away, the exception is captured and returned - up for custom parsing. Already completed items will be returned as well. - checkpoints : bool - To do - merging : bool | tuple(int, int, int), optional - Enables submitting intermediate merge jobs to the executor. Format is - (n_batches, min_batch_size, max_batch_size). Passing ``True`` will use default: (5, 4, 100), - aka as they are returned try to split completed jobs into 5 batches, but of at least 4 and at most 100 items. - Default is ``False`` - results get merged as they finish in the main process. - nparts : int, optional - Number of merge jobs to create at a time. Also pass via ``merging(X, ..., ...)'' - minred : int, optional - Minimum number of items to merge in one job. Also pass via ``merging(..., X, ...)'' - maxred : int, optional - maximum number of items to merge in one job. Also pass via ``merging(..., ..., X)'' - mergepool : concurrent.futures.Executor class or instance | int, optional - Supply an additional executor to process merge jobs independently. - An ``int`` will be interpreted as ``ProcessPoolExecutor(max_workers=int)``. - tailtimeout : int, optional - Timeout requirement on job tails. Cancel all remaining jobs if none have finished - in the timeout window. - """ - - pool: Union[ - Callable[..., concurrent.futures.Executor], concurrent.futures.Executor - ] = concurrent.futures.ProcessPoolExecutor # fmt: skip - mergepool: Optional[ - Union[ - Callable[..., concurrent.futures.Executor], - concurrent.futures.Executor, - bool, - ] - ] = None - recoverable: bool = False - merging: Union[bool, Tuple[int, int, int]] = False - workers: int = 1 - tailtimeout: Optional[int] = None - - def __post_init__(self): - if not ( - isinstance(self.merging, bool) - or (isinstance(self.merging, tuple) and len(self.merging) == 3) - ): - raise ValueError( - f"merging={self.merging} not understood. Required format is " - "(n_batches, min_batch_size, max_batch_size)" - ) - elif self.merging is True: - self.merging = (5, 4, 100) - - def _merge_size(self, size: int): - return min(self.merging[2], max(size // self.merging[0] + 1, self.merging[1])) - - def __getstate__(self): - return dict(self.__dict__, pool=None) - - def __call__( - self, - items: Iterable, - function: Callable, - accumulator: Accumulatable, - ): - if len(items) == 0: - return accumulator - if self.compression is not None: - function = _compression_wrapper(self.compression, function) - reducer = _reduce(self.compression) - - def _processwith(pool, mergepool): - FH = _FuturesHolder( - {pool.submit(function, item) for item in items}, refresh=2 - ) - - try: - if mergepool is None: - merged = _watcher(FH, self, reducer, pool) - else: - merged = _watcher(FH, self, reducer, mergepool) - return accumulate([_decompress(merged), accumulator]), 0 - - except Exception as e: - traceback.print_exc() - if self.recoverable: - print("Exception occurred, recovering progress...") - for job in FH.futures: - job.cancel() - - merged = _wait_for_merges(FH, self) - return accumulate([_decompress(merged), accumulator]), e - else: - raise e from None - - if isinstance(self.pool, concurrent.futures.Executor): - return _processwith(pool=self.pool, mergepool=self.mergepool) - else: - # assume its a class then - with ExitStack() as stack: - poolinstance = stack.enter_context(self.pool(max_workers=self.workers)) - if self.mergepool is not None: - if isinstance(self.mergepool, int): - self.mergepool = concurrent.futures.ProcessPoolExecutor( - max_workers=self.mergepool - ) - mergepoolinstance = stack.enter_context(self.mergepool) - else: - mergepoolinstance = None - return _processwith(pool=poolinstance, mergepool=mergepoolinstance) - - -@dataclass -class DaskExecutor(ExecutorBase): - """Execute using dask futures - - Parameters - ---------- - items : list - List of input arguments - function : callable - A function to be called on each input, which returns an accumulator instance - accumulator : Accumulatable - An accumulator to collect the output of the function - client : distributed.client.Client - A dask distributed client instance - treereduction : int, optional - Tree reduction factor for output accumulators (default: 20) - status : bool, optional - If true (default), enable progress bar - compression : int, optional - Compress accumulator outputs in flight with LZ4, at level specified (default 1) - Set to ``None`` for no compression. - priority : int, optional - Task priority, default 0 - retries : int, optional - Number of retries for failed tasks (default: 3) - heavy_input : serializable, optional - Any value placed here will be broadcast to workers and joined to input - items in a tuple (item, heavy_input) that is passed to function. - function_name : str, optional - Name of the function being passed - use_dataframes: bool, optional - Retrieve output as a distributed Dask DataFrame (default: False). - The outputs of individual tasks must be Pandas DataFrames. - - .. note:: If ``heavy_input`` is set, ``function`` is assumed to be pure. - """ - - client: Optional["dask.distributed.Client"] = None # noqa - treereduction: int = 20 - priority: int = 0 - retries: int = 3 - heavy_input: Optional[bytes] = None - use_dataframes: bool = False - # secret options - worker_affinity: bool = False - - def __getstate__(self): - return dict(self.__dict__, client=None) - - def __call__( - self, - items: Iterable, - function: Callable, - accumulator: Accumulatable, - ): - if len(items) == 0: - return accumulator - - import dask.dataframe as dd - from dask.distributed import Client - from distributed.scheduler import KilledWorker - - if self.client is None: - self.client = Client(threads_per_worker=1) - - if self.use_dataframes: - self.compression = None - - reducer = _reduce(self.compression) - if self.compression is not None: - function = _compression_wrapper( - self.compression, function, name=self.function_name - ) - - if self.heavy_input is not None: - # client.scatter is not robust against adaptive clusters - # https://github.com/CoffeaTeam/coffea/issues/465 - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "Large object of size") - items = list( - zip( - items, repeat(self.client.submit(lambda x: x, self.heavy_input)) - ) - ) - - work = [] - key_to_item = {} - if self.worker_affinity: - workers = list(self.client.run(lambda: 0)) - - def belongsto(heavy_input, workerindex, item): - if heavy_input is not None: - item = item[0] - hashed = _hash( - (item.fileuuid, item.treename, item.entrystart, item.entrystop) - ) - return hashed % len(workers) == workerindex - - for workerindex, worker in enumerate(workers): - items_worker = [ - item - for item in items - if belongsto(self.heavy_input, workerindex, item) - ] - work_worker = self.client.map( - function, - items_worker, - pure=(self.heavy_input is not None), - priority=self.priority, - retries=self.retries, - workers={worker}, - allow_other_workers=False, - ) - work.extend(work_worker) - key_to_item.update( - { - future.key: item - for future, item in zip(work_worker, items_worker) - } - ) - else: - work = self.client.map( - function, - items, - pure=(self.heavy_input is not None), - priority=self.priority, - retries=self.retries, - ) - key_to_item.update({future.key: item for future, item in zip(work, items)}) - if (self.function_name == "get_metadata") or not self.use_dataframes: - while len(work) > 1: - work = self.client.map( - reducer, - [ - work[i : i + self.treereduction] - for i in range(0, len(work), self.treereduction) - ], - pure=True, - priority=self.priority, - retries=self.retries, - ) - key_to_item.update({future.key: "(output reducer)" for future in work}) - work = work[0] - try: - if self.status: - from distributed import progress - - # FIXME: fancy widget doesn't appear, have to live with boring pbar - progress(work, multi=True, notebook=False) - return ( - accumulate( - [ - work.result() - if self.compression is None - else _decompress(work.result()) - ], - accumulator, - ), - 0, - ) - except KilledWorker as ex: - baditem = key_to_item[ex.task] - if self.heavy_input is not None and isinstance(baditem, tuple): - baditem = baditem[0] - raise RuntimeError( - f"Work item {baditem} caused a KilledWorker exception (likely a segfault or out-of-memory issue)" - ) - else: - if self.status: - from distributed import progress - - progress(work, multi=True, notebook=False) - return {"out": dd.from_delayed(work)}, 0 - - -@dataclass -class ParslExecutor(ExecutorBase): - """Execute using parsl pyapp wrapper - - Parameters - ---------- - items : list - List of input arguments - function : callable - A function to be called on each input, which returns an accumulator instance - accumulator : Accumulatable - An accumulator to collect the output of the function - config : parsl.config.Config, optional - A parsl DataFlow configuration object. Necessary if there is no active kernel - - .. note:: In general, it is safer to construct the DFK with ``parsl.load(config)`` prior to calling this function - status : bool - If true (default), enable progress bar - unit : str - Label of progress bar unit - desc : str - Label of progress bar description - compression : int, optional - Compress accumulator outputs in flight with LZ4, at level specified (default 1) - Set to ``None`` for no compression. - recoverable : bool, optional - Instead of raising Exception right away, the exception is captured and returned - up for custom parsing. Already completed items will be returned as well. - merging : bool | tuple(int, int, int), optional - Enables submitting intermediate merge jobs to the executor. Format is - (n_batches, min_batch_size, max_batch_size). Passing ``True`` will use default: (5, 4, 100), - aka as they are returned try to split completed jobs into 5 batches, but of at least 4 and at most 100 items. - Default is ``False`` - results get merged as they finish in the main process. - jobs_executors : list | "all" optional - Labels of the executors (from dfk.config.executors) that will process main jobs. - Default is 'all'. Recommended is ``['jobs']``, while passing ``label='jobs'`` to the primary executor. - merges_executors : list | "all" optional - Labels of the executors (from dfk.config.executors) that will process main jobs. - Default is 'all'. Recommended is ``['merges']``, while passing ``label='merges'`` to the executor dedicated towards merge jobs. - tailtimeout : int, optional - Timeout requirement on job tails. Cancel all remaining jobs if none have finished - in the timeout window. - """ - - tailtimeout: Optional[int] = None - config: Optional["parsl.config.Config"] = None # noqa - recoverable: bool = False - merging: Optional[Union[bool, Tuple[int, int, int]]] = False - jobs_executors: Union[str, List] = "all" - merges_executors: Union[str, List] = "all" - - def __post_init__(self): - if not ( - isinstance(self.merging, bool) - or (isinstance(self.merging, tuple) and len(self.merging) == 3) - ): - raise ValueError( - f"merging={self.merging} not understood. Required format is " - "(n_batches, min_batch_size, max_batch_size)" - ) - elif self.merging is True: - self.merging = (5, 4, 100) - - def _merge_size(self, size: int): - return min(self.merging[2], max(size // self.merging[0] + 1, self.merging[1])) - - def __call__( - self, - items: Iterable, - function: Callable, - accumulator: Accumulatable, - ): - if len(items) == 0: - return accumulator - import parsl - from parsl.app.app import python_app - - from .parsl.timeout import timeout - - if self.compression is not None: - function = _compression_wrapper(self.compression, function) - - # Parse config if passed - cleanup = False - try: - parsl.dfk() - except RuntimeError: - cleanup = True - pass - if cleanup and self.config is None: - raise RuntimeError( - "No active parsl DataFlowKernel, must specify a config to construct one" - ) - elif not cleanup and self.config is not None: - raise RuntimeError("An active parsl DataFlowKernel already exists") - elif self.config is not None: - parsl.clear() - parsl.load(self.config) - - # Check config/executors - _exec_avail = [exe.label for exe in parsl.dfk().config.executors] - _execs_tried = ( - [] if self.jobs_executors == "all" else [e for e in self.jobs_executors] - ) - _execs_tried += ( - [] if self.merges_executors == "all" else [e for e in self.merges_executors] - ) - if not all([_e in _exec_avail for _e in _execs_tried]): - raise RuntimeError( - f"Executors: [{','.join(_e for _e in _execs_tried if _e not in _exec_avail)}] not available in the config." - ) - - # Apps - app = timeout(python_app(function, executors=self.jobs_executors)) - reducer = timeout( - python_app(_reduce(self.compression), executors=self.merges_executors) - ) - - FH = _FuturesHolder(set(map(app, items)), refresh=2) - try: - merged = _watcher(FH, self, reducer) - return accumulate([_decompress(merged), accumulator]), 0 - - except Exception as e: - traceback.print_exc() - if self.recoverable: - print("Exception occurred, recovering progress...") - # for job in FH.futures: # NotImplemented in parsl - # job.cancel() - - merged = _wait_for_merges(FH, self) - return accumulate([_decompress(merged), accumulator]), e - else: - raise e from None - finally: - if cleanup: - parsl.dfk().cleanup() - parsl.clear() - - -class ParquetFileUprootShim: - def __init__(self, table, name): - self.table = table - self.name = name - - def array(self, **kwargs): - import awkward - - return awkward.Array(self.table[self.name]) - - -class ParquetFileContext: - def __init__(self, filename): - self.filename = filename - self.filehandle = None - self.branchnames = None - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, exc_traceback): - pass - - def _get_handle(self): - import pyarrow.parquet as pq - - if self.filehandle is None: - self.filehandle = pq.ParquetFile(self.filename) - self.branchnames = { - item.path.split(".")[0] for item in self.filehandle.schema - } - - @property - def num_entries(self): - self._get_handle() - return self.filehandle.metadata.num_rows - - def keys(self): - self._get_handle() - return self.branchnames - - def __iter__(self): - self._get_handle() - return iter(self.branchnames) - - def __getitem__(self, name): - self._get_handle() - if name in self.branchnames: - return ParquetFileUprootShim( - self.filehandle.read([name], use_threads=False), name - ) - else: - return KeyError(name) - - def __contains__(self, name): - self._get_handle() - return name in self.branchnames - - -@dataclass -class Runner: - """A tool to run a processor using uproot for data delivery - - A convenience wrapper to submit jobs for a file set, which is a - dictionary of dataset: [file list] entries. Supports only uproot TTree - reading, via NanoEvents or LazyDataFrame. For more customized processing, - e.g. to read other objects from the files and pass them into data frames, - one can write a similar function in their user code. - - Parameters - ---------- - executor : ExecutorBase instance - Executor, which implements a callable with inputs: items, function, accumulator - and performs some action equivalent to: - ``for item in items: accumulator += function(item)`` - pre_executor : ExecutorBase instance - Executor, used to calculate fileset metadata - Defaults to executor - chunksize : int, optional - Maximum number of entries to process at a time in the data frame, default: 100k - maxchunks : int, optional - Maximum number of chunks to process per dataset - Defaults to processing the whole dataset - metadata_cache : mapping, optional - A dict-like object to use as a cache for (file, tree) metadata that is used to - determine chunking. Defaults to a in-memory LRU cache that holds 100k entries - (about 1MB depending on the length of filenames, etc.) If you edit an input file - (please don't) during a session, the session can be restarted to clear the cache. - dynamic_chunksize : dict, optional - Whether to adapt the chunksize for units of work to run in the targets given. - Currently supported are 'wall_time' (in seconds), and 'memory' (in MB). - E.g., with {"wall_time": 120, "memory": 2048}, the chunksize will - be dynamically adapted so that processing jobs each run in about - two minutes, using two GB of memory. (Currently only for the WorkQueueExecutor.) - """ - - executor: ExecutorBase - pre_executor: Optional[ExecutorBase] = None - chunksize: int = 100000 - maxchunks: Optional[int] = None - metadata_cache: Optional[MutableMapping] = None - dynamic_chunksize: Optional[Dict] = None - skipbadfiles: bool = False - xrootdtimeout: Optional[int] = 60 - align_clusters: bool = False - savemetrics: bool = False - mmap: bool = False - schema: Optional[schemas.BaseSchema] = schemas.BaseSchema - cachestrategy: Optional[ - Union[Literal["dask-worker"], Callable[..., MutableMapping]] - ] = None # fmt: skip - processor_compression: int = 1 - use_skyhook: Optional[bool] = False - skyhook_options: Optional[Dict] = field(default_factory=dict) - format: str = "root" - - @staticmethod - def read_coffea_config(): - config_path = None - if "HOME" in os.environ: - config_path = os.path.join(os.environ["HOME"], ".coffea.toml") - elif "_CONDOR_SCRATCH_DIR" in os.environ: - config_path = os.path.join( - os.environ["_CONDOR_SCRATCH_DIR"], ".coffea.toml" - ) - - if config_path is not None and os.path.exists(config_path): - with open(config_path) as f: - return toml.loads(f.read()) - else: - return dict() - - def __post_init__(self): - if self.pre_executor is None: - self.pre_executor = self.executor - - assert isinstance( - self.executor, ExecutorBase - ), "Expected executor to derive from ExecutorBase" - assert isinstance( - self.pre_executor, ExecutorBase - ), "Expected pre_executor to derive from ExecutorBase" - - if self.metadata_cache is None: - self.metadata_cache = DEFAULT_METADATA_CACHE - - if self.align_clusters and self.dynamic_chunksize: - raise RuntimeError( - "align_clusters and dynamic_chunksize cannot be used simultaneously" - ) - if self.maxchunks and self.dynamic_chunksize: - raise RuntimeError( - "maxchunks and dynamic_chunksize cannot be used simultaneously" - ) - if self.dynamic_chunksize and not isinstance(self.executor, WorkQueueExecutor): - raise RuntimeError( - "dynamic_chunksize currently only supported by the WorkQueueExecutor" - ) - - assert self.format in ("root", "parquet") - - @property - def retries(self): - if isinstance(self.executor, DaskExecutor): - retries = 0 - else: - retries = getattr(self.executor, "retries", 0) - assert retries >= 0 - return retries - - @property - def use_dataframes(self): - if isinstance(self.executor, DaskExecutor): - return self.executor.use_dataframes - else: - return False - - @staticmethod - def get_cache(cachestrategy): - cache = None - if cachestrategy == "dask-worker": - from distributed import get_worker - - from coffea.processor.dask import ColumnCache - - worker = get_worker() - try: - cache = worker.plugins[ColumnCache.name] - except KeyError: - # emit warning if not found? - pass - elif callable(cachestrategy): - cache = cachestrategy() - return cache - - @staticmethod - def automatic_retries(retries: int, skipbadfiles: bool, func, *args, **kwargs): - """This should probably defined on Executor-level.""" - import warnings - - retry_count = 0 - while retry_count <= retries: - try: - return func(*args, **kwargs) - # catch xrootd errors and optionally skip - # or retry to read the file - except Exception as e: - chain = _exception_chain(e) - if skipbadfiles and any( - isinstance(c, (FileNotFoundError, UprootMissTreeError)) - for c in chain - ): - warnings.warn(str(e)) - break - if ( - skipbadfiles - and (retries == retry_count) - and any( - e in str(c) - for c in chain - for e in [ - "Invalid redirect URL", - "Operation expired", - "Socket timeout", - ] - ) - ): - warnings.warn(str(e)) - break - if ( - not skipbadfiles - or any("Auth failed" in str(c) for c in chain) - or retries == retry_count - ): - raise e - warnings.warn("Attempt %d of %d." % (retry_count + 1, retries + 1)) - retry_count += 1 - - @staticmethod - def _normalize_fileset( - fileset: Dict, - treename: str, - ) -> Generator[FileMeta, None, None]: - if isinstance(fileset, str): - with open(fileset) as fin: - fileset = json.load(fin) - elif not isinstance(fileset, Mapping): - raise ValueError("Expected fileset to be a path string or mapping") - reserved_metakeys = _PROTECTED_NAMES - for dataset, filelist in fileset.items(): - user_meta = None - if isinstance(filelist, dict): - user_meta = filelist["metadata"] if "metadata" in filelist else None - if user_meta is not None: - for rkey in reserved_metakeys: - if rkey in user_meta.keys(): - raise ValueError( - f'Reserved word "{rkey}" in metadata section of fileset dictionary, please rename this entry!' - ) - if "treename" not in filelist and treename is None: - raise ValueError( - "treename must be specified if the fileset does not contain tree names" - ) - local_treename = ( - filelist["treename"] if "treename" in filelist else treename - ) - filelist = filelist["files"] - elif isinstance(filelist, list): - if treename is None: - raise ValueError( - "treename must be specified if the fileset does not contain tree names" - ) - local_treename = treename - else: - raise ValueError( - "list of filenames in fileset must be a list or a dict" - ) - for filename in filelist: - yield FileMeta(dataset, filename, local_treename, user_meta) - - @staticmethod - def metadata_fetcher_root( - xrootdtimeout: int, align_clusters: bool, item: FileMeta - ) -> Accumulatable: - with uproot.open({item.filename: None}, timeout=xrootdtimeout) as file: - try: - tree = file[item.treename] - except uproot.exceptions.KeyInFileError as e: - raise UprootMissTreeError(str(e)) from e - - metadata = {} - if item.metadata: - metadata.update(item.metadata) - metadata.update({"numentries": tree.num_entries, "uuid": file.file.fUUID}) - if align_clusters: - metadata["clusters"] = tree.common_entry_offsets() - out = set_accumulator( - [FileMeta(item.dataset, item.filename, item.treename, metadata)] - ) - return out - - @staticmethod - def metadata_fetcher_parquet(item: FileMeta): - with ParquetFileContext(item.filename) as file: - metadata = {} - if item.metadata: - metadata.update(item.metadata) - metadata.update( - {"numentries": file.num_entries, "uuid": b"NO_UUID_0000_000"} - ) - out = set_accumulator( - [FileMeta(item.dataset, item.filename, item.treename, metadata)] - ) - return out - - def _preprocess_fileset_root(self, fileset: Dict) -> None: - # this is a bit of an abuse of map-reduce but ok - to_get = { - filemeta - for filemeta in fileset - if not filemeta.populated(clusters=self.align_clusters) - } - if len(to_get) > 0: - out = set_accumulator() - pre_arg_override = { - "function_name": "get_metadata", - "desc": "Preprocessing", - "unit": "file", - "compression": None, - } - if isinstance(self.pre_executor, (FuturesExecutor, ParslExecutor)): - pre_arg_override.update({"tailtimeout": None}) - if isinstance(self.pre_executor, (DaskExecutor)): - self.pre_executor.heavy_input = None - pre_arg_override.update({"worker_affinity": False}) - pre_executor = self.pre_executor.copy(**pre_arg_override) - closure = partial( - self.automatic_retries, - self.retries, - self.skipbadfiles, - partial( - self.metadata_fetcher_root, self.xrootdtimeout, self.align_clusters - ), - ) - out, _ = pre_executor(to_get, closure, out) - while out: - item = out.pop() - self.metadata_cache[item] = item.metadata - for filemeta in fileset: - filemeta.maybe_populate(self.metadata_cache) - - def _preprocess_fileset_parquet(self, fileset: Dict) -> None: - # this is a bit of an abuse of map-reduce but ok - to_get = { - filemeta - for filemeta in fileset - if not filemeta.populated(clusters=self.align_clusters) - } - if len(to_get) > 0: - out = set_accumulator() - pre_arg_override = { - "function_name": "get_metadata", - "desc": "Preprocessing", - "unit": "file", - "compression": None, - } - if isinstance(self.pre_executor, (FuturesExecutor, ParslExecutor)): - pre_arg_override.update({"tailtimeout": None}) - if isinstance(self.pre_executor, (DaskExecutor)): - self.pre_executor.heavy_input = None - pre_arg_override.update({"worker_affinity": False}) - pre_executor = self.pre_executor.copy(**pre_arg_override) - closure = partial( - self.automatic_retries, - self.retries, - self.skipbadfiles, - self.metadata_fetcher_parquet, - ) - out, _ = pre_executor(to_get, closure, out) - while out: - item = out.pop() - self.metadata_cache[item] = item.metadata - for filemeta in fileset: - filemeta.maybe_populate(self.metadata_cache) - - def _filter_badfiles(self, fileset: Dict) -> List: - final_fileset = [] - for filemeta in fileset: - if filemeta.populated(clusters=self.align_clusters): - final_fileset.append(filemeta) - elif not self.skipbadfiles: - raise RuntimeError( - f"Metadata for file {filemeta.filename} could not be accessed." - ) - return final_fileset - - def _chunk_generator(self, fileset: Dict, treename: str) -> Generator: - config = None - if self.use_skyhook: - config = Runner.read_coffea_config() - if not self.use_skyhook and (self.format == "root" or self.format == "parquet"): - if self.maxchunks is None: - last_chunksize = self.chunksize - for filemeta in fileset: - last_chunksize = yield from filemeta.chunks( - last_chunksize, - self.align_clusters, - ) - else: - # get just enough file info to compute chunking - nchunks = defaultdict(int) - chunks = [] - for filemeta in fileset: - if nchunks[filemeta.dataset] >= self.maxchunks: - continue - for chunk in filemeta.chunks(self.chunksize, self.align_clusters): - chunks.append(chunk) - nchunks[filemeta.dataset] += 1 - if nchunks[filemeta.dataset] >= self.maxchunks: - break - yield from (c for c in chunks) - else: - if self.use_skyhook and not config.get("skyhook", None): - print("No skyhook config found, using defaults") - config["skyhook"] = dict() - - dataset_filelist_map = {} - if self.use_skyhook: - import pyarrow.dataset as ds - - for dataset, basedir in fileset.items(): - ds_ = ds.dataset(basedir, format="parquet") - dataset_filelist_map[dataset] = ds_.files - else: - for dataset, maybe_filelist in fileset.items(): - if isinstance(maybe_filelist, list): - dataset_filelist_map[dataset] = maybe_filelist - elif isinstance(maybe_filelist, dict): - if "files" not in maybe_filelist: - raise ValueError( - "Dataset definition must have key 'files' defined!" - ) - dataset_filelist_map[dataset] = maybe_filelist["files"] - else: - raise ValueError( - "Dataset definition in fileset must be dict[str: list[str]] or dict[str: dict[str: Any]]" - ) - chunks = [] - for dataset, filelist in dataset_filelist_map.items(): - for filename in filelist: - # If skyhook config is provided and is not empty, - if self.use_skyhook: - ceph_config_path = config["skyhook"].get( - "ceph_config_path", "/etc/ceph/ceph.conf" - ) - ceph_data_pool = config["skyhook"].get( - "ceph_data_pool", "cephfs_data" - ) - filename = f"{ceph_config_path}:{ceph_data_pool}:{filename}" - chunks.append( - WorkItem( - dataset, - filename, - treename, - 0, - 0, - "", - fileset[dataset]["metadata"] - if "metadata" in fileset[dataset] - else None, - ) - ) - yield from iter(chunks) - - @staticmethod - def _work_function( - format: str, - xrootdtimeout: int, - mmap: bool, - schema: schemas.BaseSchema, - cache_function: Callable[[], MutableMapping], - use_dataframes: bool, - savemetrics: bool, - item: WorkItem, - processor_instance: ProcessorABC, - ) -> Dict: - if processor_instance == "heavy": - item, processor_instance = item - if not isinstance(processor_instance, ProcessorABC): - processor_instance = cloudpickle.loads(lz4f.decompress(processor_instance)) - - if format == "root": - filecontext = uproot.open( - {item.filename: None}, - timeout=xrootdtimeout, - file_handler=uproot.MemmapSource - if mmap - else uproot.MultithreadedFileSource, - ) - elif format == "parquet": - filecontext = ParquetFileContext(item.filename) - - metadata = { - "dataset": item.dataset, - "filename": item.filename, - "treename": item.treename, - "entrystart": item.entrystart, - "entrystop": item.entrystop, - "fileuuid": str(uuid.UUID(bytes=item.fileuuid)) - if len(item.fileuuid) > 0 - else "", - } - if item.usermeta is not None: - metadata.update(item.usermeta) - - with filecontext as file: - if schema is None: - # To deprecate - tree = None - events = None - if format == "root": - tree = file[item.treename] - events = uproot.dask(tree, ak_add_doc=True)[ - item.entrystart : item.entrystop - ] - setattr(events, "metadata", metadata) - elif format == "parquet": - import dask_awkward - - tree = file - events = dask_awkward.from_parquet(item.filename)[ - item.entrystart : item.entrystop - ] - setattr(events, "metadata", metadata) - else: - raise ValueError("Format can only be root or parquet!") - elif issubclass(schema, schemas.BaseSchema): - # change here - if format == "root": - materialized = [] - factory = NanoEventsFactory.from_root( - file=file, - treepath=item.treename, - persistent_cache=cache_function(), - schemaclass=schema, - metadata=metadata, - access_log=materialized, - delayed=True, - ) - events = factory.events()[item.entrystart : item.entrystop] - elif format == "parquet": - skyhook_options = {} - if ":" in item.filename: - ( - ceph_config_path, - ceph_data_pool, - filename, - ) = item.filename.split(":") - # patch back filename into item - item = WorkItem(**dict(asdict(item), filename=filename)) - skyhook_options["ceph_config_path"] = ceph_config_path - skyhook_options["ceph_data_pool"] = ceph_data_pool - - factory = NanoEventsFactory.from_parquet( - file=item.filename, - treepath=item.treename, - schemaclass=schema, - metadata=metadata, - skyhook_options=skyhook_options, - permit_dask=True, - ) - events = factory.events()[item.entrystart : item.entrystop] - else: - raise ValueError( - "Expected schema to derive from nanoevents.BaseSchema, instead got %r" - % schema - ) - tic = time.time() - try: - out = None - if isinstance(events, LazyDataFrame): - out = processor_instance.process(events) - else: - import dask - import dask_awkward - - to_compute = processor_instance.process(events) - # materialized = dask_awkward.report_necessary_buffers(to_compute) - out = dask.compute(to_compute, scheduler="single-threaded")[0] - except Exception as e: - raise Exception(f"Failed processing file: {item!r}") from e - if out is None: - raise ValueError( - "Output of process() should not be None. Make sure your processor's process() function returns an accumulator." - ) - toc = time.time() - if use_dataframes: - return out - else: - if savemetrics: - metrics = {} - if isinstance(file, uproot.ReadOnlyDirectory): - metrics["bytesread"] = file.file.source.num_requested_bytes - # metrics["data_and_shape_buffers"] = set(materialized) - # metrics["shape_only_buffers"] = set(materialized) - if schema is not None and issubclass(schema, schemas.BaseSchema): - metrics["entries"] = len(events) - else: - metrics["entries"] = events.size - metrics["processtime"] = toc - tic - return {"out": out, "metrics": metrics, "processed": {item}} - return {"out": out, "processed": {item}} - - def __call__( - self, - fileset: Dict, - treename: str, - processor_instance: ProcessorABC, - ) -> Accumulatable: - """Run the processor_instance on a given fileset - - Parameters - ---------- - fileset : dict - A dictionary ``{dataset: [file, file], }`` - Optionally, if some files' tree name differ, the dictionary can be specified: - ``{dataset: {'treename': 'name', 'files': [file, file]}, }`` - treename : str - name of tree inside each root file, can be ``None``; - treename can also be defined in fileset, which will override the passed treename - processor_instance : ProcessorABC - An instance of a class deriving from ProcessorABC - """ - - wrapped_out = self.run(fileset, processor_instance, treename) - if self.use_dataframes: - return wrapped_out # not wrapped anymore - if self.savemetrics: - return wrapped_out["out"], wrapped_out["metrics"] - return wrapped_out["out"] - - def preprocess( - self, - fileset: Dict, - treename: str, - ) -> Generator: - """Run the processor_instance on a given fileset - - Parameters - ---------- - fileset : dict - A dictionary ``{dataset: [file, file], }`` - Optionally, if some files' tree name differ, the dictionary can be specified: - ``{dataset: {'treename': 'name', 'files': [file, file]}, }`` - treename : str - name of tree inside each root file, can be ``None``; - treename can also be defined in fileset, which will override the passed treename - """ - - if not isinstance(fileset, (Mapping, str)): - raise ValueError( - "Expected fileset to be a mapping dataset: list(files) or filename" - ) - if self.format == "root": - fileset = list(self._normalize_fileset(fileset, treename)) - for filemeta in fileset: - filemeta.maybe_populate(self.metadata_cache) - - self._preprocess_fileset_root(fileset) - fileset = self._filter_badfiles(fileset) - - # reverse fileset list to match the order of files as presented in version - # v0.7.4. This fixes tests using maxchunks. - fileset.reverse() - elif self.format == "parquet": - fileset = list(self._normalize_fileset(fileset, treename)) - for filemeta in fileset: - filemeta.maybe_populate(self.metadata_cache) - - self._preprocess_fileset_parquet(fileset) - fileset = self._filter_badfiles(fileset) - - # reverse fileset list to match the order of files as presented in version - # v0.7.4. This fixes tests using maxchunks. - fileset.reverse() - - return self._chunk_generator(fileset, treename) - - def run( - self, - fileset: Union[Dict, str, List[WorkItem], Generator], - processor_instance: ProcessorABC, - treename: str = None, - ) -> Accumulatable: - """Run the processor_instance on a given fileset - - Parameters - ---------- - fileset : dict | str | List[WorkItem] | Generator - - A dictionary ``{dataset: [file, file], }`` - Optionally, if some files' tree name differ, the dictionary can be specified: - ``{dataset: {'treename': 'name', 'files': [file, file]}, }`` - - A single file name - - File chunks for self.preprocess() - - Chunk generator - treename : str, optional - name of tree inside each root file, can be ``None``; - treename can also be defined in fileset, which will override the passed treename - Not needed if processing premade chunks - processor_instance : ProcessorABC - An instance of a class deriving from ProcessorABC - """ - - meta = False - if not isinstance(fileset, (Mapping, str)): - if isinstance(fileset, Generator) or isinstance(fileset[0], WorkItem): - meta = True - else: - raise ValueError( - "Expected fileset to be a mapping dataset: list(files) or filename" - ) - if not isinstance(processor_instance, ProcessorABC): - raise ValueError("Expected processor_instance to derive from ProcessorABC") - - if meta: - chunks = fileset - else: - chunks = self.preprocess(fileset, treename) - - if self.processor_compression is None: - pi_to_send = processor_instance - else: - pi_to_send = lz4f.compress( - cloudpickle.dumps(processor_instance), - compression_level=self.processor_compression, - ) - # hack around dask/dask#5503 which is really a silly request but here we are - if isinstance(self.executor, DaskExecutor): - self.executor.heavy_input = pi_to_send - closure = partial( - self._work_function, - self.format, - self.xrootdtimeout, - self.mmap, - self.schema, - partial(self.get_cache, self.cachestrategy), - self.use_dataframes, - self.savemetrics, - processor_instance="heavy", - ) - else: - closure = partial( - self._work_function, - self.format, - self.xrootdtimeout, - self.mmap, - self.schema, - partial(self.get_cache, self.cachestrategy), - self.use_dataframes, - self.savemetrics, - processor_instance=pi_to_send, - ) - - if self.format == "root" and isinstance(self.executor, WorkQueueExecutor): - # keep chunks in generator, use a copy to count number of events - # this is cheap, as we are reading from the cache - chunks_to_count = self.preprocess(fileset, treename) - else: - # materialize chunks to list, then count that list - chunks = list(chunks) - chunks_to_count = chunks - - events_total = sum(len(c) for c in chunks_to_count) - - exe_args = { - "unit": "chunk", - "function_name": type(processor_instance).__name__, - } - if isinstance(self.executor, WorkQueueExecutor): - exe_args.update( - { - "unit": "event", - "events_total": events_total, - "dynamic_chunksize": self.dynamic_chunksize, - "chunksize": self.chunksize, - } - ) - - closure = partial( - self.automatic_retries, self.retries, self.skipbadfiles, closure - ) - - executor = self.executor.copy(**exe_args) - wrapped_out, e = executor(chunks, closure, None) - if wrapped_out is None: - raise ValueError( - "No chunks returned results, verify ``processor`` instance structure.\n\ - if you used skipbadfiles=True, it is possible all your files are bad." - ) - wrapped_out["exception"] = e - - if not self.use_dataframes: - processor_instance.postprocess(wrapped_out["out"]) - - if "metrics" in wrapped_out.keys(): - wrapped_out["metrics"]["chunks"] = len(chunks) - for k, v in wrapped_out["metrics"].items(): - if isinstance(v, set): - wrapped_out["metrics"][k] = list(v) - if self.use_dataframes: - return wrapped_out["out"] - else: - return wrapped_out - - -def run_spark_job( - fileset, - processor_instance, - executor, - executor_args={}, - spark=None, - partitionsize=200000, - thread_workers=16, -): - """A wrapper to submit spark jobs - - A convenience wrapper to submit jobs for spark datasets, which is a - dictionary of dataset: [file list] entries. Presently supports reading of - parquet files converted from root. For more customized processing, - e.g. to read other objects from the files and pass them into data frames, - one can write a similar function in their user code. - - Parameters - ---------- - fileset : dict - dictionary {dataset: [file, file], } - processor_instance : ProcessorABC - An instance of a class deriving from ProcessorABC - - .. note:: The processor instance must define all the columns in data and MC that it reads as ``.columns`` - executor: - anything that inherits from `SparkExecutor` like `spark_executor` - - In general, a function that takes 3 arguments: items, function accumulator - and performs some action equivalent to: - for item in items: accumulator += function(item) - executor_args: - arguments to send to the creation of a spark session - spark: - an optional already created spark instance - - if ``None`` then we create an ephemeral spark instance using a config - partitionsize: - partition size to try to aim for (coalescese only, repartition too expensive) - thread_workers: - how many spark jobs to let fly in parallel during processing steps - """ - - try: - import pyspark - except ImportError as e: - print( - "you must have pyspark installed to call run_spark_job()!", file=sys.stderr - ) - raise e - - import warnings - - import pyarrow as pa - from packaging import version - - arrow_env = ("ARROW_PRE_0_15_IPC_FORMAT", "1") - if version.parse(pa.__version__) >= version.parse("0.15.0") and version.parse( - pyspark.__version__ - ) < version.parse("3.0.0"): - import os - - if arrow_env[0] not in os.environ or os.environ[arrow_env[0]] != arrow_env[1]: - warnings.warn( - "If you are using pyarrow >= 0.15.0, make sure to set %s=%s in your environment!" - % arrow_env - ) - - import pyspark.sql - - from .spark.detail import _spark_initialize, _spark_make_dfs, _spark_stop - from .spark.spark_executor import SparkExecutor - - if not isinstance(fileset, Mapping): - raise ValueError("Expected fileset to be a mapping dataset: list(files)") - if not isinstance(processor_instance, ProcessorABC): - raise ValueError("Expected processor_instance to derive from ProcessorABC") - if not isinstance(executor, SparkExecutor): - raise ValueError("Expected executor to derive from SparkExecutor") - - executor_args.setdefault("config", None) - executor_args.setdefault("file_type", "parquet") - executor_args.setdefault("laurelin_version", "1.1.1") - executor_args.setdefault("treeName", "Events") - executor_args.setdefault("schema", None) - executor_args.setdefault("cache", True) - executor_args.setdefault("skipbadfiles", False) - executor_args.setdefault("retries", 0) - executor_args.setdefault("xrootdtimeout", None) - file_type = executor_args["file_type"] - treeName = executor_args["treeName"] - schema = executor_args["schema"] - if "flatten" in executor_args: - raise ValueError( - "Executor argument 'flatten' is deprecated, please refactor your processor to accept awkward arrays" - ) - if "nano" in executor_args: - raise ValueError( - "Awkward0 NanoEvents no longer supported.\n" - "Please use 'schema': processor.NanoAODSchema to enable awkward NanoEvents processing." - ) - use_cache = executor_args["cache"] - - if executor_args["config"] is None: - executor_args.pop("config") - - # initialize spark if we need to - # if we initialize, then we deconstruct - # when we're done - killSpark = False - if spark is None: - spark = _spark_initialize(**executor_args) - killSpark = True - use_cache = False # if we always kill spark then we cannot use the cache - else: - if not isinstance(spark, pyspark.sql.session.SparkSession): - raise ValueError( - "Expected 'spark' to be a pyspark.sql.session.SparkSession" - ) - - dfslist = {} - if executor._cacheddfs is None: - dfslist = _spark_make_dfs( - spark, - fileset, - partitionsize, - processor_instance.columns, - thread_workers, - file_type, - treeName, - ) - - output = executor( - spark, dfslist, processor_instance, None, thread_workers, use_cache, schema - ) - processor_instance.postprocess(output) - - if killSpark: - _spark_stop(spark) - del spark - spark = None - - return output diff --git a/src/coffea/processor/helpers.py b/src/coffea/processor/helpers.py deleted file mode 100644 index dcdf03ba2..000000000 --- a/src/coffea/processor/helpers.py +++ /dev/null @@ -1,273 +0,0 @@ -import numpy - -from coffea.util import deprecate - - -class Weights: - """Container for event weights and associated systematic shifts - - This container keeps track of correction factors and systematic - effects that can be encoded as multiplicative modifiers to the event weight. - All weights are stored in vector form. - - Parameters - ---------- - size : int - size of the weight arrays to be handled (i.e. the number of events / instances). - storeIndividual : bool, optional - store not only the total weight + variations, but also each individual weight. - Default is false. - """ - - def __init__(self, size, storeIndividual=False): - deprecate( - RuntimeError( - "This utility has moved to the `coffea.analysis_tools` subpackage and has new features, check it out!" - ), - 0.8, - ) - self._weight = numpy.ones(size) - self._weights = {} - self._modifiers = {} - self._weightStats = {} - self._storeIndividual = storeIndividual - - def add(self, name, weight, weightUp=None, weightDown=None, shift=False): - """Add a new weight - - Adds a named correction to the event weight, and optionally also associated - systematic uncertainties. - - Parameters - ---------- - name : str - name of correction - weight : numpy.ndarray - the nominal event weight associated with the correction - weightUp : numpy.ndarray, optional - weight with correction uncertainty shifted up (if available) - weightDown : numpy.ndarray, optional - weight with correction uncertainty shifted down. If ``weightUp`` is supplied, and - the correction uncertainty is symmetric, this can be set to None to auto-calculate - the down shift as ``1 / weightUp``. - shift : bool, optional - if True, interpret weightUp and weightDown as a relative difference (additive) to the - nominal value - - .. note:: ``weightUp`` and ``weightDown`` are assumed to be rvalue-like and may be modified in-place by this function - """ - if name.endswith("Up") or name.endswith("Down"): - raise ValueError( - "Avoid using 'Up' and 'Down' in weight names, instead pass appropriate shifts to add() call" - ) - weight = numpy.array(weight) - self._weight = self._weight * weight - if self._storeIndividual: - self._weights[name] = weight - if weightUp is not None: - weightUp = numpy.array(weightUp) - if shift: - weightUp += weight - weightUp[weight != 0.0] /= weight[weight != 0.0] - self._modifiers[name + "Up"] = weightUp - if weightDown is not None: - weightDown = numpy.array(weightDown) - if shift: - weightDown = weight - weightDown - weightDown[weight != 0.0] /= weight[weight != 0.0] - self._modifiers[name + "Down"] = weightDown - self._weightStats[name] = { - "sumw": weight.sum(), - "sumw2": (weight**2).sum(), - "min": weight.min(), - "max": weight.max(), - "n": weight.size, - } - - def weight(self, modifier=None): - """Current event weight vector - - Parameters - ---------- - modifier : str, optional - if supplied, provide event weight corresponding to a particular - systematic uncertainty shift, of form ``str(name + 'Up')`` or (Down) - - Returns - ------- - weight : numpy.ndarray - The weight vector, possibly modified by the effect of a given systematic variation. - """ - if modifier is None: - return self._weight - elif "Down" in modifier and modifier not in self._modifiers: - return self._weight / self._modifiers[modifier.replace("Down", "Up")] - return self._weight * self._modifiers[modifier] - - def partial_weight(self, include=[], exclude=[]): - """Partial event weight vector - - Return a partial weight by multiplying a subset of all weights. - Can be operated either by specifying weights to include or - weights to exclude, but not both at the same time. The method - can only be used if the individual weights are stored via the - ``storeIndividual`` argument in the `Weights` initializer. - - Parameters - ---------- - include : list | set - Weight names to include, defaults to [] - exclude : list | set - Weight names to exclude, defaults to [] - Returns - ------- - weight : numpy.ndarray - The weight vector, corresponding to only the effect of the - corrections specified. - """ - if not self._storeIndividual: - raise ValueError( - "To be able to request weight exclusion, use storeIndividual=True when creating Weights object." - ) - if (include and exclude) or not (include or exclude): - raise ValueError( - "Need to specify exactly one of the 'exclude' or 'include' arguments." - ) - if include and not isinstance(include, (list, set)): - raise ValueError("'include' should be a list or set of weight names") - if exclude and not isinstance(exclude, (list, set)): - raise ValueError("'exclude' should be a list or set of weight names") - - names = set(self._weights.keys()) - if include: - names = names & set(include) - if exclude: - names = names - set(exclude) - - w = numpy.ones(self._weight.size) - for name in names: - w = w * self._weights[name] - - return w - - @property - def variations(self): - """List of available modifiers""" - keys = set(self._modifiers.keys()) - # add any missing 'Down' variation - for k in self._modifiers.keys(): - keys.add(k.replace("Up", "Down")) - return keys - - -class PackedSelection: - """Store boolean mask vectors in a compact manner - - This class can store several boolean masks (cuts, selections) and - evaluate arbitrary combinations of the requirements in an CPU-efficient way - - Parameters - ---------- - dtype : str - internal bitwidth of mask vector, which governs the maximum - number of boolean masks storable in this object. - By default, up to 64 masks can be stored, but smaller values - for the `numpy.dtype` may be more efficient. - """ - - def __init__(self, dtype="uint64"): - """ - TODO: extend to multi-column for arbitrary bit depth - """ - deprecate( - RuntimeError( - "This utility has moved to the `coffea.analysis_tools` subpackage and has new features, check it out!" - ), - 0.8, - ) - self._dtype = numpy.dtype(dtype) - self._names = [] - self._mask = None - - @property - def names(self): - """Current list of mask names available""" - return self._names - - def add(self, name, selection): - """Add a named mask - - Parameters - ---------- - name : str - name of the mask - selection : numpy.ndarray - a flat array of dtype bool. - If not the first mask added, it must also have - the same shape as previously added masks. - """ - if isinstance(selection, numpy.ndarray) and selection.dtype == numpy.dtype( - "bool" - ): - if len(self._names) == 0: - self._mask = numpy.zeros(shape=selection.shape, dtype=self._dtype) - elif len(self._names) == 64: - raise RuntimeError( - "Exhausted all slots for %r, consider a larger dtype or fewer selections" - % self._dtype - ) - elif self._mask.shape != selection.shape: - raise ValueError( - "New selection '%s' has different shape than existing ones (%r vs. %r)" - % (name, selection.shape, self._mask.shape) - ) - self._mask |= selection.astype(self._dtype) << len(self._names) - self._names.append(name) - else: - raise ValueError( - "PackedSelection only understands numpy boolean arrays, got %r" - % selection - ) - - def require(self, **names): - """Return a mask vector corresponding to specific requirements - - Specify an exact requirement on an arbitrary subset of the masks - - Parameters - ---------- - ``**names`` : kwargs - Each argument to require specific value for, in form ``arg=True`` - or ``arg=False``. - - Examples - -------- - If - - >>> selection.names - ['cut1', 'cut2', 'cut3'] - - then - - >>> selection.require(cut1=True, cut2=False) - array([True, False, True, ...]) - - returns a boolean array where each entry passes if the corresponding entry has - ``cut1 == True``, ``cut2 == False``, and ``cut3`` arbitrary. - """ - mask = 0 - require = 0 - for name, val in names.items(): - if not isinstance(val, bool): - raise ValueError( - "Please use only booleans in PackedSelection.require(), received %r for %s" - % (val, name) - ) - idx = self._names.index(name) - mask |= 1 << idx - require |= int(val) << idx - return (self._mask & mask) == require - - def all(self, *names): - """Shorthand for `require`, where all the values are True""" - return self.require(**{name: True for name in names}) diff --git a/src/coffea/processor/parsl/__init__.py b/src/coffea/processor/parsl/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/coffea/processor/parsl/condor_config.py b/src/coffea/processor/parsl/condor_config.py deleted file mode 100644 index 49c066c81..000000000 --- a/src/coffea/processor/parsl/condor_config.py +++ /dev/null @@ -1,77 +0,0 @@ -import os -import os.path as osp - -from parsl.addresses import address_by_hostname -from parsl.channels import LocalChannel -from parsl.config import Config -from parsl.executors import HighThroughputExecutor -from parsl.providers import CondorProvider - -x509_proxy = f"x509up_u{os.getuid()}" - - -def condor_config( - cores_per_job=4, - mem_per_core=2048, - total_workers=24, - max_workers=200, - pyenv_dir="{}/.local".format(os.environ["HOME"]), - grid_proxy_dir="/tmp", - htex_label="coffea_parsl_condor_htex", - wrk_init=None, - condor_cfg=None, -): - pyenv_relpath = pyenv_dir.split("/")[-1] - - if wrk_init is None: - wrk_init = """ - source /cvmfs/sft.cern.ch/lcg/views/LCG_95apython3/x86_64-centos7-gcc7-opt/setup.sh - export PATH=`pwd`/{}:$PATH - export PYTHONPATH=`pwd`/{}:$PYTHONPATH - - export X509_USER_PROXY=`pwd`/{} - mkdir -p ./{} - """.format( - "%s/bin" % pyenv_relpath, - "%s/lib/python3.6/site-packages" % pyenv_relpath, - x509_proxy, - htex_label, - ) - - if condor_cfg is None: - condor_cfg = """ - transfer_output_files = %s - RequestMemory = %d - RequestCpus = %d - """ % ( - htex_label, - mem_per_core * cores_per_job, - cores_per_job, - ) - - xfer_files = [pyenv_dir, osp.join(grid_proxy_dir, x509_proxy)] - - condor_htex = Config( - executors=[ - HighThroughputExecutor( - label=htex_label, - address=address_by_hostname(), - prefetch_capacity=0, - cores_per_worker=1, - max_workers=cores_per_job, - worker_logdir_root="./", - provider=CondorProvider( - channel=LocalChannel(), - init_blocks=total_workers, - max_blocks=max_workers, - nodes_per_block=1, - worker_init=wrk_init, - transfer_input_files=xfer_files, - scheduler_options=condor_cfg, - ), - ) - ], - strategy=None, - ) - - return condor_htex diff --git a/src/coffea/processor/parsl/detail.py b/src/coffea/processor/parsl/detail.py deleted file mode 100644 index a618d99f5..000000000 --- a/src/coffea/processor/parsl/detail.py +++ /dev/null @@ -1,89 +0,0 @@ -import parsl -from parsl.app.app import python_app -from parsl.channels import LocalChannel -from parsl.config import Config -from parsl.executors import HighThroughputExecutor -from parsl.providers import LocalProvider - -from ..executor import _futures_handler -from .timeout import timeout - -_default_cfg = Config( - executors=[ - HighThroughputExecutor( - label="coffea_parsl_default", - cores_per_worker=1, - provider=LocalProvider( - channel=LocalChannel(), - init_blocks=1, - max_blocks=1, - ), - ) - ], - strategy=None, -) - - -def _parsl_initialize(config=None): - parsl.clear() - parsl.load(config) - - -def _parsl_stop(): - parsl.dfk().cleanup() - parsl.clear() - - -@timeout -@python_app -def derive_chunks(filename, treename, chunksize, ds, timeout=10): - from collections.abc import Sequence - - import uproot - - uproot.XRootDSource.defaults["parallel"] = False - - a_file = uproot.open({filename: None}) - - tree = None - if isinstance(treename, str): - tree = a_file[treename] - elif isinstance(treename, Sequence): - for name in reversed(treename): - if name in a_file: - tree = a_file[name] - else: - raise Exception( - "treename must be a str or Sequence but is a %s!" % repr(type(treename)) - ) - - if tree is None: - raise Exception( - "No tree found, out of possible tree names: %s" % repr(treename) - ) - - nentries = tree.numentries - return ( - ds, - treename, - [(filename, chunksize, index) for index in range(nentries // chunksize + 1)], - ) - - -def _parsl_get_chunking(filelist, chunksize, status=True, timeout=10): - futures = { - derive_chunks(fn, tn, chunksize, ds, timeout=timeout) for ds, fn, tn in filelist - } - - items = [] - - def chunk_accumulator(total, result): - ds, treename, chunks = result - for chunk in chunks: - total.append((ds, chunk[0], treename, chunk[1], chunk[2])) - - _futures_handler( - futures, items, status, "files", "Preprocessing", chunk_accumulator, None - ) - - return items diff --git a/src/coffea/processor/parsl/slurm_config.py b/src/coffea/processor/parsl/slurm_config.py deleted file mode 100644 index 48c33ae01..000000000 --- a/src/coffea/processor/parsl/slurm_config.py +++ /dev/null @@ -1,67 +0,0 @@ -import os -import os.path as osp -import shutil - -from parsl.addresses import address_by_hostname -from parsl.channels import LocalChannel -from parsl.config import Config -from parsl.executors import HighThroughputExecutor -from parsl.launchers import SrunLauncher -from parsl.providers import SlurmProvider - -x509_proxy = "x509up_u%s" % (os.getuid()) - - -def slurm_config( - cores_per_job=16, - mem_per_core=2048, - jobs_per_worker=1, - initial_workers=4, - max_workers=8, - work_dir="./", - grid_proxy_dir="/tmp", - partition="", - walltime="02:00:00", - htex_label="coffea_parsl_slurm_htex", -): - shutil.copy2(osp.join(grid_proxy_dir, x509_proxy), osp.join(work_dir, x509_proxy)) - - wrk_init = """ - export XRD_RUNFORKHANDLER=1 - export X509_USER_PROXY=%s - """ % ( - osp.join(work_dir, x509_proxy) - ) - - sched_opts = """ - #SBATCH --cpus-per-task=%d - #SBATCH --mem-per-cpu=%d - """ % ( - cores_per_job, - mem_per_core, - ) - - slurm_htex = Config( - executors=[ - HighThroughputExecutor( - label=htex_label, - address=address_by_hostname(), - prefetch_capacity=0, - max_workers=cores_per_job, - provider=SlurmProvider( - channel=LocalChannel(), - launcher=SrunLauncher(), - init_blocks=initial_workers, - max_blocks=max_workers, - nodes_per_block=jobs_per_worker, - partition=partition, - scheduler_options=sched_opts, # Enter scheduler_options if needed - worker_init=wrk_init, # Enter worker_init if needed - walltime=walltime, - ), - ) - ], - strategy=None, - ) - - return slurm_htex diff --git a/src/coffea/processor/parsl/timeout.py b/src/coffea/processor/parsl/timeout.py deleted file mode 100644 index 35c7b42dc..000000000 --- a/src/coffea/processor/parsl/timeout.py +++ /dev/null @@ -1,21 +0,0 @@ -from functools import wraps - - -def timeout(func): - @wraps(func) - def wrapper(*args, **kwargs): - import signal - - def _timeout_handler(signum, frame): - raise Exception("Timeout hit") - - signal.signal(signal.SIGALRM, _timeout_handler) - if kwargs.get("timeout"): - signal.alarm(max(1, int(kwargs["timeout"]))) - try: - result = func(*args, **kwargs) - finally: - signal.alarm(0) - return result - - return wrapper diff --git a/src/coffea/processor/processor.py b/src/coffea/processor/processor.py index edc2d6162..6cbc427a8 100644 --- a/src/coffea/processor/processor.py +++ b/src/coffea/processor/processor.py @@ -22,7 +22,7 @@ def __init__(self, flag=False): self._flag = flag def process(self, events): - out = {"sumw": len(events)} + out = {"sumw": ak.num(events, axis=0)} # ... diff --git a/src/coffea/processor/servicex/__init__.py b/src/coffea/processor/servicex/__init__.py deleted file mode 100644 index 12d26e635..000000000 --- a/src/coffea/processor/servicex/__init__.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2019, IRIS-HEP -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from .analysis import * -from .dask_executor import * -from .data_source import * -from .local_executor import * - -__all__ = [ - "DataSource", - "Analysis", - "LocalExecutor", - "DaskExecutor", -] diff --git a/src/coffea/processor/servicex/analysis.py b/src/coffea/processor/servicex/analysis.py deleted file mode 100644 index 669ba8662..000000000 --- a/src/coffea/processor/servicex/analysis.py +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright (c) 2019, IRIS-HEP -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from abc import ABC, abstractmethod - -from coffea.nanoevents.methods.base import NanoEvents - - -class Analysis(ABC): - @staticmethod - @abstractmethod - def process(events: NanoEvents) -> dict: - """ - Implement this abstract method to perform the actual analysis operations. The - executor will wrap this in code to construct a NanoEvents instance and will pass - in the analysis instance's accumulator. - :param events: NanoEvents - :return: dict[str, Accumulatable] - Filled with the results from this analysis - """ - raise NotImplementedError diff --git a/src/coffea/processor/servicex/dask_executor.py b/src/coffea/processor/servicex/dask_executor.py deleted file mode 100644 index f9dcc4851..000000000 --- a/src/coffea/processor/servicex/dask_executor.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright (c) 2019, IRIS-HEP -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from typing import Callable, Dict, Optional - -from dask.distributed import Client - -from .executor import Executor, run_coffea_processor - - -class DaskExecutor(Executor): - def __init__( - self, - client_addr: Optional[str] = None, - provided_dask_client: Optional[Client] = None, - ): - """Create a Dask executor to process the analysis - - Args: - client_addr (Optional[str]): If `None` then create a local cluster that runs - in-process. Otherwise connect to an already - existing cluster. - provided_dask_client (Optional[Client]): Pass in an initialized Dask Client. - This client must have asynchronous=True. - """ - if not provided_dask_client: - self.is_local = not client_addr - - self.dask = ( - Client(threads_per_worker=10, asynchronous=True) - if self.is_local - else Client(client_addr, asynchronous=True) - ) - else: - assert provided_dask_client.asynchronous - self.dask = provided_dask_client - self.is_local = False - - def get_result_file_stream(self, datasource, title): - if self.is_local: - return datasource.stream_result_files(title) - else: - return datasource.stream_result_file_uris(title) - - def run_async_analysis( - self, - file_url: str, - tree_name: Optional[str], - data_type: str, - meta_data: Dict[str, str], - process_func: Callable, - schema, - ): - """Create a dask future for a dask task to run the analysis.""" - data_result = self.dask.submit( - run_coffea_processor, - events_url=file_url, - tree_name=tree_name, - data_type=data_type, - meta_data=meta_data, - proc=process_func, - schema=schema, - ) - - return data_result diff --git a/src/coffea/processor/servicex/data_source.py b/src/coffea/processor/servicex/data_source.py deleted file mode 100644 index 741363e67..000000000 --- a/src/coffea/processor/servicex/data_source.py +++ /dev/null @@ -1,116 +0,0 @@ -# Copyright (c) 2021, IRIS-HEP -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -from typing import AsyncGenerator, Dict, List, Optional, Tuple - -from func_adl import ObjectStream, find_EventDataset -from servicex import ServiceXDataset, StreamInfoPath, StreamInfoUrl - - -class DataSource: - def __init__( - self, - query: ObjectStream, - metadata: Dict[str, str] = {}, - datasets: List[ServiceXDataset] = [], - ): - self.query = query - self.metadata = metadata - self.schema = None - self.datasets = datasets - - async def _get_query(self) -> str: - """Return the qastle query. - - Note: To do this we have to forward-cast the object: by design, not all `func_adl` - queries are `ServiceX` queries. But this library only works with datasets that are - based in `ServiceX`. Thus some duck typing occurs in this method. - """ - event_dataset_ast = find_EventDataset(self.query.query_ast) - event_dataset = event_dataset_ast._eds_object # type: ignore - if not hasattr(event_dataset, "return_qastle"): - raise Exception( - f"Base func_adl query {str(event_dataset)} does not have a way to generate qastle!" - ) - event_dataset.return_qastle = True # type: ignore - return await self.query.value_async() - - async def stream_result_file_uris( - self, title: Optional[str] = None - ) -> AsyncGenerator[Tuple[str, str, StreamInfoUrl], None]: - """Launch all datasources off to servicex - - Yields: - Tuple[str, StreamInfoUrl]: List of data types and url's to process - """ - qastle = await self._get_query() - - # TODO: Make this for loop parallel - for dataset in self.datasets: - data_type = dataset.first_supported_datatype(["parquet", "root"]) - if data_type == "root": - async for file in dataset.get_data_rootfiles_uri_stream( - qastle, title=title, as_signed_url=True - ): - yield (data_type, dataset.dataset_as_name, file) - elif data_type == "parquet": - async for file in dataset.get_data_parquet_uri_stream( - qastle, title=title, as_signed_url=True - ): - yield (data_type, dataset.dataset_as_name, file) - else: - raise Exception( - f"This dataset ({str(dataset)}) supports unknown datatypes" - ) - - async def stream_result_files( - self, title: Optional[str] = None - ) -> AsyncGenerator[Tuple[str, str, StreamInfoPath], None]: - """Launch all datasources at once off to servicex - - Yields: - Tuple[str, StreamInfoPath]: List of data types and file paths to process - """ - qastle = await self._get_query() - - # TODO: Make this for loop parallel - for dataset in self.datasets: - data_type = dataset.first_supported_datatype(["parquet", "root"]) - if data_type == "root": - async for file in dataset.get_data_rootfiles_stream( - qastle, title=title - ): - yield (data_type, dataset.dataset_as_name, file) - elif data_type == "parquet": - async for file in dataset.get_data_parquet_stream(qastle, title=title): - yield (data_type, dataset.dataset_as_name, file) - else: - raise Exception( - f"This dataset ({str(dataset)}) supports unknown datatypes" - ) diff --git a/src/coffea/processor/servicex/executor.py b/src/coffea/processor/servicex/executor.py deleted file mode 100644 index 704f3da8a..000000000 --- a/src/coffea/processor/servicex/executor.py +++ /dev/null @@ -1,183 +0,0 @@ -# Copyright (c) 2019, IRIS-HEP -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from abc import ABC, abstractmethod -from typing import Any, AsyncGenerator, Callable, Dict, Optional, Tuple - -import aiostream -import uproot -from servicex import StreamInfoUrl - -from ..accumulator import async_accumulate - -# from urllib.parse import urlparse, unquote -# from urllib.request import url2pathname - - -class Executor(ABC): - @abstractmethod - def run_async_analysis( - self, - file_url: str, - tree_name: Optional[str], - data_type: str, - meta_data: Dict[str, str], - process_func: Callable, - ): - raise NotImplementedError - - def get_result_file_stream(self, datasource, title: Optional[str] = None): - return datasource.stream_result_file_uris(title) - - async def execute( - self, analysis, datasource, title: Optional[str] = None, schema=None - ): - """ - Launch an analysis against the given dataset on the implementation's task framework - :param analysis: - The analysis to run - :param datasource: - The datasource to run against - :param schema: - The schema to apply to data, defaults to None (will then use auto_schema). - :return: - Stream of up to date histograms. Grows as each result is received - """ - # Stream transformed file references from ServiceX - result_file_stream = self.get_result_file_stream(datasource, title=title) - - # Launch a task against this file - func_results = self.launch_analysis_tasks_from_stream( - result_file_stream, datasource.metadata, analysis.process, schema=schema - ) - - # Wait for all the data to show up - async def inline_wait(r): - "This could be inline, but python 3.6" - x = await r - return x - - finished_events = aiostream.stream.map(func_results, inline_wait, ordered=False) - # Finally, accumulate! - # There is an accumulate pattern in the aiostream lib - async with finished_events.stream() as streamer: - async for results in async_accumulate(streamer): - yield results - - async def launch_analysis_tasks_from_stream( - self, - result_file_stream: AsyncGenerator[Tuple[str, str, StreamInfoUrl], None], - meta_data: Dict[str, str], - process_func: Callable, - schema, - ) -> AsyncGenerator[Any, None]: - """ - Invoke the implementation's task runner on each file from the serviceX stream. - We don't know the file's tree name in advance, so grab a sample the first time - around to inspect the tree name - :param result_file_stream: - :param accumulator: - :param process_func: - :param schema: - The schema to apply to data. - :return: - """ - tree_name = None - async for sx_data in result_file_stream: - file_url = sx_data[2].url - sample_md = dict(meta_data, dataset=sx_data[1]) - data_type = sx_data[0] - - # Determine the tree name if we've not gotten it already - if data_type == "root": - if tree_name is None: - with uproot.open({file_url: None}) as sample: - tree_name = sample.keys()[0] - - # Invoke the implementation's task launcher - data_result = self.run_async_analysis( - file_url=file_url, - tree_name=tree_name, - data_type=data_type, - meta_data=sample_md, - process_func=process_func, - schema=schema, - ) - - # Pass this down to the next item in the stream. - yield data_result - - -def run_coffea_processor( - events_url: str, tree_name: Optional[str], proc, data_type, meta_data, schema -): - """ - Process a single file from a tree via a coffea processor on the remote node - :param events_url: - a URL to a ROOT file that uproot4 can open - :param tree_name: - The tree in the ROOT file to use for our data. Can be null if the data isn't a root - tree! - :param accumulator: - Accumulator to store the results - :param proc: - Analysis function to execute. Must have signature - :param data_type: - What datatype is the data (root, parquet?) - :param schema: - The schema to apply to data (if None, will use auto_schema). - :return: - Populated accumulator - """ - # Since we execute remotely, explicitly include everything we need. - from coffea.nanoevents import NanoEventsFactory - - if schema is None: - from coffea.nanoevents.schemas.schema import auto_schema - - schema = auto_schema - - if data_type == "root": - # Use NanoEvents to build a 4-vector - assert tree_name is not None - events = NanoEventsFactory.from_root( - file=str(events_url), - treepath=f"/{tree_name}", - schemaclass=schema, - metadata=dict(meta_data, filename=str(events_url)), - ).events() - elif data_type == "parquet": - events = NanoEventsFactory.from_parquet( - file=str(events_url), - treepath="/", - schemaclass=schema, - metadata=dict(meta_data, filename=str(events_url)), - ).events() - else: - raise Exception(f"Unknown stream data type of {data_type} - cannot process.") - - return proc(events) diff --git a/src/coffea/processor/servicex/local_executor.py b/src/coffea/processor/servicex/local_executor.py deleted file mode 100644 index fd8670c0c..000000000 --- a/src/coffea/processor/servicex/local_executor.py +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright (c) 2019, IRIS-HEP -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from typing import Callable, Dict, Optional - -from .executor import Executor, run_coffea_processor - - -class LocalExecutor(Executor): - def __init__(self): - pass - - def get_result_file_stream(self, datasource, title): - return datasource.stream_result_files(title) - - def run_async_analysis( - self, - file_url: str, - tree_name: Optional[str], - data_type: str, - meta_data: Dict[str, str], - process_func: Callable, - schema, - ): - # TODO: Do we need a second routine here? Can we just use this one? - return self._async_analysis( - events_url=file_url, - tree_name=tree_name, - data_type=data_type, - meta_data=meta_data, - process_func=process_func, - schema=schema, - ) - - async def _async_analysis( - self, events_url, tree_name, data_type, meta_data, process_func, schema - ): - return run_coffea_processor( - events_url=events_url, - tree_name=tree_name, - data_type=data_type, - meta_data=meta_data, - proc=process_func, - schema=schema, - ) diff --git a/src/coffea/processor/spark/__init__.py b/src/coffea/processor/spark/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/coffea/processor/spark/detail.py b/src/coffea/processor/spark/detail.py deleted file mode 100644 index 6e372bd2b..000000000 --- a/src/coffea/processor/spark/detail.py +++ /dev/null @@ -1,133 +0,0 @@ -from concurrent.futures import ThreadPoolExecutor - -import pyspark.sql -import pyspark.sql.functions as fn -from pyarrow.util import guid -from tqdm import tqdm - -try: - from collections.abc import Sequence -except ImportError: - from collections.abc import Sequence - -from coffea.processor.executor import _futures_handler - -# this is a reasonable local spark configuration -_default_config = ( - pyspark.sql.SparkSession.builder.appName("coffea-analysis-%s" % guid()) - .master("local[*]") - .config("spark.sql.execution.arrow.enabled", "true") - .config("spark.sql.execution.arrow.maxRecordsPerBatch", 200000) -) - - -def _spark_initialize(config=_default_config, **kwargs): - spark_progress = False - if "spark_progress" in kwargs.keys(): - spark_progress = kwargs["spark_progress"] - - cfg_actual = config - # get spark to not complain about missing log configs - cfg_actual = cfg_actual.config( - "spark.driver.extraJavaOptions", "-Dlog4jspark.root.logger=ERROR,console" - ) - if not spark_progress: - cfg_actual = cfg_actual.config("spark.ui.showConsoleProgress", "false") - - kwargs.setdefault("bindAddress", None) - if kwargs["bindAddress"] is not None: - cfg_actual = cfg_actual.config( - "spark.driver.bindAddress", kwargs["bindAddress"] - ) - kwargs.setdefault("host", None) - if kwargs["host"] is not None: - cfg_actual = cfg_actual.config("spark.driver.host", kwargs["host"]) - - session = cfg_actual.getOrCreate() - sc = session.sparkContext - - if "log_level" in kwargs.keys(): - sc.setLogLevel(kwargs["log_level"]) - else: - sc.setLogLevel("ERROR") - - return session - - -def _read_df( - spark, dataset, files_or_dirs, ana_cols, partitionsize, file_type, treeName -): - flist = files_or_dirs - tname = treeName - if isinstance(files_or_dirs, dict): - tname = files_or_dirs["treename"] - flist = files_or_dirs["files"] - if not isinstance(flist, Sequence): - raise ValueError("spark dataset file list must be a Sequence (like list())") - df = ( - spark.read.format(file_type) - .option("tree", tname) - .option("threadCount", "-1") - .load(flist) - ) - count = df.count() - - df_cols = set(df.columns) - cols_in_df = ana_cols.intersection(df_cols) - df = df.select(*cols_in_df) - missing_cols = ana_cols - cols_in_df - for missing in missing_cols: - df = df.withColumn(missing, fn.lit(0.0)) - # compatibility with older pyarrow which doesn't understand array - for col, dtype in df.dtypes: - if dtype == "array": - tempcol = col + "tempbool" - df = df.withColumnRenamed(col, tempcol) - df = df.withColumn(col, df[tempcol].cast("array")).drop(tempcol) - df = df.withColumn("dataset", fn.lit(dataset)) - npartitions = (count // partitionsize) + 1 - actual_partitions = df.rdd.getNumPartitions() - avg_counts = count / actual_partitions - if actual_partitions > 1.50 * npartitions or avg_counts > partitionsize: - df = df.repartition(npartitions) - - return df, dataset, count - - -def _spark_make_dfs( - spark, - fileset, - partitionsize, - columns, - thread_workers, - file_type, - treeName, - status=True, -): - dfs = {} - ana_cols = set(columns) - - with ThreadPoolExecutor(max_workers=thread_workers) as executor: - futures = { - executor.submit( - _read_df, spark, ds, files, ana_cols, partitionsize, file_type, treeName - ) - for ds, files in fileset.items() - } - - for df, ds, count in tqdm( - _futures_handler(futures, timeout=None), - disable=not status, - unit="dataset", - total=len(fileset), - desc="loading", - ): - dfs[ds] = (df, count) - - return dfs - - -def _spark_stop(spark): - # this may do more later? - spark._jvm.SparkSession.clearActiveSession() - spark.stop() diff --git a/src/coffea/processor/spark/spark_executor.py b/src/coffea/processor/spark/spark_executor.py deleted file mode 100644 index 0db32f475..000000000 --- a/src/coffea/processor/spark/spark_executor.py +++ /dev/null @@ -1,195 +0,0 @@ -import pickle # noqa: F401 -from concurrent.futures import ThreadPoolExecutor - -import awkward # noqa: F401 -import lz4.frame # noqa: F401 - -# must preload these for exec calls -import numpy # noqa: F401 -import pandas # noqa: F401 -import pyspark.sql.functions as fn -from jinja2 import Environment, PackageLoader, select_autoescape -from pyspark.sql.types import StringType # noqa: F401 -from pyspark.sql.types import BinaryType, StructField, StructType -from tqdm import tqdm - -from coffea.nanoevents import NanoEventsFactory, schemas # noqa: F401 -from coffea.nanoevents.mapping import SimplePreloadedColumnSource # noqa: F401 -from coffea.processor.accumulator import accumulate -from coffea.processor.executor import _decompress, _futures_handler, _reduce - -lz4_clevel = 1 - - -# this is a UDF that takes care of summing histograms across -# various spark results where the outputs are histogram blobs -def agg_histos_raw(series, lz4_clevel): - goodlines = series[series.str.len() > 0] - if goodlines.size == 1: # short-circuit trivial aggregations - return goodlines[0] - return _reduce(lz4_clevel)(goodlines) - - -@fn.pandas_udf(BinaryType()) -def agg_histos(series: pandas.Series) -> bytes: - global lz4_clevel - return agg_histos_raw(series, lz4_clevel) - - -def reduce_histos_raw(df, lz4_clevel): - histos = df["histos"] - outhist = _reduce(lz4_clevel)(histos[histos.str.len() > 0]) - return pandas.DataFrame(data={"histos": numpy.array([outhist], dtype="O")}) - - -@fn.pandas_udf( - StructType([StructField("histos", BinaryType(), True)]), -) -def reduce_histos(df: pandas.DataFrame) -> pandas.DataFrame: - global lz4_clevel - return reduce_histos_raw(df, lz4_clevel) - - -def _get_ds_bistream(item): - global lz4_clevel - ds, bitstream = item - if bitstream is None: - raise Exception( - "No pandas dataframe returned from spark in dataset: %s, something went wrong!" - % ds - ) - if bitstream.empty: - raise Exception( - "The histogram list returned from spark is empty in dataset: %s, something went wrong!" - % ds - ) - out = bitstream[bitstream.columns[0]][0] - if lz4_clevel is not None: - return _decompress(out) - return out - - -class SparkExecutor: - _template_name = "spark.py.tmpl" - - def __init__(self): - self._cacheddfs = None - self._counts = None - self._env = Environment( - loader=PackageLoader("coffea.processor", "templates"), - autoescape=select_autoescape(["py"]), - ) - - @property - def counts(self): - return self._counts - - def __call__( - self, - spark, - dfslist, - theprocessor, - output, - thread_workers, - use_df_cache, - schema, - status=True, - unit="datasets", - desc="Processing", - ): - # processor needs to be a global - global processor_instance, coffea_udf, nano_schema - processor_instance = theprocessor - if schema is None: - schema = schemas.BaseSchema - if not issubclass(schema, schemas.BaseSchema): - raise ValueError( - "Expected schema to derive from BaseSchema (%s)" - % (str(schema.__name__)) - ) - nano_schema = schema - # get columns from processor - columns = processor_instance.columns - cols_w_ds = ["dataset"] + columns - # make our udf - tmpl = self._env.get_template(self._template_name) - render = tmpl.render(cols=columns) - print(render) - exec(render) - - # cache the input datasets if it's not already done - if self._counts is None: - self._counts = {} - # go through each dataset and thin down to the columns we want - for ds, (df, counts) in dfslist.items(): - self._counts[ds] = counts - - if self._cacheddfs is None: - self._cacheddfs = {} - cachedesc = "caching" if use_df_cache else "pruning" - with ThreadPoolExecutor(max_workers=thread_workers) as executor: - futures = set() - for ds, (df, counts) in dfslist.items(): - futures.add( - executor.submit( - self._pruneandcache_data, ds, df, cols_w_ds, use_df_cache - ) - ) - gen = _futures_handler(futures, timeout=None) - try: - for ds, df in tqdm( - gen, - disable=not status, - unit=unit, - total=len(dfslist), - desc=cachedesc, - ): - self._cacheddfs[ds] = df - finally: - gen.close() - - with ThreadPoolExecutor(max_workers=thread_workers) as executor: - futures = set() - for ds, df in self._cacheddfs.items(): - co_udf = coffea_udf - futures.add( - executor.submit(self._launch_analysis, ds, df, co_udf, cols_w_ds) - ) - gen = _futures_handler(futures, timeout=None) - try: - output = accumulate( - tqdm( - map(_get_ds_bistream, gen), - disable=not status, - unit=unit, - total=len(self._cacheddfs), - desc=desc, - ), - output, - ) - finally: - gen.close() - - return output - - def _pruneandcache_data(self, ds, df, columns, cacheit): - if cacheit: - return ds, df.select(*columns).cache() - return ds, df.select(*columns) - - def _launch_analysis(self, ds, df, udf, columns): - histo_map_parts = (df.rdd.getNumPartitions() // 20) + 1 - return ( - ds, - df.select(udf(*columns).alias("histos")) - .withColumn("hpid", fn.spark_partition_id() % histo_map_parts) - .repartition(histo_map_parts, "hpid") - .groupBy("hpid") - .apply(reduce_histos) - .groupBy() - .agg(agg_histos("histos")) - .toPandas(), - ) - - -spark_executor = SparkExecutor() diff --git a/src/coffea/processor/templates/__init__.py b/src/coffea/processor/templates/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/coffea/processor/templates/spark.py.tmpl b/src/coffea/processor/templates/spark.py.tmpl deleted file mode 100644 index 5e6e5f818..000000000 --- a/src/coffea/processor/templates/spark.py.tmpl +++ /dev/null @@ -1,24 +0,0 @@ -global coffea_udf - - -def coffea_udf(dataset: pd.Series, {% for col in cols %}{{col + ": pd.Series"}}{{ "," if not loop.last }}{% endfor %}): - global processor_instance, lz4_clevel, nano_schema - - columns = [{% for col in cols %}awkward.Array({{col}}){{ "," if not loop.last }}{% endfor %}] - names = [{% for col in cols %}{{"'"|safe+col+"'"|safe}}{{ "," if not loop.last }}{% endfor %}] - - size = len(dataset) - src = SimplePreloadedColumnSource(dict(zip(names, columns)), None, size, object_path='/Events') - - events = NanoEventsFactory \ - .from_preloaded(src, metadata={'dataset': dataset[0]}, schemaclass=nano_schema) \ - .events() - - vals = processor_instance.process(events) - - valsblob = lz4.frame.compress(pickle.dumps(vals), compression_level=lz4_clevel) - - outs = numpy.full(shape=(size, ), fill_value=b'', dtype='O') - outs[0] = valsblob - - return pandas.Series(outs) diff --git a/tests/test_accumulators.py b/tests/test_accumulators.py deleted file mode 100644 index ccf5d1d11..000000000 --- a/tests/test_accumulators.py +++ /dev/null @@ -1,184 +0,0 @@ -from collections import defaultdict -from functools import partial - -import numpy as np -import pytest - -from coffea import processor - - -def test_accumulators(): - a = processor.value_accumulator(float) - a += 3.0 - assert a.value == 3.0 - assert a.identity().value == 0.0 - - a = processor.value_accumulator(partial(np.array, [2.0])) - a += 3.0 - assert np.array_equal(a.value, np.array([5.0])) - assert np.array_equal(a.identity().value, np.array([2.0])) - - lacc = processor.list_accumulator(range(4)) - lacc += [3] - lacc += processor.list_accumulator([1, 2]) - assert lacc == [0, 1, 2, 3, 3, 1, 2] - - b = processor.set_accumulator({"apples", "oranges"}) - b += {"pears"} - b += "grapes" - assert b == {"apples", "oranges", "pears", "grapes"} - - c = processor.dict_accumulator({"num": a, "fruit": b}) - c["num"] += 2.0 - c += processor.dict_accumulator( - { - "num2": processor.value_accumulator(int), - "fruit": processor.set_accumulator({"apples", "cherries"}), - } - ) - assert c["num2"].value == 0 - assert np.array_equal(c["num"].value, np.array([7.0])) - assert c["fruit"] == {"apples", "oranges", "pears", "grapes", "cherries"} - - d = processor.defaultdict_accumulator(float) - d["x"] = 0.0 - d["x"] += 4.0 - d["y"] += 5.0 - d["z"] += d["x"] - d["x"] += d["y"] - assert d["x"] == 9.0 - assert d["y"] == 5.0 - assert d["z"] == 4.0 - assert d["w"] == 0.0 - - f = processor.defaultdict_accumulator(lambda: 2.0) - f["x"] += 4.0 - assert f["x"] == 6.0 - - f += f - assert f["x"] == 12.0 - assert f["y"] == 2.0 - - a = processor.column_accumulator(np.arange(6).reshape(2, 3)) - b = processor.column_accumulator(np.arange(12).reshape(4, 3)) - a += b - assert a.value.sum() == 81 - - -def test_new_accumulators(): - a = processor.accumulate((0.0, 3.0)) - assert a == 3.0 - - a = processor.accumulate( - ( - np.array([2.0]), - 3.0, - ) - ) - assert np.array_equal(a, np.array([5.0])) - - lacc = processor.accumulate( - ( - list(range(4)), - [3], - [1, 2], - ) - ) - assert lacc == [0, 1, 2, 3, 3, 1, 2] - - b = processor.accumulate( - ( - {"apples", "oranges"}, - {"pears"}, - {"grapes"}, - ) - ) - assert b == {"apples", "oranges", "pears", "grapes"} - - c = processor.accumulate( - ( - {"num": a, "fruit": b}, - {"num": 2.0}, - { - "num2": 0, - "fruit": {"apples", "cherries"}, - }, - ) - ) - assert c["num2"] == 0 - assert np.array_equal(c["num"], np.array([7.0])) - assert c["fruit"] == {"apples", "oranges", "pears", "grapes", "cherries"} - - d = processor.accumulate( - ( - defaultdict(float), - {"x": 4.0, "y": 5.0}, - {"z": 4.0, "x": 5.0}, - ) - ) - assert d["x"] == 9.0 - assert d["y"] == 5.0 - assert d["z"] == 4.0 - # this is different than old style! - with pytest.raises(KeyError): - d["w"] - - f = processor.accumulate( - ( - defaultdict(lambda: 2.0), - defaultdict(lambda: 2, {"x": 4.0}), - ) - ) - assert f["x"] == 4.0 - assert f["y"] == 2.0 - - # this is different than old style! - f = processor.accumulate([f], f) - assert f["x"] == 8.0 - assert f["y"] == 4.0 - assert f["z"] == 2.0 - - a = processor.accumulate( - ( - processor.column_accumulator(np.arange(6).reshape(2, 3)), - processor.column_accumulator(np.arange(12).reshape(4, 3)), - ) - ) - assert a.value.sum() == 81 - - -def test_accumulator_types(): - class MyDict(dict): - pass - - out = processor.accumulate( - ( - {"x": 2}, - MyDict({"x": 3}), - ) - ) - assert type(out) is dict - - with pytest.raises(ValueError): - processor.accumulate( - ( - defaultdict(lambda: 2), - MyDict({"x": 3}), - ) - ) - - out = processor.accumulate( - ( - MyDict({"x": 3}), - {"x": 2}, - ) - ) - assert type(out) is dict - - with pytest.raises(ValueError): - processor.accumulate( - ( - MyDict({"x": 3}), - defaultdict(lambda: 2), - ) - ) diff --git a/tests/test_dataset_tools.py b/tests/test_dataset_tools.py new file mode 100644 index 000000000..3500fdc3f --- /dev/null +++ b/tests/test_dataset_tools.py @@ -0,0 +1,308 @@ +import dask +import pytest +from distributed import Client + +from coffea.dataset_tools import ( + apply_to_fileset, + get_failed_steps_for_fileset, + max_chunks, + preprocess, + slice_chunks, +) +from coffea.nanoevents import BaseSchema, NanoAODSchema +from coffea.processor.test_items import NanoEventsProcessor, NanoTestProcessor + +_starting_fileset = { + "ZJets": { + "files": { + "tests/samples/nano_dy.root": { + "object_path": "Events", + "steps": [ + [0, 5], + [5, 10], + [10, 15], + [15, 20], + [20, 25], + [25, 30], + [30, 35], + [35, 40], + ], + } + } + }, + "Data": { + "files": { + "tests/samples/nano_dimuon.root": "Events", + "tests/samples/nano_dimuon_not_there.root": "Events", + } + }, +} + +_starting_fileset_with_steps = { + "ZJets": { + "files": { + "tests/samples/nano_dy.root": { + "object_path": "Events", + "steps": [ + [0, 5], + [5, 10], + [10, 15], + [15, 20], + [20, 25], + [25, 30], + [30, 35], + [35, 40], + ], + } + } + }, + "Data": { + "files": { + "tests/samples/nano_dimuon.root": { + "object_path": "Events", + "steps": [ + [0, 5], + [5, 10], + [10, 15], + [15, 20], + [20, 25], + [25, 30], + [30, 35], + [35, 40], + ], + }, + "tests/samples/nano_dimuon_not_there.root": { + "object_path": "Events", + "steps": [ + [0, 5], + [5, 10], + [10, 15], + [15, 20], + [20, 25], + [25, 30], + [30, 35], + [35, 40], + ], + }, + } + }, +} + +_runnable_result = { + "ZJets": { + "files": { + "tests/samples/nano_dy.root": { + "object_path": "Events", + "steps": [ + [0, 7], + [7, 14], + [14, 21], + [21, 28], + [28, 35], + [35, 40], + ], + "uuid": "a9490124-3648-11ea-89e9-f5b55c90beef", + } + } + }, + "Data": { + "files": { + "tests/samples/nano_dimuon.root": { + "object_path": "Events", + "steps": [ + [0, 7], + [7, 14], + [14, 21], + [21, 28], + [28, 35], + [35, 40], + ], + "uuid": "a210a3f8-3648-11ea-a29f-f5b55c90beef", + } + } + }, +} + +_updated_result = { + "ZJets": { + "files": { + "tests/samples/nano_dy.root": { + "object_path": "Events", + "steps": [ + [0, 7], + [7, 14], + [14, 21], + [21, 28], + [28, 35], + [35, 40], + ], + "uuid": "a9490124-3648-11ea-89e9-f5b55c90beef", + } + } + }, + "Data": { + "files": { + "tests/samples/nano_dimuon.root": { + "object_path": "Events", + "steps": [ + [0, 7], + [7, 14], + [14, 21], + [21, 28], + [28, 35], + [35, 40], + ], + "uuid": "a210a3f8-3648-11ea-a29f-f5b55c90beef", + }, + "tests/samples/nano_dimuon_not_there.root": { + "object_path": "Events", + "steps": None, + "uuid": None, + }, + } + }, +} + + +@pytest.mark.parametrize( + "proc_and_schema", + [(NanoTestProcessor, BaseSchema), (NanoEventsProcessor, NanoAODSchema)], +) +def test_apply_to_fileset(proc_and_schema): + proc, schemaclass = proc_and_schema + + with Client() as _: + to_compute = apply_to_fileset( + proc(), + _runnable_result, + schemaclass=schemaclass, + ) + out = dask.compute(to_compute)[0] + + assert out["ZJets"]["cutflow"]["ZJets_pt"] == 18 + assert out["ZJets"]["cutflow"]["ZJets_mass"] == 6 + assert out["Data"]["cutflow"]["Data_pt"] == 84 + assert out["Data"]["cutflow"]["Data_mass"] == 66 + + to_compute = apply_to_fileset( + proc(), + max_chunks(_runnable_result, 1), + schemaclass=schemaclass, + ) + out = dask.compute(to_compute)[0] + + assert out["ZJets"]["cutflow"]["ZJets_pt"] == 5 + assert out["ZJets"]["cutflow"]["ZJets_mass"] == 2 + assert out["Data"]["cutflow"]["Data_pt"] == 17 + assert out["Data"]["cutflow"]["Data_mass"] == 14 + + +def test_preprocess(): + with Client() as _: + starting_fileset = _starting_fileset + + dataset_runnable, dataset_updated = preprocess( + starting_fileset, + maybe_step_size=7, + align_clusters=False, + files_per_batch=10, + skip_bad_files=True, + ) + + assert dataset_runnable == _runnable_result + assert dataset_updated == _updated_result + + +def test_preprocess_failed_file(): + with Client() as _, pytest.raises(FileNotFoundError): + starting_fileset = _starting_fileset + + dataset_runnable, dataset_updated = preprocess( + starting_fileset, + maybe_step_size=7, + align_clusters=False, + files_per_batch=10, + skip_bad_files=False, + ) + + +def test_maxchunks(): + max_chunked = max_chunks(_runnable_result, 3) + + assert max_chunked == { + "ZJets": { + "files": { + "tests/samples/nano_dy.root": { + "object_path": "Events", + "steps": [[0, 7], [7, 14], [14, 21]], + "uuid": "a9490124-3648-11ea-89e9-f5b55c90beef", + } + } + }, + "Data": { + "files": { + "tests/samples/nano_dimuon.root": { + "object_path": "Events", + "steps": [[0, 7], [7, 14], [14, 21]], + "uuid": "a210a3f8-3648-11ea-a29f-f5b55c90beef", + } + } + }, + } + + +def test_slicechunks(): + slice_chunked = slice_chunks(_runnable_result, slice(None, None, 2)) + + assert slice_chunked == { + "ZJets": { + "files": { + "tests/samples/nano_dy.root": { + "object_path": "Events", + "steps": [[0, 7], [14, 21], [28, 35]], + "uuid": "a9490124-3648-11ea-89e9-f5b55c90beef", + } + } + }, + "Data": { + "files": { + "tests/samples/nano_dimuon.root": { + "object_path": "Events", + "steps": [[0, 7], [14, 21], [28, 35]], + "uuid": "a210a3f8-3648-11ea-a29f-f5b55c90beef", + } + } + }, + } + + +def test_recover_failed_chunks(): + with Client() as _: + to_compute = apply_to_fileset( + NanoEventsProcessor(), + _starting_fileset_with_steps, + schemaclass=NanoAODSchema, + uproot_options={"allow_read_errors_with_report": True}, + ) + out, reports = dask.compute(*to_compute) + + failed_fset = get_failed_steps_for_fileset(_starting_fileset_with_steps, reports) + assert failed_fset == { + "Data": { + "files": { + "tests/samples/nano_dimuon_not_there.root": { + "object_path": "Events", + "steps": [ + [0, 5], + [5, 10], + [10, 15], + [15, 20], + [20, 25], + [25, 30], + [30, 35], + [35, 40], + ], + } + } + } + } diff --git a/tests/test_local_executors.py b/tests/test_local_executors.py deleted file mode 100644 index ccc35f5f4..000000000 --- a/tests/test_local_executors.py +++ /dev/null @@ -1,125 +0,0 @@ -import os.path as osp -import sys - -import pytest - -from coffea import processor -from coffea.nanoevents import schemas -from coffea.processor.executor import UprootMissTreeError - -if sys.platform.startswith("win"): - pytest.skip("skipping tests that only function in linux", allow_module_level=True) - - -@pytest.mark.parametrize("filetype", ["root", "parquet"]) -@pytest.mark.parametrize("skipbadfiles", [True, False]) -@pytest.mark.parametrize("maxchunks", [1, None]) -@pytest.mark.parametrize("chunksize", [100000, 5]) -@pytest.mark.parametrize("schema", [None, schemas.BaseSchema]) -@pytest.mark.parametrize( - "executor", [processor.IterativeExecutor] # , processor.FuturesExecutor -) -def test_dataframe_analysis( - executor, schema, chunksize, maxchunks, skipbadfiles, filetype -): - from coffea.processor.test_items import NanoTestProcessor - - if schema is not None and filetype == "parquet": - pytest.xfail("parquet nanoevents not supported yet") - - filelist = { - "ZJets": {"files": [osp.abspath(f"tests/samples/nano_dy.{filetype}")]}, - "Data": {"files": [osp.abspath(f"tests/samples/nano_dimuon.{filetype}")]}, - } - - executor = executor() - run = processor.Runner( - executor=executor, - schema=schema, - chunksize=chunksize, - maxchunks=maxchunks, - skipbadfiles=skipbadfiles, - format=filetype, - ) - - hists = run(filelist, "Events", processor_instance=NanoTestProcessor()) - - if maxchunks is None: - assert hists["cutflow"]["ZJets_pt"] == 18 - assert hists["cutflow"]["ZJets_mass"] == 6 - assert hists["cutflow"]["Data_pt"] == 84 - assert hists["cutflow"]["Data_mass"] == 66 - else: - assert maxchunks == 1 - print(hists["cutflow"]["ZJets_pt"]) - assert hists["cutflow"]["ZJets_pt"] == (18 if chunksize == 100_000 else 2) - assert hists["cutflow"]["ZJets_mass"] == (6 if chunksize == 100_000 else 1) - assert hists["cutflow"]["Data_pt"] == (84 if chunksize == 100_000 else 13) - assert hists["cutflow"]["Data_mass"] == (66 if chunksize == 100_000 else 12) - - -@pytest.mark.parametrize("filetype", ["root", "parquet"]) -@pytest.mark.parametrize("skipbadfiles", [True, False]) -@pytest.mark.parametrize("maxchunks", [None, 1000]) -@pytest.mark.parametrize("compression", [None, 0, 2]) -@pytest.mark.parametrize( - "executor", [processor.IterativeExecutor] # , processor.FuturesExecutor -) -def test_nanoevents_analysis(executor, compression, maxchunks, skipbadfiles, filetype): - from coffea.processor.test_items import NanoEventsProcessor - - if filetype == "parquet": - pytest.xfail("parquet nanoevents not supported yet") - - filelist = { - "DummyBadMissingFile": { - "treename": "Events", - "files": [osp.abspath(f"tests/samples/non_existent.{filetype}")], - }, - "ZJetsBadMissingTree": { - "treename": "NotEvents", - "files": [ - osp.abspath(f"tests/samples/nano_dy.{filetype}"), - osp.abspath(f"tests/samples/nano_dy_SpecialTree.{filetype}"), - ], - }, - "ZJetsBadMissingTreeAllFiles": { - "treename": "NotEvents", - "files": [osp.abspath(f"tests/samples/nano_dy.{filetype}")], - }, - "ZJets": { - "treename": "Events", - "files": [osp.abspath(f"tests/samples/nano_dy.{filetype}")], - "metadata": {"checkusermeta": True, "someusermeta": "hello"}, - }, - "Data": { - "treename": "Events", - "files": [osp.abspath(f"tests/samples/nano_dimuon.{filetype}")], - "metadata": {"checkusermeta": True, "someusermeta2": "world"}, - }, - } - - executor = executor(compression=compression) - run = processor.Runner( - executor=executor, - skipbadfiles=skipbadfiles, - schema=processor.NanoAODSchema, - maxchunks=maxchunks, - format=filetype, - ) - - if skipbadfiles: - hists = run(filelist, "Events", processor_instance=NanoEventsProcessor()) - assert hists["cutflow"]["ZJets_pt"] == 18 - assert hists["cutflow"]["ZJets_mass"] == 6 - assert hists["cutflow"]["ZJetsBadMissingTree_pt"] == 18 - assert hists["cutflow"]["ZJetsBadMissingTree_mass"] == 6 - assert hists["cutflow"]["Data_pt"] == 84 - assert hists["cutflow"]["Data_mass"] == 66 - - else: - LookForError = (FileNotFoundError, UprootMissTreeError) - with pytest.raises(LookForError): - hists = run(filelist, "Events", processor_instance=NanoEventsProcessor()) - with pytest.raises(LookForError): - hists = run(filelist, "NotEvents", processor_instance=NanoEventsProcessor()) diff --git a/tests/test_parsl.py b/tests/test_parsl.py deleted file mode 100644 index a720383da..000000000 --- a/tests/test_parsl.py +++ /dev/null @@ -1,145 +0,0 @@ -import multiprocessing -import sys - -import pytest - -from coffea import processor - - -def test_parsl_start_stop(): - pytest.importorskip("parsl", minversion="0.7.2") - - from coffea.processor.parsl.detail import ( - _default_cfg, - _parsl_initialize, - _parsl_stop, - ) - - _parsl_initialize(config=_default_cfg) - - _parsl_stop() - - -def do_parsl_job(filelist, flatten=False, compression=0, config=None): - from coffea.processor.test_items import NanoTestProcessor - - executor = processor.ParslExecutor(compression=compression, config=config) - run = processor.Runner(executor=executor) - - hists = run(filelist, "Events", processor_instance=NanoTestProcessor()) - - assert hists["cutflow"]["ZJets_pt"] == 18 - assert hists["cutflow"]["ZJets_mass"] == 6 - assert hists["cutflow"]["Data_pt"] == 84 - assert hists["cutflow"]["Data_mass"] == 66 - - -# @pytest.mark.skipif(sys.platform.startswith('darwin'), reason='parsl htex not working on osx again') -def test_parsl_htex_executor(): - pytest.importorskip("parsl", minversion="0.7.2") - import os - import os.path as osp - - import parsl - from parsl.channels import LocalChannel - from parsl.config import Config - from parsl.executors import HighThroughputExecutor - from parsl.providers import LocalProvider - - parsl_config = Config( - executors=[ - HighThroughputExecutor( - label="coffea_parsl_default", - address="127.0.0.1", - cores_per_worker=max(multiprocessing.cpu_count() // 2, 1), - max_workers=1, - provider=LocalProvider( - channel=LocalChannel(), - init_blocks=1, - max_blocks=1, - nodes_per_block=1, - ), - ) - ], - strategy=None, - ) - parsl.load(parsl_config) - - filelist = { - "ZJets": [osp.join(os.getcwd(), "tests/samples/nano_dy.root")], - "Data": [osp.join(os.getcwd(), "tests/samples/nano_dimuon.root")], - } - - do_parsl_job(filelist) - do_parsl_job(filelist, compression=1) - - filelist = { - "ZJets": { - "treename": "Events", - "files": [osp.join(os.getcwd(), "tests/samples/nano_dy.root")], - }, - "Data": { - "treename": "Events", - "files": [osp.join(os.getcwd(), "tests/samples/nano_dimuon.root")], - }, - } - - do_parsl_job(filelist) - - -@pytest.mark.skipif( - sys.platform.startswith("win"), reason="signals are different on windows" -) -def test_timeout(): - import signal - - from coffea.processor.parsl.timeout import timeout - - @timeout - def too_long(timeout=None): - import time - - time.sleep(20) - - @timeout - def make_except(timeout=None): - import time - - time.sleep(1) - raise Exception("oops!") - - try: - too_long(timeout=5) - except Exception as e: - assert e.args[0] == "Timeout hit" - - try: - make_except(timeout=20) - except Exception as e: - assert e.args[0] == "oops!" - - # reset alarms for other tests, this is suspicious - signal.alarm(0) - - -def test_parsl_condor_cfg(): - pytest.importorskip("parsl", minversion="0.7.2") - - from coffea.processor.parsl.condor_config import condor_config - - print(condor_config()) - - -@pytest.mark.skip("broken and soon to be removed") -def test_parsl_slurm_cfg(): - pytest.importorskip("parsl", minversion="0.7.2") - import os - - x509_proxy = "x509up_u%s" % (os.getuid()) - fname = "/tmp/%s" % x509_proxy - with open(fname, "w+"): - os.utime(fname, None) - - from coffea.processor.parsl.slurm_config import slurm_config - - print(slurm_config()) diff --git a/tests/test_processor.py b/tests/test_processor.py index 732762b08..b5d836b67 100644 --- a/tests/test_processor.py +++ b/tests/test_processor.py @@ -1,6 +1,3 @@ -import os.path as osp -import sys - import pytest @@ -28,117 +25,3 @@ def postprocess(self, accumulator): acc = None super(test, proc).postprocess(acc) - - -@pytest.mark.skipif( - sys.platform.startswith("win"), reason="problems with paths on windows" -) -def test_lazy_dataframe(): - import uproot - - from coffea.processor import LazyDataFrame - - tree = uproot.open(osp.abspath("tests/samples/nano_dy.root"))["Events"] - entrystart = 0 - entrystop = 100 - - df = LazyDataFrame(tree, entrystart, entrystop, preload_items=["nMuon"]) - - assert len(df) == 1 - - pt = df["Muon_pt"] - assert len(df) == 2 - df["Muon_pt_up"] = pt * 1.05 - assert len(df) == 3 - assert "Muon_pt" in df.materialized - - assert "Muon_eta" in df.available - - assert df.size == tree.num_entries - - with pytest.raises(KeyError): - df["notthere"] - - -@pytest.mark.skipif( - sys.platform.startswith("win"), reason="problems with paths on windows" -) -def test_lazy_dataframe_getattr(): - import uproot - - from coffea.processor import LazyDataFrame - - tree = uproot.open(osp.abspath("tests/samples/nano_dy.root"))["Events"] - entrystart = 0 - entrystop = 100 - - df = LazyDataFrame(tree, entrystart, entrystop, preload_items=["nMuon"]) - - assert len(df) == 1 - - df.Muon_pt - assert len(df) == 2 - assert "Muon_pt" in df.materialized - - assert "Muon_eta" in df.available - - assert df.size == tree.num_entries - - with pytest.raises(AttributeError): - df.notthere - - import copy - - df2 = copy.copy(df) - df2.Muon_pt - - with pytest.raises(AttributeError): - df2.notthere - - -def test_processor_newaccumulator(): - from coffea.processor import ( - IterativeExecutor, - ProcessorABC, - defaultdict_accumulator, - ) - - class Test(ProcessorABC): - def process(self, item): - return {"itemsum": item} - - def postprocess(self, accumulator): - pass - - proc = Test() - - exe = IterativeExecutor() - out = exe( - range(10), - proc.process, - None, - ) - assert out == ({"itemsum": 45}, 0) - - class TestOldStyle(ProcessorABC): - @property - def accumulator(self): - return defaultdict_accumulator(int) - - def process(self, item): - out = self.accumulator.identity() - out["itemsum"] += item - return out - - def postprocess(self, accumulator): - pass - - proc = TestOldStyle() - - exe = IterativeExecutor() - out = exe( - range(10), - proc.process, - proc.accumulator, - ) - assert out[0]["itemsum"] == 45 diff --git a/tests/test_spark.py b/tests/test_spark.py deleted file mode 100644 index 25581213a..000000000 --- a/tests/test_spark.py +++ /dev/null @@ -1,136 +0,0 @@ -import pytest - - -def test_spark_imports(): - pytest.importorskip("pyspark", minversion="3.3.0") - - from coffea.processor.spark.detail import _spark_initialize, _spark_stop - - spark = _spark_initialize(bindAddress="127.0.0.1", host="127.0.0.1") - _spark_stop(spark) - - -@pytest.mark.skip(reason="pyspark executor work currently in progress") -def test_spark_executor(): - pyspark = pytest.importorskip("pyspark", minversion="3.3.0") - import os - import os.path as osp - - import pyspark.sql - from pyarrow.util import guid - - from coffea.nanoevents import schemas - from coffea.processor import run_spark_job - from coffea.processor.spark.detail import _spark_initialize, _spark_stop - - spark_config = ( - pyspark.sql.SparkSession.builder.appName("spark-executor-test-%s" % guid()) - .master("local[*]") - .config("spark.sql.execution.arrow.enabled", "true") - .config("spark.driver.host", "127.0.0.1") - .config("spark.driver.bindAddress", "127.0.0.1") - .config("spark.executor.x509proxyname", "x509_u12409") - .config("spark.sql.execution.arrow.maxRecordsPerBatch", 200000) - ) - - spark = _spark_initialize( - config=spark_config, log_level="ERROR", spark_progress=False - ) - - filelist = { - "ZJets": { - "files": ["file:" + osp.join(os.getcwd(), "tests/samples/nano_dy.root")], - "treename": "Events", - }, - "Data": { - "files": [ - "file:" + osp.join(os.getcwd(), "tests/samples/nano_dimuon.root") - ], - "treename": "Events", - }, - } - - from coffea.processor.spark.spark_executor import spark_executor - from coffea.processor.test_items import NanoEventsProcessor, NanoTestProcessor - - columns = ["nMuon", "Muon_pt", "Muon_eta", "Muon_phi", "Muon_mass", "Muon_charge"] - proc = NanoTestProcessor(columns=columns) - - hists = run_spark_job( - filelist, - processor_instance=proc, - executor=spark_executor, - spark=spark, - thread_workers=1, - executor_args={"file_type": "root"}, - ) - - assert sum(spark_executor.counts.values()) == 80 - assert hists["cutflow"]["ZJets_pt"] == 18 - assert hists["cutflow"]["ZJets_mass"] == 6 - assert hists["cutflow"]["Data_pt"] == 84 - assert hists["cutflow"]["Data_mass"] == 66 - - hists = run_spark_job( - filelist, - processor_instance=proc, - executor=spark_executor, - spark=spark, - thread_workers=1, - executor_args={"file_type": "root"}, - ) - - assert sum(spark_executor.counts.values()) == 80 - assert hists["cutflow"]["ZJets_pt"] == 18 - assert hists["cutflow"]["ZJets_mass"] == 6 - assert hists["cutflow"]["Data_pt"] == 84 - assert hists["cutflow"]["Data_mass"] == 66 - - proc = NanoEventsProcessor(columns=columns) - hists = run_spark_job( - filelist, - processor_instance=proc, - executor=spark_executor, - spark=spark, - thread_workers=1, - executor_args={"file_type": "root", "schema": schemas.NanoAODSchema}, - ) - - _spark_stop(spark) - - assert sum(spark_executor.counts.values()) == 80 - assert hists["cutflow"]["ZJets_pt"] == 18 - assert hists["cutflow"]["ZJets_mass"] == 6 - assert hists["cutflow"]["Data_pt"] == 84 - assert hists["cutflow"]["Data_mass"] == 66 - - -def test_spark_hist_adders(): - pytest.importorskip("pyspark", minversion="3.3.0") - - import pickle as pkl - - import lz4.frame as lz4f - import pandas as pd - - from coffea.processor.spark.spark_executor import agg_histos_raw, reduce_histos_raw - from coffea.processor.test_items import NanoTestProcessor - from coffea.util import numpy as np - - proc = NanoTestProcessor() - - one = proc.accumulator - two = proc.accumulator - hlist1 = [lz4f.compress(pkl.dumps(one))] - hlist2 = [lz4f.compress(pkl.dumps(one)), lz4f.compress(pkl.dumps(two))] - harray1 = np.array(hlist1, dtype="O") - harray2 = np.array(hlist2, dtype="O") - - series1 = pd.Series(harray1) - series2 = pd.Series(harray2) - df = pd.DataFrame({"histos": harray2}) - - # correctness of these functions is checked in test_spark_executor - agg_histos_raw(series1, 1) - agg_histos_raw(series2, 1) - reduce_histos_raw(df, 1) diff --git a/tests/test_taskvine.py b/tests/test_taskvine.py index 022a152ba..cea9e4a4c 100755 --- a/tests/test_taskvine.py +++ b/tests/test_taskvine.py @@ -4,13 +4,12 @@ import hist.dask as hda import pytest -from coffea import processor -from coffea.nanoevents import NanoEventsFactory +from coffea.nanoevents import NanoAODSchema, NanoEventsFactory def histogram_common(): # The opendata files are non-standard NanoAOD, so some optional data columns are missing - processor.NanoAODSchema.warn_missing_crossrefs = False + NanoAODSchema.warn_missing_crossrefs = False # "file:/tmp/Run2012B_SingleMu.root", events = NanoEventsFactory.from_root( diff --git a/tests/test_workitem.py b/tests/test_workitem.py deleted file mode 100644 index 205c4d161..000000000 --- a/tests/test_workitem.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python3 - -from coffea.processor.executor import WorkItem - - -def test_work_item(): - item1 = WorkItem("TestDataSet", "/a/b/c.root", "Events", 500, 670, "abc", {}) - item2 = WorkItem( - "TestDataSet", "/a/b/c.root", "Events", 500, 670, "abc", {"meta": "data"} - ) - item3 = WorkItem("TestDataSet", "/a/b/c.root", "Events", 500, 760, "abc", {}) - - assert item1 == item1 - assert item1 == item2 - assert item1 != item3 - assert item1.dataset == "TestDataSet" - assert item1.filename == "/a/b/c.root" - assert item1.treename == "Events" - assert item1.entrystart == 500 - assert item1.entrystop == 670 - assert item1.fileuuid == "abc" - assert len(item1) == 670 - 500 - assert len(item3) == 760 - 500 - - # Test if hashable - hash(item2) - - # Test if usermeta is mutable - item1.usermeta["user"] = "meta"