From 76fc9b99f17139e29f9515bfb514aab70c1d3b52 Mon Sep 17 00:00:00 2001
From: garciagenrique <enrique.garcia.garcia@cern.ch>
Date: Thu, 20 Jun 2024 17:42:01 +0200
Subject: [PATCH] add docs and script to fetch rucio dataset files

---
 tutorials/data-lake/pull-dataset/README.md    | 25 +++++++++++++-
 .../pull-dataset/rucio_dataset_files.sh       | 34 +++++++++++++++++++
 2 files changed, 58 insertions(+), 1 deletion(-)
 create mode 100755 tutorials/data-lake/pull-dataset/rucio_dataset_files.sh
diff --git a/tutorials/data-lake/pull-dataset/README.md b/tutorials/data-lake/pull-dataset/README.md
index 1d1b6277..06cd8ccb 100644
--- a/tutorials/data-lake/pull-dataset/README.md
+++ b/tutorials/data-lake/pull-dataset/README.md
@@ -1 +1,24 @@
-# Pull dataset from Rucio data lake
+# Interact with Rucio dataset files
+
+The following script assumes that all the files within a "Rucio Dataset" (or `DIDs` - see below) are present in a RSE (Rucio Storage Element), and that this RSE is accessible locally.
+ - `DIDs` (or Data Identifiers - see Rucio [documentation](https://rucio.github.io/documentation/started/concepts/file_dataset_container/)) are composed of a scope plus a dataset name in the `SCOPE:Name` format.
+ - If the files are not present in the RSE, replicate the dataset on the desired RSE before running the script.
+
+Run the following bash script
+
+```bash
+> ./rucio_dataset_files.sh <SCOPE:DataSet> <output_filename> <output_dirname>
+```
+where
+ - `SCOPE:Name` is the Rucio DID. You can list all the scopes with the command `rucio list-scopes`, and the dataset name with `rucio list-did <SCOPE>:` (note the colon).
+ - `output_filename` is the output file that contains the "filepath" of all the files in the dataset.
+ - `output_dirname` is the output directory with all the dataset files (in the form of symbolic links), to avoid duplication of files on disk. It also prevents users to search within the disk, which could get complicated depending on the storage kind and model.
+
+```bash
+# Example
+> ./rucio_dataset_files.sh calorimeter:training_data_hdf5 calorimeter_files.txt calorimeter_symlink_dir
+
+# And check the output file and the directory with the symlinks
+> cat calorimeter_files.txt
+> ls -l calorimeter_symlink_dir
+```
diff --git a/tutorials/data-lake/pull-dataset/rucio_dataset_files.sh b/tutorials/data-lake/pull-dataset/rucio_dataset_files.sh
new file mode 100755
index 00000000..769f7027
--- /dev/null
+++ b/tutorials/data-lake/pull-dataset/rucio_dataset_files.sh
@@ -0,0 +1,34 @@
+#/bin/bash
+#
+# G. Guerrieri & E. Garcia (CERN) - Jun 2024
+#
+# This script runs only on VEGA
+# 
+# Usage - on a terminal run
+# > ./rucio_dataset_files.sh <SCOPE:DataSet> <output_file> <output_symlink_dir>
+
+set -e
+
+ds=$1
+name=$2
+location=$3
+
+pw=`pwd -P`
+
+if [[ -f "${name}" ]]; then rm ${name}.txt; fi
+touch ${name}.txt
+
+if [ -d "${location}" ]; then echo -e "Directory exists. Exiting\n${pw}/${location}" ; exit 1 ; fi
+mkdir $location
+
+for file in `rucio list-file-replicas --rse VEGA-DCACHE $ds | awk '{ print $12 }' | sed 's|https://dcache.sling.si:2880|/dcache/sling.si|g'`
+do
+  if [[ $file == "|" ]]; then continue; fi
+  fileReduced=`basename $file`
+  echo linking $fileReduced "..."
+  link=$location/${ds/:/.}.$fileReduced
+  ln -s $file $link
+  echo ${pw}/$link >> ${name}.txt
+done
+
+chmod -R 777 $3