diff --git a/.gitignore b/.gitignore
index 0643fa9cb2..211462e6ac 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,8 +10,8 @@ tmp/runtime/*
 # python virtualenv
 .venv/
 # ci directory
-build/
-workdir/
+/build/
+/workdir/
 # direnv
 .envrc
 # pylint config
diff --git a/cicd/monit_spark/Dockerfile b/cicd/monit_spark/Dockerfile
index 7372722116..fc4499cee7 100644
--- a/cicd/monit_spark/Dockerfile
+++ b/cicd/monit_spark/Dockerfile
@@ -1,4 +1,4 @@
-FROM registry.cern.ch/cmsmonitoring/cmsmon-spark:v0.5.0.1 
+FROM registry.cern.ch/cmsmonitoring/cmsmon-spark:v0.5.0.1
 
 ## build with from dmwm/CRABServer, root directory
 # docker buildx build -t registry.cern.ch/cmscrab/crabspark:(date +%s) -f cicd/monit_spark/Dockerfile .
@@ -9,17 +9,9 @@ RUN yum install -y \
     && rm -rf /var/cache/yum
 
 RUN mkdir -p /data/srv/spark/
-COPY ./src/script/Monitor/crab-spark/workdir/osearch.py \
-     ./src/script/Monitor/crab-spark/workdir/bootstrap.sh \
-     ./src/script/Monitor/crab-spark/cronjobs/crab_condor_daily.py \
-     ./src/script/Monitor/crab-spark/cronjobs/crab_tape_recall_rules_history_daily.py \
-     ./src/script/Monitor/crab-spark/cronjobs/run_spark.sh \
-     ./src/script/Monitor/crab-spark/cronjobs/crab_data_daily.py \
-     ./src/script/Monitor/crab-spark/cronjobs/crab_tape_recall_updated_rules_daily.py \
+COPY ./src/script/Monitor/crab-spark \
      /data/srv/spark
 
 ENTRYPOINT ["tini", "--"]
 
 CMD ["echo", "no default script for spark docker image"]
-
-
diff --git a/src/script/Monitor/crab-spark/cronjobs/crab_condor_daily.py b/src/script/Monitor/crab-spark/cronjobs/crab_condor_daily.py
deleted file mode 100644
index 66d998334a..0000000000
--- a/src/script/Monitor/crab-spark/cronjobs/crab_condor_daily.py
+++ /dev/null
@@ -1,250 +0,0 @@
-import os
-import sys
-
-os.environ['PYSPARK_PYTHON'] = sys.executable
-os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
-
-import time
-import numpy as np
-import pandas as pd
-
-from datetime import datetime, date, timedelta
-
-import osearch
-
-import argparse
-parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument("-s", "--start-date", default=None,
-  help="process data starting from this day, inclusive (YYYY-MM-DD)",)
-parser.add_argument("-e", "--end-date", default=None,
-  help="process data until this day, not included (YYYY-MM-DD)",)
-args = parser.parse_args()
-print(f"timerange: [{args.start_date} {args.end_date})" )
-
-
-from pyspark.sql.functions import (
-    col,
-    lit,
-    when,
-    sum as _sum,
-    count as _count,
-    first,
-    date_format,
-    from_unixtime
-)
-from pyspark.sql.types import (
-    StructType,
-    LongType,
-    StringType,
-    StructField,
-    DoubleType,
-    IntegerType,
-)
-
-from pyspark.sql import SparkSession
-
-spark = SparkSession\
-        .builder\
-        .appName("crab_tape_recall")\
-        .getOrCreate()
-
-# CRAB table date
-
-# condor data and query date
-# if args.end_date:
-#     end_date = datetime.strptime(args.end_date, '%Y-%m-%d')
-# else:
-#     end_date = datetime.now()
-#     end_date = end_date.replace(minute=0, hour=0, second=0, microsecond=0)
-# 
-# if args.start_date:
-#     start_date = datetime.strptime(args.start_date, '%Y-%m-%d')
-# else:
-#     start_date = end_date - timedelta(days=1)
-
-if args.end_date:
-    end_date = datetime.strptime(args.end_date, '%Y-%m-%d')
-else:
-    end_date = datetime.now()
-    end_date = end_date.replace(minute=0, hour=0, second=0, microsecond=0)
-
-if args.start_date:
-    start_date = datetime.strptime(args.start_date, '%Y-%m-%d')
-else:
-    start_date = end_date - timedelta(days=1)
-
-date_list = pd.date_range(
-    start=start_date,
-    end=end_date,
-    ).to_pydatetime().tolist()
-
-# Import condor data
-
-def process_single_day(day):
-
-    start_date = day
-    end_date = day + timedelta(days=1)
-    print(f"START PROCESSING: from {start_date} to {end_date}")
-
-    _DEFAULT_HDFS_FOLDER = "/project/monitoring/archive/condor/raw/metric"
-    
-    def _get_schema():
-        return StructType(
-            [
-                StructField(
-                    "data",
-                    StructType(
-                        [
-                            StructField("RecordTime", LongType(), nullable=False),
-                            StructField("CMSPrimaryDataTier", StringType(), nullable=True),
-                            StructField("Status", StringType(), nullable=True),
-                            StructField("WallClockHr", DoubleType(), nullable=True),
-                            StructField("CoreHr", DoubleType(), nullable=True),
-                            StructField("CpuTimeHr", DoubleType(), nullable=True),
-                            StructField("Type", StringType(), nullable=True),
-                            StructField("CRAB_DataBlock", StringType(), nullable=True),
-                            StructField("GlobalJobId", StringType(), nullable=False),
-                            StructField("ExitCode", LongType(), nullable=True),
-                            StructField("CRAB_Workflow", StringType(), nullable=True),
-                            StructField("CommittedCoreHr", StringType(), nullable=True),
-                            StructField("CommittedWallClockHr", StringType(), nullable=True),
-                        ]
-                    ),
-                ),
-            ]
-        )
-    
-    def get_candidate_files(start_date, end_date, spark, base=_DEFAULT_HDFS_FOLDER):
-        st_date = start_date - timedelta(days=0)
-        ed_date = end_date + timedelta(days=0)
-        days = (ed_date - st_date).days
-    
-        sc = spark.sparkContext
-        FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
-        URI = sc._gateway.jvm.java.net.URI
-        Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
-        fs = FileSystem.get(URI("hdfs:///"), sc._jsc.hadoopConfiguration())
-        candidate_files = [
-            f"{base}/{(st_date + timedelta(days=i)).strftime('%Y/%m/%d')}"
-            for i in range(0, days)
-        ]    
-        candidate_files = [url for url in candidate_files if fs.globStatus(Path(url))]
-        print("No. of Compacted files:", len(candidate_files))
-    
-        pre_candidate_files = [
-            f"{base}/{(st_date + timedelta(days=i)).strftime('%Y/%m/%d')}.tmp"
-            for i in range(0, days)
-        ]
-        pre_candidate_files = [url for url in pre_candidate_files if fs.globStatus(Path(url))]
-        print("No. of uncompacted files:", len(pre_candidate_files))
-        
-        return candidate_files + pre_candidate_files
-    
-    
-    schema = _get_schema()
-    
-    condor_df = (
-            spark.read.option("basePath", _DEFAULT_HDFS_FOLDER)
-            .json(
-                get_candidate_files(start_date, end_date, spark, base=_DEFAULT_HDFS_FOLDER),
-                schema=schema,
-            ).select("data.*")
-            .filter(
-                f"""Status IN ('Completed')
-                AND Type IN ('analysis')
-                AND RecordTime >= {start_date.timestamp() * 1000}
-                AND RecordTime < {end_date.timestamp() * 1000}
-                """
-            )
-            .drop_duplicates(["GlobalJobId"])
-    #	.cache()
-        )
-    
-    # Convert file type by saving and recall it again (.json too complex for spark)
-    
-    crab_username = os.getenv("CRAB_KRB5_USERNAME", "cmscrab")
-    condor_df.write.mode('overwrite').parquet(f"/cms/users/{crab_username}/condor_vir_data" ,compression='zstd')
-    condor_df = spark.read.format('parquet').load(f"/cms/users/{crab_username}/condor_vir_data")
-    
-    # Import CRAB data
-    wa_date = day.strftime("%Y-%m-%d")    
-    HDFS_CRAB_part = f'/project/awg/cms/crab/tasks/{wa_date}/'
-    crab_df = spark.read.format('avro').load(HDFS_CRAB_part)
-    crab_df = crab_df.select('TM_TASKNAME', 'TM_IGNORE_LOCALITY')
-    
-    print("==============================================="
-          , "Condor Matrix and CRAB Table"
-          , "==============================================="
-          , "File Directory:", HDFS_CRAB_part, get_candidate_files(start_date, end_date, spark, base=_DEFAULT_HDFS_FOLDER)
-          , "Work Directory:", os.getcwd()
-          , "==============================================="
-          , "===============================================", sep='\n')
-    
-    # Join condor job with CRAB data
-    
-    result_df = condor_df.join(crab_df, crab_df["TM_TASKNAME"] == condor_df["CRAB_Workflow"])\
-        .select('RecordTime', 'CMSPrimaryDataTier', 'WallClockHr', 'CoreHr', 'CpuTimeHr', 'ExitCode'
-                , "CRAB_DataBlock", "TM_IGNORE_LOCALITY", "GlobalJobId", "CommittedCoreHr", "CommittedWallClockHr")
-        
-    # Convert database to dictionary
-    
-    docs = result_df.toPandas()
-    docs["CRAB_Type"] = docs.apply(lambda row: "PrivateMC" if row["CRAB_DataBlock"] == "MCFakeBlock" else "Analysis", axis=1)
-    print(f"pandas dataframe size: {docs.memory_usage(deep=True).apply(lambda x: x / 1024 / 1024).sum()} MB")
-    
-    
-    def get_index_schema():
-        return {
-            "settings": {"index": {"number_of_shards": "1", "number_of_replicas": "1"}},
-            "mappings": {
-                "properties": {
-                    "RecordTime": {"format": "epoch_millis", "type": "date"},
-                    "CMSPrimaryDataTier": {"ignore_above": 2048, "type": "keyword"},
-                    "GlobalJobId": {"ignore_above": 2048, "type": "keyword"},
-                    "WallClockHr": {"type": "long"},
-                    "CoreHr": {"type": "long"},
-                    "CpuTimeHr": {"type": "long"},
-                    "ExitCode": {"ignore_above": 2048, "type": "keyword"},
-                    "TM_IGNORE_LOCALITY": {"ignore_above": 2048, "type": "keyword"},
-                    "CRAB_Type": {"ignore_above": 2048, "type": "keyword"},
-                    "CRAB_DataBlock": {"ignore_above": 2048, "type": "keyword"},
-                    "CommittedCoreHr": {"type": "long"}, 
-                    "CommittedWallClockHr": {"type": "long"},
-                }
-            }
-        }
-    
-    # Send data to Opensearch
-    
-    _index_template = 'crab-condor-taskdb'
-    client = osearch.get_es_client("os-cms.cern.ch/es", '/data/certs/monit.d/monit_spark_crab.txt', get_index_schema())
-    idx = client.get_or_create_index(timestamp=day.strftime("%s"), index_template=_index_template, index_mod="M")
-    docs_rows = len(docs)
-    sent = 0
-    batch = 50000
-    import gc
-    while sent < docs_rows:
-        gc.collect()
-        start = sent
-        end = start + batch if start + batch < docs_rows else docs_rows
-        docs_tmp = docs.iloc[start:end]
-        # the following line requires a lot of RAM, better do it 50_000
-        # items at a time only. Keep in mind that the pandas datafram usually
-        # contains about 1_000_000 rows
-        docs_tmp = docs_tmp.to_dict('records')
-        no_of_fail_saved = client.send(idx, docs_tmp, metadata=None, batch_size=10000, drop_nulls=False)
-        sent = end
-    
-        print("=================================== Condor Matrix and CRAB Table =====================================",
-              "FINISHED : ",
-              f"start {start}, end {end}",
-              len(docs_tmp), "ROWS ARE SENT",
-              no_of_fail_saved, "ROWS ARE FAILED",
-              "=================================== Condor Matrix and CRAB Table =====================================", 
-            sep='\n')
-    
-    
-for day in date_list:
-    process_single_day(day)
-
-
diff --git a/src/script/Monitor/crab-spark/cronjobs/crab_data_daily.py b/src/script/Monitor/crab-spark/cronjobs/crab_data_daily.py
deleted file mode 100644
index 4fb3f4d45d..0000000000
--- a/src/script/Monitor/crab-spark/cronjobs/crab_data_daily.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# import pickle
-from datetime import datetime, timedelta
-
-# import click
-import os
-import pandas as pd
-# import pprint
-import time
-# from dateutil.relativedelta import relativedelta
-
-import numpy as np
-import json
-import osearch
-
-import argparse
-
-parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument("-s", "--start-date", default=None,
-  help="process data starting from this day, inclusive (YYYY-MM-DD)",)
-parser.add_argument("-e", "--end-date", default=None,
-  help="process data until this day, not included (YYYY-MM-DD)",)
-args = parser.parse_args()
-print(f"timerange: [{args.start_date} {args.end_date})" )
-
-
-from pyspark import SparkContext, StorageLevel
-from pyspark.sql import SparkSession
-from pyspark.sql.functions import (
-    col, collect_list, concat_ws, greatest, lit, lower, when,
-    avg as _avg,
-    count as _count,
-    hex as _hex,
-    max as _max,
-    min as _min,
-    round as _round,
-    sum as _sum,
-)
-from pyspark.sql.types import (
-    LongType,
-)
-from pyspark.sql import SparkSession
-spark = SparkSession\
-        .builder\
-        .appName("crab_tape_recall")\
-        .getOrCreate()
-
-# Query date
-
-#TODAY = str(datetime.now())[:10]
-#YESTERDAY = str(datetime.now()-timedelta(days=1))[:10]
-#wa_date = TODAY
-
-if args.end_date:
-    end_date = datetime.strptime(args.end_date, '%Y-%m-%d')
-else:
-    end_date = datetime.now()
-    end_date = end_date.replace(minute=0, hour=0, second=0, microsecond=0)
-
-if args.start_date:
-    start_date = datetime.strptime(args.start_date, '%Y-%m-%d')
-else:
-    start_date = end_date - timedelta(days=1)
-
-date_list = pd.date_range(
-    start=start_date,
-    end=end_date,
-    ).to_pydatetime().tolist()
-
-# Import data into database form
-
-def process_single_day(day):
-
-    wa_date = day.strftime("%Y-%m-%d")
-    TODAY = wa_date
-    YESTERDAY = (day-timedelta(days=1)).strftime("%Y-%m-%d")
-
-    HDFS_CRAB_part = f'/project/awg/cms/crab/tasks/{wa_date}/'
-    print("==============================================="
-          , "CRAB Table"
-          , "==============================================="
-          , "File Directory:", HDFS_CRAB_part
-          , "Work Directory:", os.getcwd()
-          , "==============================================="
-          , "===============================================", sep='\n')
-    
-    crab_part = spark.read.format('avro').load(HDFS_CRAB_part)
-    df = crab_part.select("TM_TASKNAME","TM_START_TIME","TM_TASK_STATUS","TM_SPLIT_ALGO","TM_USERNAME","TM_USER_ROLE","TM_JOB_TYPE","TM_IGNORE_LOCALITY","TM_SCRIPTEXE","TM_USER_CONFIG")
-    df.createOrReplaceTempView("crab_algo")
-    
-    # Query daily data
-    
-    query = f"""\
-    SELECT *
-    FROM crab_algo 
-    WHERE 1=1
-    AND TM_START_TIME >= unix_timestamp("{YESTERDAY} 00:00:00", "yyyy-MM-dd HH:mm:ss")*1000 
-    AND TM_START_TIME < unix_timestamp("{TODAY} 00:00:00", "yyyy-MM-dd HH:mm:ss")*1000 
-    """
-    
-    tmpdf = spark.sql(query)
-    tmpdf.show(10)
-    
-    # Convert database to dictionary
-    
-    docs = tmpdf.toPandas().to_dict('records')
-    
-    # Extract 'REQUIRE_ACCELERATOR' from 'TM_USER_CONFIG'
-    
-    for i in range(len(docs)):
-        if docs[i]['TM_USER_CONFIG'] is not None:
-            data = json.loads(docs[i]['TM_USER_CONFIG'])
-            if "requireaccelerator" in data:
-                docs[i]['REQUIRE_ACCELERATOR'] = data["requireaccelerator"]
-            else:
-                docs[i]['REQUIRE_ACCELERATOR'] = None
-        else:
-            docs[i]['REQUIRE_ACCELERATOR'] = None
-    
-    # Define type of each schema
-    
-    def get_index_schema():
-        return {
-            "settings": {"index": {"number_of_shards": "1", "number_of_replicas": "1"}},
-            "mappings": {
-                "properties": {
-                    "TM_TASKNAME": {"ignore_above": 2048, "type": "keyword"},
-                    "TM_START_TIME": {"format": "epoch_millis", "type": "date"},
-                    'TM_TASK_STATUS': {"ignore_above": 2048, "type": "keyword"},
-                    "TM_SPLIT_ALGO": {"ignore_above": 2048, "type": "keyword"},
-                    "TM_USERNAME": {"ignore_above": 2048, "type": "keyword"},
-                    "TM_USER_ROLE": {"ignore_above": 2048, "type": "keyword"},
-                    "TM_JOB_TYPE": {"ignore_above": 2048, "type": "keyword"},
-                    "TM_IGNORE_LOCALITY": {"ignore_above": 2048, "type": "keyword"},
-                    "TM_SCRIPTEXE": {"ignore_above": 2048, "type": "keyword"},
-                    "REQUIRE_ACCELERATOR": {"ignore_above": 2048, "type": "keyword"},
-                }
-            }
-        }
-    
-    # Send data to Opensearch
-    
-    _index_template = 'crab-taskdb'
-    client = osearch.get_es_client("os-cms.cern.ch/es", '/data/certs/monit.d/monit_spark_crab.txt', get_index_schema())
-    idx = client.get_or_create_index(timestamp=day.strftime("%s"), index_template=_index_template, index_mod="M")
-    no_of_fail_saved = client.send(idx, docs, metadata=None, batch_size=10000, drop_nulls=False)
-    
-    print("================================= CRAB Table ======================================="
-          , "FINISHED : ", len(docs), "ROWS ARE SENT", no_of_fail_saved, "ROWS ARE FAILED"
-          , "=================================  CRAB Table =======================================", sep='\n')
-
-
-for day in date_list:
-    process_single_day(day)
-
diff --git a/src/script/Monitor/crab-spark/cronjobs/crab_tape_recall_rules_history_daily.py b/src/script/Monitor/crab-spark/cronjobs/crab_tape_recall_rules_history_daily.py
deleted file mode 100644
index 5fa5ddcac5..0000000000
--- a/src/script/Monitor/crab-spark/cronjobs/crab_tape_recall_rules_history_daily.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# import pickle
-from datetime import datetime, timedelta
-
-# import click
-import os
-import pandas as pd
-# import pprint
-import time
-# from dateutil.relativedelta import relativedelta
-
-import argparse
-parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument("-s", "--start-date", default=None,
-  help="process data starting from this day, inclusive (YYYY-MM-DD)",)
-parser.add_argument("-e", "--end-date", default=None,
-  help="process data until this day, not included (YYYY-MM-DD)",)
-args = parser.parse_args()
-print(f"timerange: [{args.start_date} {args.end_date})" )
-
-
-from pyspark import SparkContext, StorageLevel
-from pyspark.sql import SparkSession
-from pyspark.sql.functions import (
-    col, collect_list, concat_ws, greatest, lit, lower, when,
-    avg as _avg,
-    count as _count,
-    hex as _hex,
-    max as _max,
-    min as _min,
-    round as _round,
-    sum as _sum,
-)
-
-from pyspark.sql.types import (
-    LongType,
-)
-
-import numpy as np
-# import math
-import osearch
-from pyspark.sql import SparkSession
-
-spark = SparkSession\
-        .builder\
-        .appName("crab_tape_recall")\
-        .getOrCreate()
-
-# Data date
-
-if args.end_date:
-    end_date = datetime.strptime(args.end_date, '%Y-%m-%d')
-else:
-    end_date = datetime.now()
-    end_date = end_date.replace(minute=0, hour=0, second=0, microsecond=0)
-
-if args.start_date:
-    start_date = datetime.strptime(args.start_date, '%Y-%m-%d')
-else:
-    start_date = end_date - timedelta(days=1)
-
-date_list = pd.date_range(
-    start=start_date,
-    end=end_date,
-    ).to_pydatetime().tolist()
-
-def process_single_day(day):
-    
-    # Query date
-    
-    wa_date = day.strftime("%Y-%m-%d")
-    TODAY = wa_date
-    YESTERDAY = (day-timedelta(days=1)).strftime("%Y-%m-$d")
-    TOYEAR = day.strftime("%Y")
-    
-    # Import data into database form
-    
-    HDFS_RUCIO_RULES_HISTORY = f'/project/awg/cms/rucio/{wa_date}/rules_history/'
-    
-    print("==============================================="
-          , "RUCIO : Rules History"
-          , "==============================================="
-          , "File Directory:", HDFS_RUCIO_RULES_HISTORY
-          , "Work Directory:", os.getcwd()
-          , "==============================================="
-          , "===============================================", sep='\n')
-    
-    rucio_rules_history = spark.read.format('avro').load(HDFS_RUCIO_RULES_HISTORY).withColumn('ID', lower(_hex(col('ID'))))
-    
-    # Query data in daily
-    
-    rucio_rules_history = rucio_rules_history.select("ID", "NAME", "STATE", "EXPIRES_AT", "UPDATED_AT", "CREATED_AT", "ACCOUNT").filter(f"""ACCOUNT IN ('crab_tape_recall')""").cache()
-    rucio_rules_history.createOrReplaceTempView("rules_history")
-    
-    query = query = f"""\
-    WITH filter_t AS (
-    SELECT ID, NAME, STATE, EXPIRES_AT, UPDATED_AT, CREATED_AT
-    FROM rules_history 
-    WHERE 1=1
-    AND CREATED_AT >= unix_timestamp("{TOYEAR}-01-01 00:00:00", "yyyy-MM-dd HH:mm:ss")*1000
-    ),
-    rn_t AS (
-    SELECT ID, NAME, STATE, EXPIRES_AT, UPDATED_AT, CREATED_AT,
-    row_number() over(partition by ID order by UPDATED_AT desc) as rn
-    FROM filter_t
-    ),
-    calc_days_t AS (
-    SELECT ID, NAME, STATE, EXPIRES_AT, UPDATED_AT, CREATED_AT,
-       CASE 
-          WHEN STATE = 'O' THEN ceil((UPDATED_AT-CREATED_AT)/86400000)  
-          WHEN STATE != 'O' AND EXPIRES_AT < unix_timestamp("{wa_date} 00:00:00", "yyyy-MM-dd HH:mm:ss")*1000 THEN ceil((EXPIRES_AT-CREATED_AT)/86400000)
-          ELSE 0
-       END AS DAYS
-    FROM rn_t
-    WHERE rn = 1
-    )
-    SELECT * 
-    FROM calc_days_t
-    WHERE 1=1
-    AND EXPIRES_AT >= unix_timestamp("{YESTERDAY} 00:00:00", "yyyy-MM-dd HH:mm:ss")*1000
-    AND EXPIRES_AT < unix_timestamp("{TODAY} 00:00:00", "yyyy-MM-dd HH:mm:ss")*1000 
-    """
-    
-    tmpdf = spark.sql(query)
-    tmpdf.show()
-    
-    # Convert database to dictionary
-    
-    docs = tmpdf.toPandas().to_dict('records')
-    
-    # Define type of each schema
-    
-    def get_index_schema():
-        return {
-            "settings": {"index": {"number_of_shards": "1", "number_of_replicas": "1"}},
-            "mappings": {
-                "properties": {
-                    "timestamp": {"format": "epoch_second", "type": "date"},
-                    "ID": {"ignore_above": 1024, "type": "keyword"},
-                    "NAME": {"ignore_above": 2048, "type": "keyword"},
-                    "STATE": {"ignore_above": 1024, "type": "keyword"},
-                    "EXPIRES_AT": {"format": "epoch_millis", "type": "date"},
-                    "UPDATED_AT": {"format": "epoch_millis", "type": "date"},
-                    "CREATED_AT": {"format": "epoch_millis", "type": "date"},
-                    "DAYS": {"type": "long"},
-                }
-            }
-        }
-    
-    # Send data to Opensearch
-    
-    _index_template = 'crab-tape-recall-daily'
-    client = osearch.get_es_client("os-cms.cern.ch/es", '/data/certs/monit.d/monit_spark_crab.txt', get_index_schema())
-    idx = client.get_or_create_index(timestamp=day.strftime("%s"), index_template=_index_template, index_mod="M")
-    no_of_fail_saved = client.send(idx, docs, metadata=None, batch_size=10000, drop_nulls=False)
-    
-    print("=================================== RUCIO : Rules History ====================================="
-          , "FINISHED : "
-          , len(docs), "ROWS ARE SENT"
-          , no_of_fail_saved, "ROWS ARE FAILED"
-          , "=================================== RUCIO : Rules History =====================================", sep='\n')
-    
-
-for day in date_list:
-    process_single_day(day)
diff --git a/src/script/Monitor/crab-spark/cronjobs/crab_tape_recall_updated_rules_daily.py b/src/script/Monitor/crab-spark/cronjobs/crab_tape_recall_updated_rules_daily.py
deleted file mode 100644
index 79c4bc019e..0000000000
--- a/src/script/Monitor/crab-spark/cronjobs/crab_tape_recall_updated_rules_daily.py
+++ /dev/null
@@ -1,163 +0,0 @@
-
-from datetime import datetime, timedelta
-import os
-import pandas as pd
-import time
-
-import argparse
-
-parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument("-s", "--start-date", default=None,
-  help="process data starting from this day, inclusive (YYYY-MM-DD)",)
-parser.add_argument("-e", "--end-date", default=None,
-  help="process data until this day, not included (YYYY-MM-DD)",)
-args = parser.parse_args()
-print(f"timerange: [{args.start_date} {args.end_date})" )
-
-from pyspark import SparkContext, StorageLevel
-from pyspark.sql import SparkSession
-from pyspark.sql.functions import (
-    col, collect_list, concat_ws, greatest, lit, lower, when,
-    avg as _avg,
-    count as _count,
-    hex as _hex,
-    max as _max,
-    min as _min,
-    round as _round,
-    sum as _sum,
-)
-from pyspark.sql.types import (
-    LongType,
-)
-import numpy as np
-import osearch
-from pyspark.sql import SparkSession
-
-spark = SparkSession\
-        .builder\
-        .appName("crab_tape_recall")\
-        .getOrCreate()
-
-# Data date
-
-if args.end_date:
-    end_date = datetime.strptime(args.end_date, '%Y-%m-%d')
-else:
-    end_date = datetime.now()
-    end_date = end_date.replace(minute=0, hour=0, second=0, microsecond=0)
-
-if args.start_date:
-    start_date = datetime.strptime(args.start_date, '%Y-%m-%d')
-else:
-    start_date = end_date - timedelta(days=1)
-
-date_list = pd.date_range(
-    start=start_date,
-    end=end_date,
-    ).to_pydatetime().tolist()
-
-def process_single_day(day):
-
-    wa_date = day.strftime("%Y-%m-%d")
-    
-    # Import data into database form
-    
-    HDFS_RUCIO_DATASET_LOCKS = f'/project/awg/cms/rucio/{wa_date}/dataset_locks/part*.avro'
-    HDFS_RUCIO_RSES =          f'/project/awg/cms/rucio/{wa_date}/rses/part*.avro'
-    HDFS_RUCIO_RULES =         f'/project/awg/cms/rucio/{wa_date}/rules'
-    print("===============================================", "File Directory:", HDFS_RUCIO_DATASET_LOCKS, "Work Directory:", os.getcwd(), "===============================================", sep='\n')
-    
-    print("==============================================="
-          , "RUCIO : Rules, RSEs, Dataset"
-          , "==============================================="
-          , "File Directory:", HDFS_RUCIO_DATASET_LOCKS
-          , "Work Directory:", os.getcwd()
-          , "==============================================="
-          , "===============================================", sep='\n')
-    rucio_dataset_locks = spark.read.format('avro').load(HDFS_RUCIO_DATASET_LOCKS)\
-        .withColumn('BYTES', col('BYTES').cast(LongType()))\
-        .withColumn('RULE_ID', lower(_hex(col('RULE_ID'))))\
-        .withColumn('RSE_ID', lower(_hex(col('RSE_ID'))))
-    rucio_dataset_locks.createOrReplaceTempView("dataset_locks")
-    
-    rucio_rses = spark.read.format('avro').load(HDFS_RUCIO_RSES)\
-        .withColumn('ID', lower(_hex(col('ID'))))
-    rucio_rses.createOrReplaceTempView("rses")
-    
-    rucio_rules = spark.read.format('avro').load(HDFS_RUCIO_RULES)\
-        .withColumn('ID', lower(_hex(col('ID'))))
-    rucio_rules.createOrReplaceTempView("rules")
-    
-    # filter and query
-    
-    rucio_dataset_locks = rucio_dataset_locks.filter(f"""ACCOUNT IN ('crab_tape_recall')""").cache()
-    rucio_rses = rucio_rses.select('ID', 'RSE', 'RSE_TYPE').cache()
-    rucio_rules = rucio_rules.select('ID', 'ACCOUNT', 'DID_TYPE', 'EXPIRES_AT').cache()
-    
-    result_df = rucio_dataset_locks.join(rucio_rses, rucio_rses["ID"] == rucio_dataset_locks["RSE_ID"])\
-            .join(rucio_rules, rucio_rules["ID"] == rucio_dataset_locks["RULE_ID"]).drop('ID', 'RULE_ID', 'RSE_ID', 'ACCESSED_AT', 'ACCOUNT')
-    
-    # Convert database to dictionary
-    
-    docs = result_df.toPandas().to_dict('records')
-    
-    # Add TIMESTAMP column and convert TiB
-    TIME = datetime.strptime(f"""{wa_date} 00:00:00""", "%Y-%m-%d %H:%M:%S").timestamp()*1000
-    for i in range(len(docs)):
-        docs[i]['TIMESTAMP'] = TIME
-        docs[i]['SIZE_TiB'] = docs[i]["BYTES"]/1099511627776
-        del docs[i]["BYTES"]
-    
-        # break down the name
-        NAME_i = docs[i]['NAME']
-        split_NAME = NAME_i.split('#')[0]
-        docs[i]['NAME_'] = NAME_i.split('#')[0]
-        split_NAME = docs[i]['NAME_'].split('/')
-        if len(split_NAME) != 4:
-            print("YO HOO !!, something wrong.", NAME_i)
-        docs[i]['PriDataset'] = split_NAME[1]
-        docs[i]['DataTier'] = split_NAME[-1]
-    
-    # Define type of each schema
-    
-    def get_index_schema():
-        return {
-            "settings": {"index": {"number_of_shards": "1", "number_of_replicas": "1"}},
-            "mappings": {
-                "properties": {
-                    'SCOPE': {"ignore_above": 2048, "type": "keyword"},
-                    'NAME': {"ignore_above": 2048, "type": "keyword"},
-                    'STATE': {"ignore_above": 1024, "type": "keyword"},
-                    'LENGTH': {"ignore_above": 1024, "type": "keyword"},
-                    'SIZE_TiB': {"type": "long"},
-                    'UPDATED_AT': {"format": "epoch_millis", "type": "date"},
-                    'CREATED_AT': {"format": "epoch_millis", "type": "date"},
-                    'RSE': {"ignore_above": 2048, "type": "keyword"},
-                    'RSE_TYPE': {"ignore_above": 2048, "type": "keyword"},
-                    'DID_TYPE': {"ignore_above": 1024, "type": "keyword"},
-                    'EXPIRES_AT': {"format": "epoch_millis", "type": "date"},
-                    'TIMESTAMP': {"format": "epoch_millis", "type": "date"},
-                    'NAME_': {"ignore_above": 2048, "type": "keyword"},
-                    'PriDataset': {"ignore_above": 2048, "type": "keyword"},
-                    'DataTier': {"ignore_above": 2048, "type": "keyword"},
-                }
-            }
-        }
-        
-    # Send data to Opensearch
-    
-    _index_template = 'crab-tape-recall-rules'
-    client = osearch.get_es_client("os-cms.cern.ch/es", '/data/certs/monit.d/monit_spark_crab.txt', get_index_schema())
-    idx = client.get_or_create_index(timestamp=day.strftime("%s"), index_template=_index_template, index_mod="M")
-    no_of_fail_saved = client.send(idx, docs, metadata=None, batch_size=10000, drop_nulls=False)
-    
-    print("==================================== RUCIO : Rules, RSEs, Dataset ===================================="
-          , "FINISHED : "
-          , len(docs), "ROWS ARE SENT"
-          , no_of_fail_saved, "ROWS ARE FAILED"
-          , "==================================== RUCIO : Rules, RSEs, Dataset ====================================", sep='\n')
-
-
-for day in date_list:
-    process_single_day(day)
-
diff --git a/src/script/Monitor/crab-spark/cronjobs/cron_daily.sh b/src/script/Monitor/crab-spark/cronjobs/cron_daily.sh
deleted file mode 100644
index 7665fa59b8..0000000000
--- a/src/script/Monitor/crab-spark/cronjobs/cron_daily.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/bash
-
-TAG=latest
-if [[ -n $1 ]]; then
-  TAG=$1
-fi
-
-docker run --rm --net=host -v /cvmfs:/cvmfs:shared \
-      -v /data/certs/monit.d/monit_spark_crab.txt:/data/certs/monit.d/monit_spark_crab.txt \
-      -v /data/certs/keytabs.d/cmscrab.keytab:/data/certs/keytabs.d/cmscrab.keytab \
-      registry.cern.ch/cmscrab/crabspark:${TAG} \
-      bash /data/srv/spark/run_spark.sh /data/srv/spark/crab_data_daily.py \
-
-docker run --rm --net=host -v /cvmfs:/cvmfs:shared \
-      -v /data/certs/monit.d/monit_spark_crab.txt:/data/certs/monit.d/monit_spark_crab.txt \
-      -v /data/certs/keytabs.d/cmscrab.keytab:/data/certs/keytabs.d/cmscrab.keytab \
-      registry.cern.ch/cmscrab/crabspark:${TAG} \
-      bash /data/srv/spark/run_spark.sh /data/srv/spark/crab_condor_daily.py
-
- docker run --rm --net=host -v /cvmfs:/cvmfs:shared \
-      -v /data/certs/monit.d/monit_spark_crab.txt:/data/certs/monit.d/monit_spark_crab.txt \
-      -v /data/certs/keytabs.d/cmscrab.keytab:/data/certs/keytabs.d/cmscrab.keytab \
-      registry.cern.ch/cmscrab/crabspark:${TAG} \
-      bash /data/srv/spark/run_spark.sh /data/srv/spark/crab_tape_recall_rules_history_daily.py 
-
-docker run --rm --net=host -v /cvmfs:/cvmfs:shared \
-  -v /data/certs/monit.d/monit_spark_crab.txt:/data/certs/monit.d/monit_spark_crab.txt \
-  -v /data/certs/keytabs.d/cmscrab.keytab:/data/certs/keytabs.d/cmscrab.keytab \
-  registry.cern.ch/cmscrab/crabspark:${TAG} \
-  bash /data/srv/spark/run_spark.sh /data/srv/spark/crab_tape_recall_updated_rules_daily.py
-
diff --git a/src/script/Monitor/crab-spark/cronjobs/run_spark.sh b/src/script/Monitor/crab-spark/cronjobs/run_spark.sh
index fb5b13d3ab..5412f4119b 100644
--- a/src/script/Monitor/crab-spark/cronjobs/run_spark.sh
+++ b/src/script/Monitor/crab-spark/cronjobs/run_spark.sh
@@ -1,10 +1,15 @@
 #!/bin/bash
+set -euo pipefail
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 
 # work directory
-cd /data/srv/spark
+pushd "${SCRIPT_DIR}"
 
 # source the environment for spark submit
-source ./bootstrap.sh
+source ../workdir/bootstrap.sh
 
 # submit $1 to spark, where $1 supposes to be a data pulling file (.py)
 spark-submit --master yarn --packages org.apache.spark:spark-avro_2.12:3.5.0 $@
+
+popd
diff --git a/src/script/Monitor/crab-spark/notebooks/crab_condor.ipynb b/src/script/Monitor/crab-spark/notebooks/crab_condor.ipynb
index 82f3f81eb2..65855e1112 100644
--- a/src/script/Monitor/crab-spark/notebooks/crab_condor.ipynb
+++ b/src/script/Monitor/crab-spark/notebooks/crab_condor.ipynb
@@ -1,85 +1,43 @@
 {
  "cells": [
   {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "cf212bba",
+   "cell_type": "markdown",
+   "id": "aed9b54a",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "            <div>\n",
-       "                <p><b>SparkSession - in-memory</b></p>\n",
-       "                \n",
-       "        <div>\n",
-       "            <p><b>SparkContext</b></p>\n",
-       "\n",
-       "            <p><a href=\"http://swan-prod-2xlarge-h2ohj7sq3fe3-node-17:32740\">Spark UI</a></p>\n",
-       "\n",
-       "            <dl>\n",
-       "              <dt>Version</dt>\n",
-       "                <dd><code>v3.3.2</code></dd>\n",
-       "              <dt>Master</dt>\n",
-       "                <dd><code>yarn</code></dd>\n",
-       "              <dt>AppName</dt>\n",
-       "                <dd><code>pyspark_shell_swan</code></dd>\n",
-       "            </dl>\n",
-       "        </div>\n",
-       "        \n",
-       "            </div>\n",
-       "        "
-      ],
-      "text/plain": [
-       "<pyspark.sql.session.SparkSession at 0x7f8b9eb7f550>"
-      ]
-     },
-     "execution_count": 1,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
    "source": [
-    "spark"
+    "# CRAB Spark condor job\n",
+    "\n",
+    "This join info between the condor job metrics and crab taskdb, to answer these questions:\n",
+    "- How many jobs use ignorelocality?\n",
+    "- What is wall clock time spent by each CMS data tier and each job type?\n",
+    "- What is the success rate of the Analysis job type?\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
-   "id": "77d4d561",
+   "execution_count": null,
+   "id": "5e9af689",
    "metadata": {},
    "outputs": [],
    "source": [
+    "from datetime import datetime, timedelta, timezone\n",
     "import os\n",
-    "import sys\n",
-    "\n",
-    "os.environ['PYSPARK_PYTHON'] = sys.executable\n",
-    "os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable\n",
-    "\n",
     "import time\n",
-    "# from utils import (\n",
-    "#     _to_dict,\n",
-    "#     _donut,\n",
-    "#     _pie,\n",
-    "#     _line_graph,\n",
-    "#     _other_fields,\n",
-    "#     _exitcode_info,\n",
-    "#     _better_label\n",
-    "# )\n",
-    "from datetime import datetime, date, timedelta\n",
+    "import pandas as pd\n",
+    "\n",
+    "from pyspark import SparkContext, StorageLevel\n",
+    "from pyspark.sql import SparkSession\n",
     "from pyspark.sql.functions import (\n",
-    "    col,\n",
-    "    lit,\n",
-    "    when,\n",
-    "    sum as _sum,\n",
+    "    current_user,\n",
+    "    col, collect_list, concat_ws, greatest, lit, lower, when,\n",
+    "    avg as _avg,\n",
     "    count as _count,\n",
-    "    first,\n",
-    "    date_format,\n",
-    "    from_unixtime\n",
+    "    hex as _hex,\n",
+    "    max as _max,\n",
+    "    min as _min,\n",
+    "    round as _round,\n",
+    "    sum as _sum,\n",
     ")\n",
-    "import numpy as np\n",
-    "import pandas as pd\n",
     "from pyspark.sql.types import (\n",
     "    StructType,\n",
     "    LongType,\n",
@@ -87,615 +45,407 @@
     "    StructField,\n",
     "    DoubleType,\n",
     "    IntegerType,\n",
-    ")\n",
-    "# spark.conf.set(\"spark.sql.execution.arrow.pyspark.enabled\", \"true\")\n"
+    ")"
    ]
   },
   {
-   "cell_type": "markdown",
-   "id": "6b14b465",
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "51b2f1c7",
    "metadata": {},
+   "outputs": [],
    "source": [
-    "### Prepare condor file name/configuration"
+    "# try to import libs from current directory, fallback to $PWD/../workdir if not found\n",
+    "try:\n",
+    "    from crabspark_utils import get_candidate_files, send_os, send_os_parallel\n",
+    "except ModuleNotFoundError:\n",
+    "    import sys\n",
+    "    sys.path.insert(0, f'{os.getcwd()}/../workdir')\n",
+    "    from crabspark_utils import get_candidate_files, send_os, send_os_parallel\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "id": "65a21e3a",
+   "execution_count": null,
+   "id": "22946659",
    "metadata": {},
    "outputs": [],
    "source": [
-    "def _get_schema():\n",
-    "    return StructType(\n",
-    "        [\n",
-    "            StructField(\n",
-    "                \"data\",\n",
-    "                StructType(\n",
-    "                    [\n",
-    "                        StructField(\"RecordTime\", LongType(), nullable=False),\n",
-    "                        StructField(\"CMSPrimaryDataTier\", StringType(), nullable=True),\n",
-    "                        StructField(\"Status\", StringType(), nullable=True),\n",
-    "                        StructField(\"WallClockHr\", DoubleType(), nullable=True),\n",
-    "                        StructField(\"CoreHr\", DoubleType(), nullable=True),\n",
-    "                        StructField(\"CpuTimeHr\", DoubleType(), nullable=True),\n",
-    "                        StructField(\"Type\", StringType(), nullable=True),\n",
-    "                        StructField(\"CRAB_DataBlock\", StringType(), nullable=True),\n",
-    "                        StructField(\"GlobalJobId\", StringType(), nullable=False),\n",
-    "                        StructField(\"ExitCode\", LongType(), nullable=True),\n",
-    "                        StructField(\"CRAB_Workflow\", StringType(), nullable=True),\n",
-    "                        StructField(\"CommittedCoreHr\", StringType(), nullable=True),\n",
-    "                        StructField(\"CommittedWallClockHr\", StringType(), nullable=True),\n",
-    "                    ]\n",
-    "                ),\n",
-    "            ),\n",
-    "        ]\n",
-    "    )"
+    "spark = SparkSession\\\n",
+    "        .builder\\\n",
+    "        .appName('condor-job')\\\n",
+    "        .getOrCreate()\n",
+    "spark"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
-   "id": "5344e275",
+   "execution_count": null,
+   "id": "d37c4539",
    "metadata": {},
    "outputs": [],
    "source": [
-    "_DEFAULT_HDFS_FOLDER = \"/project/monitoring/archive/condor/raw/metric\""
+    "# clear any cache left, for working with notebook\n",
+    "# it safe to run everytime cronjob start\n",
+    "spark.catalog.clearCache()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
-   "id": "c20d8d62",
+   "execution_count": null,
+   "id": "31c19eb0",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# # Check available files \n",
-    "# !hdfs dfs -ls /project/monitoring/archive/condor/raw/metric/2023/07/08"
+    "# secret path, also check if file exists\n",
+    "secretpath = os.environ.get('OPENSEARCH_SECRET_PATH', f'{os.getcwd()}/../workdir/secret_opensearch.txt')\n",
+    "if not os.path.isfile(secretpath): \n",
+    "    raise Exception(f'OS secrets file {secretpath} does not exists')\n",
+    "# if PROD, index prefix will be `crab-*`, otherwise `crab-test-*`\n",
+    "PROD = os.environ.get('PROD', 'false').lower() in ('true', '1', 't')\n",
+    "# FROM_DATE, in strptime(\"%Y-%m-%d\")\n",
+    "START = os.environ.get('START_DATE', None) \n",
+    "END = os.environ.get('END_DATE', None)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
-   "id": "8d821f8f",
+   "execution_count": null,
+   "id": "e843eb6d",
    "metadata": {},
    "outputs": [],
    "source": [
-    "def get_candidate_files(start_date, end_date, spark, base=_DEFAULT_HDFS_FOLDER):\n",
-    "    st_date = start_date - timedelta(days=0)\n",
-    "    ed_date = end_date + timedelta(days=0)\n",
-    "    days = (ed_date - st_date).days\n",
-    "    pre_candidate_files = [\n",
-    "        \"{base}/{day}{{,.tmp}}\".format(\n",
-    "            base=base, day=(st_date + timedelta(days=i)).strftime(\"%Y/%m/%d\")\n",
-    "        )\n",
-    "        for i in range(0, days)\n",
-    "    ]\n",
-    "    sc = spark.sparkContext\n",
-    "    \n",
-    "    candidate_files = [\n",
-    "        f\"{base}/{(st_date + timedelta(days=i)).strftime('%Y/%m/%d')}\"\n",
-    "        for i in range(0, days)\n",
-    "    ]\n",
-    "    FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem\n",
-    "    URI = sc._gateway.jvm.java.net.URI\n",
-    "    Path = sc._gateway.jvm.org.apache.hadoop.fs.Path\n",
-    "    fs = FileSystem.get(URI(\"hdfs:///\"), sc._jsc.hadoopConfiguration())\n",
-    "    # FIXME\n",
-    "    candidate_files = [url for url in candidate_files if fs.globStatus(Path(url))]\n",
-    "    print(\"No. of Consisted files:\", len(candidate_files))\n",
-    "    return candidate_files\n",
-    "\n",
-    "#     all_candidate_files = []\n",
-    "#     candidate_files = [\n",
-    "#         f\"{base}/{(st_date + timedelta(days=i)).strftime('%Y/%m/%d')}\"\n",
-    "#         for i in range(0, days)\n",
-    "#     ]\n",
-    "    \n",
-    "#     URI           = sc._gateway.jvm.java.net.URI\n",
-    "#     Path          = sc._gateway.jvm.org.apache.hadoop.fs.Path\n",
-    "#     FileSystem    = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem\n",
-    "#     Configuration = sc._gateway.jvm.org.apache.hadoop.conf.Configuration\n",
-    "#     fs = FileSystem.get(URI(\"hdfs:///\"), Configuration())\n",
-    "\n",
-    "#     for fileNames in candidate_files:\n",
-    "#         status = fs.listStatus(Path(fileNames))\n",
-    "#         candidate_files_day_i = [\n",
-    "#             str(fileStatus.getPath()).replace('hdfs://analytix', '')\n",
-    "#             for fileStatus in status\n",
-    "#         ]\n",
-    "#         all_candidate_files.extend(candidate_files_day_i)\n",
-    "#     print(\"Files Directory:\", candidate_files, \"\\nNo. of Consisted files:\", len(all_candidate_files))\n",
-    "#     return all_candidate_files\n",
-    "\n",
-    "def group_files(files, n=16):\n",
-    "    # Yield successive n-sized\n",
-    "    # chunks from files\n",
-    "    all_group = []\n",
-    "    for i in range(0, len(files), n):\n",
-    "        all_group.append(files[i:i+n])\n",
-    "    print(\"There are\", len(all_group), \"chunks of files\")\n",
-    "    return all_group"
+    "# For run playbook manually, set start/end date here\n",
+    "START_DATE = \"2024-10-01\"\n",
+    "END_DATE = \"2024-10-02\"\n",
+    "# if cronjob, replace constant with value from env\n",
+    "if START and END:\n",
+    "    START_DATE = START\n",
+    "    END_DATE = END"
    ]
   },
   {
-   "cell_type": "markdown",
-   "id": "9a57477b",
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "430146eb",
    "metadata": {},
+   "outputs": [],
    "source": [
-    "## load dataset"
+    "# index name\n",
+    "index_name = 'condor-taskdb'\n",
+    "# use prod index pattern if this execution is for production\n",
+    "if PROD:\n",
+    "    index_name = f'crab-prod-{index_name}'\n",
+    "else:\n",
+    "    index_name = f'crab-test-{index_name}'"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
-   "id": "28bcc686",
+   "execution_count": null,
+   "id": "2a3b6697",
    "metadata": {},
    "outputs": [],
    "source": [
-    "schema = _get_schema()\n",
-    "start_date = datetime(2023, 8, 10)\n",
-    "end_date = datetime(2023, 8, 11)"
+    "# datetime object\n",
+    "start_datetime = datetime.strptime(START_DATE, \"%Y-%m-%d\").replace(tzinfo=timezone.utc)\n",
+    "end_datetime = datetime.strptime(END_DATE, \"%Y-%m-%d\").replace(tzinfo=timezone.utc)\n",
+    "# sanity check\n",
+    "if end_datetime < start_datetime: \n",
+    "    raise Exception(f\"end date ({END_DATE}) is less than start date ({START_DATE})\")\n",
+    "start_epochmilis = int(start_datetime.timestamp()) * 1000\n",
+    "end_epochmilis = int(end_datetime.timestamp()) * 1000\n",
+    "yesterday_epoch = int((end_datetime-timedelta(days=1)).timestamp())"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
-   "id": "bec66775",
+   "execution_count": null,
+   "id": "9404c437",
    "metadata": {
     "scrolled": true
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "No. of Consisted files: 1\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "['/project/monitoring/archive/condor/raw/metric/2023/08/10']"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "candidate_files = get_candidate_files(start_date, end_date, spark, base=_DEFAULT_HDFS_FOLDER)\n",
-    "candidate_files"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "894bdcf0",
-   "metadata": {},
+   "outputs": [],
    "source": [
-    "### Prepare CRAB data file name"
+    "# debug\n",
+    "print(START_DATE, \n",
+    "      END_DATE, \n",
+    "      index_name,\n",
+    "      sep='\\n')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
-   "id": "b4120002",
+   "execution_count": null,
+   "id": "9d4bb4d0",
    "metadata": {},
    "outputs": [],
    "source": [
-    "TODAY = str(end_date)[:10]\n",
-    "wa_date = TODAY\n",
-    "HDFS_CRAB_part = f'/project/awg/cms/crab/tasks/{wa_date}/'"
+    "# read crab data\n",
+    "HDFS_CRAB_part = f'/project/awg/cms/crab/tasks/{END_DATE}/' \n",
+    "crab_df = spark.read.format('avro').load(HDFS_CRAB_part)\n",
+    "# we did not filter the task here because most jobs was created from older tasks.\n",
+    "# if there are too many crab tasks, it might be safe to filter out the tasks older than 30+7 days ago.\n",
+    "crab_df = crab_df.select('TM_TASKNAME', 'TM_IGNORE_LOCALITY').cache()\n",
+    "crab_df.createOrReplaceTempView(\"tasks\")"
    ]
   },
   {
-   "cell_type": "markdown",
-   "id": "de4d8e96",
-   "metadata": {},
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f15887f4",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
    "source": [
-    "### Get raw data from condor raw"
+    "# read condor data\n",
+    "# reading file 2 days before start date and 1 days after end date inclusive\n",
+    "# sometime flume (condor log aggregator) process the metrics is delay for 2 days, sometime it has timestamp from the future.\n",
+    "# so we do this to make sure we get all metrics from the date we want. (all of these suggested by CMSMONIT)\n",
+    "# Note that we read all files, compact or not, even it has the same content, we will dedup it in the next step.\n",
+    "_DEFAULT_HDFS_FOLDER = \"/project/monitoring/archive/condor/raw/metric\"\n",
+    "candidate_files = get_candidate_files(start_datetime, end_datetime, spark=spark, base=_DEFAULT_HDFS_FOLDER, day_delta=2)\n",
+    "\n",
+    "# this is map json doc to spark schema\n",
+    "read_schema = StructType(\n",
+    "        [\n",
+    "            StructField(\n",
+    "                \"data\",\n",
+    "                StructType(\n",
+    "                    [\n",
+    "                        StructField(\"RecordTime\", LongType(), nullable=False),\n",
+    "                        StructField(\"CMSPrimaryDataTier\", StringType(), nullable=True),\n",
+    "                        StructField(\"Status\", StringType(), nullable=True),\n",
+    "                        StructField(\"WallClockHr\", DoubleType(), nullable=True),\n",
+    "                        StructField(\"CoreHr\", DoubleType(), nullable=True),\n",
+    "                        StructField(\"CpuTimeHr\", DoubleType(), nullable=True),\n",
+    "                        StructField(\"Type\", StringType(), nullable=True),\n",
+    "                        StructField(\"CRAB_DataBlock\", StringType(), nullable=True),\n",
+    "                        StructField(\"GlobalJobId\", StringType(), nullable=False),\n",
+    "                        StructField(\"ExitCode\", LongType(), nullable=True),\n",
+    "                        StructField(\"CRAB_Workflow\", StringType(), nullable=True),\n",
+    "                        StructField(\"CommittedCoreHr\", StringType(), nullable=True),\n",
+    "                        StructField(\"CommittedWallClockHr\", StringType(), nullable=True),\n",
+    "                    ]\n",
+    "                ),\n",
+    "            ),\n",
+    "        ]\n",
+    "   )\n",
+    "print(\"===============================================\"\n",
+    "      , \"Condor Matrix and CRAB Table\"\n",
+    "      , \"===============================================\"\n",
+    "      , \"File Directory:\", _DEFAULT_HDFS_FOLDER, candidate_files\n",
+    "      , \"Work Directory:\", os.getcwd()\n",
+    "      , \"===============================================\"\n",
+    "      , \"===============================================\", sep='\\n')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
-   "id": "0aa94c64",
+   "execution_count": null,
+   "id": "fd3bcb00",
    "metadata": {},
    "outputs": [],
    "source": [
-    "spark.conf.set(\"spark.sql.session.timeZone\", \"UTC\")\n",
-    "\n",
-    "crab_df = spark.read.format('avro').load(HDFS_CRAB_part)\n",
-    "crab_df = crab_df.select('TM_TASKNAME', 'TM_IGNORE_LOCALITY')"
+    "crab_username = spark.sql(\"\"\"SELECT current_user() AS user\"\"\").toPandas().to_dict('records')[0]['user']"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
-   "id": "b35668ef",
+   "execution_count": null,
+   "id": "515aefbc",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "23/08/16 13:48:02 WARN CacheManager: Asked to cache already cached data.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "condor_df = (\n",
-    "        spark.read.option(\"basePath\", _DEFAULT_HDFS_FOLDER)\n",
-    "        .json(\n",
+    "# extract only \"interested data\" from condor metrics and save into temporary area\n",
+    "# need to do this because we do not have enough memory to compute all data at once.\n",
+    "# (1 days is ok, 1 month got spark OOM)\n",
+    "# \"interested data\" is\n",
+    "# - selected column (see `read_schema` above)\n",
+    "# - date range from START_DATE inclusive to END_DATE exclusive\n",
+    "# - only status Complete and type analysis\n",
+    "# job will got dedup by `.drop_duplicates([\"GlobalJobId\"])` in later step\n",
+    "( \n",
+    "    spark.read.option(\"basePath\", _DEFAULT_HDFS_FOLDER)\n",
+    "         .json(\n",
     "            candidate_files,\n",
-    "            schema=schema,\n",
-    "        ).select(\"data.*\")\n",
-    "        .filter(\n",
+    "            schema=read_schema,\n",
+    "         )\n",
+    "         .select(\"data.*\")\n",
+    "         .filter(\n",
     "            f\"\"\"Status IN ('Completed')\n",
     "            AND Type IN ('analysis')\n",
-    "            AND RecordTime >= {start_date.timestamp() * 1000}\n",
-    "            AND RecordTime < {end_date.timestamp() * 1000}\n",
+    "            AND RecordTime >= {start_epochmilis}\n",
+    "            AND RecordTime < {end_epochmilis}\n",
     "            \"\"\"\n",
-    "        )\n",
-    "        .drop_duplicates([\"GlobalJobId\"]).cache()\n",
-    "    ) \n",
-    "condor_df.write.mode('overwrite').parquet(\"/cms/users/eatthaph/condor_vir_data\" ,compression='zstd')\n",
-    "condor_df = spark.read.format('parquet').load('/cms/users/eatthaph/condor_vir_data')\n",
-    "# condor_df.count()"
+    "         )\n",
+    "        .drop_duplicates([\"GlobalJobId\"])\n",
+    "        .write.mode('overwrite').parquet(f\"/cms/users/{crab_username}/condor_vir_data\" ,compression='zstd') # overriding the same path to cleanup old data. However, we could not run it parallel\n",
+    ")\n",
+    "spark.catalog.clearCache()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
-   "id": "7656d1f3",
+   "execution_count": null,
+   "id": "957ac50a",
    "metadata": {},
    "outputs": [],
    "source": [
-    "result_df = condor_df.join(crab_df, crab_df[\"TM_TASKNAME\"] == condor_df[\"CRAB_Workflow\"])\\\n",
-    "    .select('RecordTime', 'CMSPrimaryDataTier', 'WallClockHr', 'CoreHr', 'CpuTimeHr', 'ExitCode'\n",
-    "            , \"CRAB_DataBlock\", \"TM_IGNORE_LOCALITY\", \"GlobalJobId\", \"CommittedCoreHr\", \"CommittedWallClockHr\")\n",
-    "docs = result_df.toPandas()"
+    "condor_df = spark.read.format('parquet').load(f\"/cms/users/{crab_username}/condor_vir_data\").cache()\n",
+    "condor_df.createOrReplaceTempView(\"condor\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "2b04b914",
-   "metadata": {},
+   "id": "e271b1c8",
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
-    "len(docs)"
+    "# query\n",
+    "query = f\"\"\"\\\n",
+    "WITH filter_tb AS (\n",
+    "SELECT *\n",
+    "FROM condor\n",
+    "WHERE 1=1\n",
+    "AND RecordTime >= {start_epochmilis}\n",
+    "AND RecordTime < {end_epochmilis}\n",
+    "),\n",
+    "join_tb AS (\n",
+    "SELECT RecordTime, CMSPrimaryDataTier, WallClockHr, CoreHr, CpuTimeHr, ExitCode, CRAB_DataBlock, TM_IGNORE_LOCALITY, GlobalJobId, CommittedCoreHr, CommittedWallClockHr\n",
+    "FROM filter_tb\n",
+    "INNER JOIN tasks \n",
+    "ON filter_tb.CRAB_Workflow = tasks.TM_TASKNAME \n",
+    "), \n",
+    "finalize_tb AS (\n",
+    "SELECT RecordTime, CMSPrimaryDataTier, WallClockHr, CoreHr, CpuTimeHr, ExitCode, CRAB_DataBlock, TM_IGNORE_LOCALITY, GlobalJobId, CommittedCoreHr, CommittedWallClockHr, \n",
+    "       CASE \n",
+    "           WHEN CRAB_DataBlock = 'MCFakeBlock' THEN 'PrivateMC'  \n",
+    "           ELSE 'Analysis'\n",
+    "       END AS CRAB_Type,        --- to differentiate between analysis and mc\n",
+    "       'condor' AS type,        --- use to match specific data when use wildcard index pattern on grafana side\n",
+    "       RecordTime AS timestamp  --- use `RecordTime` as timestamp\n",
+    "FROM join_tb\n",
+    ")\n",
+    "SELECT * \n",
+    "FROM finalize_tb \n",
+    "\"\"\"\n",
+    "tmpdf = spark.sql(query)\n",
+    "tmpdf.show(10)\n",
+    "\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
-   "id": "fa3f9917",
+   "execution_count": null,
+   "id": "75c6a964",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# def spark_exec(candidate_files):\n",
-    "#     condor_df = (\n",
-    "#             spark.read.option(\"basePath\", _DEFAULT_HDFS_FOLDER)\n",
-    "#             .json(\n",
-    "#                 candidate_files,\n",
-    "#                 schema=schema,\n",
-    "#             ).select(\"data.*\")\n",
-    "#             .filter(\n",
-    "#                 f\"\"\"Status IN ('Completed')\n",
-    "#                 AND Type IN ('analysis')\n",
-    "#                 AND RecordTime >= {start_date.timestamp() * 1000}\n",
-    "#                 AND RecordTime < {end_date.timestamp() * 1000}\n",
-    "#                 \"\"\"\n",
-    "#             )\n",
-    "#             .drop_duplicates([\"GlobalJobId\"]).cache()\n",
-    "#         ) \n",
-    "#     condor_df.write.mode('overwrite').parquet(\"/cms/users/eatthaph/condor_vir_data\" ,compression='zstd')\n",
-    "#     condor_df = spark.read.format('parquet').load('/cms/users/eatthaph/condor_vir_data')\n",
-    "#     result_df = condor_df.join(crab_df, crab_df[\"TM_TASKNAME\"] == condor_df[\"CRAB_Workflow\"])\\\n",
-    "#         .select('RecordTime', 'CMSPrimaryDataTier', 'WallClockHr', 'CoreHr', 'CpuTimeHr', 'ExitCode'\n",
-    "#                 , \"CRAB_DataBlock\", \"TM_IGNORE_LOCALITY\", \"GlobalJobId\", \"CommittedCoreHr\", \"CommittedWallClockHr\")\n",
-    "#     sub_docs = result_df.toPandas()\n",
-    "#     return sub_docs\n",
-    "\n",
-    "# def loop_excute(candidate_files, initial_n=len(candidate_files)):\n",
-    "#     r = 0\n",
-    "#     n = initial_n\n",
-    "#     df_list = []\n",
-    "#     file_chunk = group_files(candidate_files, n)\n",
-    "#     while len(file_chunk)!=0 and r<10:\n",
-    "#         print(\"=================================\\n round :\", r+1, \"\\n=================================\")\n",
-    "#         df_err_list = []\n",
-    "#         for i, chunk in enumerate(file_chunk):\n",
-    "#             print(\"=================================\\n\", i+1, \"out of\", len(file_chunk), \"\\n=================================\")\n",
-    "#             try:\n",
-    "#                 df_list.append(spark_exec(chunk))\n",
-    "#             except Exception as ex:\n",
-    "#                 print(\"=====\", ex)\n",
-    "#                 df_err_list.extend(chunk)\n",
-    "# #         if n != 1:\n",
-    "# #             n = n//2\n",
-    "#         file_chunk = group_files(df_err_list, n)\n",
-    "#         r += 1\n",
-    "#         print(\"\")\n",
-    "#     print(\"Fail excuted files :\", df_err_list)\n",
-    "#     return df_list"
+    "tmpdf.count()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
-   "id": "af6d5e17",
+   "execution_count": null,
+   "id": "eee4a1f3",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# useful_df = loop_excute(candidate_files)\n",
-    "# df_list = spark_exec(candidate_files)"
+    "schema = {\n",
+    "            \"settings\": {\"index\": {\"number_of_shards\": \"1\", \"number_of_replicas\": \"1\"}},\n",
+    "            \"mappings\": {\n",
+    "                \"properties\": {\n",
+    "                    \"RecordTime\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
+    "                    \"CMSPrimaryDataTier\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+    "                    \"GlobalJobId\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+    "                    \"WallClockHr\": {\"type\": \"long\"},\n",
+    "                    \"CoreHr\": {\"type\": \"long\"},\n",
+    "                    \"CpuTimeHr\": {\"type\": \"long\"},\n",
+    "                    \"ExitCode\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+    "                    \"TM_IGNORE_LOCALITY\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+    "                    \"CRAB_Type\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+    "                    \"CRAB_DataBlock\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+    "                    \"CommittedCoreHr\": {\"type\": \"long\"}, \n",
+    "                    \"CommittedWallClockHr\": {\"type\": \"long\"},\n",
+    "                    \"type\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+    "                    \"timestamp\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
+    "                }\n",
+    "            }\n",
+    "        }"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
-   "id": "18908dab",
-   "metadata": {},
+   "execution_count": null,
+   "id": "5d0506d4",
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
-    "docs = docs.to_dict('records')"
+    "# this is simple workaround osearch bug when work in notebook because\n",
+    "#   - it load the secret once and use forever\n",
+    "#   - get_or_create_index() create index+schema only the first time it execute\n",
+    "# it is safe to run again even in cronjobs \n",
+    "import importlib\n",
+    "import osearch\n",
+    "importlib.reload(osearch)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
-   "id": "c912b217",
+   "execution_count": null,
+   "id": "47a4f569",
    "metadata": {},
    "outputs": [],
    "source": [
-    "for i in range(len(docs)):\n",
-    "    if docs[i]['CRAB_DataBlock'] == 'MCFakeBlock':\n",
-    "        docs[i]['CRAB_Type'] = 'PrivateMC'\n",
-    "    else:\n",
-    "        docs[i]['CRAB_Type'] = 'Analysis'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "id": "0e3ae57b",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[{'RecordTime': 1692101192000,\n",
-       "  'CMSPrimaryDataTier': 'MINIAODSIM',\n",
-       "  'WallClockHr': 0.12361111111111112,\n",
-       "  'CoreHr': 0.12361111111111112,\n",
-       "  'CpuTimeHr': 0.0022222222222222222,\n",
-       "  'ExitCode': 8020,\n",
-       "  'CRAB_DataBlock': '/WWTo4Q_4f_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18MiniAODv2-106X_upgrade2018_realistic_v16_L1v1-v3/MINIAODSIM#eb5a0cbd-6c43-492c-ac21-4318775aee3b',\n",
-       "  'TM_IGNORE_LOCALITY': 'F',\n",
-       "  'GlobalJobId': 'crab3@vocms0155.cern.ch#98631116.0#1692100543',\n",
-       "  'CommittedCoreHr': '0.12361111111111112',\n",
-       "  'CommittedWallClockHr': '0.12361111111111112',\n",
-       "  'CRAB_Type': 'Analysis'},\n",
-       " {'RecordTime': 1692099933000,\n",
-       "  'CMSPrimaryDataTier': 'MINIAODSIM',\n",
-       "  'WallClockHr': 0.12166666666666667,\n",
-       "  'CoreHr': 0.12166666666666667,\n",
-       "  'CpuTimeHr': 0.004722222222222222,\n",
-       "  'ExitCode': 8020,\n",
-       "  'CRAB_DataBlock': '/WWTo4Q_4f_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18MiniAODv2-106X_upgrade2018_realistic_v16_L1v1-v3/MINIAODSIM#eb5a0cbd-6c43-492c-ac21-4318775aee3b',\n",
-       "  'TM_IGNORE_LOCALITY': 'F',\n",
-       "  'GlobalJobId': 'crab3@vocms0155.cern.ch#98629759.0#1692099393',\n",
-       "  'CommittedCoreHr': '0.1213888888888889',\n",
-       "  'CommittedWallClockHr': '0.1213888888888889',\n",
-       "  'CRAB_Type': 'Analysis'},\n",
-       " {'RecordTime': 1692121300000,\n",
-       "  'CMSPrimaryDataTier': 'MINIAODSIM',\n",
-       "  'WallClockHr': 5.698333333333333,\n",
-       "  'CoreHr': 5.698333333333333,\n",
-       "  'CpuTimeHr': 5.501388888888889,\n",
-       "  'ExitCode': 0,\n",
-       "  'CRAB_DataBlock': '/QCD_HT50to100_TuneCP5_13TeV-madgraphMLM-pythia8/RunIISummer20UL17MiniAODv2-106X_mc2017_realistic_v9-v2/MINIAODSIM#4c355926-ea17-4285-bd2b-2c7692c48a87',\n",
-       "  'TM_IGNORE_LOCALITY': 'F',\n",
-       "  'GlobalJobId': 'crab3@vocms0107.cern.ch#96720630.0#1691795011',\n",
-       "  'CommittedCoreHr': '5.698333333333333',\n",
-       "  'CommittedWallClockHr': '5.698333333333333',\n",
-       "  'CRAB_Type': 'Analysis'},\n",
-       " {'RecordTime': 1692121556000,\n",
-       "  'CMSPrimaryDataTier': 'MINIAODSIM',\n",
-       "  'WallClockHr': 5.769722222222223,\n",
-       "  'CoreHr': 5.769722222222223,\n",
-       "  'CpuTimeHr': 5.543055555555555,\n",
-       "  'ExitCode': 0,\n",
-       "  'CRAB_DataBlock': '/QCD_HT50to100_TuneCP5_13TeV-madgraphMLM-pythia8/RunIISummer20UL17MiniAODv2-106X_mc2017_realistic_v9-v2/MINIAODSIM#4c355926-ea17-4285-bd2b-2c7692c48a87',\n",
-       "  'TM_IGNORE_LOCALITY': 'F',\n",
-       "  'GlobalJobId': 'crab3@vocms0107.cern.ch#96720628.0#1691795011',\n",
-       "  'CommittedCoreHr': '5.769722222222223',\n",
-       "  'CommittedWallClockHr': '5.769722222222223',\n",
-       "  'CRAB_Type': 'Analysis'},\n",
-       " {'RecordTime': 1692123756000,\n",
-       "  'CMSPrimaryDataTier': 'MINIAODSIM',\n",
-       "  'WallClockHr': 3.2091666666666665,\n",
-       "  'CoreHr': 3.2091666666666665,\n",
-       "  'CpuTimeHr': 3.1125,\n",
-       "  'ExitCode': 0,\n",
-       "  'CRAB_DataBlock': '/QCD_HT50to100_TuneCP5_13TeV-madgraphMLM-pythia8/RunIISummer20UL17MiniAODv2-106X_mc2017_realistic_v9-v2/MINIAODSIM#4c355926-ea17-4285-bd2b-2c7692c48a87',\n",
-       "  'TM_IGNORE_LOCALITY': 'F',\n",
-       "  'GlobalJobId': 'crab3@vocms0107.cern.ch#96720658.0#1691795012',\n",
-       "  'CommittedCoreHr': '3.2091666666666665',\n",
-       "  'CommittedWallClockHr': '3.2091666666666665',\n",
-       "  'CRAB_Type': 'Analysis'}]"
-      ]
-     },
-     "execution_count": 17,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "docs[:5]"
+    "# repartition rdd to make each partition small enough to load back to python kernel, serialize to dict, and send to os.\n",
+    "# for 12M rows, number of from 27 days of data is 51, around 250k per partition.\n",
+    "# try reducing partition to 20 once but make python kernel out-of-memory. \n",
+    "# so, try to keep it around 200k per partition instead.\n",
+    "partition_num = tmpdf.count() // 200000\n",
+    "tmpdf = tmpdf.repartition(partition_num, 'RecordTime')\n",
+    "total_part = tmpdf.rdd.getNumPartitions()\n",
+    "\n",
+    "print(f\"Number of partition: {total_part}\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
-   "id": "bcdfb65c",
-   "metadata": {},
+   "execution_count": null,
+   "id": "3e1f7a3f",
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [],
    "source": [
-    "import osearch"
+    "# send to os, serialize df one rdd partition at a time\n",
+    "part = 0\n",
+    "for docs in tmpdf.rdd.mapPartitions(lambda p: [[x.asDict() for x in p]]).toLocalIterator():\n",
+    "    part += 1\n",
+    "    print(f\"Partition: {part}/{total_part}, Length of partition: {len(docs)}\")\n",
+    "    send_os_parallel(docs, index_name, schema, secretpath, yesterday_epoch, 20000) # batch_size is just arbitrary number"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
-   "id": "4666acef",
+   "execution_count": null,
+   "id": "52b2fc9f",
    "metadata": {},
    "outputs": [],
    "source": [
-    "def get_index_schema():\n",
-    "    return {\n",
-    "        \"settings\": {\"index\": {\"number_of_shards\": \"1\", \"number_of_replicas\": \"1\"}},\n",
-    "        \"mappings\": {\n",
-    "            \"properties\": {\n",
-    "                \"RecordTime\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
-    "                \"CMSPrimaryDataTier\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
-    "                \"GlobalJobId\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
-    "                \"WallClockHr\": {\"type\": \"long\"},\n",
-    "                \"CoreHr\": {\"type\": \"long\"},\n",
-    "                \"CpuTimeHr\": {\"type\": \"long\"},\n",
-    "                \"ExitCode\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
-    "                \"TM_IGNORE_LOCALITY\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
-    "                \"CRAB_Type\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
-    "                \"CRAB_DataBlock\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
-    "                \"CommittedCoreHr\": {\"type\": \"long\"}, \n",
-    "                \"CommittedWallClockHr\": {\"type\": \"long\"},\n",
-    "            }\n",
-    "        }\n",
-    "    }"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "id": "d6e4107b",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/eos/user/e/eatthaph/.local/lib/python3.9/site-packages/opensearchpy/connection/http_urllib3.py:199: UserWarning: Connecting to https://es-cms1.cern.ch:443 using SSL with verify_certs=False is insecure.\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n"
-     ]
-    }
-   ],
-   "source": [
-    "_index_template = 'crab-condor-ekong'\n",
-    "client = osearch.get_es_client(\"es-cms1.cern.ch/es\", 'secret_opensearch.txt', get_index_schema())\n",
-    "# index_mod=\"\": 'test-foo', index_mod=\"Y\": 'test-foo-YYYY', index_mod=\"M\": 'test-foo-YYYY-MM', index_mod=\"D\": 'test-foo-YYYY-MM-DD',\n",
-    "idx = client.get_or_create_index(timestamp=time.time(), index_template=_index_template, index_mod=\"M\")\n",
-    "client.send(idx, docs, metadata=None, batch_size=10000, drop_nulls=False)"
+    "print(\"Done!\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "d7274886",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1b4484a3",
+   "id": "1dc69a5c",
    "metadata": {},
    "outputs": [],
    "source": []
@@ -728,14 +478,13 @@
    "list_of_options": [
     {
      "name": "spark.jars.packages",
-     "value": "org.apache.spark:spark-avro_2.12:3.3.1"
+     "value": "org.apache.spark:spark-avro_2.12:3.5.0"
+    },
+    {
+     "name": "spark.executor.instances",
+     "value": "20"
     }
    ]
-  },
-  "vscode": {
-   "interpreter": {
-    "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
-   }
   }
  },
  "nbformat": 4,
diff --git a/src/script/Monitor/crab-spark/notebooks/crab_data.ipynb b/src/script/Monitor/crab-spark/notebooks/crab_data.ipynb
deleted file mode 100644
index be60f3eb84..0000000000
--- a/src/script/Monitor/crab-spark/notebooks/crab_data.ipynb
+++ /dev/null
@@ -1,832 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "66b56403",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "795d491e",
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "            <div>\n",
-       "                <p><b>SparkSession - in-memory</b></p>\n",
-       "                \n",
-       "        <div>\n",
-       "            <p><b>SparkContext</b></p>\n",
-       "\n",
-       "            <p><a href=\"http://swan-prod-gpu-t4-5x-k4cbamaqo7dk-node-14:31977\">Spark UI</a></p>\n",
-       "\n",
-       "            <dl>\n",
-       "              <dt>Version</dt>\n",
-       "                <dd><code>v3.3.2</code></dd>\n",
-       "              <dt>Master</dt>\n",
-       "                <dd><code>yarn</code></dd>\n",
-       "              <dt>AppName</dt>\n",
-       "                <dd><code>pyspark_shell_swan</code></dd>\n",
-       "            </dl>\n",
-       "        </div>\n",
-       "        \n",
-       "            </div>\n",
-       "        "
-      ],
-      "text/plain": [
-       "<pyspark.sql.session.SparkSession at 0x7fa0349a2610>"
-      ]
-     },
-     "execution_count": 1,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "spark"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "31b02b1c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# !hdfs -h"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "7a7ad1c3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# !hdfs dfs -ls /cms/users/eatthaph"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "8a170ced",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# !hdfs dfs -ls /cms/users/eatthaph/"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "17520cda",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "23/07/25 16:06:21 WARN ipc.Client: Exception encountered while connecting to the server \n",
-      "org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.ipc.StandbyException): Operation category READ is not supported in state standby. Visit https://s.apache.org/sbnn-error\n",
-      "\tat org.apache.hadoop.security.SaslRpcClient.saslConnect(SaslRpcClient.java:376)\n",
-      "\tat org.apache.hadoop.ipc.Client$Connection.setupSaslConnection(Client.java:622)\n",
-      "\tat org.apache.hadoop.ipc.Client$Connection.access$2300(Client.java:413)\n",
-      "\tat org.apache.hadoop.ipc.Client$Connection$2.run(Client.java:822)\n",
-      "\tat org.apache.hadoop.ipc.Client$Connection$2.run(Client.java:818)\n",
-      "\tat java.base/java.security.AccessController.doPrivileged(Native Method)\n",
-      "\tat java.base/javax.security.auth.Subject.doAs(Subject.java:423)\n",
-      "\tat org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1845)\n",
-      "\tat org.apache.hadoop.ipc.Client$Connection.setupIOstreams(Client.java:818)\n",
-      "\tat org.apache.hadoop.ipc.Client$Connection.access$3800(Client.java:413)\n",
-      "\tat org.apache.hadoop.ipc.Client.getConnection(Client.java:1636)\n",
-      "\tat org.apache.hadoop.ipc.Client.call(Client.java:1452)\n",
-      "\tat org.apache.hadoop.ipc.Client.call(Client.java:1405)\n",
-      "\tat org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:234)\n",
-      "\tat org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:119)\n",
-      "\tat com.sun.proxy.$Proxy12.getFileInfo(Unknown Source)\n",
-      "\tat org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.getFileInfo(ClientNamenodeProtocolTranslatorPB.java:964)\n",
-      "\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n",
-      "\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n",
-      "\tat java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n",
-      "\tat java.base/java.lang.reflect.Method.invoke(Method.java:566)\n",
-      "\tat org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:422)\n",
-      "\tat org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeMethod(RetryInvocationHandler.java:165)\n",
-      "\tat org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invoke(RetryInvocationHandler.java:157)\n",
-      "\tat org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeOnce(RetryInvocationHandler.java:95)\n",
-      "\tat org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:359)\n",
-      "\tat com.sun.proxy.$Proxy13.getFileInfo(Unknown Source)\n",
-      "\tat org.apache.hadoop.hdfs.DFSClient.getFileInfo(DFSClient.java:1731)\n",
-      "\tat org.apache.hadoop.hdfs.DistributedFileSystem$29.doCall(DistributedFileSystem.java:1725)\n",
-      "\tat org.apache.hadoop.hdfs.DistributedFileSystem$29.doCall(DistributedFileSystem.java:1722)\n",
-      "\tat org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81)\n",
-      "\tat org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:1737)\n",
-      "\tat org.apache.hadoop.fs.Globber.getFileStatus(Globber.java:115)\n",
-      "\tat org.apache.hadoop.fs.Globber.doGlob(Globber.java:349)\n",
-      "\tat org.apache.hadoop.fs.Globber.glob(Globber.java:202)\n",
-      "\tat org.apache.hadoop.fs.FileSystem.globStatus(FileSystem.java:2093)\n",
-      "\tat org.apache.hadoop.fs.shell.PathData.expandAsGlob(PathData.java:353)\n",
-      "\tat org.apache.hadoop.fs.shell.Command.expandArgument(Command.java:250)\n",
-      "\tat org.apache.hadoop.fs.shell.Command.expandArguments(Command.java:233)\n",
-      "\tat org.apache.hadoop.fs.shell.FsCommand.processRawArguments(FsCommand.java:104)\n",
-      "\tat org.apache.hadoop.fs.shell.Command.run(Command.java:177)\n",
-      "\tat org.apache.hadoop.fs.FsShell.run(FsShell.java:327)\n",
-      "\tat org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:76)\n",
-      "\tat org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:90)\n",
-      "\tat org.apache.hadoop.fs.FsShell.main(FsShell.java:390)\n",
-      "Found 9 items\n",
-      "-rw-r--r-x+  3 cmssqoop c3          0 2023-07-19 02:14 /project/awg/cms/crab/tasks/2023-07-19/_SUCCESS\n",
-      "-rw-r--r-x+  3 cmssqoop c3   85991835 2023-07-19 02:01 /project/awg/cms/crab/tasks/2023-07-19/part-m-00000.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3  837565156 2023-07-19 02:14 /project/awg/cms/crab/tasks/2023-07-19/part-m-00001.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3  605874324 2023-07-19 02:10 /project/awg/cms/crab/tasks/2023-07-19/part-m-00002.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3  602365393 2023-07-19 02:09 /project/awg/cms/crab/tasks/2023-07-19/part-m-00003.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3  761072727 2023-07-19 02:13 /project/awg/cms/crab/tasks/2023-07-19/part-m-00004.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3  462585036 2023-07-19 02:07 /project/awg/cms/crab/tasks/2023-07-19/part-m-00005.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3  394767237 2023-07-19 02:06 /project/awg/cms/crab/tasks/2023-07-19/part-m-00006.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3  358041401 2023-07-19 02:04 /project/awg/cms/crab/tasks/2023-07-19/part-m-00007.avro\n"
-     ]
-    }
-   ],
-   "source": [
-    "!hdfs dfs -ls /project/awg/cms/crab/tasks/2023-07-19"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "2a7b2463",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# import pickle\n",
-    "from datetime import datetime, timedelta\n",
-    "\n",
-    "# import click\n",
-    "import os\n",
-    "import pandas as pd\n",
-    "# import pprint\n",
-    "import time\n",
-    "# from dateutil.relativedelta import relativedelta\n",
-    "from pyspark import SparkContext, StorageLevel\n",
-    "from pyspark.sql import SparkSession\n",
-    "from pyspark.sql.functions import (\n",
-    "    col, collect_list, concat_ws, greatest, lit, lower, when, unix_timestamp, to_timestamp,\n",
-    "    avg as _avg,\n",
-    "    count as _count,\n",
-    "    hex as _hex,\n",
-    "    max as _max,\n",
-    "    min as _min,\n",
-    "    round as _round,\n",
-    "    sum as _sum,\n",
-    ")\n",
-    "\n",
-    "from pyspark.sql.types import (\n",
-    "    LongType,\n",
-    ")\n",
-    "\n",
-    "# import matplotlib.pyplot as plt\n",
-    "import numpy as np\n",
-    "import math\n",
-    "import json\n",
-    "#from CMSSpark.src.python.CMSSpark import schemas as cms_schemas"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "f2904198",
-   "metadata": {},
-   "source": [
-    "## load dataset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 34,
-   "id": "aa0d181a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# end_date = str(datetime.now())[:10]\n",
-    "# start_date = str(datetime.now()-timedelta(days=1))[:10]\n",
-    "\n",
-    "start_date = '2023-07-20'\n",
-    "end_date = '2023-07-25'\n",
-    "\n",
-    "wa_date = end_date\n",
-    "HDFS_CRAB_part = f'/project/awg/cms/crab/tasks/{wa_date}/'\n",
-    "# HDFS_CRAB_part = f'/project/awg/cms/crab/{wa_date}/tasks/'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 35,
-   "id": "532ec9ac",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "crab_part = spark.read.format('avro').load(HDFS_CRAB_part)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "3ad81af6",
-   "metadata": {},
-   "source": [
-    "## Query"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 37,
-   "id": "41cf761f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = crab_part.select(\"TM_TASKNAME\",\"TM_START_TIME\",\"TM_TASK_STATUS\",\"TM_SPLIT_ALGO\",\"TM_USERNAME\",\"TM_USER_ROLE\",\"TM_JOB_TYPE\",\"TM_IGNORE_LOCALITY\",\"TM_SCRIPTEXE\",\"TM_USER_CONFIG\")\n",
-    "df.createOrReplaceTempView(\"crab_algo\")\n",
-    "# df.show(10)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 38,
-   "id": "e41c5fc6",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "6147"
-      ]
-     },
-     "execution_count": 38,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "query = f\"\"\"\\\n",
-    "SELECT *\n",
-    "FROM crab_algo \n",
-    "WHERE 1=1\n",
-    "AND TM_START_TIME >= unix_timestamp(\"{start_date} 00:00:00\", \"yyyy-MM-dd HH:mm:ss\")*1000 \n",
-    "AND TM_START_TIME < unix_timestamp(\"{end_date} 00:00:00\", \"yyyy-MM-dd HH:mm:ss\")*1000 \n",
-    "\"\"\"\n",
-    "\n",
-    "tmpdf = spark.sql(query)\n",
-    "tmpdf.count()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "id": "25033524",
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "root\n",
-      " |-- TM_TASKNAME: string (nullable = true)\n",
-      " |-- TM_START_TIME: long (nullable = true)\n",
-      " |-- TM_TASK_STATUS: string (nullable = true)\n",
-      " |-- TM_SPLIT_ALGO: string (nullable = true)\n",
-      " |-- TM_USERNAME: string (nullable = true)\n",
-      " |-- TM_USER_ROLE: string (nullable = true)\n",
-      " |-- TM_JOB_TYPE: string (nullable = true)\n",
-      " |-- TM_IGNORE_LOCALITY: string (nullable = true)\n",
-      " |-- TM_SCRIPTEXE: string (nullable = true)\n",
-      " |-- TM_USER_CONFIG: string (nullable = true)\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "tmpdf.printSchema()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "ff188450",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "docs = tmpdf.toPandas().to_dict('records')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "fad5ca52",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "6147"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len(docs)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "id": "c454d0c4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Extract 'REQUIRE_ACCELERATOR' from 'TM_USER_CONFIG'\n",
-    "\n",
-    "for i in range(len(docs)):\n",
-    "    if docs[i]['TM_USER_CONFIG'] is not None:\n",
-    "        data = json.loads(docs[i]['TM_USER_CONFIG'])\n",
-    "        if \"requireaccelerator\" in data:\n",
-    "            docs[i]['REQUIRE_ACCELERATOR'] = data[\"requireaccelerator\"]\n",
-    "        else:\n",
-    "            docs[i]['REQUIRE_ACCELERATOR'] = None\n",
-    "    else:\n",
-    "        docs[i]['REQUIRE_ACCELERATOR'] = None"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "id": "d2e914f6",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[{'TM_TASKNAME': '160406_111833:sciaba_HC-163-AnySite-26725-20160406125703-T1_UK_RAL',\n",
-       "  'TM_START_TIME': 1459934313843,\n",
-       "  'TM_TASK_STATUS': 'SUBMITTED',\n",
-       "  'TM_SPLIT_ALGO': 'FileBased',\n",
-       "  'TM_USERNAME': 'sciaba',\n",
-       "  'TM_USER_ROLE': 'production',\n",
-       "  'TM_JOB_TYPE': 'Analysis',\n",
-       "  'TM_IGNORE_LOCALITY': 'T',\n",
-       "  'TM_SCRIPTEXE': None,\n",
-       "  'TM_USER_CONFIG': None,\n",
-       "  'REQUIRE_ACCELERATOR': None},\n",
-       " {'TM_TASKNAME': '160406_111914:sciaba_HC-148-AnySite-26727-20160406131903-T2_UK_SGrid_Bristol',\n",
-       "  'TM_START_TIME': 1459934354531,\n",
-       "  'TM_TASK_STATUS': 'SUBMITTED',\n",
-       "  'TM_SPLIT_ALGO': 'FileBased',\n",
-       "  'TM_USERNAME': 'sciaba',\n",
-       "  'TM_USER_ROLE': 'production',\n",
-       "  'TM_JOB_TYPE': 'Analysis',\n",
-       "  'TM_IGNORE_LOCALITY': 'T',\n",
-       "  'TM_SCRIPTEXE': None,\n",
-       "  'TM_USER_CONFIG': None,\n",
-       "  'REQUIRE_ACCELERATOR': None},\n",
-       " {'TM_TASKNAME': '160319_180958:sciaba_HC-138-AnySite-26052-20160319011302-T2_RU_IHEP',\n",
-       "  'TM_START_TIME': 1458407398241,\n",
-       "  'TM_TASK_STATUS': 'SUBMITTED',\n",
-       "  'TM_SPLIT_ALGO': 'FileBased',\n",
-       "  'TM_USERNAME': 'sciaba',\n",
-       "  'TM_USER_ROLE': 'production',\n",
-       "  'TM_JOB_TYPE': 'Analysis',\n",
-       "  'TM_IGNORE_LOCALITY': 'T',\n",
-       "  'TM_SCRIPTEXE': None,\n",
-       "  'TM_USER_CONFIG': None,\n",
-       "  'REQUIRE_ACCELERATOR': None}]"
-      ]
-     },
-     "execution_count": 25,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "docs[:3]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "id": "cf696d7f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import osearch"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "id": "e47490bd",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_index_schema():\n",
-    "    return {\n",
-    "        \"settings\": {\"index\": {\"number_of_shards\": \"1\", \"number_of_replicas\": \"1\"}},\n",
-    "        \"mappings\": {\n",
-    "            \"properties\": {\n",
-    "                \"TM_TASKNAME\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
-    "                \"TM_START_TIME\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
-    "                'TM_TASK_STATUS': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
-    "                \"TM_SPLIT_ALGO\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
-    "                \"TM_USERNAME\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
-    "                \"TM_USER_ROLE\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
-    "                \"TM_JOB_TYPE\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
-    "                \"TM_IGNORE_LOCALITY\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
-    "                \"TM_SCRIPTEXE\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
-    "                \"REQUIRE_ACCELERATOR\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
-    "            }\n",
-    "        }\n",
-    "    }\n",
-    "\n",
-    "# def get_index_schema():\n",
-    "#     return {\n",
-    "#         \"settings\": {\"index\": {\"number_of_shards\": \"1\", \"number_of_replicas\": \"1\"}},\n",
-    "#         \"mappings\": {\n",
-    "#             \"properties\": {\n",
-    "#                 \"TM_TASKNAME\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
-    "#                 \"TM_SPLIT_ALGO\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
-    "#                 \"TM_START_TIME\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
-    "#                 \"TM_END_INJECTION\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
-    "#             }\n",
-    "#         }\n",
-    "#     }"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "id": "6bcfc801",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "0"
-      ]
-     },
-     "execution_count": 28,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "_index_template = 'crab-data-ekong'\n",
-    "client = osearch.get_es_client(\"es-cms1.cern.ch/es\", 'secret_opensearch.txt', get_index_schema())\n",
-    "# index_mod=\"\": 'test-foo', index_mod=\"Y\": 'test-foo-YYYY', index_mod=\"M\": 'test-foo-YYYY-MM', index_mod=\"D\": 'test-foo-YYYY-MM-DD',\n",
-    "idx = client.get_or_create_index(timestamp=time.time(), index_template=_index_template, index_mod=\"M\")\n",
-    "client.send(idx, docs, metadata=None, batch_size=10000, drop_nulls=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "bcac057e",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a5f62789",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "@webio": {
-   "lastCommId": null,
-   "lastKernelId": null
-  },
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.12"
-  },
-  "sparkconnect": {
-   "bundled_options": [],
-   "list_of_options": [
-    {
-     "name": "spark.jars.packages",
-     "value": "org.apache.spark:spark-avro_2.12:3.3.1"
-    }
-   ]
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/src/script/Monitor/crab-spark/notebooks/crab_rucio_rules_poc.ipynb b/src/script/Monitor/crab-spark/notebooks/crab_rucio_rules_poc.ipynb
new file mode 100644
index 0000000000..8aa5ac31a6
--- /dev/null
+++ b/src/script/Monitor/crab-spark/notebooks/crab_rucio_rules_poc.ipynb
@@ -0,0 +1,340 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5e9af689",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datetime import datetime, timedelta, timezone\n",
+    "import os\n",
+    "import time\n",
+    "import pandas as pd\n",
+    "\n",
+    "from pyspark import SparkContext, StorageLevel\n",
+    "from pyspark.sql import SparkSession\n",
+    "from pyspark.sql.functions import (\n",
+    "    current_user,\n",
+    "    col, collect_list, concat_ws, greatest, lit, lower, when,\n",
+    "    avg as _avg,\n",
+    "    count as _count,\n",
+    "    hex as _hex,\n",
+    "    max as _max,\n",
+    "    min as _min,\n",
+    "    round as _round,\n",
+    "    sum as _sum,\n",
+    ")\n",
+    "from pyspark.sql.types import (\n",
+    "    StructType,\n",
+    "    LongType,\n",
+    "    StringType,\n",
+    "    StructField,\n",
+    "    DoubleType,\n",
+    "    IntegerType,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "91309756",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# try to import libs from current directory, fallback to $PWD/../workdir if not found\n",
+    "try:\n",
+    "    from crabspark_utils import get_candidate_files, send_os, send_os_parallel\n",
+    "except ModuleNotFoundError:\n",
+    "    import sys\n",
+    "    sys.path.insert(0, f'{os.getcwd()}/../workdir')\n",
+    "    from crabspark_utils import get_candidate_files, send_os, send_os_parallel\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "22946659",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "spark = SparkSession\\\n",
+    "        .builder\\\n",
+    "        .appName('crab-taskdb')\\\n",
+    "        .getOrCreate()\n",
+    "spark"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e9013878",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# clear any cache left, for working with notebook\n",
+    "# it safe to run everytime cronjob start\n",
+    "spark.catalog.clearCache()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "31c19eb0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# secret path, also check if file exists\n",
+    "secretpath = os.environ.get('OPENSEARCH_SECRET_PATH', f'{os.getcwd()}/../workdir/secret_opensearch.txt')\n",
+    "if not os.path.isfile(secretpath): \n",
+    "    raise Exception(f'OS secrets file {secretpath} does not exists')\n",
+    "# if PROD, index prefix will be `crab-*`, otherwise `crab-test-*`\n",
+    "PROD = os.environ.get('PROD', 'false').lower() in ('true', '1', 't')\n",
+    "# FROM_DATE, in strptime(\"%Y-%m-%d\")\n",
+    "START = os.environ.get('START_DATE', None) \n",
+    "END = os.environ.get('END_DATE', None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e843eb6d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# For run playbook manually, set start/end date here\n",
+    "START_DATE = \"2020-01-01\"\n",
+    "END_DATE = \"2024-10-01\"\n",
+    "# if cronjob, replace constant with value from env\n",
+    "if START and END:\n",
+    "    START_DATE = START\n",
+    "    END_DATE = END"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b17ed53f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# index name\n",
+    "index_name = 'taskdb'\n",
+    "# use prod index pattern if this execution is for production\n",
+    "if PROD:\n",
+    "    index_name = f'crab-prod-{index_name}'\n",
+    "else:\n",
+    "    index_name = f'crab-test-{index_name}'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8417ab47",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# datetime object\n",
+    "start_datetime = datetime.strptime(START_DATE, \"%Y-%m-%d\").replace(tzinfo=timezone.utc)\n",
+    "end_datetime = datetime.strptime(END_DATE, \"%Y-%m-%d\").replace(tzinfo=timezone.utc)\n",
+    "# sanity check\n",
+    "if end_datetime < start_datetime: \n",
+    "    raise Exception(f\"end date ({END_DATE}) is less than start date ({START_DATE})\")\n",
+    "start_epochmilis = int(start_datetime.timestamp()) * 1000\n",
+    "end_epochmilis = int(end_datetime.timestamp()) * 1000\n",
+    "yesterday_epoch = int((end_datetime-timedelta(days=1)).timestamp())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9404c437",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# debug\n",
+    "print(START_DATE, \n",
+    "      END_DATE, \n",
+    "      index_name,\n",
+    "      sep='\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3e85c2f0",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# This code block and following block is copied from Panos's script.\n",
+    "# https://gitlab.cern.ch/cmsdmops/cmsdmops/-/blob/8da699db49097d7a58440e6058f022c3f93992e2/monitoring/kubernetes/src/rucio_activity_account_usage.py\n",
+    "# see more in https://github.com/dmwm/CRABServer/issues/7798#issuecomment-2389265249\n",
+    "def get_df_rses(spark):\n",
+    "    \"\"\"Get Spark dataframe of RSES\n",
+    "    \"\"\"\n",
+    "    hdfs_rses_path = '/project/awg/cms/rucio/{}/rses/part*.avro'.format(datetime.today().strftime('%Y-%m-%d'))\n",
+    "    df_rses = spark.read.format(\"avro\").load(hdfs_rses_path) \\\n",
+    "        .filter(col('DELETED_AT').isNull()) \\\n",
+    "        .withColumn('rse_id', lower(_hex(col('ID')))) \\\n",
+    "        .withColumn('rse_tier', _split(col('RSE'), '_').getItem(0)) \\\n",
+    "        .withColumn('rse_country', _split(col('RSE'), '_').getItem(1)) \\\n",
+    "        .withColumn('rse_kind',\n",
+    "                    when((col(\"rse\").endswith('Temp') | col(\"rse\").endswith('temp') | col(\"rse\").endswith('TEMP')),\n",
+    "                         'temp')\n",
+    "                    .when((col(\"rse\").endswith('Test') | col(\"rse\").endswith('test') | col(\"rse\").endswith('TEST')),\n",
+    "                          'test')\n",
+    "                    .otherwise('prod')\n",
+    "                    ) \\\n",
+    "        .select(['rse_id', 'RSE', 'RSE_TYPE', 'rse_tier', 'rse_country', 'rse_kind'])\n",
+    "    return df_rses\n",
+    "def get_df_locks(spark):\n",
+    "    \"\"\"Get Spark dataframe of Locks\n",
+    "    \"\"\"\n",
+    "    today = datetime.today().strftime('%Y-%m-%d')\n",
+    "    locks_path = f'/project/awg/cms/rucio/{today}/locks/part*.avro'\n",
+    "    locks = spark.read.format('avro').load(locks_path) \\\n",
+    "                .filter(col('SCOPE') == 'cms') \\\n",
+    "                .filter(col('STATE').isin(['O', 'R'])) \\\n",
+    "                .withColumn('rse_id', lower(_hex(col('RSE_ID')))) \\\n",
+    "                .withColumnRenamed('NAME', 'f_name') \\\n",
+    "                .withColumnRenamed('ACCOUNT', 'account_name') \\\n",
+    "                .withColumnRenamed('BYTES', 'f_size') \\\n",
+    "                .withColumn('r_id', lower(_hex(col('RULE_ID')))) \\\n",
+    "                .select(['rse_id', 'f_name', 'f_size', 'r_id', 'account_name'])\n",
+    "    return locks\n",
+    "def get_df_accounts(spark):\n",
+    "    \"\"\"Get Spark dataframe of Accounts\n",
+    "    \"\"\"\n",
+    "    today = datetime.today().strftime('%Y-%m-%d')\n",
+    "    hdfs_rucio_accounts = f'/project/awg/cms/rucio/{today}/accounts/part*.avro'\n",
+    "    df_accounts = spark.read.format(\"avro\").load(hdfs_rucio_accounts) \\\n",
+    "        .filter(col('DELETED_AT').isNull()) \\\n",
+    "        .withColumnRenamed('ACCOUNT', 'account_name') \\\n",
+    "        .withColumnRenamed('ACCOUNT_TYPE', 'account_type') \\\n",
+    "        .select(['account_name', 'account_type'])\n",
+    "    return df_accounts\n",
+    "def get_df_rules(spark):\n",
+    "    \"\"\"Get Spark dataframe of rules\n",
+    "    \"\"\"\n",
+    "    hdfs_rules_path = '/project/awg/cms/rucio/{}/rules/part*.avro'.format(datetime.today().strftime('%Y-%m-%d'))\n",
+    "    return spark.read.format('avro').load(hdfs_rules_path) \\\n",
+    "        .filter(col('SCOPE') == 'cms') \\\n",
+    "        .withColumnRenamed('name', 'r_name') \\\n",
+    "        .withColumn('r_id', lower(_hex(col('ID')))) \\\n",
+    "        .withColumn('s_id', lower(_hex(col('SUBSCRIPTION_ID')))) \\\n",
+    "        .withColumnRenamed('ACTIVITY', 'activity') \\\n",
+    "        .withColumnRenamed('STATE', 'rule_state') \\\n",
+    "        .withColumnRenamed('RSE_EXPRESSION', 'rse_expression') \\\n",
+    "        .select(['r_name','r_id', 's_id', 'activity', 'rule_state', 'rse_expression']) \n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e271b1c8",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "# add data_tier field\n",
+    "df_rses = get_df_rses(spark)\n",
+    "df_locks = get_df_locks(spark)\n",
+    "df_accounts = get_df_accounts(spark)\n",
+    "df_rules = get_df_rules(spark)\n",
+    "tb_denominator = 10 ** 12\n",
+    "locks = df_locks.join(df_rses, ['rse_id'], how='left') \\\n",
+    "        .filter(col('rse_kind') == 'prod') \\\n",
+    "        .select(['f_name', 'f_size', 'RSE', 'rse_type', 'account_name', 'r_id']) \n",
+    "\n",
+    "locks_with_activity = (\n",
+    "    locks.join(df_rules, ['r_id'], how='leftouter')\n",
+    "         .select(['f_name', 'account_name', 'RSE', 'rse_type', 'f_size', 'activity', 'r_name'])\n",
+    "         .withColumn('data_tier', regexp_extract('r_name', r'^\\/([\\w-]+)\\/([\\w-]+)\\/([\\w-]+)(#[\\w-]+)?', 3))\n",
+    "         .select(['f_name', 'account_name', 'RSE', 'rse_type', 'f_size', 'activity', 'data_tier'])\n",
+    ")\n",
+    "\n",
+    "timestamp = int(time.time())\n",
+    "\n",
+    "# A File locked by the user for two activities is accounted to both activities\n",
+    "# A File locked by two users for the same activity is accounted to both Users\n",
+    "user_aggreagated = locks_with_activity \\\n",
+    "        .select(['f_name', 'f_size', 'RSE', 'rse_type', 'account_name', 'activity', 'data_tier']) \\\n",
+    "        .distinct() \\\n",
+    "        .groupby(['RSE', 'rse_type', 'account_name', 'activity', 'data_tier']) \\\n",
+    "        .agg(_round(_sum(col('f_size')) / tb_denominator, 5).alias('total_locked')) \\\n",
+    "        .join(df_accounts, ['account_name'], how='left') \\\n",
+    "        .withColumnRenamed('RSE', 'rse_name') \\\n",
+    "        .withColumn('timestamp', lit(timestamp)) \\\n",
+    "        .select(['total_locked', 'rse_name', 'rse_type', 'account_name', 'account_type', 'activity', 'data_tier', 'timestamp']) \\\n",
+    "        .cache()\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "15c3ff28",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "user_aggreagated.show(10, False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f7e98534",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "user_aggreagated.count()"
+   ]
+  }
+ ],
+ "metadata": {
+  "@webio": {
+   "lastCommId": null,
+   "lastKernelId": null
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  },
+  "sparkconnect": {
+   "bundled_options": [],
+   "list_of_options": [
+    {
+     "name": "spark.jars.packages",
+     "value": "org.apache.spark:spark-avro_2.12:3.5.0"
+    },
+    {
+     "name": "spark.executor.instances",
+     "value": "20"
+    }
+   ]
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/script/Monitor/crab-spark/notebooks/crab_tape_recall_history.ipynb b/src/script/Monitor/crab-spark/notebooks/crab_tape_recall_history.ipynb
new file mode 100644
index 0000000000..3ed0a6e890
--- /dev/null
+++ b/src/script/Monitor/crab-spark/notebooks/crab_tape_recall_history.ipynb
@@ -0,0 +1,572 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "2ecefbb5",
+   "metadata": {},
+   "source": [
+    "# CRAB Spark tape recall history\n",
+    "\n",
+    "This jobs is querying `rules_history` table of cmsrucio to answer theses questions:\n",
+    "- How long do tasks stay in “taperecall”?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5e9af689",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datetime import datetime, timedelta, timezone\n",
+    "import os\n",
+    "import time\n",
+    "import pandas as pd\n",
+    "\n",
+    "from pyspark import SparkContext, StorageLevel\n",
+    "from pyspark.sql import SparkSession\n",
+    "from pyspark.sql.functions import (\n",
+    "    current_user,\n",
+    "    col, collect_list, concat_ws, greatest, lit, lower, when,\n",
+    "    avg as _avg,\n",
+    "    count as _count,\n",
+    "    hex as _hex,\n",
+    "    max as _max,\n",
+    "    min as _min,\n",
+    "    round as _round,\n",
+    "    sum as _sum,\n",
+    ")\n",
+    "from pyspark.sql.types import (\n",
+    "    StructType,\n",
+    "    LongType,\n",
+    "    StringType,\n",
+    "    StructField,\n",
+    "    DoubleType,\n",
+    "    IntegerType,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "22946659",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark = SparkSession\\\n",
+    "        .builder\\\n",
+    "        .appName('tape-recall-history')\\\n",
+    "        .getOrCreate()\n",
+    "spark"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "014b13c8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.catalog.clearCache()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "31c19eb0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# arguments\n",
+    "# secret path, also check if file exists\n",
+    "secretpath = os.environ.get('OPENSEARCH_SECRET_PATH', f'{os.getcwd()}/../workdir/secret_opensearch.txt')\n",
+    "if not os.path.isfile(secretpath): \n",
+    "    raise Exception(f'OS secrets file {secretpath} does not exists')\n",
+    "# if PROD, index prefix will be `crab-*`, otherwise `crab-test-*`\n",
+    "PROD = os.environ.get('PROD', 'false').lower() in ('true', '1', 't')\n",
+    "# FROM_DATE, in strptime(\"%Y-%m-%d\")\n",
+    "START = os.environ.get('START_DATE', None) \n",
+    "END = os.environ.get('END_DATE', None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e843eb6d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# try to import libs from current directory, fallback to $PWD/../workdir if not found\n",
+    "try:\n",
+    "    from crabspark_utils import get_candidate_files, send_os, send_os_parallel\n",
+    "except ModuleNotFoundError:\n",
+    "    import sys\n",
+    "    sys.path.insert(0, f'{os.getcwd()}/../workdir')\n",
+    "    from crabspark_utils import get_candidate_files, send_os, send_os_parallel\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2c644790",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# variables for run inside notebook\n",
+    "START_DATE = \"2020-01-01\"\n",
+    "END_DATE = \"2024-10-01\"\n",
+    "# if cronjob, replace constant with value from env\n",
+    "if START and END:\n",
+    "    START_DATE = START\n",
+    "    END_DATE = END"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d608eab0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# index name\n",
+    "index_name = 'tape-recall-history' # always put test index prefix\n",
+    "# use prod index pattern if this execution is for production\n",
+    "if PROD:\n",
+    "    index_name = f'crab-prod-{index_name}'\n",
+    "else:\n",
+    "    index_name = f'crab-test-{index_name}'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b17ed53f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# datetime object\n",
+    "start_datetime = datetime.strptime(START_DATE, \"%Y-%m-%d\").replace(tzinfo=timezone.utc)\n",
+    "end_datetime = datetime.strptime(END_DATE, \"%Y-%m-%d\").replace(tzinfo=timezone.utc)\n",
+    "# sanity check\n",
+    "if end_datetime < start_datetime: \n",
+    "    raise Exception(f\"end date ({END_DATE}) is less than start date ({START_DATE})\")\n",
+    "start_epochmilis = int(start_datetime.timestamp()) * 1000\n",
+    "end_epochmilis = int(end_datetime.timestamp()) * 1000\n",
+    "yesterday_epoch = int((end_datetime-timedelta(days=1)).timestamp())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9404c437",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# debug\n",
+    "print(START_DATE, \n",
+    "      END_DATE, \n",
+    "      index_name,\n",
+    "      sep='\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3e85c2f0",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Import data into spark\n",
+    "\n",
+    "HDFS_RUCIO_RULES_HISTORY = f'/project/awg/cms/rucio/{END_DATE}/rules_history/'\n",
+    "\n",
+    "print(\"===============================================\"\n",
+    "      , \"RUCIO : Rules History\"\n",
+    "      , \"===============================================\"\n",
+    "      , \"File Directory:\", HDFS_RUCIO_RULES_HISTORY\n",
+    "      , \"Work Directory:\", os.getcwd()\n",
+    "      , \"===============================================\"\n",
+    "      , \"===============================================\", sep='\\n')\n",
+    "\n",
+    "# we only interest in the rules where state does not change anymore.\n",
+    "# which means, only the rules that already expired.\n",
+    "rucio_rules_history = (\n",
+    "    spark.read.format('avro').load(HDFS_RUCIO_RULES_HISTORY).withColumn('ID', lower(_hex(col('ID'))))\n",
+    "         .select(\"ID\", \"ACCOUNT\", \"NAME\", \"STATE\", \"EXPIRES_AT\", \"UPDATED_AT\", \"CREATED_AT\")\n",
+    "         .filter(f\"\"\"\\\n",
+    "                  1=1\n",
+    "                  AND ACTIVITY = 'Analysis TapeRecall'\n",
+    "                  AND EXPIRES_AT >= {start_epochmilis}\n",
+    "                  AND EXPIRES_AT < {end_epochmilis}\n",
+    "                  \"\"\")\n",
+    "         .cache()\n",
+    ")\n",
+    "rucio_rules_history.createOrReplaceTempView(\"rules_history\")\n",
+    "\n",
+    "HDFS_CRAB_part = f'/project/awg/cms/crab/tasks/{END_DATE}/'\n",
+    "print(\"===============================================\"\n",
+    "      , \"CRAB Table\"\n",
+    "      , \"===============================================\"\n",
+    "      , \"File Directory:\", HDFS_CRAB_part\n",
+    "      , \"Work Directory:\", os.getcwd()\n",
+    "      , \"===============================================\"\n",
+    "      , \"===============================================\", sep='\\n')\n",
+    "\n",
+    "# do not filter taskdb by create time (TM_START_TIME) because it is possible that rules are created 6 months ago\n",
+    "tasks_df = (\n",
+    "    spark.read.format('avro').load(HDFS_CRAB_part)\n",
+    "         .select(\"TM_TASKNAME\",\"TM_START_TIME\",\"TM_TASK_STATUS\",  'TM_TASKNAME', 'TM_START_TIME', 'TM_TASK_STATUS' , 'TM_DDM_REQID')\n",
+    "         .cache()\n",
+    ")\n",
+    "tasks_df.createOrReplaceTempView(\"tasks\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d0ad6c09",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# rucio append new row to rules_history when the content rules table change (not sure the exact condition)\n",
+    "# We need to get \"the latest\" row for each rules by:\n",
+    "# - If rule has state \"O\", select the earliest UPDATED_AT row.\n",
+    "#   For the OK rule, we can calculate number of days using UPDATED_AT-CREATED_AT. \n",
+    "#   However, there are some posiblility that rucio append new entry with newer UPDATED_AT (For exmple 37fcada73f14439b88558ef792e10276)\n",
+    "# - If not, select the latest UPDATED_AT row.\n",
+    "#   This because the rules still in temporary state, and the rules will go to the end state \n",
+    "#   (not the real state, but rules_history will not getting new row anymore) after rules is expired \n",
+    "#   So, we can calculate number of day by EXPIRES_AT-CREATED_AT\n",
+    "#\n",
+    "# Here is the step to translate above condition to SQL (in the buttom-up manner)\n",
+    "# 1. count number of row where the state is 'O'.\n",
+    "# 2. left join the rule history by ID, so each row will have number of state O \n",
+    "#    New table look like this:\n",
+    "#    +--------------------------------+-----+-------------+-------+\n",
+    "#    |ID                              |STATE|EXPIRES_AT   |state_o|\n",
+    "#    +--------------------------------+-----+-------------+-------+\n",
+    "#    |6d275222b43d431abc568dd83313118f|R    |1727244523000|1      |\n",
+    "#    |875a388ca374407ea761689511078956|R    |1727339056000|1      |\n",
+    "#    |dfe4012bcb9c448f98f940f01302ae6e|R    |1727234937000|0      |\n",
+    "#    |dfe4012bcb9c448f98f940f01302ae6e|R    |1725402537000|0      |\n",
+    "#    |c6859b18a771440ab906733e2bebf78a|R    |1727235038000|1      |\n",
+    "#     \n",
+    "# 3. select the earliest row for \"the rule that have state O\" (where clause). this can be done by windows function, sort by UPDATED_AT ascending for each ID, then filter only row_number \"1\"\n",
+    "# 4. select the latest row for \"the rule that does not have state O at all\". \n",
+    "#    This is a bit tricky but can be done by filter out the rule that have number of state O more than zero.\n",
+    "#    which this column already availabe from left join in step 2.\n",
+    "#    For the \"select latest row\" we do the same way as 4. but sort by UPDATED_AT descending instead.\n",
+    "# 5. merge result from 3. and 4 with UNION ALL.\n",
+    "# 6. Then, we will calculate number of date in the next step\n",
+    "#\n",
+    "# We are selecting the rules for each condition and join later, to avoid large broadcasthashjoin internally\n",
+    "# I (Wa) tried this before and it cause above issue, but I might be wrong here though.\n",
+    "# ```\n",
+    "#  SELECT * FROM rhistinfo_t \n",
+    "#  WHERE (state_o > 0) \n",
+    "#    OR (ID NOT IN (SELECT ID FROM (SELECT * FROM rhistinfo_t WHERE state_o > 0)))\n",
+    "# ```\n",
+    "# \n",
+    "\n",
+    "query = f\"\"\"\\\n",
+    "WITH \n",
+    "count_t AS (\n",
+    "SELECT ID, \n",
+    "       SUM(CASE WHEN state = 'O' THEN 1 ELSE 0 END) AS state_o\n",
+    "FROM rules_history\n",
+    "GROUP BY ID\n",
+    "),\n",
+    "rhistinfo_t AS (\n",
+    "SELECT rules_history.ID AS ID, \n",
+    "       rules_history.ACCOUNT AS ACCOUNT, \n",
+    "       rules_history.NAME AS NAME, \n",
+    "       rules_history.STATE AS STATE, \n",
+    "       rules_history.EXPIRES_AT AS EXPIRES_AT, \n",
+    "       rules_history.UPDATED_AT AS UPDATED_AT, \n",
+    "       rules_history.CREATED_AT AS CREATED_AT,\n",
+    "       count_t.state_o AS state_o\n",
+    "FROM rules_history\n",
+    "LEFT JOIN count_t ON rules_history.ID = count_t.ID\n",
+    "),\n",
+    "tmpwindow_1 AS (\n",
+    "SELECT *, row_number() over(partition by ID order by UPDATED_AT) as row_num\n",
+    "FROM rhistinfo_t\n",
+    "WHERE STATE = 'O'\n",
+    "), \n",
+    "r1 AS (\n",
+    "SELECT * FROM tmpwindow_1\n",
+    "WHERE row_num = 1\n",
+    "),\n",
+    "tmpwindow_2 AS (\n",
+    "SELECT *, row_number() over(partition by ID order by UPDATED_AT DESC) as row_num\n",
+    "FROM rhistinfo_t\n",
+    "WHERE STATE != 'O' AND state_o = 0\n",
+    "),\n",
+    "r2 AS (\n",
+    "SELECT * FROM tmpwindow_2\n",
+    "WHERE row_num = 1\n",
+    "),\n",
+    "r_all AS (\n",
+    "SELECT * FROM r1\n",
+    "UNION ALL\n",
+    "SELECT * FROM r2\n",
+    ")\n",
+    "SELECT * \n",
+    "FROM r_all\n",
+    "ORDER BY ID\n",
+    "\"\"\"\n",
+    "\n",
+    "tmprules = spark.sql(query)\n",
+    "tmprules.show(10, False)\n",
+    "tmprules.createOrReplaceTempView(\"tmprules\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "32dd41b1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Calculate number of days, for state O, UPDATED_AT-CREATED_AT, otherwise EXPIRES_AT-CREATED_AT\n",
+    "# then enrich the data with the crab taskdb table by join rule ID with TM_DDM_REQID column\n",
+    "# need to apply windows function again to select only the rule id with the latest crab tasks\n",
+    "\n",
+    "query = f\"\"\"\\\n",
+    "WITH \n",
+    "calc_days_t AS (\n",
+    "SELECT ID, ACCOUNT, NAME, STATE, EXPIRES_AT, UPDATED_AT, CREATED_AT,\n",
+    "       CASE \n",
+    "           WHEN STATE = 'O' THEN ceil((UPDATED_AT-CREATED_AT)/86400000)  \n",
+    "           ELSE ceil((EXPIRES_AT-CREATED_AT)/86400000)\n",
+    "       END AS DAYS\n",
+    "FROM tmprules\n",
+    "),\n",
+    "join_t AS (\n",
+    "SELECT \n",
+    "    calc_days_t.ID AS ID, \n",
+    "    calc_days_t.ACCOUNT AS ACCOUNT, \n",
+    "    calc_days_t.NAME AS NAME, \n",
+    "    calc_days_t.STATE AS STATE, \n",
+    "    calc_days_t.DAYS AS DAYS, \n",
+    "    calc_days_t.EXPIRES_AT AS EXPIRES_AT, \n",
+    "    calc_days_t.UPDATED_AT AS UPDATED_AT, \n",
+    "    calc_days_t.CREATED_AT AS CREATED_AT, \n",
+    "    tasks.TM_TASKNAME AS TM_TASKNAME,\n",
+    "    IFNULL(tasks.TM_START_TIME, 0) AS TM_START_TIME, \n",
+    "    tasks.TM_TASK_STATUS AS TM_TASK_STATUS\n",
+    "FROM calc_days_t\n",
+    "LEFT JOIN tasks ON calc_days_t.ID = tasks.TM_DDM_REQID\n",
+    "),\n",
+    "window_t AS (\n",
+    "SELECT ID, ACCOUNT, NAME, STATE, DAYS, EXPIRES_AT, UPDATED_AT, CREATED_AT, TM_TASKNAME, TM_START_TIME, TM_TASK_STATUS, \n",
+    "       row_number() OVER (PARTITION BY ID ORDER BY TM_START_TIME DESC) AS row_num\n",
+    "FROM join_t \n",
+    "),\n",
+    "uniqueid_t AS (\n",
+    "SELECT *\n",
+    "FROM window_t \n",
+    "WHERE row_num = 1\n",
+    "), \n",
+    "finalize_t AS (\n",
+    "SELECT ID, ACCOUNT, NAME, STATE, DAYS, EXPIRES_AT, UPDATED_AT, CREATED_AT, TM_TASKNAME, IFNULL(TM_START_TIME, 0) as TM_START_TIME, TM_TASK_STATUS, \n",
+    "       EXPIRES_AT AS timestamp,\n",
+    "       'tape_recall_history' AS type\n",
+    "FROM uniqueid_t \n",
+    ")\n",
+    "SELECT *\n",
+    "FROM finalize_t\n",
+    "\"\"\"\n",
+    "\n",
+    "tmpdf = spark.sql(query)\n",
+    "tmpdf.show(10, False)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "df979012",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tmpdf.count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c33dfce3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "docs = tmpdf.toPandas().to_dict('records')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eee4a1f3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "schema = {\n",
+    "        \"settings\": {\"index\": {\"number_of_shards\": \"1\", \"number_of_replicas\": \"1\"}},\n",
+    "        \"mappings\": {\n",
+    "            \"properties\": {\n",
+    "                \"ID\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+    "                \"ACCOUNT\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+    "                \"NAME\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+    "                \"STATE\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+    "                \"DAYS\": {\"type\": \"long\"},\n",
+    "                \"EXPIRES_AT\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
+    "                \"UPDATED_AT\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
+    "                \"CREATED_AT\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
+    "                \"TM_TASKNAME\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+    "                \"TM_START_TIME\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
+    "                \"TM_TASK_STATUS\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+    "                \"type\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+    "                \"timestamp\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
+    "            }\n",
+    "\n",
+    "        }\n",
+    "\n",
+    "    }\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9ec824ee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# this is simple workaround osearch bug when work in notebook because\n",
+    "#   - it load the secret once and use forever\n",
+    "#   - get_or_create_index() create index+schema only the first time it execute\n",
+    "# it is safe to run again even in cronjobs \n",
+    "import importlib\n",
+    "import osearch\n",
+    "importlib.reload(osearch)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6cdc83dd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "osearch.send_os(docs, index_name, schema, secretpath, yesterday_epoch)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "22747a3f",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Add a single doc to es everyday to check if pipeline is running successfully.\n",
+    "# This is need because we did not have rule that expires everyday\n",
+    "# Remember to filter it out in grafana (For example `NOT ID:00000000000000000` in lucene query)\n",
+    "day = start_datetime\n",
+    "monitoring_docs = []\n",
+    "while day < end_datetime:\n",
+    "    milisec = int(day.timestamp())*1000\n",
+    "    doc = {\n",
+    "        \"ID\": '00000000000000000',\n",
+    "        \"ACCOUNT\": 'cmscrab',\n",
+    "        \"NAME\": '/Pipeline/Monitoring/AOD',\n",
+    "        \"STATE\": 'P',\n",
+    "        \"DAYS\": -1,\n",
+    "        \"EXPIRES_AT\": milisec,\n",
+    "        \"UPDATED_AT\": milisec,\n",
+    "        \"CREATED_AT\": milisec,\n",
+    "        \"TM_TASKNAME\": '240000_000000:cmscrab_crab_20240000_000000',\n",
+    "        \"TM_START_TIME\": milisec,\n",
+    "        \"TM_TASK_STATUS\": 'PLACEHOLDER',\n",
+    "        \"type\": 'tape_recall_history',\n",
+    "        \"timestamp\": milisec,\n",
+    "\n",
+    "    }\n",
+    "    monitoring_docs.append(doc)\n",
+    "    day += timedelta(days=1)\n",
+    "send_os(monitoring_docs, index_name, schema, secretpath, yesterday_epoch)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4a24e4ff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Useful query to get only the rules that gave\n",
+    "#query = f\"\"\"\\\n",
+    "#repeated_ids AS (\n",
+    "#    SELECT ID\n",
+    "#    FROM rules_history\n",
+    "#    GROUP BY ID\n",
+    "#    HAVING COUNT(*) > 2\n",
+    "#),\n",
+    "#tba_t AS (\n",
+    "#SELECT *\n",
+    "#FROM rules_history\n",
+    "#)\n",
+    "#SELECT * FROM tba_t\n",
+    "#\"\"\"\n",
+    "#\n",
+    "#testdf = spark.sql(query)\n",
+    "#testdf.show(100, False)\n",
+    "#\n",
+    "# rule 37fc where latest UPDATED_AT is 43 days after the first OK state\n",
+    "#spark.sql(\"\"\"\\\n",
+    "#SELECT * FROM rules_history\n",
+    "#WHERE ID = '37fcada73f14439b88558ef792e10276'\n",
+    "#\"\"\").show(10, False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "@webio": {
+   "lastCommId": null,
+   "lastKernelId": null
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  },
+  "sparkconnect": {
+   "bundled_options": [],
+   "list_of_options": [
+    {
+     "name": "spark.jars.packages",
+     "value": "org.apache.spark:spark-avro_2.12:3.5.0"
+    },
+    {
+     "name": "spark.executor.instances",
+     "value": "20"
+    }
+   ]
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/script/Monitor/crab-spark/notebooks/crab_tape_recall_rules_history.ipynb b/src/script/Monitor/crab-spark/notebooks/crab_tape_recall_rules_history.ipynb
deleted file mode 100644
index 20f441f4ce..0000000000
--- a/src/script/Monitor/crab-spark/notebooks/crab_tape_recall_rules_history.ipynb
+++ /dev/null
@@ -1,726 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2fe94c82",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "9f91521a",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "            <div>\n",
-       "                <p><b>SparkSession - in-memory</b></p>\n",
-       "                \n",
-       "        <div>\n",
-       "            <p><b>SparkContext</b></p>\n",
-       "\n",
-       "            <p><a href=\"http://swan-prod-2xlarge-h2ohj7sq3fe3-node-12:30968\">Spark UI</a></p>\n",
-       "\n",
-       "            <dl>\n",
-       "              <dt>Version</dt>\n",
-       "                <dd><code>v3.3.2</code></dd>\n",
-       "              <dt>Master</dt>\n",
-       "                <dd><code>yarn</code></dd>\n",
-       "              <dt>AppName</dt>\n",
-       "                <dd><code>pyspark_shell_swan</code></dd>\n",
-       "            </dl>\n",
-       "        </div>\n",
-       "        \n",
-       "            </div>\n",
-       "        "
-      ],
-      "text/plain": [
-       "<pyspark.sql.session.SparkSession at 0x7fdbd3478be0>"
-      ]
-     },
-     "execution_count": 1,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "spark"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "666f70d9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# !hdfs dfs -stat /project/awg/cms/rucio/2023-07-31/"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "bd6751a6",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "23/08/01 17:05:11 WARN ipc.Client: Exception encountered while connecting to the server \n",
-      "org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.ipc.StandbyException): Operation category READ is not supported in state standby. Visit https://s.apache.org/sbnn-error\n",
-      "\tat org.apache.hadoop.security.SaslRpcClient.saslConnect(SaslRpcClient.java:376)\n",
-      "\tat org.apache.hadoop.ipc.Client$Connection.setupSaslConnection(Client.java:622)\n",
-      "\tat org.apache.hadoop.ipc.Client$Connection.access$2300(Client.java:413)\n",
-      "\tat org.apache.hadoop.ipc.Client$Connection$2.run(Client.java:822)\n",
-      "\tat org.apache.hadoop.ipc.Client$Connection$2.run(Client.java:818)\n",
-      "\tat java.base/java.security.AccessController.doPrivileged(Native Method)\n",
-      "\tat java.base/javax.security.auth.Subject.doAs(Subject.java:423)\n",
-      "\tat org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1845)\n",
-      "\tat org.apache.hadoop.ipc.Client$Connection.setupIOstreams(Client.java:818)\n",
-      "\tat org.apache.hadoop.ipc.Client$Connection.access$3800(Client.java:413)\n",
-      "\tat org.apache.hadoop.ipc.Client.getConnection(Client.java:1636)\n",
-      "\tat org.apache.hadoop.ipc.Client.call(Client.java:1452)\n",
-      "\tat org.apache.hadoop.ipc.Client.call(Client.java:1405)\n",
-      "\tat org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:234)\n",
-      "\tat org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:119)\n",
-      "\tat com.sun.proxy.$Proxy12.getFileInfo(Unknown Source)\n",
-      "\tat org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.getFileInfo(ClientNamenodeProtocolTranslatorPB.java:964)\n",
-      "\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n",
-      "\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n",
-      "\tat java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n",
-      "\tat java.base/java.lang.reflect.Method.invoke(Method.java:566)\n",
-      "\tat org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:422)\n",
-      "\tat org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeMethod(RetryInvocationHandler.java:165)\n",
-      "\tat org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invoke(RetryInvocationHandler.java:157)\n",
-      "\tat org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeOnce(RetryInvocationHandler.java:95)\n",
-      "\tat org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:359)\n",
-      "\tat com.sun.proxy.$Proxy13.getFileInfo(Unknown Source)\n",
-      "\tat org.apache.hadoop.hdfs.DFSClient.getFileInfo(DFSClient.java:1731)\n",
-      "\tat org.apache.hadoop.hdfs.DistributedFileSystem$29.doCall(DistributedFileSystem.java:1725)\n",
-      "\tat org.apache.hadoop.hdfs.DistributedFileSystem$29.doCall(DistributedFileSystem.java:1722)\n",
-      "\tat org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81)\n",
-      "\tat org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:1737)\n",
-      "\tat org.apache.hadoop.fs.Globber.getFileStatus(Globber.java:115)\n",
-      "\tat org.apache.hadoop.fs.Globber.doGlob(Globber.java:349)\n",
-      "\tat org.apache.hadoop.fs.Globber.glob(Globber.java:202)\n",
-      "\tat org.apache.hadoop.fs.FileSystem.globStatus(FileSystem.java:2093)\n",
-      "\tat org.apache.hadoop.fs.shell.PathData.expandAsGlob(PathData.java:353)\n",
-      "\tat org.apache.hadoop.fs.shell.Command.expandArgument(Command.java:250)\n",
-      "\tat org.apache.hadoop.fs.shell.Command.expandArguments(Command.java:233)\n",
-      "\tat org.apache.hadoop.fs.shell.FsCommand.processRawArguments(FsCommand.java:104)\n",
-      "\tat org.apache.hadoop.fs.shell.Command.run(Command.java:177)\n",
-      "\tat org.apache.hadoop.fs.FsShell.run(FsShell.java:327)\n",
-      "\tat org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:76)\n",
-      "\tat org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:90)\n",
-      "\tat org.apache.hadoop.fs.FsShell.main(FsShell.java:390)\n",
-      "Found 41 items\n",
-      "-rw-r--r-x+  3 cmssqoop c3          0 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/_SUCCESS\n",
-      "-rw-r--r-x+  3 cmssqoop c3   88187830 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00000.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   78573788 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00001.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   89288020 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00002.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   87120186 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00003.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   84145506 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00004.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   77023084 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00005.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   82231949 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00006.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   90427579 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00007.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   83505019 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00008.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   81737327 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00009.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   89063315 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00010.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   87547076 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00011.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   76025866 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00012.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   86124517 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00013.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   84209698 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00014.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   87883924 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00015.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   84024611 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00016.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   88549765 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00017.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   78591247 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00018.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   88304711 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00019.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   84004574 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00020.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   84661738 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00021.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   78502498 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00022.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   91523366 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00023.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   77450183 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00024.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   92852942 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00025.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   85201132 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00026.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   83220428 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00027.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   72640822 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00028.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   74597749 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00029.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   83142949 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00030.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   86601475 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00031.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   90497549 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00032.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   88555030 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00033.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   78799199 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00034.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   80642314 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00035.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   85967465 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00036.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   92843317 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00037.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   83861741 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00038.avro\n",
-      "-rw-r--r-x+  3 cmssqoop c3   91545885 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00039.avro\n"
-     ]
-    }
-   ],
-   "source": [
-    "!hdfs dfs -ls /project/awg/cms/rucio/2023-07-24/rules_history #02:54:14"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "800a2f9e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pickle\n",
-    "from datetime import datetime, timedelta\n",
-    "\n",
-    "import click\n",
-    "import os\n",
-    "import pandas as pd\n",
-    "import pprint\n",
-    "import time\n",
-    "from dateutil.relativedelta import relativedelta\n",
-    "from pyspark import SparkContext, StorageLevel\n",
-    "from pyspark.sql import SparkSession\n",
-    "from pyspark.sql.functions import (\n",
-    "    col, collect_list, concat_ws, greatest, lit, lower, when,\n",
-    "    avg as _avg,\n",
-    "    count as _count,\n",
-    "    hex as _hex,\n",
-    "    max as _max,\n",
-    "    min as _min,\n",
-    "    round as _round,\n",
-    "    sum as _sum,\n",
-    ")\n",
-    "\n",
-    "from pyspark.sql.types import (\n",
-    "    LongType,\n",
-    ")\n",
-    "\n",
-    "#from CMSSpark.src.python.CMSSpark import schemas as cms_schemas"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "6951caed",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#from CMSSpark import schemas as cms_schemas"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4e78c524",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e597820f",
-   "metadata": {},
-   "source": [
-    "## load dataset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "id": "2c100a92",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# end_date = str(datetime.now())[:10]\n",
-    "# start_date = str(datetime.now()-timedelta(days=1))[:10]\n",
-    "\n",
-    "end_date = '2023-07-31'\n",
-    "start_date = '2023-07-01'\n",
-    "\n",
-    "TOYEAR = end_date[:4]\n",
-    "\n",
-    "wa_date = end_date\n",
-    "HDFS_RUCIO_DATASET_LOCKS = f'/project/awg/cms/rucio/{wa_date}/dataset_locks/part*.avro'\n",
-    "HDFS_RUCIO_LOCKS =         f'/project/awg/cms/rucio/{wa_date}/locks'\n",
-    "HDFS_RUCIO_RSES =          f'/project/awg/cms/rucio/{wa_date}/rses/part*.avro'\n",
-    "HDFS_RUCIO_RULES =         f'/project/awg/cms/rucio/{wa_date}/rules'\n",
-    "HDFS_RUCIO_RULES_HISTORY = f'/project/awg/cms/rucio/{wa_date}/rules_history'\n",
-    "HDFS_RUCIO_REPLICAS =      f'/project/awg/cms/rucio/{wa_date}/replicas'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "fe62d431",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# rucio_dataset_locks = spark.read.format('avro').load(HDFS_RUCIO_DATASET_LOCKS)\\\n",
-    "#     .withColumn('BYTES', col('BYTES').cast(LongType()))\\\n",
-    "#     .withColumn('RULE_ID', lower(_hex(col('RULE_ID'))))\\\n",
-    "#     .withColumn('RSE_ID', lower(_hex(col('RSE_ID'))))\n",
-    "# rucio_dataset_locks.createOrReplaceTempView(\"dataset_locks\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "b2e4fcfa",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# rucio_rses = spark.read.format('avro').load(HDFS_RUCIO_RSES)\\\n",
-    "#     .withColumn('ID', lower(_hex(col('ID'))))\n",
-    "# rucio_rses.createOrReplaceTempView(\"rses\")\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "3893197e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# rucio_rules = spark.read.format('avro').load(HDFS_RUCIO_RULES)\\\n",
-    "#     .withColumn('ID', lower(_hex(col('ID'))))\n",
-    "# rucio_rules.createOrReplaceTempView(\"rules\")\n",
-    "# #spark.sql(\"SELECT * FROM rules\").count()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "f9f2ba4e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# rucio_locks = spark.read.format('avro').load(HDFS_RUCIO_LOCKS)\\\n",
-    "#     .withColumn('BYTES', col('BYTES').cast(LongType()))\\\n",
-    "#     .withColumn('RULE_ID', lower(_hex(col('RULE_ID'))))\\\n",
-    "#     .withColumn('RSE_ID', lower(_hex(col('RSE_ID'))))\n",
-    "# rucio_locks.createOrReplaceTempView(\"locks\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "7771b12d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "rucio_rules_history = spark.read.format('avro').load(HDFS_RUCIO_RULES_HISTORY)\\\n",
-    "    .withColumn('ID', lower(_hex(col('ID'))))\n",
-    "    #.persist(StorageLevel.DISK_ONLY)\n",
-    "rucio_rules_history = rucio_rules_history.select(\"ID\", \"NAME\", \"STATE\", \"EXPIRES_AT\", \"UPDATED_AT\", \"CREATED_AT\", \"ACCOUNT\")\n",
-    "rucio_rules_history.createOrReplaceTempView(\"rules_history\")\n",
-    "#spark.sql(\"SELECT * FROM rules_history\").count()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "id": "274421b8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# rucio_replicas = spark.read.format('avro').load(HDFS_RUCIO_REPLICAS)\\\n",
-    "#     .withColumn('RSE_ID', lower(_hex(col('RSE_ID'))))\n",
-    "# rucio_replicas.createOrReplaceTempView(\"replicas\")\n",
-    "# #spark.sql(\"SELECT * FROM replicas\").count()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "5c84635f",
-   "metadata": {},
-   "source": [
-    "## Query"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "ee99f580",
-   "metadata": {},
-   "source": [
-    "# how long does it take ?"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "26120cd9",
-   "metadata": {
-    "scrolled": false
-   },
-   "outputs": [],
-   "source": [
-    "# # NOTE: days is ceiling\n",
-    "\n",
-    "# spark.sql(\"\"\"\n",
-    "# WITH filter_t AS (\n",
-    "# SELECT ID, NAME, STATE, EXPIRES_AT, UPDATED_AT, CREATED_AT\n",
-    "# FROM rules_history \n",
-    "# WHERE 1=1\n",
-    "# AND ACCOUNT = \"crab_tape_recall\"\n",
-    "# --- we look at the rule created this year (2023)\n",
-    "# AND CREATED_AT >= unix_timestamp(\"2023-01-01 00:00:00\", \"yyyy-MM-dd HH:mm:ss\")*1000 \n",
-    "# ),\n",
-    "# rn_t AS (\n",
-    "# SELECT ID, NAME, STATE, EXPIRES_AT, UPDATED_AT, CREATED_AT,\n",
-    "# row_number() over(partition by ID order by UPDATED_AT desc) as rn --- to get only latest state for each id\n",
-    "# FROM filter_t\n",
-    "# ),\n",
-    "# calc_days_t AS (\n",
-    "# SELECT ID, NAME, STATE, \n",
-    "# from_unixtime(EXPIRES_AT/1000, 'yyyy-MM-dd HH:mm:ss') AS EXPIRES_AT, \n",
-    "# from_unixtime(UPDATED_AT/1000, 'yyyy-MM-dd HH:mm:ss') AS UPDATED_AT, \n",
-    "# from_unixtime(CREATED_AT/1000, 'yyyy-MM-dd HH:mm:ss') AS CREATED_AT,\n",
-    "# --- if state is O we calculate from update_at when state change (assumed that there is only single row for O state)\n",
-    "# --- but if state is not O, we calculate from expired time, it usually 14 days but it is possible that rules somehow got extend\n",
-    "# --- other wise days = 0 for filter the rules that not expire \n",
-    "#    CASE \n",
-    "#       WHEN STATE = 'O' THEN ceil((UPDATED_AT-CREATED_AT)/86400000)  \n",
-    "#       WHEN STATE != 'O' AND EXPIRES_AT < unix_timestamp(\"2023-05-22 00:00:00\", \"yyyy-MM-dd HH:mm:ss\")*1000 THEN ceil((EXPIRES_AT-CREATED_AT)/86400000)\n",
-    "#       ELSE 0\n",
-    "#    END AS DAYS\n",
-    "# FROM rn_t\n",
-    "# WHERE rn = 1\n",
-    "# )\n",
-    "# SELECT * \n",
-    "# FROM calc_days_t\n",
-    "# ---AND STATE == 'O'\n",
-    "# \"\"\"\n",
-    "# ).show(50,truncate=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "id": "fadde59c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "## query use to produce data to elasticsearch\n",
-    "\n",
-    "query = f\"\"\"\\\n",
-    "WITH filter_t AS (\n",
-    "SELECT ID, NAME, STATE, EXPIRES_AT, UPDATED_AT, CREATED_AT\n",
-    "FROM rules_history \n",
-    "WHERE 1=1\n",
-    "AND ACCOUNT = \"crab_tape_recall\"\n",
-    "AND CREATED_AT >= unix_timestamp(\"{TOYEAR}-01-01 00:00:00\", \"yyyy-MM-dd HH:mm:ss\")*1000\n",
-    "),\n",
-    "rn_t AS (\n",
-    "SELECT ID, NAME, STATE, EXPIRES_AT, UPDATED_AT, CREATED_AT,\n",
-    "row_number() over(partition by ID order by UPDATED_AT desc) as rn\n",
-    "FROM filter_t\n",
-    "),\n",
-    "calc_days_t AS (\n",
-    "SELECT ID, NAME, STATE, EXPIRES_AT, UPDATED_AT, CREATED_AT,\n",
-    "   CASE \n",
-    "      WHEN STATE = 'O' THEN ceil((UPDATED_AT-CREATED_AT)/86400000)  \n",
-    "      WHEN STATE != 'O' AND EXPIRES_AT < unix_timestamp(\"{wa_date} 00:00:00\", \"yyyy-MM-dd HH:mm:ss\")*1000 THEN ceil((EXPIRES_AT-CREATED_AT)/86400000)\n",
-    "      ELSE 0\n",
-    "   END AS DAYS\n",
-    "FROM rn_t\n",
-    "WHERE rn = 1\n",
-    ")\n",
-    "SELECT * \n",
-    "FROM calc_days_t\n",
-    "WHERE 1=1\n",
-    "AND EXPIRES_AT >= unix_timestamp(\"{start_date} 00:00:00\", \"yyyy-MM-dd HH:mm:ss\")*1000 \n",
-    "AND EXPIRES_AT < unix_timestamp(\"{end_date} 00:00:00\", \"yyyy-MM-dd HH:mm:ss\")*1000 \n",
-    "\"\"\"\n",
-    "\n",
-    "tmpdf = spark.sql(query)\n",
-    "# str(datetime.now()-timedelta(days=1))[:10]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "id": "b44548ef",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "+--------------------+--------------------+-----+-------------+-------------+-------------+----+\n",
-      "|                  ID|                NAME|STATE|   EXPIRES_AT|   UPDATED_AT|   CREATED_AT|DAYS|\n",
-      "+--------------------+--------------------+-----+-------------+-------------+-------------+----+\n",
-      "|16e7eeb0a6c447839...|/DYJetsToLL_LHEFi...|    O|1689496342000|1689130290000|1686566168000|  30|\n",
-      "|27aea75d1d364b219...|/WJetsToLNu_HT-20...|    O|1689568449000|1689086563000|1686931142000|  25|\n",
-      "|3f2d7fcff69d49079...|/ParkingBPH1/Run2...|    R|1689522386000|1687621610000|1687621586000|  22|\n",
-      "|67d9f565492b4dec9...|/DYJetsToLL_M-10t...|    R|1689519133000|1687618376000|1687618333000|  22|\n",
-      "|c2cbad3267e84ba18...|/TapeRecall/23061...|    O|1689554004000|1689117261000|1686940766000|  26|\n",
-      "|d23ee08f6aac4d5db...|/QCD_HT300to500_T...|    O|1689525153000|1689048723000|1686900417000|  25|\n",
-      "|ddfdfed2239940298...|/W2JetsToLNu_Tune...|    R|1689517301000|1687616515000|1687616501000|  22|\n",
-      "|dee8dbd0a82b48b59...|/TapeRecall/23060...|    O|1689525153000|1689127918000|1685747740000|  40|\n",
-      "+--------------------+--------------------+-----+-------------+-------------+-------------+----+\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "tmpdf.show(50)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "91db6a20",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tmpdf.printSchema()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 47,
-   "id": "e734d507",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "docs = tmpdf.toPandas().to_dict('records')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 48,
-   "id": "ac8524e0",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "8260"
-      ]
-     },
-     "execution_count": 48,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len(docs)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 49,
-   "id": "fa51e74c",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[{'ID': '00049b4efb3e4dd091dbfed2012069df',\n",
-       "  'NAME': '/TapeRecall/221110_230609.dshmygol_crab_Bfinder_2018_MC_Bc_in_JpsiPI_v0_1/USER',\n",
-       "  'STATE': 'O',\n",
-       "  'EXPIRES_AT': 1669331191000,\n",
-       "  'UPDATED_AT': 1668133187000,\n",
-       "  'CREATED_AT': 1668121591000,\n",
-       "  'DAYS': 1},\n",
-       " {'ID': '0007a18199834a2ca720f088d96a3c9c',\n",
-       "  'NAME': '/TapeRecall/220427_065307.youying_crab_DiphoVtxUL2016_DoubleMuon_Run2016B-21Feb2020_ver2_UL2016_HIPM-v1/USER',\n",
-       "  'STATE': 'O',\n",
-       "  'EXPIRES_AT': 1652252233000,\n",
-       "  'UPDATED_AT': 1651048717000,\n",
-       "  'CREATED_AT': 1651042633000,\n",
-       "  'DAYS': 1},\n",
-       " {'ID': '00d4ba364b89477e888e8797a33092d2',\n",
-       "  'NAME': '/TapeRecall/210810_035101.jingqing_crab_BPHSkimOfficialChib06900-2016-v5/USER',\n",
-       "  'STATE': 'O',\n",
-       "  'EXPIRES_AT': 1629777107000,\n",
-       "  'UPDATED_AT': 1628847357000,\n",
-       "  'CREATED_AT': 1628567507000,\n",
-       "  'DAYS': 4},\n",
-       " {'ID': '00fb74e1bafc40aba0736216b798a80c',\n",
-       "  'NAME': '/TapeRecall/230220_091052.shiyi_crab_RUN3_2022Dv2mass3_SKIM_E_newV2/USER',\n",
-       "  'STATE': 'R',\n",
-       "  'EXPIRES_AT': 1678093905000,\n",
-       "  'UPDATED_AT': 1678093224000,\n",
-       "  'CREATED_AT': 1676884305000,\n",
-       "  'DAYS': 14},\n",
-       " {'ID': '0116d88feb0842f29f78c78f2e7a4ce4',\n",
-       "  'NAME': '/TapeRecall/230113_215556.wjang_crab_NanoAODv9_v1_ST_t-channel_antitop_4f_InclusiveDecays_TuneCP5_13TeV-powheg-madspin-pythia8_postVFP/USER',\n",
-       "  'STATE': 'O',\n",
-       "  'EXPIRES_AT': 1674856682000,\n",
-       "  'UPDATED_AT': 1673966996000,\n",
-       "  'CREATED_AT': 1673647082000,\n",
-       "  'DAYS': 4}]"
-      ]
-     },
-     "execution_count": 49,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "docs[:5]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c052b072",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 52,
-   "id": "86f3a742",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import osearch"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 53,
-   "id": "6d29e62d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_index_schema():\n",
-    "    return {\n",
-    "        \"settings\": {\"index\": {\"number_of_shards\": \"1\", \"number_of_replicas\": \"1\"}},\n",
-    "        \"mappings\": {\n",
-    "            \"properties\": {\n",
-    "                \"timestamp\": {\"format\": \"epoch_second\", \"type\": \"date\"},\n",
-    "                \"ID\": {\"ignore_above\": 1024, \"type\": \"keyword\"},\n",
-    "                \"NAME\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
-    "                \"STATE\": {\"ignore_above\": 1024, \"type\": \"keyword\"},\n",
-    "                \"EXPIRES_AT\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
-    "                \"UPDATED_AT\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
-    "                \"CREATED_AT\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
-    "                \"DAYS\": {\"type\": \"long\"},\n",
-    "            }\n",
-    "        }\n",
-    "    }"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 54,
-   "id": "b479eeb7",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/eos/user/e/eatthaph/.local/lib/python3.9/site-packages/opensearchpy/connection/http_urllib3.py:199: UserWarning: Connecting to https://es-cms1.cern.ch:443 using SSL with verify_certs=False is insecure.\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "0"
-      ]
-     },
-     "execution_count": 54,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "_index_template = 'crab-tape-recall-daily-ekong'\n",
-    "client = osearch.get_es_client(\"es-cms1.cern.ch/es\", 'secret_opensearch.txt', get_index_schema())\n",
-    "# index_mod=\"\": 'test-foo', index_mod=\"Y\": 'test-foo-YYYY', index_mod=\"M\": 'test-foo-YYYY-MM', index_mod=\"D\": 'test-foo-YYYY-MM-DD',\n",
-    "idx = client.get_or_create_index(timestamp=time.time(), index_template=_index_template, index_mod=\"M\")\n",
-    "client.send(idx, docs, metadata=None, batch_size=10000, drop_nulls=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0af51d3a",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "12ece939",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f4567c46",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "546e9d4f",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "496e681c",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "@webio": {
-   "lastCommId": null,
-   "lastKernelId": null
-  },
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.12"
-  },
-  "sparkconnect": {
-   "bundled_options": [],
-   "list_of_options": [
-    {
-     "name": "spark.jars.packages",
-     "value": "org.apache.spark:spark-avro_2.12:3.3.1"
-    }
-   ]
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/src/script/Monitor/crab-spark/notebooks/crab_tape_recall_updated_rules.ipynb b/src/script/Monitor/crab-spark/notebooks/crab_tape_recall_updated_rules.ipynb
deleted file mode 100644
index 5311eb9dd5..0000000000
--- a/src/script/Monitor/crab-spark/notebooks/crab_tape_recall_updated_rules.ipynb
+++ /dev/null
@@ -1,889 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "9f91521a",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "            <div>\n",
-       "                <p><b>SparkSession - in-memory</b></p>\n",
-       "                \n",
-       "        <div>\n",
-       "            <p><b>SparkContext</b></p>\n",
-       "\n",
-       "            <p><a href=\"http://swan-prod-gpu-t4-5x-k4cbamaqo7dk-node-12:31134\">Spark UI</a></p>\n",
-       "\n",
-       "            <dl>\n",
-       "              <dt>Version</dt>\n",
-       "                <dd><code>v3.3.2</code></dd>\n",
-       "              <dt>Master</dt>\n",
-       "                <dd><code>yarn</code></dd>\n",
-       "              <dt>AppName</dt>\n",
-       "                <dd><code>pyspark_shell_swan</code></dd>\n",
-       "            </dl>\n",
-       "        </div>\n",
-       "        \n",
-       "            </div>\n",
-       "        "
-      ],
-      "text/plain": [
-       "<pyspark.sql.session.SparkSession at 0x7f3610a1b370>"
-      ]
-     },
-     "execution_count": 1,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "spark"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "666f70d9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# !hdfs dfs -stat /project/awg/cms/rucio/2023-07-24/"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "bd6751a6",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "23/08/09 12:12:50 WARN ipc.Client: Exception encountered while connecting to the server \n",
-      "org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.ipc.StandbyException): Operation category READ is not supported in state standby. Visit https://s.apache.org/sbnn-error\n",
-      "\tat org.apache.hadoop.security.SaslRpcClient.saslConnect(SaslRpcClient.java:376)\n",
-      "\tat org.apache.hadoop.ipc.Client$Connection.setupSaslConnection(Client.java:622)\n",
-      "\tat org.apache.hadoop.ipc.Client$Connection.access$2300(Client.java:413)\n",
-      "\tat org.apache.hadoop.ipc.Client$Connection$2.run(Client.java:822)\n",
-      "\tat org.apache.hadoop.ipc.Client$Connection$2.run(Client.java:818)\n",
-      "\tat java.base/java.security.AccessController.doPrivileged(Native Method)\n",
-      "\tat java.base/javax.security.auth.Subject.doAs(Subject.java:423)\n",
-      "\tat org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1845)\n",
-      "\tat org.apache.hadoop.ipc.Client$Connection.setupIOstreams(Client.java:818)\n",
-      "\tat org.apache.hadoop.ipc.Client$Connection.access$3800(Client.java:413)\n",
-      "\tat org.apache.hadoop.ipc.Client.getConnection(Client.java:1636)\n",
-      "\tat org.apache.hadoop.ipc.Client.call(Client.java:1452)\n",
-      "\tat org.apache.hadoop.ipc.Client.call(Client.java:1405)\n",
-      "\tat org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:234)\n",
-      "\tat org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:119)\n",
-      "\tat com.sun.proxy.$Proxy12.getFileInfo(Unknown Source)\n",
-      "\tat org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.getFileInfo(ClientNamenodeProtocolTranslatorPB.java:964)\n",
-      "\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n",
-      "\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n",
-      "\tat java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n",
-      "\tat java.base/java.lang.reflect.Method.invoke(Method.java:566)\n",
-      "\tat org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:422)\n",
-      "\tat org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeMethod(RetryInvocationHandler.java:165)\n",
-      "\tat org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invoke(RetryInvocationHandler.java:157)\n",
-      "\tat org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeOnce(RetryInvocationHandler.java:95)\n",
-      "\tat org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:359)\n",
-      "\tat com.sun.proxy.$Proxy13.getFileInfo(Unknown Source)\n",
-      "\tat org.apache.hadoop.hdfs.DFSClient.getFileInfo(DFSClient.java:1731)\n",
-      "\tat org.apache.hadoop.hdfs.DistributedFileSystem$29.doCall(DistributedFileSystem.java:1725)\n",
-      "\tat org.apache.hadoop.hdfs.DistributedFileSystem$29.doCall(DistributedFileSystem.java:1722)\n",
-      "\tat org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81)\n",
-      "\tat org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:1737)\n",
-      "\tat org.apache.hadoop.fs.Globber.getFileStatus(Globber.java:115)\n",
-      "\tat org.apache.hadoop.fs.Globber.doGlob(Globber.java:349)\n",
-      "\tat org.apache.hadoop.fs.Globber.glob(Globber.java:202)\n",
-      "\tat org.apache.hadoop.fs.FileSystem.globStatus(FileSystem.java:2093)\n",
-      "\tat org.apache.hadoop.fs.shell.PathData.expandAsGlob(PathData.java:353)\n",
-      "\tat org.apache.hadoop.fs.shell.Command.expandArgument(Command.java:250)\n",
-      "\tat org.apache.hadoop.fs.shell.Command.expandArguments(Command.java:233)\n",
-      "\tat org.apache.hadoop.fs.shell.FsCommand.processRawArguments(FsCommand.java:104)\n",
-      "\tat org.apache.hadoop.fs.shell.Command.run(Command.java:177)\n",
-      "\tat org.apache.hadoop.fs.FsShell.run(FsShell.java:327)\n",
-      "\tat org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:76)\n",
-      "\tat org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:90)\n",
-      "\tat org.apache.hadoop.fs.FsShell.main(FsShell.java:390)\n",
-      "Found 10 items\n",
-      "drwxrwxr-x+  - cmssqoop c3          0 2023-07-25 04:18 /project/awg/cms/rucio/2023-07-25/contents\n",
-      "drwxrwxr-x+  - cmssqoop c3          0 2023-07-25 04:22 /project/awg/cms/rucio/2023-07-25/dataset_locks\n",
-      "drwxrwxr-x+  - cmssqoop c3          0 2023-07-25 04:13 /project/awg/cms/rucio/2023-07-25/dids\n",
-      "drwxrwxr-x+  - cmssqoop c3          0 2023-07-25 04:28 /project/awg/cms/rucio/2023-07-25/locks\n",
-      "drwxrwxr-x+  - cmssqoop c3          0 2023-07-25 04:06 /project/awg/cms/rucio/2023-07-25/replicas\n",
-      "drwxrwxr-x+  - cmssqoop c3          0 2023-07-25 04:46 /project/awg/cms/rucio/2023-07-25/requests_history\n",
-      "drwxrwxr-x+  - cmssqoop c3          0 2023-07-25 04:53 /project/awg/cms/rucio/2023-07-25/rses\n",
-      "drwxrwxr-x+  - cmssqoop c3          0 2023-07-25 04:33 /project/awg/cms/rucio/2023-07-25/rules\n",
-      "drwxrwxr-x+  - cmssqoop c3          0 2023-07-25 04:38 /project/awg/cms/rucio/2023-07-25/rules_history\n",
-      "drwxrwxr-x+  - cmssqoop c3          0 2023-07-25 04:50 /project/awg/cms/rucio/2023-07-25/subscriptions\n"
-     ]
-    }
-   ],
-   "source": [
-    "# check available files\n",
-    "!hdfs dfs -ls /project/awg/cms/rucio/2023-07-25"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "800a2f9e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pickle\n",
-    "from datetime import datetime, timedelta\n",
-    "\n",
-    "import click\n",
-    "import os\n",
-    "import pandas as pd\n",
-    "import pprint\n",
-    "import time\n",
-    "from dateutil.relativedelta import relativedelta\n",
-    "from pyspark import SparkContext, StorageLevel\n",
-    "from pyspark.sql import SparkSession\n",
-    "from pyspark.sql.functions import (\n",
-    "    col, collect_list, concat_ws, greatest, lit, lower, when,\n",
-    "    avg as _avg,\n",
-    "    count as _count,\n",
-    "    hex as _hex,\n",
-    "    max as _max,\n",
-    "    min as _min,\n",
-    "    round as _round,\n",
-    "    sum as _sum,\n",
-    ")\n",
-    "\n",
-    "from pyspark.sql.types import (\n",
-    "    LongType,\n",
-    ")\n",
-    "\n",
-    "#from CMSSpark.src.python.CMSSpark import schemas as cms_schemas"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e597820f",
-   "metadata": {},
-   "source": [
-    "## load dataset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "2c100a92",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "wa_date = str(datetime.now())[:10]\n",
-    "# wa_date = \"2023-08-08\"\n",
-    "\n",
-    "HDFS_RUCIO_DATASET_LOCKS = f'/project/awg/cms/rucio/{wa_date}/dataset_locks/part*.avro'\n",
-    "# HDFS_RUCIO_LOCKS =         f'/project/awg/cms/rucio/{wa_date}/locks'\n",
-    "HDFS_RUCIO_RSES =          f'/project/awg/cms/rucio/{wa_date}/rses/part*.avro'\n",
-    "HDFS_RUCIO_RULES =         f'/project/awg/cms/rucio/{wa_date}/rules'\n",
-    "# HDFS_RUCIO_RULES_HISTORY = f'/project/awg/cms/rucio/{wa_date}/rules_history'\n",
-    "# HDFS_RUCIO_REPLICAS =      f'/project/awg/cms/rucio/{wa_date}/replicas'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "fe62d431",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "rucio_dataset_locks = spark.read.format('avro').load(HDFS_RUCIO_DATASET_LOCKS)\\\n",
-    "    .withColumn('BYTES', col('BYTES').cast(LongType()))\\\n",
-    "    .withColumn('RULE_ID', lower(_hex(col('RULE_ID'))))\\\n",
-    "    .withColumn('RSE_ID', lower(_hex(col('RSE_ID')))).filter(f\"\"\"ACCOUNT IN ('crab_tape_recall')\"\"\").cache()\n",
-    "rucio_dataset_locks.createOrReplaceTempView(\"dataset_locks\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "b2e4fcfa",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "rucio_rses = spark.read.format('avro').load(HDFS_RUCIO_RSES)\\\n",
-    "    .withColumn('ID', lower(_hex(col('ID'))))\n",
-    "rucio_rses.createOrReplaceTempView(\"rses\")\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "3893197e",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "23/08/09 12:37:11 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n"
-     ]
-    }
-   ],
-   "source": [
-    "rucio_rules = spark.read.format('avro').load(HDFS_RUCIO_RULES)\\\n",
-    "    .withColumn('ID', lower(_hex(col('ID'))))\n",
-    "rucio_rules.createOrReplaceTempView(\"rules\")\n",
-    "#spark.sql(\"SELECT * FROM rules\").count()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "f9f2ba4e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# rucio_locks = spark.read.format('avro').load(HDFS_RUCIO_LOCKS)\\\n",
-    "#     .withColumn('BYTES', col('BYTES').cast(LongType()))\\\n",
-    "#     .withColumn('RULE_ID', lower(_hex(col('RULE_ID'))))\\\n",
-    "#     .withColumn('RSE_ID', lower(_hex(col('RSE_ID'))))\n",
-    "# rucio_locks.createOrReplaceTempView(\"locks\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "7771b12d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# rucio_rules_history = spark.read.format('avro').load(HDFS_RUCIO_RULES_HISTORY)\\\n",
-    "#     .withColumn('ID', lower(_hex(col('ID'))))\n",
-    "#     #.persist(StorageLevel.DISK_ONLY)\n",
-    "# rucio_rules_history.createOrReplaceTempView(\"rules_history\")\n",
-    "# #spark.sql(\"SELECT * FROM rules_history\").count()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "274421b8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# rucio_replicas = spark.read.format('avro').load(HDFS_RUCIO_REPLICAS)\\\n",
-    "#     .withColumn('RSE_ID', lower(_hex(col('RSE_ID'))))\n",
-    "# rucio_replicas.createOrReplaceTempView(\"replicas\")\n",
-    "# #spark.sql(\"SELECT * FROM replicas\").count()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "5c84635f",
-   "metadata": {},
-   "source": [
-    "## Query"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "9be915ed",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# rucio_dataset_locks.count()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "8648794b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# rucio_dataset_locks.printSchema()\n",
-    "# rucio_rses.printSchema()\n",
-    "# rucio_rules.printSchema()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "3aed55c6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# rucio_dataset_locks = rucio_dataset_locks.select('')\n",
-    "rucio_rses = rucio_rses.select('ID', 'RSE', 'RSE_TYPE').cache()\n",
-    "rucio_rules = rucio_rules.select('ID', 'ACCOUNT', 'DID_TYPE', 'EXPIRES_AT').cache()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "929705b6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "result_df = rucio_dataset_locks.join(rucio_rses, rucio_rses[\"ID\"] == rucio_dataset_locks[\"RSE_ID\"])\\\n",
-    "        .join(rucio_rules, rucio_rules[\"ID\"] == rucio_dataset_locks[\"RULE_ID\"]).drop('ID', 'RULE_ID', 'RSE_ID', 'ACCESSED_AT', 'ACCOUNT')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "49af7fee",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# result_df.show(100)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "id": "91db6a20",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# result_df.printSchema()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "7cbdf730",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# result_df.count()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "id": "e734d507",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "docs = result_df.toPandas().to_dict('records')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 38,
-   "id": "ac8524e0",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "17770"
-      ]
-     },
-     "execution_count": 38,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len(docs)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "id": "6d047c66",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "for i in range(len(docs)):\n",
-    "    docs[i]['SIZE_TiB'] = docs[i][\"BYTES\"]/1099511627776\n",
-    "    del docs[i][\"BYTES\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "id": "c052b072",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "TIME = datetime.strptime(f\"\"\"{wa_date} 00:00:00\"\"\", \"%Y-%m-%d %H:%M:%S\").timestamp()*1000\n",
-    "for i in range(len(docs)):\n",
-    "    docs[i]['TIMESTAMP'] = TIME"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 41,
-   "id": "836a7743",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "for i in range(len(docs)):\n",
-    "    NAME_i = docs[i]['NAME']\n",
-    "    split_NAME = NAME_i.split('#')[0]\n",
-    "    docs[i]['NAME_'] = NAME_i.split('#')[0]\n",
-    "    split_NAME = docs[i]['NAME_'].split('/')\n",
-    "    if len(split_NAME) != 4:\n",
-    "        print(\"YO HOO !!, something wrong.\", NAME_i)\n",
-    "    docs[i]['PriDataset'] = split_NAME[1]\n",
-    "    docs[i]['DataTier'] = split_NAME[-1]    "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 42,
-   "id": "51bf031e",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[{'SCOPE': 'cms',\n",
-       "  'NAME': '/ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8/RunIIFall17MiniAODv2-PU2017_12Apr2018_94X_mc2017_realistic_v14-v3/MINIAODSIM#c7b37e2d-77d8-40b9-b8c9-cdf7658406bd',\n",
-       "  'STATE': 'O',\n",
-       "  'LENGTH': '1',\n",
-       "  'UPDATED_AT': 1689164433000,\n",
-       "  'CREATED_AT': 1689096938000,\n",
-       "  'RSE': 'T2_UK_SGrid_RALPP',\n",
-       "  'RSE_TYPE': 'DISK',\n",
-       "  'DID_TYPE': 'C',\n",
-       "  'EXPIRES_AT': 1691719252000,\n",
-       "  'SIZE_TiB': 0.0003293267427579849,\n",
-       "  'TIMESTAMP': 1691532000000.0,\n",
-       "  'NAME_': '/ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8/RunIIFall17MiniAODv2-PU2017_12Apr2018_94X_mc2017_realistic_v14-v3/MINIAODSIM',\n",
-       "  'PriDataset': 'ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8',\n",
-       "  'DataTier': 'MINIAODSIM'},\n",
-       " {'SCOPE': 'cms',\n",
-       "  'NAME': '/ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8/RunIIFall17MiniAODv2-PU2017_12Apr2018_94X_mc2017_realistic_v14-v3/MINIAODSIM#4e06c095-6b19-46a1-a6a6-321e6692a086',\n",
-       "  'STATE': 'O',\n",
-       "  'LENGTH': '1',\n",
-       "  'UPDATED_AT': 1689164433000,\n",
-       "  'CREATED_AT': 1689096938000,\n",
-       "  'RSE': 'T2_UK_SGrid_RALPP',\n",
-       "  'RSE_TYPE': 'DISK',\n",
-       "  'DID_TYPE': 'C',\n",
-       "  'EXPIRES_AT': 1691719252000,\n",
-       "  'SIZE_TiB': 0.00011089865711255698,\n",
-       "  'TIMESTAMP': 1691532000000.0,\n",
-       "  'NAME_': '/ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8/RunIIFall17MiniAODv2-PU2017_12Apr2018_94X_mc2017_realistic_v14-v3/MINIAODSIM',\n",
-       "  'PriDataset': 'ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8',\n",
-       "  'DataTier': 'MINIAODSIM'},\n",
-       " {'SCOPE': 'cms',\n",
-       "  'NAME': '/ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8/RunIIFall17MiniAODv2-PU2017_12Apr2018_94X_mc2017_realistic_v14-v3/MINIAODSIM#1a79fa1f-9f97-4f0f-9716-523e29e57c32',\n",
-       "  'STATE': 'O',\n",
-       "  'LENGTH': '1',\n",
-       "  'UPDATED_AT': 1689164433000,\n",
-       "  'CREATED_AT': 1689096938000,\n",
-       "  'RSE': 'T2_UK_SGrid_RALPP',\n",
-       "  'RSE_TYPE': 'DISK',\n",
-       "  'DID_TYPE': 'C',\n",
-       "  'EXPIRES_AT': 1691719252000,\n",
-       "  'SIZE_TiB': 0.001415386764165305,\n",
-       "  'TIMESTAMP': 1691532000000.0,\n",
-       "  'NAME_': '/ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8/RunIIFall17MiniAODv2-PU2017_12Apr2018_94X_mc2017_realistic_v14-v3/MINIAODSIM',\n",
-       "  'PriDataset': 'ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8',\n",
-       "  'DataTier': 'MINIAODSIM'},\n",
-       " {'SCOPE': 'cms',\n",
-       "  'NAME': '/ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8/RunIIFall17MiniAODv2-PU2017_12Apr2018_94X_mc2017_realistic_v14-v3/MINIAODSIM#18958704-f8f5-4ab4-8d26-0875a74714c4',\n",
-       "  'STATE': 'O',\n",
-       "  'LENGTH': '1',\n",
-       "  'UPDATED_AT': 1689164433000,\n",
-       "  'CREATED_AT': 1689096938000,\n",
-       "  'RSE': 'T2_UK_SGrid_RALPP',\n",
-       "  'RSE_TYPE': 'DISK',\n",
-       "  'DID_TYPE': 'C',\n",
-       "  'EXPIRES_AT': 1691719252000,\n",
-       "  'SIZE_TiB': 0.0008716376141819637,\n",
-       "  'TIMESTAMP': 1691532000000.0,\n",
-       "  'NAME_': '/ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8/RunIIFall17MiniAODv2-PU2017_12Apr2018_94X_mc2017_realistic_v14-v3/MINIAODSIM',\n",
-       "  'PriDataset': 'ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8',\n",
-       "  'DataTier': 'MINIAODSIM'},\n",
-       " {'SCOPE': 'cms',\n",
-       "  'NAME': '/ParkingDoubleMuonLowMass1/Run2023C-PromptReco-v3/AOD#ef5c7b53-7002-4b16-bd94-c9e6cbd1ddc6',\n",
-       "  'STATE': 'O',\n",
-       "  'LENGTH': '1',\n",
-       "  'UPDATED_AT': 1689903482000,\n",
-       "  'CREATED_AT': 1689587082000,\n",
-       "  'RSE': 'T2_BE_UCL',\n",
-       "  'RSE_TYPE': 'DISK',\n",
-       "  'DID_TYPE': 'C',\n",
-       "  'EXPIRES_AT': 1692496353000,\n",
-       "  'SIZE_TiB': 5.84150075155776e-06,\n",
-       "  'TIMESTAMP': 1691532000000.0,\n",
-       "  'NAME_': '/ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8/RunIIFall17MiniAODv2-PU2017_12Apr2018_94X_mc2017_realistic_v14-v3/MINIAODSIM',\n",
-       "  'PriDataset': 'ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8',\n",
-       "  'DataTier': 'MINIAODSIM'}]"
-      ]
-     },
-     "execution_count": 42,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "docs[:5]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "id": "5c770068",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['',\n",
-       " 'ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8',\n",
-       " 'RunIIFall17MiniAODv2-PU2017_12Apr2018_94X_mc2017_realistic_v14-v3',\n",
-       " 'MINIAODSIM#c7b37e2d-77d8-40b9-b8c9-cdf7658406bd']"
-      ]
-     },
-     "execution_count": 27,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "split_str = test_str.split('/')\n",
-    "split_str"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "id": "2a2868f7",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['MINIAODSIM', 'c7b37e2d-77d8-40b9-b8c9-cdf7658406bd']"
-      ]
-     },
-     "execution_count": 29,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "split_str[3].split('#')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "id": "86f3a742",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import osearch"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "id": "6d29e62d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_index_schema():\n",
-    "    return {\n",
-    "        \"settings\": {\"index\": {\"number_of_shards\": \"1\", \"number_of_replicas\": \"1\"}},\n",
-    "        \"mappings\": {\n",
-    "            \"properties\": {\n",
-    "                'SCOPE': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
-    "                'NAME': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
-    "                'STATE': {\"ignore_above\": 1024, \"type\": \"keyword\"},\n",
-    "                'LENGTH': {\"ignore_above\": 1024, \"type\": \"keyword\"},\n",
-    "                'SIZE_TiB': {\"type\": \"long\"},\n",
-    "                'UPDATED_AT': {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
-    "                'CREATED_AT': {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
-    "                'RSE': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
-    "                'RSE_TYPE': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
-    "                'DID_TYPE': {\"ignore_above\": 1024, \"type\": \"keyword\"},\n",
-    "                'EXPIRES_AT': {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
-    "                'TIMESTAMP': {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
-    "                'NAME_': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
-    "                'PriDataset': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
-    "                'DataTier': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
-    "            }\n",
-    "        }\n",
-    "    }"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "b479eeb7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# _index_template = 'crab-tape-recall-rules-ekong'\n",
-    "# client = osearch.get_es_client(\"es-cms1.cern.ch/es\", 'secret_opensearch.txt', get_index_schema())\n",
-    "# # index_mod=\"\": 'test-foo', index_mod=\"Y\": 'test-foo-YYYY', index_mod=\"M\": 'test-foo-YYYY-MM', index_mod=\"D\": 'test-foo-YYYY-MM-DD',\n",
-    "# idx = client.get_or_create_index(timestamp=time.time(), index_template=_index_template, index_mod=\"M\")\n",
-    "# client.send(idx, docs, metadata=None, batch_size=10000, drop_nulls=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "0af51d3a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from datetime import datetime, timedelta\n",
-    "import os\n",
-    "import pandas as pd\n",
-    "import time\n",
-    "from pyspark import SparkContext, StorageLevel\n",
-    "from pyspark.sql import SparkSession\n",
-    "from pyspark.sql.functions import (\n",
-    "    col, collect_list, concat_ws, greatest, lit, lower, when,\n",
-    "    avg as _avg,\n",
-    "    count as _count,\n",
-    "    hex as _hex,\n",
-    "    max as _max,\n",
-    "    min as _min,\n",
-    "    round as _round,\n",
-    "    sum as _sum,\n",
-    ")\n",
-    "\n",
-    "from pyspark.sql.types import (\n",
-    "    LongType,\n",
-    ")\n",
-    "\n",
-    "import numpy as np\n",
-    "import osearch\n",
-    "from pyspark.sql import SparkSession"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "035e6ecf",
-   "metadata": {},
-   "source": [
-    "## Multiple Day Upload"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "12ece939",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def multi_upload(start_date, end_date):\n",
-    "    # change to the date of collected data\n",
-    "    start_date = start_date + timedelta(days=1)\n",
-    "    end_date = end_date + timedelta(days=1)\n",
-    "    \n",
-    "    days = (end_date - start_date).days\n",
-    "    for i in range(days):\n",
-    "        TODAY = start_date + timedelta(days=i)\n",
-    "        TODAY = str(TODAY)[:10]\n",
-    "        \n",
-    "        print(TODAY)\n",
-    "        # Import data into database form\n",
-    "\n",
-    "        wa_date = TODAY\n",
-    "        HDFS_RUCIO_DATASET_LOCKS = f'/project/awg/cms/rucio/{wa_date}/dataset_locks/part*.avro'\n",
-    "        HDFS_RUCIO_RSES =          f'/project/awg/cms/rucio/{wa_date}/rses/part*.avro'\n",
-    "        HDFS_RUCIO_RULES =         f'/project/awg/cms/rucio/{wa_date}/rules'\n",
-    "\n",
-    "        rucio_dataset_locks = spark.read.format('avro').load(HDFS_RUCIO_DATASET_LOCKS)\\\n",
-    "            .withColumn('BYTES', col('BYTES').cast(LongType()))\\\n",
-    "            .withColumn('RULE_ID', lower(_hex(col('RULE_ID'))))\\\n",
-    "            .withColumn('RSE_ID', lower(_hex(col('RSE_ID')))).filter(f\"\"\"ACCOUNT IN ('crab_tape_recall')\"\"\").cache()\n",
-    "        rucio_dataset_locks.createOrReplaceTempView(\"dataset_locks\")\n",
-    "\n",
-    "        rucio_rses = spark.read.format('avro').load(HDFS_RUCIO_RSES)\\\n",
-    "            .withColumn('ID', lower(_hex(col('ID'))))\n",
-    "        rucio_rses.createOrReplaceTempView(\"rses\")\n",
-    "\n",
-    "        rucio_rules = spark.read.format('avro').load(HDFS_RUCIO_RULES)\\\n",
-    "            .withColumn('ID', lower(_hex(col('ID'))))\n",
-    "        rucio_rules.createOrReplaceTempView(\"rules\")\n",
-    "\n",
-    "        # filter and query\n",
-    "\n",
-    "        rucio_rses = rucio_rses.select('ID', 'RSE', 'RSE_TYPE').cache()\n",
-    "        rucio_rules = rucio_rules.select('ID', 'ACCOUNT', 'DID_TYPE', 'EXPIRES_AT').cache()\n",
-    "\n",
-    "        result_df = rucio_dataset_locks.join(rucio_rses, rucio_rses[\"ID\"] == rucio_dataset_locks[\"RSE_ID\"])\\\n",
-    "                .join(rucio_rules, rucio_rules[\"ID\"] == rucio_dataset_locks[\"RULE_ID\"]).drop('ID', 'RULE_ID', 'RSE_ID', 'ACCESSED_AT', 'ACCOUNT')\n",
-    "\n",
-    "        # Convert database to dictionary\n",
-    "\n",
-    "        docs = result_df.toPandas().to_dict('records')\n",
-    "            \n",
-    "        # Add TIMESTAMP column and convert TiB\n",
-    "        TIME = datetime.strptime(f\"\"\"{wa_date} 00:00:00\"\"\", \"%Y-%m-%d %H:%M:%S\").timestamp()*1000\n",
-    "        for i in range(len(docs)):\n",
-    "            docs[i]['TIMESTAMP'] = TIME\n",
-    "            docs[i]['SIZE_TiB'] = docs[i][\"BYTES\"]/1099511627776\n",
-    "            del docs[i][\"BYTES\"]\n",
-    "            \n",
-    "            # break down the name\n",
-    "            NAME_i = docs[i]['NAME']\n",
-    "            split_NAME = NAME_i.split('#')[0]\n",
-    "            docs[i]['NAME_'] = NAME_i.split('#')[0]\n",
-    "            split_NAME = docs[i]['NAME_'].split('/')\n",
-    "            if len(split_NAME) != 4:\n",
-    "                print(\"YO HOO !!, something wrong.\", NAME_i)\n",
-    "            docs[i]['PriDataset'] = split_NAME[1]\n",
-    "            docs[i]['DataTier'] = split_NAME[-1]\n",
-    "\n",
-    "        # Define type of each schema\n",
-    "\n",
-    "        def get_index_schema():\n",
-    "            return {\n",
-    "                \"settings\": {\"index\": {\"number_of_shards\": \"1\", \"number_of_replicas\": \"1\"}},\n",
-    "                \"mappings\": {\n",
-    "                    \"properties\": {\n",
-    "                        'SCOPE': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
-    "                        'NAME': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
-    "                        'STATE': {\"ignore_above\": 1024, \"type\": \"keyword\"},\n",
-    "                        'LENGTH': {\"ignore_above\": 1024, \"type\": \"keyword\"},\n",
-    "                        'BYTES': {\"type\": \"long\"},\n",
-    "                        'UPDATED_AT': {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
-    "                        'CREATED_AT': {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
-    "                        'RSE': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
-    "                        'RSE_TYPE': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
-    "                        'DID_TYPE': {\"ignore_above\": 1024, \"type\": \"keyword\"},\n",
-    "                        'EXPIRES_AT': {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
-    "                        'TIMESTAMP': {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
-    "                        'NAME_': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
-    "                        'PriDataset': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
-    "                        'DataTier': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
-    "                    }\n",
-    "                }\n",
-    "            }\n",
-    "\n",
-    "        # Send data to Opensearch\n",
-    "\n",
-    "        _index_template = 'crab-tape-recall-rules-ekong'\n",
-    "        client = osearch.get_es_client(\"es-cms1.cern.ch/es\", 'secret_opensearch.txt', get_index_schema())\n",
-    "        idx = client.get_or_create_index(timestamp=time.time(), index_template=_index_template, index_mod=\"M\")\n",
-    "        no_of_fail_saved = client.send(idx, docs, metadata=None, batch_size=10000, drop_nulls=False)\n",
-    "\n",
-    "        print(\"========================================================================\", \"FINISHED : \", len(docs), \"ROWS ARE SENT\", no_of_fail_saved, \"ROWS ARE FAILED\", \"========================================================================\", sep='\\n')\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "f4567c46",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2023-07-23\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n",
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "========================================================================\n",
-      "FINISHED : \n",
-      "40190\n",
-      "ROWS ARE SENT\n",
-      "0\n",
-      "ROWS ARE FAILED\n",
-      "========================================================================\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
-      "  warnings.warn(\n"
-     ]
-    }
-   ],
-   "source": [
-    "# upload the data of start_date day to end_date-1d\n",
-    "start_date = datetime(2023, 7, 23)\n",
-    "end_date = datetime(2023, 7, 24)\n",
-    "\n",
-    "multi_upload(start_date, end_date)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "546e9d4f",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "496e681c",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "@webio": {
-   "lastCommId": null,
-   "lastKernelId": null
-  },
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.12"
-  },
-  "sparkconnect": {
-   "bundled_options": [],
-   "list_of_options": [
-    {
-     "name": "spark.jars.packages",
-     "value": "org.apache.spark:spark-avro_2.12:3.3.1"
-    }
-   ]
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/src/script/Monitor/crab-spark/notebooks/crab_taskdb.ipynb b/src/script/Monitor/crab-spark/notebooks/crab_taskdb.ipynb
new file mode 100644
index 0000000000..a491927996
--- /dev/null
+++ b/src/script/Monitor/crab-spark/notebooks/crab_taskdb.ipynb
@@ -0,0 +1,416 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "bcae07ec",
+   "metadata": {},
+   "source": [
+    "# CRAB Spark taskdb\n",
+    "\n",
+    "This jobs will \"copy\" some column from TaskDB table to opensearch to answer theses questions:\n",
+    "- How many tasks are using each crab features? (Split algorithm, Ignorelocality, ScriptExe, GPU)\n",
+    "- How many tasks each users submit?\n",
+    "- How many tasks use ignorelocality?\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6d41c8e6",
+   "metadata": {},
+   "source": [
+    "## Import lib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5e9af689",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datetime import datetime, timedelta, timezone\n",
+    "import os\n",
+    "import time\n",
+    "import pandas as pd\n",
+    "\n",
+    "from pyspark import SparkContext, StorageLevel\n",
+    "from pyspark.sql import SparkSession\n",
+    "from pyspark.sql.functions import (\n",
+    "    current_user,\n",
+    "    col, collect_list, concat_ws, greatest, lit, lower, when,\n",
+    "    avg as _avg,\n",
+    "    count as _count,\n",
+    "    hex as _hex,\n",
+    "    max as _max,\n",
+    "    min as _min,\n",
+    "    round as _round,\n",
+    "    sum as _sum,\n",
+    ")\n",
+    "from pyspark.sql.types import (\n",
+    "    StructType,\n",
+    "    LongType,\n",
+    "    StringType,\n",
+    "    StructField,\n",
+    "    DoubleType,\n",
+    "    IntegerType,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "07a5e399",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# try to import libs from current directory, fallback to $PWD/../workdir if not found\n",
+    "try:\n",
+    "    from crabspark_utils import get_candidate_files, send_os, send_os_parallel\n",
+    "except ModuleNotFoundError:\n",
+    "    import sys\n",
+    "    sys.path.insert(0, f'{os.getcwd()}/../workdir')\n",
+    "    from crabspark_utils import get_candidate_files, send_os, send_os_parallel\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "22946659",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "spark = SparkSession\\\n",
+    "        .builder\\\n",
+    "        .appName('crab-taskdb')\\\n",
+    "        .getOrCreate()\n",
+    "spark"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e9013878",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# clear any cache left, for working with notebook\n",
+    "# it safe to run everytime cronjob start\n",
+    "spark.catalog.clearCache()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "17a6078f",
+   "metadata": {},
+   "source": [
+    "## Arguments\n",
+    "\n",
+    "We provide arguments to this script via env var. \n",
+    "- `OPENSEARCH_SECRET_PATH`: path to secretfile, contain a line of <username>:<password> of opensearch that we send the data to\n",
+    "- `PROD`: if true index prefix will be `crab-prod-`, otherwise `crab-test-`\n",
+    "- `START`: start date (YYYY-MM-dd)\n",
+    "- `END`: end date (YYYY-MM-dd)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "31c19eb0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# secret path, also check if file exists\n",
+    "secretpath = os.environ.get('OPENSEARCH_SECRET_PATH', f'{os.getcwd()}/../workdir/secret_opensearch.txt')\n",
+    "if not os.path.isfile(secretpath): \n",
+    "    raise Exception(f'OS secrets file {secretpath} does not exists')\n",
+    "# if PROD, index prefix will be `crab-*`, otherwise `crab-test-*`\n",
+    "PROD = os.environ.get('PROD', 'false').lower() in ('true', '1', 't')\n",
+    "# FROM_DATE, in strptime(\"%Y-%m-%d\")\n",
+    "START = os.environ.get('START_DATE', None) \n",
+    "END = os.environ.get('END_DATE', None)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f15e62ea",
+   "metadata": {},
+   "source": [
+    "## Variables \n",
+    "Will be used throughout notebook"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e843eb6d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# For run playbook manually, set start/end date here\n",
+    "START_DATE = \"2024-01-03\"\n",
+    "END_DATE = \"2024-10-04\"\n",
+    "# if cronjob, replace constant with value from env\n",
+    "if START and END:\n",
+    "    START_DATE = START\n",
+    "    END_DATE = END"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b17ed53f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# index name\n",
+    "index_name = 'taskdb'\n",
+    "# use prod index pattern if this execution is for production\n",
+    "if PROD:\n",
+    "    index_name = f'crab-prod-{index_name}'\n",
+    "else:\n",
+    "    index_name = f'crab-test-{index_name}'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "430146eb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# datetime object\n",
+    "start_datetime = datetime.strptime(START_DATE, \"%Y-%m-%d\").replace(tzinfo=timezone.utc)\n",
+    "end_datetime = datetime.strptime(END_DATE, \"%Y-%m-%d\").replace(tzinfo=timezone.utc)\n",
+    "# sanity check\n",
+    "if end_datetime < start_datetime: \n",
+    "    raise Exception(f\"end date ({END_DATE}) is less than start date ({START_DATE})\")\n",
+    "start_epochmilis = int(start_datetime.timestamp()) * 1000\n",
+    "end_epochmilis = int(end_datetime.timestamp()) * 1000\n",
+    "yesterday_epoch = int((end_datetime-timedelta(days=1)).timestamp())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9404c437",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# debug\n",
+    "print(START_DATE, \n",
+    "      END_DATE, \n",
+    "      index_name,\n",
+    "      sep='\\n')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9b33ec96",
+   "metadata": {},
+   "source": [
+    "## Loading data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0cf35868",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Note that \"today\" file, for example, today=2024-10-04, should be in directory /project/awg/cms/crab/tasks/2024-10-04 \n",
+    "# which contain contents from the begining of table until the time of dump job run\n",
+    "# which mean data before 2024-10-04 will be available, but not 2024-10-04 itself!\n",
+    "\n",
+    "HDFS_CRAB_part = f'/project/awg/cms/crab/tasks/{END_DATE}/' # data each day in hdfs contain whole table\n",
+    "print(\"===============================================\"\n",
+    "      , \"CRAB Table\"\n",
+    "      , \"===============================================\"\n",
+    "      , \"File Directory:\", HDFS_CRAB_part\n",
+    "      , \"Work Directory:\", os.getcwd()\n",
+    "      , \"===============================================\"\n",
+    "      , \"===============================================\", sep='\\n')\n",
+    "\n",
+    "tasks_df = spark.read.format('avro').load(HDFS_CRAB_part).cache()\n",
+    "tasks_df = ( \n",
+    "    tasks_df.select(\"TM_TASKNAME\",\"TM_START_TIME\",\"TM_TASK_STATUS\",\"TM_SPLIT_ALGO\",\"TM_USERNAME\",\"TM_USER_ROLE\",\"TM_JOB_TYPE\",\"TM_IGNORE_LOCALITY\",\"TM_SCRIPTEXE\",\"TM_USER_CONFIG\")\n",
+    "             .filter(f\"\"\"\\\n",
+    "                  1=1\n",
+    "                  AND TM_START_TIME >= {start_epochmilis}\n",
+    "                  AND TM_START_TIME < {end_epochmilis}\"\"\")\n",
+    "             .cache()\n",
+    ")\n",
+    "tasks_df.createOrReplaceTempView(\"tasks\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "86c634fe",
+   "metadata": {},
+   "source": [
+    "## Query"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e271b1c8",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "query = f\"\"\"\\\n",
+    "WITH reqacc_tb AS (         \n",
+    "SELECT TM_TASKNAME, TM_START_TIME, TM_TASK_STATUS, TM_SPLIT_ALGO, TM_USERNAME, TM_USER_ROLE, TM_JOB_TYPE, TM_IGNORE_LOCALITY, TM_SCRIPTEXE,\n",
+    "       CASE \n",
+    "           WHEN get_json_object(TM_USER_CONFIG, '$.requireaccelerator') = true THEN 'T'\n",
+    "           ELSE 'F'\n",
+    "       END AS REQUIRE_ACCELERATOR\n",
+    "FROM tasks\n",
+    "),\n",
+    "finalize_tb AS (\n",
+    "SELECT TM_TASKNAME, TM_START_TIME, TM_TASK_STATUS, TM_SPLIT_ALGO, TM_USERNAME, TM_USER_ROLE, TM_JOB_TYPE, TM_IGNORE_LOCALITY, TM_SCRIPTEXE, REQUIRE_ACCELERATOR,\n",
+    "       TM_START_TIME AS timestamp,\n",
+    "       'taskdb' AS type\n",
+    "FROM reqacc_tb\n",
+    ")\n",
+    "SELECT * FROM finalize_tb\n",
+    "\"\"\"\n",
+    "\n",
+    "tmpdf = spark.sql(query)\n",
+    "tmpdf.show(10, False)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6561ada6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tmpdf.count()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3c7fc2e5",
+   "metadata": {},
+   "source": [
+    "## Sending result to OpenSearch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c33dfce3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# convert spark df to dicts\n",
+    "docs = tmpdf.toPandas().to_dict('records')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eee4a1f3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "schema = {\n",
+    "            \"settings\": {\"index\": {\"number_of_shards\": \"1\", \"number_of_replicas\": \"1\"}},\n",
+    "            \"mappings\": {\n",
+    "                \"properties\": {\n",
+    "                    \"TM_TASKNAME\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+    "                    \"TM_START_TIME\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
+    "                    'TM_TASK_STATUS': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+    "                    \"TM_SPLIT_ALGO\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+    "                    \"TM_USERNAME\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+    "                    \"TM_USER_ROLE\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+    "                    \"TM_JOB_TYPE\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+    "                    \"TM_IGNORE_LOCALITY\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+    "                    \"TM_SCRIPTEXE\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+    "                    \"REQUIRE_ACCELERATOR\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+    "                    \"type\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+    "                    \"timestamp\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
+    "        }\n",
+    "    }\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9ec824ee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# this is simple workaround osearch bug when work in notebook because\n",
+    "#   - it load the secret once and use forever\n",
+    "#   - get_or_create_index() create index+schema only the first time it execute\n",
+    "# it is safe to run again even in cronjobs \n",
+    "import importlib\n",
+    "import osearch\n",
+    "importlib.reload(osearch)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "64bcf06e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "send_os(docs, index_name, schema, secretpath, yesterday_epoch)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "032d03e0",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "@webio": {
+   "lastCommId": null,
+   "lastKernelId": null
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  },
+  "sparkconnect": {
+   "bundled_options": [],
+   "list_of_options": [
+    {
+     "name": "spark.jars.packages",
+     "value": "org.apache.spark:spark-avro_2.12:3.5.0"
+    }
+   ]
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/script/Monitor/crab-spark/workdir/bootstrap.sh b/src/script/Monitor/crab-spark/workdir/bootstrap.sh
index 29e8d7f00e..9390cfac87 100644
--- a/src/script/Monitor/crab-spark/workdir/bootstrap.sh
+++ b/src/script/Monitor/crab-spark/workdir/bootstrap.sh
@@ -1,16 +1,10 @@
 # source the environment for spark submit
 kinit cmscrab@CERN.CH -k -t /data/certs/keytabs.d/cmscrab.keytab
-source hadoop-setconf.sh analytix 
+source hadoop-setconf.sh analytix
 
 LCG_VER=/cvmfs/sft.cern.ch/lcg/views/LCG_105a_swan/x86_64-el9-gcc13-opt
 source  $LCG_VER/setup.sh
 export PYSPARK_PYTHON=$LCG_VER/bin/python3
 
-# i know, ugly, we should install software in the dockerfile
-# however, we really need an environment from cvmfs, and i am not sure we 
-# can have access to cvmfs at build time in gitlab
-python3 -m pip install --user opensearch-py
-
 # finish the environment
 export CRAB_KRB5_USERNAME=$(klist | grep -i Default | cut -d":" -f2 | cut -d"@" -f"1" | awk '{$1=$1};1')
-
diff --git a/src/script/Monitor/crab-spark/workdir/crabspark_utils.py b/src/script/Monitor/crab-spark/workdir/crabspark_utils.py
new file mode 100644
index 0000000000..6d45b2b130
--- /dev/null
+++ b/src/script/Monitor/crab-spark/workdir/crabspark_utils.py
@@ -0,0 +1,94 @@
+"""
+Utility functions for spark scripts
+"""
+# pylint: disable=protected-access
+
+import concurrent.futures
+
+from datetime import timedelta
+from osearch import get_es_client, OpenSearchInterface
+
+def get_candidate_files(start_date, end_date, spark, base, day_delta=1):
+    """
+    Returns a list of hdfs folders that can contain data for the given dates.
+    Copy from CMSMONIT CMSSpark:
+    https://github.com/dmwm/CMSSpark/blob/b8efa0ac5cb57b617ee8d1ea9bb26d53fb0443b0/src/python/CMSSpark/spark_utils.py#L768
+    """
+    st_date = start_date - timedelta(days=day_delta)
+    ed_date = end_date + timedelta(days=day_delta)
+    days = (ed_date - st_date).days
+
+    sc = spark.sparkContext
+    # The candidate files are the folders to the specific dates,
+    # but if we are looking at recent days the compaction procedure could
+    # have not run yet, so we will consider also the .tmp folders.
+
+    candidate_files = [
+        f"{base}/{(st_date + timedelta(days=i)).strftime('%Y/%m/%d')}{{,.tmp}}"
+        for i in range(0, days)
+    ]
+    fsystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
+    uri = sc._gateway.jvm.java.net.URI
+    path = sc._gateway.jvm.org.apache.hadoop.fs.Path
+    fs = fsystem.get(uri("hdfs:///"), sc._jsc.hadoopConfiguration())
+    candidate_files = [url for url in candidate_files if fs.globStatus(path(url))]
+    return candidate_files
+
+
+def send_os(docs, index_name, schema, secretpath, timestamp, batch_size=10000, printsummary=True):
+    """
+    Convenient one-liner function to send data to opensearch using osearch lib
+
+    :param docs: documents to send to opensearch
+    :type docs: dict
+    :param index_name: opensearch index name
+    :type index_name: str
+    :param schema: opensearch index schema
+    :type schema: str
+    :param secretpath: path to secret file which contains "<user>:<password>"
+    :type secretpath: str
+    :param timestamp: timestamp in second to build
+    :type timestamp: str
+    :param batch_size: how many docs we send to os in a single request
+    :type batch_size: int
+    :param printsummary: if yes, print summary text
+    :type printsummary: bool
+
+    :return: number of total docs and number of fail-to-send docs.
+    :rtype: (int, int)
+    """
+    client = get_es_client("os-cms.cern.ch/os", secretpath, schema)
+    idx = client.get_or_create_index(timestamp=timestamp, index_template=index_name, index_mod="M")
+    no_of_fail_saved = client.send(idx, docs, metadata=None, batch_size=batch_size, drop_nulls=False)
+    if printsummary:
+        print("========================================================================"
+              , "FINISHED : "
+              , len(docs), "ROWS ARE SENT"
+              , no_of_fail_saved, "ROWS ARE FAILED"
+              , "========================================================================", sep='\n')
+    return len(docs), no_of_fail_saved
+
+def send_os_parallel(docs, index_name, schema, secretpath, timestamp, batch_size=10000):
+    """
+    Convenient one-liner function to send data to opensearch using osearch lib,
+    in parallel.
+
+    Note that it has the same params as send_os() except `printsummary`, and
+    return None
+    """
+    with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
+        futures = []
+        for chunk in OpenSearchInterface.to_chunks(docs, batch_size):
+            future = executor.submit(send_os, chunk, index_name, schema, secretpath, timestamp, batch_size+1, False)
+            futures.append(future)
+        total_docs = 0
+        total_fails = 0
+        for f in futures:
+            ndocs, nfails = f.result()
+            total_docs += ndocs
+            total_fails += nfails
+        print("========================================================================"
+              , "FINISHED : "
+              , total_docs, "ROWS ARE SENT"
+              , total_fails, "ROWS ARE FAILED"
+              , "========================================================================", sep='\n')
diff --git a/src/script/Monitor/crab-spark/workdir/osearch.py b/src/script/Monitor/crab-spark/workdir/osearch.py
index 68f2f5b1ad..cd5787aec0 100644
--- a/src/script/Monitor/crab-spark/workdir/osearch.py
+++ b/src/script/Monitor/crab-spark/workdir/osearch.py
@@ -55,6 +55,7 @@ def get_index_schema():
 import json
 import logging
 import time
+import concurrent.futures
 from collections import Counter as collectionsCounter
 from datetime import datetime
 
@@ -96,6 +97,7 @@ def __init__(self, host, secret_file, index_mapping_and_settings):
             url = 'https://' + username + ':' + password + '@' + host
             self.handle = OpenSearch(
                 [url],
+                http_compress=True,
                 verify_certs=False,
                 use_ssl=True,
                 ca_certs='/etc/pki/tls/certs/ca-bundle.trust.crt',
@@ -215,3 +217,35 @@ def send(self, idx, data, metadata=None, batch_size=10000, drop_nulls=False):
             logging.error("OpenSearch send failed count: ", result_n_failed)
         logging.debug("OpenSearch send", len(data) - result_n_failed, "documents successfully")
         return result_n_failed
+
+def send_os(docs, index_name, schema, secretpath, timestamp, batch_size=10000, printsummary=True):
+
+    client = get_es_client("os-cms.cern.ch/os", secretpath, schema)
+    idx = client.get_or_create_index(timestamp=timestamp, index_template=index_name, index_mod="M")
+    no_of_fail_saved = client.send(idx, docs, metadata=None, batch_size=batch_size, drop_nulls=False)
+    if printsummary:
+        print("========================================================================"
+              , "FINISHED : "
+              , len(docs), "ROWS ARE SENT"
+              , no_of_fail_saved, "ROWS ARE FAILED"
+              , "========================================================================", sep='\n')
+    else:
+        return len(docs), no_of_fail_saved
+
+def send_os_parallel(docs, index_name, schema, secretpath, timestamp, batch_size=10000):
+    with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
+        futures = []
+        for chunk in OpenSearchInterface.to_chunks(docs, batch_size):
+            future = executor.submit(send_os, chunk, index_name, schema, secretpath, timestamp, batch_size+1, False)
+            futures.append(future)
+        total_docs = 0
+        total_fails = 0
+        for f in futures:
+            ndocs, nfails = f.result()
+            total_docs += ndocs
+            total_fails += nfails
+        print("========================================================================"
+              , "FINISHED : "
+              , total_docs, "ROWS ARE SENT"
+              , total_fails, "ROWS ARE FAILED"
+              , "========================================================================", sep='\n')
diff --git a/src/script/Monitor/crab-spark/workdir/run.py b/src/script/Monitor/crab-spark/workdir/run.py
new file mode 100755
index 0000000000..eb8224c8b6
--- /dev/null
+++ b/src/script/Monitor/crab-spark/workdir/run.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+
+"""
+This file convert the spark notebook into python file and run spark-submit
+Require shell to source "bootstrap.sh" to bootstrap the cmd and pylib need by
+this script.
+
+It process the python's argparse and pass the argument to spark script via
+environment variable.
+
+For examples:
+- To extract data from the whole September 2024
+    ./run.py --secretpath secret.txt --start 2024-09-01 --end 2024-10-01 crab_taskdb.ipynb
+
+- To extract data from n days ago (in case you need to wait until data settle)
+  For example, today is 2024-10-01 but you want to process data on 2024-09-30
+    ./run.py --secretpath secret.txt --ndaysago 2 crab_condor.ipynb
+
+- To push result docs to production index (otherwise, index will prefix with `crab-test`)
+   ./run.py --secretpath secret.txt --today --prod crab_taskdb.ipynb
+
+- To run in crontab daily, use "run_spark.sh" to prepare a new shell and source bootstrap.sh
+    ./run_spark.sh ./run.py --secretpath secret.txt --today --prod crab_taskdb.ipynb
+
+- To check env that will pass to spark script
+     ./run.py --secretpath secret.txt --today --dryrun crab_taskdb.ipynb
+"""
+import argparse
+import os
+import subprocess
+import pathlib
+from pprint import pprint
+from datetime import datetime, timedelta, timezone
+
+def valid_date(s):
+    """
+    check if date formate is correct and return the arg `s`.
+    The function serve as `type` of argument in argparse.
+
+    >>> valid_date('2024-01-01')
+    valid_date('2024-01-01')
+
+    :param s: date in format YYYY-mm-ddd
+    :type s: str
+
+    :return: s argument
+    :rtype: str
+    """
+    try:
+        datetime.strptime(s, '%Y-%m-%d')
+        return s
+    except ValueError as e:
+        raise argparse.ArgumentTypeError(f"not a valid date: {s!r}") from e
+
+parser = argparse.ArgumentParser(description='Converting spark ipynb and run spark-submit')
+parser.add_argument('path', help='path of script (.ipynb)')
+parser.add_argument('--start', type=valid_date, dest='start_date', help='Start date of interest (YYY-mm-dd)')
+parser.add_argument('--end', type=valid_date, dest='end_date', help='End date of interest (YYY-mm-dd).')
+parser.add_argument('--ndaysago', type=int, default=-1, help='set start date to n-1 days ago, and end date to n days ago')
+parser.add_argument('--today', action='store_true', help='shortcut --ndaysago 0')
+parser.add_argument('--prod', action='store_true', help='set opensearch index prefix to prod "crab-<index_name>)". Default is "crab-test-<index_name>"')
+parser.add_argument('--secretpath', help='secret file path')
+parser.add_argument('--dryrun', action='store_true', help='print env that will pass to spark script')
+args = parser.parse_args()
+
+sparkjob_env = {}
+if args.today:
+    args.ndaysago = 0
+if args.ndaysago >= 0:
+    day = datetime.now().replace(tzinfo=timezone.utc)
+    ed = args.ndaysago
+    sd = args.ndaysago + 1 # start date is "yesterday" of n days ago
+    sparkjob_env['START_DATE'] = (day-timedelta(days=sd)).strftime("%Y-%m-%d")
+    sparkjob_env['END_DATE'] = (day-timedelta(days=ed)).strftime("%Y-%m-%d")
+if args.start_date and args.end_date:
+    sparkjob_env['START_DATE'] = args.start_date
+    sparkjob_env['END_DATE'] = args.end_date
+if 'START_DATE' not in sparkjob_env and 'END_DATE' not in sparkjob_env:
+    raise Exception("Need --today or --ndaysago or --start/--end.")
+if args.secretpath:
+    sparkjob_env['OPENSEARCH_SECRET_PATH'] = args.secretpath
+if args.prod:
+    sparkjob_env['PROD'] = 't'
+else:
+    sparkjob_env['PROD'] = 'f'
+
+runenv = os.environ.copy()
+runenv.update(sparkjob_env)
+
+# convert from nootebook to py
+path = pathlib.Path(args.path)
+pathpy = path.with_suffix('.py')
+cmd = f"jupyter nbconvert --to python {path}"
+print(f'Running: {cmd}')
+if not args.dryrun:
+    subprocess.run(cmd, shell=True, timeout=3600, check=True)
+
+# spark-submit
+cmd = f'spark-submit --master yarn --packages org.apache.spark:spark-avro_2.12:3.5.0 {pathpy}'
+print(f'Running: {cmd}')
+print('With env: ')
+pprint(sparkjob_env)
+if not args.dryrun:
+    subprocess.run(cmd, shell=True, timeout=3600, check=True, env=runenv)
diff --git a/src/script/Monitor/crab-spark/workdir/run_spark.sh b/src/script/Monitor/crab-spark/workdir/run_spark.sh
new file mode 100755
index 0000000000..3bd92a5bfc
--- /dev/null
+++ b/src/script/Monitor/crab-spark/workdir/run_spark.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+set -euo pipefail
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+pushd $SCRIPT_DIR
+
+# source the environment for spark submit
+set +euo pipefail
+source ./bootstrap.sh
+set -euo pipefail
+
+$@
+
+popd