diff --git a/.gitignore b/.gitignore
index 0643fa9cb2..211462e6ac 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,8 +10,8 @@ tmp/runtime/*
# python virtualenv
.venv/
# ci directory
-build/
-workdir/
+/build/
+/workdir/
# direnv
.envrc
# pylint config
diff --git a/cicd/monit_spark/Dockerfile b/cicd/monit_spark/Dockerfile
index 7372722116..fc4499cee7 100644
--- a/cicd/monit_spark/Dockerfile
+++ b/cicd/monit_spark/Dockerfile
@@ -1,4 +1,4 @@
-FROM registry.cern.ch/cmsmonitoring/cmsmon-spark:v0.5.0.1
+FROM registry.cern.ch/cmsmonitoring/cmsmon-spark:v0.5.0.1
## build with from dmwm/CRABServer, root directory
# docker buildx build -t registry.cern.ch/cmscrab/crabspark:(date +%s) -f cicd/monit_spark/Dockerfile .
@@ -9,17 +9,9 @@ RUN yum install -y \
&& rm -rf /var/cache/yum
RUN mkdir -p /data/srv/spark/
-COPY ./src/script/Monitor/crab-spark/workdir/osearch.py \
- ./src/script/Monitor/crab-spark/workdir/bootstrap.sh \
- ./src/script/Monitor/crab-spark/cronjobs/crab_condor_daily.py \
- ./src/script/Monitor/crab-spark/cronjobs/crab_tape_recall_rules_history_daily.py \
- ./src/script/Monitor/crab-spark/cronjobs/run_spark.sh \
- ./src/script/Monitor/crab-spark/cronjobs/crab_data_daily.py \
- ./src/script/Monitor/crab-spark/cronjobs/crab_tape_recall_updated_rules_daily.py \
+COPY ./src/script/Monitor/crab-spark \
/data/srv/spark
ENTRYPOINT ["tini", "--"]
CMD ["echo", "no default script for spark docker image"]
-
-
diff --git a/src/script/Monitor/crab-spark/cronjobs/crab_condor_daily.py b/src/script/Monitor/crab-spark/cronjobs/crab_condor_daily.py
deleted file mode 100644
index 66d998334a..0000000000
--- a/src/script/Monitor/crab-spark/cronjobs/crab_condor_daily.py
+++ /dev/null
@@ -1,250 +0,0 @@
-import os
-import sys
-
-os.environ['PYSPARK_PYTHON'] = sys.executable
-os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
-
-import time
-import numpy as np
-import pandas as pd
-
-from datetime import datetime, date, timedelta
-
-import osearch
-
-import argparse
-parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument("-s", "--start-date", default=None,
- help="process data starting from this day, inclusive (YYYY-MM-DD)",)
-parser.add_argument("-e", "--end-date", default=None,
- help="process data until this day, not included (YYYY-MM-DD)",)
-args = parser.parse_args()
-print(f"timerange: [{args.start_date} {args.end_date})" )
-
-
-from pyspark.sql.functions import (
- col,
- lit,
- when,
- sum as _sum,
- count as _count,
- first,
- date_format,
- from_unixtime
-)
-from pyspark.sql.types import (
- StructType,
- LongType,
- StringType,
- StructField,
- DoubleType,
- IntegerType,
-)
-
-from pyspark.sql import SparkSession
-
-spark = SparkSession\
- .builder\
- .appName("crab_tape_recall")\
- .getOrCreate()
-
-# CRAB table date
-
-# condor data and query date
-# if args.end_date:
-# end_date = datetime.strptime(args.end_date, '%Y-%m-%d')
-# else:
-# end_date = datetime.now()
-# end_date = end_date.replace(minute=0, hour=0, second=0, microsecond=0)
-#
-# if args.start_date:
-# start_date = datetime.strptime(args.start_date, '%Y-%m-%d')
-# else:
-# start_date = end_date - timedelta(days=1)
-
-if args.end_date:
- end_date = datetime.strptime(args.end_date, '%Y-%m-%d')
-else:
- end_date = datetime.now()
- end_date = end_date.replace(minute=0, hour=0, second=0, microsecond=0)
-
-if args.start_date:
- start_date = datetime.strptime(args.start_date, '%Y-%m-%d')
-else:
- start_date = end_date - timedelta(days=1)
-
-date_list = pd.date_range(
- start=start_date,
- end=end_date,
- ).to_pydatetime().tolist()
-
-# Import condor data
-
-def process_single_day(day):
-
- start_date = day
- end_date = day + timedelta(days=1)
- print(f"START PROCESSING: from {start_date} to {end_date}")
-
- _DEFAULT_HDFS_FOLDER = "/project/monitoring/archive/condor/raw/metric"
-
- def _get_schema():
- return StructType(
- [
- StructField(
- "data",
- StructType(
- [
- StructField("RecordTime", LongType(), nullable=False),
- StructField("CMSPrimaryDataTier", StringType(), nullable=True),
- StructField("Status", StringType(), nullable=True),
- StructField("WallClockHr", DoubleType(), nullable=True),
- StructField("CoreHr", DoubleType(), nullable=True),
- StructField("CpuTimeHr", DoubleType(), nullable=True),
- StructField("Type", StringType(), nullable=True),
- StructField("CRAB_DataBlock", StringType(), nullable=True),
- StructField("GlobalJobId", StringType(), nullable=False),
- StructField("ExitCode", LongType(), nullable=True),
- StructField("CRAB_Workflow", StringType(), nullable=True),
- StructField("CommittedCoreHr", StringType(), nullable=True),
- StructField("CommittedWallClockHr", StringType(), nullable=True),
- ]
- ),
- ),
- ]
- )
-
- def get_candidate_files(start_date, end_date, spark, base=_DEFAULT_HDFS_FOLDER):
- st_date = start_date - timedelta(days=0)
- ed_date = end_date + timedelta(days=0)
- days = (ed_date - st_date).days
-
- sc = spark.sparkContext
- FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
- URI = sc._gateway.jvm.java.net.URI
- Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
- fs = FileSystem.get(URI("hdfs:///"), sc._jsc.hadoopConfiguration())
- candidate_files = [
- f"{base}/{(st_date + timedelta(days=i)).strftime('%Y/%m/%d')}"
- for i in range(0, days)
- ]
- candidate_files = [url for url in candidate_files if fs.globStatus(Path(url))]
- print("No. of Compacted files:", len(candidate_files))
-
- pre_candidate_files = [
- f"{base}/{(st_date + timedelta(days=i)).strftime('%Y/%m/%d')}.tmp"
- for i in range(0, days)
- ]
- pre_candidate_files = [url for url in pre_candidate_files if fs.globStatus(Path(url))]
- print("No. of uncompacted files:", len(pre_candidate_files))
-
- return candidate_files + pre_candidate_files
-
-
- schema = _get_schema()
-
- condor_df = (
- spark.read.option("basePath", _DEFAULT_HDFS_FOLDER)
- .json(
- get_candidate_files(start_date, end_date, spark, base=_DEFAULT_HDFS_FOLDER),
- schema=schema,
- ).select("data.*")
- .filter(
- f"""Status IN ('Completed')
- AND Type IN ('analysis')
- AND RecordTime >= {start_date.timestamp() * 1000}
- AND RecordTime < {end_date.timestamp() * 1000}
- """
- )
- .drop_duplicates(["GlobalJobId"])
- # .cache()
- )
-
- # Convert file type by saving and recall it again (.json too complex for spark)
-
- crab_username = os.getenv("CRAB_KRB5_USERNAME", "cmscrab")
- condor_df.write.mode('overwrite').parquet(f"/cms/users/{crab_username}/condor_vir_data" ,compression='zstd')
- condor_df = spark.read.format('parquet').load(f"/cms/users/{crab_username}/condor_vir_data")
-
- # Import CRAB data
- wa_date = day.strftime("%Y-%m-%d")
- HDFS_CRAB_part = f'/project/awg/cms/crab/tasks/{wa_date}/'
- crab_df = spark.read.format('avro').load(HDFS_CRAB_part)
- crab_df = crab_df.select('TM_TASKNAME', 'TM_IGNORE_LOCALITY')
-
- print("==============================================="
- , "Condor Matrix and CRAB Table"
- , "==============================================="
- , "File Directory:", HDFS_CRAB_part, get_candidate_files(start_date, end_date, spark, base=_DEFAULT_HDFS_FOLDER)
- , "Work Directory:", os.getcwd()
- , "==============================================="
- , "===============================================", sep='\n')
-
- # Join condor job with CRAB data
-
- result_df = condor_df.join(crab_df, crab_df["TM_TASKNAME"] == condor_df["CRAB_Workflow"])\
- .select('RecordTime', 'CMSPrimaryDataTier', 'WallClockHr', 'CoreHr', 'CpuTimeHr', 'ExitCode'
- , "CRAB_DataBlock", "TM_IGNORE_LOCALITY", "GlobalJobId", "CommittedCoreHr", "CommittedWallClockHr")
-
- # Convert database to dictionary
-
- docs = result_df.toPandas()
- docs["CRAB_Type"] = docs.apply(lambda row: "PrivateMC" if row["CRAB_DataBlock"] == "MCFakeBlock" else "Analysis", axis=1)
- print(f"pandas dataframe size: {docs.memory_usage(deep=True).apply(lambda x: x / 1024 / 1024).sum()} MB")
-
-
- def get_index_schema():
- return {
- "settings": {"index": {"number_of_shards": "1", "number_of_replicas": "1"}},
- "mappings": {
- "properties": {
- "RecordTime": {"format": "epoch_millis", "type": "date"},
- "CMSPrimaryDataTier": {"ignore_above": 2048, "type": "keyword"},
- "GlobalJobId": {"ignore_above": 2048, "type": "keyword"},
- "WallClockHr": {"type": "long"},
- "CoreHr": {"type": "long"},
- "CpuTimeHr": {"type": "long"},
- "ExitCode": {"ignore_above": 2048, "type": "keyword"},
- "TM_IGNORE_LOCALITY": {"ignore_above": 2048, "type": "keyword"},
- "CRAB_Type": {"ignore_above": 2048, "type": "keyword"},
- "CRAB_DataBlock": {"ignore_above": 2048, "type": "keyword"},
- "CommittedCoreHr": {"type": "long"},
- "CommittedWallClockHr": {"type": "long"},
- }
- }
- }
-
- # Send data to Opensearch
-
- _index_template = 'crab-condor-taskdb'
- client = osearch.get_es_client("os-cms.cern.ch/es", '/data/certs/monit.d/monit_spark_crab.txt', get_index_schema())
- idx = client.get_or_create_index(timestamp=day.strftime("%s"), index_template=_index_template, index_mod="M")
- docs_rows = len(docs)
- sent = 0
- batch = 50000
- import gc
- while sent < docs_rows:
- gc.collect()
- start = sent
- end = start + batch if start + batch < docs_rows else docs_rows
- docs_tmp = docs.iloc[start:end]
- # the following line requires a lot of RAM, better do it 50_000
- # items at a time only. Keep in mind that the pandas datafram usually
- # contains about 1_000_000 rows
- docs_tmp = docs_tmp.to_dict('records')
- no_of_fail_saved = client.send(idx, docs_tmp, metadata=None, batch_size=10000, drop_nulls=False)
- sent = end
-
- print("=================================== Condor Matrix and CRAB Table =====================================",
- "FINISHED : ",
- f"start {start}, end {end}",
- len(docs_tmp), "ROWS ARE SENT",
- no_of_fail_saved, "ROWS ARE FAILED",
- "=================================== Condor Matrix and CRAB Table =====================================",
- sep='\n')
-
-
-for day in date_list:
- process_single_day(day)
-
-
diff --git a/src/script/Monitor/crab-spark/cronjobs/crab_data_daily.py b/src/script/Monitor/crab-spark/cronjobs/crab_data_daily.py
deleted file mode 100644
index 4fb3f4d45d..0000000000
--- a/src/script/Monitor/crab-spark/cronjobs/crab_data_daily.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# import pickle
-from datetime import datetime, timedelta
-
-# import click
-import os
-import pandas as pd
-# import pprint
-import time
-# from dateutil.relativedelta import relativedelta
-
-import numpy as np
-import json
-import osearch
-
-import argparse
-
-parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument("-s", "--start-date", default=None,
- help="process data starting from this day, inclusive (YYYY-MM-DD)",)
-parser.add_argument("-e", "--end-date", default=None,
- help="process data until this day, not included (YYYY-MM-DD)",)
-args = parser.parse_args()
-print(f"timerange: [{args.start_date} {args.end_date})" )
-
-
-from pyspark import SparkContext, StorageLevel
-from pyspark.sql import SparkSession
-from pyspark.sql.functions import (
- col, collect_list, concat_ws, greatest, lit, lower, when,
- avg as _avg,
- count as _count,
- hex as _hex,
- max as _max,
- min as _min,
- round as _round,
- sum as _sum,
-)
-from pyspark.sql.types import (
- LongType,
-)
-from pyspark.sql import SparkSession
-spark = SparkSession\
- .builder\
- .appName("crab_tape_recall")\
- .getOrCreate()
-
-# Query date
-
-#TODAY = str(datetime.now())[:10]
-#YESTERDAY = str(datetime.now()-timedelta(days=1))[:10]
-#wa_date = TODAY
-
-if args.end_date:
- end_date = datetime.strptime(args.end_date, '%Y-%m-%d')
-else:
- end_date = datetime.now()
- end_date = end_date.replace(minute=0, hour=0, second=0, microsecond=0)
-
-if args.start_date:
- start_date = datetime.strptime(args.start_date, '%Y-%m-%d')
-else:
- start_date = end_date - timedelta(days=1)
-
-date_list = pd.date_range(
- start=start_date,
- end=end_date,
- ).to_pydatetime().tolist()
-
-# Import data into database form
-
-def process_single_day(day):
-
- wa_date = day.strftime("%Y-%m-%d")
- TODAY = wa_date
- YESTERDAY = (day-timedelta(days=1)).strftime("%Y-%m-%d")
-
- HDFS_CRAB_part = f'/project/awg/cms/crab/tasks/{wa_date}/'
- print("==============================================="
- , "CRAB Table"
- , "==============================================="
- , "File Directory:", HDFS_CRAB_part
- , "Work Directory:", os.getcwd()
- , "==============================================="
- , "===============================================", sep='\n')
-
- crab_part = spark.read.format('avro').load(HDFS_CRAB_part)
- df = crab_part.select("TM_TASKNAME","TM_START_TIME","TM_TASK_STATUS","TM_SPLIT_ALGO","TM_USERNAME","TM_USER_ROLE","TM_JOB_TYPE","TM_IGNORE_LOCALITY","TM_SCRIPTEXE","TM_USER_CONFIG")
- df.createOrReplaceTempView("crab_algo")
-
- # Query daily data
-
- query = f"""\
- SELECT *
- FROM crab_algo
- WHERE 1=1
- AND TM_START_TIME >= unix_timestamp("{YESTERDAY} 00:00:00", "yyyy-MM-dd HH:mm:ss")*1000
- AND TM_START_TIME < unix_timestamp("{TODAY} 00:00:00", "yyyy-MM-dd HH:mm:ss")*1000
- """
-
- tmpdf = spark.sql(query)
- tmpdf.show(10)
-
- # Convert database to dictionary
-
- docs = tmpdf.toPandas().to_dict('records')
-
- # Extract 'REQUIRE_ACCELERATOR' from 'TM_USER_CONFIG'
-
- for i in range(len(docs)):
- if docs[i]['TM_USER_CONFIG'] is not None:
- data = json.loads(docs[i]['TM_USER_CONFIG'])
- if "requireaccelerator" in data:
- docs[i]['REQUIRE_ACCELERATOR'] = data["requireaccelerator"]
- else:
- docs[i]['REQUIRE_ACCELERATOR'] = None
- else:
- docs[i]['REQUIRE_ACCELERATOR'] = None
-
- # Define type of each schema
-
- def get_index_schema():
- return {
- "settings": {"index": {"number_of_shards": "1", "number_of_replicas": "1"}},
- "mappings": {
- "properties": {
- "TM_TASKNAME": {"ignore_above": 2048, "type": "keyword"},
- "TM_START_TIME": {"format": "epoch_millis", "type": "date"},
- 'TM_TASK_STATUS': {"ignore_above": 2048, "type": "keyword"},
- "TM_SPLIT_ALGO": {"ignore_above": 2048, "type": "keyword"},
- "TM_USERNAME": {"ignore_above": 2048, "type": "keyword"},
- "TM_USER_ROLE": {"ignore_above": 2048, "type": "keyword"},
- "TM_JOB_TYPE": {"ignore_above": 2048, "type": "keyword"},
- "TM_IGNORE_LOCALITY": {"ignore_above": 2048, "type": "keyword"},
- "TM_SCRIPTEXE": {"ignore_above": 2048, "type": "keyword"},
- "REQUIRE_ACCELERATOR": {"ignore_above": 2048, "type": "keyword"},
- }
- }
- }
-
- # Send data to Opensearch
-
- _index_template = 'crab-taskdb'
- client = osearch.get_es_client("os-cms.cern.ch/es", '/data/certs/monit.d/monit_spark_crab.txt', get_index_schema())
- idx = client.get_or_create_index(timestamp=day.strftime("%s"), index_template=_index_template, index_mod="M")
- no_of_fail_saved = client.send(idx, docs, metadata=None, batch_size=10000, drop_nulls=False)
-
- print("================================= CRAB Table ======================================="
- , "FINISHED : ", len(docs), "ROWS ARE SENT", no_of_fail_saved, "ROWS ARE FAILED"
- , "================================= CRAB Table =======================================", sep='\n')
-
-
-for day in date_list:
- process_single_day(day)
-
diff --git a/src/script/Monitor/crab-spark/cronjobs/crab_tape_recall_rules_history_daily.py b/src/script/Monitor/crab-spark/cronjobs/crab_tape_recall_rules_history_daily.py
deleted file mode 100644
index 5fa5ddcac5..0000000000
--- a/src/script/Monitor/crab-spark/cronjobs/crab_tape_recall_rules_history_daily.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# import pickle
-from datetime import datetime, timedelta
-
-# import click
-import os
-import pandas as pd
-# import pprint
-import time
-# from dateutil.relativedelta import relativedelta
-
-import argparse
-parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument("-s", "--start-date", default=None,
- help="process data starting from this day, inclusive (YYYY-MM-DD)",)
-parser.add_argument("-e", "--end-date", default=None,
- help="process data until this day, not included (YYYY-MM-DD)",)
-args = parser.parse_args()
-print(f"timerange: [{args.start_date} {args.end_date})" )
-
-
-from pyspark import SparkContext, StorageLevel
-from pyspark.sql import SparkSession
-from pyspark.sql.functions import (
- col, collect_list, concat_ws, greatest, lit, lower, when,
- avg as _avg,
- count as _count,
- hex as _hex,
- max as _max,
- min as _min,
- round as _round,
- sum as _sum,
-)
-
-from pyspark.sql.types import (
- LongType,
-)
-
-import numpy as np
-# import math
-import osearch
-from pyspark.sql import SparkSession
-
-spark = SparkSession\
- .builder\
- .appName("crab_tape_recall")\
- .getOrCreate()
-
-# Data date
-
-if args.end_date:
- end_date = datetime.strptime(args.end_date, '%Y-%m-%d')
-else:
- end_date = datetime.now()
- end_date = end_date.replace(minute=0, hour=0, second=0, microsecond=0)
-
-if args.start_date:
- start_date = datetime.strptime(args.start_date, '%Y-%m-%d')
-else:
- start_date = end_date - timedelta(days=1)
-
-date_list = pd.date_range(
- start=start_date,
- end=end_date,
- ).to_pydatetime().tolist()
-
-def process_single_day(day):
-
- # Query date
-
- wa_date = day.strftime("%Y-%m-%d")
- TODAY = wa_date
- YESTERDAY = (day-timedelta(days=1)).strftime("%Y-%m-$d")
- TOYEAR = day.strftime("%Y")
-
- # Import data into database form
-
- HDFS_RUCIO_RULES_HISTORY = f'/project/awg/cms/rucio/{wa_date}/rules_history/'
-
- print("==============================================="
- , "RUCIO : Rules History"
- , "==============================================="
- , "File Directory:", HDFS_RUCIO_RULES_HISTORY
- , "Work Directory:", os.getcwd()
- , "==============================================="
- , "===============================================", sep='\n')
-
- rucio_rules_history = spark.read.format('avro').load(HDFS_RUCIO_RULES_HISTORY).withColumn('ID', lower(_hex(col('ID'))))
-
- # Query data in daily
-
- rucio_rules_history = rucio_rules_history.select("ID", "NAME", "STATE", "EXPIRES_AT", "UPDATED_AT", "CREATED_AT", "ACCOUNT").filter(f"""ACCOUNT IN ('crab_tape_recall')""").cache()
- rucio_rules_history.createOrReplaceTempView("rules_history")
-
- query = query = f"""\
- WITH filter_t AS (
- SELECT ID, NAME, STATE, EXPIRES_AT, UPDATED_AT, CREATED_AT
- FROM rules_history
- WHERE 1=1
- AND CREATED_AT >= unix_timestamp("{TOYEAR}-01-01 00:00:00", "yyyy-MM-dd HH:mm:ss")*1000
- ),
- rn_t AS (
- SELECT ID, NAME, STATE, EXPIRES_AT, UPDATED_AT, CREATED_AT,
- row_number() over(partition by ID order by UPDATED_AT desc) as rn
- FROM filter_t
- ),
- calc_days_t AS (
- SELECT ID, NAME, STATE, EXPIRES_AT, UPDATED_AT, CREATED_AT,
- CASE
- WHEN STATE = 'O' THEN ceil((UPDATED_AT-CREATED_AT)/86400000)
- WHEN STATE != 'O' AND EXPIRES_AT < unix_timestamp("{wa_date} 00:00:00", "yyyy-MM-dd HH:mm:ss")*1000 THEN ceil((EXPIRES_AT-CREATED_AT)/86400000)
- ELSE 0
- END AS DAYS
- FROM rn_t
- WHERE rn = 1
- )
- SELECT *
- FROM calc_days_t
- WHERE 1=1
- AND EXPIRES_AT >= unix_timestamp("{YESTERDAY} 00:00:00", "yyyy-MM-dd HH:mm:ss")*1000
- AND EXPIRES_AT < unix_timestamp("{TODAY} 00:00:00", "yyyy-MM-dd HH:mm:ss")*1000
- """
-
- tmpdf = spark.sql(query)
- tmpdf.show()
-
- # Convert database to dictionary
-
- docs = tmpdf.toPandas().to_dict('records')
-
- # Define type of each schema
-
- def get_index_schema():
- return {
- "settings": {"index": {"number_of_shards": "1", "number_of_replicas": "1"}},
- "mappings": {
- "properties": {
- "timestamp": {"format": "epoch_second", "type": "date"},
- "ID": {"ignore_above": 1024, "type": "keyword"},
- "NAME": {"ignore_above": 2048, "type": "keyword"},
- "STATE": {"ignore_above": 1024, "type": "keyword"},
- "EXPIRES_AT": {"format": "epoch_millis", "type": "date"},
- "UPDATED_AT": {"format": "epoch_millis", "type": "date"},
- "CREATED_AT": {"format": "epoch_millis", "type": "date"},
- "DAYS": {"type": "long"},
- }
- }
- }
-
- # Send data to Opensearch
-
- _index_template = 'crab-tape-recall-daily'
- client = osearch.get_es_client("os-cms.cern.ch/es", '/data/certs/monit.d/monit_spark_crab.txt', get_index_schema())
- idx = client.get_or_create_index(timestamp=day.strftime("%s"), index_template=_index_template, index_mod="M")
- no_of_fail_saved = client.send(idx, docs, metadata=None, batch_size=10000, drop_nulls=False)
-
- print("=================================== RUCIO : Rules History ====================================="
- , "FINISHED : "
- , len(docs), "ROWS ARE SENT"
- , no_of_fail_saved, "ROWS ARE FAILED"
- , "=================================== RUCIO : Rules History =====================================", sep='\n')
-
-
-for day in date_list:
- process_single_day(day)
diff --git a/src/script/Monitor/crab-spark/cronjobs/crab_tape_recall_updated_rules_daily.py b/src/script/Monitor/crab-spark/cronjobs/crab_tape_recall_updated_rules_daily.py
deleted file mode 100644
index 79c4bc019e..0000000000
--- a/src/script/Monitor/crab-spark/cronjobs/crab_tape_recall_updated_rules_daily.py
+++ /dev/null
@@ -1,163 +0,0 @@
-
-from datetime import datetime, timedelta
-import os
-import pandas as pd
-import time
-
-import argparse
-
-parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument("-s", "--start-date", default=None,
- help="process data starting from this day, inclusive (YYYY-MM-DD)",)
-parser.add_argument("-e", "--end-date", default=None,
- help="process data until this day, not included (YYYY-MM-DD)",)
-args = parser.parse_args()
-print(f"timerange: [{args.start_date} {args.end_date})" )
-
-from pyspark import SparkContext, StorageLevel
-from pyspark.sql import SparkSession
-from pyspark.sql.functions import (
- col, collect_list, concat_ws, greatest, lit, lower, when,
- avg as _avg,
- count as _count,
- hex as _hex,
- max as _max,
- min as _min,
- round as _round,
- sum as _sum,
-)
-from pyspark.sql.types import (
- LongType,
-)
-import numpy as np
-import osearch
-from pyspark.sql import SparkSession
-
-spark = SparkSession\
- .builder\
- .appName("crab_tape_recall")\
- .getOrCreate()
-
-# Data date
-
-if args.end_date:
- end_date = datetime.strptime(args.end_date, '%Y-%m-%d')
-else:
- end_date = datetime.now()
- end_date = end_date.replace(minute=0, hour=0, second=0, microsecond=0)
-
-if args.start_date:
- start_date = datetime.strptime(args.start_date, '%Y-%m-%d')
-else:
- start_date = end_date - timedelta(days=1)
-
-date_list = pd.date_range(
- start=start_date,
- end=end_date,
- ).to_pydatetime().tolist()
-
-def process_single_day(day):
-
- wa_date = day.strftime("%Y-%m-%d")
-
- # Import data into database form
-
- HDFS_RUCIO_DATASET_LOCKS = f'/project/awg/cms/rucio/{wa_date}/dataset_locks/part*.avro'
- HDFS_RUCIO_RSES = f'/project/awg/cms/rucio/{wa_date}/rses/part*.avro'
- HDFS_RUCIO_RULES = f'/project/awg/cms/rucio/{wa_date}/rules'
- print("===============================================", "File Directory:", HDFS_RUCIO_DATASET_LOCKS, "Work Directory:", os.getcwd(), "===============================================", sep='\n')
-
- print("==============================================="
- , "RUCIO : Rules, RSEs, Dataset"
- , "==============================================="
- , "File Directory:", HDFS_RUCIO_DATASET_LOCKS
- , "Work Directory:", os.getcwd()
- , "==============================================="
- , "===============================================", sep='\n')
- rucio_dataset_locks = spark.read.format('avro').load(HDFS_RUCIO_DATASET_LOCKS)\
- .withColumn('BYTES', col('BYTES').cast(LongType()))\
- .withColumn('RULE_ID', lower(_hex(col('RULE_ID'))))\
- .withColumn('RSE_ID', lower(_hex(col('RSE_ID'))))
- rucio_dataset_locks.createOrReplaceTempView("dataset_locks")
-
- rucio_rses = spark.read.format('avro').load(HDFS_RUCIO_RSES)\
- .withColumn('ID', lower(_hex(col('ID'))))
- rucio_rses.createOrReplaceTempView("rses")
-
- rucio_rules = spark.read.format('avro').load(HDFS_RUCIO_RULES)\
- .withColumn('ID', lower(_hex(col('ID'))))
- rucio_rules.createOrReplaceTempView("rules")
-
- # filter and query
-
- rucio_dataset_locks = rucio_dataset_locks.filter(f"""ACCOUNT IN ('crab_tape_recall')""").cache()
- rucio_rses = rucio_rses.select('ID', 'RSE', 'RSE_TYPE').cache()
- rucio_rules = rucio_rules.select('ID', 'ACCOUNT', 'DID_TYPE', 'EXPIRES_AT').cache()
-
- result_df = rucio_dataset_locks.join(rucio_rses, rucio_rses["ID"] == rucio_dataset_locks["RSE_ID"])\
- .join(rucio_rules, rucio_rules["ID"] == rucio_dataset_locks["RULE_ID"]).drop('ID', 'RULE_ID', 'RSE_ID', 'ACCESSED_AT', 'ACCOUNT')
-
- # Convert database to dictionary
-
- docs = result_df.toPandas().to_dict('records')
-
- # Add TIMESTAMP column and convert TiB
- TIME = datetime.strptime(f"""{wa_date} 00:00:00""", "%Y-%m-%d %H:%M:%S").timestamp()*1000
- for i in range(len(docs)):
- docs[i]['TIMESTAMP'] = TIME
- docs[i]['SIZE_TiB'] = docs[i]["BYTES"]/1099511627776
- del docs[i]["BYTES"]
-
- # break down the name
- NAME_i = docs[i]['NAME']
- split_NAME = NAME_i.split('#')[0]
- docs[i]['NAME_'] = NAME_i.split('#')[0]
- split_NAME = docs[i]['NAME_'].split('/')
- if len(split_NAME) != 4:
- print("YO HOO !!, something wrong.", NAME_i)
- docs[i]['PriDataset'] = split_NAME[1]
- docs[i]['DataTier'] = split_NAME[-1]
-
- # Define type of each schema
-
- def get_index_schema():
- return {
- "settings": {"index": {"number_of_shards": "1", "number_of_replicas": "1"}},
- "mappings": {
- "properties": {
- 'SCOPE': {"ignore_above": 2048, "type": "keyword"},
- 'NAME': {"ignore_above": 2048, "type": "keyword"},
- 'STATE': {"ignore_above": 1024, "type": "keyword"},
- 'LENGTH': {"ignore_above": 1024, "type": "keyword"},
- 'SIZE_TiB': {"type": "long"},
- 'UPDATED_AT': {"format": "epoch_millis", "type": "date"},
- 'CREATED_AT': {"format": "epoch_millis", "type": "date"},
- 'RSE': {"ignore_above": 2048, "type": "keyword"},
- 'RSE_TYPE': {"ignore_above": 2048, "type": "keyword"},
- 'DID_TYPE': {"ignore_above": 1024, "type": "keyword"},
- 'EXPIRES_AT': {"format": "epoch_millis", "type": "date"},
- 'TIMESTAMP': {"format": "epoch_millis", "type": "date"},
- 'NAME_': {"ignore_above": 2048, "type": "keyword"},
- 'PriDataset': {"ignore_above": 2048, "type": "keyword"},
- 'DataTier': {"ignore_above": 2048, "type": "keyword"},
- }
- }
- }
-
- # Send data to Opensearch
-
- _index_template = 'crab-tape-recall-rules'
- client = osearch.get_es_client("os-cms.cern.ch/es", '/data/certs/monit.d/monit_spark_crab.txt', get_index_schema())
- idx = client.get_or_create_index(timestamp=day.strftime("%s"), index_template=_index_template, index_mod="M")
- no_of_fail_saved = client.send(idx, docs, metadata=None, batch_size=10000, drop_nulls=False)
-
- print("==================================== RUCIO : Rules, RSEs, Dataset ===================================="
- , "FINISHED : "
- , len(docs), "ROWS ARE SENT"
- , no_of_fail_saved, "ROWS ARE FAILED"
- , "==================================== RUCIO : Rules, RSEs, Dataset ====================================", sep='\n')
-
-
-for day in date_list:
- process_single_day(day)
-
diff --git a/src/script/Monitor/crab-spark/cronjobs/cron_daily.sh b/src/script/Monitor/crab-spark/cronjobs/cron_daily.sh
deleted file mode 100644
index 7665fa59b8..0000000000
--- a/src/script/Monitor/crab-spark/cronjobs/cron_daily.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/bash
-
-TAG=latest
-if [[ -n $1 ]]; then
- TAG=$1
-fi
-
-docker run --rm --net=host -v /cvmfs:/cvmfs:shared \
- -v /data/certs/monit.d/monit_spark_crab.txt:/data/certs/monit.d/monit_spark_crab.txt \
- -v /data/certs/keytabs.d/cmscrab.keytab:/data/certs/keytabs.d/cmscrab.keytab \
- registry.cern.ch/cmscrab/crabspark:${TAG} \
- bash /data/srv/spark/run_spark.sh /data/srv/spark/crab_data_daily.py \
-
-docker run --rm --net=host -v /cvmfs:/cvmfs:shared \
- -v /data/certs/monit.d/monit_spark_crab.txt:/data/certs/monit.d/monit_spark_crab.txt \
- -v /data/certs/keytabs.d/cmscrab.keytab:/data/certs/keytabs.d/cmscrab.keytab \
- registry.cern.ch/cmscrab/crabspark:${TAG} \
- bash /data/srv/spark/run_spark.sh /data/srv/spark/crab_condor_daily.py
-
- docker run --rm --net=host -v /cvmfs:/cvmfs:shared \
- -v /data/certs/monit.d/monit_spark_crab.txt:/data/certs/monit.d/monit_spark_crab.txt \
- -v /data/certs/keytabs.d/cmscrab.keytab:/data/certs/keytabs.d/cmscrab.keytab \
- registry.cern.ch/cmscrab/crabspark:${TAG} \
- bash /data/srv/spark/run_spark.sh /data/srv/spark/crab_tape_recall_rules_history_daily.py
-
-docker run --rm --net=host -v /cvmfs:/cvmfs:shared \
- -v /data/certs/monit.d/monit_spark_crab.txt:/data/certs/monit.d/monit_spark_crab.txt \
- -v /data/certs/keytabs.d/cmscrab.keytab:/data/certs/keytabs.d/cmscrab.keytab \
- registry.cern.ch/cmscrab/crabspark:${TAG} \
- bash /data/srv/spark/run_spark.sh /data/srv/spark/crab_tape_recall_updated_rules_daily.py
-
diff --git a/src/script/Monitor/crab-spark/cronjobs/run_spark.sh b/src/script/Monitor/crab-spark/cronjobs/run_spark.sh
index fb5b13d3ab..5412f4119b 100644
--- a/src/script/Monitor/crab-spark/cronjobs/run_spark.sh
+++ b/src/script/Monitor/crab-spark/cronjobs/run_spark.sh
@@ -1,10 +1,15 @@
#!/bin/bash
+set -euo pipefail
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
# work directory
-cd /data/srv/spark
+pushd "${SCRIPT_DIR}"
# source the environment for spark submit
-source ./bootstrap.sh
+source ../workdir/bootstrap.sh
# submit $1 to spark, where $1 supposes to be a data pulling file (.py)
spark-submit --master yarn --packages org.apache.spark:spark-avro_2.12:3.5.0 $@
+
+popd
diff --git a/src/script/Monitor/crab-spark/notebooks/crab_condor.ipynb b/src/script/Monitor/crab-spark/notebooks/crab_condor.ipynb
index 82f3f81eb2..65855e1112 100644
--- a/src/script/Monitor/crab-spark/notebooks/crab_condor.ipynb
+++ b/src/script/Monitor/crab-spark/notebooks/crab_condor.ipynb
@@ -1,85 +1,43 @@
{
"cells": [
{
- "cell_type": "code",
- "execution_count": 1,
- "id": "cf212bba",
+ "cell_type": "markdown",
+ "id": "aed9b54a",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "
\n",
- "
SparkSession - in-memory
\n",
- " \n",
- "
\n",
- "
SparkContext
\n",
- "\n",
- "
Spark UI
\n",
- "\n",
- "
\n",
- " - Version
\n",
- " v3.3.2
\n",
- " - Master
\n",
- " yarn
\n",
- " - AppName
\n",
- " pyspark_shell_swan
\n",
- "
\n",
- "
\n",
- " \n",
- "
\n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "execution_count": 1,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
- "spark"
+ "# CRAB Spark condor job\n",
+ "\n",
+ "This join info between the condor job metrics and crab taskdb, to answer these questions:\n",
+ "- How many jobs use ignorelocality?\n",
+ "- What is wall clock time spent by each CMS data tier and each job type?\n",
+ "- What is the success rate of the Analysis job type?\n"
]
},
{
"cell_type": "code",
- "execution_count": 2,
- "id": "77d4d561",
+ "execution_count": null,
+ "id": "5e9af689",
"metadata": {},
"outputs": [],
"source": [
+ "from datetime import datetime, timedelta, timezone\n",
"import os\n",
- "import sys\n",
- "\n",
- "os.environ['PYSPARK_PYTHON'] = sys.executable\n",
- "os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable\n",
- "\n",
"import time\n",
- "# from utils import (\n",
- "# _to_dict,\n",
- "# _donut,\n",
- "# _pie,\n",
- "# _line_graph,\n",
- "# _other_fields,\n",
- "# _exitcode_info,\n",
- "# _better_label\n",
- "# )\n",
- "from datetime import datetime, date, timedelta\n",
+ "import pandas as pd\n",
+ "\n",
+ "from pyspark import SparkContext, StorageLevel\n",
+ "from pyspark.sql import SparkSession\n",
"from pyspark.sql.functions import (\n",
- " col,\n",
- " lit,\n",
- " when,\n",
- " sum as _sum,\n",
+ " current_user,\n",
+ " col, collect_list, concat_ws, greatest, lit, lower, when,\n",
+ " avg as _avg,\n",
" count as _count,\n",
- " first,\n",
- " date_format,\n",
- " from_unixtime\n",
+ " hex as _hex,\n",
+ " max as _max,\n",
+ " min as _min,\n",
+ " round as _round,\n",
+ " sum as _sum,\n",
")\n",
- "import numpy as np\n",
- "import pandas as pd\n",
"from pyspark.sql.types import (\n",
" StructType,\n",
" LongType,\n",
@@ -87,615 +45,407 @@
" StructField,\n",
" DoubleType,\n",
" IntegerType,\n",
- ")\n",
- "# spark.conf.set(\"spark.sql.execution.arrow.pyspark.enabled\", \"true\")\n"
+ ")"
]
},
{
- "cell_type": "markdown",
- "id": "6b14b465",
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "51b2f1c7",
"metadata": {},
+ "outputs": [],
"source": [
- "### Prepare condor file name/configuration"
+ "# try to import libs from current directory, fallback to $PWD/../workdir if not found\n",
+ "try:\n",
+ " from crabspark_utils import get_candidate_files, send_os, send_os_parallel\n",
+ "except ModuleNotFoundError:\n",
+ " import sys\n",
+ " sys.path.insert(0, f'{os.getcwd()}/../workdir')\n",
+ " from crabspark_utils import get_candidate_files, send_os, send_os_parallel\n"
]
},
{
"cell_type": "code",
- "execution_count": 3,
- "id": "65a21e3a",
+ "execution_count": null,
+ "id": "22946659",
"metadata": {},
"outputs": [],
"source": [
- "def _get_schema():\n",
- " return StructType(\n",
- " [\n",
- " StructField(\n",
- " \"data\",\n",
- " StructType(\n",
- " [\n",
- " StructField(\"RecordTime\", LongType(), nullable=False),\n",
- " StructField(\"CMSPrimaryDataTier\", StringType(), nullable=True),\n",
- " StructField(\"Status\", StringType(), nullable=True),\n",
- " StructField(\"WallClockHr\", DoubleType(), nullable=True),\n",
- " StructField(\"CoreHr\", DoubleType(), nullable=True),\n",
- " StructField(\"CpuTimeHr\", DoubleType(), nullable=True),\n",
- " StructField(\"Type\", StringType(), nullable=True),\n",
- " StructField(\"CRAB_DataBlock\", StringType(), nullable=True),\n",
- " StructField(\"GlobalJobId\", StringType(), nullable=False),\n",
- " StructField(\"ExitCode\", LongType(), nullable=True),\n",
- " StructField(\"CRAB_Workflow\", StringType(), nullable=True),\n",
- " StructField(\"CommittedCoreHr\", StringType(), nullable=True),\n",
- " StructField(\"CommittedWallClockHr\", StringType(), nullable=True),\n",
- " ]\n",
- " ),\n",
- " ),\n",
- " ]\n",
- " )"
+ "spark = SparkSession\\\n",
+ " .builder\\\n",
+ " .appName('condor-job')\\\n",
+ " .getOrCreate()\n",
+ "spark"
]
},
{
"cell_type": "code",
- "execution_count": 4,
- "id": "5344e275",
+ "execution_count": null,
+ "id": "d37c4539",
"metadata": {},
"outputs": [],
"source": [
- "_DEFAULT_HDFS_FOLDER = \"/project/monitoring/archive/condor/raw/metric\""
+ "# clear any cache left, for working with notebook\n",
+ "# it safe to run everytime cronjob start\n",
+ "spark.catalog.clearCache()"
]
},
{
"cell_type": "code",
- "execution_count": 5,
- "id": "c20d8d62",
+ "execution_count": null,
+ "id": "31c19eb0",
"metadata": {},
"outputs": [],
"source": [
- "# # Check available files \n",
- "# !hdfs dfs -ls /project/monitoring/archive/condor/raw/metric/2023/07/08"
+ "# secret path, also check if file exists\n",
+ "secretpath = os.environ.get('OPENSEARCH_SECRET_PATH', f'{os.getcwd()}/../workdir/secret_opensearch.txt')\n",
+ "if not os.path.isfile(secretpath): \n",
+ " raise Exception(f'OS secrets file {secretpath} does not exists')\n",
+ "# if PROD, index prefix will be `crab-*`, otherwise `crab-test-*`\n",
+ "PROD = os.environ.get('PROD', 'false').lower() in ('true', '1', 't')\n",
+ "# FROM_DATE, in strptime(\"%Y-%m-%d\")\n",
+ "START = os.environ.get('START_DATE', None) \n",
+ "END = os.environ.get('END_DATE', None)"
]
},
{
"cell_type": "code",
- "execution_count": 6,
- "id": "8d821f8f",
+ "execution_count": null,
+ "id": "e843eb6d",
"metadata": {},
"outputs": [],
"source": [
- "def get_candidate_files(start_date, end_date, spark, base=_DEFAULT_HDFS_FOLDER):\n",
- " st_date = start_date - timedelta(days=0)\n",
- " ed_date = end_date + timedelta(days=0)\n",
- " days = (ed_date - st_date).days\n",
- " pre_candidate_files = [\n",
- " \"{base}/{day}{{,.tmp}}\".format(\n",
- " base=base, day=(st_date + timedelta(days=i)).strftime(\"%Y/%m/%d\")\n",
- " )\n",
- " for i in range(0, days)\n",
- " ]\n",
- " sc = spark.sparkContext\n",
- " \n",
- " candidate_files = [\n",
- " f\"{base}/{(st_date + timedelta(days=i)).strftime('%Y/%m/%d')}\"\n",
- " for i in range(0, days)\n",
- " ]\n",
- " FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem\n",
- " URI = sc._gateway.jvm.java.net.URI\n",
- " Path = sc._gateway.jvm.org.apache.hadoop.fs.Path\n",
- " fs = FileSystem.get(URI(\"hdfs:///\"), sc._jsc.hadoopConfiguration())\n",
- " # FIXME\n",
- " candidate_files = [url for url in candidate_files if fs.globStatus(Path(url))]\n",
- " print(\"No. of Consisted files:\", len(candidate_files))\n",
- " return candidate_files\n",
- "\n",
- "# all_candidate_files = []\n",
- "# candidate_files = [\n",
- "# f\"{base}/{(st_date + timedelta(days=i)).strftime('%Y/%m/%d')}\"\n",
- "# for i in range(0, days)\n",
- "# ]\n",
- " \n",
- "# URI = sc._gateway.jvm.java.net.URI\n",
- "# Path = sc._gateway.jvm.org.apache.hadoop.fs.Path\n",
- "# FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem\n",
- "# Configuration = sc._gateway.jvm.org.apache.hadoop.conf.Configuration\n",
- "# fs = FileSystem.get(URI(\"hdfs:///\"), Configuration())\n",
- "\n",
- "# for fileNames in candidate_files:\n",
- "# status = fs.listStatus(Path(fileNames))\n",
- "# candidate_files_day_i = [\n",
- "# str(fileStatus.getPath()).replace('hdfs://analytix', '')\n",
- "# for fileStatus in status\n",
- "# ]\n",
- "# all_candidate_files.extend(candidate_files_day_i)\n",
- "# print(\"Files Directory:\", candidate_files, \"\\nNo. of Consisted files:\", len(all_candidate_files))\n",
- "# return all_candidate_files\n",
- "\n",
- "def group_files(files, n=16):\n",
- " # Yield successive n-sized\n",
- " # chunks from files\n",
- " all_group = []\n",
- " for i in range(0, len(files), n):\n",
- " all_group.append(files[i:i+n])\n",
- " print(\"There are\", len(all_group), \"chunks of files\")\n",
- " return all_group"
+ "# For run playbook manually, set start/end date here\n",
+ "START_DATE = \"2024-10-01\"\n",
+ "END_DATE = \"2024-10-02\"\n",
+ "# if cronjob, replace constant with value from env\n",
+ "if START and END:\n",
+ " START_DATE = START\n",
+ " END_DATE = END"
]
},
{
- "cell_type": "markdown",
- "id": "9a57477b",
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "430146eb",
"metadata": {},
+ "outputs": [],
"source": [
- "## load dataset"
+ "# index name\n",
+ "index_name = 'condor-taskdb'\n",
+ "# use prod index pattern if this execution is for production\n",
+ "if PROD:\n",
+ " index_name = f'crab-prod-{index_name}'\n",
+ "else:\n",
+ " index_name = f'crab-test-{index_name}'"
]
},
{
"cell_type": "code",
- "execution_count": 7,
- "id": "28bcc686",
+ "execution_count": null,
+ "id": "2a3b6697",
"metadata": {},
"outputs": [],
"source": [
- "schema = _get_schema()\n",
- "start_date = datetime(2023, 8, 10)\n",
- "end_date = datetime(2023, 8, 11)"
+ "# datetime object\n",
+ "start_datetime = datetime.strptime(START_DATE, \"%Y-%m-%d\").replace(tzinfo=timezone.utc)\n",
+ "end_datetime = datetime.strptime(END_DATE, \"%Y-%m-%d\").replace(tzinfo=timezone.utc)\n",
+ "# sanity check\n",
+ "if end_datetime < start_datetime: \n",
+ " raise Exception(f\"end date ({END_DATE}) is less than start date ({START_DATE})\")\n",
+ "start_epochmilis = int(start_datetime.timestamp()) * 1000\n",
+ "end_epochmilis = int(end_datetime.timestamp()) * 1000\n",
+ "yesterday_epoch = int((end_datetime-timedelta(days=1)).timestamp())"
]
},
{
"cell_type": "code",
- "execution_count": 8,
- "id": "bec66775",
+ "execution_count": null,
+ "id": "9404c437",
"metadata": {
"scrolled": true
},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "No. of Consisted files: 1\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "['/project/monitoring/archive/condor/raw/metric/2023/08/10']"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "candidate_files = get_candidate_files(start_date, end_date, spark, base=_DEFAULT_HDFS_FOLDER)\n",
- "candidate_files"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "894bdcf0",
- "metadata": {},
+ "outputs": [],
"source": [
- "### Prepare CRAB data file name"
+ "# debug\n",
+ "print(START_DATE, \n",
+ " END_DATE, \n",
+ " index_name,\n",
+ " sep='\\n')"
]
},
{
"cell_type": "code",
- "execution_count": 9,
- "id": "b4120002",
+ "execution_count": null,
+ "id": "9d4bb4d0",
"metadata": {},
"outputs": [],
"source": [
- "TODAY = str(end_date)[:10]\n",
- "wa_date = TODAY\n",
- "HDFS_CRAB_part = f'/project/awg/cms/crab/tasks/{wa_date}/'"
+ "# read crab data\n",
+ "HDFS_CRAB_part = f'/project/awg/cms/crab/tasks/{END_DATE}/' \n",
+ "crab_df = spark.read.format('avro').load(HDFS_CRAB_part)\n",
+ "# we did not filter the task here because most jobs was created from older tasks.\n",
+ "# if there are too many crab tasks, it might be safe to filter out the tasks older than 30+7 days ago.\n",
+ "crab_df = crab_df.select('TM_TASKNAME', 'TM_IGNORE_LOCALITY').cache()\n",
+ "crab_df.createOrReplaceTempView(\"tasks\")"
]
},
{
- "cell_type": "markdown",
- "id": "de4d8e96",
- "metadata": {},
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f15887f4",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
"source": [
- "### Get raw data from condor raw"
+ "# read condor data\n",
+ "# reading file 2 days before start date and 1 days after end date inclusive\n",
+ "# sometime flume (condor log aggregator) process the metrics is delay for 2 days, sometime it has timestamp from the future.\n",
+ "# so we do this to make sure we get all metrics from the date we want. (all of these suggested by CMSMONIT)\n",
+ "# Note that we read all files, compact or not, even it has the same content, we will dedup it in the next step.\n",
+ "_DEFAULT_HDFS_FOLDER = \"/project/monitoring/archive/condor/raw/metric\"\n",
+ "candidate_files = get_candidate_files(start_datetime, end_datetime, spark=spark, base=_DEFAULT_HDFS_FOLDER, day_delta=2)\n",
+ "\n",
+ "# this is map json doc to spark schema\n",
+ "read_schema = StructType(\n",
+ " [\n",
+ " StructField(\n",
+ " \"data\",\n",
+ " StructType(\n",
+ " [\n",
+ " StructField(\"RecordTime\", LongType(), nullable=False),\n",
+ " StructField(\"CMSPrimaryDataTier\", StringType(), nullable=True),\n",
+ " StructField(\"Status\", StringType(), nullable=True),\n",
+ " StructField(\"WallClockHr\", DoubleType(), nullable=True),\n",
+ " StructField(\"CoreHr\", DoubleType(), nullable=True),\n",
+ " StructField(\"CpuTimeHr\", DoubleType(), nullable=True),\n",
+ " StructField(\"Type\", StringType(), nullable=True),\n",
+ " StructField(\"CRAB_DataBlock\", StringType(), nullable=True),\n",
+ " StructField(\"GlobalJobId\", StringType(), nullable=False),\n",
+ " StructField(\"ExitCode\", LongType(), nullable=True),\n",
+ " StructField(\"CRAB_Workflow\", StringType(), nullable=True),\n",
+ " StructField(\"CommittedCoreHr\", StringType(), nullable=True),\n",
+ " StructField(\"CommittedWallClockHr\", StringType(), nullable=True),\n",
+ " ]\n",
+ " ),\n",
+ " ),\n",
+ " ]\n",
+ " )\n",
+ "print(\"===============================================\"\n",
+ " , \"Condor Matrix and CRAB Table\"\n",
+ " , \"===============================================\"\n",
+ " , \"File Directory:\", _DEFAULT_HDFS_FOLDER, candidate_files\n",
+ " , \"Work Directory:\", os.getcwd()\n",
+ " , \"===============================================\"\n",
+ " , \"===============================================\", sep='\\n')"
]
},
{
"cell_type": "code",
- "execution_count": 10,
- "id": "0aa94c64",
+ "execution_count": null,
+ "id": "fd3bcb00",
"metadata": {},
"outputs": [],
"source": [
- "spark.conf.set(\"spark.sql.session.timeZone\", \"UTC\")\n",
- "\n",
- "crab_df = spark.read.format('avro').load(HDFS_CRAB_part)\n",
- "crab_df = crab_df.select('TM_TASKNAME', 'TM_IGNORE_LOCALITY')"
+ "crab_username = spark.sql(\"\"\"SELECT current_user() AS user\"\"\").toPandas().to_dict('records')[0]['user']"
]
},
{
"cell_type": "code",
- "execution_count": 25,
- "id": "b35668ef",
+ "execution_count": null,
+ "id": "515aefbc",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "23/08/16 13:48:02 WARN CacheManager: Asked to cache already cached data.\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
- "condor_df = (\n",
- " spark.read.option(\"basePath\", _DEFAULT_HDFS_FOLDER)\n",
- " .json(\n",
+ "# extract only \"interested data\" from condor metrics and save into temporary area\n",
+ "# need to do this because we do not have enough memory to compute all data at once.\n",
+ "# (1 days is ok, 1 month got spark OOM)\n",
+ "# \"interested data\" is\n",
+ "# - selected column (see `read_schema` above)\n",
+ "# - date range from START_DATE inclusive to END_DATE exclusive\n",
+ "# - only status Complete and type analysis\n",
+ "# job will got dedup by `.drop_duplicates([\"GlobalJobId\"])` in later step\n",
+ "( \n",
+ " spark.read.option(\"basePath\", _DEFAULT_HDFS_FOLDER)\n",
+ " .json(\n",
" candidate_files,\n",
- " schema=schema,\n",
- " ).select(\"data.*\")\n",
- " .filter(\n",
+ " schema=read_schema,\n",
+ " )\n",
+ " .select(\"data.*\")\n",
+ " .filter(\n",
" f\"\"\"Status IN ('Completed')\n",
" AND Type IN ('analysis')\n",
- " AND RecordTime >= {start_date.timestamp() * 1000}\n",
- " AND RecordTime < {end_date.timestamp() * 1000}\n",
+ " AND RecordTime >= {start_epochmilis}\n",
+ " AND RecordTime < {end_epochmilis}\n",
" \"\"\"\n",
- " )\n",
- " .drop_duplicates([\"GlobalJobId\"]).cache()\n",
- " ) \n",
- "condor_df.write.mode('overwrite').parquet(\"/cms/users/eatthaph/condor_vir_data\" ,compression='zstd')\n",
- "condor_df = spark.read.format('parquet').load('/cms/users/eatthaph/condor_vir_data')\n",
- "# condor_df.count()"
+ " )\n",
+ " .drop_duplicates([\"GlobalJobId\"])\n",
+ " .write.mode('overwrite').parquet(f\"/cms/users/{crab_username}/condor_vir_data\" ,compression='zstd') # overriding the same path to cleanup old data. However, we could not run it parallel\n",
+ ")\n",
+ "spark.catalog.clearCache()"
]
},
{
"cell_type": "code",
- "execution_count": 27,
- "id": "7656d1f3",
+ "execution_count": null,
+ "id": "957ac50a",
"metadata": {},
"outputs": [],
"source": [
- "result_df = condor_df.join(crab_df, crab_df[\"TM_TASKNAME\"] == condor_df[\"CRAB_Workflow\"])\\\n",
- " .select('RecordTime', 'CMSPrimaryDataTier', 'WallClockHr', 'CoreHr', 'CpuTimeHr', 'ExitCode'\n",
- " , \"CRAB_DataBlock\", \"TM_IGNORE_LOCALITY\", \"GlobalJobId\", \"CommittedCoreHr\", \"CommittedWallClockHr\")\n",
- "docs = result_df.toPandas()"
+ "condor_df = spark.read.format('parquet').load(f\"/cms/users/{crab_username}/condor_vir_data\").cache()\n",
+ "condor_df.createOrReplaceTempView(\"condor\")"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "2b04b914",
- "metadata": {},
+ "id": "e271b1c8",
+ "metadata": {
+ "scrolled": true
+ },
"outputs": [],
"source": [
- "len(docs)"
+ "# query\n",
+ "query = f\"\"\"\\\n",
+ "WITH filter_tb AS (\n",
+ "SELECT *\n",
+ "FROM condor\n",
+ "WHERE 1=1\n",
+ "AND RecordTime >= {start_epochmilis}\n",
+ "AND RecordTime < {end_epochmilis}\n",
+ "),\n",
+ "join_tb AS (\n",
+ "SELECT RecordTime, CMSPrimaryDataTier, WallClockHr, CoreHr, CpuTimeHr, ExitCode, CRAB_DataBlock, TM_IGNORE_LOCALITY, GlobalJobId, CommittedCoreHr, CommittedWallClockHr\n",
+ "FROM filter_tb\n",
+ "INNER JOIN tasks \n",
+ "ON filter_tb.CRAB_Workflow = tasks.TM_TASKNAME \n",
+ "), \n",
+ "finalize_tb AS (\n",
+ "SELECT RecordTime, CMSPrimaryDataTier, WallClockHr, CoreHr, CpuTimeHr, ExitCode, CRAB_DataBlock, TM_IGNORE_LOCALITY, GlobalJobId, CommittedCoreHr, CommittedWallClockHr, \n",
+ " CASE \n",
+ " WHEN CRAB_DataBlock = 'MCFakeBlock' THEN 'PrivateMC' \n",
+ " ELSE 'Analysis'\n",
+ " END AS CRAB_Type, --- to differentiate between analysis and mc\n",
+ " 'condor' AS type, --- use to match specific data when use wildcard index pattern on grafana side\n",
+ " RecordTime AS timestamp --- use `RecordTime` as timestamp\n",
+ "FROM join_tb\n",
+ ")\n",
+ "SELECT * \n",
+ "FROM finalize_tb \n",
+ "\"\"\"\n",
+ "tmpdf = spark.sql(query)\n",
+ "tmpdf.show(10)\n",
+ "\n"
]
},
{
"cell_type": "code",
- "execution_count": 19,
- "id": "fa3f9917",
+ "execution_count": null,
+ "id": "75c6a964",
"metadata": {},
"outputs": [],
"source": [
- "# def spark_exec(candidate_files):\n",
- "# condor_df = (\n",
- "# spark.read.option(\"basePath\", _DEFAULT_HDFS_FOLDER)\n",
- "# .json(\n",
- "# candidate_files,\n",
- "# schema=schema,\n",
- "# ).select(\"data.*\")\n",
- "# .filter(\n",
- "# f\"\"\"Status IN ('Completed')\n",
- "# AND Type IN ('analysis')\n",
- "# AND RecordTime >= {start_date.timestamp() * 1000}\n",
- "# AND RecordTime < {end_date.timestamp() * 1000}\n",
- "# \"\"\"\n",
- "# )\n",
- "# .drop_duplicates([\"GlobalJobId\"]).cache()\n",
- "# ) \n",
- "# condor_df.write.mode('overwrite').parquet(\"/cms/users/eatthaph/condor_vir_data\" ,compression='zstd')\n",
- "# condor_df = spark.read.format('parquet').load('/cms/users/eatthaph/condor_vir_data')\n",
- "# result_df = condor_df.join(crab_df, crab_df[\"TM_TASKNAME\"] == condor_df[\"CRAB_Workflow\"])\\\n",
- "# .select('RecordTime', 'CMSPrimaryDataTier', 'WallClockHr', 'CoreHr', 'CpuTimeHr', 'ExitCode'\n",
- "# , \"CRAB_DataBlock\", \"TM_IGNORE_LOCALITY\", \"GlobalJobId\", \"CommittedCoreHr\", \"CommittedWallClockHr\")\n",
- "# sub_docs = result_df.toPandas()\n",
- "# return sub_docs\n",
- "\n",
- "# def loop_excute(candidate_files, initial_n=len(candidate_files)):\n",
- "# r = 0\n",
- "# n = initial_n\n",
- "# df_list = []\n",
- "# file_chunk = group_files(candidate_files, n)\n",
- "# while len(file_chunk)!=0 and r<10:\n",
- "# print(\"=================================\\n round :\", r+1, \"\\n=================================\")\n",
- "# df_err_list = []\n",
- "# for i, chunk in enumerate(file_chunk):\n",
- "# print(\"=================================\\n\", i+1, \"out of\", len(file_chunk), \"\\n=================================\")\n",
- "# try:\n",
- "# df_list.append(spark_exec(chunk))\n",
- "# except Exception as ex:\n",
- "# print(\"=====\", ex)\n",
- "# df_err_list.extend(chunk)\n",
- "# # if n != 1:\n",
- "# # n = n//2\n",
- "# file_chunk = group_files(df_err_list, n)\n",
- "# r += 1\n",
- "# print(\"\")\n",
- "# print(\"Fail excuted files :\", df_err_list)\n",
- "# return df_list"
+ "tmpdf.count()"
]
},
{
"cell_type": "code",
- "execution_count": 1,
- "id": "af6d5e17",
+ "execution_count": null,
+ "id": "eee4a1f3",
"metadata": {},
"outputs": [],
"source": [
- "# useful_df = loop_excute(candidate_files)\n",
- "# df_list = spark_exec(candidate_files)"
+ "schema = {\n",
+ " \"settings\": {\"index\": {\"number_of_shards\": \"1\", \"number_of_replicas\": \"1\"}},\n",
+ " \"mappings\": {\n",
+ " \"properties\": {\n",
+ " \"RecordTime\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
+ " \"CMSPrimaryDataTier\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+ " \"GlobalJobId\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+ " \"WallClockHr\": {\"type\": \"long\"},\n",
+ " \"CoreHr\": {\"type\": \"long\"},\n",
+ " \"CpuTimeHr\": {\"type\": \"long\"},\n",
+ " \"ExitCode\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+ " \"TM_IGNORE_LOCALITY\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+ " \"CRAB_Type\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+ " \"CRAB_DataBlock\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+ " \"CommittedCoreHr\": {\"type\": \"long\"}, \n",
+ " \"CommittedWallClockHr\": {\"type\": \"long\"},\n",
+ " \"type\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+ " \"timestamp\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
+ " }\n",
+ " }\n",
+ " }"
]
},
{
"cell_type": "code",
- "execution_count": 15,
- "id": "18908dab",
- "metadata": {},
+ "execution_count": null,
+ "id": "5d0506d4",
+ "metadata": {
+ "scrolled": true
+ },
"outputs": [],
"source": [
- "docs = docs.to_dict('records')"
+ "# this is simple workaround osearch bug when work in notebook because\n",
+ "# - it load the secret once and use forever\n",
+ "# - get_or_create_index() create index+schema only the first time it execute\n",
+ "# it is safe to run again even in cronjobs \n",
+ "import importlib\n",
+ "import osearch\n",
+ "importlib.reload(osearch)"
]
},
{
"cell_type": "code",
- "execution_count": 16,
- "id": "c912b217",
+ "execution_count": null,
+ "id": "47a4f569",
"metadata": {},
"outputs": [],
"source": [
- "for i in range(len(docs)):\n",
- " if docs[i]['CRAB_DataBlock'] == 'MCFakeBlock':\n",
- " docs[i]['CRAB_Type'] = 'PrivateMC'\n",
- " else:\n",
- " docs[i]['CRAB_Type'] = 'Analysis'"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "id": "0e3ae57b",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[{'RecordTime': 1692101192000,\n",
- " 'CMSPrimaryDataTier': 'MINIAODSIM',\n",
- " 'WallClockHr': 0.12361111111111112,\n",
- " 'CoreHr': 0.12361111111111112,\n",
- " 'CpuTimeHr': 0.0022222222222222222,\n",
- " 'ExitCode': 8020,\n",
- " 'CRAB_DataBlock': '/WWTo4Q_4f_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18MiniAODv2-106X_upgrade2018_realistic_v16_L1v1-v3/MINIAODSIM#eb5a0cbd-6c43-492c-ac21-4318775aee3b',\n",
- " 'TM_IGNORE_LOCALITY': 'F',\n",
- " 'GlobalJobId': 'crab3@vocms0155.cern.ch#98631116.0#1692100543',\n",
- " 'CommittedCoreHr': '0.12361111111111112',\n",
- " 'CommittedWallClockHr': '0.12361111111111112',\n",
- " 'CRAB_Type': 'Analysis'},\n",
- " {'RecordTime': 1692099933000,\n",
- " 'CMSPrimaryDataTier': 'MINIAODSIM',\n",
- " 'WallClockHr': 0.12166666666666667,\n",
- " 'CoreHr': 0.12166666666666667,\n",
- " 'CpuTimeHr': 0.004722222222222222,\n",
- " 'ExitCode': 8020,\n",
- " 'CRAB_DataBlock': '/WWTo4Q_4f_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18MiniAODv2-106X_upgrade2018_realistic_v16_L1v1-v3/MINIAODSIM#eb5a0cbd-6c43-492c-ac21-4318775aee3b',\n",
- " 'TM_IGNORE_LOCALITY': 'F',\n",
- " 'GlobalJobId': 'crab3@vocms0155.cern.ch#98629759.0#1692099393',\n",
- " 'CommittedCoreHr': '0.1213888888888889',\n",
- " 'CommittedWallClockHr': '0.1213888888888889',\n",
- " 'CRAB_Type': 'Analysis'},\n",
- " {'RecordTime': 1692121300000,\n",
- " 'CMSPrimaryDataTier': 'MINIAODSIM',\n",
- " 'WallClockHr': 5.698333333333333,\n",
- " 'CoreHr': 5.698333333333333,\n",
- " 'CpuTimeHr': 5.501388888888889,\n",
- " 'ExitCode': 0,\n",
- " 'CRAB_DataBlock': '/QCD_HT50to100_TuneCP5_13TeV-madgraphMLM-pythia8/RunIISummer20UL17MiniAODv2-106X_mc2017_realistic_v9-v2/MINIAODSIM#4c355926-ea17-4285-bd2b-2c7692c48a87',\n",
- " 'TM_IGNORE_LOCALITY': 'F',\n",
- " 'GlobalJobId': 'crab3@vocms0107.cern.ch#96720630.0#1691795011',\n",
- " 'CommittedCoreHr': '5.698333333333333',\n",
- " 'CommittedWallClockHr': '5.698333333333333',\n",
- " 'CRAB_Type': 'Analysis'},\n",
- " {'RecordTime': 1692121556000,\n",
- " 'CMSPrimaryDataTier': 'MINIAODSIM',\n",
- " 'WallClockHr': 5.769722222222223,\n",
- " 'CoreHr': 5.769722222222223,\n",
- " 'CpuTimeHr': 5.543055555555555,\n",
- " 'ExitCode': 0,\n",
- " 'CRAB_DataBlock': '/QCD_HT50to100_TuneCP5_13TeV-madgraphMLM-pythia8/RunIISummer20UL17MiniAODv2-106X_mc2017_realistic_v9-v2/MINIAODSIM#4c355926-ea17-4285-bd2b-2c7692c48a87',\n",
- " 'TM_IGNORE_LOCALITY': 'F',\n",
- " 'GlobalJobId': 'crab3@vocms0107.cern.ch#96720628.0#1691795011',\n",
- " 'CommittedCoreHr': '5.769722222222223',\n",
- " 'CommittedWallClockHr': '5.769722222222223',\n",
- " 'CRAB_Type': 'Analysis'},\n",
- " {'RecordTime': 1692123756000,\n",
- " 'CMSPrimaryDataTier': 'MINIAODSIM',\n",
- " 'WallClockHr': 3.2091666666666665,\n",
- " 'CoreHr': 3.2091666666666665,\n",
- " 'CpuTimeHr': 3.1125,\n",
- " 'ExitCode': 0,\n",
- " 'CRAB_DataBlock': '/QCD_HT50to100_TuneCP5_13TeV-madgraphMLM-pythia8/RunIISummer20UL17MiniAODv2-106X_mc2017_realistic_v9-v2/MINIAODSIM#4c355926-ea17-4285-bd2b-2c7692c48a87',\n",
- " 'TM_IGNORE_LOCALITY': 'F',\n",
- " 'GlobalJobId': 'crab3@vocms0107.cern.ch#96720658.0#1691795012',\n",
- " 'CommittedCoreHr': '3.2091666666666665',\n",
- " 'CommittedWallClockHr': '3.2091666666666665',\n",
- " 'CRAB_Type': 'Analysis'}]"
- ]
- },
- "execution_count": 17,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "docs[:5]"
+ "# repartition rdd to make each partition small enough to load back to python kernel, serialize to dict, and send to os.\n",
+ "# for 12M rows, number of from 27 days of data is 51, around 250k per partition.\n",
+ "# try reducing partition to 20 once but make python kernel out-of-memory. \n",
+ "# so, try to keep it around 200k per partition instead.\n",
+ "partition_num = tmpdf.count() // 200000\n",
+ "tmpdf = tmpdf.repartition(partition_num, 'RecordTime')\n",
+ "total_part = tmpdf.rdd.getNumPartitions()\n",
+ "\n",
+ "print(f\"Number of partition: {total_part}\")"
]
},
{
"cell_type": "code",
- "execution_count": 18,
- "id": "bcdfb65c",
- "metadata": {},
+ "execution_count": null,
+ "id": "3e1f7a3f",
+ "metadata": {
+ "scrolled": false
+ },
"outputs": [],
"source": [
- "import osearch"
+ "# send to os, serialize df one rdd partition at a time\n",
+ "part = 0\n",
+ "for docs in tmpdf.rdd.mapPartitions(lambda p: [[x.asDict() for x in p]]).toLocalIterator():\n",
+ " part += 1\n",
+ " print(f\"Partition: {part}/{total_part}, Length of partition: {len(docs)}\")\n",
+ " send_os_parallel(docs, index_name, schema, secretpath, yesterday_epoch, 20000) # batch_size is just arbitrary number"
]
},
{
"cell_type": "code",
- "execution_count": 19,
- "id": "4666acef",
+ "execution_count": null,
+ "id": "52b2fc9f",
"metadata": {},
"outputs": [],
"source": [
- "def get_index_schema():\n",
- " return {\n",
- " \"settings\": {\"index\": {\"number_of_shards\": \"1\", \"number_of_replicas\": \"1\"}},\n",
- " \"mappings\": {\n",
- " \"properties\": {\n",
- " \"RecordTime\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
- " \"CMSPrimaryDataTier\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
- " \"GlobalJobId\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
- " \"WallClockHr\": {\"type\": \"long\"},\n",
- " \"CoreHr\": {\"type\": \"long\"},\n",
- " \"CpuTimeHr\": {\"type\": \"long\"},\n",
- " \"ExitCode\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
- " \"TM_IGNORE_LOCALITY\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
- " \"CRAB_Type\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
- " \"CRAB_DataBlock\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
- " \"CommittedCoreHr\": {\"type\": \"long\"}, \n",
- " \"CommittedWallClockHr\": {\"type\": \"long\"},\n",
- " }\n",
- " }\n",
- " }"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "id": "d6e4107b",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/eos/user/e/eatthaph/.local/lib/python3.9/site-packages/opensearchpy/connection/http_urllib3.py:199: UserWarning: Connecting to https://es-cms1.cern.ch:443 using SSL with verify_certs=False is insecure.\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n"
- ]
- }
- ],
- "source": [
- "_index_template = 'crab-condor-ekong'\n",
- "client = osearch.get_es_client(\"es-cms1.cern.ch/es\", 'secret_opensearch.txt', get_index_schema())\n",
- "# index_mod=\"\": 'test-foo', index_mod=\"Y\": 'test-foo-YYYY', index_mod=\"M\": 'test-foo-YYYY-MM', index_mod=\"D\": 'test-foo-YYYY-MM-DD',\n",
- "idx = client.get_or_create_index(timestamp=time.time(), index_template=_index_template, index_mod=\"M\")\n",
- "client.send(idx, docs, metadata=None, batch_size=10000, drop_nulls=False)"
+ "print(\"Done!\")"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "d7274886",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "1b4484a3",
+ "id": "1dc69a5c",
"metadata": {},
"outputs": [],
"source": []
@@ -728,14 +478,13 @@
"list_of_options": [
{
"name": "spark.jars.packages",
- "value": "org.apache.spark:spark-avro_2.12:3.3.1"
+ "value": "org.apache.spark:spark-avro_2.12:3.5.0"
+ },
+ {
+ "name": "spark.executor.instances",
+ "value": "20"
}
]
- },
- "vscode": {
- "interpreter": {
- "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
- }
}
},
"nbformat": 4,
diff --git a/src/script/Monitor/crab-spark/notebooks/crab_data.ipynb b/src/script/Monitor/crab-spark/notebooks/crab_data.ipynb
deleted file mode 100644
index be60f3eb84..0000000000
--- a/src/script/Monitor/crab-spark/notebooks/crab_data.ipynb
+++ /dev/null
@@ -1,832 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "66b56403",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "795d491e",
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- "
SparkSession - in-memory
\n",
- " \n",
- "
\n",
- "
SparkContext
\n",
- "\n",
- "
Spark UI
\n",
- "\n",
- "
\n",
- " - Version
\n",
- " v3.3.2
\n",
- " - Master
\n",
- " yarn
\n",
- " - AppName
\n",
- " pyspark_shell_swan
\n",
- "
\n",
- "
\n",
- " \n",
- "
\n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "execution_count": 1,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "31b02b1c",
- "metadata": {},
- "outputs": [],
- "source": [
- "# !hdfs -h"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "7a7ad1c3",
- "metadata": {},
- "outputs": [],
- "source": [
- "# !hdfs dfs -ls /cms/users/eatthaph"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "8a170ced",
- "metadata": {},
- "outputs": [],
- "source": [
- "# !hdfs dfs -ls /cms/users/eatthaph/"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "id": "17520cda",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "23/07/25 16:06:21 WARN ipc.Client: Exception encountered while connecting to the server \n",
- "org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.ipc.StandbyException): Operation category READ is not supported in state standby. Visit https://s.apache.org/sbnn-error\n",
- "\tat org.apache.hadoop.security.SaslRpcClient.saslConnect(SaslRpcClient.java:376)\n",
- "\tat org.apache.hadoop.ipc.Client$Connection.setupSaslConnection(Client.java:622)\n",
- "\tat org.apache.hadoop.ipc.Client$Connection.access$2300(Client.java:413)\n",
- "\tat org.apache.hadoop.ipc.Client$Connection$2.run(Client.java:822)\n",
- "\tat org.apache.hadoop.ipc.Client$Connection$2.run(Client.java:818)\n",
- "\tat java.base/java.security.AccessController.doPrivileged(Native Method)\n",
- "\tat java.base/javax.security.auth.Subject.doAs(Subject.java:423)\n",
- "\tat org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1845)\n",
- "\tat org.apache.hadoop.ipc.Client$Connection.setupIOstreams(Client.java:818)\n",
- "\tat org.apache.hadoop.ipc.Client$Connection.access$3800(Client.java:413)\n",
- "\tat org.apache.hadoop.ipc.Client.getConnection(Client.java:1636)\n",
- "\tat org.apache.hadoop.ipc.Client.call(Client.java:1452)\n",
- "\tat org.apache.hadoop.ipc.Client.call(Client.java:1405)\n",
- "\tat org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:234)\n",
- "\tat org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:119)\n",
- "\tat com.sun.proxy.$Proxy12.getFileInfo(Unknown Source)\n",
- "\tat org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.getFileInfo(ClientNamenodeProtocolTranslatorPB.java:964)\n",
- "\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n",
- "\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n",
- "\tat java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n",
- "\tat java.base/java.lang.reflect.Method.invoke(Method.java:566)\n",
- "\tat org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:422)\n",
- "\tat org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeMethod(RetryInvocationHandler.java:165)\n",
- "\tat org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invoke(RetryInvocationHandler.java:157)\n",
- "\tat org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeOnce(RetryInvocationHandler.java:95)\n",
- "\tat org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:359)\n",
- "\tat com.sun.proxy.$Proxy13.getFileInfo(Unknown Source)\n",
- "\tat org.apache.hadoop.hdfs.DFSClient.getFileInfo(DFSClient.java:1731)\n",
- "\tat org.apache.hadoop.hdfs.DistributedFileSystem$29.doCall(DistributedFileSystem.java:1725)\n",
- "\tat org.apache.hadoop.hdfs.DistributedFileSystem$29.doCall(DistributedFileSystem.java:1722)\n",
- "\tat org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81)\n",
- "\tat org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:1737)\n",
- "\tat org.apache.hadoop.fs.Globber.getFileStatus(Globber.java:115)\n",
- "\tat org.apache.hadoop.fs.Globber.doGlob(Globber.java:349)\n",
- "\tat org.apache.hadoop.fs.Globber.glob(Globber.java:202)\n",
- "\tat org.apache.hadoop.fs.FileSystem.globStatus(FileSystem.java:2093)\n",
- "\tat org.apache.hadoop.fs.shell.PathData.expandAsGlob(PathData.java:353)\n",
- "\tat org.apache.hadoop.fs.shell.Command.expandArgument(Command.java:250)\n",
- "\tat org.apache.hadoop.fs.shell.Command.expandArguments(Command.java:233)\n",
- "\tat org.apache.hadoop.fs.shell.FsCommand.processRawArguments(FsCommand.java:104)\n",
- "\tat org.apache.hadoop.fs.shell.Command.run(Command.java:177)\n",
- "\tat org.apache.hadoop.fs.FsShell.run(FsShell.java:327)\n",
- "\tat org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:76)\n",
- "\tat org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:90)\n",
- "\tat org.apache.hadoop.fs.FsShell.main(FsShell.java:390)\n",
- "Found 9 items\n",
- "-rw-r--r-x+ 3 cmssqoop c3 0 2023-07-19 02:14 /project/awg/cms/crab/tasks/2023-07-19/_SUCCESS\n",
- "-rw-r--r-x+ 3 cmssqoop c3 85991835 2023-07-19 02:01 /project/awg/cms/crab/tasks/2023-07-19/part-m-00000.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 837565156 2023-07-19 02:14 /project/awg/cms/crab/tasks/2023-07-19/part-m-00001.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 605874324 2023-07-19 02:10 /project/awg/cms/crab/tasks/2023-07-19/part-m-00002.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 602365393 2023-07-19 02:09 /project/awg/cms/crab/tasks/2023-07-19/part-m-00003.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 761072727 2023-07-19 02:13 /project/awg/cms/crab/tasks/2023-07-19/part-m-00004.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 462585036 2023-07-19 02:07 /project/awg/cms/crab/tasks/2023-07-19/part-m-00005.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 394767237 2023-07-19 02:06 /project/awg/cms/crab/tasks/2023-07-19/part-m-00006.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 358041401 2023-07-19 02:04 /project/awg/cms/crab/tasks/2023-07-19/part-m-00007.avro\n"
- ]
- }
- ],
- "source": [
- "!hdfs dfs -ls /project/awg/cms/crab/tasks/2023-07-19"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "2a7b2463",
- "metadata": {},
- "outputs": [],
- "source": [
- "# import pickle\n",
- "from datetime import datetime, timedelta\n",
- "\n",
- "# import click\n",
- "import os\n",
- "import pandas as pd\n",
- "# import pprint\n",
- "import time\n",
- "# from dateutil.relativedelta import relativedelta\n",
- "from pyspark import SparkContext, StorageLevel\n",
- "from pyspark.sql import SparkSession\n",
- "from pyspark.sql.functions import (\n",
- " col, collect_list, concat_ws, greatest, lit, lower, when, unix_timestamp, to_timestamp,\n",
- " avg as _avg,\n",
- " count as _count,\n",
- " hex as _hex,\n",
- " max as _max,\n",
- " min as _min,\n",
- " round as _round,\n",
- " sum as _sum,\n",
- ")\n",
- "\n",
- "from pyspark.sql.types import (\n",
- " LongType,\n",
- ")\n",
- "\n",
- "# import matplotlib.pyplot as plt\n",
- "import numpy as np\n",
- "import math\n",
- "import json\n",
- "#from CMSSpark.src.python.CMSSpark import schemas as cms_schemas"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "f2904198",
- "metadata": {},
- "source": [
- "## load dataset"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 34,
- "id": "aa0d181a",
- "metadata": {},
- "outputs": [],
- "source": [
- "# end_date = str(datetime.now())[:10]\n",
- "# start_date = str(datetime.now()-timedelta(days=1))[:10]\n",
- "\n",
- "start_date = '2023-07-20'\n",
- "end_date = '2023-07-25'\n",
- "\n",
- "wa_date = end_date\n",
- "HDFS_CRAB_part = f'/project/awg/cms/crab/tasks/{wa_date}/'\n",
- "# HDFS_CRAB_part = f'/project/awg/cms/crab/{wa_date}/tasks/'"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 35,
- "id": "532ec9ac",
- "metadata": {},
- "outputs": [],
- "source": [
- "crab_part = spark.read.format('avro').load(HDFS_CRAB_part)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "3ad81af6",
- "metadata": {},
- "source": [
- "## Query"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 37,
- "id": "41cf761f",
- "metadata": {},
- "outputs": [],
- "source": [
- "df = crab_part.select(\"TM_TASKNAME\",\"TM_START_TIME\",\"TM_TASK_STATUS\",\"TM_SPLIT_ALGO\",\"TM_USERNAME\",\"TM_USER_ROLE\",\"TM_JOB_TYPE\",\"TM_IGNORE_LOCALITY\",\"TM_SCRIPTEXE\",\"TM_USER_CONFIG\")\n",
- "df.createOrReplaceTempView(\"crab_algo\")\n",
- "# df.show(10)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 38,
- "id": "e41c5fc6",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "6147"
- ]
- },
- "execution_count": 38,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "query = f\"\"\"\\\n",
- "SELECT *\n",
- "FROM crab_algo \n",
- "WHERE 1=1\n",
- "AND TM_START_TIME >= unix_timestamp(\"{start_date} 00:00:00\", \"yyyy-MM-dd HH:mm:ss\")*1000 \n",
- "AND TM_START_TIME < unix_timestamp(\"{end_date} 00:00:00\", \"yyyy-MM-dd HH:mm:ss\")*1000 \n",
- "\"\"\"\n",
- "\n",
- "tmpdf = spark.sql(query)\n",
- "tmpdf.count()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "id": "25033524",
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "root\n",
- " |-- TM_TASKNAME: string (nullable = true)\n",
- " |-- TM_START_TIME: long (nullable = true)\n",
- " |-- TM_TASK_STATUS: string (nullable = true)\n",
- " |-- TM_SPLIT_ALGO: string (nullable = true)\n",
- " |-- TM_USERNAME: string (nullable = true)\n",
- " |-- TM_USER_ROLE: string (nullable = true)\n",
- " |-- TM_JOB_TYPE: string (nullable = true)\n",
- " |-- TM_IGNORE_LOCALITY: string (nullable = true)\n",
- " |-- TM_SCRIPTEXE: string (nullable = true)\n",
- " |-- TM_USER_CONFIG: string (nullable = true)\n",
- "\n"
- ]
- }
- ],
- "source": [
- "tmpdf.printSchema()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "id": "ff188450",
- "metadata": {},
- "outputs": [],
- "source": [
- "docs = tmpdf.toPandas().to_dict('records')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "id": "fad5ca52",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "6147"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(docs)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "id": "c454d0c4",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Extract 'REQUIRE_ACCELERATOR' from 'TM_USER_CONFIG'\n",
- "\n",
- "for i in range(len(docs)):\n",
- " if docs[i]['TM_USER_CONFIG'] is not None:\n",
- " data = json.loads(docs[i]['TM_USER_CONFIG'])\n",
- " if \"requireaccelerator\" in data:\n",
- " docs[i]['REQUIRE_ACCELERATOR'] = data[\"requireaccelerator\"]\n",
- " else:\n",
- " docs[i]['REQUIRE_ACCELERATOR'] = None\n",
- " else:\n",
- " docs[i]['REQUIRE_ACCELERATOR'] = None"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "id": "d2e914f6",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[{'TM_TASKNAME': '160406_111833:sciaba_HC-163-AnySite-26725-20160406125703-T1_UK_RAL',\n",
- " 'TM_START_TIME': 1459934313843,\n",
- " 'TM_TASK_STATUS': 'SUBMITTED',\n",
- " 'TM_SPLIT_ALGO': 'FileBased',\n",
- " 'TM_USERNAME': 'sciaba',\n",
- " 'TM_USER_ROLE': 'production',\n",
- " 'TM_JOB_TYPE': 'Analysis',\n",
- " 'TM_IGNORE_LOCALITY': 'T',\n",
- " 'TM_SCRIPTEXE': None,\n",
- " 'TM_USER_CONFIG': None,\n",
- " 'REQUIRE_ACCELERATOR': None},\n",
- " {'TM_TASKNAME': '160406_111914:sciaba_HC-148-AnySite-26727-20160406131903-T2_UK_SGrid_Bristol',\n",
- " 'TM_START_TIME': 1459934354531,\n",
- " 'TM_TASK_STATUS': 'SUBMITTED',\n",
- " 'TM_SPLIT_ALGO': 'FileBased',\n",
- " 'TM_USERNAME': 'sciaba',\n",
- " 'TM_USER_ROLE': 'production',\n",
- " 'TM_JOB_TYPE': 'Analysis',\n",
- " 'TM_IGNORE_LOCALITY': 'T',\n",
- " 'TM_SCRIPTEXE': None,\n",
- " 'TM_USER_CONFIG': None,\n",
- " 'REQUIRE_ACCELERATOR': None},\n",
- " {'TM_TASKNAME': '160319_180958:sciaba_HC-138-AnySite-26052-20160319011302-T2_RU_IHEP',\n",
- " 'TM_START_TIME': 1458407398241,\n",
- " 'TM_TASK_STATUS': 'SUBMITTED',\n",
- " 'TM_SPLIT_ALGO': 'FileBased',\n",
- " 'TM_USERNAME': 'sciaba',\n",
- " 'TM_USER_ROLE': 'production',\n",
- " 'TM_JOB_TYPE': 'Analysis',\n",
- " 'TM_IGNORE_LOCALITY': 'T',\n",
- " 'TM_SCRIPTEXE': None,\n",
- " 'TM_USER_CONFIG': None,\n",
- " 'REQUIRE_ACCELERATOR': None}]"
- ]
- },
- "execution_count": 25,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "docs[:3]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "id": "cf696d7f",
- "metadata": {},
- "outputs": [],
- "source": [
- "import osearch"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "id": "e47490bd",
- "metadata": {},
- "outputs": [],
- "source": [
- "def get_index_schema():\n",
- " return {\n",
- " \"settings\": {\"index\": {\"number_of_shards\": \"1\", \"number_of_replicas\": \"1\"}},\n",
- " \"mappings\": {\n",
- " \"properties\": {\n",
- " \"TM_TASKNAME\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
- " \"TM_START_TIME\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
- " 'TM_TASK_STATUS': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
- " \"TM_SPLIT_ALGO\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
- " \"TM_USERNAME\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
- " \"TM_USER_ROLE\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
- " \"TM_JOB_TYPE\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
- " \"TM_IGNORE_LOCALITY\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
- " \"TM_SCRIPTEXE\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
- " \"REQUIRE_ACCELERATOR\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
- " }\n",
- " }\n",
- " }\n",
- "\n",
- "# def get_index_schema():\n",
- "# return {\n",
- "# \"settings\": {\"index\": {\"number_of_shards\": \"1\", \"number_of_replicas\": \"1\"}},\n",
- "# \"mappings\": {\n",
- "# \"properties\": {\n",
- "# \"TM_TASKNAME\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
- "# \"TM_SPLIT_ALGO\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
- "# \"TM_START_TIME\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
- "# \"TM_END_INJECTION\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
- "# }\n",
- "# }\n",
- "# }"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "id": "6bcfc801",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "0"
- ]
- },
- "execution_count": 28,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "_index_template = 'crab-data-ekong'\n",
- "client = osearch.get_es_client(\"es-cms1.cern.ch/es\", 'secret_opensearch.txt', get_index_schema())\n",
- "# index_mod=\"\": 'test-foo', index_mod=\"Y\": 'test-foo-YYYY', index_mod=\"M\": 'test-foo-YYYY-MM', index_mod=\"D\": 'test-foo-YYYY-MM-DD',\n",
- "idx = client.get_or_create_index(timestamp=time.time(), index_template=_index_template, index_mod=\"M\")\n",
- "client.send(idx, docs, metadata=None, batch_size=10000, drop_nulls=False)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "bcac057e",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "a5f62789",
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "@webio": {
- "lastCommId": null,
- "lastKernelId": null
- },
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.9.12"
- },
- "sparkconnect": {
- "bundled_options": [],
- "list_of_options": [
- {
- "name": "spark.jars.packages",
- "value": "org.apache.spark:spark-avro_2.12:3.3.1"
- }
- ]
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/src/script/Monitor/crab-spark/notebooks/crab_rucio_rules_poc.ipynb b/src/script/Monitor/crab-spark/notebooks/crab_rucio_rules_poc.ipynb
new file mode 100644
index 0000000000..8aa5ac31a6
--- /dev/null
+++ b/src/script/Monitor/crab-spark/notebooks/crab_rucio_rules_poc.ipynb
@@ -0,0 +1,340 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5e9af689",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from datetime import datetime, timedelta, timezone\n",
+ "import os\n",
+ "import time\n",
+ "import pandas as pd\n",
+ "\n",
+ "from pyspark import SparkContext, StorageLevel\n",
+ "from pyspark.sql import SparkSession\n",
+ "from pyspark.sql.functions import (\n",
+ " current_user,\n",
+ " col, collect_list, concat_ws, greatest, lit, lower, when,\n",
+ " avg as _avg,\n",
+ " count as _count,\n",
+ " hex as _hex,\n",
+ " max as _max,\n",
+ " min as _min,\n",
+ " round as _round,\n",
+ " sum as _sum,\n",
+ ")\n",
+ "from pyspark.sql.types import (\n",
+ " StructType,\n",
+ " LongType,\n",
+ " StringType,\n",
+ " StructField,\n",
+ " DoubleType,\n",
+ " IntegerType,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "91309756",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# try to import libs from current directory, fallback to $PWD/../workdir if not found\n",
+ "try:\n",
+ " from crabspark_utils import get_candidate_files, send_os, send_os_parallel\n",
+ "except ModuleNotFoundError:\n",
+ " import sys\n",
+ " sys.path.insert(0, f'{os.getcwd()}/../workdir')\n",
+ " from crabspark_utils import get_candidate_files, send_os, send_os_parallel\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "22946659",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "spark = SparkSession\\\n",
+ " .builder\\\n",
+ " .appName('crab-taskdb')\\\n",
+ " .getOrCreate()\n",
+ "spark"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e9013878",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# clear any cache left, for working with notebook\n",
+ "# it safe to run everytime cronjob start\n",
+ "spark.catalog.clearCache()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "31c19eb0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# secret path, also check if file exists\n",
+ "secretpath = os.environ.get('OPENSEARCH_SECRET_PATH', f'{os.getcwd()}/../workdir/secret_opensearch.txt')\n",
+ "if not os.path.isfile(secretpath): \n",
+ " raise Exception(f'OS secrets file {secretpath} does not exists')\n",
+ "# if PROD, index prefix will be `crab-*`, otherwise `crab-test-*`\n",
+ "PROD = os.environ.get('PROD', 'false').lower() in ('true', '1', 't')\n",
+ "# FROM_DATE, in strptime(\"%Y-%m-%d\")\n",
+ "START = os.environ.get('START_DATE', None) \n",
+ "END = os.environ.get('END_DATE', None)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e843eb6d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# For run playbook manually, set start/end date here\n",
+ "START_DATE = \"2020-01-01\"\n",
+ "END_DATE = \"2024-10-01\"\n",
+ "# if cronjob, replace constant with value from env\n",
+ "if START and END:\n",
+ " START_DATE = START\n",
+ " END_DATE = END"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b17ed53f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# index name\n",
+ "index_name = 'taskdb'\n",
+ "# use prod index pattern if this execution is for production\n",
+ "if PROD:\n",
+ " index_name = f'crab-prod-{index_name}'\n",
+ "else:\n",
+ " index_name = f'crab-test-{index_name}'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8417ab47",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# datetime object\n",
+ "start_datetime = datetime.strptime(START_DATE, \"%Y-%m-%d\").replace(tzinfo=timezone.utc)\n",
+ "end_datetime = datetime.strptime(END_DATE, \"%Y-%m-%d\").replace(tzinfo=timezone.utc)\n",
+ "# sanity check\n",
+ "if end_datetime < start_datetime: \n",
+ " raise Exception(f\"end date ({END_DATE}) is less than start date ({START_DATE})\")\n",
+ "start_epochmilis = int(start_datetime.timestamp()) * 1000\n",
+ "end_epochmilis = int(end_datetime.timestamp()) * 1000\n",
+ "yesterday_epoch = int((end_datetime-timedelta(days=1)).timestamp())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9404c437",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "# debug\n",
+ "print(START_DATE, \n",
+ " END_DATE, \n",
+ " index_name,\n",
+ " sep='\\n')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3e85c2f0",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "# This code block and following block is copied from Panos's script.\n",
+ "# https://gitlab.cern.ch/cmsdmops/cmsdmops/-/blob/8da699db49097d7a58440e6058f022c3f93992e2/monitoring/kubernetes/src/rucio_activity_account_usage.py\n",
+ "# see more in https://github.com/dmwm/CRABServer/issues/7798#issuecomment-2389265249\n",
+ "def get_df_rses(spark):\n",
+ " \"\"\"Get Spark dataframe of RSES\n",
+ " \"\"\"\n",
+ " hdfs_rses_path = '/project/awg/cms/rucio/{}/rses/part*.avro'.format(datetime.today().strftime('%Y-%m-%d'))\n",
+ " df_rses = spark.read.format(\"avro\").load(hdfs_rses_path) \\\n",
+ " .filter(col('DELETED_AT').isNull()) \\\n",
+ " .withColumn('rse_id', lower(_hex(col('ID')))) \\\n",
+ " .withColumn('rse_tier', _split(col('RSE'), '_').getItem(0)) \\\n",
+ " .withColumn('rse_country', _split(col('RSE'), '_').getItem(1)) \\\n",
+ " .withColumn('rse_kind',\n",
+ " when((col(\"rse\").endswith('Temp') | col(\"rse\").endswith('temp') | col(\"rse\").endswith('TEMP')),\n",
+ " 'temp')\n",
+ " .when((col(\"rse\").endswith('Test') | col(\"rse\").endswith('test') | col(\"rse\").endswith('TEST')),\n",
+ " 'test')\n",
+ " .otherwise('prod')\n",
+ " ) \\\n",
+ " .select(['rse_id', 'RSE', 'RSE_TYPE', 'rse_tier', 'rse_country', 'rse_kind'])\n",
+ " return df_rses\n",
+ "def get_df_locks(spark):\n",
+ " \"\"\"Get Spark dataframe of Locks\n",
+ " \"\"\"\n",
+ " today = datetime.today().strftime('%Y-%m-%d')\n",
+ " locks_path = f'/project/awg/cms/rucio/{today}/locks/part*.avro'\n",
+ " locks = spark.read.format('avro').load(locks_path) \\\n",
+ " .filter(col('SCOPE') == 'cms') \\\n",
+ " .filter(col('STATE').isin(['O', 'R'])) \\\n",
+ " .withColumn('rse_id', lower(_hex(col('RSE_ID')))) \\\n",
+ " .withColumnRenamed('NAME', 'f_name') \\\n",
+ " .withColumnRenamed('ACCOUNT', 'account_name') \\\n",
+ " .withColumnRenamed('BYTES', 'f_size') \\\n",
+ " .withColumn('r_id', lower(_hex(col('RULE_ID')))) \\\n",
+ " .select(['rse_id', 'f_name', 'f_size', 'r_id', 'account_name'])\n",
+ " return locks\n",
+ "def get_df_accounts(spark):\n",
+ " \"\"\"Get Spark dataframe of Accounts\n",
+ " \"\"\"\n",
+ " today = datetime.today().strftime('%Y-%m-%d')\n",
+ " hdfs_rucio_accounts = f'/project/awg/cms/rucio/{today}/accounts/part*.avro'\n",
+ " df_accounts = spark.read.format(\"avro\").load(hdfs_rucio_accounts) \\\n",
+ " .filter(col('DELETED_AT').isNull()) \\\n",
+ " .withColumnRenamed('ACCOUNT', 'account_name') \\\n",
+ " .withColumnRenamed('ACCOUNT_TYPE', 'account_type') \\\n",
+ " .select(['account_name', 'account_type'])\n",
+ " return df_accounts\n",
+ "def get_df_rules(spark):\n",
+ " \"\"\"Get Spark dataframe of rules\n",
+ " \"\"\"\n",
+ " hdfs_rules_path = '/project/awg/cms/rucio/{}/rules/part*.avro'.format(datetime.today().strftime('%Y-%m-%d'))\n",
+ " return spark.read.format('avro').load(hdfs_rules_path) \\\n",
+ " .filter(col('SCOPE') == 'cms') \\\n",
+ " .withColumnRenamed('name', 'r_name') \\\n",
+ " .withColumn('r_id', lower(_hex(col('ID')))) \\\n",
+ " .withColumn('s_id', lower(_hex(col('SUBSCRIPTION_ID')))) \\\n",
+ " .withColumnRenamed('ACTIVITY', 'activity') \\\n",
+ " .withColumnRenamed('STATE', 'rule_state') \\\n",
+ " .withColumnRenamed('RSE_EXPRESSION', 'rse_expression') \\\n",
+ " .select(['r_name','r_id', 's_id', 'activity', 'rule_state', 'rse_expression']) \n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e271b1c8",
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [],
+ "source": [
+ "# add data_tier field\n",
+ "df_rses = get_df_rses(spark)\n",
+ "df_locks = get_df_locks(spark)\n",
+ "df_accounts = get_df_accounts(spark)\n",
+ "df_rules = get_df_rules(spark)\n",
+ "tb_denominator = 10 ** 12\n",
+ "locks = df_locks.join(df_rses, ['rse_id'], how='left') \\\n",
+ " .filter(col('rse_kind') == 'prod') \\\n",
+ " .select(['f_name', 'f_size', 'RSE', 'rse_type', 'account_name', 'r_id']) \n",
+ "\n",
+ "locks_with_activity = (\n",
+ " locks.join(df_rules, ['r_id'], how='leftouter')\n",
+ " .select(['f_name', 'account_name', 'RSE', 'rse_type', 'f_size', 'activity', 'r_name'])\n",
+ " .withColumn('data_tier', regexp_extract('r_name', r'^\\/([\\w-]+)\\/([\\w-]+)\\/([\\w-]+)(#[\\w-]+)?', 3))\n",
+ " .select(['f_name', 'account_name', 'RSE', 'rse_type', 'f_size', 'activity', 'data_tier'])\n",
+ ")\n",
+ "\n",
+ "timestamp = int(time.time())\n",
+ "\n",
+ "# A File locked by the user for two activities is accounted to both activities\n",
+ "# A File locked by two users for the same activity is accounted to both Users\n",
+ "user_aggreagated = locks_with_activity \\\n",
+ " .select(['f_name', 'f_size', 'RSE', 'rse_type', 'account_name', 'activity', 'data_tier']) \\\n",
+ " .distinct() \\\n",
+ " .groupby(['RSE', 'rse_type', 'account_name', 'activity', 'data_tier']) \\\n",
+ " .agg(_round(_sum(col('f_size')) / tb_denominator, 5).alias('total_locked')) \\\n",
+ " .join(df_accounts, ['account_name'], how='left') \\\n",
+ " .withColumnRenamed('RSE', 'rse_name') \\\n",
+ " .withColumn('timestamp', lit(timestamp)) \\\n",
+ " .select(['total_locked', 'rse_name', 'rse_type', 'account_name', 'account_type', 'activity', 'data_tier', 'timestamp']) \\\n",
+ " .cache()\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "15c3ff28",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "user_aggreagated.show(10, False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f7e98534",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "user_aggreagated.count()"
+ ]
+ }
+ ],
+ "metadata": {
+ "@webio": {
+ "lastCommId": null,
+ "lastKernelId": null
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.12"
+ },
+ "sparkconnect": {
+ "bundled_options": [],
+ "list_of_options": [
+ {
+ "name": "spark.jars.packages",
+ "value": "org.apache.spark:spark-avro_2.12:3.5.0"
+ },
+ {
+ "name": "spark.executor.instances",
+ "value": "20"
+ }
+ ]
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/script/Monitor/crab-spark/notebooks/crab_tape_recall_history.ipynb b/src/script/Monitor/crab-spark/notebooks/crab_tape_recall_history.ipynb
new file mode 100644
index 0000000000..3ed0a6e890
--- /dev/null
+++ b/src/script/Monitor/crab-spark/notebooks/crab_tape_recall_history.ipynb
@@ -0,0 +1,572 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "2ecefbb5",
+ "metadata": {},
+ "source": [
+ "# CRAB Spark tape recall history\n",
+ "\n",
+ "This jobs is querying `rules_history` table of cmsrucio to answer theses questions:\n",
+ "- How long do tasks stay in “taperecall”?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5e9af689",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from datetime import datetime, timedelta, timezone\n",
+ "import os\n",
+ "import time\n",
+ "import pandas as pd\n",
+ "\n",
+ "from pyspark import SparkContext, StorageLevel\n",
+ "from pyspark.sql import SparkSession\n",
+ "from pyspark.sql.functions import (\n",
+ " current_user,\n",
+ " col, collect_list, concat_ws, greatest, lit, lower, when,\n",
+ " avg as _avg,\n",
+ " count as _count,\n",
+ " hex as _hex,\n",
+ " max as _max,\n",
+ " min as _min,\n",
+ " round as _round,\n",
+ " sum as _sum,\n",
+ ")\n",
+ "from pyspark.sql.types import (\n",
+ " StructType,\n",
+ " LongType,\n",
+ " StringType,\n",
+ " StructField,\n",
+ " DoubleType,\n",
+ " IntegerType,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "22946659",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "spark = SparkSession\\\n",
+ " .builder\\\n",
+ " .appName('tape-recall-history')\\\n",
+ " .getOrCreate()\n",
+ "spark"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "014b13c8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "spark.catalog.clearCache()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "31c19eb0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# arguments\n",
+ "# secret path, also check if file exists\n",
+ "secretpath = os.environ.get('OPENSEARCH_SECRET_PATH', f'{os.getcwd()}/../workdir/secret_opensearch.txt')\n",
+ "if not os.path.isfile(secretpath): \n",
+ " raise Exception(f'OS secrets file {secretpath} does not exists')\n",
+ "# if PROD, index prefix will be `crab-*`, otherwise `crab-test-*`\n",
+ "PROD = os.environ.get('PROD', 'false').lower() in ('true', '1', 't')\n",
+ "# FROM_DATE, in strptime(\"%Y-%m-%d\")\n",
+ "START = os.environ.get('START_DATE', None) \n",
+ "END = os.environ.get('END_DATE', None)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e843eb6d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# try to import libs from current directory, fallback to $PWD/../workdir if not found\n",
+ "try:\n",
+ " from crabspark_utils import get_candidate_files, send_os, send_os_parallel\n",
+ "except ModuleNotFoundError:\n",
+ " import sys\n",
+ " sys.path.insert(0, f'{os.getcwd()}/../workdir')\n",
+ " from crabspark_utils import get_candidate_files, send_os, send_os_parallel\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2c644790",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# variables for run inside notebook\n",
+ "START_DATE = \"2020-01-01\"\n",
+ "END_DATE = \"2024-10-01\"\n",
+ "# if cronjob, replace constant with value from env\n",
+ "if START and END:\n",
+ " START_DATE = START\n",
+ " END_DATE = END"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d608eab0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# index name\n",
+ "index_name = 'tape-recall-history' # always put test index prefix\n",
+ "# use prod index pattern if this execution is for production\n",
+ "if PROD:\n",
+ " index_name = f'crab-prod-{index_name}'\n",
+ "else:\n",
+ " index_name = f'crab-test-{index_name}'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b17ed53f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# datetime object\n",
+ "start_datetime = datetime.strptime(START_DATE, \"%Y-%m-%d\").replace(tzinfo=timezone.utc)\n",
+ "end_datetime = datetime.strptime(END_DATE, \"%Y-%m-%d\").replace(tzinfo=timezone.utc)\n",
+ "# sanity check\n",
+ "if end_datetime < start_datetime: \n",
+ " raise Exception(f\"end date ({END_DATE}) is less than start date ({START_DATE})\")\n",
+ "start_epochmilis = int(start_datetime.timestamp()) * 1000\n",
+ "end_epochmilis = int(end_datetime.timestamp()) * 1000\n",
+ "yesterday_epoch = int((end_datetime-timedelta(days=1)).timestamp())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9404c437",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# debug\n",
+ "print(START_DATE, \n",
+ " END_DATE, \n",
+ " index_name,\n",
+ " sep='\\n')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3e85c2f0",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "# Import data into spark\n",
+ "\n",
+ "HDFS_RUCIO_RULES_HISTORY = f'/project/awg/cms/rucio/{END_DATE}/rules_history/'\n",
+ "\n",
+ "print(\"===============================================\"\n",
+ " , \"RUCIO : Rules History\"\n",
+ " , \"===============================================\"\n",
+ " , \"File Directory:\", HDFS_RUCIO_RULES_HISTORY\n",
+ " , \"Work Directory:\", os.getcwd()\n",
+ " , \"===============================================\"\n",
+ " , \"===============================================\", sep='\\n')\n",
+ "\n",
+ "# we only interest in the rules where state does not change anymore.\n",
+ "# which means, only the rules that already expired.\n",
+ "rucio_rules_history = (\n",
+ " spark.read.format('avro').load(HDFS_RUCIO_RULES_HISTORY).withColumn('ID', lower(_hex(col('ID'))))\n",
+ " .select(\"ID\", \"ACCOUNT\", \"NAME\", \"STATE\", \"EXPIRES_AT\", \"UPDATED_AT\", \"CREATED_AT\")\n",
+ " .filter(f\"\"\"\\\n",
+ " 1=1\n",
+ " AND ACTIVITY = 'Analysis TapeRecall'\n",
+ " AND EXPIRES_AT >= {start_epochmilis}\n",
+ " AND EXPIRES_AT < {end_epochmilis}\n",
+ " \"\"\")\n",
+ " .cache()\n",
+ ")\n",
+ "rucio_rules_history.createOrReplaceTempView(\"rules_history\")\n",
+ "\n",
+ "HDFS_CRAB_part = f'/project/awg/cms/crab/tasks/{END_DATE}/'\n",
+ "print(\"===============================================\"\n",
+ " , \"CRAB Table\"\n",
+ " , \"===============================================\"\n",
+ " , \"File Directory:\", HDFS_CRAB_part\n",
+ " , \"Work Directory:\", os.getcwd()\n",
+ " , \"===============================================\"\n",
+ " , \"===============================================\", sep='\\n')\n",
+ "\n",
+ "# do not filter taskdb by create time (TM_START_TIME) because it is possible that rules are created 6 months ago\n",
+ "tasks_df = (\n",
+ " spark.read.format('avro').load(HDFS_CRAB_part)\n",
+ " .select(\"TM_TASKNAME\",\"TM_START_TIME\",\"TM_TASK_STATUS\", 'TM_TASKNAME', 'TM_START_TIME', 'TM_TASK_STATUS' , 'TM_DDM_REQID')\n",
+ " .cache()\n",
+ ")\n",
+ "tasks_df.createOrReplaceTempView(\"tasks\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d0ad6c09",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# rucio append new row to rules_history when the content rules table change (not sure the exact condition)\n",
+ "# We need to get \"the latest\" row for each rules by:\n",
+ "# - If rule has state \"O\", select the earliest UPDATED_AT row.\n",
+ "# For the OK rule, we can calculate number of days using UPDATED_AT-CREATED_AT. \n",
+ "# However, there are some posiblility that rucio append new entry with newer UPDATED_AT (For exmple 37fcada73f14439b88558ef792e10276)\n",
+ "# - If not, select the latest UPDATED_AT row.\n",
+ "# This because the rules still in temporary state, and the rules will go to the end state \n",
+ "# (not the real state, but rules_history will not getting new row anymore) after rules is expired \n",
+ "# So, we can calculate number of day by EXPIRES_AT-CREATED_AT\n",
+ "#\n",
+ "# Here is the step to translate above condition to SQL (in the buttom-up manner)\n",
+ "# 1. count number of row where the state is 'O'.\n",
+ "# 2. left join the rule history by ID, so each row will have number of state O \n",
+ "# New table look like this:\n",
+ "# +--------------------------------+-----+-------------+-------+\n",
+ "# |ID |STATE|EXPIRES_AT |state_o|\n",
+ "# +--------------------------------+-----+-------------+-------+\n",
+ "# |6d275222b43d431abc568dd83313118f|R |1727244523000|1 |\n",
+ "# |875a388ca374407ea761689511078956|R |1727339056000|1 |\n",
+ "# |dfe4012bcb9c448f98f940f01302ae6e|R |1727234937000|0 |\n",
+ "# |dfe4012bcb9c448f98f940f01302ae6e|R |1725402537000|0 |\n",
+ "# |c6859b18a771440ab906733e2bebf78a|R |1727235038000|1 |\n",
+ "# \n",
+ "# 3. select the earliest row for \"the rule that have state O\" (where clause). this can be done by windows function, sort by UPDATED_AT ascending for each ID, then filter only row_number \"1\"\n",
+ "# 4. select the latest row for \"the rule that does not have state O at all\". \n",
+ "# This is a bit tricky but can be done by filter out the rule that have number of state O more than zero.\n",
+ "# which this column already availabe from left join in step 2.\n",
+ "# For the \"select latest row\" we do the same way as 4. but sort by UPDATED_AT descending instead.\n",
+ "# 5. merge result from 3. and 4 with UNION ALL.\n",
+ "# 6. Then, we will calculate number of date in the next step\n",
+ "#\n",
+ "# We are selecting the rules for each condition and join later, to avoid large broadcasthashjoin internally\n",
+ "# I (Wa) tried this before and it cause above issue, but I might be wrong here though.\n",
+ "# ```\n",
+ "# SELECT * FROM rhistinfo_t \n",
+ "# WHERE (state_o > 0) \n",
+ "# OR (ID NOT IN (SELECT ID FROM (SELECT * FROM rhistinfo_t WHERE state_o > 0)))\n",
+ "# ```\n",
+ "# \n",
+ "\n",
+ "query = f\"\"\"\\\n",
+ "WITH \n",
+ "count_t AS (\n",
+ "SELECT ID, \n",
+ " SUM(CASE WHEN state = 'O' THEN 1 ELSE 0 END) AS state_o\n",
+ "FROM rules_history\n",
+ "GROUP BY ID\n",
+ "),\n",
+ "rhistinfo_t AS (\n",
+ "SELECT rules_history.ID AS ID, \n",
+ " rules_history.ACCOUNT AS ACCOUNT, \n",
+ " rules_history.NAME AS NAME, \n",
+ " rules_history.STATE AS STATE, \n",
+ " rules_history.EXPIRES_AT AS EXPIRES_AT, \n",
+ " rules_history.UPDATED_AT AS UPDATED_AT, \n",
+ " rules_history.CREATED_AT AS CREATED_AT,\n",
+ " count_t.state_o AS state_o\n",
+ "FROM rules_history\n",
+ "LEFT JOIN count_t ON rules_history.ID = count_t.ID\n",
+ "),\n",
+ "tmpwindow_1 AS (\n",
+ "SELECT *, row_number() over(partition by ID order by UPDATED_AT) as row_num\n",
+ "FROM rhistinfo_t\n",
+ "WHERE STATE = 'O'\n",
+ "), \n",
+ "r1 AS (\n",
+ "SELECT * FROM tmpwindow_1\n",
+ "WHERE row_num = 1\n",
+ "),\n",
+ "tmpwindow_2 AS (\n",
+ "SELECT *, row_number() over(partition by ID order by UPDATED_AT DESC) as row_num\n",
+ "FROM rhistinfo_t\n",
+ "WHERE STATE != 'O' AND state_o = 0\n",
+ "),\n",
+ "r2 AS (\n",
+ "SELECT * FROM tmpwindow_2\n",
+ "WHERE row_num = 1\n",
+ "),\n",
+ "r_all AS (\n",
+ "SELECT * FROM r1\n",
+ "UNION ALL\n",
+ "SELECT * FROM r2\n",
+ ")\n",
+ "SELECT * \n",
+ "FROM r_all\n",
+ "ORDER BY ID\n",
+ "\"\"\"\n",
+ "\n",
+ "tmprules = spark.sql(query)\n",
+ "tmprules.show(10, False)\n",
+ "tmprules.createOrReplaceTempView(\"tmprules\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "32dd41b1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Calculate number of days, for state O, UPDATED_AT-CREATED_AT, otherwise EXPIRES_AT-CREATED_AT\n",
+ "# then enrich the data with the crab taskdb table by join rule ID with TM_DDM_REQID column\n",
+ "# need to apply windows function again to select only the rule id with the latest crab tasks\n",
+ "\n",
+ "query = f\"\"\"\\\n",
+ "WITH \n",
+ "calc_days_t AS (\n",
+ "SELECT ID, ACCOUNT, NAME, STATE, EXPIRES_AT, UPDATED_AT, CREATED_AT,\n",
+ " CASE \n",
+ " WHEN STATE = 'O' THEN ceil((UPDATED_AT-CREATED_AT)/86400000) \n",
+ " ELSE ceil((EXPIRES_AT-CREATED_AT)/86400000)\n",
+ " END AS DAYS\n",
+ "FROM tmprules\n",
+ "),\n",
+ "join_t AS (\n",
+ "SELECT \n",
+ " calc_days_t.ID AS ID, \n",
+ " calc_days_t.ACCOUNT AS ACCOUNT, \n",
+ " calc_days_t.NAME AS NAME, \n",
+ " calc_days_t.STATE AS STATE, \n",
+ " calc_days_t.DAYS AS DAYS, \n",
+ " calc_days_t.EXPIRES_AT AS EXPIRES_AT, \n",
+ " calc_days_t.UPDATED_AT AS UPDATED_AT, \n",
+ " calc_days_t.CREATED_AT AS CREATED_AT, \n",
+ " tasks.TM_TASKNAME AS TM_TASKNAME,\n",
+ " IFNULL(tasks.TM_START_TIME, 0) AS TM_START_TIME, \n",
+ " tasks.TM_TASK_STATUS AS TM_TASK_STATUS\n",
+ "FROM calc_days_t\n",
+ "LEFT JOIN tasks ON calc_days_t.ID = tasks.TM_DDM_REQID\n",
+ "),\n",
+ "window_t AS (\n",
+ "SELECT ID, ACCOUNT, NAME, STATE, DAYS, EXPIRES_AT, UPDATED_AT, CREATED_AT, TM_TASKNAME, TM_START_TIME, TM_TASK_STATUS, \n",
+ " row_number() OVER (PARTITION BY ID ORDER BY TM_START_TIME DESC) AS row_num\n",
+ "FROM join_t \n",
+ "),\n",
+ "uniqueid_t AS (\n",
+ "SELECT *\n",
+ "FROM window_t \n",
+ "WHERE row_num = 1\n",
+ "), \n",
+ "finalize_t AS (\n",
+ "SELECT ID, ACCOUNT, NAME, STATE, DAYS, EXPIRES_AT, UPDATED_AT, CREATED_AT, TM_TASKNAME, IFNULL(TM_START_TIME, 0) as TM_START_TIME, TM_TASK_STATUS, \n",
+ " EXPIRES_AT AS timestamp,\n",
+ " 'tape_recall_history' AS type\n",
+ "FROM uniqueid_t \n",
+ ")\n",
+ "SELECT *\n",
+ "FROM finalize_t\n",
+ "\"\"\"\n",
+ "\n",
+ "tmpdf = spark.sql(query)\n",
+ "tmpdf.show(10, False)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "df979012",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tmpdf.count()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c33dfce3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "docs = tmpdf.toPandas().to_dict('records')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "eee4a1f3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "schema = {\n",
+ " \"settings\": {\"index\": {\"number_of_shards\": \"1\", \"number_of_replicas\": \"1\"}},\n",
+ " \"mappings\": {\n",
+ " \"properties\": {\n",
+ " \"ID\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+ " \"ACCOUNT\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+ " \"NAME\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+ " \"STATE\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+ " \"DAYS\": {\"type\": \"long\"},\n",
+ " \"EXPIRES_AT\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
+ " \"UPDATED_AT\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
+ " \"CREATED_AT\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
+ " \"TM_TASKNAME\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+ " \"TM_START_TIME\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
+ " \"TM_TASK_STATUS\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+ " \"type\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+ " \"timestamp\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
+ " }\n",
+ "\n",
+ " }\n",
+ "\n",
+ " }\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9ec824ee",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# this is simple workaround osearch bug when work in notebook because\n",
+ "# - it load the secret once and use forever\n",
+ "# - get_or_create_index() create index+schema only the first time it execute\n",
+ "# it is safe to run again even in cronjobs \n",
+ "import importlib\n",
+ "import osearch\n",
+ "importlib.reload(osearch)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6cdc83dd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "osearch.send_os(docs, index_name, schema, secretpath, yesterday_epoch)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "22747a3f",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "# Add a single doc to es everyday to check if pipeline is running successfully.\n",
+ "# This is need because we did not have rule that expires everyday\n",
+ "# Remember to filter it out in grafana (For example `NOT ID:00000000000000000` in lucene query)\n",
+ "day = start_datetime\n",
+ "monitoring_docs = []\n",
+ "while day < end_datetime:\n",
+ " milisec = int(day.timestamp())*1000\n",
+ " doc = {\n",
+ " \"ID\": '00000000000000000',\n",
+ " \"ACCOUNT\": 'cmscrab',\n",
+ " \"NAME\": '/Pipeline/Monitoring/AOD',\n",
+ " \"STATE\": 'P',\n",
+ " \"DAYS\": -1,\n",
+ " \"EXPIRES_AT\": milisec,\n",
+ " \"UPDATED_AT\": milisec,\n",
+ " \"CREATED_AT\": milisec,\n",
+ " \"TM_TASKNAME\": '240000_000000:cmscrab_crab_20240000_000000',\n",
+ " \"TM_START_TIME\": milisec,\n",
+ " \"TM_TASK_STATUS\": 'PLACEHOLDER',\n",
+ " \"type\": 'tape_recall_history',\n",
+ " \"timestamp\": milisec,\n",
+ "\n",
+ " }\n",
+ " monitoring_docs.append(doc)\n",
+ " day += timedelta(days=1)\n",
+ "send_os(monitoring_docs, index_name, schema, secretpath, yesterday_epoch)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4a24e4ff",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Useful query to get only the rules that gave\n",
+ "#query = f\"\"\"\\\n",
+ "#repeated_ids AS (\n",
+ "# SELECT ID\n",
+ "# FROM rules_history\n",
+ "# GROUP BY ID\n",
+ "# HAVING COUNT(*) > 2\n",
+ "#),\n",
+ "#tba_t AS (\n",
+ "#SELECT *\n",
+ "#FROM rules_history\n",
+ "#)\n",
+ "#SELECT * FROM tba_t\n",
+ "#\"\"\"\n",
+ "#\n",
+ "#testdf = spark.sql(query)\n",
+ "#testdf.show(100, False)\n",
+ "#\n",
+ "# rule 37fc where latest UPDATED_AT is 43 days after the first OK state\n",
+ "#spark.sql(\"\"\"\\\n",
+ "#SELECT * FROM rules_history\n",
+ "#WHERE ID = '37fcada73f14439b88558ef792e10276'\n",
+ "#\"\"\").show(10, False)"
+ ]
+ }
+ ],
+ "metadata": {
+ "@webio": {
+ "lastCommId": null,
+ "lastKernelId": null
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.12"
+ },
+ "sparkconnect": {
+ "bundled_options": [],
+ "list_of_options": [
+ {
+ "name": "spark.jars.packages",
+ "value": "org.apache.spark:spark-avro_2.12:3.5.0"
+ },
+ {
+ "name": "spark.executor.instances",
+ "value": "20"
+ }
+ ]
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/script/Monitor/crab-spark/notebooks/crab_tape_recall_rules_history.ipynb b/src/script/Monitor/crab-spark/notebooks/crab_tape_recall_rules_history.ipynb
deleted file mode 100644
index 20f441f4ce..0000000000
--- a/src/script/Monitor/crab-spark/notebooks/crab_tape_recall_rules_history.ipynb
+++ /dev/null
@@ -1,726 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "2fe94c82",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "9f91521a",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- "
SparkSession - in-memory
\n",
- " \n",
- "
\n",
- "
SparkContext
\n",
- "\n",
- "
Spark UI
\n",
- "\n",
- "
\n",
- " - Version
\n",
- " v3.3.2
\n",
- " - Master
\n",
- " yarn
\n",
- " - AppName
\n",
- " pyspark_shell_swan
\n",
- "
\n",
- "
\n",
- " \n",
- "
\n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "execution_count": 1,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "666f70d9",
- "metadata": {},
- "outputs": [],
- "source": [
- "# !hdfs dfs -stat /project/awg/cms/rucio/2023-07-31/"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "bd6751a6",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "23/08/01 17:05:11 WARN ipc.Client: Exception encountered while connecting to the server \n",
- "org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.ipc.StandbyException): Operation category READ is not supported in state standby. Visit https://s.apache.org/sbnn-error\n",
- "\tat org.apache.hadoop.security.SaslRpcClient.saslConnect(SaslRpcClient.java:376)\n",
- "\tat org.apache.hadoop.ipc.Client$Connection.setupSaslConnection(Client.java:622)\n",
- "\tat org.apache.hadoop.ipc.Client$Connection.access$2300(Client.java:413)\n",
- "\tat org.apache.hadoop.ipc.Client$Connection$2.run(Client.java:822)\n",
- "\tat org.apache.hadoop.ipc.Client$Connection$2.run(Client.java:818)\n",
- "\tat java.base/java.security.AccessController.doPrivileged(Native Method)\n",
- "\tat java.base/javax.security.auth.Subject.doAs(Subject.java:423)\n",
- "\tat org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1845)\n",
- "\tat org.apache.hadoop.ipc.Client$Connection.setupIOstreams(Client.java:818)\n",
- "\tat org.apache.hadoop.ipc.Client$Connection.access$3800(Client.java:413)\n",
- "\tat org.apache.hadoop.ipc.Client.getConnection(Client.java:1636)\n",
- "\tat org.apache.hadoop.ipc.Client.call(Client.java:1452)\n",
- "\tat org.apache.hadoop.ipc.Client.call(Client.java:1405)\n",
- "\tat org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:234)\n",
- "\tat org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:119)\n",
- "\tat com.sun.proxy.$Proxy12.getFileInfo(Unknown Source)\n",
- "\tat org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.getFileInfo(ClientNamenodeProtocolTranslatorPB.java:964)\n",
- "\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n",
- "\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n",
- "\tat java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n",
- "\tat java.base/java.lang.reflect.Method.invoke(Method.java:566)\n",
- "\tat org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:422)\n",
- "\tat org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeMethod(RetryInvocationHandler.java:165)\n",
- "\tat org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invoke(RetryInvocationHandler.java:157)\n",
- "\tat org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeOnce(RetryInvocationHandler.java:95)\n",
- "\tat org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:359)\n",
- "\tat com.sun.proxy.$Proxy13.getFileInfo(Unknown Source)\n",
- "\tat org.apache.hadoop.hdfs.DFSClient.getFileInfo(DFSClient.java:1731)\n",
- "\tat org.apache.hadoop.hdfs.DistributedFileSystem$29.doCall(DistributedFileSystem.java:1725)\n",
- "\tat org.apache.hadoop.hdfs.DistributedFileSystem$29.doCall(DistributedFileSystem.java:1722)\n",
- "\tat org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81)\n",
- "\tat org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:1737)\n",
- "\tat org.apache.hadoop.fs.Globber.getFileStatus(Globber.java:115)\n",
- "\tat org.apache.hadoop.fs.Globber.doGlob(Globber.java:349)\n",
- "\tat org.apache.hadoop.fs.Globber.glob(Globber.java:202)\n",
- "\tat org.apache.hadoop.fs.FileSystem.globStatus(FileSystem.java:2093)\n",
- "\tat org.apache.hadoop.fs.shell.PathData.expandAsGlob(PathData.java:353)\n",
- "\tat org.apache.hadoop.fs.shell.Command.expandArgument(Command.java:250)\n",
- "\tat org.apache.hadoop.fs.shell.Command.expandArguments(Command.java:233)\n",
- "\tat org.apache.hadoop.fs.shell.FsCommand.processRawArguments(FsCommand.java:104)\n",
- "\tat org.apache.hadoop.fs.shell.Command.run(Command.java:177)\n",
- "\tat org.apache.hadoop.fs.FsShell.run(FsShell.java:327)\n",
- "\tat org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:76)\n",
- "\tat org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:90)\n",
- "\tat org.apache.hadoop.fs.FsShell.main(FsShell.java:390)\n",
- "Found 41 items\n",
- "-rw-r--r-x+ 3 cmssqoop c3 0 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/_SUCCESS\n",
- "-rw-r--r-x+ 3 cmssqoop c3 88187830 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00000.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 78573788 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00001.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 89288020 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00002.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 87120186 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00003.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 84145506 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00004.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 77023084 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00005.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 82231949 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00006.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 90427579 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00007.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 83505019 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00008.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 81737327 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00009.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 89063315 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00010.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 87547076 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00011.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 76025866 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00012.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 86124517 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00013.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 84209698 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00014.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 87883924 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00015.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 84024611 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00016.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 88549765 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00017.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 78591247 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00018.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 88304711 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00019.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 84004574 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00020.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 84661738 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00021.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 78502498 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00022.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 91523366 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00023.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 77450183 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00024.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 92852942 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00025.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 85201132 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00026.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 83220428 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00027.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 72640822 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00028.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 74597749 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00029.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 83142949 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00030.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 86601475 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00031.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 90497549 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00032.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 88555030 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00033.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 78799199 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00034.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 80642314 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00035.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 85967465 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00036.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 92843317 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00037.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 83861741 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00038.avro\n",
- "-rw-r--r-x+ 3 cmssqoop c3 91545885 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00039.avro\n"
- ]
- }
- ],
- "source": [
- "!hdfs dfs -ls /project/awg/cms/rucio/2023-07-24/rules_history #02:54:14"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "800a2f9e",
- "metadata": {},
- "outputs": [],
- "source": [
- "import pickle\n",
- "from datetime import datetime, timedelta\n",
- "\n",
- "import click\n",
- "import os\n",
- "import pandas as pd\n",
- "import pprint\n",
- "import time\n",
- "from dateutil.relativedelta import relativedelta\n",
- "from pyspark import SparkContext, StorageLevel\n",
- "from pyspark.sql import SparkSession\n",
- "from pyspark.sql.functions import (\n",
- " col, collect_list, concat_ws, greatest, lit, lower, when,\n",
- " avg as _avg,\n",
- " count as _count,\n",
- " hex as _hex,\n",
- " max as _max,\n",
- " min as _min,\n",
- " round as _round,\n",
- " sum as _sum,\n",
- ")\n",
- "\n",
- "from pyspark.sql.types import (\n",
- " LongType,\n",
- ")\n",
- "\n",
- "#from CMSSpark.src.python.CMSSpark import schemas as cms_schemas"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "id": "6951caed",
- "metadata": {},
- "outputs": [],
- "source": [
- "#from CMSSpark import schemas as cms_schemas"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "4e78c524",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "markdown",
- "id": "e597820f",
- "metadata": {},
- "source": [
- "## load dataset"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "id": "2c100a92",
- "metadata": {},
- "outputs": [],
- "source": [
- "# end_date = str(datetime.now())[:10]\n",
- "# start_date = str(datetime.now()-timedelta(days=1))[:10]\n",
- "\n",
- "end_date = '2023-07-31'\n",
- "start_date = '2023-07-01'\n",
- "\n",
- "TOYEAR = end_date[:4]\n",
- "\n",
- "wa_date = end_date\n",
- "HDFS_RUCIO_DATASET_LOCKS = f'/project/awg/cms/rucio/{wa_date}/dataset_locks/part*.avro'\n",
- "HDFS_RUCIO_LOCKS = f'/project/awg/cms/rucio/{wa_date}/locks'\n",
- "HDFS_RUCIO_RSES = f'/project/awg/cms/rucio/{wa_date}/rses/part*.avro'\n",
- "HDFS_RUCIO_RULES = f'/project/awg/cms/rucio/{wa_date}/rules'\n",
- "HDFS_RUCIO_RULES_HISTORY = f'/project/awg/cms/rucio/{wa_date}/rules_history'\n",
- "HDFS_RUCIO_REPLICAS = f'/project/awg/cms/rucio/{wa_date}/replicas'"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "id": "fe62d431",
- "metadata": {},
- "outputs": [],
- "source": [
- "# rucio_dataset_locks = spark.read.format('avro').load(HDFS_RUCIO_DATASET_LOCKS)\\\n",
- "# .withColumn('BYTES', col('BYTES').cast(LongType()))\\\n",
- "# .withColumn('RULE_ID', lower(_hex(col('RULE_ID'))))\\\n",
- "# .withColumn('RSE_ID', lower(_hex(col('RSE_ID'))))\n",
- "# rucio_dataset_locks.createOrReplaceTempView(\"dataset_locks\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "id": "b2e4fcfa",
- "metadata": {},
- "outputs": [],
- "source": [
- "# rucio_rses = spark.read.format('avro').load(HDFS_RUCIO_RSES)\\\n",
- "# .withColumn('ID', lower(_hex(col('ID'))))\n",
- "# rucio_rses.createOrReplaceTempView(\"rses\")\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "id": "3893197e",
- "metadata": {},
- "outputs": [],
- "source": [
- "# rucio_rules = spark.read.format('avro').load(HDFS_RUCIO_RULES)\\\n",
- "# .withColumn('ID', lower(_hex(col('ID'))))\n",
- "# rucio_rules.createOrReplaceTempView(\"rules\")\n",
- "# #spark.sql(\"SELECT * FROM rules\").count()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "id": "f9f2ba4e",
- "metadata": {},
- "outputs": [],
- "source": [
- "# rucio_locks = spark.read.format('avro').load(HDFS_RUCIO_LOCKS)\\\n",
- "# .withColumn('BYTES', col('BYTES').cast(LongType()))\\\n",
- "# .withColumn('RULE_ID', lower(_hex(col('RULE_ID'))))\\\n",
- "# .withColumn('RSE_ID', lower(_hex(col('RSE_ID'))))\n",
- "# rucio_locks.createOrReplaceTempView(\"locks\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "id": "7771b12d",
- "metadata": {},
- "outputs": [],
- "source": [
- "rucio_rules_history = spark.read.format('avro').load(HDFS_RUCIO_RULES_HISTORY)\\\n",
- " .withColumn('ID', lower(_hex(col('ID'))))\n",
- " #.persist(StorageLevel.DISK_ONLY)\n",
- "rucio_rules_history = rucio_rules_history.select(\"ID\", \"NAME\", \"STATE\", \"EXPIRES_AT\", \"UPDATED_AT\", \"CREATED_AT\", \"ACCOUNT\")\n",
- "rucio_rules_history.createOrReplaceTempView(\"rules_history\")\n",
- "#spark.sql(\"SELECT * FROM rules_history\").count()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "id": "274421b8",
- "metadata": {},
- "outputs": [],
- "source": [
- "# rucio_replicas = spark.read.format('avro').load(HDFS_RUCIO_REPLICAS)\\\n",
- "# .withColumn('RSE_ID', lower(_hex(col('RSE_ID'))))\n",
- "# rucio_replicas.createOrReplaceTempView(\"replicas\")\n",
- "# #spark.sql(\"SELECT * FROM replicas\").count()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "5c84635f",
- "metadata": {},
- "source": [
- "## Query"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "ee99f580",
- "metadata": {},
- "source": [
- "# how long does it take ?"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "id": "26120cd9",
- "metadata": {
- "scrolled": false
- },
- "outputs": [],
- "source": [
- "# # NOTE: days is ceiling\n",
- "\n",
- "# spark.sql(\"\"\"\n",
- "# WITH filter_t AS (\n",
- "# SELECT ID, NAME, STATE, EXPIRES_AT, UPDATED_AT, CREATED_AT\n",
- "# FROM rules_history \n",
- "# WHERE 1=1\n",
- "# AND ACCOUNT = \"crab_tape_recall\"\n",
- "# --- we look at the rule created this year (2023)\n",
- "# AND CREATED_AT >= unix_timestamp(\"2023-01-01 00:00:00\", \"yyyy-MM-dd HH:mm:ss\")*1000 \n",
- "# ),\n",
- "# rn_t AS (\n",
- "# SELECT ID, NAME, STATE, EXPIRES_AT, UPDATED_AT, CREATED_AT,\n",
- "# row_number() over(partition by ID order by UPDATED_AT desc) as rn --- to get only latest state for each id\n",
- "# FROM filter_t\n",
- "# ),\n",
- "# calc_days_t AS (\n",
- "# SELECT ID, NAME, STATE, \n",
- "# from_unixtime(EXPIRES_AT/1000, 'yyyy-MM-dd HH:mm:ss') AS EXPIRES_AT, \n",
- "# from_unixtime(UPDATED_AT/1000, 'yyyy-MM-dd HH:mm:ss') AS UPDATED_AT, \n",
- "# from_unixtime(CREATED_AT/1000, 'yyyy-MM-dd HH:mm:ss') AS CREATED_AT,\n",
- "# --- if state is O we calculate from update_at when state change (assumed that there is only single row for O state)\n",
- "# --- but if state is not O, we calculate from expired time, it usually 14 days but it is possible that rules somehow got extend\n",
- "# --- other wise days = 0 for filter the rules that not expire \n",
- "# CASE \n",
- "# WHEN STATE = 'O' THEN ceil((UPDATED_AT-CREATED_AT)/86400000) \n",
- "# WHEN STATE != 'O' AND EXPIRES_AT < unix_timestamp(\"2023-05-22 00:00:00\", \"yyyy-MM-dd HH:mm:ss\")*1000 THEN ceil((EXPIRES_AT-CREATED_AT)/86400000)\n",
- "# ELSE 0\n",
- "# END AS DAYS\n",
- "# FROM rn_t\n",
- "# WHERE rn = 1\n",
- "# )\n",
- "# SELECT * \n",
- "# FROM calc_days_t\n",
- "# ---AND STATE == 'O'\n",
- "# \"\"\"\n",
- "# ).show(50,truncate=False)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "id": "fadde59c",
- "metadata": {},
- "outputs": [],
- "source": [
- "## query use to produce data to elasticsearch\n",
- "\n",
- "query = f\"\"\"\\\n",
- "WITH filter_t AS (\n",
- "SELECT ID, NAME, STATE, EXPIRES_AT, UPDATED_AT, CREATED_AT\n",
- "FROM rules_history \n",
- "WHERE 1=1\n",
- "AND ACCOUNT = \"crab_tape_recall\"\n",
- "AND CREATED_AT >= unix_timestamp(\"{TOYEAR}-01-01 00:00:00\", \"yyyy-MM-dd HH:mm:ss\")*1000\n",
- "),\n",
- "rn_t AS (\n",
- "SELECT ID, NAME, STATE, EXPIRES_AT, UPDATED_AT, CREATED_AT,\n",
- "row_number() over(partition by ID order by UPDATED_AT desc) as rn\n",
- "FROM filter_t\n",
- "),\n",
- "calc_days_t AS (\n",
- "SELECT ID, NAME, STATE, EXPIRES_AT, UPDATED_AT, CREATED_AT,\n",
- " CASE \n",
- " WHEN STATE = 'O' THEN ceil((UPDATED_AT-CREATED_AT)/86400000) \n",
- " WHEN STATE != 'O' AND EXPIRES_AT < unix_timestamp(\"{wa_date} 00:00:00\", \"yyyy-MM-dd HH:mm:ss\")*1000 THEN ceil((EXPIRES_AT-CREATED_AT)/86400000)\n",
- " ELSE 0\n",
- " END AS DAYS\n",
- "FROM rn_t\n",
- "WHERE rn = 1\n",
- ")\n",
- "SELECT * \n",
- "FROM calc_days_t\n",
- "WHERE 1=1\n",
- "AND EXPIRES_AT >= unix_timestamp(\"{start_date} 00:00:00\", \"yyyy-MM-dd HH:mm:ss\")*1000 \n",
- "AND EXPIRES_AT < unix_timestamp(\"{end_date} 00:00:00\", \"yyyy-MM-dd HH:mm:ss\")*1000 \n",
- "\"\"\"\n",
- "\n",
- "tmpdf = spark.sql(query)\n",
- "# str(datetime.now()-timedelta(days=1))[:10]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "id": "b44548ef",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "+--------------------+--------------------+-----+-------------+-------------+-------------+----+\n",
- "| ID| NAME|STATE| EXPIRES_AT| UPDATED_AT| CREATED_AT|DAYS|\n",
- "+--------------------+--------------------+-----+-------------+-------------+-------------+----+\n",
- "|16e7eeb0a6c447839...|/DYJetsToLL_LHEFi...| O|1689496342000|1689130290000|1686566168000| 30|\n",
- "|27aea75d1d364b219...|/WJetsToLNu_HT-20...| O|1689568449000|1689086563000|1686931142000| 25|\n",
- "|3f2d7fcff69d49079...|/ParkingBPH1/Run2...| R|1689522386000|1687621610000|1687621586000| 22|\n",
- "|67d9f565492b4dec9...|/DYJetsToLL_M-10t...| R|1689519133000|1687618376000|1687618333000| 22|\n",
- "|c2cbad3267e84ba18...|/TapeRecall/23061...| O|1689554004000|1689117261000|1686940766000| 26|\n",
- "|d23ee08f6aac4d5db...|/QCD_HT300to500_T...| O|1689525153000|1689048723000|1686900417000| 25|\n",
- "|ddfdfed2239940298...|/W2JetsToLNu_Tune...| R|1689517301000|1687616515000|1687616501000| 22|\n",
- "|dee8dbd0a82b48b59...|/TapeRecall/23060...| O|1689525153000|1689127918000|1685747740000| 40|\n",
- "+--------------------+--------------------+-----+-------------+-------------+-------------+----+\n",
- "\n"
- ]
- }
- ],
- "source": [
- "tmpdf.show(50)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "91db6a20",
- "metadata": {},
- "outputs": [],
- "source": [
- "tmpdf.printSchema()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 47,
- "id": "e734d507",
- "metadata": {},
- "outputs": [],
- "source": [
- "docs = tmpdf.toPandas().to_dict('records')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 48,
- "id": "ac8524e0",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "8260"
- ]
- },
- "execution_count": 48,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(docs)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 49,
- "id": "fa51e74c",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[{'ID': '00049b4efb3e4dd091dbfed2012069df',\n",
- " 'NAME': '/TapeRecall/221110_230609.dshmygol_crab_Bfinder_2018_MC_Bc_in_JpsiPI_v0_1/USER',\n",
- " 'STATE': 'O',\n",
- " 'EXPIRES_AT': 1669331191000,\n",
- " 'UPDATED_AT': 1668133187000,\n",
- " 'CREATED_AT': 1668121591000,\n",
- " 'DAYS': 1},\n",
- " {'ID': '0007a18199834a2ca720f088d96a3c9c',\n",
- " 'NAME': '/TapeRecall/220427_065307.youying_crab_DiphoVtxUL2016_DoubleMuon_Run2016B-21Feb2020_ver2_UL2016_HIPM-v1/USER',\n",
- " 'STATE': 'O',\n",
- " 'EXPIRES_AT': 1652252233000,\n",
- " 'UPDATED_AT': 1651048717000,\n",
- " 'CREATED_AT': 1651042633000,\n",
- " 'DAYS': 1},\n",
- " {'ID': '00d4ba364b89477e888e8797a33092d2',\n",
- " 'NAME': '/TapeRecall/210810_035101.jingqing_crab_BPHSkimOfficialChib06900-2016-v5/USER',\n",
- " 'STATE': 'O',\n",
- " 'EXPIRES_AT': 1629777107000,\n",
- " 'UPDATED_AT': 1628847357000,\n",
- " 'CREATED_AT': 1628567507000,\n",
- " 'DAYS': 4},\n",
- " {'ID': '00fb74e1bafc40aba0736216b798a80c',\n",
- " 'NAME': '/TapeRecall/230220_091052.shiyi_crab_RUN3_2022Dv2mass3_SKIM_E_newV2/USER',\n",
- " 'STATE': 'R',\n",
- " 'EXPIRES_AT': 1678093905000,\n",
- " 'UPDATED_AT': 1678093224000,\n",
- " 'CREATED_AT': 1676884305000,\n",
- " 'DAYS': 14},\n",
- " {'ID': '0116d88feb0842f29f78c78f2e7a4ce4',\n",
- " 'NAME': '/TapeRecall/230113_215556.wjang_crab_NanoAODv9_v1_ST_t-channel_antitop_4f_InclusiveDecays_TuneCP5_13TeV-powheg-madspin-pythia8_postVFP/USER',\n",
- " 'STATE': 'O',\n",
- " 'EXPIRES_AT': 1674856682000,\n",
- " 'UPDATED_AT': 1673966996000,\n",
- " 'CREATED_AT': 1673647082000,\n",
- " 'DAYS': 4}]"
- ]
- },
- "execution_count": 49,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "docs[:5]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "c052b072",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": 52,
- "id": "86f3a742",
- "metadata": {},
- "outputs": [],
- "source": [
- "import osearch"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 53,
- "id": "6d29e62d",
- "metadata": {},
- "outputs": [],
- "source": [
- "def get_index_schema():\n",
- " return {\n",
- " \"settings\": {\"index\": {\"number_of_shards\": \"1\", \"number_of_replicas\": \"1\"}},\n",
- " \"mappings\": {\n",
- " \"properties\": {\n",
- " \"timestamp\": {\"format\": \"epoch_second\", \"type\": \"date\"},\n",
- " \"ID\": {\"ignore_above\": 1024, \"type\": \"keyword\"},\n",
- " \"NAME\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
- " \"STATE\": {\"ignore_above\": 1024, \"type\": \"keyword\"},\n",
- " \"EXPIRES_AT\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
- " \"UPDATED_AT\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
- " \"CREATED_AT\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
- " \"DAYS\": {\"type\": \"long\"},\n",
- " }\n",
- " }\n",
- " }"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 54,
- "id": "b479eeb7",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/eos/user/e/eatthaph/.local/lib/python3.9/site-packages/opensearchpy/connection/http_urllib3.py:199: UserWarning: Connecting to https://es-cms1.cern.ch:443 using SSL with verify_certs=False is insecure.\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "0"
- ]
- },
- "execution_count": 54,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "_index_template = 'crab-tape-recall-daily-ekong'\n",
- "client = osearch.get_es_client(\"es-cms1.cern.ch/es\", 'secret_opensearch.txt', get_index_schema())\n",
- "# index_mod=\"\": 'test-foo', index_mod=\"Y\": 'test-foo-YYYY', index_mod=\"M\": 'test-foo-YYYY-MM', index_mod=\"D\": 'test-foo-YYYY-MM-DD',\n",
- "idx = client.get_or_create_index(timestamp=time.time(), index_template=_index_template, index_mod=\"M\")\n",
- "client.send(idx, docs, metadata=None, batch_size=10000, drop_nulls=False)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "0af51d3a",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "12ece939",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "f4567c46",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "546e9d4f",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "496e681c",
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "@webio": {
- "lastCommId": null,
- "lastKernelId": null
- },
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.9.12"
- },
- "sparkconnect": {
- "bundled_options": [],
- "list_of_options": [
- {
- "name": "spark.jars.packages",
- "value": "org.apache.spark:spark-avro_2.12:3.3.1"
- }
- ]
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/src/script/Monitor/crab-spark/notebooks/crab_tape_recall_updated_rules.ipynb b/src/script/Monitor/crab-spark/notebooks/crab_tape_recall_updated_rules.ipynb
deleted file mode 100644
index 5311eb9dd5..0000000000
--- a/src/script/Monitor/crab-spark/notebooks/crab_tape_recall_updated_rules.ipynb
+++ /dev/null
@@ -1,889 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "9f91521a",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- "
SparkSession - in-memory
\n",
- " \n",
- "
\n",
- "
SparkContext
\n",
- "\n",
- "
Spark UI
\n",
- "\n",
- "
\n",
- " - Version
\n",
- " v3.3.2
\n",
- " - Master
\n",
- " yarn
\n",
- " - AppName
\n",
- " pyspark_shell_swan
\n",
- "
\n",
- "
\n",
- " \n",
- "
\n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "execution_count": 1,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "666f70d9",
- "metadata": {},
- "outputs": [],
- "source": [
- "# !hdfs dfs -stat /project/awg/cms/rucio/2023-07-24/"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "bd6751a6",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "23/08/09 12:12:50 WARN ipc.Client: Exception encountered while connecting to the server \n",
- "org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.ipc.StandbyException): Operation category READ is not supported in state standby. Visit https://s.apache.org/sbnn-error\n",
- "\tat org.apache.hadoop.security.SaslRpcClient.saslConnect(SaslRpcClient.java:376)\n",
- "\tat org.apache.hadoop.ipc.Client$Connection.setupSaslConnection(Client.java:622)\n",
- "\tat org.apache.hadoop.ipc.Client$Connection.access$2300(Client.java:413)\n",
- "\tat org.apache.hadoop.ipc.Client$Connection$2.run(Client.java:822)\n",
- "\tat org.apache.hadoop.ipc.Client$Connection$2.run(Client.java:818)\n",
- "\tat java.base/java.security.AccessController.doPrivileged(Native Method)\n",
- "\tat java.base/javax.security.auth.Subject.doAs(Subject.java:423)\n",
- "\tat org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1845)\n",
- "\tat org.apache.hadoop.ipc.Client$Connection.setupIOstreams(Client.java:818)\n",
- "\tat org.apache.hadoop.ipc.Client$Connection.access$3800(Client.java:413)\n",
- "\tat org.apache.hadoop.ipc.Client.getConnection(Client.java:1636)\n",
- "\tat org.apache.hadoop.ipc.Client.call(Client.java:1452)\n",
- "\tat org.apache.hadoop.ipc.Client.call(Client.java:1405)\n",
- "\tat org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:234)\n",
- "\tat org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:119)\n",
- "\tat com.sun.proxy.$Proxy12.getFileInfo(Unknown Source)\n",
- "\tat org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.getFileInfo(ClientNamenodeProtocolTranslatorPB.java:964)\n",
- "\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n",
- "\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n",
- "\tat java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n",
- "\tat java.base/java.lang.reflect.Method.invoke(Method.java:566)\n",
- "\tat org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:422)\n",
- "\tat org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeMethod(RetryInvocationHandler.java:165)\n",
- "\tat org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invoke(RetryInvocationHandler.java:157)\n",
- "\tat org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeOnce(RetryInvocationHandler.java:95)\n",
- "\tat org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:359)\n",
- "\tat com.sun.proxy.$Proxy13.getFileInfo(Unknown Source)\n",
- "\tat org.apache.hadoop.hdfs.DFSClient.getFileInfo(DFSClient.java:1731)\n",
- "\tat org.apache.hadoop.hdfs.DistributedFileSystem$29.doCall(DistributedFileSystem.java:1725)\n",
- "\tat org.apache.hadoop.hdfs.DistributedFileSystem$29.doCall(DistributedFileSystem.java:1722)\n",
- "\tat org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81)\n",
- "\tat org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:1737)\n",
- "\tat org.apache.hadoop.fs.Globber.getFileStatus(Globber.java:115)\n",
- "\tat org.apache.hadoop.fs.Globber.doGlob(Globber.java:349)\n",
- "\tat org.apache.hadoop.fs.Globber.glob(Globber.java:202)\n",
- "\tat org.apache.hadoop.fs.FileSystem.globStatus(FileSystem.java:2093)\n",
- "\tat org.apache.hadoop.fs.shell.PathData.expandAsGlob(PathData.java:353)\n",
- "\tat org.apache.hadoop.fs.shell.Command.expandArgument(Command.java:250)\n",
- "\tat org.apache.hadoop.fs.shell.Command.expandArguments(Command.java:233)\n",
- "\tat org.apache.hadoop.fs.shell.FsCommand.processRawArguments(FsCommand.java:104)\n",
- "\tat org.apache.hadoop.fs.shell.Command.run(Command.java:177)\n",
- "\tat org.apache.hadoop.fs.FsShell.run(FsShell.java:327)\n",
- "\tat org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:76)\n",
- "\tat org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:90)\n",
- "\tat org.apache.hadoop.fs.FsShell.main(FsShell.java:390)\n",
- "Found 10 items\n",
- "drwxrwxr-x+ - cmssqoop c3 0 2023-07-25 04:18 /project/awg/cms/rucio/2023-07-25/contents\n",
- "drwxrwxr-x+ - cmssqoop c3 0 2023-07-25 04:22 /project/awg/cms/rucio/2023-07-25/dataset_locks\n",
- "drwxrwxr-x+ - cmssqoop c3 0 2023-07-25 04:13 /project/awg/cms/rucio/2023-07-25/dids\n",
- "drwxrwxr-x+ - cmssqoop c3 0 2023-07-25 04:28 /project/awg/cms/rucio/2023-07-25/locks\n",
- "drwxrwxr-x+ - cmssqoop c3 0 2023-07-25 04:06 /project/awg/cms/rucio/2023-07-25/replicas\n",
- "drwxrwxr-x+ - cmssqoop c3 0 2023-07-25 04:46 /project/awg/cms/rucio/2023-07-25/requests_history\n",
- "drwxrwxr-x+ - cmssqoop c3 0 2023-07-25 04:53 /project/awg/cms/rucio/2023-07-25/rses\n",
- "drwxrwxr-x+ - cmssqoop c3 0 2023-07-25 04:33 /project/awg/cms/rucio/2023-07-25/rules\n",
- "drwxrwxr-x+ - cmssqoop c3 0 2023-07-25 04:38 /project/awg/cms/rucio/2023-07-25/rules_history\n",
- "drwxrwxr-x+ - cmssqoop c3 0 2023-07-25 04:50 /project/awg/cms/rucio/2023-07-25/subscriptions\n"
- ]
- }
- ],
- "source": [
- "# check available files\n",
- "!hdfs dfs -ls /project/awg/cms/rucio/2023-07-25"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "800a2f9e",
- "metadata": {},
- "outputs": [],
- "source": [
- "import pickle\n",
- "from datetime import datetime, timedelta\n",
- "\n",
- "import click\n",
- "import os\n",
- "import pandas as pd\n",
- "import pprint\n",
- "import time\n",
- "from dateutil.relativedelta import relativedelta\n",
- "from pyspark import SparkContext, StorageLevel\n",
- "from pyspark.sql import SparkSession\n",
- "from pyspark.sql.functions import (\n",
- " col, collect_list, concat_ws, greatest, lit, lower, when,\n",
- " avg as _avg,\n",
- " count as _count,\n",
- " hex as _hex,\n",
- " max as _max,\n",
- " min as _min,\n",
- " round as _round,\n",
- " sum as _sum,\n",
- ")\n",
- "\n",
- "from pyspark.sql.types import (\n",
- " LongType,\n",
- ")\n",
- "\n",
- "#from CMSSpark.src.python.CMSSpark import schemas as cms_schemas"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "e597820f",
- "metadata": {},
- "source": [
- "## load dataset"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "2c100a92",
- "metadata": {},
- "outputs": [],
- "source": [
- "wa_date = str(datetime.now())[:10]\n",
- "# wa_date = \"2023-08-08\"\n",
- "\n",
- "HDFS_RUCIO_DATASET_LOCKS = f'/project/awg/cms/rucio/{wa_date}/dataset_locks/part*.avro'\n",
- "# HDFS_RUCIO_LOCKS = f'/project/awg/cms/rucio/{wa_date}/locks'\n",
- "HDFS_RUCIO_RSES = f'/project/awg/cms/rucio/{wa_date}/rses/part*.avro'\n",
- "HDFS_RUCIO_RULES = f'/project/awg/cms/rucio/{wa_date}/rules'\n",
- "# HDFS_RUCIO_RULES_HISTORY = f'/project/awg/cms/rucio/{wa_date}/rules_history'\n",
- "# HDFS_RUCIO_REPLICAS = f'/project/awg/cms/rucio/{wa_date}/replicas'"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "fe62d431",
- "metadata": {},
- "outputs": [],
- "source": [
- "rucio_dataset_locks = spark.read.format('avro').load(HDFS_RUCIO_DATASET_LOCKS)\\\n",
- " .withColumn('BYTES', col('BYTES').cast(LongType()))\\\n",
- " .withColumn('RULE_ID', lower(_hex(col('RULE_ID'))))\\\n",
- " .withColumn('RSE_ID', lower(_hex(col('RSE_ID')))).filter(f\"\"\"ACCOUNT IN ('crab_tape_recall')\"\"\").cache()\n",
- "rucio_dataset_locks.createOrReplaceTempView(\"dataset_locks\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "id": "b2e4fcfa",
- "metadata": {},
- "outputs": [],
- "source": [
- "rucio_rses = spark.read.format('avro').load(HDFS_RUCIO_RSES)\\\n",
- " .withColumn('ID', lower(_hex(col('ID'))))\n",
- "rucio_rses.createOrReplaceTempView(\"rses\")\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "id": "3893197e",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "23/08/09 12:37:11 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n"
- ]
- }
- ],
- "source": [
- "rucio_rules = spark.read.format('avro').load(HDFS_RUCIO_RULES)\\\n",
- " .withColumn('ID', lower(_hex(col('ID'))))\n",
- "rucio_rules.createOrReplaceTempView(\"rules\")\n",
- "#spark.sql(\"SELECT * FROM rules\").count()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "id": "f9f2ba4e",
- "metadata": {},
- "outputs": [],
- "source": [
- "# rucio_locks = spark.read.format('avro').load(HDFS_RUCIO_LOCKS)\\\n",
- "# .withColumn('BYTES', col('BYTES').cast(LongType()))\\\n",
- "# .withColumn('RULE_ID', lower(_hex(col('RULE_ID'))))\\\n",
- "# .withColumn('RSE_ID', lower(_hex(col('RSE_ID'))))\n",
- "# rucio_locks.createOrReplaceTempView(\"locks\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "id": "7771b12d",
- "metadata": {},
- "outputs": [],
- "source": [
- "# rucio_rules_history = spark.read.format('avro').load(HDFS_RUCIO_RULES_HISTORY)\\\n",
- "# .withColumn('ID', lower(_hex(col('ID'))))\n",
- "# #.persist(StorageLevel.DISK_ONLY)\n",
- "# rucio_rules_history.createOrReplaceTempView(\"rules_history\")\n",
- "# #spark.sql(\"SELECT * FROM rules_history\").count()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "id": "274421b8",
- "metadata": {},
- "outputs": [],
- "source": [
- "# rucio_replicas = spark.read.format('avro').load(HDFS_RUCIO_REPLICAS)\\\n",
- "# .withColumn('RSE_ID', lower(_hex(col('RSE_ID'))))\n",
- "# rucio_replicas.createOrReplaceTempView(\"replicas\")\n",
- "# #spark.sql(\"SELECT * FROM replicas\").count()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "5c84635f",
- "metadata": {},
- "source": [
- "## Query"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "id": "9be915ed",
- "metadata": {},
- "outputs": [],
- "source": [
- "# rucio_dataset_locks.count()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "id": "8648794b",
- "metadata": {},
- "outputs": [],
- "source": [
- "# rucio_dataset_locks.printSchema()\n",
- "# rucio_rses.printSchema()\n",
- "# rucio_rules.printSchema()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "id": "3aed55c6",
- "metadata": {},
- "outputs": [],
- "source": [
- "# rucio_dataset_locks = rucio_dataset_locks.select('')\n",
- "rucio_rses = rucio_rses.select('ID', 'RSE', 'RSE_TYPE').cache()\n",
- "rucio_rules = rucio_rules.select('ID', 'ACCOUNT', 'DID_TYPE', 'EXPIRES_AT').cache()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "id": "929705b6",
- "metadata": {},
- "outputs": [],
- "source": [
- "result_df = rucio_dataset_locks.join(rucio_rses, rucio_rses[\"ID\"] == rucio_dataset_locks[\"RSE_ID\"])\\\n",
- " .join(rucio_rules, rucio_rules[\"ID\"] == rucio_dataset_locks[\"RULE_ID\"]).drop('ID', 'RULE_ID', 'RSE_ID', 'ACCESSED_AT', 'ACCOUNT')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "id": "49af7fee",
- "metadata": {},
- "outputs": [],
- "source": [
- "# result_df.show(100)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "id": "91db6a20",
- "metadata": {},
- "outputs": [],
- "source": [
- "# result_df.printSchema()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "id": "7cbdf730",
- "metadata": {},
- "outputs": [],
- "source": [
- "# result_df.count()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "id": "e734d507",
- "metadata": {},
- "outputs": [],
- "source": [
- "docs = result_df.toPandas().to_dict('records')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 38,
- "id": "ac8524e0",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "17770"
- ]
- },
- "execution_count": 38,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(docs)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "id": "6d047c66",
- "metadata": {},
- "outputs": [],
- "source": [
- "for i in range(len(docs)):\n",
- " docs[i]['SIZE_TiB'] = docs[i][\"BYTES\"]/1099511627776\n",
- " del docs[i][\"BYTES\"]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "id": "c052b072",
- "metadata": {},
- "outputs": [],
- "source": [
- "TIME = datetime.strptime(f\"\"\"{wa_date} 00:00:00\"\"\", \"%Y-%m-%d %H:%M:%S\").timestamp()*1000\n",
- "for i in range(len(docs)):\n",
- " docs[i]['TIMESTAMP'] = TIME"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 41,
- "id": "836a7743",
- "metadata": {},
- "outputs": [],
- "source": [
- "for i in range(len(docs)):\n",
- " NAME_i = docs[i]['NAME']\n",
- " split_NAME = NAME_i.split('#')[0]\n",
- " docs[i]['NAME_'] = NAME_i.split('#')[0]\n",
- " split_NAME = docs[i]['NAME_'].split('/')\n",
- " if len(split_NAME) != 4:\n",
- " print(\"YO HOO !!, something wrong.\", NAME_i)\n",
- " docs[i]['PriDataset'] = split_NAME[1]\n",
- " docs[i]['DataTier'] = split_NAME[-1] "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 42,
- "id": "51bf031e",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[{'SCOPE': 'cms',\n",
- " 'NAME': '/ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8/RunIIFall17MiniAODv2-PU2017_12Apr2018_94X_mc2017_realistic_v14-v3/MINIAODSIM#c7b37e2d-77d8-40b9-b8c9-cdf7658406bd',\n",
- " 'STATE': 'O',\n",
- " 'LENGTH': '1',\n",
- " 'UPDATED_AT': 1689164433000,\n",
- " 'CREATED_AT': 1689096938000,\n",
- " 'RSE': 'T2_UK_SGrid_RALPP',\n",
- " 'RSE_TYPE': 'DISK',\n",
- " 'DID_TYPE': 'C',\n",
- " 'EXPIRES_AT': 1691719252000,\n",
- " 'SIZE_TiB': 0.0003293267427579849,\n",
- " 'TIMESTAMP': 1691532000000.0,\n",
- " 'NAME_': '/ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8/RunIIFall17MiniAODv2-PU2017_12Apr2018_94X_mc2017_realistic_v14-v3/MINIAODSIM',\n",
- " 'PriDataset': 'ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8',\n",
- " 'DataTier': 'MINIAODSIM'},\n",
- " {'SCOPE': 'cms',\n",
- " 'NAME': '/ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8/RunIIFall17MiniAODv2-PU2017_12Apr2018_94X_mc2017_realistic_v14-v3/MINIAODSIM#4e06c095-6b19-46a1-a6a6-321e6692a086',\n",
- " 'STATE': 'O',\n",
- " 'LENGTH': '1',\n",
- " 'UPDATED_AT': 1689164433000,\n",
- " 'CREATED_AT': 1689096938000,\n",
- " 'RSE': 'T2_UK_SGrid_RALPP',\n",
- " 'RSE_TYPE': 'DISK',\n",
- " 'DID_TYPE': 'C',\n",
- " 'EXPIRES_AT': 1691719252000,\n",
- " 'SIZE_TiB': 0.00011089865711255698,\n",
- " 'TIMESTAMP': 1691532000000.0,\n",
- " 'NAME_': '/ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8/RunIIFall17MiniAODv2-PU2017_12Apr2018_94X_mc2017_realistic_v14-v3/MINIAODSIM',\n",
- " 'PriDataset': 'ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8',\n",
- " 'DataTier': 'MINIAODSIM'},\n",
- " {'SCOPE': 'cms',\n",
- " 'NAME': '/ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8/RunIIFall17MiniAODv2-PU2017_12Apr2018_94X_mc2017_realistic_v14-v3/MINIAODSIM#1a79fa1f-9f97-4f0f-9716-523e29e57c32',\n",
- " 'STATE': 'O',\n",
- " 'LENGTH': '1',\n",
- " 'UPDATED_AT': 1689164433000,\n",
- " 'CREATED_AT': 1689096938000,\n",
- " 'RSE': 'T2_UK_SGrid_RALPP',\n",
- " 'RSE_TYPE': 'DISK',\n",
- " 'DID_TYPE': 'C',\n",
- " 'EXPIRES_AT': 1691719252000,\n",
- " 'SIZE_TiB': 0.001415386764165305,\n",
- " 'TIMESTAMP': 1691532000000.0,\n",
- " 'NAME_': '/ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8/RunIIFall17MiniAODv2-PU2017_12Apr2018_94X_mc2017_realistic_v14-v3/MINIAODSIM',\n",
- " 'PriDataset': 'ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8',\n",
- " 'DataTier': 'MINIAODSIM'},\n",
- " {'SCOPE': 'cms',\n",
- " 'NAME': '/ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8/RunIIFall17MiniAODv2-PU2017_12Apr2018_94X_mc2017_realistic_v14-v3/MINIAODSIM#18958704-f8f5-4ab4-8d26-0875a74714c4',\n",
- " 'STATE': 'O',\n",
- " 'LENGTH': '1',\n",
- " 'UPDATED_AT': 1689164433000,\n",
- " 'CREATED_AT': 1689096938000,\n",
- " 'RSE': 'T2_UK_SGrid_RALPP',\n",
- " 'RSE_TYPE': 'DISK',\n",
- " 'DID_TYPE': 'C',\n",
- " 'EXPIRES_AT': 1691719252000,\n",
- " 'SIZE_TiB': 0.0008716376141819637,\n",
- " 'TIMESTAMP': 1691532000000.0,\n",
- " 'NAME_': '/ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8/RunIIFall17MiniAODv2-PU2017_12Apr2018_94X_mc2017_realistic_v14-v3/MINIAODSIM',\n",
- " 'PriDataset': 'ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8',\n",
- " 'DataTier': 'MINIAODSIM'},\n",
- " {'SCOPE': 'cms',\n",
- " 'NAME': '/ParkingDoubleMuonLowMass1/Run2023C-PromptReco-v3/AOD#ef5c7b53-7002-4b16-bd94-c9e6cbd1ddc6',\n",
- " 'STATE': 'O',\n",
- " 'LENGTH': '1',\n",
- " 'UPDATED_AT': 1689903482000,\n",
- " 'CREATED_AT': 1689587082000,\n",
- " 'RSE': 'T2_BE_UCL',\n",
- " 'RSE_TYPE': 'DISK',\n",
- " 'DID_TYPE': 'C',\n",
- " 'EXPIRES_AT': 1692496353000,\n",
- " 'SIZE_TiB': 5.84150075155776e-06,\n",
- " 'TIMESTAMP': 1691532000000.0,\n",
- " 'NAME_': '/ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8/RunIIFall17MiniAODv2-PU2017_12Apr2018_94X_mc2017_realistic_v14-v3/MINIAODSIM',\n",
- " 'PriDataset': 'ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8',\n",
- " 'DataTier': 'MINIAODSIM'}]"
- ]
- },
- "execution_count": 42,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "docs[:5]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "id": "5c770068",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['',\n",
- " 'ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8',\n",
- " 'RunIIFall17MiniAODv2-PU2017_12Apr2018_94X_mc2017_realistic_v14-v3',\n",
- " 'MINIAODSIM#c7b37e2d-77d8-40b9-b8c9-cdf7658406bd']"
- ]
- },
- "execution_count": 27,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "split_str = test_str.split('/')\n",
- "split_str"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "id": "2a2868f7",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['MINIAODSIM', 'c7b37e2d-77d8-40b9-b8c9-cdf7658406bd']"
- ]
- },
- "execution_count": 29,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "split_str[3].split('#')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "id": "86f3a742",
- "metadata": {},
- "outputs": [],
- "source": [
- "import osearch"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "id": "6d29e62d",
- "metadata": {},
- "outputs": [],
- "source": [
- "def get_index_schema():\n",
- " return {\n",
- " \"settings\": {\"index\": {\"number_of_shards\": \"1\", \"number_of_replicas\": \"1\"}},\n",
- " \"mappings\": {\n",
- " \"properties\": {\n",
- " 'SCOPE': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
- " 'NAME': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
- " 'STATE': {\"ignore_above\": 1024, \"type\": \"keyword\"},\n",
- " 'LENGTH': {\"ignore_above\": 1024, \"type\": \"keyword\"},\n",
- " 'SIZE_TiB': {\"type\": \"long\"},\n",
- " 'UPDATED_AT': {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
- " 'CREATED_AT': {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
- " 'RSE': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
- " 'RSE_TYPE': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
- " 'DID_TYPE': {\"ignore_above\": 1024, \"type\": \"keyword\"},\n",
- " 'EXPIRES_AT': {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
- " 'TIMESTAMP': {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
- " 'NAME_': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
- " 'PriDataset': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
- " 'DataTier': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
- " }\n",
- " }\n",
- " }"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "b479eeb7",
- "metadata": {},
- "outputs": [],
- "source": [
- "# _index_template = 'crab-tape-recall-rules-ekong'\n",
- "# client = osearch.get_es_client(\"es-cms1.cern.ch/es\", 'secret_opensearch.txt', get_index_schema())\n",
- "# # index_mod=\"\": 'test-foo', index_mod=\"Y\": 'test-foo-YYYY', index_mod=\"M\": 'test-foo-YYYY-MM', index_mod=\"D\": 'test-foo-YYYY-MM-DD',\n",
- "# idx = client.get_or_create_index(timestamp=time.time(), index_template=_index_template, index_mod=\"M\")\n",
- "# client.send(idx, docs, metadata=None, batch_size=10000, drop_nulls=False)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "0af51d3a",
- "metadata": {},
- "outputs": [],
- "source": [
- "from datetime import datetime, timedelta\n",
- "import os\n",
- "import pandas as pd\n",
- "import time\n",
- "from pyspark import SparkContext, StorageLevel\n",
- "from pyspark.sql import SparkSession\n",
- "from pyspark.sql.functions import (\n",
- " col, collect_list, concat_ws, greatest, lit, lower, when,\n",
- " avg as _avg,\n",
- " count as _count,\n",
- " hex as _hex,\n",
- " max as _max,\n",
- " min as _min,\n",
- " round as _round,\n",
- " sum as _sum,\n",
- ")\n",
- "\n",
- "from pyspark.sql.types import (\n",
- " LongType,\n",
- ")\n",
- "\n",
- "import numpy as np\n",
- "import osearch\n",
- "from pyspark.sql import SparkSession"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "035e6ecf",
- "metadata": {},
- "source": [
- "## Multiple Day Upload"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "12ece939",
- "metadata": {},
- "outputs": [],
- "source": [
- "def multi_upload(start_date, end_date):\n",
- " # change to the date of collected data\n",
- " start_date = start_date + timedelta(days=1)\n",
- " end_date = end_date + timedelta(days=1)\n",
- " \n",
- " days = (end_date - start_date).days\n",
- " for i in range(days):\n",
- " TODAY = start_date + timedelta(days=i)\n",
- " TODAY = str(TODAY)[:10]\n",
- " \n",
- " print(TODAY)\n",
- " # Import data into database form\n",
- "\n",
- " wa_date = TODAY\n",
- " HDFS_RUCIO_DATASET_LOCKS = f'/project/awg/cms/rucio/{wa_date}/dataset_locks/part*.avro'\n",
- " HDFS_RUCIO_RSES = f'/project/awg/cms/rucio/{wa_date}/rses/part*.avro'\n",
- " HDFS_RUCIO_RULES = f'/project/awg/cms/rucio/{wa_date}/rules'\n",
- "\n",
- " rucio_dataset_locks = spark.read.format('avro').load(HDFS_RUCIO_DATASET_LOCKS)\\\n",
- " .withColumn('BYTES', col('BYTES').cast(LongType()))\\\n",
- " .withColumn('RULE_ID', lower(_hex(col('RULE_ID'))))\\\n",
- " .withColumn('RSE_ID', lower(_hex(col('RSE_ID')))).filter(f\"\"\"ACCOUNT IN ('crab_tape_recall')\"\"\").cache()\n",
- " rucio_dataset_locks.createOrReplaceTempView(\"dataset_locks\")\n",
- "\n",
- " rucio_rses = spark.read.format('avro').load(HDFS_RUCIO_RSES)\\\n",
- " .withColumn('ID', lower(_hex(col('ID'))))\n",
- " rucio_rses.createOrReplaceTempView(\"rses\")\n",
- "\n",
- " rucio_rules = spark.read.format('avro').load(HDFS_RUCIO_RULES)\\\n",
- " .withColumn('ID', lower(_hex(col('ID'))))\n",
- " rucio_rules.createOrReplaceTempView(\"rules\")\n",
- "\n",
- " # filter and query\n",
- "\n",
- " rucio_rses = rucio_rses.select('ID', 'RSE', 'RSE_TYPE').cache()\n",
- " rucio_rules = rucio_rules.select('ID', 'ACCOUNT', 'DID_TYPE', 'EXPIRES_AT').cache()\n",
- "\n",
- " result_df = rucio_dataset_locks.join(rucio_rses, rucio_rses[\"ID\"] == rucio_dataset_locks[\"RSE_ID\"])\\\n",
- " .join(rucio_rules, rucio_rules[\"ID\"] == rucio_dataset_locks[\"RULE_ID\"]).drop('ID', 'RULE_ID', 'RSE_ID', 'ACCESSED_AT', 'ACCOUNT')\n",
- "\n",
- " # Convert database to dictionary\n",
- "\n",
- " docs = result_df.toPandas().to_dict('records')\n",
- " \n",
- " # Add TIMESTAMP column and convert TiB\n",
- " TIME = datetime.strptime(f\"\"\"{wa_date} 00:00:00\"\"\", \"%Y-%m-%d %H:%M:%S\").timestamp()*1000\n",
- " for i in range(len(docs)):\n",
- " docs[i]['TIMESTAMP'] = TIME\n",
- " docs[i]['SIZE_TiB'] = docs[i][\"BYTES\"]/1099511627776\n",
- " del docs[i][\"BYTES\"]\n",
- " \n",
- " # break down the name\n",
- " NAME_i = docs[i]['NAME']\n",
- " split_NAME = NAME_i.split('#')[0]\n",
- " docs[i]['NAME_'] = NAME_i.split('#')[0]\n",
- " split_NAME = docs[i]['NAME_'].split('/')\n",
- " if len(split_NAME) != 4:\n",
- " print(\"YO HOO !!, something wrong.\", NAME_i)\n",
- " docs[i]['PriDataset'] = split_NAME[1]\n",
- " docs[i]['DataTier'] = split_NAME[-1]\n",
- "\n",
- " # Define type of each schema\n",
- "\n",
- " def get_index_schema():\n",
- " return {\n",
- " \"settings\": {\"index\": {\"number_of_shards\": \"1\", \"number_of_replicas\": \"1\"}},\n",
- " \"mappings\": {\n",
- " \"properties\": {\n",
- " 'SCOPE': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
- " 'NAME': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
- " 'STATE': {\"ignore_above\": 1024, \"type\": \"keyword\"},\n",
- " 'LENGTH': {\"ignore_above\": 1024, \"type\": \"keyword\"},\n",
- " 'BYTES': {\"type\": \"long\"},\n",
- " 'UPDATED_AT': {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
- " 'CREATED_AT': {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
- " 'RSE': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
- " 'RSE_TYPE': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
- " 'DID_TYPE': {\"ignore_above\": 1024, \"type\": \"keyword\"},\n",
- " 'EXPIRES_AT': {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
- " 'TIMESTAMP': {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
- " 'NAME_': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
- " 'PriDataset': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
- " 'DataTier': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
- " }\n",
- " }\n",
- " }\n",
- "\n",
- " # Send data to Opensearch\n",
- "\n",
- " _index_template = 'crab-tape-recall-rules-ekong'\n",
- " client = osearch.get_es_client(\"es-cms1.cern.ch/es\", 'secret_opensearch.txt', get_index_schema())\n",
- " idx = client.get_or_create_index(timestamp=time.time(), index_template=_index_template, index_mod=\"M\")\n",
- " no_of_fail_saved = client.send(idx, docs, metadata=None, batch_size=10000, drop_nulls=False)\n",
- "\n",
- " print(\"========================================================================\", \"FINISHED : \", len(docs), \"ROWS ARE SENT\", no_of_fail_saved, \"ROWS ARE FAILED\", \"========================================================================\", sep='\\n')\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "id": "f4567c46",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "2023-07-23\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n",
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "========================================================================\n",
- "FINISHED : \n",
- "40190\n",
- "ROWS ARE SENT\n",
- "0\n",
- "ROWS ARE FAILED\n",
- "========================================================================\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n"
- ]
- }
- ],
- "source": [
- "# upload the data of start_date day to end_date-1d\n",
- "start_date = datetime(2023, 7, 23)\n",
- "end_date = datetime(2023, 7, 24)\n",
- "\n",
- "multi_upload(start_date, end_date)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "546e9d4f",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "496e681c",
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "@webio": {
- "lastCommId": null,
- "lastKernelId": null
- },
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.9.12"
- },
- "sparkconnect": {
- "bundled_options": [],
- "list_of_options": [
- {
- "name": "spark.jars.packages",
- "value": "org.apache.spark:spark-avro_2.12:3.3.1"
- }
- ]
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/src/script/Monitor/crab-spark/notebooks/crab_taskdb.ipynb b/src/script/Monitor/crab-spark/notebooks/crab_taskdb.ipynb
new file mode 100644
index 0000000000..a491927996
--- /dev/null
+++ b/src/script/Monitor/crab-spark/notebooks/crab_taskdb.ipynb
@@ -0,0 +1,416 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "bcae07ec",
+ "metadata": {},
+ "source": [
+ "# CRAB Spark taskdb\n",
+ "\n",
+ "This jobs will \"copy\" some column from TaskDB table to opensearch to answer theses questions:\n",
+ "- How many tasks are using each crab features? (Split algorithm, Ignorelocality, ScriptExe, GPU)\n",
+ "- How many tasks each users submit?\n",
+ "- How many tasks use ignorelocality?\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6d41c8e6",
+ "metadata": {},
+ "source": [
+ "## Import lib"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5e9af689",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from datetime import datetime, timedelta, timezone\n",
+ "import os\n",
+ "import time\n",
+ "import pandas as pd\n",
+ "\n",
+ "from pyspark import SparkContext, StorageLevel\n",
+ "from pyspark.sql import SparkSession\n",
+ "from pyspark.sql.functions import (\n",
+ " current_user,\n",
+ " col, collect_list, concat_ws, greatest, lit, lower, when,\n",
+ " avg as _avg,\n",
+ " count as _count,\n",
+ " hex as _hex,\n",
+ " max as _max,\n",
+ " min as _min,\n",
+ " round as _round,\n",
+ " sum as _sum,\n",
+ ")\n",
+ "from pyspark.sql.types import (\n",
+ " StructType,\n",
+ " LongType,\n",
+ " StringType,\n",
+ " StructField,\n",
+ " DoubleType,\n",
+ " IntegerType,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "07a5e399",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# try to import libs from current directory, fallback to $PWD/../workdir if not found\n",
+ "try:\n",
+ " from crabspark_utils import get_candidate_files, send_os, send_os_parallel\n",
+ "except ModuleNotFoundError:\n",
+ " import sys\n",
+ " sys.path.insert(0, f'{os.getcwd()}/../workdir')\n",
+ " from crabspark_utils import get_candidate_files, send_os, send_os_parallel\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "22946659",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "spark = SparkSession\\\n",
+ " .builder\\\n",
+ " .appName('crab-taskdb')\\\n",
+ " .getOrCreate()\n",
+ "spark"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e9013878",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# clear any cache left, for working with notebook\n",
+ "# it safe to run everytime cronjob start\n",
+ "spark.catalog.clearCache()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "17a6078f",
+ "metadata": {},
+ "source": [
+ "## Arguments\n",
+ "\n",
+ "We provide arguments to this script via env var. \n",
+ "- `OPENSEARCH_SECRET_PATH`: path to secretfile, contain a line of : of opensearch that we send the data to\n",
+ "- `PROD`: if true index prefix will be `crab-prod-`, otherwise `crab-test-`\n",
+ "- `START`: start date (YYYY-MM-dd)\n",
+ "- `END`: end date (YYYY-MM-dd)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "31c19eb0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# secret path, also check if file exists\n",
+ "secretpath = os.environ.get('OPENSEARCH_SECRET_PATH', f'{os.getcwd()}/../workdir/secret_opensearch.txt')\n",
+ "if not os.path.isfile(secretpath): \n",
+ " raise Exception(f'OS secrets file {secretpath} does not exists')\n",
+ "# if PROD, index prefix will be `crab-*`, otherwise `crab-test-*`\n",
+ "PROD = os.environ.get('PROD', 'false').lower() in ('true', '1', 't')\n",
+ "# FROM_DATE, in strptime(\"%Y-%m-%d\")\n",
+ "START = os.environ.get('START_DATE', None) \n",
+ "END = os.environ.get('END_DATE', None)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f15e62ea",
+ "metadata": {},
+ "source": [
+ "## Variables \n",
+ "Will be used throughout notebook"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e843eb6d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# For run playbook manually, set start/end date here\n",
+ "START_DATE = \"2024-01-03\"\n",
+ "END_DATE = \"2024-10-04\"\n",
+ "# if cronjob, replace constant with value from env\n",
+ "if START and END:\n",
+ " START_DATE = START\n",
+ " END_DATE = END"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b17ed53f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# index name\n",
+ "index_name = 'taskdb'\n",
+ "# use prod index pattern if this execution is for production\n",
+ "if PROD:\n",
+ " index_name = f'crab-prod-{index_name}'\n",
+ "else:\n",
+ " index_name = f'crab-test-{index_name}'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "430146eb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# datetime object\n",
+ "start_datetime = datetime.strptime(START_DATE, \"%Y-%m-%d\").replace(tzinfo=timezone.utc)\n",
+ "end_datetime = datetime.strptime(END_DATE, \"%Y-%m-%d\").replace(tzinfo=timezone.utc)\n",
+ "# sanity check\n",
+ "if end_datetime < start_datetime: \n",
+ " raise Exception(f\"end date ({END_DATE}) is less than start date ({START_DATE})\")\n",
+ "start_epochmilis = int(start_datetime.timestamp()) * 1000\n",
+ "end_epochmilis = int(end_datetime.timestamp()) * 1000\n",
+ "yesterday_epoch = int((end_datetime-timedelta(days=1)).timestamp())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9404c437",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "# debug\n",
+ "print(START_DATE, \n",
+ " END_DATE, \n",
+ " index_name,\n",
+ " sep='\\n')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9b33ec96",
+ "metadata": {},
+ "source": [
+ "## Loading data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0cf35868",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Note that \"today\" file, for example, today=2024-10-04, should be in directory /project/awg/cms/crab/tasks/2024-10-04 \n",
+ "# which contain contents from the begining of table until the time of dump job run\n",
+ "# which mean data before 2024-10-04 will be available, but not 2024-10-04 itself!\n",
+ "\n",
+ "HDFS_CRAB_part = f'/project/awg/cms/crab/tasks/{END_DATE}/' # data each day in hdfs contain whole table\n",
+ "print(\"===============================================\"\n",
+ " , \"CRAB Table\"\n",
+ " , \"===============================================\"\n",
+ " , \"File Directory:\", HDFS_CRAB_part\n",
+ " , \"Work Directory:\", os.getcwd()\n",
+ " , \"===============================================\"\n",
+ " , \"===============================================\", sep='\\n')\n",
+ "\n",
+ "tasks_df = spark.read.format('avro').load(HDFS_CRAB_part).cache()\n",
+ "tasks_df = ( \n",
+ " tasks_df.select(\"TM_TASKNAME\",\"TM_START_TIME\",\"TM_TASK_STATUS\",\"TM_SPLIT_ALGO\",\"TM_USERNAME\",\"TM_USER_ROLE\",\"TM_JOB_TYPE\",\"TM_IGNORE_LOCALITY\",\"TM_SCRIPTEXE\",\"TM_USER_CONFIG\")\n",
+ " .filter(f\"\"\"\\\n",
+ " 1=1\n",
+ " AND TM_START_TIME >= {start_epochmilis}\n",
+ " AND TM_START_TIME < {end_epochmilis}\"\"\")\n",
+ " .cache()\n",
+ ")\n",
+ "tasks_df.createOrReplaceTempView(\"tasks\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "86c634fe",
+ "metadata": {},
+ "source": [
+ "## Query"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e271b1c8",
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [],
+ "source": [
+ "query = f\"\"\"\\\n",
+ "WITH reqacc_tb AS ( \n",
+ "SELECT TM_TASKNAME, TM_START_TIME, TM_TASK_STATUS, TM_SPLIT_ALGO, TM_USERNAME, TM_USER_ROLE, TM_JOB_TYPE, TM_IGNORE_LOCALITY, TM_SCRIPTEXE,\n",
+ " CASE \n",
+ " WHEN get_json_object(TM_USER_CONFIG, '$.requireaccelerator') = true THEN 'T'\n",
+ " ELSE 'F'\n",
+ " END AS REQUIRE_ACCELERATOR\n",
+ "FROM tasks\n",
+ "),\n",
+ "finalize_tb AS (\n",
+ "SELECT TM_TASKNAME, TM_START_TIME, TM_TASK_STATUS, TM_SPLIT_ALGO, TM_USERNAME, TM_USER_ROLE, TM_JOB_TYPE, TM_IGNORE_LOCALITY, TM_SCRIPTEXE, REQUIRE_ACCELERATOR,\n",
+ " TM_START_TIME AS timestamp,\n",
+ " 'taskdb' AS type\n",
+ "FROM reqacc_tb\n",
+ ")\n",
+ "SELECT * FROM finalize_tb\n",
+ "\"\"\"\n",
+ "\n",
+ "tmpdf = spark.sql(query)\n",
+ "tmpdf.show(10, False)\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6561ada6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tmpdf.count()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3c7fc2e5",
+ "metadata": {},
+ "source": [
+ "## Sending result to OpenSearch"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c33dfce3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# convert spark df to dicts\n",
+ "docs = tmpdf.toPandas().to_dict('records')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "eee4a1f3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "schema = {\n",
+ " \"settings\": {\"index\": {\"number_of_shards\": \"1\", \"number_of_replicas\": \"1\"}},\n",
+ " \"mappings\": {\n",
+ " \"properties\": {\n",
+ " \"TM_TASKNAME\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+ " \"TM_START_TIME\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
+ " 'TM_TASK_STATUS': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+ " \"TM_SPLIT_ALGO\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+ " \"TM_USERNAME\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+ " \"TM_USER_ROLE\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+ " \"TM_JOB_TYPE\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+ " \"TM_IGNORE_LOCALITY\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+ " \"TM_SCRIPTEXE\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+ " \"REQUIRE_ACCELERATOR\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+ " \"type\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n",
+ " \"timestamp\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n",
+ " }\n",
+ " }\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9ec824ee",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# this is simple workaround osearch bug when work in notebook because\n",
+ "# - it load the secret once and use forever\n",
+ "# - get_or_create_index() create index+schema only the first time it execute\n",
+ "# it is safe to run again even in cronjobs \n",
+ "import importlib\n",
+ "import osearch\n",
+ "importlib.reload(osearch)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "64bcf06e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "send_os(docs, index_name, schema, secretpath, yesterday_epoch)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "032d03e0",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "@webio": {
+ "lastCommId": null,
+ "lastKernelId": null
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.12"
+ },
+ "sparkconnect": {
+ "bundled_options": [],
+ "list_of_options": [
+ {
+ "name": "spark.jars.packages",
+ "value": "org.apache.spark:spark-avro_2.12:3.5.0"
+ }
+ ]
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/script/Monitor/crab-spark/workdir/bootstrap.sh b/src/script/Monitor/crab-spark/workdir/bootstrap.sh
index 29e8d7f00e..9390cfac87 100644
--- a/src/script/Monitor/crab-spark/workdir/bootstrap.sh
+++ b/src/script/Monitor/crab-spark/workdir/bootstrap.sh
@@ -1,16 +1,10 @@
# source the environment for spark submit
kinit cmscrab@CERN.CH -k -t /data/certs/keytabs.d/cmscrab.keytab
-source hadoop-setconf.sh analytix
+source hadoop-setconf.sh analytix
LCG_VER=/cvmfs/sft.cern.ch/lcg/views/LCG_105a_swan/x86_64-el9-gcc13-opt
source $LCG_VER/setup.sh
export PYSPARK_PYTHON=$LCG_VER/bin/python3
-# i know, ugly, we should install software in the dockerfile
-# however, we really need an environment from cvmfs, and i am not sure we
-# can have access to cvmfs at build time in gitlab
-python3 -m pip install --user opensearch-py
-
# finish the environment
export CRAB_KRB5_USERNAME=$(klist | grep -i Default | cut -d":" -f2 | cut -d"@" -f"1" | awk '{$1=$1};1')
-
diff --git a/src/script/Monitor/crab-spark/workdir/crabspark_utils.py b/src/script/Monitor/crab-spark/workdir/crabspark_utils.py
new file mode 100644
index 0000000000..6d45b2b130
--- /dev/null
+++ b/src/script/Monitor/crab-spark/workdir/crabspark_utils.py
@@ -0,0 +1,94 @@
+"""
+Utility functions for spark scripts
+"""
+# pylint: disable=protected-access
+
+import concurrent.futures
+
+from datetime import timedelta
+from osearch import get_es_client, OpenSearchInterface
+
+def get_candidate_files(start_date, end_date, spark, base, day_delta=1):
+ """
+ Returns a list of hdfs folders that can contain data for the given dates.
+ Copy from CMSMONIT CMSSpark:
+ https://github.com/dmwm/CMSSpark/blob/b8efa0ac5cb57b617ee8d1ea9bb26d53fb0443b0/src/python/CMSSpark/spark_utils.py#L768
+ """
+ st_date = start_date - timedelta(days=day_delta)
+ ed_date = end_date + timedelta(days=day_delta)
+ days = (ed_date - st_date).days
+
+ sc = spark.sparkContext
+ # The candidate files are the folders to the specific dates,
+ # but if we are looking at recent days the compaction procedure could
+ # have not run yet, so we will consider also the .tmp folders.
+
+ candidate_files = [
+ f"{base}/{(st_date + timedelta(days=i)).strftime('%Y/%m/%d')}{{,.tmp}}"
+ for i in range(0, days)
+ ]
+ fsystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
+ uri = sc._gateway.jvm.java.net.URI
+ path = sc._gateway.jvm.org.apache.hadoop.fs.Path
+ fs = fsystem.get(uri("hdfs:///"), sc._jsc.hadoopConfiguration())
+ candidate_files = [url for url in candidate_files if fs.globStatus(path(url))]
+ return candidate_files
+
+
+def send_os(docs, index_name, schema, secretpath, timestamp, batch_size=10000, printsummary=True):
+ """
+ Convenient one-liner function to send data to opensearch using osearch lib
+
+ :param docs: documents to send to opensearch
+ :type docs: dict
+ :param index_name: opensearch index name
+ :type index_name: str
+ :param schema: opensearch index schema
+ :type schema: str
+ :param secretpath: path to secret file which contains ":"
+ :type secretpath: str
+ :param timestamp: timestamp in second to build
+ :type timestamp: str
+ :param batch_size: how many docs we send to os in a single request
+ :type batch_size: int
+ :param printsummary: if yes, print summary text
+ :type printsummary: bool
+
+ :return: number of total docs and number of fail-to-send docs.
+ :rtype: (int, int)
+ """
+ client = get_es_client("os-cms.cern.ch/os", secretpath, schema)
+ idx = client.get_or_create_index(timestamp=timestamp, index_template=index_name, index_mod="M")
+ no_of_fail_saved = client.send(idx, docs, metadata=None, batch_size=batch_size, drop_nulls=False)
+ if printsummary:
+ print("========================================================================"
+ , "FINISHED : "
+ , len(docs), "ROWS ARE SENT"
+ , no_of_fail_saved, "ROWS ARE FAILED"
+ , "========================================================================", sep='\n')
+ return len(docs), no_of_fail_saved
+
+def send_os_parallel(docs, index_name, schema, secretpath, timestamp, batch_size=10000):
+ """
+ Convenient one-liner function to send data to opensearch using osearch lib,
+ in parallel.
+
+ Note that it has the same params as send_os() except `printsummary`, and
+ return None
+ """
+ with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
+ futures = []
+ for chunk in OpenSearchInterface.to_chunks(docs, batch_size):
+ future = executor.submit(send_os, chunk, index_name, schema, secretpath, timestamp, batch_size+1, False)
+ futures.append(future)
+ total_docs = 0
+ total_fails = 0
+ for f in futures:
+ ndocs, nfails = f.result()
+ total_docs += ndocs
+ total_fails += nfails
+ print("========================================================================"
+ , "FINISHED : "
+ , total_docs, "ROWS ARE SENT"
+ , total_fails, "ROWS ARE FAILED"
+ , "========================================================================", sep='\n')
diff --git a/src/script/Monitor/crab-spark/workdir/osearch.py b/src/script/Monitor/crab-spark/workdir/osearch.py
index 68f2f5b1ad..cd5787aec0 100644
--- a/src/script/Monitor/crab-spark/workdir/osearch.py
+++ b/src/script/Monitor/crab-spark/workdir/osearch.py
@@ -55,6 +55,7 @@ def get_index_schema():
import json
import logging
import time
+import concurrent.futures
from collections import Counter as collectionsCounter
from datetime import datetime
@@ -96,6 +97,7 @@ def __init__(self, host, secret_file, index_mapping_and_settings):
url = 'https://' + username + ':' + password + '@' + host
self.handle = OpenSearch(
[url],
+ http_compress=True,
verify_certs=False,
use_ssl=True,
ca_certs='/etc/pki/tls/certs/ca-bundle.trust.crt',
@@ -215,3 +217,35 @@ def send(self, idx, data, metadata=None, batch_size=10000, drop_nulls=False):
logging.error("OpenSearch send failed count: ", result_n_failed)
logging.debug("OpenSearch send", len(data) - result_n_failed, "documents successfully")
return result_n_failed
+
+def send_os(docs, index_name, schema, secretpath, timestamp, batch_size=10000, printsummary=True):
+
+ client = get_es_client("os-cms.cern.ch/os", secretpath, schema)
+ idx = client.get_or_create_index(timestamp=timestamp, index_template=index_name, index_mod="M")
+ no_of_fail_saved = client.send(idx, docs, metadata=None, batch_size=batch_size, drop_nulls=False)
+ if printsummary:
+ print("========================================================================"
+ , "FINISHED : "
+ , len(docs), "ROWS ARE SENT"
+ , no_of_fail_saved, "ROWS ARE FAILED"
+ , "========================================================================", sep='\n')
+ else:
+ return len(docs), no_of_fail_saved
+
+def send_os_parallel(docs, index_name, schema, secretpath, timestamp, batch_size=10000):
+ with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
+ futures = []
+ for chunk in OpenSearchInterface.to_chunks(docs, batch_size):
+ future = executor.submit(send_os, chunk, index_name, schema, secretpath, timestamp, batch_size+1, False)
+ futures.append(future)
+ total_docs = 0
+ total_fails = 0
+ for f in futures:
+ ndocs, nfails = f.result()
+ total_docs += ndocs
+ total_fails += nfails
+ print("========================================================================"
+ , "FINISHED : "
+ , total_docs, "ROWS ARE SENT"
+ , total_fails, "ROWS ARE FAILED"
+ , "========================================================================", sep='\n')
diff --git a/src/script/Monitor/crab-spark/workdir/run.py b/src/script/Monitor/crab-spark/workdir/run.py
new file mode 100755
index 0000000000..eb8224c8b6
--- /dev/null
+++ b/src/script/Monitor/crab-spark/workdir/run.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+
+"""
+This file convert the spark notebook into python file and run spark-submit
+Require shell to source "bootstrap.sh" to bootstrap the cmd and pylib need by
+this script.
+
+It process the python's argparse and pass the argument to spark script via
+environment variable.
+
+For examples:
+- To extract data from the whole September 2024
+ ./run.py --secretpath secret.txt --start 2024-09-01 --end 2024-10-01 crab_taskdb.ipynb
+
+- To extract data from n days ago (in case you need to wait until data settle)
+ For example, today is 2024-10-01 but you want to process data on 2024-09-30
+ ./run.py --secretpath secret.txt --ndaysago 2 crab_condor.ipynb
+
+- To push result docs to production index (otherwise, index will prefix with `crab-test`)
+ ./run.py --secretpath secret.txt --today --prod crab_taskdb.ipynb
+
+- To run in crontab daily, use "run_spark.sh" to prepare a new shell and source bootstrap.sh
+ ./run_spark.sh ./run.py --secretpath secret.txt --today --prod crab_taskdb.ipynb
+
+- To check env that will pass to spark script
+ ./run.py --secretpath secret.txt --today --dryrun crab_taskdb.ipynb
+"""
+import argparse
+import os
+import subprocess
+import pathlib
+from pprint import pprint
+from datetime import datetime, timedelta, timezone
+
+def valid_date(s):
+ """
+ check if date formate is correct and return the arg `s`.
+ The function serve as `type` of argument in argparse.
+
+ >>> valid_date('2024-01-01')
+ valid_date('2024-01-01')
+
+ :param s: date in format YYYY-mm-ddd
+ :type s: str
+
+ :return: s argument
+ :rtype: str
+ """
+ try:
+ datetime.strptime(s, '%Y-%m-%d')
+ return s
+ except ValueError as e:
+ raise argparse.ArgumentTypeError(f"not a valid date: {s!r}") from e
+
+parser = argparse.ArgumentParser(description='Converting spark ipynb and run spark-submit')
+parser.add_argument('path', help='path of script (.ipynb)')
+parser.add_argument('--start', type=valid_date, dest='start_date', help='Start date of interest (YYY-mm-dd)')
+parser.add_argument('--end', type=valid_date, dest='end_date', help='End date of interest (YYY-mm-dd).')
+parser.add_argument('--ndaysago', type=int, default=-1, help='set start date to n-1 days ago, and end date to n days ago')
+parser.add_argument('--today', action='store_true', help='shortcut --ndaysago 0')
+parser.add_argument('--prod', action='store_true', help='set opensearch index prefix to prod "crab-)". Default is "crab-test-"')
+parser.add_argument('--secretpath', help='secret file path')
+parser.add_argument('--dryrun', action='store_true', help='print env that will pass to spark script')
+args = parser.parse_args()
+
+sparkjob_env = {}
+if args.today:
+ args.ndaysago = 0
+if args.ndaysago >= 0:
+ day = datetime.now().replace(tzinfo=timezone.utc)
+ ed = args.ndaysago
+ sd = args.ndaysago + 1 # start date is "yesterday" of n days ago
+ sparkjob_env['START_DATE'] = (day-timedelta(days=sd)).strftime("%Y-%m-%d")
+ sparkjob_env['END_DATE'] = (day-timedelta(days=ed)).strftime("%Y-%m-%d")
+if args.start_date and args.end_date:
+ sparkjob_env['START_DATE'] = args.start_date
+ sparkjob_env['END_DATE'] = args.end_date
+if 'START_DATE' not in sparkjob_env and 'END_DATE' not in sparkjob_env:
+ raise Exception("Need --today or --ndaysago or --start/--end.")
+if args.secretpath:
+ sparkjob_env['OPENSEARCH_SECRET_PATH'] = args.secretpath
+if args.prod:
+ sparkjob_env['PROD'] = 't'
+else:
+ sparkjob_env['PROD'] = 'f'
+
+runenv = os.environ.copy()
+runenv.update(sparkjob_env)
+
+# convert from nootebook to py
+path = pathlib.Path(args.path)
+pathpy = path.with_suffix('.py')
+cmd = f"jupyter nbconvert --to python {path}"
+print(f'Running: {cmd}')
+if not args.dryrun:
+ subprocess.run(cmd, shell=True, timeout=3600, check=True)
+
+# spark-submit
+cmd = f'spark-submit --master yarn --packages org.apache.spark:spark-avro_2.12:3.5.0 {pathpy}'
+print(f'Running: {cmd}')
+print('With env: ')
+pprint(sparkjob_env)
+if not args.dryrun:
+ subprocess.run(cmd, shell=True, timeout=3600, check=True, env=runenv)
diff --git a/src/script/Monitor/crab-spark/workdir/run_spark.sh b/src/script/Monitor/crab-spark/workdir/run_spark.sh
new file mode 100755
index 0000000000..3bd92a5bfc
--- /dev/null
+++ b/src/script/Monitor/crab-spark/workdir/run_spark.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+set -euo pipefail
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+pushd $SCRIPT_DIR
+
+# source the environment for spark submit
+set +euo pipefail
+source ./bootstrap.sh
+set -euo pipefail
+
+$@
+
+popd