diff --git a/.gitignore b/.gitignore index 0643fa9cb2..211462e6ac 100644 --- a/.gitignore +++ b/.gitignore @@ -10,8 +10,8 @@ tmp/runtime/* # python virtualenv .venv/ # ci directory -build/ -workdir/ +/build/ +/workdir/ # direnv .envrc # pylint config diff --git a/cicd/monit_spark/Dockerfile b/cicd/monit_spark/Dockerfile index 7372722116..fc4499cee7 100644 --- a/cicd/monit_spark/Dockerfile +++ b/cicd/monit_spark/Dockerfile @@ -1,4 +1,4 @@ -FROM registry.cern.ch/cmsmonitoring/cmsmon-spark:v0.5.0.1 +FROM registry.cern.ch/cmsmonitoring/cmsmon-spark:v0.5.0.1 ## build with from dmwm/CRABServer, root directory # docker buildx build -t registry.cern.ch/cmscrab/crabspark:(date +%s) -f cicd/monit_spark/Dockerfile . @@ -9,17 +9,9 @@ RUN yum install -y \ && rm -rf /var/cache/yum RUN mkdir -p /data/srv/spark/ -COPY ./src/script/Monitor/crab-spark/workdir/osearch.py \ - ./src/script/Monitor/crab-spark/workdir/bootstrap.sh \ - ./src/script/Monitor/crab-spark/cronjobs/crab_condor_daily.py \ - ./src/script/Monitor/crab-spark/cronjobs/crab_tape_recall_rules_history_daily.py \ - ./src/script/Monitor/crab-spark/cronjobs/run_spark.sh \ - ./src/script/Monitor/crab-spark/cronjobs/crab_data_daily.py \ - ./src/script/Monitor/crab-spark/cronjobs/crab_tape_recall_updated_rules_daily.py \ +COPY ./src/script/Monitor/crab-spark \ /data/srv/spark ENTRYPOINT ["tini", "--"] CMD ["echo", "no default script for spark docker image"] - - diff --git a/src/script/Monitor/crab-spark/cronjobs/crab_condor_daily.py b/src/script/Monitor/crab-spark/cronjobs/crab_condor_daily.py deleted file mode 100644 index 66d998334a..0000000000 --- a/src/script/Monitor/crab-spark/cronjobs/crab_condor_daily.py +++ /dev/null @@ -1,250 +0,0 @@ -import os -import sys - -os.environ['PYSPARK_PYTHON'] = sys.executable -os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable - -import time -import numpy as np -import pandas as pd - -from datetime import datetime, date, timedelta - -import osearch - -import argparse -parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) -parser.add_argument("-s", "--start-date", default=None, - help="process data starting from this day, inclusive (YYYY-MM-DD)",) -parser.add_argument("-e", "--end-date", default=None, - help="process data until this day, not included (YYYY-MM-DD)",) -args = parser.parse_args() -print(f"timerange: [{args.start_date} {args.end_date})" ) - - -from pyspark.sql.functions import ( - col, - lit, - when, - sum as _sum, - count as _count, - first, - date_format, - from_unixtime -) -from pyspark.sql.types import ( - StructType, - LongType, - StringType, - StructField, - DoubleType, - IntegerType, -) - -from pyspark.sql import SparkSession - -spark = SparkSession\ - .builder\ - .appName("crab_tape_recall")\ - .getOrCreate() - -# CRAB table date - -# condor data and query date -# if args.end_date: -# end_date = datetime.strptime(args.end_date, '%Y-%m-%d') -# else: -# end_date = datetime.now() -# end_date = end_date.replace(minute=0, hour=0, second=0, microsecond=0) -# -# if args.start_date: -# start_date = datetime.strptime(args.start_date, '%Y-%m-%d') -# else: -# start_date = end_date - timedelta(days=1) - -if args.end_date: - end_date = datetime.strptime(args.end_date, '%Y-%m-%d') -else: - end_date = datetime.now() - end_date = end_date.replace(minute=0, hour=0, second=0, microsecond=0) - -if args.start_date: - start_date = datetime.strptime(args.start_date, '%Y-%m-%d') -else: - start_date = end_date - timedelta(days=1) - -date_list = pd.date_range( - start=start_date, - end=end_date, - ).to_pydatetime().tolist() - -# Import condor data - -def process_single_day(day): - - start_date = day - end_date = day + timedelta(days=1) - print(f"START PROCESSING: from {start_date} to {end_date}") - - _DEFAULT_HDFS_FOLDER = "/project/monitoring/archive/condor/raw/metric" - - def _get_schema(): - return StructType( - [ - StructField( - "data", - StructType( - [ - StructField("RecordTime", LongType(), nullable=False), - StructField("CMSPrimaryDataTier", StringType(), nullable=True), - StructField("Status", StringType(), nullable=True), - StructField("WallClockHr", DoubleType(), nullable=True), - StructField("CoreHr", DoubleType(), nullable=True), - StructField("CpuTimeHr", DoubleType(), nullable=True), - StructField("Type", StringType(), nullable=True), - StructField("CRAB_DataBlock", StringType(), nullable=True), - StructField("GlobalJobId", StringType(), nullable=False), - StructField("ExitCode", LongType(), nullable=True), - StructField("CRAB_Workflow", StringType(), nullable=True), - StructField("CommittedCoreHr", StringType(), nullable=True), - StructField("CommittedWallClockHr", StringType(), nullable=True), - ] - ), - ), - ] - ) - - def get_candidate_files(start_date, end_date, spark, base=_DEFAULT_HDFS_FOLDER): - st_date = start_date - timedelta(days=0) - ed_date = end_date + timedelta(days=0) - days = (ed_date - st_date).days - - sc = spark.sparkContext - FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem - URI = sc._gateway.jvm.java.net.URI - Path = sc._gateway.jvm.org.apache.hadoop.fs.Path - fs = FileSystem.get(URI("hdfs:///"), sc._jsc.hadoopConfiguration()) - candidate_files = [ - f"{base}/{(st_date + timedelta(days=i)).strftime('%Y/%m/%d')}" - for i in range(0, days) - ] - candidate_files = [url for url in candidate_files if fs.globStatus(Path(url))] - print("No. of Compacted files:", len(candidate_files)) - - pre_candidate_files = [ - f"{base}/{(st_date + timedelta(days=i)).strftime('%Y/%m/%d')}.tmp" - for i in range(0, days) - ] - pre_candidate_files = [url for url in pre_candidate_files if fs.globStatus(Path(url))] - print("No. of uncompacted files:", len(pre_candidate_files)) - - return candidate_files + pre_candidate_files - - - schema = _get_schema() - - condor_df = ( - spark.read.option("basePath", _DEFAULT_HDFS_FOLDER) - .json( - get_candidate_files(start_date, end_date, spark, base=_DEFAULT_HDFS_FOLDER), - schema=schema, - ).select("data.*") - .filter( - f"""Status IN ('Completed') - AND Type IN ('analysis') - AND RecordTime >= {start_date.timestamp() * 1000} - AND RecordTime < {end_date.timestamp() * 1000} - """ - ) - .drop_duplicates(["GlobalJobId"]) - # .cache() - ) - - # Convert file type by saving and recall it again (.json too complex for spark) - - crab_username = os.getenv("CRAB_KRB5_USERNAME", "cmscrab") - condor_df.write.mode('overwrite').parquet(f"/cms/users/{crab_username}/condor_vir_data" ,compression='zstd') - condor_df = spark.read.format('parquet').load(f"/cms/users/{crab_username}/condor_vir_data") - - # Import CRAB data - wa_date = day.strftime("%Y-%m-%d") - HDFS_CRAB_part = f'/project/awg/cms/crab/tasks/{wa_date}/' - crab_df = spark.read.format('avro').load(HDFS_CRAB_part) - crab_df = crab_df.select('TM_TASKNAME', 'TM_IGNORE_LOCALITY') - - print("===============================================" - , "Condor Matrix and CRAB Table" - , "===============================================" - , "File Directory:", HDFS_CRAB_part, get_candidate_files(start_date, end_date, spark, base=_DEFAULT_HDFS_FOLDER) - , "Work Directory:", os.getcwd() - , "===============================================" - , "===============================================", sep='\n') - - # Join condor job with CRAB data - - result_df = condor_df.join(crab_df, crab_df["TM_TASKNAME"] == condor_df["CRAB_Workflow"])\ - .select('RecordTime', 'CMSPrimaryDataTier', 'WallClockHr', 'CoreHr', 'CpuTimeHr', 'ExitCode' - , "CRAB_DataBlock", "TM_IGNORE_LOCALITY", "GlobalJobId", "CommittedCoreHr", "CommittedWallClockHr") - - # Convert database to dictionary - - docs = result_df.toPandas() - docs["CRAB_Type"] = docs.apply(lambda row: "PrivateMC" if row["CRAB_DataBlock"] == "MCFakeBlock" else "Analysis", axis=1) - print(f"pandas dataframe size: {docs.memory_usage(deep=True).apply(lambda x: x / 1024 / 1024).sum()} MB") - - - def get_index_schema(): - return { - "settings": {"index": {"number_of_shards": "1", "number_of_replicas": "1"}}, - "mappings": { - "properties": { - "RecordTime": {"format": "epoch_millis", "type": "date"}, - "CMSPrimaryDataTier": {"ignore_above": 2048, "type": "keyword"}, - "GlobalJobId": {"ignore_above": 2048, "type": "keyword"}, - "WallClockHr": {"type": "long"}, - "CoreHr": {"type": "long"}, - "CpuTimeHr": {"type": "long"}, - "ExitCode": {"ignore_above": 2048, "type": "keyword"}, - "TM_IGNORE_LOCALITY": {"ignore_above": 2048, "type": "keyword"}, - "CRAB_Type": {"ignore_above": 2048, "type": "keyword"}, - "CRAB_DataBlock": {"ignore_above": 2048, "type": "keyword"}, - "CommittedCoreHr": {"type": "long"}, - "CommittedWallClockHr": {"type": "long"}, - } - } - } - - # Send data to Opensearch - - _index_template = 'crab-condor-taskdb' - client = osearch.get_es_client("os-cms.cern.ch/es", '/data/certs/monit.d/monit_spark_crab.txt', get_index_schema()) - idx = client.get_or_create_index(timestamp=day.strftime("%s"), index_template=_index_template, index_mod="M") - docs_rows = len(docs) - sent = 0 - batch = 50000 - import gc - while sent < docs_rows: - gc.collect() - start = sent - end = start + batch if start + batch < docs_rows else docs_rows - docs_tmp = docs.iloc[start:end] - # the following line requires a lot of RAM, better do it 50_000 - # items at a time only. Keep in mind that the pandas datafram usually - # contains about 1_000_000 rows - docs_tmp = docs_tmp.to_dict('records') - no_of_fail_saved = client.send(idx, docs_tmp, metadata=None, batch_size=10000, drop_nulls=False) - sent = end - - print("=================================== Condor Matrix and CRAB Table =====================================", - "FINISHED : ", - f"start {start}, end {end}", - len(docs_tmp), "ROWS ARE SENT", - no_of_fail_saved, "ROWS ARE FAILED", - "=================================== Condor Matrix and CRAB Table =====================================", - sep='\n') - - -for day in date_list: - process_single_day(day) - - diff --git a/src/script/Monitor/crab-spark/cronjobs/crab_data_daily.py b/src/script/Monitor/crab-spark/cronjobs/crab_data_daily.py deleted file mode 100644 index 4fb3f4d45d..0000000000 --- a/src/script/Monitor/crab-spark/cronjobs/crab_data_daily.py +++ /dev/null @@ -1,154 +0,0 @@ -# import pickle -from datetime import datetime, timedelta - -# import click -import os -import pandas as pd -# import pprint -import time -# from dateutil.relativedelta import relativedelta - -import numpy as np -import json -import osearch - -import argparse - -parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) -parser.add_argument("-s", "--start-date", default=None, - help="process data starting from this day, inclusive (YYYY-MM-DD)",) -parser.add_argument("-e", "--end-date", default=None, - help="process data until this day, not included (YYYY-MM-DD)",) -args = parser.parse_args() -print(f"timerange: [{args.start_date} {args.end_date})" ) - - -from pyspark import SparkContext, StorageLevel -from pyspark.sql import SparkSession -from pyspark.sql.functions import ( - col, collect_list, concat_ws, greatest, lit, lower, when, - avg as _avg, - count as _count, - hex as _hex, - max as _max, - min as _min, - round as _round, - sum as _sum, -) -from pyspark.sql.types import ( - LongType, -) -from pyspark.sql import SparkSession -spark = SparkSession\ - .builder\ - .appName("crab_tape_recall")\ - .getOrCreate() - -# Query date - -#TODAY = str(datetime.now())[:10] -#YESTERDAY = str(datetime.now()-timedelta(days=1))[:10] -#wa_date = TODAY - -if args.end_date: - end_date = datetime.strptime(args.end_date, '%Y-%m-%d') -else: - end_date = datetime.now() - end_date = end_date.replace(minute=0, hour=0, second=0, microsecond=0) - -if args.start_date: - start_date = datetime.strptime(args.start_date, '%Y-%m-%d') -else: - start_date = end_date - timedelta(days=1) - -date_list = pd.date_range( - start=start_date, - end=end_date, - ).to_pydatetime().tolist() - -# Import data into database form - -def process_single_day(day): - - wa_date = day.strftime("%Y-%m-%d") - TODAY = wa_date - YESTERDAY = (day-timedelta(days=1)).strftime("%Y-%m-%d") - - HDFS_CRAB_part = f'/project/awg/cms/crab/tasks/{wa_date}/' - print("===============================================" - , "CRAB Table" - , "===============================================" - , "File Directory:", HDFS_CRAB_part - , "Work Directory:", os.getcwd() - , "===============================================" - , "===============================================", sep='\n') - - crab_part = spark.read.format('avro').load(HDFS_CRAB_part) - df = crab_part.select("TM_TASKNAME","TM_START_TIME","TM_TASK_STATUS","TM_SPLIT_ALGO","TM_USERNAME","TM_USER_ROLE","TM_JOB_TYPE","TM_IGNORE_LOCALITY","TM_SCRIPTEXE","TM_USER_CONFIG") - df.createOrReplaceTempView("crab_algo") - - # Query daily data - - query = f"""\ - SELECT * - FROM crab_algo - WHERE 1=1 - AND TM_START_TIME >= unix_timestamp("{YESTERDAY} 00:00:00", "yyyy-MM-dd HH:mm:ss")*1000 - AND TM_START_TIME < unix_timestamp("{TODAY} 00:00:00", "yyyy-MM-dd HH:mm:ss")*1000 - """ - - tmpdf = spark.sql(query) - tmpdf.show(10) - - # Convert database to dictionary - - docs = tmpdf.toPandas().to_dict('records') - - # Extract 'REQUIRE_ACCELERATOR' from 'TM_USER_CONFIG' - - for i in range(len(docs)): - if docs[i]['TM_USER_CONFIG'] is not None: - data = json.loads(docs[i]['TM_USER_CONFIG']) - if "requireaccelerator" in data: - docs[i]['REQUIRE_ACCELERATOR'] = data["requireaccelerator"] - else: - docs[i]['REQUIRE_ACCELERATOR'] = None - else: - docs[i]['REQUIRE_ACCELERATOR'] = None - - # Define type of each schema - - def get_index_schema(): - return { - "settings": {"index": {"number_of_shards": "1", "number_of_replicas": "1"}}, - "mappings": { - "properties": { - "TM_TASKNAME": {"ignore_above": 2048, "type": "keyword"}, - "TM_START_TIME": {"format": "epoch_millis", "type": "date"}, - 'TM_TASK_STATUS': {"ignore_above": 2048, "type": "keyword"}, - "TM_SPLIT_ALGO": {"ignore_above": 2048, "type": "keyword"}, - "TM_USERNAME": {"ignore_above": 2048, "type": "keyword"}, - "TM_USER_ROLE": {"ignore_above": 2048, "type": "keyword"}, - "TM_JOB_TYPE": {"ignore_above": 2048, "type": "keyword"}, - "TM_IGNORE_LOCALITY": {"ignore_above": 2048, "type": "keyword"}, - "TM_SCRIPTEXE": {"ignore_above": 2048, "type": "keyword"}, - "REQUIRE_ACCELERATOR": {"ignore_above": 2048, "type": "keyword"}, - } - } - } - - # Send data to Opensearch - - _index_template = 'crab-taskdb' - client = osearch.get_es_client("os-cms.cern.ch/es", '/data/certs/monit.d/monit_spark_crab.txt', get_index_schema()) - idx = client.get_or_create_index(timestamp=day.strftime("%s"), index_template=_index_template, index_mod="M") - no_of_fail_saved = client.send(idx, docs, metadata=None, batch_size=10000, drop_nulls=False) - - print("================================= CRAB Table =======================================" - , "FINISHED : ", len(docs), "ROWS ARE SENT", no_of_fail_saved, "ROWS ARE FAILED" - , "================================= CRAB Table =======================================", sep='\n') - - -for day in date_list: - process_single_day(day) - diff --git a/src/script/Monitor/crab-spark/cronjobs/crab_tape_recall_rules_history_daily.py b/src/script/Monitor/crab-spark/cronjobs/crab_tape_recall_rules_history_daily.py deleted file mode 100644 index 5fa5ddcac5..0000000000 --- a/src/script/Monitor/crab-spark/cronjobs/crab_tape_recall_rules_history_daily.py +++ /dev/null @@ -1,164 +0,0 @@ -# import pickle -from datetime import datetime, timedelta - -# import click -import os -import pandas as pd -# import pprint -import time -# from dateutil.relativedelta import relativedelta - -import argparse -parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) -parser.add_argument("-s", "--start-date", default=None, - help="process data starting from this day, inclusive (YYYY-MM-DD)",) -parser.add_argument("-e", "--end-date", default=None, - help="process data until this day, not included (YYYY-MM-DD)",) -args = parser.parse_args() -print(f"timerange: [{args.start_date} {args.end_date})" ) - - -from pyspark import SparkContext, StorageLevel -from pyspark.sql import SparkSession -from pyspark.sql.functions import ( - col, collect_list, concat_ws, greatest, lit, lower, when, - avg as _avg, - count as _count, - hex as _hex, - max as _max, - min as _min, - round as _round, - sum as _sum, -) - -from pyspark.sql.types import ( - LongType, -) - -import numpy as np -# import math -import osearch -from pyspark.sql import SparkSession - -spark = SparkSession\ - .builder\ - .appName("crab_tape_recall")\ - .getOrCreate() - -# Data date - -if args.end_date: - end_date = datetime.strptime(args.end_date, '%Y-%m-%d') -else: - end_date = datetime.now() - end_date = end_date.replace(minute=0, hour=0, second=0, microsecond=0) - -if args.start_date: - start_date = datetime.strptime(args.start_date, '%Y-%m-%d') -else: - start_date = end_date - timedelta(days=1) - -date_list = pd.date_range( - start=start_date, - end=end_date, - ).to_pydatetime().tolist() - -def process_single_day(day): - - # Query date - - wa_date = day.strftime("%Y-%m-%d") - TODAY = wa_date - YESTERDAY = (day-timedelta(days=1)).strftime("%Y-%m-$d") - TOYEAR = day.strftime("%Y") - - # Import data into database form - - HDFS_RUCIO_RULES_HISTORY = f'/project/awg/cms/rucio/{wa_date}/rules_history/' - - print("===============================================" - , "RUCIO : Rules History" - , "===============================================" - , "File Directory:", HDFS_RUCIO_RULES_HISTORY - , "Work Directory:", os.getcwd() - , "===============================================" - , "===============================================", sep='\n') - - rucio_rules_history = spark.read.format('avro').load(HDFS_RUCIO_RULES_HISTORY).withColumn('ID', lower(_hex(col('ID')))) - - # Query data in daily - - rucio_rules_history = rucio_rules_history.select("ID", "NAME", "STATE", "EXPIRES_AT", "UPDATED_AT", "CREATED_AT", "ACCOUNT").filter(f"""ACCOUNT IN ('crab_tape_recall')""").cache() - rucio_rules_history.createOrReplaceTempView("rules_history") - - query = query = f"""\ - WITH filter_t AS ( - SELECT ID, NAME, STATE, EXPIRES_AT, UPDATED_AT, CREATED_AT - FROM rules_history - WHERE 1=1 - AND CREATED_AT >= unix_timestamp("{TOYEAR}-01-01 00:00:00", "yyyy-MM-dd HH:mm:ss")*1000 - ), - rn_t AS ( - SELECT ID, NAME, STATE, EXPIRES_AT, UPDATED_AT, CREATED_AT, - row_number() over(partition by ID order by UPDATED_AT desc) as rn - FROM filter_t - ), - calc_days_t AS ( - SELECT ID, NAME, STATE, EXPIRES_AT, UPDATED_AT, CREATED_AT, - CASE - WHEN STATE = 'O' THEN ceil((UPDATED_AT-CREATED_AT)/86400000) - WHEN STATE != 'O' AND EXPIRES_AT < unix_timestamp("{wa_date} 00:00:00", "yyyy-MM-dd HH:mm:ss")*1000 THEN ceil((EXPIRES_AT-CREATED_AT)/86400000) - ELSE 0 - END AS DAYS - FROM rn_t - WHERE rn = 1 - ) - SELECT * - FROM calc_days_t - WHERE 1=1 - AND EXPIRES_AT >= unix_timestamp("{YESTERDAY} 00:00:00", "yyyy-MM-dd HH:mm:ss")*1000 - AND EXPIRES_AT < unix_timestamp("{TODAY} 00:00:00", "yyyy-MM-dd HH:mm:ss")*1000 - """ - - tmpdf = spark.sql(query) - tmpdf.show() - - # Convert database to dictionary - - docs = tmpdf.toPandas().to_dict('records') - - # Define type of each schema - - def get_index_schema(): - return { - "settings": {"index": {"number_of_shards": "1", "number_of_replicas": "1"}}, - "mappings": { - "properties": { - "timestamp": {"format": "epoch_second", "type": "date"}, - "ID": {"ignore_above": 1024, "type": "keyword"}, - "NAME": {"ignore_above": 2048, "type": "keyword"}, - "STATE": {"ignore_above": 1024, "type": "keyword"}, - "EXPIRES_AT": {"format": "epoch_millis", "type": "date"}, - "UPDATED_AT": {"format": "epoch_millis", "type": "date"}, - "CREATED_AT": {"format": "epoch_millis", "type": "date"}, - "DAYS": {"type": "long"}, - } - } - } - - # Send data to Opensearch - - _index_template = 'crab-tape-recall-daily' - client = osearch.get_es_client("os-cms.cern.ch/es", '/data/certs/monit.d/monit_spark_crab.txt', get_index_schema()) - idx = client.get_or_create_index(timestamp=day.strftime("%s"), index_template=_index_template, index_mod="M") - no_of_fail_saved = client.send(idx, docs, metadata=None, batch_size=10000, drop_nulls=False) - - print("=================================== RUCIO : Rules History =====================================" - , "FINISHED : " - , len(docs), "ROWS ARE SENT" - , no_of_fail_saved, "ROWS ARE FAILED" - , "=================================== RUCIO : Rules History =====================================", sep='\n') - - -for day in date_list: - process_single_day(day) diff --git a/src/script/Monitor/crab-spark/cronjobs/crab_tape_recall_updated_rules_daily.py b/src/script/Monitor/crab-spark/cronjobs/crab_tape_recall_updated_rules_daily.py deleted file mode 100644 index 79c4bc019e..0000000000 --- a/src/script/Monitor/crab-spark/cronjobs/crab_tape_recall_updated_rules_daily.py +++ /dev/null @@ -1,163 +0,0 @@ - -from datetime import datetime, timedelta -import os -import pandas as pd -import time - -import argparse - -parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) -parser.add_argument("-s", "--start-date", default=None, - help="process data starting from this day, inclusive (YYYY-MM-DD)",) -parser.add_argument("-e", "--end-date", default=None, - help="process data until this day, not included (YYYY-MM-DD)",) -args = parser.parse_args() -print(f"timerange: [{args.start_date} {args.end_date})" ) - -from pyspark import SparkContext, StorageLevel -from pyspark.sql import SparkSession -from pyspark.sql.functions import ( - col, collect_list, concat_ws, greatest, lit, lower, when, - avg as _avg, - count as _count, - hex as _hex, - max as _max, - min as _min, - round as _round, - sum as _sum, -) -from pyspark.sql.types import ( - LongType, -) -import numpy as np -import osearch -from pyspark.sql import SparkSession - -spark = SparkSession\ - .builder\ - .appName("crab_tape_recall")\ - .getOrCreate() - -# Data date - -if args.end_date: - end_date = datetime.strptime(args.end_date, '%Y-%m-%d') -else: - end_date = datetime.now() - end_date = end_date.replace(minute=0, hour=0, second=0, microsecond=0) - -if args.start_date: - start_date = datetime.strptime(args.start_date, '%Y-%m-%d') -else: - start_date = end_date - timedelta(days=1) - -date_list = pd.date_range( - start=start_date, - end=end_date, - ).to_pydatetime().tolist() - -def process_single_day(day): - - wa_date = day.strftime("%Y-%m-%d") - - # Import data into database form - - HDFS_RUCIO_DATASET_LOCKS = f'/project/awg/cms/rucio/{wa_date}/dataset_locks/part*.avro' - HDFS_RUCIO_RSES = f'/project/awg/cms/rucio/{wa_date}/rses/part*.avro' - HDFS_RUCIO_RULES = f'/project/awg/cms/rucio/{wa_date}/rules' - print("===============================================", "File Directory:", HDFS_RUCIO_DATASET_LOCKS, "Work Directory:", os.getcwd(), "===============================================", sep='\n') - - print("===============================================" - , "RUCIO : Rules, RSEs, Dataset" - , "===============================================" - , "File Directory:", HDFS_RUCIO_DATASET_LOCKS - , "Work Directory:", os.getcwd() - , "===============================================" - , "===============================================", sep='\n') - rucio_dataset_locks = spark.read.format('avro').load(HDFS_RUCIO_DATASET_LOCKS)\ - .withColumn('BYTES', col('BYTES').cast(LongType()))\ - .withColumn('RULE_ID', lower(_hex(col('RULE_ID'))))\ - .withColumn('RSE_ID', lower(_hex(col('RSE_ID')))) - rucio_dataset_locks.createOrReplaceTempView("dataset_locks") - - rucio_rses = spark.read.format('avro').load(HDFS_RUCIO_RSES)\ - .withColumn('ID', lower(_hex(col('ID')))) - rucio_rses.createOrReplaceTempView("rses") - - rucio_rules = spark.read.format('avro').load(HDFS_RUCIO_RULES)\ - .withColumn('ID', lower(_hex(col('ID')))) - rucio_rules.createOrReplaceTempView("rules") - - # filter and query - - rucio_dataset_locks = rucio_dataset_locks.filter(f"""ACCOUNT IN ('crab_tape_recall')""").cache() - rucio_rses = rucio_rses.select('ID', 'RSE', 'RSE_TYPE').cache() - rucio_rules = rucio_rules.select('ID', 'ACCOUNT', 'DID_TYPE', 'EXPIRES_AT').cache() - - result_df = rucio_dataset_locks.join(rucio_rses, rucio_rses["ID"] == rucio_dataset_locks["RSE_ID"])\ - .join(rucio_rules, rucio_rules["ID"] == rucio_dataset_locks["RULE_ID"]).drop('ID', 'RULE_ID', 'RSE_ID', 'ACCESSED_AT', 'ACCOUNT') - - # Convert database to dictionary - - docs = result_df.toPandas().to_dict('records') - - # Add TIMESTAMP column and convert TiB - TIME = datetime.strptime(f"""{wa_date} 00:00:00""", "%Y-%m-%d %H:%M:%S").timestamp()*1000 - for i in range(len(docs)): - docs[i]['TIMESTAMP'] = TIME - docs[i]['SIZE_TiB'] = docs[i]["BYTES"]/1099511627776 - del docs[i]["BYTES"] - - # break down the name - NAME_i = docs[i]['NAME'] - split_NAME = NAME_i.split('#')[0] - docs[i]['NAME_'] = NAME_i.split('#')[0] - split_NAME = docs[i]['NAME_'].split('/') - if len(split_NAME) != 4: - print("YO HOO !!, something wrong.", NAME_i) - docs[i]['PriDataset'] = split_NAME[1] - docs[i]['DataTier'] = split_NAME[-1] - - # Define type of each schema - - def get_index_schema(): - return { - "settings": {"index": {"number_of_shards": "1", "number_of_replicas": "1"}}, - "mappings": { - "properties": { - 'SCOPE': {"ignore_above": 2048, "type": "keyword"}, - 'NAME': {"ignore_above": 2048, "type": "keyword"}, - 'STATE': {"ignore_above": 1024, "type": "keyword"}, - 'LENGTH': {"ignore_above": 1024, "type": "keyword"}, - 'SIZE_TiB': {"type": "long"}, - 'UPDATED_AT': {"format": "epoch_millis", "type": "date"}, - 'CREATED_AT': {"format": "epoch_millis", "type": "date"}, - 'RSE': {"ignore_above": 2048, "type": "keyword"}, - 'RSE_TYPE': {"ignore_above": 2048, "type": "keyword"}, - 'DID_TYPE': {"ignore_above": 1024, "type": "keyword"}, - 'EXPIRES_AT': {"format": "epoch_millis", "type": "date"}, - 'TIMESTAMP': {"format": "epoch_millis", "type": "date"}, - 'NAME_': {"ignore_above": 2048, "type": "keyword"}, - 'PriDataset': {"ignore_above": 2048, "type": "keyword"}, - 'DataTier': {"ignore_above": 2048, "type": "keyword"}, - } - } - } - - # Send data to Opensearch - - _index_template = 'crab-tape-recall-rules' - client = osearch.get_es_client("os-cms.cern.ch/es", '/data/certs/monit.d/monit_spark_crab.txt', get_index_schema()) - idx = client.get_or_create_index(timestamp=day.strftime("%s"), index_template=_index_template, index_mod="M") - no_of_fail_saved = client.send(idx, docs, metadata=None, batch_size=10000, drop_nulls=False) - - print("==================================== RUCIO : Rules, RSEs, Dataset ====================================" - , "FINISHED : " - , len(docs), "ROWS ARE SENT" - , no_of_fail_saved, "ROWS ARE FAILED" - , "==================================== RUCIO : Rules, RSEs, Dataset ====================================", sep='\n') - - -for day in date_list: - process_single_day(day) - diff --git a/src/script/Monitor/crab-spark/cronjobs/cron_daily.sh b/src/script/Monitor/crab-spark/cronjobs/cron_daily.sh deleted file mode 100644 index 7665fa59b8..0000000000 --- a/src/script/Monitor/crab-spark/cronjobs/cron_daily.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash - -TAG=latest -if [[ -n $1 ]]; then - TAG=$1 -fi - -docker run --rm --net=host -v /cvmfs:/cvmfs:shared \ - -v /data/certs/monit.d/monit_spark_crab.txt:/data/certs/monit.d/monit_spark_crab.txt \ - -v /data/certs/keytabs.d/cmscrab.keytab:/data/certs/keytabs.d/cmscrab.keytab \ - registry.cern.ch/cmscrab/crabspark:${TAG} \ - bash /data/srv/spark/run_spark.sh /data/srv/spark/crab_data_daily.py \ - -docker run --rm --net=host -v /cvmfs:/cvmfs:shared \ - -v /data/certs/monit.d/monit_spark_crab.txt:/data/certs/monit.d/monit_spark_crab.txt \ - -v /data/certs/keytabs.d/cmscrab.keytab:/data/certs/keytabs.d/cmscrab.keytab \ - registry.cern.ch/cmscrab/crabspark:${TAG} \ - bash /data/srv/spark/run_spark.sh /data/srv/spark/crab_condor_daily.py - - docker run --rm --net=host -v /cvmfs:/cvmfs:shared \ - -v /data/certs/monit.d/monit_spark_crab.txt:/data/certs/monit.d/monit_spark_crab.txt \ - -v /data/certs/keytabs.d/cmscrab.keytab:/data/certs/keytabs.d/cmscrab.keytab \ - registry.cern.ch/cmscrab/crabspark:${TAG} \ - bash /data/srv/spark/run_spark.sh /data/srv/spark/crab_tape_recall_rules_history_daily.py - -docker run --rm --net=host -v /cvmfs:/cvmfs:shared \ - -v /data/certs/monit.d/monit_spark_crab.txt:/data/certs/monit.d/monit_spark_crab.txt \ - -v /data/certs/keytabs.d/cmscrab.keytab:/data/certs/keytabs.d/cmscrab.keytab \ - registry.cern.ch/cmscrab/crabspark:${TAG} \ - bash /data/srv/spark/run_spark.sh /data/srv/spark/crab_tape_recall_updated_rules_daily.py - diff --git a/src/script/Monitor/crab-spark/cronjobs/run_spark.sh b/src/script/Monitor/crab-spark/cronjobs/run_spark.sh index fb5b13d3ab..5412f4119b 100644 --- a/src/script/Monitor/crab-spark/cronjobs/run_spark.sh +++ b/src/script/Monitor/crab-spark/cronjobs/run_spark.sh @@ -1,10 +1,15 @@ #!/bin/bash +set -euo pipefail + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # work directory -cd /data/srv/spark +pushd "${SCRIPT_DIR}" # source the environment for spark submit -source ./bootstrap.sh +source ../workdir/bootstrap.sh # submit $1 to spark, where $1 supposes to be a data pulling file (.py) spark-submit --master yarn --packages org.apache.spark:spark-avro_2.12:3.5.0 $@ + +popd diff --git a/src/script/Monitor/crab-spark/notebooks/crab_condor.ipynb b/src/script/Monitor/crab-spark/notebooks/crab_condor.ipynb index 82f3f81eb2..65855e1112 100644 --- a/src/script/Monitor/crab-spark/notebooks/crab_condor.ipynb +++ b/src/script/Monitor/crab-spark/notebooks/crab_condor.ipynb @@ -1,85 +1,43 @@ { "cells": [ { - "cell_type": "code", - "execution_count": 1, - "id": "cf212bba", + "cell_type": "markdown", + "id": "aed9b54a", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - "

SparkSession - in-memory

\n", - " \n", - "
\n", - "

SparkContext

\n", - "\n", - "

Spark UI

\n", - "\n", - "
\n", - "
Version
\n", - "
v3.3.2
\n", - "
Master
\n", - "
yarn
\n", - "
AppName
\n", - "
pyspark_shell_swan
\n", - "
\n", - "
\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "spark" + "# CRAB Spark condor job\n", + "\n", + "This join info between the condor job metrics and crab taskdb, to answer these questions:\n", + "- How many jobs use ignorelocality?\n", + "- What is wall clock time spent by each CMS data tier and each job type?\n", + "- What is the success rate of the Analysis job type?\n" ] }, { "cell_type": "code", - "execution_count": 2, - "id": "77d4d561", + "execution_count": null, + "id": "5e9af689", "metadata": {}, "outputs": [], "source": [ + "from datetime import datetime, timedelta, timezone\n", "import os\n", - "import sys\n", - "\n", - "os.environ['PYSPARK_PYTHON'] = sys.executable\n", - "os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable\n", - "\n", "import time\n", - "# from utils import (\n", - "# _to_dict,\n", - "# _donut,\n", - "# _pie,\n", - "# _line_graph,\n", - "# _other_fields,\n", - "# _exitcode_info,\n", - "# _better_label\n", - "# )\n", - "from datetime import datetime, date, timedelta\n", + "import pandas as pd\n", + "\n", + "from pyspark import SparkContext, StorageLevel\n", + "from pyspark.sql import SparkSession\n", "from pyspark.sql.functions import (\n", - " col,\n", - " lit,\n", - " when,\n", - " sum as _sum,\n", + " current_user,\n", + " col, collect_list, concat_ws, greatest, lit, lower, when,\n", + " avg as _avg,\n", " count as _count,\n", - " first,\n", - " date_format,\n", - " from_unixtime\n", + " hex as _hex,\n", + " max as _max,\n", + " min as _min,\n", + " round as _round,\n", + " sum as _sum,\n", ")\n", - "import numpy as np\n", - "import pandas as pd\n", "from pyspark.sql.types import (\n", " StructType,\n", " LongType,\n", @@ -87,615 +45,407 @@ " StructField,\n", " DoubleType,\n", " IntegerType,\n", - ")\n", - "# spark.conf.set(\"spark.sql.execution.arrow.pyspark.enabled\", \"true\")\n" + ")" ] }, { - "cell_type": "markdown", - "id": "6b14b465", + "cell_type": "code", + "execution_count": null, + "id": "51b2f1c7", "metadata": {}, + "outputs": [], "source": [ - "### Prepare condor file name/configuration" + "# try to import libs from current directory, fallback to $PWD/../workdir if not found\n", + "try:\n", + " from crabspark_utils import get_candidate_files, send_os, send_os_parallel\n", + "except ModuleNotFoundError:\n", + " import sys\n", + " sys.path.insert(0, f'{os.getcwd()}/../workdir')\n", + " from crabspark_utils import get_candidate_files, send_os, send_os_parallel\n" ] }, { "cell_type": "code", - "execution_count": 3, - "id": "65a21e3a", + "execution_count": null, + "id": "22946659", "metadata": {}, "outputs": [], "source": [ - "def _get_schema():\n", - " return StructType(\n", - " [\n", - " StructField(\n", - " \"data\",\n", - " StructType(\n", - " [\n", - " StructField(\"RecordTime\", LongType(), nullable=False),\n", - " StructField(\"CMSPrimaryDataTier\", StringType(), nullable=True),\n", - " StructField(\"Status\", StringType(), nullable=True),\n", - " StructField(\"WallClockHr\", DoubleType(), nullable=True),\n", - " StructField(\"CoreHr\", DoubleType(), nullable=True),\n", - " StructField(\"CpuTimeHr\", DoubleType(), nullable=True),\n", - " StructField(\"Type\", StringType(), nullable=True),\n", - " StructField(\"CRAB_DataBlock\", StringType(), nullable=True),\n", - " StructField(\"GlobalJobId\", StringType(), nullable=False),\n", - " StructField(\"ExitCode\", LongType(), nullable=True),\n", - " StructField(\"CRAB_Workflow\", StringType(), nullable=True),\n", - " StructField(\"CommittedCoreHr\", StringType(), nullable=True),\n", - " StructField(\"CommittedWallClockHr\", StringType(), nullable=True),\n", - " ]\n", - " ),\n", - " ),\n", - " ]\n", - " )" + "spark = SparkSession\\\n", + " .builder\\\n", + " .appName('condor-job')\\\n", + " .getOrCreate()\n", + "spark" ] }, { "cell_type": "code", - "execution_count": 4, - "id": "5344e275", + "execution_count": null, + "id": "d37c4539", "metadata": {}, "outputs": [], "source": [ - "_DEFAULT_HDFS_FOLDER = \"/project/monitoring/archive/condor/raw/metric\"" + "# clear any cache left, for working with notebook\n", + "# it safe to run everytime cronjob start\n", + "spark.catalog.clearCache()" ] }, { "cell_type": "code", - "execution_count": 5, - "id": "c20d8d62", + "execution_count": null, + "id": "31c19eb0", "metadata": {}, "outputs": [], "source": [ - "# # Check available files \n", - "# !hdfs dfs -ls /project/monitoring/archive/condor/raw/metric/2023/07/08" + "# secret path, also check if file exists\n", + "secretpath = os.environ.get('OPENSEARCH_SECRET_PATH', f'{os.getcwd()}/../workdir/secret_opensearch.txt')\n", + "if not os.path.isfile(secretpath): \n", + " raise Exception(f'OS secrets file {secretpath} does not exists')\n", + "# if PROD, index prefix will be `crab-*`, otherwise `crab-test-*`\n", + "PROD = os.environ.get('PROD', 'false').lower() in ('true', '1', 't')\n", + "# FROM_DATE, in strptime(\"%Y-%m-%d\")\n", + "START = os.environ.get('START_DATE', None) \n", + "END = os.environ.get('END_DATE', None)" ] }, { "cell_type": "code", - "execution_count": 6, - "id": "8d821f8f", + "execution_count": null, + "id": "e843eb6d", "metadata": {}, "outputs": [], "source": [ - "def get_candidate_files(start_date, end_date, spark, base=_DEFAULT_HDFS_FOLDER):\n", - " st_date = start_date - timedelta(days=0)\n", - " ed_date = end_date + timedelta(days=0)\n", - " days = (ed_date - st_date).days\n", - " pre_candidate_files = [\n", - " \"{base}/{day}{{,.tmp}}\".format(\n", - " base=base, day=(st_date + timedelta(days=i)).strftime(\"%Y/%m/%d\")\n", - " )\n", - " for i in range(0, days)\n", - " ]\n", - " sc = spark.sparkContext\n", - " \n", - " candidate_files = [\n", - " f\"{base}/{(st_date + timedelta(days=i)).strftime('%Y/%m/%d')}\"\n", - " for i in range(0, days)\n", - " ]\n", - " FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem\n", - " URI = sc._gateway.jvm.java.net.URI\n", - " Path = sc._gateway.jvm.org.apache.hadoop.fs.Path\n", - " fs = FileSystem.get(URI(\"hdfs:///\"), sc._jsc.hadoopConfiguration())\n", - " # FIXME\n", - " candidate_files = [url for url in candidate_files if fs.globStatus(Path(url))]\n", - " print(\"No. of Consisted files:\", len(candidate_files))\n", - " return candidate_files\n", - "\n", - "# all_candidate_files = []\n", - "# candidate_files = [\n", - "# f\"{base}/{(st_date + timedelta(days=i)).strftime('%Y/%m/%d')}\"\n", - "# for i in range(0, days)\n", - "# ]\n", - " \n", - "# URI = sc._gateway.jvm.java.net.URI\n", - "# Path = sc._gateway.jvm.org.apache.hadoop.fs.Path\n", - "# FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem\n", - "# Configuration = sc._gateway.jvm.org.apache.hadoop.conf.Configuration\n", - "# fs = FileSystem.get(URI(\"hdfs:///\"), Configuration())\n", - "\n", - "# for fileNames in candidate_files:\n", - "# status = fs.listStatus(Path(fileNames))\n", - "# candidate_files_day_i = [\n", - "# str(fileStatus.getPath()).replace('hdfs://analytix', '')\n", - "# for fileStatus in status\n", - "# ]\n", - "# all_candidate_files.extend(candidate_files_day_i)\n", - "# print(\"Files Directory:\", candidate_files, \"\\nNo. of Consisted files:\", len(all_candidate_files))\n", - "# return all_candidate_files\n", - "\n", - "def group_files(files, n=16):\n", - " # Yield successive n-sized\n", - " # chunks from files\n", - " all_group = []\n", - " for i in range(0, len(files), n):\n", - " all_group.append(files[i:i+n])\n", - " print(\"There are\", len(all_group), \"chunks of files\")\n", - " return all_group" + "# For run playbook manually, set start/end date here\n", + "START_DATE = \"2024-10-01\"\n", + "END_DATE = \"2024-10-02\"\n", + "# if cronjob, replace constant with value from env\n", + "if START and END:\n", + " START_DATE = START\n", + " END_DATE = END" ] }, { - "cell_type": "markdown", - "id": "9a57477b", + "cell_type": "code", + "execution_count": null, + "id": "430146eb", "metadata": {}, + "outputs": [], "source": [ - "## load dataset" + "# index name\n", + "index_name = 'condor-taskdb'\n", + "# use prod index pattern if this execution is for production\n", + "if PROD:\n", + " index_name = f'crab-prod-{index_name}'\n", + "else:\n", + " index_name = f'crab-test-{index_name}'" ] }, { "cell_type": "code", - "execution_count": 7, - "id": "28bcc686", + "execution_count": null, + "id": "2a3b6697", "metadata": {}, "outputs": [], "source": [ - "schema = _get_schema()\n", - "start_date = datetime(2023, 8, 10)\n", - "end_date = datetime(2023, 8, 11)" + "# datetime object\n", + "start_datetime = datetime.strptime(START_DATE, \"%Y-%m-%d\").replace(tzinfo=timezone.utc)\n", + "end_datetime = datetime.strptime(END_DATE, \"%Y-%m-%d\").replace(tzinfo=timezone.utc)\n", + "# sanity check\n", + "if end_datetime < start_datetime: \n", + " raise Exception(f\"end date ({END_DATE}) is less than start date ({START_DATE})\")\n", + "start_epochmilis = int(start_datetime.timestamp()) * 1000\n", + "end_epochmilis = int(end_datetime.timestamp()) * 1000\n", + "yesterday_epoch = int((end_datetime-timedelta(days=1)).timestamp())" ] }, { "cell_type": "code", - "execution_count": 8, - "id": "bec66775", + "execution_count": null, + "id": "9404c437", "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "No. of Consisted files: 1\n" - ] - }, - { - "data": { - "text/plain": [ - "['/project/monitoring/archive/condor/raw/metric/2023/08/10']" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "candidate_files = get_candidate_files(start_date, end_date, spark, base=_DEFAULT_HDFS_FOLDER)\n", - "candidate_files" - ] - }, - { - "cell_type": "markdown", - "id": "894bdcf0", - "metadata": {}, + "outputs": [], "source": [ - "### Prepare CRAB data file name" + "# debug\n", + "print(START_DATE, \n", + " END_DATE, \n", + " index_name,\n", + " sep='\\n')" ] }, { "cell_type": "code", - "execution_count": 9, - "id": "b4120002", + "execution_count": null, + "id": "9d4bb4d0", "metadata": {}, "outputs": [], "source": [ - "TODAY = str(end_date)[:10]\n", - "wa_date = TODAY\n", - "HDFS_CRAB_part = f'/project/awg/cms/crab/tasks/{wa_date}/'" + "# read crab data\n", + "HDFS_CRAB_part = f'/project/awg/cms/crab/tasks/{END_DATE}/' \n", + "crab_df = spark.read.format('avro').load(HDFS_CRAB_part)\n", + "# we did not filter the task here because most jobs was created from older tasks.\n", + "# if there are too many crab tasks, it might be safe to filter out the tasks older than 30+7 days ago.\n", + "crab_df = crab_df.select('TM_TASKNAME', 'TM_IGNORE_LOCALITY').cache()\n", + "crab_df.createOrReplaceTempView(\"tasks\")" ] }, { - "cell_type": "markdown", - "id": "de4d8e96", - "metadata": {}, + "cell_type": "code", + "execution_count": null, + "id": "f15887f4", + "metadata": { + "scrolled": true + }, + "outputs": [], "source": [ - "### Get raw data from condor raw" + "# read condor data\n", + "# reading file 2 days before start date and 1 days after end date inclusive\n", + "# sometime flume (condor log aggregator) process the metrics is delay for 2 days, sometime it has timestamp from the future.\n", + "# so we do this to make sure we get all metrics from the date we want. (all of these suggested by CMSMONIT)\n", + "# Note that we read all files, compact or not, even it has the same content, we will dedup it in the next step.\n", + "_DEFAULT_HDFS_FOLDER = \"/project/monitoring/archive/condor/raw/metric\"\n", + "candidate_files = get_candidate_files(start_datetime, end_datetime, spark=spark, base=_DEFAULT_HDFS_FOLDER, day_delta=2)\n", + "\n", + "# this is map json doc to spark schema\n", + "read_schema = StructType(\n", + " [\n", + " StructField(\n", + " \"data\",\n", + " StructType(\n", + " [\n", + " StructField(\"RecordTime\", LongType(), nullable=False),\n", + " StructField(\"CMSPrimaryDataTier\", StringType(), nullable=True),\n", + " StructField(\"Status\", StringType(), nullable=True),\n", + " StructField(\"WallClockHr\", DoubleType(), nullable=True),\n", + " StructField(\"CoreHr\", DoubleType(), nullable=True),\n", + " StructField(\"CpuTimeHr\", DoubleType(), nullable=True),\n", + " StructField(\"Type\", StringType(), nullable=True),\n", + " StructField(\"CRAB_DataBlock\", StringType(), nullable=True),\n", + " StructField(\"GlobalJobId\", StringType(), nullable=False),\n", + " StructField(\"ExitCode\", LongType(), nullable=True),\n", + " StructField(\"CRAB_Workflow\", StringType(), nullable=True),\n", + " StructField(\"CommittedCoreHr\", StringType(), nullable=True),\n", + " StructField(\"CommittedWallClockHr\", StringType(), nullable=True),\n", + " ]\n", + " ),\n", + " ),\n", + " ]\n", + " )\n", + "print(\"===============================================\"\n", + " , \"Condor Matrix and CRAB Table\"\n", + " , \"===============================================\"\n", + " , \"File Directory:\", _DEFAULT_HDFS_FOLDER, candidate_files\n", + " , \"Work Directory:\", os.getcwd()\n", + " , \"===============================================\"\n", + " , \"===============================================\", sep='\\n')" ] }, { "cell_type": "code", - "execution_count": 10, - "id": "0aa94c64", + "execution_count": null, + "id": "fd3bcb00", "metadata": {}, "outputs": [], "source": [ - "spark.conf.set(\"spark.sql.session.timeZone\", \"UTC\")\n", - "\n", - "crab_df = spark.read.format('avro').load(HDFS_CRAB_part)\n", - "crab_df = crab_df.select('TM_TASKNAME', 'TM_IGNORE_LOCALITY')" + "crab_username = spark.sql(\"\"\"SELECT current_user() AS user\"\"\").toPandas().to_dict('records')[0]['user']" ] }, { "cell_type": "code", - "execution_count": 25, - "id": "b35668ef", + "execution_count": null, + "id": "515aefbc", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "23/08/16 13:48:02 WARN CacheManager: Asked to cache already cached data.\n" - ] - } - ], + "outputs": [], "source": [ - "condor_df = (\n", - " spark.read.option(\"basePath\", _DEFAULT_HDFS_FOLDER)\n", - " .json(\n", + "# extract only \"interested data\" from condor metrics and save into temporary area\n", + "# need to do this because we do not have enough memory to compute all data at once.\n", + "# (1 days is ok, 1 month got spark OOM)\n", + "# \"interested data\" is\n", + "# - selected column (see `read_schema` above)\n", + "# - date range from START_DATE inclusive to END_DATE exclusive\n", + "# - only status Complete and type analysis\n", + "# job will got dedup by `.drop_duplicates([\"GlobalJobId\"])` in later step\n", + "( \n", + " spark.read.option(\"basePath\", _DEFAULT_HDFS_FOLDER)\n", + " .json(\n", " candidate_files,\n", - " schema=schema,\n", - " ).select(\"data.*\")\n", - " .filter(\n", + " schema=read_schema,\n", + " )\n", + " .select(\"data.*\")\n", + " .filter(\n", " f\"\"\"Status IN ('Completed')\n", " AND Type IN ('analysis')\n", - " AND RecordTime >= {start_date.timestamp() * 1000}\n", - " AND RecordTime < {end_date.timestamp() * 1000}\n", + " AND RecordTime >= {start_epochmilis}\n", + " AND RecordTime < {end_epochmilis}\n", " \"\"\"\n", - " )\n", - " .drop_duplicates([\"GlobalJobId\"]).cache()\n", - " ) \n", - "condor_df.write.mode('overwrite').parquet(\"/cms/users/eatthaph/condor_vir_data\" ,compression='zstd')\n", - "condor_df = spark.read.format('parquet').load('/cms/users/eatthaph/condor_vir_data')\n", - "# condor_df.count()" + " )\n", + " .drop_duplicates([\"GlobalJobId\"])\n", + " .write.mode('overwrite').parquet(f\"/cms/users/{crab_username}/condor_vir_data\" ,compression='zstd') # overriding the same path to cleanup old data. However, we could not run it parallel\n", + ")\n", + "spark.catalog.clearCache()" ] }, { "cell_type": "code", - "execution_count": 27, - "id": "7656d1f3", + "execution_count": null, + "id": "957ac50a", "metadata": {}, "outputs": [], "source": [ - "result_df = condor_df.join(crab_df, crab_df[\"TM_TASKNAME\"] == condor_df[\"CRAB_Workflow\"])\\\n", - " .select('RecordTime', 'CMSPrimaryDataTier', 'WallClockHr', 'CoreHr', 'CpuTimeHr', 'ExitCode'\n", - " , \"CRAB_DataBlock\", \"TM_IGNORE_LOCALITY\", \"GlobalJobId\", \"CommittedCoreHr\", \"CommittedWallClockHr\")\n", - "docs = result_df.toPandas()" + "condor_df = spark.read.format('parquet').load(f\"/cms/users/{crab_username}/condor_vir_data\").cache()\n", + "condor_df.createOrReplaceTempView(\"condor\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "2b04b914", - "metadata": {}, + "id": "e271b1c8", + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ - "len(docs)" + "# query\n", + "query = f\"\"\"\\\n", + "WITH filter_tb AS (\n", + "SELECT *\n", + "FROM condor\n", + "WHERE 1=1\n", + "AND RecordTime >= {start_epochmilis}\n", + "AND RecordTime < {end_epochmilis}\n", + "),\n", + "join_tb AS (\n", + "SELECT RecordTime, CMSPrimaryDataTier, WallClockHr, CoreHr, CpuTimeHr, ExitCode, CRAB_DataBlock, TM_IGNORE_LOCALITY, GlobalJobId, CommittedCoreHr, CommittedWallClockHr\n", + "FROM filter_tb\n", + "INNER JOIN tasks \n", + "ON filter_tb.CRAB_Workflow = tasks.TM_TASKNAME \n", + "), \n", + "finalize_tb AS (\n", + "SELECT RecordTime, CMSPrimaryDataTier, WallClockHr, CoreHr, CpuTimeHr, ExitCode, CRAB_DataBlock, TM_IGNORE_LOCALITY, GlobalJobId, CommittedCoreHr, CommittedWallClockHr, \n", + " CASE \n", + " WHEN CRAB_DataBlock = 'MCFakeBlock' THEN 'PrivateMC' \n", + " ELSE 'Analysis'\n", + " END AS CRAB_Type, --- to differentiate between analysis and mc\n", + " 'condor' AS type, --- use to match specific data when use wildcard index pattern on grafana side\n", + " RecordTime AS timestamp --- use `RecordTime` as timestamp\n", + "FROM join_tb\n", + ")\n", + "SELECT * \n", + "FROM finalize_tb \n", + "\"\"\"\n", + "tmpdf = spark.sql(query)\n", + "tmpdf.show(10)\n", + "\n" ] }, { "cell_type": "code", - "execution_count": 19, - "id": "fa3f9917", + "execution_count": null, + "id": "75c6a964", "metadata": {}, "outputs": [], "source": [ - "# def spark_exec(candidate_files):\n", - "# condor_df = (\n", - "# spark.read.option(\"basePath\", _DEFAULT_HDFS_FOLDER)\n", - "# .json(\n", - "# candidate_files,\n", - "# schema=schema,\n", - "# ).select(\"data.*\")\n", - "# .filter(\n", - "# f\"\"\"Status IN ('Completed')\n", - "# AND Type IN ('analysis')\n", - "# AND RecordTime >= {start_date.timestamp() * 1000}\n", - "# AND RecordTime < {end_date.timestamp() * 1000}\n", - "# \"\"\"\n", - "# )\n", - "# .drop_duplicates([\"GlobalJobId\"]).cache()\n", - "# ) \n", - "# condor_df.write.mode('overwrite').parquet(\"/cms/users/eatthaph/condor_vir_data\" ,compression='zstd')\n", - "# condor_df = spark.read.format('parquet').load('/cms/users/eatthaph/condor_vir_data')\n", - "# result_df = condor_df.join(crab_df, crab_df[\"TM_TASKNAME\"] == condor_df[\"CRAB_Workflow\"])\\\n", - "# .select('RecordTime', 'CMSPrimaryDataTier', 'WallClockHr', 'CoreHr', 'CpuTimeHr', 'ExitCode'\n", - "# , \"CRAB_DataBlock\", \"TM_IGNORE_LOCALITY\", \"GlobalJobId\", \"CommittedCoreHr\", \"CommittedWallClockHr\")\n", - "# sub_docs = result_df.toPandas()\n", - "# return sub_docs\n", - "\n", - "# def loop_excute(candidate_files, initial_n=len(candidate_files)):\n", - "# r = 0\n", - "# n = initial_n\n", - "# df_list = []\n", - "# file_chunk = group_files(candidate_files, n)\n", - "# while len(file_chunk)!=0 and r<10:\n", - "# print(\"=================================\\n round :\", r+1, \"\\n=================================\")\n", - "# df_err_list = []\n", - "# for i, chunk in enumerate(file_chunk):\n", - "# print(\"=================================\\n\", i+1, \"out of\", len(file_chunk), \"\\n=================================\")\n", - "# try:\n", - "# df_list.append(spark_exec(chunk))\n", - "# except Exception as ex:\n", - "# print(\"=====\", ex)\n", - "# df_err_list.extend(chunk)\n", - "# # if n != 1:\n", - "# # n = n//2\n", - "# file_chunk = group_files(df_err_list, n)\n", - "# r += 1\n", - "# print(\"\")\n", - "# print(\"Fail excuted files :\", df_err_list)\n", - "# return df_list" + "tmpdf.count()" ] }, { "cell_type": "code", - "execution_count": 1, - "id": "af6d5e17", + "execution_count": null, + "id": "eee4a1f3", "metadata": {}, "outputs": [], "source": [ - "# useful_df = loop_excute(candidate_files)\n", - "# df_list = spark_exec(candidate_files)" + "schema = {\n", + " \"settings\": {\"index\": {\"number_of_shards\": \"1\", \"number_of_replicas\": \"1\"}},\n", + " \"mappings\": {\n", + " \"properties\": {\n", + " \"RecordTime\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n", + " \"CMSPrimaryDataTier\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", + " \"GlobalJobId\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", + " \"WallClockHr\": {\"type\": \"long\"},\n", + " \"CoreHr\": {\"type\": \"long\"},\n", + " \"CpuTimeHr\": {\"type\": \"long\"},\n", + " \"ExitCode\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", + " \"TM_IGNORE_LOCALITY\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", + " \"CRAB_Type\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", + " \"CRAB_DataBlock\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", + " \"CommittedCoreHr\": {\"type\": \"long\"}, \n", + " \"CommittedWallClockHr\": {\"type\": \"long\"},\n", + " \"type\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", + " \"timestamp\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n", + " }\n", + " }\n", + " }" ] }, { "cell_type": "code", - "execution_count": 15, - "id": "18908dab", - "metadata": {}, + "execution_count": null, + "id": "5d0506d4", + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ - "docs = docs.to_dict('records')" + "# this is simple workaround osearch bug when work in notebook because\n", + "# - it load the secret once and use forever\n", + "# - get_or_create_index() create index+schema only the first time it execute\n", + "# it is safe to run again even in cronjobs \n", + "import importlib\n", + "import osearch\n", + "importlib.reload(osearch)" ] }, { "cell_type": "code", - "execution_count": 16, - "id": "c912b217", + "execution_count": null, + "id": "47a4f569", "metadata": {}, "outputs": [], "source": [ - "for i in range(len(docs)):\n", - " if docs[i]['CRAB_DataBlock'] == 'MCFakeBlock':\n", - " docs[i]['CRAB_Type'] = 'PrivateMC'\n", - " else:\n", - " docs[i]['CRAB_Type'] = 'Analysis'" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "0e3ae57b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'RecordTime': 1692101192000,\n", - " 'CMSPrimaryDataTier': 'MINIAODSIM',\n", - " 'WallClockHr': 0.12361111111111112,\n", - " 'CoreHr': 0.12361111111111112,\n", - " 'CpuTimeHr': 0.0022222222222222222,\n", - " 'ExitCode': 8020,\n", - " 'CRAB_DataBlock': '/WWTo4Q_4f_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18MiniAODv2-106X_upgrade2018_realistic_v16_L1v1-v3/MINIAODSIM#eb5a0cbd-6c43-492c-ac21-4318775aee3b',\n", - " 'TM_IGNORE_LOCALITY': 'F',\n", - " 'GlobalJobId': 'crab3@vocms0155.cern.ch#98631116.0#1692100543',\n", - " 'CommittedCoreHr': '0.12361111111111112',\n", - " 'CommittedWallClockHr': '0.12361111111111112',\n", - " 'CRAB_Type': 'Analysis'},\n", - " {'RecordTime': 1692099933000,\n", - " 'CMSPrimaryDataTier': 'MINIAODSIM',\n", - " 'WallClockHr': 0.12166666666666667,\n", - " 'CoreHr': 0.12166666666666667,\n", - " 'CpuTimeHr': 0.004722222222222222,\n", - " 'ExitCode': 8020,\n", - " 'CRAB_DataBlock': '/WWTo4Q_4f_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18MiniAODv2-106X_upgrade2018_realistic_v16_L1v1-v3/MINIAODSIM#eb5a0cbd-6c43-492c-ac21-4318775aee3b',\n", - " 'TM_IGNORE_LOCALITY': 'F',\n", - " 'GlobalJobId': 'crab3@vocms0155.cern.ch#98629759.0#1692099393',\n", - " 'CommittedCoreHr': '0.1213888888888889',\n", - " 'CommittedWallClockHr': '0.1213888888888889',\n", - " 'CRAB_Type': 'Analysis'},\n", - " {'RecordTime': 1692121300000,\n", - " 'CMSPrimaryDataTier': 'MINIAODSIM',\n", - " 'WallClockHr': 5.698333333333333,\n", - " 'CoreHr': 5.698333333333333,\n", - " 'CpuTimeHr': 5.501388888888889,\n", - " 'ExitCode': 0,\n", - " 'CRAB_DataBlock': '/QCD_HT50to100_TuneCP5_13TeV-madgraphMLM-pythia8/RunIISummer20UL17MiniAODv2-106X_mc2017_realistic_v9-v2/MINIAODSIM#4c355926-ea17-4285-bd2b-2c7692c48a87',\n", - " 'TM_IGNORE_LOCALITY': 'F',\n", - " 'GlobalJobId': 'crab3@vocms0107.cern.ch#96720630.0#1691795011',\n", - " 'CommittedCoreHr': '5.698333333333333',\n", - " 'CommittedWallClockHr': '5.698333333333333',\n", - " 'CRAB_Type': 'Analysis'},\n", - " {'RecordTime': 1692121556000,\n", - " 'CMSPrimaryDataTier': 'MINIAODSIM',\n", - " 'WallClockHr': 5.769722222222223,\n", - " 'CoreHr': 5.769722222222223,\n", - " 'CpuTimeHr': 5.543055555555555,\n", - " 'ExitCode': 0,\n", - " 'CRAB_DataBlock': '/QCD_HT50to100_TuneCP5_13TeV-madgraphMLM-pythia8/RunIISummer20UL17MiniAODv2-106X_mc2017_realistic_v9-v2/MINIAODSIM#4c355926-ea17-4285-bd2b-2c7692c48a87',\n", - " 'TM_IGNORE_LOCALITY': 'F',\n", - " 'GlobalJobId': 'crab3@vocms0107.cern.ch#96720628.0#1691795011',\n", - " 'CommittedCoreHr': '5.769722222222223',\n", - " 'CommittedWallClockHr': '5.769722222222223',\n", - " 'CRAB_Type': 'Analysis'},\n", - " {'RecordTime': 1692123756000,\n", - " 'CMSPrimaryDataTier': 'MINIAODSIM',\n", - " 'WallClockHr': 3.2091666666666665,\n", - " 'CoreHr': 3.2091666666666665,\n", - " 'CpuTimeHr': 3.1125,\n", - " 'ExitCode': 0,\n", - " 'CRAB_DataBlock': '/QCD_HT50to100_TuneCP5_13TeV-madgraphMLM-pythia8/RunIISummer20UL17MiniAODv2-106X_mc2017_realistic_v9-v2/MINIAODSIM#4c355926-ea17-4285-bd2b-2c7692c48a87',\n", - " 'TM_IGNORE_LOCALITY': 'F',\n", - " 'GlobalJobId': 'crab3@vocms0107.cern.ch#96720658.0#1691795012',\n", - " 'CommittedCoreHr': '3.2091666666666665',\n", - " 'CommittedWallClockHr': '3.2091666666666665',\n", - " 'CRAB_Type': 'Analysis'}]" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "docs[:5]" + "# repartition rdd to make each partition small enough to load back to python kernel, serialize to dict, and send to os.\n", + "# for 12M rows, number of from 27 days of data is 51, around 250k per partition.\n", + "# try reducing partition to 20 once but make python kernel out-of-memory. \n", + "# so, try to keep it around 200k per partition instead.\n", + "partition_num = tmpdf.count() // 200000\n", + "tmpdf = tmpdf.repartition(partition_num, 'RecordTime')\n", + "total_part = tmpdf.rdd.getNumPartitions()\n", + "\n", + "print(f\"Number of partition: {total_part}\")" ] }, { "cell_type": "code", - "execution_count": 18, - "id": "bcdfb65c", - "metadata": {}, + "execution_count": null, + "id": "3e1f7a3f", + "metadata": { + "scrolled": false + }, "outputs": [], "source": [ - "import osearch" + "# send to os, serialize df one rdd partition at a time\n", + "part = 0\n", + "for docs in tmpdf.rdd.mapPartitions(lambda p: [[x.asDict() for x in p]]).toLocalIterator():\n", + " part += 1\n", + " print(f\"Partition: {part}/{total_part}, Length of partition: {len(docs)}\")\n", + " send_os_parallel(docs, index_name, schema, secretpath, yesterday_epoch, 20000) # batch_size is just arbitrary number" ] }, { "cell_type": "code", - "execution_count": 19, - "id": "4666acef", + "execution_count": null, + "id": "52b2fc9f", "metadata": {}, "outputs": [], "source": [ - "def get_index_schema():\n", - " return {\n", - " \"settings\": {\"index\": {\"number_of_shards\": \"1\", \"number_of_replicas\": \"1\"}},\n", - " \"mappings\": {\n", - " \"properties\": {\n", - " \"RecordTime\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n", - " \"CMSPrimaryDataTier\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", - " \"GlobalJobId\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", - " \"WallClockHr\": {\"type\": \"long\"},\n", - " \"CoreHr\": {\"type\": \"long\"},\n", - " \"CpuTimeHr\": {\"type\": \"long\"},\n", - " \"ExitCode\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", - " \"TM_IGNORE_LOCALITY\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", - " \"CRAB_Type\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", - " \"CRAB_DataBlock\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", - " \"CommittedCoreHr\": {\"type\": \"long\"}, \n", - " \"CommittedWallClockHr\": {\"type\": \"long\"},\n", - " }\n", - " }\n", - " }" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "d6e4107b", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/eos/user/e/eatthaph/.local/lib/python3.9/site-packages/opensearchpy/connection/http_urllib3.py:199: UserWarning: Connecting to https://es-cms1.cern.ch:443 using SSL with verify_certs=False is insecure.\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n" - ] - } - ], - "source": [ - "_index_template = 'crab-condor-ekong'\n", - "client = osearch.get_es_client(\"es-cms1.cern.ch/es\", 'secret_opensearch.txt', get_index_schema())\n", - "# index_mod=\"\": 'test-foo', index_mod=\"Y\": 'test-foo-YYYY', index_mod=\"M\": 'test-foo-YYYY-MM', index_mod=\"D\": 'test-foo-YYYY-MM-DD',\n", - "idx = client.get_or_create_index(timestamp=time.time(), index_template=_index_template, index_mod=\"M\")\n", - "client.send(idx, docs, metadata=None, batch_size=10000, drop_nulls=False)" + "print(\"Done!\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "d7274886", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1b4484a3", + "id": "1dc69a5c", "metadata": {}, "outputs": [], "source": [] @@ -728,14 +478,13 @@ "list_of_options": [ { "name": "spark.jars.packages", - "value": "org.apache.spark:spark-avro_2.12:3.3.1" + "value": "org.apache.spark:spark-avro_2.12:3.5.0" + }, + { + "name": "spark.executor.instances", + "value": "20" } ] - }, - "vscode": { - "interpreter": { - "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" - } } }, "nbformat": 4, diff --git a/src/script/Monitor/crab-spark/notebooks/crab_data.ipynb b/src/script/Monitor/crab-spark/notebooks/crab_data.ipynb deleted file mode 100644 index be60f3eb84..0000000000 --- a/src/script/Monitor/crab-spark/notebooks/crab_data.ipynb +++ /dev/null @@ -1,832 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "66b56403", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "795d491e", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - "

SparkSession - in-memory

\n", - " \n", - "
\n", - "

SparkContext

\n", - "\n", - "

Spark UI

\n", - "\n", - "
\n", - "
Version
\n", - "
v3.3.2
\n", - "
Master
\n", - "
yarn
\n", - "
AppName
\n", - "
pyspark_shell_swan
\n", - "
\n", - "
\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "31b02b1c", - "metadata": {}, - "outputs": [], - "source": [ - "# !hdfs -h" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "7a7ad1c3", - "metadata": {}, - "outputs": [], - "source": [ - "# !hdfs dfs -ls /cms/users/eatthaph" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "8a170ced", - "metadata": {}, - "outputs": [], - "source": [ - "# !hdfs dfs -ls /cms/users/eatthaph/" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "17520cda", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "23/07/25 16:06:21 WARN ipc.Client: Exception encountered while connecting to the server \n", - "org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.ipc.StandbyException): Operation category READ is not supported in state standby. Visit https://s.apache.org/sbnn-error\n", - "\tat org.apache.hadoop.security.SaslRpcClient.saslConnect(SaslRpcClient.java:376)\n", - "\tat org.apache.hadoop.ipc.Client$Connection.setupSaslConnection(Client.java:622)\n", - "\tat org.apache.hadoop.ipc.Client$Connection.access$2300(Client.java:413)\n", - "\tat org.apache.hadoop.ipc.Client$Connection$2.run(Client.java:822)\n", - "\tat org.apache.hadoop.ipc.Client$Connection$2.run(Client.java:818)\n", - "\tat java.base/java.security.AccessController.doPrivileged(Native Method)\n", - "\tat java.base/javax.security.auth.Subject.doAs(Subject.java:423)\n", - "\tat org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1845)\n", - "\tat org.apache.hadoop.ipc.Client$Connection.setupIOstreams(Client.java:818)\n", - "\tat org.apache.hadoop.ipc.Client$Connection.access$3800(Client.java:413)\n", - "\tat org.apache.hadoop.ipc.Client.getConnection(Client.java:1636)\n", - "\tat org.apache.hadoop.ipc.Client.call(Client.java:1452)\n", - "\tat org.apache.hadoop.ipc.Client.call(Client.java:1405)\n", - "\tat org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:234)\n", - "\tat org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:119)\n", - "\tat com.sun.proxy.$Proxy12.getFileInfo(Unknown Source)\n", - "\tat org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.getFileInfo(ClientNamenodeProtocolTranslatorPB.java:964)\n", - "\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n", - "\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n", - "\tat java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n", - "\tat java.base/java.lang.reflect.Method.invoke(Method.java:566)\n", - "\tat org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:422)\n", - "\tat org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeMethod(RetryInvocationHandler.java:165)\n", - "\tat org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invoke(RetryInvocationHandler.java:157)\n", - "\tat org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeOnce(RetryInvocationHandler.java:95)\n", - "\tat org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:359)\n", - "\tat com.sun.proxy.$Proxy13.getFileInfo(Unknown Source)\n", - "\tat org.apache.hadoop.hdfs.DFSClient.getFileInfo(DFSClient.java:1731)\n", - "\tat org.apache.hadoop.hdfs.DistributedFileSystem$29.doCall(DistributedFileSystem.java:1725)\n", - "\tat org.apache.hadoop.hdfs.DistributedFileSystem$29.doCall(DistributedFileSystem.java:1722)\n", - "\tat org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81)\n", - "\tat org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:1737)\n", - "\tat org.apache.hadoop.fs.Globber.getFileStatus(Globber.java:115)\n", - "\tat org.apache.hadoop.fs.Globber.doGlob(Globber.java:349)\n", - "\tat org.apache.hadoop.fs.Globber.glob(Globber.java:202)\n", - "\tat org.apache.hadoop.fs.FileSystem.globStatus(FileSystem.java:2093)\n", - "\tat org.apache.hadoop.fs.shell.PathData.expandAsGlob(PathData.java:353)\n", - "\tat org.apache.hadoop.fs.shell.Command.expandArgument(Command.java:250)\n", - "\tat org.apache.hadoop.fs.shell.Command.expandArguments(Command.java:233)\n", - "\tat org.apache.hadoop.fs.shell.FsCommand.processRawArguments(FsCommand.java:104)\n", - "\tat org.apache.hadoop.fs.shell.Command.run(Command.java:177)\n", - "\tat org.apache.hadoop.fs.FsShell.run(FsShell.java:327)\n", - "\tat org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:76)\n", - "\tat org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:90)\n", - "\tat org.apache.hadoop.fs.FsShell.main(FsShell.java:390)\n", - "Found 9 items\n", - "-rw-r--r-x+ 3 cmssqoop c3 0 2023-07-19 02:14 /project/awg/cms/crab/tasks/2023-07-19/_SUCCESS\n", - "-rw-r--r-x+ 3 cmssqoop c3 85991835 2023-07-19 02:01 /project/awg/cms/crab/tasks/2023-07-19/part-m-00000.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 837565156 2023-07-19 02:14 /project/awg/cms/crab/tasks/2023-07-19/part-m-00001.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 605874324 2023-07-19 02:10 /project/awg/cms/crab/tasks/2023-07-19/part-m-00002.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 602365393 2023-07-19 02:09 /project/awg/cms/crab/tasks/2023-07-19/part-m-00003.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 761072727 2023-07-19 02:13 /project/awg/cms/crab/tasks/2023-07-19/part-m-00004.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 462585036 2023-07-19 02:07 /project/awg/cms/crab/tasks/2023-07-19/part-m-00005.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 394767237 2023-07-19 02:06 /project/awg/cms/crab/tasks/2023-07-19/part-m-00006.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 358041401 2023-07-19 02:04 /project/awg/cms/crab/tasks/2023-07-19/part-m-00007.avro\n" - ] - } - ], - "source": [ - "!hdfs dfs -ls /project/awg/cms/crab/tasks/2023-07-19" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "2a7b2463", - "metadata": {}, - "outputs": [], - "source": [ - "# import pickle\n", - "from datetime import datetime, timedelta\n", - "\n", - "# import click\n", - "import os\n", - "import pandas as pd\n", - "# import pprint\n", - "import time\n", - "# from dateutil.relativedelta import relativedelta\n", - "from pyspark import SparkContext, StorageLevel\n", - "from pyspark.sql import SparkSession\n", - "from pyspark.sql.functions import (\n", - " col, collect_list, concat_ws, greatest, lit, lower, when, unix_timestamp, to_timestamp,\n", - " avg as _avg,\n", - " count as _count,\n", - " hex as _hex,\n", - " max as _max,\n", - " min as _min,\n", - " round as _round,\n", - " sum as _sum,\n", - ")\n", - "\n", - "from pyspark.sql.types import (\n", - " LongType,\n", - ")\n", - "\n", - "# import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import math\n", - "import json\n", - "#from CMSSpark.src.python.CMSSpark import schemas as cms_schemas" - ] - }, - { - "cell_type": "markdown", - "id": "f2904198", - "metadata": {}, - "source": [ - "## load dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "aa0d181a", - "metadata": {}, - "outputs": [], - "source": [ - "# end_date = str(datetime.now())[:10]\n", - "# start_date = str(datetime.now()-timedelta(days=1))[:10]\n", - "\n", - "start_date = '2023-07-20'\n", - "end_date = '2023-07-25'\n", - "\n", - "wa_date = end_date\n", - "HDFS_CRAB_part = f'/project/awg/cms/crab/tasks/{wa_date}/'\n", - "# HDFS_CRAB_part = f'/project/awg/cms/crab/{wa_date}/tasks/'" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "532ec9ac", - "metadata": {}, - "outputs": [], - "source": [ - "crab_part = spark.read.format('avro').load(HDFS_CRAB_part)" - ] - }, - { - "cell_type": "markdown", - "id": "3ad81af6", - "metadata": {}, - "source": [ - "## Query" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "id": "41cf761f", - "metadata": {}, - "outputs": [], - "source": [ - "df = crab_part.select(\"TM_TASKNAME\",\"TM_START_TIME\",\"TM_TASK_STATUS\",\"TM_SPLIT_ALGO\",\"TM_USERNAME\",\"TM_USER_ROLE\",\"TM_JOB_TYPE\",\"TM_IGNORE_LOCALITY\",\"TM_SCRIPTEXE\",\"TM_USER_CONFIG\")\n", - "df.createOrReplaceTempView(\"crab_algo\")\n", - "# df.show(10)" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "e41c5fc6", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "6147" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "query = f\"\"\"\\\n", - "SELECT *\n", - "FROM crab_algo \n", - "WHERE 1=1\n", - "AND TM_START_TIME >= unix_timestamp(\"{start_date} 00:00:00\", \"yyyy-MM-dd HH:mm:ss\")*1000 \n", - "AND TM_START_TIME < unix_timestamp(\"{end_date} 00:00:00\", \"yyyy-MM-dd HH:mm:ss\")*1000 \n", - "\"\"\"\n", - "\n", - "tmpdf = spark.sql(query)\n", - "tmpdf.count()" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "25033524", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "root\n", - " |-- TM_TASKNAME: string (nullable = true)\n", - " |-- TM_START_TIME: long (nullable = true)\n", - " |-- TM_TASK_STATUS: string (nullable = true)\n", - " |-- TM_SPLIT_ALGO: string (nullable = true)\n", - " |-- TM_USERNAME: string (nullable = true)\n", - " |-- TM_USER_ROLE: string (nullable = true)\n", - " |-- TM_JOB_TYPE: string (nullable = true)\n", - " |-- TM_IGNORE_LOCALITY: string (nullable = true)\n", - " |-- TM_SCRIPTEXE: string (nullable = true)\n", - " |-- TM_USER_CONFIG: string (nullable = true)\n", - "\n" - ] - } - ], - "source": [ - "tmpdf.printSchema()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "ff188450", - "metadata": {}, - "outputs": [], - "source": [ - "docs = tmpdf.toPandas().to_dict('records')" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "fad5ca52", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "6147" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(docs)" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "c454d0c4", - "metadata": {}, - "outputs": [], - "source": [ - "# Extract 'REQUIRE_ACCELERATOR' from 'TM_USER_CONFIG'\n", - "\n", - "for i in range(len(docs)):\n", - " if docs[i]['TM_USER_CONFIG'] is not None:\n", - " data = json.loads(docs[i]['TM_USER_CONFIG'])\n", - " if \"requireaccelerator\" in data:\n", - " docs[i]['REQUIRE_ACCELERATOR'] = data[\"requireaccelerator\"]\n", - " else:\n", - " docs[i]['REQUIRE_ACCELERATOR'] = None\n", - " else:\n", - " docs[i]['REQUIRE_ACCELERATOR'] = None" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "d2e914f6", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'TM_TASKNAME': '160406_111833:sciaba_HC-163-AnySite-26725-20160406125703-T1_UK_RAL',\n", - " 'TM_START_TIME': 1459934313843,\n", - " 'TM_TASK_STATUS': 'SUBMITTED',\n", - " 'TM_SPLIT_ALGO': 'FileBased',\n", - " 'TM_USERNAME': 'sciaba',\n", - " 'TM_USER_ROLE': 'production',\n", - " 'TM_JOB_TYPE': 'Analysis',\n", - " 'TM_IGNORE_LOCALITY': 'T',\n", - " 'TM_SCRIPTEXE': None,\n", - " 'TM_USER_CONFIG': None,\n", - " 'REQUIRE_ACCELERATOR': None},\n", - " {'TM_TASKNAME': '160406_111914:sciaba_HC-148-AnySite-26727-20160406131903-T2_UK_SGrid_Bristol',\n", - " 'TM_START_TIME': 1459934354531,\n", - " 'TM_TASK_STATUS': 'SUBMITTED',\n", - " 'TM_SPLIT_ALGO': 'FileBased',\n", - " 'TM_USERNAME': 'sciaba',\n", - " 'TM_USER_ROLE': 'production',\n", - " 'TM_JOB_TYPE': 'Analysis',\n", - " 'TM_IGNORE_LOCALITY': 'T',\n", - " 'TM_SCRIPTEXE': None,\n", - " 'TM_USER_CONFIG': None,\n", - " 'REQUIRE_ACCELERATOR': None},\n", - " {'TM_TASKNAME': '160319_180958:sciaba_HC-138-AnySite-26052-20160319011302-T2_RU_IHEP',\n", - " 'TM_START_TIME': 1458407398241,\n", - " 'TM_TASK_STATUS': 'SUBMITTED',\n", - " 'TM_SPLIT_ALGO': 'FileBased',\n", - " 'TM_USERNAME': 'sciaba',\n", - " 'TM_USER_ROLE': 'production',\n", - " 'TM_JOB_TYPE': 'Analysis',\n", - " 'TM_IGNORE_LOCALITY': 'T',\n", - " 'TM_SCRIPTEXE': None,\n", - " 'TM_USER_CONFIG': None,\n", - " 'REQUIRE_ACCELERATOR': None}]" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "docs[:3]" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "cf696d7f", - "metadata": {}, - "outputs": [], - "source": [ - "import osearch" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "e47490bd", - "metadata": {}, - "outputs": [], - "source": [ - "def get_index_schema():\n", - " return {\n", - " \"settings\": {\"index\": {\"number_of_shards\": \"1\", \"number_of_replicas\": \"1\"}},\n", - " \"mappings\": {\n", - " \"properties\": {\n", - " \"TM_TASKNAME\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", - " \"TM_START_TIME\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n", - " 'TM_TASK_STATUS': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", - " \"TM_SPLIT_ALGO\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", - " \"TM_USERNAME\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", - " \"TM_USER_ROLE\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", - " \"TM_JOB_TYPE\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", - " \"TM_IGNORE_LOCALITY\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", - " \"TM_SCRIPTEXE\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", - " \"REQUIRE_ACCELERATOR\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", - " }\n", - " }\n", - " }\n", - "\n", - "# def get_index_schema():\n", - "# return {\n", - "# \"settings\": {\"index\": {\"number_of_shards\": \"1\", \"number_of_replicas\": \"1\"}},\n", - "# \"mappings\": {\n", - "# \"properties\": {\n", - "# \"TM_TASKNAME\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", - "# \"TM_SPLIT_ALGO\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", - "# \"TM_START_TIME\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n", - "# \"TM_END_INJECTION\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n", - "# }\n", - "# }\n", - "# }" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "6bcfc801", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/plain": [ - "0" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "_index_template = 'crab-data-ekong'\n", - "client = osearch.get_es_client(\"es-cms1.cern.ch/es\", 'secret_opensearch.txt', get_index_schema())\n", - "# index_mod=\"\": 'test-foo', index_mod=\"Y\": 'test-foo-YYYY', index_mod=\"M\": 'test-foo-YYYY-MM', index_mod=\"D\": 'test-foo-YYYY-MM-DD',\n", - "idx = client.get_or_create_index(timestamp=time.time(), index_template=_index_template, index_mod=\"M\")\n", - "client.send(idx, docs, metadata=None, batch_size=10000, drop_nulls=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bcac057e", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a5f62789", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "@webio": { - "lastCommId": null, - "lastKernelId": null - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.12" - }, - "sparkconnect": { - "bundled_options": [], - "list_of_options": [ - { - "name": "spark.jars.packages", - "value": "org.apache.spark:spark-avro_2.12:3.3.1" - } - ] - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/src/script/Monitor/crab-spark/notebooks/crab_rucio_rules_poc.ipynb b/src/script/Monitor/crab-spark/notebooks/crab_rucio_rules_poc.ipynb new file mode 100644 index 0000000000..8aa5ac31a6 --- /dev/null +++ b/src/script/Monitor/crab-spark/notebooks/crab_rucio_rules_poc.ipynb @@ -0,0 +1,340 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "5e9af689", + "metadata": {}, + "outputs": [], + "source": [ + "from datetime import datetime, timedelta, timezone\n", + "import os\n", + "import time\n", + "import pandas as pd\n", + "\n", + "from pyspark import SparkContext, StorageLevel\n", + "from pyspark.sql import SparkSession\n", + "from pyspark.sql.functions import (\n", + " current_user,\n", + " col, collect_list, concat_ws, greatest, lit, lower, when,\n", + " avg as _avg,\n", + " count as _count,\n", + " hex as _hex,\n", + " max as _max,\n", + " min as _min,\n", + " round as _round,\n", + " sum as _sum,\n", + ")\n", + "from pyspark.sql.types import (\n", + " StructType,\n", + " LongType,\n", + " StringType,\n", + " StructField,\n", + " DoubleType,\n", + " IntegerType,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "91309756", + "metadata": {}, + "outputs": [], + "source": [ + "# try to import libs from current directory, fallback to $PWD/../workdir if not found\n", + "try:\n", + " from crabspark_utils import get_candidate_files, send_os, send_os_parallel\n", + "except ModuleNotFoundError:\n", + " import sys\n", + " sys.path.insert(0, f'{os.getcwd()}/../workdir')\n", + " from crabspark_utils import get_candidate_files, send_os, send_os_parallel\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22946659", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "spark = SparkSession\\\n", + " .builder\\\n", + " .appName('crab-taskdb')\\\n", + " .getOrCreate()\n", + "spark" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e9013878", + "metadata": {}, + "outputs": [], + "source": [ + "# clear any cache left, for working with notebook\n", + "# it safe to run everytime cronjob start\n", + "spark.catalog.clearCache()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31c19eb0", + "metadata": {}, + "outputs": [], + "source": [ + "# secret path, also check if file exists\n", + "secretpath = os.environ.get('OPENSEARCH_SECRET_PATH', f'{os.getcwd()}/../workdir/secret_opensearch.txt')\n", + "if not os.path.isfile(secretpath): \n", + " raise Exception(f'OS secrets file {secretpath} does not exists')\n", + "# if PROD, index prefix will be `crab-*`, otherwise `crab-test-*`\n", + "PROD = os.environ.get('PROD', 'false').lower() in ('true', '1', 't')\n", + "# FROM_DATE, in strptime(\"%Y-%m-%d\")\n", + "START = os.environ.get('START_DATE', None) \n", + "END = os.environ.get('END_DATE', None)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e843eb6d", + "metadata": {}, + "outputs": [], + "source": [ + "# For run playbook manually, set start/end date here\n", + "START_DATE = \"2020-01-01\"\n", + "END_DATE = \"2024-10-01\"\n", + "# if cronjob, replace constant with value from env\n", + "if START and END:\n", + " START_DATE = START\n", + " END_DATE = END" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b17ed53f", + "metadata": {}, + "outputs": [], + "source": [ + "# index name\n", + "index_name = 'taskdb'\n", + "# use prod index pattern if this execution is for production\n", + "if PROD:\n", + " index_name = f'crab-prod-{index_name}'\n", + "else:\n", + " index_name = f'crab-test-{index_name}'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8417ab47", + "metadata": {}, + "outputs": [], + "source": [ + "# datetime object\n", + "start_datetime = datetime.strptime(START_DATE, \"%Y-%m-%d\").replace(tzinfo=timezone.utc)\n", + "end_datetime = datetime.strptime(END_DATE, \"%Y-%m-%d\").replace(tzinfo=timezone.utc)\n", + "# sanity check\n", + "if end_datetime < start_datetime: \n", + " raise Exception(f\"end date ({END_DATE}) is less than start date ({START_DATE})\")\n", + "start_epochmilis = int(start_datetime.timestamp()) * 1000\n", + "end_epochmilis = int(end_datetime.timestamp()) * 1000\n", + "yesterday_epoch = int((end_datetime-timedelta(days=1)).timestamp())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9404c437", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# debug\n", + "print(START_DATE, \n", + " END_DATE, \n", + " index_name,\n", + " sep='\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e85c2f0", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# This code block and following block is copied from Panos's script.\n", + "# https://gitlab.cern.ch/cmsdmops/cmsdmops/-/blob/8da699db49097d7a58440e6058f022c3f93992e2/monitoring/kubernetes/src/rucio_activity_account_usage.py\n", + "# see more in https://github.com/dmwm/CRABServer/issues/7798#issuecomment-2389265249\n", + "def get_df_rses(spark):\n", + " \"\"\"Get Spark dataframe of RSES\n", + " \"\"\"\n", + " hdfs_rses_path = '/project/awg/cms/rucio/{}/rses/part*.avro'.format(datetime.today().strftime('%Y-%m-%d'))\n", + " df_rses = spark.read.format(\"avro\").load(hdfs_rses_path) \\\n", + " .filter(col('DELETED_AT').isNull()) \\\n", + " .withColumn('rse_id', lower(_hex(col('ID')))) \\\n", + " .withColumn('rse_tier', _split(col('RSE'), '_').getItem(0)) \\\n", + " .withColumn('rse_country', _split(col('RSE'), '_').getItem(1)) \\\n", + " .withColumn('rse_kind',\n", + " when((col(\"rse\").endswith('Temp') | col(\"rse\").endswith('temp') | col(\"rse\").endswith('TEMP')),\n", + " 'temp')\n", + " .when((col(\"rse\").endswith('Test') | col(\"rse\").endswith('test') | col(\"rse\").endswith('TEST')),\n", + " 'test')\n", + " .otherwise('prod')\n", + " ) \\\n", + " .select(['rse_id', 'RSE', 'RSE_TYPE', 'rse_tier', 'rse_country', 'rse_kind'])\n", + " return df_rses\n", + "def get_df_locks(spark):\n", + " \"\"\"Get Spark dataframe of Locks\n", + " \"\"\"\n", + " today = datetime.today().strftime('%Y-%m-%d')\n", + " locks_path = f'/project/awg/cms/rucio/{today}/locks/part*.avro'\n", + " locks = spark.read.format('avro').load(locks_path) \\\n", + " .filter(col('SCOPE') == 'cms') \\\n", + " .filter(col('STATE').isin(['O', 'R'])) \\\n", + " .withColumn('rse_id', lower(_hex(col('RSE_ID')))) \\\n", + " .withColumnRenamed('NAME', 'f_name') \\\n", + " .withColumnRenamed('ACCOUNT', 'account_name') \\\n", + " .withColumnRenamed('BYTES', 'f_size') \\\n", + " .withColumn('r_id', lower(_hex(col('RULE_ID')))) \\\n", + " .select(['rse_id', 'f_name', 'f_size', 'r_id', 'account_name'])\n", + " return locks\n", + "def get_df_accounts(spark):\n", + " \"\"\"Get Spark dataframe of Accounts\n", + " \"\"\"\n", + " today = datetime.today().strftime('%Y-%m-%d')\n", + " hdfs_rucio_accounts = f'/project/awg/cms/rucio/{today}/accounts/part*.avro'\n", + " df_accounts = spark.read.format(\"avro\").load(hdfs_rucio_accounts) \\\n", + " .filter(col('DELETED_AT').isNull()) \\\n", + " .withColumnRenamed('ACCOUNT', 'account_name') \\\n", + " .withColumnRenamed('ACCOUNT_TYPE', 'account_type') \\\n", + " .select(['account_name', 'account_type'])\n", + " return df_accounts\n", + "def get_df_rules(spark):\n", + " \"\"\"Get Spark dataframe of rules\n", + " \"\"\"\n", + " hdfs_rules_path = '/project/awg/cms/rucio/{}/rules/part*.avro'.format(datetime.today().strftime('%Y-%m-%d'))\n", + " return spark.read.format('avro').load(hdfs_rules_path) \\\n", + " .filter(col('SCOPE') == 'cms') \\\n", + " .withColumnRenamed('name', 'r_name') \\\n", + " .withColumn('r_id', lower(_hex(col('ID')))) \\\n", + " .withColumn('s_id', lower(_hex(col('SUBSCRIPTION_ID')))) \\\n", + " .withColumnRenamed('ACTIVITY', 'activity') \\\n", + " .withColumnRenamed('STATE', 'rule_state') \\\n", + " .withColumnRenamed('RSE_EXPRESSION', 'rse_expression') \\\n", + " .select(['r_name','r_id', 's_id', 'activity', 'rule_state', 'rse_expression']) \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e271b1c8", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "# add data_tier field\n", + "df_rses = get_df_rses(spark)\n", + "df_locks = get_df_locks(spark)\n", + "df_accounts = get_df_accounts(spark)\n", + "df_rules = get_df_rules(spark)\n", + "tb_denominator = 10 ** 12\n", + "locks = df_locks.join(df_rses, ['rse_id'], how='left') \\\n", + " .filter(col('rse_kind') == 'prod') \\\n", + " .select(['f_name', 'f_size', 'RSE', 'rse_type', 'account_name', 'r_id']) \n", + "\n", + "locks_with_activity = (\n", + " locks.join(df_rules, ['r_id'], how='leftouter')\n", + " .select(['f_name', 'account_name', 'RSE', 'rse_type', 'f_size', 'activity', 'r_name'])\n", + " .withColumn('data_tier', regexp_extract('r_name', r'^\\/([\\w-]+)\\/([\\w-]+)\\/([\\w-]+)(#[\\w-]+)?', 3))\n", + " .select(['f_name', 'account_name', 'RSE', 'rse_type', 'f_size', 'activity', 'data_tier'])\n", + ")\n", + "\n", + "timestamp = int(time.time())\n", + "\n", + "# A File locked by the user for two activities is accounted to both activities\n", + "# A File locked by two users for the same activity is accounted to both Users\n", + "user_aggreagated = locks_with_activity \\\n", + " .select(['f_name', 'f_size', 'RSE', 'rse_type', 'account_name', 'activity', 'data_tier']) \\\n", + " .distinct() \\\n", + " .groupby(['RSE', 'rse_type', 'account_name', 'activity', 'data_tier']) \\\n", + " .agg(_round(_sum(col('f_size')) / tb_denominator, 5).alias('total_locked')) \\\n", + " .join(df_accounts, ['account_name'], how='left') \\\n", + " .withColumnRenamed('RSE', 'rse_name') \\\n", + " .withColumn('timestamp', lit(timestamp)) \\\n", + " .select(['total_locked', 'rse_name', 'rse_type', 'account_name', 'account_type', 'activity', 'data_tier', 'timestamp']) \\\n", + " .cache()\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15c3ff28", + "metadata": {}, + "outputs": [], + "source": [ + "user_aggreagated.show(10, False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7e98534", + "metadata": {}, + "outputs": [], + "source": [ + "user_aggreagated.count()" + ] + } + ], + "metadata": { + "@webio": { + "lastCommId": null, + "lastKernelId": null + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + }, + "sparkconnect": { + "bundled_options": [], + "list_of_options": [ + { + "name": "spark.jars.packages", + "value": "org.apache.spark:spark-avro_2.12:3.5.0" + }, + { + "name": "spark.executor.instances", + "value": "20" + } + ] + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/script/Monitor/crab-spark/notebooks/crab_tape_recall_history.ipynb b/src/script/Monitor/crab-spark/notebooks/crab_tape_recall_history.ipynb new file mode 100644 index 0000000000..3ed0a6e890 --- /dev/null +++ b/src/script/Monitor/crab-spark/notebooks/crab_tape_recall_history.ipynb @@ -0,0 +1,572 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2ecefbb5", + "metadata": {}, + "source": [ + "# CRAB Spark tape recall history\n", + "\n", + "This jobs is querying `rules_history` table of cmsrucio to answer theses questions:\n", + "- How long do tasks stay in “taperecall”?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e9af689", + "metadata": {}, + "outputs": [], + "source": [ + "from datetime import datetime, timedelta, timezone\n", + "import os\n", + "import time\n", + "import pandas as pd\n", + "\n", + "from pyspark import SparkContext, StorageLevel\n", + "from pyspark.sql import SparkSession\n", + "from pyspark.sql.functions import (\n", + " current_user,\n", + " col, collect_list, concat_ws, greatest, lit, lower, when,\n", + " avg as _avg,\n", + " count as _count,\n", + " hex as _hex,\n", + " max as _max,\n", + " min as _min,\n", + " round as _round,\n", + " sum as _sum,\n", + ")\n", + "from pyspark.sql.types import (\n", + " StructType,\n", + " LongType,\n", + " StringType,\n", + " StructField,\n", + " DoubleType,\n", + " IntegerType,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22946659", + "metadata": {}, + "outputs": [], + "source": [ + "spark = SparkSession\\\n", + " .builder\\\n", + " .appName('tape-recall-history')\\\n", + " .getOrCreate()\n", + "spark" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "014b13c8", + "metadata": {}, + "outputs": [], + "source": [ + "spark.catalog.clearCache()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31c19eb0", + "metadata": {}, + "outputs": [], + "source": [ + "# arguments\n", + "# secret path, also check if file exists\n", + "secretpath = os.environ.get('OPENSEARCH_SECRET_PATH', f'{os.getcwd()}/../workdir/secret_opensearch.txt')\n", + "if not os.path.isfile(secretpath): \n", + " raise Exception(f'OS secrets file {secretpath} does not exists')\n", + "# if PROD, index prefix will be `crab-*`, otherwise `crab-test-*`\n", + "PROD = os.environ.get('PROD', 'false').lower() in ('true', '1', 't')\n", + "# FROM_DATE, in strptime(\"%Y-%m-%d\")\n", + "START = os.environ.get('START_DATE', None) \n", + "END = os.environ.get('END_DATE', None)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e843eb6d", + "metadata": {}, + "outputs": [], + "source": [ + "# try to import libs from current directory, fallback to $PWD/../workdir if not found\n", + "try:\n", + " from crabspark_utils import get_candidate_files, send_os, send_os_parallel\n", + "except ModuleNotFoundError:\n", + " import sys\n", + " sys.path.insert(0, f'{os.getcwd()}/../workdir')\n", + " from crabspark_utils import get_candidate_files, send_os, send_os_parallel\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c644790", + "metadata": {}, + "outputs": [], + "source": [ + "# variables for run inside notebook\n", + "START_DATE = \"2020-01-01\"\n", + "END_DATE = \"2024-10-01\"\n", + "# if cronjob, replace constant with value from env\n", + "if START and END:\n", + " START_DATE = START\n", + " END_DATE = END" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d608eab0", + "metadata": {}, + "outputs": [], + "source": [ + "# index name\n", + "index_name = 'tape-recall-history' # always put test index prefix\n", + "# use prod index pattern if this execution is for production\n", + "if PROD:\n", + " index_name = f'crab-prod-{index_name}'\n", + "else:\n", + " index_name = f'crab-test-{index_name}'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b17ed53f", + "metadata": {}, + "outputs": [], + "source": [ + "# datetime object\n", + "start_datetime = datetime.strptime(START_DATE, \"%Y-%m-%d\").replace(tzinfo=timezone.utc)\n", + "end_datetime = datetime.strptime(END_DATE, \"%Y-%m-%d\").replace(tzinfo=timezone.utc)\n", + "# sanity check\n", + "if end_datetime < start_datetime: \n", + " raise Exception(f\"end date ({END_DATE}) is less than start date ({START_DATE})\")\n", + "start_epochmilis = int(start_datetime.timestamp()) * 1000\n", + "end_epochmilis = int(end_datetime.timestamp()) * 1000\n", + "yesterday_epoch = int((end_datetime-timedelta(days=1)).timestamp())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9404c437", + "metadata": {}, + "outputs": [], + "source": [ + "# debug\n", + "print(START_DATE, \n", + " END_DATE, \n", + " index_name,\n", + " sep='\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e85c2f0", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Import data into spark\n", + "\n", + "HDFS_RUCIO_RULES_HISTORY = f'/project/awg/cms/rucio/{END_DATE}/rules_history/'\n", + "\n", + "print(\"===============================================\"\n", + " , \"RUCIO : Rules History\"\n", + " , \"===============================================\"\n", + " , \"File Directory:\", HDFS_RUCIO_RULES_HISTORY\n", + " , \"Work Directory:\", os.getcwd()\n", + " , \"===============================================\"\n", + " , \"===============================================\", sep='\\n')\n", + "\n", + "# we only interest in the rules where state does not change anymore.\n", + "# which means, only the rules that already expired.\n", + "rucio_rules_history = (\n", + " spark.read.format('avro').load(HDFS_RUCIO_RULES_HISTORY).withColumn('ID', lower(_hex(col('ID'))))\n", + " .select(\"ID\", \"ACCOUNT\", \"NAME\", \"STATE\", \"EXPIRES_AT\", \"UPDATED_AT\", \"CREATED_AT\")\n", + " .filter(f\"\"\"\\\n", + " 1=1\n", + " AND ACTIVITY = 'Analysis TapeRecall'\n", + " AND EXPIRES_AT >= {start_epochmilis}\n", + " AND EXPIRES_AT < {end_epochmilis}\n", + " \"\"\")\n", + " .cache()\n", + ")\n", + "rucio_rules_history.createOrReplaceTempView(\"rules_history\")\n", + "\n", + "HDFS_CRAB_part = f'/project/awg/cms/crab/tasks/{END_DATE}/'\n", + "print(\"===============================================\"\n", + " , \"CRAB Table\"\n", + " , \"===============================================\"\n", + " , \"File Directory:\", HDFS_CRAB_part\n", + " , \"Work Directory:\", os.getcwd()\n", + " , \"===============================================\"\n", + " , \"===============================================\", sep='\\n')\n", + "\n", + "# do not filter taskdb by create time (TM_START_TIME) because it is possible that rules are created 6 months ago\n", + "tasks_df = (\n", + " spark.read.format('avro').load(HDFS_CRAB_part)\n", + " .select(\"TM_TASKNAME\",\"TM_START_TIME\",\"TM_TASK_STATUS\", 'TM_TASKNAME', 'TM_START_TIME', 'TM_TASK_STATUS' , 'TM_DDM_REQID')\n", + " .cache()\n", + ")\n", + "tasks_df.createOrReplaceTempView(\"tasks\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0ad6c09", + "metadata": {}, + "outputs": [], + "source": [ + "# rucio append new row to rules_history when the content rules table change (not sure the exact condition)\n", + "# We need to get \"the latest\" row for each rules by:\n", + "# - If rule has state \"O\", select the earliest UPDATED_AT row.\n", + "# For the OK rule, we can calculate number of days using UPDATED_AT-CREATED_AT. \n", + "# However, there are some posiblility that rucio append new entry with newer UPDATED_AT (For exmple 37fcada73f14439b88558ef792e10276)\n", + "# - If not, select the latest UPDATED_AT row.\n", + "# This because the rules still in temporary state, and the rules will go to the end state \n", + "# (not the real state, but rules_history will not getting new row anymore) after rules is expired \n", + "# So, we can calculate number of day by EXPIRES_AT-CREATED_AT\n", + "#\n", + "# Here is the step to translate above condition to SQL (in the buttom-up manner)\n", + "# 1. count number of row where the state is 'O'.\n", + "# 2. left join the rule history by ID, so each row will have number of state O \n", + "# New table look like this:\n", + "# +--------------------------------+-----+-------------+-------+\n", + "# |ID |STATE|EXPIRES_AT |state_o|\n", + "# +--------------------------------+-----+-------------+-------+\n", + "# |6d275222b43d431abc568dd83313118f|R |1727244523000|1 |\n", + "# |875a388ca374407ea761689511078956|R |1727339056000|1 |\n", + "# |dfe4012bcb9c448f98f940f01302ae6e|R |1727234937000|0 |\n", + "# |dfe4012bcb9c448f98f940f01302ae6e|R |1725402537000|0 |\n", + "# |c6859b18a771440ab906733e2bebf78a|R |1727235038000|1 |\n", + "# \n", + "# 3. select the earliest row for \"the rule that have state O\" (where clause). this can be done by windows function, sort by UPDATED_AT ascending for each ID, then filter only row_number \"1\"\n", + "# 4. select the latest row for \"the rule that does not have state O at all\". \n", + "# This is a bit tricky but can be done by filter out the rule that have number of state O more than zero.\n", + "# which this column already availabe from left join in step 2.\n", + "# For the \"select latest row\" we do the same way as 4. but sort by UPDATED_AT descending instead.\n", + "# 5. merge result from 3. and 4 with UNION ALL.\n", + "# 6. Then, we will calculate number of date in the next step\n", + "#\n", + "# We are selecting the rules for each condition and join later, to avoid large broadcasthashjoin internally\n", + "# I (Wa) tried this before and it cause above issue, but I might be wrong here though.\n", + "# ```\n", + "# SELECT * FROM rhistinfo_t \n", + "# WHERE (state_o > 0) \n", + "# OR (ID NOT IN (SELECT ID FROM (SELECT * FROM rhistinfo_t WHERE state_o > 0)))\n", + "# ```\n", + "# \n", + "\n", + "query = f\"\"\"\\\n", + "WITH \n", + "count_t AS (\n", + "SELECT ID, \n", + " SUM(CASE WHEN state = 'O' THEN 1 ELSE 0 END) AS state_o\n", + "FROM rules_history\n", + "GROUP BY ID\n", + "),\n", + "rhistinfo_t AS (\n", + "SELECT rules_history.ID AS ID, \n", + " rules_history.ACCOUNT AS ACCOUNT, \n", + " rules_history.NAME AS NAME, \n", + " rules_history.STATE AS STATE, \n", + " rules_history.EXPIRES_AT AS EXPIRES_AT, \n", + " rules_history.UPDATED_AT AS UPDATED_AT, \n", + " rules_history.CREATED_AT AS CREATED_AT,\n", + " count_t.state_o AS state_o\n", + "FROM rules_history\n", + "LEFT JOIN count_t ON rules_history.ID = count_t.ID\n", + "),\n", + "tmpwindow_1 AS (\n", + "SELECT *, row_number() over(partition by ID order by UPDATED_AT) as row_num\n", + "FROM rhistinfo_t\n", + "WHERE STATE = 'O'\n", + "), \n", + "r1 AS (\n", + "SELECT * FROM tmpwindow_1\n", + "WHERE row_num = 1\n", + "),\n", + "tmpwindow_2 AS (\n", + "SELECT *, row_number() over(partition by ID order by UPDATED_AT DESC) as row_num\n", + "FROM rhistinfo_t\n", + "WHERE STATE != 'O' AND state_o = 0\n", + "),\n", + "r2 AS (\n", + "SELECT * FROM tmpwindow_2\n", + "WHERE row_num = 1\n", + "),\n", + "r_all AS (\n", + "SELECT * FROM r1\n", + "UNION ALL\n", + "SELECT * FROM r2\n", + ")\n", + "SELECT * \n", + "FROM r_all\n", + "ORDER BY ID\n", + "\"\"\"\n", + "\n", + "tmprules = spark.sql(query)\n", + "tmprules.show(10, False)\n", + "tmprules.createOrReplaceTempView(\"tmprules\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32dd41b1", + "metadata": {}, + "outputs": [], + "source": [ + "# Calculate number of days, for state O, UPDATED_AT-CREATED_AT, otherwise EXPIRES_AT-CREATED_AT\n", + "# then enrich the data with the crab taskdb table by join rule ID with TM_DDM_REQID column\n", + "# need to apply windows function again to select only the rule id with the latest crab tasks\n", + "\n", + "query = f\"\"\"\\\n", + "WITH \n", + "calc_days_t AS (\n", + "SELECT ID, ACCOUNT, NAME, STATE, EXPIRES_AT, UPDATED_AT, CREATED_AT,\n", + " CASE \n", + " WHEN STATE = 'O' THEN ceil((UPDATED_AT-CREATED_AT)/86400000) \n", + " ELSE ceil((EXPIRES_AT-CREATED_AT)/86400000)\n", + " END AS DAYS\n", + "FROM tmprules\n", + "),\n", + "join_t AS (\n", + "SELECT \n", + " calc_days_t.ID AS ID, \n", + " calc_days_t.ACCOUNT AS ACCOUNT, \n", + " calc_days_t.NAME AS NAME, \n", + " calc_days_t.STATE AS STATE, \n", + " calc_days_t.DAYS AS DAYS, \n", + " calc_days_t.EXPIRES_AT AS EXPIRES_AT, \n", + " calc_days_t.UPDATED_AT AS UPDATED_AT, \n", + " calc_days_t.CREATED_AT AS CREATED_AT, \n", + " tasks.TM_TASKNAME AS TM_TASKNAME,\n", + " IFNULL(tasks.TM_START_TIME, 0) AS TM_START_TIME, \n", + " tasks.TM_TASK_STATUS AS TM_TASK_STATUS\n", + "FROM calc_days_t\n", + "LEFT JOIN tasks ON calc_days_t.ID = tasks.TM_DDM_REQID\n", + "),\n", + "window_t AS (\n", + "SELECT ID, ACCOUNT, NAME, STATE, DAYS, EXPIRES_AT, UPDATED_AT, CREATED_AT, TM_TASKNAME, TM_START_TIME, TM_TASK_STATUS, \n", + " row_number() OVER (PARTITION BY ID ORDER BY TM_START_TIME DESC) AS row_num\n", + "FROM join_t \n", + "),\n", + "uniqueid_t AS (\n", + "SELECT *\n", + "FROM window_t \n", + "WHERE row_num = 1\n", + "), \n", + "finalize_t AS (\n", + "SELECT ID, ACCOUNT, NAME, STATE, DAYS, EXPIRES_AT, UPDATED_AT, CREATED_AT, TM_TASKNAME, IFNULL(TM_START_TIME, 0) as TM_START_TIME, TM_TASK_STATUS, \n", + " EXPIRES_AT AS timestamp,\n", + " 'tape_recall_history' AS type\n", + "FROM uniqueid_t \n", + ")\n", + "SELECT *\n", + "FROM finalize_t\n", + "\"\"\"\n", + "\n", + "tmpdf = spark.sql(query)\n", + "tmpdf.show(10, False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df979012", + "metadata": {}, + "outputs": [], + "source": [ + "tmpdf.count()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c33dfce3", + "metadata": {}, + "outputs": [], + "source": [ + "docs = tmpdf.toPandas().to_dict('records')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eee4a1f3", + "metadata": {}, + "outputs": [], + "source": [ + "schema = {\n", + " \"settings\": {\"index\": {\"number_of_shards\": \"1\", \"number_of_replicas\": \"1\"}},\n", + " \"mappings\": {\n", + " \"properties\": {\n", + " \"ID\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", + " \"ACCOUNT\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", + " \"NAME\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", + " \"STATE\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", + " \"DAYS\": {\"type\": \"long\"},\n", + " \"EXPIRES_AT\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n", + " \"UPDATED_AT\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n", + " \"CREATED_AT\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n", + " \"TM_TASKNAME\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", + " \"TM_START_TIME\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n", + " \"TM_TASK_STATUS\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", + " \"type\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", + " \"timestamp\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n", + " }\n", + "\n", + " }\n", + "\n", + " }\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ec824ee", + "metadata": {}, + "outputs": [], + "source": [ + "# this is simple workaround osearch bug when work in notebook because\n", + "# - it load the secret once and use forever\n", + "# - get_or_create_index() create index+schema only the first time it execute\n", + "# it is safe to run again even in cronjobs \n", + "import importlib\n", + "import osearch\n", + "importlib.reload(osearch)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6cdc83dd", + "metadata": {}, + "outputs": [], + "source": [ + "osearch.send_os(docs, index_name, schema, secretpath, yesterday_epoch)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22747a3f", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Add a single doc to es everyday to check if pipeline is running successfully.\n", + "# This is need because we did not have rule that expires everyday\n", + "# Remember to filter it out in grafana (For example `NOT ID:00000000000000000` in lucene query)\n", + "day = start_datetime\n", + "monitoring_docs = []\n", + "while day < end_datetime:\n", + " milisec = int(day.timestamp())*1000\n", + " doc = {\n", + " \"ID\": '00000000000000000',\n", + " \"ACCOUNT\": 'cmscrab',\n", + " \"NAME\": '/Pipeline/Monitoring/AOD',\n", + " \"STATE\": 'P',\n", + " \"DAYS\": -1,\n", + " \"EXPIRES_AT\": milisec,\n", + " \"UPDATED_AT\": milisec,\n", + " \"CREATED_AT\": milisec,\n", + " \"TM_TASKNAME\": '240000_000000:cmscrab_crab_20240000_000000',\n", + " \"TM_START_TIME\": milisec,\n", + " \"TM_TASK_STATUS\": 'PLACEHOLDER',\n", + " \"type\": 'tape_recall_history',\n", + " \"timestamp\": milisec,\n", + "\n", + " }\n", + " monitoring_docs.append(doc)\n", + " day += timedelta(days=1)\n", + "send_os(monitoring_docs, index_name, schema, secretpath, yesterday_epoch)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a24e4ff", + "metadata": {}, + "outputs": [], + "source": [ + "# Useful query to get only the rules that gave\n", + "#query = f\"\"\"\\\n", + "#repeated_ids AS (\n", + "# SELECT ID\n", + "# FROM rules_history\n", + "# GROUP BY ID\n", + "# HAVING COUNT(*) > 2\n", + "#),\n", + "#tba_t AS (\n", + "#SELECT *\n", + "#FROM rules_history\n", + "#)\n", + "#SELECT * FROM tba_t\n", + "#\"\"\"\n", + "#\n", + "#testdf = spark.sql(query)\n", + "#testdf.show(100, False)\n", + "#\n", + "# rule 37fc where latest UPDATED_AT is 43 days after the first OK state\n", + "#spark.sql(\"\"\"\\\n", + "#SELECT * FROM rules_history\n", + "#WHERE ID = '37fcada73f14439b88558ef792e10276'\n", + "#\"\"\").show(10, False)" + ] + } + ], + "metadata": { + "@webio": { + "lastCommId": null, + "lastKernelId": null + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + }, + "sparkconnect": { + "bundled_options": [], + "list_of_options": [ + { + "name": "spark.jars.packages", + "value": "org.apache.spark:spark-avro_2.12:3.5.0" + }, + { + "name": "spark.executor.instances", + "value": "20" + } + ] + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/script/Monitor/crab-spark/notebooks/crab_tape_recall_rules_history.ipynb b/src/script/Monitor/crab-spark/notebooks/crab_tape_recall_rules_history.ipynb deleted file mode 100644 index 20f441f4ce..0000000000 --- a/src/script/Monitor/crab-spark/notebooks/crab_tape_recall_rules_history.ipynb +++ /dev/null @@ -1,726 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "2fe94c82", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "9f91521a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - "

SparkSession - in-memory

\n", - " \n", - "
\n", - "

SparkContext

\n", - "\n", - "

Spark UI

\n", - "\n", - "
\n", - "
Version
\n", - "
v3.3.2
\n", - "
Master
\n", - "
yarn
\n", - "
AppName
\n", - "
pyspark_shell_swan
\n", - "
\n", - "
\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "666f70d9", - "metadata": {}, - "outputs": [], - "source": [ - "# !hdfs dfs -stat /project/awg/cms/rucio/2023-07-31/" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "bd6751a6", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "23/08/01 17:05:11 WARN ipc.Client: Exception encountered while connecting to the server \n", - "org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.ipc.StandbyException): Operation category READ is not supported in state standby. Visit https://s.apache.org/sbnn-error\n", - "\tat org.apache.hadoop.security.SaslRpcClient.saslConnect(SaslRpcClient.java:376)\n", - "\tat org.apache.hadoop.ipc.Client$Connection.setupSaslConnection(Client.java:622)\n", - "\tat org.apache.hadoop.ipc.Client$Connection.access$2300(Client.java:413)\n", - "\tat org.apache.hadoop.ipc.Client$Connection$2.run(Client.java:822)\n", - "\tat org.apache.hadoop.ipc.Client$Connection$2.run(Client.java:818)\n", - "\tat java.base/java.security.AccessController.doPrivileged(Native Method)\n", - "\tat java.base/javax.security.auth.Subject.doAs(Subject.java:423)\n", - "\tat org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1845)\n", - "\tat org.apache.hadoop.ipc.Client$Connection.setupIOstreams(Client.java:818)\n", - "\tat org.apache.hadoop.ipc.Client$Connection.access$3800(Client.java:413)\n", - "\tat org.apache.hadoop.ipc.Client.getConnection(Client.java:1636)\n", - "\tat org.apache.hadoop.ipc.Client.call(Client.java:1452)\n", - "\tat org.apache.hadoop.ipc.Client.call(Client.java:1405)\n", - "\tat org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:234)\n", - "\tat org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:119)\n", - "\tat com.sun.proxy.$Proxy12.getFileInfo(Unknown Source)\n", - "\tat org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.getFileInfo(ClientNamenodeProtocolTranslatorPB.java:964)\n", - "\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n", - "\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n", - "\tat java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n", - "\tat java.base/java.lang.reflect.Method.invoke(Method.java:566)\n", - "\tat org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:422)\n", - "\tat org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeMethod(RetryInvocationHandler.java:165)\n", - "\tat org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invoke(RetryInvocationHandler.java:157)\n", - "\tat org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeOnce(RetryInvocationHandler.java:95)\n", - "\tat org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:359)\n", - "\tat com.sun.proxy.$Proxy13.getFileInfo(Unknown Source)\n", - "\tat org.apache.hadoop.hdfs.DFSClient.getFileInfo(DFSClient.java:1731)\n", - "\tat org.apache.hadoop.hdfs.DistributedFileSystem$29.doCall(DistributedFileSystem.java:1725)\n", - "\tat org.apache.hadoop.hdfs.DistributedFileSystem$29.doCall(DistributedFileSystem.java:1722)\n", - "\tat org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81)\n", - "\tat org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:1737)\n", - "\tat org.apache.hadoop.fs.Globber.getFileStatus(Globber.java:115)\n", - "\tat org.apache.hadoop.fs.Globber.doGlob(Globber.java:349)\n", - "\tat org.apache.hadoop.fs.Globber.glob(Globber.java:202)\n", - "\tat org.apache.hadoop.fs.FileSystem.globStatus(FileSystem.java:2093)\n", - "\tat org.apache.hadoop.fs.shell.PathData.expandAsGlob(PathData.java:353)\n", - "\tat org.apache.hadoop.fs.shell.Command.expandArgument(Command.java:250)\n", - "\tat org.apache.hadoop.fs.shell.Command.expandArguments(Command.java:233)\n", - "\tat org.apache.hadoop.fs.shell.FsCommand.processRawArguments(FsCommand.java:104)\n", - "\tat org.apache.hadoop.fs.shell.Command.run(Command.java:177)\n", - "\tat org.apache.hadoop.fs.FsShell.run(FsShell.java:327)\n", - "\tat org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:76)\n", - "\tat org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:90)\n", - "\tat org.apache.hadoop.fs.FsShell.main(FsShell.java:390)\n", - "Found 41 items\n", - "-rw-r--r-x+ 3 cmssqoop c3 0 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/_SUCCESS\n", - "-rw-r--r-x+ 3 cmssqoop c3 88187830 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00000.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 78573788 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00001.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 89288020 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00002.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 87120186 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00003.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 84145506 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00004.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 77023084 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00005.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 82231949 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00006.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 90427579 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00007.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 83505019 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00008.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 81737327 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00009.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 89063315 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00010.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 87547076 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00011.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 76025866 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00012.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 86124517 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00013.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 84209698 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00014.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 87883924 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00015.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 84024611 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00016.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 88549765 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00017.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 78591247 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00018.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 88304711 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00019.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 84004574 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00020.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 84661738 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00021.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 78502498 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00022.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 91523366 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00023.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 77450183 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00024.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 92852942 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00025.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 85201132 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00026.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 83220428 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00027.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 72640822 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00028.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 74597749 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00029.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 83142949 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00030.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 86601475 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00031.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 90497549 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00032.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 88555030 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00033.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 78799199 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00034.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 80642314 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00035.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 85967465 2023-07-24 04:35 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00036.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 92843317 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00037.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 83861741 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00038.avro\n", - "-rw-r--r-x+ 3 cmssqoop c3 91545885 2023-07-24 04:36 /project/awg/cms/rucio/2023-07-24/rules_history/part-m-00039.avro\n" - ] - } - ], - "source": [ - "!hdfs dfs -ls /project/awg/cms/rucio/2023-07-24/rules_history #02:54:14" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "800a2f9e", - "metadata": {}, - "outputs": [], - "source": [ - "import pickle\n", - "from datetime import datetime, timedelta\n", - "\n", - "import click\n", - "import os\n", - "import pandas as pd\n", - "import pprint\n", - "import time\n", - "from dateutil.relativedelta import relativedelta\n", - "from pyspark import SparkContext, StorageLevel\n", - "from pyspark.sql import SparkSession\n", - "from pyspark.sql.functions import (\n", - " col, collect_list, concat_ws, greatest, lit, lower, when,\n", - " avg as _avg,\n", - " count as _count,\n", - " hex as _hex,\n", - " max as _max,\n", - " min as _min,\n", - " round as _round,\n", - " sum as _sum,\n", - ")\n", - "\n", - "from pyspark.sql.types import (\n", - " LongType,\n", - ")\n", - "\n", - "#from CMSSpark.src.python.CMSSpark import schemas as cms_schemas" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "6951caed", - "metadata": {}, - "outputs": [], - "source": [ - "#from CMSSpark import schemas as cms_schemas" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4e78c524", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "e597820f", - "metadata": {}, - "source": [ - "## load dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "2c100a92", - "metadata": {}, - "outputs": [], - "source": [ - "# end_date = str(datetime.now())[:10]\n", - "# start_date = str(datetime.now()-timedelta(days=1))[:10]\n", - "\n", - "end_date = '2023-07-31'\n", - "start_date = '2023-07-01'\n", - "\n", - "TOYEAR = end_date[:4]\n", - "\n", - "wa_date = end_date\n", - "HDFS_RUCIO_DATASET_LOCKS = f'/project/awg/cms/rucio/{wa_date}/dataset_locks/part*.avro'\n", - "HDFS_RUCIO_LOCKS = f'/project/awg/cms/rucio/{wa_date}/locks'\n", - "HDFS_RUCIO_RSES = f'/project/awg/cms/rucio/{wa_date}/rses/part*.avro'\n", - "HDFS_RUCIO_RULES = f'/project/awg/cms/rucio/{wa_date}/rules'\n", - "HDFS_RUCIO_RULES_HISTORY = f'/project/awg/cms/rucio/{wa_date}/rules_history'\n", - "HDFS_RUCIO_REPLICAS = f'/project/awg/cms/rucio/{wa_date}/replicas'" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "fe62d431", - "metadata": {}, - "outputs": [], - "source": [ - "# rucio_dataset_locks = spark.read.format('avro').load(HDFS_RUCIO_DATASET_LOCKS)\\\n", - "# .withColumn('BYTES', col('BYTES').cast(LongType()))\\\n", - "# .withColumn('RULE_ID', lower(_hex(col('RULE_ID'))))\\\n", - "# .withColumn('RSE_ID', lower(_hex(col('RSE_ID'))))\n", - "# rucio_dataset_locks.createOrReplaceTempView(\"dataset_locks\")" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "b2e4fcfa", - "metadata": {}, - "outputs": [], - "source": [ - "# rucio_rses = spark.read.format('avro').load(HDFS_RUCIO_RSES)\\\n", - "# .withColumn('ID', lower(_hex(col('ID'))))\n", - "# rucio_rses.createOrReplaceTempView(\"rses\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "3893197e", - "metadata": {}, - "outputs": [], - "source": [ - "# rucio_rules = spark.read.format('avro').load(HDFS_RUCIO_RULES)\\\n", - "# .withColumn('ID', lower(_hex(col('ID'))))\n", - "# rucio_rules.createOrReplaceTempView(\"rules\")\n", - "# #spark.sql(\"SELECT * FROM rules\").count()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "f9f2ba4e", - "metadata": {}, - "outputs": [], - "source": [ - "# rucio_locks = spark.read.format('avro').load(HDFS_RUCIO_LOCKS)\\\n", - "# .withColumn('BYTES', col('BYTES').cast(LongType()))\\\n", - "# .withColumn('RULE_ID', lower(_hex(col('RULE_ID'))))\\\n", - "# .withColumn('RSE_ID', lower(_hex(col('RSE_ID'))))\n", - "# rucio_locks.createOrReplaceTempView(\"locks\")" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "7771b12d", - "metadata": {}, - "outputs": [], - "source": [ - "rucio_rules_history = spark.read.format('avro').load(HDFS_RUCIO_RULES_HISTORY)\\\n", - " .withColumn('ID', lower(_hex(col('ID'))))\n", - " #.persist(StorageLevel.DISK_ONLY)\n", - "rucio_rules_history = rucio_rules_history.select(\"ID\", \"NAME\", \"STATE\", \"EXPIRES_AT\", \"UPDATED_AT\", \"CREATED_AT\", \"ACCOUNT\")\n", - "rucio_rules_history.createOrReplaceTempView(\"rules_history\")\n", - "#spark.sql(\"SELECT * FROM rules_history\").count()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "274421b8", - "metadata": {}, - "outputs": [], - "source": [ - "# rucio_replicas = spark.read.format('avro').load(HDFS_RUCIO_REPLICAS)\\\n", - "# .withColumn('RSE_ID', lower(_hex(col('RSE_ID'))))\n", - "# rucio_replicas.createOrReplaceTempView(\"replicas\")\n", - "# #spark.sql(\"SELECT * FROM replicas\").count()" - ] - }, - { - "cell_type": "markdown", - "id": "5c84635f", - "metadata": {}, - "source": [ - "## Query" - ] - }, - { - "cell_type": "markdown", - "id": "ee99f580", - "metadata": {}, - "source": [ - "# how long does it take ?" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "26120cd9", - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "# # NOTE: days is ceiling\n", - "\n", - "# spark.sql(\"\"\"\n", - "# WITH filter_t AS (\n", - "# SELECT ID, NAME, STATE, EXPIRES_AT, UPDATED_AT, CREATED_AT\n", - "# FROM rules_history \n", - "# WHERE 1=1\n", - "# AND ACCOUNT = \"crab_tape_recall\"\n", - "# --- we look at the rule created this year (2023)\n", - "# AND CREATED_AT >= unix_timestamp(\"2023-01-01 00:00:00\", \"yyyy-MM-dd HH:mm:ss\")*1000 \n", - "# ),\n", - "# rn_t AS (\n", - "# SELECT ID, NAME, STATE, EXPIRES_AT, UPDATED_AT, CREATED_AT,\n", - "# row_number() over(partition by ID order by UPDATED_AT desc) as rn --- to get only latest state for each id\n", - "# FROM filter_t\n", - "# ),\n", - "# calc_days_t AS (\n", - "# SELECT ID, NAME, STATE, \n", - "# from_unixtime(EXPIRES_AT/1000, 'yyyy-MM-dd HH:mm:ss') AS EXPIRES_AT, \n", - "# from_unixtime(UPDATED_AT/1000, 'yyyy-MM-dd HH:mm:ss') AS UPDATED_AT, \n", - "# from_unixtime(CREATED_AT/1000, 'yyyy-MM-dd HH:mm:ss') AS CREATED_AT,\n", - "# --- if state is O we calculate from update_at when state change (assumed that there is only single row for O state)\n", - "# --- but if state is not O, we calculate from expired time, it usually 14 days but it is possible that rules somehow got extend\n", - "# --- other wise days = 0 for filter the rules that not expire \n", - "# CASE \n", - "# WHEN STATE = 'O' THEN ceil((UPDATED_AT-CREATED_AT)/86400000) \n", - "# WHEN STATE != 'O' AND EXPIRES_AT < unix_timestamp(\"2023-05-22 00:00:00\", \"yyyy-MM-dd HH:mm:ss\")*1000 THEN ceil((EXPIRES_AT-CREATED_AT)/86400000)\n", - "# ELSE 0\n", - "# END AS DAYS\n", - "# FROM rn_t\n", - "# WHERE rn = 1\n", - "# )\n", - "# SELECT * \n", - "# FROM calc_days_t\n", - "# ---AND STATE == 'O'\n", - "# \"\"\"\n", - "# ).show(50,truncate=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "fadde59c", - "metadata": {}, - "outputs": [], - "source": [ - "## query use to produce data to elasticsearch\n", - "\n", - "query = f\"\"\"\\\n", - "WITH filter_t AS (\n", - "SELECT ID, NAME, STATE, EXPIRES_AT, UPDATED_AT, CREATED_AT\n", - "FROM rules_history \n", - "WHERE 1=1\n", - "AND ACCOUNT = \"crab_tape_recall\"\n", - "AND CREATED_AT >= unix_timestamp(\"{TOYEAR}-01-01 00:00:00\", \"yyyy-MM-dd HH:mm:ss\")*1000\n", - "),\n", - "rn_t AS (\n", - "SELECT ID, NAME, STATE, EXPIRES_AT, UPDATED_AT, CREATED_AT,\n", - "row_number() over(partition by ID order by UPDATED_AT desc) as rn\n", - "FROM filter_t\n", - "),\n", - "calc_days_t AS (\n", - "SELECT ID, NAME, STATE, EXPIRES_AT, UPDATED_AT, CREATED_AT,\n", - " CASE \n", - " WHEN STATE = 'O' THEN ceil((UPDATED_AT-CREATED_AT)/86400000) \n", - " WHEN STATE != 'O' AND EXPIRES_AT < unix_timestamp(\"{wa_date} 00:00:00\", \"yyyy-MM-dd HH:mm:ss\")*1000 THEN ceil((EXPIRES_AT-CREATED_AT)/86400000)\n", - " ELSE 0\n", - " END AS DAYS\n", - "FROM rn_t\n", - "WHERE rn = 1\n", - ")\n", - "SELECT * \n", - "FROM calc_days_t\n", - "WHERE 1=1\n", - "AND EXPIRES_AT >= unix_timestamp(\"{start_date} 00:00:00\", \"yyyy-MM-dd HH:mm:ss\")*1000 \n", - "AND EXPIRES_AT < unix_timestamp(\"{end_date} 00:00:00\", \"yyyy-MM-dd HH:mm:ss\")*1000 \n", - "\"\"\"\n", - "\n", - "tmpdf = spark.sql(query)\n", - "# str(datetime.now()-timedelta(days=1))[:10]" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "b44548ef", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+--------------------+--------------------+-----+-------------+-------------+-------------+----+\n", - "| ID| NAME|STATE| EXPIRES_AT| UPDATED_AT| CREATED_AT|DAYS|\n", - "+--------------------+--------------------+-----+-------------+-------------+-------------+----+\n", - "|16e7eeb0a6c447839...|/DYJetsToLL_LHEFi...| O|1689496342000|1689130290000|1686566168000| 30|\n", - "|27aea75d1d364b219...|/WJetsToLNu_HT-20...| O|1689568449000|1689086563000|1686931142000| 25|\n", - "|3f2d7fcff69d49079...|/ParkingBPH1/Run2...| R|1689522386000|1687621610000|1687621586000| 22|\n", - "|67d9f565492b4dec9...|/DYJetsToLL_M-10t...| R|1689519133000|1687618376000|1687618333000| 22|\n", - "|c2cbad3267e84ba18...|/TapeRecall/23061...| O|1689554004000|1689117261000|1686940766000| 26|\n", - "|d23ee08f6aac4d5db...|/QCD_HT300to500_T...| O|1689525153000|1689048723000|1686900417000| 25|\n", - "|ddfdfed2239940298...|/W2JetsToLNu_Tune...| R|1689517301000|1687616515000|1687616501000| 22|\n", - "|dee8dbd0a82b48b59...|/TapeRecall/23060...| O|1689525153000|1689127918000|1685747740000| 40|\n", - "+--------------------+--------------------+-----+-------------+-------------+-------------+----+\n", - "\n" - ] - } - ], - "source": [ - "tmpdf.show(50)" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "91db6a20", - "metadata": {}, - "outputs": [], - "source": [ - "tmpdf.printSchema()" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "id": "e734d507", - "metadata": {}, - "outputs": [], - "source": [ - "docs = tmpdf.toPandas().to_dict('records')" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "id": "ac8524e0", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "8260" - ] - }, - "execution_count": 48, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(docs)" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "id": "fa51e74c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'ID': '00049b4efb3e4dd091dbfed2012069df',\n", - " 'NAME': '/TapeRecall/221110_230609.dshmygol_crab_Bfinder_2018_MC_Bc_in_JpsiPI_v0_1/USER',\n", - " 'STATE': 'O',\n", - " 'EXPIRES_AT': 1669331191000,\n", - " 'UPDATED_AT': 1668133187000,\n", - " 'CREATED_AT': 1668121591000,\n", - " 'DAYS': 1},\n", - " {'ID': '0007a18199834a2ca720f088d96a3c9c',\n", - " 'NAME': '/TapeRecall/220427_065307.youying_crab_DiphoVtxUL2016_DoubleMuon_Run2016B-21Feb2020_ver2_UL2016_HIPM-v1/USER',\n", - " 'STATE': 'O',\n", - " 'EXPIRES_AT': 1652252233000,\n", - " 'UPDATED_AT': 1651048717000,\n", - " 'CREATED_AT': 1651042633000,\n", - " 'DAYS': 1},\n", - " {'ID': '00d4ba364b89477e888e8797a33092d2',\n", - " 'NAME': '/TapeRecall/210810_035101.jingqing_crab_BPHSkimOfficialChib06900-2016-v5/USER',\n", - " 'STATE': 'O',\n", - " 'EXPIRES_AT': 1629777107000,\n", - " 'UPDATED_AT': 1628847357000,\n", - " 'CREATED_AT': 1628567507000,\n", - " 'DAYS': 4},\n", - " {'ID': '00fb74e1bafc40aba0736216b798a80c',\n", - " 'NAME': '/TapeRecall/230220_091052.shiyi_crab_RUN3_2022Dv2mass3_SKIM_E_newV2/USER',\n", - " 'STATE': 'R',\n", - " 'EXPIRES_AT': 1678093905000,\n", - " 'UPDATED_AT': 1678093224000,\n", - " 'CREATED_AT': 1676884305000,\n", - " 'DAYS': 14},\n", - " {'ID': '0116d88feb0842f29f78c78f2e7a4ce4',\n", - " 'NAME': '/TapeRecall/230113_215556.wjang_crab_NanoAODv9_v1_ST_t-channel_antitop_4f_InclusiveDecays_TuneCP5_13TeV-powheg-madspin-pythia8_postVFP/USER',\n", - " 'STATE': 'O',\n", - " 'EXPIRES_AT': 1674856682000,\n", - " 'UPDATED_AT': 1673966996000,\n", - " 'CREATED_AT': 1673647082000,\n", - " 'DAYS': 4}]" - ] - }, - "execution_count": 49, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "docs[:5]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c052b072", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 52, - "id": "86f3a742", - "metadata": {}, - "outputs": [], - "source": [ - "import osearch" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "id": "6d29e62d", - "metadata": {}, - "outputs": [], - "source": [ - "def get_index_schema():\n", - " return {\n", - " \"settings\": {\"index\": {\"number_of_shards\": \"1\", \"number_of_replicas\": \"1\"}},\n", - " \"mappings\": {\n", - " \"properties\": {\n", - " \"timestamp\": {\"format\": \"epoch_second\", \"type\": \"date\"},\n", - " \"ID\": {\"ignore_above\": 1024, \"type\": \"keyword\"},\n", - " \"NAME\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", - " \"STATE\": {\"ignore_above\": 1024, \"type\": \"keyword\"},\n", - " \"EXPIRES_AT\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n", - " \"UPDATED_AT\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n", - " \"CREATED_AT\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n", - " \"DAYS\": {\"type\": \"long\"},\n", - " }\n", - " }\n", - " }" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "id": "b479eeb7", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/eos/user/e/eatthaph/.local/lib/python3.9/site-packages/opensearchpy/connection/http_urllib3.py:199: UserWarning: Connecting to https://es-cms1.cern.ch:443 using SSL with verify_certs=False is insecure.\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/plain": [ - "0" - ] - }, - "execution_count": 54, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "_index_template = 'crab-tape-recall-daily-ekong'\n", - "client = osearch.get_es_client(\"es-cms1.cern.ch/es\", 'secret_opensearch.txt', get_index_schema())\n", - "# index_mod=\"\": 'test-foo', index_mod=\"Y\": 'test-foo-YYYY', index_mod=\"M\": 'test-foo-YYYY-MM', index_mod=\"D\": 'test-foo-YYYY-MM-DD',\n", - "idx = client.get_or_create_index(timestamp=time.time(), index_template=_index_template, index_mod=\"M\")\n", - "client.send(idx, docs, metadata=None, batch_size=10000, drop_nulls=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0af51d3a", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "12ece939", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f4567c46", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "546e9d4f", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "496e681c", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "@webio": { - "lastCommId": null, - "lastKernelId": null - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.12" - }, - "sparkconnect": { - "bundled_options": [], - "list_of_options": [ - { - "name": "spark.jars.packages", - "value": "org.apache.spark:spark-avro_2.12:3.3.1" - } - ] - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/src/script/Monitor/crab-spark/notebooks/crab_tape_recall_updated_rules.ipynb b/src/script/Monitor/crab-spark/notebooks/crab_tape_recall_updated_rules.ipynb deleted file mode 100644 index 5311eb9dd5..0000000000 --- a/src/script/Monitor/crab-spark/notebooks/crab_tape_recall_updated_rules.ipynb +++ /dev/null @@ -1,889 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "9f91521a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - "

SparkSession - in-memory

\n", - " \n", - "
\n", - "

SparkContext

\n", - "\n", - "

Spark UI

\n", - "\n", - "
\n", - "
Version
\n", - "
v3.3.2
\n", - "
Master
\n", - "
yarn
\n", - "
AppName
\n", - "
pyspark_shell_swan
\n", - "
\n", - "
\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "666f70d9", - "metadata": {}, - "outputs": [], - "source": [ - "# !hdfs dfs -stat /project/awg/cms/rucio/2023-07-24/" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "bd6751a6", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "23/08/09 12:12:50 WARN ipc.Client: Exception encountered while connecting to the server \n", - "org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.ipc.StandbyException): Operation category READ is not supported in state standby. Visit https://s.apache.org/sbnn-error\n", - "\tat org.apache.hadoop.security.SaslRpcClient.saslConnect(SaslRpcClient.java:376)\n", - "\tat org.apache.hadoop.ipc.Client$Connection.setupSaslConnection(Client.java:622)\n", - "\tat org.apache.hadoop.ipc.Client$Connection.access$2300(Client.java:413)\n", - "\tat org.apache.hadoop.ipc.Client$Connection$2.run(Client.java:822)\n", - "\tat org.apache.hadoop.ipc.Client$Connection$2.run(Client.java:818)\n", - "\tat java.base/java.security.AccessController.doPrivileged(Native Method)\n", - "\tat java.base/javax.security.auth.Subject.doAs(Subject.java:423)\n", - "\tat org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1845)\n", - "\tat org.apache.hadoop.ipc.Client$Connection.setupIOstreams(Client.java:818)\n", - "\tat org.apache.hadoop.ipc.Client$Connection.access$3800(Client.java:413)\n", - "\tat org.apache.hadoop.ipc.Client.getConnection(Client.java:1636)\n", - "\tat org.apache.hadoop.ipc.Client.call(Client.java:1452)\n", - "\tat org.apache.hadoop.ipc.Client.call(Client.java:1405)\n", - "\tat org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:234)\n", - "\tat org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:119)\n", - "\tat com.sun.proxy.$Proxy12.getFileInfo(Unknown Source)\n", - "\tat org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.getFileInfo(ClientNamenodeProtocolTranslatorPB.java:964)\n", - "\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n", - "\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n", - "\tat java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n", - "\tat java.base/java.lang.reflect.Method.invoke(Method.java:566)\n", - "\tat org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:422)\n", - "\tat org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeMethod(RetryInvocationHandler.java:165)\n", - "\tat org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invoke(RetryInvocationHandler.java:157)\n", - "\tat org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeOnce(RetryInvocationHandler.java:95)\n", - "\tat org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:359)\n", - "\tat com.sun.proxy.$Proxy13.getFileInfo(Unknown Source)\n", - "\tat org.apache.hadoop.hdfs.DFSClient.getFileInfo(DFSClient.java:1731)\n", - "\tat org.apache.hadoop.hdfs.DistributedFileSystem$29.doCall(DistributedFileSystem.java:1725)\n", - "\tat org.apache.hadoop.hdfs.DistributedFileSystem$29.doCall(DistributedFileSystem.java:1722)\n", - "\tat org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81)\n", - "\tat org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:1737)\n", - "\tat org.apache.hadoop.fs.Globber.getFileStatus(Globber.java:115)\n", - "\tat org.apache.hadoop.fs.Globber.doGlob(Globber.java:349)\n", - "\tat org.apache.hadoop.fs.Globber.glob(Globber.java:202)\n", - "\tat org.apache.hadoop.fs.FileSystem.globStatus(FileSystem.java:2093)\n", - "\tat org.apache.hadoop.fs.shell.PathData.expandAsGlob(PathData.java:353)\n", - "\tat org.apache.hadoop.fs.shell.Command.expandArgument(Command.java:250)\n", - "\tat org.apache.hadoop.fs.shell.Command.expandArguments(Command.java:233)\n", - "\tat org.apache.hadoop.fs.shell.FsCommand.processRawArguments(FsCommand.java:104)\n", - "\tat org.apache.hadoop.fs.shell.Command.run(Command.java:177)\n", - "\tat org.apache.hadoop.fs.FsShell.run(FsShell.java:327)\n", - "\tat org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:76)\n", - "\tat org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:90)\n", - "\tat org.apache.hadoop.fs.FsShell.main(FsShell.java:390)\n", - "Found 10 items\n", - "drwxrwxr-x+ - cmssqoop c3 0 2023-07-25 04:18 /project/awg/cms/rucio/2023-07-25/contents\n", - "drwxrwxr-x+ - cmssqoop c3 0 2023-07-25 04:22 /project/awg/cms/rucio/2023-07-25/dataset_locks\n", - "drwxrwxr-x+ - cmssqoop c3 0 2023-07-25 04:13 /project/awg/cms/rucio/2023-07-25/dids\n", - "drwxrwxr-x+ - cmssqoop c3 0 2023-07-25 04:28 /project/awg/cms/rucio/2023-07-25/locks\n", - "drwxrwxr-x+ - cmssqoop c3 0 2023-07-25 04:06 /project/awg/cms/rucio/2023-07-25/replicas\n", - "drwxrwxr-x+ - cmssqoop c3 0 2023-07-25 04:46 /project/awg/cms/rucio/2023-07-25/requests_history\n", - "drwxrwxr-x+ - cmssqoop c3 0 2023-07-25 04:53 /project/awg/cms/rucio/2023-07-25/rses\n", - "drwxrwxr-x+ - cmssqoop c3 0 2023-07-25 04:33 /project/awg/cms/rucio/2023-07-25/rules\n", - "drwxrwxr-x+ - cmssqoop c3 0 2023-07-25 04:38 /project/awg/cms/rucio/2023-07-25/rules_history\n", - "drwxrwxr-x+ - cmssqoop c3 0 2023-07-25 04:50 /project/awg/cms/rucio/2023-07-25/subscriptions\n" - ] - } - ], - "source": [ - "# check available files\n", - "!hdfs dfs -ls /project/awg/cms/rucio/2023-07-25" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "800a2f9e", - "metadata": {}, - "outputs": [], - "source": [ - "import pickle\n", - "from datetime import datetime, timedelta\n", - "\n", - "import click\n", - "import os\n", - "import pandas as pd\n", - "import pprint\n", - "import time\n", - "from dateutil.relativedelta import relativedelta\n", - "from pyspark import SparkContext, StorageLevel\n", - "from pyspark.sql import SparkSession\n", - "from pyspark.sql.functions import (\n", - " col, collect_list, concat_ws, greatest, lit, lower, when,\n", - " avg as _avg,\n", - " count as _count,\n", - " hex as _hex,\n", - " max as _max,\n", - " min as _min,\n", - " round as _round,\n", - " sum as _sum,\n", - ")\n", - "\n", - "from pyspark.sql.types import (\n", - " LongType,\n", - ")\n", - "\n", - "#from CMSSpark.src.python.CMSSpark import schemas as cms_schemas" - ] - }, - { - "cell_type": "markdown", - "id": "e597820f", - "metadata": {}, - "source": [ - "## load dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "2c100a92", - "metadata": {}, - "outputs": [], - "source": [ - "wa_date = str(datetime.now())[:10]\n", - "# wa_date = \"2023-08-08\"\n", - "\n", - "HDFS_RUCIO_DATASET_LOCKS = f'/project/awg/cms/rucio/{wa_date}/dataset_locks/part*.avro'\n", - "# HDFS_RUCIO_LOCKS = f'/project/awg/cms/rucio/{wa_date}/locks'\n", - "HDFS_RUCIO_RSES = f'/project/awg/cms/rucio/{wa_date}/rses/part*.avro'\n", - "HDFS_RUCIO_RULES = f'/project/awg/cms/rucio/{wa_date}/rules'\n", - "# HDFS_RUCIO_RULES_HISTORY = f'/project/awg/cms/rucio/{wa_date}/rules_history'\n", - "# HDFS_RUCIO_REPLICAS = f'/project/awg/cms/rucio/{wa_date}/replicas'" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "fe62d431", - "metadata": {}, - "outputs": [], - "source": [ - "rucio_dataset_locks = spark.read.format('avro').load(HDFS_RUCIO_DATASET_LOCKS)\\\n", - " .withColumn('BYTES', col('BYTES').cast(LongType()))\\\n", - " .withColumn('RULE_ID', lower(_hex(col('RULE_ID'))))\\\n", - " .withColumn('RSE_ID', lower(_hex(col('RSE_ID')))).filter(f\"\"\"ACCOUNT IN ('crab_tape_recall')\"\"\").cache()\n", - "rucio_dataset_locks.createOrReplaceTempView(\"dataset_locks\")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "b2e4fcfa", - "metadata": {}, - "outputs": [], - "source": [ - "rucio_rses = spark.read.format('avro').load(HDFS_RUCIO_RSES)\\\n", - " .withColumn('ID', lower(_hex(col('ID'))))\n", - "rucio_rses.createOrReplaceTempView(\"rses\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "3893197e", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "23/08/09 12:37:11 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n" - ] - } - ], - "source": [ - "rucio_rules = spark.read.format('avro').load(HDFS_RUCIO_RULES)\\\n", - " .withColumn('ID', lower(_hex(col('ID'))))\n", - "rucio_rules.createOrReplaceTempView(\"rules\")\n", - "#spark.sql(\"SELECT * FROM rules\").count()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "f9f2ba4e", - "metadata": {}, - "outputs": [], - "source": [ - "# rucio_locks = spark.read.format('avro').load(HDFS_RUCIO_LOCKS)\\\n", - "# .withColumn('BYTES', col('BYTES').cast(LongType()))\\\n", - "# .withColumn('RULE_ID', lower(_hex(col('RULE_ID'))))\\\n", - "# .withColumn('RSE_ID', lower(_hex(col('RSE_ID'))))\n", - "# rucio_locks.createOrReplaceTempView(\"locks\")" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "7771b12d", - "metadata": {}, - "outputs": [], - "source": [ - "# rucio_rules_history = spark.read.format('avro').load(HDFS_RUCIO_RULES_HISTORY)\\\n", - "# .withColumn('ID', lower(_hex(col('ID'))))\n", - "# #.persist(StorageLevel.DISK_ONLY)\n", - "# rucio_rules_history.createOrReplaceTempView(\"rules_history\")\n", - "# #spark.sql(\"SELECT * FROM rules_history\").count()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "274421b8", - "metadata": {}, - "outputs": [], - "source": [ - "# rucio_replicas = spark.read.format('avro').load(HDFS_RUCIO_REPLICAS)\\\n", - "# .withColumn('RSE_ID', lower(_hex(col('RSE_ID'))))\n", - "# rucio_replicas.createOrReplaceTempView(\"replicas\")\n", - "# #spark.sql(\"SELECT * FROM replicas\").count()" - ] - }, - { - "cell_type": "markdown", - "id": "5c84635f", - "metadata": {}, - "source": [ - "## Query" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "9be915ed", - "metadata": {}, - "outputs": [], - "source": [ - "# rucio_dataset_locks.count()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "8648794b", - "metadata": {}, - "outputs": [], - "source": [ - "# rucio_dataset_locks.printSchema()\n", - "# rucio_rses.printSchema()\n", - "# rucio_rules.printSchema()" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "3aed55c6", - "metadata": {}, - "outputs": [], - "source": [ - "# rucio_dataset_locks = rucio_dataset_locks.select('')\n", - "rucio_rses = rucio_rses.select('ID', 'RSE', 'RSE_TYPE').cache()\n", - "rucio_rules = rucio_rules.select('ID', 'ACCOUNT', 'DID_TYPE', 'EXPIRES_AT').cache()" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "929705b6", - "metadata": {}, - "outputs": [], - "source": [ - "result_df = rucio_dataset_locks.join(rucio_rses, rucio_rses[\"ID\"] == rucio_dataset_locks[\"RSE_ID\"])\\\n", - " .join(rucio_rules, rucio_rules[\"ID\"] == rucio_dataset_locks[\"RULE_ID\"]).drop('ID', 'RULE_ID', 'RSE_ID', 'ACCESSED_AT', 'ACCOUNT')" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "49af7fee", - "metadata": {}, - "outputs": [], - "source": [ - "# result_df.show(100)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "91db6a20", - "metadata": {}, - "outputs": [], - "source": [ - "# result_df.printSchema()" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "7cbdf730", - "metadata": {}, - "outputs": [], - "source": [ - "# result_df.count()" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "e734d507", - "metadata": {}, - "outputs": [], - "source": [ - "docs = result_df.toPandas().to_dict('records')" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "ac8524e0", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "17770" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(docs)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "6d047c66", - "metadata": {}, - "outputs": [], - "source": [ - "for i in range(len(docs)):\n", - " docs[i]['SIZE_TiB'] = docs[i][\"BYTES\"]/1099511627776\n", - " del docs[i][\"BYTES\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "c052b072", - "metadata": {}, - "outputs": [], - "source": [ - "TIME = datetime.strptime(f\"\"\"{wa_date} 00:00:00\"\"\", \"%Y-%m-%d %H:%M:%S\").timestamp()*1000\n", - "for i in range(len(docs)):\n", - " docs[i]['TIMESTAMP'] = TIME" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "id": "836a7743", - "metadata": {}, - "outputs": [], - "source": [ - "for i in range(len(docs)):\n", - " NAME_i = docs[i]['NAME']\n", - " split_NAME = NAME_i.split('#')[0]\n", - " docs[i]['NAME_'] = NAME_i.split('#')[0]\n", - " split_NAME = docs[i]['NAME_'].split('/')\n", - " if len(split_NAME) != 4:\n", - " print(\"YO HOO !!, something wrong.\", NAME_i)\n", - " docs[i]['PriDataset'] = split_NAME[1]\n", - " docs[i]['DataTier'] = split_NAME[-1] " - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "id": "51bf031e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'SCOPE': 'cms',\n", - " 'NAME': '/ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8/RunIIFall17MiniAODv2-PU2017_12Apr2018_94X_mc2017_realistic_v14-v3/MINIAODSIM#c7b37e2d-77d8-40b9-b8c9-cdf7658406bd',\n", - " 'STATE': 'O',\n", - " 'LENGTH': '1',\n", - " 'UPDATED_AT': 1689164433000,\n", - " 'CREATED_AT': 1689096938000,\n", - " 'RSE': 'T2_UK_SGrid_RALPP',\n", - " 'RSE_TYPE': 'DISK',\n", - " 'DID_TYPE': 'C',\n", - " 'EXPIRES_AT': 1691719252000,\n", - " 'SIZE_TiB': 0.0003293267427579849,\n", - " 'TIMESTAMP': 1691532000000.0,\n", - " 'NAME_': '/ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8/RunIIFall17MiniAODv2-PU2017_12Apr2018_94X_mc2017_realistic_v14-v3/MINIAODSIM',\n", - " 'PriDataset': 'ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8',\n", - " 'DataTier': 'MINIAODSIM'},\n", - " {'SCOPE': 'cms',\n", - " 'NAME': '/ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8/RunIIFall17MiniAODv2-PU2017_12Apr2018_94X_mc2017_realistic_v14-v3/MINIAODSIM#4e06c095-6b19-46a1-a6a6-321e6692a086',\n", - " 'STATE': 'O',\n", - " 'LENGTH': '1',\n", - " 'UPDATED_AT': 1689164433000,\n", - " 'CREATED_AT': 1689096938000,\n", - " 'RSE': 'T2_UK_SGrid_RALPP',\n", - " 'RSE_TYPE': 'DISK',\n", - " 'DID_TYPE': 'C',\n", - " 'EXPIRES_AT': 1691719252000,\n", - " 'SIZE_TiB': 0.00011089865711255698,\n", - " 'TIMESTAMP': 1691532000000.0,\n", - " 'NAME_': '/ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8/RunIIFall17MiniAODv2-PU2017_12Apr2018_94X_mc2017_realistic_v14-v3/MINIAODSIM',\n", - " 'PriDataset': 'ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8',\n", - " 'DataTier': 'MINIAODSIM'},\n", - " {'SCOPE': 'cms',\n", - " 'NAME': '/ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8/RunIIFall17MiniAODv2-PU2017_12Apr2018_94X_mc2017_realistic_v14-v3/MINIAODSIM#1a79fa1f-9f97-4f0f-9716-523e29e57c32',\n", - " 'STATE': 'O',\n", - " 'LENGTH': '1',\n", - " 'UPDATED_AT': 1689164433000,\n", - " 'CREATED_AT': 1689096938000,\n", - " 'RSE': 'T2_UK_SGrid_RALPP',\n", - " 'RSE_TYPE': 'DISK',\n", - " 'DID_TYPE': 'C',\n", - " 'EXPIRES_AT': 1691719252000,\n", - " 'SIZE_TiB': 0.001415386764165305,\n", - " 'TIMESTAMP': 1691532000000.0,\n", - " 'NAME_': '/ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8/RunIIFall17MiniAODv2-PU2017_12Apr2018_94X_mc2017_realistic_v14-v3/MINIAODSIM',\n", - " 'PriDataset': 'ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8',\n", - " 'DataTier': 'MINIAODSIM'},\n", - " {'SCOPE': 'cms',\n", - " 'NAME': '/ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8/RunIIFall17MiniAODv2-PU2017_12Apr2018_94X_mc2017_realistic_v14-v3/MINIAODSIM#18958704-f8f5-4ab4-8d26-0875a74714c4',\n", - " 'STATE': 'O',\n", - " 'LENGTH': '1',\n", - " 'UPDATED_AT': 1689164433000,\n", - " 'CREATED_AT': 1689096938000,\n", - " 'RSE': 'T2_UK_SGrid_RALPP',\n", - " 'RSE_TYPE': 'DISK',\n", - " 'DID_TYPE': 'C',\n", - " 'EXPIRES_AT': 1691719252000,\n", - " 'SIZE_TiB': 0.0008716376141819637,\n", - " 'TIMESTAMP': 1691532000000.0,\n", - " 'NAME_': '/ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8/RunIIFall17MiniAODv2-PU2017_12Apr2018_94X_mc2017_realistic_v14-v3/MINIAODSIM',\n", - " 'PriDataset': 'ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8',\n", - " 'DataTier': 'MINIAODSIM'},\n", - " {'SCOPE': 'cms',\n", - " 'NAME': '/ParkingDoubleMuonLowMass1/Run2023C-PromptReco-v3/AOD#ef5c7b53-7002-4b16-bd94-c9e6cbd1ddc6',\n", - " 'STATE': 'O',\n", - " 'LENGTH': '1',\n", - " 'UPDATED_AT': 1689903482000,\n", - " 'CREATED_AT': 1689587082000,\n", - " 'RSE': 'T2_BE_UCL',\n", - " 'RSE_TYPE': 'DISK',\n", - " 'DID_TYPE': 'C',\n", - " 'EXPIRES_AT': 1692496353000,\n", - " 'SIZE_TiB': 5.84150075155776e-06,\n", - " 'TIMESTAMP': 1691532000000.0,\n", - " 'NAME_': '/ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8/RunIIFall17MiniAODv2-PU2017_12Apr2018_94X_mc2017_realistic_v14-v3/MINIAODSIM',\n", - " 'PriDataset': 'ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8',\n", - " 'DataTier': 'MINIAODSIM'}]" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "docs[:5]" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "5c770068", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['',\n", - " 'ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8',\n", - " 'RunIIFall17MiniAODv2-PU2017_12Apr2018_94X_mc2017_realistic_v14-v3',\n", - " 'MINIAODSIM#c7b37e2d-77d8-40b9-b8c9-cdf7658406bd']" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "split_str = test_str.split('/')\n", - "split_str" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "2a2868f7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['MINIAODSIM', 'c7b37e2d-77d8-40b9-b8c9-cdf7658406bd']" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "split_str[3].split('#')" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "86f3a742", - "metadata": {}, - "outputs": [], - "source": [ - "import osearch" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "6d29e62d", - "metadata": {}, - "outputs": [], - "source": [ - "def get_index_schema():\n", - " return {\n", - " \"settings\": {\"index\": {\"number_of_shards\": \"1\", \"number_of_replicas\": \"1\"}},\n", - " \"mappings\": {\n", - " \"properties\": {\n", - " 'SCOPE': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", - " 'NAME': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", - " 'STATE': {\"ignore_above\": 1024, \"type\": \"keyword\"},\n", - " 'LENGTH': {\"ignore_above\": 1024, \"type\": \"keyword\"},\n", - " 'SIZE_TiB': {\"type\": \"long\"},\n", - " 'UPDATED_AT': {\"format\": \"epoch_millis\", \"type\": \"date\"},\n", - " 'CREATED_AT': {\"format\": \"epoch_millis\", \"type\": \"date\"},\n", - " 'RSE': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", - " 'RSE_TYPE': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", - " 'DID_TYPE': {\"ignore_above\": 1024, \"type\": \"keyword\"},\n", - " 'EXPIRES_AT': {\"format\": \"epoch_millis\", \"type\": \"date\"},\n", - " 'TIMESTAMP': {\"format\": \"epoch_millis\", \"type\": \"date\"},\n", - " 'NAME_': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", - " 'PriDataset': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", - " 'DataTier': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", - " }\n", - " }\n", - " }" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "b479eeb7", - "metadata": {}, - "outputs": [], - "source": [ - "# _index_template = 'crab-tape-recall-rules-ekong'\n", - "# client = osearch.get_es_client(\"es-cms1.cern.ch/es\", 'secret_opensearch.txt', get_index_schema())\n", - "# # index_mod=\"\": 'test-foo', index_mod=\"Y\": 'test-foo-YYYY', index_mod=\"M\": 'test-foo-YYYY-MM', index_mod=\"D\": 'test-foo-YYYY-MM-DD',\n", - "# idx = client.get_or_create_index(timestamp=time.time(), index_template=_index_template, index_mod=\"M\")\n", - "# client.send(idx, docs, metadata=None, batch_size=10000, drop_nulls=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "0af51d3a", - "metadata": {}, - "outputs": [], - "source": [ - "from datetime import datetime, timedelta\n", - "import os\n", - "import pandas as pd\n", - "import time\n", - "from pyspark import SparkContext, StorageLevel\n", - "from pyspark.sql import SparkSession\n", - "from pyspark.sql.functions import (\n", - " col, collect_list, concat_ws, greatest, lit, lower, when,\n", - " avg as _avg,\n", - " count as _count,\n", - " hex as _hex,\n", - " max as _max,\n", - " min as _min,\n", - " round as _round,\n", - " sum as _sum,\n", - ")\n", - "\n", - "from pyspark.sql.types import (\n", - " LongType,\n", - ")\n", - "\n", - "import numpy as np\n", - "import osearch\n", - "from pyspark.sql import SparkSession" - ] - }, - { - "cell_type": "markdown", - "id": "035e6ecf", - "metadata": {}, - "source": [ - "## Multiple Day Upload" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "12ece939", - "metadata": {}, - "outputs": [], - "source": [ - "def multi_upload(start_date, end_date):\n", - " # change to the date of collected data\n", - " start_date = start_date + timedelta(days=1)\n", - " end_date = end_date + timedelta(days=1)\n", - " \n", - " days = (end_date - start_date).days\n", - " for i in range(days):\n", - " TODAY = start_date + timedelta(days=i)\n", - " TODAY = str(TODAY)[:10]\n", - " \n", - " print(TODAY)\n", - " # Import data into database form\n", - "\n", - " wa_date = TODAY\n", - " HDFS_RUCIO_DATASET_LOCKS = f'/project/awg/cms/rucio/{wa_date}/dataset_locks/part*.avro'\n", - " HDFS_RUCIO_RSES = f'/project/awg/cms/rucio/{wa_date}/rses/part*.avro'\n", - " HDFS_RUCIO_RULES = f'/project/awg/cms/rucio/{wa_date}/rules'\n", - "\n", - " rucio_dataset_locks = spark.read.format('avro').load(HDFS_RUCIO_DATASET_LOCKS)\\\n", - " .withColumn('BYTES', col('BYTES').cast(LongType()))\\\n", - " .withColumn('RULE_ID', lower(_hex(col('RULE_ID'))))\\\n", - " .withColumn('RSE_ID', lower(_hex(col('RSE_ID')))).filter(f\"\"\"ACCOUNT IN ('crab_tape_recall')\"\"\").cache()\n", - " rucio_dataset_locks.createOrReplaceTempView(\"dataset_locks\")\n", - "\n", - " rucio_rses = spark.read.format('avro').load(HDFS_RUCIO_RSES)\\\n", - " .withColumn('ID', lower(_hex(col('ID'))))\n", - " rucio_rses.createOrReplaceTempView(\"rses\")\n", - "\n", - " rucio_rules = spark.read.format('avro').load(HDFS_RUCIO_RULES)\\\n", - " .withColumn('ID', lower(_hex(col('ID'))))\n", - " rucio_rules.createOrReplaceTempView(\"rules\")\n", - "\n", - " # filter and query\n", - "\n", - " rucio_rses = rucio_rses.select('ID', 'RSE', 'RSE_TYPE').cache()\n", - " rucio_rules = rucio_rules.select('ID', 'ACCOUNT', 'DID_TYPE', 'EXPIRES_AT').cache()\n", - "\n", - " result_df = rucio_dataset_locks.join(rucio_rses, rucio_rses[\"ID\"] == rucio_dataset_locks[\"RSE_ID\"])\\\n", - " .join(rucio_rules, rucio_rules[\"ID\"] == rucio_dataset_locks[\"RULE_ID\"]).drop('ID', 'RULE_ID', 'RSE_ID', 'ACCESSED_AT', 'ACCOUNT')\n", - "\n", - " # Convert database to dictionary\n", - "\n", - " docs = result_df.toPandas().to_dict('records')\n", - " \n", - " # Add TIMESTAMP column and convert TiB\n", - " TIME = datetime.strptime(f\"\"\"{wa_date} 00:00:00\"\"\", \"%Y-%m-%d %H:%M:%S\").timestamp()*1000\n", - " for i in range(len(docs)):\n", - " docs[i]['TIMESTAMP'] = TIME\n", - " docs[i]['SIZE_TiB'] = docs[i][\"BYTES\"]/1099511627776\n", - " del docs[i][\"BYTES\"]\n", - " \n", - " # break down the name\n", - " NAME_i = docs[i]['NAME']\n", - " split_NAME = NAME_i.split('#')[0]\n", - " docs[i]['NAME_'] = NAME_i.split('#')[0]\n", - " split_NAME = docs[i]['NAME_'].split('/')\n", - " if len(split_NAME) != 4:\n", - " print(\"YO HOO !!, something wrong.\", NAME_i)\n", - " docs[i]['PriDataset'] = split_NAME[1]\n", - " docs[i]['DataTier'] = split_NAME[-1]\n", - "\n", - " # Define type of each schema\n", - "\n", - " def get_index_schema():\n", - " return {\n", - " \"settings\": {\"index\": {\"number_of_shards\": \"1\", \"number_of_replicas\": \"1\"}},\n", - " \"mappings\": {\n", - " \"properties\": {\n", - " 'SCOPE': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", - " 'NAME': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", - " 'STATE': {\"ignore_above\": 1024, \"type\": \"keyword\"},\n", - " 'LENGTH': {\"ignore_above\": 1024, \"type\": \"keyword\"},\n", - " 'BYTES': {\"type\": \"long\"},\n", - " 'UPDATED_AT': {\"format\": \"epoch_millis\", \"type\": \"date\"},\n", - " 'CREATED_AT': {\"format\": \"epoch_millis\", \"type\": \"date\"},\n", - " 'RSE': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", - " 'RSE_TYPE': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", - " 'DID_TYPE': {\"ignore_above\": 1024, \"type\": \"keyword\"},\n", - " 'EXPIRES_AT': {\"format\": \"epoch_millis\", \"type\": \"date\"},\n", - " 'TIMESTAMP': {\"format\": \"epoch_millis\", \"type\": \"date\"},\n", - " 'NAME_': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", - " 'PriDataset': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", - " 'DataTier': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", - " }\n", - " }\n", - " }\n", - "\n", - " # Send data to Opensearch\n", - "\n", - " _index_template = 'crab-tape-recall-rules-ekong'\n", - " client = osearch.get_es_client(\"es-cms1.cern.ch/es\", 'secret_opensearch.txt', get_index_schema())\n", - " idx = client.get_or_create_index(timestamp=time.time(), index_template=_index_template, index_mod=\"M\")\n", - " no_of_fail_saved = client.send(idx, docs, metadata=None, batch_size=10000, drop_nulls=False)\n", - "\n", - " print(\"========================================================================\", \"FINISHED : \", len(docs), \"ROWS ARE SENT\", no_of_fail_saved, \"ROWS ARE FAILED\", \"========================================================================\", sep='\\n')\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "f4567c46", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2023-07-23\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n", - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "========================================================================\n", - "FINISHED : \n", - "40190\n", - "ROWS ARE SENT\n", - "0\n", - "ROWS ARE FAILED\n", - "========================================================================\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/cvmfs/sft.cern.ch/lcg/views/LCG_103swan/x86_64-centos7-gcc11-opt/lib/python3.9/site-packages/urllib3/connectionpool.py:1013: InsecureRequestWarning: Unverified HTTPS request is being made to host 'es-cms1.cern.ch'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n" - ] - } - ], - "source": [ - "# upload the data of start_date day to end_date-1d\n", - "start_date = datetime(2023, 7, 23)\n", - "end_date = datetime(2023, 7, 24)\n", - "\n", - "multi_upload(start_date, end_date)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "546e9d4f", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "496e681c", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "@webio": { - "lastCommId": null, - "lastKernelId": null - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.12" - }, - "sparkconnect": { - "bundled_options": [], - "list_of_options": [ - { - "name": "spark.jars.packages", - "value": "org.apache.spark:spark-avro_2.12:3.3.1" - } - ] - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/src/script/Monitor/crab-spark/notebooks/crab_taskdb.ipynb b/src/script/Monitor/crab-spark/notebooks/crab_taskdb.ipynb new file mode 100644 index 0000000000..a491927996 --- /dev/null +++ b/src/script/Monitor/crab-spark/notebooks/crab_taskdb.ipynb @@ -0,0 +1,416 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "bcae07ec", + "metadata": {}, + "source": [ + "# CRAB Spark taskdb\n", + "\n", + "This jobs will \"copy\" some column from TaskDB table to opensearch to answer theses questions:\n", + "- How many tasks are using each crab features? (Split algorithm, Ignorelocality, ScriptExe, GPU)\n", + "- How many tasks each users submit?\n", + "- How many tasks use ignorelocality?\n" + ] + }, + { + "cell_type": "markdown", + "id": "6d41c8e6", + "metadata": {}, + "source": [ + "## Import lib" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e9af689", + "metadata": {}, + "outputs": [], + "source": [ + "from datetime import datetime, timedelta, timezone\n", + "import os\n", + "import time\n", + "import pandas as pd\n", + "\n", + "from pyspark import SparkContext, StorageLevel\n", + "from pyspark.sql import SparkSession\n", + "from pyspark.sql.functions import (\n", + " current_user,\n", + " col, collect_list, concat_ws, greatest, lit, lower, when,\n", + " avg as _avg,\n", + " count as _count,\n", + " hex as _hex,\n", + " max as _max,\n", + " min as _min,\n", + " round as _round,\n", + " sum as _sum,\n", + ")\n", + "from pyspark.sql.types import (\n", + " StructType,\n", + " LongType,\n", + " StringType,\n", + " StructField,\n", + " DoubleType,\n", + " IntegerType,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07a5e399", + "metadata": {}, + "outputs": [], + "source": [ + "# try to import libs from current directory, fallback to $PWD/../workdir if not found\n", + "try:\n", + " from crabspark_utils import get_candidate_files, send_os, send_os_parallel\n", + "except ModuleNotFoundError:\n", + " import sys\n", + " sys.path.insert(0, f'{os.getcwd()}/../workdir')\n", + " from crabspark_utils import get_candidate_files, send_os, send_os_parallel\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22946659", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "spark = SparkSession\\\n", + " .builder\\\n", + " .appName('crab-taskdb')\\\n", + " .getOrCreate()\n", + "spark" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e9013878", + "metadata": {}, + "outputs": [], + "source": [ + "# clear any cache left, for working with notebook\n", + "# it safe to run everytime cronjob start\n", + "spark.catalog.clearCache()" + ] + }, + { + "cell_type": "markdown", + "id": "17a6078f", + "metadata": {}, + "source": [ + "## Arguments\n", + "\n", + "We provide arguments to this script via env var. \n", + "- `OPENSEARCH_SECRET_PATH`: path to secretfile, contain a line of : of opensearch that we send the data to\n", + "- `PROD`: if true index prefix will be `crab-prod-`, otherwise `crab-test-`\n", + "- `START`: start date (YYYY-MM-dd)\n", + "- `END`: end date (YYYY-MM-dd)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31c19eb0", + "metadata": {}, + "outputs": [], + "source": [ + "# secret path, also check if file exists\n", + "secretpath = os.environ.get('OPENSEARCH_SECRET_PATH', f'{os.getcwd()}/../workdir/secret_opensearch.txt')\n", + "if not os.path.isfile(secretpath): \n", + " raise Exception(f'OS secrets file {secretpath} does not exists')\n", + "# if PROD, index prefix will be `crab-*`, otherwise `crab-test-*`\n", + "PROD = os.environ.get('PROD', 'false').lower() in ('true', '1', 't')\n", + "# FROM_DATE, in strptime(\"%Y-%m-%d\")\n", + "START = os.environ.get('START_DATE', None) \n", + "END = os.environ.get('END_DATE', None)" + ] + }, + { + "cell_type": "markdown", + "id": "f15e62ea", + "metadata": {}, + "source": [ + "## Variables \n", + "Will be used throughout notebook" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e843eb6d", + "metadata": {}, + "outputs": [], + "source": [ + "# For run playbook manually, set start/end date here\n", + "START_DATE = \"2024-01-03\"\n", + "END_DATE = \"2024-10-04\"\n", + "# if cronjob, replace constant with value from env\n", + "if START and END:\n", + " START_DATE = START\n", + " END_DATE = END" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b17ed53f", + "metadata": {}, + "outputs": [], + "source": [ + "# index name\n", + "index_name = 'taskdb'\n", + "# use prod index pattern if this execution is for production\n", + "if PROD:\n", + " index_name = f'crab-prod-{index_name}'\n", + "else:\n", + " index_name = f'crab-test-{index_name}'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "430146eb", + "metadata": {}, + "outputs": [], + "source": [ + "# datetime object\n", + "start_datetime = datetime.strptime(START_DATE, \"%Y-%m-%d\").replace(tzinfo=timezone.utc)\n", + "end_datetime = datetime.strptime(END_DATE, \"%Y-%m-%d\").replace(tzinfo=timezone.utc)\n", + "# sanity check\n", + "if end_datetime < start_datetime: \n", + " raise Exception(f\"end date ({END_DATE}) is less than start date ({START_DATE})\")\n", + "start_epochmilis = int(start_datetime.timestamp()) * 1000\n", + "end_epochmilis = int(end_datetime.timestamp()) * 1000\n", + "yesterday_epoch = int((end_datetime-timedelta(days=1)).timestamp())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9404c437", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# debug\n", + "print(START_DATE, \n", + " END_DATE, \n", + " index_name,\n", + " sep='\\n')" + ] + }, + { + "cell_type": "markdown", + "id": "9b33ec96", + "metadata": {}, + "source": [ + "## Loading data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0cf35868", + "metadata": {}, + "outputs": [], + "source": [ + "# Note that \"today\" file, for example, today=2024-10-04, should be in directory /project/awg/cms/crab/tasks/2024-10-04 \n", + "# which contain contents from the begining of table until the time of dump job run\n", + "# which mean data before 2024-10-04 will be available, but not 2024-10-04 itself!\n", + "\n", + "HDFS_CRAB_part = f'/project/awg/cms/crab/tasks/{END_DATE}/' # data each day in hdfs contain whole table\n", + "print(\"===============================================\"\n", + " , \"CRAB Table\"\n", + " , \"===============================================\"\n", + " , \"File Directory:\", HDFS_CRAB_part\n", + " , \"Work Directory:\", os.getcwd()\n", + " , \"===============================================\"\n", + " , \"===============================================\", sep='\\n')\n", + "\n", + "tasks_df = spark.read.format('avro').load(HDFS_CRAB_part).cache()\n", + "tasks_df = ( \n", + " tasks_df.select(\"TM_TASKNAME\",\"TM_START_TIME\",\"TM_TASK_STATUS\",\"TM_SPLIT_ALGO\",\"TM_USERNAME\",\"TM_USER_ROLE\",\"TM_JOB_TYPE\",\"TM_IGNORE_LOCALITY\",\"TM_SCRIPTEXE\",\"TM_USER_CONFIG\")\n", + " .filter(f\"\"\"\\\n", + " 1=1\n", + " AND TM_START_TIME >= {start_epochmilis}\n", + " AND TM_START_TIME < {end_epochmilis}\"\"\")\n", + " .cache()\n", + ")\n", + "tasks_df.createOrReplaceTempView(\"tasks\")" + ] + }, + { + "cell_type": "markdown", + "id": "86c634fe", + "metadata": {}, + "source": [ + "## Query" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e271b1c8", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "query = f\"\"\"\\\n", + "WITH reqacc_tb AS ( \n", + "SELECT TM_TASKNAME, TM_START_TIME, TM_TASK_STATUS, TM_SPLIT_ALGO, TM_USERNAME, TM_USER_ROLE, TM_JOB_TYPE, TM_IGNORE_LOCALITY, TM_SCRIPTEXE,\n", + " CASE \n", + " WHEN get_json_object(TM_USER_CONFIG, '$.requireaccelerator') = true THEN 'T'\n", + " ELSE 'F'\n", + " END AS REQUIRE_ACCELERATOR\n", + "FROM tasks\n", + "),\n", + "finalize_tb AS (\n", + "SELECT TM_TASKNAME, TM_START_TIME, TM_TASK_STATUS, TM_SPLIT_ALGO, TM_USERNAME, TM_USER_ROLE, TM_JOB_TYPE, TM_IGNORE_LOCALITY, TM_SCRIPTEXE, REQUIRE_ACCELERATOR,\n", + " TM_START_TIME AS timestamp,\n", + " 'taskdb' AS type\n", + "FROM reqacc_tb\n", + ")\n", + "SELECT * FROM finalize_tb\n", + "\"\"\"\n", + "\n", + "tmpdf = spark.sql(query)\n", + "tmpdf.show(10, False)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6561ada6", + "metadata": {}, + "outputs": [], + "source": [ + "tmpdf.count()" + ] + }, + { + "cell_type": "markdown", + "id": "3c7fc2e5", + "metadata": {}, + "source": [ + "## Sending result to OpenSearch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c33dfce3", + "metadata": {}, + "outputs": [], + "source": [ + "# convert spark df to dicts\n", + "docs = tmpdf.toPandas().to_dict('records')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eee4a1f3", + "metadata": {}, + "outputs": [], + "source": [ + "schema = {\n", + " \"settings\": {\"index\": {\"number_of_shards\": \"1\", \"number_of_replicas\": \"1\"}},\n", + " \"mappings\": {\n", + " \"properties\": {\n", + " \"TM_TASKNAME\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", + " \"TM_START_TIME\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n", + " 'TM_TASK_STATUS': {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", + " \"TM_SPLIT_ALGO\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", + " \"TM_USERNAME\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", + " \"TM_USER_ROLE\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", + " \"TM_JOB_TYPE\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", + " \"TM_IGNORE_LOCALITY\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", + " \"TM_SCRIPTEXE\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", + " \"REQUIRE_ACCELERATOR\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", + " \"type\": {\"ignore_above\": 2048, \"type\": \"keyword\"},\n", + " \"timestamp\": {\"format\": \"epoch_millis\", \"type\": \"date\"},\n", + " }\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ec824ee", + "metadata": {}, + "outputs": [], + "source": [ + "# this is simple workaround osearch bug when work in notebook because\n", + "# - it load the secret once and use forever\n", + "# - get_or_create_index() create index+schema only the first time it execute\n", + "# it is safe to run again even in cronjobs \n", + "import importlib\n", + "import osearch\n", + "importlib.reload(osearch)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64bcf06e", + "metadata": {}, + "outputs": [], + "source": [ + "send_os(docs, index_name, schema, secretpath, yesterday_epoch)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "032d03e0", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "@webio": { + "lastCommId": null, + "lastKernelId": null + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + }, + "sparkconnect": { + "bundled_options": [], + "list_of_options": [ + { + "name": "spark.jars.packages", + "value": "org.apache.spark:spark-avro_2.12:3.5.0" + } + ] + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/script/Monitor/crab-spark/workdir/bootstrap.sh b/src/script/Monitor/crab-spark/workdir/bootstrap.sh index 29e8d7f00e..9390cfac87 100644 --- a/src/script/Monitor/crab-spark/workdir/bootstrap.sh +++ b/src/script/Monitor/crab-spark/workdir/bootstrap.sh @@ -1,16 +1,10 @@ # source the environment for spark submit kinit cmscrab@CERN.CH -k -t /data/certs/keytabs.d/cmscrab.keytab -source hadoop-setconf.sh analytix +source hadoop-setconf.sh analytix LCG_VER=/cvmfs/sft.cern.ch/lcg/views/LCG_105a_swan/x86_64-el9-gcc13-opt source $LCG_VER/setup.sh export PYSPARK_PYTHON=$LCG_VER/bin/python3 -# i know, ugly, we should install software in the dockerfile -# however, we really need an environment from cvmfs, and i am not sure we -# can have access to cvmfs at build time in gitlab -python3 -m pip install --user opensearch-py - # finish the environment export CRAB_KRB5_USERNAME=$(klist | grep -i Default | cut -d":" -f2 | cut -d"@" -f"1" | awk '{$1=$1};1') - diff --git a/src/script/Monitor/crab-spark/workdir/crabspark_utils.py b/src/script/Monitor/crab-spark/workdir/crabspark_utils.py new file mode 100644 index 0000000000..6d45b2b130 --- /dev/null +++ b/src/script/Monitor/crab-spark/workdir/crabspark_utils.py @@ -0,0 +1,94 @@ +""" +Utility functions for spark scripts +""" +# pylint: disable=protected-access + +import concurrent.futures + +from datetime import timedelta +from osearch import get_es_client, OpenSearchInterface + +def get_candidate_files(start_date, end_date, spark, base, day_delta=1): + """ + Returns a list of hdfs folders that can contain data for the given dates. + Copy from CMSMONIT CMSSpark: + https://github.com/dmwm/CMSSpark/blob/b8efa0ac5cb57b617ee8d1ea9bb26d53fb0443b0/src/python/CMSSpark/spark_utils.py#L768 + """ + st_date = start_date - timedelta(days=day_delta) + ed_date = end_date + timedelta(days=day_delta) + days = (ed_date - st_date).days + + sc = spark.sparkContext + # The candidate files are the folders to the specific dates, + # but if we are looking at recent days the compaction procedure could + # have not run yet, so we will consider also the .tmp folders. + + candidate_files = [ + f"{base}/{(st_date + timedelta(days=i)).strftime('%Y/%m/%d')}{{,.tmp}}" + for i in range(0, days) + ] + fsystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem + uri = sc._gateway.jvm.java.net.URI + path = sc._gateway.jvm.org.apache.hadoop.fs.Path + fs = fsystem.get(uri("hdfs:///"), sc._jsc.hadoopConfiguration()) + candidate_files = [url for url in candidate_files if fs.globStatus(path(url))] + return candidate_files + + +def send_os(docs, index_name, schema, secretpath, timestamp, batch_size=10000, printsummary=True): + """ + Convenient one-liner function to send data to opensearch using osearch lib + + :param docs: documents to send to opensearch + :type docs: dict + :param index_name: opensearch index name + :type index_name: str + :param schema: opensearch index schema + :type schema: str + :param secretpath: path to secret file which contains ":" + :type secretpath: str + :param timestamp: timestamp in second to build + :type timestamp: str + :param batch_size: how many docs we send to os in a single request + :type batch_size: int + :param printsummary: if yes, print summary text + :type printsummary: bool + + :return: number of total docs and number of fail-to-send docs. + :rtype: (int, int) + """ + client = get_es_client("os-cms.cern.ch/os", secretpath, schema) + idx = client.get_or_create_index(timestamp=timestamp, index_template=index_name, index_mod="M") + no_of_fail_saved = client.send(idx, docs, metadata=None, batch_size=batch_size, drop_nulls=False) + if printsummary: + print("========================================================================" + , "FINISHED : " + , len(docs), "ROWS ARE SENT" + , no_of_fail_saved, "ROWS ARE FAILED" + , "========================================================================", sep='\n') + return len(docs), no_of_fail_saved + +def send_os_parallel(docs, index_name, schema, secretpath, timestamp, batch_size=10000): + """ + Convenient one-liner function to send data to opensearch using osearch lib, + in parallel. + + Note that it has the same params as send_os() except `printsummary`, and + return None + """ + with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor: + futures = [] + for chunk in OpenSearchInterface.to_chunks(docs, batch_size): + future = executor.submit(send_os, chunk, index_name, schema, secretpath, timestamp, batch_size+1, False) + futures.append(future) + total_docs = 0 + total_fails = 0 + for f in futures: + ndocs, nfails = f.result() + total_docs += ndocs + total_fails += nfails + print("========================================================================" + , "FINISHED : " + , total_docs, "ROWS ARE SENT" + , total_fails, "ROWS ARE FAILED" + , "========================================================================", sep='\n') diff --git a/src/script/Monitor/crab-spark/workdir/osearch.py b/src/script/Monitor/crab-spark/workdir/osearch.py index 68f2f5b1ad..cd5787aec0 100644 --- a/src/script/Monitor/crab-spark/workdir/osearch.py +++ b/src/script/Monitor/crab-spark/workdir/osearch.py @@ -55,6 +55,7 @@ def get_index_schema(): import json import logging import time +import concurrent.futures from collections import Counter as collectionsCounter from datetime import datetime @@ -96,6 +97,7 @@ def __init__(self, host, secret_file, index_mapping_and_settings): url = 'https://' + username + ':' + password + '@' + host self.handle = OpenSearch( [url], + http_compress=True, verify_certs=False, use_ssl=True, ca_certs='/etc/pki/tls/certs/ca-bundle.trust.crt', @@ -215,3 +217,35 @@ def send(self, idx, data, metadata=None, batch_size=10000, drop_nulls=False): logging.error("OpenSearch send failed count: ", result_n_failed) logging.debug("OpenSearch send", len(data) - result_n_failed, "documents successfully") return result_n_failed + +def send_os(docs, index_name, schema, secretpath, timestamp, batch_size=10000, printsummary=True): + + client = get_es_client("os-cms.cern.ch/os", secretpath, schema) + idx = client.get_or_create_index(timestamp=timestamp, index_template=index_name, index_mod="M") + no_of_fail_saved = client.send(idx, docs, metadata=None, batch_size=batch_size, drop_nulls=False) + if printsummary: + print("========================================================================" + , "FINISHED : " + , len(docs), "ROWS ARE SENT" + , no_of_fail_saved, "ROWS ARE FAILED" + , "========================================================================", sep='\n') + else: + return len(docs), no_of_fail_saved + +def send_os_parallel(docs, index_name, schema, secretpath, timestamp, batch_size=10000): + with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor: + futures = [] + for chunk in OpenSearchInterface.to_chunks(docs, batch_size): + future = executor.submit(send_os, chunk, index_name, schema, secretpath, timestamp, batch_size+1, False) + futures.append(future) + total_docs = 0 + total_fails = 0 + for f in futures: + ndocs, nfails = f.result() + total_docs += ndocs + total_fails += nfails + print("========================================================================" + , "FINISHED : " + , total_docs, "ROWS ARE SENT" + , total_fails, "ROWS ARE FAILED" + , "========================================================================", sep='\n') diff --git a/src/script/Monitor/crab-spark/workdir/run.py b/src/script/Monitor/crab-spark/workdir/run.py new file mode 100755 index 0000000000..eb8224c8b6 --- /dev/null +++ b/src/script/Monitor/crab-spark/workdir/run.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 + +""" +This file convert the spark notebook into python file and run spark-submit +Require shell to source "bootstrap.sh" to bootstrap the cmd and pylib need by +this script. + +It process the python's argparse and pass the argument to spark script via +environment variable. + +For examples: +- To extract data from the whole September 2024 + ./run.py --secretpath secret.txt --start 2024-09-01 --end 2024-10-01 crab_taskdb.ipynb + +- To extract data from n days ago (in case you need to wait until data settle) + For example, today is 2024-10-01 but you want to process data on 2024-09-30 + ./run.py --secretpath secret.txt --ndaysago 2 crab_condor.ipynb + +- To push result docs to production index (otherwise, index will prefix with `crab-test`) + ./run.py --secretpath secret.txt --today --prod crab_taskdb.ipynb + +- To run in crontab daily, use "run_spark.sh" to prepare a new shell and source bootstrap.sh + ./run_spark.sh ./run.py --secretpath secret.txt --today --prod crab_taskdb.ipynb + +- To check env that will pass to spark script + ./run.py --secretpath secret.txt --today --dryrun crab_taskdb.ipynb +""" +import argparse +import os +import subprocess +import pathlib +from pprint import pprint +from datetime import datetime, timedelta, timezone + +def valid_date(s): + """ + check if date formate is correct and return the arg `s`. + The function serve as `type` of argument in argparse. + + >>> valid_date('2024-01-01') + valid_date('2024-01-01') + + :param s: date in format YYYY-mm-ddd + :type s: str + + :return: s argument + :rtype: str + """ + try: + datetime.strptime(s, '%Y-%m-%d') + return s + except ValueError as e: + raise argparse.ArgumentTypeError(f"not a valid date: {s!r}") from e + +parser = argparse.ArgumentParser(description='Converting spark ipynb and run spark-submit') +parser.add_argument('path', help='path of script (.ipynb)') +parser.add_argument('--start', type=valid_date, dest='start_date', help='Start date of interest (YYY-mm-dd)') +parser.add_argument('--end', type=valid_date, dest='end_date', help='End date of interest (YYY-mm-dd).') +parser.add_argument('--ndaysago', type=int, default=-1, help='set start date to n-1 days ago, and end date to n days ago') +parser.add_argument('--today', action='store_true', help='shortcut --ndaysago 0') +parser.add_argument('--prod', action='store_true', help='set opensearch index prefix to prod "crab-)". Default is "crab-test-"') +parser.add_argument('--secretpath', help='secret file path') +parser.add_argument('--dryrun', action='store_true', help='print env that will pass to spark script') +args = parser.parse_args() + +sparkjob_env = {} +if args.today: + args.ndaysago = 0 +if args.ndaysago >= 0: + day = datetime.now().replace(tzinfo=timezone.utc) + ed = args.ndaysago + sd = args.ndaysago + 1 # start date is "yesterday" of n days ago + sparkjob_env['START_DATE'] = (day-timedelta(days=sd)).strftime("%Y-%m-%d") + sparkjob_env['END_DATE'] = (day-timedelta(days=ed)).strftime("%Y-%m-%d") +if args.start_date and args.end_date: + sparkjob_env['START_DATE'] = args.start_date + sparkjob_env['END_DATE'] = args.end_date +if 'START_DATE' not in sparkjob_env and 'END_DATE' not in sparkjob_env: + raise Exception("Need --today or --ndaysago or --start/--end.") +if args.secretpath: + sparkjob_env['OPENSEARCH_SECRET_PATH'] = args.secretpath +if args.prod: + sparkjob_env['PROD'] = 't' +else: + sparkjob_env['PROD'] = 'f' + +runenv = os.environ.copy() +runenv.update(sparkjob_env) + +# convert from nootebook to py +path = pathlib.Path(args.path) +pathpy = path.with_suffix('.py') +cmd = f"jupyter nbconvert --to python {path}" +print(f'Running: {cmd}') +if not args.dryrun: + subprocess.run(cmd, shell=True, timeout=3600, check=True) + +# spark-submit +cmd = f'spark-submit --master yarn --packages org.apache.spark:spark-avro_2.12:3.5.0 {pathpy}' +print(f'Running: {cmd}') +print('With env: ') +pprint(sparkjob_env) +if not args.dryrun: + subprocess.run(cmd, shell=True, timeout=3600, check=True, env=runenv) diff --git a/src/script/Monitor/crab-spark/workdir/run_spark.sh b/src/script/Monitor/crab-spark/workdir/run_spark.sh new file mode 100755 index 0000000000..3bd92a5bfc --- /dev/null +++ b/src/script/Monitor/crab-spark/workdir/run_spark.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +set -euo pipefail +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + +pushd $SCRIPT_DIR + +# source the environment for spark submit +set +euo pipefail +source ./bootstrap.sh +set -euo pipefail + +$@ + +popd