diff --git a/technologies/app/saagie-usage-monitoring/README.md b/technologies/app/saagie-usage-monitoring/README.md index 3dc386f78..30b6f8f0d 100644 --- a/technologies/app/saagie-usage-monitoring/README.md +++ b/technologies/app/saagie-usage-monitoring/README.md @@ -18,6 +18,7 @@ To deploy Saagie Usage Monitoring on your platform, you need to create a user wi - `SAAGIE_AND_S3` if you want to monitor Saagie and S3 buckets - IP_HDFS (Required if MONITORING_OPT=`SAAGIE_AND_DATALAKE`) : Namenode IP - AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_S3_ENDPOINT and AWS_REGION_NAME (Required if MONITORING_OPT=`SAAGIE_AND_S3`) +- SAAGIE_SUM_CRON : Cron to collect Saagie informations on API (Optionnal, Default value : `0 * * * *`) For an external Postgres database : - SAAGIE_PG_HOST : Postgresql host (Default value : `localhost`) diff --git a/technologies/app/saagie-usage-monitoring/metadata.yaml b/technologies/app/saagie-usage-monitoring/metadata.yaml index 17fe81e7d..35d1360a5 100644 --- a/technologies/app/saagie-usage-monitoring/metadata.yaml +++ b/technologies/app/saagie-usage-monitoring/metadata.yaml @@ -16,6 +16,42 @@ customFlags: [] readme: /technologies/app/saagie-usage-monitoring contexts: + - id: saagie-usage-monitoring-2023-02 + label: For Saagie 2023.02 + releaseNotes: "" + available: true + trustLevel: stable + ports: + - port: 80 + name: saagie-usage-monitoring + rewriteUrl: false + basePath: SAAGIE_BASE_PATH + - port: 92 + name: ttyd + rewriteUrl: true + volumes: ["/opt/grafana", "/var/lib/postgresql/data"] + dockerInfo: + image: "saagie/saagie-usage-monitoring" + baseTag: "2023.02-0.1" + version: "2023.02-0.1-1.168.0_SDKTECHNO-244" + - id: saagie-usage-monitoring-2023-03 + label: For Saagie 2023.03 + releaseNotes: "" + available: true + trustLevel: stable + ports: + - port: 80 + name: saagie-usage-monitoring + rewriteUrl: false + basePath: SAAGIE_BASE_PATH + - port: 92 + name: ttyd + rewriteUrl: true + volumes: ["/opt/grafana", "/var/lib/postgresql/data"] + dockerInfo: + image: "saagie/saagie-usage-monitoring" + baseTag: "2023.03-0.1" + version: "2023.03-0.1-1.168.0_SDKTECHNO-244" - id: saagie-usage-monitoring-3.0 label: For Saagie 3.x releaseNotes: "" diff --git a/technologies/app/saagie-usage-monitoring/saagie-usage-monitoring-2023.02/Dockerfile b/technologies/app/saagie-usage-monitoring/saagie-usage-monitoring-2023.02/Dockerfile new file mode 100644 index 000000000..3e73e5ad8 --- /dev/null +++ b/technologies/app/saagie-usage-monitoring/saagie-usage-monitoring-2023.02/Dockerfile @@ -0,0 +1,78 @@ +#FROM grafana/grafana:9.2.3-ubuntu +FROM grafana/grafana:10.1.2-ubuntu + +USER root +# Install Dependencies +RUN apt-get update \ + && apt-get install -y software-properties-common nginx cron \ + wget libpq-dev openjdk-11-jdk ca-certificates-java \ + postgresql postgresql-contrib postgresql-client \ + build-essential cmake git libjson-c-dev libwebsockets-dev sqlite\ + && add-apt-repository ppa:deadsnakes/ppa \ + && apt-get update \ + && apt-get install -y python3.9 pip \ + && rm /etc/nginx/sites-enabled/default \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Fix certificate issues +RUN update-ca-certificates -f; + +# Hadoop command-line +RUN cd / \ + && mkdir hadoop \ + && cd hadoop \ + && wget -q https://archive.apache.org/dist/hadoop/common/hadoop-2.6.5/hadoop-2.6.5.tar.gz \ + && tar xvf hadoop-2.6.5.tar.gz \ + && rm hadoop-2.6.5.tar.gz \ + && rm -rf hadoop-2.6.5/etc/hadoop \ + && ln -s /etc/hadoop/conf hadoop-2.6.5/etc/hadoop; + +# Python dependencies +ADD code/requirements.txt /tmp/requirements.txt +RUN pip install -r /tmp/requirements.txt \ + && rm -rf /root/.cachex \ + && rm -rf /boot/.cache/pip \ + && rm -rf ~/.cache/pip + +# Environment variables +ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64/ +ENV HADOOP_HOME=/hadoop/hadoop-2.6.5 +ENV HADOOP_CONF_DIR=/hadoop/hadoop-2.6.5/etc/hadoop +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/hadoop/hadoop-2.6.5/lib/native:/usr/lib/jvm/java-11-openjdk-amd64/lib" +ENV CLASSPATH="/etc/hadoop/conf:/hadoop/hadoop-2.6.5/share/hadoop/common/lib/*:/hadoop/hadoop-2.6.5/share/hadoop/common/*:/hadoop/hadoop-2.6.5/share/hadoop/hdfs:/hadoop/hadoop-2.6.5/share/hadoop/hdfs/lib/*:/hadoop/hadoop-2.6.5/share/hadoop/hdfs/*:/hadoop/hadoop-2.6.5/share/hadoop/yarn/lib/*:/hadoop/hadoop-2.6.5/share/hadoop/yarn/*:/hadoop/hadoop-2.6.5/share/hadoop/mapreduce/lib/*:/hadoop/hadoop-2.6.5/share/hadoop/mapreduce/*" +ENV PATH "/hadoop/hadoop-2.6.5/bin:${PATH}" + +# Configure PostgreSQL +RUN chown postgres:postgres /run/postgresql/ \ + && chmod 777 /run/postgresql + +# Install ttyd +RUN git clone https://github.com/tsl0922/ttyd.git \ + && cd ttyd && mkdir build && cd build \ + && cmake .. \ + && make && make install + +# Configure Grafana +RUN mkdir /opt/grafana && mkdir /opt/plugins && mkdir /app && mkdir /var/lib/grafana/dashboards + +ADD server.conf /etc/nginx/sites-enabled/grafana.conf +ADD grafana.ini /etc/grafana/grafana.ini + +ADD grafana/provisioning /etc/grafana/provisioning +ADD grafana/dashboards /var/lib/grafana/tmp-dashboards + +ENV GF_PATHS_DATA /opt/grafana +ENV GF_PATHS_PLUGINS /opt/plugins + +ADD update_sqlite.sh / +RUN chmod +x /update_sqlite.sh + +ADD code /app +ADD infra.sql infra.sql + +RUN grafana-cli --pluginsDir "/opt/plugins" plugins install marcusolsson-treemap-panel + +EXPOSE 80 92 +ADD entrypoint.sh /entrypoint.sh +ENTRYPOINT ["bash", "/entrypoint.sh"] \ No newline at end of file diff --git a/technologies/app/saagie-usage-monitoring/saagie-usage-monitoring-2023.02/build.gradle.kts b/technologies/app/saagie-usage-monitoring/saagie-usage-monitoring-2023.02/build.gradle.kts new file mode 100644 index 000000000..b814e94a3 --- /dev/null +++ b/technologies/app/saagie-usage-monitoring/saagie-usage-monitoring-2023.02/build.gradle.kts @@ -0,0 +1,22 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * Copyright 2019-2021. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import com.bmuschko.gradle.docker.DockerRemoteApiPlugin +import com.saagie.technologies.SaagieTechnologiesGradlePlugin + +apply() +apply() diff --git a/technologies/app/saagie-usage-monitoring/saagie-usage-monitoring-2023.02/code/__main__.py b/technologies/app/saagie-usage-monitoring/saagie-usage-monitoring-2023.02/code/__main__.py new file mode 100644 index 000000000..4894f25d9 --- /dev/null +++ b/technologies/app/saagie-usage-monitoring/saagie-usage-monitoring-2023.02/code/__main__.py @@ -0,0 +1,216 @@ +import logging +import sys +import os +import pyarrow as pa +from datetime import datetime +from hdfs import InsecureClient +import utils + +monitoring_type = os.environ["MONITORING_OPT"] + + +def get_datalake_metrics(): + """ + Fetch Metrics from Hadoop API about Datalake usage and save it to PostgreSQL in the supervision Database + :return: + """ + with utils.DatabaseUtils() as database_utils: + hdfs = pa.hdfs.connect(os.environ["IP_HDFS"], port=8020, user="hdfs") + total_capacity = utils.get_hadoop_capacity(hdfs) + total_space_used = utils.get_hadoop_space_used(hdfs) + logging.debug(f"total_capacity : {total_capacity}") + logging.debug(f"total_space_used : {total_space_used}") + database_utils.supervision_datalake_to_pg("total_capacity", total_capacity) + database_utils.supervision_datalake_to_pg("total_used", total_space_used) + + # Get count files + client_hdfs = InsecureClient("http://" + os.environ["IP_HDFS"] + ":50070", user="hdfs") + content_root = client_hdfs.list("/", status=True) + + get_metrics_for_folder(client_hdfs=client_hdfs, + database_utils=database_utils, + folder="/") + for f in content_root: + if f[1]['type'] == 'DIRECTORY': + base_folder = "/" + f[0] + get_metrics_for_folder(client_hdfs=client_hdfs, + database_utils=database_utils, + folder=base_folder) + content_data = client_hdfs.list(base_folder, status=True) + for f in content_data: + if f[1]['type'] == 'DIRECTORY': + get_metrics_for_folder(client_hdfs=client_hdfs, + database_utils=database_utils, + folder=base_folder + "/" + f[0]) + + +def get_s3_metrics(): + """ + Fetch Metrics from S3 buckets usage and save it to PostgreSQL in the supervision Database + :return: + """ + with utils.S3Utils() as s3_utils, utils.DatabaseUtils() as database_utils: + buckets = s3_utils.get_all_buckets() + total_size = 0 + total_objects = 0 + for bucket in buckets: + bucket_size, number_of_objects = s3_utils.get_bucket_size(bucket.name, database_utils) + total_size += bucket_size + total_objects += number_of_objects + + database_utils.supervision_s3_to_pg("bucket_size", 'all_buckets', utils.bytes_to_gb(total_size)) + database_utils.supervision_s3_to_pg("bucket_objects", 'all_buckets', total_objects) + + +def get_metrics_for_folder(client_hdfs, database_utils, folder): + sub = client_hdfs.content(folder) + database_utils.supervision_datalake_to_pg(f"Data size {folder}", sub["length"]) + database_utils.supervision_datalake_to_pg(f"File Count {folder}", sub["fileCount"]) + database_utils.supervision_datalake_to_pg(f"Average size file {folder}", utils.get_average_file_size(sub)) + + +def get_saagie_metrics(): + """ + Truncate existing metrics and fetch Metrics Saagie API about Jobs and instances and save it to PostgreSQL in the + supervision Database :return: + """ + with utils.DatabaseUtils() as database_utils: + logging.debug("truncate_supervision_saagie_pg finished") + get_saagie_jobs_metrics(database_utils) + + +def get_saagie_jobs_metrics(database_utils): + """ + Fetch Metrics from Saagie API about Jobs and Pipelines duration and status and save it to PostgreSQL in the + supervision Database :return: + """ + logging.debug("truncate_supervision_saagie_pg starting") + database_utils.truncate_supervision_saagie_pg() + today = datetime.now().strftime('%Y-%m-%d') + + with utils.SaagieUtils() as saagie_utils: + project_list = saagie_utils.get_projects() + all_projects = [] + for project in project_list: + logging.debug(f"Getting metrics for project {project['name']}") + + job_list = saagie_utils.get_job_instances(project["id"]) + app_list = saagie_utils.get_apps(project["id"]) + pipeline_list = saagie_utils.get_pipelines(project["id"]) + + all_jobs = [{ + 'project_id': project["id"], + 'project_name': project["name"], + 'orchestration_type': "job", + 'orchestration_id': job["id"], + 'orchestration_name': job["name"], + 'orchestration_category': job["category"], + 'creation_date': job["creationDate"], + 'instance_count': job["countJobInstance"], + 'technology': job["technology"]["label"] if job["technology"] is not None else None + } for job in job_list] + database_utils.supervision_saagie_jobs_to_pg(all_jobs) + + all_apps = [{ + 'project_id': project["id"], + 'project_name': project["name"], + 'orchestration_type': "app", + 'orchestration_id': app["id"], + 'orchestration_name': app["name"], + 'creation_date': app["creationDate"], + 'current_status': app["history"]["currentStatus"] if app["history"] is not None else None, + 'start_time': app["history"]["startTime"] if app["history"] is not None else None, + 'stop_time': app["history"]["stopTime"] if app["history"] is not None else None, + 'technology': app["technology"]["label"] if app["technology"] is not None else None + } for app in app_list] + database_utils.supervision_saagie_apps_to_pg(all_apps) + + for job in job_list: + log_instance_metrics(database_utils, job["instances"], job, "job", project["id"], project['name']) + + for pipeline in pipeline_list: + log_instance_metrics(database_utils, pipeline["instances"], pipeline, "pipeline", project["id"], + project['name']) + + all_projects.append({ + 'project_id': project["id"], + 'project_name': project["name"], + 'snapshot_date': today, + 'job_count': len(job_list) + len(app_list)}) + database_utils.supervision_saagie_jobs_snapshot_to_pg(all_projects) + + +def get_instance_duration(start_time, end_time): + """ + Compute instance duration based on start and end time + :param start_time: + :param end_time: + :return: + """ + instance_start_time = utils.parse_instance_timestamp(start_time) + instance_end_time = utils.parse_instance_timestamp(end_time) + if instance_end_time and instance_end_time: + return (instance_end_time - instance_start_time).total_seconds() * 1000 + else: + return 0 + + +def log_instance_metrics(database_utils, instances, job_or_pipeline, orchestration_type, project_id, project_name): + """ + For each instance of a job or a pipeline, compute its duration and its Saagie URL and save it to PostgreSQL + in the supervision Database + :param database_utils: Instance of database utils to connect to PG + :param instances: instances of the current job + :param job_or_pipeline: job_or_pipeline object returned from Saagie API + :param orchestration_type: indicating whether its a job or a pipeline + :param project_id: Saagie Project ID + :param project_name: Saagie Project Name + :return: + """ + now = datetime.now() + if instances: + all_instances = [{ + 'supervision_timestamp': now, + 'project_id': project_id, + 'project_name': project_name, + 'orchestration_type': orchestration_type, + 'orchestration_id': job_or_pipeline["id"], + 'orchestration_name': job_or_pipeline["name"], + 'instance_id': instance["id"], + 'instance_start_time': instance["startTime"], + 'instance_end_time': instance["endTime"], + 'instance_status': instance["status"], + 'instance_duration': get_instance_duration(instance["startTime"], instance["endTime"]), + 'instance_saagie_url': utils.build_saagie_url(project_id, orchestration_type, job_or_pipeline["id"], + instance["id"]) + } for instance in instances] + + database_utils.supervision_saagie_to_pg(all_instances) + + +def main(): + if monitoring_type == "SAAGIE": + logging.info("Getting saagie metrics") + get_saagie_metrics() + elif monitoring_type == "SAAGIE_AND_DATALAKE": + logging.info("Getting saagie metrics") + get_saagie_metrics() + logging.info("Getting datalake metrics") + get_datalake_metrics() + elif monitoring_type == "SAAGIE_AND_S3": + logging.info("Getting saagie metrics") + get_saagie_metrics() + logging.info("Getting S3 metrics") + get_s3_metrics() + else: + logging.error("MONITORING_OPT wrong or missing, correct options are : 'SAAGIE' or 'SAAGIE_AND_DATALAKE'") + sys.exit(1) + logging.info("Metrics successfully gathered") + + +if __name__ == "__main__": + logging.getLogger("pyarrow").setLevel(logging.ERROR) + logging.basicConfig(level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%d/%m/%Y %H:%M:%S") + main() diff --git a/technologies/app/saagie-usage-monitoring/saagie-usage-monitoring-2023.02/code/requirements.txt b/technologies/app/saagie-usage-monitoring/saagie-usage-monitoring-2023.02/code/requirements.txt new file mode 100644 index 000000000..498baaad6 --- /dev/null +++ b/technologies/app/saagie-usage-monitoring/saagie-usage-monitoring-2023.02/code/requirements.txt @@ -0,0 +1,6 @@ +boto3==1.28.54 +urllib3==1.26.5 +psycopg2==2.9.2 +pyarrow==6.0.1 +hdfs +saagieapi==2.6.3 \ No newline at end of file diff --git a/technologies/app/saagie-usage-monitoring/saagie-usage-monitoring-2023.02/code/utils.py b/technologies/app/saagie-usage-monitoring/saagie-usage-monitoring-2023.02/code/utils.py new file mode 100644 index 000000000..06a72d45d --- /dev/null +++ b/technologies/app/saagie-usage-monitoring/saagie-usage-monitoring-2023.02/code/utils.py @@ -0,0 +1,444 @@ +import logging +import traceback +from datetime import datetime +import urllib3 +import psycopg2 +import psycopg2.extras +import boto3 +import botocore.exceptions + +import os + +from saagieapi import SaagieApi + +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) +logging.getLogger("boto3").setLevel(logging.WARNING) +logging.getLogger("botocore").setLevel(logging.WARNING) + +postgresql_host = os.environ["SAAGIE_PG_HOST"] +postgresql_port = os.environ["SAAGIE_PG_PORT"] +postgresql_user = os.environ["SAAGIE_PG_USER"] +postgresql_password = os.environ["SAAGIE_PG_PASSWORD"] +postgresql_db = os.environ["SAAGIE_PG_DATABASE"] + +saagie_login = os.environ["SAAGIE_SUPERVISION_LOGIN"] +saagie_password = os.environ["SAAGIE_SUPERVISION_PASSWORD"] +saagie_url = os.environ["SAAGIE_URL"] + "/" if not os.environ["SAAGIE_URL"].endswith("/") else os.environ["SAAGIE_URL"] +saagie_realm = os.environ["SAAGIE_REALM"] +saagie_platform = os.environ["SAAGIE_PLATFORM_ID"] + + +# Workaround for platforms with too many instances +MAX_INSTANCES_FETCHED = os.environ.get("SMT_MAX_INSTANCES_FETCHED", 1000) + + +class SaagieUtils(object): + + def __init__(self): + self.saagie_auth = SaagieApi(url_saagie=saagie_url, + id_platform=saagie_platform, + user=saagie_login, + password=saagie_password, + realm=saagie_realm) + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, tb): + if exc_type is not None: + traceback.print_exception(exc_type, exc_value, tb) + + def get_projects(self): + """ + Call Saagie graphql API to get the list of projects + :return: a JSON containing the project names and ids + """ + projects = self.saagie_auth.projects.list() + return projects['projects'] if projects else [] + + def get_job_instances(self, project_id): + """ + Call Saagie graphql API to get the jobs of a Saagie project for a given project id + :param project_id: Saagie Project ID + :return: a JSON containing a list of jobs + """ + dict_technology = {} + result = [] + jobs = self.saagie_auth.jobs.list_for_project(project_id=project_id, + instances_limit = MAX_INSTANCES_FETCHED, + versions_limit = 0) + if jobs: + for job in jobs["jobs"]: + technology_id = job["technology"]["id"] + if not dict_technology.get(technology_id): + technology_label = self.get_technology_label(technology_id) + dict_technology[technology_id] = technology_label + job["technology"]["label"] = dict_technology.get(technology_id) + result.append(job) + + return result if result else [] + + def get_pipelines(self, project_id): + """ + Call Saagie graphql API to get the pipelines of a Saagie project for a given project id + :param project_id: Saagie Project ID + :return: a JSON containing a list of pipelines + """ + pipelines = self.saagie_auth.pipelines.list_for_project(project_id=project_id, + instances_limit = MAX_INSTANCES_FETCHED, + versions_limit = 0) + return pipelines["project"]['pipelines'] if pipelines["project"] is not None else [] + + def get_apps(self, project_id): + dict_technology = {} + result = [] + + apps = self.saagie_auth.apps.list_for_project(project_id=project_id)["project"] + if apps: + for job in apps["apps"]: + technology_id = job["technology"]["id"] + if not dict_technology.get(technology_id): + technology_label = self.get_technology_label(technology_id) + dict_technology[technology_id] = technology_label + job["technology"]["label"] = dict_technology.get(technology_id) + result.append(job) + return result if result else [] + + def get_technology_label(self, technology_id): + technology_label = self.saagie_auth.get_technology_name_by_id(technology_id) + return technology_label[1] if technology_label[1] is not None else None + + +class S3Utils(object): + + def __init__(self): + s3_endpoint = os.environ['AWS_S3_ENDPOINT'] + s3_region = os.environ['AWS_REGION_NAME'] + self._s3_resource = boto3.resource("s3", + endpoint_url=s3_endpoint, + region_name=s3_region) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, tb): + if exc_type is not None: + traceback.print_exception(exc_type, exc_value, tb) + + def get_bucket_size(self, bucket_name, database_utils): + """ + Save size and # objects for each bucket and each object prefix + :param bucket_name: name of the bucket + :param database_utils: utils to save metrics in pg + :return: a tuple (total bucket size, total number of files) for the bucket + """ + total_bucket_size = 0 + total_bucket_objects = 0 + prefix_size = {} + prefix_objects = {} + try: + bucket = self._s3_resource.Bucket(bucket_name) + for bucket_object in bucket.objects.all(): + prefix = self.get_object_prefix(bucket_name, bucket_object.key) + if prefix: + total_bucket_size += bucket_object.size + total_bucket_objects += 1 + prefix_size[prefix] = prefix_size.get(prefix, 0) + bucket_object.size + if bucket_object.size > 0: + prefix_objects[prefix] = prefix_objects.get(prefix, 0) + 1 + for prefix, size in prefix_size.items(): + database_utils.supervision_s3_to_pg("prefix_size", prefix, bytes_to_gb(size)) + for prefix, number_objects in prefix_objects.items(): + database_utils.supervision_s3_to_pg("prefix_objects", prefix, number_objects) + database_utils.supervision_s3_to_pg("bucket_size", bucket_name, bytes_to_gb(total_bucket_size)) + database_utils.supervision_s3_to_pg("bucket_objects", bucket_name, bytes_to_gb(total_bucket_objects)) + return total_bucket_size, total_bucket_objects + except botocore.exceptions.ClientError: + logging.warning(f"Cannot fetch metrics from bucket {bucket_name}") + return 0, 0 + + def get_all_buckets(self): + """ + Returns all the buckets + :return: a list of buckets + """ + return self._s3_resource.buckets.all() + + @staticmethod + def get_object_prefix(bucket_name: str, object_key: str): + """ + Returns the prefix of the object key if it's an object, None if it's a directory + :param bucket_name: name of the bucket + :param object_key: the object's key + :return: a string containning the object prefix + """ + if object_key.endswith("/") or not object_key: + return None + return bucket_name + ("/" + object_key.split("/")[0] if "/" in object_key else "/") + + +class DatabaseUtils(object): + + def __init__(self): + self._db_connection = psycopg2.connect( + f"""host='{postgresql_host}' + port='{postgresql_port}' + user='{postgresql_user}' + password='{postgresql_password}' + dbname='{postgresql_db}'""") + self._db_connection.autocommit = True + self._db_cur = self._db_connection.cursor() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, tb): + if exc_type is not None: + traceback.print_exception(exc_type, exc_value, tb) + self._db_connection.close() + + def truncate_supervision_saagie_pg(self): + """ + Truncate the supervision_saagie and supervision_saagie_jobs tables + """ + try: + self._db_cur.execute('TRUNCATE TABLE supervision_saagie') + self._db_cur.execute('TRUNCATE TABLE supervision_saagie_jobs') + self._db_cur.execute('TRUNCATE TABLE supervision_saagie_apps') + except Exception as e: + logging.error(e) + + def supervision_saagie_to_pg(self, instances): + """ + Log saagie metrics to PostgresSQL. + :param instances: List of instances + :return: + """ + try: + psycopg2.extras.execute_batch(self._db_cur, """ + INSERT INTO supervision_saagie ( + supervision_timestamp, + project_id, + project_name, + orchestration_type, + orchestration_id, + orchestration_name, + instance_id, + instance_start_time, + instance_end_time, + instance_status, + instance_duration, + instance_saagie_url) + VALUES ( + %(supervision_timestamp)s, + %(project_id)s, + %(project_name)s, + %(orchestration_type)s, + %(orchestration_id)s, + %(orchestration_name)s, + %(instance_id)s, + %(instance_start_time)s, + %(instance_end_time)s, + %(instance_status)s, + %(instance_duration)s, + %(instance_saagie_url)s + ); + """, instances) + + except Exception as e: + logging.error(e) + + def supervision_saagie_jobs_to_pg(self, jobs): + """ + Log saagie jobs metrics to PostgresSQL. + :param jobs: List of jobs + :return: + """ + + try: + psycopg2.extras.execute_batch(self._db_cur, """ + INSERT INTO supervision_saagie_jobs ( + project_id, + project_name, + orchestration_type, + orchestration_id, + orchestration_name, + orchestration_category, + creation_date, + instance_count, + technology) + VALUES ( + %(project_id)s, + %(project_name)s, + %(orchestration_type)s, + %(orchestration_id)s, + %(orchestration_name)s, + %(orchestration_category)s, + %(creation_date)s, + %(instance_count)s, + %(technology)s + ); + """, jobs) + except Exception as e: + logging.error(e) + + def supervision_saagie_apps_to_pg(self, apps): + """ + Log saagie apps metrics to PostgresSQL. + :param apps: List of Apps + :return: + """ + + try: + psycopg2.extras.execute_batch(self._db_cur, """ + INSERT INTO supervision_saagie_apps ( + project_id, + project_name, + orchestration_type, + orchestration_id, + orchestration_name, + creation_date, + current_status, + start_time, + stop_time, + technology) + VALUES ( + %(project_id)s, + %(project_name)s, + %(orchestration_type)s, + %(orchestration_id)s, + %(orchestration_name)s, + %(creation_date)s, + %(current_status)s, + %(start_time)s, + %(stop_time)s, + %(technology)s + ); + """, apps) + except Exception as e: + logging.error(e) + + def supervision_saagie_jobs_snapshot_to_pg(self, project_job_counts): + """ + Log saagie job daily snapshot count to PostgresSQL. + :param project_job_counts: List of projects with job and apps count + :return: + """ + today = datetime.today().strftime('%Y-%m-%d') + try: + self._db_cur.execute( + f'''DELETE FROM supervision_saagie_jobs_snapshot + WHERE snapshot_date = \'{today}\'''') + psycopg2.extras.execute_batch(self._db_cur, """ + INSERT INTO supervision_saagie_jobs_snapshot (project_id, project_name, snapshot_date, job_count) + VALUES( %(project_id)s, + %(project_name)s, + %(snapshot_date)s, + %(job_count)s)""", project_job_counts) + except Exception as e: + logging.error(e) + + def supervision_datalake_to_pg(self, supervision_label, supervision_value): + """ + Log datalake metrics to PostgresSQL. + :param supervision_label: Label of the metric (e.g. space_used, total_capacity..) + :param supervision_value: Value in Gigabytes + :return: + """ + + today = datetime.today().strftime('%Y-%m-%d') + try: + self._db_cur.execute( + '''INSERT INTO supervision_datalake (supervision_date, supervision_label, supervision_value) + VALUES(%s,%s,%s) + ON CONFLICT ON CONSTRAINT supervision_datalake_pkey + DO + UPDATE + SET (supervision_label, supervision_value) = (EXCLUDED.supervision_label, EXCLUDED.supervision_value)''', + (today, supervision_label, supervision_value)) + except Exception as e: + logging.error(e) + + def supervision_s3_to_pg(self, supervision_label, supervision_namespace, supervision_value): + """ + Log datalake metrics to PostgresSQL. + :param supervision_label: Label of the metric (e.g. space_used, total_capacity..) + :param supervision_namespace: Namespace of the metric (e.g. bucket name...) + :param supervision_value: Value in Gigabytes + :return: + """ + + today = datetime.today().strftime('%Y-%m-%d') + try: + self._db_cur.execute( + '''INSERT INTO supervision_s3 (supervision_date, supervision_label,supervision_namespace, supervision_value) + VALUES(%s,%s,%s,%s) + ON CONFLICT ON CONSTRAINT supervision_s3_pkey + DO + UPDATE + SET (supervision_label, supervision_value) = (EXCLUDED.supervision_label, EXCLUDED.supervision_value)''', + (today, supervision_label, supervision_namespace, supervision_value)) + except Exception as e: + logging.error(e) + + +def parse_instance_timestamp(instance_timestamp): + """ + Parse a timestamp trying 2 different formats + :param instance_timestamp: Timestamp to parse (string) + :return: a datetime object + """ + datetime_format = '%Y-%m-%dT%H:%M:%S.%f%z' + alternative_datetime_format = '%Y-%m-%dT%H:%M:%S%z' + + if instance_timestamp: + try: + return datetime.strptime(instance_timestamp, datetime_format) + except ValueError: + return datetime.strptime(instance_timestamp, alternative_datetime_format) + else: + return None + + +def build_saagie_url(project_id, orchestration_type, job_or_pipeline_id, instance_id): + """ + Build the Saagie URL of a job or pipeline instance + :param instance_id: id of the Saagie instance + :param job_or_pipeline_id: if of the job or the pipeline + :param orchestration_type: job or pipeline + :param project_id: Saagie Project ID + :return: the complete URL of this instance + """ + return f"{saagie_url}projects/platform/{saagie_platform}/project/{project_id}/{orchestration_type}/" \ + f"{job_or_pipeline_id}/instances/{instance_id}" + + +def bytes_to_gb(size_in_bytes): + """ + Convert a size in bytes to gigabytes (rounded to 2 decimals) + :param size_in_bytes: size to convert + :return: size in gigabytes + """ + return round(size_in_bytes / 1024 / 1024 / 1024, 2) + + +def get_hadoop_capacity(hdfs): + """ + Get Datalake total capacity + :return: total capacity in GB rounded to 2 decimals + """ + return bytes_to_gb(hdfs.get_capacity()) + + +def get_hadoop_space_used(hdfs): + """ + Get Datalake total space used + :return: total space used in GB rounded to 2 decimals + """ + return bytes_to_gb(hdfs.get_space_used()) + + +def get_average_file_size(sub): + """ + Get the average file size of a subdirectory + """ + return sub["length"] / sub["fileCount"] if sub["length"] != 0 else 0 diff --git a/technologies/app/saagie-usage-monitoring/saagie-usage-monitoring-2023.02/context.yaml b/technologies/app/saagie-usage-monitoring/saagie-usage-monitoring-2023.02/context.yaml new file mode 100644 index 000000000..1bc5c602c --- /dev/null +++ b/technologies/app/saagie-usage-monitoring/saagie-usage-monitoring-2023.02/context.yaml @@ -0,0 +1,14 @@ +id: saagie-usage-monitoring-2023-02 +label: For Saagie 2023.02 +releaseNotes: "" +available: true +trustLevel: stable +ports: + - port: 80 + name: saagie-usage-monitoring + rewriteUrl: false + basePath: SAAGIE_BASE_PATH + - port: 92 + name: ttyd + rewriteUrl: true +volumes: ["/opt/grafana", "/var/lib/postgresql/data"] diff --git a/technologies/app/saagie-usage-monitoring/saagie-usage-monitoring-2023.02/dockerInfo.yaml b/technologies/app/saagie-usage-monitoring/saagie-usage-monitoring-2023.02/dockerInfo.yaml new file mode 100644 index 000000000..1ed331b5d --- /dev/null +++ b/technologies/app/saagie-usage-monitoring/saagie-usage-monitoring-2023.02/dockerInfo.yaml @@ -0,0 +1,4 @@ +image: saagie/saagie-usage-monitoring +baseTag: 2023.02-0.1 +dynamicVersion: 1.168.0_SDKTECHNO-244 +version: 2023.02-0.1-1.168.0_SDKTECHNO-244 diff --git a/technologies/app/saagie-usage-monitoring/saagie-usage-monitoring-2023.02/entrypoint.sh b/technologies/app/saagie-usage-monitoring/saagie-usage-monitoring-2023.02/entrypoint.sh new file mode 100644 index 000000000..89fc8cec1 --- /dev/null +++ b/technologies/app/saagie-usage-monitoring/saagie-usage-monitoring-2023.02/entrypoint.sh @@ -0,0 +1,130 @@ +#!/bin/bash + +if [[ -z ${SAAGIE_SUPERVISION_LOGIN} || -z ${SAAGIE_SUPERVISION_PASSWORD} || -z ${SAAGIE_URL} ]]; then + echo "$(date '+%Y-%m-%d %H:%M:%S') [ERROR] Missing environment variables. In order to work, this app needs the following environment variables set : " + echo "- SAAGIE_SUPERVISION_LOGIN" + echo "- SAAGIE_SUPERVISION_PASSWORD" + echo "- SAAGIE_URL" + exit 1 +fi + +if [[ -z ${MONITORING_OPT} ]]; then + echo "$(date '+%Y-%m-%d %H:%M:%S') [WARN] MONITORING_OPT not set, Saagie Usage Monitoring will only monitor Saagie" + export MONITORING_OPT="SAAGIE" +fi + +if [[ -z ${SAAGIE_PLATFORM_ID} ]]; then + echo "$(date '+%Y-%m-%d %H:%M:%S') [WARN] SAAGIE_PLATFORM_ID not set, using platform 1 by default" + export SAAGIE_PLATFORM_ID="1" +fi + +if [[ -z ${SAAGIE_PG_HOST} ]]; then + echo "$(date '+%Y-%m-%d %H:%M:%S') [WARN] SAAGIE_PG_HOST not set, using local postgres" + export SAAGIE_PG_HOST="localhost" + export SAAGIE_PG_PORT="5432" + export SAAGIE_PG_USER="supervision_pg_user" + export SAAGIE_PG_PASSWORD="" + export SAAGIE_PG_DATABASE="supervision_pg_db" +else + if [[ -z ${SAAGIE_PG_HOST} || -z ${SAAGIE_PG_PORT} || -z ${SAAGIE_PG_USER} || -z ${SAAGIE_PG_PASSWORD} || -z ${SAAGIE_PG_DATABASE} ]]; then + echo "$(date '+%Y-%m-%d %H:%M:%S') [ERROR] Missing environment variables. If SAAGIE_PG_HOST is set, this app needs following environment variables set : " + echo "- SAAGIE_PG_HOST" + echo "- SAAGIE_PG_PORT" + echo "- SAAGIE_PG_USER" + echo "- SAAGIE_PG_PASSWORD" + echo "- SAAGIE_PG_DATABASE" + exit 2 + fi +fi + +arrIN=(${SAAGIE_URL//\/\// }) +arrOUT=(${arrIN[1]//-/ }) +export SAAGIE_REALM="${arrOUT[0]}" + +echo \#!/bin/bash +{ + echo export SAAGIE_SUPERVISION_LOGIN="$SAAGIE_SUPERVISION_LOGIN" + echo export SAAGIE_SUPERVISION_PASSWORD=\'"$SAAGIE_SUPERVISION_PASSWORD"\' + echo export SAAGIE_URL="$SAAGIE_URL" + echo export SAAGIE_REALM="$SAAGIE_REALM" + echo export SAAGIE_PLATFORM_ID="$SAAGIE_PLATFORM_ID" + echo export MONITORING_OPT=$MONITORING_OPT + echo export IP_HDFS="$IP_HDFS" + echo export HADOOP_HOME=/hadoop/hadoop-2.6.5 + echo export SAAGIE_PG_HOST="$SAAGIE_PG_HOST" + echo export SAAGIE_PG_PORT="$SAAGIE_PG_PORT" + echo export SAAGIE_PG_USER="$SAAGIE_PG_USER" + echo export SAAGIE_PG_PASSWORD=\'"$SAAGIE_PG_PASSWORD"\' + echo export SAAGIE_PG_DATABASE="$SAAGIE_PG_DATABASE" +echo python3 /app/__main__.py +} >> /app/script.sh + +chmod +x /app/script.sh +PG_DATA_DIR="/var/lib/postgresql/data" +mkdir -p $PG_DATA_DIR + +if [ "$SAAGIE_PG_HOST" == "localhost" ]; then + + #Local database + if [ "$(ls -A $PG_DATA_DIR)" ]; then + echo "$(date '+%Y-%m-%d %H:%M:%S') [INFO] PG Database already exists, skipping init" + su postgres -c "export PATH=$PATH:/usr/lib/postgresql/14/bin && pg_ctl -D ${PG_DATA_DIR} start" > /dev/null + else + echo "$(date '+%Y-%m-%d %H:%M:%S') [INFO] Initializing PG database" + chown postgres:postgres $PG_DATA_DIR + chmod 777 $PG_DATA_DIR + + { + su postgres -c "/usr/lib/postgresql/14/bin/initdb -D $PG_DATA_DIR" + su postgres -c "export PATH=$PATH:/usr/lib/postgresql/14/bin && pg_ctl start -D $PG_DATA_DIR" + su postgres -c 'psql --command "CREATE USER supervision_pg_user"' + su postgres -c 'psql --command "CREATE DATABASE supervision_pg_db ENCODING \"UTF8\" TEMPLATE template0"' + su postgres -c 'psql --command "GRANT ALL PRIVILEGES ON DATABASE supervision_pg_db to supervision_pg_user"' + su postgres -c 'psql -U supervision_pg_user -d supervision_pg_db -f infra.sql' + } > /dev/null + fi + +else + #External database + echo "$(date '+%Y-%m-%d %H:%M:%S') [INFO] Create PG tables if not exists" + export PGPASSWORD=$SAAGIE_PG_PASSWORD + psql -h $SAAGIE_PG_HOST \ + -p $SAAGIE_PG_PORT \ + -U $SAAGIE_PG_USER \ + -d $SAAGIE_PG_DATABASE \ + -f infra.sql > /dev/null +fi + +sed -i 's:SAAGIE_BASE_PATH:'"$SAAGIE_BASE_PATH"':g' /etc/grafana/grafana.ini +sed -i 's:SAAGIE_BASE_PATH:'"$SAAGIE_BASE_PATH"':g' /etc/nginx/sites-enabled/grafana.conf +sed -i "s:url\::url\: $SAAGIE_PG_HOST\:$SAAGIE_PG_PORT:g" /etc/grafana/provisioning/datasources/grafana_source_monitoring.yaml +sed -i "s:password\::password\: $SAAGIE_PG_PASSWORD:g" /etc/grafana/provisioning/datasources/grafana_source_monitoring.yaml +sed -i "s:user\::user\: $SAAGIE_PG_USER:g" /etc/grafana/provisioning/datasources/grafana_source_monitoring.yaml +sed -i "s:database\::database\: $SAAGIE_PG_DATABASE:g" /etc/grafana/provisioning/datasources/grafana_source_monitoring.yaml + +cp /var/lib/grafana/tmp-dashboards/saagie*.json /var/lib/grafana/dashboards/ + +if [ "$MONITORING_OPT" == "SAAGIE_AND_DATALAKE" ]; then + cp /var/lib/grafana/tmp-dashboards/datalake*.json /var/lib/grafana/dashboards/ +elif [ "$MONITORING_OPT" == "SAAGIE_AND_S3" ]; then + cp /var/lib/grafana/tmp-dashboards/s3*.json /var/lib/grafana/dashboards/ +fi + +if [[ -z "${SAAGIE_SUM_CRON}" ]]; then + export SAAGIE_SUM_CRON="0 * * * *" +fi + + +echo "$SAAGIE_SUM_CRON /app/script.sh >> /tmp/log_cron.log 2>&1" > mycron \ +&& crontab mycron \ +&& rm mycron \ +&& service cron start + +echo "Job's starting" >> /tmp/log_cron.log + +tail -f /tmp/log_cron.log & + +/update_sqlite.sh & +/app/script.sh & +ttyd -p 92 bash & +nginx && /run.sh diff --git a/technologies/app/saagie-usage-monitoring/saagie-usage-monitoring-2023.02/grafana.ini b/technologies/app/saagie-usage-monitoring/saagie-usage-monitoring-2023.02/grafana.ini new file mode 100644 index 000000000..2f4f38126 --- /dev/null +++ b/technologies/app/saagie-usage-monitoring/saagie-usage-monitoring-2023.02/grafana.ini @@ -0,0 +1,696 @@ +##################### Grafana Configuration Example ##################### +# +# Everything has defaults so you only need to uncomment things you want to +# change + +# possible values : production, development +;app_mode = production + +# instance name, defaults to HOSTNAME environment variable value or hostname if HOSTNAME var is empty +;instance_name = ${HOSTNAME} + +#################################### Paths #################################### +[paths] +# Path to where grafana can store temp files, sessions, and the sqlite3 db (if that is used) +data = /opt/grafana + +# Temporary files in `data` directory older than given duration will be removed +;temp_data_lifetime = 24h + +# Directory where grafana can store logs +;logs = /var/log/grafana + +# Directory where grafana will automatically scan and look for plugins +plugins = /opt/plugins + +# folder that contains provisioning config files that grafana will apply on startup and while running. +;provisioning = conf/provisioning + +#################################### Server #################################### +[server] +# Protocol (http, https, h2, socket) +;protocol = http + +# The ip address to bind to, empty will bind to all interfaces +;http_addr = + +# The http port to use +;http_port = 3000 + +# The public facing domain name used to access grafana from a browser +domain = SAAGIE_BASE_PATH + +# Redirect to correct domain if host header does not match domain +# Prevents DNS rebinding attacks +;enforce_domain = false + +# The full public facing url you use in browser, used for redirects and emails +# If you use reverse proxy and sub path specify full url (with sub path) +root_url = %(protocol)s://%(domain)s/ + +# Serve Grafana from subpath specified in `root_url` setting. By default it is set to `false` for compatibility reasons. +;serve_from_sub_path = false + +# Log web requests +;router_logging = false + +# the path relative working path +;static_root_path = public + +# enable gzip +;enable_gzip = false + +# https certs & key file +;cert_file = +;cert_key = + +# Unix socket path +;socket = + +#################################### Database #################################### +[database] +# You can configure the database connection by specifying type, host, name, user and password +# as separate properties or as on string using the url properties. + +# Either "mysql", "postgres" or "sqlite3", it's your choice +;type = sqlite3 +;host = 127.0.0.1:3306 +;name = grafana +;user = root +# If the password contains # or ; you have to wrap it with triple quotes. Ex """#password;""" +;password = + +# Use either URL or the previous fields to configure the database +# Example: mysql://user:secret@host:port/database +;url = + +# For "postgres" only, either "disable", "require" or "verify-full" +;ssl_mode = disable + +;ca_cert_path = +;client_key_path = +;client_cert_path = +;server_cert_name = + +# For "sqlite3" only, path relative to data_path setting +;path = grafana.db + +# Max idle conn setting default is 2 +;max_idle_conn = 2 + +# Max conn setting default is 0 (mean not set) +;max_open_conn = + +# Connection Max Lifetime default is 14400 (means 14400 seconds or 4 hours) +;conn_max_lifetime = 14400 + +# Set to true to log the sql calls and execution times. +;log_queries = + +# For "sqlite3" only. cache mode setting used for connecting to the database. (private, shared) +;cache_mode = private + +#################################### Cache server ############################# +[remote_cache] +# Either "redis", "memcached" or "database" default is "database" +;type = database + +# cache connectionstring options +# database: will use Grafana primary database. +# redis: config like redis server e.g. `addr=127.0.0.1:6379,pool_size=100,db=0,ssl=false`. Only addr is required. ssl may be 'true', 'false', or 'insecure'. +# memcache: 127.0.0.1:11211 +;connstr = + +#################################### Data proxy ########################### +[dataproxy] + +# This enables data proxy logging, default is false +;logging = false + +# How long the data proxy should wait before timing out default is 30 (seconds) +;timeout = 30 + +# If enabled and user is not anonymous, data proxy will add X-Grafana-User header with username into the request, default is false. +;send_user_header = false + +#################################### Analytics #################################### +[analytics] +# Server reporting, sends usage counters to stats.grafana.org every 24 hours. +# No ip addresses are being tracked, only simple counters to track +# running instances, dashboard and error counts. It is very helpful to us. +# Change this option to false to disable reporting. +;reporting_enabled = true + +# Set to false to disable all checks to https://grafana.net +# for new vesions (grafana itself and plugins), check is used +# in some UI views to notify that grafana or plugin update exists +# This option does not cause any auto updates, nor send any information +# only a GET request to http://grafana.com to get latest versions +;check_for_updates = true + +# Google Analytics universal tracking code, only enabled if you specify an id here +;google_analytics_ua_id = + +# Google Tag Manager ID, only enabled if you specify an id here +;google_tag_manager_id = + +#################################### Security #################################### +[security] +# disable creation of admin user on first start of grafana +;disable_initial_admin_creation = false + +# default admin user, created on startup +;admin_user = admin + +# used for signing +;secret_key = SW2YcwTIb9zpOOhoPsMm + +# disable gravatar profile images +;disable_gravatar = false + +# data source proxy whitelist (ip_or_domain:port separated by spaces) +;data_source_proxy_whitelist = + +# disable protection against brute force login attempts +;disable_brute_force_login_protection = false + +# set to true if you host Grafana behind HTTPS. default is false. +;cookie_secure = false + +# set cookie SameSite attribute. defaults to `lax`. can be set to "lax", "strict", "none" and "disabled" +;cookie_samesite = lax + +# set to true if you want to allow browsers to render Grafana in a ,