Sdktechno 244 (#607)

SUM 2023.02 & 2023.03
saagie · Oct 23, 2023 · 6caa77b · 6caa77b
1 parent 800240b
commit 6caa77b
Show file tree

Hide file tree

Showing 48 changed files with 12,457 additions and 2 deletions.
diff --git a/technologies/app/saagie-usage-monitoring/README.md b/technologies/app/saagie-usage-monitoring/README.md
@@ -18,6 +18,7 @@ To deploy Saagie Usage Monitoring on your platform, you need to create a user wi
   - `SAAGIE_AND_S3` if you want to monitor Saagie and S3 buckets
 - IP_HDFS (Required if MONITORING_OPT=`SAAGIE_AND_DATALAKE`) : Namenode IP
 - AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_S3_ENDPOINT and AWS_REGION_NAME (Required if MONITORING_OPT=`SAAGIE_AND_S3`)
+- SAAGIE_SUM_CRON : Cron to collect Saagie informations on API (Optionnal, Default value : `0 * * * *`)
 
 For an external Postgres database : 
 - SAAGIE_PG_HOST : Postgresql host (Default value : `localhost`)

diff --git a/technologies/app/saagie-usage-monitoring/metadata.yaml b/technologies/app/saagie-usage-monitoring/metadata.yaml
@@ -16,6 +16,42 @@ customFlags: []
 readme: /technologies/app/saagie-usage-monitoring
 
 contexts:
+  - id: saagie-usage-monitoring-2023-02
+    label: For Saagie 2023.02
+    releaseNotes: ""
+    available: true
+    trustLevel: stable
+    ports:
+      - port: 80
+        name: saagie-usage-monitoring
+        rewriteUrl: false
+        basePath: SAAGIE_BASE_PATH
+      - port: 92
+        name: ttyd
+        rewriteUrl: true
+    volumes: ["/opt/grafana", "/var/lib/postgresql/data"]
+    dockerInfo:
+      image: "saagie/saagie-usage-monitoring"
+      baseTag: "2023.02-0.1"
+      version: "2023.02-0.1-1.168.0_SDKTECHNO-244"
+  - id: saagie-usage-monitoring-2023-03
+    label: For Saagie 2023.03
+    releaseNotes: ""
+    available: true
+    trustLevel: stable
+    ports:
+      - port: 80
+        name: saagie-usage-monitoring
+        rewriteUrl: false
+        basePath: SAAGIE_BASE_PATH
+      - port: 92
+        name: ttyd
+        rewriteUrl: true
+    volumes: ["/opt/grafana", "/var/lib/postgresql/data"]
+    dockerInfo:
+      image: "saagie/saagie-usage-monitoring"
+      baseTag: "2023.03-0.1"
+      version: "2023.03-0.1-1.168.0_SDKTECHNO-244"
   - id: saagie-usage-monitoring-3.0
     label: For Saagie 3.x
     releaseNotes: ""

diff --git a/technologies/app/saagie-usage-monitoring/saagie-usage-monitoring-2023.02/Dockerfile b/technologies/app/saagie-usage-monitoring/saagie-usage-monitoring-2023.02/Dockerfile
@@ -0,0 +1,78 @@
+#FROM grafana/grafana:9.2.3-ubuntu
+FROM grafana/grafana:10.1.2-ubuntu
+
+USER root
+# Install Dependencies
+RUN apt-get update \
+  && apt-get install -y software-properties-common nginx cron \
+    wget libpq-dev openjdk-11-jdk ca-certificates-java \
+    postgresql postgresql-contrib postgresql-client \
+    build-essential cmake git libjson-c-dev libwebsockets-dev sqlite\
+  && add-apt-repository ppa:deadsnakes/ppa \
+  && apt-get update \
+  && apt-get install -y python3.9 pip \
+  && rm /etc/nginx/sites-enabled/default \
+  && apt-get clean \
+  && rm -rf /var/lib/apt/lists/*
+
+# Fix certificate issues
+RUN update-ca-certificates -f;
+
+# Hadoop command-line
+RUN cd / \
+    && mkdir hadoop \
+    && cd hadoop \
+    && wget -q https://archive.apache.org/dist/hadoop/common/hadoop-2.6.5/hadoop-2.6.5.tar.gz \
+    && tar xvf hadoop-2.6.5.tar.gz \
+    && rm hadoop-2.6.5.tar.gz \
+    && rm -rf hadoop-2.6.5/etc/hadoop \
+    && ln -s /etc/hadoop/conf hadoop-2.6.5/etc/hadoop;
+
+# Python dependencies
+ADD code/requirements.txt /tmp/requirements.txt
+RUN pip install -r /tmp/requirements.txt \
+    && rm -rf /root/.cachex \
+    && rm -rf /boot/.cache/pip \
+    && rm -rf ~/.cache/pip
+
+# Environment variables
+ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64/
+ENV HADOOP_HOME=/hadoop/hadoop-2.6.5
+ENV HADOOP_CONF_DIR=/hadoop/hadoop-2.6.5/etc/hadoop
+ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/hadoop/hadoop-2.6.5/lib/native:/usr/lib/jvm/java-11-openjdk-amd64/lib"
+ENV CLASSPATH="/etc/hadoop/conf:/hadoop/hadoop-2.6.5/share/hadoop/common/lib/*:/hadoop/hadoop-2.6.5/share/hadoop/common/*:/hadoop/hadoop-2.6.5/share/hadoop/hdfs:/hadoop/hadoop-2.6.5/share/hadoop/hdfs/lib/*:/hadoop/hadoop-2.6.5/share/hadoop/hdfs/*:/hadoop/hadoop-2.6.5/share/hadoop/yarn/lib/*:/hadoop/hadoop-2.6.5/share/hadoop/yarn/*:/hadoop/hadoop-2.6.5/share/hadoop/mapreduce/lib/*:/hadoop/hadoop-2.6.5/share/hadoop/mapreduce/*"
+ENV PATH "/hadoop/hadoop-2.6.5/bin:${PATH}"
+
+# Configure PostgreSQL
+RUN chown postgres:postgres /run/postgresql/ \
+  && chmod 777 /run/postgresql
+
+# Install ttyd
+RUN git clone https://github.com/tsl0922/ttyd.git \
+    && cd ttyd && mkdir build && cd build \
+    && cmake .. \
+    && make && make install
+
+# Configure Grafana
+RUN mkdir /opt/grafana && mkdir /opt/plugins && mkdir /app && mkdir /var/lib/grafana/dashboards
+
+ADD server.conf /etc/nginx/sites-enabled/grafana.conf
+ADD grafana.ini /etc/grafana/grafana.ini
+
+ADD grafana/provisioning /etc/grafana/provisioning
+ADD grafana/dashboards /var/lib/grafana/tmp-dashboards
+
+ENV GF_PATHS_DATA /opt/grafana
+ENV GF_PATHS_PLUGINS /opt/plugins
+
+ADD update_sqlite.sh /
+RUN chmod +x /update_sqlite.sh
+
+ADD code /app
+ADD infra.sql infra.sql
+
+RUN grafana-cli --pluginsDir "/opt/plugins" plugins install marcusolsson-treemap-panel
+
+EXPOSE 80 92
+ADD entrypoint.sh /entrypoint.sh
+ENTRYPOINT ["bash", "/entrypoint.sh"]
diff --git a/technologies/app/saagie-usage-monitoring/saagie-usage-monitoring-2023.02/build.gradle.kts b/technologies/app/saagie-usage-monitoring/saagie-usage-monitoring-2023.02/build.gradle.kts
@@ -0,0 +1,22 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Copyright 2019-2021.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import com.bmuschko.gradle.docker.DockerRemoteApiPlugin
+import com.saagie.technologies.SaagieTechnologiesGradlePlugin
+
+apply<DockerRemoteApiPlugin>()
+apply<SaagieTechnologiesGradlePlugin>()
diff --git a/technologies/app/saagie-usage-monitoring/saagie-usage-monitoring-2023.02/code/__main__.py b/technologies/app/saagie-usage-monitoring/saagie-usage-monitoring-2023.02/code/__main__.py
@@ -0,0 +1,216 @@
+import logging
+import sys
+import os
+import pyarrow as pa
+from datetime import datetime
+from hdfs import InsecureClient
+import utils
+
+monitoring_type = os.environ["MONITORING_OPT"]
+
+
+def get_datalake_metrics():
+    """
+    Fetch Metrics from Hadoop API about Datalake usage and save it to PostgreSQL in the supervision Database
+    :return:
+    """
+    with utils.DatabaseUtils() as database_utils:
+        hdfs = pa.hdfs.connect(os.environ["IP_HDFS"], port=8020, user="hdfs")
+        total_capacity = utils.get_hadoop_capacity(hdfs)
+        total_space_used = utils.get_hadoop_space_used(hdfs)
+        logging.debug(f"total_capacity : {total_capacity}")
+        logging.debug(f"total_space_used : {total_space_used}")
+        database_utils.supervision_datalake_to_pg("total_capacity", total_capacity)
+        database_utils.supervision_datalake_to_pg("total_used", total_space_used)
+
+        # Get count files
+        client_hdfs = InsecureClient("http://" + os.environ["IP_HDFS"] + ":50070", user="hdfs")
+        content_root = client_hdfs.list("/", status=True)
+
+        get_metrics_for_folder(client_hdfs=client_hdfs,
+                               database_utils=database_utils,
+                               folder="/")
+        for f in content_root:
+            if f[1]['type'] == 'DIRECTORY':
+                base_folder = "/" + f[0]
+                get_metrics_for_folder(client_hdfs=client_hdfs,
+                                       database_utils=database_utils,
+                                       folder=base_folder)
+            content_data = client_hdfs.list(base_folder, status=True)
+            for f in content_data:
+                if f[1]['type'] == 'DIRECTORY':
+                    get_metrics_for_folder(client_hdfs=client_hdfs,
+                                           database_utils=database_utils,
+                                           folder=base_folder + "/" + f[0])
+
+
+def get_s3_metrics():
+    """
+    Fetch Metrics from S3 buckets usage and save it to PostgreSQL in the supervision Database
+    :return:
+    """
+    with utils.S3Utils() as s3_utils, utils.DatabaseUtils() as database_utils:
+        buckets = s3_utils.get_all_buckets()
+        total_size = 0
+        total_objects = 0
+        for bucket in buckets:
+            bucket_size, number_of_objects = s3_utils.get_bucket_size(bucket.name, database_utils)
+            total_size += bucket_size
+            total_objects += number_of_objects
+
+        database_utils.supervision_s3_to_pg("bucket_size", 'all_buckets', utils.bytes_to_gb(total_size))
+        database_utils.supervision_s3_to_pg("bucket_objects", 'all_buckets', total_objects)
+
+
+def get_metrics_for_folder(client_hdfs, database_utils, folder):
+    sub = client_hdfs.content(folder)
+    database_utils.supervision_datalake_to_pg(f"Data size {folder}", sub["length"])
+    database_utils.supervision_datalake_to_pg(f"File Count {folder}", sub["fileCount"])
+    database_utils.supervision_datalake_to_pg(f"Average size file {folder}", utils.get_average_file_size(sub))
+
+
+def get_saagie_metrics():
+    """
+    Truncate existing metrics and fetch Metrics Saagie API about Jobs and instances and save it to PostgreSQL in the
+    supervision Database :return:
+    """
+    with utils.DatabaseUtils() as database_utils:
+        logging.debug("truncate_supervision_saagie_pg finished")
+        get_saagie_jobs_metrics(database_utils)
+
+
+def get_saagie_jobs_metrics(database_utils):
+    """
+    Fetch Metrics from Saagie API about Jobs and Pipelines duration and status and save it to PostgreSQL in the
+    supervision Database :return:
+    """
+    logging.debug("truncate_supervision_saagie_pg starting")
+    database_utils.truncate_supervision_saagie_pg()
+    today = datetime.now().strftime('%Y-%m-%d')
+
+    with utils.SaagieUtils() as saagie_utils:
+        project_list = saagie_utils.get_projects()
+        all_projects = []
+        for project in project_list:
+            logging.debug(f"Getting metrics for project {project['name']}")
+
+            job_list = saagie_utils.get_job_instances(project["id"])
+            app_list = saagie_utils.get_apps(project["id"])
+            pipeline_list = saagie_utils.get_pipelines(project["id"])
+
+            all_jobs = [{
+                'project_id': project["id"],
+                'project_name': project["name"],
+                'orchestration_type': "job",
+                'orchestration_id': job["id"],
+                'orchestration_name': job["name"],
+                'orchestration_category': job["category"],
+                'creation_date': job["creationDate"],
+                'instance_count': job["countJobInstance"],
+                'technology': job["technology"]["label"] if job["technology"] is not None else None
+            } for job in job_list]
+            database_utils.supervision_saagie_jobs_to_pg(all_jobs)
+
+            all_apps = [{
+                'project_id': project["id"],
+                'project_name': project["name"],
+                'orchestration_type': "app",
+                'orchestration_id': app["id"],
+                'orchestration_name': app["name"],
+                'creation_date': app["creationDate"],
+                'current_status': app["history"]["currentStatus"] if app["history"] is not None else None,
+                'start_time': app["history"]["startTime"] if app["history"] is not None else None,
+                'stop_time': app["history"]["stopTime"] if app["history"] is not None else None,
+                'technology': app["technology"]["label"] if app["technology"] is not None else None
+            } for app in app_list]
+            database_utils.supervision_saagie_apps_to_pg(all_apps)
+
+            for job in job_list:
+                log_instance_metrics(database_utils, job["instances"], job, "job", project["id"], project['name'])
+
+            for pipeline in pipeline_list:
+                log_instance_metrics(database_utils, pipeline["instances"], pipeline, "pipeline", project["id"],
+                                     project['name'])
+
+            all_projects.append({
+                'project_id': project["id"],
+                'project_name': project["name"],
+                'snapshot_date': today,
+                'job_count': len(job_list) + len(app_list)})
+        database_utils.supervision_saagie_jobs_snapshot_to_pg(all_projects)
+
+
+def get_instance_duration(start_time, end_time):
+    """
+    Compute instance duration based on start and end time
+    :param start_time:
+    :param end_time:
+    :return:
+    """
+    instance_start_time = utils.parse_instance_timestamp(start_time)
+    instance_end_time = utils.parse_instance_timestamp(end_time)
+    if instance_end_time and instance_end_time:
+        return (instance_end_time - instance_start_time).total_seconds() * 1000
+    else:
+        return 0
+
+
+def log_instance_metrics(database_utils, instances, job_or_pipeline, orchestration_type, project_id, project_name):
+    """
+    For each instance of a job or a pipeline, compute its duration and its Saagie URL and save it to PostgreSQL
+    in the supervision Database
+    :param database_utils: Instance of database utils to connect to PG
+    :param instances: instances of the current job
+    :param job_or_pipeline: job_or_pipeline object returned from Saagie API
+    :param orchestration_type: indicating whether its a job or a pipeline
+    :param project_id: Saagie Project ID
+    :param project_name: Saagie Project Name
+    :return:
+    """
+    now = datetime.now()
+    if instances:
+        all_instances = [{
+            'supervision_timestamp': now,
+            'project_id': project_id,
+            'project_name': project_name,
+            'orchestration_type': orchestration_type,
+            'orchestration_id': job_or_pipeline["id"],
+            'orchestration_name': job_or_pipeline["name"],
+            'instance_id': instance["id"],
+            'instance_start_time': instance["startTime"],
+            'instance_end_time': instance["endTime"],
+            'instance_status': instance["status"],
+            'instance_duration': get_instance_duration(instance["startTime"], instance["endTime"]),
+            'instance_saagie_url': utils.build_saagie_url(project_id, orchestration_type, job_or_pipeline["id"],
+                                                          instance["id"])
+        } for instance in instances]
+
+        database_utils.supervision_saagie_to_pg(all_instances)
+
+
+def main():
+    if monitoring_type == "SAAGIE":
+        logging.info("Getting saagie metrics")
+        get_saagie_metrics()
+    elif monitoring_type == "SAAGIE_AND_DATALAKE":
+        logging.info("Getting saagie metrics")
+        get_saagie_metrics()
+        logging.info("Getting datalake metrics")
+        get_datalake_metrics()
+    elif monitoring_type == "SAAGIE_AND_S3":
+        logging.info("Getting saagie metrics")
+        get_saagie_metrics()
+        logging.info("Getting S3 metrics")
+        get_s3_metrics()
+    else:
+        logging.error("MONITORING_OPT wrong or missing, correct options are : 'SAAGIE' or 'SAAGIE_AND_DATALAKE'")
+        sys.exit(1)
+    logging.info("Metrics successfully gathered")
+
+
+if __name__ == "__main__":
+    logging.getLogger("pyarrow").setLevel(logging.ERROR)
+    logging.basicConfig(level=logging.INFO,
+                        format="%(asctime)s [%(levelname)s] %(message)s",
+                        datefmt="%d/%m/%Y %H:%M:%S")
+    main()
diff --git a/...ologies/app/saagie-usage-monitoring/saagie-usage-monitoring-2023.02/code/requirements.txt b/...ologies/app/saagie-usage-monitoring/saagie-usage-monitoring-2023.02/code/requirements.txt
@@ -0,0 +1,6 @@
+boto3==1.28.54
+urllib3==1.26.5
+psycopg2==2.9.2
+pyarrow==6.0.1
+hdfs
+saagieapi==2.6.3