Skip to content

Commit

Permalink
Sdktechno 244 (#607)
Browse files Browse the repository at this point in the history
SUM 2023.02 & 2023.03
  • Loading branch information
NicolasCD authored Oct 23, 2023
1 parent 800240b commit 6caa77b
Show file tree
Hide file tree
Showing 48 changed files with 12,457 additions and 2 deletions.
1 change: 1 addition & 0 deletions technologies/app/saagie-usage-monitoring/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ To deploy Saagie Usage Monitoring on your platform, you need to create a user wi
- `SAAGIE_AND_S3` if you want to monitor Saagie and S3 buckets
- IP_HDFS (Required if MONITORING_OPT=`SAAGIE_AND_DATALAKE`) : Namenode IP
- AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_S3_ENDPOINT and AWS_REGION_NAME (Required if MONITORING_OPT=`SAAGIE_AND_S3`)
- SAAGIE_SUM_CRON : Cron to collect Saagie informations on API (Optionnal, Default value : `0 * * * *`)

For an external Postgres database :
- SAAGIE_PG_HOST : Postgresql host (Default value : `localhost`)
Expand Down
36 changes: 36 additions & 0 deletions technologies/app/saagie-usage-monitoring/metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,42 @@ customFlags: []
readme: /technologies/app/saagie-usage-monitoring

contexts:
- id: saagie-usage-monitoring-2023-02
label: For Saagie 2023.02
releaseNotes: ""
available: true
trustLevel: stable
ports:
- port: 80
name: saagie-usage-monitoring
rewriteUrl: false
basePath: SAAGIE_BASE_PATH
- port: 92
name: ttyd
rewriteUrl: true
volumes: ["/opt/grafana", "/var/lib/postgresql/data"]
dockerInfo:
image: "saagie/saagie-usage-monitoring"
baseTag: "2023.02-0.1"
version: "2023.02-0.1-1.168.0_SDKTECHNO-244"
- id: saagie-usage-monitoring-2023-03
label: For Saagie 2023.03
releaseNotes: ""
available: true
trustLevel: stable
ports:
- port: 80
name: saagie-usage-monitoring
rewriteUrl: false
basePath: SAAGIE_BASE_PATH
- port: 92
name: ttyd
rewriteUrl: true
volumes: ["/opt/grafana", "/var/lib/postgresql/data"]
dockerInfo:
image: "saagie/saagie-usage-monitoring"
baseTag: "2023.03-0.1"
version: "2023.03-0.1-1.168.0_SDKTECHNO-244"
- id: saagie-usage-monitoring-3.0
label: For Saagie 3.x
releaseNotes: ""
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#FROM grafana/grafana:9.2.3-ubuntu
FROM grafana/grafana:10.1.2-ubuntu

USER root
# Install Dependencies
RUN apt-get update \
&& apt-get install -y software-properties-common nginx cron \
wget libpq-dev openjdk-11-jdk ca-certificates-java \
postgresql postgresql-contrib postgresql-client \
build-essential cmake git libjson-c-dev libwebsockets-dev sqlite\
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update \
&& apt-get install -y python3.9 pip \
&& rm /etc/nginx/sites-enabled/default \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

# Fix certificate issues
RUN update-ca-certificates -f;

# Hadoop command-line
RUN cd / \
&& mkdir hadoop \
&& cd hadoop \
&& wget -q https://archive.apache.org/dist/hadoop/common/hadoop-2.6.5/hadoop-2.6.5.tar.gz \
&& tar xvf hadoop-2.6.5.tar.gz \
&& rm hadoop-2.6.5.tar.gz \
&& rm -rf hadoop-2.6.5/etc/hadoop \
&& ln -s /etc/hadoop/conf hadoop-2.6.5/etc/hadoop;

# Python dependencies
ADD code/requirements.txt /tmp/requirements.txt
RUN pip install -r /tmp/requirements.txt \
&& rm -rf /root/.cachex \
&& rm -rf /boot/.cache/pip \
&& rm -rf ~/.cache/pip

# Environment variables
ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64/
ENV HADOOP_HOME=/hadoop/hadoop-2.6.5
ENV HADOOP_CONF_DIR=/hadoop/hadoop-2.6.5/etc/hadoop
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/hadoop/hadoop-2.6.5/lib/native:/usr/lib/jvm/java-11-openjdk-amd64/lib"
ENV CLASSPATH="/etc/hadoop/conf:/hadoop/hadoop-2.6.5/share/hadoop/common/lib/*:/hadoop/hadoop-2.6.5/share/hadoop/common/*:/hadoop/hadoop-2.6.5/share/hadoop/hdfs:/hadoop/hadoop-2.6.5/share/hadoop/hdfs/lib/*:/hadoop/hadoop-2.6.5/share/hadoop/hdfs/*:/hadoop/hadoop-2.6.5/share/hadoop/yarn/lib/*:/hadoop/hadoop-2.6.5/share/hadoop/yarn/*:/hadoop/hadoop-2.6.5/share/hadoop/mapreduce/lib/*:/hadoop/hadoop-2.6.5/share/hadoop/mapreduce/*"
ENV PATH "/hadoop/hadoop-2.6.5/bin:${PATH}"

# Configure PostgreSQL
RUN chown postgres:postgres /run/postgresql/ \
&& chmod 777 /run/postgresql

# Install ttyd
RUN git clone https://github.com/tsl0922/ttyd.git \
&& cd ttyd && mkdir build && cd build \
&& cmake .. \
&& make && make install

# Configure Grafana
RUN mkdir /opt/grafana && mkdir /opt/plugins && mkdir /app && mkdir /var/lib/grafana/dashboards

ADD server.conf /etc/nginx/sites-enabled/grafana.conf
ADD grafana.ini /etc/grafana/grafana.ini

ADD grafana/provisioning /etc/grafana/provisioning
ADD grafana/dashboards /var/lib/grafana/tmp-dashboards

ENV GF_PATHS_DATA /opt/grafana
ENV GF_PATHS_PLUGINS /opt/plugins

ADD update_sqlite.sh /
RUN chmod +x /update_sqlite.sh

ADD code /app
ADD infra.sql infra.sql

RUN grafana-cli --pluginsDir "/opt/plugins" plugins install marcusolsson-treemap-panel

EXPOSE 80 92
ADD entrypoint.sh /entrypoint.sh
ENTRYPOINT ["bash", "/entrypoint.sh"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* Copyright 2019-2021.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.bmuschko.gradle.docker.DockerRemoteApiPlugin
import com.saagie.technologies.SaagieTechnologiesGradlePlugin

apply<DockerRemoteApiPlugin>()
apply<SaagieTechnologiesGradlePlugin>()
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
import logging
import sys
import os
import pyarrow as pa
from datetime import datetime
from hdfs import InsecureClient
import utils

monitoring_type = os.environ["MONITORING_OPT"]


def get_datalake_metrics():
"""
Fetch Metrics from Hadoop API about Datalake usage and save it to PostgreSQL in the supervision Database
:return:
"""
with utils.DatabaseUtils() as database_utils:
hdfs = pa.hdfs.connect(os.environ["IP_HDFS"], port=8020, user="hdfs")
total_capacity = utils.get_hadoop_capacity(hdfs)
total_space_used = utils.get_hadoop_space_used(hdfs)
logging.debug(f"total_capacity : {total_capacity}")
logging.debug(f"total_space_used : {total_space_used}")
database_utils.supervision_datalake_to_pg("total_capacity", total_capacity)
database_utils.supervision_datalake_to_pg("total_used", total_space_used)

# Get count files
client_hdfs = InsecureClient("http://" + os.environ["IP_HDFS"] + ":50070", user="hdfs")
content_root = client_hdfs.list("/", status=True)

get_metrics_for_folder(client_hdfs=client_hdfs,
database_utils=database_utils,
folder="/")
for f in content_root:
if f[1]['type'] == 'DIRECTORY':
base_folder = "/" + f[0]
get_metrics_for_folder(client_hdfs=client_hdfs,
database_utils=database_utils,
folder=base_folder)
content_data = client_hdfs.list(base_folder, status=True)
for f in content_data:
if f[1]['type'] == 'DIRECTORY':
get_metrics_for_folder(client_hdfs=client_hdfs,
database_utils=database_utils,
folder=base_folder + "/" + f[0])


def get_s3_metrics():
"""
Fetch Metrics from S3 buckets usage and save it to PostgreSQL in the supervision Database
:return:
"""
with utils.S3Utils() as s3_utils, utils.DatabaseUtils() as database_utils:
buckets = s3_utils.get_all_buckets()
total_size = 0
total_objects = 0
for bucket in buckets:
bucket_size, number_of_objects = s3_utils.get_bucket_size(bucket.name, database_utils)
total_size += bucket_size
total_objects += number_of_objects

database_utils.supervision_s3_to_pg("bucket_size", 'all_buckets', utils.bytes_to_gb(total_size))
database_utils.supervision_s3_to_pg("bucket_objects", 'all_buckets', total_objects)


def get_metrics_for_folder(client_hdfs, database_utils, folder):
sub = client_hdfs.content(folder)
database_utils.supervision_datalake_to_pg(f"Data size {folder}", sub["length"])
database_utils.supervision_datalake_to_pg(f"File Count {folder}", sub["fileCount"])
database_utils.supervision_datalake_to_pg(f"Average size file {folder}", utils.get_average_file_size(sub))


def get_saagie_metrics():
"""
Truncate existing metrics and fetch Metrics Saagie API about Jobs and instances and save it to PostgreSQL in the
supervision Database :return:
"""
with utils.DatabaseUtils() as database_utils:
logging.debug("truncate_supervision_saagie_pg finished")
get_saagie_jobs_metrics(database_utils)


def get_saagie_jobs_metrics(database_utils):
"""
Fetch Metrics from Saagie API about Jobs and Pipelines duration and status and save it to PostgreSQL in the
supervision Database :return:
"""
logging.debug("truncate_supervision_saagie_pg starting")
database_utils.truncate_supervision_saagie_pg()
today = datetime.now().strftime('%Y-%m-%d')

with utils.SaagieUtils() as saagie_utils:
project_list = saagie_utils.get_projects()
all_projects = []
for project in project_list:
logging.debug(f"Getting metrics for project {project['name']}")

job_list = saagie_utils.get_job_instances(project["id"])
app_list = saagie_utils.get_apps(project["id"])
pipeline_list = saagie_utils.get_pipelines(project["id"])

all_jobs = [{
'project_id': project["id"],
'project_name': project["name"],
'orchestration_type': "job",
'orchestration_id': job["id"],
'orchestration_name': job["name"],
'orchestration_category': job["category"],
'creation_date': job["creationDate"],
'instance_count': job["countJobInstance"],
'technology': job["technology"]["label"] if job["technology"] is not None else None
} for job in job_list]
database_utils.supervision_saagie_jobs_to_pg(all_jobs)

all_apps = [{
'project_id': project["id"],
'project_name': project["name"],
'orchestration_type': "app",
'orchestration_id': app["id"],
'orchestration_name': app["name"],
'creation_date': app["creationDate"],
'current_status': app["history"]["currentStatus"] if app["history"] is not None else None,
'start_time': app["history"]["startTime"] if app["history"] is not None else None,
'stop_time': app["history"]["stopTime"] if app["history"] is not None else None,
'technology': app["technology"]["label"] if app["technology"] is not None else None
} for app in app_list]
database_utils.supervision_saagie_apps_to_pg(all_apps)

for job in job_list:
log_instance_metrics(database_utils, job["instances"], job, "job", project["id"], project['name'])

for pipeline in pipeline_list:
log_instance_metrics(database_utils, pipeline["instances"], pipeline, "pipeline", project["id"],
project['name'])

all_projects.append({
'project_id': project["id"],
'project_name': project["name"],
'snapshot_date': today,
'job_count': len(job_list) + len(app_list)})
database_utils.supervision_saagie_jobs_snapshot_to_pg(all_projects)


def get_instance_duration(start_time, end_time):
"""
Compute instance duration based on start and end time
:param start_time:
:param end_time:
:return:
"""
instance_start_time = utils.parse_instance_timestamp(start_time)
instance_end_time = utils.parse_instance_timestamp(end_time)
if instance_end_time and instance_end_time:
return (instance_end_time - instance_start_time).total_seconds() * 1000
else:
return 0


def log_instance_metrics(database_utils, instances, job_or_pipeline, orchestration_type, project_id, project_name):
"""
For each instance of a job or a pipeline, compute its duration and its Saagie URL and save it to PostgreSQL
in the supervision Database
:param database_utils: Instance of database utils to connect to PG
:param instances: instances of the current job
:param job_or_pipeline: job_or_pipeline object returned from Saagie API
:param orchestration_type: indicating whether its a job or a pipeline
:param project_id: Saagie Project ID
:param project_name: Saagie Project Name
:return:
"""
now = datetime.now()
if instances:
all_instances = [{
'supervision_timestamp': now,
'project_id': project_id,
'project_name': project_name,
'orchestration_type': orchestration_type,
'orchestration_id': job_or_pipeline["id"],
'orchestration_name': job_or_pipeline["name"],
'instance_id': instance["id"],
'instance_start_time': instance["startTime"],
'instance_end_time': instance["endTime"],
'instance_status': instance["status"],
'instance_duration': get_instance_duration(instance["startTime"], instance["endTime"]),
'instance_saagie_url': utils.build_saagie_url(project_id, orchestration_type, job_or_pipeline["id"],
instance["id"])
} for instance in instances]

database_utils.supervision_saagie_to_pg(all_instances)


def main():
if monitoring_type == "SAAGIE":
logging.info("Getting saagie metrics")
get_saagie_metrics()
elif monitoring_type == "SAAGIE_AND_DATALAKE":
logging.info("Getting saagie metrics")
get_saagie_metrics()
logging.info("Getting datalake metrics")
get_datalake_metrics()
elif monitoring_type == "SAAGIE_AND_S3":
logging.info("Getting saagie metrics")
get_saagie_metrics()
logging.info("Getting S3 metrics")
get_s3_metrics()
else:
logging.error("MONITORING_OPT wrong or missing, correct options are : 'SAAGIE' or 'SAAGIE_AND_DATALAKE'")
sys.exit(1)
logging.info("Metrics successfully gathered")


if __name__ == "__main__":
logging.getLogger("pyarrow").setLevel(logging.ERROR)
logging.basicConfig(level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
datefmt="%d/%m/%Y %H:%M:%S")
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
boto3==1.28.54
urllib3==1.26.5
psycopg2==2.9.2
pyarrow==6.0.1
hdfs
saagieapi==2.6.3
Loading

0 comments on commit 6caa77b

Please sign in to comment.