From 868ff1e296a6ea59fa9965011fb1f17d952263e6 Mon Sep 17 00:00:00 2001 From: naveenpaul1 Date: Mon, 6 Nov 2023 19:15:06 +0530 Subject: [PATCH] NSFS | NC |endpoint metrics should aggregated fork Signed-off-by: naveenpaul1 --- docs/non_containerized_NSFS.md | 7 + src/endpoint/endpoint.js | 12 +- src/sdk/endpoint_stats_collector.js | 15 ++- .../analytic_services/prometheus_reporting.js | 123 ++++++++++++++++-- .../base_prometheus_report.js | 8 +- .../prometheus_reports/nodejs_report.js | 2 +- .../prometheus_reports/noobaa_core_report.js | 2 +- .../noobaa_endpoint_report.js | 11 +- .../system_services/stats_aggregator.js | 33 ++++- src/test/utils/metrics.js | 4 +- src/util/fork_utils.js | 81 +++++++++++- 11 files changed, 258 insertions(+), 40 deletions(-) diff --git a/docs/non_containerized_NSFS.md b/docs/non_containerized_NSFS.md index 267bfcb837..e26509febe 100644 --- a/docs/non_containerized_NSFS.md +++ b/docs/non_containerized_NSFS.md @@ -436,6 +436,13 @@ NSFS management CLI command will create both account and bucket dir if it's miss Non containerized NSFS certificates/ directory location will be under the config_root path. The certificates/ directory should contain SSL files tls.key and tls.crt. System will use a cert from this dir to create a valid HTTPS connection. If cert is missing in this dir a self-signed SSL certificate will be generated. Make sure the path to certificates/ directory is valid before running nsfs command, If the path is invalid then cert flow will fail. +Non containerized NSFS restrict insecure HTTP connections when `allow_http` is set to false in cofig.json. This is not the default behaviour. + +## Monitoring + +Prometheus metrics port can be passed through the argument `--metrics_port` while executing the nsfs command. +NSFS state and output metrics can be fetched from URL `http:{host}:{metrics_port}/metrics/nsfs_stats`. + ## Log and Logrotate Noobaa logs are configured using rsyslog and logrotate. RPM will configure rsyslog and logrotate if both are already running. diff --git a/src/endpoint/endpoint.js b/src/endpoint/endpoint.js index 2d07fc2d50..6c09535375 100755 --- a/src/endpoint/endpoint.js +++ b/src/endpoint/endpoint.js @@ -36,11 +36,11 @@ const server_rpc = require('../server/server_rpc'); const debug_config = require('../util/debug_config'); const auth_server = require('../server/common_services/auth_server'); const system_store = require('../server/system_services/system_store'); -const prom_reporting = require('../server/analytic_services/prometheus_reporting'); const background_scheduler = require('../util/background_scheduler').get_instance(); const endpoint_stats_collector = require('../sdk/endpoint_stats_collector'); const { NamespaceMonitor } = require('../server/bg_services/namespace_monitor'); const { SemaphoreMonitor } = require('../server/bg_services/semaphore_monitor'); +const prom_reporting = require('../server/analytic_services/prometheus_reporting'); const cluster = /** @type {import('node:cluster').Cluster} */ ( /** @type {unknown} */ (require('node:cluster')) ); @@ -91,12 +91,12 @@ async function main(options = {}) { try { // the primary just forks and returns, workers will continue to serve fork_count = options.forks ?? config.ENDPOINT_FORKS; - if (fork_utils.start_workers(fork_count)) return; + const metrics_port = options.metrics_port || config.EP_METRICS_SERVER_PORT; + if (fork_utils.start_workers(metrics_port, fork_count)) return; const http_port = options.http_port || Number(process.env.ENDPOINT_PORT) || 6001; const https_port = options.https_port || Number(process.env.ENDPOINT_SSL_PORT) || 6443; - const https_port_sts = options.https_port_sts || Number(process.env.ENDPOINT_SSL_STS_PORT) || 7443; - const metrics_port = options.metrics_port || config.EP_METRICS_SERVER_PORT; + const https_port_sts = options.https_port_sts || Number(process.env.ENDPOINT_SSL_PORT_STS) || 7443; const endpoint_group_id = process.env.ENDPOINT_GROUP_ID || 'default-endpoint-group'; const virtual_hosts = Object.freeze( @@ -182,9 +182,9 @@ async function main(options = {}) { await listen_http(https_port_sts, https_server_sts); dbg.log0('Started STS HTTPS successfully'); } - if (metrics_port > 0) { + if (metrics_port > 0 && cluster.isPrimary) { dbg.log0('Starting metrics server', metrics_port); - await prom_reporting.start_server(metrics_port); + await prom_reporting.start_server(metrics_port, false); dbg.log0('Started metrics server successfully'); } diff --git a/src/sdk/endpoint_stats_collector.js b/src/sdk/endpoint_stats_collector.js index 01a6f9b515..050b08d02a 100644 --- a/src/sdk/endpoint_stats_collector.js +++ b/src/sdk/endpoint_stats_collector.js @@ -6,8 +6,12 @@ const mime = require('mime'); const dbg = require('../util/debug_module')(__filename); const prom_report = require('../server/analytic_services/prometheus_reporting'); +const stats_aggregator = require('../server/system_services/stats_aggregator'); const DelayedCollector = require('../util/delayed_collector'); const config = require('../../config'); +const cluster = /** @type {import('node:cluster').Cluster} */ ( + /** @type {unknown} */ (require('node:cluster')) +); /** * @typedef {{ @@ -154,13 +158,14 @@ class EndpointStatsCollector { for (const [k, v] of Object.entries(data.fs_workers_stats ?? {})) { dbg.log0(`nsfs stats - FS op=${k} :`, v); } - if (this.rpc_client) { await this.rpc_client.stats.update_nsfs_stats({ nsfs_stats: data }, { timeout: SEND_STATS_TIMEOUT }); + } else { + await stats_aggregator.standalon_update_nsfs_stats(data); } } @@ -350,6 +355,14 @@ class EndpointStatsCollector { this.prom_metrics_report.observe('hub_write_latency', { bucket_name }, hub_write_latency); } } + update_fork_counter() { + // add fork related metrics to prometheus + const code = `worker_${cluster.worker.id}`; + this.prom_metrics_report.inc('fork_counter', {code}); + } +} +if (cluster.isWorker) { + EndpointStatsCollector.instance().update_fork_counter(); } EndpointStatsCollector._instance = null; diff --git a/src/server/analytic_services/prometheus_reporting.js b/src/server/analytic_services/prometheus_reporting.js index f14377d062..c460ec87aa 100644 --- a/src/server/analytic_services/prometheus_reporting.js +++ b/src/server/analytic_services/prometheus_reporting.js @@ -10,6 +10,9 @@ const config = require('../../../config'); const { NodeJsReport } = require('./prometheus_reports/nodejs_report'); const { NooBaaCoreReport } = require('./prometheus_reports/noobaa_core_report'); const { NooBaaEndpointReport } = require('./prometheus_reports/noobaa_endpoint_report'); +const stats_aggregator = require('../system_services/stats_aggregator'); +const AggregatorRegistry = require('prom-client').AggregatorRegistry; +const aggregatorRegistry = new AggregatorRegistry(); // Currenty supported reprots const reports = Object.seal({ @@ -18,6 +21,9 @@ const reports = Object.seal({ endpoint: null // optional }); +let io_stats_complete = {}; +let ops_stats_complete = {}; + function get_nodejs_report() { return reports.nodejs; } @@ -43,6 +49,7 @@ async function export_all_metrics() { async function start_server( port, + fork_enabled, retry_count = config.PROMETHEUS_SERVER_RETRY_COUNT, delay = config.PROMETHEUS_SERVER_RETRY_DELAY ) { @@ -51,20 +58,50 @@ async function start_server( } const server = http.createServer(async (req, res) => { - // Serve all metrics on the root path. - if (req.url === '' || req.url === '/') { - res.writeHead(200, { 'Content-Type': 'text/plain' }); - res.end(await export_all_metrics()); - return; - } - - // Serve report's metrics on the report name path - const report_name = req.url.substr(1); - const report = reports[report_name]; - if (report) { - res.writeHead(200, { 'Content-Type': 'text/plain' }); - res.end(await report.export_metrics(report_name)); - return; + // Serve all metrics on the root path for system that do have one or more fork running. + if (fork_enabled) { + const metrics = await aggregatorRegistry.clusterMetrics(); + if (req.url === '' || req.url === '/') { + res.writeHead(200, { 'Content-Type': aggregatorRegistry.contentType }); + res.end(metrics); + return; + } + if (req.url === '/metrics/nsfs_stats') { + res.writeHead(200, { 'Content-Type': 'text/plain' }); + const nsfs_report = { + nsfs_counters: io_stats_complete, + op_stats_counters: ops_stats_complete, + }; + res.end(JSON.stringify(nsfs_report)); + return; + } + // Serve report's metrics on the report name path + const report_name = req.url.substr(1); + const single_metrics = export_single_metrics(metrics, report_name); + if (single_metrics !== "") { + res.writeHead(200, { 'Content-Type': 'text/plain' }); + res.end(single_metrics); + return; + } + } else { + // Serve all metrics on the root path for system that do not have any fork running. + if (req.url === '' || req.url === '/') { + res.writeHead(200, { 'Content-Type': 'text/plain' }); + res.end(await export_all_metrics()); + return; + } + if (req.url === '/metrics/nsfs_stats') { + res.writeHead(200, { 'Content-Type': 'text/plain' }); + res.end(await metrics_nsfs_stats_handler()); + return; + } + const report_name = req.url.substr(1); + const report = reports[report_name]; + if (report) { + res.writeHead(200, { 'Content-Type': 'text/plain' }); + res.end(await report.export_metrics(report_name)); + return; + } } res.writeHead(404, { 'Content-Type': 'text/plain' }); @@ -94,6 +131,62 @@ async function start_server( } } +async function metrics_nsfs_stats_handler() { + const nsfs_io_stats = {}; + const nsfs_counters = stats_aggregator.get_nsfs_io_stats(false); + // Building the report per io and value + for (const [key, value] of Object.entries(nsfs_counters)) { + nsfs_io_stats[`noobaa_nsfs_io_${key}`.toLowerCase()] = value; + } + + const op_stats_counters = {}; + const op_stats = stats_aggregator.get_op_stats(false); + // Building the report per op name key and value + for (const [op_name, obj] of Object.entries(op_stats)) { + for (const [key, value] of Object.entries(obj)) { + op_stats_counters[`noobaa_nsfs_op_${op_name}_${key}`.toLowerCase()] = value; + } + } + + const nsfs_report = { + nsfs_counters: nsfs_io_stats, + op_stats_counters: op_stats_counters, + }; + dbg.log1(`_create_nsfs_report: nsfs_report ${nsfs_report}`); + return JSON.stringify(nsfs_report); +} + +function export_single_metrics(all_metrics, report_name) { + let single_metrics = ""; + const metrics_arr = all_metrics.split('\n'); + for (const metrics_line of metrics_arr) { + if (metrics_line.includes(report_name)) { + single_metrics = single_metrics + metrics_line + "\n"; + } + } + return single_metrics; + +} + +function set_io_stats(io_stats) { + const nsfs_io_stats = {}; + for (const [key, value] of Object.entries(io_stats)) { + nsfs_io_stats[`noobaa_nsfs_io_${key}`.toLowerCase()] = value; + } + io_stats_complete = nsfs_io_stats; +} + +function set_ops_stats(ops_stats) { + const op_stats_counters = {}; + // Building the report per op name key and value + for (const [op_name, obj] of Object.entries(ops_stats)) { + for (const [key, value] of Object.entries(obj)) { + op_stats_counters[`noobaa_nsfs_op_${op_name}_${key}`.toLowerCase()] = value; + } + } + ops_stats_complete = op_stats_counters; +} + // ----------------------------------------- // exports // ----------------------------------------- @@ -102,3 +195,5 @@ exports.get_core_report = get_core_report; exports.get_endpoint_report = get_endpoint_report; exports.export_all_metrics = export_all_metrics; exports.start_server = start_server; +exports.set_io_stats = set_io_stats; +exports.set_ops_stats = set_ops_stats; diff --git a/src/server/analytic_services/prometheus_reports/base_prometheus_report.js b/src/server/analytic_services/prometheus_reports/base_prometheus_report.js index 70c72afa2d..d0bac2c575 100644 --- a/src/server/analytic_services/prometheus_reports/base_prometheus_report.js +++ b/src/server/analytic_services/prometheus_reports/base_prometheus_report.js @@ -6,15 +6,15 @@ const config = require('../../../../config.js'); class BasePrometheusReport { constructor() { - this._registry = new this.prom_client.Registry(); + this._register = this.prom_client.register; } get prom_client() { return prom_client; } - get registry() { - return this._registry; + get register() { + return this._register; } get metric_prefix() { @@ -30,7 +30,7 @@ class BasePrometheusReport { } export_metrics() { - return this.registry.metrics(); + return this.register.metrics(); } } diff --git a/src/server/analytic_services/prometheus_reports/nodejs_report.js b/src/server/analytic_services/prometheus_reports/nodejs_report.js index e93d0810b4..6b1e7ec2bc 100644 --- a/src/server/analytic_services/prometheus_reports/nodejs_report.js +++ b/src/server/analytic_services/prometheus_reports/nodejs_report.js @@ -14,7 +14,7 @@ class NodeJsReport extends BasePrometheusReport { if (this.enabled) { this.prom_client.collectDefaultMetrics({ - register: this.registry, + register: this.register, prefix: this.metric_prefix }); } diff --git a/src/server/analytic_services/prometheus_reports/noobaa_core_report.js b/src/server/analytic_services/prometheus_reports/noobaa_core_report.js index cf2d312f4a..b27313b4c9 100644 --- a/src/server/analytic_services/prometheus_reports/noobaa_core_report.js +++ b/src/server/analytic_services/prometheus_reports/noobaa_core_report.js @@ -407,7 +407,7 @@ class NooBaaCoreReport extends BasePrometheusReport { } this._metrics[m.name] = new this.prom_client[m.type]({ name: this.get_prefixed_name(m.name), - registers: [this.registry], + registers: [this.register], ...m.configuration, }); } diff --git a/src/server/analytic_services/prometheus_reports/noobaa_endpoint_report.js b/src/server/analytic_services/prometheus_reports/noobaa_endpoint_report.js index 6aa2a4b1b6..d1414876b2 100644 --- a/src/server/analytic_services/prometheus_reports/noobaa_endpoint_report.js +++ b/src/server/analytic_services/prometheus_reports/noobaa_endpoint_report.js @@ -227,6 +227,15 @@ const NOOBAA_ENDPOINT_METRICS = js_utils.deep_freeze([{ total_values = 0; }, }, + { + type: 'Counter', + name: 'fork_counter', + configuration: { + help: 'Counter on number of fork hit', + labelNames: ['code'] + }, + aggregator: 'average', + } ]); class NooBaaEndpointReport extends BasePrometheusReport { @@ -241,7 +250,7 @@ class NooBaaEndpointReport extends BasePrometheusReport { collect: m.collect, prom_instance: new this.prom_client[m.type]({ name: this.get_prefixed_name(m.name), - registers: [this.registry], + registers: [this.register], ...m.configuration, collect() { if (m.collect && this.average_intervals) { diff --git a/src/server/system_services/stats_aggregator.js b/src/server/system_services/stats_aggregator.js index d00f57ee3b..48f6751791 100644 --- a/src/server/system_services/stats_aggregator.js +++ b/src/server/system_services/stats_aggregator.js @@ -28,6 +28,10 @@ const prom_reporting = require('../analytic_services/prometheus_reporting'); const { HistoryDataStore } = require('../analytic_services/history_data_store'); const addr_utils = require('../../util/addr_utils'); const Quota = require('../system_services/objects/quota'); +// these type hacks are needed because the type info from require('node:cluster') is incorrect +const cluster_module = /** @type {import('node:cluster').Cluster} */ ( + /** @type {unknown} */ (require('node:cluster')) +); const ops_aggregation = {}; @@ -1246,6 +1250,16 @@ async function update_nsfs_stats(req) { if (_nsfs_counters.fs_workers_stats) _update_fs_stats(_nsfs_counters.fs_workers_stats); } +async function standalon_update_nsfs_stats(_nsfs_counters = {}) { + dbg.log1(`standalon_update_nsfs_stats. nsfs_stats =`, _nsfs_counters); + if (_nsfs_counters.io_stats) _update_io_stats(_nsfs_counters.io_stats); + if (_nsfs_counters.op_stats) _update_ops_stats(_nsfs_counters.op_stats); + if (cluster_module.isWorker) { + process.send({ io_stats: _nsfs_counters.io_stats }); + process.send({ op_stats: _nsfs_counters.op_stats }); + } +} + function _update_io_stats(io_stats) { //Go over the io_stats and count for (const [key, value] of Object.entries(io_stats)) { @@ -1355,23 +1369,29 @@ function _new_namespace_nsfs_stats() { } // Will return the current nsfs_io_counters and reset it. -function get_nsfs_io_stats() { +function get_nsfs_io_stats(reset_nsfs_counters = true) { const nsfs_io_stats = nsfs_io_counters; - nsfs_io_counters = _new_namespace_nsfs_stats(); + if (reset_nsfs_counters) { + nsfs_io_counters = _new_namespace_nsfs_stats(); + } return nsfs_io_stats; } // Will return the current op_stats and reset it. -function get_op_stats() { +function get_op_stats(reset_nsfs_counters = true) { const nsfs_op_stats = op_stats; - op_stats = {}; + if (reset_nsfs_counters) { + op_stats = {}; + } return nsfs_op_stats; } // Will return the current fs_workers_stats and reset it. -function get_fs_workers_stats() { +function get_fs_workers_stats(reset_nsfs_counters = true) { const nsfs_fs_workers_stats = fs_workers_stats; - fs_workers_stats = {}; + if (reset_nsfs_counters) { + fs_workers_stats = {}; + } return nsfs_fs_workers_stats; } @@ -1401,3 +1421,4 @@ exports.object_usage_scrubber = object_usage_scrubber; exports.send_stats = background_worker; exports.background_worker = background_worker; exports.update_nsfs_stats = update_nsfs_stats; +exports.standalon_update_nsfs_stats = standalon_update_nsfs_stats; diff --git a/src/test/utils/metrics.js b/src/test/utils/metrics.js index 701de185b5..d4f2661fba 100644 --- a/src/test/utils/metrics.js +++ b/src/test/utils/metrics.js @@ -4,12 +4,12 @@ // Get metric from prometheus collector function get_metric(stat_collector, name) { const metric_name = stat_collector.get_prefixed_name(name); - return stat_collector.registry.getSingleMetric(metric_name); + return stat_collector.register.getSingleMetric(metric_name); } // Reset all metrics in prometheus collector function reset_metrics(stat_collector) { - return stat_collector.registry.resetMetrics(); + return stat_collector.register.resetMetrics(); } exports.get_metric = get_metric; diff --git a/src/util/fork_utils.js b/src/util/fork_utils.js index 97c3a9eeaf..09f6b338ed 100644 --- a/src/util/fork_utils.js +++ b/src/util/fork_utils.js @@ -5,7 +5,18 @@ const cluster = /** @type {import('node:cluster').Cluster} */ ( /** @type {unknown} */ (require('node:cluster')) ); +const dbg = require('../util/debug_module')(__filename); +const prom_reporting = require('../server/analytic_services/prometheus_reporting'); + +const io_stats = { + read_count: 0, + write_count: 0, + read_bytes: 0, + write_bytes: 0, +}; + +const op_stats = {}; /** * The cluster module allows easy creation of child processes that all share server ports. * When count > 0 the primary process will fork worker processes to process incoming http requests. @@ -13,10 +24,10 @@ const cluster = /** @type {import('node:cluster').Cluster} */ ( * @see https://nodejs.org/api/cluster.html * * @param {number?} count number of workers to start. + * @param {number?} metrics_port prometheus metris port. * @returns {boolean} true if workers were started. */ -function start_workers(count = 0) { - +function start_workers(metrics_port, count = 0) { if (cluster.isPrimary && count > 0) { for (let i = 0; i < count; ++i) { const worker = cluster.fork(); @@ -31,12 +42,74 @@ function start_workers(count = 0) { console.error('EXIT ON WORKER ERROR'); process.exit(1); }); - + for (const id in cluster.workers) { + if (id) { + cluster.workers[id].on('message', nsfs_io_state_handler); + } + } + if (metrics_port > 0) { + dbg.log0('Starting metrics server', metrics_port); + prom_reporting.start_server(metrics_port, true); + dbg.log0('Started metrics server successfully'); + } return true; } return false; } -exports.cluster = cluster; +function nsfs_io_state_handler(msg) { + if (msg.io_stats) { + for (const [key, value] of Object.entries(msg.io_stats)) { + io_stats[key] += value; + } + prom_reporting.set_io_stats(io_stats); + } + if (msg.op_stats) { + _update_ops_stats(msg.op_stats); + prom_reporting.set_ops_stats(op_stats); + } +} + +function _update_ops_stats(ops_stats) { + // Predefined op_names + const op_names = [ + `upload_object`, + `delete_object`, + `create_bucket`, + `list_buckets`, + `delete_bucket`, + `list_objects`, + `head_object`, + `read_object`, + `initiate_multipart`, + `upload_part`, + `complete_object_upload`, + ]; + //Go over the op_stats + for (const op_name of op_names) { + if (op_name in ops_stats) { + _set_op_stats(op_name, ops_stats[op_name]); + } + } +} + +function _set_op_stats(op_name, stats) { + //In the event of all of the same ops are failing (count = error_count) we will not masseur the op times + // As this is intended as a timing masseur and not a counter. + if (op_stats[op_name]) { + const count = op_stats[op_name].count + stats.count; + const error_count = op_stats[op_name].error_count + stats.error_count; + op_stats[op_name] = { + count, + error_count, + }; + } else if (stats.count > stats.error_count) { + op_stats[op_name] = { + count: stats.count, + error_count: stats.error_count, + }; + } +} + exports.start_workers = start_workers;