From eb216a1f4bbb26e1f18537b30d22e8ad8711f42c Mon Sep 17 00:00:00 2001 From: Costa Tsaousis Date: Mon, 9 May 2022 16:34:31 +0300 Subject: [PATCH] Workers utilization charts (#12807) * initial version of worker utilization * working example * without mutexes * monitoring DBENGINE, ACLKSYNC, WEB workers * added charts to monitor worker usage * fixed charts units * updated contexts * updated priorities * added documentation * converted threads to stacked chart * One query per query thread * Revert "One query per query thread" This reverts commit 6aeb391f5987c3c6ba2864b559fd7f0cd64b14d3. * fixed priority for web charts * read worker cpu utilization from proc * read workers cpu utilization via /proc/self/task/PID/stat, so that we have cpu utilization even when the jobs are too long to finish within our update_every frequency * disabled web server cpu utilization monitoring - it is now monitored by worker utilization * tight integration of worker utilization to web server * monitoring statsd worker threads * code cleanup and renaming of variables * contrained worker and statistics conflict to just one variable * support for rendering jobs per type * better priorities and removed the total jobs chart * added busy time in ms per job type * added proc.plugin monitoring, switch clock to MONOTONIC_RAW if available, global statistics now cleans up old worker threads * isolated worker thread families * added cgroups.plugin workers * remove unneeded dimensions when then expected worker is just one * plugins.d and streaming monitoring * rebased; support worker_is_busy() to be called one after another * added diskspace plugin monitoring * added tc.plugin monitoring * added ML threads monitoring * dont create dimensions and charts that are not needed * fix crash when job types are added on the fly * added timex and idlejitter plugins; collected heartbeat statistics; reworked heartbeat according to the POSIX * the right name is heartbeat for this chart * monitor streaming senders * added streaming senders to global stats * prevent division by zero * added clock_init() to external C plugins * added freebsd and macos plugins * added freebsd and macos to global statistics * dont use new as a variable; address compiler warnings on FreeBSD and MacOS * refactored contexts to be unique; added health threads monitoring Co-authored-by: Stelios Fragkakis <52996999+stelfrag@users.noreply.github.com> --- CMakeLists.txt | 2 + Makefile.am | 2 + aclk/aclk_query.c | 16 + collectors/all.h | 4 +- collectors/apps.plugin/apps_plugin.c | 2 + collectors/cgroups.plugin/sys_fs_cgroup.c | 102 +- collectors/cups.plugin/cups_plugin.c | 1 + .../diskspace.plugin/plugin_diskspace.c | 94 +- collectors/ebpf.plugin/ebpf.c | 2 + collectors/freebsd.plugin/plugin_freebsd.c | 101 +- collectors/freeipmi.plugin/freeipmi_plugin.c | 1 + .../idlejitter.plugin/plugin_idlejitter.c | 7 + collectors/macos.plugin/plugin_macos.c | 100 +- collectors/nfacct.plugin/plugin_nfacct.c | 1 + collectors/perf.plugin/perf_plugin.c | 1 + collectors/plugins.d/plugins_d.c | 5 + collectors/proc.plugin/plugin_proc.c | 123 +- collectors/slabinfo.plugin/slabinfo.c | 1 + collectors/statsd.plugin/statsd.c | 169 ++- collectors/tc.plugin/plugin_tc.c | 89 +- collectors/timex.plugin/plugin_timex.c | 72 +- collectors/xenstat.plugin/xenstat_plugin.c | 1 + configure.ac | 1 + daemon/global_statistics.c | 1305 ++++++++++++----- database/engine/rrdengine.c | 33 +- database/sqlite/sqlite_aclk.c | 37 + database/sqlite/sqlite_aclk.h | 6 +- health/health.c | 41 + libnetdata/Makefile.am | 1 + libnetdata/clocks/clocks.c | 245 +++- libnetdata/clocks/clocks.h | 25 +- libnetdata/libnetdata.h | 1 + libnetdata/worker_utilization/Makefile.am | 8 + libnetdata/worker_utilization/README.md | 58 + .../worker_utilization/worker_utilization.c | 201 +++ .../worker_utilization/worker_utilization.h | 22 + ml/Host.cc | 32 + parser/parser.c | 7 + parser/parser.h | 2 + streaming/receiver.c | 4 + streaming/sender.c | 96 +- web/server/static/static-threaded.c | 145 +- 42 files changed, 2070 insertions(+), 1096 deletions(-) create mode 100644 libnetdata/worker_utilization/Makefile.am create mode 100644 libnetdata/worker_utilization/README.md create mode 100644 libnetdata/worker_utilization/worker_utilization.c create mode 100644 libnetdata/worker_utilization/worker_utilization.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 553b3656aa1292..386e016acfe92c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -410,6 +410,8 @@ set(LIBNETDATA_FILES libnetdata/string/utf8.h libnetdata/socket/security.c libnetdata/socket/security.h + libnetdata/worker_utilization/worker_utilization.c + libnetdata/worker_utilization/worker_utilization.h libnetdata/circular_buffer/circular_buffer.c libnetdata/circular_buffer/circular_buffer.h) diff --git a/Makefile.am b/Makefile.am index 6dc5f204c35683..4a627c25d20f4b 100644 --- a/Makefile.am +++ b/Makefile.am @@ -187,6 +187,8 @@ LIBNETDATA_FILES = \ libnetdata/health/health.c \ libnetdata/health/health.h \ libnetdata/string/utf8.h \ + libnetdata/worker_utilization/worker_utilization.c \ + libnetdata/worker_utilization/worker_utilization.h \ $(NULL) if ENABLE_PLUGIN_EBPF diff --git a/aclk/aclk_query.c b/aclk/aclk_query.c index 058f6596a6535f..bd3b7a572c470b 100644 --- a/aclk/aclk_query.c +++ b/aclk/aclk_query.c @@ -351,6 +351,8 @@ static void aclk_query_process_msg(struct aclk_query_thread *query_thr, aclk_que { for (int i = 0; aclk_query_handlers[i].type != UNKNOWN; i++) { if (aclk_query_handlers[i].type == query->type) { + worker_is_busy(i); + debug(D_ACLK, "Processing Queued Message of type: \"%s\"", aclk_query_handlers[i].name); aclk_query_handlers[i].fnc(query_thr, query); if (aclk_stats_enabled) { @@ -361,6 +363,8 @@ static void aclk_query_process_msg(struct aclk_query_thread *query_thr, aclk_que ACLK_STATS_UNLOCK; } aclk_query_free(query); + + worker_is_idle(); return; } } @@ -378,21 +382,33 @@ int aclk_query_process_msgs(struct aclk_query_thread *query_thr) return 0; } +static void worker_aclk_register(void) { + worker_register("ACLKQUERY"); + for (int i = 0; aclk_query_handlers[i].type != UNKNOWN; i++) { + worker_register_job_name(i, aclk_query_handlers[i].name); + } +} + /** * Main query processing thread */ void *aclk_query_main_thread(void *ptr) { + worker_aclk_register(); + struct aclk_query_thread *query_thr = ptr; while (!netdata_exit) { aclk_query_process_msgs(query_thr); + worker_is_idle(); QUERY_THREAD_LOCK; if (unlikely(pthread_cond_wait(&query_cond_wait, &query_lock_wait))) sleep_usec(USEC_PER_SEC * 1); QUERY_THREAD_UNLOCK; } + + worker_unregister(); return NULL; } diff --git a/collectors/all.h b/collectors/all.h index 61f3c01bff2825..3d7304dd55e2da 100644 --- a/collectors/all.h +++ b/collectors/all.h @@ -360,10 +360,8 @@ #define NETDATA_CHART_PRIO_CHECKS 99999 -#define NETDATA_CHART_PRIO_NETDATA_DISKSPACE 132020 #define NETDATA_CHART_PRIO_NETDATA_TIMEX 132030 -#define NETDATA_CHART_PRIO_NETDATA_TC_CPU 135000 -#define NETDATA_CHART_PRIO_NETDATA_TC_TIME 135001 +#define NETDATA_CHART_PRIO_NETDATA_TC_TIME 1000100 #endif //NETDATA_ALL_H diff --git a/collectors/apps.plugin/apps_plugin.c b/collectors/apps.plugin/apps_plugin.c index 6924b2bf4e57a6..b4853d3545791e 100644 --- a/collectors/apps.plugin/apps_plugin.c +++ b/collectors/apps.plugin/apps_plugin.c @@ -4124,6 +4124,8 @@ static int check_capabilities() { int main(int argc, char **argv) { // debug_flags = D_PROCFILE; + clocks_init(); + pagesize = (size_t)sysconf(_SC_PAGESIZE); // set the name for logging diff --git a/collectors/cgroups.plugin/sys_fs_cgroup.c b/collectors/cgroups.plugin/sys_fs_cgroup.c index 9453d1b712561f..bf78624f9995ba 100644 --- a/collectors/cgroups.plugin/sys_fs_cgroup.c +++ b/collectors/cgroups.plugin/sys_fs_cgroup.c @@ -2646,11 +2646,26 @@ static inline void discovery_process_cgroup(struct cgroup *cg) { read_cgroup_network_interfaces(cg); } +#define WORKER_DISCOVERY_INIT 0 +#define WORKER_DISCOVERY_FIND 1 +#define WORKER_DISCOVERY_PROCESS 2 +#define WORKER_DISCOVERY_UPDATE 3 +#define WORKER_DISCOVERY_CLEANUP 4 +#define WORKER_DISCOVERY_COPY 5 +#define WORKER_DISCOVERY_SHARE 6 +#define WORKER_DISCOVERY_LOCK 7 + +#if WORKER_UTILIZATION_MAX_JOB_TYPES < 8 +#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 8 +#endif + static inline void discovery_find_all_cgroups() { debug(D_CGROUP, "searching for cgroups"); + worker_is_busy(WORKER_DISCOVERY_INIT); discovery_mark_all_cgroups_as_unavailable(); + worker_is_busy(WORKER_DISCOVERY_FIND); if (!cgroup_use_unified_cgroups) { discovery_find_all_cgroups_v1(); } else { @@ -2659,16 +2674,25 @@ static inline void discovery_find_all_cgroups() { struct cgroup *cg; for (cg = discovered_cgroup_root; cg; cg = cg->discovered_next) { + worker_is_busy(WORKER_DISCOVERY_PROCESS); discovery_process_cgroup(cg); } + worker_is_busy(WORKER_DISCOVERY_UPDATE); discovery_update_filenames(); + worker_is_busy(WORKER_DISCOVERY_LOCK); uv_mutex_lock(&cgroup_root_mutex); + + worker_is_busy(WORKER_DISCOVERY_CLEANUP); discovery_cleanup_all_cgroups(); + + worker_is_busy(WORKER_DISCOVERY_COPY); discovery_copy_discovered_cgroups_to_reader(); + uv_mutex_unlock(&cgroup_root_mutex); + worker_is_busy(WORKER_DISCOVERY_SHARE); discovery_share_cgroups_with_ebpf(); debug(D_CGROUP, "done searching for cgroups"); @@ -2678,7 +2702,19 @@ void cgroup_discovery_worker(void *ptr) { UNUSED(ptr); + worker_register("CGROUPSDISC"); + worker_register_job_name(WORKER_DISCOVERY_INIT, "init"); + worker_register_job_name(WORKER_DISCOVERY_FIND, "find"); + worker_register_job_name(WORKER_DISCOVERY_PROCESS, "process"); + worker_register_job_name(WORKER_DISCOVERY_UPDATE, "update"); + worker_register_job_name(WORKER_DISCOVERY_CLEANUP, "cleanup"); + worker_register_job_name(WORKER_DISCOVERY_COPY, "copy"); + worker_register_job_name(WORKER_DISCOVERY_SHARE, "share"); + worker_register_job_name(WORKER_DISCOVERY_LOCK, "lock"); + while (!netdata_exit) { + worker_is_idle(); + uv_mutex_lock(&discovery_thread.mutex); while (!discovery_thread.start_discovery) uv_cond_wait(&discovery_thread.cond_var, &discovery_thread.mutex); @@ -2692,6 +2728,7 @@ void cgroup_discovery_worker(void *ptr) } discovery_thread.exited = 1; + worker_unregister(); } // ---------------------------------------------------------------------------- @@ -4650,6 +4687,8 @@ void update_cgroup_charts(int update_every) { // cgroups main static void cgroup_main_cleanup(void *ptr) { + worker_unregister(); + struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr; static_thread->enabled = NETDATA_MAIN_THREAD_EXITING; @@ -4687,24 +4726,30 @@ static void cgroup_main_cleanup(void *ptr) { static_thread->enabled = NETDATA_MAIN_THREAD_EXITED; } +#define WORKER_CGROUPS_LOCK 0 +#define WORKER_CGROUPS_READ 1 +#define WORKER_CGROUPS_CHART 2 + +#if WORKER_UTILIZATION_MAX_JOB_TYPES < 3 +#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 3 +#endif + void *cgroups_main(void *ptr) { - netdata_thread_cleanup_push(cgroup_main_cleanup, ptr); + worker_register("CGROUPS"); + worker_register_job_name(WORKER_CGROUPS_LOCK, "lock"); + worker_register_job_name(WORKER_CGROUPS_READ, "read"); + worker_register_job_name(WORKER_CGROUPS_READ, "chart"); - struct rusage thread; + netdata_thread_cleanup_push(cgroup_main_cleanup, ptr); if (getenv("KUBERNETES_SERVICE_HOST") != NULL && getenv("KUBERNETES_SERVICE_PORT") != NULL) { is_inside_k8s = 1; cgroup_enable_cpuacct_cpu_shares = CONFIG_BOOLEAN_YES; } - // when ZERO, attempt to do it - int vdo_cpu_netdata = config_get_boolean("plugin:cgroups", "cgroups plugin resource charts", 1); - read_cgroup_plugin_configuration(); netdata_cgroup_ebpf_initialize_shm(); - RRDSET *stcpu_thread = NULL; - if (uv_mutex_init(&cgroup_root_mutex)) { error("CGROUP: cannot initialize mutex for the main cgroup list"); goto exit; @@ -4736,6 +4781,8 @@ void *cgroups_main(void *ptr) { usec_t find_every = cgroup_check_for_new_every * USEC_PER_SEC, find_dt = 0; while(!netdata_exit) { + worker_is_idle(); + usec_t hb_dt = heartbeat_next(&hb, step); if(unlikely(netdata_exit)) break; @@ -4747,46 +4794,21 @@ void *cgroups_main(void *ptr) { cgroups_check = 0; } + worker_is_busy(WORKER_CGROUPS_LOCK); uv_mutex_lock(&cgroup_root_mutex); - read_all_discovered_cgroups(cgroup_root); - update_cgroup_charts(cgroup_update_every); - uv_mutex_unlock(&cgroup_root_mutex); - - // -------------------------------------------------------------------- - - if(vdo_cpu_netdata) { - getrusage(RUSAGE_THREAD, &thread); - if(unlikely(!stcpu_thread)) { - - stcpu_thread = rrdset_create_localhost( - "netdata" - , "plugin_cgroups_cpu" - , NULL - , "cgroups" - , NULL - , "Netdata CGroups Plugin CPU usage" - , "milliseconds/s" - , PLUGIN_CGROUPS_NAME - , "stats" - , 132000 - , cgroup_update_every - , RRDSET_TYPE_STACKED - ); + worker_is_busy(WORKER_CGROUPS_READ); + read_all_discovered_cgroups(cgroup_root); - rrddim_add(stcpu_thread, "user", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL); - rrddim_add(stcpu_thread, "system", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL); - } - else - rrdset_next(stcpu_thread); + worker_is_busy(WORKER_CGROUPS_CHART); + update_cgroup_charts(cgroup_update_every); - rrddim_set(stcpu_thread, "user" , thread.ru_utime.tv_sec * 1000000ULL + thread.ru_utime.tv_usec); - rrddim_set(stcpu_thread, "system", thread.ru_stime.tv_sec * 1000000ULL + thread.ru_stime.tv_usec); - rrdset_done(stcpu_thread); - } + worker_is_idle(); + uv_mutex_unlock(&cgroup_root_mutex); } exit: + worker_unregister(); netdata_thread_cleanup_pop(1); return NULL; } diff --git a/collectors/cups.plugin/cups_plugin.c b/collectors/cups.plugin/cups_plugin.c index cc57dbf1f5344f..46bbc19bb83946 100644 --- a/collectors/cups.plugin/cups_plugin.c +++ b/collectors/cups.plugin/cups_plugin.c @@ -224,6 +224,7 @@ void reset_metrics() { } int main(int argc, char **argv) { + clocks_init(); // ------------------------------------------------------------------------ // initialization of netdata plugin diff --git a/collectors/diskspace.plugin/plugin_diskspace.c b/collectors/diskspace.plugin/plugin_diskspace.c index b6a52c0611b179..13806277c35efd 100644 --- a/collectors/diskspace.plugin/plugin_diskspace.c +++ b/collectors/diskspace.plugin/plugin_diskspace.c @@ -365,6 +365,8 @@ static inline void do_disk_space_stats(struct mountinfo *mi, int update_every) { } static void diskspace_main_cleanup(void *ptr) { + worker_unregister(); + struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr; static_thread->enabled = NETDATA_MAIN_THREAD_EXITING; @@ -373,10 +375,21 @@ static void diskspace_main_cleanup(void *ptr) { static_thread->enabled = NETDATA_MAIN_THREAD_EXITED; } +#define WORKER_JOB_MOUNTINFO 0 +#define WORKER_JOB_MOUNTPOINT 1 +#define WORKER_JOB_CLEANUP 2 + +#if WORKER_UTILIZATION_MAX_JOB_TYPES < 3 +#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 3 +#endif + void *diskspace_main(void *ptr) { - netdata_thread_cleanup_push(diskspace_main_cleanup, ptr); + worker_register("DISKSPACE"); + worker_register_job_name(WORKER_JOB_MOUNTINFO, "mountinfo"); + worker_register_job_name(WORKER_JOB_MOUNTPOINT, "mountpoint"); + worker_register_job_name(WORKER_JOB_CLEANUP, "cleanup"); - int vdo_cpu_netdata = config_get_boolean("plugin:proc", "netdata server resources", 1); + netdata_thread_cleanup_push(diskspace_main_cleanup, ptr); cleanup_mount_points = config_get_boolean(CONFIG_SECTION_DISKSPACE, "remove charts of unmounted disks" , cleanup_mount_points); @@ -388,14 +401,11 @@ void *diskspace_main(void *ptr) { if(check_for_new_mountpoints_every < update_every) check_for_new_mountpoints_every = update_every; - struct rusage thread; - - usec_t duration = 0; usec_t step = update_every * USEC_PER_SEC; heartbeat_t hb; heartbeat_init(&hb); while(!netdata_exit) { - duration = heartbeat_monotonic_dt_to_now_usec(&hb); + worker_is_idle(); /* usec_t hb_dt = */ heartbeat_next(&hb, step); if(unlikely(netdata_exit)) break; @@ -404,9 +414,9 @@ void *diskspace_main(void *ptr) { // -------------------------------------------------------------------------- // this is smart enough not to reload it every time + worker_is_busy(WORKER_JOB_MOUNTINFO); mountinfo_reload(0); - // -------------------------------------------------------------------------- // disk space metrics @@ -420,80 +430,20 @@ void *diskspace_main(void *ptr) { if(mi->flags & MOUNTINFO_READONLY && !strcmp(mi->root, mi->mount_point)) continue; + worker_is_busy(WORKER_JOB_MOUNTPOINT); do_disk_space_stats(mi, update_every); if(unlikely(netdata_exit)) break; } if(unlikely(netdata_exit)) break; - if(dict_mountpoints) + if(dict_mountpoints) { + worker_is_busy(WORKER_JOB_CLEANUP); dictionary_get_all(dict_mountpoints, mount_point_cleanup, NULL); - - if(vdo_cpu_netdata) { - static RRDSET *stcpu_thread = NULL, *st_duration = NULL; - static RRDDIM *rd_user = NULL, *rd_system = NULL, *rd_duration = NULL; - - // ---------------------------------------------------------------- - - getrusage(RUSAGE_THREAD, &thread); - - if(unlikely(!stcpu_thread)) { - stcpu_thread = rrdset_create_localhost( - "netdata" - , "plugin_diskspace" - , NULL - , "diskspace" - , NULL - , "Netdata Disk Space Plugin CPU usage" - , "milliseconds/s" - , PLUGIN_DISKSPACE_NAME - , NULL - , NETDATA_CHART_PRIO_NETDATA_DISKSPACE - , update_every - , RRDSET_TYPE_STACKED - ); - - rd_user = rrddim_add(stcpu_thread, "user", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL); - rd_system = rrddim_add(stcpu_thread, "system", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL); - } - else - rrdset_next(stcpu_thread); - - rrddim_set_by_pointer(stcpu_thread, rd_user, thread.ru_utime.tv_sec * 1000000ULL + thread.ru_utime.tv_usec); - rrddim_set_by_pointer(stcpu_thread, rd_system, thread.ru_stime.tv_sec * 1000000ULL + thread.ru_stime.tv_usec); - rrdset_done(stcpu_thread); - - // ---------------------------------------------------------------- - - if(unlikely(!st_duration)) { - st_duration = rrdset_create_localhost( - "netdata" - , "plugin_diskspace_dt" - , NULL - , "diskspace" - , NULL - , "Netdata Disk Space Plugin Duration" - , "milliseconds/run" - , PLUGIN_DISKSPACE_NAME - , NULL - , 132021 - , update_every - , RRDSET_TYPE_AREA - ); - - rd_duration = rrddim_add(st_duration, "duration", NULL, 1, 1000, RRD_ALGORITHM_ABSOLUTE); - } - else - rrdset_next(st_duration); - - rrddim_set_by_pointer(st_duration, rd_duration, duration); - rrdset_done(st_duration); - - // ---------------------------------------------------------------- - - if(unlikely(netdata_exit)) break; } + } + worker_unregister(); netdata_thread_cleanup_pop(1); return NULL; diff --git a/collectors/ebpf.plugin/ebpf.c b/collectors/ebpf.plugin/ebpf.c index eb9332273e3ced..9f411bb8bf2e27 100644 --- a/collectors/ebpf.plugin/ebpf.c +++ b/collectors/ebpf.plugin/ebpf.c @@ -1864,6 +1864,8 @@ static void ebpf_manage_pid(pid_t pid) */ int main(int argc, char **argv) { + clocks_init(); + set_global_variables(); ebpf_parse_args(argc, argv); ebpf_manage_pid(getpid()); diff --git a/collectors/freebsd.plugin/plugin_freebsd.c b/collectors/freebsd.plugin/plugin_freebsd.c index 97ca1d9ae00642..a52ece3f94e84a 100644 --- a/collectors/freebsd.plugin/plugin_freebsd.c +++ b/collectors/freebsd.plugin/plugin_freebsd.c @@ -9,7 +9,6 @@ static struct freebsd_module { int enabled; int (*func)(int update_every, usec_t dt); - usec_t duration; RRDDIM *rd; @@ -68,8 +67,14 @@ static struct freebsd_module { {.name = NULL, .dim = NULL, .enabled = 0, .func = NULL} }; +#if WORKER_UTILIZATION_MAX_JOB_TYPES < 33 +#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 33 +#endif + static void freebsd_main_cleanup(void *ptr) { + worker_unregister(); + struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr; static_thread->enabled = NETDATA_MAIN_THREAD_EXITING; @@ -80,9 +85,9 @@ static void freebsd_main_cleanup(void *ptr) void *freebsd_main(void *ptr) { - netdata_thread_cleanup_push(freebsd_main_cleanup, ptr); + worker_register("FREEBSD"); - int vdo_cpu_netdata = config_get_boolean("plugin:freebsd", "netdata server resources", 1); + netdata_thread_cleanup_push(freebsd_main_cleanup, ptr); // initialize FreeBSD plugin if (freebsd_plugin_init()) @@ -94,8 +99,9 @@ void *freebsd_main(void *ptr) struct freebsd_module *pm = &freebsd_modules[i]; pm->enabled = config_get_boolean("plugin:freebsd", pm->name, pm->enabled); - pm->duration = 0ULL; pm->rd = NULL; + + worker_register_job_name(i, freebsd_modules[i].dim); } usec_t step = localhost->rrd_update_every * USEC_PER_SEC; @@ -103,14 +109,13 @@ void *freebsd_main(void *ptr) heartbeat_init(&hb); while (!netdata_exit) { + worker_is_idle(); + usec_t hb_dt = heartbeat_next(&hb, step); - usec_t duration = 0ULL; if (unlikely(netdata_exit)) break; - // BEGIN -- the job to be done - for (i = 0; freebsd_modules[i].name; i++) { struct freebsd_module *pm = &freebsd_modules[i]; if (unlikely(!pm->enabled)) @@ -118,92 +123,12 @@ void *freebsd_main(void *ptr) debug(D_PROCNETDEV_LOOP, "FREEBSD calling %s.", pm->name); + worker_is_busy(i); pm->enabled = !pm->func(localhost->rrd_update_every, hb_dt); - pm->duration = heartbeat_monotonic_dt_to_now_usec(&hb) - duration; - duration += pm->duration; if (unlikely(netdata_exit)) break; } - - // END -- the job is done - - if (vdo_cpu_netdata) { - static RRDSET *st_cpu_thread = NULL, *st_duration = NULL; - static RRDDIM *rd_user = NULL, *rd_system = NULL; - - // ---------------------------------------------------------------- - - struct rusage thread; - getrusage(RUSAGE_THREAD, &thread); - - if (unlikely(!st_cpu_thread)) { - st_cpu_thread = rrdset_create_localhost( - "netdata", - "plugin_freebsd_cpu", - NULL, - "freebsd", - NULL, - "Netdata FreeBSD plugin CPU usage", - "milliseconds/s", - "freebsd.plugin", - "stats", - 132000, - localhost->rrd_update_every, - RRDSET_TYPE_STACKED); - - rd_user = rrddim_add(st_cpu_thread, "user", NULL, 1, USEC_PER_MS, RRD_ALGORITHM_INCREMENTAL); - rd_system = rrddim_add(st_cpu_thread, "system", NULL, 1, USEC_PER_MS, RRD_ALGORITHM_INCREMENTAL); - } else { - rrdset_next(st_cpu_thread); - } - - rrddim_set_by_pointer( - st_cpu_thread, rd_user, thread.ru_utime.tv_sec * USEC_PER_SEC + thread.ru_utime.tv_usec); - rrddim_set_by_pointer( - st_cpu_thread, rd_system, thread.ru_stime.tv_sec * USEC_PER_SEC + thread.ru_stime.tv_usec); - rrdset_done(st_cpu_thread); - - // ---------------------------------------------------------------- - - if (unlikely(!st_duration)) { - st_duration = rrdset_find_active_bytype_localhost("netdata", "plugin_freebsd_modules"); - - if (!st_duration) { - st_duration = rrdset_create_localhost( - "netdata", - "plugin_freebsd_modules", - NULL, - "freebsd", - NULL, - "Netdata FreeBSD plugin modules durations", - "milliseconds/run", - "freebsd.plugin", - "stats", - 132001, - localhost->rrd_update_every, - RRDSET_TYPE_STACKED); - - for (i = 0; freebsd_modules[i].name; i++) { - struct freebsd_module *pm = &freebsd_modules[i]; - if (unlikely(!pm->enabled)) - continue; - - pm->rd = rrddim_add(st_duration, pm->dim, NULL, 1, 1000, RRD_ALGORITHM_ABSOLUTE); - } - } - } else - rrdset_next(st_duration); - - for (i = 0; freebsd_modules[i].name; i++) { - struct freebsd_module *pm = &freebsd_modules[i]; - if (unlikely(!pm->enabled)) - continue; - - rrddim_set_by_pointer(st_duration, pm->rd, pm->duration); - } - rrdset_done(st_duration); - } } netdata_thread_cleanup_pop(1); diff --git a/collectors/freeipmi.plugin/freeipmi_plugin.c b/collectors/freeipmi.plugin/freeipmi_plugin.c index 6c6f3d747f13c7..351b6e32be85ef 100644 --- a/collectors/freeipmi.plugin/freeipmi_plugin.c +++ b/collectors/freeipmi.plugin/freeipmi_plugin.c @@ -1596,6 +1596,7 @@ int host_is_local(const char *host) } int main (int argc, char **argv) { + clocks_init(); // ------------------------------------------------------------------------ // initialization of netdata plugin diff --git a/collectors/idlejitter.plugin/plugin_idlejitter.c b/collectors/idlejitter.plugin/plugin_idlejitter.c index 12ab8601a7fc5a..535819c69f4de2 100644 --- a/collectors/idlejitter.plugin/plugin_idlejitter.c +++ b/collectors/idlejitter.plugin/plugin_idlejitter.c @@ -5,6 +5,8 @@ #define CPU_IDLEJITTER_SLEEP_TIME_MS 20 static void cpuidlejitter_main_cleanup(void *ptr) { + worker_unregister(); + struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr; static_thread->enabled = NETDATA_MAIN_THREAD_EXITING; @@ -14,6 +16,9 @@ static void cpuidlejitter_main_cleanup(void *ptr) { } void *cpuidlejitter_main(void *ptr) { + worker_register("IDLEJITTER"); + worker_register_job_name(0, "measurements"); + netdata_thread_cleanup_push(cpuidlejitter_main_cleanup, ptr); usec_t sleep_ut = config_get_number("plugin:idlejitter", "loop time in ms", CPU_IDLEJITTER_SLEEP_TIME_MS) * USEC_PER_MS; @@ -55,7 +60,9 @@ void *cpuidlejitter_main(void *ptr) { while(elapsed < update_every_ut) { now_monotonic_high_precision_timeval(&before); + worker_is_idle(); sleep_usec(sleep_ut); + worker_is_busy(0); now_monotonic_high_precision_timeval(&after); usec_t dt = dt_usec(&after, &before); diff --git a/collectors/macos.plugin/plugin_macos.c b/collectors/macos.plugin/plugin_macos.c index 4566c09ee793f7..10472bdb8abac2 100644 --- a/collectors/macos.plugin/plugin_macos.c +++ b/collectors/macos.plugin/plugin_macos.c @@ -9,7 +9,6 @@ static struct macos_module { int enabled; int (*func)(int update_every, usec_t dt); - usec_t duration; RRDDIM *rd; @@ -22,8 +21,14 @@ static struct macos_module { {.name = NULL, .dim = NULL, .enabled = 0, .func = NULL} }; +#if WORKER_UTILIZATION_MAX_JOB_TYPES < 3 +#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 3 +#endif + static void macos_main_cleanup(void *ptr) { + worker_unregister(); + struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr; static_thread->enabled = NETDATA_MAIN_THREAD_EXITING; @@ -34,17 +39,18 @@ static void macos_main_cleanup(void *ptr) void *macos_main(void *ptr) { - netdata_thread_cleanup_push(macos_main_cleanup, ptr); + worker_register("MACOS"); - int vdo_cpu_netdata = config_get_boolean("plugin:macos", "netdata server resources", CONFIG_BOOLEAN_YES); + netdata_thread_cleanup_push(macos_main_cleanup, ptr); // check the enabled status for each module for (int i = 0; macos_modules[i].name; i++) { struct macos_module *pm = &macos_modules[i]; pm->enabled = config_get_boolean("plugin:macos", pm->name, pm->enabled); - pm->duration = 0ULL; pm->rd = NULL; + + worker_register_job_name(i, macos_modules[i].dim); } usec_t step = localhost->rrd_update_every * USEC_PER_SEC; @@ -52,10 +58,8 @@ void *macos_main(void *ptr) heartbeat_init(&hb); while (!netdata_exit) { + worker_is_idle(); usec_t hb_dt = heartbeat_next(&hb, step); - usec_t duration = 0ULL; - - // BEGIN -- the job to be done for (int i = 0; macos_modules[i].name; i++) { struct macos_module *pm = &macos_modules[i]; @@ -64,92 +68,12 @@ void *macos_main(void *ptr) debug(D_PROCNETDEV_LOOP, "macos calling %s.", pm->name); + worker_is_busy(i); pm->enabled = !pm->func(localhost->rrd_update_every, hb_dt); - pm->duration = heartbeat_monotonic_dt_to_now_usec(&hb) - duration; - duration += pm->duration; if (unlikely(netdata_exit)) break; } - - // END -- the job is done - - if (vdo_cpu_netdata) { - static RRDSET *st_cpu_thread = NULL, *st_duration = NULL; - static RRDDIM *rd_user = NULL, *rd_system = NULL; - - // ---------------------------------------------------------------- - - struct rusage thread; - getrusage(RUSAGE_THREAD, &thread); - - if (unlikely(!st_cpu_thread)) { - st_cpu_thread = rrdset_create_localhost( - "netdata", - "plugin_macos_cpu", - NULL, - "macos", - NULL, - "Netdata macOS plugin CPU usage", - "milliseconds/s", - "macos.plugin", - "stats", - 132000, - localhost->rrd_update_every, - RRDSET_TYPE_STACKED); - - rd_user = rrddim_add(st_cpu_thread, "user", NULL, 1, USEC_PER_MS, RRD_ALGORITHM_INCREMENTAL); - rd_system = rrddim_add(st_cpu_thread, "system", NULL, 1, USEC_PER_MS, RRD_ALGORITHM_INCREMENTAL); - } else { - rrdset_next(st_cpu_thread); - } - - rrddim_set_by_pointer( - st_cpu_thread, rd_user, thread.ru_utime.tv_sec * USEC_PER_SEC + thread.ru_utime.tv_usec); - rrddim_set_by_pointer( - st_cpu_thread, rd_system, thread.ru_stime.tv_sec * USEC_PER_SEC + thread.ru_stime.tv_usec); - rrdset_done(st_cpu_thread); - - // ---------------------------------------------------------------- - - if (unlikely(!st_duration)) { - st_duration = rrdset_find_active_bytype_localhost("netdata", "plugin_macos_modules"); - - if (!st_duration) { - st_duration = rrdset_create_localhost( - "netdata", - "plugin_macos_modules", - NULL, - "macos", - NULL, - "Netdata macOS plugin modules durations", - "milliseconds/run", - "macos.plugin", - "stats", - 132001, - localhost->rrd_update_every, - RRDSET_TYPE_STACKED); - - for (int i = 0; macos_modules[i].name; i++) { - struct macos_module *pm = &macos_modules[i]; - if (unlikely(!pm->enabled)) - continue; - - pm->rd = rrddim_add(st_duration, pm->dim, NULL, 1, 1000, RRD_ALGORITHM_ABSOLUTE); - } - } - } else - rrdset_next(st_duration); - - for (int i = 0; macos_modules[i].name; i++) { - struct macos_module *pm = &macos_modules[i]; - if (unlikely(!pm->enabled)) - continue; - - rrddim_set_by_pointer(st_duration, pm->rd, pm->duration); - } - rrdset_done(st_duration); - } } netdata_thread_cleanup_pop(1); diff --git a/collectors/nfacct.plugin/plugin_nfacct.c b/collectors/nfacct.plugin/plugin_nfacct.c index 35209a281a5816..eeadb3ccce2ae9 100644 --- a/collectors/nfacct.plugin/plugin_nfacct.c +++ b/collectors/nfacct.plugin/plugin_nfacct.c @@ -745,6 +745,7 @@ void nfacct_signals() } int main(int argc, char **argv) { + clocks_init(); // ------------------------------------------------------------------------ // initialization of netdata plugin diff --git a/collectors/perf.plugin/perf_plugin.c b/collectors/perf.plugin/perf_plugin.c index 4020cf0661a011..80e042edc37665 100644 --- a/collectors/perf.plugin/perf_plugin.c +++ b/collectors/perf.plugin/perf_plugin.c @@ -1283,6 +1283,7 @@ void parse_command_line(int argc, char **argv) { } int main(int argc, char **argv) { + clocks_init(); // ------------------------------------------------------------------------ // initialization of netdata plugin diff --git a/collectors/plugins.d/plugins_d.c b/collectors/plugins.d/plugins_d.c index 614e43d5849e92..daef1f766c5dd8 100644 --- a/collectors/plugins.d/plugins_d.c +++ b/collectors/plugins.d/plugins_d.c @@ -230,6 +230,8 @@ static void pluginsd_worker_thread_handle_error(struct plugind *cd, int worker_r void *pluginsd_worker_thread(void *arg) { + worker_register("PLUGINSD"); + netdata_thread_cleanup_push(pluginsd_worker_thread_cleanup, arg); struct plugind *cd = (struct plugind *)arg; @@ -260,6 +262,7 @@ void *pluginsd_worker_thread(void *arg) if (unlikely(!cd->enabled)) break; } + worker_unregister(); netdata_thread_cleanup_pop(1); return NULL; @@ -281,6 +284,8 @@ static void pluginsd_main_cleanup(void *data) info("cleanup completed."); static_thread->enabled = NETDATA_MAIN_THREAD_EXITED; + + worker_unregister(); } void *pluginsd_main(void *ptr) diff --git a/collectors/proc.plugin/plugin_proc.c b/collectors/proc.plugin/plugin_proc.c index 190811e24b7d95..e10e91f82d4f80 100644 --- a/collectors/proc.plugin/plugin_proc.c +++ b/collectors/proc.plugin/plugin_proc.c @@ -9,7 +9,6 @@ static struct proc_module { int enabled; int (*func)(int update_every, usec_t dt); - usec_t duration; RRDDIM *rd; @@ -66,9 +65,7 @@ static struct proc_module { // ZFS metrics {.name = "/proc/spl/kstat/zfs/arcstats", .dim = "zfs_arcstats", .func = do_proc_spl_kstat_zfs_arcstats}, - {.name = "/proc/spl/kstat/zfs/pool/state", - .dim = "zfs_pool_state", - .func = do_proc_spl_kstat_zfs_pool_state}, + {.name = "/proc/spl/kstat/zfs/pool/state",.dim = "zfs_pool_state",.func = do_proc_spl_kstat_zfs_pool_state}, // BTRFS metrics {.name = "/sys/fs/btrfs", .dim = "btrfs", .func = do_sys_fs_btrfs}, @@ -83,6 +80,10 @@ static struct proc_module { {.name = NULL, .dim = NULL, .func = NULL} }; +#if WORKER_UTILIZATION_MAX_JOB_TYPES < 36 +#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 36 +#endif + static void proc_main_cleanup(void *ptr) { struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr; @@ -91,13 +92,15 @@ static void proc_main_cleanup(void *ptr) info("cleaning up..."); static_thread->enabled = NETDATA_MAIN_THREAD_EXITED; + + worker_unregister(); } void *proc_main(void *ptr) { - netdata_thread_cleanup_push(proc_main_cleanup, ptr); + worker_register("PROC"); - int vdo_cpu_netdata = config_get_boolean("plugin:proc", "netdata server resources", CONFIG_BOOLEAN_YES); + netdata_thread_cleanup_push(proc_main_cleanup, ptr); config_get_boolean("plugin:proc", "/proc/pagetypeinfo", CONFIG_BOOLEAN_NO); @@ -107,128 +110,34 @@ void *proc_main(void *ptr) struct proc_module *pm = &proc_modules[i]; pm->enabled = config_get_boolean("plugin:proc", pm->name, CONFIG_BOOLEAN_YES); - pm->duration = 0ULL; pm->rd = NULL; + + worker_register_job_name(i, proc_modules[i].dim); } usec_t step = localhost->rrd_update_every * USEC_PER_SEC; heartbeat_t hb; heartbeat_init(&hb); - size_t iterations = 0; while (!netdata_exit) { - iterations++; - (void)iterations; - + worker_is_idle(); usec_t hb_dt = heartbeat_next(&hb, step); - usec_t duration = 0ULL; if (unlikely(netdata_exit)) break; - // BEGIN -- the job to be done - for (i = 0; proc_modules[i].name; i++) { + if (unlikely(netdata_exit)) + break; + struct proc_module *pm = &proc_modules[i]; if (unlikely(!pm->enabled)) continue; debug(D_PROCNETDEV_LOOP, "PROC calling %s.", pm->name); -//#ifdef NETDATA_LOG_ALLOCATIONS -// if(pm->func == do_proc_interrupts) -// log_thread_memory_allocations = iterations; -//#endif + worker_is_busy(i); pm->enabled = !pm->func(localhost->rrd_update_every, hb_dt); - pm->duration = heartbeat_monotonic_dt_to_now_usec(&hb) - duration; - duration += pm->duration; - -//#ifdef NETDATA_LOG_ALLOCATIONS -// if(pm->func == do_proc_interrupts) -// log_thread_memory_allocations = 0; -//#endif - - if (unlikely(netdata_exit)) - break; - } - - // END -- the job is done - - if (vdo_cpu_netdata) { - static RRDSET *st_cpu_thread = NULL, *st_duration = NULL; - static RRDDIM *rd_user = NULL, *rd_system = NULL; - - // ---------------------------------------------------------------- - - struct rusage thread; - getrusage(RUSAGE_THREAD, &thread); - - if (unlikely(!st_cpu_thread)) { - st_cpu_thread = rrdset_create_localhost( - "netdata", - "plugin_proc_cpu", - NULL, - "proc", - NULL, - "Netdata proc plugin CPU usage", - "milliseconds/s", - "proc", - "stats", - 132000, - localhost->rrd_update_every, - RRDSET_TYPE_STACKED); - - rd_user = rrddim_add(st_cpu_thread, "user", NULL, 1, USEC_PER_MS, RRD_ALGORITHM_INCREMENTAL); - rd_system = rrddim_add(st_cpu_thread, "system", NULL, 1, USEC_PER_MS, RRD_ALGORITHM_INCREMENTAL); - } else { - rrdset_next(st_cpu_thread); - } - - rrddim_set_by_pointer( - st_cpu_thread, rd_user, thread.ru_utime.tv_sec * USEC_PER_SEC + thread.ru_utime.tv_usec); - rrddim_set_by_pointer( - st_cpu_thread, rd_system, thread.ru_stime.tv_sec * USEC_PER_SEC + thread.ru_stime.tv_usec); - rrdset_done(st_cpu_thread); - - // ---------------------------------------------------------------- - - if (unlikely(!st_duration)) { - st_duration = rrdset_find_active_bytype_localhost("netdata", "plugin_proc_modules"); - - if (!st_duration) { - st_duration = rrdset_create_localhost( - "netdata", - "plugin_proc_modules", - NULL, - "proc", - NULL, - "Netdata proc plugin modules durations", - "milliseconds/run", - "proc", - "stats", - 132001, - localhost->rrd_update_every, - RRDSET_TYPE_STACKED); - - for (i = 0; proc_modules[i].name; i++) { - struct proc_module *pm = &proc_modules[i]; - if (unlikely(!pm->enabled)) - continue; - - pm->rd = rrddim_add(st_duration, pm->dim, NULL, 1, USEC_PER_MS, RRD_ALGORITHM_ABSOLUTE); - } - } - } else - rrdset_next(st_duration); - - for (i = 0; proc_modules[i].name; i++) { - struct proc_module *pm = &proc_modules[i]; - if (unlikely(!pm->enabled)) - continue; - - rrddim_set_by_pointer(st_duration, pm->rd, pm->duration); - } - rrdset_done(st_duration); } } diff --git a/collectors/slabinfo.plugin/slabinfo.c b/collectors/slabinfo.plugin/slabinfo.c index 0913b895eefecb..2e47ee229ef4e1 100644 --- a/collectors/slabinfo.plugin/slabinfo.c +++ b/collectors/slabinfo.plugin/slabinfo.c @@ -336,6 +336,7 @@ void usage(void) { } int main(int argc, char **argv) { + clocks_init(); program_name = argv[0]; program_version = "0.1"; diff --git a/collectors/statsd.plugin/statsd.c b/collectors/statsd.plugin/statsd.c index a630d00d0cd195..f4286ae37840af 100644 --- a/collectors/statsd.plugin/statsd.c +++ b/collectors/statsd.plugin/statsd.c @@ -9,6 +9,15 @@ #define STATSD_LISTEN_PORT 8125 #define STATSD_LISTEN_BACKLOG 4096 +#define WORKER_JOB_TYPE_TCP_CONNECTED 0 +#define WORKER_JOB_TYPE_TCP_DISCONNECTED 1 +#define WORKER_JOB_TYPE_RCV_DATA 2 +#define WORKER_JOB_TYPE_SND_DATA 3 + +#if WORKER_UTILIZATION_MAX_JOB_TYPES < 4 +#error Please increase WORKER_UTILIZATION_MAX_JOB_TYPES to at least 4 +#endif + // -------------------------------------------------------------------------------------- // #define STATSD_MULTITHREADED 1 @@ -237,10 +246,6 @@ struct collection_thread_status { size_t max_sockets; netdata_thread_t thread; - struct rusage rusage; - RRDSET *st_cpu; - RRDDIM *rd_user; - RRDDIM *rd_system; }; static struct statsd { @@ -788,6 +793,7 @@ static void *statsd_add_callback(POLLINFO *pi, short int *events, void *data) { (void)pi; (void)data; + worker_is_busy(WORKER_JOB_TYPE_TCP_CONNECTED); *events = POLLIN; struct statsd_tcp *t = (struct statsd_tcp *)callocz(sizeof(struct statsd_tcp) + STATSD_TCP_BUFFER_SIZE, 1); @@ -796,11 +802,14 @@ static void *statsd_add_callback(POLLINFO *pi, short int *events, void *data) { statsd.tcp_socket_connects++; statsd.tcp_socket_connected++; + worker_is_idle(); return t; } // TCP client disconnected static void statsd_del_callback(POLLINFO *pi) { + worker_is_busy(WORKER_JOB_TYPE_TCP_DISCONNECTED); + struct statsd_tcp *t = pi->data; if(likely(t)) { @@ -818,10 +827,15 @@ static void statsd_del_callback(POLLINFO *pi) { freez(t); } + + worker_is_idle(); } // Receive data static int statsd_rcv_callback(POLLINFO *pi, short int *events) { + int retval = -1; + worker_is_busy(WORKER_JOB_TYPE_RCV_DATA); + *events = POLLIN; int fd = pi->fd; @@ -832,14 +846,16 @@ static int statsd_rcv_callback(POLLINFO *pi, short int *events) { if(unlikely(!d)) { error("STATSD: internal error: expected TCP data pointer is NULL"); statsd.socket_errors++; - return -1; + retval = -1; + goto cleanup; } #ifdef NETDATA_INTERNAL_CHECKS if(unlikely(d->type != STATSD_SOCKET_DATA_TYPE_TCP)) { error("STATSD: internal error: socket data type should be %d, but it is %d", (int)STATSD_SOCKET_DATA_TYPE_TCP, (int)d->type); statsd.socket_errors++; - return -1; + retval = -1; + goto cleanup; } #endif @@ -872,8 +888,10 @@ static int statsd_rcv_callback(POLLINFO *pi, short int *events) { d->len = statsd_process(d->buffer, d->len, 1); } - if(unlikely(ret == -1)) - return -1; + if(unlikely(ret == -1)) { + retval = -1; + goto cleanup; + } } while (rc != -1); break; @@ -884,14 +902,16 @@ static int statsd_rcv_callback(POLLINFO *pi, short int *events) { if(unlikely(!d)) { error("STATSD: internal error: expected UDP data pointer is NULL"); statsd.socket_errors++; - return -1; + retval = -1; + goto cleanup; } #ifdef NETDATA_INTERNAL_CHECKS if(unlikely(d->type != STATSD_SOCKET_DATA_TYPE_UDP)) { error("STATSD: internal error: socket data should be %d, but it is %d", (int)d->type, (int)STATSD_SOCKET_DATA_TYPE_UDP); statsd.socket_errors++; - return -1; + retval = -1; + goto cleanup; } #endif @@ -904,7 +924,8 @@ static int statsd_rcv_callback(POLLINFO *pi, short int *events) { if (errno != EWOULDBLOCK && errno != EAGAIN && errno != EINTR) { error("STATSD: recvmmsg() on UDP socket %d failed.", fd); statsd.socket_errors++; - return -1; + retval = -1; + goto cleanup; } } else if (rc) { // data received @@ -929,7 +950,8 @@ static int statsd_rcv_callback(POLLINFO *pi, short int *events) { if (errno != EWOULDBLOCK && errno != EAGAIN && errno != EINTR) { error("STATSD: recv() on UDP socket %d failed.", fd); statsd.socket_errors++; - return -1; + retval = -1; + goto cleanup; } } else if (rc) { // data received @@ -947,24 +969,26 @@ static int statsd_rcv_callback(POLLINFO *pi, short int *events) { default: { error("STATSD: internal error: unknown socktype %d on socket %d", pi->socktype, fd); statsd.socket_errors++; - return -1; + retval = -1; + goto cleanup; } } - return 0; + retval = 0; +cleanup: + worker_is_idle(); + return retval; } static int statsd_snd_callback(POLLINFO *pi, short int *events) { (void)pi; (void)events; + worker_is_busy(WORKER_JOB_TYPE_SND_DATA); error("STATSD: snd_callback() called, but we never requested to send data to statsd clients."); - return -1; -} + worker_is_idle(); -static void statsd_timer_callback(void *timer_data) { - struct collection_thread_status *status = timer_data; - getrusage(RUSAGE_THREAD, &status->rusage); + return -1; } // -------------------------------------------------------------------------------------------------------------------- @@ -986,12 +1010,19 @@ void statsd_collector_thread_cleanup(void *data) { #endif freez(d); + worker_unregister(); } void *statsd_collector_thread(void *ptr) { struct collection_thread_status *status = ptr; status->status = 1; + worker_register("STATSD"); + worker_register_job_name(WORKER_JOB_TYPE_TCP_CONNECTED, "tcp connect"); + worker_register_job_name(WORKER_JOB_TYPE_TCP_DISCONNECTED, "tcp disconnect"); + worker_register_job_name(WORKER_JOB_TYPE_RCV_DATA, "receive"); + worker_register_job_name(WORKER_JOB_TYPE_SND_DATA, "send"); + info("STATSD collector thread started with taskid %d", gettid()); struct statsd_udp *d = callocz(sizeof(struct statsd_udp), 1); @@ -1019,7 +1050,7 @@ void *statsd_collector_thread(void *ptr) { , statsd_del_callback , statsd_rcv_callback , statsd_snd_callback - , statsd_timer_callback + , NULL , NULL // No access control pattern , 0 // No dns lookups for access control pattern , (void *)d @@ -2147,9 +2178,32 @@ static void statsd_main_cleanup(void *data) { info("STATSD: cleanup completed."); static_thread->enabled = NETDATA_MAIN_THREAD_EXITED; + + worker_unregister(); } +#define WORKER_STATSD_FLUSH_GAUGES 0 +#define WORKER_STATSD_FLUSH_COUNTERS 1 +#define WORKER_STATSD_FLUSH_METERS 2 +#define WORKER_STATSD_FLUSH_TIMERS 3 +#define WORKER_STATSD_FLUSH_HISTOGRAMS 4 +#define WORKER_STATSD_FLUSH_SETS 5 +#define WORKER_STATSD_FLUSH_STATS 6 + +#if WORKER_UTILIZATION_MAX_JOB_TYPES < 7 +#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 6 +#endif + void *statsd_main(void *ptr) { + worker_register("STATSDFLUSH"); + worker_register_job_name(WORKER_STATSD_FLUSH_GAUGES, "gauges"); + worker_register_job_name(WORKER_STATSD_FLUSH_COUNTERS, "counters"); + worker_register_job_name(WORKER_STATSD_FLUSH_METERS, "meters"); + worker_register_job_name(WORKER_STATSD_FLUSH_TIMERS, "timers"); + worker_register_job_name(WORKER_STATSD_FLUSH_HISTOGRAMS, "histograms"); + worker_register_job_name(WORKER_STATSD_FLUSH_SETS, "sets"); + worker_register_job_name(WORKER_STATSD_FLUSH_STATS, "statistics"); + netdata_thread_cleanup_push(statsd_main_cleanup, ptr); // ---------------------------------------------------------------------------------------------------------------- @@ -2420,71 +2474,37 @@ void *statsd_main(void *ptr) { ); RRDDIM *rd_pcharts = rrddim_add(st_pcharts, "charts", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); - RRDSET *stcpu_thread = rrdset_create_localhost( - "netdata" - , "plugin_statsd_charting_cpu" - , NULL - , "statsd" - , "netdata.statsd_cpu" - , "Netdata statsd charting thread CPU usage" - , "milliseconds/s" - , PLUGIN_STATSD_NAME - , "stats" - , 132001 - , statsd.update_every - , RRDSET_TYPE_STACKED - ); - - RRDDIM *rd_user = rrddim_add(stcpu_thread, "user", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL); - RRDDIM *rd_system = rrddim_add(stcpu_thread, "system", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL); - struct rusage thread; - - for(i = 0; i < statsd.threads ;i++) { - char id[100 + 1]; - char title[100 + 1]; - - snprintfz(id, 100, "plugin_statsd_collector%d_cpu", i + 1); - snprintfz(title, 100, "Netdata statsd collector thread No %d CPU usage", i + 1); - - statsd.collection_threads_status[i].st_cpu = rrdset_create_localhost( - "netdata" - , id - , NULL - , "statsd" - , "netdata.statsd_cpu" - , title - , "milliseconds/s" - , PLUGIN_STATSD_NAME - , "stats" - , 132002 + i - , statsd.update_every - , RRDSET_TYPE_STACKED - ); - - statsd.collection_threads_status[i].rd_user = rrddim_add(statsd.collection_threads_status[i].st_cpu, "user", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL); - statsd.collection_threads_status[i].rd_system = rrddim_add(statsd.collection_threads_status[i].st_cpu, "system", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL); - } - - // ---------------------------------------------------------------------------------------------------------------- + // ---------------------------------------------------------------------------------------------------------------- // statsd thread to turn metrics into charts usec_t step = statsd.update_every * USEC_PER_SEC; heartbeat_t hb; heartbeat_init(&hb); while(!netdata_exit) { + worker_is_idle(); usec_t hb_dt = heartbeat_next(&hb, step); + worker_is_busy(WORKER_STATSD_FLUSH_GAUGES); statsd_flush_index_metrics(&statsd.gauges, statsd_flush_gauge); + + worker_is_busy(WORKER_STATSD_FLUSH_COUNTERS); statsd_flush_index_metrics(&statsd.counters, statsd_flush_counter); + + worker_is_busy(WORKER_STATSD_FLUSH_METERS); statsd_flush_index_metrics(&statsd.meters, statsd_flush_meter); + + worker_is_busy(WORKER_STATSD_FLUSH_TIMERS); statsd_flush_index_metrics(&statsd.timers, statsd_flush_timer); + + worker_is_busy(WORKER_STATSD_FLUSH_HISTOGRAMS); statsd_flush_index_metrics(&statsd.histograms, statsd_flush_histogram); + + worker_is_busy(WORKER_STATSD_FLUSH_SETS); statsd_flush_index_metrics(&statsd.sets, statsd_flush_set); + worker_is_busy(WORKER_STATSD_FLUSH_STATS); statsd_update_all_app_charts(); - getrusage(RUSAGE_THREAD, &thread); - if(unlikely(netdata_exit)) break; @@ -2498,9 +2518,6 @@ void *statsd_main(void *ptr) { rrdset_next(st_tcp_connects); rrdset_next(st_tcp_connected); rrdset_next(st_pcharts); - rrdset_next(stcpu_thread); - for(i = 0; i < statsd.threads ;i++) - rrdset_next(statsd.collection_threads_status[i].st_cpu); } rrddim_set_by_pointer(st_metrics, rd_metrics_gauge, (collected_number)statsd.gauges.metrics); @@ -2550,16 +2567,6 @@ void *statsd_main(void *ptr) { rrddim_set_by_pointer(st_pcharts, rd_pcharts, (collected_number)statsd.private_charts); rrdset_done(st_pcharts); - - rrddim_set_by_pointer(stcpu_thread, rd_user, thread.ru_utime.tv_sec * 1000000ULL + thread.ru_utime.tv_usec); - rrddim_set_by_pointer(stcpu_thread, rd_system, thread.ru_stime.tv_sec * 1000000ULL + thread.ru_stime.tv_usec); - rrdset_done(stcpu_thread); - - for(i = 0; i < statsd.threads ;i++) { - rrddim_set_by_pointer(statsd.collection_threads_status[i].st_cpu, statsd.collection_threads_status[i].rd_user, statsd.collection_threads_status[i].rusage.ru_utime.tv_sec * 1000000ULL + statsd.collection_threads_status[i].rusage.ru_utime.tv_usec); - rrddim_set_by_pointer(statsd.collection_threads_status[i].st_cpu, statsd.collection_threads_status[i].rd_system, statsd.collection_threads_status[i].rusage.ru_stime.tv_sec * 1000000ULL + statsd.collection_threads_status[i].rusage.ru_stime.tv_usec); - rrdset_done(statsd.collection_threads_status[i].st_cpu); - } } cleanup: ; // added semi-colon to prevent older gcc error: label at end of compound statement diff --git a/collectors/tc.plugin/plugin_tc.c b/collectors/tc.plugin/plugin_tc.c index ce3fe668b4d0d4..f012c078d8c129 100644 --- a/collectors/tc.plugin/plugin_tc.c +++ b/collectors/tc.plugin/plugin_tc.c @@ -844,6 +844,8 @@ static inline void tc_split_words(char *str, char **words, int max_words) { static pid_t tc_child_pid = 0; static void tc_main_cleanup(void *ptr) { + worker_unregister(); + struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr; static_thread->enabled = NETDATA_MAIN_THREAD_EXITING; @@ -864,10 +866,35 @@ static void tc_main_cleanup(void *ptr) { static_thread->enabled = NETDATA_MAIN_THREAD_EXITED; } +#define WORKER_TC_CLASS 0 +#define WORKER_TC_BEGIN 1 +#define WORKER_TC_END 2 +#define WORKER_TC_SENT 3 +#define WORKER_TC_LENDED 4 +#define WORKER_TC_TOKENS 5 +#define WORKER_TC_SETDEVICENAME 6 +#define WORKER_TC_SETDEVICEGROUP 7 +#define WORKER_TC_SETCLASSNAME 8 +#define WORKER_TC_WORKTIME 9 + +#if WORKER_UTILIZATION_MAX_JOB_TYPES < 10 +#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 10 +#endif + void *tc_main(void *ptr) { - netdata_thread_cleanup_push(tc_main_cleanup, ptr); + worker_register("TC"); + worker_register_job_name(WORKER_TC_CLASS, "class"); + worker_register_job_name(WORKER_TC_BEGIN, "begin"); + worker_register_job_name(WORKER_TC_END, "end"); + worker_register_job_name(WORKER_TC_SENT, "sent"); + worker_register_job_name(WORKER_TC_LENDED, "lended"); + worker_register_job_name(WORKER_TC_TOKENS, "tokens"); + worker_register_job_name(WORKER_TC_SETDEVICENAME, "devicename"); + worker_register_job_name(WORKER_TC_SETDEVICEGROUP, "devicegroup"); + worker_register_job_name(WORKER_TC_SETCLASSNAME, "classname"); + worker_register_job_name(WORKER_TC_WORKTIME, "worktime"); - struct rusage thread; + netdata_thread_cleanup_push(tc_main_cleanup, ptr); char command[FILENAME_MAX + 1]; char *words[PLUGINSD_MAX_WORDS] = { NULL }; @@ -913,6 +940,7 @@ void *tc_main(void *ptr) { if(unlikely(!words[0] || !*words[0])) { // debug(D_TC_LOOP, "empty line"); + worker_is_idle(); continue; } // else debug(D_TC_LOOP, "First word is '%s'", words[0]); @@ -920,6 +948,8 @@ void *tc_main(void *ptr) { first_hash = simple_hash(words[0]); if(unlikely(device && ((first_hash == CLASS_HASH && strcmp(words[0], "class") == 0) || (first_hash == QDISC_HASH && strcmp(words[0], "qdisc") == 0)))) { + worker_is_busy(WORKER_TC_CLASS); + // debug(D_TC_LOOP, "CLASS line on class id='%s', parent='%s', parentid='%s', leaf='%s', leafid='%s'", words[2], words[3], words[4], words[5], words[6]); char *type = words[1]; // the class/qdisc type: htb, fq_codel, etc @@ -949,6 +979,7 @@ void *tc_main(void *ptr) { // there should be an IFB interface for this class = NULL; + worker_is_idle(); continue; } @@ -985,6 +1016,8 @@ void *tc_main(void *ptr) { } } else if(unlikely(first_hash == END_HASH && strcmp(words[0], "END") == 0)) { + worker_is_busy(WORKER_TC_END); + // debug(D_TC_LOOP, "END line"); if(likely(device)) { @@ -998,6 +1031,8 @@ void *tc_main(void *ptr) { class = NULL; } else if(unlikely(first_hash == BEGIN_HASH && strcmp(words[0], "BEGIN") == 0)) { + worker_is_busy(WORKER_TC_BEGIN); + // debug(D_TC_LOOP, "BEGIN line on device '%s'", words[1]); if(likely(words[1] && *words[1])) { @@ -1011,6 +1046,8 @@ void *tc_main(void *ptr) { class = NULL; } else if(unlikely(device && class && first_hash == SENT_HASH && strcmp(words[0], "Sent") == 0)) { + worker_is_busy(WORKER_TC_SENT); + // debug(D_TC_LOOP, "SENT line '%s'", words[1]); if(likely(words[1] && *words[1])) { class->bytes = str2ull(words[1]); @@ -1033,6 +1070,8 @@ void *tc_main(void *ptr) { class->requeues = str2ull(words[8]); } else if(unlikely(device && class && class->updated && first_hash == LENDED_HASH && strcmp(words[0], "lended:") == 0)) { + worker_is_busy(WORKER_TC_LENDED); + // debug(D_TC_LOOP, "LENDED line '%s'", words[1]); if(likely(words[1] && *words[1])) class->lended = str2ull(words[1]); @@ -1044,6 +1083,8 @@ void *tc_main(void *ptr) { class->giants = str2ull(words[5]); } else if(unlikely(device && class && class->updated && first_hash == TOKENS_HASH && strcmp(words[0], "tokens:") == 0)) { + worker_is_busy(WORKER_TC_TOKENS); + // debug(D_TC_LOOP, "TOKENS line '%s'", words[1]); if(likely(words[1] && *words[1])) class->tokens = str2ull(words[1]); @@ -1052,16 +1093,22 @@ void *tc_main(void *ptr) { class->ctokens = str2ull(words[3]); } else if(unlikely(device && first_hash == SETDEVICENAME_HASH && strcmp(words[0], "SETDEVICENAME") == 0)) { + worker_is_busy(WORKER_TC_SETDEVICENAME); + // debug(D_TC_LOOP, "SETDEVICENAME line '%s'", words[1]); if(likely(words[1] && *words[1])) tc_device_set_device_name(device, words[1]); } else if(unlikely(device && first_hash == SETDEVICEGROUP_HASH && strcmp(words[0], "SETDEVICEGROUP") == 0)) { + worker_is_busy(WORKER_TC_SETDEVICEGROUP); + // debug(D_TC_LOOP, "SETDEVICEGROUP line '%s'", words[1]); if(likely(words[1] && *words[1])) tc_device_set_device_family(device, words[1]); } else if(unlikely(device && first_hash == SETCLASSNAME_HASH && strcmp(words[0], "SETCLASSNAME") == 0)) { + worker_is_busy(WORKER_TC_SETCLASSNAME); + // debug(D_TC_LOOP, "SETCLASSNAME line '%s' '%s'", words[1], words[2]); char *id = words[1]; char *path = words[2]; @@ -1069,36 +1116,9 @@ void *tc_main(void *ptr) { tc_device_set_class_name(device, id, path); } else if(unlikely(first_hash == WORKTIME_HASH && strcmp(words[0], "WORKTIME") == 0)) { - // debug(D_TC_LOOP, "WORKTIME line '%s' '%s'", words[1], words[2]); - getrusage(RUSAGE_THREAD, &thread); - - static RRDSET *stcpu = NULL; - static RRDDIM *rd_user = NULL, *rd_system = NULL; - - if(unlikely(!stcpu)) { - stcpu = rrdset_create_localhost( - "netdata" - , "plugin_tc_cpu" - , NULL - , "tc.helper" - , NULL - , "Netdata TC CPU usage" - , "milliseconds/s" - , PLUGIN_TC_NAME - , NULL - , NETDATA_CHART_PRIO_NETDATA_TC_CPU - , localhost->rrd_update_every - , RRDSET_TYPE_STACKED - ); - rd_user = rrddim_add(stcpu, "user", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL); - rd_system = rrddim_add(stcpu, "system", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL); - } - else rrdset_next(stcpu); - - rrddim_set_by_pointer(stcpu, rd_user , thread.ru_utime.tv_sec * 1000000ULL + thread.ru_utime.tv_usec); - rrddim_set_by_pointer(stcpu, rd_system, thread.ru_stime.tv_sec * 1000000ULL + thread.ru_stime.tv_usec); - rrdset_done(stcpu); + worker_is_busy(WORKER_TC_WORKTIME); + // debug(D_TC_LOOP, "WORKTIME line '%s' '%s'", words[1], words[2]); static RRDSET *sttime = NULL; static RRDDIM *rd_run_time = NULL; @@ -1107,8 +1127,8 @@ void *tc_main(void *ptr) { "netdata" , "plugin_tc_time" , NULL - , "tc.helper" - , NULL + , "workers plugin tc" + , "netdata.workers.tc.script_time" , "Netdata TC script execution" , "milliseconds/run" , PLUGIN_TC_NAME @@ -1128,6 +1148,8 @@ void *tc_main(void *ptr) { //else { // debug(D_TC_LOOP, "IGNORED line"); //} + + worker_is_idle(); } // fgets() failed or loop broke @@ -1158,6 +1180,7 @@ void *tc_main(void *ptr) { } cleanup: ; // added semi-colon to prevent older gcc error: label at end of compound statement + worker_unregister(); netdata_thread_cleanup_pop(1); return NULL; } diff --git a/collectors/timex.plugin/plugin_timex.c b/collectors/timex.plugin/plugin_timex.c index 34a3415a0b1458..0390b9920b5b81 100644 --- a/collectors/timex.plugin/plugin_timex.c +++ b/collectors/timex.plugin/plugin_timex.c @@ -32,6 +32,8 @@ struct status_codes { static void timex_main_cleanup(void *ptr) { + worker_unregister(); + struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr; static_thread->enabled = NETDATA_MAIN_THREAD_EXITING; @@ -42,9 +44,10 @@ static void timex_main_cleanup(void *ptr) void *timex_main(void *ptr) { - netdata_thread_cleanup_push(timex_main_cleanup, ptr); + worker_register("TIMEX"); + worker_register_job_name(0, "clock check"); - int vdo_cpu_netdata = config_get_boolean(CONFIG_SECTION_TIMEX, "timex plugin resource charts", CONFIG_BOOLEAN_YES); + netdata_thread_cleanup_push(timex_main_cleanup, ptr); int update_every = (int)config_get_number(CONFIG_SECTION_TIMEX, "update every", 10); if (update_every < localhost->rrd_update_every) @@ -62,8 +65,9 @@ void *timex_main(void *ptr) heartbeat_t hb; heartbeat_init(&hb); while (!netdata_exit) { - usec_t duration = heartbeat_monotonic_dt_to_now_usec(&hb); + worker_is_idle(); heartbeat_next(&hb, step); + worker_is_busy(0); struct timex timex_buf = {}; int sync_state = 0; @@ -170,68 +174,6 @@ void *timex_main(void *ptr) rrddim_set_by_pointer(st_offset, rd_offset, timex_buf.offset); rrdset_done(st_offset); } - - if (vdo_cpu_netdata) { - static RRDSET *stcpu_thread = NULL, *st_duration = NULL; - static RRDDIM *rd_user = NULL, *rd_system = NULL, *rd_duration = NULL; - - // ---------------------------------------------------------------- - - struct rusage thread; - getrusage(RUSAGE_THREAD, &thread); - - if (unlikely(!stcpu_thread)) { - stcpu_thread = rrdset_create_localhost( - "netdata", - "plugin_timex", - NULL, - "timex", - NULL, - "Netdata Timex Plugin CPU usage", - "milliseconds/s", - PLUGIN_TIMEX_NAME, - NULL, - NETDATA_CHART_PRIO_NETDATA_TIMEX, - update_every, - RRDSET_TYPE_STACKED); - - rd_user = rrddim_add(stcpu_thread, "user", NULL, 1, USEC_PER_MS, RRD_ALGORITHM_INCREMENTAL); - rd_system = rrddim_add(stcpu_thread, "system", NULL, 1, USEC_PER_MS, RRD_ALGORITHM_INCREMENTAL); - } else { - rrdset_next(stcpu_thread); - } - - rrddim_set_by_pointer( - stcpu_thread, rd_user, thread.ru_utime.tv_sec * USEC_PER_SEC + thread.ru_utime.tv_usec); - rrddim_set_by_pointer( - stcpu_thread, rd_system, thread.ru_stime.tv_sec * USEC_PER_SEC + thread.ru_stime.tv_usec); - rrdset_done(stcpu_thread); - - // ---------------------------------------------------------------- - - if (unlikely(!st_duration)) { - st_duration = rrdset_create_localhost( - "netdata", - "plugin_timex_dt", - NULL, - "timex", - NULL, - "Netdata Timex Plugin Duration", - "milliseconds/run", - PLUGIN_TIMEX_NAME, - NULL, - NETDATA_CHART_PRIO_NETDATA_TIMEX + 1, - update_every, - RRDSET_TYPE_AREA); - - rd_duration = rrddim_add(st_duration, "duration", NULL, 1, USEC_PER_MS, RRD_ALGORITHM_ABSOLUTE); - } else { - rrdset_next(st_duration); - } - - rrddim_set_by_pointer(st_duration, rd_duration, duration); - rrdset_done(st_duration); - } } exit: diff --git a/collectors/xenstat.plugin/xenstat_plugin.c b/collectors/xenstat.plugin/xenstat_plugin.c index 781b22afe9b357..882f72ce97f727 100644 --- a/collectors/xenstat.plugin/xenstat_plugin.c +++ b/collectors/xenstat.plugin/xenstat_plugin.c @@ -920,6 +920,7 @@ static void xenstat_send_domain_metrics() { } int main(int argc, char **argv) { + clocks_init(); // ------------------------------------------------------------------------ // initialization of netdata plugin diff --git a/configure.ac b/configure.ac index a5ef2f85980b25..7af950aa0f38ba 100644 --- a/configure.ac +++ b/configure.ac @@ -1767,6 +1767,7 @@ AC_CONFIG_FILES([ libnetdata/url/Makefile libnetdata/json/Makefile libnetdata/health/Makefile + libnetdata/worker_utilization/Makefile registry/Makefile streaming/Makefile system/Makefile diff --git a/daemon/global_statistics.c b/daemon/global_statistics.c index 5c48ae5253dd40..c4849ed6c37d78 100644 --- a/daemon/global_statistics.c +++ b/daemon/global_statistics.c @@ -6,6 +6,16 @@ #define CONFIG_SECTION_GLOBAL_STATISTICS "global statistics" +#define WORKER_JOB_GLOBAL 0 +#define WORKER_JOB_REGISTRY 1 +#define WORKER_JOB_WORKERS 2 +#define WORKER_JOB_DBENGINE 3 +#define WORKER_JOB_HEARTBEAT 4 + +#if WORKER_UTILIZATION_MAX_JOB_TYPES < 5 +#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 5 +#endif + static struct global_statistics { volatile uint16_t connected_clients; @@ -436,435 +446,993 @@ static void global_statistics_charts(void) { } // ---------------------------------------------------------------- +} +static void dbengine_statistics_charts(void) { #ifdef ENABLE_DBENGINE - RRDHOST *host; - unsigned long long stats_array[RRDENG_NR_STATS] = {0}; - unsigned long long local_stats_array[RRDENG_NR_STATS]; - unsigned dbengine_contexts = 0, counted_multihost_db = 0, i; - - rrd_rdlock(); - rrdhost_foreach_read(host) { - if (host->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE && !rrdhost_flag_check(host, RRDHOST_FLAG_ARCHIVED)) { - if (&multidb_ctx == host->rrdeng_ctx) { - if (counted_multihost_db) - continue; /* Only count multi-host DB once */ - counted_multihost_db = 1; - } - ++dbengine_contexts; - /* get localhost's DB engine's statistics */ - rrdeng_get_37_statistics(host->rrdeng_ctx, local_stats_array); - for (i = 0 ; i < RRDENG_NR_STATS ; ++i) { - /* aggregate statistics across hosts */ - stats_array[i] += local_stats_array[i]; + if(netdata_rwlock_tryrdlock(&rrd_rwlock) == 0) { + RRDHOST *host; + unsigned long long stats_array[RRDENG_NR_STATS] = {0}; + unsigned long long local_stats_array[RRDENG_NR_STATS]; + unsigned dbengine_contexts = 0, counted_multihost_db = 0, i; + + rrdhost_foreach_read(host) { + if (host->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE && !rrdhost_flag_check(host, RRDHOST_FLAG_ARCHIVED)) { + if (&multidb_ctx == host->rrdeng_ctx) { + if (counted_multihost_db) + continue; /* Only count multi-host DB once */ + counted_multihost_db = 1; + } + ++dbengine_contexts; + /* get localhost's DB engine's statistics */ + rrdeng_get_37_statistics(host->rrdeng_ctx, local_stats_array); + for (i = 0; i < RRDENG_NR_STATS; ++i) { + /* aggregate statistics across hosts */ + stats_array[i] += local_stats_array[i]; + } } } - } - rrd_unlock(); - - if (dbengine_contexts) { - /* deduplicate global statistics by getting the ones from the last context */ - stats_array[30] = local_stats_array[30]; - stats_array[31] = local_stats_array[31]; - stats_array[32] = local_stats_array[32]; - stats_array[34] = local_stats_array[34]; - stats_array[36] = local_stats_array[36]; - - // ---------------------------------------------------------------- - - { - static RRDSET *st_compression = NULL; - static RRDDIM *rd_savings = NULL; - - if (unlikely(!st_compression)) { - st_compression = rrdset_create_localhost( - "netdata" - , "dbengine_compression_ratio" - , NULL - , "dbengine" - , NULL - , "Netdata DB engine data extents' compression savings ratio" - , "percentage" - , "netdata" - , "stats" - , 130502 - , localhost->rrd_update_every - , RRDSET_TYPE_LINE - ); - - rd_savings = rrddim_add(st_compression, "savings", NULL, 1, 1000, RRD_ALGORITHM_ABSOLUTE); + rrd_unlock(); + + if (dbengine_contexts) { + /* deduplicate global statistics by getting the ones from the last context */ + stats_array[30] = local_stats_array[30]; + stats_array[31] = local_stats_array[31]; + stats_array[32] = local_stats_array[32]; + stats_array[34] = local_stats_array[34]; + stats_array[36] = local_stats_array[36]; + + // ---------------------------------------------------------------- + + { + static RRDSET *st_compression = NULL; + static RRDDIM *rd_savings = NULL; + + if (unlikely(!st_compression)) { + st_compression = rrdset_create_localhost( + "netdata", + "dbengine_compression_ratio", + NULL, + "dbengine", + NULL, + "Netdata DB engine data extents' compression savings ratio", + "percentage", + "netdata", + "stats", + 130502, + localhost->rrd_update_every, + RRDSET_TYPE_LINE); + + rd_savings = rrddim_add(st_compression, "savings", NULL, 1, 1000, RRD_ALGORITHM_ABSOLUTE); + } else + rrdset_next(st_compression); + + unsigned long long ratio; + unsigned long long compressed_content_size = stats_array[12]; + unsigned long long content_size = stats_array[11]; + + if (content_size) { + // allow negative savings + ratio = ((content_size - compressed_content_size) * 100 * 1000) / content_size; + } else { + ratio = 0; + } + rrddim_set_by_pointer(st_compression, rd_savings, ratio); + + rrdset_done(st_compression); } - else - rrdset_next(st_compression); - - unsigned long long ratio; - unsigned long long compressed_content_size = stats_array[12]; - unsigned long long content_size = stats_array[11]; - - if (content_size) { - // allow negative savings - ratio = ((content_size - compressed_content_size) * 100 * 1000) / content_size; - } else { - ratio = 0; - } - rrddim_set_by_pointer(st_compression, rd_savings, ratio); - rrdset_done(st_compression); - } + // ---------------------------------------------------------------- + + { + static RRDSET *st_pg_cache_hit_ratio = NULL; + static RRDDIM *rd_hit_ratio = NULL; + + if (unlikely(!st_pg_cache_hit_ratio)) { + st_pg_cache_hit_ratio = rrdset_create_localhost( + "netdata", + "page_cache_hit_ratio", + NULL, + "dbengine", + NULL, + "Netdata DB engine page cache hit ratio", + "percentage", + "netdata", + "stats", + 130503, + localhost->rrd_update_every, + RRDSET_TYPE_LINE); + + rd_hit_ratio = rrddim_add(st_pg_cache_hit_ratio, "ratio", NULL, 1, 1000, RRD_ALGORITHM_ABSOLUTE); + } else + rrdset_next(st_pg_cache_hit_ratio); + + static unsigned long long old_hits = 0; + static unsigned long long old_misses = 0; + unsigned long long hits = stats_array[7]; + unsigned long long misses = stats_array[8]; + unsigned long long hits_delta; + unsigned long long misses_delta; + unsigned long long ratio; + + hits_delta = hits - old_hits; + misses_delta = misses - old_misses; + old_hits = hits; + old_misses = misses; + + if (hits_delta + misses_delta) { + ratio = (hits_delta * 100 * 1000) / (hits_delta + misses_delta); + } else { + ratio = 0; + } + rrddim_set_by_pointer(st_pg_cache_hit_ratio, rd_hit_ratio, ratio); + + rrdset_done(st_pg_cache_hit_ratio); + } - // ---------------------------------------------------------------- - - { - static RRDSET *st_pg_cache_hit_ratio = NULL; - static RRDDIM *rd_hit_ratio = NULL; - - if (unlikely(!st_pg_cache_hit_ratio)) { - st_pg_cache_hit_ratio = rrdset_create_localhost( - "netdata" - , "page_cache_hit_ratio" - , NULL - , "dbengine" - , NULL - , "Netdata DB engine page cache hit ratio" - , "percentage" - , "netdata" - , "stats" - , 130503 - , localhost->rrd_update_every - , RRDSET_TYPE_LINE - ); - - rd_hit_ratio = rrddim_add(st_pg_cache_hit_ratio, "ratio", NULL, 1, 1000, RRD_ALGORITHM_ABSOLUTE); + // ---------------------------------------------------------------- + + { + static RRDSET *st_pg_cache_pages = NULL; + static RRDDIM *rd_descriptors = NULL; + static RRDDIM *rd_populated = NULL; + static RRDDIM *rd_dirty = NULL; + static RRDDIM *rd_backfills = NULL; + static RRDDIM *rd_evictions = NULL; + static RRDDIM *rd_used_by_collectors = NULL; + + if (unlikely(!st_pg_cache_pages)) { + st_pg_cache_pages = rrdset_create_localhost( + "netdata", + "page_cache_stats", + NULL, + "dbengine", + NULL, + "Netdata dbengine page cache statistics", + "pages", + "netdata", + "stats", + 130504, + localhost->rrd_update_every, + RRDSET_TYPE_LINE); + + rd_descriptors = rrddim_add(st_pg_cache_pages, "descriptors", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + rd_populated = rrddim_add(st_pg_cache_pages, "populated", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + rd_dirty = rrddim_add(st_pg_cache_pages, "dirty", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + rd_backfills = rrddim_add(st_pg_cache_pages, "backfills", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); + rd_evictions = rrddim_add(st_pg_cache_pages, "evictions", NULL, -1, 1, RRD_ALGORITHM_INCREMENTAL); + rd_used_by_collectors = + rrddim_add(st_pg_cache_pages, "used_by_collectors", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + } else + rrdset_next(st_pg_cache_pages); + + rrddim_set_by_pointer(st_pg_cache_pages, rd_descriptors, (collected_number)stats_array[27]); + rrddim_set_by_pointer(st_pg_cache_pages, rd_populated, (collected_number)stats_array[3]); + rrddim_set_by_pointer(st_pg_cache_pages, rd_dirty, (collected_number)stats_array[0] + stats_array[4]); + rrddim_set_by_pointer(st_pg_cache_pages, rd_backfills, (collected_number)stats_array[9]); + rrddim_set_by_pointer(st_pg_cache_pages, rd_evictions, (collected_number)stats_array[10]); + rrddim_set_by_pointer(st_pg_cache_pages, rd_used_by_collectors, (collected_number)stats_array[0]); + rrdset_done(st_pg_cache_pages); } - else - rrdset_next(st_pg_cache_hit_ratio); - - static unsigned long long old_hits = 0; - static unsigned long long old_misses = 0; - unsigned long long hits = stats_array[7]; - unsigned long long misses = stats_array[8]; - unsigned long long hits_delta; - unsigned long long misses_delta; - unsigned long long ratio; - - hits_delta = hits - old_hits; - misses_delta = misses - old_misses; - old_hits = hits; - old_misses = misses; - - if (hits_delta + misses_delta) { - ratio = (hits_delta * 100 * 1000) / (hits_delta + misses_delta); - } else { - ratio = 0; + + // ---------------------------------------------------------------- + + { + static RRDSET *st_long_term_pages = NULL; + static RRDDIM *rd_total = NULL; + static RRDDIM *rd_insertions = NULL; + static RRDDIM *rd_deletions = NULL; + static RRDDIM *rd_flushing_pressure_deletions = NULL; + + if (unlikely(!st_long_term_pages)) { + st_long_term_pages = rrdset_create_localhost( + "netdata", + "dbengine_long_term_page_stats", + NULL, + "dbengine", + NULL, + "Netdata dbengine long-term page statistics", + "pages", + "netdata", + "stats", + 130505, + localhost->rrd_update_every, + RRDSET_TYPE_LINE); + + rd_total = rrddim_add(st_long_term_pages, "total", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + rd_insertions = rrddim_add(st_long_term_pages, "insertions", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); + rd_deletions = rrddim_add(st_long_term_pages, "deletions", NULL, -1, 1, RRD_ALGORITHM_INCREMENTAL); + rd_flushing_pressure_deletions = rrddim_add( + st_long_term_pages, "flushing_pressure_deletions", NULL, -1, 1, RRD_ALGORITHM_INCREMENTAL); + } else + rrdset_next(st_long_term_pages); + + rrddim_set_by_pointer(st_long_term_pages, rd_total, (collected_number)stats_array[2]); + rrddim_set_by_pointer(st_long_term_pages, rd_insertions, (collected_number)stats_array[5]); + rrddim_set_by_pointer(st_long_term_pages, rd_deletions, (collected_number)stats_array[6]); + rrddim_set_by_pointer( + st_long_term_pages, rd_flushing_pressure_deletions, (collected_number)stats_array[36]); + rrdset_done(st_long_term_pages); } - rrddim_set_by_pointer(st_pg_cache_hit_ratio, rd_hit_ratio, ratio); - rrdset_done(st_pg_cache_hit_ratio); - } + // ---------------------------------------------------------------- + + { + static RRDSET *st_io_stats = NULL; + static RRDDIM *rd_reads = NULL; + static RRDDIM *rd_writes = NULL; + + if (unlikely(!st_io_stats)) { + st_io_stats = rrdset_create_localhost( + "netdata", + "dbengine_io_throughput", + NULL, + "dbengine", + NULL, + "Netdata DB engine I/O throughput", + "MiB/s", + "netdata", + "stats", + 130506, + localhost->rrd_update_every, + RRDSET_TYPE_LINE); + + rd_reads = rrddim_add(st_io_stats, "reads", NULL, 1, 1024 * 1024, RRD_ALGORITHM_INCREMENTAL); + rd_writes = rrddim_add(st_io_stats, "writes", NULL, -1, 1024 * 1024, RRD_ALGORITHM_INCREMENTAL); + } else + rrdset_next(st_io_stats); + + rrddim_set_by_pointer(st_io_stats, rd_reads, (collected_number)stats_array[17]); + rrddim_set_by_pointer(st_io_stats, rd_writes, (collected_number)stats_array[15]); + rrdset_done(st_io_stats); + } - // ---------------------------------------------------------------- - - { - static RRDSET *st_pg_cache_pages = NULL; - static RRDDIM *rd_descriptors = NULL; - static RRDDIM *rd_populated = NULL; - static RRDDIM *rd_dirty = NULL; - static RRDDIM *rd_backfills = NULL; - static RRDDIM *rd_evictions = NULL; - static RRDDIM *rd_used_by_collectors = NULL; - - if (unlikely(!st_pg_cache_pages)) { - st_pg_cache_pages = rrdset_create_localhost( - "netdata" - , "page_cache_stats" - , NULL - , "dbengine" - , NULL - , "Netdata dbengine page cache statistics" - , "pages" - , "netdata" - , "stats" - , 130504 - , localhost->rrd_update_every - , RRDSET_TYPE_LINE - ); - - rd_descriptors = rrddim_add(st_pg_cache_pages, "descriptors", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); - rd_populated = rrddim_add(st_pg_cache_pages, "populated", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); - rd_dirty = rrddim_add(st_pg_cache_pages, "dirty", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); - rd_backfills = rrddim_add(st_pg_cache_pages, "backfills", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); - rd_evictions = rrddim_add(st_pg_cache_pages, "evictions", NULL, -1, 1, RRD_ALGORITHM_INCREMENTAL); - rd_used_by_collectors = rrddim_add(st_pg_cache_pages, "used_by_collectors", NULL, 1, 1, - RRD_ALGORITHM_ABSOLUTE); + // ---------------------------------------------------------------- + + { + static RRDSET *st_io_stats = NULL; + static RRDDIM *rd_reads = NULL; + static RRDDIM *rd_writes = NULL; + + if (unlikely(!st_io_stats)) { + st_io_stats = rrdset_create_localhost( + "netdata", + "dbengine_io_operations", + NULL, + "dbengine", + NULL, + "Netdata DB engine I/O operations", + "operations/s", + "netdata", + "stats", + 130507, + localhost->rrd_update_every, + RRDSET_TYPE_LINE); + + rd_reads = rrddim_add(st_io_stats, "reads", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); + rd_writes = rrddim_add(st_io_stats, "writes", NULL, -1, 1, RRD_ALGORITHM_INCREMENTAL); + } else + rrdset_next(st_io_stats); + + rrddim_set_by_pointer(st_io_stats, rd_reads, (collected_number)stats_array[18]); + rrddim_set_by_pointer(st_io_stats, rd_writes, (collected_number)stats_array[16]); + rrdset_done(st_io_stats); } - else - rrdset_next(st_pg_cache_pages); - - rrddim_set_by_pointer(st_pg_cache_pages, rd_descriptors, (collected_number)stats_array[27]); - rrddim_set_by_pointer(st_pg_cache_pages, rd_populated, (collected_number)stats_array[3]); - rrddim_set_by_pointer(st_pg_cache_pages, rd_dirty, (collected_number)stats_array[0] + stats_array[4]); - rrddim_set_by_pointer(st_pg_cache_pages, rd_backfills, (collected_number)stats_array[9]); - rrddim_set_by_pointer(st_pg_cache_pages, rd_evictions, (collected_number)stats_array[10]); - rrddim_set_by_pointer(st_pg_cache_pages, rd_used_by_collectors, (collected_number)stats_array[0]); - rrdset_done(st_pg_cache_pages); - } - // ---------------------------------------------------------------- + // ---------------------------------------------------------------- + + { + static RRDSET *st_errors = NULL; + static RRDDIM *rd_fs_errors = NULL; + static RRDDIM *rd_io_errors = NULL; + static RRDDIM *pg_cache_over_half_dirty_events = NULL; + + if (unlikely(!st_errors)) { + st_errors = rrdset_create_localhost( + "netdata", + "dbengine_global_errors", + NULL, + "dbengine", + NULL, + "Netdata DB engine errors", + "errors/s", + "netdata", + "stats", + 130508, + localhost->rrd_update_every, + RRDSET_TYPE_LINE); + + rd_io_errors = rrddim_add(st_errors, "io_errors", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); + rd_fs_errors = rrddim_add(st_errors, "fs_errors", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); + pg_cache_over_half_dirty_events = + rrddim_add(st_errors, "pg_cache_over_half_dirty_events", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); + } else + rrdset_next(st_errors); + + rrddim_set_by_pointer(st_errors, rd_io_errors, (collected_number)stats_array[30]); + rrddim_set_by_pointer(st_errors, rd_fs_errors, (collected_number)stats_array[31]); + rrddim_set_by_pointer(st_errors, pg_cache_over_half_dirty_events, (collected_number)stats_array[34]); + rrdset_done(st_errors); + } - { - static RRDSET *st_long_term_pages = NULL; - static RRDDIM *rd_total = NULL; - static RRDDIM *rd_insertions = NULL; - static RRDDIM *rd_deletions = NULL; - static RRDDIM *rd_flushing_pressure_deletions = NULL; + // ---------------------------------------------------------------- + + { + static RRDSET *st_fd = NULL; + static RRDDIM *rd_fd_current = NULL; + static RRDDIM *rd_fd_max = NULL; + + if (unlikely(!st_fd)) { + st_fd = rrdset_create_localhost( + "netdata", + "dbengine_global_file_descriptors", + NULL, + "dbengine", + NULL, + "Netdata DB engine File Descriptors", + "descriptors", + "netdata", + "stats", + 130509, + localhost->rrd_update_every, + RRDSET_TYPE_LINE); + + rd_fd_current = rrddim_add(st_fd, "current", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + rd_fd_max = rrddim_add(st_fd, "max", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + } else + rrdset_next(st_fd); + + rrddim_set_by_pointer(st_fd, rd_fd_current, (collected_number)stats_array[32]); + /* Careful here, modify this accordingly if the File-Descriptor budget ever changes */ + rrddim_set_by_pointer(st_fd, rd_fd_max, (collected_number)rlimit_nofile.rlim_cur / 4); + rrdset_done(st_fd); + } - if (unlikely(!st_long_term_pages)) { - st_long_term_pages = rrdset_create_localhost( - "netdata" - , "dbengine_long_term_page_stats" - , NULL - , "dbengine" - , NULL - , "Netdata dbengine long-term page statistics" - , "pages" - , "netdata" - , "stats" - , 130505 - , localhost->rrd_update_every - , RRDSET_TYPE_LINE - ); - - rd_total = rrddim_add(st_long_term_pages, "total", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); - rd_insertions = rrddim_add(st_long_term_pages, "insertions", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); - rd_deletions = rrddim_add(st_long_term_pages, "deletions", NULL, -1, 1, RRD_ALGORITHM_INCREMENTAL); - rd_flushing_pressure_deletions = rrddim_add(st_long_term_pages, "flushing_pressure_deletions", NULL, -1, - 1, RRD_ALGORITHM_INCREMENTAL); + // ---------------------------------------------------------------- + + { + static RRDSET *st_ram_usage = NULL; + static RRDDIM *rd_cached = NULL; + static RRDDIM *rd_pinned = NULL; + static RRDDIM *rd_metadata = NULL; + + collected_number cached_pages, pinned_pages, API_producers, populated_pages, metadata, pages_on_disk, + page_cache_descriptors; + + if (unlikely(!st_ram_usage)) { + st_ram_usage = rrdset_create_localhost( + "netdata", + "dbengine_ram", + NULL, + "dbengine", + NULL, + "Netdata DB engine RAM usage", + "MiB", + "netdata", + "stats", + 130510, + localhost->rrd_update_every, + RRDSET_TYPE_STACKED); + + rd_cached = rrddim_add(st_ram_usage, "cache", NULL, 1, 256, RRD_ALGORITHM_ABSOLUTE); + rd_pinned = rrddim_add(st_ram_usage, "collectors", NULL, 1, 256, RRD_ALGORITHM_ABSOLUTE); + rd_metadata = rrddim_add(st_ram_usage, "metadata", NULL, 1, 1048576, RRD_ALGORITHM_ABSOLUTE); + } else + rrdset_next(st_ram_usage); + + API_producers = (collected_number)stats_array[0]; + pages_on_disk = (collected_number)stats_array[2]; + populated_pages = (collected_number)stats_array[3]; + page_cache_descriptors = (collected_number)stats_array[27]; + + if (API_producers * 2 > populated_pages) { + pinned_pages = API_producers; + } else { + pinned_pages = API_producers * 2; + } + cached_pages = populated_pages - pinned_pages; + + metadata = page_cache_descriptors * sizeof(struct page_cache_descr); + metadata += pages_on_disk * sizeof(struct rrdeng_page_descr); + /* This is an empirical estimation for Judy array indexing and extent structures */ + metadata += pages_on_disk * 58; + + rrddim_set_by_pointer(st_ram_usage, rd_cached, cached_pages); + rrddim_set_by_pointer(st_ram_usage, rd_pinned, pinned_pages); + rrddim_set_by_pointer(st_ram_usage, rd_metadata, metadata); + rrdset_done(st_ram_usage); } - else - rrdset_next(st_long_term_pages); - - rrddim_set_by_pointer(st_long_term_pages, rd_total, (collected_number)stats_array[2]); - rrddim_set_by_pointer(st_long_term_pages, rd_insertions, (collected_number)stats_array[5]); - rrddim_set_by_pointer(st_long_term_pages, rd_deletions, (collected_number)stats_array[6]); - rrddim_set_by_pointer(st_long_term_pages, rd_flushing_pressure_deletions, - (collected_number)stats_array[36]); - rrdset_done(st_long_term_pages); } + } +#endif +} + +static void update_heartbeat_charts() { + RRDSET *st = rrdset_create_localhost( + "netdata" + , "heartbeat" + , NULL + , "heartbeat" + , NULL + , "System clock jitter" + , "microseconds" + , "netdata" + , "stats" + , 900000 + , localhost->rrd_update_every + , RRDSET_TYPE_AREA + ); + + RRDDIM *rd_min = rrddim_add(st, "min", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + RRDDIM *rd_max = rrddim_add(st, "max", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + RRDDIM *rd_avg = rrddim_add(st, "average", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + + rrdset_next(st); + + usec_t min, max, average; + size_t count; + + heartbeat_statistics(&min, &max, &average, &count); + + rrddim_set_by_pointer(st, rd_min, (collected_number)min); + rrddim_set_by_pointer(st, rd_max, (collected_number)max); + rrddim_set_by_pointer(st, rd_avg, (collected_number)average); + + rrdset_done(st); +} + +// --------------------------------------------------------------------------------------------------------------------- +// worker utilization + +struct worker_job_type { + char name[WORKER_UTILIZATION_MAX_JOB_NAME_LENGTH + 1]; + size_t jobs_started; + usec_t busy_time; + + RRDDIM *rd_jobs_started; + RRDDIM *rd_busy_time; +}; + +struct worker_thread { + pid_t pid; + int enabled; + + int cpu_enabled; + + kernel_uint_t utime; + kernel_uint_t stime; + + kernel_uint_t utime_old; + kernel_uint_t stime_old; + + usec_t collected_time; + usec_t collected_time_old; + + size_t jobs_started; + usec_t busy_time; + + struct worker_thread *next; +}; + +struct worker_utilization { + const char *name; + const char *family; + size_t priority; + uint32_t flags; + + char *name_lowercase; + + struct worker_job_type per_job_type[WORKER_UTILIZATION_MAX_JOB_TYPES]; + + size_t workers_registered; + size_t workers_busy; + usec_t workers_total_busy_time; + usec_t workers_total_duration; + size_t workers_total_jobs_started; + double workers_min_busy_time; + double workers_max_busy_time; - // ---------------------------------------------------------------- + struct worker_thread *threads; - { - static RRDSET *st_io_stats = NULL; - static RRDDIM *rd_reads = NULL; - static RRDDIM *rd_writes = NULL; + RRDSET *st_workers_time; + RRDDIM *rd_workers_time_avg; + RRDDIM *rd_workers_time_min; + RRDDIM *rd_workers_time_max; - if (unlikely(!st_io_stats)) { - st_io_stats = rrdset_create_localhost( + size_t workers_cpu_enabled; + RRDSET *st_workers_cpu; + RRDDIM *rd_workers_cpu_avg; + RRDDIM *rd_workers_cpu_min; + RRDDIM *rd_workers_cpu_max; + + RRDSET *st_workers_threads; + RRDDIM *rd_workers_threads_free; + RRDDIM *rd_workers_threads_busy; + + RRDSET *st_workers_jobs_per_job_type; + RRDSET *st_workers_busy_per_job_type; +}; + +static void workers_utilization_update_chart(struct worker_utilization *wu) { + if(!wu->workers_registered) return; + + //fprintf(stderr, "%-12s WORKER UTILIZATION: %-3.2f%%, %zu jobs done, %zu running, on %zu workers, min %-3.02f%%, max %-3.02f%%.\n", + // wu->name, + // (double)wu->workers_total_busy_time * 100.0 / (double)wu->workers_total_duration, + // wu->workers_total_jobs_started, wu->workers_busy, wu->workers_registered, + // wu->workers_min_busy_time, wu->workers_max_busy_time); + + // ---------------------------------------------------------------------- + + if(unlikely(!wu->st_workers_time)) { + char name[RRD_ID_LENGTH_MAX + 1]; + snprintfz(name, RRD_ID_LENGTH_MAX, "workers_time_%s", wu->name_lowercase); + + char context[RRD_ID_LENGTH_MAX + 1]; + snprintf(context, RRD_ID_LENGTH_MAX, "netdata.workers.%s.time", wu->name_lowercase); + + wu->st_workers_time = rrdset_create_localhost( + "netdata" + , name + , NULL + , wu->family + , context + , "Netdata Workers Busy Time (100% = all workers busy)" + , "%" + , "netdata" + , "stats" + , wu->priority + , localhost->rrd_update_every + , RRDSET_TYPE_AREA + ); + } + + // we add the min and max dimensions only when we have multiple workers + + if(unlikely(!wu->rd_workers_time_min && wu->workers_registered > 1)) + wu->rd_workers_time_min = rrddim_add(wu->st_workers_time, "min", NULL, 1, 10000, RRD_ALGORITHM_ABSOLUTE); + + if(unlikely(!wu->rd_workers_time_max && wu->workers_registered > 1)) + wu->rd_workers_time_max = rrddim_add(wu->st_workers_time, "max", NULL, 1, 10000, RRD_ALGORITHM_ABSOLUTE); + + if(unlikely(!wu->rd_workers_time_avg)) + wu->rd_workers_time_avg = rrddim_add(wu->st_workers_time, "average", NULL, 1, 10000, RRD_ALGORITHM_ABSOLUTE); + + rrdset_next(wu->st_workers_time); + + if(wu->rd_workers_time_min) + rrddim_set_by_pointer(wu->st_workers_time, wu->rd_workers_time_min, (collected_number)((double)wu->workers_min_busy_time * 10000.0)); + + if(wu->rd_workers_time_max) + rrddim_set_by_pointer(wu->st_workers_time, wu->rd_workers_time_max, (collected_number)((double)wu->workers_max_busy_time * 10000.0)); + + rrddim_set_by_pointer(wu->st_workers_time, wu->rd_workers_time_avg, (collected_number)((double)wu->workers_total_busy_time * 100.0 * 10000.0 / (double)wu->workers_total_duration)); + rrdset_done(wu->st_workers_time); + + // ---------------------------------------------------------------------- + +#ifdef __linux__ + if(wu->workers_cpu_enabled || wu->st_workers_cpu) { + if(unlikely(!wu->st_workers_cpu)) { + char name[RRD_ID_LENGTH_MAX + 1]; + snprintfz(name, RRD_ID_LENGTH_MAX, "workers_cpu_%s", wu->name_lowercase); + + char context[RRD_ID_LENGTH_MAX + 1]; + snprintf(context, RRD_ID_LENGTH_MAX, "netdata.workers.%s.cpu", wu->name_lowercase); + + wu->st_workers_cpu = rrdset_create_localhost( "netdata" - , "dbengine_io_throughput" - , NULL - , "dbengine" + , name , NULL - , "Netdata DB engine I/O throughput" - , "MiB/s" + , wu->family + , context + , "Netdata Workers CPU Utilization (100% = all workers busy)" + , "%" , "netdata" , "stats" - , 130506 + , wu->priority + 1 , localhost->rrd_update_every - , RRDSET_TYPE_LINE - ); + , RRDSET_TYPE_AREA + ); + } - rd_reads = rrddim_add(st_io_stats, "reads", NULL, 1, 1024 * 1024, RRD_ALGORITHM_INCREMENTAL); - rd_writes = rrddim_add(st_io_stats, "writes", NULL, -1, 1024 * 1024, RRD_ALGORITHM_INCREMENTAL); - } - else - rrdset_next(st_io_stats); + if (unlikely(!wu->rd_workers_cpu_min && wu->workers_registered > 1)) + wu->rd_workers_cpu_min = rrddim_add(wu->st_workers_cpu, "min", NULL, 1, 10000ULL, RRD_ALGORITHM_ABSOLUTE); - rrddim_set_by_pointer(st_io_stats, rd_reads, (collected_number)stats_array[17]); - rrddim_set_by_pointer(st_io_stats, rd_writes, (collected_number)stats_array[15]); - rrdset_done(st_io_stats); - } + if (unlikely(!wu->rd_workers_cpu_max && wu->workers_registered > 1)) + wu->rd_workers_cpu_max = rrddim_add(wu->st_workers_cpu, "max", NULL, 1, 10000ULL, RRD_ALGORITHM_ABSOLUTE); - // ---------------------------------------------------------------- - - { - static RRDSET *st_io_stats = NULL; - static RRDDIM *rd_reads = NULL; - static RRDDIM *rd_writes = NULL; - - if (unlikely(!st_io_stats)) { - st_io_stats = rrdset_create_localhost( - "netdata" - , "dbengine_io_operations" - , NULL - , "dbengine" - , NULL - , "Netdata DB engine I/O operations" - , "operations/s" - , "netdata" - , "stats" - , 130507 - , localhost->rrd_update_every - , RRDSET_TYPE_LINE - ); - - rd_reads = rrddim_add(st_io_stats, "reads", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); - rd_writes = rrddim_add(st_io_stats, "writes", NULL, -1, 1, RRD_ALGORITHM_INCREMENTAL); - } - else - rrdset_next(st_io_stats); + if(unlikely(!wu->rd_workers_cpu_avg)) + wu->rd_workers_cpu_avg = rrddim_add(wu->st_workers_cpu, "average", NULL, 1, 10000ULL, RRD_ALGORITHM_ABSOLUTE); - rrddim_set_by_pointer(st_io_stats, rd_reads, (collected_number)stats_array[18]); - rrddim_set_by_pointer(st_io_stats, rd_writes, (collected_number)stats_array[16]); - rrdset_done(st_io_stats); - } + rrdset_next(wu->st_workers_cpu); - // ---------------------------------------------------------------- - - { - static RRDSET *st_errors = NULL; - static RRDDIM *rd_fs_errors = NULL; - static RRDDIM *rd_io_errors = NULL; - static RRDDIM *pg_cache_over_half_dirty_events = NULL; - - if (unlikely(!st_errors)) { - st_errors = rrdset_create_localhost( - "netdata" - , "dbengine_global_errors" - , NULL - , "dbengine" - , NULL - , "Netdata DB engine errors" - , "errors/s" - , "netdata" - , "stats" - , 130508 - , localhost->rrd_update_every - , RRDSET_TYPE_LINE - ); - - rd_io_errors = rrddim_add(st_errors, "io_errors", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); - rd_fs_errors = rrddim_add(st_errors, "fs_errors", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); - pg_cache_over_half_dirty_events = rrddim_add(st_errors, "pg_cache_over_half_dirty_events", NULL, 1, 1, - RRD_ALGORITHM_INCREMENTAL); - } - else - rrdset_next(st_errors); + size_t count = 0; + calculated_number min = 1000.0, max = 0.0, total = 0.0; + struct worker_thread *wt; + for(wt = wu->threads; wt ; wt = wt->next) { + if(!wt->cpu_enabled) continue; + count++; - rrddim_set_by_pointer(st_errors, rd_io_errors, (collected_number)stats_array[30]); - rrddim_set_by_pointer(st_errors, rd_fs_errors, (collected_number)stats_array[31]); - rrddim_set_by_pointer(st_errors, pg_cache_over_half_dirty_events, (collected_number)stats_array[34]); - rrdset_done(st_errors); + usec_t delta = wt->collected_time - wt->collected_time_old; + calculated_number utime = (calculated_number)(wt->utime - wt->utime_old) / (calculated_number)system_hz * 100.0 * (calculated_number)USEC_PER_SEC / (calculated_number)delta; + calculated_number stime = (calculated_number)(wt->stime - wt->stime_old) / (calculated_number)system_hz * 100.0 * (calculated_number)USEC_PER_SEC / (calculated_number)delta; + calculated_number cpu_util = utime + stime; + + total += cpu_util; + if(cpu_util < min) min = cpu_util; + if(cpu_util > max) max = cpu_util; } + if(unlikely(min == 1000.0)) min = 0.0; + + if(wu->rd_workers_cpu_min) + rrddim_set_by_pointer(wu->st_workers_cpu, wu->rd_workers_cpu_min, (collected_number)(min * 10000ULL)); + + if(wu->rd_workers_cpu_max) + rrddim_set_by_pointer(wu->st_workers_cpu, wu->rd_workers_cpu_max, (collected_number)(max * 10000ULL)); - // ---------------------------------------------------------------- - - { - static RRDSET *st_fd = NULL; - static RRDDIM *rd_fd_current = NULL; - static RRDDIM *rd_fd_max = NULL; - - if (unlikely(!st_fd)) { - st_fd = rrdset_create_localhost( - "netdata" - , "dbengine_global_file_descriptors" - , NULL - , "dbengine" - , NULL - , "Netdata DB engine File Descriptors" - , "descriptors" - , "netdata" - , "stats" - , 130509 - , localhost->rrd_update_every - , RRDSET_TYPE_LINE - ); - - rd_fd_current = rrddim_add(st_fd, "current", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); - rd_fd_max = rrddim_add(st_fd, "max", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + rrddim_set_by_pointer(wu->st_workers_cpu, wu->rd_workers_cpu_avg, (collected_number)( total * 10000ULL / (calculated_number)count )); + rrdset_done(wu->st_workers_cpu); + } +#endif + + // ---------------------------------------------------------------------- + + if(unlikely(!wu->st_workers_jobs_per_job_type)) { + char name[RRD_ID_LENGTH_MAX + 1]; + snprintfz(name, RRD_ID_LENGTH_MAX, "workers_jobs_by_type_%s", wu->name_lowercase); + + char context[RRD_ID_LENGTH_MAX + 1]; + snprintf(context, RRD_ID_LENGTH_MAX, "netdata.workers.%s.jobs_started_by_type", wu->name_lowercase); + + wu->st_workers_jobs_per_job_type = rrdset_create_localhost( + "netdata" + , name + , NULL + , wu->family + , context + , "Netdata Workers Jobs Started by Type" + , "jobs" + , "netdata" + , "stats" + , wu->priority + 2 + , localhost->rrd_update_every + , RRDSET_TYPE_STACKED + ); + } + + rrdset_next(wu->st_workers_jobs_per_job_type); + + { + size_t i; + for(i = 0; i < WORKER_UTILIZATION_MAX_JOB_TYPES ;i++) { + if (wu->per_job_type[i].name[0]) { + + if(unlikely(!wu->per_job_type[i].rd_jobs_started)) + wu->per_job_type[i].rd_jobs_started = rrddim_add(wu->st_workers_jobs_per_job_type, wu->per_job_type[i].name, NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + + rrddim_set_by_pointer(wu->st_workers_jobs_per_job_type, wu->per_job_type[i].rd_jobs_started, (collected_number)(wu->per_job_type[i].jobs_started)); } - else - rrdset_next(st_fd); + } + } - rrddim_set_by_pointer(st_fd, rd_fd_current, (collected_number)stats_array[32]); - /* Careful here, modify this accordingly if the File-Descriptor budget ever changes */ - rrddim_set_by_pointer(st_fd, rd_fd_max, (collected_number)rlimit_nofile.rlim_cur / 4); - rrdset_done(st_fd); + rrdset_done(wu->st_workers_jobs_per_job_type); + + // ---------------------------------------------------------------------- + + if(unlikely(!wu->st_workers_busy_per_job_type)) { + char name[RRD_ID_LENGTH_MAX + 1]; + snprintfz(name, RRD_ID_LENGTH_MAX, "workers_busy_time_by_type_%s", wu->name_lowercase); + + char context[RRD_ID_LENGTH_MAX + 1]; + snprintf(context, RRD_ID_LENGTH_MAX, "netdata.workers.%s.time_by_type", wu->name_lowercase); + + wu->st_workers_busy_per_job_type = rrdset_create_localhost( + "netdata" + , name + , NULL + , wu->family + , context + , "Netdata Workers Busy Time by Type" + , "ms" + , "netdata" + , "stats" + , wu->priority + 3 + , localhost->rrd_update_every + , RRDSET_TYPE_STACKED + ); + } + + rrdset_next(wu->st_workers_busy_per_job_type); + + { + size_t i; + for(i = 0; i < WORKER_UTILIZATION_MAX_JOB_TYPES ;i++) { + if (wu->per_job_type[i].name[0]) { + + if(unlikely(!wu->per_job_type[i].rd_busy_time)) + wu->per_job_type[i].rd_busy_time = rrddim_add(wu->st_workers_busy_per_job_type, wu->per_job_type[i].name, NULL, 1, USEC_PER_MS, RRD_ALGORITHM_ABSOLUTE); + + rrddim_set_by_pointer(wu->st_workers_busy_per_job_type, wu->per_job_type[i].rd_busy_time, (collected_number)(wu->per_job_type[i].busy_time)); + } } + } - // ---------------------------------------------------------------- + rrdset_done(wu->st_workers_busy_per_job_type); - { - static RRDSET *st_ram_usage = NULL; - static RRDDIM *rd_cached = NULL; - static RRDDIM *rd_pinned = NULL; - static RRDDIM *rd_metadata = NULL; + // ---------------------------------------------------------------------- - collected_number cached_pages, pinned_pages, API_producers, populated_pages, metadata, pages_on_disk, - page_cache_descriptors; + if(wu->st_workers_threads || wu->workers_registered > 1) { + if(unlikely(!wu->st_workers_threads)) { + char name[RRD_ID_LENGTH_MAX + 1]; + snprintfz(name, RRD_ID_LENGTH_MAX, "workers_threads_%s", wu->name_lowercase); - if (unlikely(!st_ram_usage)) { - st_ram_usage = rrdset_create_localhost( + char context[RRD_ID_LENGTH_MAX + 1]; + snprintf(context, RRD_ID_LENGTH_MAX, "netdata.workers.%s.threads", wu->name_lowercase); + + wu->st_workers_threads = rrdset_create_localhost( "netdata" - , "dbengine_ram" - , NULL - , "dbengine" + , name , NULL - , "Netdata DB engine RAM usage" - , "MiB" + , wu->family + , context + , "Netdata Workers Threads" + , "threads" , "netdata" , "stats" - , 130510 + , wu->priority + 4 , localhost->rrd_update_every , RRDSET_TYPE_STACKED - ); + ); - rd_cached = rrddim_add(st_ram_usage, "cache", NULL, 1, 256, RRD_ALGORITHM_ABSOLUTE); - rd_pinned = rrddim_add(st_ram_usage, "collectors", NULL, 1, 256, RRD_ALGORITHM_ABSOLUTE); - rd_metadata = rrddim_add(st_ram_usage, "metadata", NULL, 1, 1048576, RRD_ALGORITHM_ABSOLUTE); - } - else - rrdset_next(st_ram_usage); - - API_producers = (collected_number)stats_array[0]; - pages_on_disk = (collected_number)stats_array[2]; - populated_pages = (collected_number)stats_array[3]; - page_cache_descriptors = (collected_number)stats_array[27]; - - if (API_producers * 2 > populated_pages) { - pinned_pages = API_producers; - } else{ - pinned_pages = API_producers * 2; - } - cached_pages = populated_pages - pinned_pages; + wu->rd_workers_threads_free = rrddim_add(wu->st_workers_threads, "free", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + wu->rd_workers_threads_busy = rrddim_add(wu->st_workers_threads, "busy", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + } + else + rrdset_next(wu->st_workers_threads); - metadata = page_cache_descriptors * sizeof(struct page_cache_descr); - metadata += pages_on_disk * sizeof(struct rrdeng_page_descr); - /* This is an empirical estimation for Judy array indexing and extent structures */ - metadata += pages_on_disk * 58; + rrddim_set_by_pointer(wu->st_workers_threads, wu->rd_workers_threads_free, (collected_number)(wu->workers_registered - wu->workers_busy)); + rrddim_set_by_pointer(wu->st_workers_threads, wu->rd_workers_threads_busy, (collected_number)(wu->workers_busy)); + rrdset_done(wu->st_workers_threads); + } +} - rrddim_set_by_pointer(st_ram_usage, rd_cached, cached_pages); - rrddim_set_by_pointer(st_ram_usage, rd_pinned, pinned_pages); - rrddim_set_by_pointer(st_ram_usage, rd_metadata, metadata); - rrdset_done(st_ram_usage); +static void workers_utilization_reset_statistics(struct worker_utilization *wu) { + wu->workers_registered = 0; + wu->workers_busy = 0; + wu->workers_total_busy_time = 0; + wu->workers_total_duration = 0; + wu->workers_total_jobs_started = 0; + wu->workers_min_busy_time = 100.0; + wu->workers_max_busy_time = 0; + wu->workers_cpu_enabled = 0; + + size_t i; + for(i = 0; i < WORKER_UTILIZATION_MAX_JOB_TYPES ;i++) { + if(unlikely(!wu->name_lowercase)) { + wu->name_lowercase = strdupz(wu->name); + char *s = wu->name_lowercase; + for( ; *s ; s++) *s = tolower(*s); } + + wu->per_job_type[i].jobs_started = 0; + wu->per_job_type[i].busy_time = 0; + } + + struct worker_thread *wt; + for(wt = wu->threads; wt ; wt = wt->next) { + wt->enabled = 0; + wt->cpu_enabled = 0; } +} + +static int read_thread_cpu_time_from_proc_stat(pid_t pid __maybe_unused, kernel_uint_t *utime __maybe_unused, kernel_uint_t *stime __maybe_unused) { +#ifdef __linux__ + char filename[200 + 1]; + snprintfz(filename, 200, "/proc/self/task/%d/stat", pid); + + procfile *ff = procfile_open(filename, " ", PROCFILE_FLAG_NO_ERROR_ON_FILE_IO); + if(!ff) return -1; + + ff = procfile_readall(ff); + if(!ff) return -1; + + *utime = str2kernel_uint_t(procfile_lineword(ff, 0, 13)); + *stime = str2kernel_uint_t(procfile_lineword(ff, 0, 14)); + + procfile_close(ff); + return 0; +#else + // TODO: add here cpu time detection per thread, for FreeBSD and MacOS + *utime = 0; + *stime = 0; + return 1; #endif +} + +static void workers_threads_cleanup(struct worker_utilization *wu) { + struct worker_thread *t; + + // free threads at the beginning of the linked list + while(wu->threads && !wu->threads->enabled) { + t = wu->threads; + wu->threads = t->next; + t->next = NULL; + freez(t); + } + + // free threads in the middle of the linked list + for(t = wu->threads; t && t->next ; t = t->next) { + if(t->next->enabled) continue; + + struct worker_thread *to_remove = t->next; + t->next = to_remove->next; + to_remove->next = NULL; + freez(to_remove); + } +} + +static struct worker_thread *worker_thread_find(struct worker_utilization *wu, pid_t pid) { + struct worker_thread *wt; + for(wt = wu->threads; wt && wt->pid != pid ; wt = wt->next) ; + return wt; +} + +static struct worker_thread *worker_thread_create(struct worker_utilization *wu, pid_t pid) { + struct worker_thread *wt; + + wt = (struct worker_thread *)callocz(1, sizeof(struct worker_thread)); + wt->pid = pid; + + // link it + wt->next = wu->threads; + wu->threads = wt; + return wt; } +static struct worker_thread *worker_thread_find_or_create(struct worker_utilization *wu, pid_t pid) { + struct worker_thread *wt; + wt = worker_thread_find(wu, pid); + if(!wt) wt = worker_thread_create(wu, pid); + + return wt; +} + +static void worker_utilization_charts_callback(void *ptr, pid_t pid __maybe_unused, const char *thread_tag __maybe_unused, size_t utilization_usec __maybe_unused, size_t duration_usec __maybe_unused, size_t jobs_started __maybe_unused, size_t is_running __maybe_unused, const char **job_types_names __maybe_unused, size_t *job_types_jobs_started __maybe_unused, usec_t *job_types_busy_time __maybe_unused) { + struct worker_utilization *wu = (struct worker_utilization *)ptr; + + // find the worker_thread in the list + struct worker_thread *wt = worker_thread_find_or_create(wu, pid); + + wt->enabled = 1; + wt->busy_time = utilization_usec; + wt->jobs_started = jobs_started; + + wt->utime_old = wt->utime; + wt->stime_old = wt->stime; + wt->collected_time_old = wt->collected_time; + + wu->workers_total_busy_time += utilization_usec; + wu->workers_total_duration += duration_usec; + wu->workers_total_jobs_started += jobs_started; + wu->workers_busy += is_running; + wu->workers_registered++; + + double util = (double)utilization_usec * 100.0 / (double)duration_usec; + if(util > wu->workers_max_busy_time) + wu->workers_max_busy_time = util; + + if(util < wu->workers_min_busy_time) + wu->workers_min_busy_time = util; + + // accumulate per job type statistics + size_t i; + for(i = 0; i < WORKER_UTILIZATION_MAX_JOB_TYPES ;i++) { + wu->per_job_type[i].jobs_started += job_types_jobs_started[i]; + wu->per_job_type[i].busy_time += job_types_busy_time[i]; + + // new job type found + if(unlikely(!wu->per_job_type[i].name[0] && job_types_names[i])) + strncpyz(wu->per_job_type[i].name, job_types_names[i], WORKER_UTILIZATION_MAX_JOB_NAME_LENGTH); + } + + // find its CPU utilization + if((!read_thread_cpu_time_from_proc_stat(pid, &wt->utime, &wt->stime))) { + wt->cpu_enabled = 1; + wt->collected_time = now_realtime_usec(); + } + wu->workers_cpu_enabled += wt->cpu_enabled; +} + +static struct worker_utilization all_workers_utilization[] = { + { .name = "STATS", .family = "workers global statistics", .priority = 1000000 }, + { .name = "HEALTH", .family = "workers health alarms", .priority = 1000000 }, + { .name = "MLTRAIN", .family = "workers ML training", .priority = 1000000 }, + { .name = "MLDETECT", .family = "workers ML detection", .priority = 1000000 }, + { .name = "STREAMRCV", .family = "workers streaming receive", .priority = 1000000 }, + { .name = "STREAMSND", .family = "workers streaming send", .priority = 1000000 }, + { .name = "DBENGINE", .family = "workers dbengine instances", .priority = 1000000 }, + { .name = "WEB", .family = "workers web server", .priority = 1000000 }, + { .name = "ACLKQUERY", .family = "workers aclk query", .priority = 1000000 }, + { .name = "ACLKSYNC", .family = "workers aclk host sync", .priority = 1000000 }, + { .name = "PLUGINSD", .family = "workers plugins.d", .priority = 1000000 }, + { .name = "STATSD", .family = "workers plugin statsd", .priority = 1000000 }, + { .name = "STATSDFLUSH", .family = "workers plugin statsd flush", .priority = 1000000 }, + { .name = "PROC", .family = "workers plugin proc", .priority = 1000000 }, + { .name = "FREEBSD", .family = "workers plugin freebsd", .priority = 1000000 }, + { .name = "MACOS", .family = "workers plugin macos", .priority = 1000000 }, + { .name = "CGROUPS", .family = "workers plugin cgroups", .priority = 1000000 }, + { .name = "CGROUPSDISC", .family = "workers plugin cgroups find", .priority = 1000000 }, + { .name = "DISKSPACE", .family = "workers plugin diskspace", .priority = 1000000 }, + { .name = "TC", .family = "workers plugin tc", .priority = 1000000 }, + { .name = "TIMEX", .family = "workers plugin timex", .priority = 1000000 }, + { .name = "IDLEJITTER", .family = "workers plugin idlejitter", .priority = 1000000 }, + + // has to be terminated with a NULL + { .name = NULL, .family = NULL } +}; + +static void worker_utilization_charts(void) { + static size_t iterations = 0; + iterations++; + + int i; + for(i = 0; all_workers_utilization[i].name ;i++) { + workers_utilization_reset_statistics(&all_workers_utilization[i]); + workers_foreach(all_workers_utilization[i].name, worker_utilization_charts_callback, &all_workers_utilization[i]); + + // skip the first iteration, so that we don't accumulate startup utilization to our charts + if(likely(iterations > 1)) + workers_utilization_update_chart(&all_workers_utilization[i]); + + workers_threads_cleanup(&all_workers_utilization[i]); + } +} + +static void worker_utilization_finish(void) { + int i; + for(i = 0; all_workers_utilization[i].name ;i++) { + struct worker_utilization *wu = &all_workers_utilization[i]; + + if(wu->name_lowercase) { + freez(wu->name_lowercase); + wu->name_lowercase = NULL; + } + + // mark all threads as not enabled + struct worker_thread *t; + for(t = wu->threads; t ; t = t->next) t->enabled = 0; + + // let the cleanup job free them + workers_threads_cleanup(wu); + } +} + +// --------------------------------------------------------------------------------------------------------------------- + static void global_statistics_cleanup(void *ptr) { + worker_unregister(); + struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr; static_thread->enabled = NETDATA_MAIN_THREAD_EXITING; info("cleaning up..."); + worker_utilization_finish(); + static_thread->enabled = NETDATA_MAIN_THREAD_EXITED; } void *global_statistics_main(void *ptr) { + worker_register("STATS"); + worker_register_job_name(WORKER_JOB_GLOBAL, "global"); + worker_register_job_name(WORKER_JOB_REGISTRY, "registry"); + worker_register_job_name(WORKER_JOB_WORKERS, "workers"); + worker_register_job_name(WORKER_JOB_DBENGINE, "dbengine"); + netdata_thread_cleanup_push(global_statistics_cleanup, ptr); int update_every = @@ -876,10 +1444,23 @@ void *global_statistics_main(void *ptr) heartbeat_t hb; heartbeat_init(&hb); while (!netdata_exit) { + worker_is_idle(); heartbeat_next(&hb, step); + worker_is_busy(WORKER_JOB_WORKERS); + worker_utilization_charts(); + + worker_is_busy(WORKER_JOB_GLOBAL); global_statistics_charts(); + + worker_is_busy(WORKER_JOB_REGISTRY); registry_statistics(); + + worker_is_busy(WORKER_JOB_DBENGINE); + dbengine_statistics_charts(); + + worker_is_busy(WORKER_JOB_HEARTBEAT); + update_heartbeat_charts(); } netdata_thread_cleanup_pop(1); diff --git a/database/engine/rrdengine.c b/database/engine/rrdengine.c index 9f5f564cc0ab30..9f43f445689140 100644 --- a/database/engine/rrdengine.c +++ b/database/engine/rrdengine.c @@ -11,6 +11,10 @@ rrdeng_stats_t global_flushing_pressure_page_deletions = 0; static unsigned pages_per_extent = MAX_PAGES_PER_EXTENT; +#if WORKER_UTILIZATION_MAX_JOB_TYPES < (RRDENG_MAX_OPCODE + 2) +#error Please increase WORKER_UTILIZATION_MAX_JOB_TYPES to at least (RRDENG_MAX_OPCODE + 2) +#endif + void *dbengine_page_alloc() { void *page = netdata_mmap(NULL, RRDENG_BLOCK_SIZE, MAP_PRIVATE, enable_ksm); if(!page) fatal("Cannot allocate dbengine page cache page, with mmap()"); @@ -23,6 +27,8 @@ void dbengine_page_free(void *page) { static void sanity_check(void) { + BUILD_BUG_ON(WORKER_UTILIZATION_MAX_JOB_TYPES < (RRDENG_MAX_OPCODE + 2)); + /* Magic numbers must fit in the super-blocks */ BUILD_BUG_ON(strlen(RRDENG_DF_MAGIC) > RRDENG_MAGIC_SZ); BUILD_BUG_ON(strlen(RRDENG_JF_MAGIC) > RRDENG_MAGIC_SZ); @@ -1085,13 +1091,17 @@ void async_cb(uv_async_t *handle) void timer_cb(uv_timer_t* handle) { + worker_is_busy(RRDENG_MAX_OPCODE + 1); + struct rrdengine_worker_config* wc = handle->data; struct rrdengine_instance *ctx = wc->ctx; uv_stop(handle->loop); uv_update_time(handle->loop); - if (unlikely(!ctx->metalog_ctx->initialized)) + if (unlikely(!ctx->metalog_ctx->initialized)) { + worker_is_idle(); return; /* Wait for the metadata log to initialize */ + } rrdeng_test_quota(wc); debug(D_RRDENGINE, "%s: timeout reached.", __func__); if (likely(!wc->now_deleting_files && !wc->now_invalidating_dirty_pages)) { @@ -1133,12 +1143,26 @@ void timer_cb(uv_timer_t* handle) debug(D_RRDENGINE, "%s", get_rrdeng_statistics(wc->ctx, buf, sizeof(buf))); } #endif + + worker_is_idle(); } #define MAX_CMD_BATCH_SIZE (256) void rrdeng_worker(void* arg) { + worker_register("DBENGINE"); + worker_register_job_name(RRDENG_NOOP, "noop"); + worker_register_job_name(RRDENG_READ_PAGE, "page read"); + worker_register_job_name(RRDENG_READ_EXTENT, "extent read"); + worker_register_job_name(RRDENG_COMMIT_PAGE, "commit"); + worker_register_job_name(RRDENG_FLUSH_PAGES, "flush"); + worker_register_job_name(RRDENG_SHUTDOWN, "shutdown"); + worker_register_job_name(RRDENG_INVALIDATE_OLDEST_MEMORY_PAGE, "page lru"); + worker_register_job_name(RRDENG_QUIESCE, "quiesce"); + worker_register_job_name(RRDENG_MAX_OPCODE, "cleanup"); + worker_register_job_name(RRDENG_MAX_OPCODE + 1, "timer"); + struct rrdengine_worker_config* wc = arg; struct rrdengine_instance *ctx = wc->ctx; uv_loop_t* loop; @@ -1188,7 +1212,9 @@ void rrdeng_worker(void* arg) shutdown = 0; int set_name = 0; while (likely(shutdown == 0 || rrdeng_threads_alive(wc))) { + worker_is_idle(); uv_run(loop, UV_RUN_DEFAULT); + worker_is_busy(RRDENG_MAX_OPCODE); rrdeng_cleanup_finished_threads(wc); /* wait for commands */ @@ -1205,6 +1231,9 @@ void rrdeng_worker(void* arg) opcode = cmd.opcode; ++cmd_batch_size; + if(likely(opcode != RRDENG_NOOP)) + worker_is_busy(opcode); + switch (opcode) { case RRDENG_NOOP: /* the command queue was empty, do nothing */ @@ -1281,6 +1310,7 @@ void rrdeng_worker(void* arg) fatal_assert(0 == uv_loop_close(loop)); freez(loop); + worker_unregister(); return; error_after_timer_init: @@ -1293,6 +1323,7 @@ void rrdeng_worker(void* arg) wc->error = UV_EAGAIN; /* wake up initialization thread */ completion_mark_complete(&ctx->rrdengine_completion); + worker_unregister(); } /* C entry point for development purposes diff --git a/database/sqlite/sqlite_aclk.c b/database/sqlite/sqlite_aclk.c index 9945e18ad2e493..daa5fbaac377ac 100644 --- a/database/sqlite/sqlite_aclk.c +++ b/database/sqlite/sqlite_aclk.c @@ -10,6 +10,11 @@ #include "../../aclk/aclk.h" #endif +void sanity_check(void) { + // make sure the compiler will stop on misconfigurations + BUILD_BUG_ON(WORKER_UTILIZATION_MAX_JOB_TYPES < ACLK_MAX_ENUMERATIONS_DEFINED); +} + const char *aclk_sync_config[] = { "CREATE TABLE IF NOT EXISTS dimension_delete (dimension_id blob, dimension_name text, chart_type_id text, " "dim_id blob, chart_id blob, host_id blob, date_created);", @@ -352,6 +357,29 @@ static void timer_cb(uv_timer_t* handle) void aclk_database_worker(void *arg) { + worker_register("ACLKSYNC"); + worker_register_job_name(ACLK_DATABASE_NOOP, "noop"); +#ifdef ENABLE_NEW_CLOUD_PROTOCOL + worker_register_job_name(ACLK_DATABASE_ADD_CHART, "chart add"); + worker_register_job_name(ACLK_DATABASE_ADD_DIMENSION, "dimension add"); + worker_register_job_name(ACLK_DATABASE_PUSH_CHART, "chart push"); + worker_register_job_name(ACLK_DATABASE_PUSH_CHART_CONFIG, "chart conf push"); + worker_register_job_name(ACLK_DATABASE_RESET_CHART, "chart reset"); + worker_register_job_name(ACLK_DATABASE_CHART_ACK, "chart ack"); + worker_register_job_name(ACLK_DATABASE_UPD_RETENTION, "retention check"); + worker_register_job_name(ACLK_DATABASE_DIM_DELETION, "dimension delete"); + worker_register_job_name(ACLK_DATABASE_ORPHAN_HOST, "node orphan"); +#endif + worker_register_job_name(ACLK_DATABASE_ALARM_HEALTH_LOG, "alert log"); + worker_register_job_name(ACLK_DATABASE_CLEANUP, "cleanup"); + worker_register_job_name(ACLK_DATABASE_DELETE_HOST, "node delete"); + worker_register_job_name(ACLK_DATABASE_NODE_INFO, "node info"); + worker_register_job_name(ACLK_DATABASE_PUSH_ALERT, "alert push"); + worker_register_job_name(ACLK_DATABASE_PUSH_ALERT_CONFIG, "alert conf push"); + worker_register_job_name(ACLK_DATABASE_PUSH_ALERT_SNAPSHOT, "alert snapshot"); + worker_register_job_name(ACLK_DATABASE_QUEUE_REMOVED_ALERTS, "alerts check"); + worker_register_job_name(ACLK_DATABASE_TIMER, "timer"); + struct aclk_database_worker_config *wc = arg; uv_loop_t *loop; int ret; @@ -413,6 +441,7 @@ void aclk_database_worker(void *arg) debug(D_ACLK_SYNC,"Node %s reports pending message count = %u", wc->node_id, wc->chart_payload_count); while (likely(!netdata_exit)) { + worker_is_idle(); uv_run(loop, UV_RUN_DEFAULT); /* wait for commands */ @@ -427,6 +456,10 @@ void aclk_database_worker(void *arg) opcode = cmd.opcode; ++cmd_batch_size; + + if(likely(opcode != ACLK_DATABASE_NOOP)) + worker_is_busy(opcode); + switch (opcode) { case ACLK_DATABASE_NOOP: /* the command queue was empty, do nothing */ @@ -439,6 +472,7 @@ void aclk_database_worker(void *arg) if (wc->host == localhost) sql_check_aclk_table_list(wc); break; + case ACLK_DATABASE_DELETE_HOST: debug(D_ACLK_SYNC,"Cleaning ACLK tables for %s", (char *) cmd.data); sql_delete_aclk_table_list(wc, cmd); @@ -577,6 +611,8 @@ void aclk_database_worker(void *arg) wc->host->dbsync_worker = NULL; freez(wc); rrd_unlock(); + + worker_unregister(); return; error_after_timer_init: @@ -585,6 +621,7 @@ void aclk_database_worker(void *arg) fatal_assert(0 == uv_loop_close(loop)); error_after_loop_init: freez(loop); + worker_unregister(); } // ------------------------------------------------------------- diff --git a/database/sqlite/sqlite_aclk.h b/database/sqlite/sqlite_aclk.h index 77cda309082fb8..ede9a62942ca10 100644 --- a/database/sqlite/sqlite_aclk.h +++ b/database/sqlite/sqlite_aclk.h @@ -133,7 +133,11 @@ enum aclk_database_opcode { ACLK_DATABASE_PUSH_ALERT_CONFIG, ACLK_DATABASE_PUSH_ALERT_SNAPSHOT, ACLK_DATABASE_QUEUE_REMOVED_ALERTS, - ACLK_DATABASE_TIMER + ACLK_DATABASE_TIMER, + + // leave this last + // we need it to check for worker utilization + ACLK_MAX_ENUMERATIONS_DEFINED }; struct aclk_chart_payload_t { diff --git a/health/health.c b/health/health.c index 4359c90f51649a..df3802497eab90 100644 --- a/health/health.c +++ b/health/health.c @@ -573,6 +573,8 @@ static inline int check_if_resumed_from_suspension(void) { } static void health_main_cleanup(void *ptr) { + worker_unregister(); + struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr; static_thread->enabled = NETDATA_MAIN_THREAD_EXITING; @@ -695,7 +697,31 @@ static void init_pending_foreach_alarms(RRDHOST *host) { * * @return It always returns NULL */ + +#define WORKER_HEALTH_JOB_RRD_LOCK 0 +#define WORKER_HEALTH_JOB_HOST_LOCK 1 +#define WORKER_HEALTH_JOB_DB_QUERY 2 +#define WORKER_HEALTH_JOB_CALC_EVAL 3 +#define WORKER_HEALTH_JOB_WARNING_EVAL 4 +#define WORKER_HEALTH_JOB_CRITICAL_EVAL 5 +#define WORKER_HEALTH_JOB_ALARM_LOG_ENTRY 6 +#define WORKER_HEALTH_JOB_ALARM_LOG_PROCESS 7 + +#if WORKER_UTILIZATION_MAX_JOB_TYPES < 8 +#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 8 +#endif + void *health_main(void *ptr) { + worker_register("HEALTH"); + worker_register_job_name(WORKER_HEALTH_JOB_RRD_LOCK, "rrd lock"); + worker_register_job_name(WORKER_HEALTH_JOB_HOST_LOCK, "host lock"); + worker_register_job_name(WORKER_HEALTH_JOB_DB_QUERY, "db lookup"); + worker_register_job_name(WORKER_HEALTH_JOB_CALC_EVAL, "calc eval"); + worker_register_job_name(WORKER_HEALTH_JOB_WARNING_EVAL, "warning eval"); + worker_register_job_name(WORKER_HEALTH_JOB_CRITICAL_EVAL, "critical eval"); + worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY, "alarm log entry"); + worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS, "alarm log process"); + netdata_thread_cleanup_push(health_main_cleanup, ptr); int min_run_every = (int)config_get_number(CONFIG_SECTION_HEALTH, "run at least every seconds", 10); @@ -743,6 +769,7 @@ void *health_main(void *ptr) { marked_aclk_reload_loop = loop; #endif + worker_is_busy(WORKER_HEALTH_JOB_RRD_LOCK); rrd_rdlock(); RRDHOST *host; @@ -772,6 +799,7 @@ void *health_main(void *ptr) { init_pending_foreach_alarms(host); + worker_is_busy(WORKER_HEALTH_JOB_HOST_LOCK); rrdhost_rdlock(host); // the first loop is to lookup values from the db @@ -786,6 +814,7 @@ void *health_main(void *ptr) { rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE) && now > (rc->rrdset->last_collected_time.tv_sec + 60))) { if (!rrdcalc_isrepeating(rc)) { + worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY); time_t now = now_realtime_sec(); ALARM_ENTRY *ae = health_create_alarm_entry( host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, @@ -820,6 +849,8 @@ void *health_main(void *ptr) { // if there is database lookup, do it if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) { + worker_is_busy(WORKER_HEALTH_JOB_DB_QUERY); + /* time_t old_db_timestamp = rc->db_before; */ int value_is_null = 0; @@ -876,6 +907,8 @@ void *health_main(void *ptr) { // if there is calculation expression, run it if (unlikely(rc->calculation)) { + worker_is_busy(WORKER_HEALTH_JOB_CALC_EVAL); + if (unlikely(!expression_evaluate(rc->calculation))) { // calculation failed rc->value = NAN; @@ -924,6 +957,8 @@ void *health_main(void *ptr) { // check the warning expression if (likely(rc->warning)) { + worker_is_busy(WORKER_HEALTH_JOB_WARNING_EVAL); + if (unlikely(!expression_evaluate(rc->warning))) { // calculation failed rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR; @@ -948,6 +983,8 @@ void *health_main(void *ptr) { // check the critical expression if (likely(rc->critical)) { + worker_is_busy(WORKER_HEALTH_JOB_CRITICAL_EVAL); + if (unlikely(!expression_evaluate(rc->critical))) { // calculation failed rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR; @@ -1005,6 +1042,7 @@ void *health_main(void *ptr) { // check if the new status and the old differ if (status != rc->status) { + worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY); int delay = 0; // apply trigger hysteresis @@ -1086,6 +1124,7 @@ void *health_main(void *ptr) { } if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) { + worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY); rc->last_repeat = now; if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++; ALARM_ENTRY *ae = health_create_alarm_entry( @@ -1118,6 +1157,7 @@ void *health_main(void *ptr) { // execute notifications // and cleanup + worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS); health_alarm_log_process(host); if (unlikely(netdata_exit)) { @@ -1156,6 +1196,7 @@ void *health_main(void *ptr) { now = now_realtime_sec(); if(now < next_run) { + worker_is_idle(); debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now)); sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now)); now = now_realtime_sec(); diff --git a/libnetdata/Makefile.am b/libnetdata/Makefile.am index ea2460149821f2..167d05caa15eee 100644 --- a/libnetdata/Makefile.am +++ b/libnetdata/Makefile.am @@ -26,6 +26,7 @@ SUBDIRS = \ storage_number \ threads \ url \ + worker_utilization \ tests \ $(NULL) diff --git a/libnetdata/clocks/clocks.c b/libnetdata/clocks/clocks.c index 5dfd93753a8297..85f4eff4187a2b 100644 --- a/libnetdata/clocks/clocks.c +++ b/libnetdata/clocks/clocks.c @@ -7,6 +7,9 @@ static clockid_t clock_boottime_to_use = CLOCK_MONOTONIC; static clockid_t clock_monotonic_to_use = CLOCK_MONOTONIC; +usec_t clock_monotonic_resolution = 1000; +usec_t clock_realtime_resolution = 1000; + #ifndef HAVE_CLOCK_GETTIME inline int clock_gettime(clockid_t clk_id, struct timespec *ts) { struct timeval tv; @@ -20,15 +23,19 @@ inline int clock_gettime(clockid_t clk_id, struct timespec *ts) { } #endif -// When running a binary with CLOCK_MONOTONIC_COARSE defined on a system with a linux kernel older than Linux 2.6.32 the -// clock_gettime(2) system call fails with EINVAL. In that case it must fall-back to CLOCK_MONOTONIC. +// Similar to CLOCK_MONOTONIC, but provides access to a raw hardware-based time that is not subject to NTP adjustments +// or the incremental adjustments performed by adjtime(3). This clock does not count time that the system is suspended -static void test_clock_monotonic_coarse(void) { +static void test_clock_monotonic_raw(void) { +#ifdef CLOCK_MONOTONIC_RAW struct timespec ts; - if(clock_gettime(CLOCK_MONOTONIC_COARSE, &ts) == -1 && errno == EINVAL) + if(clock_gettime(CLOCK_MONOTONIC_RAW, &ts) == -1 && errno == EINVAL) clock_monotonic_to_use = CLOCK_MONOTONIC; else - clock_monotonic_to_use = CLOCK_MONOTONIC_COARSE; + clock_monotonic_to_use = CLOCK_MONOTONIC_RAW; +#else + clock_monotonic_to_use = CLOCK_MONOTONIC; +#endif } // When running a binary with CLOCK_BOOTTIME defined on a system with a linux kernel older than Linux 2.6.39 the @@ -42,14 +49,31 @@ static void test_clock_boottime(void) { clock_boottime_to_use = CLOCK_BOOTTIME; } +static usec_t get_clock_resolution(clockid_t clock) { + struct timespec ts; + clock_getres(clock, &ts); + return ts.tv_sec * USEC_PER_SEC + ts.tv_nsec * NSEC_PER_USEC; +} + // perform any initializations required for clocks void clocks_init(void) { - // monotonic coarse has to be tested before boottime - test_clock_monotonic_coarse(); + // monotonic raw has to be tested before boottime + test_clock_monotonic_raw(); // boottime has to be tested after monotonic coarse test_clock_boottime(); + + clock_monotonic_resolution = get_clock_resolution(clock_monotonic_to_use); + clock_realtime_resolution = get_clock_resolution(CLOCK_REALTIME); + + // if for any reason these are zero, netdata will crash + // since we use them as modulo to calculations + if(!clock_realtime_resolution) + clock_realtime_resolution = 1000; + + if(!clock_monotonic_resolution) + clock_monotonic_resolution = 1000; } inline time_t now_sec(clockid_t clk_id) { @@ -155,8 +179,110 @@ inline usec_t dt_usec(struct timeval *now, struct timeval *old) { return (ts1 > ts2) ? (ts1 - ts2) : (ts2 - ts1); } +void sleep_to_absolute_time(usec_t usec) { + static int einval_printed = 0, enotsup_printed = 0, eunknown_printed = 0; + clockid_t clock = CLOCK_REALTIME; + + struct timespec req = { + .tv_sec = (time_t)(usec / USEC_PER_SEC), + .tv_nsec = (suseconds_t)((usec % USEC_PER_SEC) * NSEC_PER_USEC) + }; + + int ret = 0; + while( (ret = clock_nanosleep(clock, TIMER_ABSTIME, &req, NULL)) != 0 ) { + if(ret == EINTR) continue; + else { + if (ret == EINVAL) { + if (!einval_printed) { + einval_printed++; + error( + "Invalid time given to clock_nanosleep(): clockid = %d, tv_sec = %ld, tv_nsec = %ld", + clock, + req.tv_sec, + req.tv_nsec); + } + } else if (ret == ENOTSUP) { + if (!enotsup_printed) { + enotsup_printed++; + error( + "Invalid clock id given to clock_nanosleep(): clockid = %d, tv_sec = %ld, tv_nsec = %ld", + clock, + req.tv_sec, + req.tv_nsec); + } + } else { + if (!eunknown_printed) { + eunknown_printed++; + error( + "Unknown return value %d from clock_nanosleep(): clockid = %d, tv_sec = %ld, tv_nsec = %ld", + ret, + clock, + req.tv_sec, + req.tv_nsec); + } + } + sleep_usec(usec); + } + } +}; + +#define HEARTBEAT_ALIGNMENT_STATISTICS_SIZE 10 +netdata_mutex_t heartbeat_alignment_mutex = NETDATA_MUTEX_INITIALIZER; +static size_t heartbeat_alignment_id = 0; + +struct heartbeat_thread_statistics { + size_t sequence; + usec_t dt; +}; +static struct heartbeat_thread_statistics heartbeat_alignment_values[HEARTBEAT_ALIGNMENT_STATISTICS_SIZE] = { 0 }; + +void heartbeat_statistics(usec_t *min_ptr, usec_t *max_ptr, usec_t *average_ptr, size_t *count_ptr) { + struct heartbeat_thread_statistics current[HEARTBEAT_ALIGNMENT_STATISTICS_SIZE]; + static struct heartbeat_thread_statistics old[HEARTBEAT_ALIGNMENT_STATISTICS_SIZE] = { 0 }; + + memcpy(current, heartbeat_alignment_values, sizeof(struct heartbeat_thread_statistics) * HEARTBEAT_ALIGNMENT_STATISTICS_SIZE); + + usec_t min = 0, max = 0, total = 0, average = 0; + size_t i, count = 0; + for(i = 0; i < HEARTBEAT_ALIGNMENT_STATISTICS_SIZE ;i++) { + if(current[i].sequence == old[i].sequence) continue; + usec_t value = current[i].dt - old[i].dt; + + if(!count) { + min = max = total = value; + count = 1; + } + else { + total += value; + if(value < min) min = value; + if(value > max) max = value; + count++; + } + } + average = total / count; + + if(min_ptr) *min_ptr = min; + if(max_ptr) *max_ptr = max; + if(average_ptr) *average_ptr = average; + if(count_ptr) *count_ptr = count; + + memcpy(old, current, sizeof(struct heartbeat_thread_statistics) * HEARTBEAT_ALIGNMENT_STATISTICS_SIZE); +} + inline void heartbeat_init(heartbeat_t *hb) { - hb->monotonic = hb->realtime = 0ULL; + hb->realtime = 0ULL; + hb->randomness = 250 * USEC_PER_MS + ((now_realtime_usec() * clock_realtime_resolution) % (250 * USEC_PER_MS)); + hb->randomness -= (hb->randomness % clock_realtime_resolution); + + netdata_mutex_lock(&heartbeat_alignment_mutex); + hb->statistics_id = heartbeat_alignment_id; + heartbeat_alignment_id++; + netdata_mutex_unlock(&heartbeat_alignment_mutex); + + if(hb->statistics_id < HEARTBEAT_ALIGNMENT_STATISTICS_SIZE) { + heartbeat_alignment_values[hb->statistics_id].dt = 0; + heartbeat_alignment_values[hb->statistics_id].sequence = 0; + } } // waits for the next heartbeat @@ -164,96 +290,73 @@ inline void heartbeat_init(heartbeat_t *hb) { // it returns the dt using the realtime clock usec_t heartbeat_next(heartbeat_t *hb, usec_t tick) { - heartbeat_t now; - now.monotonic = now_monotonic_usec(); - now.realtime = now_realtime_usec(); - - usec_t next_monotonic = now.monotonic - (now.monotonic % tick) + tick; - - while(now.monotonic < next_monotonic) { - sleep_usec(next_monotonic - now.monotonic); - now.monotonic = now_monotonic_usec(); - now.realtime = now_realtime_usec(); + if(unlikely(hb->randomness > tick / 2)) { + // TODO: The heartbeat tick should be specified at the heartbeat_init() function + usec_t tmp = (now_realtime_usec() * clock_realtime_resolution) % (tick / 2); + info("heartbeat randomness of %llu is too big for a tick of %llu - setting it to %llu", hb->randomness, tick, tmp); + hb->randomness = tmp; } - if(likely(hb->realtime != 0ULL)) { - usec_t dt_monotonic = now.monotonic - hb->monotonic; - usec_t dt_realtime = now.realtime - hb->realtime; + usec_t dt; + usec_t now = now_realtime_usec(); + usec_t next = now - (now % tick) + tick + hb->randomness; - hb->monotonic = now.monotonic; - hb->realtime = now.realtime; + // align the next time we want to the clock resolution + if(next % clock_realtime_resolution) + next = next - (next % clock_realtime_resolution) + clock_realtime_resolution; - if(unlikely(dt_monotonic >= tick + tick / 2)) { - errno = 0; - error("heartbeat missed %llu monotonic microseconds", dt_monotonic - tick); - } + // sleep_usec() has a loop to guarantee we will sleep for at least the requested time. + // According the specs, when we sleep for a relative time, clock adjustments should not affect the duration + // we sleep. + sleep_usec(next - now); + now = now_realtime_usec(); + dt = now - hb->realtime; - return dt_realtime; + if(hb->statistics_id < HEARTBEAT_ALIGNMENT_STATISTICS_SIZE) { + heartbeat_alignment_values[hb->statistics_id].dt += now - next; + heartbeat_alignment_values[hb->statistics_id].sequence++; } - else { - hb->monotonic = now.monotonic; - hb->realtime = now.realtime; - return 0ULL; + + if(unlikely(now < next)) { + errno = 0; + error("heartbeat clock: woke up %llu microseconds earlier than expected (can be due to the CLOCK_REALTIME set to the past).", next - now); + } + else if(unlikely(now - next > tick / 2)) { + errno = 0; + error("heartbeat clock: woke up %llu microseconds later than expected (can be due to system load or the CLOCK_REALTIME set to the future).", now - next); } -} -// returned the elapsed time, since the last heartbeat -// using the monotonic clock + if(unlikely(!hb->realtime)) { + // the first time return zero + dt = 0; + } -inline usec_t heartbeat_monotonic_dt_to_now_usec(heartbeat_t *hb) { - if(!hb || !hb->monotonic) return 0ULL; - return now_monotonic_usec() - hb->monotonic; + hb->realtime = now; + return dt; } -int sleep_usec(usec_t usec) { - -#ifndef NETDATA_WITH_USLEEP +void sleep_usec(usec_t usec) { // we expect microseconds (1.000.000 per second) // but timespec is nanoseconds (1.000.000.000 per second) struct timespec rem, req = { - .tv_sec = (time_t) (usec / 1000000), - .tv_nsec = (suseconds_t) ((usec % 1000000) * 1000) + .tv_sec = (time_t) (usec / USEC_PER_SEC), + .tv_nsec = (suseconds_t) ((usec % USEC_PER_SEC) * NSEC_PER_USEC) }; - while (nanosleep(&req, &rem) == -1) { + while ((errno = clock_nanosleep(CLOCK_REALTIME, 0, &req, &rem)) != 0) { if (likely(errno == EINTR)) { - debug(D_SYSTEM, "nanosleep() interrupted (while sleeping for %llu microseconds).", usec); req.tv_sec = rem.tv_sec; req.tv_nsec = rem.tv_nsec; } else { - error("Cannot nanosleep() for %llu microseconds.", usec); + error("Cannot clock_nanosleep(CLOCK_REALTIME) for %llu microseconds.", usec); break; } } - - return 0; -#else - int ret = usleep(usec); - if(unlikely(ret == -1 && errno == EINVAL)) { - // on certain systems, usec has to be up to 999999 - if(usec > 999999) { - int counter = usec / 999999; - while(counter--) - usleep(999999); - - usleep(usec % 999999); - } - else { - error("Cannot usleep() for %llu microseconds.", usec); - return ret; - } - } - - if(ret != 0) - error("usleep() failed for %llu microseconds.", usec); - - return ret; -#endif } static inline collected_number uptime_from_boottime(void) { #ifdef CLOCK_BOOTTIME_IS_AVAILABLE - return now_boottime_usec() / 1000; + return (collected_number)(now_boottime_usec() / USEC_PER_MS); #else error("uptime cannot be read from CLOCK_BOOTTIME on this system."); return 0; diff --git a/libnetdata/clocks/clocks.h b/libnetdata/clocks/clocks.h index 3c9ee28bad92ce..53c036ece9e259 100644 --- a/libnetdata/clocks/clocks.h +++ b/libnetdata/clocks/clocks.h @@ -22,8 +22,9 @@ typedef unsigned long long usec_t; typedef long long susec_t; typedef struct heartbeat { - usec_t monotonic; usec_t realtime; + usec_t randomness; + size_t statistics_id; } heartbeat_t; /* Linux value is as good as any other */ @@ -36,20 +37,14 @@ typedef struct heartbeat { #define CLOCK_MONOTONIC CLOCK_REALTIME #endif -/* Prefer CLOCK_MONOTONIC_COARSE where available to reduce overhead. It has the same semantics as CLOCK_MONOTONIC */ -#ifndef CLOCK_MONOTONIC_COARSE -/* fallback to CLOCK_MONOTONIC if not available */ -#define CLOCK_MONOTONIC_COARSE CLOCK_MONOTONIC -#endif - #ifndef CLOCK_BOOTTIME #ifdef CLOCK_UPTIME /* CLOCK_BOOTTIME falls back to CLOCK_UPTIME on FreeBSD */ #define CLOCK_BOOTTIME CLOCK_UPTIME #else // CLOCK_UPTIME -/* CLOCK_BOOTTIME falls back to CLOCK_MONOTONIC */ -#define CLOCK_BOOTTIME CLOCK_MONOTONIC_COARSE +/* CLOCK_BOOTTIME falls back to CLOCK_REALTIME */ +#define CLOCK_BOOTTIME CLOCK_REALTIME #endif // CLOCK_UPTIME #else // CLOCK_BOOTTIME @@ -115,8 +110,6 @@ extern int clock_gettime(clockid_t clk_id, struct timespec *ts); * All now_*_sec() functions return the time in seconds from the appropriate clock, or 0 on error. * All now_*_usec() functions return the time in microseconds from the appropriate clock, or 0 on error. * - * Most functions will attempt to use CLOCK_MONOTONIC_COARSE if available to reduce contention overhead and improve - * performance scaling. If high precision is required please use one of the available now_*_high_precision_* functions. */ extern int now_realtime_timeval(struct timeval *tv); extern time_t now_realtime_sec(void); @@ -146,10 +139,9 @@ extern void heartbeat_init(heartbeat_t *hb); */ extern usec_t heartbeat_next(heartbeat_t *hb, usec_t tick); -/* Returns elapsed time in microseconds since last heartbeat */ -extern usec_t heartbeat_monotonic_dt_to_now_usec(heartbeat_t *hb); +extern void heartbeat_statistics(usec_t *min_ptr, usec_t *max_ptr, usec_t *average_ptr, size_t *count_ptr); -extern int sleep_usec(usec_t usec); +extern void sleep_usec(usec_t usec); extern void clocks_init(void); @@ -160,4 +152,9 @@ extern int now_timeval(clockid_t clk_id, struct timeval *tv); extern collected_number uptime_msec(char *filename); +extern usec_t clock_monotonic_resolution; +extern usec_t clock_realtime_resolution; + +extern void sleep_to_absolute_time(usec_t usec); + #endif /* NETDATA_CLOCKS_H */ diff --git a/libnetdata/libnetdata.h b/libnetdata/libnetdata.h index d197f3f7c6719b..34062f2a658674 100644 --- a/libnetdata/libnetdata.h +++ b/libnetdata/libnetdata.h @@ -346,6 +346,7 @@ extern char *netdata_configured_host_prefix; #include "health/health.h" #include "string/utf8.h" #include "onewayalloc/onewayalloc.h" +#include "worker_utilization/worker_utilization.h" // BEWARE: Outside of the C code this also exists in alarm-notify.sh #define DEFAULT_CLOUD_BASE_URL "https://app.netdata.cloud" diff --git a/libnetdata/worker_utilization/Makefile.am b/libnetdata/worker_utilization/Makefile.am new file mode 100644 index 00000000000000..161784b8f64b5b --- /dev/null +++ b/libnetdata/worker_utilization/Makefile.am @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-3.0-or-later + +AUTOMAKE_OPTIONS = subdir-objects +MAINTAINERCLEANFILES = $(srcdir)/Makefile.in + +dist_noinst_DATA = \ + README.md \ + $(NULL) diff --git a/libnetdata/worker_utilization/README.md b/libnetdata/worker_utilization/README.md new file mode 100644 index 00000000000000..85e532ed10d056 --- /dev/null +++ b/libnetdata/worker_utilization/README.md @@ -0,0 +1,58 @@ + + +# Worker Utilization + +This library is to be used when there are 1 or more worker threads accepting requests of some kind and servicing them. +The goal is to provide a very simple way to monitor worker threads utilization, as a percentage of the time they are busy and the amount of requests served. + +## How to use + +When a working thread starts, call: + +```c +void worker_register(const char *name); +``` + +This will create the necessary structures for the library to work. +No need to keep a pointer to them. They are allocated as `__thread` variables. + +When the thread stops, call: + +```c +void worker_unregister(void) +``` + +Again, no parameters, or return values. + +When you are about to do some work in the working thread, call: + +```c +void worker_is_busy(void) +``` + +When you finish doing the job, call: + +```c +void worker_is_idle(void) +``` + +Calls to `worker_is_busy()` can be made one after another (without calling +`worker_is_idle()` between them) to switch jobs without losing any time between +them and eliminating one of the 2 clock calls involved. + +## Implementation details + +Totally lockless, extremely fast, it should not introduce any kind of problems to the workers. +Every time `worker_is_busy()` or `worker_is_idle()` are called, a call to `now_realtime_usec()` +is done and a couple of variables are updated. That's it! + +The worker does not need to update the variables regularly. Based on the last status of the worker, +the statistics collector of netdata will calculate if the thread is busy or idle all the time or +part of the time. Works well for both thousands of jobs per second and unlimited working time +(being totally busy with a single request for ages). + +The statistics collector is called by the global statistics thread of netdata. So, even if the workers +are extremely busy with their jobs, netdata will be able to know how busy they are. diff --git a/libnetdata/worker_utilization/worker_utilization.c b/libnetdata/worker_utilization/worker_utilization.c new file mode 100644 index 00000000000000..459df2f26541a1 --- /dev/null +++ b/libnetdata/worker_utilization/worker_utilization.c @@ -0,0 +1,201 @@ +#include "worker_utilization.h" + +#define WORKER_IDLE 'I' +#define WORKER_BUSY 'B' + +struct worker_job_type { + char name[WORKER_UTILIZATION_MAX_JOB_NAME_LENGTH + 1]; + size_t worker_jobs_started; + usec_t worker_busy_time; + + size_t statistics_jobs_started; + usec_t statistics_busy_time; +}; + +struct worker { + pid_t pid; + const char *tag; + const char *workname; + uint32_t workname_hash; + + // only one variable is set by our statistics callers + usec_t statistics_last_checkpoint; + size_t statistics_last_jobs_started; + usec_t statistics_last_busy_time; + + // the worker controlled variables + size_t job_id; + volatile size_t jobs_started; + volatile usec_t busy_time; + volatile usec_t last_action_timestamp; + volatile char last_action; + + struct worker_job_type per_job_type[WORKER_UTILIZATION_MAX_JOB_TYPES]; + + struct worker *next; +}; + +static netdata_mutex_t base_lock = NETDATA_MUTEX_INITIALIZER; +static struct worker *base = NULL; +static __thread struct worker *worker = NULL; + +void worker_register(const char *workname) { + if(unlikely(worker)) return; + + worker = callocz(1, sizeof(struct worker)); + worker->pid = gettid(); + worker->tag = strdupz(netdata_thread_tag()); + worker->workname = strdupz(workname); + worker->workname_hash = simple_hash(worker->workname); + + usec_t now = now_realtime_usec(); + worker->statistics_last_checkpoint = now; + worker->last_action_timestamp = now; + worker->last_action = WORKER_IDLE; + + netdata_mutex_lock(&base_lock); + worker->next = base; + base = worker; + netdata_mutex_unlock(&base_lock); +} + +void worker_register_job_name(size_t job_id, const char *name) { + if(unlikely(!worker)) return; + + if(unlikely(job_id >= WORKER_UTILIZATION_MAX_JOB_TYPES)) { + error("WORKER_UTILIZATION: job_id %zu is too big. Max is %zu", job_id, (size_t)(WORKER_UTILIZATION_MAX_JOB_TYPES - 1)); + return; + } + + strncpy(worker->per_job_type[job_id].name, name, WORKER_UTILIZATION_MAX_JOB_NAME_LENGTH); +} + +void worker_unregister(void) { + if(unlikely(!worker)) return; + + netdata_mutex_lock(&base_lock); + if(base == worker) + base = worker->next; + else { + struct worker *p; + for(p = base; p && p->next && p->next != worker ;p = p->next); + if(p && p->next == worker) + p->next = worker->next; + } + netdata_mutex_unlock(&base_lock); + + freez((void *)worker->tag); + freez((void *)worker->workname); + freez(worker); + + worker = NULL; +} + +static inline void worker_is_idle_with_time(usec_t now) { + usec_t delta = now - worker->last_action_timestamp; + worker->busy_time += delta; + worker->per_job_type[worker->job_id].worker_busy_time += delta; + + // the worker was busy + // set it to idle before we set the timestamp + + worker->last_action = WORKER_IDLE; + if(likely(worker->last_action_timestamp < now)) + worker->last_action_timestamp = now; +} + +void worker_is_idle(void) { + if(unlikely(!worker)) return; + if(unlikely(worker->last_action != WORKER_BUSY)) return; + + worker_is_idle_with_time(now_realtime_usec()); +} + +void worker_is_busy(size_t job_id) { + if(unlikely(!worker)) return; + if(unlikely(job_id >= WORKER_UTILIZATION_MAX_JOB_TYPES)) + job_id = 0; + + usec_t now = now_realtime_usec(); + + if(worker->last_action == WORKER_BUSY) + worker_is_idle_with_time(now); + + // the worker was idle + // set the timestamp and then set it to busy + + worker->job_id = job_id; + worker->per_job_type[job_id].worker_jobs_started++; + worker->jobs_started++; + worker->last_action_timestamp = now; + worker->last_action = WORKER_BUSY; +} + + +// statistics interface + +void workers_foreach(const char *workname, void (*callback)(void *data, pid_t pid, const char *thread_tag, size_t utilization_usec, size_t duration_usec, size_t jobs_started, size_t is_running, const char **job_types_names, size_t *job_types_jobs_started, usec_t *job_types_busy_time), void *data) { + netdata_mutex_lock(&base_lock); + uint32_t hash = simple_hash(workname); + usec_t busy_time, delta; + size_t i, jobs_started, jobs_running; + + struct worker *p; + for(p = base; p ; p = p->next) { + if(hash != p->workname_hash || strcmp(workname, p->workname)) continue; + + usec_t now = now_realtime_usec(); + + // find per job type statistics + const char *per_job_type_name[WORKER_UTILIZATION_MAX_JOB_TYPES]; + size_t per_job_type_jobs_started[WORKER_UTILIZATION_MAX_JOB_TYPES]; + usec_t per_job_type_busy_time[WORKER_UTILIZATION_MAX_JOB_TYPES]; + for(i = 0; i < WORKER_UTILIZATION_MAX_JOB_TYPES ;i++) { + per_job_type_name[i] = p->per_job_type[i].name; + + size_t tmp_jobs_started = p->per_job_type[i].worker_jobs_started; + per_job_type_jobs_started[i] = tmp_jobs_started - p->per_job_type[i].statistics_jobs_started; + p->per_job_type[i].statistics_jobs_started = tmp_jobs_started; + + usec_t tmp_busy_time = p->per_job_type[i].worker_busy_time; + per_job_type_busy_time[i] = tmp_busy_time - p->per_job_type[i].statistics_busy_time; + p->per_job_type[i].statistics_busy_time = tmp_busy_time; + } + + // get a copy of the worker variables + usec_t worker_busy_time = p->busy_time; + size_t worker_jobs_started = p->jobs_started; + char worker_last_action = p->last_action; + usec_t worker_last_action_timestamp = p->last_action_timestamp; + + // this is the only variable both the worker thread and the statistics thread are writing + // we set this only when the worker is busy, so that worker will not + // accumulate all the busy time, but only the time after the point we collected statistics + if(worker_last_action == WORKER_BUSY && p->last_action_timestamp == worker_last_action_timestamp && p->last_action == WORKER_BUSY) + p->last_action_timestamp = now; + + // calculate delta busy time + busy_time = worker_busy_time - p->statistics_last_busy_time; + p->statistics_last_busy_time = worker_busy_time; + + // calculate delta jobs done + jobs_started = worker_jobs_started - p->statistics_last_jobs_started; + p->statistics_last_jobs_started = worker_jobs_started; + + jobs_running = 0; + if(worker_last_action == WORKER_BUSY) { + // the worker is still busy with something + // let's add that busy time to the reported one + busy_time += now - worker_last_action_timestamp; + jobs_running = 1; + } + + delta = now - p->statistics_last_checkpoint; + + p->statistics_last_checkpoint = now; + + callback(data, p->pid, p->tag, busy_time, delta, jobs_started, jobs_running, per_job_type_name, per_job_type_jobs_started, per_job_type_busy_time); + } + + netdata_mutex_unlock(&base_lock); +} diff --git a/libnetdata/worker_utilization/worker_utilization.h b/libnetdata/worker_utilization/worker_utilization.h new file mode 100644 index 00000000000000..8f16fe0549db87 --- /dev/null +++ b/libnetdata/worker_utilization/worker_utilization.h @@ -0,0 +1,22 @@ +#ifndef WORKER_UTILIZATION_H +#define WORKER_UTILIZATION_H 1 + +#include "../libnetdata.h" + +// workers interfaces + +#define WORKER_UTILIZATION_MAX_JOB_TYPES 50 +#define WORKER_UTILIZATION_MAX_JOB_NAME_LENGTH 25 + +extern void worker_register(const char *workname); +extern void worker_register_job_name(size_t job_id, const char *name); +extern void worker_unregister(void); + +extern void worker_is_idle(void); +extern void worker_is_busy(size_t job_id); + +// statistics interface + +extern void workers_foreach(const char *workname, void (*callback)(void *data, pid_t pid, const char *thread_tag, size_t utilization_usec, size_t duration_usec, size_t jobs_started, size_t is_running, const char **job_types_names, size_t *job_types_jobs_started, usec_t *job_types_busy_time), void *data); + +#endif // WORKER_UTILIZATION_H diff --git a/ml/Host.cc b/ml/Host.cc index 3166720cc81f94..4f64bf694ebe95 100644 --- a/ml/Host.cc +++ b/ml/Host.cc @@ -358,6 +358,10 @@ void TrainableHost::trainDimension(Dimension *D, const TimePoint &NowTP) { void TrainableHost::train() { Duration MaxSleepFor = Seconds{10 * updateEvery()}; + worker_register("MLTRAIN"); + worker_register_job_name(0, "dimensions"); + + worker_is_busy(0); while (!netdata_exit) { netdata_thread_testcancel(); netdata_thread_disable_cancelability(); @@ -378,11 +382,23 @@ void TrainableHost::train() { if (RealDuration >= AllottedDuration) continue; + worker_is_idle(); SleepFor = std::min(AllottedDuration - RealDuration, MaxSleepFor); std::this_thread::sleep_for(SleepFor); + worker_is_busy(0); } } +#define WORKER_JOB_DETECT_DIMENSION 0 +#define WORKER_JOB_UPDATE_DETECTION_CHART 1 +#define WORKER_JOB_UPDATE_ANOMALY_RATES 2 +#define WORKER_JOB_UPDATE_CHARTS 3 +#define WORKER_JOB_SAVE_ANOMALY_EVENT 4 + +#if WORKER_UTILIZATION_MAX_JOB_TYPES < 5 +#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 5 +#endif + void DetectableHost::detectOnce() { auto P = BRW.insert(WindowAnomalyRate >= Cfg.HostAnomalyRateThreshold); BitRateWindow::Edge Edge = P.first; @@ -408,6 +424,8 @@ void DetectableHost::detectOnce() { DimsOverThreshold.reserve(DimensionsMap.size()); for (auto &DP : DimensionsMap) { + worker_is_busy(WORKER_JOB_DETECT_DIMENSION); + Dimension *D = DP.second; auto P = D->detect(WindowLength, ResetBitCounter); @@ -434,6 +452,7 @@ void DetectableHost::detectOnce() { } if (CollectAnomalyRates) { + worker_is_busy(WORKER_JOB_UPDATE_ANOMALY_RATES); AnomalyRateTimer = 0; rrdset_done(AnomalyRateRS); } @@ -442,6 +461,7 @@ void DetectableHost::detectOnce() { this->NumNormalDimensions = NumNormalDimensions; this->NumTrainedDimensions = NumTrainedDimensions; + worker_is_busy(WORKER_JOB_UPDATE_CHARTS); updateDimensionsChart(getRH(), NumTrainedDimensions, NumNormalDimensions, NumAnomalousDimensions); updateRateChart(getRH(), WindowAnomalyRate * 10000.0); updateWindowLengthChart(getRH(), WindowLength); @@ -454,6 +474,8 @@ void DetectableHost::detectOnce() { if (!NewAnomalyEvent || (DimsOverThreshold.size() == 0)) return; + worker_is_busy(WORKER_JOB_SAVE_ANOMALY_EVENT); + std::sort(DimsOverThreshold.begin(), DimsOverThreshold.end()); std::reverse(DimsOverThreshold.begin(), DimsOverThreshold.end()); @@ -476,6 +498,13 @@ void DetectableHost::detectOnce() { } void DetectableHost::detect() { + worker_register("MLDETECT"); + worker_register_job_name(WORKER_JOB_DETECT_DIMENSION, "dimensions"); + worker_register_job_name(WORKER_JOB_UPDATE_DETECTION_CHART, "detection chart"); + worker_register_job_name(WORKER_JOB_UPDATE_ANOMALY_RATES, "anomaly rates"); + worker_register_job_name(WORKER_JOB_UPDATE_CHARTS, "charts"); + worker_register_job_name(WORKER_JOB_SAVE_ANOMALY_EVENT, "anomaly event"); + std::this_thread::sleep_for(Seconds{10}); heartbeat_t HB; @@ -483,10 +512,13 @@ void DetectableHost::detect() { while (!netdata_exit) { netdata_thread_testcancel(); + worker_is_idle(); heartbeat_next(&HB, updateEvery() * USEC_PER_SEC); netdata_thread_disable_cancelability(); detectOnce(); + + worker_is_busy(WORKER_JOB_UPDATE_DETECTION_CHART); updateDetectionChart(getRH()); netdata_thread_enable_cancelability(); } diff --git a/parser/parser.c b/parser/parser.c index 5137cb61cffd98..0707f71727513f 100644 --- a/parser/parser.c +++ b/parser/parser.c @@ -133,10 +133,13 @@ int parser_add_keyword(PARSER *parser, char *keyword, keyword_function func) tmp_keyword = callocz(1, sizeof(*tmp_keyword)); + tmp_keyword->worker_job_id = parser->worker_job_ids++; tmp_keyword->keyword = strdupz(keyword); tmp_keyword->keyword_hash = keyword_hash; tmp_keyword->func[tmp_keyword->func_no++] = (void *) func; + worker_register_job_name(tmp_keyword->worker_job_id, tmp_keyword->keyword); + tmp_keyword->next = parser->keyword; parser->keyword = tmp_keyword; return tmp_keyword->func_no; @@ -265,10 +268,12 @@ inline int parser_action(PARSER *parser, char *input) uint32_t command_hash = simple_hash(command); + size_t worker_job_id; while(tmp_keyword) { if (command_hash == tmp_keyword->keyword_hash && (!strcmp(command, tmp_keyword->keyword))) { action_function_list = &tmp_keyword->func[0]; + worker_job_id = tmp_keyword->worker_job_id; break; } tmp_keyword = tmp_keyword->next; @@ -284,12 +289,14 @@ inline int parser_action(PARSER *parser, char *input) #endif } else { + worker_is_busy(worker_job_id); while ((action_function = *action_function_list) != NULL) { rc = action_function(words, parser->user, parser->plugins_action); if (unlikely(rc == PARSER_RC_ERROR || rc == PARSER_RC_STOP)) break; action_function_list++; } + worker_is_idle(); } if (likely(input == parser->buffer)) diff --git a/parser/parser.h b/parser/parser.h index 8d11a90074c027..65a4e1ab30a4c8 100644 --- a/parser/parser.h +++ b/parser/parser.h @@ -54,6 +54,7 @@ typedef enum parser_input_type { typedef PARSER_RC (*keyword_function)(char **, void *, PLUGINSD_ACTION *plugins_action); typedef struct parser_keyword { + size_t worker_job_id; char *keyword; uint32_t keyword_hash; int func_no; @@ -67,6 +68,7 @@ typedef struct parser_data { } PARSER_DATA; typedef struct parser { + size_t worker_job_ids; uint8_t version; // Parser version RRDHOST *host; void *input; // Input source e.g. stream diff --git a/streaming/receiver.c b/streaming/receiver.c index c777ea54e96908..f77bba6ad8bff3 100644 --- a/streaming/receiver.c +++ b/streaming/receiver.c @@ -30,6 +30,8 @@ void destroy_receiver_state(struct receiver_state *rpt) { } static void rrdpush_receiver_thread_cleanup(void *ptr) { + worker_unregister(); + static __thread int executed = 0; if(!executed) { executed = 1; @@ -716,7 +718,9 @@ void *rrdpush_receiver_thread(void *ptr) { struct receiver_state *rpt = (struct receiver_state *)ptr; info("STREAM %s [%s]:%s: receive thread created (task id %d)", rpt->hostname, rpt->client_ip, rpt->client_port, gettid()); + worker_register("STREAMRCV"); rrdpush_receive(rpt); + worker_unregister(); netdata_thread_cleanup_pop(1); return NULL; diff --git a/streaming/sender.c b/streaming/sender.c index 72259c3ab34112..02d832eb87ef1f 100644 --- a/streaming/sender.c +++ b/streaming/sender.c @@ -2,6 +2,26 @@ #include "rrdpush.h" +#define WORKER_SENDER_JOB_CONNECT 0 +#define WORKER_SENDER_JOB_PIPE_READ 1 +#define WORKER_SENDER_JOB_SOCKET_RECEIVE 2 +#define WORKER_SENDER_JOB_EXECUTE 3 +#define WORKER_SENDER_JOB_SOCKET_SEND 4 +#define WORKER_SENDER_JOB_DISCONNECT_BAD_HANDSHAKE 5 +#define WORKER_SENDER_JOB_DISCONNECT_OVERFLOW 6 +#define WORKER_SENDER_JOB_DISCONNECT_TIMEOUT 7 +#define WORKER_SENDER_JOB_DISCONNECT_POLL_ERROR 8 +#define WORKER_SENDER_JOB_DISCONNECT_SOCKER_ERROR 9 +#define WORKER_SENDER_JOB_DISCONNECT_SSL_ERROR 10 +#define WORKER_SENDER_JOB_DISCONNECT_PARENT_CLOSED 11 +#define WORKER_SENDER_JOB_DISCONNECT_RECEIVE_ERROR 12 +#define WORKER_SENDER_JOB_DISCONNECT_SEND_ERROR 13 +#define WORKER_SENDER_JOB_DISCONNECT_NO_COMPRESSION 14 + +#if WORKER_UTILIZATION_MAX_JOB_TYPES < 15 +#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 15 +#endif + extern struct config stream_config; extern int netdata_use_ssl_on_stream; extern char *netdata_ssl_ca_path; @@ -21,8 +41,8 @@ static inline void rrdpush_sender_thread_close_socket(RRDHOST *host); * Inform the user through the error log file and * deactivate compression by downgrading the stream protocol. */ -static inline void deactivate_compression(struct sender_state *s) -{ +static inline void deactivate_compression(struct sender_state *s) { + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_NO_COMPRESSION); error("STREAM_COMPRESSION: Deactivating compression to avoid stream corruption"); default_compression_enabled = 0; s->rrdpush_compression = 0; @@ -389,6 +409,7 @@ if(!s->rrdpush_compression) err = SSL_get_error(host->ssl.conn, err); error("SSL cannot connect with the server: %s ",ERR_error_string((long)SSL_get_error(host->ssl.conn,err),NULL)); if (netdata_use_ssl_on_stream == NETDATA_SSL_FORCE) { + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_SSL_ERROR); rrdpush_sender_thread_close_socket(host); return 0; }else { @@ -399,6 +420,7 @@ if(!s->rrdpush_compression) if (netdata_use_ssl_on_stream == NETDATA_SSL_FORCE) { if (netdata_validate_server == NETDATA_SSL_VALID_CERTIFICATE) { if ( security_test_certificate(host->ssl.conn)) { + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_SSL_ERROR); error("Closing the stream connection, because the server SSL certificate is not valid."); rrdpush_sender_thread_close_socket(host); return 0; @@ -411,6 +433,7 @@ if(!s->rrdpush_compression) #else if(send_timeout(host->rrdpush_sender_socket, http, strlen(http), 0, timeout) == -1) { #endif + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_TIMEOUT); error("STREAM %s [send to %s]: failed to send HTTP header to remote netdata.", host->hostname, s->connected_to); rrdpush_sender_thread_close_socket(host); return 0; @@ -426,6 +449,7 @@ if(!s->rrdpush_compression) received = recv_timeout(host->rrdpush_sender_socket, http, HTTP_HEADER_SIZE, 0, timeout); if(received == -1) { #endif + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_TIMEOUT); error("STREAM %s [send to %s]: remote netdata does not respond.", host->hostname, s->connected_to); rrdpush_sender_thread_close_socket(host); return 0; @@ -435,6 +459,7 @@ if(!s->rrdpush_compression) debug(D_STREAM, "Response to sender from far end: %s", http); int32_t version = (int32_t)parse_stream_version(host, http); if(version == -1) { + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_BAD_HANDSHAKE); error("STREAM %s [send to %s]: server is not replying properly (is it a netdata?).", host->hostname, s->connected_to); rrdpush_sender_thread_close_socket(host); return 0; @@ -541,9 +566,9 @@ void attempt_to_send(struct sender_state *s) { s->last_sent_t = now_monotonic_sec(); } else if (ret == -1 && (errno == EAGAIN || errno == EINTR || errno == EWOULDBLOCK)) - debug(D_STREAM, "STREAM %s [send to %s]: unavailable after polling POLLOUT", s->host->hostname, - s->connected_to); + debug(D_STREAM, "STREAM %s [send to %s]: unavailable after polling POLLOUT", s->host->hostname, s->connected_to); else if (ret == -1) { + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_SEND_ERROR); debug(D_STREAM, "STREAM: Send failed - closing socket..."); error("STREAM %s [send to %s]: failed to send metrics - closing connection - we have sent %zu bytes on this connection.", s->host->hostname, s->connected_to, s->sent_bytes_on_this_connection); rrdpush_sender_thread_close_socket(s->host); @@ -570,6 +595,8 @@ int ret; int sslerrno = SSL_get_error(s->host->ssl.conn, desired); if (sslerrno == SSL_ERROR_WANT_READ || sslerrno == SSL_ERROR_WANT_WRITE) return; + + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_SSL_ERROR); u_long err; char buf[256]; while ((err = ERR_get_error()) != 0) { @@ -581,20 +608,25 @@ int ret; return; } #endif - ret = recv(s->host->rrdpush_sender_socket, s->read_buffer + s->read_len, sizeof(s->read_buffer) - s->read_len - 1, - MSG_DONTWAIT); + ret = recv(s->host->rrdpush_sender_socket, s->read_buffer + s->read_len, sizeof(s->read_buffer) - s->read_len - 1,MSG_DONTWAIT); if (ret>0) { s->read_len += ret; return; } + debug(D_STREAM, "Socket was POLLIN, but req %zu bytes gave %d", sizeof(s->read_buffer) - s->read_len - 1, ret); + if (ret<0 && (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR)) return; - if (ret==0) + + if (ret==0) { + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_PARENT_CLOSED); error("STREAM %s [send to %s]: connection closed by far end. Restarting connection", s->host->hostname, s->connected_to); - else - error("STREAM %s [send to %s]: error during read (%d). Restarting connection", s->host->hostname, s->connected_to, - ret); + } + else { + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_RECEIVE_ERROR); + error("STREAM %s [send to %s]: error during receive (%d). Restarting connection", s->host->hostname, s->connected_to, ret); + } rrdpush_sender_thread_close_socket(s->host); } @@ -615,6 +647,8 @@ void execute_commands(struct sender_state *s) { static void rrdpush_sender_thread_cleanup_callback(void *ptr) { + worker_unregister(); + RRDHOST *host = (RRDHOST *)ptr; netdata_mutex_lock(&host->sender->mutex); @@ -707,6 +741,25 @@ void *rrdpush_sender_thread(void *ptr) { fds[Collector].fd = s->host->rrdpush_sender_pipe[PIPE_READ]; fds[Collector].events = POLLIN; + worker_register("STREAMSND"); + worker_register_job_name(WORKER_SENDER_JOB_CONNECT, "connect"); + worker_register_job_name(WORKER_SENDER_JOB_PIPE_READ, "pipe read"); + worker_register_job_name(WORKER_SENDER_JOB_SOCKET_RECEIVE, "receive"); + worker_register_job_name(WORKER_SENDER_JOB_EXECUTE, "execute"); + worker_register_job_name(WORKER_SENDER_JOB_SOCKET_SEND, "send"); + + // disconnection reasons + worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_TIMEOUT, "disconnect timeout"); + worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_POLL_ERROR, "disconnect poll error"); + worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_SOCKER_ERROR, "disconnect socket error"); + worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_OVERFLOW, "disconnect overflow"); + worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_SSL_ERROR, "disconnect ssl error"); + worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_PARENT_CLOSED, "disconnect parent closed"); + worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_RECEIVE_ERROR, "disconnect receive error"); + worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_SEND_ERROR, "disconnect send error"); + worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_NO_COMPRESSION, "disconnect no compression"); + worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_BAD_HANDSHAKE, "disconnect bad handshake"); + netdata_thread_cleanup_push(rrdpush_sender_thread_cleanup_callback, s->host); for(; s->host->rrdpush_send_enabled && !netdata_exit ;) { // check for outstanding cancellation requests @@ -714,6 +767,7 @@ void *rrdpush_sender_thread(void *ptr) { // The connection attempt blocks (after which we use the socket in nonblocking) if(unlikely(s->host->rrdpush_sender_socket == -1)) { + worker_is_busy(WORKER_SENDER_JOB_CONNECT); s->overflow = 0; s->read_len = 0; s->buffer->read = 0; @@ -731,11 +785,14 @@ void *rrdpush_sender_thread(void *ptr) { // If the TCP window never opened then something is wrong, restart connection if(unlikely(now_monotonic_sec() - s->last_sent_t > s->timeout)) { + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_TIMEOUT); error("STREAM %s [send to %s]: could not send metrics for %d seconds - closing connection - we have sent %zu bytes on this connection via %zu send attempts.", s->host->hostname, s->connected_to, s->timeout, s->sent_bytes_on_this_connection, s->send_attempts); rrdpush_sender_thread_close_socket(s->host); continue; } + worker_is_idle(); + // Wait until buffer opens in the socket or a rrdset_done_push wakes us fds[Collector].revents = 0; fds[Socket].revents = 0; @@ -757,16 +814,18 @@ void *rrdpush_sender_thread(void *ptr) { int retval = poll(fds, 2, 1000); debug(D_STREAM, "STREAM: poll() finished collector=%d socket=%d (current chunk %zu bytes)...", fds[Collector].revents, fds[Socket].revents, outstanding); + if(unlikely(netdata_exit)) break; // Spurious wake-ups without error - loop again - if (retval == 0 || ((retval == -1) && (errno == EAGAIN || errno == EINTR))) - { + if (retval == 0 || ((retval == -1) && (errno == EAGAIN || errno == EINTR))) { debug(D_STREAM, "Spurious wakeup"); continue; } + // Only errors from poll() are internal, but try restarting the connection if(unlikely(retval == -1)) { + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_POLL_ERROR); error("STREAM %s [send to %s]: failed to poll(). Closing socket.", s->host->hostname, s->connected_to); rrdpush_sender_thread_close_socket(s->host); continue; @@ -774,6 +833,7 @@ void *rrdpush_sender_thread(void *ptr) { // If the collector woke us up then empty the pipe to remove the signal if (fds[Collector].revents & POLLIN || fds[Collector].revents & POLLPRI) { + worker_is_busy(WORKER_SENDER_JOB_PIPE_READ); debug(D_STREAM, "STREAM: Data added to send buffer (current buffer chunk %zu bytes)...", outstanding); char buffer[1000 + 1]; @@ -782,13 +842,19 @@ void *rrdpush_sender_thread(void *ptr) { } // Read as much as possible to fill the buffer, split into full lines for execution. - if (fds[Socket].revents & POLLIN) + if (fds[Socket].revents & POLLIN) { + worker_is_busy(WORKER_SENDER_JOB_SOCKET_RECEIVE); attempt_read(s); + } + + worker_is_busy(WORKER_SENDER_JOB_EXECUTE); execute_commands(s); // If we have data and have seen the TCP window open then try to close it by a transmission. - if (outstanding && fds[Socket].revents & POLLOUT) + if (outstanding && fds[Socket].revents & POLLOUT) { + worker_is_busy(WORKER_SENDER_JOB_SOCKET_SEND); attempt_to_send(s); + } // TODO-GAPS - why do we only check this on the socket, not the pipe? if (outstanding) { @@ -800,6 +866,7 @@ void *rrdpush_sender_thread(void *ptr) { else if (unlikely(fds[Socket].revents & POLLNVAL)) error = "connection is invalid (POLLNVAL)"; if(unlikely(error)) { + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_SOCKER_ERROR); error("STREAM %s [send to %s]: restart stream because %s - %zu bytes transmitted.", s->host->hostname, s->connected_to, error, s->sent_bytes_on_this_connection); rrdpush_sender_thread_close_socket(s->host); @@ -808,6 +875,7 @@ void *rrdpush_sender_thread(void *ptr) { // protection from overflow if (s->overflow) { + worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_OVERFLOW); errno = 0; error("STREAM %s [send to %s]: buffer full (%zu-bytes) after %zu bytes. Restarting connection", s->host->hostname, s->connected_to, s->buffer->size, s->sent_bytes_on_this_connection); diff --git a/web/server/static/static-threaded.c b/web/server/static/static-threaded.c index ff10cb8819ff54..a30ce2ec5ace54 100644 --- a/web/server/static/static-threaded.c +++ b/web/server/static/static-threaded.c @@ -7,6 +7,20 @@ int web_client_timeout = DEFAULT_DISCONNECT_IDLE_WEB_CLIENTS_AFTER_SECONDS; int web_client_first_request_timeout = DEFAULT_TIMEOUT_TO_RECEIVE_FIRST_WEB_REQUEST; long web_client_streaming_rate_t = 0L; +#define WORKER_JOB_ADD_CONNECTION 0 +#define WORKER_JOB_DEL_COLLECTION 1 +#define WORKER_JOB_ADD_FILE 2 +#define WORKER_JOB_DEL_FILE 3 +#define WORKER_JOB_READ_FILE 4 +#define WORKER_JOB_WRITE_FILE 5 +#define WORKER_JOB_RCV_DATA 6 +#define WORKER_JOB_SND_DATA 7 +#define WORKER_JOB_PROCESS 8 + +#if (WORKER_UTILIZATION_MAX_JOB_TYPES < 9) +#error Please increase WORKER_UTILIZATION_MAX_JOB_TYPES to at least 8 +#endif + /* * -------------------------------------------------------------------------------------------------------------------- * Build web_client state from the pollinfo that describes an accepted connection. @@ -71,11 +85,15 @@ static inline int web_server_check_client_status(struct web_client *w) { static void *web_server_file_add_callback(POLLINFO *pi, short int *events, void *data) { struct web_client *w = (struct web_client *)data; + worker_is_busy(WORKER_JOB_ADD_FILE); + worker_private->files_read++; debug(D_WEB_CLIENT, "%llu: ADDED FILE READ ON FD %d", w->id, pi->fd); *events = POLLIN; pi->data = w; + + worker_is_idle(); return w; } @@ -83,27 +101,36 @@ static void web_server_file_del_callback(POLLINFO *pi) { struct web_client *w = (struct web_client *)pi->data; debug(D_WEB_CLIENT, "%llu: RELEASE FILE READ ON FD %d", w->id, pi->fd); + worker_is_busy(WORKER_JOB_DEL_FILE); + w->pollinfo_filecopy_slot = 0; if(unlikely(!w->pollinfo_slot)) { debug(D_WEB_CLIENT, "%llu: CROSS WEB CLIENT CLEANUP (iFD %d, oFD %d)", w->id, pi->fd, w->ofd); web_client_release(w); } + + worker_is_idle(); } static int web_server_file_read_callback(POLLINFO *pi, short int *events) { + int retval = -1; struct web_client *w = (struct web_client *)pi->data; + worker_is_busy(WORKER_JOB_READ_FILE); + // if there is no POLLINFO linked to this, it means the client disconnected // stop the file reading too if(unlikely(!w->pollinfo_slot)) { debug(D_WEB_CLIENT, "%llu: PREVENTED ATTEMPT TO READ FILE ON FD %d, ON CLOSED WEB CLIENT", w->id, pi->fd); - return -1; + retval = -1; + goto cleanup; } if(unlikely(w->mode != WEB_CLIENT_MODE_FILECOPY || w->ifd == w->ofd)) { debug(D_WEB_CLIENT, "%llu: PREVENTED ATTEMPT TO READ FILE ON FD %d, ON NON-FILECOPY WEB CLIENT", w->id, pi->fd); - return -1; + retval = -1; + goto cleanup; } debug(D_WEB_CLIENT, "%llu: READING FILE ON FD %d", w->id, pi->fd); @@ -121,18 +148,25 @@ static int web_server_file_read_callback(POLLINFO *pi, short int *events) { if(unlikely(ret <= 0 || w->ifd == w->ofd)) { debug(D_WEB_CLIENT, "%llu: DONE READING FILE ON FD %d", w->id, pi->fd); - return -1; + retval = -1; + goto cleanup; } *events = POLLIN; - return 0; + retval = 0; + +cleanup: + worker_is_idle(); + return retval; } static int web_server_file_write_callback(POLLINFO *pi, short int *events) { (void)pi; (void)events; + worker_is_busy(WORKER_JOB_WRITE_FILE); error("Writing to web files is not supported!"); + worker_is_idle(); return -1; } @@ -143,6 +177,7 @@ static int web_server_file_write_callback(POLLINFO *pi, short int *events) { static void *web_server_add_callback(POLLINFO *pi, short int *events, void *data) { (void)data; // Suppress warning on unused argument + worker_is_busy(WORKER_JOB_ADD_CONNECTION); worker_private->connected++; size_t concurrent = worker_private->connected - worker_private->disconnected; @@ -177,7 +212,7 @@ static void *web_server_add_callback(POLLINFO *pi, short int *events, void *data //this means that the mensage was not completely read, so //I cannot identify it yet. sock_setnonblock(w->ifd); - return w; + goto cleanup; } //The next two ifs are not together because I am reusing SSL structure @@ -191,7 +226,7 @@ static void *web_server_add_callback(POLLINFO *pi, short int *events, void *data if (test[0] < 0x18){ WEB_CLIENT_IS_DEAD(w); sock_setnonblock(w->ifd); - return w; + goto cleanup; } } } @@ -217,11 +252,16 @@ static void *web_server_add_callback(POLLINFO *pi, short int *events, void *data #endif debug(D_WEB_CLIENT, "%llu: ADDED CLIENT FD %d", w->id, pi->fd); + +cleanup: + worker_is_idle(); return w; } // TCP client disconnected static void web_server_del_callback(POLLINFO *pi) { + worker_is_busy(WORKER_JOB_DEL_COLLECTION); + worker_private->disconnected++; struct web_client *w = (struct web_client *)pi->data; @@ -240,18 +280,27 @@ static void web_server_del_callback(POLLINFO *pi) { debug(D_WEB_CLIENT, "%llu: CLOSING CLIENT FD %d", w->id, pi->fd); web_client_release(w); } + + worker_is_idle(); } static int web_server_rcv_callback(POLLINFO *pi, short int *events) { + int ret = -1; + worker_is_busy(WORKER_JOB_RCV_DATA); + worker_private->receptions++; struct web_client *w = (struct web_client *)pi->data; int fd = pi->fd; - if(unlikely(web_client_receive(w) < 0)) - return -1; + if(unlikely(web_client_receive(w) < 0)) { + ret = -1; + goto cleanup; + } debug(D_WEB_CLIENT, "%llu: processing received data on fd %d.", w->id, fd); + worker_is_idle(); + worker_is_busy(WORKER_JOB_PROCESS); web_client_process_request(w); if(unlikely(w->mode == WEB_CLIENT_MODE_FILECOPY)) { @@ -282,7 +331,8 @@ static int web_server_rcv_callback(POLLINFO *pi, short int *events) { w->pollinfo_filecopy_slot = fpi->slot; else { error("Failed to add filecopy fd. Closing client."); - return -1; + ret = -1; + goto cleanup; } } } @@ -295,10 +345,17 @@ static int web_server_rcv_callback(POLLINFO *pi, short int *events) { if(unlikely(w->ofd == fd && web_client_has_wait_send(w))) *events |= POLLOUT; - return web_server_check_client_status(w); + ret = web_server_check_client_status(w); + +cleanup: + worker_is_idle(); + return ret; } static int web_server_snd_callback(POLLINFO *pi, short int *events) { + int retval = -1; + worker_is_busy(WORKER_JOB_SND_DATA); + worker_private->sends++; struct web_client *w = (struct web_client *)pi->data; @@ -306,8 +363,12 @@ static int web_server_snd_callback(POLLINFO *pi, short int *events) { debug(D_WEB_CLIENT, "%llu: sending data on fd %d.", w->id, fd); - if(unlikely(web_client_send(w) < 0)) - return -1; + int ret = web_client_send(w); + + if(unlikely(ret < 0)) { + retval = -1; + goto cleanup; + } if(unlikely(w->ifd == fd && web_client_has_wait_receive(w))) *events |= POLLIN; @@ -315,50 +376,11 @@ static int web_server_snd_callback(POLLINFO *pi, short int *events) { if(unlikely(w->ofd == fd && web_client_has_wait_send(w))) *events |= POLLOUT; - return web_server_check_client_status(w); -} + retval = web_server_check_client_status(w); -static void web_server_tmr_callback(void *timer_data) { - worker_private = (struct web_server_static_threaded_worker *)timer_data; - - static __thread RRDSET *st = NULL; - static __thread RRDDIM *rd_user = NULL, *rd_system = NULL; - - if(unlikely(netdata_exit)) return; - - if(unlikely(!st)) { - char id[100 + 1]; - char title[100 + 1]; - - snprintfz(id, 100, "web_thread%d_cpu", worker_private->id + 1); - snprintfz(title, 100, "Netdata web server thread CPU usage"); - - st = rrdset_create_localhost( - "netdata" - , id - , NULL - , "web" - , "netdata.web_cpu" - , title - , "milliseconds/s" - , "web" - , "stats" - , 132000 + worker_private->id - , default_rrd_update_every - , RRDSET_TYPE_STACKED - ); - - rd_user = rrddim_add(st, "user", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL); - rd_system = rrddim_add(st, "system", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL); - } - else - rrdset_next(st); - - struct rusage rusage; - getrusage(RUSAGE_THREAD, &rusage); - rrddim_set_by_pointer(st, rd_user, rusage.ru_utime.tv_sec * 1000000ULL + rusage.ru_utime.tv_usec); - rrddim_set_by_pointer(st, rd_system, rusage.ru_stime.tv_sec * 1000000ULL + rusage.ru_stime.tv_usec); - rrdset_done(st); +cleanup: + worker_is_idle(); + return retval; } // ---------------------------------------------------------------------------- @@ -379,11 +401,22 @@ static void socket_listen_main_static_threaded_worker_cleanup(void *ptr) { ); worker_private->running = 0; + worker_unregister(); } void *socket_listen_main_static_threaded_worker(void *ptr) { worker_private = (struct web_server_static_threaded_worker *)ptr; worker_private->running = 1; + worker_register("WEB"); + worker_register_job_name(WORKER_JOB_ADD_CONNECTION, "connect"); + worker_register_job_name(WORKER_JOB_DEL_COLLECTION, "disconnect"); + worker_register_job_name(WORKER_JOB_ADD_FILE, "file start"); + worker_register_job_name(WORKER_JOB_DEL_FILE, "file end"); + worker_register_job_name(WORKER_JOB_READ_FILE, "file read"); + worker_register_job_name(WORKER_JOB_WRITE_FILE, "file write"); + worker_register_job_name(WORKER_JOB_RCV_DATA, "receive"); + worker_register_job_name(WORKER_JOB_SND_DATA, "send"); + worker_register_job_name(WORKER_JOB_PROCESS, "process"); netdata_thread_cleanup_push(socket_listen_main_static_threaded_worker_cleanup, ptr); @@ -392,7 +425,7 @@ void *socket_listen_main_static_threaded_worker(void *ptr) { , web_server_del_callback , web_server_rcv_callback , web_server_snd_callback - , web_server_tmr_callback + , NULL , web_allow_connections_from , web_allow_connections_dns , NULL