From 0c0c4683b7137c23212b54c6c7b490dea168199d Mon Sep 17 00:00:00 2001 From: Zain Budhwani Date: Tue, 20 Feb 2024 22:54:00 +0000 Subject: [PATCH 1/6] Add incremental delay when polling to see if thread is init --- src/sonic-eventd/src/eventd.cpp | 6 ++++-- src/sonic-eventd/src/eventd.h | 2 ++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/sonic-eventd/src/eventd.cpp b/src/sonic-eventd/src/eventd.cpp index 36c162453427..7b861335fa8d 100644 --- a/src/sonic-eventd/src/eventd.cpp +++ b/src/sonic-eventd/src/eventd.cpp @@ -546,9 +546,11 @@ capture_service::set_control(capture_control_t ctrl, event_serialized_lst_t *lst switch(ctrl) { case INIT_CAPTURE: m_thr = thread(&capture_service::do_capture, this); + int duration = CAPTURE_SERVICE_POLLING_DURATION; for(int i=0; !m_cap_run && (i < CAPTURE_SERVICE_POLLING_RETRIES); ++i) { - /* Wait max a second for thread to init */ - this_thread::sleep_for(chrono::milliseconds(CAPTURE_SERVICE_POLLING_DURATION)); + /* Poll to see if thread has been init, if so exit early. Add delay on every attempt */ + this_thread::sleep_for(chrono::milliseconds(duration)); + duration = min(duration + CAPTURE_SERVICE_POLLING_INCREMENT, CAPTURE_SERVICE_POLLING_DELAY); } RET_ON_ERR(m_cap_run, "Failed to init capture"); m_ctrl = ctrl; diff --git a/src/sonic-eventd/src/eventd.h b/src/sonic-eventd/src/eventd.h index a7a87f9436a0..bf0d87d6382d 100644 --- a/src/sonic-eventd/src/eventd.h +++ b/src/sonic-eventd/src/eventd.h @@ -22,6 +22,8 @@ typedef enum { #define EVENTS_STATS_FIELD_NAME "value" #define STATS_HEARTBEAT_MIN 300 #define CAPTURE_SERVICE_POLLING_DURATION 10 +#define CAPTURE_SERVICE_POLLING_INCREMENT 10 +#define CAPTUER_SERVICE_POLLING_MAX_DURATION 100 #define CAPTURE_SERVICE_POLLING_RETRIES 100 /* From a50802815adee2a15ce4243ecad97dde37229380 Mon Sep 17 00:00:00 2001 From: Zain Budhwani Date: Wed, 21 Feb 2024 01:23:41 +0000 Subject: [PATCH 2/6] Push changes to eventd.cpp --- src/sonic-eventd/src/eventd.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sonic-eventd/src/eventd.cpp b/src/sonic-eventd/src/eventd.cpp index 7b861335fa8d..b831041b0af6 100644 --- a/src/sonic-eventd/src/eventd.cpp +++ b/src/sonic-eventd/src/eventd.cpp @@ -539,6 +539,7 @@ int capture_service::set_control(capture_control_t ctrl, event_serialized_lst_t *lst) { int ret = -1; + int duration = CAPTURE_SERVICE_POLLING_DURATION; /* Can go in single step only. */ RET_ON_ERR((ctrl - m_ctrl) == 1, "m_ctrl(%d)+1 < ctrl(%d)", m_ctrl, ctrl); @@ -546,11 +547,10 @@ capture_service::set_control(capture_control_t ctrl, event_serialized_lst_t *lst switch(ctrl) { case INIT_CAPTURE: m_thr = thread(&capture_service::do_capture, this); - int duration = CAPTURE_SERVICE_POLLING_DURATION; for(int i=0; !m_cap_run && (i < CAPTURE_SERVICE_POLLING_RETRIES); ++i) { /* Poll to see if thread has been init, if so exit early. Add delay on every attempt */ this_thread::sleep_for(chrono::milliseconds(duration)); - duration = min(duration + CAPTURE_SERVICE_POLLING_INCREMENT, CAPTURE_SERVICE_POLLING_DELAY); + duration = min(duration + CAPTURE_SERVICE_POLLING_INCREMENT, CAPTURE_SERVICE_POLLING_MAX_DURATION); } RET_ON_ERR(m_cap_run, "Failed to init capture"); m_ctrl = ctrl; From c9127189a7e6b857c8b3488adc7b94c5315fd0c3 Mon Sep 17 00:00:00 2001 From: Zain Budhwani Date: Wed, 21 Feb 2024 18:29:28 +0000 Subject: [PATCH 3/6] Push changes to eventd.h --- src/sonic-eventd/src/eventd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sonic-eventd/src/eventd.h b/src/sonic-eventd/src/eventd.h index bf0d87d6382d..960dfb8b8dc1 100644 --- a/src/sonic-eventd/src/eventd.h +++ b/src/sonic-eventd/src/eventd.h @@ -23,7 +23,7 @@ typedef enum { #define STATS_HEARTBEAT_MIN 300 #define CAPTURE_SERVICE_POLLING_DURATION 10 #define CAPTURE_SERVICE_POLLING_INCREMENT 10 -#define CAPTUER_SERVICE_POLLING_MAX_DURATION 100 +#define CAPTURE_SERVICE_POLLING_MAX_DURATION 100 #define CAPTURE_SERVICE_POLLING_RETRIES 100 /* From ab1a29cde73ffd138c29e2dddcd153d54840ff5e Mon Sep 17 00:00:00 2001 From: Zain Budhwani Date: Mon, 4 Mar 2024 19:10:30 +0000 Subject: [PATCH 4/6] Fix eventd exiting for any reason logged as error --- src/sonic-eventd/src/eventd.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sonic-eventd/src/eventd.cpp b/src/sonic-eventd/src/eventd.cpp index b831041b0af6..8c10fedd2527 100644 --- a/src/sonic-eventd/src/eventd.cpp +++ b/src/sonic-eventd/src/eventd.cpp @@ -810,7 +810,7 @@ run_eventd_service() if (zctx != NULL) { zmq_ctx_term(zctx); } - SWSS_LOG_ERROR("Eventd service exiting\n"); + SWSS_LOG_INFO("Eventd service exiting\n"); } void set_unit_testing(bool b) From 5588c8532e215da86a6aeaf302ae5374754e2387 Mon Sep 17 00:00:00 2001 From: Zain Budhwani Date: Mon, 4 Mar 2024 22:11:51 +0000 Subject: [PATCH 5/6] Add logic such that eventd does not exit if capture service is unavailable --- src/sonic-eventd/src/eventd.cpp | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/src/sonic-eventd/src/eventd.cpp b/src/sonic-eventd/src/eventd.cpp index 8c10fedd2527..a818731151fc 100644 --- a/src/sonic-eventd/src/eventd.cpp +++ b/src/sonic-eventd/src/eventd.cpp @@ -649,6 +649,7 @@ run_eventd_service() stats_collector stats_instance; eventd_proxy *proxy = NULL; capture_service *capture = NULL; + bool skip_caching = false; event_serialized_lst_t capture_fifo_events; last_events_t capture_last_events; @@ -679,8 +680,14 @@ run_eventd_service() * Telemetry will send a stop & collect cache upon startup */ capture = new capture_service(zctx, cache_max, &stats_instance); - RET_ON_ERR(capture->set_control(INIT_CAPTURE) == 0, "Failed to init capture"); - RET_ON_ERR(capture->set_control(START_CAPTURE) == 0, "Failed to start capture"); + if (capture->set_control(INIT_CAPTURE) != 0) { + SWSS_LOG_WARN("Failed to initialize capture service, so we skip caching"); + skip_caching = true; + delete capture; + capture = NULL; // Capture service will not be available + } else { + RET_ON_ERR(capture->set_control(START_CAPTURE) == 0, "Failed to start capture"); + } this_thread::sleep_for(chrono::milliseconds(200)); RET_ON_ERR(stats_instance.is_running(), "Failed to start stats instance"); @@ -710,7 +717,7 @@ run_eventd_service() case EVENT_CACHE_START: if (capture == NULL) { - SWSS_LOG_ERROR("Cache is not initialized to start"); + SWSS_LOG_WARN("Cache is not initialized to start"); resp = -1; break; } @@ -723,7 +730,7 @@ run_eventd_service() case EVENT_CACHE_STOP: if (capture == NULL) { - SWSS_LOG_ERROR("Cache is not initialized to stop"); + SWSS_LOG_WARN("Cache is not initialized to stop"); resp = -1; break; } @@ -742,6 +749,11 @@ run_eventd_service() case EVENT_CACHE_READ: + if (skip_caching) { + SWSS_LOG_WARN("Capture service is unavailable, skipping cache read"); + resp = -1; + break; + } if (capture != NULL) { SWSS_LOG_ERROR("Cache is not stopped yet."); resp = -1; From 90f43c5449aa0bee7a12c7ada464a2f6a4647d9e Mon Sep 17 00:00:00 2001 From: Zain Budhwani Date: Mon, 11 Mar 2024 21:42:38 +0000 Subject: [PATCH 6/6] Use smart pointer for capture service --- src/sonic-eventd/src/eventd.cpp | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/src/sonic-eventd/src/eventd.cpp b/src/sonic-eventd/src/eventd.cpp index a818731151fc..eb692d5b3a9b 100644 --- a/src/sonic-eventd/src/eventd.cpp +++ b/src/sonic-eventd/src/eventd.cpp @@ -1,4 +1,5 @@ #include +#include #include "eventd.h" #include "dbconnector.h" #include "zmq.h" @@ -648,7 +649,7 @@ run_eventd_service() event_service service; stats_collector stats_instance; eventd_proxy *proxy = NULL; - capture_service *capture = NULL; + unique_ptr capture; bool skip_caching = false; event_serialized_lst_t capture_fifo_events; @@ -679,12 +680,11 @@ run_eventd_service() * events until telemetry starts. * Telemetry will send a stop & collect cache upon startup */ - capture = new capture_service(zctx, cache_max, &stats_instance); + capture = make_unique(zctx, cache_max, &stats_instance); if (capture->set_control(INIT_CAPTURE) != 0) { SWSS_LOG_WARN("Failed to initialize capture service, so we skip caching"); skip_caching = true; - delete capture; - capture = NULL; // Capture service will not be available + capture.reset(); // Capture service will not be available } else { RET_ON_ERR(capture->set_control(START_CAPTURE) == 0, "Failed to start capture"); } @@ -703,12 +703,12 @@ run_eventd_service() case EVENT_CACHE_INIT: /* connect only*/ if (capture != NULL) { - delete capture; + capture.reset(); } event_serialized_lst_t().swap(capture_fifo_events); last_events_t().swap(capture_last_events); - capture = new capture_service(zctx, cache_max, &stats_instance); + capture = make_unique(zctx, cache_max, &stats_instance); if (capture != NULL) { resp = capture->set_control(INIT_CAPTURE); } @@ -740,8 +740,7 @@ run_eventd_service() resp = capture->read_cache(capture_fifo_events, capture_last_events, overflow); } - delete capture; - capture = NULL; + capture.reset(); /* Unpause heartbeat upon stop caching */ stats_instance.heartbeat_ctrl(); @@ -816,9 +815,6 @@ run_eventd_service() if (proxy != NULL) { delete proxy; } - if (capture != NULL) { - delete capture; - } if (zctx != NULL) { zmq_ctx_term(zctx); }