From 23fc75ea3a366e72927ddef27dd5d24a019827d2 Mon Sep 17 00:00:00 2001 From: Nichamon Naksinehaboon Date: Mon, 27 Jan 2025 19:17:11 -0600 Subject: [PATCH] Prevent race between look-up and set deletion Fix a race condition where a set delete request from a peer could invalidate maps while ldmsd is handling a rendezvous lookup and is about to submit a remote read request. an update is being scheduled. Check if the remote and local map handles of the set are valid and hold the lock during the entire read submission to ensure maps remains valid. This is a corner case. The time window between the times the server deletes the set and responds to a lookup request is very small. In practice, this could happen when a set is very short live. --- ldms/src/core/ldms_xprt.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ldms/src/core/ldms_xprt.c b/ldms/src/core/ldms_xprt.c index 929a94409..27e695dea 100644 --- a/ldms/src/core/ldms_xprt.c +++ b/ldms/src/core/ldms_xprt.c @@ -2608,11 +2608,17 @@ static void handle_rendezvous_lookup(zap_ep_t zep, zap_event_t ev, rd_ctxt->rc = ctxt->rc; pthread_mutex_unlock(&x->lock); assert((zep == x->zap_ep) && (x == rd_ctxt->x)); + pthread_mutex_lock(&lset->lock); + if (!lset->lmap || !lset->rmap) { + pthread_mutex_unlock(&lset->lock); + goto callback; + } rc = zap_read(zep, lset->rmap, zap_map_addr(lset->rmap), lset->lmap, zap_map_addr(lset->lmap), __le32_to_cpu(lset->meta->meta_sz), rd_ctxt); + pthread_mutex_unlock(&lset->lock); if (rc) { x->zerrno = rc; rc = zap_zerr2errno(rc);