From 26568194066567d7ed20f62834e5c6f45ba62e8c Mon Sep 17 00:00:00 2001 From: Gabriel mermelstein Date: Tue, 1 Oct 2024 14:30:09 +0300 Subject: [PATCH 01/10] adding debug logs --- waku/node/peer_manager/peer_manager.nim | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/waku/node/peer_manager/peer_manager.nim b/waku/node/peer_manager/peer_manager.nim index 07a091e864..f0ca376892 100644 --- a/waku/node/peer_manager/peer_manager.nim +++ b/waku/node/peer_manager/peer_manager.nim @@ -732,6 +732,12 @@ proc connectToRelayPeers*(pm: PeerManager) {.async.} = var (inRelayPeers, outRelayPeers) = pm.connectedPeers(WakuRelayCodec) let totalRelayPeers = inRelayPeers.len + outRelayPeers.len + info "connectToRelayPeers", + inRelayPeers = inRelayPeers.len, + inRelayPeersTarget = pm.inRelayPeersTarget, + outRelayPeers = outRelayPeers.len, + outRelayPeersTarget = pm.outRelayPeersTarget + if inRelayPeers.len > pm.inRelayPeersTarget: await pm.pruneInRelayConns(inRelayPeers.len - pm.inRelayPeersTarget) @@ -749,6 +755,8 @@ proc connectToRelayPeers*(pm: PeerManager) {.async.} = min(outsideBackoffPeers.len, pm.outRelayPeersTarget - outRelayPeers.len) ## number of outstanding connection requests + info "connectToRelayPeers connecting to peers", + numPendingConnReqs = numPendingConnReqs while numPendingConnReqs > 0 and outRelayPeers.len < pm.outRelayPeersTarget: let numPeersToConnect = min(numPendingConnReqs, MaxParallelDials) await pm.connectToNodes(outsideBackoffPeers[index ..< (index + numPeersToConnect)]) @@ -757,6 +765,12 @@ proc connectToRelayPeers*(pm: PeerManager) {.async.} = index += numPeersToConnect numPendingConnReqs -= numPeersToConnect + (inRelayPeers, outRelayPeers) = pm.connectedPeers(WakuRelayCodec) + info "finished connectToRelayPeers", + inRelayPeers = inRelayPeers.len, + inRelayPeersTarget = pm.inRelayPeersTarget, + outRelayPeers = outRelayPeers.len, + outRelayPeersTarget = pm.outRelayPeersTarget proc manageRelayPeers*(pm: PeerManager) {.async.} = if pm.wakuMetadata.shards.len == 0: From a0b23fbdfdf030e9d3b7c29356c21308c8fd306b Mon Sep 17 00:00:00 2001 From: Gabriel mermelstein Date: Tue, 1 Oct 2024 15:07:18 +0300 Subject: [PATCH 02/10] more logs --- waku/node/peer_manager/peer_manager.nim | 2 ++ waku/node/waku_node.nim | 1 + 2 files changed, 3 insertions(+) diff --git a/waku/node/peer_manager/peer_manager.nim b/waku/node/peer_manager/peer_manager.nim index f0ca376892..15812b0b12 100644 --- a/waku/node/peer_manager/peer_manager.nim +++ b/waku/node/peer_manager/peer_manager.nim @@ -190,6 +190,7 @@ proc connectRelay*( wireAddr = peer.addrs, peerId = peerId, failedAttempts = failedAttempts var deadline = sleepAsync(dialTimeout) + info "calling connectRelay" let workfut = pm.switch.connect(peerId, peer.addrs) # Can't use catch: with .withTimeout() in this case @@ -685,6 +686,7 @@ proc reconnectPeers*( ## Reconnect to peers registered for this protocol. This will update connectedness. ## Especially useful to resume connections from persistent storage after a restart. + info "calling reconnectPeers" debug "Reconnecting peers", proto = proto # Proto is not persisted, we need to iterate over all peers. diff --git a/waku/node/waku_node.nim b/waku/node/waku_node.nim index 9996e6c138..ce54633c5f 100644 --- a/waku/node/waku_node.nim +++ b/waku/node/waku_node.nim @@ -196,6 +196,7 @@ proc connectToNodes*( ) {.async.} = ## `source` indicates source of node addrs (static config, api call, discovery, etc) # NOTE Connects to the node without a give protocol, which automatically creates streams for relay + info "calling connectToNodes" await peer_manager.connectToNodes(node.peerManager, nodes, source = source) proc disconnectNode*(node: WakuNode, remotePeer: RemotePeerInfo) {.async.} = From 1ca9970fddd10ad8b3c152f1e49c6ed7ca92b48c Mon Sep 17 00:00:00 2001 From: Gabriel mermelstein Date: Tue, 1 Oct 2024 17:40:09 +0300 Subject: [PATCH 03/10] more logs --- waku/node/peer_manager/peer_manager.nim | 3 +++ waku/node/waku_node.nim | 1 + 2 files changed, 4 insertions(+) diff --git a/waku/node/peer_manager/peer_manager.nim b/waku/node/peer_manager/peer_manager.nim index 15812b0b12..afd217f4e9 100644 --- a/waku/node/peer_manager/peer_manager.nim +++ b/waku/node/peer_manager/peer_manager.nim @@ -253,6 +253,8 @@ proc dialPeer( trace "Dialing peer", wireAddr = addrs, peerId = peerId, proto = proto + info "calling dialPeer" + # Dial Peer let dialFut = pm.switch.dial(peerId, addrs, proto) @@ -611,6 +613,7 @@ proc dialPeer*( # First add dialed peer info to peer store, if it does not exist yet.. # TODO: nim libp2p peerstore already adds them + info "calling peerManager's dialPeer" if not pm.wakuPeerStore.hasPeer(remotePeerInfo.peerId, proto): trace "Adding newly dialed peer to manager", peerId = $remotePeerInfo.peerId, address = $remotePeerInfo.addrs[0], proto = proto diff --git a/waku/node/waku_node.nim b/waku/node/waku_node.nim index ce54633c5f..77f605c065 100644 --- a/waku/node/waku_node.nim +++ b/waku/node/waku_node.nim @@ -1252,6 +1252,7 @@ proc keepaliveLoop(node: WakuNode, keepalive: chronos.Duration) {.async.} = for peer in peers: try: + info "calling keepAlive dial" let conn = await node.switch.dial(peer.peerId, peer.addrs, PingCodec) let pingDelay = await node.libp2pPing.ping(conn) await conn.close() From 3a1f0a8e0c0435ea323adcf22c1d79a859a24ff8 Mon Sep 17 00:00:00 2001 From: Gabriel mermelstein Date: Tue, 1 Oct 2024 18:22:57 +0300 Subject: [PATCH 04/10] logs --- waku/node/peer_manager/peer_manager.nim | 1 + waku/node/waku_node.nim | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/waku/node/peer_manager/peer_manager.nim b/waku/node/peer_manager/peer_manager.nim index afd217f4e9..a3043f0463 100644 --- a/waku/node/peer_manager/peer_manager.nim +++ b/waku/node/peer_manager/peer_manager.nim @@ -367,6 +367,7 @@ proc onConnEvent(pm: PeerManager, peerId: PeerID, event: ConnEvent) {.async.} = discard proc onPeerMetadata(pm: PeerManager, peerId: PeerId) {.async.} = + info "calling onPeerMetadata" let res = catch: await pm.switch.dial(peerId, WakuMetadataCodec) diff --git a/waku/node/waku_node.nim b/waku/node/waku_node.nim index 77f605c065..df350ee8fd 100644 --- a/waku/node/waku_node.nim +++ b/waku/node/waku_node.nim @@ -434,6 +434,7 @@ proc startRelay*(node: WakuNode) {.async.} = let backoffPeriod = node.wakuRelay.parameters.pruneBackoff + chronos.seconds(BackoffSlackTime) + info "calling reconnectPeers", backoffPeriod = backoffPeriod await node.peerManager.reconnectPeers(WakuRelayCodec, backoffPeriod) # Start the WakuRelay protocol @@ -1252,7 +1253,6 @@ proc keepaliveLoop(node: WakuNode, keepalive: chronos.Duration) {.async.} = for peer in peers: try: - info "calling keepAlive dial" let conn = await node.switch.dial(peer.peerId, peer.addrs, PingCodec) let pingDelay = await node.libp2pPing.ping(conn) await conn.close() From 75ab0e141e089d4b88ece1d8b65ecc4c811b6ce2 Mon Sep 17 00:00:00 2001 From: Gabriel mermelstein Date: Tue, 1 Oct 2024 18:42:35 +0300 Subject: [PATCH 05/10] updating nim-libp2p to debug branch --- .gitmodules | 2 +- vendor/nim-libp2p | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitmodules b/.gitmodules index 5650701b4c..cb0fdb6657 100644 --- a/.gitmodules +++ b/.gitmodules @@ -12,7 +12,7 @@ path = vendor/nim-libp2p url = https://github.com/vacp2p/nim-libp2p.git ignore = dirty - branch = master + branch = debug-excess-peer-connections [submodule "vendor/nim-stew"] path = vendor/nim-stew url = https://github.com/status-im/nim-stew.git diff --git a/vendor/nim-libp2p b/vendor/nim-libp2p index 18a2e79ce2..207b4a0f5c 160000 --- a/vendor/nim-libp2p +++ b/vendor/nim-libp2p @@ -1 +1 @@ -Subproject commit 18a2e79ce209d2f21d8c5db7a41659cc39dbe0b4 +Subproject commit 207b4a0f5c890b85280107c0f913f479b32a8daa From 98ce2306d17421e0d30f67f8f757599843fea6a5 Mon Sep 17 00:00:00 2001 From: Gabriel mermelstein Date: Tue, 1 Oct 2024 19:08:46 +0300 Subject: [PATCH 06/10] logs --- waku/node/peer_manager/peer_manager.nim | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/waku/node/peer_manager/peer_manager.nim b/waku/node/peer_manager/peer_manager.nim index a3043f0463..d191c681a3 100644 --- a/waku/node/peer_manager/peer_manager.nim +++ b/waku/node/peer_manager/peer_manager.nim @@ -367,7 +367,6 @@ proc onConnEvent(pm: PeerManager, peerId: PeerID, event: ConnEvent) {.async.} = discard proc onPeerMetadata(pm: PeerManager, peerId: PeerId) {.async.} = - info "calling onPeerMetadata" let res = catch: await pm.switch.dial(peerId, WakuMetadataCodec) @@ -744,6 +743,8 @@ proc connectToRelayPeers*(pm: PeerManager) {.async.} = outRelayPeers = outRelayPeers.len, outRelayPeersTarget = pm.outRelayPeersTarget + info "connectToRelayPeers outRelayPeers", outRelayPeers = outRelayPeers + if inRelayPeers.len > pm.inRelayPeersTarget: await pm.pruneInRelayConns(inRelayPeers.len - pm.inRelayPeersTarget) From 5c9bc68aaf8f26100931284bf907f3e1f826a81c Mon Sep 17 00:00:00 2001 From: Gabriel mermelstein Date: Wed, 2 Oct 2024 11:01:21 +0300 Subject: [PATCH 07/10] ping log --- waku/node/waku_node.nim | 1 + 1 file changed, 1 insertion(+) diff --git a/waku/node/waku_node.nim b/waku/node/waku_node.nim index df350ee8fd..64092b0f83 100644 --- a/waku/node/waku_node.nim +++ b/waku/node/waku_node.nim @@ -1253,6 +1253,7 @@ proc keepaliveLoop(node: WakuNode, keepalive: chronos.Duration) {.async.} = for peer in peers: try: + info "calling keepAlive dial", peerId = peer.peerId let conn = await node.switch.dial(peer.peerId, peer.addrs, PingCodec) let pingDelay = await node.libp2pPing.ping(conn) await conn.close() From f3b533f5ae99ccca849f51bce4f4cf69b4c07812 Mon Sep 17 00:00:00 2001 From: Gabriel mermelstein Date: Wed, 2 Oct 2024 12:42:13 +0300 Subject: [PATCH 08/10] adding potential fix --- waku/node/peer_manager/peer_manager.nim | 10 ++++++---- waku/node/waku_node.nim | 14 ++++++++------ 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/waku/node/peer_manager/peer_manager.nim b/waku/node/peer_manager/peer_manager.nim index d191c681a3..242a76ad79 100644 --- a/waku/node/peer_manager/peer_manager.nim +++ b/waku/node/peer_manager/peer_manager.nim @@ -407,9 +407,11 @@ proc onPeerMetadata(pm: PeerManager, peerId: PeerId) {.async.} = asyncSpawn(pm.switch.disconnect(peerId)) pm.wakuPeerStore.delete(peerId) -proc connectedPeers*(pm: PeerManager, protocol: string): (seq[PeerId], seq[PeerId]) = - ## Returns the peerIds of physical connections (in and out) - ## containing at least one stream with the given protocol. +proc connectedPeers*( + pm: PeerManager, protocol: string = "" +): (seq[PeerId], seq[PeerId]) = + ## Returns the peerIds of physical connections (in and out) + ## If a protocol is specified, only returns peers with at least one stream of that protocol var inPeers: seq[PeerId] var outPeers: seq[PeerId] @@ -417,7 +419,7 @@ proc connectedPeers*(pm: PeerManager, protocol: string): (seq[PeerId], seq[PeerI for peerId, muxers in pm.switch.connManager.getConnections(): for peerConn in muxers: let streams = peerConn.getStreams() - if streams.anyIt(it.protocol == protocol): + if protocol.len == 0 or streams.anyIt(it.protocol == protocol): if peerConn.connection.transportDir == Direction.In: inPeers.add(peerId) elif peerConn.connection.transportDir == Direction.Out: diff --git a/waku/node/waku_node.nim b/waku/node/waku_node.nim index 64092b0f83..4eb671f493 100644 --- a/waku/node/waku_node.nim +++ b/waku/node/waku_node.nim @@ -1244,17 +1244,19 @@ proc mountLibp2pPing*(node: WakuNode) {.async: (raises: []).} = # TODO: Move this logic to PeerManager proc keepaliveLoop(node: WakuNode, keepalive: chronos.Duration) {.async.} = while node.started: - # Keep all connected peers alive while running + # Keep connected peers alive while running + # Each node is responsible of keeping its outgoing connections alive trace "Running keepalive" # First get a list of connected peer infos - let peers = - node.peerManager.wakuPeerStore.peers().filterIt(it.connectedness == Connected) + let outPeers = node.peerManager.connectedPeers()[1] - for peer in peers: + for peerId in outPeers: try: - info "calling keepAlive dial", peerId = peer.peerId - let conn = await node.switch.dial(peer.peerId, peer.addrs, PingCodec) + info "calling keepAlive dial", peerId = peerId + let conn = (await node.peerManager.dialPeer(peerId, PingCodec)).valueOr: + warn "Failed dialing peer for keep alive", peerId = peerId + continue let pingDelay = await node.libp2pPing.ping(conn) await conn.close() except CatchableError as exc: From 41ffa4b7e798193baab94842b4c3e88a0f7e0153 Mon Sep 17 00:00:00 2001 From: Gabriel mermelstein Date: Wed, 2 Oct 2024 13:06:33 +0300 Subject: [PATCH 09/10] removing debug logs --- waku/node/peer_manager/peer_manager.nim | 21 --------------------- waku/node/waku_node.nim | 2 -- 2 files changed, 23 deletions(-) diff --git a/waku/node/peer_manager/peer_manager.nim b/waku/node/peer_manager/peer_manager.nim index 242a76ad79..f51cfa31ac 100644 --- a/waku/node/peer_manager/peer_manager.nim +++ b/waku/node/peer_manager/peer_manager.nim @@ -190,7 +190,6 @@ proc connectRelay*( wireAddr = peer.addrs, peerId = peerId, failedAttempts = failedAttempts var deadline = sleepAsync(dialTimeout) - info "calling connectRelay" let workfut = pm.switch.connect(peerId, peer.addrs) # Can't use catch: with .withTimeout() in this case @@ -253,8 +252,6 @@ proc dialPeer( trace "Dialing peer", wireAddr = addrs, peerId = peerId, proto = proto - info "calling dialPeer" - # Dial Peer let dialFut = pm.switch.dial(peerId, addrs, proto) @@ -615,7 +612,6 @@ proc dialPeer*( # First add dialed peer info to peer store, if it does not exist yet.. # TODO: nim libp2p peerstore already adds them - info "calling peerManager's dialPeer" if not pm.wakuPeerStore.hasPeer(remotePeerInfo.peerId, proto): trace "Adding newly dialed peer to manager", peerId = $remotePeerInfo.peerId, address = $remotePeerInfo.addrs[0], proto = proto @@ -691,7 +687,6 @@ proc reconnectPeers*( ## Reconnect to peers registered for this protocol. This will update connectedness. ## Especially useful to resume connections from persistent storage after a restart. - info "calling reconnectPeers" debug "Reconnecting peers", proto = proto # Proto is not persisted, we need to iterate over all peers. @@ -739,14 +734,6 @@ proc connectToRelayPeers*(pm: PeerManager) {.async.} = var (inRelayPeers, outRelayPeers) = pm.connectedPeers(WakuRelayCodec) let totalRelayPeers = inRelayPeers.len + outRelayPeers.len - info "connectToRelayPeers", - inRelayPeers = inRelayPeers.len, - inRelayPeersTarget = pm.inRelayPeersTarget, - outRelayPeers = outRelayPeers.len, - outRelayPeersTarget = pm.outRelayPeersTarget - - info "connectToRelayPeers outRelayPeers", outRelayPeers = outRelayPeers - if inRelayPeers.len > pm.inRelayPeersTarget: await pm.pruneInRelayConns(inRelayPeers.len - pm.inRelayPeersTarget) @@ -764,8 +751,6 @@ proc connectToRelayPeers*(pm: PeerManager) {.async.} = min(outsideBackoffPeers.len, pm.outRelayPeersTarget - outRelayPeers.len) ## number of outstanding connection requests - info "connectToRelayPeers connecting to peers", - numPendingConnReqs = numPendingConnReqs while numPendingConnReqs > 0 and outRelayPeers.len < pm.outRelayPeersTarget: let numPeersToConnect = min(numPendingConnReqs, MaxParallelDials) await pm.connectToNodes(outsideBackoffPeers[index ..< (index + numPeersToConnect)]) @@ -774,12 +759,6 @@ proc connectToRelayPeers*(pm: PeerManager) {.async.} = index += numPeersToConnect numPendingConnReqs -= numPeersToConnect - (inRelayPeers, outRelayPeers) = pm.connectedPeers(WakuRelayCodec) - info "finished connectToRelayPeers", - inRelayPeers = inRelayPeers.len, - inRelayPeersTarget = pm.inRelayPeersTarget, - outRelayPeers = outRelayPeers.len, - outRelayPeersTarget = pm.outRelayPeersTarget proc manageRelayPeers*(pm: PeerManager) {.async.} = if pm.wakuMetadata.shards.len == 0: diff --git a/waku/node/waku_node.nim b/waku/node/waku_node.nim index 4eb671f493..5aa78203df 100644 --- a/waku/node/waku_node.nim +++ b/waku/node/waku_node.nim @@ -196,7 +196,6 @@ proc connectToNodes*( ) {.async.} = ## `source` indicates source of node addrs (static config, api call, discovery, etc) # NOTE Connects to the node without a give protocol, which automatically creates streams for relay - info "calling connectToNodes" await peer_manager.connectToNodes(node.peerManager, nodes, source = source) proc disconnectNode*(node: WakuNode, remotePeer: RemotePeerInfo) {.async.} = @@ -434,7 +433,6 @@ proc startRelay*(node: WakuNode) {.async.} = let backoffPeriod = node.wakuRelay.parameters.pruneBackoff + chronos.seconds(BackoffSlackTime) - info "calling reconnectPeers", backoffPeriod = backoffPeriod await node.peerManager.reconnectPeers(WakuRelayCodec, backoffPeriod) # Start the WakuRelay protocol From 5d40f73ad78f1a07574248f7193fa3a5b676bb9c Mon Sep 17 00:00:00 2001 From: Gabriel mermelstein Date: Wed, 2 Oct 2024 14:01:13 +0300 Subject: [PATCH 10/10] Revert "updating nim-libp2p to debug branch" This reverts commit 75ab0e141e089d4b88ece1d8b65ecc4c811b6ce2. --- .gitmodules | 2 +- vendor/nim-libp2p | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitmodules b/.gitmodules index cb0fdb6657..5650701b4c 100644 --- a/.gitmodules +++ b/.gitmodules @@ -12,7 +12,7 @@ path = vendor/nim-libp2p url = https://github.com/vacp2p/nim-libp2p.git ignore = dirty - branch = debug-excess-peer-connections + branch = master [submodule "vendor/nim-stew"] path = vendor/nim-stew url = https://github.com/status-im/nim-stew.git diff --git a/vendor/nim-libp2p b/vendor/nim-libp2p index 207b4a0f5c..18a2e79ce2 160000 --- a/vendor/nim-libp2p +++ b/vendor/nim-libp2p @@ -1 +1 @@ -Subproject commit 207b4a0f5c890b85280107c0f913f479b32a8daa +Subproject commit 18a2e79ce209d2f21d8c5db7a41659cc39dbe0b4