From 53a18874c88a33f600521522838332f3e5d2b507 Mon Sep 17 00:00:00 2001 From: Maksim Zheravin Date: Fri, 7 Jan 2022 22:53:39 +0200 Subject: [PATCH 01/15] Scalable processor: initial classes --- .../exchange/core2/revelator/Revelator.java | 3 +- .../core2/revelator/fences/IFenceArray.java | 11 ++ .../revelator/fences/ScalableFenceArray.java | 32 +++ .../scalable/ScalableMessageHandler.java | 19 ++ .../ScalablePrimaryFlowProcessor.java | 184 ++++++++++++++++++ .../ScalableSecondaryFlowProcessor.java | 183 +++++++++++++++++ .../scalable/ScalableShardClassifier.java | 18 ++ 7 files changed, 449 insertions(+), 1 deletion(-) create mode 100644 src/main/java/exchange/core2/revelator/fences/IFenceArray.java create mode 100644 src/main/java/exchange/core2/revelator/fences/ScalableFenceArray.java create mode 100644 src/main/java/exchange/core2/revelator/processors/scalable/ScalableMessageHandler.java create mode 100644 src/main/java/exchange/core2/revelator/processors/scalable/ScalablePrimaryFlowProcessor.java create mode 100644 src/main/java/exchange/core2/revelator/processors/scalable/ScalableSecondaryFlowProcessor.java create mode 100644 src/main/java/exchange/core2/revelator/processors/scalable/ScalableShardClassifier.java diff --git a/src/main/java/exchange/core2/revelator/Revelator.java b/src/main/java/exchange/core2/revelator/Revelator.java index 1edae8c..77c5fe3 100644 --- a/src/main/java/exchange/core2/revelator/Revelator.java +++ b/src/main/java/exchange/core2/revelator/Revelator.java @@ -18,7 +18,7 @@ public final class Revelator implements AutoCloseable { private static final Logger log = LoggerFactory.getLogger(Revelator.class); - public static final int MSG_HEADER_SIZE = 3; + public static final int MSG_HEADER_SIZE = 4; public static final byte MSG_TYPE_POISON_PILL = 31; public static final byte MSG_TYPE_TEST_CONTROL = 30; @@ -208,6 +208,7 @@ public long claimSingleMessage(final int claimingPayloadSize, buffer[index] = msgTypeEncoded | correlationId; buffer[index + 1] = timestamp; buffer[index + 2] = claimingPayloadSize; + buffer[index + 3] = 0L; final long payloadStartSeq = msgStartSequence + MSG_HEADER_SIZE; diff --git a/src/main/java/exchange/core2/revelator/fences/IFenceArray.java b/src/main/java/exchange/core2/revelator/fences/IFenceArray.java new file mode 100644 index 0000000..0af7dc3 --- /dev/null +++ b/src/main/java/exchange/core2/revelator/fences/IFenceArray.java @@ -0,0 +1,11 @@ +package exchange.core2.revelator.fences; + +public interface IFenceArray { + + long getAcquire(long entityId); + + long getVolatile(long entityId); + + long getOpaque(long entityId); + +} diff --git a/src/main/java/exchange/core2/revelator/fences/ScalableFenceArray.java b/src/main/java/exchange/core2/revelator/fences/ScalableFenceArray.java new file mode 100644 index 0000000..7510de9 --- /dev/null +++ b/src/main/java/exchange/core2/revelator/fences/ScalableFenceArray.java @@ -0,0 +1,32 @@ +package exchange.core2.revelator.fences; + +public class ScalableFenceArray implements IFenceArray { + + + private final long mask; + private final IFence[] fences; + + // TODO add cache + + public ScalableFenceArray(final IFence[] fences, int numHandlers) { + this.fences = fences; + this.mask = numHandlers - 1; + } + + + @Override + public long getAcquire(long entityId) { + + return fences[(int)(entityId & mask)].getAcquire(0L); + } + + @Override + public long getVolatile(long entityId) { + return 0; + } + + @Override + public long getOpaque(long entityId) { + return 0; + } +} diff --git a/src/main/java/exchange/core2/revelator/processors/scalable/ScalableMessageHandler.java b/src/main/java/exchange/core2/revelator/processors/scalable/ScalableMessageHandler.java new file mode 100644 index 0000000..55103b0 --- /dev/null +++ b/src/main/java/exchange/core2/revelator/processors/scalable/ScalableMessageHandler.java @@ -0,0 +1,19 @@ +package exchange.core2.revelator.processors.scalable; + +public interface ScalableMessageHandler { + + void handleMessage(long[] buffer, + int payloadIndex, + int payloadSize, + long timestamp, + long globalOffset, + long correlationId, + byte msgType); + + + + + default void onShutdown() { + } + +} diff --git a/src/main/java/exchange/core2/revelator/processors/scalable/ScalablePrimaryFlowProcessor.java b/src/main/java/exchange/core2/revelator/processors/scalable/ScalablePrimaryFlowProcessor.java new file mode 100644 index 0000000..4120c0b --- /dev/null +++ b/src/main/java/exchange/core2/revelator/processors/scalable/ScalablePrimaryFlowProcessor.java @@ -0,0 +1,184 @@ +package exchange.core2.revelator.processors.scalable; + +import exchange.core2.revelator.Revelator; +import exchange.core2.revelator.RevelatorConfig; +import exchange.core2.revelator.fences.IFence; +import exchange.core2.revelator.fences.SingleWriterFence; +import exchange.core2.revelator.processors.IFlowProcessor; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Arrays; + +import static exchange.core2.revelator.processors.scalable.ScalableShardClassifier.SHARD_ALL; +import static exchange.core2.revelator.processors.scalable.ScalableShardClassifier.SHARD_NONE; + +public final class ScalablePrimaryFlowProcessor implements IFlowProcessor { + + private static final Logger log = LoggerFactory.getLogger(ScalablePrimaryFlowProcessor.class); + + // 0..n-2 processors (n-1 is master) + private final ScalableSecondaryFlowProcessor[] secondaryProcessors; + private final ScalableMessageHandler[] handlers; + + private final ScalableShardClassifier shardClassifier; + + private final IFence inboundFence; + private final SingleWriterFence releasingFence = new SingleWriterFence(); + + private final int indexMask; + private final long[] buffer; + private final int bufferSize; + + private volatile PrimaryState actualState = PrimaryState.CENTRALIZED; + + + public ScalablePrimaryFlowProcessor(final ScalableMessageHandler[] handlers, + final ScalableShardClassifier shardClassifier, + final IFence inboundFence, + final RevelatorConfig config) { + + this.handlers = handlers; + this.shardClassifier = shardClassifier; + this.inboundFence = inboundFence; + this.indexMask = config.getIndexMask(); + this.buffer = config.getBuffer(); + this.bufferSize = config.getBufferSize(); + } + + @Override + public void run() { + + + long positionSeq = 0L; + + while (true) { + + int queueRefreshCounter = 0; + + long availableSeq; + while ((availableSeq = inboundFence.getAcquire(positionSeq)) <= positionSeq) { + + if (actualState == PrimaryState.SHARDED) { + actualState = PrimaryState.CONSOLIDATING; + + // no new messages + still in sharded mode => signal secondary processors to initiate stop + for (final ScalableSecondaryFlowProcessor p : secondaryProcessors) { + p.deactivate(positionSeq); + } + } + Thread.onSpinWait(); + } + + if (actualState == PrimaryState.CENTRALIZED && availableSeq - positionSeq > 8192) { + // switch to sharded mode + actualState = PrimaryState.SHARDED; + + for (final ScalableSecondaryFlowProcessor p : secondaryProcessors) { + p.activate(positionSeq, availableSeq); + } + + //final int index = (int) (positionSeq & indexMask); + } + + // TODO separate implementations depending on actualState? + + while (positionSeq < availableSeq) { + + final int index = (int) (positionSeq & indexMask); + + final long header1 = buffer[index]; + + if (header1 == 0L) { + // skip until end of the buffer + positionSeq = (positionSeq | indexMask) + 1; + continue; + } + + final long correlationId = header1 & 0x00FF_FFFF_FFFF_FFFFL; + final int header2 = (int) (header1 >>> 56); + final byte msgType = (byte) (header2 & 0x1F); + + if (msgType == Revelator.MSG_TYPE_POISON_PILL) { + log.debug("processor shutdown (received msgType={}, publishing positionSeq={}+{})", msgType, positionSeq, Revelator.MSG_HEADER_SIZE); + releasingFence.setRelease(positionSeq + Revelator.MSG_HEADER_SIZE); + for (final ScalableMessageHandler handler : handlers) { + handler.onShutdown(); + } + return; + } + + final int indexMsg = index + Revelator.MSG_HEADER_SIZE; + + // payload size in longs + final int payloadSize = (int) buffer[index + 2]; + if (indexMsg + payloadSize > bufferSize) { + throw new IllegalStateException("Failed to decode message: headerSize=" + Revelator.MSG_HEADER_SIZE + + " payloadSize=" + payloadSize + + " correlationId=" + correlationId + + " unexpected " + (indexMsg + payloadSize - bufferSize) + " bytes"); + } + + final int shard = shardClassifier.getShardMessage(buffer, indexMsg, payloadSize, msgType); + + if (shard != SHARD_NONE) { + + final long timestamp = buffer[index + 1]; + + if (shard == SHARD_ALL) { + + // broadcast to each handler + for (final ScalableMessageHandler handler : handlers) { + try { + handler.handleMessage(buffer, indexMsg, payloadSize, timestamp, positionSeq, correlationId, msgType); + } catch (final Exception ex) { + log.debug("Exception when processing batch", ex); + } + } + + } else { + + // broadcast to one handler + + final ScalableMessageHandler handler = handlers[shard]; + + try { + handler.handleMessage(buffer, indexMsg, payloadSize, timestamp, positionSeq, correlationId, msgType); + } catch (final Exception ex) { + log.debug("Exception when processing batch", ex); + } + } + + } + + positionSeq += Revelator.MSG_HEADER_SIZE + payloadSize; + + // limit batches size + if (queueRefreshCounter++ > 256) { + break; + } + + } + + releasingFence.setRelease(availableSeq); + } + + } + + public SingleWriterFence getReleasingFence() { + return releasingFence; + } + + @Override + public String toString() { + return "ScalableFlowProcessor{" + Arrays.toString(handlers) + '}'; + } + + + public enum PrimaryState { + CENTRALIZED, +// SHARDED_PREPARE, + SHARDED, + CONSOLIDATING, + } +} diff --git a/src/main/java/exchange/core2/revelator/processors/scalable/ScalableSecondaryFlowProcessor.java b/src/main/java/exchange/core2/revelator/processors/scalable/ScalableSecondaryFlowProcessor.java new file mode 100644 index 0000000..38c6add --- /dev/null +++ b/src/main/java/exchange/core2/revelator/processors/scalable/ScalableSecondaryFlowProcessor.java @@ -0,0 +1,183 @@ +package exchange.core2.revelator.processors.scalable; + +import exchange.core2.revelator.Revelator; +import exchange.core2.revelator.RevelatorConfig; +import exchange.core2.revelator.fences.IFence; +import exchange.core2.revelator.fences.SingleWriterFence; +import exchange.core2.revelator.processors.IFlowProcessor; +import exchange.core2.revelator.processors.simple.SimpleMessageHandler; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.concurrent.locks.LockSupport; + +public final class ScalableSecondaryFlowProcessor implements IFlowProcessor { + + private static final Logger log = LoggerFactory.getLogger(ScalableSecondaryFlowProcessor.class); + + private final SimpleMessageHandler handler; + + private final IFence inboundFence; + private final SingleWriterFence releasingFence = new SingleWriterFence(); + + private final int indexMask; + private final long[] buffer; + private final int bufferSize; + + private long activationOffsetFrom = -1L; + private long activationOffsetAvailable = -1L; + + private long deactivationRequestedOffset = -1L; + private long deactivationActualOffset = -1L; + + private volatile State requestedState = State.SLEEP; + + private State actualState = State.SLEEP; + + public ScalableSecondaryFlowProcessor(final SimpleMessageHandler handler, + final IFence inboundFence, + final RevelatorConfig config) { + + this.handler = handler; + this.inboundFence = inboundFence; + this.indexMask = config.getIndexMask(); + this.buffer = config.getBuffer(); + this.bufferSize = config.getBufferSize(); + } + + @Override + public void run() { + + + long positionSeq = 0L; + + while (true) { + + final long availableSeq; + if (actualState == State.SLEEP) { + // note: volatile read of the current state (should be ok) + while (requestedState != State.RUNNING) { + // do nothing + LockSupport.parkNanos(7_000); + } + actualState = State.RUNNING; + positionSeq = activationOffsetFrom; // start from proposed sequence + availableSeq = activationOffsetAvailable; // use last known from primary processor (optimization) + + } else { + + // wait for inboundFence + long availableSeq1; + while ((availableSeq1 = inboundFence.getAcquire(positionSeq)) <= positionSeq) { + + Thread.yield(); + + //check requestedState sometimes + if (actualState == State.RUNNING) { + + // expensive volatile read but it is acceptable if end of spike was reached + if (requestedState == State.FINALIZING) { + actualState = State.FINALIZING; + availableSeq1 = deactivationRequestedOffset; + break; + } + } + + } + availableSeq = availableSeq1; + } + + + // currentState is running + int queueRefreshCounter = 0; + + + while (positionSeq < availableSeq) { + + final int index = (int) (positionSeq & indexMask); + + final long header1 = buffer[index]; + + if (header1 == 0L) { + // skip until end of the buffer + positionSeq = (positionSeq | indexMask) + 1; + continue; + } + + final long correlationId = header1 & 0x00FF_FFFF_FFFF_FFFFL; + final int header2 = (int) (header1 >>> 56); + final byte msgType = (byte) (header2 & 0x1F); + + if (msgType == Revelator.MSG_TYPE_POISON_PILL) { + log.debug("processor shutdown (received msgType={}, publishing positionSeq={}+{})", msgType, positionSeq, Revelator.MSG_HEADER_SIZE); + releasingFence.setRelease(positionSeq + Revelator.MSG_HEADER_SIZE); + handler.onShutdown(); + return; + } + + final long timestamp = buffer[index + 1]; + + // payload size in longs + final int payloadSize = (int) buffer[index + 2]; + final int indexMsg = index + Revelator.MSG_HEADER_SIZE; + if (indexMsg + payloadSize > bufferSize) { + throw new IllegalStateException("Failed to decode message: headerSize=" + Revelator.MSG_HEADER_SIZE + + " payloadSize=" + payloadSize + + " correlationId=" + correlationId + + " unexpected " + (indexMsg + payloadSize - bufferSize) + " bytes"); + } + + try { + handler.handleMessage(buffer, indexMsg, payloadSize, timestamp, positionSeq, correlationId, msgType); + } catch (final Exception ex) { + log.debug("Exception when processing batch", ex); + // TODO call custom handler + } + + positionSeq += Revelator.MSG_HEADER_SIZE + payloadSize; + + // limit batches size + if (queueRefreshCounter++ > 256) { + break; + } + + } + + releasingFence.setRelease(availableSeq); + } + + } + + public void activate(long offsetFrom, long offsetAvailable) { + activationOffsetFrom = offsetFrom; + activationOffsetAvailable = offsetAvailable; // just share most recent from main primary processor + requestedState = State.RUNNING; + } + + public void deactivate(long offset) { + deactivationRequestedOffset = offset; + requestedState = State.FINALIZING; + } + + public long getActualDeactivationOffset() { + // TODO volatile write + return deactivationActualOffset; + } + + + public enum State { + SLEEP, + RUNNING, + FINALIZING + } + + + public SingleWriterFence getReleasingFence() { + return releasingFence; + } + + @Override + public String toString() { + return "ScalableFlowProcessor{" + handler + '}'; + } +} diff --git a/src/main/java/exchange/core2/revelator/processors/scalable/ScalableShardClassifier.java b/src/main/java/exchange/core2/revelator/processors/scalable/ScalableShardClassifier.java new file mode 100644 index 0000000..6d36f43 --- /dev/null +++ b/src/main/java/exchange/core2/revelator/processors/scalable/ScalableShardClassifier.java @@ -0,0 +1,18 @@ +package exchange.core2.revelator.processors.scalable; + +public interface ScalableShardClassifier { + + /** + * @return handlerId (extract shard) + */ + int getShardMessage(long[] buffer, + int payloadIndex, + int payloadSize, + byte msgType); + + /** + * Constants + */ + int SHARD_ALL = -1; + int SHARD_NONE = Integer.MIN_VALUE; +} From c10141d4b5c942e76ca93285d4f8434dc92e265e Mon Sep 17 00:00:00 2001 From: Maksim Zheravin Date: Sun, 9 Jan 2022 15:14:56 +0200 Subject: [PATCH 02/15] Scalable processor: implementing per-batch processing --- .../ScalablePrimaryFlowProcessor.java | 19 +- .../ScalableSecondaryFlowProcessor.java | 167 ++++++------------ 2 files changed, 63 insertions(+), 123 deletions(-) diff --git a/src/main/java/exchange/core2/revelator/processors/scalable/ScalablePrimaryFlowProcessor.java b/src/main/java/exchange/core2/revelator/processors/scalable/ScalablePrimaryFlowProcessor.java index 4120c0b..a461ba7 100644 --- a/src/main/java/exchange/core2/revelator/processors/scalable/ScalablePrimaryFlowProcessor.java +++ b/src/main/java/exchange/core2/revelator/processors/scalable/ScalablePrimaryFlowProcessor.java @@ -32,13 +32,14 @@ public final class ScalablePrimaryFlowProcessor implements IFlowProcessor { private volatile PrimaryState actualState = PrimaryState.CENTRALIZED; - public ScalablePrimaryFlowProcessor(final ScalableMessageHandler[] handlers, + final ScalableSecondaryFlowProcessor[] processors, final ScalableShardClassifier shardClassifier, final IFence inboundFence, final RevelatorConfig config) { this.handlers = handlers; + this.secondaryProcessors = processors; this.shardClassifier = shardClassifier; this.inboundFence = inboundFence; this.indexMask = config.getIndexMask(); @@ -64,18 +65,20 @@ public void run() { // no new messages + still in sharded mode => signal secondary processors to initiate stop for (final ScalableSecondaryFlowProcessor p : secondaryProcessors) { - p.deactivate(positionSeq); + //p.deactivate(positionSeq); } } Thread.onSpinWait(); } - if (actualState == PrimaryState.CENTRALIZED && availableSeq - positionSeq > 8192) { + + // check if batch is big enough to activate secondary processors + if (actualState == PrimaryState.CENTRALIZED && availableSeq - positionSeq > 8192) { // TODO could be just single large message // switch to sharded mode actualState = PrimaryState.SHARDED; for (final ScalableSecondaryFlowProcessor p : secondaryProcessors) { - p.activate(positionSeq, availableSeq); + p.processRange(positionSeq, availableSeq); } //final int index = (int) (positionSeq & indexMask); @@ -119,13 +122,14 @@ public void run() { + " unexpected " + (indexMsg + payloadSize - bufferSize) + " bytes"); } + // check which for which handlers to execute final int shard = shardClassifier.getShardMessage(buffer, indexMsg, payloadSize, msgType); - if (shard != SHARD_NONE) { + if (shard != ScalableShardClassifier.SHARD_NONE) { final long timestamp = buffer[index + 1]; - if (shard == SHARD_ALL) { + if (shard == ScalableShardClassifier.SHARD_ALL) { // broadcast to each handler for (final ScalableMessageHandler handler : handlers) { @@ -139,7 +143,6 @@ public void run() { } else { // broadcast to one handler - final ScalableMessageHandler handler = handlers[shard]; try { @@ -177,7 +180,7 @@ public String toString() { public enum PrimaryState { CENTRALIZED, -// SHARDED_PREPARE, + // SHARDED_PREPARE, SHARDED, CONSOLIDATING, } diff --git a/src/main/java/exchange/core2/revelator/processors/scalable/ScalableSecondaryFlowProcessor.java b/src/main/java/exchange/core2/revelator/processors/scalable/ScalableSecondaryFlowProcessor.java index 38c6add..5ca3d9b 100644 --- a/src/main/java/exchange/core2/revelator/processors/scalable/ScalableSecondaryFlowProcessor.java +++ b/src/main/java/exchange/core2/revelator/processors/scalable/ScalableSecondaryFlowProcessor.java @@ -6,9 +6,13 @@ import exchange.core2.revelator.fences.SingleWriterFence; import exchange.core2.revelator.processors.IFlowProcessor; import exchange.core2.revelator.processors.simple.SimpleMessageHandler; +import org.eclipse.collections.api.tuple.primitive.LongLongPair; +import org.eclipse.collections.impl.tuple.primitive.PrimitiveTuples; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.util.concurrent.ArrayBlockingQueue; +import java.util.concurrent.BlockingQueue; import java.util.concurrent.locks.LockSupport; public final class ScalableSecondaryFlowProcessor implements IFlowProcessor { @@ -17,29 +21,19 @@ public final class ScalableSecondaryFlowProcessor implements IFlowProcessor { private final SimpleMessageHandler handler; - private final IFence inboundFence; private final SingleWriterFence releasingFence = new SingleWriterFence(); private final int indexMask; private final long[] buffer; private final int bufferSize; - private long activationOffsetFrom = -1L; - private long activationOffsetAvailable = -1L; + private final BlockingQueue tasks = new ArrayBlockingQueue<>(32); - private long deactivationRequestedOffset = -1L; - private long deactivationActualOffset = -1L; - - private volatile State requestedState = State.SLEEP; - - private State actualState = State.SLEEP; public ScalableSecondaryFlowProcessor(final SimpleMessageHandler handler, - final IFence inboundFence, final RevelatorConfig config) { this.handler = handler; - this.inboundFence = inboundFence; this.indexMask = config.getIndexMask(); this.buffer = config.getBuffer(); this.bufferSize = config.getBufferSize(); @@ -47,131 +41,74 @@ public ScalableSecondaryFlowProcessor(final SimpleMessageHandler handler, @Override public void run() { + try { + while (true) { + final LongLongPair task = tasks.take(); - long positionSeq = 0L; - - while (true) { - - final long availableSeq; - if (actualState == State.SLEEP) { - // note: volatile read of the current state (should be ok) - while (requestedState != State.RUNNING) { - // do nothing - LockSupport.parkNanos(7_000); - } - actualState = State.RUNNING; - positionSeq = activationOffsetFrom; // start from proposed sequence - availableSeq = activationOffsetAvailable; // use last known from primary processor (optimization) + long positionSeq = task.getOne(); + final long availableSeq = task.getTwo(); - } else { + while (positionSeq < availableSeq) { - // wait for inboundFence - long availableSeq1; - while ((availableSeq1 = inboundFence.getAcquire(positionSeq)) <= positionSeq) { + final int index = (int) (positionSeq & indexMask); - Thread.yield(); + final long header1 = buffer[index]; - //check requestedState sometimes - if (actualState == State.RUNNING) { - - // expensive volatile read but it is acceptable if end of spike was reached - if (requestedState == State.FINALIZING) { - actualState = State.FINALIZING; - availableSeq1 = deactivationRequestedOffset; - break; - } + if (header1 == 0L) { + // skip until end of the buffer + positionSeq = (positionSeq | indexMask) + 1; + continue; } - } - availableSeq = availableSeq1; - } - + final long correlationId = header1 & 0x00FF_FFFF_FFFF_FFFFL; + final int header2 = (int) (header1 >>> 56); + final byte msgType = (byte) (header2 & 0x1F); - // currentState is running - int queueRefreshCounter = 0; - - - while (positionSeq < availableSeq) { - - final int index = (int) (positionSeq & indexMask); - - final long header1 = buffer[index]; - - if (header1 == 0L) { - // skip until end of the buffer - positionSeq = (positionSeq | indexMask) + 1; - continue; - } - - final long correlationId = header1 & 0x00FF_FFFF_FFFF_FFFFL; - final int header2 = (int) (header1 >>> 56); - final byte msgType = (byte) (header2 & 0x1F); - - if (msgType == Revelator.MSG_TYPE_POISON_PILL) { - log.debug("processor shutdown (received msgType={}, publishing positionSeq={}+{})", msgType, positionSeq, Revelator.MSG_HEADER_SIZE); - releasingFence.setRelease(positionSeq + Revelator.MSG_HEADER_SIZE); - handler.onShutdown(); - return; - } - - final long timestamp = buffer[index + 1]; + if (msgType == Revelator.MSG_TYPE_POISON_PILL) { + log.debug("processor shutdown (received msgType={}, publishing positionSeq={}+{})", msgType, positionSeq, Revelator.MSG_HEADER_SIZE); + releasingFence.setRelease(positionSeq + Revelator.MSG_HEADER_SIZE); + handler.onShutdown(); + return; + } - // payload size in longs - final int payloadSize = (int) buffer[index + 2]; - final int indexMsg = index + Revelator.MSG_HEADER_SIZE; - if (indexMsg + payloadSize > bufferSize) { - throw new IllegalStateException("Failed to decode message: headerSize=" + Revelator.MSG_HEADER_SIZE - + " payloadSize=" + payloadSize - + " correlationId=" + correlationId - + " unexpected " + (indexMsg + payloadSize - bufferSize) + " bytes"); - } + final long timestamp = buffer[index + 1]; - try { - handler.handleMessage(buffer, indexMsg, payloadSize, timestamp, positionSeq, correlationId, msgType); - } catch (final Exception ex) { - log.debug("Exception when processing batch", ex); - // TODO call custom handler - } + // payload size in longs + final int payloadSize = (int) buffer[index + 2]; + final int indexMsg = index + Revelator.MSG_HEADER_SIZE; + if (indexMsg + payloadSize > bufferSize) { + throw new IllegalStateException("Failed to decode message: headerSize=" + Revelator.MSG_HEADER_SIZE + + " payloadSize=" + payloadSize + + " correlationId=" + correlationId + + " unexpected " + (indexMsg + payloadSize - bufferSize) + " bytes"); + } - positionSeq += Revelator.MSG_HEADER_SIZE + payloadSize; + try { + handler.handleMessage(buffer, indexMsg, payloadSize, timestamp, positionSeq, correlationId, msgType); + } catch (final Exception ex) { + log.debug("Exception when processing batch", ex); + // TODO call custom handler + } - // limit batches size - if (queueRefreshCounter++ > 256) { - break; + positionSeq += Revelator.MSG_HEADER_SIZE + payloadSize; } + releasingFence.setRelease(availableSeq); } - - releasingFence.setRelease(availableSeq); + } catch (InterruptedException ex) { + throw new RuntimeException(ex); } - } - public void activate(long offsetFrom, long offsetAvailable) { - activationOffsetFrom = offsetFrom; - activationOffsetAvailable = offsetAvailable; // just share most recent from main primary processor - requestedState = State.RUNNING; - } - - public void deactivate(long offset) { - deactivationRequestedOffset = offset; - requestedState = State.FINALIZING; - } - - public long getActualDeactivationOffset() { - // TODO volatile write - return deactivationActualOffset; - } - - - public enum State { - SLEEP, - RUNNING, - FINALIZING + public void processRange(long offsetFrom, long offsetTo) { + try { + tasks.put(PrimitiveTuples.pair(offsetFrom, offsetTo)); + } catch (InterruptedException ex) { + throw new RuntimeException(ex); + } } - public SingleWriterFence getReleasingFence() { return releasingFence; } From 441e054700b5c4ec698f680224503c0221eace39 Mon Sep 17 00:00:00 2001 From: Maksim Zheravin Date: Sun, 16 Jan 2022 23:30:02 +0200 Subject: [PATCH 03/15] RAFT: initial implementation --- .../revelator/raft/CmdRaftAppendEntries.java | 60 +++++ .../raft/CmdRaftAppendEntriesResponse.java | 35 +++ .../revelator/raft/CmdRaftVoteRequest.java | 50 ++++ .../revelator/raft/CmdRaftVoteResponse.java | 38 +++ .../core2/revelator/raft/RaftClient.java | 49 ++++ .../core2/revelator/raft/RaftLogEntry.java | 16 ++ .../core2/revelator/raft/RaftMessage.java | 4 + .../core2/revelator/raft/RaftNode.java | 229 ++++++++++++++++++ .../core2/revelator/raft/RpcRequest.java | 13 + .../core2/revelator/raft/RpcResponse.java | 15 ++ .../core2/revelator/raft/RpcService.java | 222 +++++++++++++++++ 11 files changed, 731 insertions(+) create mode 100644 src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntries.java create mode 100644 src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntriesResponse.java create mode 100644 src/main/java/exchange/core2/revelator/raft/CmdRaftVoteRequest.java create mode 100644 src/main/java/exchange/core2/revelator/raft/CmdRaftVoteResponse.java create mode 100644 src/main/java/exchange/core2/revelator/raft/RaftClient.java create mode 100644 src/main/java/exchange/core2/revelator/raft/RaftLogEntry.java create mode 100644 src/main/java/exchange/core2/revelator/raft/RaftMessage.java create mode 100644 src/main/java/exchange/core2/revelator/raft/RaftNode.java create mode 100644 src/main/java/exchange/core2/revelator/raft/RpcRequest.java create mode 100644 src/main/java/exchange/core2/revelator/raft/RpcResponse.java create mode 100644 src/main/java/exchange/core2/revelator/raft/RpcService.java diff --git a/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntries.java b/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntries.java new file mode 100644 index 0000000..7e7d2da --- /dev/null +++ b/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntries.java @@ -0,0 +1,60 @@ +package exchange.core2.revelator.raft; + +import java.nio.ByteBuffer; +import java.util.List; + +/** + * Invoked by leader to replicate log entries (5.3); also used as heartbeat (5.2). + */ +public final class CmdRaftAppendEntries implements RpcRequest { + + public final int term; // leader’s term + public final int leaderId; // so follower can redirect clients + + public final long prevLogIndex; // index of log entry immediately preceding new ones + public final int prevLogTerm;// term of prevLogIndex entry + public final List entries; // log entries to store (empty for heartbeat; may send more than one for efficiency) + public final long leaderCommit;// leader’s commitIndex + + public CmdRaftAppendEntries(int term, + int leaderId, + long prevLogIndex, + int prevLogTerm, + List entries, + long leaderCommit) { + + this.term = term; + this.leaderId = leaderId; + this.prevLogIndex = prevLogIndex; + this.prevLogTerm = prevLogTerm; + this.entries = entries; + this.leaderCommit = leaderCommit; + } + + @Override + public int getMessageType() { + return 1; + } + + @Override + public void serialize(ByteBuffer buffer) { + buffer.putInt(term); + buffer.putInt(leaderId); + buffer.putLong(prevLogIndex); + buffer.putInt(prevLogTerm); +// buffer.put + buffer.putLong(leaderCommit); + } + + public static CmdRaftAppendEntries create(ByteBuffer buffer){ + + final int term = buffer.getInt(); + final int leaderId = buffer.getInt(); + final long prevLogIndex = buffer.getLong(); + final int prevLogTerm = buffer.getInt(); + // todo entries + final long leaderCommit = buffer.getLong(); + + return new CmdRaftAppendEntries(term, leaderId, prevLogIndex, prevLogTerm, List.of(), leaderCommit); + } +} diff --git a/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntriesResponse.java b/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntriesResponse.java new file mode 100644 index 0000000..a935432 --- /dev/null +++ b/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntriesResponse.java @@ -0,0 +1,35 @@ +package exchange.core2.revelator.raft; + +import java.nio.ByteBuffer; + +/** + * Invoked by leader to replicate log entries (5.3); also used as heartbeat (5.2). + */ +public final class CmdRaftAppendEntriesResponse implements RpcResponse { + + public final int term; // currentTerm, for leader to update itself + public final boolean success; // true if follower contained entry matching prevLogIndex and prevLogTerm + + public CmdRaftAppendEntriesResponse(int term, + boolean success) { + this.term = term; + this.success = success; + } + + @Override + public int getMessageType() { + return RpcResponse.RESPONSE_APPEND_ENTRIES; + } + + @Override + public void serialize(ByteBuffer buffer) { + buffer.putInt(term); + buffer.put(success ? (byte) 1 : (byte) 0); + } + + public static CmdRaftAppendEntriesResponse create(ByteBuffer bb){ + final int term = bb.getInt(); + final boolean success = bb.get() == 1; + return new CmdRaftAppendEntriesResponse(term, success); + } +} diff --git a/src/main/java/exchange/core2/revelator/raft/CmdRaftVoteRequest.java b/src/main/java/exchange/core2/revelator/raft/CmdRaftVoteRequest.java new file mode 100644 index 0000000..3944ca1 --- /dev/null +++ b/src/main/java/exchange/core2/revelator/raft/CmdRaftVoteRequest.java @@ -0,0 +1,50 @@ +package exchange.core2.revelator.raft; + +import java.nio.ByteBuffer; + +/** + * Invoked by leader to replicate log entries (5.3); also used as heartbeat (5.2). + */ +public final class CmdRaftVoteRequest implements RpcRequest { + + public final int term; // candidate's term + public final int candidateId; // candidate requesting vote + + public final long lastLogIndex; // index of candidate’s last log entry (5.4) + public final int lastLogTerm; // term of candidate’s last log entry (5.4) + + public CmdRaftVoteRequest(int term, + int candidateId, + long lastLogIndex, + int lastLogTerm) { + + this.term = term; + this.candidateId = candidateId; + this.lastLogIndex = lastLogIndex; + this.lastLogTerm = lastLogTerm; + } + + @Override + public int getMessageType() { + return 2; + } + + @Override + public void serialize(ByteBuffer buffer) { + buffer.putInt(term); + buffer.putInt(candidateId); + buffer.putLong(lastLogIndex); + buffer.putInt(lastLogTerm); + } + + + public static CmdRaftVoteRequest create(ByteBuffer buffer) { + + final int term = buffer.getInt(); + final int leaderId = buffer.getInt(); + final long prevLogIndex = buffer.getLong(); + final int prevLogTerm = buffer.getInt(); + + return new CmdRaftVoteRequest(term, leaderId, prevLogIndex, prevLogTerm); + } +} diff --git a/src/main/java/exchange/core2/revelator/raft/CmdRaftVoteResponse.java b/src/main/java/exchange/core2/revelator/raft/CmdRaftVoteResponse.java new file mode 100644 index 0000000..d742ad1 --- /dev/null +++ b/src/main/java/exchange/core2/revelator/raft/CmdRaftVoteResponse.java @@ -0,0 +1,38 @@ +package exchange.core2.revelator.raft; + +import java.nio.ByteBuffer; + +/** + * Invoked by candidates to gather votes (5.2). + */ +public final class CmdRaftVoteResponse implements RpcResponse { + + public final int term; // currentTerm, for candidate to update itself + public final boolean voteGranted; // true means that candidate received vote + + public CmdRaftVoteResponse(int term, + boolean voteGranted) { + this.term = term; + this.voteGranted = voteGranted; + } + + @Override + public int getMessageType() { + return RpcResponse.RESPONSE_VOTE; + } + + @Override + public void serialize(ByteBuffer buffer) { + buffer.putInt(term); + buffer.put(voteGranted ? (byte) 1 : (byte) 0); + } + + public static CmdRaftVoteResponse create(ByteBuffer buffer) { + + final int term = buffer.getInt(); + final boolean voteGranted = buffer.get() == 1; + + return new CmdRaftVoteResponse(term, voteGranted); + } + +} diff --git a/src/main/java/exchange/core2/revelator/raft/RaftClient.java b/src/main/java/exchange/core2/revelator/raft/RaftClient.java new file mode 100644 index 0000000..d986fad --- /dev/null +++ b/src/main/java/exchange/core2/revelator/raft/RaftClient.java @@ -0,0 +1,49 @@ +package exchange.core2.revelator.raft; + + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.net.*; + +public class RaftClient { + + private static final Logger log = LoggerFactory.getLogger(RaftClient.class); + + + public static void main(String[] args) throws IOException, InterruptedException { + final RaftClient raftClient = new RaftClient(); + + while (true) { + raftClient.sendEcho("TEST123"); + Thread.sleep(1000); + } + } + + private DatagramSocket socket; + private InetAddress address; + + private byte[] buf; + + public RaftClient() throws SocketException, UnknownHostException { + socket = new DatagramSocket(); + address = InetAddress.getByName("localhost"); + } + + public String sendEcho(String msg) throws IOException { + buf = msg.getBytes(); + DatagramPacket packet = new DatagramPacket(buf, buf.length, address, 3778); + log.debug(">> {}", msg); + socket.send(packet); + packet = new DatagramPacket(buf, buf.length); + socket.receive(packet); + String received = new String(packet.getData(), 0, packet.getLength()); + log.debug("<< {}", received); + return received; + } + + public void close() { + socket.close(); + } +} \ No newline at end of file diff --git a/src/main/java/exchange/core2/revelator/raft/RaftLogEntry.java b/src/main/java/exchange/core2/revelator/raft/RaftLogEntry.java new file mode 100644 index 0000000..ef97bce --- /dev/null +++ b/src/main/java/exchange/core2/revelator/raft/RaftLogEntry.java @@ -0,0 +1,16 @@ +package exchange.core2.revelator.raft; + +/** + * each entry contains command for state machine, and term when entry was received by leader + */ +public class RaftLogEntry { + + // term when entry was received by leader + public final long term; + public final String cmd; + + public RaftLogEntry(long term, String cmd) { + this.term = term; + this.cmd = cmd; + } +} diff --git a/src/main/java/exchange/core2/revelator/raft/RaftMessage.java b/src/main/java/exchange/core2/revelator/raft/RaftMessage.java new file mode 100644 index 0000000..f92a5b6 --- /dev/null +++ b/src/main/java/exchange/core2/revelator/raft/RaftMessage.java @@ -0,0 +1,4 @@ +package exchange.core2.revelator.raft; + +public interface RaftMessage { +} diff --git a/src/main/java/exchange/core2/revelator/raft/RaftNode.java b/src/main/java/exchange/core2/revelator/raft/RaftNode.java new file mode 100644 index 0000000..6e2d149 --- /dev/null +++ b/src/main/java/exchange/core2/revelator/raft/RaftNode.java @@ -0,0 +1,229 @@ +package exchange.core2.revelator.raft; + + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.net.DatagramPacket; +import java.net.DatagramSocket; +import java.net.InetAddress; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import java.util.Timer; +import java.util.TimerTask; +import java.util.concurrent.CompletableFuture; + +public class RaftNode { + + private static final Logger logger = LoggerFactory.getLogger(RaftNode.class); + + public static final int HEARTBEAT_TIMEOUT_MS = 2000; + + public static final int CLUSTER_SIZE = 3; + public static final int VOTES_REQUIRED = 2; + + + /* **** Persistent state on all servers: (Updated on stable storage before responding to RPCs) */ + + // latest term server has seen (initialized to 0 on first boot, increases monotonically) + private int currentTerm = 0; + + // candidateId that received vote in current term (or -1 if none) + private int votedFor = -1; + + // log entries; each entry contains command for state machine, and term when entry was received by leader (first index is 1) + private final List log = new ArrayList<>(); // TODO change to persistent storage with long-index + + + /* **** Volatile state on all servers: */ + + // index of the highest log entry known to be committed (initialized to 0, increases monotonically) + private long commitIndex = 0; + + // index of the highest log entry applied to state machine (initialized to 0, increases monotonically) + private long lastApplied = 0; + + private RaftNodeState currentState = RaftNodeState.FOLLOWER; + + /* **** Volatile state on leaders: (Reinitialized after election) */ + + // for each server, index of the next log entry to send to that server (initialized to leader last log index + 1) + private final long[] nextIndex = new long[3]; + + // for each server, index of the highest log entry known to be replicated on server (initialized to 0, increases monotonically) + private final long[] matchIndex = new long[3]; + + /* ********************************************* */ + + private final Timer timer = new Timer("AppendTimer"); + + + public static void main(String[] args) { + new RaftNode().run(3778); + } + + public RaftNode(int thisNodeId) { + logger.info("Starting node {} as follower...", thisNodeId); + resetFollowerAppendTimer(); + } + + public void run(int port) { + try (final DatagramSocket serverSocket = new DatagramSocket(port)) { + final byte[] receiveData = new byte[8]; + String sendString = "polo"; + final byte[] sendData = sendString.getBytes(StandardCharsets.UTF_8); + + logger.info("Listening on udp:{}:{}", InetAddress.getLocalHost().getHostAddress(), port); + final DatagramPacket receivePacket = new DatagramPacket(receiveData, receiveData.length); + + while (true) { + serverSocket.receive(receivePacket); + String sentence = new String(receivePacket.getData(), 0, + receivePacket.getLength()); + logger.debug("RECEIVED: " + sentence); + + + DatagramPacket sendPacket = new DatagramPacket( + sendData, + sendData.length, + receivePacket.getAddress(), + receivePacket.getPort()); + + serverSocket.send(sendPacket); + } + } catch (IOException ex) { + System.out.println(ex); + } + } + + + + + + /** + * Receiver implementation:

+ * 1. Reply false if term < currentTerm (5.1)

+ * 2. Reply false if log doesn't contain an entry at prevLogIndex whose term matches prevLogTerm (5.3)

+ * 3. If an existing entry conflicts with a new one (same index but different terms), delete the existing entry and all that follow it (5.3)

+ * 4. Append any new entries not already in the log

+ * 5. If leaderCommit > commitIndex, set commitIndex = min(leaderCommit, index of last new entry)

+ */ + public synchronized CmdRaftAppendEntriesResponse appendEntries(CmdRaftAppendEntries cmd) { + + // 1. Reply false if term < currentTerm + // If the term in the RPC is smaller than the candidate’s current term, then the candidate rejects the RPC and continues in candidate state. + if (cmd.term < currentTerm) { + logger.debug("term < currentTerm"); + return new CmdRaftAppendEntriesResponse(currentTerm, false); + } + + // If the leader’s term (included in its RPC) is at least as large as the candidate’s current term, then the candidate + // recognizes the leader as legitimate and returns to follower state. I + checkTerm(cmd.term); + + // 2. Reply false if log doesn't contain an entry at prevLogIndex whose term matches prevLogTerm + if (cmd.prevLogIndex >= log.size() || log.get((int) cmd.prevLogIndex).term != cmd.prevLogTerm) { + logger.debug("log doesn't contain an entry at prevLogIndex whose term matches prevLogTerm"); + return new CmdRaftAppendEntriesResponse(currentTerm, false); + } + + // TODO 3. If an existing entry conflicts with a new one (same index but different terms), delete the existing entry and all that follow it (5.3) + + // Append any new entries not already in the log + + resetFollowerAppendTimer(); + + // If leaderCommit > commitIndex, set commitIndex = min(leaderCommit, index of last new entry) + if (cmd.leaderCommit > commitIndex) { + commitIndex = Math.min(cmd.leaderCommit, log.size() - 1); + } + + return new CmdRaftAppendEntriesResponse(currentTerm, true); + } + + /** + * Receiver implementation: + * 1. Reply false if term < currentTerm (5.1) + * 2. If votedFor is null or candidateId, and candidate’s log is at least as up-to-date as receiver’s log, grant vote (5.2, 5.4) + */ + public synchronized CmdRaftVoteResponse handleVoteRequest(CmdRaftVoteRequest request) { + + if (request.term < currentTerm) { + return new CmdRaftVoteResponse(currentTerm, false); + } + + checkTerm(request.term); + + + final boolean notVotedYet = votedFor == -1 || votedFor == request.candidateId; + final boolean logIsUpToDate = request.lastLogIndex >= commitIndex; // TODO commitIndex or lastApplied? + if (notVotedYet && logIsUpToDate) { + // vote for candidate + votedFor = request.candidateId; + return new CmdRaftVoteResponse(currentTerm, true); + } else { + // reject voting + return new CmdRaftVoteResponse(currentTerm, false); + } + + } + + + + private void checkTerm(int term) { + // All servers: If RPC request or response contains term T > currentTerm: set currentTerm = T, convert to follower (5.1) + if (term > currentTerm) { + logger.info("Newer term={} received from new leader, switching to FOLLOWER", term); + currentTerm = term; + currentState = RaftNodeState.FOLLOWER; + } + } + + + private void broadcast(RaftMessage message) { + + logger.debug("Sending: {}", message); + + } + + private void resetFollowerAppendTimer(){ + + logger.debug("reset append timer"); + + timer.cancel(); + + final TimerTask task = new TimerTask() { + @Override + public void run() { + appendTimeout(); + } + }; + + timer.schedule(task, HEARTBEAT_TIMEOUT_MS); + } + + private synchronized void appendTimeout(){ + + logger.info("heartbeat timeout - switching to CANDIDATE"); + currentState = RaftNodeState.CANDIDATE; + + // On conversion to candidate, start election: + // - Increment currentTerm + // - Vote for self + // - Reset election timer + // - Send RequestVote RPCs to all other servers + currentTerm ++; + + + + } + + + public enum RaftNodeState { + FOLLOWER, + CANDIDATE, + LEADER + } +} \ No newline at end of file diff --git a/src/main/java/exchange/core2/revelator/raft/RpcRequest.java b/src/main/java/exchange/core2/revelator/raft/RpcRequest.java new file mode 100644 index 0000000..3b003da --- /dev/null +++ b/src/main/java/exchange/core2/revelator/raft/RpcRequest.java @@ -0,0 +1,13 @@ +package exchange.core2.revelator.raft; + +import java.nio.ByteBuffer; + +public interface RpcRequest { + + int getMessageType(); + + void serialize(ByteBuffer buffer); + + int REQUEST_APPEND_ENTRIES = 1; + int REQUEST_VOTE = 2; +} diff --git a/src/main/java/exchange/core2/revelator/raft/RpcResponse.java b/src/main/java/exchange/core2/revelator/raft/RpcResponse.java new file mode 100644 index 0000000..71f944a --- /dev/null +++ b/src/main/java/exchange/core2/revelator/raft/RpcResponse.java @@ -0,0 +1,15 @@ +package exchange.core2.revelator.raft; + +import java.nio.ByteBuffer; + +public interface RpcResponse { + + + int getMessageType(); + + void serialize(ByteBuffer buffer); + + int RESPONSE_APPEND_ENTRIES = -1; + int RESPONSE_VOTE = -2; + +} diff --git a/src/main/java/exchange/core2/revelator/raft/RpcService.java b/src/main/java/exchange/core2/revelator/raft/RpcService.java new file mode 100644 index 0000000..65f6f62 --- /dev/null +++ b/src/main/java/exchange/core2/revelator/raft/RpcService.java @@ -0,0 +1,222 @@ +package exchange.core2.revelator.raft; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.net.DatagramPacket; +import java.net.DatagramSocket; +import java.net.InetAddress; +import java.nio.ByteBuffer; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.AtomicLong; +import java.util.function.BiFunction; + +public class RpcService implements AutoCloseable { + + private static final Logger logger = LoggerFactory.getLogger(RaftNode.class); + + private final AtomicLong correlationIdCounter = new AtomicLong(0L); + private final Map> futureMap = new ConcurrentHashMap<>(); + private final Map socketMap; + private final int serverPort; + private final int serverNodeId; + private final BiFunction handler; + + private volatile boolean active = true; + + public RpcService(Map remoteNodes, + BiFunction handler, + int serverNodeId) { + + final Map socketMap = new HashMap<>(); + remoteNodes.forEach((id, address) -> { + + try { + final String[] split = address.split(":"); + + final DatagramSocket socket = new DatagramSocket(); + final InetAddress host = InetAddress.getByName(split[0]); + final int port = Integer.parseInt(split[1]); + + RemoteUdpSocket remoteUdpSocket = new RemoteUdpSocket(socket, host, port); + + socketMap.put(id, remoteUdpSocket); + + } catch (Exception ex) { + throw new RuntimeException(ex); + } + }); + + + this.socketMap = socketMap; + this.handler = handler; + this.serverPort = socketMap.get(serverNodeId).port; + this.serverNodeId = serverNodeId; + + Thread t = new Thread(this::run); + t.setDaemon(true); + t.setName("ListenerUDP"); + t.start(); + + } + + + public void run() { + + try (final DatagramSocket serverSocket = new DatagramSocket(serverPort)) { + + logger.info("Listening on UDP {}:{}", InetAddress.getLocalHost().getHostAddress(), serverPort); + + final byte[] receiveData = new byte[256]; + + final DatagramPacket receivePacket = new DatagramPacket(receiveData, receiveData.length); + + while (active) { + + serverSocket.receive(receivePacket); + // String sentence = new String(receivePacket.getData(), 0, receivePacket.getLength()); +// logger.debug("RECEIVED: " + sentence); + + + final ByteBuffer bb = ByteBuffer.wrap(receivePacket.getData(), 0, receivePacket.getLength()); + + final int nodeId = bb.getInt(); + final int messageType = bb.getInt(); + final long correlationId = bb.getLong(); + + if (messageType < 0) { + processResponse(receivePacket, bb, messageType, correlationId); + + } else { + processRequest(receivePacket, bb, nodeId, messageType, correlationId); + } + } + + logger.info("UDP server shutdown"); + + } catch (final IOException ex) { + logger.error("Error in service thread", ex); + throw new RuntimeException(ex); + } + + } + + private void processRequest(DatagramPacket receivePacket, ByteBuffer bb, int nodeId, int messageType, long correlationId) { + + if (messageType == RpcRequest.REQUEST_APPEND_ENTRIES) { + + final CmdRaftAppendEntries request = CmdRaftAppendEntries.create(bb); + final RpcResponse response = handler.apply(nodeId, request); + sendResponse(nodeId, correlationId, response); + + } else if (messageType == RpcRequest.REQUEST_VOTE) { + + final CmdRaftVoteRequest request = CmdRaftVoteRequest.create(bb); + final RpcResponse response = handler.apply(nodeId, request); + sendResponse(nodeId, correlationId, response); + + } else { + logger.warn("Unsupported response type={} from {} correlationId={}", + messageType, receivePacket.getAddress().getHostAddress(), correlationId); + } + } + + private void processResponse(DatagramPacket receivePacket, ByteBuffer bb, int messageType, long correlationId) { + final CompletableFuture future = futureMap.remove(correlationId); + + if (future != null) { + + if (messageType == RpcResponse.RESPONSE_APPEND_ENTRIES) { + + future.complete(CmdRaftAppendEntriesResponse.create(bb)); + + } else if (messageType == RpcResponse.RESPONSE_VOTE) { + + future.complete(CmdRaftVoteResponse.create(bb)); + + } else { + logger.warn("Unsupported response type={} from {} correlationId={}", + messageType, receivePacket.getAddress().getHostAddress(), correlationId); + } + + + } else { + logger.warn("Unexpected (or duplicate) response from {} type={} correlationId={}", + receivePacket.getAddress().getHostAddress(), messageType, correlationId); + } + } + + private void sendResponse(int callerNodeId, long correlationId, RpcResponse response) { + final byte[] array = new byte[64]; + ByteBuffer bb = ByteBuffer.wrap(array); + + bb.putInt(serverNodeId); + bb.putInt(response.getMessageType()); + bb.putLong(correlationId); + response.serialize(bb); + + send(callerNodeId, array, bb.position()); + } + + + public CompletableFuture callRpcSync(RpcRequest request, int nodeId) { + + final long correlationId = correlationIdCounter.incrementAndGet(); + + CompletableFuture future = new CompletableFuture<>(); + futureMap.put(correlationId, future); + + final byte[] array = new byte[64]; + ByteBuffer bb = ByteBuffer.wrap(array); + + bb.putInt(nodeId); + bb.putInt(request.getMessageType()); + bb.putLong(correlationId); + + request.serialize(bb); + + send(nodeId, array, bb.position()); + + return future; + } + + + private void send(int nodeId, byte[] data, int length) { + + final RemoteUdpSocket remoteUdpSocket = socketMap.get(nodeId); + final DatagramPacket packet = new DatagramPacket(data, length, remoteUdpSocket.address, remoteUdpSocket.port); + + try { + remoteUdpSocket.socket.send(packet); + } catch (IOException ex) { + throw new RuntimeException(ex); + } + } + + + @Override + public void close() throws Exception { + + active = false; + } + + + public static final class RemoteUdpSocket { + + private final DatagramSocket socket; + private final InetAddress address; + private final int port; + + public RemoteUdpSocket(DatagramSocket socket, InetAddress address, int port) { + this.socket = socket; + this.address = address; + this.port = port; + } + } + + +} From c6132bf9f883ccab2f03dc3422420af2adcf969e Mon Sep 17 00:00:00 2001 From: Maksim Zheravin Date: Thu, 20 Jan 2022 00:07:54 +0200 Subject: [PATCH 04/15] RAFT: bugfixes for voting procedure --- .../revelator/raft/CmdRaftAppendEntries.java | 12 + .../raft/CmdRaftAppendEntriesResponse.java | 8 + .../revelator/raft/CmdRaftVoteRequest.java | 10 + .../revelator/raft/CmdRaftVoteResponse.java | 7 + .../core2/revelator/raft/RaftNode.java | 277 +++++++++++++++--- .../core2/revelator/raft/RpcService.java | 58 ++-- 6 files changed, 301 insertions(+), 71 deletions(-) diff --git a/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntries.java b/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntries.java index 7e7d2da..98db4c7 100644 --- a/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntries.java +++ b/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntries.java @@ -57,4 +57,16 @@ public static CmdRaftAppendEntries create(ByteBuffer buffer){ return new CmdRaftAppendEntries(term, leaderId, prevLogIndex, prevLogTerm, List.of(), leaderCommit); } + + @Override + public String toString() { + return "CmdRaftAppendEntries{" + + "term=" + term + + ", leaderId=" + leaderId + + ", prevLogIndex=" + prevLogIndex + + ", prevLogTerm=" + prevLogTerm + + ", entries=" + entries + + ", leaderCommit=" + leaderCommit + + '}'; + } } diff --git a/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntriesResponse.java b/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntriesResponse.java index a935432..aa3a73b 100644 --- a/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntriesResponse.java +++ b/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntriesResponse.java @@ -32,4 +32,12 @@ public static CmdRaftAppendEntriesResponse create(ByteBuffer bb){ final boolean success = bb.get() == 1; return new CmdRaftAppendEntriesResponse(term, success); } + + @Override + public String toString() { + return "CmdRaftAppendEntriesResponse{" + + "term=" + term + + ", success=" + success + + '}'; + } } diff --git a/src/main/java/exchange/core2/revelator/raft/CmdRaftVoteRequest.java b/src/main/java/exchange/core2/revelator/raft/CmdRaftVoteRequest.java index 3944ca1..a0aca60 100644 --- a/src/main/java/exchange/core2/revelator/raft/CmdRaftVoteRequest.java +++ b/src/main/java/exchange/core2/revelator/raft/CmdRaftVoteRequest.java @@ -47,4 +47,14 @@ public static CmdRaftVoteRequest create(ByteBuffer buffer) { return new CmdRaftVoteRequest(term, leaderId, prevLogIndex, prevLogTerm); } + + @Override + public String toString() { + return "CmdRaftVoteRequest{" + + "term=" + term + + ", candidateId=" + candidateId + + ", lastLogIndex=" + lastLogIndex + + ", lastLogTerm=" + lastLogTerm + + '}'; + } } diff --git a/src/main/java/exchange/core2/revelator/raft/CmdRaftVoteResponse.java b/src/main/java/exchange/core2/revelator/raft/CmdRaftVoteResponse.java index d742ad1..0df063a 100644 --- a/src/main/java/exchange/core2/revelator/raft/CmdRaftVoteResponse.java +++ b/src/main/java/exchange/core2/revelator/raft/CmdRaftVoteResponse.java @@ -35,4 +35,11 @@ public static CmdRaftVoteResponse create(ByteBuffer buffer) { return new CmdRaftVoteResponse(term, voteGranted); } + @Override + public String toString() { + return "CmdRaftVoteResponse{" + + "term=" + term + + ", voteGranted=" + voteGranted + + '}'; + } } diff --git a/src/main/java/exchange/core2/revelator/raft/RaftNode.java b/src/main/java/exchange/core2/revelator/raft/RaftNode.java index 6e2d149..c5ea42b 100644 --- a/src/main/java/exchange/core2/revelator/raft/RaftNode.java +++ b/src/main/java/exchange/core2/revelator/raft/RaftNode.java @@ -4,22 +4,18 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; -import java.net.DatagramPacket; -import java.net.DatagramSocket; -import java.net.InetAddress; -import java.nio.charset.StandardCharsets; -import java.util.ArrayList; -import java.util.List; -import java.util.Timer; -import java.util.TimerTask; -import java.util.concurrent.CompletableFuture; +import java.util.*; +import java.util.concurrent.*; +import java.util.function.BiConsumer; +import java.util.function.BiFunction; public class RaftNode { private static final Logger logger = LoggerFactory.getLogger(RaftNode.class); - public static final int HEARTBEAT_TIMEOUT_MS = 2000; + public static final int HEARTBEAT_TIMEOUT_MS = 2000 + (int) (Math.random() * 500); + public static final int HEARTBEAT_LEADER_RATE_MS = 1000; + public static final int ELECTION_TIMEOUT_MS = 3000; public static final int CLUSTER_SIZE = 3; public static final int VOTES_REQUIRED = 2; @@ -57,49 +53,132 @@ public class RaftNode { /* ********************************************* */ - private final Timer timer = new Timer("AppendTimer"); + private final int currentNodeId; + private final int[] otherNodes; + private final RpcService rpcService; + + private Timer appendTimer; +// private Timer electionTimer; + + private ScheduledExecutorService heartbeatLeaderExecutor; public static void main(String[] args) { - new RaftNode().run(3778); + + final int thisNodeId = Integer.parseInt(args[0]); + + new RaftNode(thisNodeId); } public RaftNode(int thisNodeId) { - logger.info("Starting node {} as follower...", thisNodeId); - resetFollowerAppendTimer(); - } - public void run(int port) { - try (final DatagramSocket serverSocket = new DatagramSocket(port)) { - final byte[] receiveData = new byte[8]; - String sendString = "polo"; - final byte[] sendData = sendString.getBytes(StandardCharsets.UTF_8); + // localhost:3778, localhost:3779, localhost:3780 + final Map remoteNodes = Map.of( + 0, "localhost:3778", + 1, "localhost:3779", + 2, "localhost:3780"); + + this.currentNodeId = thisNodeId; + + this.otherNodes = remoteNodes.keySet().stream().mapToInt(x -> x).filter(nodeId -> nodeId != thisNodeId).toArray(); + + + final BiFunction handler = (fromNodeId, req) -> { + logger.debug("INCOMING REQ {} >>> {}", fromNodeId, req); + if (req instanceof CmdRaftVoteRequest) { + synchronized (this) { + logger.debug("votedFor={}", votedFor); + if (votedFor == -1) { + logger.debug("VOTE GRANTED for {}", fromNodeId); + votedFor = fromNodeId; + return new CmdRaftVoteResponse(currentTerm, true); + } else { + return new CmdRaftVoteResponse(currentTerm, false); + } + } + } + if (req instanceof CmdRaftAppendEntries) { + CmdRaftAppendEntries appendEntriesCmd = (CmdRaftAppendEntries) req; + + synchronized (this) { + + if (currentState == RaftNodeState.CANDIDATE) { + /* While waiting for votes, a candidate may receive an AppendEntries RPC from another server claiming to be leader. + If the leader’s term (included in its RPC) is at least as large as the candidate’s current term, + then the candidate recognizes the leader as legitimate and returns to follower state. + If the term in the RPC is smaller than the candidate’s current term, + then the candidate rejects the RPC and continues in candidate state. */ + if (appendEntriesCmd.term < currentTerm) { + logger.debug("Ignoring leader with older term {} (current={}", appendEntriesCmd.term, currentTerm); + } else { + logger.debug("Stop being candidate - switching to follower"); + currentState = RaftNodeState.FOLLOWER; + votedFor = -1; + resetFollowerAppendTimer(); + } + + } else { + + if (appendEntriesCmd.term < currentTerm) { + // TODO reply that leader is sending bs + } + if (appendEntriesCmd.term >= currentTerm) { + if (appendEntriesCmd.term > currentTerm) { + logger.info("Update term {}->{}", currentTerm, appendEntriesCmd.term); + currentTerm = appendEntriesCmd.term; + } + + resetFollowerAppendTimer(); + } + } + } + } - logger.info("Listening on udp:{}:{}", InetAddress.getLocalHost().getHostAddress(), port); - final DatagramPacket receivePacket = new DatagramPacket(receiveData, receiveData.length); + return null; + }; - while (true) { - serverSocket.receive(receivePacket); - String sentence = new String(receivePacket.getData(), 0, - receivePacket.getLength()); - logger.debug("RECEIVED: " + sentence); + final BiConsumer handlerResponses = (fromNodeId, resp) -> { + logger.debug("INCOMING RESP {} >>> {}", fromNodeId, resp); + }; + // todo remove from constructor + rpcService = new RpcService(remoteNodes, handler, handlerResponses, thisNodeId); - DatagramPacket sendPacket = new DatagramPacket( - sendData, - sendData.length, - receivePacket.getAddress(), - receivePacket.getPort()); + logger.info("HEARTBEAT_TIMEOUT_MS={}", HEARTBEAT_TIMEOUT_MS); + logger.info("ELECTION_TIMEOUT_MS={}", ELECTION_TIMEOUT_MS); - serverSocket.send(sendPacket); - } - } catch (IOException ex) { - System.out.println(ex); - } + logger.info("Starting node {} as follower...", thisNodeId); + resetFollowerAppendTimer(); } - - +// public void run(int port) { +// try (final DatagramSocket serverSocket = new DatagramSocket(port)) { +// final byte[] receiveData = new byte[8]; +// String sendString = "polo"; +// final byte[] sendData = sendString.getBytes(StandardCharsets.UTF_8); +// +// logger.info("Listening on udp:{}:{}", InetAddress.getLocalHost().getHostAddress(), port); +// final DatagramPacket receivePacket = new DatagramPacket(receiveData, receiveData.length); +// +// while (true) { +// serverSocket.receive(receivePacket); +// String sentence = new String(receivePacket.getData(), 0, +// receivePacket.getLength()); +// logger.debug("RECEIVED: " + sentence); +// +// +// DatagramPacket sendPacket = new DatagramPacket( +// sendData, +// sendData.length, +// receivePacket.getAddress(), +// receivePacket.getPort()); +// +// serverSocket.send(sendPacket); +// } +// } catch (IOException ex) { +// System.out.println(ex); +// } +// } /** @@ -171,12 +250,16 @@ public synchronized CmdRaftVoteResponse handleVoteRequest(CmdRaftVoteRequest req } - private void checkTerm(int term) { // All servers: If RPC request or response contains term T > currentTerm: set currentTerm = T, convert to follower (5.1) if (term > currentTerm) { logger.info("Newer term={} received from new leader, switching to FOLLOWER", term); currentTerm = term; + + if (currentState == RaftNodeState.LEADER) { + heartbeatLeaderExecutor.shutdown(); + } + currentState = RaftNodeState.FOLLOWER; } } @@ -188,12 +271,13 @@ private void broadcast(RaftMessage message) { } - private void resetFollowerAppendTimer(){ + private synchronized void resetFollowerAppendTimer() { logger.debug("reset append timer"); - timer.cancel(); - + if (appendTimer != null) { + appendTimer.cancel(); + } final TimerTask task = new TimerTask() { @Override public void run() { @@ -201,12 +285,26 @@ public void run() { } }; - timer.schedule(task, HEARTBEAT_TIMEOUT_MS); + appendTimer = new Timer(); + appendTimer.schedule(task, HEARTBEAT_TIMEOUT_MS); } - private synchronized void appendTimeout(){ + /** + * To begin an election, a follower increments its current + * term and transitions to candidate state. It then votes for + * itself and issues RequestVote RPCs in parallel to each of + * the other servers in the cluster. A candidate continues in + * this state until one of three things happens: + * (a) it wins the election, + * (b) another server establishes itself as leader, or + * (c) a period of time goes by with no winner. + */ + private synchronized void appendTimeout() { + + if (currentState == RaftNodeState.LEADER) { + heartbeatLeaderExecutor.shutdown(); + } - logger.info("heartbeat timeout - switching to CANDIDATE"); currentState = RaftNodeState.CANDIDATE; // On conversion to candidate, start election: @@ -214,12 +312,95 @@ private synchronized void appendTimeout(){ // - Vote for self // - Reset election timer // - Send RequestVote RPCs to all other servers - currentTerm ++; + currentTerm++; + + logger.info("heartbeat timeout - switching to CANDIDATE, term={}", currentTerm); + + votedFor = currentNodeId; + + final TimerTask task = new TimerTask() { + @Override + public void run() { + electionTimeout(); + } + }; + + final CmdRaftVoteRequest voteReq = new CmdRaftVoteRequest( + currentTerm, + currentNodeId, + lastApplied, + 429384628); // TODO extract from log! + + try { + + final CompletableFuture future0 = rpcService.callRpcSync(voteReq, otherNodes[0]); + final CompletableFuture future1 = rpcService.callRpcSync(voteReq, otherNodes[1]); + + final CompletableFuture objectCompletableFuture = CompletableFuture.anyOf(future0, future1); + final CmdRaftVoteResponse response = (CmdRaftVoteResponse) objectCompletableFuture.get(ELECTION_TIMEOUT_MS, TimeUnit.MILLISECONDS); + + /* + A candidate wins an election if it receives votes from + a majority of the servers in the full cluster for the same + term. Each server will vote for at most one candidate in a + given term, on a first-come-first-served basis + (note: Section 5.4 adds an additional restriction on votes) + */ + + if (response.voteGranted) { + logger.info("One vote, becoming a LEADER!"); + currentState = RaftNodeState.LEADER; + + heartbeatLeaderExecutor = Executors.newSingleThreadScheduledExecutor(); + heartbeatLeaderExecutor.scheduleAtFixedRate( + () -> { + logger.info("Sending heartbeats"); + final CmdRaftAppendEntries heartBeatReq = new CmdRaftAppendEntries( + currentTerm, + currentNodeId, + lastApplied, + 429384628, + List.of(), + commitIndex); + rpcService.callRpcSync(heartBeatReq, otherNodes[0]); + rpcService.callRpcSync(heartBeatReq, otherNodes[1]); + }, + 0, + HEARTBEAT_LEADER_RATE_MS, + TimeUnit.MILLISECONDS); + + // TODO init + // for each server, index of the next log entry to send to that server (initialized to leader last log index + 1) +// private final long[] nextIndex = new long[3]; + + // for each server, index of the highest log entry known to be replicated on server (initialized to 0, increases monotonically) +// private final long[] matchIndex = new long[3]; + + } else { + logger.info("Vote not granted"); + // TODO maybe second one granted his vote ? + } + } catch (TimeoutException ex) { + logger.warn("Vote timeout"); + } catch (Exception ex) { + logger.warn("Exception while collecting votes", ex); + } + // did not win election + if (currentState == RaftNodeState.CANDIDATE) { + currentState = RaftNodeState.FOLLOWER; + votedFor = -1; + resetFollowerAppendTimer(); + } +// electionTimer = new Timer(); +// electionTimer.schedule(task, ELECTION_TIMEOUT_MS); } + private synchronized void electionTimeout() { + logger.info("election timeout - switching to CANDIDATE"); + } public enum RaftNodeState { FOLLOWER, diff --git a/src/main/java/exchange/core2/revelator/raft/RpcService.java b/src/main/java/exchange/core2/revelator/raft/RpcService.java index 65f6f62..f30ec69 100644 --- a/src/main/java/exchange/core2/revelator/raft/RpcService.java +++ b/src/main/java/exchange/core2/revelator/raft/RpcService.java @@ -1,5 +1,6 @@ package exchange.core2.revelator.raft; +import org.agrona.PrintBufferUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -13,6 +14,7 @@ import java.util.concurrent.CompletableFuture; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicLong; +import java.util.function.BiConsumer; import java.util.function.BiFunction; public class RpcService implements AutoCloseable { @@ -25,11 +27,13 @@ public class RpcService implements AutoCloseable { private final int serverPort; private final int serverNodeId; private final BiFunction handler; + private final BiConsumer handlerResponses; private volatile boolean active = true; public RpcService(Map remoteNodes, BiFunction handler, + BiConsumer handlerResponses, int serverNodeId) { final Map socketMap = new HashMap<>(); @@ -54,6 +58,7 @@ public RpcService(Map remoteNodes, this.socketMap = socketMap; this.handler = handler; + this.handlerResponses = handlerResponses; this.serverPort = socketMap.get(serverNodeId).port; this.serverNodeId = serverNodeId; @@ -69,9 +74,9 @@ public void run() { try (final DatagramSocket serverSocket = new DatagramSocket(serverPort)) { - logger.info("Listening on UDP {}:{}", InetAddress.getLocalHost().getHostAddress(), serverPort); + logger.info("Listening at UDP {}:{}", InetAddress.getLocalHost().getHostAddress(), serverPort); - final byte[] receiveData = new byte[256]; + final byte[] receiveData = new byte[256]; // TODO set proper value final DatagramPacket receivePacket = new DatagramPacket(receiveData, receiveData.length); @@ -79,7 +84,6 @@ public void run() { serverSocket.receive(receivePacket); // String sentence = new String(receivePacket.getData(), 0, receivePacket.getLength()); -// logger.debug("RECEIVED: " + sentence); final ByteBuffer bb = ByteBuffer.wrap(receivePacket.getData(), 0, receivePacket.getLength()); @@ -88,8 +92,11 @@ public void run() { final int messageType = bb.getInt(); final long correlationId = bb.getLong(); + logger.debug("RECEIVED from {} mt={}: {}", nodeId, messageType, PrintBufferUtil.hexDump(receivePacket.getData(), 0, receivePacket.getLength())); + + if (messageType < 0) { - processResponse(receivePacket, bb, messageType, correlationId); + processResponse(receivePacket, bb, nodeId, messageType, correlationId); } else { processRequest(receivePacket, bb, nodeId, messageType, correlationId); @@ -111,13 +118,17 @@ private void processRequest(DatagramPacket receivePacket, ByteBuffer bb, int nod final CmdRaftAppendEntries request = CmdRaftAppendEntries.create(bb); final RpcResponse response = handler.apply(nodeId, request); - sendResponse(nodeId, correlationId, response); + if (response != null) { + sendResponse(nodeId, correlationId, response); + } } else if (messageType == RpcRequest.REQUEST_VOTE) { final CmdRaftVoteRequest request = CmdRaftVoteRequest.create(bb); final RpcResponse response = handler.apply(nodeId, request); - sendResponse(nodeId, correlationId, response); + if (response != null) { + sendResponse(nodeId, correlationId, response); + } } else { logger.warn("Unsupported response type={} from {} correlationId={}", @@ -125,28 +136,29 @@ private void processRequest(DatagramPacket receivePacket, ByteBuffer bb, int nod } } - private void processResponse(DatagramPacket receivePacket, ByteBuffer bb, int messageType, long correlationId) { - final CompletableFuture future = futureMap.remove(correlationId); - - if (future != null) { + private void processResponse(DatagramPacket receivePacket, ByteBuffer bb, int nodeId, int messageType, long correlationId) { - if (messageType == RpcResponse.RESPONSE_APPEND_ENTRIES) { + final CompletableFuture future = futureMap.remove(correlationId); - future.complete(CmdRaftAppendEntriesResponse.create(bb)); + if (messageType == RpcResponse.RESPONSE_APPEND_ENTRIES) { - } else if (messageType == RpcResponse.RESPONSE_VOTE) { + final CmdRaftAppendEntriesResponse r = CmdRaftAppendEntriesResponse.create(bb); + if (future != null) { + future.complete(r); + } + handlerResponses.accept(nodeId, r); - future.complete(CmdRaftVoteResponse.create(bb)); + } else if (messageType == RpcResponse.RESPONSE_VOTE) { - } else { - logger.warn("Unsupported response type={} from {} correlationId={}", - messageType, receivePacket.getAddress().getHostAddress(), correlationId); + final CmdRaftVoteResponse r = CmdRaftVoteResponse.create(bb); + if (future != null) { + future.complete(r); } - + handlerResponses.accept(nodeId, r); } else { - logger.warn("Unexpected (or duplicate) response from {} type={} correlationId={}", - receivePacket.getAddress().getHostAddress(), messageType, correlationId); + logger.warn("Unsupported response type={} from {} correlationId={}", + messageType, receivePacket.getAddress().getHostAddress(), correlationId); } } @@ -163,7 +175,7 @@ private void sendResponse(int callerNodeId, long correlationId, RpcResponse resp } - public CompletableFuture callRpcSync(RpcRequest request, int nodeId) { + public CompletableFuture callRpcSync(RpcRequest request, int toNodeId) { final long correlationId = correlationIdCounter.incrementAndGet(); @@ -173,13 +185,13 @@ public CompletableFuture callRpcSync(RpcRequest request, int nodeId final byte[] array = new byte[64]; ByteBuffer bb = ByteBuffer.wrap(array); - bb.putInt(nodeId); + bb.putInt(serverNodeId); bb.putInt(request.getMessageType()); bb.putLong(correlationId); request.serialize(bb); - send(nodeId, array, bb.position()); + send(toNodeId, array, bb.position()); return future; } From 5c625e172d792646b767a5e9235486120c4e221f Mon Sep 17 00:00:00 2001 From: Maksim Zheravin Date: Fri, 21 Jan 2022 00:02:19 +0200 Subject: [PATCH 05/15] RAFT: async handlers implementation --- .../core2/revelator/raft/RaftNode.java | 392 ++++++++++-------- 1 file changed, 217 insertions(+), 175 deletions(-) diff --git a/src/main/java/exchange/core2/revelator/raft/RaftNode.java b/src/main/java/exchange/core2/revelator/raft/RaftNode.java index c5ea42b..24ce63d 100644 --- a/src/main/java/exchange/core2/revelator/raft/RaftNode.java +++ b/src/main/java/exchange/core2/revelator/raft/RaftNode.java @@ -5,7 +5,10 @@ import org.slf4j.LoggerFactory; import java.util.*; -import java.util.concurrent.*; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; import java.util.function.BiConsumer; import java.util.function.BiFunction; @@ -59,7 +62,7 @@ public class RaftNode { private final RpcService rpcService; private Timer appendTimer; -// private Timer electionTimer; + private Timer electionTimer; private ScheduledExecutorService heartbeatLeaderExecutor; @@ -85,51 +88,71 @@ public RaftNode(int thisNodeId) { final BiFunction handler = (fromNodeId, req) -> { logger.debug("INCOMING REQ {} >>> {}", fromNodeId, req); - if (req instanceof CmdRaftVoteRequest) { + + if (req instanceof CmdRaftVoteRequest voteRequest) { + synchronized (this) { - logger.debug("votedFor={}", votedFor); - if (votedFor == -1) { - logger.debug("VOTE GRANTED for {}", fromNodeId); - votedFor = fromNodeId; - return new CmdRaftVoteResponse(currentTerm, true); - } else { + /* Receiver implementation: + 1. Reply false if term < currentTerm (§5.1) + 2. If votedFor is null or candidateId, and candidate’s log is at + least as up-to-date as receiver’s log, grant vote (5.2, 5.4) */ + + if (voteRequest.term < currentTerm) { + logger.debug("Reject vote for {} - term is old", fromNodeId); + return new CmdRaftVoteResponse(currentTerm, false); + } + + if (voteRequest.term > currentTerm) { + logger.debug("received newer term {} with vote request", voteRequest.term); + currentTerm = voteRequest.term; + switchToFollower(); + } + + if (votedFor != -1 && votedFor != currentNodeId) { + logger.debug("Reject vote for {} - already voted for {}", fromNodeId, votedFor); return new CmdRaftVoteResponse(currentTerm, false); } + + logger.debug("VOTE GRANTED for {}", fromNodeId); + votedFor = fromNodeId; + + return new CmdRaftVoteResponse(currentTerm, true); + } + } - if (req instanceof CmdRaftAppendEntries) { - CmdRaftAppendEntries appendEntriesCmd = (CmdRaftAppendEntries) req; + if (req instanceof CmdRaftAppendEntries appendEntriesCmd) { synchronized (this) { + if (appendEntriesCmd.term < currentTerm) { + logger.debug("Ignoring leader with older term {} (current={}", appendEntriesCmd.term, currentTerm); + return new CmdRaftAppendEntriesResponse(currentTerm, false); + } + if (currentState == RaftNodeState.CANDIDATE) { /* While waiting for votes, a candidate may receive an AppendEntries RPC from another server claiming to be leader. If the leader’s term (included in its RPC) is at least as large as the candidate’s current term, then the candidate recognizes the leader as legitimate and returns to follower state. If the term in the RPC is smaller than the candidate’s current term, then the candidate rejects the RPC and continues in candidate state. */ - if (appendEntriesCmd.term < currentTerm) { - logger.debug("Ignoring leader with older term {} (current={}", appendEntriesCmd.term, currentTerm); - } else { - logger.debug("Stop being candidate - switching to follower"); - currentState = RaftNodeState.FOLLOWER; - votedFor = -1; - resetFollowerAppendTimer(); - } + + switchToFollower(); + + electionTimer.cancel(); + } else { - if (appendEntriesCmd.term < currentTerm) { - // TODO reply that leader is sending bs - } - if (appendEntriesCmd.term >= currentTerm) { - if (appendEntriesCmd.term > currentTerm) { - logger.info("Update term {}->{}", currentTerm, appendEntriesCmd.term); - currentTerm = appendEntriesCmd.term; - } + // TODO add records - resetFollowerAppendTimer(); + if (appendEntriesCmd.term > currentTerm) { + logger.info("Update term {}->{}", currentTerm, appendEntriesCmd.term); + currentTerm = appendEntriesCmd.term; } + + resetFollowerAppendTimer(); + } } } @@ -138,7 +161,22 @@ public RaftNode(int thisNodeId) { }; final BiConsumer handlerResponses = (fromNodeId, resp) -> { + logger.debug("INCOMING RESP {} >>> {}", fromNodeId, resp); + + /* A candidate wins an election if it receives votes from + a majority of the servers in the full cluster for the same + term. Each server will vote for at most one candidate in a + given term, on a first-come-first-served basis + (note: Section 5.4 adds an additional restriction on votes) */ + + if (resp instanceof final CmdRaftVoteResponse voteResponse) { + synchronized (this) { + if (currentState == RaftNodeState.CANDIDATE && voteResponse.voteGranted && voteResponse.term == currentTerm) { + switchToLeader(); + } + } + } }; // todo remove from constructor @@ -151,6 +189,26 @@ public RaftNode(int thisNodeId) { resetFollowerAppendTimer(); } + private void switchToFollower() { + + if (currentState == RaftNodeState.LEADER) { + logger.debug("shutdown heartbeats"); + heartbeatLeaderExecutor.shutdown(); + } + + if (currentState == RaftNodeState.CANDIDATE && electionTimer != null) { + logger.debug("cancelled elevtion timer"); + electionTimer.cancel(); + } + + if (currentState != RaftNodeState.FOLLOWER) { + logger.debug("Switching to follower (reset votedFor, start append timer)"); + currentState = RaftNodeState.FOLLOWER; + votedFor = -1; + resetFollowerAppendTimer(); + } + } + // public void run(int port) { // try (final DatagramSocket serverSocket = new DatagramSocket(port)) { // final byte[] receiveData = new byte[8]; @@ -181,88 +239,88 @@ public RaftNode(int thisNodeId) { // } - /** - * Receiver implementation:

- * 1. Reply false if term < currentTerm (5.1)

- * 2. Reply false if log doesn't contain an entry at prevLogIndex whose term matches prevLogTerm (5.3)

- * 3. If an existing entry conflicts with a new one (same index but different terms), delete the existing entry and all that follow it (5.3)

- * 4. Append any new entries not already in the log

- * 5. If leaderCommit > commitIndex, set commitIndex = min(leaderCommit, index of last new entry)

- */ - public synchronized CmdRaftAppendEntriesResponse appendEntries(CmdRaftAppendEntries cmd) { - - // 1. Reply false if term < currentTerm - // If the term in the RPC is smaller than the candidate’s current term, then the candidate rejects the RPC and continues in candidate state. - if (cmd.term < currentTerm) { - logger.debug("term < currentTerm"); - return new CmdRaftAppendEntriesResponse(currentTerm, false); - } - - // If the leader’s term (included in its RPC) is at least as large as the candidate’s current term, then the candidate - // recognizes the leader as legitimate and returns to follower state. I - checkTerm(cmd.term); - - // 2. Reply false if log doesn't contain an entry at prevLogIndex whose term matches prevLogTerm - if (cmd.prevLogIndex >= log.size() || log.get((int) cmd.prevLogIndex).term != cmd.prevLogTerm) { - logger.debug("log doesn't contain an entry at prevLogIndex whose term matches prevLogTerm"); - return new CmdRaftAppendEntriesResponse(currentTerm, false); - } - - // TODO 3. If an existing entry conflicts with a new one (same index but different terms), delete the existing entry and all that follow it (5.3) - - // Append any new entries not already in the log - - resetFollowerAppendTimer(); - - // If leaderCommit > commitIndex, set commitIndex = min(leaderCommit, index of last new entry) - if (cmd.leaderCommit > commitIndex) { - commitIndex = Math.min(cmd.leaderCommit, log.size() - 1); - } - - return new CmdRaftAppendEntriesResponse(currentTerm, true); - } - - /** - * Receiver implementation: - * 1. Reply false if term < currentTerm (5.1) - * 2. If votedFor is null or candidateId, and candidate’s log is at least as up-to-date as receiver’s log, grant vote (5.2, 5.4) - */ - public synchronized CmdRaftVoteResponse handleVoteRequest(CmdRaftVoteRequest request) { - - if (request.term < currentTerm) { - return new CmdRaftVoteResponse(currentTerm, false); - } - - checkTerm(request.term); - - - final boolean notVotedYet = votedFor == -1 || votedFor == request.candidateId; - final boolean logIsUpToDate = request.lastLogIndex >= commitIndex; // TODO commitIndex or lastApplied? - if (notVotedYet && logIsUpToDate) { - // vote for candidate - votedFor = request.candidateId; - return new CmdRaftVoteResponse(currentTerm, true); - } else { - // reject voting - return new CmdRaftVoteResponse(currentTerm, false); - } - - } - - - private void checkTerm(int term) { - // All servers: If RPC request or response contains term T > currentTerm: set currentTerm = T, convert to follower (5.1) - if (term > currentTerm) { - logger.info("Newer term={} received from new leader, switching to FOLLOWER", term); - currentTerm = term; - - if (currentState == RaftNodeState.LEADER) { - heartbeatLeaderExecutor.shutdown(); - } - - currentState = RaftNodeState.FOLLOWER; - } - } +// /** +// * Receiver implementation:

+// * 1. Reply false if term < currentTerm (5.1)

+// * 2. Reply false if log doesn't contain an entry at prevLogIndex whose term matches prevLogTerm (5.3)

+// * 3. If an existing entry conflicts with a new one (same index but different terms), delete the existing entry and all that follow it (5.3)

+// * 4. Append any new entries not already in the log

+// * 5. If leaderCommit > commitIndex, set commitIndex = min(leaderCommit, index of last new entry)

+// */ +// public synchronized CmdRaftAppendEntriesResponse appendEntries(CmdRaftAppendEntries cmd) { +// +// // 1. Reply false if term < currentTerm +// // If the term in the RPC is smaller than the candidate’s current term, then the candidate rejects the RPC and continues in candidate state. +// if (cmd.term < currentTerm) { +// logger.debug("term < currentTerm"); +// return new CmdRaftAppendEntriesResponse(currentTerm, false); +// } +// +// // If the leader’s term (included in its RPC) is at least as large as the candidate’s current term, then the candidate +// // recognizes the leader as legitimate and returns to follower state. I +// checkTerm(cmd.term); +// +// // 2. Reply false if log doesn't contain an entry at prevLogIndex whose term matches prevLogTerm +// if (cmd.prevLogIndex >= log.size() || log.get((int) cmd.prevLogIndex).term != cmd.prevLogTerm) { +// logger.debug("log doesn't contain an entry at prevLogIndex whose term matches prevLogTerm"); +// return new CmdRaftAppendEntriesResponse(currentTerm, false); +// } +// +// // TODO 3. If an existing entry conflicts with a new one (same index but different terms), delete the existing entry and all that follow it (5.3) +// +// // Append any new entries not already in the log +// +// resetFollowerAppendTimer(); +// +// // If leaderCommit > commitIndex, set commitIndex = min(leaderCommit, index of last new entry) +// if (cmd.leaderCommit > commitIndex) { +// commitIndex = Math.min(cmd.leaderCommit, log.size() - 1); +// } +// +// return new CmdRaftAppendEntriesResponse(currentTerm, true); +// } +// +// /** +// * Receiver implementation: +// * 1. Reply false if term < currentTerm (5.1) +// * 2. If votedFor is null or candidateId, and candidate’s log is at least as up-to-date as receiver’s log, grant vote (5.2, 5.4) +// */ +// public synchronized CmdRaftVoteResponse handleVoteRequest(CmdRaftVoteRequest request) { +// +// if (request.term < currentTerm) { +// return new CmdRaftVoteResponse(currentTerm, false); +// } +// +// checkTerm(request.term); +// +// +// final boolean notVotedYet = votedFor == -1 || votedFor == request.candidateId; +// final boolean logIsUpToDate = request.lastLogIndex >= commitIndex; // TODO commitIndex or lastApplied? +// if (notVotedYet && logIsUpToDate) { +// // vote for candidate +// votedFor = request.candidateId; +// return new CmdRaftVoteResponse(currentTerm, true); +// } else { +// // reject voting +// return new CmdRaftVoteResponse(currentTerm, false); +// } +// +// } +// +// +// private void checkTerm(int term) { +// // All servers: If RPC request or response contains term T > currentTerm: set currentTerm = T, convert to follower (5.1) +// if (term > currentTerm) { +// logger.info("Newer term={} received from new leader, switching to FOLLOWER", term); +// currentTerm = term; +// +// if (currentState == RaftNodeState.LEADER) { +// heartbeatLeaderExecutor.shutdown(); +// } +// +// currentState = RaftNodeState.FOLLOWER; +// } +// } private void broadcast(RaftMessage message) { @@ -301,6 +359,8 @@ public void run() { */ private synchronized void appendTimeout() { + // TODO double-check last receiving time (and get rid of timers) + if (currentState == RaftNodeState.LEADER) { heartbeatLeaderExecutor.shutdown(); } @@ -318,12 +378,6 @@ private synchronized void appendTimeout() { votedFor = currentNodeId; - final TimerTask task = new TimerTask() { - @Override - public void run() { - electionTimeout(); - } - }; final CmdRaftVoteRequest voteReq = new CmdRaftVoteRequest( currentTerm, @@ -331,75 +385,63 @@ public void run() { lastApplied, 429384628); // TODO extract from log! - try { - - final CompletableFuture future0 = rpcService.callRpcSync(voteReq, otherNodes[0]); - final CompletableFuture future1 = rpcService.callRpcSync(voteReq, otherNodes[1]); - - final CompletableFuture objectCompletableFuture = CompletableFuture.anyOf(future0, future1); - final CmdRaftVoteResponse response = (CmdRaftVoteResponse) objectCompletableFuture.get(ELECTION_TIMEOUT_MS, TimeUnit.MILLISECONDS); - - /* - A candidate wins an election if it receives votes from - a majority of the servers in the full cluster for the same - term. Each server will vote for at most one candidate in a - given term, on a first-come-first-served basis - (note: Section 5.4 adds an additional restriction on votes) - */ - - if (response.voteGranted) { - logger.info("One vote, becoming a LEADER!"); - currentState = RaftNodeState.LEADER; - - heartbeatLeaderExecutor = Executors.newSingleThreadScheduledExecutor(); - heartbeatLeaderExecutor.scheduleAtFixedRate( - () -> { - logger.info("Sending heartbeats"); - final CmdRaftAppendEntries heartBeatReq = new CmdRaftAppendEntries( - currentTerm, - currentNodeId, - lastApplied, - 429384628, - List.of(), - commitIndex); - rpcService.callRpcSync(heartBeatReq, otherNodes[0]); - rpcService.callRpcSync(heartBeatReq, otherNodes[1]); - }, - 0, - HEARTBEAT_LEADER_RATE_MS, - TimeUnit.MILLISECONDS); - - // TODO init - // for each server, index of the next log entry to send to that server (initialized to leader last log index + 1) -// private final long[] nextIndex = new long[3]; +// try { - // for each server, index of the highest log entry known to be replicated on server (initialized to 0, increases monotonically) -// private final long[] matchIndex = new long[3]; + final CompletableFuture future0 = rpcService.callRpcSync(voteReq, otherNodes[0]); + final CompletableFuture future1 = rpcService.callRpcSync(voteReq, otherNodes[1]); - } else { - logger.info("Vote not granted"); - // TODO maybe second one granted his vote ? + final TimerTask task = new TimerTask() { + @Override + public void run() { + appendTimeout(); } + }; - } catch (TimeoutException ex) { - logger.warn("Vote timeout"); - } catch (Exception ex) { - logger.warn("Exception while collecting votes", ex); - } + electionTimer = new Timer(); + electionTimer.schedule(task, ELECTION_TIMEOUT_MS); + } - // did not win election - if (currentState == RaftNodeState.CANDIDATE) { - currentState = RaftNodeState.FOLLOWER; - votedFor = -1; - resetFollowerAppendTimer(); + private void switchToLeader() { + + logger.info("Becoming a LEADER!"); + currentState = RaftNodeState.LEADER; + if (electionTimer != null) { + electionTimer.cancel(); + } + if (appendTimer != null) { + appendTimer.cancel(); } -// electionTimer = new Timer(); -// electionTimer.schedule(task, ELECTION_TIMEOUT_MS); + heartbeatLeaderExecutor = Executors.newSingleThreadScheduledExecutor(); + heartbeatLeaderExecutor.scheduleAtFixedRate( + () -> { + logger.info("Sending heartbeats, term={}", currentTerm); + final CmdRaftAppendEntries heartBeatReq = new CmdRaftAppendEntries( + currentTerm, + currentNodeId, + lastApplied, + 429384628, + List.of(), + commitIndex); + rpcService.callRpcSync(heartBeatReq, otherNodes[0]); + rpcService.callRpcSync(heartBeatReq, otherNodes[1]); + }, + 0, + HEARTBEAT_LEADER_RATE_MS, + TimeUnit.MILLISECONDS); + + logger.info("Leader initiated"); + + // TODO init + // for each server, index of the next log entry to send to that server (initialized to leader last log index + 1) +// private final long[] nextIndex = new long[3]; + + // for each server, index of the highest log entry known to be replicated on server (initialized to 0, increases monotonically) +// private final long[] matchIndex = new long[3]; } private synchronized void electionTimeout() { - logger.info("election timeout - switching to CANDIDATE"); + logger.info("election timeout - switching to FOLLOWER"); } public enum RaftNodeState { From a2dc6c0bb0d4c9428fd5a16d33103eff1ad2557f Mon Sep 17 00:00:00 2001 From: Maksim Zheravin Date: Sat, 22 Jan 2022 00:51:46 +0200 Subject: [PATCH 06/15] RAFT: remove timers and executors --- .../core2/revelator/raft/RaftNode.java | 193 ++++++++---------- .../core2/revelator/raft/RpcService.java | 23 ++- 2 files changed, 98 insertions(+), 118 deletions(-) diff --git a/src/main/java/exchange/core2/revelator/raft/RaftNode.java b/src/main/java/exchange/core2/revelator/raft/RaftNode.java index 24ce63d..630e240 100644 --- a/src/main/java/exchange/core2/revelator/raft/RaftNode.java +++ b/src/main/java/exchange/core2/revelator/raft/RaftNode.java @@ -4,11 +4,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.*; -import java.util.concurrent.CompletableFuture; -import java.util.concurrent.Executors; -import java.util.concurrent.ScheduledExecutorService; -import java.util.concurrent.TimeUnit; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; import java.util.function.BiConsumer; import java.util.function.BiFunction; @@ -61,10 +59,10 @@ public class RaftNode { private final RpcService rpcService; - private Timer appendTimer; - private Timer electionTimer; - - private ScheduledExecutorService heartbeatLeaderExecutor; + // timers + private long lastHeartBeatReceivedNs = System.nanoTime(); + private long lastHeartBeatSentNs = System.nanoTime(); + private long electionStartedNs = System.nanoTime(); public static void main(String[] args) { @@ -137,9 +135,11 @@ public RaftNode(int thisNodeId) { If the term in the RPC is smaller than the candidate’s current term, then the candidate rejects the RPC and continues in candidate state. */ + logger.debug("Switch from Candidate to follower"); + switchToFollower(); - electionTimer.cancel(); +// electionTimer.cancel(); } else { @@ -187,20 +187,78 @@ public RaftNode(int thisNodeId) { logger.info("Starting node {} as follower...", thisNodeId); resetFollowerAppendTimer(); + + new Thread(this::workerThread).start(); + } - private void switchToFollower() { + private void workerThread() { - if (currentState == RaftNodeState.LEADER) { - logger.debug("shutdown heartbeats"); - heartbeatLeaderExecutor.shutdown(); - } + try { + + while (true) { + + synchronized (this) { - if (currentState == RaftNodeState.CANDIDATE && electionTimer != null) { - logger.debug("cancelled elevtion timer"); - electionTimer.cancel(); + if (currentState == RaftNodeState.FOLLOWER) { + + if (System.nanoTime() > lastHeartBeatReceivedNs + HEARTBEAT_TIMEOUT_MS * 1_000_000L) { + appendTimeout(); + } else { + + } + + } + + if (currentState == RaftNodeState.CANDIDATE) { + final long t = System.nanoTime(); + if (t > electionStartedNs + ELECTION_TIMEOUT_MS * 1_000_000L) { + appendTimeout(); + } + } + + if (currentState == RaftNodeState.LEADER) { + + final long t = System.nanoTime(); + if (t > lastHeartBeatSentNs + HEARTBEAT_LEADER_RATE_MS * 1_000_000L) { + + lastHeartBeatSentNs = t; + + logger.info("Sending heartbeats to {}, term={}", otherNodes, currentTerm); + final CmdRaftAppendEntries heartBeatReq = new CmdRaftAppendEntries( + currentTerm, + currentNodeId, + lastApplied, + 429384628, + List.of(), + commitIndex); + rpcService.callRpcAsync(heartBeatReq, otherNodes[0]); + rpcService.callRpcAsync(heartBeatReq, otherNodes[1]); + } + } + + + } + + Thread.sleep(10); + } + + + } catch (InterruptedException e) { + e.printStackTrace(); } + + } + + + private void switchToFollower() { + +// if (currentState == RaftNodeState.CANDIDATE && electionTimer != null) { +// logger.debug("cancelled elevtion timer"); +// electionTimer.cancel(); +// } + if (currentState != RaftNodeState.FOLLOWER) { logger.debug("Switching to follower (reset votedFor, start append timer)"); currentState = RaftNodeState.FOLLOWER; @@ -209,35 +267,6 @@ private void switchToFollower() { } } -// public void run(int port) { -// try (final DatagramSocket serverSocket = new DatagramSocket(port)) { -// final byte[] receiveData = new byte[8]; -// String sendString = "polo"; -// final byte[] sendData = sendString.getBytes(StandardCharsets.UTF_8); -// -// logger.info("Listening on udp:{}:{}", InetAddress.getLocalHost().getHostAddress(), port); -// final DatagramPacket receivePacket = new DatagramPacket(receiveData, receiveData.length); -// -// while (true) { -// serverSocket.receive(receivePacket); -// String sentence = new String(receivePacket.getData(), 0, -// receivePacket.getLength()); -// logger.debug("RECEIVED: " + sentence); -// -// -// DatagramPacket sendPacket = new DatagramPacket( -// sendData, -// sendData.length, -// receivePacket.getAddress(), -// receivePacket.getPort()); -// -// serverSocket.send(sendPacket); -// } -// } catch (IOException ex) { -// System.out.println(ex); -// } -// } - // /** // * Receiver implementation:

@@ -330,21 +359,8 @@ private void broadcast(RaftMessage message) { } private synchronized void resetFollowerAppendTimer() { - - logger.debug("reset append timer"); - - if (appendTimer != null) { - appendTimer.cancel(); - } - final TimerTask task = new TimerTask() { - @Override - public void run() { - appendTimeout(); - } - }; - - appendTimer = new Timer(); - appendTimer.schedule(task, HEARTBEAT_TIMEOUT_MS); +// logger.debug("reset append timer"); + lastHeartBeatReceivedNs = System.nanoTime(); } /** @@ -359,11 +375,7 @@ public void run() { */ private synchronized void appendTimeout() { - // TODO double-check last receiving time (and get rid of timers) - - if (currentState == RaftNodeState.LEADER) { - heartbeatLeaderExecutor.shutdown(); - } + // TODO double-check last receiving time currentState = RaftNodeState.CANDIDATE; @@ -385,52 +397,15 @@ private synchronized void appendTimeout() { lastApplied, 429384628); // TODO extract from log! -// try { - - final CompletableFuture future0 = rpcService.callRpcSync(voteReq, otherNodes[0]); - final CompletableFuture future1 = rpcService.callRpcSync(voteReq, otherNodes[1]); - - final TimerTask task = new TimerTask() { - @Override - public void run() { - appendTimeout(); - } - }; + rpcService.callRpcAsync(voteReq, otherNodes[0]); + rpcService.callRpcAsync(voteReq, otherNodes[1]); - electionTimer = new Timer(); - electionTimer.schedule(task, ELECTION_TIMEOUT_MS); + electionStartedNs = System.nanoTime(); } private void switchToLeader() { - logger.info("Becoming a LEADER!"); currentState = RaftNodeState.LEADER; - if (electionTimer != null) { - electionTimer.cancel(); - } - if (appendTimer != null) { - appendTimer.cancel(); - } - - heartbeatLeaderExecutor = Executors.newSingleThreadScheduledExecutor(); - heartbeatLeaderExecutor.scheduleAtFixedRate( - () -> { - logger.info("Sending heartbeats, term={}", currentTerm); - final CmdRaftAppendEntries heartBeatReq = new CmdRaftAppendEntries( - currentTerm, - currentNodeId, - lastApplied, - 429384628, - List.of(), - commitIndex); - rpcService.callRpcSync(heartBeatReq, otherNodes[0]); - rpcService.callRpcSync(heartBeatReq, otherNodes[1]); - }, - 0, - HEARTBEAT_LEADER_RATE_MS, - TimeUnit.MILLISECONDS); - - logger.info("Leader initiated"); // TODO init // for each server, index of the next log entry to send to that server (initialized to leader last log index + 1) @@ -440,10 +415,6 @@ private void switchToLeader() { // private final long[] matchIndex = new long[3]; } - private synchronized void electionTimeout() { - logger.info("election timeout - switching to FOLLOWER"); - } - public enum RaftNodeState { FOLLOWER, CANDIDATE, diff --git a/src/main/java/exchange/core2/revelator/raft/RpcService.java b/src/main/java/exchange/core2/revelator/raft/RpcService.java index f30ec69..f717c57 100644 --- a/src/main/java/exchange/core2/revelator/raft/RpcService.java +++ b/src/main/java/exchange/core2/revelator/raft/RpcService.java @@ -1,6 +1,5 @@ package exchange.core2.revelator.raft; -import org.agrona.PrintBufferUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -19,7 +18,7 @@ public class RpcService implements AutoCloseable { - private static final Logger logger = LoggerFactory.getLogger(RaftNode.class); + private static final Logger logger = LoggerFactory.getLogger(RpcService.class); private final AtomicLong correlationIdCounter = new AtomicLong(0L); private final Map> futureMap = new ConcurrentHashMap<>(); @@ -92,8 +91,7 @@ public void run() { final int messageType = bb.getInt(); final long correlationId = bb.getLong(); - logger.debug("RECEIVED from {} mt={}: {}", nodeId, messageType, PrintBufferUtil.hexDump(receivePacket.getData(), 0, receivePacket.getLength())); - +// logger.debug("RECEIVED from {} mt={}: {}", nodeId, messageType, PrintBufferUtil.hexDump(receivePacket.getData(), 0, receivePacket.getLength())); if (messageType < 0) { processResponse(receivePacket, bb, nodeId, messageType, correlationId); @@ -175,13 +173,26 @@ private void sendResponse(int callerNodeId, long correlationId, RpcResponse resp } + public void callRpcAsync(RpcRequest request, int toNodeId) { + + final long correlationId = correlationIdCounter.incrementAndGet(); + callRpc(request, toNodeId, correlationId); + } + public CompletableFuture callRpcSync(RpcRequest request, int toNodeId) { final long correlationId = correlationIdCounter.incrementAndGet(); - CompletableFuture future = new CompletableFuture<>(); + final CompletableFuture future = new CompletableFuture<>(); futureMap.put(correlationId, future); + callRpc(request, toNodeId, correlationId); + + return future; + } + + private void callRpc(RpcRequest request, int toNodeId, long correlationId) { + final byte[] array = new byte[64]; ByteBuffer bb = ByteBuffer.wrap(array); @@ -192,8 +203,6 @@ public CompletableFuture callRpcSync(RpcRequest request, int toNode request.serialize(bb); send(toNodeId, array, bb.position()); - - return future; } From 2db33956e4802b425e9130f1a4c13f2fde4bec10 Mon Sep 17 00:00:00 2001 From: Maksim Zheravin Date: Sun, 23 Jan 2022 20:24:04 +0200 Subject: [PATCH 07/15] RAFT: add custom commands, consensus bugfixes --- .../revelator/raft/CmdRaftAppendEntries.java | 36 ++---- .../raft/CmdRaftAppendEntriesResponse.java | 13 +- .../revelator/raft/CmdRaftVoteRequest.java | 19 +-- .../revelator/raft/CmdRaftVoteResponse.java | 11 +- .../revelator/raft/CustomCommandRequest.java | 21 ++++ .../revelator/raft/CustomCommandResponse.java | 28 +++++ .../core2/revelator/raft/RaftNode.java | 118 +++++++++--------- .../core2/revelator/raft/RpcHandler.java | 9 ++ .../core2/revelator/raft/RpcMessage.java | 19 +++ .../core2/revelator/raft/RpcRequest.java | 10 +- .../core2/revelator/raft/RpcResponse.java | 12 +- .../core2/revelator/raft/RpcService.java | 106 ++++++---------- 12 files changed, 192 insertions(+), 210 deletions(-) create mode 100644 src/main/java/exchange/core2/revelator/raft/CustomCommandRequest.java create mode 100644 src/main/java/exchange/core2/revelator/raft/CustomCommandResponse.java create mode 100644 src/main/java/exchange/core2/revelator/raft/RpcHandler.java create mode 100644 src/main/java/exchange/core2/revelator/raft/RpcMessage.java diff --git a/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntries.java b/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntries.java index 98db4c7..60a6b0b 100644 --- a/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntries.java +++ b/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntries.java @@ -6,30 +6,12 @@ /** * Invoked by leader to replicate log entries (5.3); also used as heartbeat (5.2). */ -public final class CmdRaftAppendEntries implements RpcRequest { - - public final int term; // leader’s term - public final int leaderId; // so follower can redirect clients - - public final long prevLogIndex; // index of log entry immediately preceding new ones - public final int prevLogTerm;// term of prevLogIndex entry - public final List entries; // log entries to store (empty for heartbeat; may send more than one for efficiency) - public final long leaderCommit;// leader’s commitIndex - - public CmdRaftAppendEntries(int term, - int leaderId, - long prevLogIndex, - int prevLogTerm, - List entries, - long leaderCommit) { - - this.term = term; - this.leaderId = leaderId; - this.prevLogIndex = prevLogIndex; - this.prevLogTerm = prevLogTerm; - this.entries = entries; - this.leaderCommit = leaderCommit; - } +public record CmdRaftAppendEntries(int term, + int leaderId, + long prevLogIndex, + int prevLogTerm, + List entries, + long leaderCommit) implements RpcRequest { @Override public int getMessageType() { @@ -46,16 +28,16 @@ public void serialize(ByteBuffer buffer) { buffer.putLong(leaderCommit); } - public static CmdRaftAppendEntries create(ByteBuffer buffer){ + public static CmdRaftAppendEntries create(ByteBuffer buffer) { final int term = buffer.getInt(); final int leaderId = buffer.getInt(); final long prevLogIndex = buffer.getLong(); final int prevLogTerm = buffer.getInt(); // todo entries - final long leaderCommit = buffer.getLong(); + final long leaderCommit = buffer.getLong(); - return new CmdRaftAppendEntries(term, leaderId, prevLogIndex, prevLogTerm, List.of(), leaderCommit); + return new CmdRaftAppendEntries(term, leaderId, prevLogIndex, prevLogTerm, List.of(), leaderCommit); } @Override diff --git a/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntriesResponse.java b/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntriesResponse.java index aa3a73b..761b471 100644 --- a/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntriesResponse.java +++ b/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntriesResponse.java @@ -5,16 +5,7 @@ /** * Invoked by leader to replicate log entries (5.3); also used as heartbeat (5.2). */ -public final class CmdRaftAppendEntriesResponse implements RpcResponse { - - public final int term; // currentTerm, for leader to update itself - public final boolean success; // true if follower contained entry matching prevLogIndex and prevLogTerm - - public CmdRaftAppendEntriesResponse(int term, - boolean success) { - this.term = term; - this.success = success; - } +public record CmdRaftAppendEntriesResponse(int term, boolean success) implements RpcResponse { @Override public int getMessageType() { @@ -27,7 +18,7 @@ public void serialize(ByteBuffer buffer) { buffer.put(success ? (byte) 1 : (byte) 0); } - public static CmdRaftAppendEntriesResponse create(ByteBuffer bb){ + public static CmdRaftAppendEntriesResponse create(ByteBuffer bb) { final int term = bb.getInt(); final boolean success = bb.get() == 1; return new CmdRaftAppendEntriesResponse(term, success); diff --git a/src/main/java/exchange/core2/revelator/raft/CmdRaftVoteRequest.java b/src/main/java/exchange/core2/revelator/raft/CmdRaftVoteRequest.java index a0aca60..c0af7f7 100644 --- a/src/main/java/exchange/core2/revelator/raft/CmdRaftVoteRequest.java +++ b/src/main/java/exchange/core2/revelator/raft/CmdRaftVoteRequest.java @@ -5,24 +5,7 @@ /** * Invoked by leader to replicate log entries (5.3); also used as heartbeat (5.2). */ -public final class CmdRaftVoteRequest implements RpcRequest { - - public final int term; // candidate's term - public final int candidateId; // candidate requesting vote - - public final long lastLogIndex; // index of candidate’s last log entry (5.4) - public final int lastLogTerm; // term of candidate’s last log entry (5.4) - - public CmdRaftVoteRequest(int term, - int candidateId, - long lastLogIndex, - int lastLogTerm) { - - this.term = term; - this.candidateId = candidateId; - this.lastLogIndex = lastLogIndex; - this.lastLogTerm = lastLogTerm; - } +public record CmdRaftVoteRequest(int term, int candidateId, long lastLogIndex, int lastLogTerm) implements RpcRequest { @Override public int getMessageType() { diff --git a/src/main/java/exchange/core2/revelator/raft/CmdRaftVoteResponse.java b/src/main/java/exchange/core2/revelator/raft/CmdRaftVoteResponse.java index 0df063a..c2e17c7 100644 --- a/src/main/java/exchange/core2/revelator/raft/CmdRaftVoteResponse.java +++ b/src/main/java/exchange/core2/revelator/raft/CmdRaftVoteResponse.java @@ -5,16 +5,7 @@ /** * Invoked by candidates to gather votes (5.2). */ -public final class CmdRaftVoteResponse implements RpcResponse { - - public final int term; // currentTerm, for candidate to update itself - public final boolean voteGranted; // true means that candidate received vote - - public CmdRaftVoteResponse(int term, - boolean voteGranted) { - this.term = term; - this.voteGranted = voteGranted; - } +public record CmdRaftVoteResponse(int term, boolean voteGranted) implements RpcResponse { @Override public int getMessageType() { diff --git a/src/main/java/exchange/core2/revelator/raft/CustomCommandRequest.java b/src/main/java/exchange/core2/revelator/raft/CustomCommandRequest.java new file mode 100644 index 0000000..5ee7a74 --- /dev/null +++ b/src/main/java/exchange/core2/revelator/raft/CustomCommandRequest.java @@ -0,0 +1,21 @@ +package exchange.core2.revelator.raft; + +import java.nio.ByteBuffer; + +public record CustomCommandRequest(long data) implements RpcRequest { + + @Override + public int getMessageType() { + return REQUEST_CUSTOM; + } + + @Override + public void serialize(ByteBuffer buffer) { + buffer.putLong(data); + } + + public static CustomCommandRequest create(ByteBuffer buffer) { + final long data = buffer.getLong(); + return new CustomCommandRequest(data); + } +} diff --git a/src/main/java/exchange/core2/revelator/raft/CustomCommandResponse.java b/src/main/java/exchange/core2/revelator/raft/CustomCommandResponse.java new file mode 100644 index 0000000..42cb8b7 --- /dev/null +++ b/src/main/java/exchange/core2/revelator/raft/CustomCommandResponse.java @@ -0,0 +1,28 @@ +package exchange.core2.revelator.raft; + +import java.nio.ByteBuffer; + +public record CustomCommandResponse(long hash, int leaderNodeId, boolean success) implements RpcResponse { + + @Override + public int getMessageType() { + return RESPONSE_CUSTOM; + } + + @Override + public void serialize(ByteBuffer buffer) { + buffer.putLong(hash); + buffer.putInt(leaderNodeId); + buffer.put(success ? (byte) 1 : (byte) 0); + } + + public static CustomCommandResponse create(ByteBuffer buffer) { + + final long hash = buffer.getLong(); + final int leaderNodeId = buffer.getInt(); + final boolean success = buffer.get() == 1; + + return new CustomCommandResponse(hash, leaderNodeId, success); + } + +} diff --git a/src/main/java/exchange/core2/revelator/raft/RaftNode.java b/src/main/java/exchange/core2/revelator/raft/RaftNode.java index 630e240..0984e87 100644 --- a/src/main/java/exchange/core2/revelator/raft/RaftNode.java +++ b/src/main/java/exchange/core2/revelator/raft/RaftNode.java @@ -7,8 +7,6 @@ import java.util.ArrayList; import java.util.List; import java.util.Map; -import java.util.function.BiConsumer; -import java.util.function.BiFunction; public class RaftNode { @@ -16,7 +14,8 @@ public class RaftNode { public static final int HEARTBEAT_TIMEOUT_MS = 2000 + (int) (Math.random() * 500); public static final int HEARTBEAT_LEADER_RATE_MS = 1000; - public static final int ELECTION_TIMEOUT_MS = 3000; + public static final int ELECTION_TIMEOUT_MIN_MS = 2500; + public static final int ELECTION_TIMEOUT_MAX_MS = 2800; public static final int CLUSTER_SIZE = 3; public static final int VOTES_REQUIRED = 2; @@ -62,7 +61,7 @@ public class RaftNode { // timers private long lastHeartBeatReceivedNs = System.nanoTime(); private long lastHeartBeatSentNs = System.nanoTime(); - private long electionStartedNs = System.nanoTime(); + private long electionEndNs = System.nanoTime(); public static void main(String[] args) { @@ -83,86 +82,88 @@ public RaftNode(int thisNodeId) { this.otherNodes = remoteNodes.keySet().stream().mapToInt(x -> x).filter(nodeId -> nodeId != thisNodeId).toArray(); + RpcHandler handler = new RpcHandler() { + @Override + public RpcResponse handleRequest(int fromNodeId, RpcRequest req) { + logger.debug("INCOMING REQ {} >>> {}", fromNodeId, req); - final BiFunction handler = (fromNodeId, req) -> { - logger.debug("INCOMING REQ {} >>> {}", fromNodeId, req); + if (req instanceof CmdRaftVoteRequest voteRequest) { - if (req instanceof CmdRaftVoteRequest voteRequest) { - - synchronized (this) { + synchronized (this) { /* Receiver implementation: 1. Reply false if term < currentTerm (§5.1) 2. If votedFor is null or candidateId, and candidate’s log is at least as up-to-date as receiver’s log, grant vote (5.2, 5.4) */ - if (voteRequest.term < currentTerm) { - logger.debug("Reject vote for {} - term is old", fromNodeId); - return new CmdRaftVoteResponse(currentTerm, false); - } + if (voteRequest.term() < currentTerm) { + logger.debug("Reject vote for {} - term is old", fromNodeId); + return new CmdRaftVoteResponse(currentTerm, false); + } - if (voteRequest.term > currentTerm) { - logger.debug("received newer term {} with vote request", voteRequest.term); - currentTerm = voteRequest.term; - switchToFollower(); - } + if (voteRequest.term() > currentTerm) { + logger.debug("received newer term {} with vote request", voteRequest.term()); + currentTerm = voteRequest.term(); + votedFor = -1; // never voted in newer term + switchToFollower(); + resetFollowerAppendTimer(); + } - if (votedFor != -1 && votedFor != currentNodeId) { - logger.debug("Reject vote for {} - already voted for {}", fromNodeId, votedFor); - return new CmdRaftVoteResponse(currentTerm, false); - } + if (votedFor != -1 && votedFor != currentNodeId) { + logger.debug("Reject vote for {} - already voted for {}", fromNodeId, votedFor); + return new CmdRaftVoteResponse(currentTerm, false); + } - logger.debug("VOTE GRANTED for {}", fromNodeId); - votedFor = fromNodeId; + logger.debug("VOTE GRANTED for {}", fromNodeId); + votedFor = fromNodeId; - return new CmdRaftVoteResponse(currentTerm, true); + return new CmdRaftVoteResponse(currentTerm, true); + } } + if (req instanceof CmdRaftAppendEntries appendEntriesCmd) { - } - if (req instanceof CmdRaftAppendEntries appendEntriesCmd) { - - synchronized (this) { + synchronized (this) { - if (appendEntriesCmd.term < currentTerm) { - logger.debug("Ignoring leader with older term {} (current={}", appendEntriesCmd.term, currentTerm); - return new CmdRaftAppendEntriesResponse(currentTerm, false); - } + if (appendEntriesCmd.term() < currentTerm) { + logger.debug("Ignoring leader with older term {} (current={}", appendEntriesCmd.term(), currentTerm); + return new CmdRaftAppendEntriesResponse(currentTerm, false); + } - if (currentState == RaftNodeState.CANDIDATE) { + if (currentState == RaftNodeState.CANDIDATE) { /* While waiting for votes, a candidate may receive an AppendEntries RPC from another server claiming to be leader. If the leader’s term (included in its RPC) is at least as large as the candidate’s current term, then the candidate recognizes the leader as legitimate and returns to follower state. If the term in the RPC is smaller than the candidate’s current term, then the candidate rejects the RPC and continues in candidate state. */ - logger.debug("Switch from Candidate to follower"); + logger.debug("Switch from Candidate to follower"); - switchToFollower(); + switchToFollower(); // electionTimer.cancel(); - } else { + } else { - // TODO add records + // TODO add records - if (appendEntriesCmd.term > currentTerm) { - logger.info("Update term {}->{}", currentTerm, appendEntriesCmd.term); - currentTerm = appendEntriesCmd.term; - } + if (appendEntriesCmd.term() > currentTerm) { + logger.info("Update term {}->{}", currentTerm, appendEntriesCmd.term()); + currentTerm = appendEntriesCmd.term(); + } - resetFollowerAppendTimer(); + resetFollowerAppendTimer(); + } } } - } - - return null; - }; - final BiConsumer handlerResponses = (fromNodeId, resp) -> { + return null; + } - logger.debug("INCOMING RESP {} >>> {}", fromNodeId, resp); + @Override + public void handleResponse(int fromNodeId, RpcResponse resp) { + logger.debug("INCOMING RESP {} >>> {}", fromNodeId, resp); /* A candidate wins an election if it receives votes from a majority of the servers in the full cluster for the same @@ -170,20 +171,21 @@ public RaftNode(int thisNodeId) { given term, on a first-come-first-served basis (note: Section 5.4 adds an additional restriction on votes) */ - if (resp instanceof final CmdRaftVoteResponse voteResponse) { - synchronized (this) { - if (currentState == RaftNodeState.CANDIDATE && voteResponse.voteGranted && voteResponse.term == currentTerm) { - switchToLeader(); + if (resp instanceof final CmdRaftVoteResponse voteResponse) { + synchronized (this) { + if (currentState == RaftNodeState.CANDIDATE && voteResponse.voteGranted() && voteResponse.term() == currentTerm) { + switchToLeader(); + } } } } }; // todo remove from constructor - rpcService = new RpcService(remoteNodes, handler, handlerResponses, thisNodeId); + rpcService = new RpcService(remoteNodes, handler, thisNodeId); logger.info("HEARTBEAT_TIMEOUT_MS={}", HEARTBEAT_TIMEOUT_MS); - logger.info("ELECTION_TIMEOUT_MS={}", ELECTION_TIMEOUT_MS); + logger.info("ELECTION_TIMEOUT_MS={}..{}", ELECTION_TIMEOUT_MIN_MS, ELECTION_TIMEOUT_MAX_MS); logger.info("Starting node {} as follower...", thisNodeId); resetFollowerAppendTimer(); @@ -212,7 +214,7 @@ private void workerThread() { if (currentState == RaftNodeState.CANDIDATE) { final long t = System.nanoTime(); - if (t > electionStartedNs + ELECTION_TIMEOUT_MS * 1_000_000L) { + if (t > electionEndNs) { appendTimeout(); } } @@ -400,7 +402,9 @@ private synchronized void appendTimeout() { rpcService.callRpcAsync(voteReq, otherNodes[0]); rpcService.callRpcAsync(voteReq, otherNodes[1]); - electionStartedNs = System.nanoTime(); + final int timeoutMs = ELECTION_TIMEOUT_MIN_MS + (int) (Math.random() * (ELECTION_TIMEOUT_MAX_MS - ELECTION_TIMEOUT_MIN_MS)); + logger.debug("ElectionTimeout: {}ms", timeoutMs); + electionEndNs = System.nanoTime() + timeoutMs * 1_000_000L; } private void switchToLeader() { diff --git a/src/main/java/exchange/core2/revelator/raft/RpcHandler.java b/src/main/java/exchange/core2/revelator/raft/RpcHandler.java new file mode 100644 index 0000000..9ca57de --- /dev/null +++ b/src/main/java/exchange/core2/revelator/raft/RpcHandler.java @@ -0,0 +1,9 @@ +package exchange.core2.revelator.raft; + +public interface RpcHandler { + + RpcResponse handleRequest(int nodeId, RpcRequest request); + + void handleResponse(int nodeId, RpcResponse response); + +} diff --git a/src/main/java/exchange/core2/revelator/raft/RpcMessage.java b/src/main/java/exchange/core2/revelator/raft/RpcMessage.java new file mode 100644 index 0000000..9830cff --- /dev/null +++ b/src/main/java/exchange/core2/revelator/raft/RpcMessage.java @@ -0,0 +1,19 @@ +package exchange.core2.revelator.raft; + +import java.nio.ByteBuffer; + +public interface RpcMessage { + + int getMessageType(); + + void serialize(ByteBuffer buffer); + + int REQUEST_APPEND_ENTRIES = 1; + int RESPONSE_APPEND_ENTRIES = -1; + int REQUEST_VOTE = 2; + int RESPONSE_VOTE = -2; + + int REQUEST_CUSTOM = 10; + int RESPONSE_CUSTOM = -10; + +} diff --git a/src/main/java/exchange/core2/revelator/raft/RpcRequest.java b/src/main/java/exchange/core2/revelator/raft/RpcRequest.java index 3b003da..829a924 100644 --- a/src/main/java/exchange/core2/revelator/raft/RpcRequest.java +++ b/src/main/java/exchange/core2/revelator/raft/RpcRequest.java @@ -1,13 +1,5 @@ package exchange.core2.revelator.raft; -import java.nio.ByteBuffer; +public interface RpcRequest extends RpcMessage { -public interface RpcRequest { - - int getMessageType(); - - void serialize(ByteBuffer buffer); - - int REQUEST_APPEND_ENTRIES = 1; - int REQUEST_VOTE = 2; } diff --git a/src/main/java/exchange/core2/revelator/raft/RpcResponse.java b/src/main/java/exchange/core2/revelator/raft/RpcResponse.java index 71f944a..ec66efd 100644 --- a/src/main/java/exchange/core2/revelator/raft/RpcResponse.java +++ b/src/main/java/exchange/core2/revelator/raft/RpcResponse.java @@ -1,15 +1,5 @@ package exchange.core2.revelator.raft; -import java.nio.ByteBuffer; - -public interface RpcResponse { - - - int getMessageType(); - - void serialize(ByteBuffer buffer); - - int RESPONSE_APPEND_ENTRIES = -1; - int RESPONSE_VOTE = -2; +public interface RpcResponse extends RpcMessage { } diff --git a/src/main/java/exchange/core2/revelator/raft/RpcService.java b/src/main/java/exchange/core2/revelator/raft/RpcService.java index f717c57..5fff099 100644 --- a/src/main/java/exchange/core2/revelator/raft/RpcService.java +++ b/src/main/java/exchange/core2/revelator/raft/RpcService.java @@ -1,5 +1,6 @@ package exchange.core2.revelator.raft; +import org.agrona.PrintBufferUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -13,8 +14,6 @@ import java.util.concurrent.CompletableFuture; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicLong; -import java.util.function.BiConsumer; -import java.util.function.BiFunction; public class RpcService implements AutoCloseable { @@ -25,14 +24,12 @@ public class RpcService implements AutoCloseable { private final Map socketMap; private final int serverPort; private final int serverNodeId; - private final BiFunction handler; - private final BiConsumer handlerResponses; + private final RpcHandler handler; private volatile boolean active = true; public RpcService(Map remoteNodes, - BiFunction handler, - BiConsumer handlerResponses, + RpcHandler handler, int serverNodeId) { final Map socketMap = new HashMap<>(); @@ -57,7 +54,6 @@ public RpcService(Map remoteNodes, this.socketMap = socketMap; this.handler = handler; - this.handlerResponses = handlerResponses; this.serverPort = socketMap.get(serverNodeId).port; this.serverNodeId = serverNodeId; @@ -81,23 +77,37 @@ public void run() { while (active) { - serverSocket.receive(receivePacket); - // String sentence = new String(receivePacket.getData(), 0, receivePacket.getLength()); + try { + serverSocket.receive(receivePacket); + final ByteBuffer bb = ByteBuffer.wrap(receivePacket.getData(), 0, receivePacket.getLength()); - final ByteBuffer bb = ByteBuffer.wrap(receivePacket.getData(), 0, receivePacket.getLength()); - - final int nodeId = bb.getInt(); - final int messageType = bb.getInt(); - final long correlationId = bb.getLong(); + final int nodeId = bb.getInt(); + final int messageType = bb.getInt(); + final long correlationId = bb.getLong(); // logger.debug("RECEIVED from {} mt={}: {}", nodeId, messageType, PrintBufferUtil.hexDump(receivePacket.getData(), 0, receivePacket.getLength())); - if (messageType < 0) { - processResponse(receivePacket, bb, nodeId, messageType, correlationId); - - } else { - processRequest(receivePacket, bb, nodeId, messageType, correlationId); + final RpcMessage msg = createByType(messageType, bb); + + if (messageType < 0) { + final CompletableFuture future = futureMap.remove(correlationId); + if (future != null) { + future.complete((RpcResponse) msg); + } else { + handler.handleResponse(nodeId, (RpcResponse) msg); + } + + } else { + final RpcResponse response = handler.handleRequest(nodeId, (RpcRequest) msg); + if (response != null) { + sendResponse(nodeId, correlationId, response); + } + } + + } catch (Exception ex) { + String message = PrintBufferUtil.hexDump(receivePacket.getData(), 0, receivePacket.getLength()); + logger.error("Failed to process message from {}: {}", receivePacket.getAddress().getHostAddress(), message, ex); } } @@ -110,54 +120,16 @@ public void run() { } - private void processRequest(DatagramPacket receivePacket, ByteBuffer bb, int nodeId, int messageType, long correlationId) { - - if (messageType == RpcRequest.REQUEST_APPEND_ENTRIES) { - - final CmdRaftAppendEntries request = CmdRaftAppendEntries.create(bb); - final RpcResponse response = handler.apply(nodeId, request); - if (response != null) { - sendResponse(nodeId, correlationId, response); - } - - } else if (messageType == RpcRequest.REQUEST_VOTE) { - - final CmdRaftVoteRequest request = CmdRaftVoteRequest.create(bb); - final RpcResponse response = handler.apply(nodeId, request); - if (response != null) { - sendResponse(nodeId, correlationId, response); - } - - } else { - logger.warn("Unsupported response type={} from {} correlationId={}", - messageType, receivePacket.getAddress().getHostAddress(), correlationId); - } - } - - private void processResponse(DatagramPacket receivePacket, ByteBuffer bb, int nodeId, int messageType, long correlationId) { - - final CompletableFuture future = futureMap.remove(correlationId); - - if (messageType == RpcResponse.RESPONSE_APPEND_ENTRIES) { - - final CmdRaftAppendEntriesResponse r = CmdRaftAppendEntriesResponse.create(bb); - if (future != null) { - future.complete(r); - } - handlerResponses.accept(nodeId, r); - - } else if (messageType == RpcResponse.RESPONSE_VOTE) { - - final CmdRaftVoteResponse r = CmdRaftVoteResponse.create(bb); - if (future != null) { - future.complete(r); - } - handlerResponses.accept(nodeId, r); - - } else { - logger.warn("Unsupported response type={} from {} correlationId={}", - messageType, receivePacket.getAddress().getHostAddress(), correlationId); - } + static RpcMessage createByType(int messageType, ByteBuffer buffer) { + return switch (messageType) { + case RpcMessage.REQUEST_APPEND_ENTRIES -> CmdRaftAppendEntries.create(buffer); + case RpcMessage.RESPONSE_APPEND_ENTRIES -> CmdRaftAppendEntriesResponse.create(buffer); + case RpcMessage.REQUEST_VOTE -> CmdRaftVoteRequest.create(buffer); + case RpcMessage.RESPONSE_VOTE -> CmdRaftVoteResponse.create(buffer); + case RpcMessage.REQUEST_CUSTOM -> CustomCommandRequest.create(buffer); + case RpcMessage.RESPONSE_CUSTOM -> CustomCommandResponse.create(buffer); + default -> throw new IllegalArgumentException("Unknown messageType: " + messageType); + }; } private void sendResponse(int callerNodeId, long correlationId, RpcResponse response) { From 9f0f148609a77ba3911685b6aff0cb8b76fc344e Mon Sep 17 00:00:00 2001 From: Maksim Zheravin Date: Sat, 29 Jan 2022 01:19:14 +0200 Subject: [PATCH 08/15] RAFT: implement client communication, entries transfer, log repository mock, replicated state machine (RSM) abstraction --- .../revelator/raft/CmdRaftAppendEntries.java | 16 +- .../raft/CmdRaftAppendEntriesResponse.java | 7 + .../revelator/raft/CustomCommandRequest.java | 1 + .../revelator/raft/CustomCommandResponse.java | 6 +- .../core2/revelator/raft/CustomRsm.java | 19 + .../core2/revelator/raft/RaftLogEntry.java | 21 +- .../revelator/raft/RaftLogRepository.java | 83 ++++ .../core2/revelator/raft/RaftNode.java | 392 +++++++++++------- .../raft/ReplicatedStateMachine.java | 10 + .../core2/revelator/raft/RpcHandler.java | 8 +- .../core2/revelator/raft/RpcService.java | 123 ++++-- .../revelator/raft/SerializableMessage.java | 9 + .../raft/SerializableMessageFactory.java | 9 + 13 files changed, 508 insertions(+), 196 deletions(-) create mode 100644 src/main/java/exchange/core2/revelator/raft/CustomRsm.java create mode 100644 src/main/java/exchange/core2/revelator/raft/RaftLogRepository.java create mode 100644 src/main/java/exchange/core2/revelator/raft/ReplicatedStateMachine.java create mode 100644 src/main/java/exchange/core2/revelator/raft/SerializableMessage.java create mode 100644 src/main/java/exchange/core2/revelator/raft/SerializableMessageFactory.java diff --git a/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntries.java b/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntries.java index 60a6b0b..17db74b 100644 --- a/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntries.java +++ b/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntries.java @@ -1,6 +1,7 @@ package exchange.core2.revelator.raft; import java.nio.ByteBuffer; +import java.util.ArrayList; import java.util.List; /** @@ -10,7 +11,7 @@ public record CmdRaftAppendEntries(int term, int leaderId, long prevLogIndex, int prevLogTerm, - List entries, + List entries, long leaderCommit) implements RpcRequest { @Override @@ -24,7 +25,8 @@ public void serialize(ByteBuffer buffer) { buffer.putInt(leaderId); buffer.putLong(prevLogIndex); buffer.putInt(prevLogTerm); -// buffer.put + buffer.putInt(entries.size()); + entries.forEach(entry -> entry.serialize(buffer)); buffer.putLong(leaderCommit); } @@ -34,10 +36,16 @@ public static CmdRaftAppendEntries create(ByteBuffer buffer) { final int leaderId = buffer.getInt(); final long prevLogIndex = buffer.getLong(); final int prevLogTerm = buffer.getInt(); - // todo entries + final int numEntries = buffer.getInt(); + + final List entries = new ArrayList<>(numEntries); + for (int i = 0; i < numEntries; i++) { + entries.add(RaftLogEntry.create(buffer)); + } + final long leaderCommit = buffer.getLong(); - return new CmdRaftAppendEntries(term, leaderId, prevLogIndex, prevLogTerm, List.of(), leaderCommit); + return new CmdRaftAppendEntries(term, leaderId, prevLogIndex, prevLogTerm, entries, leaderCommit); } @Override diff --git a/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntriesResponse.java b/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntriesResponse.java index 761b471..19ea7fd 100644 --- a/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntriesResponse.java +++ b/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntriesResponse.java @@ -4,6 +4,13 @@ /** * Invoked by leader to replicate log entries (5.3); also used as heartbeat (5.2). + *

+ * TODO If desired, the protocol can be optimized to reduce the number of rejected AppendEntries RPCs. For example, + * when rejecting an AppendEntries request, the follower can include the term of the conflicting entry and the first + * index it stores for that term. With this information, the leader can decrement nextIndex to bypass all of the + * conflicting entries in that term; one AppendEntries RPC will be required for each term with conflicting entries, rather + * than one RPC per entry. In practice, we doubt this optimization is necessary, since failures happen infrequently + * and it is unlikely that there will be many inconsistent entries. */ public record CmdRaftAppendEntriesResponse(int term, boolean success) implements RpcResponse { diff --git a/src/main/java/exchange/core2/revelator/raft/CustomCommandRequest.java b/src/main/java/exchange/core2/revelator/raft/CustomCommandRequest.java index 5ee7a74..50eb04c 100644 --- a/src/main/java/exchange/core2/revelator/raft/CustomCommandRequest.java +++ b/src/main/java/exchange/core2/revelator/raft/CustomCommandRequest.java @@ -2,6 +2,7 @@ import java.nio.ByteBuffer; +// TODO support batching public record CustomCommandRequest(long data) implements RpcRequest { @Override diff --git a/src/main/java/exchange/core2/revelator/raft/CustomCommandResponse.java b/src/main/java/exchange/core2/revelator/raft/CustomCommandResponse.java index 42cb8b7..ea449e2 100644 --- a/src/main/java/exchange/core2/revelator/raft/CustomCommandResponse.java +++ b/src/main/java/exchange/core2/revelator/raft/CustomCommandResponse.java @@ -2,7 +2,7 @@ import java.nio.ByteBuffer; -public record CustomCommandResponse(long hash, int leaderNodeId, boolean success) implements RpcResponse { +public record CustomCommandResponse(int hash, int leaderNodeId, boolean success) implements RpcResponse { @Override public int getMessageType() { @@ -11,14 +11,14 @@ public int getMessageType() { @Override public void serialize(ByteBuffer buffer) { - buffer.putLong(hash); + buffer.putInt(hash); buffer.putInt(leaderNodeId); buffer.put(success ? (byte) 1 : (byte) 0); } public static CustomCommandResponse create(ByteBuffer buffer) { - final long hash = buffer.getLong(); + final int hash = buffer.getInt(); final int leaderNodeId = buffer.getInt(); final boolean success = buffer.get() == 1; diff --git a/src/main/java/exchange/core2/revelator/raft/CustomRsm.java b/src/main/java/exchange/core2/revelator/raft/CustomRsm.java new file mode 100644 index 0000000..91421f8 --- /dev/null +++ b/src/main/java/exchange/core2/revelator/raft/CustomRsm.java @@ -0,0 +1,19 @@ +package exchange.core2.revelator.raft; + +import org.agrona.collections.Hashing; + +public class CustomRsm implements ReplicatedStateMachine { + + int hash = 0; + + @Override + public int apply(long value) { + hash = Hashing.hash(hash ^ Hashing.hash(value)); + return hash; + } + + @Override + public int getState() { + return hash; + } +} diff --git a/src/main/java/exchange/core2/revelator/raft/RaftLogEntry.java b/src/main/java/exchange/core2/revelator/raft/RaftLogEntry.java index ef97bce..26cfe6e 100644 --- a/src/main/java/exchange/core2/revelator/raft/RaftLogEntry.java +++ b/src/main/java/exchange/core2/revelator/raft/RaftLogEntry.java @@ -1,16 +1,31 @@ package exchange.core2.revelator.raft; +import java.nio.ByteBuffer; + /** * each entry contains command for state machine, and term when entry was received by leader */ public class RaftLogEntry { // term when entry was received by leader - public final long term; - public final String cmd; + public final int term; + + // command + public final long cmd; - public RaftLogEntry(long term, String cmd) { + public RaftLogEntry(int term, long cmd) { this.term = term; this.cmd = cmd; } + + public void serialize(ByteBuffer buffer) { + buffer.putInt(term); + buffer.putLong(cmd); + } + + public static RaftLogEntry create(ByteBuffer buffer) { + final int term = buffer.getInt(); + final long cmd = buffer.getLong(); + return new RaftLogEntry(term, cmd); + } } diff --git a/src/main/java/exchange/core2/revelator/raft/RaftLogRepository.java b/src/main/java/exchange/core2/revelator/raft/RaftLogRepository.java new file mode 100644 index 0000000..5add984 --- /dev/null +++ b/src/main/java/exchange/core2/revelator/raft/RaftLogRepository.java @@ -0,0 +1,83 @@ +package exchange.core2.revelator.raft; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; + +public class RaftLogRepository { + + + private static final Logger log = LoggerFactory.getLogger(RaftLogRepository.class); + + private final List logEntries = new ArrayList<>(); // TODO change to persistent storage with long-index + + public RaftLogEntry getEntry(long index) { + return logEntries.get((int) index - 1); + } + + public long getLastLogIndex() { + return logEntries.size(); // 0 = no records + } + + public int getLastLogTerm() { + if (logEntries.isEmpty()) { + return 0; // return term 0 by default + } else { + return logEntries.get(logEntries.size() - 1).term; + } + } + + public long append(final RaftLogEntry logEntry) { + logEntries.add(logEntry); + return logEntries.size(); // starting from index=1 + } + + + // size = 5 + // pos 0 1 2 3 4 5 6 7 <- array positions + // idx 1 2 3 4 5 <- existing records + + // last 5 + // new 6 7 8 + + // last 3 + // check 4 5 + // new 6 7 8 + + // TODO unittest + + public void appendOrOverride(final List newEntries, long prevLogIndex) { + + for (int i = 0; i < newEntries.size(); i++) { + final RaftLogEntry newEntry = newEntries.get(i); + if ((prevLogIndex + i) < logEntries.size()) { + final int pos = (int) prevLogIndex + i; + final int existingTerm = logEntries.get(pos).term; + + // 3. If an existing entry conflicts with a new one (same index but different terms), + // delete the existing entry and all that follow it + + if (newEntry.term != existingTerm) { + log.debug("Remove all records after index={}, because term is different: {} (old={})", pos + 1, newEntry.term, existingTerm); + int lastIdxToRemove = logEntries.size(); + if (lastIdxToRemove > pos + 1) { + logEntries.subList(pos + 1, lastIdxToRemove).clear(); + } + } + } else { + logEntries.add(newEntry); // TODO inefficient, because normally records are simply appended as batch + } + } + } + + // 1 + public List getEntriesStartingFrom(long nextIndex) { + if (getLastLogIndex() < nextIndex) { + return List.of(); + } + + return logEntries.subList((int) nextIndex - 1, logEntries.size()); + } +} diff --git a/src/main/java/exchange/core2/revelator/raft/RaftNode.java b/src/main/java/exchange/core2/revelator/raft/RaftNode.java index 0984e87..a99c6e0 100644 --- a/src/main/java/exchange/core2/revelator/raft/RaftNode.java +++ b/src/main/java/exchange/core2/revelator/raft/RaftNode.java @@ -1,21 +1,24 @@ package exchange.core2.revelator.raft; +import org.eclipse.collections.impl.map.mutable.primitive.LongObjectHashMap; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.ArrayList; +import java.net.InetAddress; +import java.util.Arrays; import java.util.List; import java.util.Map; -public class RaftNode { +public class RaftNode { - private static final Logger logger = LoggerFactory.getLogger(RaftNode.class); + private static final Logger log = LoggerFactory.getLogger(RaftNode.class); public static final int HEARTBEAT_TIMEOUT_MS = 2000 + (int) (Math.random() * 500); public static final int HEARTBEAT_LEADER_RATE_MS = 1000; public static final int ELECTION_TIMEOUT_MIN_MS = 2500; public static final int ELECTION_TIMEOUT_MAX_MS = 2800; + public static final int APPEND_REPLY_TIMEOUT_MAX_MS = 20; public static final int CLUSTER_SIZE = 3; public static final int VOTES_REQUIRED = 2; @@ -30,8 +33,7 @@ public class RaftNode { private int votedFor = -1; // log entries; each entry contains command for state machine, and term when entry was received by leader (first index is 1) - private final List log = new ArrayList<>(); // TODO change to persistent storage with long-index - + private final RaftLogRepository logRepository = new RaftLogRepository(); /* **** Volatile state on all servers: */ @@ -46,11 +48,28 @@ public class RaftNode { /* **** Volatile state on leaders: (Reinitialized after election) */ // for each server, index of the next log entry to send to that server (initialized to leader last log index + 1) + // + // The leader maintains a nextIndex for each follower, which is the index of the next log entry the leader will + // send to that follower. When a leader first comes to power, it initializes all nextIndex values to the index just after the + // last one in its log (11 in Figure 7). If a follower’s log is inconsistent with the leader’s, the AppendEntries consistency + // check will fail in the next AppendEntries RPC. After a rejection, the leader decrements nextIndex and retries + // the AppendEntries RPC. Eventually nextIndex will reach a point where the leader and follower logs match. When + // this happens, AppendEntries will succeed, which removes any conflicting entries in the follower’s log and appends + // entries from the leader’s log (if any). Once AppendEntries succeeds, the follower’s log is consistent with the leader’s, + // and it will remain that way for the rest of the term. private final long[] nextIndex = new long[3]; // for each server, index of the highest log entry known to be replicated on server (initialized to 0, increases monotonically) private final long[] matchIndex = new long[3]; + // EXTRA: ending only one addRecordsMessage to each server + private final long[] correlationIds = new long[3]; + private final long[] timeSent = new long[3]; + private final long[] sentUpTo = new long[3]; + + + private final LongObjectHashMap clientResponsesMap = new LongObjectHashMap<>(); + /* ********************************************* */ private final int currentNodeId; @@ -58,11 +77,14 @@ public class RaftNode { private final RpcService rpcService; + private final ReplicatedStateMachine rsm = new CustomRsm(); + // timers private long lastHeartBeatReceivedNs = System.nanoTime(); private long lastHeartBeatSentNs = System.nanoTime(); private long electionEndNs = System.nanoTime(); + public static void main(String[] args) { final int thisNodeId = Integer.parseInt(args[0]); @@ -84,8 +106,8 @@ public RaftNode(int thisNodeId) { RpcHandler handler = new RpcHandler() { @Override - public RpcResponse handleRequest(int fromNodeId, RpcRequest req) { - logger.debug("INCOMING REQ {} >>> {}", fromNodeId, req); + public RpcResponse handleNodeRequest(int fromNodeId, RpcRequest req) { + log.debug("INCOMING REQ {} >>> {}", fromNodeId, req); if (req instanceof CmdRaftVoteRequest voteRequest) { @@ -96,65 +118,97 @@ public RpcResponse handleRequest(int fromNodeId, RpcRequest req) { least as up-to-date as receiver’s log, grant vote (5.2, 5.4) */ if (voteRequest.term() < currentTerm) { - logger.debug("Reject vote for {} - term is old", fromNodeId); + log.debug("Reject vote for {} - term is old", fromNodeId); return new CmdRaftVoteResponse(currentTerm, false); } if (voteRequest.term() > currentTerm) { - logger.debug("received newer term {} with vote request", voteRequest.term()); + log.debug("received newer term {} with vote request", voteRequest.term()); currentTerm = voteRequest.term(); votedFor = -1; // never voted in newer term switchToFollower(); resetFollowerAppendTimer(); } - if (votedFor != -1 && votedFor != currentNodeId) { - logger.debug("Reject vote for {} - already voted for {}", fromNodeId, votedFor); + if (votedFor != -1) { +// if (votedFor != -1 && votedFor != currentNodeId) { + log.debug("Reject vote for {} - already voted for {}", fromNodeId, votedFor); return new CmdRaftVoteResponse(currentTerm, false); } - logger.debug("VOTE GRANTED for {}", fromNodeId); + log.debug("VOTE GRANTED for {}", fromNodeId); votedFor = fromNodeId; return new CmdRaftVoteResponse(currentTerm, true); } } - if (req instanceof CmdRaftAppendEntries appendEntriesCmd) { + if (req instanceof CmdRaftAppendEntries cmd) { synchronized (this) { - if (appendEntriesCmd.term() < currentTerm) { - logger.debug("Ignoring leader with older term {} (current={}", appendEntriesCmd.term(), currentTerm); + // 1. Reply false if term < currentTerm + if (cmd.term() < currentTerm) { + log.debug("Ignoring leader with older term {} (current={}", cmd.term(), currentTerm); return new CmdRaftAppendEntriesResponse(currentTerm, false); } if (currentState == RaftNodeState.CANDIDATE) { - /* While waiting for votes, a candidate may receive an AppendEntries RPC from another server claiming to be leader. - If the leader’s term (included in its RPC) is at least as large as the candidate’s current term, - then the candidate recognizes the leader as legitimate and returns to follower state. - If the term in the RPC is smaller than the candidate’s current term, - then the candidate rejects the RPC and continues in candidate state. */ + /* While waiting for votes, a candidate may receive an AppendEntries RPC from another server claiming to be leader. + If the leader’s term (included in its RPC) is at least as large as the candidate’s current term, + then the candidate recognizes the leader as legitimate and returns to follower state. + If the term in the RPC is smaller than the candidate’s current term, + then the candidate rejects the RPC and continues in candidate state. */ - logger.debug("Switch from Candidate to follower"); + log.debug("Switch from Candidate to follower"); switchToFollower(); + } -// electionTimer.cancel(); + if (cmd.term() > currentTerm) { + log.info("Update term {}->{}", currentTerm, cmd.term()); + currentTerm = cmd.term(); + switchToFollower(); + } + if (currentState == RaftNodeState.FOLLOWER && votedFor != cmd.leaderId()) { + log.info("Changed votedFor to {}", cmd.leaderId()); + votedFor = cmd.leaderId(); // to inform client who accessing followers + } - } else { + resetFollowerAppendTimer(); - // TODO add records + if (cmd.entries().isEmpty()) { + return new CmdRaftAppendEntriesResponse(currentTerm, true); + } - if (appendEntriesCmd.term() > currentTerm) { - logger.info("Update term {}->{}", currentTerm, appendEntriesCmd.term()); - currentTerm = appendEntriesCmd.term(); - } + log.debug("Adding new records into the log"); - resetFollowerAppendTimer(); + // 2. Reply false if log doesn’t contain an entry at prevLogIndex whose term matches prevLogTerm + final long prevLogIndex = cmd.prevLogIndex(); + if (prevLogIndex >= logRepository.getLastLogIndex()) { + log.warn("Reject - log doesn’t contain an entry at prevLogIndex={}", logRepository.getLastLogIndex()); + return new CmdRaftAppendEntriesResponse(currentTerm, false); + } + final int lastLogTerm = logRepository.getLastLogTerm(); + if (cmd.prevLogTerm() != lastLogTerm) { + log.warn("Reject - log last record has different term {}, expected prevLogTerm={}", lastLogTerm, cmd.prevLogTerm()); + return new CmdRaftAppendEntriesResponse(currentTerm, false); } + + // 3. If an existing entry conflicts with a new one (same index but different terms), + // delete the existing entry and all that follow it + // 4. Append any new entries not already in the log + logRepository.appendOrOverride(cmd.entries(), prevLogIndex); + + // 5. If leaderCommit > commitIndex, set commitIndex = min(leaderCommit, index of last new entry) + if (cmd.leaderCommit() > commitIndex) { + commitIndex = Math.min(cmd.leaderCommit(), logRepository.getLastLogIndex()); + log.debug("set commitIndex to {}", commitIndex); + } + + applyPendingEntriesToStateMachine(); } } @@ -162,8 +216,8 @@ public RpcResponse handleRequest(int fromNodeId, RpcRequest req) { } @Override - public void handleResponse(int fromNodeId, RpcResponse resp) { - logger.debug("INCOMING RESP {} >>> {}", fromNodeId, resp); + public void handleNodeResponse(int fromNodeId, RpcResponse resp, long correlationId) { + log.debug("INCOMING RESP {} >>> {}", fromNodeId, resp); /* A candidate wins an election if it receives votes from a majority of the servers in the full cluster for the same @@ -177,23 +231,68 @@ public void handleResponse(int fromNodeId, RpcResponse resp) { switchToLeader(); } } + } else if (resp instanceof final CmdRaftAppendEntriesResponse appendResponse) { + synchronized (this) { + if (appendResponse.success() && correlationId == correlationIds[fromNodeId]) { + + timeSent[fromNodeId] = 0L; + matchIndex[fromNodeId] = sentUpTo[fromNodeId]; + nextIndex[fromNodeId] = sentUpTo[fromNodeId] + 1; + } + + } + } + + } + + + @Override + public CustomCommandResponse handleClientRequest(final InetAddress address, + final int port, + final long correlationId, + final CustomCommandRequest request) { + + synchronized (this) { + + if (currentState == RaftNodeState.LEADER) { + // If command received from client: append entry to local log, + // respond after entry applied to state machine (5.3) + + final int prevLogTerm = logRepository.getLastLogTerm(); + final long prevLogIndex = logRepository.getLastLogIndex(); + + // adding new record into the local log + final RaftLogEntry logEntry = new RaftLogEntry(currentTerm, request.data()); + final long index = logRepository.append(logEntry); + + // remember client request (TODO !! on batch migration - should refer to the last record) + clientResponsesMap.put(index, new ClientAddress(address, port, correlationId)); + + } else { + // inform client about different leader + return new CustomCommandResponse(0, votedFor, false); + } } + + return null; } + }; // todo remove from constructor rpcService = new RpcService(remoteNodes, handler, thisNodeId); - logger.info("HEARTBEAT_TIMEOUT_MS={}", HEARTBEAT_TIMEOUT_MS); - logger.info("ELECTION_TIMEOUT_MS={}..{}", ELECTION_TIMEOUT_MIN_MS, ELECTION_TIMEOUT_MAX_MS); + log.info("HEARTBEAT_TIMEOUT_MS={}", HEARTBEAT_TIMEOUT_MS); + log.info("ELECTION_TIMEOUT_MS={}..{}", ELECTION_TIMEOUT_MIN_MS, ELECTION_TIMEOUT_MAX_MS); - logger.info("Starting node {} as follower...", thisNodeId); + log.info("Starting node {} as follower...", thisNodeId); resetFollowerAppendTimer(); new Thread(this::workerThread).start(); } + private void workerThread() { try { @@ -221,22 +320,58 @@ private void workerThread() { if (currentState == RaftNodeState.LEADER) { - final long t = System.nanoTime(); - if (t > lastHeartBeatSentNs + HEARTBEAT_LEADER_RATE_MS * 1_000_000L) { - - lastHeartBeatSentNs = t; - - logger.info("Sending heartbeats to {}, term={}", otherNodes, currentTerm); - final CmdRaftAppendEntries heartBeatReq = new CmdRaftAppendEntries( - currentTerm, - currentNodeId, - lastApplied, - 429384628, - List.of(), - commitIndex); - rpcService.callRpcAsync(heartBeatReq, otherNodes[0]); - rpcService.callRpcAsync(heartBeatReq, otherNodes[1]); - } + + // If last log index >= nextIndex for a follower: send AppendEntries RPC with log entries starting at nextIndex + // If successful: update nextIndex and matchIndex for follower (5.3) + // If AppendEntries fails because of log inconsistency: decrement nextIndex and retry (5.3) + + + final int prevLogTerm = logRepository.getLastLogTerm(); + final long prevLogIndex = logRepository.getLastLogIndex(); + + Arrays.stream(otherNodes).forEach(targetNodeId -> { + + final long nextIndexForNode = nextIndex[targetNodeId]; + + final long t = System.nanoTime(); + final boolean timeToSendHeartbeat = t > lastHeartBeatSentNs + HEARTBEAT_LEADER_RATE_MS * 1_000_000L; + + // have records and did not send batch recently + final boolean canRetry = prevLogIndex >= nextIndexForNode + && (correlationIds[targetNodeId] == 0L || t > timeSent[targetNodeId] + APPEND_REPLY_TIMEOUT_MAX_MS * 1_000_000L); + + if (canRetry || timeToSendHeartbeat) { + + final List newEntries = logRepository.getEntriesStartingFrom(nextIndexForNode); + + final CmdRaftAppendEntries appendRequest = new CmdRaftAppendEntries( + currentTerm, + currentNodeId, + prevLogIndex, + prevLogTerm, + newEntries, + commitIndex); + + log.info("Sending {} entries to {}, term={}", newEntries.size(), otherNodes, currentTerm); + + final long corrId = rpcService.callRpcAsync(appendRequest, targetNodeId); + + if (!newEntries.isEmpty()) { + correlationIds[targetNodeId] = corrId; + timeSent[targetNodeId] = System.nanoTime(); + sentUpTo[targetNodeId] = nextIndexForNode + newEntries.size(); + } + } + }); + +// if (System.nanoTime() > lastHeartBeatSentNs + HEARTBEAT_LEADER_RATE_MS * 1_000_000L) { +// +// log.info("Sending heartbeats to {}, term={}", otherNodes, currentTerm); +// +// sendAppendEntries(prevLogTerm, prevLogIndex, List.of()); +// } + + } @@ -262,7 +397,7 @@ private void switchToFollower() { // } if (currentState != RaftNodeState.FOLLOWER) { - logger.debug("Switching to follower (reset votedFor, start append timer)"); + log.debug("Switching to follower (reset votedFor, start append timer)"); currentState = RaftNodeState.FOLLOWER; votedFor = -1; resetFollowerAppendTimer(); @@ -270,96 +405,6 @@ private void switchToFollower() { } -// /** -// * Receiver implementation:

-// * 1. Reply false if term < currentTerm (5.1)

-// * 2. Reply false if log doesn't contain an entry at prevLogIndex whose term matches prevLogTerm (5.3)

-// * 3. If an existing entry conflicts with a new one (same index but different terms), delete the existing entry and all that follow it (5.3)

-// * 4. Append any new entries not already in the log

-// * 5. If leaderCommit > commitIndex, set commitIndex = min(leaderCommit, index of last new entry)

-// */ -// public synchronized CmdRaftAppendEntriesResponse appendEntries(CmdRaftAppendEntries cmd) { -// -// // 1. Reply false if term < currentTerm -// // If the term in the RPC is smaller than the candidate’s current term, then the candidate rejects the RPC and continues in candidate state. -// if (cmd.term < currentTerm) { -// logger.debug("term < currentTerm"); -// return new CmdRaftAppendEntriesResponse(currentTerm, false); -// } -// -// // If the leader’s term (included in its RPC) is at least as large as the candidate’s current term, then the candidate -// // recognizes the leader as legitimate and returns to follower state. I -// checkTerm(cmd.term); -// -// // 2. Reply false if log doesn't contain an entry at prevLogIndex whose term matches prevLogTerm -// if (cmd.prevLogIndex >= log.size() || log.get((int) cmd.prevLogIndex).term != cmd.prevLogTerm) { -// logger.debug("log doesn't contain an entry at prevLogIndex whose term matches prevLogTerm"); -// return new CmdRaftAppendEntriesResponse(currentTerm, false); -// } -// -// // TODO 3. If an existing entry conflicts with a new one (same index but different terms), delete the existing entry and all that follow it (5.3) -// -// // Append any new entries not already in the log -// -// resetFollowerAppendTimer(); -// -// // If leaderCommit > commitIndex, set commitIndex = min(leaderCommit, index of last new entry) -// if (cmd.leaderCommit > commitIndex) { -// commitIndex = Math.min(cmd.leaderCommit, log.size() - 1); -// } -// -// return new CmdRaftAppendEntriesResponse(currentTerm, true); -// } -// -// /** -// * Receiver implementation: -// * 1. Reply false if term < currentTerm (5.1) -// * 2. If votedFor is null or candidateId, and candidate’s log is at least as up-to-date as receiver’s log, grant vote (5.2, 5.4) -// */ -// public synchronized CmdRaftVoteResponse handleVoteRequest(CmdRaftVoteRequest request) { -// -// if (request.term < currentTerm) { -// return new CmdRaftVoteResponse(currentTerm, false); -// } -// -// checkTerm(request.term); -// -// -// final boolean notVotedYet = votedFor == -1 || votedFor == request.candidateId; -// final boolean logIsUpToDate = request.lastLogIndex >= commitIndex; // TODO commitIndex or lastApplied? -// if (notVotedYet && logIsUpToDate) { -// // vote for candidate -// votedFor = request.candidateId; -// return new CmdRaftVoteResponse(currentTerm, true); -// } else { -// // reject voting -// return new CmdRaftVoteResponse(currentTerm, false); -// } -// -// } -// -// -// private void checkTerm(int term) { -// // All servers: If RPC request or response contains term T > currentTerm: set currentTerm = T, convert to follower (5.1) -// if (term > currentTerm) { -// logger.info("Newer term={} received from new leader, switching to FOLLOWER", term); -// currentTerm = term; -// -// if (currentState == RaftNodeState.LEADER) { -// heartbeatLeaderExecutor.shutdown(); -// } -// -// currentState = RaftNodeState.FOLLOWER; -// } -// } - - - private void broadcast(RaftMessage message) { - - logger.debug("Sending: {}", message); - - } - private synchronized void resetFollowerAppendTimer() { // logger.debug("reset append timer"); lastHeartBeatReceivedNs = System.nanoTime(); @@ -388,40 +433,89 @@ private synchronized void appendTimeout() { // - Send RequestVote RPCs to all other servers currentTerm++; - logger.info("heartbeat timeout - switching to CANDIDATE, term={}", currentTerm); + log.info("heartbeat timeout - switching to CANDIDATE, term={}", currentTerm); votedFor = currentNodeId; + final int prevLogTerm = logRepository.getLastLogTerm(); + final long prevLogIndex = logRepository.getLastLogIndex(); final CmdRaftVoteRequest voteReq = new CmdRaftVoteRequest( currentTerm, currentNodeId, - lastApplied, - 429384628); // TODO extract from log! + prevLogIndex, + prevLogTerm); rpcService.callRpcAsync(voteReq, otherNodes[0]); rpcService.callRpcAsync(voteReq, otherNodes[1]); final int timeoutMs = ELECTION_TIMEOUT_MIN_MS + (int) (Math.random() * (ELECTION_TIMEOUT_MAX_MS - ELECTION_TIMEOUT_MIN_MS)); - logger.debug("ElectionTimeout: {}ms", timeoutMs); + log.debug("ElectionTimeout: {}ms", timeoutMs); electionEndNs = System.nanoTime() + timeoutMs * 1_000_000L; } private void switchToLeader() { - logger.info("Becoming a LEADER!"); + log.info("Becoming a LEADER!"); currentState = RaftNodeState.LEADER; - // TODO init // for each server, index of the next log entry to send to that server (initialized to leader last log index + 1) -// private final long[] nextIndex = new long[3]; + final long next = logRepository.getLastLogIndex() + 1; + Arrays.fill(nextIndex, next); // for each server, index of the highest log entry known to be replicated on server (initialized to 0, increases monotonically) -// private final long[] matchIndex = new long[3]; + Arrays.fill(matchIndex, 0); + } + + + private void sendAppendEntries(int prevLogTerm, long prevLogIndex, List logEntries) { + + final CmdRaftAppendEntries appendRequest = new CmdRaftAppendEntries( + currentTerm, + currentNodeId, + prevLogIndex, + prevLogTerm, + logEntries, + commitIndex); + rpcService.callRpcAsync(appendRequest, otherNodes[0]); + rpcService.callRpcAsync(appendRequest, otherNodes[1]); + + lastHeartBeatSentNs = System.nanoTime(); + } + + private void applyPendingEntriesToStateMachine() { + + /* + All Servers: If commitIndex > lastApplied: increment lastApplied, apply log[lastApplied] to state machine (5.3) + */ + while (lastApplied < commitIndex) { + lastApplied++; + final RaftLogEntry raftLogEntry = logRepository.getEntry(lastApplied); + final int result = rsm.apply(raftLogEntry.cmd); + + if (currentState == RaftNodeState.LEADER) { + + // respond to client that batch has applied + final ClientAddress c = clientResponsesMap.get(lastApplied); + rpcService.respondToClient( + c.address, + c.port, + c.correlationId, + new CustomCommandResponse(result, currentNodeId, true)); + } + + } + + } + public enum RaftNodeState { FOLLOWER, CANDIDATE, LEADER } + + // TODO can move to RpcService + private record ClientAddress(InetAddress address, int port, long correlationId) { + } } \ No newline at end of file diff --git a/src/main/java/exchange/core2/revelator/raft/ReplicatedStateMachine.java b/src/main/java/exchange/core2/revelator/raft/ReplicatedStateMachine.java new file mode 100644 index 0000000..c5783c6 --- /dev/null +++ b/src/main/java/exchange/core2/revelator/raft/ReplicatedStateMachine.java @@ -0,0 +1,10 @@ +package exchange.core2.revelator.raft; + +public interface ReplicatedStateMachine { + + // TODO switch to custom messages + int apply(long value); + + int getState(); + +} diff --git a/src/main/java/exchange/core2/revelator/raft/RpcHandler.java b/src/main/java/exchange/core2/revelator/raft/RpcHandler.java index 9ca57de..74d639b 100644 --- a/src/main/java/exchange/core2/revelator/raft/RpcHandler.java +++ b/src/main/java/exchange/core2/revelator/raft/RpcHandler.java @@ -1,9 +1,13 @@ package exchange.core2.revelator.raft; +import java.net.InetAddress; + public interface RpcHandler { - RpcResponse handleRequest(int nodeId, RpcRequest request); + RpcResponse handleNodeRequest(int nodeId, RpcRequest request); + + void handleNodeResponse(int nodeId, RpcResponse response, long correlationId); - void handleResponse(int nodeId, RpcResponse response); + CustomCommandResponse handleClientRequest(InetAddress address, int port, long correlationId, CustomCommandRequest request); } diff --git a/src/main/java/exchange/core2/revelator/raft/RpcService.java b/src/main/java/exchange/core2/revelator/raft/RpcService.java index 5fff099..64e7d42 100644 --- a/src/main/java/exchange/core2/revelator/raft/RpcService.java +++ b/src/main/java/exchange/core2/revelator/raft/RpcService.java @@ -5,9 +5,7 @@ import org.slf4j.LoggerFactory; import java.io.IOException; -import java.net.DatagramPacket; -import java.net.DatagramSocket; -import java.net.InetAddress; +import java.net.*; import java.nio.ByteBuffer; import java.util.HashMap; import java.util.Map; @@ -15,17 +13,20 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicLong; -public class RpcService implements AutoCloseable { +public class RpcService implements AutoCloseable { private static final Logger logger = LoggerFactory.getLogger(RpcService.class); - private final AtomicLong correlationIdCounter = new AtomicLong(0L); + private final AtomicLong correlationIdCounter = new AtomicLong(1L); private final Map> futureMap = new ConcurrentHashMap<>(); private final Map socketMap; private final int serverPort; private final int serverNodeId; private final RpcHandler handler; + private DatagramSocket serverSocket; +// private final SerializableMessageFactory msgFactory; + private volatile boolean active = true; public RpcService(Map remoteNodes, @@ -57,67 +58,93 @@ public RpcService(Map remoteNodes, this.serverPort = socketMap.get(serverNodeId).port; this.serverNodeId = serverNodeId; + try { + this.serverSocket = new DatagramSocket(serverPort); + } catch (final SocketException ex) { + throw new RuntimeException(ex); + } + Thread t = new Thread(this::run); t.setDaemon(true); t.setName("ListenerUDP"); t.start(); + } public void run() { - try (final DatagramSocket serverSocket = new DatagramSocket(serverPort)) { - + try { logger.info("Listening at UDP {}:{}", InetAddress.getLocalHost().getHostAddress(), serverPort); + } catch (UnknownHostException ex) { + logger.warn("UnknownHostException: ", ex); + } - final byte[] receiveData = new byte[256]; // TODO set proper value + final byte[] receiveData = new byte[256]; // TODO set proper value - final DatagramPacket receivePacket = new DatagramPacket(receiveData, receiveData.length); + final DatagramPacket receivePacket = new DatagramPacket(receiveData, receiveData.length); - while (active) { + while (active) { - try { - serverSocket.receive(receivePacket); + try { + serverSocket.receive(receivePacket); - final ByteBuffer bb = ByteBuffer.wrap(receivePacket.getData(), 0, receivePacket.getLength()); + final ByteBuffer bb = ByteBuffer.wrap(receivePacket.getData(), 0, receivePacket.getLength()); - final int nodeId = bb.getInt(); - final int messageType = bb.getInt(); - final long correlationId = bb.getLong(); + final int nodeId = bb.getInt(); + final int messageType = bb.getInt(); + final long correlationId = bb.getLong(); // logger.debug("RECEIVED from {} mt={}: {}", nodeId, messageType, PrintBufferUtil.hexDump(receivePacket.getData(), 0, receivePacket.getLength())); - final RpcMessage msg = createByType(messageType, bb); + final RpcMessage msg = createByType(messageType, bb); + // TODO use msgFactory + + if (messageType < 0) { + // handler response - if (messageType < 0) { - final CompletableFuture future = futureMap.remove(correlationId); - if (future != null) { - future.complete((RpcResponse) msg); - } else { - handler.handleResponse(nodeId, (RpcResponse) msg); + final CompletableFuture future = futureMap.remove(correlationId); + if (future != null) { + // complete future for future-based-calls + future.complete((RpcResponse) msg); + } else { + // handle response for full-async-calls + handler.handleNodeResponse(nodeId, (RpcResponse) msg, correlationId); + } + + } else { + + if (msg instanceof CustomCommandRequest) { + + final InetAddress address = receivePacket.getAddress(); + final int port = receivePacket.getPort(); + + final CustomCommandResponse response = handler.handleClientRequest(address, port, correlationId, (CustomCommandRequest) msg); + if (response != null) { + respondToClient(address, port, correlationId, response); } } else { - final RpcResponse response = handler.handleRequest(nodeId, (RpcRequest) msg); + // handle request + final RpcResponse response = handler.handleNodeRequest(nodeId, (RpcRequest) msg); + // send response if (response != null) { sendResponse(nodeId, correlationId, response); } } - } catch (Exception ex) { - String message = PrintBufferUtil.hexDump(receivePacket.getData(), 0, receivePacket.getLength()); - logger.error("Failed to process message from {}: {}", receivePacket.getAddress().getHostAddress(), message, ex); - } - } - logger.info("UDP server shutdown"); + } - } catch (final IOException ex) { - logger.error("Error in service thread", ex); - throw new RuntimeException(ex); + } catch (final Exception ex) { + String message = PrintBufferUtil.hexDump(receivePacket.getData(), 0, receivePacket.getLength()); + logger.error("Failed to process message from {}: {}", receivePacket.getAddress().getHostAddress(), message, ex); + } } + logger.info("UDP server shutdown"); + serverSocket.close(); } static RpcMessage createByType(int messageType, ByteBuffer buffer) { @@ -145,10 +172,11 @@ private void sendResponse(int callerNodeId, long correlationId, RpcResponse resp } - public void callRpcAsync(RpcRequest request, int toNodeId) { + public long callRpcAsync(RpcRequest request, int toNodeId) { final long correlationId = correlationIdCounter.incrementAndGet(); callRpc(request, toNodeId, correlationId); + return correlationId; } public CompletableFuture callRpcSync(RpcRequest request, int toNodeId) { @@ -190,6 +218,31 @@ private void send(int nodeId, byte[] data, int length) { } } + public void respondToClient(InetAddress address, int port, long correlationId, RpcResponse response) { + + final byte[] array = new byte[64]; + ByteBuffer bb = ByteBuffer.wrap(array); + + // put only correlationId into the header + bb.putLong(correlationId); + response.serialize(bb); + + respondToClient(address, port, array, bb.position()); + } + + private void respondToClient(InetAddress address, int port, byte[] data, int length) { + + final DatagramPacket packet = new DatagramPacket(data, length, address, port); + + try { + + serverSocket.send(packet); + + } catch (IOException ex) { + throw new RuntimeException(ex); + } + } + @Override public void close() throws Exception { @@ -200,7 +253,7 @@ public void close() throws Exception { public static final class RemoteUdpSocket { - private final DatagramSocket socket; + private final DatagramSocket socket; // TODO remove private final InetAddress address; private final int port; diff --git a/src/main/java/exchange/core2/revelator/raft/SerializableMessage.java b/src/main/java/exchange/core2/revelator/raft/SerializableMessage.java new file mode 100644 index 0000000..cdd70a2 --- /dev/null +++ b/src/main/java/exchange/core2/revelator/raft/SerializableMessage.java @@ -0,0 +1,9 @@ +package exchange.core2.revelator.raft; + +import java.nio.ByteBuffer; + +public interface SerializableMessage { + + void serialize(ByteBuffer buffer); + +} diff --git a/src/main/java/exchange/core2/revelator/raft/SerializableMessageFactory.java b/src/main/java/exchange/core2/revelator/raft/SerializableMessageFactory.java new file mode 100644 index 0000000..78b2042 --- /dev/null +++ b/src/main/java/exchange/core2/revelator/raft/SerializableMessageFactory.java @@ -0,0 +1,9 @@ +package exchange.core2.revelator.raft; + +import java.nio.ByteBuffer; + +public interface SerializableMessageFactory { + + T create(ByteBuffer buffer); + +} From e34ccd7ae40097df7423cb7259953ee707745f3e Mon Sep 17 00:00:00 2001 From: Maksim Zheravin Date: Sat, 29 Jan 2022 14:20:12 +0200 Subject: [PATCH 09/15] RAFT: bugfixes, update commitIndex --- .../revelator/raft/RaftLogRepository.java | 19 +++++- .../core2/revelator/raft/RaftNode.java | 60 ++++++++----------- 2 files changed, 42 insertions(+), 37 deletions(-) diff --git a/src/main/java/exchange/core2/revelator/raft/RaftLogRepository.java b/src/main/java/exchange/core2/revelator/raft/RaftLogRepository.java index 5add984..36ecc15 100644 --- a/src/main/java/exchange/core2/revelator/raft/RaftLogRepository.java +++ b/src/main/java/exchange/core2/revelator/raft/RaftLogRepository.java @@ -5,6 +5,7 @@ import java.util.ArrayList; import java.util.List; +import java.util.Optional; public class RaftLogRepository { @@ -17,6 +18,22 @@ public RaftLogEntry getEntry(long index) { return logEntries.get((int) index - 1); } + public Optional getEntryOpt(long index) { + return Optional.ofNullable(logEntries.get((int) index - 1)); + } + + public long lastEntryInTerm(long indexAfter, long indexBeforeIncl, int term) { + + int idx = (int) indexAfter; + for (int i = (int) indexAfter + 1; i <= indexBeforeIncl; i++) { + if (logEntries.get(idx - 1).term == term) { + idx = i; + } + } + return idx; + } + + public long getLastLogIndex() { return logEntries.size(); // 0 = no records } @@ -78,6 +95,6 @@ public List getEntriesStartingFrom(long nextIndex) { return List.of(); } - return logEntries.subList((int) nextIndex - 1, logEntries.size()); + return logEntries.subList((int) nextIndex - 1, logEntries.size()); } } diff --git a/src/main/java/exchange/core2/revelator/raft/RaftNode.java b/src/main/java/exchange/core2/revelator/raft/RaftNode.java index a99c6e0..e0c29c2 100644 --- a/src/main/java/exchange/core2/revelator/raft/RaftNode.java +++ b/src/main/java/exchange/core2/revelator/raft/RaftNode.java @@ -20,9 +20,6 @@ public class RaftNode { public static final int ELECTION_TIMEOUT_MAX_MS = 2800; public static final int APPEND_REPLY_TIMEOUT_MAX_MS = 20; - public static final int CLUSTER_SIZE = 3; - public static final int VOTES_REQUIRED = 2; - /* **** Persistent state on all servers: (Updated on stable storage before responding to RPCs) */ @@ -66,6 +63,7 @@ public class RaftNode { private final long[] correlationIds = new long[3]; private final long[] timeSent = new long[3]; private final long[] sentUpTo = new long[3]; + private final long[] lastHeartBeatSentNs = new long[3]; private final LongObjectHashMap clientResponsesMap = new LongObjectHashMap<>(); @@ -81,7 +79,6 @@ public class RaftNode { // timers private long lastHeartBeatReceivedNs = System.nanoTime(); - private long lastHeartBeatSentNs = System.nanoTime(); private long electionEndNs = System.nanoTime(); @@ -238,6 +235,22 @@ public void handleNodeResponse(int fromNodeId, RpcResponse resp, long correlatio timeSent[fromNodeId] = 0L; matchIndex[fromNodeId] = sentUpTo[fromNodeId]; nextIndex[fromNodeId] = sentUpTo[fromNodeId] + 1; + + // If there exists an N such that + // N > commitIndex, a majority of matchIndex[i] >= N, and log[N].term == currentTerm: + // set commitIndex = N (5.3, 5.4). + + + if (matchIndex[fromNodeId] > commitIndex) { + final long newCommitIndex = Math.max( + Math.max(commitIndex, matchIndex[fromNodeId]), + logRepository.lastEntryInTerm(commitIndex, matchIndex[fromNodeId], currentTerm)); + + if (commitIndex != newCommitIndex) { + log.debug("updated commitIndex: {}->{}", commitIndex, newCommitIndex); + } + commitIndex = newCommitIndex; + } } } @@ -320,12 +333,10 @@ private void workerThread() { if (currentState == RaftNodeState.LEADER) { - // If last log index >= nextIndex for a follower: send AppendEntries RPC with log entries starting at nextIndex // If successful: update nextIndex and matchIndex for follower (5.3) // If AppendEntries fails because of log inconsistency: decrement nextIndex and retry (5.3) - final int prevLogTerm = logRepository.getLastLogTerm(); final long prevLogIndex = logRepository.getLastLogIndex(); @@ -334,12 +345,16 @@ private void workerThread() { final long nextIndexForNode = nextIndex[targetNodeId]; final long t = System.nanoTime(); - final boolean timeToSendHeartbeat = t > lastHeartBeatSentNs + HEARTBEAT_LEADER_RATE_MS * 1_000_000L; + final boolean timeToSendHeartbeat = t > lastHeartBeatSentNs[targetNodeId] + HEARTBEAT_LEADER_RATE_MS * 1_000_000L; + + //log.debug("timeToSendHeartbeat={}",timeToSendHeartbeat); // have records and did not send batch recently final boolean canRetry = prevLogIndex >= nextIndexForNode && (correlationIds[targetNodeId] == 0L || t > timeSent[targetNodeId] + APPEND_REPLY_TIMEOUT_MAX_MS * 1_000_000L); + //log.debug("canRetry={}",canRetry); + if (canRetry || timeToSendHeartbeat) { final List newEntries = logRepository.getEntriesStartingFrom(nextIndexForNode); @@ -361,17 +376,11 @@ private void workerThread() { timeSent[targetNodeId] = System.nanoTime(); sentUpTo[targetNodeId] = nextIndexForNode + newEntries.size(); } + + lastHeartBeatSentNs[targetNodeId] = System.nanoTime(); } }); -// if (System.nanoTime() > lastHeartBeatSentNs + HEARTBEAT_LEADER_RATE_MS * 1_000_000L) { -// -// log.info("Sending heartbeats to {}, term={}", otherNodes, currentTerm); -// -// sendAppendEntries(prevLogTerm, prevLogIndex, List.of()); -// } - - } @@ -391,11 +400,6 @@ private void workerThread() { private void switchToFollower() { -// if (currentState == RaftNodeState.CANDIDATE && electionTimer != null) { -// logger.debug("cancelled elevtion timer"); -// electionTimer.cancel(); -// } - if (currentState != RaftNodeState.FOLLOWER) { log.debug("Switching to follower (reset votedFor, start append timer)"); currentState = RaftNodeState.FOLLOWER; @@ -466,22 +470,6 @@ private void switchToLeader() { Arrays.fill(matchIndex, 0); } - - private void sendAppendEntries(int prevLogTerm, long prevLogIndex, List logEntries) { - - final CmdRaftAppendEntries appendRequest = new CmdRaftAppendEntries( - currentTerm, - currentNodeId, - prevLogIndex, - prevLogTerm, - logEntries, - commitIndex); - rpcService.callRpcAsync(appendRequest, otherNodes[0]); - rpcService.callRpcAsync(appendRequest, otherNodes[1]); - - lastHeartBeatSentNs = System.nanoTime(); - } - private void applyPendingEntriesToStateMachine() { /* From f9c31570733e1c1004718ddc5de6dbc09588692f Mon Sep 17 00:00:00 2001 From: Maksim Zheravin Date: Sun, 30 Jan 2022 00:37:13 +0200 Subject: [PATCH 10/15] RAFT: bugfixes, rpc client implementation and testing tool --- .../core2/revelator/raft/RaftClient.java | 44 ++-- .../core2/revelator/raft/RaftLogEntry.java | 8 + .../revelator/raft/RaftLogRepository.java | 22 +- .../core2/revelator/raft/RaftNode.java | 79 +++++--- .../core2/revelator/raft/RaftUtils.java | 59 ++++++ .../core2/revelator/raft/RpcClient.java | 190 ++++++++++++++++++ .../core2/revelator/raft/RpcService.java | 64 +----- 7 files changed, 364 insertions(+), 102 deletions(-) create mode 100644 src/main/java/exchange/core2/revelator/raft/RaftUtils.java create mode 100644 src/main/java/exchange/core2/revelator/raft/RpcClient.java diff --git a/src/main/java/exchange/core2/revelator/raft/RaftClient.java b/src/main/java/exchange/core2/revelator/raft/RaftClient.java index d986fad..9827503 100644 --- a/src/main/java/exchange/core2/revelator/raft/RaftClient.java +++ b/src/main/java/exchange/core2/revelator/raft/RaftClient.java @@ -5,7 +5,8 @@ import org.slf4j.LoggerFactory; import java.io.IOException; -import java.net.*; +import java.util.Map; +import java.util.Random; public class RaftClient { @@ -15,35 +16,34 @@ public class RaftClient { public static void main(String[] args) throws IOException, InterruptedException { final RaftClient raftClient = new RaftClient(); + Random random = new Random(1L); while (true) { - raftClient.sendEcho("TEST123"); - Thread.sleep(1000); + raftClient.sendEcho(random.nextLong()); + Thread.sleep(2000); } } - private DatagramSocket socket; - private InetAddress address; + private RpcClient rpcClient; - private byte[] buf; + public RaftClient() { - public RaftClient() throws SocketException, UnknownHostException { - socket = new DatagramSocket(); - address = InetAddress.getByName("localhost"); - } + // localhost:3778, localhost:3779, localhost:3780 + final Map remoteNodes = Map.of( + 0, "localhost:3778", + 1, "localhost:3779", + 2, "localhost:3780"); - public String sendEcho(String msg) throws IOException { - buf = msg.getBytes(); - DatagramPacket packet = new DatagramPacket(buf, buf.length, address, 3778); - log.debug(">> {}", msg); - socket.send(packet); - packet = new DatagramPacket(buf, buf.length); - socket.receive(packet); - String received = new String(packet.getData(), 0, packet.getLength()); - log.debug("<< {}", received); - return received; + this.rpcClient = new RpcClient(remoteNodes); } - public void close() { - socket.close(); + public void sendEcho(long data) { + try { + log.info("send >>> data={}", data); + final int hash = rpcClient.callRpcSync(data, 500); + log.info("recv <<< hash={}", hash); + } catch (Exception ex) { + log.warn("Exception: ", ex); + } } + } \ No newline at end of file diff --git a/src/main/java/exchange/core2/revelator/raft/RaftLogEntry.java b/src/main/java/exchange/core2/revelator/raft/RaftLogEntry.java index 26cfe6e..5a48d19 100644 --- a/src/main/java/exchange/core2/revelator/raft/RaftLogEntry.java +++ b/src/main/java/exchange/core2/revelator/raft/RaftLogEntry.java @@ -23,6 +23,14 @@ public void serialize(ByteBuffer buffer) { buffer.putLong(cmd); } + @Override + public String toString() { + return "RLE{" + + "t" + term + + " cmd=" + cmd + + '}'; + } + public static RaftLogEntry create(ByteBuffer buffer) { final int term = buffer.getInt(); final long cmd = buffer.getLong(); diff --git a/src/main/java/exchange/core2/revelator/raft/RaftLogRepository.java b/src/main/java/exchange/core2/revelator/raft/RaftLogRepository.java index 36ecc15..d280507 100644 --- a/src/main/java/exchange/core2/revelator/raft/RaftLogRepository.java +++ b/src/main/java/exchange/core2/revelator/raft/RaftLogRepository.java @@ -19,14 +19,20 @@ public RaftLogEntry getEntry(long index) { } public Optional getEntryOpt(long index) { - return Optional.ofNullable(logEntries.get((int) index - 1)); + if (index < 1 || index > logEntries.size()) { + return Optional.empty(); + } + + return Optional.of(logEntries.get((int) index - 1)); } public long lastEntryInTerm(long indexAfter, long indexBeforeIncl, int term) { int idx = (int) indexAfter; + for (int i = (int) indexAfter + 1; i <= indexBeforeIncl; i++) { - if (logEntries.get(idx - 1).term == term) { + log.debug("i={}", i); + if (logEntries.get(i - 1).term == term) { idx = i; } } @@ -67,12 +73,19 @@ public long append(final RaftLogEntry logEntry) { public void appendOrOverride(final List newEntries, long prevLogIndex) { + log.debug("appendOrOverride(newEntries={} , prevLogIndex={}", newEntries, prevLogIndex); + for (int i = 0; i < newEntries.size(); i++) { final RaftLogEntry newEntry = newEntries.get(i); + if ((prevLogIndex + i) < logEntries.size()) { + + final int pos = (int) prevLogIndex + i; final int existingTerm = logEntries.get(pos).term; + log.debug("Validating older record with index={}: existingTerm={} newEntry.term={}", pos + 1, existingTerm, newEntry.term); + // 3. If an existing entry conflicts with a new one (same index but different terms), // delete the existing entry and all that follow it @@ -84,6 +97,7 @@ public void appendOrOverride(final List newEntries, long prevLogIn } } } else { + log.debug("appendOrOverride - added {}", newEntry); logEntries.add(newEntry); // TODO inefficient, because normally records are simply appended as batch } } @@ -95,6 +109,8 @@ public List getEntriesStartingFrom(long nextIndex) { return List.of(); } - return logEntries.subList((int) nextIndex - 1, logEntries.size()); + log.debug("getEntriesStartingFrom({}): logEntries: {}", nextIndex, logEntries); + + return new ArrayList<>(logEntries.subList((int) nextIndex - 1, logEntries.size())); } } diff --git a/src/main/java/exchange/core2/revelator/raft/RaftNode.java b/src/main/java/exchange/core2/revelator/raft/RaftNode.java index e0c29c2..86a1c77 100644 --- a/src/main/java/exchange/core2/revelator/raft/RaftNode.java +++ b/src/main/java/exchange/core2/revelator/raft/RaftNode.java @@ -18,7 +18,7 @@ public class RaftNode { public static final int HEARTBEAT_LEADER_RATE_MS = 1000; public static final int ELECTION_TIMEOUT_MIN_MS = 2500; public static final int ELECTION_TIMEOUT_MAX_MS = 2800; - public static final int APPEND_REPLY_TIMEOUT_MAX_MS = 20; + public static final int APPEND_REPLY_TIMEOUT_MAX_MS = 1000; /* **** Persistent state on all servers: (Updated on stable storage before responding to RPCs) */ @@ -179,12 +179,12 @@ public RpcResponse handleNodeRequest(int fromNodeId, RpcRequest req) { return new CmdRaftAppendEntriesResponse(currentTerm, true); } - log.debug("Adding new records into the log"); + log.debug("Adding new records into the log..."); // 2. Reply false if log doesn’t contain an entry at prevLogIndex whose term matches prevLogTerm final long prevLogIndex = cmd.prevLogIndex(); - if (prevLogIndex >= logRepository.getLastLogIndex()) { - log.warn("Reject - log doesn’t contain an entry at prevLogIndex={}", logRepository.getLastLogIndex()); + if (prevLogIndex > 0 && prevLogIndex != logRepository.getLastLogIndex()) { + log.warn("Reject - log doesn’t contain an entry at prevLogIndex={} (last is {}))", prevLogIndex, logRepository.getLastLogIndex()); return new CmdRaftAppendEntriesResponse(currentTerm, false); } @@ -199,13 +199,18 @@ public RpcResponse handleNodeRequest(int fromNodeId, RpcRequest req) { // 4. Append any new entries not already in the log logRepository.appendOrOverride(cmd.entries(), prevLogIndex); + log.debug("Added to log repository: {}", cmd.entries()); + // 5. If leaderCommit > commitIndex, set commitIndex = min(leaderCommit, index of last new entry) if (cmd.leaderCommit() > commitIndex) { commitIndex = Math.min(cmd.leaderCommit(), logRepository.getLastLogIndex()); log.debug("set commitIndex to {}", commitIndex); } + // todo can do in different thread applyPendingEntriesToStateMachine(); + + return new CmdRaftAppendEntriesResponse(currentTerm, true); } } @@ -230,29 +235,49 @@ public void handleNodeResponse(int fromNodeId, RpcResponse resp, long correlatio } } else if (resp instanceof final CmdRaftAppendEntriesResponse appendResponse) { synchronized (this) { - if (appendResponse.success() && correlationId == correlationIds[fromNodeId]) { + if (correlationId == correlationIds[fromNodeId]) { + timeSent[fromNodeId] = 0L; - matchIndex[fromNodeId] = sentUpTo[fromNodeId]; - nextIndex[fromNodeId] = sentUpTo[fromNodeId] + 1; - // If there exists an N such that - // N > commitIndex, a majority of matchIndex[i] >= N, and log[N].term == currentTerm: - // set commitIndex = N (5.3, 5.4). + if (appendResponse.success()) { + + log.debug("current sentUpTo[{}]={}", fromNodeId, sentUpTo[fromNodeId]); + + matchIndex[fromNodeId] = sentUpTo[fromNodeId]; + nextIndex[fromNodeId] = sentUpTo[fromNodeId] + 1; + + // If there exists an N such that + // N > commitIndex, a majority of matchIndex[i] >= N, and log[N].term == currentTerm: + // set commitIndex = N (5.3, 5.4). + + if (matchIndex[fromNodeId] > commitIndex) { + log.debug("lastEntryInTerm({}, {}, {});", commitIndex, matchIndex[fromNodeId], currentTerm); + final long lastEntryInTerm = logRepository.lastEntryInTerm(commitIndex, matchIndex[fromNodeId], currentTerm); + + final long newCommitIndex = Math.max( + Math.max(commitIndex, matchIndex[fromNodeId]), + lastEntryInTerm); + if (commitIndex != newCommitIndex) { + log.debug("updated commitIndex: {}->{}", commitIndex, newCommitIndex); + } - if (matchIndex[fromNodeId] > commitIndex) { - final long newCommitIndex = Math.max( - Math.max(commitIndex, matchIndex[fromNodeId]), - logRepository.lastEntryInTerm(commitIndex, matchIndex[fromNodeId], currentTerm)); + commitIndex = newCommitIndex; - if (commitIndex != newCommitIndex) { - log.debug("updated commitIndex: {}->{}", commitIndex, newCommitIndex); + // TODO another thread + applyPendingEntriesToStateMachine(); + } + + } else { + if (nextIndex[fromNodeId] > 1) { + log.debug("decrementing nextIndex[{}] to {}", fromNodeId, nextIndex[fromNodeId] - 1); + nextIndex[fromNodeId]--; + } else { + log.warn("Can not decrement nextIndex[{}]", fromNodeId); } - commitIndex = newCommitIndex; } } - } } @@ -282,6 +307,7 @@ public CustomCommandResponse handleClientRequest(final InetAddress address, clientResponsesMap.put(index, new ClientAddress(address, port, correlationId)); } else { + log.debug("Redirecting client to leader nodeId={}", votedFor); // inform client about different leader return new CustomCommandResponse(0, votedFor, false); } @@ -337,7 +363,7 @@ private void workerThread() { // If successful: update nextIndex and matchIndex for follower (5.3) // If AppendEntries fails because of log inconsistency: decrement nextIndex and retry (5.3) - final int prevLogTerm = logRepository.getLastLogTerm(); + final long prevLogIndex = logRepository.getLastLogIndex(); Arrays.stream(otherNodes).forEach(targetNodeId -> { @@ -358,23 +384,28 @@ private void workerThread() { if (canRetry || timeToSendHeartbeat) { final List newEntries = logRepository.getEntriesStartingFrom(nextIndexForNode); + final int prevLogTerm = logRepository.getEntryOpt(nextIndexForNode - 1).map(e -> e.term).orElse(0); + + log.debug("node {} : nextIndexForNode={} newEntries={} prevLogTerm={}", targetNodeId, nextIndexForNode, newEntries, prevLogTerm); + final CmdRaftAppendEntries appendRequest = new CmdRaftAppendEntries( currentTerm, currentNodeId, - prevLogIndex, + nextIndexForNode - 1, prevLogTerm, newEntries, commitIndex); - log.info("Sending {} entries to {}, term={}", newEntries.size(), otherNodes, currentTerm); - final long corrId = rpcService.callRpcAsync(appendRequest, targetNodeId); + log.info("Sent {} entries to {}: {} corrId={}", newEntries.size(), targetNodeId, appendRequest, corrId); if (!newEntries.isEmpty()) { correlationIds[targetNodeId] = corrId; timeSent[targetNodeId] = System.nanoTime(); - sentUpTo[targetNodeId] = nextIndexForNode + newEntries.size(); + sentUpTo[targetNodeId] = nextIndexForNode - 1 + newEntries.size(); + log.debug("correlationIds[{}]={}", targetNodeId, corrId); + log.debug("set sentUpTo[{}]={}", targetNodeId, sentUpTo[targetNodeId]); } lastHeartBeatSentNs[targetNodeId] = System.nanoTime(); @@ -478,12 +509,14 @@ private void applyPendingEntriesToStateMachine() { while (lastApplied < commitIndex) { lastApplied++; final RaftLogEntry raftLogEntry = logRepository.getEntry(lastApplied); + log.debug("Applying to RSM: {}", raftLogEntry); final int result = rsm.apply(raftLogEntry.cmd); if (currentState == RaftNodeState.LEADER) { // respond to client that batch has applied final ClientAddress c = clientResponsesMap.get(lastApplied); + log.debug("Replying to client lastApplied={} c={}", lastApplied, c); rpcService.respondToClient( c.address, c.port, diff --git a/src/main/java/exchange/core2/revelator/raft/RaftUtils.java b/src/main/java/exchange/core2/revelator/raft/RaftUtils.java new file mode 100644 index 0000000..37f4e6c --- /dev/null +++ b/src/main/java/exchange/core2/revelator/raft/RaftUtils.java @@ -0,0 +1,59 @@ +package exchange.core2.revelator.raft; + +import java.net.InetAddress; +import java.nio.ByteBuffer; +import java.util.HashMap; +import java.util.Map; + +public class RaftUtils { + + + public static RpcMessage createMessageByType(int messageType, ByteBuffer buffer) { + return switch (messageType) { + case RpcMessage.REQUEST_APPEND_ENTRIES -> CmdRaftAppendEntries.create(buffer); + case RpcMessage.RESPONSE_APPEND_ENTRIES -> CmdRaftAppendEntriesResponse.create(buffer); + case RpcMessage.REQUEST_VOTE -> CmdRaftVoteRequest.create(buffer); + case RpcMessage.RESPONSE_VOTE -> CmdRaftVoteResponse.create(buffer); + case RpcMessage.REQUEST_CUSTOM -> CustomCommandRequest.create(buffer); + case RpcMessage.RESPONSE_CUSTOM -> CustomCommandResponse.create(buffer); + default -> throw new IllegalArgumentException("Unknown messageType: " + messageType); + }; + } + + public static Map createHostMap(Map remoteNodes) { + + final Map socketMap = new HashMap<>(); + + remoteNodes.forEach((id, address) -> { + + try { + final String[] split = address.split(":"); + + final InetAddress host = InetAddress.getByName(split[0]); + final int port = Integer.parseInt(split[1]); + + RemoteUdpSocket remoteUdpSocket = new RemoteUdpSocket(host, port); + + socketMap.put(id, remoteUdpSocket); + + } catch (Exception ex) { + throw new RuntimeException(ex); + } + }); + + return socketMap; + } + + + public static final class RemoteUdpSocket { + + public final InetAddress address; + public final int port; + + public RemoteUdpSocket(InetAddress address, int port) { + this.address = address; + this.port = port; + } + } + +} diff --git a/src/main/java/exchange/core2/revelator/raft/RpcClient.java b/src/main/java/exchange/core2/revelator/raft/RpcClient.java new file mode 100644 index 0000000..dc8f281 --- /dev/null +++ b/src/main/java/exchange/core2/revelator/raft/RpcClient.java @@ -0,0 +1,190 @@ +package exchange.core2.revelator.raft; + +import org.agrona.PrintBufferUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.net.DatagramPacket; +import java.net.DatagramSocket; +import java.net.SocketException; +import java.nio.ByteBuffer; +import java.util.LinkedList; +import java.util.Map; +import java.util.Queue; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.concurrent.atomic.AtomicLong; +import java.util.stream.Collectors; + +public class RpcClient { + + private static final Logger logger = LoggerFactory.getLogger(RpcClient.class); + + private final AtomicLong correlationIdCounter = new AtomicLong(1L); + private final Map> futureMap = new ConcurrentHashMap<>(); + private final Map socketMap; + + private volatile int leaderNodeId = 0; + + private final DatagramSocket serverSocket; + + private volatile boolean active = true; + + + public RpcClient(final Map remoteNodes) { + + this.socketMap = RaftUtils.createHostMap(remoteNodes); + + + try { + this.serverSocket = new DatagramSocket(); + } catch (final SocketException ex) { + throw new RuntimeException(ex); + } + + Thread t = new Thread(this::run); + t.setDaemon(true); + t.setName("ListenerUDP"); + t.start(); + + } + + public void run() { + + final byte[] receiveData = new byte[256]; // TODO set proper value + + final DatagramPacket receivePacket = new DatagramPacket(receiveData, receiveData.length); + + while (active) { + + try { + serverSocket.receive(receivePacket); + + final ByteBuffer bb = ByteBuffer.wrap(receivePacket.getData(), 0, receivePacket.getLength()); + + final long correlationId = bb.getLong(); + + logger.debug("RECEIVED from {} (c={}): {}", receivePacket.getAddress(), correlationId, PrintBufferUtil.hexDump(receivePacket.getData(), 0, receivePacket.getLength())); + + final CustomCommandResponse msg = CustomCommandResponse.create(bb); + + final CompletableFuture future = futureMap.remove(correlationId); + if (future != null) { + // complete future for future-based-calls + future.complete(msg); + } else { + logger.warn("Unexpected response with correlationId={}", correlationId); + } + + } catch (final Exception ex) { + String message = PrintBufferUtil.hexDump(receivePacket.getData(), 0, receivePacket.getLength()); + logger.error("Failed to process message from {}: {}", receivePacket.getAddress().getHostAddress(), message, ex); + } + } + + logger.info("UDP server shutdown"); + serverSocket.close(); + } + + public int callRpcSync(final long data, final int timeoutMs) throws TimeoutException { + + final int leaderNodeIdInitial = leaderNodeId; + int leaderNodeIdLocal = leaderNodeIdInitial; + + final Queue remainingServers = socketMap.keySet().stream() + .filter(id -> id != leaderNodeIdInitial) + .collect(Collectors.toCollection(LinkedList::new)); + + for (int i = 0; i < 5; i++) { + + final long correlationId = correlationIdCounter.incrementAndGet(); + final CompletableFuture future = new CompletableFuture<>(); + futureMap.put(correlationId, future); + + final CustomCommandRequest request = new CustomCommandRequest(data); + + // send request to last known leader + callRpc(request, leaderNodeIdLocal, correlationId); + + try { + + // block waiting for response + final CustomCommandResponse response = future.get(timeoutMs, TimeUnit.MILLISECONDS); + + if (response.success()) { + + // update only if changed (volatile write) + if (leaderNodeIdInitial != leaderNodeIdLocal) { + leaderNodeId = leaderNodeIdLocal; + } + + return response.hash(); + + } else { + + // can be redirected + if (response.leaderNodeId() != leaderNodeIdLocal) { + logger.info("Redirected to new leader {}->{}", leaderNodeIdLocal, response.leaderNodeId()); + leaderNodeIdLocal = response.leaderNodeId(); + } + } + + } catch (TimeoutException ex) { + + logger.info("Timeout from " + leaderNodeIdLocal); + + final Integer nextNode = remainingServers.poll(); + if (nextNode != null) { + leaderNodeIdLocal = nextNode; + } else { + throw ex; + } + + + } catch (Exception ex) { + + logger.info("Request failed ({})", ex.getMessage()); + throw new RuntimeException(ex); + } finally { + // double-check if correlationId removed + futureMap.remove(correlationId); + } + } + + throw new TimeoutException(); + } + + private void callRpc(CustomCommandRequest request, int toNodeId, long correlationId) { + + final byte[] array = new byte[64]; + ByteBuffer bb = ByteBuffer.wrap(array); + + bb.putInt(-1); + bb.putInt(request.getMessageType()); + bb.putLong(correlationId); + + request.serialize(bb); + + send(toNodeId, array, bb.position()); + } + + + private void send(int nodeId, byte[] data, int length) { + + final RaftUtils.RemoteUdpSocket remoteUdpSocket = socketMap.get(nodeId); + + DatagramPacket packet = new DatagramPacket(data, length, remoteUdpSocket.address, remoteUdpSocket.port); + + try { + serverSocket.send(packet); + + } catch (IOException ex) { + throw new RuntimeException(ex); + } + } + + +} diff --git a/src/main/java/exchange/core2/revelator/raft/RpcService.java b/src/main/java/exchange/core2/revelator/raft/RpcService.java index 64e7d42..bfc2ec1 100644 --- a/src/main/java/exchange/core2/revelator/raft/RpcService.java +++ b/src/main/java/exchange/core2/revelator/raft/RpcService.java @@ -7,7 +7,6 @@ import java.io.IOException; import java.net.*; import java.nio.ByteBuffer; -import java.util.HashMap; import java.util.Map; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ConcurrentHashMap; @@ -19,7 +18,7 @@ public class RpcService implements AutoCloseable { private final AtomicLong correlationIdCounter = new AtomicLong(1L); private final Map> futureMap = new ConcurrentHashMap<>(); - private final Map socketMap; + private final Map socketMap; private final int serverPort; private final int serverNodeId; private final RpcHandler handler; @@ -33,27 +32,7 @@ public RpcService(Map remoteNodes, RpcHandler handler, int serverNodeId) { - final Map socketMap = new HashMap<>(); - remoteNodes.forEach((id, address) -> { - - try { - final String[] split = address.split(":"); - - final DatagramSocket socket = new DatagramSocket(); - final InetAddress host = InetAddress.getByName(split[0]); - final int port = Integer.parseInt(split[1]); - - RemoteUdpSocket remoteUdpSocket = new RemoteUdpSocket(socket, host, port); - - socketMap.put(id, remoteUdpSocket); - - } catch (Exception ex) { - throw new RuntimeException(ex); - } - }); - - - this.socketMap = socketMap; + this.socketMap = RaftUtils.createHostMap(remoteNodes);; this.handler = handler; this.serverPort = socketMap.get(serverNodeId).port; this.serverNodeId = serverNodeId; @@ -96,9 +75,9 @@ public void run() { final int messageType = bb.getInt(); final long correlationId = bb.getLong(); -// logger.debug("RECEIVED from {} mt={}: {}", nodeId, messageType, PrintBufferUtil.hexDump(receivePacket.getData(), 0, receivePacket.getLength())); + logger.debug("RECEIVED from {} mt={}: {}", nodeId, messageType, PrintBufferUtil.hexDump(receivePacket.getData(), 0, receivePacket.getLength())); - final RpcMessage msg = createByType(messageType, bb); + final RpcMessage msg = RaftUtils.createMessageByType(messageType, bb); // TODO use msgFactory if (messageType < 0) { @@ -116,6 +95,7 @@ public void run() { } else { if (msg instanceof CustomCommandRequest) { + // request from client final InetAddress address = receivePacket.getAddress(); final int port = receivePacket.getPort(); @@ -147,17 +127,7 @@ public void run() { serverSocket.close(); } - static RpcMessage createByType(int messageType, ByteBuffer buffer) { - return switch (messageType) { - case RpcMessage.REQUEST_APPEND_ENTRIES -> CmdRaftAppendEntries.create(buffer); - case RpcMessage.RESPONSE_APPEND_ENTRIES -> CmdRaftAppendEntriesResponse.create(buffer); - case RpcMessage.REQUEST_VOTE -> CmdRaftVoteRequest.create(buffer); - case RpcMessage.RESPONSE_VOTE -> CmdRaftVoteResponse.create(buffer); - case RpcMessage.REQUEST_CUSTOM -> CustomCommandRequest.create(buffer); - case RpcMessage.RESPONSE_CUSTOM -> CustomCommandResponse.create(buffer); - default -> throw new IllegalArgumentException("Unknown messageType: " + messageType); - }; - } + private void sendResponse(int callerNodeId, long correlationId, RpcResponse response) { final byte[] array = new byte[64]; @@ -193,7 +163,7 @@ public CompletableFuture callRpcSync(RpcRequest request, int toNode private void callRpc(RpcRequest request, int toNodeId, long correlationId) { - final byte[] array = new byte[64]; + final byte[] array = new byte[256]; ByteBuffer bb = ByteBuffer.wrap(array); bb.putInt(serverNodeId); @@ -208,11 +178,11 @@ private void callRpc(RpcRequest request, int toNodeId, long correlationId) { private void send(int nodeId, byte[] data, int length) { - final RemoteUdpSocket remoteUdpSocket = socketMap.get(nodeId); + final RaftUtils.RemoteUdpSocket remoteUdpSocket = socketMap.get(nodeId); final DatagramPacket packet = new DatagramPacket(data, length, remoteUdpSocket.address, remoteUdpSocket.port); try { - remoteUdpSocket.socket.send(packet); + serverSocket.send(packet); } catch (IOException ex) { throw new RuntimeException(ex); } @@ -221,7 +191,7 @@ private void send(int nodeId, byte[] data, int length) { public void respondToClient(InetAddress address, int port, long correlationId, RpcResponse response) { final byte[] array = new byte[64]; - ByteBuffer bb = ByteBuffer.wrap(array); + final ByteBuffer bb = ByteBuffer.wrap(array); // put only correlationId into the header bb.putLong(correlationId); @@ -251,18 +221,4 @@ public void close() throws Exception { } - public static final class RemoteUdpSocket { - - private final DatagramSocket socket; // TODO remove - private final InetAddress address; - private final int port; - - public RemoteUdpSocket(DatagramSocket socket, InetAddress address, int port) { - this.socket = socket; - this.address = address; - this.port = port; - } - } - - } From 42c1955c82ee1a914f3097f50bee7dc167669d78 Mon Sep 17 00:00:00 2001 From: Maksim Zheravin Date: Sun, 30 Jan 2022 02:18:56 +0200 Subject: [PATCH 11/15] RAFT: add generics and custom types --- .../revelator/raft/CmdRaftAppendEntries.java | 23 +++++---- .../revelator/raft/CustomCommandRequest.java | 13 ++--- .../revelator/raft/CustomCommandResponse.java | 16 +++--- .../core2/revelator/raft/CustomRsm.java | 42 +++++++++++++--- .../revelator/raft/CustomRsmCommand.java | 21 ++++++++ .../revelator/raft/CustomRsmResponse.java | 21 ++++++++ .../core2/revelator/raft/RaftClient.java | 8 +-- .../core2/revelator/raft/RaftLogEntry.java | 15 +++--- .../revelator/raft/RaftLogRepository.java | 16 +++--- .../core2/revelator/raft/RaftMessage.java | 4 -- .../core2/revelator/raft/RaftNode.java | 49 ++++++++++--------- .../core2/revelator/raft/RaftUtils.java | 14 ++++-- .../raft/ReplicatedStateMachine.java | 10 ++-- .../core2/revelator/raft/RpcClient.java | 28 ++++++----- .../core2/revelator/raft/RpcHandler.java | 4 +- .../core2/revelator/raft/RpcService.java | 29 +++++++---- .../core2/revelator/raft/RsmRequest.java | 7 +++ .../core2/revelator/raft/RsmResponse.java | 7 +++ .../raft/SerializableMessageFactory.java | 7 ++- 19 files changed, 222 insertions(+), 112 deletions(-) create mode 100644 src/main/java/exchange/core2/revelator/raft/CustomRsmCommand.java create mode 100644 src/main/java/exchange/core2/revelator/raft/CustomRsmResponse.java delete mode 100644 src/main/java/exchange/core2/revelator/raft/RaftMessage.java create mode 100644 src/main/java/exchange/core2/revelator/raft/RsmRequest.java create mode 100644 src/main/java/exchange/core2/revelator/raft/RsmResponse.java diff --git a/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntries.java b/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntries.java index 17db74b..b8182da 100644 --- a/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntries.java +++ b/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntries.java @@ -7,12 +7,12 @@ /** * Invoked by leader to replicate log entries (5.3); also used as heartbeat (5.2). */ -public record CmdRaftAppendEntries(int term, - int leaderId, - long prevLogIndex, - int prevLogTerm, - List entries, - long leaderCommit) implements RpcRequest { +public record CmdRaftAppendEntries(int term, + int leaderId, + long prevLogIndex, + int prevLogTerm, + List> entries, + long leaderCommit) implements RpcRequest { @Override public int getMessageType() { @@ -30,7 +30,9 @@ public void serialize(ByteBuffer buffer) { buffer.putLong(leaderCommit); } - public static CmdRaftAppendEntries create(ByteBuffer buffer) { + public static CmdRaftAppendEntries create( + ByteBuffer buffer, + SerializableMessageFactory factory) { final int term = buffer.getInt(); final int leaderId = buffer.getInt(); @@ -38,14 +40,15 @@ public static CmdRaftAppendEntries create(ByteBuffer buffer) { final int prevLogTerm = buffer.getInt(); final int numEntries = buffer.getInt(); - final List entries = new ArrayList<>(numEntries); + final List> entries = new ArrayList<>(numEntries); for (int i = 0; i < numEntries; i++) { - entries.add(RaftLogEntry.create(buffer)); + + entries.add(RaftLogEntry.create(buffer, factory)); } final long leaderCommit = buffer.getLong(); - return new CmdRaftAppendEntries(term, leaderId, prevLogIndex, prevLogTerm, entries, leaderCommit); + return new CmdRaftAppendEntries<>(term, leaderId, prevLogIndex, prevLogTerm, entries, leaderCommit); } @Override diff --git a/src/main/java/exchange/core2/revelator/raft/CustomCommandRequest.java b/src/main/java/exchange/core2/revelator/raft/CustomCommandRequest.java index 50eb04c..530d2ab 100644 --- a/src/main/java/exchange/core2/revelator/raft/CustomCommandRequest.java +++ b/src/main/java/exchange/core2/revelator/raft/CustomCommandRequest.java @@ -2,8 +2,9 @@ import java.nio.ByteBuffer; -// TODO support batching -public record CustomCommandRequest(long data) implements RpcRequest { +// TODO support batching !! + +public record CustomCommandRequest(T rsmRequest) implements RpcRequest { @Override public int getMessageType() { @@ -12,11 +13,11 @@ public int getMessageType() { @Override public void serialize(ByteBuffer buffer) { - buffer.putLong(data); + rsmRequest.serialize(buffer); } - public static CustomCommandRequest create(ByteBuffer buffer) { - final long data = buffer.getLong(); - return new CustomCommandRequest(data); + public static CustomCommandRequest create(ByteBuffer buffer, SerializableMessageFactory factory) { + + return new CustomCommandRequest<>(factory.createRequest(buffer)); } } diff --git a/src/main/java/exchange/core2/revelator/raft/CustomCommandResponse.java b/src/main/java/exchange/core2/revelator/raft/CustomCommandResponse.java index ea449e2..581b5e4 100644 --- a/src/main/java/exchange/core2/revelator/raft/CustomCommandResponse.java +++ b/src/main/java/exchange/core2/revelator/raft/CustomCommandResponse.java @@ -2,7 +2,9 @@ import java.nio.ByteBuffer; -public record CustomCommandResponse(int hash, int leaderNodeId, boolean success) implements RpcResponse { +public record CustomCommandResponse(S rsmResponse, + int leaderNodeId, + boolean success) implements RpcResponse { @Override public int getMessageType() { @@ -11,18 +13,18 @@ public int getMessageType() { @Override public void serialize(ByteBuffer buffer) { - buffer.putInt(hash); buffer.putInt(leaderNodeId); - buffer.put(success ? (byte) 1 : (byte) 0); + buffer.putInt(success ? 1 : 0); + rsmResponse.serialize(buffer); } - public static CustomCommandResponse create(ByteBuffer buffer) { + public static CustomCommandResponse create(ByteBuffer buffer, SerializableMessageFactory factory) { - final int hash = buffer.getInt(); final int leaderNodeId = buffer.getInt(); - final boolean success = buffer.get() == 1; + final boolean success = buffer.getInt() == 1; + final S rsmResponse = factory.createResponse(buffer); - return new CustomCommandResponse(hash, leaderNodeId, success); + return new CustomCommandResponse<>(rsmResponse, leaderNodeId, success); } } diff --git a/src/main/java/exchange/core2/revelator/raft/CustomRsm.java b/src/main/java/exchange/core2/revelator/raft/CustomRsm.java index 91421f8..8d6fdc2 100644 --- a/src/main/java/exchange/core2/revelator/raft/CustomRsm.java +++ b/src/main/java/exchange/core2/revelator/raft/CustomRsm.java @@ -2,18 +2,46 @@ import org.agrona.collections.Hashing; -public class CustomRsm implements ReplicatedStateMachine { +import java.nio.ByteBuffer; - int hash = 0; +public class CustomRsm implements + ReplicatedStateMachine, + SerializableMessageFactory { + + public static final CustomRsmResponse EMPTY_RSM_RESPONSE = new CustomRsmResponse(0); + + // state + private int hash = 0; + + @Override + public CustomRsmResponse applyCommand(CustomRsmCommand cmd) { + hash = Hashing.hash(hash ^ Hashing.hash(cmd.data)); + return new CustomRsmResponse(hash); + } + + @Override + public CustomRsmResponse applyQuery(CustomRsmCommand query) { + // can not change anything + return new CustomRsmResponse(hash); + } + + @Override + public CustomRsmResponse getState() { + return new CustomRsmResponse(hash); + } + + @Override + public CustomRsmCommand createRequest(ByteBuffer buffer) { + return CustomRsmCommand.create(buffer); + } @Override - public int apply(long value) { - hash = Hashing.hash(hash ^ Hashing.hash(value)); - return hash; + public CustomRsmResponse createResponse(ByteBuffer buffer) { + return CustomRsmResponse.create(buffer); } @Override - public int getState() { - return hash; + public CustomRsmResponse emptyResponse() { + return EMPTY_RSM_RESPONSE; } } diff --git a/src/main/java/exchange/core2/revelator/raft/CustomRsmCommand.java b/src/main/java/exchange/core2/revelator/raft/CustomRsmCommand.java new file mode 100644 index 0000000..840d7af --- /dev/null +++ b/src/main/java/exchange/core2/revelator/raft/CustomRsmCommand.java @@ -0,0 +1,21 @@ +package exchange.core2.revelator.raft; + +import java.nio.ByteBuffer; + +public class CustomRsmCommand implements RsmRequest { + + final long data; + + public CustomRsmCommand(long data) { + this.data = data; + } + + @Override + public void serialize(ByteBuffer buffer) { + buffer.putLong(data); + } + + public static CustomRsmCommand create(ByteBuffer buffer) { + return new CustomRsmCommand(buffer.getLong()); + } +} diff --git a/src/main/java/exchange/core2/revelator/raft/CustomRsmResponse.java b/src/main/java/exchange/core2/revelator/raft/CustomRsmResponse.java new file mode 100644 index 0000000..03003ed --- /dev/null +++ b/src/main/java/exchange/core2/revelator/raft/CustomRsmResponse.java @@ -0,0 +1,21 @@ +package exchange.core2.revelator.raft; + +import java.nio.ByteBuffer; + +public class CustomRsmResponse implements RsmResponse { + + final int hash; + + public CustomRsmResponse(int hash) { + this.hash = hash; + } + + @Override + public void serialize(ByteBuffer buffer) { + buffer.putInt(hash); + } + + public static CustomRsmResponse create(ByteBuffer buffer) { + return new CustomRsmResponse(buffer.getInt()); + } +} diff --git a/src/main/java/exchange/core2/revelator/raft/RaftClient.java b/src/main/java/exchange/core2/revelator/raft/RaftClient.java index 9827503..ee79744 100644 --- a/src/main/java/exchange/core2/revelator/raft/RaftClient.java +++ b/src/main/java/exchange/core2/revelator/raft/RaftClient.java @@ -23,7 +23,7 @@ public static void main(String[] args) throws IOException, InterruptedException } } - private RpcClient rpcClient; + private final RpcClient rpcClient; public RaftClient() { @@ -33,14 +33,14 @@ public RaftClient() { 1, "localhost:3779", 2, "localhost:3780"); - this.rpcClient = new RpcClient(remoteNodes); + this.rpcClient = new RpcClient<>(remoteNodes, new CustomRsm()); } public void sendEcho(long data) { try { log.info("send >>> data={}", data); - final int hash = rpcClient.callRpcSync(data, 500); - log.info("recv <<< hash={}", hash); + final CustomRsmResponse res = rpcClient.callRpcSync(new CustomRsmCommand(data), 500); + log.info("recv <<< hash={}", res.hash); } catch (Exception ex) { log.warn("Exception: ", ex); } diff --git a/src/main/java/exchange/core2/revelator/raft/RaftLogEntry.java b/src/main/java/exchange/core2/revelator/raft/RaftLogEntry.java index 5a48d19..3dc031a 100644 --- a/src/main/java/exchange/core2/revelator/raft/RaftLogEntry.java +++ b/src/main/java/exchange/core2/revelator/raft/RaftLogEntry.java @@ -5,22 +5,22 @@ /** * each entry contains command for state machine, and term when entry was received by leader */ -public class RaftLogEntry { +public class RaftLogEntry { // term when entry was received by leader public final int term; // command - public final long cmd; + public final T cmd; - public RaftLogEntry(int term, long cmd) { + public RaftLogEntry(int term, T cmd) { this.term = term; this.cmd = cmd; } public void serialize(ByteBuffer buffer) { buffer.putInt(term); - buffer.putLong(cmd); + cmd.serialize(buffer); } @Override @@ -31,9 +31,10 @@ public String toString() { '}'; } - public static RaftLogEntry create(ByteBuffer buffer) { + public static RaftLogEntry create(ByteBuffer buffer, + SerializableMessageFactory factory) { final int term = buffer.getInt(); - final long cmd = buffer.getLong(); - return new RaftLogEntry(term, cmd); + final T cmd = factory.createRequest(buffer); + return new RaftLogEntry(term, cmd); } } diff --git a/src/main/java/exchange/core2/revelator/raft/RaftLogRepository.java b/src/main/java/exchange/core2/revelator/raft/RaftLogRepository.java index d280507..eb9cea3 100644 --- a/src/main/java/exchange/core2/revelator/raft/RaftLogRepository.java +++ b/src/main/java/exchange/core2/revelator/raft/RaftLogRepository.java @@ -7,18 +7,18 @@ import java.util.List; import java.util.Optional; -public class RaftLogRepository { +public class RaftLogRepository { private static final Logger log = LoggerFactory.getLogger(RaftLogRepository.class); - private final List logEntries = new ArrayList<>(); // TODO change to persistent storage with long-index + private final List> logEntries = new ArrayList<>(); // TODO change to persistent storage with long-index - public RaftLogEntry getEntry(long index) { + public RaftLogEntry getEntry(long index) { return logEntries.get((int) index - 1); } - public Optional getEntryOpt(long index) { + public Optional> getEntryOpt(long index) { if (index < 1 || index > logEntries.size()) { return Optional.empty(); } @@ -52,7 +52,7 @@ public int getLastLogTerm() { } } - public long append(final RaftLogEntry logEntry) { + public long append(final RaftLogEntry logEntry) { logEntries.add(logEntry); return logEntries.size(); // starting from index=1 } @@ -71,12 +71,12 @@ public long append(final RaftLogEntry logEntry) { // TODO unittest - public void appendOrOverride(final List newEntries, long prevLogIndex) { + public void appendOrOverride(final List> newEntries, long prevLogIndex) { log.debug("appendOrOverride(newEntries={} , prevLogIndex={}", newEntries, prevLogIndex); for (int i = 0; i < newEntries.size(); i++) { - final RaftLogEntry newEntry = newEntries.get(i); + final RaftLogEntry newEntry = newEntries.get(i); if ((prevLogIndex + i) < logEntries.size()) { @@ -104,7 +104,7 @@ public void appendOrOverride(final List newEntries, long prevLogIn } // 1 - public List getEntriesStartingFrom(long nextIndex) { + public List> getEntriesStartingFrom(long nextIndex) { if (getLastLogIndex() < nextIndex) { return List.of(); } diff --git a/src/main/java/exchange/core2/revelator/raft/RaftMessage.java b/src/main/java/exchange/core2/revelator/raft/RaftMessage.java deleted file mode 100644 index f92a5b6..0000000 --- a/src/main/java/exchange/core2/revelator/raft/RaftMessage.java +++ /dev/null @@ -1,4 +0,0 @@ -package exchange.core2.revelator.raft; - -public interface RaftMessage { -} diff --git a/src/main/java/exchange/core2/revelator/raft/RaftNode.java b/src/main/java/exchange/core2/revelator/raft/RaftNode.java index 86a1c77..00d5960 100644 --- a/src/main/java/exchange/core2/revelator/raft/RaftNode.java +++ b/src/main/java/exchange/core2/revelator/raft/RaftNode.java @@ -10,7 +10,7 @@ import java.util.List; import java.util.Map; -public class RaftNode { +public class RaftNode { private static final Logger log = LoggerFactory.getLogger(RaftNode.class); @@ -30,7 +30,7 @@ public class RaftNode { private int votedFor = -1; // log entries; each entry contains command for state machine, and term when entry was received by leader (first index is 1) - private final RaftLogRepository logRepository = new RaftLogRepository(); + private final RaftLogRepository logRepository = new RaftLogRepository<>(); /* **** Volatile state on all servers: */ @@ -73,9 +73,9 @@ public class RaftNode { private final int currentNodeId; private final int[] otherNodes; - private final RpcService rpcService; + private final RpcService rpcService; - private final ReplicatedStateMachine rsm = new CustomRsm(); + private final ReplicatedStateMachine rsm; // timers private long lastHeartBeatReceivedNs = System.nanoTime(); @@ -86,10 +86,14 @@ public static void main(String[] args) { final int thisNodeId = Integer.parseInt(args[0]); - new RaftNode(thisNodeId); + final CustomRsm customRsm = new CustomRsm(); + + new RaftNode<>(thisNodeId, customRsm, customRsm); } - public RaftNode(int thisNodeId) { + public RaftNode(int thisNodeId, + ReplicatedStateMachine rsm, + SerializableMessageFactory msgFactory) { // localhost:3778, localhost:3779, localhost:3780 final Map remoteNodes = Map.of( @@ -98,10 +102,10 @@ public RaftNode(int thisNodeId) { 2, "localhost:3780"); this.currentNodeId = thisNodeId; - + this.rsm = rsm; this.otherNodes = remoteNodes.keySet().stream().mapToInt(x -> x).filter(nodeId -> nodeId != thisNodeId).toArray(); - RpcHandler handler = new RpcHandler() { + final RpcHandler handler = new RpcHandler<>() { @Override public RpcResponse handleNodeRequest(int fromNodeId, RpcRequest req) { log.debug("INCOMING REQ {} >>> {}", fromNodeId, req); @@ -197,7 +201,8 @@ public RpcResponse handleNodeRequest(int fromNodeId, RpcRequest req) { // 3. If an existing entry conflicts with a new one (same index but different terms), // delete the existing entry and all that follow it // 4. Append any new entries not already in the log - logRepository.appendOrOverride(cmd.entries(), prevLogIndex); + final List> entries = cmd.entries(); + logRepository.appendOrOverride(entries, prevLogIndex); log.debug("Added to log repository: {}", cmd.entries()); @@ -285,22 +290,18 @@ public void handleNodeResponse(int fromNodeId, RpcResponse resp, long correlatio @Override - public CustomCommandResponse handleClientRequest(final InetAddress address, - final int port, - final long correlationId, - final CustomCommandRequest request) { - + public CustomCommandResponse handleClientRequest(final InetAddress address, + final int port, + final long correlationId, + final CustomCommandRequest request) { synchronized (this) { if (currentState == RaftNodeState.LEADER) { // If command received from client: append entry to local log, // respond after entry applied to state machine (5.3) - final int prevLogTerm = logRepository.getLastLogTerm(); - final long prevLogIndex = logRepository.getLastLogIndex(); - // adding new record into the local log - final RaftLogEntry logEntry = new RaftLogEntry(currentTerm, request.data()); + final RaftLogEntry logEntry = new RaftLogEntry<>(currentTerm, request.rsmRequest()); final long index = logRepository.append(logEntry); // remember client request (TODO !! on batch migration - should refer to the last record) @@ -309,7 +310,7 @@ public CustomCommandResponse handleClientRequest(final InetAddress address, } else { log.debug("Redirecting client to leader nodeId={}", votedFor); // inform client about different leader - return new CustomCommandResponse(0, votedFor, false); + return new CustomCommandResponse<>(msgFactory.emptyResponse(), votedFor, false); } } @@ -319,7 +320,7 @@ public CustomCommandResponse handleClientRequest(final InetAddress address, }; // todo remove from constructor - rpcService = new RpcService(remoteNodes, handler, thisNodeId); + rpcService = new RpcService<>(remoteNodes, handler, msgFactory, thisNodeId); log.info("HEARTBEAT_TIMEOUT_MS={}", HEARTBEAT_TIMEOUT_MS); log.info("ELECTION_TIMEOUT_MS={}..{}", ELECTION_TIMEOUT_MIN_MS, ELECTION_TIMEOUT_MAX_MS); @@ -383,7 +384,7 @@ private void workerThread() { if (canRetry || timeToSendHeartbeat) { - final List newEntries = logRepository.getEntriesStartingFrom(nextIndexForNode); + final List> newEntries = logRepository.getEntriesStartingFrom(nextIndexForNode); final int prevLogTerm = logRepository.getEntryOpt(nextIndexForNode - 1).map(e -> e.term).orElse(0); log.debug("node {} : nextIndexForNode={} newEntries={} prevLogTerm={}", targetNodeId, nextIndexForNode, newEntries, prevLogTerm); @@ -508,9 +509,9 @@ private void applyPendingEntriesToStateMachine() { */ while (lastApplied < commitIndex) { lastApplied++; - final RaftLogEntry raftLogEntry = logRepository.getEntry(lastApplied); + final RaftLogEntry raftLogEntry = logRepository.getEntry(lastApplied); log.debug("Applying to RSM: {}", raftLogEntry); - final int result = rsm.apply(raftLogEntry.cmd); + final S result = rsm.applyCommand(raftLogEntry.cmd); if (currentState == RaftNodeState.LEADER) { @@ -521,7 +522,7 @@ private void applyPendingEntriesToStateMachine() { c.address, c.port, c.correlationId, - new CustomCommandResponse(result, currentNodeId, true)); + new CustomCommandResponse<>(result, currentNodeId, true)); } } diff --git a/src/main/java/exchange/core2/revelator/raft/RaftUtils.java b/src/main/java/exchange/core2/revelator/raft/RaftUtils.java index 37f4e6c..dff6e1d 100644 --- a/src/main/java/exchange/core2/revelator/raft/RaftUtils.java +++ b/src/main/java/exchange/core2/revelator/raft/RaftUtils.java @@ -1,5 +1,7 @@ package exchange.core2.revelator.raft; +import com.sun.jna.platform.win32.COM.util.Factory; + import java.net.InetAddress; import java.nio.ByteBuffer; import java.util.HashMap; @@ -8,14 +10,18 @@ public class RaftUtils { - public static RpcMessage createMessageByType(int messageType, ByteBuffer buffer) { + public static RpcMessage createMessageByType( + int messageType, + ByteBuffer buffer, + SerializableMessageFactory factory) { + return switch (messageType) { - case RpcMessage.REQUEST_APPEND_ENTRIES -> CmdRaftAppendEntries.create(buffer); + case RpcMessage.REQUEST_APPEND_ENTRIES -> CmdRaftAppendEntries.create(buffer, factory); case RpcMessage.RESPONSE_APPEND_ENTRIES -> CmdRaftAppendEntriesResponse.create(buffer); case RpcMessage.REQUEST_VOTE -> CmdRaftVoteRequest.create(buffer); case RpcMessage.RESPONSE_VOTE -> CmdRaftVoteResponse.create(buffer); - case RpcMessage.REQUEST_CUSTOM -> CustomCommandRequest.create(buffer); - case RpcMessage.RESPONSE_CUSTOM -> CustomCommandResponse.create(buffer); + case RpcMessage.REQUEST_CUSTOM -> CustomCommandRequest.create(buffer, factory); + case RpcMessage.RESPONSE_CUSTOM -> CustomCommandResponse.create(buffer, factory); default -> throw new IllegalArgumentException("Unknown messageType: " + messageType); }; } diff --git a/src/main/java/exchange/core2/revelator/raft/ReplicatedStateMachine.java b/src/main/java/exchange/core2/revelator/raft/ReplicatedStateMachine.java index c5783c6..4b4804d 100644 --- a/src/main/java/exchange/core2/revelator/raft/ReplicatedStateMachine.java +++ b/src/main/java/exchange/core2/revelator/raft/ReplicatedStateMachine.java @@ -1,10 +1,12 @@ package exchange.core2.revelator.raft; -public interface ReplicatedStateMachine { +public interface ReplicatedStateMachine { - // TODO switch to custom messages - int apply(long value); + S applyCommand(T command); - int getState(); + // TODO query + S applyQuery(T query); + + S getState(); } diff --git a/src/main/java/exchange/core2/revelator/raft/RpcClient.java b/src/main/java/exchange/core2/revelator/raft/RpcClient.java index dc8f281..4dd04da 100644 --- a/src/main/java/exchange/core2/revelator/raft/RpcClient.java +++ b/src/main/java/exchange/core2/revelator/raft/RpcClient.java @@ -19,25 +19,27 @@ import java.util.concurrent.atomic.AtomicLong; import java.util.stream.Collectors; -public class RpcClient { +public class RpcClient { private static final Logger logger = LoggerFactory.getLogger(RpcClient.class); private final AtomicLong correlationIdCounter = new AtomicLong(1L); - private final Map> futureMap = new ConcurrentHashMap<>(); + private final Map>> futureMap = new ConcurrentHashMap<>(); private final Map socketMap; + private final SerializableMessageFactory msgFactory; private volatile int leaderNodeId = 0; private final DatagramSocket serverSocket; - private volatile boolean active = true; + private volatile boolean active = true; // TODO implement - public RpcClient(final Map remoteNodes) { + public RpcClient(final Map remoteNodes, + final SerializableMessageFactory msgFactory) { this.socketMap = RaftUtils.createHostMap(remoteNodes); - + this.msgFactory = msgFactory; try { this.serverSocket = new DatagramSocket(); @@ -69,9 +71,9 @@ public void run() { logger.debug("RECEIVED from {} (c={}): {}", receivePacket.getAddress(), correlationId, PrintBufferUtil.hexDump(receivePacket.getData(), 0, receivePacket.getLength())); - final CustomCommandResponse msg = CustomCommandResponse.create(bb); + final CustomCommandResponse msg = CustomCommandResponse.create(bb, msgFactory); - final CompletableFuture future = futureMap.remove(correlationId); + final CompletableFuture> future = futureMap.remove(correlationId); if (future != null) { // complete future for future-based-calls future.complete(msg); @@ -89,7 +91,7 @@ public void run() { serverSocket.close(); } - public int callRpcSync(final long data, final int timeoutMs) throws TimeoutException { + public S callRpcSync(final T data, final int timeoutMs) throws TimeoutException { final int leaderNodeIdInitial = leaderNodeId; int leaderNodeIdLocal = leaderNodeIdInitial; @@ -101,10 +103,10 @@ public int callRpcSync(final long data, final int timeoutMs) throws TimeoutExcep for (int i = 0; i < 5; i++) { final long correlationId = correlationIdCounter.incrementAndGet(); - final CompletableFuture future = new CompletableFuture<>(); + final CompletableFuture> future = new CompletableFuture<>(); futureMap.put(correlationId, future); - final CustomCommandRequest request = new CustomCommandRequest(data); + final CustomCommandRequest request = new CustomCommandRequest<>(data); // send request to last known leader callRpc(request, leaderNodeIdLocal, correlationId); @@ -112,7 +114,7 @@ public int callRpcSync(final long data, final int timeoutMs) throws TimeoutExcep try { // block waiting for response - final CustomCommandResponse response = future.get(timeoutMs, TimeUnit.MILLISECONDS); + final CustomCommandResponse response = future.get(timeoutMs, TimeUnit.MILLISECONDS); if (response.success()) { @@ -121,7 +123,7 @@ public int callRpcSync(final long data, final int timeoutMs) throws TimeoutExcep leaderNodeId = leaderNodeIdLocal; } - return response.hash(); + return response.rsmResponse(); } else { @@ -157,7 +159,7 @@ public int callRpcSync(final long data, final int timeoutMs) throws TimeoutExcep throw new TimeoutException(); } - private void callRpc(CustomCommandRequest request, int toNodeId, long correlationId) { + private void callRpc(CustomCommandRequest request, int toNodeId, long correlationId) { final byte[] array = new byte[64]; ByteBuffer bb = ByteBuffer.wrap(array); diff --git a/src/main/java/exchange/core2/revelator/raft/RpcHandler.java b/src/main/java/exchange/core2/revelator/raft/RpcHandler.java index 74d639b..ed88f75 100644 --- a/src/main/java/exchange/core2/revelator/raft/RpcHandler.java +++ b/src/main/java/exchange/core2/revelator/raft/RpcHandler.java @@ -2,12 +2,12 @@ import java.net.InetAddress; -public interface RpcHandler { +public interface RpcHandler { RpcResponse handleNodeRequest(int nodeId, RpcRequest request); void handleNodeResponse(int nodeId, RpcResponse response, long correlationId); - CustomCommandResponse handleClientRequest(InetAddress address, int port, long correlationId, CustomCommandRequest request); + CustomCommandResponse handleClientRequest(InetAddress address, int port, long correlationId, CustomCommandRequest request); } diff --git a/src/main/java/exchange/core2/revelator/raft/RpcService.java b/src/main/java/exchange/core2/revelator/raft/RpcService.java index bfc2ec1..ce664a1 100644 --- a/src/main/java/exchange/core2/revelator/raft/RpcService.java +++ b/src/main/java/exchange/core2/revelator/raft/RpcService.java @@ -12,7 +12,7 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicLong; -public class RpcService implements AutoCloseable { +public class RpcService implements AutoCloseable { private static final Logger logger = LoggerFactory.getLogger(RpcService.class); @@ -21,21 +21,24 @@ public class RpcService implements AutoCloseable { private final Map socketMap; private final int serverPort; private final int serverNodeId; - private final RpcHandler handler; + private final RpcHandler handler; + private final SerializableMessageFactory msgFactory; - private DatagramSocket serverSocket; -// private final SerializableMessageFactory msgFactory; + private final DatagramSocket serverSocket; private volatile boolean active = true; public RpcService(Map remoteNodes, - RpcHandler handler, + RpcHandler handler, + SerializableMessageFactory msgFactory, int serverNodeId) { - this.socketMap = RaftUtils.createHostMap(remoteNodes);; + this.socketMap = RaftUtils.createHostMap(remoteNodes); + ; this.handler = handler; this.serverPort = socketMap.get(serverNodeId).port; this.serverNodeId = serverNodeId; + this.msgFactory = msgFactory; try { this.serverSocket = new DatagramSocket(serverPort); @@ -77,7 +80,7 @@ public void run() { logger.debug("RECEIVED from {} mt={}: {}", nodeId, messageType, PrintBufferUtil.hexDump(receivePacket.getData(), 0, receivePacket.getLength())); - final RpcMessage msg = RaftUtils.createMessageByType(messageType, bb); + final RpcMessage msg = RaftUtils.createMessageByType(messageType, bb, msgFactory); // TODO use msgFactory if (messageType < 0) { @@ -100,7 +103,14 @@ public void run() { final InetAddress address = receivePacket.getAddress(); final int port = receivePacket.getPort(); - final CustomCommandResponse response = handler.handleClientRequest(address, port, correlationId, (CustomCommandRequest) msg); + final CustomCommandRequest msgT = (CustomCommandRequest) msg; + + final CustomCommandResponse response = handler.handleClientRequest( + address, + port, + correlationId, + msgT); + if (response != null) { respondToClient(address, port, correlationId, response); } @@ -128,7 +138,6 @@ public void run() { } - private void sendResponse(int callerNodeId, long correlationId, RpcResponse response) { final byte[] array = new byte[64]; ByteBuffer bb = ByteBuffer.wrap(array); @@ -215,7 +224,7 @@ private void respondToClient(InetAddress address, int port, byte[] data, int len @Override - public void close() throws Exception { + public void close() { active = false; } diff --git a/src/main/java/exchange/core2/revelator/raft/RsmRequest.java b/src/main/java/exchange/core2/revelator/raft/RsmRequest.java new file mode 100644 index 0000000..e18b252 --- /dev/null +++ b/src/main/java/exchange/core2/revelator/raft/RsmRequest.java @@ -0,0 +1,7 @@ +package exchange.core2.revelator.raft; + +public interface RsmRequest extends SerializableMessage { + + + +} diff --git a/src/main/java/exchange/core2/revelator/raft/RsmResponse.java b/src/main/java/exchange/core2/revelator/raft/RsmResponse.java new file mode 100644 index 0000000..a9a8aeb --- /dev/null +++ b/src/main/java/exchange/core2/revelator/raft/RsmResponse.java @@ -0,0 +1,7 @@ +package exchange.core2.revelator.raft; + +public interface RsmResponse extends SerializableMessage { + + + +} diff --git a/src/main/java/exchange/core2/revelator/raft/SerializableMessageFactory.java b/src/main/java/exchange/core2/revelator/raft/SerializableMessageFactory.java index 78b2042..b746ba4 100644 --- a/src/main/java/exchange/core2/revelator/raft/SerializableMessageFactory.java +++ b/src/main/java/exchange/core2/revelator/raft/SerializableMessageFactory.java @@ -2,8 +2,11 @@ import java.nio.ByteBuffer; -public interface SerializableMessageFactory { +public interface SerializableMessageFactory { - T create(ByteBuffer buffer); + T createRequest(ByteBuffer buffer); + S createResponse(ByteBuffer buffer); + + S emptyResponse(); } From dc4355827ce066d3c493add12b0c4a1ea3fa8b89 Mon Sep 17 00:00:00 2001 From: Maksim Zheravin Date: Sun, 30 Jan 2022 02:32:48 +0200 Subject: [PATCH 12/15] RAFT: change packages structure --- .../revelator/raft/CustomRsmCommand.java | 21 ---------------- .../revelator/raft/CustomRsmResponse.java | 21 ---------------- .../revelator/raft/RaftLogRepository.java | 14 ++++++----- .../core2/revelator/raft/RaftNode.java | 16 ++++--------- .../core2/revelator/raft/RaftUtils.java | 4 ++-- .../raft/ReplicatedStateMachine.java | 3 +++ .../core2/revelator/raft/RpcClient.java | 8 +++++-- .../core2/revelator/raft/RpcHandler.java | 2 ++ .../core2/revelator/raft/RpcService.java | 5 ++-- .../revelator/raft/RsmMessageFactory.java | 15 ++++++++++++ .../raft/SerializableMessageFactory.java | 12 ---------- .../core2/revelator/raft/demo/CustomNode.java | 16 +++++++++++++ .../revelator/raft/{ => demo}/CustomRsm.java | 8 ++++--- .../revelator/raft/demo/CustomRsmCommand.java | 24 +++++++++++++++++++ .../raft/demo/CustomRsmResponse.java | 24 +++++++++++++++++++ .../revelator/raft/{ => demo}/RaftClient.java | 5 ++-- .../{ => messages}/CmdRaftAppendEntries.java | 6 +++-- .../CmdRaftAppendEntriesResponse.java | 2 +- .../{ => messages}/CmdRaftVoteRequest.java | 2 +- .../{ => messages}/CmdRaftVoteResponse.java | 2 +- .../{ => messages}/CustomCommandRequest.java | 6 +++-- .../{ => messages}/CustomCommandResponse.java | 6 +++-- .../raft/{ => messages}/RaftLogEntry.java | 19 ++++----------- .../raft/{ => messages}/RpcMessage.java | 2 +- .../raft/{ => messages}/RpcRequest.java | 2 +- .../raft/{ => messages}/RpcResponse.java | 2 +- .../raft/{ => messages}/RsmRequest.java | 2 +- .../raft/{ => messages}/RsmResponse.java | 2 +- .../{ => messages}/SerializableMessage.java | 2 +- 29 files changed, 141 insertions(+), 112 deletions(-) delete mode 100644 src/main/java/exchange/core2/revelator/raft/CustomRsmCommand.java delete mode 100644 src/main/java/exchange/core2/revelator/raft/CustomRsmResponse.java create mode 100644 src/main/java/exchange/core2/revelator/raft/RsmMessageFactory.java delete mode 100644 src/main/java/exchange/core2/revelator/raft/SerializableMessageFactory.java create mode 100644 src/main/java/exchange/core2/revelator/raft/demo/CustomNode.java rename src/main/java/exchange/core2/revelator/raft/{ => demo}/CustomRsm.java (78%) create mode 100644 src/main/java/exchange/core2/revelator/raft/demo/CustomRsmCommand.java create mode 100644 src/main/java/exchange/core2/revelator/raft/demo/CustomRsmResponse.java rename src/main/java/exchange/core2/revelator/raft/{ => demo}/RaftClient.java (89%) rename src/main/java/exchange/core2/revelator/raft/{ => messages}/CmdRaftAppendEntries.java (93%) rename src/main/java/exchange/core2/revelator/raft/{ => messages}/CmdRaftAppendEntriesResponse.java (97%) rename src/main/java/exchange/core2/revelator/raft/{ => messages}/CmdRaftVoteRequest.java (96%) rename src/main/java/exchange/core2/revelator/raft/{ => messages}/CmdRaftVoteResponse.java (94%) rename src/main/java/exchange/core2/revelator/raft/{ => messages}/CustomCommandRequest.java (74%) rename src/main/java/exchange/core2/revelator/raft/{ => messages}/CustomCommandResponse.java (83%) rename src/main/java/exchange/core2/revelator/raft/{ => messages}/RaftLogEntry.java (66%) rename src/main/java/exchange/core2/revelator/raft/{ => messages}/RpcMessage.java (86%) rename src/main/java/exchange/core2/revelator/raft/{ => messages}/RpcRequest.java (52%) rename src/main/java/exchange/core2/revelator/raft/{ => messages}/RpcResponse.java (52%) rename src/main/java/exchange/core2/revelator/raft/{ => messages}/RsmRequest.java (57%) rename src/main/java/exchange/core2/revelator/raft/{ => messages}/RsmResponse.java (57%) rename src/main/java/exchange/core2/revelator/raft/{ => messages}/SerializableMessage.java (70%) diff --git a/src/main/java/exchange/core2/revelator/raft/CustomRsmCommand.java b/src/main/java/exchange/core2/revelator/raft/CustomRsmCommand.java deleted file mode 100644 index 840d7af..0000000 --- a/src/main/java/exchange/core2/revelator/raft/CustomRsmCommand.java +++ /dev/null @@ -1,21 +0,0 @@ -package exchange.core2.revelator.raft; - -import java.nio.ByteBuffer; - -public class CustomRsmCommand implements RsmRequest { - - final long data; - - public CustomRsmCommand(long data) { - this.data = data; - } - - @Override - public void serialize(ByteBuffer buffer) { - buffer.putLong(data); - } - - public static CustomRsmCommand create(ByteBuffer buffer) { - return new CustomRsmCommand(buffer.getLong()); - } -} diff --git a/src/main/java/exchange/core2/revelator/raft/CustomRsmResponse.java b/src/main/java/exchange/core2/revelator/raft/CustomRsmResponse.java deleted file mode 100644 index 03003ed..0000000 --- a/src/main/java/exchange/core2/revelator/raft/CustomRsmResponse.java +++ /dev/null @@ -1,21 +0,0 @@ -package exchange.core2.revelator.raft; - -import java.nio.ByteBuffer; - -public class CustomRsmResponse implements RsmResponse { - - final int hash; - - public CustomRsmResponse(int hash) { - this.hash = hash; - } - - @Override - public void serialize(ByteBuffer buffer) { - buffer.putInt(hash); - } - - public static CustomRsmResponse create(ByteBuffer buffer) { - return new CustomRsmResponse(buffer.getInt()); - } -} diff --git a/src/main/java/exchange/core2/revelator/raft/RaftLogRepository.java b/src/main/java/exchange/core2/revelator/raft/RaftLogRepository.java index eb9cea3..50dfa5b 100644 --- a/src/main/java/exchange/core2/revelator/raft/RaftLogRepository.java +++ b/src/main/java/exchange/core2/revelator/raft/RaftLogRepository.java @@ -1,5 +1,7 @@ package exchange.core2.revelator.raft; +import exchange.core2.revelator.raft.messages.RaftLogEntry; +import exchange.core2.revelator.raft.messages.RsmRequest; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -32,7 +34,7 @@ public long lastEntryInTerm(long indexAfter, long indexBeforeIncl, int term) { for (int i = (int) indexAfter + 1; i <= indexBeforeIncl; i++) { log.debug("i={}", i); - if (logEntries.get(i - 1).term == term) { + if (logEntries.get(i - 1).term() == term) { idx = i; } } @@ -48,7 +50,7 @@ public int getLastLogTerm() { if (logEntries.isEmpty()) { return 0; // return term 0 by default } else { - return logEntries.get(logEntries.size() - 1).term; + return logEntries.get(logEntries.size() - 1).term(); } } @@ -82,15 +84,15 @@ public void appendOrOverride(final List> newEntries, long prevLo final int pos = (int) prevLogIndex + i; - final int existingTerm = logEntries.get(pos).term; + final int existingTerm = logEntries.get(pos).term(); - log.debug("Validating older record with index={}: existingTerm={} newEntry.term={}", pos + 1, existingTerm, newEntry.term); + log.debug("Validating older record with index={}: existingTerm={} newEntry.term={}", pos + 1, existingTerm, newEntry.term()); // 3. If an existing entry conflicts with a new one (same index but different terms), // delete the existing entry and all that follow it - if (newEntry.term != existingTerm) { - log.debug("Remove all records after index={}, because term is different: {} (old={})", pos + 1, newEntry.term, existingTerm); + if (newEntry.term() != existingTerm) { + log.debug("Remove all records after index={}, because term is different: {} (old={})", pos + 1, newEntry.term(), existingTerm); int lastIdxToRemove = logEntries.size(); if (lastIdxToRemove > pos + 1) { logEntries.subList(pos + 1, lastIdxToRemove).clear(); diff --git a/src/main/java/exchange/core2/revelator/raft/RaftNode.java b/src/main/java/exchange/core2/revelator/raft/RaftNode.java index 00d5960..a800b1b 100644 --- a/src/main/java/exchange/core2/revelator/raft/RaftNode.java +++ b/src/main/java/exchange/core2/revelator/raft/RaftNode.java @@ -1,6 +1,7 @@ package exchange.core2.revelator.raft; +import exchange.core2.revelator.raft.messages.*; import org.eclipse.collections.impl.map.mutable.primitive.LongObjectHashMap; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -82,18 +83,9 @@ public class RaftNode { private long electionEndNs = System.nanoTime(); - public static void main(String[] args) { - - final int thisNodeId = Integer.parseInt(args[0]); - - final CustomRsm customRsm = new CustomRsm(); - - new RaftNode<>(thisNodeId, customRsm, customRsm); - } - public RaftNode(int thisNodeId, ReplicatedStateMachine rsm, - SerializableMessageFactory msgFactory) { + RsmMessageFactory msgFactory) { // localhost:3778, localhost:3779, localhost:3780 final Map remoteNodes = Map.of( @@ -385,7 +377,7 @@ private void workerThread() { if (canRetry || timeToSendHeartbeat) { final List> newEntries = logRepository.getEntriesStartingFrom(nextIndexForNode); - final int prevLogTerm = logRepository.getEntryOpt(nextIndexForNode - 1).map(e -> e.term).orElse(0); + final int prevLogTerm = logRepository.getEntryOpt(nextIndexForNode - 1).map(RaftLogEntry::term).orElse(0); log.debug("node {} : nextIndexForNode={} newEntries={} prevLogTerm={}", targetNodeId, nextIndexForNode, newEntries, prevLogTerm); @@ -511,7 +503,7 @@ private void applyPendingEntriesToStateMachine() { lastApplied++; final RaftLogEntry raftLogEntry = logRepository.getEntry(lastApplied); log.debug("Applying to RSM: {}", raftLogEntry); - final S result = rsm.applyCommand(raftLogEntry.cmd); + final S result = rsm.applyCommand(raftLogEntry.cmd()); if (currentState == RaftNodeState.LEADER) { diff --git a/src/main/java/exchange/core2/revelator/raft/RaftUtils.java b/src/main/java/exchange/core2/revelator/raft/RaftUtils.java index dff6e1d..4ed6896 100644 --- a/src/main/java/exchange/core2/revelator/raft/RaftUtils.java +++ b/src/main/java/exchange/core2/revelator/raft/RaftUtils.java @@ -1,6 +1,6 @@ package exchange.core2.revelator.raft; -import com.sun.jna.platform.win32.COM.util.Factory; +import exchange.core2.revelator.raft.messages.*; import java.net.InetAddress; import java.nio.ByteBuffer; @@ -13,7 +13,7 @@ public class RaftUtils { public static RpcMessage createMessageByType( int messageType, ByteBuffer buffer, - SerializableMessageFactory factory) { + RsmMessageFactory factory) { return switch (messageType) { case RpcMessage.REQUEST_APPEND_ENTRIES -> CmdRaftAppendEntries.create(buffer, factory); diff --git a/src/main/java/exchange/core2/revelator/raft/ReplicatedStateMachine.java b/src/main/java/exchange/core2/revelator/raft/ReplicatedStateMachine.java index 4b4804d..4cf0edb 100644 --- a/src/main/java/exchange/core2/revelator/raft/ReplicatedStateMachine.java +++ b/src/main/java/exchange/core2/revelator/raft/ReplicatedStateMachine.java @@ -1,5 +1,8 @@ package exchange.core2.revelator.raft; +import exchange.core2.revelator.raft.messages.RsmRequest; +import exchange.core2.revelator.raft.messages.RsmResponse; + public interface ReplicatedStateMachine { S applyCommand(T command); diff --git a/src/main/java/exchange/core2/revelator/raft/RpcClient.java b/src/main/java/exchange/core2/revelator/raft/RpcClient.java index 4dd04da..2093652 100644 --- a/src/main/java/exchange/core2/revelator/raft/RpcClient.java +++ b/src/main/java/exchange/core2/revelator/raft/RpcClient.java @@ -1,5 +1,9 @@ package exchange.core2.revelator.raft; +import exchange.core2.revelator.raft.messages.CustomCommandRequest; +import exchange.core2.revelator.raft.messages.CustomCommandResponse; +import exchange.core2.revelator.raft.messages.RsmRequest; +import exchange.core2.revelator.raft.messages.RsmResponse; import org.agrona.PrintBufferUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -26,7 +30,7 @@ public class RpcClient { private final AtomicLong correlationIdCounter = new AtomicLong(1L); private final Map>> futureMap = new ConcurrentHashMap<>(); private final Map socketMap; - private final SerializableMessageFactory msgFactory; + private final RsmMessageFactory msgFactory; private volatile int leaderNodeId = 0; @@ -36,7 +40,7 @@ public class RpcClient { public RpcClient(final Map remoteNodes, - final SerializableMessageFactory msgFactory) { + final RsmMessageFactory msgFactory) { this.socketMap = RaftUtils.createHostMap(remoteNodes); this.msgFactory = msgFactory; diff --git a/src/main/java/exchange/core2/revelator/raft/RpcHandler.java b/src/main/java/exchange/core2/revelator/raft/RpcHandler.java index ed88f75..5825a5c 100644 --- a/src/main/java/exchange/core2/revelator/raft/RpcHandler.java +++ b/src/main/java/exchange/core2/revelator/raft/RpcHandler.java @@ -1,5 +1,7 @@ package exchange.core2.revelator.raft; +import exchange.core2.revelator.raft.messages.*; + import java.net.InetAddress; public interface RpcHandler { diff --git a/src/main/java/exchange/core2/revelator/raft/RpcService.java b/src/main/java/exchange/core2/revelator/raft/RpcService.java index ce664a1..30b7007 100644 --- a/src/main/java/exchange/core2/revelator/raft/RpcService.java +++ b/src/main/java/exchange/core2/revelator/raft/RpcService.java @@ -1,5 +1,6 @@ package exchange.core2.revelator.raft; +import exchange.core2.revelator.raft.messages.*; import org.agrona.PrintBufferUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -22,7 +23,7 @@ public class RpcService implements private final int serverPort; private final int serverNodeId; private final RpcHandler handler; - private final SerializableMessageFactory msgFactory; + private final RsmMessageFactory msgFactory; private final DatagramSocket serverSocket; @@ -30,7 +31,7 @@ public class RpcService implements public RpcService(Map remoteNodes, RpcHandler handler, - SerializableMessageFactory msgFactory, + RsmMessageFactory msgFactory, int serverNodeId) { this.socketMap = RaftUtils.createHostMap(remoteNodes); diff --git a/src/main/java/exchange/core2/revelator/raft/RsmMessageFactory.java b/src/main/java/exchange/core2/revelator/raft/RsmMessageFactory.java new file mode 100644 index 0000000..e7bfac4 --- /dev/null +++ b/src/main/java/exchange/core2/revelator/raft/RsmMessageFactory.java @@ -0,0 +1,15 @@ +package exchange.core2.revelator.raft; + +import exchange.core2.revelator.raft.messages.RsmRequest; +import exchange.core2.revelator.raft.messages.RsmResponse; + +import java.nio.ByteBuffer; + +public interface RsmMessageFactory { + + T createRequest(ByteBuffer buffer); + + S createResponse(ByteBuffer buffer); + + S emptyResponse(); +} diff --git a/src/main/java/exchange/core2/revelator/raft/SerializableMessageFactory.java b/src/main/java/exchange/core2/revelator/raft/SerializableMessageFactory.java deleted file mode 100644 index b746ba4..0000000 --- a/src/main/java/exchange/core2/revelator/raft/SerializableMessageFactory.java +++ /dev/null @@ -1,12 +0,0 @@ -package exchange.core2.revelator.raft; - -import java.nio.ByteBuffer; - -public interface SerializableMessageFactory { - - T createRequest(ByteBuffer buffer); - - S createResponse(ByteBuffer buffer); - - S emptyResponse(); -} diff --git a/src/main/java/exchange/core2/revelator/raft/demo/CustomNode.java b/src/main/java/exchange/core2/revelator/raft/demo/CustomNode.java new file mode 100644 index 0000000..1fd5f66 --- /dev/null +++ b/src/main/java/exchange/core2/revelator/raft/demo/CustomNode.java @@ -0,0 +1,16 @@ +package exchange.core2.revelator.raft.demo; + +import exchange.core2.revelator.raft.RaftNode; + +public class CustomNode { + + public static void main(String[] args) { + + final int thisNodeId = Integer.parseInt(args[0]); + + final CustomRsm customRsm = new CustomRsm(); + + new RaftNode<>(thisNodeId, customRsm, customRsm); + } + +} diff --git a/src/main/java/exchange/core2/revelator/raft/CustomRsm.java b/src/main/java/exchange/core2/revelator/raft/demo/CustomRsm.java similarity index 78% rename from src/main/java/exchange/core2/revelator/raft/CustomRsm.java rename to src/main/java/exchange/core2/revelator/raft/demo/CustomRsm.java index 8d6fdc2..acb7682 100644 --- a/src/main/java/exchange/core2/revelator/raft/CustomRsm.java +++ b/src/main/java/exchange/core2/revelator/raft/demo/CustomRsm.java @@ -1,12 +1,14 @@ -package exchange.core2.revelator.raft; +package exchange.core2.revelator.raft.demo; +import exchange.core2.revelator.raft.ReplicatedStateMachine; +import exchange.core2.revelator.raft.RsmMessageFactory; import org.agrona.collections.Hashing; import java.nio.ByteBuffer; public class CustomRsm implements ReplicatedStateMachine, - SerializableMessageFactory { + RsmMessageFactory { public static final CustomRsmResponse EMPTY_RSM_RESPONSE = new CustomRsmResponse(0); @@ -15,7 +17,7 @@ public class CustomRsm implements @Override public CustomRsmResponse applyCommand(CustomRsmCommand cmd) { - hash = Hashing.hash(hash ^ Hashing.hash(cmd.data)); + hash = Hashing.hash(hash ^ Hashing.hash(cmd.data())); return new CustomRsmResponse(hash); } diff --git a/src/main/java/exchange/core2/revelator/raft/demo/CustomRsmCommand.java b/src/main/java/exchange/core2/revelator/raft/demo/CustomRsmCommand.java new file mode 100644 index 0000000..0cea635 --- /dev/null +++ b/src/main/java/exchange/core2/revelator/raft/demo/CustomRsmCommand.java @@ -0,0 +1,24 @@ +package exchange.core2.revelator.raft.demo; + +import exchange.core2.revelator.raft.messages.RsmRequest; + +import java.nio.ByteBuffer; + +public record CustomRsmCommand(long data) implements RsmRequest { + + @Override + public void serialize(ByteBuffer buffer) { + buffer.putLong(data); + } + + @Override + public String toString() { + return "CRC{" + + "data=" + data + + '}'; + } + + public static CustomRsmCommand create(ByteBuffer buffer) { + return new CustomRsmCommand(buffer.getLong()); + } +} diff --git a/src/main/java/exchange/core2/revelator/raft/demo/CustomRsmResponse.java b/src/main/java/exchange/core2/revelator/raft/demo/CustomRsmResponse.java new file mode 100644 index 0000000..ea03f2d --- /dev/null +++ b/src/main/java/exchange/core2/revelator/raft/demo/CustomRsmResponse.java @@ -0,0 +1,24 @@ +package exchange.core2.revelator.raft.demo; + +import exchange.core2.revelator.raft.messages.RsmResponse; + +import java.nio.ByteBuffer; + +public record CustomRsmResponse(int hash) implements RsmResponse { + + @Override + public void serialize(ByteBuffer buffer) { + buffer.putInt(hash); + } + + @Override + public String toString() { + return "CRR{" + + "hash=" + hash + + '}'; + } + + public static CustomRsmResponse create(ByteBuffer buffer) { + return new CustomRsmResponse(buffer.getInt()); + } +} diff --git a/src/main/java/exchange/core2/revelator/raft/RaftClient.java b/src/main/java/exchange/core2/revelator/raft/demo/RaftClient.java similarity index 89% rename from src/main/java/exchange/core2/revelator/raft/RaftClient.java rename to src/main/java/exchange/core2/revelator/raft/demo/RaftClient.java index ee79744..62d88d8 100644 --- a/src/main/java/exchange/core2/revelator/raft/RaftClient.java +++ b/src/main/java/exchange/core2/revelator/raft/demo/RaftClient.java @@ -1,6 +1,7 @@ -package exchange.core2.revelator.raft; +package exchange.core2.revelator.raft.demo; +import exchange.core2.revelator.raft.RpcClient; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -40,7 +41,7 @@ public void sendEcho(long data) { try { log.info("send >>> data={}", data); final CustomRsmResponse res = rpcClient.callRpcSync(new CustomRsmCommand(data), 500); - log.info("recv <<< hash={}", res.hash); + log.info("recv <<< hash={}", res.hash()); } catch (Exception ex) { log.warn("Exception: ", ex); } diff --git a/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntries.java b/src/main/java/exchange/core2/revelator/raft/messages/CmdRaftAppendEntries.java similarity index 93% rename from src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntries.java rename to src/main/java/exchange/core2/revelator/raft/messages/CmdRaftAppendEntries.java index b8182da..873b700 100644 --- a/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntries.java +++ b/src/main/java/exchange/core2/revelator/raft/messages/CmdRaftAppendEntries.java @@ -1,4 +1,6 @@ -package exchange.core2.revelator.raft; +package exchange.core2.revelator.raft.messages; + +import exchange.core2.revelator.raft.RsmMessageFactory; import java.nio.ByteBuffer; import java.util.ArrayList; @@ -32,7 +34,7 @@ public void serialize(ByteBuffer buffer) { public static CmdRaftAppendEntries create( ByteBuffer buffer, - SerializableMessageFactory factory) { + RsmMessageFactory factory) { final int term = buffer.getInt(); final int leaderId = buffer.getInt(); diff --git a/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntriesResponse.java b/src/main/java/exchange/core2/revelator/raft/messages/CmdRaftAppendEntriesResponse.java similarity index 97% rename from src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntriesResponse.java rename to src/main/java/exchange/core2/revelator/raft/messages/CmdRaftAppendEntriesResponse.java index 19ea7fd..21b2520 100644 --- a/src/main/java/exchange/core2/revelator/raft/CmdRaftAppendEntriesResponse.java +++ b/src/main/java/exchange/core2/revelator/raft/messages/CmdRaftAppendEntriesResponse.java @@ -1,4 +1,4 @@ -package exchange.core2.revelator.raft; +package exchange.core2.revelator.raft.messages; import java.nio.ByteBuffer; diff --git a/src/main/java/exchange/core2/revelator/raft/CmdRaftVoteRequest.java b/src/main/java/exchange/core2/revelator/raft/messages/CmdRaftVoteRequest.java similarity index 96% rename from src/main/java/exchange/core2/revelator/raft/CmdRaftVoteRequest.java rename to src/main/java/exchange/core2/revelator/raft/messages/CmdRaftVoteRequest.java index c0af7f7..bd2aad1 100644 --- a/src/main/java/exchange/core2/revelator/raft/CmdRaftVoteRequest.java +++ b/src/main/java/exchange/core2/revelator/raft/messages/CmdRaftVoteRequest.java @@ -1,4 +1,4 @@ -package exchange.core2.revelator.raft; +package exchange.core2.revelator.raft.messages; import java.nio.ByteBuffer; diff --git a/src/main/java/exchange/core2/revelator/raft/CmdRaftVoteResponse.java b/src/main/java/exchange/core2/revelator/raft/messages/CmdRaftVoteResponse.java similarity index 94% rename from src/main/java/exchange/core2/revelator/raft/CmdRaftVoteResponse.java rename to src/main/java/exchange/core2/revelator/raft/messages/CmdRaftVoteResponse.java index c2e17c7..d9dcd59 100644 --- a/src/main/java/exchange/core2/revelator/raft/CmdRaftVoteResponse.java +++ b/src/main/java/exchange/core2/revelator/raft/messages/CmdRaftVoteResponse.java @@ -1,4 +1,4 @@ -package exchange.core2.revelator.raft; +package exchange.core2.revelator.raft.messages; import java.nio.ByteBuffer; diff --git a/src/main/java/exchange/core2/revelator/raft/CustomCommandRequest.java b/src/main/java/exchange/core2/revelator/raft/messages/CustomCommandRequest.java similarity index 74% rename from src/main/java/exchange/core2/revelator/raft/CustomCommandRequest.java rename to src/main/java/exchange/core2/revelator/raft/messages/CustomCommandRequest.java index 530d2ab..d223338 100644 --- a/src/main/java/exchange/core2/revelator/raft/CustomCommandRequest.java +++ b/src/main/java/exchange/core2/revelator/raft/messages/CustomCommandRequest.java @@ -1,4 +1,6 @@ -package exchange.core2.revelator.raft; +package exchange.core2.revelator.raft.messages; + +import exchange.core2.revelator.raft.RsmMessageFactory; import java.nio.ByteBuffer; @@ -16,7 +18,7 @@ public void serialize(ByteBuffer buffer) { rsmRequest.serialize(buffer); } - public static CustomCommandRequest create(ByteBuffer buffer, SerializableMessageFactory factory) { + public static CustomCommandRequest create(ByteBuffer buffer, RsmMessageFactory factory) { return new CustomCommandRequest<>(factory.createRequest(buffer)); } diff --git a/src/main/java/exchange/core2/revelator/raft/CustomCommandResponse.java b/src/main/java/exchange/core2/revelator/raft/messages/CustomCommandResponse.java similarity index 83% rename from src/main/java/exchange/core2/revelator/raft/CustomCommandResponse.java rename to src/main/java/exchange/core2/revelator/raft/messages/CustomCommandResponse.java index 581b5e4..5345333 100644 --- a/src/main/java/exchange/core2/revelator/raft/CustomCommandResponse.java +++ b/src/main/java/exchange/core2/revelator/raft/messages/CustomCommandResponse.java @@ -1,4 +1,6 @@ -package exchange.core2.revelator.raft; +package exchange.core2.revelator.raft.messages; + +import exchange.core2.revelator.raft.RsmMessageFactory; import java.nio.ByteBuffer; @@ -18,7 +20,7 @@ public void serialize(ByteBuffer buffer) { rsmResponse.serialize(buffer); } - public static CustomCommandResponse create(ByteBuffer buffer, SerializableMessageFactory factory) { + public static CustomCommandResponse create(ByteBuffer buffer, RsmMessageFactory factory) { final int leaderNodeId = buffer.getInt(); final boolean success = buffer.getInt() == 1; diff --git a/src/main/java/exchange/core2/revelator/raft/RaftLogEntry.java b/src/main/java/exchange/core2/revelator/raft/messages/RaftLogEntry.java similarity index 66% rename from src/main/java/exchange/core2/revelator/raft/RaftLogEntry.java rename to src/main/java/exchange/core2/revelator/raft/messages/RaftLogEntry.java index 3dc031a..71a1229 100644 --- a/src/main/java/exchange/core2/revelator/raft/RaftLogEntry.java +++ b/src/main/java/exchange/core2/revelator/raft/messages/RaftLogEntry.java @@ -1,22 +1,13 @@ -package exchange.core2.revelator.raft; +package exchange.core2.revelator.raft.messages; + +import exchange.core2.revelator.raft.RsmMessageFactory; import java.nio.ByteBuffer; /** * each entry contains command for state machine, and term when entry was received by leader */ -public class RaftLogEntry { - - // term when entry was received by leader - public final int term; - - // command - public final T cmd; - - public RaftLogEntry(int term, T cmd) { - this.term = term; - this.cmd = cmd; - } +public record RaftLogEntry(int term, T cmd) { public void serialize(ByteBuffer buffer) { buffer.putInt(term); @@ -32,7 +23,7 @@ public String toString() { } public static RaftLogEntry create(ByteBuffer buffer, - SerializableMessageFactory factory) { + RsmMessageFactory factory) { final int term = buffer.getInt(); final T cmd = factory.createRequest(buffer); return new RaftLogEntry(term, cmd); diff --git a/src/main/java/exchange/core2/revelator/raft/RpcMessage.java b/src/main/java/exchange/core2/revelator/raft/messages/RpcMessage.java similarity index 86% rename from src/main/java/exchange/core2/revelator/raft/RpcMessage.java rename to src/main/java/exchange/core2/revelator/raft/messages/RpcMessage.java index 9830cff..e8a4993 100644 --- a/src/main/java/exchange/core2/revelator/raft/RpcMessage.java +++ b/src/main/java/exchange/core2/revelator/raft/messages/RpcMessage.java @@ -1,4 +1,4 @@ -package exchange.core2.revelator.raft; +package exchange.core2.revelator.raft.messages; import java.nio.ByteBuffer; diff --git a/src/main/java/exchange/core2/revelator/raft/RpcRequest.java b/src/main/java/exchange/core2/revelator/raft/messages/RpcRequest.java similarity index 52% rename from src/main/java/exchange/core2/revelator/raft/RpcRequest.java rename to src/main/java/exchange/core2/revelator/raft/messages/RpcRequest.java index 829a924..c56b61a 100644 --- a/src/main/java/exchange/core2/revelator/raft/RpcRequest.java +++ b/src/main/java/exchange/core2/revelator/raft/messages/RpcRequest.java @@ -1,4 +1,4 @@ -package exchange.core2.revelator.raft; +package exchange.core2.revelator.raft.messages; public interface RpcRequest extends RpcMessage { diff --git a/src/main/java/exchange/core2/revelator/raft/RpcResponse.java b/src/main/java/exchange/core2/revelator/raft/messages/RpcResponse.java similarity index 52% rename from src/main/java/exchange/core2/revelator/raft/RpcResponse.java rename to src/main/java/exchange/core2/revelator/raft/messages/RpcResponse.java index ec66efd..55ad821 100644 --- a/src/main/java/exchange/core2/revelator/raft/RpcResponse.java +++ b/src/main/java/exchange/core2/revelator/raft/messages/RpcResponse.java @@ -1,4 +1,4 @@ -package exchange.core2.revelator.raft; +package exchange.core2.revelator.raft.messages; public interface RpcResponse extends RpcMessage { diff --git a/src/main/java/exchange/core2/revelator/raft/RsmRequest.java b/src/main/java/exchange/core2/revelator/raft/messages/RsmRequest.java similarity index 57% rename from src/main/java/exchange/core2/revelator/raft/RsmRequest.java rename to src/main/java/exchange/core2/revelator/raft/messages/RsmRequest.java index e18b252..8d3d91c 100644 --- a/src/main/java/exchange/core2/revelator/raft/RsmRequest.java +++ b/src/main/java/exchange/core2/revelator/raft/messages/RsmRequest.java @@ -1,4 +1,4 @@ -package exchange.core2.revelator.raft; +package exchange.core2.revelator.raft.messages; public interface RsmRequest extends SerializableMessage { diff --git a/src/main/java/exchange/core2/revelator/raft/RsmResponse.java b/src/main/java/exchange/core2/revelator/raft/messages/RsmResponse.java similarity index 57% rename from src/main/java/exchange/core2/revelator/raft/RsmResponse.java rename to src/main/java/exchange/core2/revelator/raft/messages/RsmResponse.java index a9a8aeb..df21fcc 100644 --- a/src/main/java/exchange/core2/revelator/raft/RsmResponse.java +++ b/src/main/java/exchange/core2/revelator/raft/messages/RsmResponse.java @@ -1,4 +1,4 @@ -package exchange.core2.revelator.raft; +package exchange.core2.revelator.raft.messages; public interface RsmResponse extends SerializableMessage { diff --git a/src/main/java/exchange/core2/revelator/raft/SerializableMessage.java b/src/main/java/exchange/core2/revelator/raft/messages/SerializableMessage.java similarity index 70% rename from src/main/java/exchange/core2/revelator/raft/SerializableMessage.java rename to src/main/java/exchange/core2/revelator/raft/messages/SerializableMessage.java index cdd70a2..394fa07 100644 --- a/src/main/java/exchange/core2/revelator/raft/SerializableMessage.java +++ b/src/main/java/exchange/core2/revelator/raft/messages/SerializableMessage.java @@ -1,4 +1,4 @@ -package exchange.core2.revelator.raft; +package exchange.core2.revelator.raft.messages; import java.nio.ByteBuffer; From 83674ce81e330c860c3c21a40be9b557d81ebdc7 Mon Sep 17 00:00:00 2001 From: Maksim Zheravin Date: Sun, 6 Feb 2022 23:44:14 +0200 Subject: [PATCH 13/15] RAFT: Added Disk repository implementation, refactoring, api is broken --- pom.xml | 12 + .../core2/revelator/raft/RaftNode.java | 39 +- .../core2/revelator/raft/RaftUtils.java | 5 +- .../raft/ReplicatedStateMachine.java | 18 +- .../core2/revelator/raft/RpcClient.java | 4 +- .../core2/revelator/raft/RpcHandler.java | 6 +- .../core2/revelator/raft/RpcService.java | 13 +- .../revelator/raft/RsmRequestFactory.java | 15 + ...geFactory.java => RsmResponseFactory.java} | 5 +- .../core2/revelator/raft/demo/CustomNode.java | 2 +- .../core2/revelator/raft/demo/CustomRsm.java | 23 +- .../revelator/raft/demo/CustomRsmCommand.java | 7 + .../raft/messages/CmdRaftAppendEntries.java | 4 +- .../raft/messages/CustomCommandRequest.java | 16 +- .../raft/messages/CustomCommandResponse.java | 4 +- .../revelator/raft/messages/RaftLogEntry.java | 22 +- .../raft/repository/IRaftLogRepository.java | 44 ++ .../raft/repository/JournalDescriptor.java | 21 + .../repository/RaftDiskLogRepository.java | 485 ++++++++++++++++++ .../RaftMemLogRepository.java} | 58 ++- .../raft/repository/SnapshotDescriptor.java | 58 +++ 21 files changed, 791 insertions(+), 70 deletions(-) create mode 100644 src/main/java/exchange/core2/revelator/raft/RsmRequestFactory.java rename src/main/java/exchange/core2/revelator/raft/{RsmMessageFactory.java => RsmResponseFactory.java} (52%) create mode 100644 src/main/java/exchange/core2/revelator/raft/repository/IRaftLogRepository.java create mode 100644 src/main/java/exchange/core2/revelator/raft/repository/JournalDescriptor.java create mode 100644 src/main/java/exchange/core2/revelator/raft/repository/RaftDiskLogRepository.java rename src/main/java/exchange/core2/revelator/raft/{RaftLogRepository.java => repository/RaftMemLogRepository.java} (72%) create mode 100644 src/main/java/exchange/core2/revelator/raft/repository/SnapshotDescriptor.java diff --git a/pom.xml b/pom.xml index c9b5fd7..ebcbfe8 100644 --- a/pom.xml +++ b/pom.xml @@ -269,6 +269,18 @@ commons-math3 + + org.lz4 + lz4-java + 1.8.0 + + + + net.openhft + chronicle-wire + 2.19.32 + + junit junit diff --git a/src/main/java/exchange/core2/revelator/raft/RaftNode.java b/src/main/java/exchange/core2/revelator/raft/RaftNode.java index a800b1b..881bc87 100644 --- a/src/main/java/exchange/core2/revelator/raft/RaftNode.java +++ b/src/main/java/exchange/core2/revelator/raft/RaftNode.java @@ -2,6 +2,8 @@ import exchange.core2.revelator.raft.messages.*; +import exchange.core2.revelator.raft.repository.IRaftLogRepository; +import exchange.core2.revelator.raft.repository.RaftMemLogRepository; import org.eclipse.collections.impl.map.mutable.primitive.LongObjectHashMap; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -21,6 +23,7 @@ public class RaftNode { public static final int ELECTION_TIMEOUT_MAX_MS = 2800; public static final int APPEND_REPLY_TIMEOUT_MAX_MS = 1000; + public static final int TRANSFER_ITEMS_NUM_LIMIT = 10; /* **** Persistent state on all servers: (Updated on stable storage before responding to RPCs) */ @@ -31,7 +34,7 @@ public class RaftNode { private int votedFor = -1; // log entries; each entry contains command for state machine, and term when entry was received by leader (first index is 1) - private final RaftLogRepository logRepository = new RaftLogRepository<>(); + private final IRaftLogRepository logRepository = new RaftMemLogRepository<>(); /* **** Volatile state on all servers: */ @@ -85,7 +88,8 @@ public class RaftNode { public RaftNode(int thisNodeId, ReplicatedStateMachine rsm, - RsmMessageFactory msgFactory) { + RsmRequestFactory msgFactory, + RsmResponseFactory respFactory) { // localhost:3778, localhost:3779, localhost:3780 final Map remoteNodes = Map.of( @@ -250,7 +254,7 @@ public void handleNodeResponse(int fromNodeId, RpcResponse resp, long correlatio if (matchIndex[fromNodeId] > commitIndex) { log.debug("lastEntryInTerm({}, {}, {});", commitIndex, matchIndex[fromNodeId], currentTerm); - final long lastEntryInTerm = logRepository.lastEntryInTerm(commitIndex, matchIndex[fromNodeId], currentTerm); + final long lastEntryInTerm = logRepository.findLastEntryInTerm(commitIndex, matchIndex[fromNodeId], currentTerm); final long newCommitIndex = Math.max( Math.max(commitIndex, matchIndex[fromNodeId]), @@ -285,6 +289,7 @@ public void handleNodeResponse(int fromNodeId, RpcResponse resp, long correlatio public CustomCommandResponse handleClientRequest(final InetAddress address, final int port, final long correlationId, + final long timeReceived, final CustomCommandRequest request) { synchronized (this) { @@ -293,8 +298,8 @@ public CustomCommandResponse handleClientRequest(final InetAddress address, // respond after entry applied to state machine (5.3) // adding new record into the local log - final RaftLogEntry logEntry = new RaftLogEntry<>(currentTerm, request.rsmRequest()); - final long index = logRepository.append(logEntry); + final RaftLogEntry logEntry = new RaftLogEntry<>(currentTerm, request.rsmRequest(), timeReceived); + final long index = logRepository.appendEntry(logEntry, true); // remember client request (TODO !! on batch migration - should refer to the last record) clientResponsesMap.put(index, new ClientAddress(address, port, correlationId)); @@ -302,7 +307,7 @@ public CustomCommandResponse handleClientRequest(final InetAddress address, } else { log.debug("Redirecting client to leader nodeId={}", votedFor); // inform client about different leader - return new CustomCommandResponse<>(msgFactory.emptyResponse(), votedFor, false); + return new CustomCommandResponse<>(respFactory.emptyResponse(), votedFor, false); } } @@ -312,7 +317,7 @@ public CustomCommandResponse handleClientRequest(final InetAddress address, }; // todo remove from constructor - rpcService = new RpcService<>(remoteNodes, handler, msgFactory, thisNodeId); + rpcService = new RpcService<>(remoteNodes, handler, msgFactory, respFactory, thisNodeId); log.info("HEARTBEAT_TIMEOUT_MS={}", HEARTBEAT_TIMEOUT_MS); log.info("ELECTION_TIMEOUT_MS={}..{}", ELECTION_TIMEOUT_MIN_MS, ELECTION_TIMEOUT_MAX_MS); @@ -376,8 +381,13 @@ private void workerThread() { if (canRetry || timeToSendHeartbeat) { - final List> newEntries = logRepository.getEntriesStartingFrom(nextIndexForNode); - final int prevLogTerm = logRepository.getEntryOpt(nextIndexForNode - 1).map(RaftLogEntry::term).orElse(0); + final List> newEntries = logRepository.getEntries(nextIndexForNode, TRANSFER_ITEMS_NUM_LIMIT); + + // avoid additional request + final int prevLogTerm = logRepository.getEntries(nextIndexForNode - 1, 1).stream() + .findFirst() + .map(RaftLogEntry::term) + .orElse(0); log.debug("node {} : nextIndexForNode={} newEntries={} prevLogTerm={}", targetNodeId, nextIndexForNode, newEntries, prevLogTerm); @@ -499,9 +509,18 @@ private void applyPendingEntriesToStateMachine() { /* All Servers: If commitIndex > lastApplied: increment lastApplied, apply log[lastApplied] to state machine (5.3) */ + + // TODO request range (up to commitIndex) + final List> entries = logRepository.getEntries(lastApplied, Integer.MAX_VALUE); + int idx = 0; + while (lastApplied < commitIndex) { lastApplied++; - final RaftLogEntry raftLogEntry = logRepository.getEntry(lastApplied); +// final RaftLogEntry raftLogEntry = logRepository.getEntryOpt(lastApplied) +// .orElseThrow(() -> new RuntimeException("Can not find pending entry index=" + lastApplied + " in the repository")); + + final RaftLogEntry raftLogEntry = entries.get(idx++); + log.debug("Applying to RSM: {}", raftLogEntry); final S result = rsm.applyCommand(raftLogEntry.cmd()); diff --git a/src/main/java/exchange/core2/revelator/raft/RaftUtils.java b/src/main/java/exchange/core2/revelator/raft/RaftUtils.java index 4ed6896..e5a3628 100644 --- a/src/main/java/exchange/core2/revelator/raft/RaftUtils.java +++ b/src/main/java/exchange/core2/revelator/raft/RaftUtils.java @@ -13,7 +13,8 @@ public class RaftUtils { public static RpcMessage createMessageByType( int messageType, ByteBuffer buffer, - RsmMessageFactory factory) { + RsmRequestFactory factory, + RsmResponseFactory responseFactory) { return switch (messageType) { case RpcMessage.REQUEST_APPEND_ENTRIES -> CmdRaftAppendEntries.create(buffer, factory); @@ -21,7 +22,7 @@ public static RpcMessage createMes case RpcMessage.REQUEST_VOTE -> CmdRaftVoteRequest.create(buffer); case RpcMessage.RESPONSE_VOTE -> CmdRaftVoteResponse.create(buffer); case RpcMessage.REQUEST_CUSTOM -> CustomCommandRequest.create(buffer, factory); - case RpcMessage.RESPONSE_CUSTOM -> CustomCommandResponse.create(buffer, factory); + case RpcMessage.RESPONSE_CUSTOM -> CustomCommandResponse.create(buffer, responseFactory); default -> throw new IllegalArgumentException("Unknown messageType: " + messageType); }; } diff --git a/src/main/java/exchange/core2/revelator/raft/ReplicatedStateMachine.java b/src/main/java/exchange/core2/revelator/raft/ReplicatedStateMachine.java index 4cf0edb..3c40b81 100644 --- a/src/main/java/exchange/core2/revelator/raft/ReplicatedStateMachine.java +++ b/src/main/java/exchange/core2/revelator/raft/ReplicatedStateMachine.java @@ -2,14 +2,26 @@ import exchange.core2.revelator.raft.messages.RsmRequest; import exchange.core2.revelator.raft.messages.RsmResponse; +import net.openhft.chronicle.bytes.WriteBytesMarshallable; -public interface ReplicatedStateMachine { +public interface ReplicatedStateMachine extends WriteBytesMarshallable { + /** + * Changes state of Replicated State Machine + * + * @param command command + * @return result + */ S applyCommand(T command); // TODO query - S applyQuery(T query); - S getState(); + /** + * Execute a query that does not change the state + * + * @param query query + * @return query result + */ + S applyQuery(T query); } diff --git a/src/main/java/exchange/core2/revelator/raft/RpcClient.java b/src/main/java/exchange/core2/revelator/raft/RpcClient.java index 2093652..c6f304e 100644 --- a/src/main/java/exchange/core2/revelator/raft/RpcClient.java +++ b/src/main/java/exchange/core2/revelator/raft/RpcClient.java @@ -30,7 +30,7 @@ public class RpcClient { private final AtomicLong correlationIdCounter = new AtomicLong(1L); private final Map>> futureMap = new ConcurrentHashMap<>(); private final Map socketMap; - private final RsmMessageFactory msgFactory; + private final RsmResponseFactory msgFactory; private volatile int leaderNodeId = 0; @@ -40,7 +40,7 @@ public class RpcClient { public RpcClient(final Map remoteNodes, - final RsmMessageFactory msgFactory) { + final RsmResponseFactory msgFactory) { this.socketMap = RaftUtils.createHostMap(remoteNodes); this.msgFactory = msgFactory; diff --git a/src/main/java/exchange/core2/revelator/raft/RpcHandler.java b/src/main/java/exchange/core2/revelator/raft/RpcHandler.java index 5825a5c..0f84586 100644 --- a/src/main/java/exchange/core2/revelator/raft/RpcHandler.java +++ b/src/main/java/exchange/core2/revelator/raft/RpcHandler.java @@ -10,6 +10,10 @@ public interface RpcHandler { void handleNodeResponse(int nodeId, RpcResponse response, long correlationId); - CustomCommandResponse handleClientRequest(InetAddress address, int port, long correlationId, CustomCommandRequest request); + CustomCommandResponse handleClientRequest(InetAddress address, + int port, + long correlationId, + long timeReceived, + CustomCommandRequest request); } diff --git a/src/main/java/exchange/core2/revelator/raft/RpcService.java b/src/main/java/exchange/core2/revelator/raft/RpcService.java index 30b7007..62fac90 100644 --- a/src/main/java/exchange/core2/revelator/raft/RpcService.java +++ b/src/main/java/exchange/core2/revelator/raft/RpcService.java @@ -23,7 +23,8 @@ public class RpcService implements private final int serverPort; private final int serverNodeId; private final RpcHandler handler; - private final RsmMessageFactory msgFactory; + private final RsmRequestFactory msgFactory; + private final RsmResponseFactory responseFactory; private final DatagramSocket serverSocket; @@ -31,15 +32,16 @@ public class RpcService implements public RpcService(Map remoteNodes, RpcHandler handler, - RsmMessageFactory msgFactory, + RsmRequestFactory msgFactory, + RsmResponseFactory responseFactory, int serverNodeId) { this.socketMap = RaftUtils.createHostMap(remoteNodes); - ; this.handler = handler; this.serverPort = socketMap.get(serverNodeId).port; this.serverNodeId = serverNodeId; this.msgFactory = msgFactory; + this.responseFactory = responseFactory; try { this.serverSocket = new DatagramSocket(serverPort); @@ -81,7 +83,7 @@ public void run() { logger.debug("RECEIVED from {} mt={}: {}", nodeId, messageType, PrintBufferUtil.hexDump(receivePacket.getData(), 0, receivePacket.getLength())); - final RpcMessage msg = RaftUtils.createMessageByType(messageType, bb, msgFactory); + final RpcMessage msg = RaftUtils.createMessageByType(messageType, bb, msgFactory, responseFactory); // TODO use msgFactory if (messageType < 0) { @@ -106,10 +108,13 @@ public void run() { final CustomCommandRequest msgT = (CustomCommandRequest) msg; + final long timeReceived = System.currentTimeMillis(); + final CustomCommandResponse response = handler.handleClientRequest( address, port, correlationId, + timeReceived, msgT); if (response != null) { diff --git a/src/main/java/exchange/core2/revelator/raft/RsmRequestFactory.java b/src/main/java/exchange/core2/revelator/raft/RsmRequestFactory.java new file mode 100644 index 0000000..97c5e94 --- /dev/null +++ b/src/main/java/exchange/core2/revelator/raft/RsmRequestFactory.java @@ -0,0 +1,15 @@ +package exchange.core2.revelator.raft; + +import exchange.core2.revelator.raft.messages.RsmRequest; + +import java.io.DataInputStream; +import java.io.IOException; +import java.nio.ByteBuffer; + +public interface RsmRequestFactory { + + T createRequest(ByteBuffer buffer); + + T createRequest(DataInputStream dis) throws IOException; + +} diff --git a/src/main/java/exchange/core2/revelator/raft/RsmMessageFactory.java b/src/main/java/exchange/core2/revelator/raft/RsmResponseFactory.java similarity index 52% rename from src/main/java/exchange/core2/revelator/raft/RsmMessageFactory.java rename to src/main/java/exchange/core2/revelator/raft/RsmResponseFactory.java index e7bfac4..f0b91bf 100644 --- a/src/main/java/exchange/core2/revelator/raft/RsmMessageFactory.java +++ b/src/main/java/exchange/core2/revelator/raft/RsmResponseFactory.java @@ -1,13 +1,10 @@ package exchange.core2.revelator.raft; -import exchange.core2.revelator.raft.messages.RsmRequest; import exchange.core2.revelator.raft.messages.RsmResponse; import java.nio.ByteBuffer; -public interface RsmMessageFactory { - - T createRequest(ByteBuffer buffer); +public interface RsmResponseFactory { S createResponse(ByteBuffer buffer); diff --git a/src/main/java/exchange/core2/revelator/raft/demo/CustomNode.java b/src/main/java/exchange/core2/revelator/raft/demo/CustomNode.java index 1fd5f66..0ad6f4d 100644 --- a/src/main/java/exchange/core2/revelator/raft/demo/CustomNode.java +++ b/src/main/java/exchange/core2/revelator/raft/demo/CustomNode.java @@ -10,7 +10,7 @@ public static void main(String[] args) { final CustomRsm customRsm = new CustomRsm(); - new RaftNode<>(thisNodeId, customRsm, customRsm); + new RaftNode<>(thisNodeId, customRsm, customRsm, customRsm); } } diff --git a/src/main/java/exchange/core2/revelator/raft/demo/CustomRsm.java b/src/main/java/exchange/core2/revelator/raft/demo/CustomRsm.java index acb7682..68b29ef 100644 --- a/src/main/java/exchange/core2/revelator/raft/demo/CustomRsm.java +++ b/src/main/java/exchange/core2/revelator/raft/demo/CustomRsm.java @@ -1,14 +1,19 @@ package exchange.core2.revelator.raft.demo; import exchange.core2.revelator.raft.ReplicatedStateMachine; -import exchange.core2.revelator.raft.RsmMessageFactory; +import exchange.core2.revelator.raft.RsmRequestFactory; +import exchange.core2.revelator.raft.RsmResponseFactory; +import net.openhft.chronicle.bytes.BytesOut; import org.agrona.collections.Hashing; +import java.io.DataInputStream; +import java.io.IOException; import java.nio.ByteBuffer; public class CustomRsm implements ReplicatedStateMachine, - RsmMessageFactory { + RsmRequestFactory, + RsmResponseFactory { public static final CustomRsmResponse EMPTY_RSM_RESPONSE = new CustomRsmResponse(0); @@ -28,13 +33,14 @@ public CustomRsmResponse applyQuery(CustomRsmCommand query) { } @Override - public CustomRsmResponse getState() { - return new CustomRsmResponse(hash); + public CustomRsmCommand createRequest(ByteBuffer buffer) { + return CustomRsmCommand.create(buffer); } @Override - public CustomRsmCommand createRequest(ByteBuffer buffer) { - return CustomRsmCommand.create(buffer); + public CustomRsmCommand createRequest(DataInputStream dis) throws IOException { + + return CustomRsmCommand.create(dis); } @Override @@ -46,4 +52,9 @@ public CustomRsmResponse createResponse(ByteBuffer buffer) { public CustomRsmResponse emptyResponse() { return EMPTY_RSM_RESPONSE; } + + @Override + public void writeMarshallable(BytesOut bytes) { + bytes.append(hash); + } } diff --git a/src/main/java/exchange/core2/revelator/raft/demo/CustomRsmCommand.java b/src/main/java/exchange/core2/revelator/raft/demo/CustomRsmCommand.java index 0cea635..f26295f 100644 --- a/src/main/java/exchange/core2/revelator/raft/demo/CustomRsmCommand.java +++ b/src/main/java/exchange/core2/revelator/raft/demo/CustomRsmCommand.java @@ -2,6 +2,8 @@ import exchange.core2.revelator.raft.messages.RsmRequest; +import java.io.DataInputStream; +import java.io.IOException; import java.nio.ByteBuffer; public record CustomRsmCommand(long data) implements RsmRequest { @@ -21,4 +23,9 @@ public String toString() { public static CustomRsmCommand create(ByteBuffer buffer) { return new CustomRsmCommand(buffer.getLong()); } + + public static CustomRsmCommand create(DataInputStream dis) throws IOException { + return new CustomRsmCommand(dis.readLong()); + } + } diff --git a/src/main/java/exchange/core2/revelator/raft/messages/CmdRaftAppendEntries.java b/src/main/java/exchange/core2/revelator/raft/messages/CmdRaftAppendEntries.java index 873b700..61a436f 100644 --- a/src/main/java/exchange/core2/revelator/raft/messages/CmdRaftAppendEntries.java +++ b/src/main/java/exchange/core2/revelator/raft/messages/CmdRaftAppendEntries.java @@ -1,6 +1,6 @@ package exchange.core2.revelator.raft.messages; -import exchange.core2.revelator.raft.RsmMessageFactory; +import exchange.core2.revelator.raft.RsmRequestFactory; import java.nio.ByteBuffer; import java.util.ArrayList; @@ -34,7 +34,7 @@ public void serialize(ByteBuffer buffer) { public static CmdRaftAppendEntries create( ByteBuffer buffer, - RsmMessageFactory factory) { + RsmRequestFactory factory) { final int term = buffer.getInt(); final int leaderId = buffer.getInt(); diff --git a/src/main/java/exchange/core2/revelator/raft/messages/CustomCommandRequest.java b/src/main/java/exchange/core2/revelator/raft/messages/CustomCommandRequest.java index d223338..53e420e 100644 --- a/src/main/java/exchange/core2/revelator/raft/messages/CustomCommandRequest.java +++ b/src/main/java/exchange/core2/revelator/raft/messages/CustomCommandRequest.java @@ -1,7 +1,9 @@ package exchange.core2.revelator.raft.messages; -import exchange.core2.revelator.raft.RsmMessageFactory; +import exchange.core2.revelator.raft.RsmRequestFactory; +import java.io.DataInputStream; +import java.io.IOException; import java.nio.ByteBuffer; // TODO support batching !! @@ -18,8 +20,18 @@ public void serialize(ByteBuffer buffer) { rsmRequest.serialize(buffer); } - public static CustomCommandRequest create(ByteBuffer buffer, RsmMessageFactory factory) { + public static CustomCommandRequest create( + final ByteBuffer buffer, + final RsmRequestFactory factory) { return new CustomCommandRequest<>(factory.createRequest(buffer)); } + + + public static CustomCommandRequest create( + final DataInputStream dis, + final RsmRequestFactory factory) throws IOException { + + return new CustomCommandRequest<>(factory.createRequest(dis)); + } } diff --git a/src/main/java/exchange/core2/revelator/raft/messages/CustomCommandResponse.java b/src/main/java/exchange/core2/revelator/raft/messages/CustomCommandResponse.java index 5345333..372cdb3 100644 --- a/src/main/java/exchange/core2/revelator/raft/messages/CustomCommandResponse.java +++ b/src/main/java/exchange/core2/revelator/raft/messages/CustomCommandResponse.java @@ -1,6 +1,6 @@ package exchange.core2.revelator.raft.messages; -import exchange.core2.revelator.raft.RsmMessageFactory; +import exchange.core2.revelator.raft.RsmResponseFactory; import java.nio.ByteBuffer; @@ -20,7 +20,7 @@ public void serialize(ByteBuffer buffer) { rsmResponse.serialize(buffer); } - public static CustomCommandResponse create(ByteBuffer buffer, RsmMessageFactory factory) { + public static CustomCommandResponse create(ByteBuffer buffer, RsmResponseFactory factory) { final int leaderNodeId = buffer.getInt(); final boolean success = buffer.getInt() == 1; diff --git a/src/main/java/exchange/core2/revelator/raft/messages/RaftLogEntry.java b/src/main/java/exchange/core2/revelator/raft/messages/RaftLogEntry.java index 71a1229..20153af 100644 --- a/src/main/java/exchange/core2/revelator/raft/messages/RaftLogEntry.java +++ b/src/main/java/exchange/core2/revelator/raft/messages/RaftLogEntry.java @@ -1,31 +1,43 @@ package exchange.core2.revelator.raft.messages; -import exchange.core2.revelator.raft.RsmMessageFactory; +import exchange.core2.revelator.raft.RsmRequestFactory; +import java.io.DataInputStream; +import java.io.IOException; import java.nio.ByteBuffer; /** * each entry contains command for state machine, and term when entry was received by leader */ -public record RaftLogEntry(int term, T cmd) { +public record RaftLogEntry(int term, T cmd, long timestamp) { public void serialize(ByteBuffer buffer) { buffer.putInt(term); + buffer.putLong(timestamp); cmd.serialize(buffer); } @Override public String toString() { return "RLE{" + - "t" + term + + "term=" + term + " cmd=" + cmd + '}'; } public static RaftLogEntry create(ByteBuffer buffer, - RsmMessageFactory factory) { + RsmRequestFactory factory) { final int term = buffer.getInt(); + final long timestamp = buffer.getLong(); final T cmd = factory.createRequest(buffer); - return new RaftLogEntry(term, cmd); + return new RaftLogEntry<>(term, cmd, timestamp); + } + + public static RaftLogEntry create(DataInputStream dis, + RsmRequestFactory factory) throws IOException { + final int term = dis.readInt(); + final long timestamp = dis.readLong(); + final T cmd = factory.createRequest(dis); + return new RaftLogEntry<>(term, cmd, timestamp); } } diff --git a/src/main/java/exchange/core2/revelator/raft/repository/IRaftLogRepository.java b/src/main/java/exchange/core2/revelator/raft/repository/IRaftLogRepository.java new file mode 100644 index 0000000..ddd5773 --- /dev/null +++ b/src/main/java/exchange/core2/revelator/raft/repository/IRaftLogRepository.java @@ -0,0 +1,44 @@ +package exchange.core2.revelator.raft.repository; + +import exchange.core2.revelator.raft.messages.RaftLogEntry; +import exchange.core2.revelator.raft.messages.RsmRequest; + +import java.util.List; + +/** + * RAFT log persistent storage repository + * + * @param - request records type for particular Replicated State Machine + */ +public interface IRaftLogRepository extends AutoCloseable { + + long getLastLogIndex(); + + int getLastLogTerm(); + + /** + * Get entry from log + * + * @param indexFrom - RAFT record index (starting from 1) + * @param limit - max number of entries to retrieve + * @return records (if found) + */ + List> getEntries(long indexFrom, int limit); + + + long findLastEntryInTerm(long indexAfter, long indexBeforeIncl, int term); + + /** + * Append single entry + * + * @param logEntry - RAFT Replicated State Machine entry + * @param endOfBatch - force writing to disk + * @return index of added entry + */ + long appendEntry(RaftLogEntry logEntry, boolean endOfBatch); + + + void appendOrOverride(List> newEntries, long prevLogIndex); + + +} diff --git a/src/main/java/exchange/core2/revelator/raft/repository/JournalDescriptor.java b/src/main/java/exchange/core2/revelator/raft/repository/JournalDescriptor.java new file mode 100644 index 0000000..f6c65a1 --- /dev/null +++ b/src/main/java/exchange/core2/revelator/raft/repository/JournalDescriptor.java @@ -0,0 +1,21 @@ +package exchange.core2.revelator.raft.repository; + +public class JournalDescriptor { + + private final long timestamp; + private final long seqFirst; + private long seqLast = -1; // -1 if not finished yet + + private final SnapshotDescriptor baseSnapshot; + + private final JournalDescriptor prev; // can be null + + private JournalDescriptor next = null; // can be null + + public JournalDescriptor(long timestamp, long seqFirst, SnapshotDescriptor baseSnapshot, JournalDescriptor prev) { + this.timestamp = timestamp; + this.seqFirst = seqFirst; + this.baseSnapshot = baseSnapshot; + this.prev = prev; + } +} diff --git a/src/main/java/exchange/core2/revelator/raft/repository/RaftDiskLogRepository.java b/src/main/java/exchange/core2/revelator/raft/repository/RaftDiskLogRepository.java new file mode 100644 index 0000000..0a6de69 --- /dev/null +++ b/src/main/java/exchange/core2/revelator/raft/repository/RaftDiskLogRepository.java @@ -0,0 +1,485 @@ +package exchange.core2.revelator.raft.repository; + +import exchange.core2.revelator.raft.RsmRequestFactory; +import exchange.core2.revelator.raft.messages.RaftLogEntry; +import exchange.core2.revelator.raft.messages.RsmRequest; +import org.agrona.collections.MutableLong; +import org.eclipse.collections.api.tuple.primitive.LongLongPair; +import org.eclipse.collections.impl.tuple.primitive.PrimitiveTuples; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.*; +import java.nio.ByteBuffer; +import java.nio.channels.Channels; +import java.nio.channels.FileChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.*; + +public class RaftDiskLogRepository implements IRaftLogRepository { + + private static final Logger log = LoggerFactory.getLogger(RaftDiskLogRepository.class); + + private final int journalBufferFlushTrigger = 65536; + private final long journalFileMaxSize = 2_000_000_000; + + private final int indexRecordEveryNBytes = 4096; + + + private final String exchangeId = "EC2R-TEST"; + private final Path folder = Path.of("./raftlogs"); + + private RandomAccessFile raf; + private FileChannel writeChannel; + private FileChannel readChannel; + + + private RandomAccessFile indexRaf; + private FileChannel indexWriteChannel; + + // index -> position in file // TODO keep term in the index? + private NavigableMap currentIndex = new TreeMap<>(); // TODO use ART ? + + + private long baseSnapshotId; + + private int filesCounter = 0; + + private long writtenBytes = 0; + private long lastIndexWrittenAt = 0; + + private long lastIndex = 0L; + private int lastLogTerm = 0; + + private final ByteBuffer journalWriteBuffer = ByteBuffer.allocateDirect(512 * 1024); + private final ByteBuffer indexWriteBuffer = ByteBuffer.allocateDirect(512 * 1024); // TODO Limit index size + + + private SnapshotDescriptor lastSnapshotDescriptor = null; // todo implemnt + private JournalDescriptor lastJournalDescriptor; + + + private final RsmRequestFactory rsmRequestFactory; + + public RaftDiskLogRepository(RsmRequestFactory rsmRequestFactory) { + this.rsmRequestFactory = rsmRequestFactory; + } + + + @Override + public long findLastEntryInTerm(long indexAfter, long indexBeforeIncl, int term) { + // TODO Term index + + throw new UnsupportedOperationException(); + + //return 0; + } + + @Override + public long getLastLogIndex() { + return lastIndex; + } + + @Override + public int getLastLogTerm() { + return lastLogTerm; + } + + @Override + public long appendEntry(RaftLogEntry logEntry, boolean endOfBatch) { + + if (writeChannel == null) { + startNewFile(logEntry.timestamp()); + } + + final ByteBuffer buffer = journalWriteBuffer; + + lastIndex++; + lastLogTerm = logEntry.term(); + + buffer.putLong(logEntry.timestamp()); // 8 bytes + buffer.putInt(logEntry.term()); // 4 bytes + logEntry.serialize(buffer); + + + if (endOfBatch || buffer.position() >= journalBufferFlushTrigger) { + + // flushing on end of batch or when buffer is full + flushBufferSync(false, logEntry.timestamp()); + } + + return lastIndex; + } + + @Override + public void appendOrOverride(List> newEntries, long prevLogIndex) { + + log.debug("appendOrOverride(newEntries={} , prevLogIndex={}", newEntries, prevLogIndex); + + try { + + // check for missed records + if (prevLogIndex > lastIndex) { + throw new IllegalStateException("Can not accept prevLogIndex=" + prevLogIndex + " because=" + lastIndex); + } + + // check if leader is overriding some records + if (prevLogIndex < lastIndex) { + + // TODO loading just to compare term - can be done faster + final long removeAfter = verifyTerms(newEntries, prevLogIndex); + + if (removeAfter != -1) { + + final long position = findPosition(removeAfter); + log.debug("Removing after position: {}", position); + writeChannel.position(position); + writeChannel.truncate(position); + writtenBytes = position; + + truncateIndexRecords(removeAfter); + + lastIndex = removeAfter; + lastLogTerm = getEntries(lastIndex, 1).get(0).term(); + } + } + + // adding missing records + final int offset = (int) (lastIndex - prevLogIndex); + if (offset > 0) { + final int lastIndex = newEntries.size(); + for (int i = offset; i <= lastIndex; i++) { + appendEntry(newEntries.get(i), i == lastIndex); + } + } + + } catch (IOException ex) { + throw new RuntimeException(ex); + } + } + + private long findPosition(long removeAfter) throws IOException { + + final LongLongPair startingIndexPoint = findStartingIndexPoint(removeAfter); + final long startOffset = startingIndexPoint.getOne(); + final long floorIndex = startingIndexPoint.getTwo(); + + readChannel.position(startOffset); + + long idx = floorIndex; + + try (final InputStream is = Channels.newInputStream(readChannel); + final BufferedInputStream bis = new BufferedInputStream(is); + final DataInputStream dis = new DataInputStream(bis)) { + + while (dis.available() != 0) { + + RaftLogEntry.create(dis, rsmRequestFactory); + + idx++; + + if (idx == removeAfter) { + return readChannel.position(); + } + } + } + + throw new RuntimeException("Can not reach index " + removeAfter); + } + + private void truncateIndexRecords(long removeAfterIndex) throws IOException { + + // clean tail subtree + currentIndex.tailMap(removeAfterIndex, false).clear(); + + final Map.Entry lastIndexEntry = currentIndex.lastEntry(); + + indexWriteChannel.position(0L); + + if (lastIndexEntry == null) { + // empty tree - just clean all file + lastIndexWrittenAt = 0L; + indexWriteChannel.truncate(0L); + + } else { + + // set bytes offset to last known value (maybe not very exact?) + lastIndexWrittenAt = lastIndexEntry.getValue(); + + // remove all records after + try (final InputStream is = Channels.newInputStream(indexWriteChannel); + final BufferedInputStream bis = new BufferedInputStream(is); + final DataInputStream dis = new DataInputStream(bis)) { + + while (dis.available() != 0) { + + // read index record (16 bytes) + final long lastIndex = dis.readLong(); + dis.readLong(); + + if (lastIndex > lastIndexEntry.getKey()) { + final long pos = indexWriteChannel.position() - 16; + indexWriteChannel.position(pos); + indexWriteChannel.truncate(pos); + return; + } + } + } + } + } + + private long verifyTerms(List> newEntries, long prevLogIndex) { + final List> existingEntries = getEntries(prevLogIndex, newEntries.size()); + final int intersectionLength = Math.min(existingEntries.size(), newEntries.size()); + + for (int i = 0; i < intersectionLength; i++) { + if (existingEntries.get(i).term() != newEntries.get(i).term()) { + return prevLogIndex + i; + } + } + + return -1; + } + + /** + * @return offset+index to start looking from + */ + private LongLongPair findStartingIndexPoint(long indexFrom) { + final Map.Entry entry = currentIndex.floorEntry(indexFrom); + final long startOffset = (entry == null) ? 0L : entry.getValue(); + final long floorIndex = (entry == null) ? 1L : entry.getKey(); + return PrimitiveTuples.pair(startOffset, floorIndex); + } + + @Override + public List> getEntries(long indexFrom, int limit) { + + if (indexFrom > lastIndex) { + return List.of(); + } + + final LongLongPair indexStartingIndex = findStartingIndexPoint(indexFrom); + final long startOffset = indexStartingIndex.getOne(); + final long floorIndex = indexStartingIndex.getTwo(); + + try { + log.debug("Reading {} - floor idx:{} offset:{}", indexFrom, floorIndex, startOffset); + readChannel.position(startOffset); + } catch (IOException ex) { + throw new RuntimeException("can not read log at offset " + startOffset, ex); + } + + final List> entries = new ArrayList<>(); + + final MutableLong indexCounter = new MutableLong(floorIndex); + + try (final InputStream is = Channels.newInputStream(readChannel); + final BufferedInputStream bis = new BufferedInputStream(is); + final DataInputStream dis = new DataInputStream(bis)) { + + final boolean allLoaded = readCommands(dis, entries, indexCounter, indexFrom, limit); + if (!allLoaded) { + throw new RuntimeException("not loaded everything"); + } + + return entries; + + } catch (IOException ex) { + throw new RuntimeException(ex); + } finally { + try { + readChannel.position(0); + } catch (IOException e) { + log.error("Can not rewind readChannel position to 0"); + } + } + } + + private void startNewFile(final long timestamp) { + + try { + + filesCounter++; + + closeCurrentFiles(); + + final Path logFileName = resolveJournalPath(filesCounter, baseSnapshotId); + final Path indexFileName = resolveIndexPath(filesCounter, baseSnapshotId); + log.debug("Starting new raft log file: {} index file: {}", logFileName, indexFileName); + + if (Files.exists(logFileName)) { + throw new IllegalStateException("File already exists: " + logFileName); + } + + if (Files.exists(indexFileName)) { + throw new IllegalStateException("File already exists: " + indexFileName); + } + + raf = new RandomAccessFile(logFileName.toString(), "rwd"); + writeChannel = raf.getChannel(); + readChannel = raf.getChannel(); + + indexRaf = new RandomAccessFile(indexFileName.toString(), "rwd"); + indexWriteChannel = raf.getChannel(); + + + registerNextJournal(baseSnapshotId, timestamp); // TODO fix time + + + } catch (IOException ex) { + throw new RuntimeException(ex); + } + } + + + /** + * call only from journal thread + */ + private void registerNextJournal(long seq, long timestamp) { + + lastJournalDescriptor = new JournalDescriptor(timestamp, seq, lastSnapshotDescriptor, lastJournalDescriptor); + } + + + private Path resolveJournalPath(int partitionId, long snapshotId) { + return folder.resolve(String.format("%s_log_%d_%04X.ecrl", exchangeId, snapshotId, partitionId)); + } + + private Path resolveIndexPath(int partitionId, long snapshotId) { + return folder.resolve(String.format("%s_idx_%d_%04X.ridx", exchangeId, snapshotId, partitionId)); + } + + private void flushBufferSync(final boolean forceStartNextFile, + final long timestampNs) { + + try { + +// log.debug("Flushing buffer position={}", buffer.position()); + + // uncompressed write for single messages or small batches + writtenBytes += journalWriteBuffer.position(); + journalWriteBuffer.flip(); + writeChannel.write(journalWriteBuffer); + journalWriteBuffer.clear(); + + // + if (writtenBytes > lastIndexWrittenAt + indexRecordEveryNBytes) { + + log.debug("Adding index record:{}->{}", lastIndex, writtenBytes); + + currentIndex.put(lastIndex, writtenBytes); + + indexWriteBuffer.putLong(lastIndex); + indexWriteBuffer.putLong(writtenBytes); + indexWriteBuffer.flip(); + indexWriteChannel.write(indexWriteBuffer); + indexWriteBuffer.clear(); + + } + + if (forceStartNextFile || writtenBytes >= journalFileMaxSize) { + +// log.info("RAW {}", LatencyTools.createLatencyReportFast(hdrRecorderRaw.getIntervalHistogram())); +// log.info("LZ4-compression {}", LatencyTools.createLatencyReportFast(hdrRecorderLz4.getIntervalHistogram())); + + // todo start preparing new file asynchronously, but ONLY ONCE + startNewFile(timestampNs); + writtenBytes = 0; + } + } catch (IOException ex) { + throw new RuntimeException(ex); + } + } + + + private List> readData(final long baseSnapshotId, + final long indexFrom, + final int limit) throws IOException { + + + final List> entries = new ArrayList<>(); + + final MutableLong currentIndex = new MutableLong(0L); + + int partitionCounter = 1; + + while (true) { + + final Path path = resolveJournalPath(partitionCounter, baseSnapshotId); + + // TODO Use index + + log.debug("Reading RAFT log file: {}", path.toFile()); + + try (final FileInputStream fis = new FileInputStream(path.toFile()); + final BufferedInputStream bis = new BufferedInputStream(fis); + final DataInputStream dis = new DataInputStream(bis)) { + + final boolean done = readCommands(dis, entries, currentIndex, indexFrom, limit - entries.size()); + if (done) { + return entries; + } + + + partitionCounter++; + log.debug("EOF reached, reading next partition {}...", partitionCounter); + + } catch (FileNotFoundException ex) { + log.debug("FileNotFoundException: currentIndex={}, {}", currentIndex, ex.getMessage()); + throw ex; + + } catch (EOFException ex) { + // partitionCounter++; + log.debug("File end reached through exception, currentIndex={} !!!", currentIndex); + throw ex; + } + } + + } + + + private boolean readCommands(final DataInputStream dis, + final List> collector, + final MutableLong indexCounter, + final long indexFrom, + final int limit) throws IOException { + + while (dis.available() != 0) { + + final long idx = indexCounter.incrementAndGet(); + + final RaftLogEntry logEntry = RaftLogEntry.create(dis, rsmRequestFactory); + + if (idx >= indexFrom) { + log.debug("Adding record into collection idx={} {}", idx, logEntry); + collector.add(logEntry); + } + + if (collector.size() == limit) { + return true; + } + } + + return false; + } + + + @Override + public void close() throws IOException { + closeCurrentFiles(); + } + + private void closeCurrentFiles() throws IOException { + if (writeChannel != null) { + writeChannel.close(); + readChannel.close(); + raf.close(); + } + + if (indexWriteChannel != null) { + indexWriteChannel.close(); + indexRaf.close(); + } + } +} diff --git a/src/main/java/exchange/core2/revelator/raft/RaftLogRepository.java b/src/main/java/exchange/core2/revelator/raft/repository/RaftMemLogRepository.java similarity index 72% rename from src/main/java/exchange/core2/revelator/raft/RaftLogRepository.java rename to src/main/java/exchange/core2/revelator/raft/repository/RaftMemLogRepository.java index 50dfa5b..5ad13a8 100644 --- a/src/main/java/exchange/core2/revelator/raft/RaftLogRepository.java +++ b/src/main/java/exchange/core2/revelator/raft/repository/RaftMemLogRepository.java @@ -1,4 +1,4 @@ -package exchange.core2.revelator.raft; +package exchange.core2.revelator.raft.repository; import exchange.core2.revelator.raft.messages.RaftLogEntry; import exchange.core2.revelator.raft.messages.RsmRequest; @@ -7,33 +7,20 @@ import java.util.ArrayList; import java.util.List; -import java.util.Optional; -public class RaftLogRepository { +public class RaftMemLogRepository implements IRaftLogRepository { - - private static final Logger log = LoggerFactory.getLogger(RaftLogRepository.class); + private static final Logger log = LoggerFactory.getLogger(RaftMemLogRepository.class); private final List> logEntries = new ArrayList<>(); // TODO change to persistent storage with long-index - public RaftLogEntry getEntry(long index) { - return logEntries.get((int) index - 1); - } - - public Optional> getEntryOpt(long index) { - if (index < 1 || index > logEntries.size()) { - return Optional.empty(); - } - - return Optional.of(logEntries.get((int) index - 1)); - } - - public long lastEntryInTerm(long indexAfter, long indexBeforeIncl, int term) { + @Override + public long findLastEntryInTerm(long indexAfter, long indexBeforeIncl, int term) { int idx = (int) indexAfter; for (int i = (int) indexAfter + 1; i <= indexBeforeIncl; i++) { - log.debug("i={}", i); + //log.debug("i={}", i); if (logEntries.get(i - 1).term() == term) { idx = i; } @@ -41,11 +28,12 @@ public long lastEntryInTerm(long indexAfter, long indexBeforeIncl, int term) { return idx; } - + @Override public long getLastLogIndex() { return logEntries.size(); // 0 = no records } + @Override public int getLastLogTerm() { if (logEntries.isEmpty()) { return 0; // return term 0 by default @@ -54,9 +42,10 @@ public int getLastLogTerm() { } } - public long append(final RaftLogEntry logEntry) { + @Override + public long appendEntry(final RaftLogEntry logEntry, final boolean endOfBatch) { logEntries.add(logEntry); - return logEntries.size(); // starting from index=1 + return getLastLogIndex(); // starting from index=1 } @@ -73,6 +62,7 @@ public long append(final RaftLogEntry logEntry) { // TODO unittest + @Override public void appendOrOverride(final List> newEntries, long prevLogIndex) { log.debug("appendOrOverride(newEntries={} , prevLogIndex={}", newEntries, prevLogIndex); @@ -106,13 +96,29 @@ public void appendOrOverride(final List> newEntries, long prevLo } // 1 - public List> getEntriesStartingFrom(long nextIndex) { - if (getLastLogIndex() < nextIndex) { + @Override + public List> getEntries(long indexFrom, int limit) { + + indexFrom = Math.max(indexFrom, 1L); + + if (getLastLogIndex() < indexFrom) { return List.of(); } - log.debug("getEntriesStartingFrom({}): logEntries: {}", nextIndex, logEntries); + log.debug("getEntriesStartingFrom({}): logEntries: {}", indexFrom, logEntries); + + final long indexTo = indexFrom + limit; + + final int indexToMin = (int) Math.min(indexTo, logEntries.size()); + + log.debug("indexTo={} indexToMin={}", indexTo, indexToMin); + + final List> sublistView = logEntries.subList((int) indexFrom - 1, indexToMin); + return new ArrayList<>(sublistView); + } - return new ArrayList<>(logEntries.subList((int) nextIndex - 1, logEntries.size())); + @Override + public void close() { + // do nothing } } diff --git a/src/main/java/exchange/core2/revelator/raft/repository/SnapshotDescriptor.java b/src/main/java/exchange/core2/revelator/raft/repository/SnapshotDescriptor.java new file mode 100644 index 0000000..819ab41 --- /dev/null +++ b/src/main/java/exchange/core2/revelator/raft/repository/SnapshotDescriptor.java @@ -0,0 +1,58 @@ +package exchange.core2.revelator.raft.repository; + +import org.jetbrains.annotations.NotNull; + +import java.util.NavigableMap; +import java.util.TreeMap; + +public class SnapshotDescriptor implements Comparable{ + + + private final long snapshotId; // 0 means empty snapshot (clean start) + + // sequence when snapshot was made + private final long seq; + private final long timestampNs; + + // next and previous snapshots + private final SnapshotDescriptor prev; + private SnapshotDescriptor next = null; // TODO can be a list + + private final int numMatchingEngines; + private final int numRiskEngines; + + // all journals based on this snapshot + // mapping: startingSeq -> JournalDescriptor + private final NavigableMap journals = new TreeMap<>(); + + + public SnapshotDescriptor(long snapshotId, long seq, long timestampNs, SnapshotDescriptor prev, int numMatchingEngines, int numRiskEngines) { + this.snapshotId = snapshotId; + this.seq = seq; + this.timestampNs = timestampNs; + this.prev = prev; + this.numMatchingEngines = numMatchingEngines; + this.numRiskEngines = numRiskEngines; + } + + /** + * Create initial empty snapshot descriptor + * + * @param initialNumME - number of matching engine instances + * @param initialNumRE - number of risk engine instances + * @return new instance + */ + public static SnapshotDescriptor createEmpty(int initialNumME, int initialNumRE) { + return new SnapshotDescriptor(0, 0, 0, null, initialNumME, initialNumRE); + } + + public SnapshotDescriptor createNext(long snapshotId, long seq, long timestampNs) { + return new SnapshotDescriptor(snapshotId, seq, timestampNs, this, numMatchingEngines, numRiskEngines); + } + + @Override + public int compareTo(@NotNull SnapshotDescriptor o) { + return Long.compare(this.seq, o.seq); + } + +} From e0d6b3d1967b1070361e523c830b244d1cf4cd88 Mon Sep 17 00:00:00 2001 From: Maksim Zheravin Date: Mon, 7 Feb 2022 23:44:28 +0200 Subject: [PATCH 14/15] RAFT: Fix Node logic and mem-repository --- src/main/java/exchange/core2/revelator/raft/RaftNode.java | 2 +- .../core2/revelator/raft/repository/RaftMemLogRepository.java | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/main/java/exchange/core2/revelator/raft/RaftNode.java b/src/main/java/exchange/core2/revelator/raft/RaftNode.java index 881bc87..1fbfd71 100644 --- a/src/main/java/exchange/core2/revelator/raft/RaftNode.java +++ b/src/main/java/exchange/core2/revelator/raft/RaftNode.java @@ -511,7 +511,7 @@ private void applyPendingEntriesToStateMachine() { */ // TODO request range (up to commitIndex) - final List> entries = logRepository.getEntries(lastApplied, Integer.MAX_VALUE); + final List> entries = logRepository.getEntries(lastApplied + 1, Integer.MAX_VALUE); int idx = 0; while (lastApplied < commitIndex) { diff --git a/src/main/java/exchange/core2/revelator/raft/repository/RaftMemLogRepository.java b/src/main/java/exchange/core2/revelator/raft/repository/RaftMemLogRepository.java index 5ad13a8..97c2878 100644 --- a/src/main/java/exchange/core2/revelator/raft/repository/RaftMemLogRepository.java +++ b/src/main/java/exchange/core2/revelator/raft/repository/RaftMemLogRepository.java @@ -99,7 +99,9 @@ public void appendOrOverride(final List> newEntries, long prevLo @Override public List> getEntries(long indexFrom, int limit) { - indexFrom = Math.max(indexFrom, 1L); + if (indexFrom == 0L && limit == 1) { + return List.of(); + } if (getLastLogIndex() < indexFrom) { return List.of(); From 11d1fc5f15ef5953d33afdadd28a16d644b01b51 Mon Sep 17 00:00:00 2001 From: Maksim Zheravin Date: Fri, 20 May 2022 00:28:43 +0300 Subject: [PATCH 15/15] RAFT: raft disk journal fixes --- .../core2/revelator/raft/RaftNode.java | 4 +++- .../core2/revelator/raft/demo/CustomNode.java | 7 ++++++- .../repository/RaftDiskLogRepository.java | 20 +++++++++++++++++-- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/src/main/java/exchange/core2/revelator/raft/RaftNode.java b/src/main/java/exchange/core2/revelator/raft/RaftNode.java index 1fbfd71..9a4d10a 100644 --- a/src/main/java/exchange/core2/revelator/raft/RaftNode.java +++ b/src/main/java/exchange/core2/revelator/raft/RaftNode.java @@ -34,7 +34,7 @@ public class RaftNode { private int votedFor = -1; // log entries; each entry contains command for state machine, and term when entry was received by leader (first index is 1) - private final IRaftLogRepository logRepository = new RaftMemLogRepository<>(); + private final IRaftLogRepository logRepository; /* **** Volatile state on all servers: */ @@ -87,6 +87,7 @@ public class RaftNode { public RaftNode(int thisNodeId, + IRaftLogRepository logRepository, ReplicatedStateMachine rsm, RsmRequestFactory msgFactory, RsmResponseFactory respFactory) { @@ -97,6 +98,7 @@ public RaftNode(int thisNodeId, 1, "localhost:3779", 2, "localhost:3780"); + this.logRepository = logRepository; this.currentNodeId = thisNodeId; this.rsm = rsm; this.otherNodes = remoteNodes.keySet().stream().mapToInt(x -> x).filter(nodeId -> nodeId != thisNodeId).toArray(); diff --git a/src/main/java/exchange/core2/revelator/raft/demo/CustomNode.java b/src/main/java/exchange/core2/revelator/raft/demo/CustomNode.java index 0ad6f4d..52dbfcc 100644 --- a/src/main/java/exchange/core2/revelator/raft/demo/CustomNode.java +++ b/src/main/java/exchange/core2/revelator/raft/demo/CustomNode.java @@ -1,6 +1,8 @@ package exchange.core2.revelator.raft.demo; import exchange.core2.revelator.raft.RaftNode; +import exchange.core2.revelator.raft.repository.IRaftLogRepository; +import exchange.core2.revelator.raft.repository.RaftMemLogRepository; public class CustomNode { @@ -10,7 +12,10 @@ public static void main(String[] args) { final CustomRsm customRsm = new CustomRsm(); - new RaftNode<>(thisNodeId, customRsm, customRsm, customRsm); + //final RaftDiskLogRepository repository = new RaftDiskLogRepository<>(customRsm, thisNodeId); + final IRaftLogRepository repository = new RaftMemLogRepository<>(); + + new RaftNode<>(thisNodeId, repository, customRsm, customRsm, customRsm); } } diff --git a/src/main/java/exchange/core2/revelator/raft/repository/RaftDiskLogRepository.java b/src/main/java/exchange/core2/revelator/raft/repository/RaftDiskLogRepository.java index 0a6de69..f684921 100644 --- a/src/main/java/exchange/core2/revelator/raft/repository/RaftDiskLogRepository.java +++ b/src/main/java/exchange/core2/revelator/raft/repository/RaftDiskLogRepository.java @@ -28,7 +28,9 @@ public class RaftDiskLogRepository implements IRaftLogRepo private final String exchangeId = "EC2R-TEST"; - private final Path folder = Path.of("./raftlogs"); + private final Path folder; + + private final int nodeId; private RandomAccessFile raf; private FileChannel writeChannel; @@ -62,8 +64,17 @@ public class RaftDiskLogRepository implements IRaftLogRepo private final RsmRequestFactory rsmRequestFactory; - public RaftDiskLogRepository(RsmRequestFactory rsmRequestFactory) { + public RaftDiskLogRepository(RsmRequestFactory rsmRequestFactory, int nodeId) { this.rsmRequestFactory = rsmRequestFactory; + this.nodeId = nodeId; + + this.folder = Path.of("./raftlogs/node" + nodeId); + + final long timestamp = System.currentTimeMillis(); + + baseSnapshotId = timestamp; + + startNewFile(timestamp); } @@ -255,6 +266,10 @@ private LongLongPair findStartingIndexPoint(long indexFrom) { @Override public List> getEntries(long indexFrom, int limit) { + if (indexFrom == 0L && limit == 1) { + return List.of(); + } + if (indexFrom > lastIndex) { return List.of(); } @@ -266,6 +281,7 @@ public List> getEntries(long indexFrom, int limit) { try { log.debug("Reading {} - floor idx:{} offset:{}", indexFrom, floorIndex, startOffset); readChannel.position(startOffset); + log.debug("Position ok"); } catch (IOException ex) { throw new RuntimeException("can not read log at offset " + startOffset, ex); }