cocagne · yungoo · Mar 28, 2019 · Mar 28, 2019 · Mar 28, 2019
diff --git a/master_strategy.py b/master_strategy.py
@@ -13,6 +13,31 @@
 
 
 class DedicatedMasterStrategyMixin (object):
+    """
+    :seealso https://understandingpaxos.wordpress.com/
+
+    ## Chain-Based Management of Master Leases
+
+    Using the chain to manage leases is accomplished by adding a layer of indirection to the application-level values
+    generated by the chain. Rather than a one to one correspondence between link values and application level values,
+    each value in the chain may instead be a pair of elements in which one of the two elements is always null.
+    The pair may contain either the next application-level value or it may contain a new multi-paxos configuration;
+    but never both. The most recent configuration defines the master peer and the duration of time for which that peer
+    is guaranteed to be the master. The master may suggest configuration changes at any time, and must do so prior to
+    the expiry of each lease in order to continually maintain its master status. In contrast, all other peers, must
+    wait for the lease to expire before suggesting changes.
+
+    ## Mitigating the Single Point of Failure**
+
+    Taking a step back from the details, there is a significant drawback to master peers in that it introduces
+    a single point of failure into an otherwise robust model that can tolerate multiple failures.
+    The problems associated with this single point of failure cannot be entirely eliminated but they can be mitigated
+    by allowing a new master to be elected when the current one fails.
+    The drawback is that read and write operations are completely blocked until the failure of the master peer is
+    detected, the subsequent reelection process completes, and clients are informed of the new master’s identity.
+    In most properly tuned implementations reelections will be rare occurrences so the performance benefits of
+    this approach will usually offset the occasional hiccups in availability.
+    """
 
     lease_window   = 10.0  # seconds
     lease_start    = 0.0
@@ -25,6 +50,17 @@ class DedicatedMasterStrategyMixin (object):
 
 
     def start_master_lease_timer(self):
+        """
+        Handling Master Peer Failures
+
+        When the master lease expires, the peers assume that the master has failed and they work together to elect a
+        new one. With chain-based lease management, this is easily accomplished by each peer attempting to update the
+        multi-paxos configuration with itself set as the new master. Regardless of the implementation approach, there
+        should be a quick turn around time on the detection and re-election processes since all read and write
+        operations must be suspended until the new master is established and clients are informed of its new identity.
+        This makes master peer failures fairly expensive operations so graceful transitions where the master
+        preemptively hands off its master status to another peer should be used whenever possible.
+        """
         self.lease_start = time.time()
 
         if self.lease_expiry is not None and self.lease_expiry.active():
@@ -34,11 +70,25 @@ def start_master_lease_timer(self):
 
 
     def update_lease(self, master_uid):
+        """
+        Timing Considerations for Master Leases
+
+        When attempting to first gain a master lease and also when renewing it, the lease timer is started before the
+        first message to establish the lease is sent. Because there is at least some minimal delay required in achieving
+        consensus on the new lease, the master’s local lease timer is guaranteed to expire before the rest of the peers,
+        thus preventing any window of opportunity in which one of the other peers may gain master status while the
+        current master believes it still holds the lease. Faulty clock hardware on a peer could cause it to
+        think that a lease had elapsed early but this problem is mitigated by the requirement that all peers drop
+        Permission Request and Suggestion messages originating from all non-master peers while they believe the lease
+        is held. As long as a majority of peers do not have similarly faulty hardware, the master’s status is protected.
+        """
         self.master_uid = master_uid
 
+        # Let other competitor rest master lease timer, cause master triger renewals before competitors timer.
         if self.network_uid != master_uid:
             self.start_master_lease_timer()
 
+        # Renew owned master lease
         if master_uid == self.network_uid:
             renew_delay = (self.lease_start + self.lease_window - 1) - time.time()
 
@@ -67,6 +117,7 @@ def propose_update(self, new_value, application_level=True):
             else:
                 print 'IGNORING CLIENT REQUEST. Current master is: ', self.master_uid
         else:
+            # request master lease
             if (self.master_uid is None or self.master_uid == self.network_uid) and not self.master_attempt:
                 self.master_attempt = True
                 self.start_master_lease_timer()
@@ -78,6 +129,8 @@ def load_state(self):
 
         if self._initial_load:
             self._initial_load = False
+
+            # try request master lease
             self.update_lease(None)
 
 
@@ -89,9 +142,16 @@ def drive_to_resolution(self):
             self.stop_driving()
 
             if self.paxos.proposal_id.number == 1:
+                # Resolution in  single round trip
                 self.send_accept(self.paxos.proposal_id, self.paxos.proposed_value)
             else:
-                self.paxos.prepare()
+                # Resolution in two round trip
+                self.prepare()
+
+            # May override the retransmit_task in ExponentialBackoffResolutionStrategyMixin.send_accept
+            # that cause to keep sending accept message, so cancel it first
+            if self.retransmit_task is not None:
+                self.retransmit_task.stop()
 
             self.retransmit_task = task.LoopingCall( lambda : self.send_prepare(self.paxos.proposal_id) )
             self.retransmit_task.start( self.retransmit_interval/1000.0, now=False )
@@ -121,6 +181,7 @@ def advance_instance(self, new_instance_number, new_current_value, catchup=False
 
         super(DedicatedMasterStrategyMixin,self).advance_instance(new_instance_number, new_current_value)
 
+        # Bypass first stage of paxos preparing for resolution in single round trip
         if self.master_uid:
 
             master_pid = ProposalID(1,self.master_uid)
@@ -138,14 +199,14 @@ def advance_instance(self, new_instance_number, new_current_value, catchup=False
     def receive_prepare(self, from_uid, instance_number, proposal_id):
 
         if self.master_uid and from_uid != self.master_uid:
-            return # Drop non-master requests
+            return # Drop non-master requests to assist **Master Lease Restrictions on Message Handling**
 
         super(DedicatedMasterStrategyMixin,self).receive_prepare(from_uid, instance_number, proposal_id)
 
 
     def receive_accept(self, from_uid, instance_number, proposal_id, proposal_value):
 
         if self.master_uid and from_uid != self.master_uid:
-            return # Drop non-master requests
+            return # Drop non-master requests to assist **Master Lease Restrictions on Message Handling**
 
         super(DedicatedMasterStrategyMixin,self).receive_accept(from_uid, instance_number, proposal_id, proposal_value)
diff --git a/sync_strategy.py b/sync_strategy.py
@@ -10,17 +10,17 @@
 class SimpleSynchronizationStrategyMixin (object):
 
     sync_delay = 10.0
-    
+
     def set_messenger(self, messenger):
         super(SimpleSynchronizationStrategyMixin,self).set_messenger(messenger)
 
         def sync():
             self.messenger.send_sync_request(random.choice(self.peers), self.instance_number)
-                
+
         self.sync_task = task.LoopingCall(sync)
         self.sync_task.start(self.sync_delay)
 
-        
+
     def receive_sync_request(self, from_uid, instance_number):
         if instance_number < self.instance_number:
             self.messenger.send_catchup(from_uid, self.instance_number, self.current_value)