diff --git a/formal_verification/README.md b/formal_verification/README.md
new file mode 100644
index 0000000..420233b
--- /dev/null
+++ b/formal_verification/README.md
@@ -0,0 +1,21 @@
+# Formal Verification
+
+To ensure that Weave synchronization data structures are free of
+concurrency bugs, deadlocks or livelocks they are formally verified via model checking.
+
+The event notifier which parks idle threads and wake them up when receiving tasks
+has been formally implemented verified via TLA+ (Temporal Logic of Action).
+
+TLA+ is an industrial-strength formal specification language, model checker and can plug into proof assistant to prove properties of code.
+It is used at Microsoft and Amazon to validate bug-free distributed protocol or at Intel to ensure that that the memory of a CPU is free of cache-coherency bugs.
+
+Link: https://lamport.azurewebsites.net/tla/tla.html
+
+
+Weave Multi-Producer Single-Consumer queue has been implemented in C++ and run through CDSChecker, a model checking tool for C++11 atomics.
+
+It exhaustively checks all possible thread interleavings to ensure that no path lead to a bug.
+
+Note: Due to CDSChecker running out of "snapshotting memory" (to rollback to a previous program state) when using dlmalloc `mspace` functions, the checks are not complete.
+
+Link: http://plrg.ics.uci.edu/software_page/42-2/
diff --git a/formal_verification/mpsc_batch.cpp b/formal_verification/mpsc_batch.cpp
new file mode 100644
index 0000000..59dd75b
--- /dev/null
+++ b/formal_verification/mpsc_batch.cpp
@@ -0,0 +1,408 @@
+// Weave
+// Copyright (c) 2019 Mamy André-Ratsimbazafy
+// Licensed and distributed under either of
+//   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+//   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+// at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <threads.h>
+#include <atomic>
+
+#if 0
+// Normal C++
+#include <assert.h>
+#define MODEL_ASSERT(...) assert(__VA_ARGS__)
+#define thrd_join(thr) thrd_join(thr, nullptr)
+#define user_main(...) main(__VA_ARGS__)
+#else
+// CDSChecker
+#include <model-assert.h>
+#endif
+
+static const int Padding = 64;
+
+#define LOG(...) {printf(__VA_ARGS__); fflush(stdout);}
+
+template <typename T>
+struct Enqueueable {
+  std::atomic<Enqueueable*> next;
+  T payload;
+};
+
+template <typename T>
+class ChannelMpscUnboundedBatch
+{
+  private:
+    alignas(Padding) std::atomic<Enqueueable<T>*> m_back;
+    std::atomic<int> m_count;
+    alignas(Padding) Enqueueable<T> m_front;
+
+  public:
+    void initialize(){
+      // Ensure no false positive
+      m_front.next.store(nullptr, std::memory_order_relaxed);
+      m_back.store(&m_front, std::memory_order_relaxed);
+      m_count.store(0, std::memory_order_relaxed);
+    }
+
+    bool trySend(Enqueueable<T>* src){
+      // Send an item to the back of the channel
+      // As the channel has unbounded capacity, this should never fail
+
+      m_count.fetch_add(1, std::memory_order_relaxed);
+      src->next.store(nullptr, std::memory_order_release);
+      auto oldBack = m_back.exchange(src, std::memory_order_acq_rel);
+      // Consumer can be blocked here, it doesn't see the (potentially growing)
+      //  end of the queue until the next instruction.
+      oldBack->next.store(src, std::memory_order_release);
+
+      return true;
+    }
+
+    bool tryRecv(Enqueueable<T>** dst){
+      // Try receiving the next item buffered in the channel
+      // Returns true if successful (channel was not empty)
+      // This can fail spuriously on the last element if producer
+      // enqueues a new element while the consumer was dequeueing it
+
+      auto first = m_front.next.load(std::memory_order_acquire);
+      if (first == nullptr) {
+        // Apparently may read from uninitialized load here
+        // std::atomic_thread_fence(std::memory_order_acquire); // sync first.next.load(moRelaxed)
+        return false;
+      }
+
+      // Fast path
+      {
+        auto next = first->next.load(std::memory_order_acquire);
+        if (next != nullptr) {
+          // not competing with producers
+          __builtin_prefetch(first);
+          m_count.fetch_sub(1, std::memory_order_relaxed);
+          m_front.next.store(next, std::memory_order_relaxed);
+          *dst = first;
+          // std::atomic_thread_fence(std::memory_order_acquire); // sync first.next.load(moRelaxed)
+
+          MODEL_ASSERT(m_count.load(std::memory_order_relaxed) >= 0);
+          return true;
+        }
+      }
+      // end fast-path
+
+      // Competing with producers at the back
+      auto last = m_back.load(std::memory_order_acquire);
+      if (first != last) {
+        // We lose the competition before even trying
+        // std::atomic_thread_fence(std::memory_order_acquire); // sync first.next.load(moRelaxed)
+        return false;
+      }
+
+      m_front.next.store(nullptr, std::memory_order_acquire);
+      if (m_back.compare_exchange_strong(last, &m_front, std::memory_order_acq_rel)) {
+        // We won and replaced the last node with the channel front
+        __builtin_prefetch(first);
+        m_count.fetch_sub(1, std::memory_order_relaxed);
+        *dst = first;
+
+        MODEL_ASSERT(m_count.load(std::memory_order_relaxed) >= 0);
+        return true;
+      }
+
+      // We lost but now we know that there is an extra node coming very soon
+      auto next = first->next.load(std::memory_order_acquire);
+      while (next == nullptr) {
+        // spinlock
+        thrd_yield();
+        next = first->next.load(std::memory_order_acquire);
+      }
+
+      __builtin_prefetch(first);
+      m_count.fetch_sub(1, std::memory_order_relaxed);
+      m_front.next.store(next, std::memory_order_relaxed);     // We are the only reader of next, no sync needed
+      // std::atomic_thread_fence(std::memory_order_acquire); // sync first.next.load(moRelaxed)
+      *dst = first;
+
+      MODEL_ASSERT(m_count.load(std::memory_order_relaxed) >= 0);
+      return true;
+    }
+
+    //   Alternative impl - seems to lead to livelock in the runtime
+    //
+    //   // We lost but now we know that there is an extra node coming very soon
+    //   auto next = first->next.load(std::memory_order_relaxed);
+    //   if (next != nullptr) {
+    //     // Extra node after this one, no competition with producers
+    //     __builtin_prefetch(first);
+    //     m_count.fetch_sub(1, std::memory_order_relaxed);
+    //     m_front.next.store(next, std::memory_order_relaxed);
+    //     atomic_thread_fence(std::memory_order_acquire);
+    //     *dst = first;
+    //     return true;
+    //   }
+
+    //   // The last item wasn't linked to the list yet, bail out
+    //   return false;
+    // }
+
+  bool trySendBatch(Enqueueable<T>* first, Enqueueable<T>* last, int count){
+    // Send a list of items to the back of the channel
+    // They should be linked together by their next field
+    // As the channel has unbounded capacity this should never fail
+    m_count.fetch_add(count, std::memory_order_relaxed);
+    last->next.store(nullptr, std::memory_order_release);
+
+    auto oldBack = m_back.exchange(last, std::memory_order_acq_rel);
+    // Consumer can be blocked here, it doesn't see the (potentially growing)
+    //  end of the queue until the next instruction.
+    oldBack->next.store(first, std::memory_order_release);
+
+    MODEL_ASSERT(m_count.load(std::memory_order_relaxed) >= 0);
+    return true;
+  }
+
+  int tryRecvBatch(Enqueueable<T>** bFirst, Enqueueable<T>** bLast){
+    // Try receiving all items buffered in the channel
+    // Returns true if at least some items are dequeued.
+    // There might be competition with producers for the last item
+    //
+    // Items are returned as a linked list
+    // Returns the number of items received
+    //
+    // If no items are returned bFirst and bLast are undefined
+    // and should not be used.
+    //
+    // ⚠️ This leaks the next item
+    //   nil or overwrite it for further use in linked lists
+    int result = 0;
+
+    auto front = m_front.next.load(std::memory_order_acquire);
+    *bFirst = front;
+    if (front == nullptr) {
+      return 0;
+    }
+
+    // Fast forward to the end of the channel
+    {
+      auto next = front->next.load(std::memory_order_acquire);
+      while (next != nullptr) {
+        result += 1;
+        *bLast = front;
+        front = next;
+        next = next->next.load(std::memory_order_acquire);
+      }
+    }
+
+    // Competing with producers at the back
+    auto last = m_back.load(std::memory_order_acquire);
+    if (front != last){
+      // We lose the competition, bail out
+      m_front.next.store(front, std::memory_order_release);
+      m_count.fetch_sub(result, std::memory_order_relaxed);
+
+      MODEL_ASSERT(m_count.load(std::memory_order_relaxed) >= 0);
+      return result;
+    }
+
+    // front == last
+    m_front.next.store(nullptr, std::memory_order_relaxed);
+    if (m_back.compare_exchange_strong(last, &m_front, std::memory_order_acq_rel)) {
+      // We won and replaced the last node with the channel front
+      __builtin_prefetch(front);
+      result += 1;
+      m_count.fetch_sub(result, std::memory_order_acq_rel);
+      *bLast = front;
+
+      MODEL_ASSERT(m_count.load(std::memory_order_relaxed) >= 0);
+      return result;
+    }
+
+    // We lost but now we know that there is an extra node coming very soon
+    auto next = front->next.load(std::memory_order_acquire);
+    while (next == nullptr) {
+      // spinlock, livelock issue at a higher level in if the consumer never yields
+      thrd_yield();
+      next = front->next.load(std::memory_order_acquire);
+    }
+
+    __builtin_prefetch(front);
+    result += 1;
+    m_count.fetch_sub(result, std::memory_order_relaxed);
+    m_front.next.store(next, std::memory_order_relaxed);
+    // std::atomic_thread_fence(std::memory_order_acquire); // sync front.next.load(moRelaxed)
+    *bLast = front;
+
+    MODEL_ASSERT(m_count.load(std::memory_order_relaxed) >= 0);
+    return result;
+
+  }
+
+};
+
+// ----------------------------------------------------------------
+// Sanity checks
+
+struct thread_args {int ID; ChannelMpscUnboundedBatch<int>* chan;};
+
+#define sendLoop(chan, src) \
+{ \
+  while (!chan->trySend(src)) ; \
+}
+
+#define recvLoop(chan, dst) \
+{ \
+  while (!chan->tryRecv(dst)) ; \
+}
+
+static const int NumVals = 3;
+static const int Zeroes = 1000;
+static const int NumSenders = 1;
+
+void * thread_func_sender(void* args){
+  struct thread_args* a = static_cast<thread_args*>(args);
+  for (int i = 0; i < NumVals; ++i) {
+    Enqueueable<int>* val = static_cast<Enqueueable<int>*>(malloc(sizeof(Enqueueable<int>)));
+    val->payload = a->ID * Zeroes + i;
+    LOG("                                0x%.08x = %d\n", val, val->payload);
+    sendLoop(a->chan, val);
+  }
+  return nullptr;
+}
+
+void * thread_func_receiver(void* args){
+  struct thread_args* a = static_cast<thread_args*>(args);
+  int counts[NumSenders+1] = {0};
+  for (int i = 0; i < NumSenders * NumVals; ++i){
+    Enqueueable<int>* val;
+    recvLoop(a->chan, &val);
+
+    auto sender = val->payload / Zeroes;
+
+    LOG("recv: 0x%.08x = %d\n", val, val->payload);
+    MODEL_ASSERT(val->payload = counts[sender] + sender * Zeroes);
+
+    ++counts[sender];
+    free(val);
+  }
+
+  LOG("-----------------------------------\n");
+  for (int sender = 1; sender < NumSenders+1; ++sender){
+    LOG("counts[%d] = %d\n", sender, counts[sender]);
+    MODEL_ASSERT(counts[sender] == NumVals);
+  }
+  return nullptr;
+}
+
+int user_main_single(int argc, char **argv){
+  LOG("Running single receive test\n");
+
+  ChannelMpscUnboundedBatch<int>* chan;
+  chan = static_cast<ChannelMpscUnboundedBatch<int>*>(malloc(sizeof(ChannelMpscUnboundedBatch<int>)));
+  // printf("Size channel %lu\n", sizeof(ChannelMpscUnboundedBatch<int>));
+  chan->initialize();
+
+  thrd_t thr[NumSenders+1];
+  thread_args args[NumSenders+1];
+  for (int i = 0; i < NumSenders+1; ++i){
+    args[i].ID = i;
+    args[i].chan = chan;
+  }
+
+  thrd_create(&thr[0], (thrd_start_t)&thread_func_receiver, &args[0]);
+  for (int i = 1; i < NumSenders+1; ++i){
+    thrd_create(&thr[i], (thrd_start_t)&thread_func_sender, &args[i]);
+  }
+  for (int i = 0; i < NumSenders+1; ++i){
+    thrd_join(thr[i]);
+  }
+
+  free(chan);
+  printf("------------------------------------------------------------------------\n");
+  printf("Success\n");
+
+  return 0;
+}
+
+// ----------------------------------------------------------------
+// Batch
+
+void * thread_func_receiver_batch(void* args){
+  struct thread_args* a = static_cast<thread_args*>(args);
+  int counts[NumSenders+1] = {0};
+  int received = 0;
+  int batchID = 0;
+
+  while (received < NumSenders * NumVals){
+    Enqueueable<int>* first;
+    Enqueueable<int>* last;
+    int batchSize = a->chan->tryRecvBatch(&first, &last);
+    batchID += 1;
+    if (batchSize == 0){
+      continue;
+    }
+
+    auto cur = first;
+    int idx = 0;
+    while (idx < batchSize){
+      auto sender = cur->payload / Zeroes;
+      LOG("recv: 0x%.08x = %d\n", cur, cur->payload);
+      MODEL_ASSERT(cur->payload = counts[sender] + sender * Zeroes);
+
+      ++counts[sender];
+      ++received;
+
+      ++idx;
+      if (idx == batchSize){
+        MODEL_ASSERT(cur == last);
+      }
+
+      auto old = cur;
+      cur = cur->next.load(std::memory_order_acq_rel);
+      free(old);
+      LOG("Receiver processed batch id %d of size %d (received total %d) \n", batchID, batchSize, received)
+    }
+  }
+
+  LOG("-----------------------------------\n");
+  for (int sender = 1; sender < NumSenders+1; ++sender){
+    LOG("counts[%d] = %d\n", sender, counts[sender]);
+    MODEL_ASSERT(counts[sender] == NumVals);
+  }
+  return nullptr;
+}
+
+int user_main_batch(int argc, char **argv){
+  LOG("Running batch receive test\n");
+
+  ChannelMpscUnboundedBatch<int>* chan;
+  chan = static_cast<ChannelMpscUnboundedBatch<int>*>(malloc(sizeof(ChannelMpscUnboundedBatch<int>)));
+  // printf("Size channel %lu\n", sizeof(ChannelMpscUnboundedBatch<int>));
+  chan->initialize();
+
+  thrd_t thr[NumSenders+1];
+  thread_args args[NumSenders+1];
+  for (int i = 0; i < NumSenders+1; ++i){
+    args[i].ID = i;
+    args[i].chan = chan;
+  }
+
+  thrd_create(&thr[0], (thrd_start_t)&thread_func_receiver_batch, &args[0]);
+  for (int i = 1; i < NumSenders+1; ++i){
+    thrd_create(&thr[i], (thrd_start_t)&thread_func_sender, &args[i]);
+  }
+  for (int i = 0; i < NumSenders+1; ++i){
+    thrd_join(thr[i]);
+  }
+
+  free(chan);
+  printf("------------------------------------------------------------------------\n");
+  printf("Success\n");
+
+  return 0;
+}
+
+int user_main(int argc, char **argv){
+  user_main_single(argc, argv);
+  return 0;
+}
diff --git a/thread_collider/greenlet b/thread_collider/greenlet
new file mode 160000
index 0000000..aed081c
--- /dev/null
+++ b/thread_collider/greenlet
@@ -0,0 +1 @@
+Subproject commit aed081c38be407fa94cb3e05028c989722c4611d
diff --git a/weave.nimble b/weave.nimble
index 80a5f87..3f06663 100644
--- a/weave.nimble
+++ b/weave.nimble
@@ -61,7 +61,8 @@ task test, "Run Weave tests":
     test "", "benchmarks/bouncing_producer_consumer/weave_bpc.nim"
   when defined(i386) or defined(amd64):
     if not existsEnv"TEST_LANG" or getEnv"TEST_LANG" != "cpp":
-      test "", "benchmarks/matmul_gemm_blas/gemm_pure_nim/gemm_weave.nim"
+      # TODO: syncRoot doesn't block for Pledges - https://github.com/mratsim/weave/issues/97
+      # test "", "benchmarks/matmul_gemm_blas/gemm_pure_nim/gemm_weave.nim"
       test "", "benchmarks/matmul_gemm_blas/gemm_pure_nim/gemm_weave_nestable.nim"
 
   test "-d:WV_LazyFlowvar", "benchmarks/dfs/weave_dfs.nim"
@@ -74,7 +75,8 @@ task test, "Run Weave tests":
     test "-d:WV_LazyFlowvar", "benchmarks/bouncing_producer_consumer/weave_bpc.nim"
   when defined(i386) or defined(amd64):
     if not existsEnv"TEST_LANG" or getEnv"TEST_LANG" != "cpp":
-      test "-d:WV_LazyFlowvar", "benchmarks/matmul_gemm_blas/gemm_pure_nim/gemm_weave.nim"
+      # TODO: syncRoot doesn't block for Pledges - https://github.com/mratsim/weave/issues/97
+      # test "-d:WV_LazyFlowvar", "benchmarks/matmul_gemm_blas/gemm_pure_nim/gemm_weave.nim"
       test "-d:WV_LazyFlowvar", "benchmarks/matmul_gemm_blas/gemm_pure_nim/gemm_weave_nestable.nim"
 
   # Full test that combine everything:
@@ -121,7 +123,8 @@ task test_gc_arc, "Run Weave tests with --gc:arc":
     test "--gc:arc", "benchmarks/bouncing_producer_consumer/weave_bpc.nim"
   when defined(i386) or defined(amd64):
     if not existsEnv"TEST_LANG" or getEnv"TEST_LANG" != "cpp":
-      test "--gc:arc", "benchmarks/matmul_gemm_blas/gemm_pure_nim/gemm_weave.nim"
+      # TODO: syncRoot doesn't block for Pledges - https://github.com/mratsim/weave/issues/97
+      # test "--gc:arc", "benchmarks/matmul_gemm_blas/gemm_pure_nim/gemm_weave.nim"
       test "--gc:arc", "benchmarks/matmul_gemm_blas/gemm_pure_nim/gemm_weave_nestable.nim"
 
   test "--gc:arc -d:WV_LazyFlowvar", "benchmarks/dfs/weave_dfs.nim"
@@ -134,7 +137,8 @@ task test_gc_arc, "Run Weave tests with --gc:arc":
     test "--gc:arc -d:WV_LazyFlowvar", "benchmarks/bouncing_producer_consumer/weave_bpc.nim"
   when defined(i386) or defined(amd64):
     if not existsEnv"TEST_LANG" or getEnv"TEST_LANG" != "cpp":
-      test "--gc:arc -d:WV_LazyFlowvar", "benchmarks/matmul_gemm_blas/gemm_pure_nim/gemm_weave.nim"
+      # TODO: syncRoot doesn't block for Pledges - https://github.com/mratsim/weave/issues/97
+      # test "--gc:arc -d:WV_LazyFlowvar", "benchmarks/matmul_gemm_blas/gemm_pure_nim/gemm_weave.nim"
       test "--gc:arc -d:WV_LazyFlowvar", "benchmarks/matmul_gemm_blas/gemm_pure_nim/gemm_weave_nestable.nim"
 
   # Full test that combine everything:
diff --git a/weave/cross_thread_com/channels_mpsc_unbounded_batch.nim b/weave/cross_thread_com/channels_mpsc_unbounded_batch.nim
index abe10a3..07603ce 100644
--- a/weave/cross_thread_com/channels_mpsc_unbounded_batch.nim
+++ b/weave/cross_thread_com/channels_mpsc_unbounded_batch.nim
@@ -103,12 +103,11 @@ proc trySend*[T](chan: var ChannelMpscUnboundedBatch[T], src: sink T): bool {.in
   ## As the channel has unbounded capacity, this should never fail
 
   discard chan.count.fetchAdd(1, moRelaxed)
-  src.next.store(nil, moRelaxed)
-  fence(moRelease)
-  let oldBack = chan.back.exchange(src, moRelaxed)
+  src.next.store(nil, moRelease)
+  let oldBack = chan.back.exchange(src, moAcquireRelease)
   # Consumer can be blocked here, it doesn't see the (potentially growing) end of the queue
   # until the next instruction.
-  cast[T](oldBack).next.store(src, moRelaxed) # Workaround generic atomics bug: https://github.com/nim-lang/Nim/issues/12695
+  cast[T](oldBack).next.store(src, moRelease) # Workaround generic atomics bug: https://github.com/nim-lang/Nim/issues/12695
 
   return true
 
@@ -118,12 +117,11 @@ proc trySendBatch*[T](chan: var ChannelMpscUnboundedBatch[T], first, last: sink
   ## As the channel has unbounded capacity this should never fail
 
   discard chan.count.fetchAdd(int(count), moRelaxed)
-  last.next.store(nil, moRelaxed)
-  fence(moRelease)
-  let oldBack = chan.back.exchange(last, moRelaxed)
+  last.next.store(nil, moRelease)
+  let oldBack = chan.back.exchange(last, moAcquireRelease)
   # Consumer can be blocked here, it doesn't see the (potentially growing) end of the queue
   # until the next instruction.
-  cast[T](oldBack).next.store(first, moRelaxed) # Workaround generic atomics bug: https://github.com/nim-lang/Nim/issues/12695
+  cast[T](oldBack).next.store(first, moRelease) # Workaround generic atomics bug: https://github.com/nim-lang/Nim/issues/12695
 
   return true
 
@@ -133,38 +131,43 @@ proc tryRecv*[T](chan: var ChannelMpscUnboundedBatch[T], dst: var T): bool =
   ## This can fail spuriously on the last element if producer
   ## enqueues a new element while the consumer was dequeing it
 
-  let first = cast[T](chan.front.next.load(moRelaxed))
+  let first = cast[T](chan.front.next.load(moAcquire))
   if first.isNil:
+    # According to the model checker, we can't put "fence(moAcquire)"
+    # here and use relaxed semantics for "first" as it may read from initialized load
+    # (due to compiler reordering?)
     return false
 
   # fast path
   block:
-    let next = first.next.load(moRelaxed)
+    let next = first.next.load(moAcquire)
     if not next.isNil:
       # Not competing with producers
       prefetch(first)
       discard chan.count.fetchSub(1, moRelaxed)
       chan.front.next.store(next, moRelaxed)
-      fence(moAcquire)
+      # fence(moAcquire) # Sync "first.next.load(moRelaxed)"
       dst = first
       return true
   # End fast-path
 
   # Competing with producers at the back
-  var last = chan.back.load(moRelaxed)
+  var last = chan.back.load(moAcquire)
   if first != last:
     # We lose the competition before even trying
+    # fence(moAcquire) # Sync "chan.back.load(moRelaxed)"
     return false
 
-  chan.front.next.store(nil, moRelaxed)
+  chan.front.next.store(nil, moAcquire)
   if compareExchange(chan.back, last, chan.front.addr, moAcquireRelease):
     # We won and replaced the last node with the channel front
+    prefetch(first)
     discard chan.count.fetchSub(1, moRelaxed)
     dst = first
     return true
 
   # We lost but now we know that there is an extra node coming very soon
-  var next = first.next.load(moRelaxed)
+  var next = first.next.load(moAcquire)
   while next.isNil:
     # We spinlock, unfortunately there seems to be a livelock potential
     # or contention issue if we don't use cpuRelax
@@ -173,15 +176,31 @@ proc tryRecv*[T](chan: var ChannelMpscUnboundedBatch[T], dst: var T): bool =
     # or fibonacci and the program will get stuck.
     # The queue should probably be model checked and/or run through Relacy
     cpuRelax()
-    next = first.next.load(moRelaxed)
+    next = first.next.load(moAcquire)
 
   prefetch(first)
   discard chan.count.fetchSub(1, moRelaxed)
   chan.front.next.store(next, moRelaxed)
-  fence(moAcquire)
+  # fence(moAcquire) # sync first.next.load(moRelaxed)
   dst = first
   return true
 
+  # # Alternative implementation
+  # #
+  # # We lost but now we know that there is an extra node coming very soon
+  # cpuRelax()
+  # let next = first.next.load(moAcquire)
+  # if not next.isNil:
+  #   # Extra nodes after this one, no more competition with producers
+  #   prefetch(first)
+  #   discard chan.count.fetchSub(1, moRelaxed)
+  #   chan.front.next.store(next, moRelease)
+  #   dst = first
+  #   return true
+
+  # # The last item wasn't linked to the list yet, bail out
+  # return false
+
 proc tryRecvBatch*[T](chan: var ChannelMpscUnboundedBatch[T], bFirst, bLast: var T): int32 =
   ## Try receiving all items buffered in the channel
   ## Returns true if at least some items are dequeued.
@@ -198,24 +217,24 @@ proc tryRecvBatch*[T](chan: var ChannelMpscUnboundedBatch[T], bFirst, bLast: var
 
   result = 0
 
-  var front = cast[T](chan.front.next.load(moRelaxed))
+  var front = cast[T](chan.front.next.load(moAcquire))
   bFirst = front
   if front.isNil:
     return
 
   # Fast-forward to the end of the channel
-  var next = cast[T](front.next.load(moRelaxed))
+  var next = cast[T](front.next.load(moAcquire))
   while not next.isNil:
     result += 1
     bLast = front
     front = next
-    next = cast[T](next.next.load(moRelaxed))
+    next = cast[T](next.next.load(moAcquire))
 
   # Competing with producers at the back
-  var last = chan.back.load(moRelaxed)
+  var last = chan.back.load(moAcquire)
   if front != last:
     # We lose the competition, bail out
-    chan.front.next.store(front, moRelaxed)
+    chan.front.next.store(front, moRelease)
     discard chan.count.fetchSub(result, moRelaxed)
     postCondition: chan.count.load(moRelaxed) >= 0 # TODO: somehow it can be negative
     return
@@ -235,7 +254,7 @@ proc tryRecvBatch*[T](chan: var ChannelMpscUnboundedBatch[T], bFirst, bLast: var
   # We don't spinlock unlike the single receive case
   # we assume that consumer has plenty of work to do with the
   # already retrived batch
-  next = cast[T](front.next.load(moRelaxed))
+  next = cast[T](front.next.load(moAcquire))
   while next.isNil:
     # We spinlock, unfortunately there seems to be a livelock potential
     # or contention issue if we don't use cpuRelax
@@ -244,13 +263,13 @@ proc tryRecvBatch*[T](chan: var ChannelMpscUnboundedBatch[T], bFirst, bLast: var
     # or fibonacci and the program will get stuck.
     # The queue should probably be model checked and/or run through Relacy
     cpuRelax()
-    next = cast[T](front.next.load(moRelaxed))
+    next = cast[T](front.next.load(moAcquire))
 
   prefetch(front)
   result += 1
   discard chan.count.fetchSub(result, moRelaxed)
   chan.front.next.store(next, moRelaxed)
-  fence(moAcquire)
+  # fence(moAcquire)  # sync front.next.load(moRelaxed)
   bLast = front
   postCondition: chan.count.load(moRelaxed) >= 0
 
diff --git a/weave/parallel_tasks.nim b/weave/parallel_tasks.nim
index 6ce6374..6c78335 100644
--- a/weave/parallel_tasks.nim
+++ b/weave/parallel_tasks.nim
@@ -203,7 +203,9 @@ macro spawnDelayed*(pledges: varargs[typed], fnCall: typed): untyped =
 # --------------------------------------------------------
 
 when isMainModule:
-  import ./runtime, ./state_machines/[sync, sync_root], os, std/[times, monotimes]
+  import
+    ./runtime, ./state_machines/[sync, sync_root], os,
+    std/[times, monotimes]
 
   block: # Async without result
 
@@ -245,10 +247,29 @@ when isMainModule:
     main2()
 
   block: # isReady
-    proc sleepingLion(ms: int): int =
-      sleep(ms)
-      echo "--> Slept for ", ms, " ms"
-      return ms
+    template dummy_cpt(): untyped =
+      # Dummy computation
+      # Calculate fib(30) iteratively
+      var
+        fib = 0
+        f2 = 0
+        f1 = 1
+      for i in 2 .. 30:
+        fib = f1 + f2
+        f2 = f1
+        f1 = fib
+
+    proc sleepingLion(stop_ms: int64): int64 =
+      echo "Entering the Lion's Den"
+      let start = getMonoTime()
+
+      while true:
+        let elapsed = inMilliseconds(getMonoTime() - start)
+        if elapsed >= stop_ms:
+          echo "Exiting the Lion's Den"
+          return elapsed
+
+        dummy_cpt()
 
     proc main2() =
       echo "Sanity check 3: isReady"
@@ -258,8 +279,10 @@ when isMainModule:
       echo "Spawning sleeping thread for ", target, " ms"
       let start = getMonoTime()
       let f = spawn sleepingLion(123)
+      var spin_count: int64
       while not f.isReady():
-        cpuRelax()
+        loadBalance(Weave) # We need to send the task away, on OSX CI it seems like threads are not initialized fast enough
+        spin_count += 1
       let stopReady = getMonoTime()
       let res = sync(f)
       let stopSync = getMonoTime()
@@ -268,7 +291,7 @@ when isMainModule:
       let readyTime = inMilliseconds(stopReady-start)
       let syncTime = inMilliseconds(stopSync-stopReady)
 
-      echo "Retrieved: ", res, " (isReady: ", readyTime, " ms, sync: ", syncTime, " ms)"
+      echo "Retrieved: ", res, " (isReady: ", readyTime, " ms, sync: ", syncTime, " ms, spin_count: ", spin_count, ")"
       doAssert syncTime <= 1, "sync should be non-blocking"
       # doAssert readyTime in {target-1 .. target+1}, "asking to sleep for " & $target & " ms but slept for " & $readyTime
 
diff --git a/weave/state_machines/sync.dot b/weave/state_machines/sync.dot
index e3d856c..b65e6cb 100644
--- a/weave/state_machines/sync.dot
+++ b/weave/state_machines/sync.dot
@@ -15,7 +15,7 @@ digraph awaitFSA{
     AW_Steal_AWE_FutureReady -> AW_Steal_AWE_ReceivedTask[xlabel="normal flow"];
     AW_Steal_AWE_ReceivedTask -> AW_SuccessfulTheft [style=dashed, xlabel="true"];
     AW_Steal_AWE_ReceivedTask -> AW_Steal [xlabel="default"];
-    AW_SuccessfulTheft -> AW_OutOfDirectChildTasks [xlabel="default"];
+    AW_SuccessfulTheft -> AW_CheckTask [xlabel="default"];
     AW_CheckTask -> AW_CheckTask_AWE_FutureReady[style=bold, xlabel="always"];
     AW_CheckTask_AWE_FutureReady -> AW_Exit [color="coral", fontcolor="coral", xlabel="interrupted"];
     AW_CheckTask_AWE_FutureReady -> AW_CheckTask_AWE_HasChildTask[xlabel="normal flow"];
diff --git a/weave/state_machines/sync.nim b/weave/state_machines/sync.nim
index a5cd3d2..9291027 100644
--- a/weave/state_machines/sync.nim
+++ b/weave/state_machines/sync.nim
@@ -156,7 +156,7 @@ behavior(awaitFSA):
     profile(enq_deq_task):
       # The memory is reused but not zero-ed
       localCtx.taskCache.add(task)
-  fin: AW_OutOfDirectChildTasks
+  fin: AW_CheckTask
 
 # -------------------------------------------
 
diff --git a/weave/state_machines/sync.png b/weave/state_machines/sync.png
index ad4f962..5de4e83 100644
Binary files a/weave/state_machines/sync.png and b/weave/state_machines/sync.png differ
diff --git a/weave/state_machines/sync_scope.dot b/weave/state_machines/sync_scope.dot
index 493d5ea..2e25ac6 100644
--- a/weave/state_machines/sync_scope.dot
+++ b/weave/state_machines/sync_scope.dot
@@ -23,5 +23,5 @@ digraph syncScopeFSA{
     SB_Steal_SBE_NoDescendants -> SB_Steal_SBE_ReceivedTask[xlabel="normal flow"];
     SB_Steal_SBE_ReceivedTask -> SB_SuccessfulTheft [style=dashed, xlabel="true"];
     SB_Steal_SBE_ReceivedTask -> SB_Steal [xlabel="default"];
-    SB_SuccessfulTheft -> SB_OutOfTasks [xlabel="default"];
+    SB_SuccessfulTheft -> SB_CheckTask [xlabel="default"];
 }
\ No newline at end of file
diff --git a/weave/state_machines/sync_scope.nim b/weave/state_machines/sync_scope.nim
index b9e075b..abf19b3 100644
--- a/weave/state_machines/sync_scope.nim
+++ b/weave/state_machines/sync_scope.nim
@@ -163,7 +163,7 @@ behavior(syncScopeFSA):
     profile(enq_deq_task):
       # The memory is re-used but not zero-ed
       localCtx.taskCache.add(task)
-  fin: SB_OutOfTasks
+  fin: SB_CheckTask
 
 # -------------------------------------------
 
diff --git a/weave/state_machines/sync_scope.png b/weave/state_machines/sync_scope.png
index e0c488c..2f8f16f 100644
Binary files a/weave/state_machines/sync_scope.png and b/weave/state_machines/sync_scope.png differ