diff --git a/.github/workflows/build-macos-arm64.yaml b/.github/workflows/build-macos-arm64.yaml
new file mode 100644
index 00000000..a7f34bca
--- /dev/null
+++ b/.github/workflows/build-macos-arm64.yaml
@@ -0,0 +1,16 @@
+
+name: MacOS arm64
+on:
+  push:
+    branches: [master, dev]
+  pull_request:
+    branches: [master, dev]
+  workflow_dispatch:
+jobs:
+  build:
+    runs-on: macos-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v2
+      - name: Make Mochimo Install
+        run: make mochimo -C ${{ github.workspace }}
diff --git a/.github/workflows/build-ubuntu-arm64.yaml b/.github/workflows/build-ubuntu-arm64.yaml
new file mode 100644
index 00000000..fa645c74
--- /dev/null
+++ b/.github/workflows/build-ubuntu-arm64.yaml
@@ -0,0 +1,16 @@
+
+name: Ubuntu arm64
+on:
+  push:
+    branches: [master, dev]
+  pull_request:
+    branches: [master, dev]
+  workflow_dispatch:
+jobs:
+  build:
+    runs-on: ubuntu-24.04-arm
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v2
+      - name: Make Mochimo Install
+        run: make mochimo -C ${{ github.workspace }}
diff --git a/.github/workflows/build-ubuntu-x64.yaml b/.github/workflows/build-ubuntu-x64.yaml
new file mode 100644
index 00000000..4a8e2d47
--- /dev/null
+++ b/.github/workflows/build-ubuntu-x64.yaml
@@ -0,0 +1,16 @@
+
+name: Ubuntu x64
+on:
+  push:
+    branches: [master, dev]
+  pull_request:
+    branches: [master, dev]
+  workflow_dispatch:
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v2
+      - name: Make Mochimo Install
+        run: make mochimo -C ${{ github.workspace }}
diff --git a/.github/workflows/builds.yaml b/.github/workflows/builds.yaml
deleted file mode 100644
index baa4aaa6..00000000
--- a/.github/workflows/builds.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-##
-# builds.yaml - Build workflow for testing software builds.
-# Copyright 2021-2025 Adequate Systems, LLC. All Rights Reserved.
-#
-
-name: Builds
-on:
-  push:
-    branches: [main, dev]
-  pull_request:
-    branches: [main, dev] # MUST be a subset of the branches above
-jobs:
-  mochimo:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v2
-      - name: Make Mochimo Install
-        run: make mochimo -C ${{ github.workspace }}
diff --git a/.github/workflows/codeql.yaml b/.github/workflows/codeql.yaml
index 04488a05..91335493 100644
--- a/.github/workflows/codeql.yaml
+++ b/.github/workflows/codeql.yaml
@@ -9,10 +9,10 @@ defaults:
     shell: bash
 on:
   push:
-    branches: [main, dev]
+    branches: [dev, master]
   pull_request:
     # pull_request branches MUST be a subset of push branches
-    branches: [main, dev]
+    branches: [dev, master]
 jobs:
   audit:
     runs-on: ubuntu-latest
diff --git a/CHANGELOG.md b/CHANGELOG.md
index b741c2b7..466ec800 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,37 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ***
 
+## [3.0.1] - February 10th, 2025
+
+The latest update focuses on improving GPU mining efficiency and network performance. Key improvements include moving (last half) nonce generation to GPU with parallel PRNG implementation, which significantly reduces CPU usage on multi-GPU systems. Network scanning was enhanced with better thread utilization and increased peer sharing capabilities. The update also includes several CUDA-specific improvements, such as error handling, device counting fixes, and migration to static runtime library for the miner. Development workflows were streamlined with individual build targets and updated CI/CD runners. Non-essential features like NVML support and testnet troubleshooting code were removed to improve codebase maintainability.
+
+### Added
+- Device launch parameters header for CUDA
+- Individual build target workflows in CI/CD
+- Links to Wallet and API releases in README
+- Manual execution instructions in README
+
+### Changed
+- Increased peer sharing capabilities beyond 32 peers
+- Improved CPU efficiency of GPU miner
+  - Moved nonce generation to GPU with parallel PRNG
+  - Better device handling and status output
+  - Stabilized hashrate display
+- Enhanced network scanning with better thread utilization
+- Updated CI/CD runners to latest versions
+- Changed CUDA runtime library to static version
+
+### Fixed
+- CUDA-specific error handling and checking
+- GPU miner (makefile) target dependencies
+- Sudo handling in setup script
+- CUDA device counting when no GPUs present
+- Bridge time check for GPU IDLE->WORK mode
+
+### Removed
+- NVML support (wasn't providing useful data)
+- Testnet troubleshooting from production code
+
 ## [3.0.0] - February 2nd, 2025
 
 Major improvements to Mochimo Addresses including Hash-based Leadger formatting, Base58 error checking perpetual account tags, and UX for account management. Improvements to transactions with the standard transaction capable of 256 destinations, each with their own reference, and easy implementation of additional Digital Signature Algortihms. Improvements to Merkle Root hash allowing for development of Transaction Receipts to validate a transaction was part of the chain without having access to the block data. Improvements to chain linkage and Tfile validation procedures. Improvements to network bandwidth with Variable (sized) Protocol Data Units.
@@ -234,13 +265,14 @@ October 27th, 2018
 - removed default maddr.dat
 - removed txq1.lck (process_tx() is now synchronous)
 
-[Unreleased]: https://github.com/adequatesystems/build-c/compare/v3.0.0...HEAD
-[3.0.0]: https://github.com/adequatesystems/build-c/compare/v2.4.3...v3.0.0
-[2.4.3]: https://github.com/adequatesystems/build-c/compare/v2.4.2...v2.4.3
-[2.4.2]: https://github.com/adequatesystems/build-c/compare/v2.4.1...v2.4.2
-[2.4.1]: https://github.com/adequatesystems/build-c/compare/v2.4...v2.4.1
-[2.4.0]: https://github.com/adequatesystems/build-c/compare/v2.3...v2.4
-[2.3.0]: https://github.com/adequatesystems/build-c/compare/v2.2...v2.3
-[2.2.0]: https://github.com/adequatesystems/build-c/compare/v2.1...v2.2
-[2.1.0]: https://github.com/adequatesystems/build-c/compare/v2.0...v2.1
-[2.0.0]: https://github.com/adequatesystems/build-c/releases/tag/v2.0
+[Unreleased]: https://github.com/mochimodev/mochimo/compare/v3.0.1...HEAD
+[3.0.1]: https://github.com/mochimodev/mochimo/compare/v3.0.0...v3.0.1
+[3.0.0]: https://github.com/mochimodev/mochimo/compare/v2.4.3...v3.0.0
+[2.4.3]: https://github.com/mochimodev/mochimo/compare/v2.4.2...v2.4.3
+[2.4.2]: https://github.com/mochimodev/mochimo/compare/v2.4.1...v2.4.2
+[2.4.1]: https://github.com/mochimodev/mochimo/compare/v2.4...v2.4.1
+[2.4.0]: https://github.com/mochimodev/mochimo/compare/v2.3...v2.4
+[2.3.0]: https://github.com/mochimodev/mochimo/compare/v2.2...v2.3
+[2.2.0]: https://github.com/mochimodev/mochimo/compare/v2.1...v2.2
+[2.1.0]: https://github.com/mochimodev/mochimo/compare/v2.0...v2.1
+[2.0.0]: https://github.com/mochimodev/mochimo/releases/tag/v2.0
diff --git a/README.md b/README.md
index 6f6a1d64..e3773863 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@
 [![GitHub release (latest by date)](https://img.shields.io/github/release/mochimodev/mochimo.svg?logo=github&logoColor=lightgrey&&labelColor=2d3339&label=&color=%230059ff)](https://github.com/mochimodev/mochimo/releases)
 ![GitHub commits since latest release (by date)](https://img.shields.io/github/commits-since/mochimodev/mochimo/latest?logo=github&logoColor=lightgrey&labelColor=2d3339&color=%230059ff)<br>
 [![Tests workflow](https://github.com/mochimodev/mochimo/actions/workflows/tests.yaml/badge.svg)](https://github.com/mochimodev/mochimo/actions/workflows/tests.yaml)
-[![Builds workflow](https://github.com/mochimodev/mochimo/actions/workflows/builds.yaml/badge.svg)](https://github.com/mochimodev/mochimo/actions/workflows/builds.yaml)
+[![Approved Build workflow](https://github.com/mochimodev/mochimo/actions/workflows/build-ubuntu-x64.yaml/badge.svg)](https://github.com/mochimodev/mochimo/actions/workflows/build-ubuntu-x64.yaml)
 [![CodeQL workflow](https://github.com/mochimodev/mochimo/actions/workflows/codeql.yaml/badge.svg)](https://github.com/mochimodev/mochimo/actions/workflows/codeql.yaml)  
 ***Mochimo Services***  
 [![Chrome Web Store Version](https://img.shields.io/chrome-web-store/v/fkogefgjocnflhankmffnibdofdiiiho?logo=chromewebstore&label=Chromium%20Wallet&logoColor=lightgrey&labelColor=2d3339)](https://chromewebstore.google.com/detail/mochimo-wallet/fkogefgjocnflhankmffnibdofdiiiho)
@@ -32,7 +32,12 @@ An API written in Go, intended to comply with the Rosetta Mesh API standard.
 <hr><hr>
 <h1 align="center"><strong>NODE REQUIREMENTS</strong></h1>
 
-## Minimum Hardware
+## Builds on Latest Target Runners
+- [![Ubuntu x64 Build](https://github.com/mochimodev/mochimo/actions/workflows/build-ubuntu-x64.yaml/badge.svg)](https://github.com/mochimodev/mochimo/actions/workflows/build-ubuntu-x64.yaml)
+- [![Ubuntu arm64 Build](https://github.com/mochimodev/mochimo/actions/workflows/build-ubuntu-arm64.yaml/badge.svg)](https://github.com/mochimodev/mochimo/actions/workflows/build-ubuntu-arm64.yaml)
+- [![MacOS arm64 Build](https://github.com/mochimodev/mochimo/actions/workflows/build-macos-arm64.yaml/badge.svg)](https://github.com/mochimodev/mochimo/actions/workflows/build-macos-arm64.yaml)
+
+## System Preconfiguration
 - (CPU) Dual-core Processor
 - (RAM) 2GB of Random Access Memory
 - (SSD) 64GB of Solid State Drive Storage
@@ -64,25 +69,8 @@ To uninstall a Mochimo Node installed as a service, find your mochimo repositori
 [sudo] make uninstall -C ~/.mcm/repo/master
 ```
 
-## Build Manually
-Whatever the reason, build manually with:
-```sh
-# clone repository, if not already, and change directory
-git clone https://github.com/mochimodev/mochimo.git
-# (optionally) select a version
-git -C mochimo/ checkout v3.0.0
-# build mochimo to mochimo/bin/
-make -C mochimo/ mochimo
-```
-
-## Run Manually
-
-```sh
-mochimo/bin/gomochi
-```
-
 <hr><hr>
-<h1 align="center"><strong>MINER BUILD/USAGE</strong></h1>
+<h1 align="center"><strong>MINER INSTRUCTIONS</strong></h1>
 
 ## Build GPU Miner
 *GPU Miner ONLY supports SOLO mining with NVIDIA cards*<br/>
diff --git a/src/bin/gpuminer.c b/src/bin/gpuminer.c
index d7c2e81f..a69814a8 100644
--- a/src/bin/gpuminer.c
+++ b/src/bin/gpuminer.c
@@ -913,7 +913,7 @@ MCM_DECL_UNUSED
                         if (get32(bt->bnum) != get32(BT_curr.bnum)) {
                            total = 0.0;
                            /* report block summary */
-                           plog("Old work stats; %u(0x%x), diff:%u",
+                           plog("Work summary; block %u(0x%x), difficulty %u",
                               get32(bt->bnum), get32(bt->bnum), bt->difficulty[0]);
                            /* print block work stats and hashrate per device */
                            for (int idx = 0; idx < device_count; idx++) {
diff --git a/src/bin/mochimo.c b/src/bin/mochimo.c
index cbacb7a5..ca8e2744 100644
--- a/src/bin/mochimo.c
+++ b/src/bin/mochimo.c
@@ -377,7 +377,7 @@ int init(void)
       shuffle32(Rplist, RPLISTLEN);
       /* scan network for quorum and highest hash/weight/bnum */
       plog("Init network...");
-      qlen = scan_network(quorum, MAXQUORUM, nethash, netweight, netbnum);
+      qlen = scan_quorum(quorum, MAXQUORUM, nethash, netweight, netbnum);
       plog(" - %d/%d 0x%s 0x...%s", qlen, MAXQUORUM,
          bnum2hex(netbnum, bnumhex), weight2hex(netweight, weighthex));
       if (qlen == 0) break; /* all alone... */
diff --git a/src/ledger.c b/src/ledger.c
index 3848128f..9abd71b4 100644
--- a/src/ledger.c
+++ b/src/ledger.c
@@ -709,23 +709,6 @@ int le_update(const char *ltfname)
             }
             /* check sort -- MUST BE ascending, NO duplicates */
             if (addr_compare(le_prev.addr, le.addr) >= 0) {
-               /* WTH IS HAPPENING */
-               perror("le_update() -- addr_compare()");
-               printf("-----------------------------\n");
-               printf("lefp: %p\n", (void *) lefp);
-               printf("ltfp: %p\n", (void *) ltfp);
-               printf("-----------------------------\n");
-               print_n_bytes("le_prev.addr ", le_prev.addr, ADDR_LEN);
-               print_n_bytes("le.addr      ", le.addr, ADDR_LEN);
-               printf("addr_compare(): %d\n", addr_compare(le_prev.addr, le.addr));
-               printf("condition: %d\n", addr_compare(le_prev.addr, le.addr) >= 0);
-               printf("-----------------------------\n");
-               print_n_bytes("lt_prev.addr ", lt_prev.addr, ADDR_LEN);
-               print_n_bytes("lt.addr      ", lt.addr, ADDR_LEN);
-               printf("addr_compare(): %d\n", addr_compare(lt_prev.addr, lt.addr));
-               printf("-----------------------------\n");
-               printf("addr_compare(le, lt): %d\n", addr_compare(le.addr, lt.addr));
-               /* end WTH IS HAPPENING */
                set_errno(EMCM_LESORT);
                goto ERROR_CLEANUP;
             }
diff --git a/src/network.c b/src/network.c
index 32cc88a4..3be006a0 100644
--- a/src/network.c
+++ b/src/network.c
@@ -449,9 +449,16 @@ int send_balance(NODE *np)
  */
 int send_ipl(NODE *np)
 {
-   int count = RPLISTLEN;
+   word32 count;
 
-   if (count > 32) count = 32;
+   /* count peers in Rplist */
+   for (count = 0; count < RPLISTLEN; count++) {
+      if (Rplist[count] == 0) break;
+   }
+   /* limit count to space available in buffer size */
+   if (count > (sizeof(np->tx.buffer) / sizeof(word32))) {
+      count = (sizeof(np->tx.buffer) / sizeof(word32));
+   }
    /* copy recent peer list to TX */
    memcpy(np->tx.buffer, Rplist, sizeof(word32) * count);
    put16(np->tx.len, sizeof(word32) * count);
@@ -901,69 +908,53 @@ bad2: pinklist(np->ip);
  * Qualifying Quorum members are placed in quorum[qlen].
  * Returns number of qualifying quorum members, or number of
  * consensus nodes on the highest chain, if quorum is NULL. */
-int scan_network
+int scan_quorum
 (word32 quorum[], word32 qlen, void *hash, void *weight, void *bnum)
 {
-   TX *tx;
    NODE node;
-   word32 done = 0;
-   word32 next = 0;
+   word32 peer;
+   word32 scanidx = 0;
    word32 qcount = 0;
-   word32 peer, *ipp;
-   word16 len;
+   int result;
    word8 highhash[HASHLEN] = { 0 };
    word8 highweight[32] = { 0 };
    word8 highbnum[8] = { 0 };
-   char weighthex[65], bnumhex[17];
-   int result;
-
-   plog("begin network scan... ");
+   char ipstr[16];
+   word16 len;
 
-   OMP_PARALLEL_(private(tx, node, peer, ipp, len, result))
-   {
-      while (Running && next < RPLISTLEN && Rplist[next]) {
-         OMP_CRITICAL_()
-         {
-            peer = Rplist[next];
-            if (peer) next++;
-         }
-         /* idle condition */
-         if (peer == 0) {
-            if (done < next) {
-               millisleep(100);
-               continue;
-            }
-            break;
-         }
-         /* get ip list from peer */
+   /* iterate through batches of peers */
+   plog("expand network peers... ");
+   while (Running && scanidx < Rplistidx) {
+      if (Rplistidx >= RPLISTLEN) break;
+      pdebug("scan progress %u/%u...", scanidx, Rplistidx);
+
+      /* prepare parallel processing scope, limit threads to 16 */
+      OMP_PARALLEL_(for num_threads(16) private(node, peer, len, ipstr))
+      for (word32 idx = scanidx; idx < Rplistidx; idx++) {
+         if (Rplistidx >= RPLISTLEN) continue;
+         /* get IP list from peer */
+         peer = Rplist[idx];
          if (get_ipl(&node, peer) == VEOK) {
-            /* get ip list from TX */
-            tx = &(node.tx);
-            len = get16(tx->len);
-            ipp = (word32 *) tx->buffer;
             OMP_CRITICAL_()
             {
-               /* iterate peerlist adding to recent peers */
-               for( ; len > 0; ipp++, len -= 4) {
-                  if (Rplistidx >= RPLISTLEN) break;
-                  if (*ipp == 0 || pinklisted(*ipp)) continue;
-                  addrecent(*ipp);
-               }
                /* check peer's chain weight against highweight */
-               result = cmp256(tx->weight, highweight);
-               if (result >= 0) {  /* higher or same chain detection */
-                  if (result > 0) {  /* higher chain detection */
+               result = cmp256(node.tx.weight, highweight);
+               if (result >= 0) {
+                  /* higher or same chain detected */
+                  if (result > 0) {
+                     /* higher chain detected */
                      pdebug("new highweight");
-                     memcpy(highhash, tx->cblockhash, HASHLEN);
-                     memcpy(highweight, tx->weight, 32);
-                     put64(highbnum, tx->cblock);
+                     memcpy(highhash, node.tx.cblockhash, HASHLEN);
+                     memcpy(highweight, node.tx.weight, 32);
+                     put64(highbnum, node.tx.cblock);
                      qcount = 0;
                      if (quorum) {
                         memset(quorum, 0, qlen);
                         pdebug("higher chain found, quourum reset...");
                      }
-                  }  /* check block hash and add to quorum */
-                  if (memcmp(tx->cblockhash, highhash, HASHLEN) >= 0) {
+                  }
+                  /* check block hash and add to quorum */
+                  if (memcmp(node.tx.cblockhash, highhash, HASHLEN) >= 0) {
                      /* add ip to quorum, or q consensus */
                      if (quorum && qcount < qlen) {
                         quorum[qcount++] = peer;
@@ -972,13 +963,26 @@ int scan_network
                   }
                }  /* end if higher or same chain */
             }  /* end OMP_CRITICAL_() */
-         }  /* end if get_ipl() == VEOK */
+            /* inspect peer list */
+            for (len = 0; len < get16(node.tx.len); len += 4) {
+               peer = *((word32 *) &node.tx.buffer[len]);
+               if (peer == 0 || pinklisted(peer)) continue;
+               /* add to recent list */
+               OMP_CRITICAL_()
+               if (Rplistidx < RPLISTLEN) {
+                  if (addpeer(peer, Rplist, RPLISTLEN, &Rplistidx)) {
+                     pdebug("Added %s to recent list", ntoa(&peer, ipstr));
+                  }
+               }
+            }
+         }  /* end if get_ipl() */
+         /* atomic increment scan index */
          OMP_ATOMIC_()
-            done++;
-      }  /* end while() */
-   }  /* end OMP_PARALLEL_() */
-   pdebug("qualifying weight 0x...%s", weight2hex(highweight, weighthex));
-   pdebug("qualifying block 0x%s", bnum2hex(highbnum, bnumhex));
+            scanidx++;
+      }  /* end OMP_PARALLEL_() */
+   }  /* end while() */
+   pdebug("qualifying weight 0x...%s", weight2hex(highweight, NULL));
+   pdebug("qualifying block 0x%s", bnum2hex(highbnum, NULL));
    pdebug("qualifying nodes %d...", qcount);
    print_ipl(quorum, qcount);
 
@@ -988,7 +992,7 @@ int scan_network
    if (bnum) put64(bnum, highbnum);
 
    return qcount;
-}  /* end scan_network() */
+}  /* end scan_quorum() */
 
 /* Refresh the ip list and send_found() to low-weight peer if needed.
  * Called from server().
diff --git a/src/network.h b/src/network.h
index 8be86b00..d1e050c6 100644
--- a/src/network.h
+++ b/src/network.h
@@ -69,8 +69,8 @@ int get_file(word32 ip, word8 *bnum, char *fname);
 int get_ipl(NODE *np, word32 ip);
 int get_hash(NODE *np, word32 ip, void *bnum, void *blockhash);
 int gettx(NODE *np, SOCKET sd);
-int scan_network
-(word32 quorum[], word32 qlen, void *hash, void *weight, void *bnum);
+int scan_quorum
+   (word32 quorum[], word32 qlen, void *hash, void *weight, void *bnum);
 int refresh_ipl(void);
 
 #ifdef __cplusplus
diff --git a/src/peach.cu b/src/peach.cu
index 9cfe1324..1a3d1d67 100644
--- a/src/peach.cu
+++ b/src/peach.cu
@@ -32,19 +32,6 @@
 #include "sha256.cu"
 #include "sha3.h"
 
-/**
- * @private
- * Definitions for embedding strings.
-*/
-#define cuSTRING(x) #x
-#define cuSTR(x) cuSTRING(x)
-
-#define cu__log_error(err) \
-   do { \
-      palert("CUDA ERROR: (%d) %s", (int) err, cudaGetErrorString(err)); \
-      set_errno(EMCM_CUDA); \
-   } while(0);
-
 /* sm_61 performs MUCH better with the __constant__ qualifier */
 #if __CUDA_ARCH__ == 610
    #define cuCONSTn860 __constant__
@@ -66,9 +53,6 @@ typedef struct {
    word32 *d_phash;                    /**< previous hash */
 } PEACH_CUDA_CTX;
 
-/* pointer to peach CUDA context/s */
-static PEACH_CUDA_CTX *PeachCudaCTX;
-
 __device__ cuCONSTn860 static word64 Z_ING[32] = {
    18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
    34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 23, 24, 31, 32, 33, 34
@@ -815,16 +799,16 @@ __global__ void kcu_peach_solve
 
    /* generate last half of nonce from seed (w/ largest known frame) */
    seed = cu_rand64(d_state);
-   nonce[2] =    WORD64_C(0x10000050000) | /* nonce8bit[2,5] */
-          Z_ING[(seed     )  & 31]       | /* nonce8bit[0] */
-         Z_PREP[(seed >> 5)  &  7] <<  8 | /* nonce8bit[1] */
-          Z_ADJ[(seed >> 8)  & 63] << 24 | /* nonce8bit[3] */
-           Z_NS[(seed >> 14) & 63] << 32 | /* nonce8bit[4] */
-         Z_MASS[(seed >> 20) & 31] << 48 | /* nonce8bit[6] */
-          Z_ING[(seed >> 25) & 31] << 56;        /* nonce8bit[7] */
-   nonce[3] =  WORD64_C(0x50103) | /* nonce8bit[8:10] */
-          Z_ADJ[(seed >> 30) & 63] << 24 | /* nonce8bit[11] */
-           Z_NS[(seed >> 36) & 63] << 32;  /* nonce8bit[12] */
+   nonce[2] = WORD64_C(0x10000050000) | /* nonce8bit[2,5] */
+       Z_ING[(seed     )  & 31]       | /* nonce8bit[0] */
+      Z_PREP[(seed >> 5)  &  7] <<  8 | /* nonce8bit[1] */
+       Z_ADJ[(seed >> 8)  & 63] << 24 | /* nonce8bit[3] */
+        Z_NS[(seed >> 14) & 63] << 32 | /* nonce8bit[4] */
+      Z_MASS[(seed >> 20) & 31] << 48 | /* nonce8bit[6] */
+       Z_ING[(seed >> 25) & 31] << 56;  /* nonce8bit[7] */
+   nonce[3] =       WORD64_C(0x50103) | /* nonce8bit[8:10] */
+       Z_ADJ[(seed >> 30) & 63] << 24 | /* nonce8bit[11] */
+        Z_NS[(seed >> 36) & 63] << 32;  /* nonce8bit[12] */
 
    /* sha256 hash trailer and nonce */
    cu_sha256_init(&ictx);
@@ -921,7 +905,10 @@ __global__ void kcu_peach_checkhash
  * @param count Number of block trailers to check
  * @param bt Pointer to block trailer array
  * @param out Pointer to final hash array, if non-null
- * @returns VEOK on success, else VERROR
+ * @returns (int) value representing the result of the operation
+ * @retval (-1) Error occurred during operation
+ * @retval 0 Evaluation successful
+ * @retval 1 Evaluation failed
 */
 int peach_checkhash_cuda(int count, BTRAILER bt[], void *out)
 {
@@ -937,8 +924,11 @@ int peach_checkhash_cuda(int count, BTRAILER bt[], void *out)
    do { \
       cudaError_t err = (cuFN); \
       if (err != cudaSuccess) { \
-         cu__log_error(err); \
-         return VERROR; \
+         const char *str = cudaGetErrorString(err); \
+         palert("CUDA ERROR: (%d) %s", (int) err, str); \
+         palert("... error returned by: %s", #cuFN); \
+         set_errno(EMCM_CUDA); \
+         return (-1); \
       } \
    } while(0)
 
@@ -956,7 +946,8 @@ int peach_checkhash_cuda(int count, BTRAILER bt[], void *out)
    cuCHK(cudaMemset(d_out, 0, outsz));
    cuCHK(cudaMemset(d_eval, 0, 1));
    /* launch kernel to check Peach */
-   kcu_peach_checkhash<<<1, count>>>(d_bt, d_out, d_eval);
+   CUDA_KERNEL(kcu_peach_checkhash, 1, count)
+      (d_bt, d_out, d_eval);
    cuCHK(cudaGetLastError());
    /* retrieve hash/eval data */
    cuCHK(cudaMemcpy(out, d_out, outsz, cudaMemcpyDeviceToHost));
@@ -971,39 +962,6 @@ int peach_checkhash_cuda(int count, BTRAILER bt[], void *out)
    return (int) eval;
 }  /* end peach_checkhash_cuda() */
 
-/**
- * Free CUDA memory allocated to a previously initialized device context.
- * @param devp Pointer to DEVICE_CTX to free
- * @returns VEOK on valid DEVICE_CTX pointer, else VERROR
-*/
-int peach_free_cuda_device(DEVICE_CTX *devp, int status)
-{
-   /* check device pointer */
-   if (devp == NULL) return VERROR;
-   /* set device status */
-   devp->status = status;
-   /* free pointers -- if set */
-   PEACH_CUDA_CTX *ctxp = &PeachCudaCTX[devp->id];
-   if (ctxp->stream[0]) cudaStreamDestroy(ctxp->stream[0]);
-   if (ctxp->stream[1]) cudaStreamDestroy(ctxp->stream[1]);
-   if (ctxp->h_solve[0]) cudaFreeHost(ctxp->h_solve[0]);
-   if (ctxp->h_solve[1]) cudaFreeHost(ctxp->h_solve[1]);
-   if (ctxp->h_bt[0]) cudaFreeHost(ctxp->h_bt[0]);
-   if (ctxp->h_bt[1]) cudaFreeHost(ctxp->h_bt[1]);
-   if (ctxp->d_solve[0]) cudaFree(ctxp->d_solve[0]);
-   if (ctxp->d_solve[1]) cudaFree(ctxp->d_solve[1]);
-   if (ctxp->d_state[0]) cudaFree(ctxp->d_state[0]);
-   if (ctxp->d_state[1]) cudaFree(ctxp->d_state[1]);
-   if (ctxp->d_bt[0]) cudaFree(ctxp->d_bt[0]);
-   if (ctxp->d_bt[1]) cudaFree(ctxp->d_bt[1]);
-   if (ctxp->d_phash) cudaFree(ctxp->d_phash);
-   if (ctxp->d_map) cudaFree(ctxp->d_map);
-   /* attempt to clear last error */
-   (void) cudaGetLastError();
-
-   return VEOK;
-}  /* end peach_free_cuda_device() */
-
 /**
  * (re)Initialize a device context with a CUDA device.
  * @param devp Pointer to DEVICE_CTX to initialize
@@ -1017,14 +975,17 @@ int peach_init_cuda_device(DEVICE_CTX *ctx)
 {
    PEACH_CUDA_CTX *p_ctx;
    size_t btsz, seedsz;
+   int grid, block;
 
 #undef cuCHK
 #define cuCHK(cuFN) \
    do { \
       cudaError_t err = (cuFN); \
       if (err != cudaSuccess) { \
+         const char *str = cudaGetErrorString(err); \
+         palert("CUDA ERROR on #(%d): (%d) %s", ctx->id, (int) err, str); \
+         palert("... error returned by: %s", #cuFN); \
          ctx->status = DEV_FAIL; \
-         cu__log_error(err); \
          return VERROR; \
       } \
    } while(0)
@@ -1042,19 +1003,14 @@ int peach_init_cuda_device(DEVICE_CTX *ctx)
 
    /* set context to CUDA id */
    cuCHK(cudaSetDevice(ctx->id));
-   /* set CUDA configuration for device */
-   if (cudaOccupancyMaxPotentialBlockSize(&(ctx->grid), &(ctx->block),
-         kcu_peach_solve, 0, 0) != cudaSuccess) {
-      pdebug("cudaOccupancy~BlockSize(%d) failed...", ctx->id);
-      pdebug("Using conservative defaults for <<<512/128>>>");
-      ctx->grid = 512;
-      ctx->block = 128;
-   }
-   /* calculate total threads */
+   /* determine CUDA occupancy for device */
+   cuCHK(cudaOccupancyMaxPotentialBlockSize(&grid, &block, kcu_peach_solve, 0, 0));
+   /* store grid/block and calculate threads and state sizes */
+   ctx->grid = grid;
+   ctx->block = block;
    ctx->threads = ctx->grid * ctx->block;
    seedsz = sizeof(word64) * ctx->threads;
    btsz = sizeof(BTRAILER);
-   /* create generator for unsigned long long */
    /* create streams for device */
    cuCHK(cudaStreamCreate(&(p_ctx->stream[0])));
    cuCHK(cudaStreamCreate(&(p_ctx->stream[1])));
@@ -1074,25 +1030,21 @@ int peach_init_cuda_device(DEVICE_CTX *ctx)
    cuCHK(cudaMalloc(&(p_ctx->d_phash), 32));
    cuCHK(cudaMalloc(&(p_ctx->d_map), PEACHMAPLEN));
    /* clear device/host allocated memory */
-   cuCHK(cudaMemsetAsync(p_ctx->d_bt[0], 0, btsz, cudaStreamDefault));
-   cuCHK(cudaMemsetAsync(p_ctx->d_bt[1], 0, btsz, cudaStreamDefault));
-   cuCHK(cudaMemsetAsync(p_ctx->d_state[0], 0, seedsz, cudaStreamDefault));
-   cuCHK(cudaMemsetAsync(p_ctx->d_state[1], 0, seedsz, cudaStreamDefault));
-   cuCHK(cudaMemsetAsync(p_ctx->d_solve[0], 0, 32, cudaStreamDefault));
-   cuCHK(cudaMemsetAsync(p_ctx->d_solve[1], 0, 32, cudaStreamDefault));
-   cuCHK(cudaMemsetAsync(p_ctx->d_phash, 0, 32, cudaStreamDefault));
+   cuCHK(cudaMemsetAsync(p_ctx->d_bt[0], 0, btsz, p_ctx->stream[0]));
+   cuCHK(cudaMemsetAsync(p_ctx->d_bt[1], 0, btsz, p_ctx->stream[1]));
+   cuCHK(cudaMemsetAsync(p_ctx->d_solve[0], 0, 32, p_ctx->stream[0]));
+   cuCHK(cudaMemsetAsync(p_ctx->d_solve[1], 0, 32, p_ctx->stream[1]));
+   cuCHK(cudaMemsetAsync(p_ctx->d_phash, 0, 32, p_ctx->stream[0]));
    memset(p_ctx->h_bt[0], 0, btsz);
    memset(p_ctx->h_bt[1], 0, btsz);
    memset(p_ctx->h_solve[0], 0, 32);
    memset(p_ctx->h_solve[1], 0, 32);
 
-   /* wait for all operations in cudaStreamDefault to complete */
-   cuCHK(cudaStreamSynchronize(cudaStreamDefault));
-
    /* generate prng state */
-   kcu_srand64<<<ctx->grid, ctx->block>>>(p_ctx->d_state[0], rand32());
-   kcu_srand64<<<ctx->grid, ctx->block>>>(p_ctx->d_state[1], rand32());
-   cudaDeviceSynchronize();
+   CUDA_KERNEL(kcu_srand64, grid, block, 0, p_ctx->stream[0])
+      (p_ctx->d_state[0], rand32());
+   CUDA_KERNEL(kcu_srand64, grid, block, 0, p_ctx->stream[1])
+      (p_ctx->d_state[1], rand32());
 
    /* set device as initialized */
    ctx->status = DEV_INIT;
@@ -1103,7 +1055,7 @@ int peach_init_cuda_device(DEVICE_CTX *ctx)
 /**
  * Try solve for a tokenized haiku as nonce output for Peach proof of work
  * on CUDA devices. Combine haiku protocols implemented in the Trigg
- * Algorithm with the memory intensive protocols of the Peach algorithm to
+ * Algorithm with the intensive protocols of the Peach algorithm to
  * generate haiku output as proof of work.
  * @param ctx Pointer to DEVICE_CTX to perform work with
  * @param bt Pointer to block trailer to solve for
@@ -1114,7 +1066,7 @@ int peach_init_cuda_device(DEVICE_CTX *ctx)
 */
 int peach_solve_cuda(DEVICE_CTX *ctx, BTRAILER *bt, word8 diff, BTRAILER *btout)
 {
-   int id, sid, grid, block, build;
+   int id, grid, block, build;
    PEACH_CUDA_CTX *P;
    cudaError_t err;
 
@@ -1123,32 +1075,40 @@ int peach_solve_cuda(DEVICE_CTX *ctx, BTRAILER *bt, word8 diff, BTRAILER *btout)
    do { \
       err = (cuFN); \
       if (err != cudaSuccess) { \
+         const char *str = cudaGetErrorString(err); \
+         palert("CUDA ERROR on #(%d): (%d) %s", ctx->id, (int) err, str); \
+         palert("... error returned by: %s", #cuFN); \
          ctx->status = DEV_FAIL; \
-         cu__log_error(err); \
          return VERROR; \
       } \
    } while(0)
 
    /* init */
-   id = ctx->id;
    P = (PEACH_CUDA_CTX *) ctx->peach;
    /* report unuseable GPUs */
    if (ctx->status < DEV_NULL) return VETIMEOUT;
 
-   /* set/check cuda device */
-   cuCHK(cudaSetDevice(id));
+   /* set cuda device */
+   cuCHK(cudaSetDevice(ctx->id));
+   /* check for previous (async) execution errors */
    cuCHK(cudaGetLastError());
 
    /* build peach map */
    if (ctx->status == DEV_INIT) {
       /* build peach map -- init */
-      if (ctx->work == 0) {
-         /* ensure both streams have finished */
-         if (cudaStreamQuery(P->stream[1]) == cudaSuccess
-            && cudaStreamQuery(P->stream[0]) == cudaSuccess) {
-            /* synchronize device before initializing new peach map */
-            cudaDeviceSynchronize();
-            /* clear any late solves */
+      for (build = id = 0; id < 2; id++) {
+         /* check stream is ready */
+         err = cudaStreamQuery(P->stream[id]);
+         if (err == cudaErrorNotReady) continue;
+         cuCHK(err);
+
+         /* check pre-build state */
+         if (ctx->work == 0 && build == 0) {
+            /* ensure secondary stream is ready */
+            err = cudaStreamQuery(P->stream[id ^ 1]);
+            if (err == cudaErrorNotReady) break;
+            cuCHK(err);
+            /* clear late solves */
             cuCHK(cudaMemset(P->d_solve[0], 0, 32));
             cuCHK(cudaMemset(P->d_solve[1], 0, 32));
             memset(P->h_solve[0], 0, 32);
@@ -1159,72 +1119,58 @@ int peach_solve_cuda(DEVICE_CTX *ctx, BTRAILER *bt, word8 diff, BTRAILER *btout)
             /* update device phash */
             cuCHK(cudaMemcpy(P->d_phash, P->h_bt[0]->phash, 32, cudaMemcpyHostToDevice));
             /* synchronize memory transfers before building peach map */
-            cudaDeviceSynchronize();
-            /* flag build ready */
+            cuCHK(cudaDeviceSynchronize());
+            /* flag build state */
             build = 1;
          }
-      }
-      /* build peach map -- build */
-      if (ctx->work < PEACHCACHELEN) {
-         for (sid = 0; sid < 2 && (build || ctx->work > 0); sid++) {
-            /* ensure stream is ready for next section of build */
-            if (cudaStreamQuery(P->stream[sid]) != cudaSuccess) continue;
-            /* set CUDA configuration for generating peach map */
-            if (cudaOccupancyMaxPotentialBlockSize(&grid, &block,
-                  kcu_peach_build, 0, 0) != cudaSuccess) {
-               pdebug("cudaOccupancy~BlockSize(%d) failed...", id);
-               pdebug("Using conservative defaults, <<<128/128>>>");
-               grid = 128;
-               block = 128;
+         /* check build state */
+         if (ctx->work > 0 || build) {
+            if (ctx->work < PEACHCACHELEN) {
+               /* prepare launch config and generate peach map */
+               cuCHK(cudaOccupancyMaxPotentialBlockSize(&grid, &block, kcu_peach_build, 0, 0));
+               CUDA_KERNEL(kcu_peach_build, grid, block, 0, P->stream[id])
+                  ((word32) ctx->work, P->d_map, P->d_phash);
+               cuCHK(cudaGetLastError());
+               /* update build progress */
+               ctx->work += grid * block;
+            } else {
+               /* ensure secondary stream is finished */
+               err = cudaStreamQuery(P->stream[id ^ 1]);
+               if (err == cudaErrorNotReady) break;
+               cuCHK(err);
+               /* build is complete */
+               ctx->last = time(NULL);
+               ctx->status = DEV_IDLE;
+               ctx->work = 0;
+               break;
             }
-            /* launch kernel to generate map */
-            kcu_peach_build<<<grid, block, 0, P->stream[sid]>>>
-               ((word32) ctx->work, P->d_map, P->d_phash);
-            cuCHK(cudaGetLastError());
-            /* update build progress */
-            ctx->work += grid * block;
-         }
-      } else {
-         /* ensure both streams have finished */
-         if (cudaStreamQuery(P->stream[1]) == cudaSuccess
-            && cudaStreamQuery(P->stream[0]) == cudaSuccess) {
-            /* build is complete */
-            ctx->last = time(NULL);
-            ctx->status = DEV_IDLE;
-            ctx->work = 0;
-         }
-      }
-   }
+         }  /* end if (ctx->work > 0... */
+      }  /* end for(build = id = 0... */
+   }  /* end if (ctx->status == DEV_INIT)... */
 
    /* switch to WORK mode when all conditions are satisfied:
     * - transactions to solve
     * - block NOT already solved
     * - block NOT expired
     */
-   switch (ctx->status) {
-      case DEV_IDLE:{
-         if (get32(bt->tcount) == 0) break;
-         if (cmp64(bt->bnum, btout->bnum) == 0) break;
-         if (difftime(time(NULL), get32(bt->time0)) >= BRIDGEv3) break;
-         ctx->last = time(NULL);
-         ctx->status = DEV_WORK;
-         ctx->work = 0;
-         break;
-      }
+   while (ctx->status == DEV_IDLE) {
+      if (get32(bt->tcount) == 0) break;
+      if (cmp64(bt->bnum, btout->bnum) == 0) break;
+      if (difftime(time(NULL), get32(bt->time0)) >= BRIDGEv3) break;
+      ctx->last = time(NULL);
+      ctx->status = DEV_WORK;
+      ctx->work = 0;
+      break;
    }
 
    /* solve work in block trailer */
    if (ctx->status == DEV_WORK) {
-      for(sid = 0; sid < 2; sid++) {
-         err = cudaStreamQuery(P->stream[sid]);
+      for(id = 0; id < 2; id++) {
+         err = cudaStreamQuery(P->stream[id]);
          if (err == cudaErrorNotReady) continue;
-         if (err != cudaSuccess) {
-            ctx->status = DEV_FAIL;
-            cu__log_error(err);
-            return VERROR;
-         }
+         cuCHK(err);
          /* check trailer for block update */
-         if (memcmp(P->h_bt[sid]->phash, bt->phash, HASHLEN)) {
+         if (memcmp(P->h_bt[id]->phash, bt->phash, HASHLEN)) {
             ctx->status = DEV_INIT;
             ctx->work = 0;
             break;
@@ -1241,41 +1187,37 @@ int peach_solve_cuda(DEVICE_CTX *ctx, BTRAILER *bt, word8 diff, BTRAILER *btout)
             break;
          }
          /* check for solves */
-         if (*(P->h_solve[sid])) {
-            /* combine solved nonce with bt */
-            memcpy(P->h_bt[sid]->nonce, P->h_solve[sid], 32);
-            /* clear solve from host/device */
-            cudaMemsetAsync(P->d_solve[sid], 0, 32, P->stream[sid]);
-            memset(P->h_solve[sid], 0, 32);
-            /* move solved block trailer to btout */
-            memcpy(btout, P->h_bt[sid], sizeof(BTRAILER));
-            /* return a solve */
+         if (*(P->h_solve[id])) {
+            /* combine solve with nonce and copy to output */
+            memcpy(P->h_bt[id]->nonce, P->h_solve[id], 32);
+            memcpy(btout, P->h_bt[id], sizeof(BTRAILER));
+            /* (async) clear solve */
+            cuCHK(cudaMemsetAsync(P->d_solve[id], 0, 32, P->stream[id]));
+            memset(P->h_solve[id], 0, 32);
+
             return VEOK;
          }
-         /* check for "on-the-fly" difficulty changes */
+         /* update block trailer (incl. half nonce) */
+         memcpy(P->h_bt[id], bt, 92);
+         trigg_generate(P->h_bt[id]->nonce);
+         /* (async) update trailer data (incl. half nonce) */
+         cuCHK(cudaMemcpyAsync(P->d_bt[id], P->h_bt[id],
+            92 + 16, cudaMemcpyHostToDevice, P->stream[id]));
+         /* (async) launch kernel to solve Peach (dynamic difficulty) */
          diff = diff && diff < bt->difficulty[0] ? diff : bt->difficulty[0];
-         /* ensure host block trailer is updated */
-         memcpy(P->h_bt[sid], bt, 92);
-         /* generate (first) nonce directly into block trailer */
-         trigg_generate(P->h_bt[sid]->nonce);
-         /* copy trailer updates w/ nonce ELSE just nonce */
-         cuCHK(cudaMemcpyAsync(P->d_bt[sid], P->h_bt[sid],
-            92 + 16, cudaMemcpyHostToDevice, P->stream[sid]));
-         cuCHK(cudaGetLastError());
-         /* launch kernel to solve Peach */
-         kcu_peach_solve<<<ctx->grid, ctx->block, 0, P->stream[sid]>>>
-            (P->d_map, P->d_bt[sid], P->d_state[sid], diff, P->d_solve[sid]);
-         cuCHK(cudaGetLastError());
-         /* retrieve solve seed */
-         cudaMemcpyAsync(P->h_solve[sid], P->d_solve[sid], 32,
-            cudaMemcpyDeviceToHost, P->stream[sid]);
+         CUDA_KERNEL(kcu_peach_solve, ctx->grid, ctx->block, 0, P->stream[id])
+            (P->d_map, P->d_bt[id], P->d_state[id], diff, P->d_solve[id]);
+         /* check kernel launch errors */
          cuCHK(cudaGetLastError());
+         /* (async) solve retrieval */
+         cuCHK(cudaMemcpyAsync(P->h_solve[id], P->d_solve[id], 32,
+            cudaMemcpyDeviceToHost, P->stream[id]));
          /* increment progress counters */
          ctx->work += ctx->threads;
          double delta = difftime(time(NULL), ctx->last);
          ctx->hps = ctx->work / (delta ? delta : 1);
-      }
-   }
+      }  /* end for(id = 0; id < 2; id++)... */
+   }  /* end if (ctx->status == DEV_WORK)... */
 
    return VERROR;
 }  /* end peach_solve_cuda() */
diff --git a/src/peach.cuh b/src/peach.cuh
index a43db3cc..872755fb 100644
--- a/src/peach.cuh
+++ b/src/peach.cuh
@@ -12,11 +12,24 @@
 
 
 #include <cuda_runtime.h>
+#include <device_launch_parameters.h>
 
 #include <stdint.h>
 #include <time.h>
 #include "peach.h"
 
+/* WORKAROUND for annoying limitations of intellisense, due to the arguably
+ * questionable choice of CUDA delimiter for Kernel Function Arguments.
+ * Based on contributions to a stackoverflow question here:
+ *    https://stackoverflow.com/a/63084481
+ */
+#ifdef __INTELLISENSE__
+#define CUDA_KERNEL(...)
+#else
+#define CUDA_KERNEL(FN, ...) FN <<< __VA_ARGS__ >>>
+#endif
+/* end WORKAROUND */
+
 __global__ void kcu_peach_build
    (word32 offset, word64 *d_map, word32 *d_phash);
 __global__ void kcu_peach_solve
diff --git a/src/peach.h b/src/peach.h
index 270d65f9..992085f9 100644
--- a/src/peach.h
+++ b/src/peach.h
@@ -121,7 +121,6 @@ int peach_solve(const BTRAILER *bt, word8 diff, void *out);
 
 /* CUDA functions */
 int peach_checkhash_cuda(int count, BTRAILER bt[], void *out);
-int peach_free_cuda_device(DEVICE_CTX *devp, int status);
 int peach_init_cuda_device(DEVICE_CTX *devp);
 int peach_solve_cuda(DEVICE_CTX *dev, BTRAILER *bt, word8 diff, BTRAILER *out);
 
diff --git a/src/test/peach-mining-cu.c b/src/test/peach-mining-cu.c
index 6b345bf5..78b3298d 100644
--- a/src/test/peach-mining-cu.c
+++ b/src/test/peach-mining-cu.c
@@ -35,14 +35,7 @@ static word8 Block1[sizeof(BTRAILER)] = {
    0x42, 0xd4, 0xba, 0x1c, 0xf7, 0x2f, 0x6e, 0x37, 0xff, 0x92,
    0x99, 0x9a, 0xa0, 0x32, 0x55, 0x51, 0xbc, 0xf1, 0x5f, 0x69
 };
-void print_32_bytes(word8 *data)
-{
-   int i;
-   for (i = 0; i < 32; i++) {
-      printf("%02x", data[i]);
-   }
-   printf("\n");
-}
+
 int main()
 {
    DEVICE_CTX D[GPUMAX] = { 0 };
@@ -95,10 +88,7 @@ int main()
       m = metric_reduce(&hps);
       ASSERT_DEBUG("Diff(%d) perf: ~%.2lf %sH/s\n", diff, hps, m);
       /* ensure solution is correct */
-      print_32_bytes(btout.nonce);
-      n = peach_checkhash(&btout, btout.difficulty[0], digest);
-      ASSERT_EQ(n, VEOK);
-      plog("assertion succeeded");
+      ASSERT_EQ(peach_checkhash(&btout, btout.difficulty[0], digest), 0);
    }
    /* check difficulty met requirement */
    ASSERT_GE_MSG(diff, 2, "should meet minimum diff requirement");