diff --git a/.vscode/settings.json b/.vscode/settings.json
index 8d0be481..7e42f2d1 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -12,6 +12,7 @@
         "xtree": "c",
         "xutility": "c",
         "tb_internal.h": "c",
-        "gc.h": "c"
+        "gc.h": "c",
+        "windows.h": "c"
     }
 }
\ No newline at end of file
diff --git a/c11threads/threads.h b/c11threads/threads.h
new file mode 100644
index 00000000..ead6e87b
--- /dev/null
+++ b/c11threads/threads.h
@@ -0,0 +1,151 @@
+/*
+ * C11 <threads.h> emulation library
+ *
+ * (C) Copyright yohhoy 2012.
+ * Distributed under the Boost Software License, Version 1.0.
+ * (See copy at http://www.boost.org/LICENSE_1_0.txt)
+ */
+#ifndef EMULATED_THREADS_H_INCLUDED_
+#define EMULATED_THREADS_H_INCLUDED_
+
+#include <time.h>
+
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+// check configuration
+#if defined(EMULATED_THREADS_USE_NATIVE_CALL_ONCE) && (_WIN32_WINNT < 0x0600)
+#error EMULATED_THREADS_USE_NATIVE_CALL_ONCE requires _WIN32_WINNT>=0x0600
+#endif
+
+#if defined(EMULATED_THREADS_USE_NATIVE_CV) && (_WIN32_WINNT < 0x0600)
+#error EMULATED_THREADS_USE_NATIVE_CV requires _WIN32_WINNT>=0x0600
+#endif
+
+/*---------------------------- macros ----------------------------*/
+#ifdef EMULATED_THREADS_USE_NATIVE_CALL_ONCE
+#define ONCE_FLAG_INIT INIT_ONCE_STATIC_INIT
+#else
+#define ONCE_FLAG_INIT \
+{ 0 }
+#endif
+#define TSS_DTOR_ITERATIONS 1
+
+#ifndef thread_local
+#define thread_local _Thread_local
+#endif
+
+/*---------------------------- types ----------------------------*/
+typedef struct cnd_t {
+    #ifdef EMULATED_THREADS_USE_NATIVE_CV
+    CONDITION_VARIABLE condvar;
+    #else
+    int blocked;
+    int gone;
+    int to_unblock;
+    HANDLE sem_queue;
+    HANDLE sem_gate;
+    CRITICAL_SECTION monitor;
+    #endif
+} cnd_t;
+
+typedef HANDLE thrd_t;
+
+typedef DWORD tss_t;
+
+typedef struct mtx_t {
+    CRITICAL_SECTION cs;
+} mtx_t;
+
+#ifdef EMULATED_THREADS_USE_NATIVE_CALL_ONCE
+typedef INIT_ONCE once_flag;
+#else
+typedef struct once_flag_t {
+    volatile LONG status;
+} once_flag;
+#endif
+
+#elif defined(__unix__) || defined(__unix) || defined(__APPLE__)
+#include <pthread.h>
+
+/*---------------------------- macros ----------------------------*/
+#define ONCE_FLAG_INIT PTHREAD_ONCE_INIT
+#ifdef INIT_ONCE_STATIC_INIT
+#define TSS_DTOR_ITERATIONS PTHREAD_DESTRUCTOR_ITERATIONS
+#else
+#define TSS_DTOR_ITERATIONS 1 // assume TSS dtor MAY be called at least once.
+#endif
+
+/*---------------------------- types ----------------------------*/
+typedef pthread_cond_t cnd_t;
+typedef pthread_t thrd_t;
+typedef pthread_key_t tss_t;
+typedef pthread_mutex_t mtx_t;
+typedef pthread_once_t once_flag;
+
+#else
+#error Not supported on this platform.
+#endif
+
+/*---------------------------- types ----------------------------*/
+typedef void (*tss_dtor_t)(void*);
+typedef int (*thrd_start_t)(void*);
+
+struct xtime {
+    time_t sec;
+    long nsec;
+};
+typedef struct xtime xtime;
+
+/*-------------------- enumeration constants --------------------*/
+enum {
+    mtx_plain = 0,
+    mtx_try = 1,
+    mtx_timed = 2,
+    mtx_recursive = 4
+};
+
+enum {
+    thrd_success = 0, // succeeded
+    thrd_timeout,     // timeout
+    thrd_error,       // failed
+    thrd_busy,        // resource busy
+    thrd_nomem        // out of memory
+};
+
+/*-------------------------- functions --------------------------*/
+void call_once(once_flag* flag, void (*func)(void));
+
+int cnd_broadcast(cnd_t* cond);
+void cnd_destroy(cnd_t* cond);
+int cnd_init(cnd_t* cond);
+int cnd_signal(cnd_t* cond);
+int cnd_timedwait(cnd_t* cond, mtx_t* mtx, const xtime* xt);
+int cnd_wait(cnd_t* cond, mtx_t* mtx);
+
+void mtx_destroy(mtx_t* mtx);
+int mtx_init(mtx_t* mtx, int type);
+int mtx_lock(mtx_t* mtx);
+int mtx_timedlock(mtx_t* mtx, const xtime* xt);
+int mtx_trylock(mtx_t* mtx);
+int mtx_unlock(mtx_t* mtx);
+
+int thrd_create(thrd_t* thr, thrd_start_t func, void* arg);
+thrd_t thrd_current(void);
+int thrd_detach(thrd_t thr);
+int thrd_equal(thrd_t thr0, thrd_t thr1);
+void thrd_exit(int res);
+int thrd_join(thrd_t thr, int* res);
+void thrd_sleep(const xtime* xt);
+void thrd_yield(void);
+
+int tss_create(tss_t* key, tss_dtor_t dtor);
+void tss_delete(tss_t key);
+void* tss_get(tss_t key);
+int tss_set(tss_t key, void* val);
+
+int xtime_get(xtime* xt, int base);
+#define TIME_UTC 1
+
+#endif /* EMULATED_THREADS_H_INCLUDED_ */
diff --git a/c11threads/threads_msvc.c b/c11threads/threads_msvc.c
new file mode 100644
index 00000000..6fc621ad
--- /dev/null
+++ b/c11threads/threads_msvc.c
@@ -0,0 +1,460 @@
+/*
+ * C11 <threads.h> emulation library
+ *
+ * (C) Copyright yohhoy 2012.
+ * Distributed under the Boost Software License, Version 1.0.
+ * (See copy at http://www.boost.org/LICENSE_1_0.txt)
+ */
+#include <assert.h>
+#include <errno.h>
+#include <limits.h>
+#include <process.h> // MSVCRT
+#include <stdlib.h>
+
+/*
+Configuration macro:
+
+  EMULATED_THREADS_USE_NATIVE_CALL_ONCE
+    Use native WindowsAPI one-time initialization function.
+    (requires WinVista or later)
+    Otherwise emulate by mtx_trylock() + *busy loop* for WinXP.
+
+  EMULATED_THREADS_USE_NATIVE_CV
+    Use native WindowsAPI condition variable object.
+    (requires WinVista or later)
+    Otherwise use emulated implementation for WinXP.
+
+  EMULATED_THREADS_TSS_DTOR_SLOTNUM
+    Max registerable TSS dtor number.
+*/
+#if _WIN32_WINNT >= 0x0600
+// Prefer native WindowsAPI on newer environment.
+#define EMULATED_THREADS_USE_NATIVE_CALL_ONCE
+#define EMULATED_THREADS_USE_NATIVE_CV
+#endif
+#define EMULATED_THREADS_TSS_DTOR_SLOTNUM 64 // see TLS_MINIMUM_AVAILABLE
+
+#include "threads.h"
+
+/*
+Implementation limits:
+  - Conditionally emulation for "Initialization functions"
+    (see EMULATED_THREADS_USE_NATIVE_CALL_ONCE macro)
+  - Emulated `mtx_timelock()' with mtx_trylock() + *busy loop*
+*/
+static void impl_tss_dtor_invoke(); // forward decl.
+
+struct impl_thrd_param {
+    thrd_start_t func;
+    void* arg;
+};
+
+static unsigned __stdcall impl_thrd_routine(void* p) {
+    struct impl_thrd_param pack;
+    int code;
+    memcpy(&pack, p, sizeof(struct impl_thrd_param));
+    free(p);
+    code = pack.func(pack.arg);
+    impl_tss_dtor_invoke();
+    return (unsigned)code;
+}
+
+static DWORD impl_xtime2msec(const xtime* xt) {
+    return (DWORD)((xt->sec * 1000u) + (xt->nsec / 1000000));
+}
+
+#ifdef EMULATED_THREADS_USE_NATIVE_CALL_ONCE
+struct impl_call_once_param {
+    void (*func)(void);
+};
+static BOOL CALLBACK impl_call_once_callback(PINIT_ONCE InitOnce, PVOID Parameter, PVOID* Context) {
+    struct impl_call_once_param* param = (struct impl_call_once_param*)Parameter;
+    (param->func)();
+    ((void)InitOnce);
+    ((void)Context); // suppress warning
+    return TRUE;
+}
+#endif // ifdef EMULATED_THREADS_USE_NATIVE_CALL_ONCE
+
+#ifndef EMULATED_THREADS_USE_NATIVE_CV
+/*
+Note:
+  The implementation of condition variable is ported from Boost.Interprocess
+  See http://www.boost.org/boost/interprocess/sync/windows/condition.hpp
+*/
+static void impl_cond_do_signal(cnd_t* cond, int broadcast) {
+    int nsignal = 0;
+
+    EnterCriticalSection(&cond->monitor);
+    if (cond->to_unblock != 0) {
+        if (cond->blocked == 0) {
+            LeaveCriticalSection(&cond->monitor);
+            return;
+        }
+        if (broadcast) {
+            cond->to_unblock += nsignal = cond->blocked;
+            cond->blocked = 0;
+        } else {
+            nsignal = 1;
+            cond->to_unblock++;
+            cond->blocked--;
+        }
+    } else if (cond->blocked > cond->gone) {
+        WaitForSingleObject(cond->sem_gate, INFINITE);
+        if (cond->gone != 0) {
+            cond->blocked -= cond->gone;
+            cond->gone = 0;
+        }
+        if (broadcast) {
+            nsignal = cond->to_unblock = cond->blocked;
+            cond->blocked = 0;
+        } else {
+            nsignal = cond->to_unblock = 1;
+            cond->blocked--;
+        }
+    }
+    LeaveCriticalSection(&cond->monitor);
+
+    if (0 < nsignal)
+        ReleaseSemaphore(cond->sem_queue, nsignal, NULL);
+}
+
+static int impl_cond_do_wait(cnd_t* cond, mtx_t* mtx, const xtime* xt) {
+    int nleft = 0;
+    int ngone = 0;
+    int timeout = 0;
+    DWORD w;
+
+    WaitForSingleObject(cond->sem_gate, INFINITE);
+    cond->blocked++;
+    ReleaseSemaphore(cond->sem_gate, 1, NULL);
+
+    mtx_unlock(mtx);
+
+    w = WaitForSingleObject(cond->sem_queue, xt ? impl_xtime2msec(xt) : INFINITE);
+    timeout = (w == WAIT_TIMEOUT);
+
+    EnterCriticalSection(&cond->monitor);
+    if ((nleft = cond->to_unblock) != 0) {
+        if (timeout) {
+            if (cond->blocked != 0) {
+                cond->blocked--;
+            } else {
+                cond->gone++;
+            }
+        }
+        if (--cond->to_unblock == 0) {
+            if (cond->blocked != 0) {
+                ReleaseSemaphore(cond->sem_gate, 1, NULL);
+                nleft = 0;
+            } else if ((ngone = cond->gone) != 0) {
+                cond->gone = 0;
+            }
+        }
+    } else if (++cond->gone == INT_MAX / 2) {
+        WaitForSingleObject(cond->sem_gate, INFINITE);
+        cond->blocked -= cond->gone;
+        ReleaseSemaphore(cond->sem_gate, 1, NULL);
+        cond->gone = 0;
+    }
+    LeaveCriticalSection(&cond->monitor);
+
+    if (nleft == 1) {
+        while (ngone--)
+            WaitForSingleObject(cond->sem_queue, INFINITE);
+        ReleaseSemaphore(cond->sem_gate, 1, NULL);
+    }
+
+    mtx_lock(mtx);
+    return timeout ? thrd_busy : thrd_success;
+}
+#endif // ifndef EMULATED_THREADS_USE_NATIVE_CV
+
+static struct impl_tss_dtor_entry {
+    tss_t key;
+    tss_dtor_t dtor;
+} impl_tss_dtor_tbl[EMULATED_THREADS_TSS_DTOR_SLOTNUM];
+
+static int impl_tss_dtor_register(tss_t key, tss_dtor_t dtor) {
+    int i;
+    for (i = 0; i < EMULATED_THREADS_TSS_DTOR_SLOTNUM; i++) {
+        if (!impl_tss_dtor_tbl[i].dtor)
+            break;
+    }
+    if (i == EMULATED_THREADS_TSS_DTOR_SLOTNUM)
+        return 1;
+    impl_tss_dtor_tbl[i].key = key;
+    impl_tss_dtor_tbl[i].dtor = dtor;
+    return 0;
+}
+
+static void impl_tss_dtor_invoke() {
+    int i;
+    for (i = 0; i < EMULATED_THREADS_TSS_DTOR_SLOTNUM; i++) {
+        if (impl_tss_dtor_tbl[i].dtor) {
+            void* val = tss_get(impl_tss_dtor_tbl[i].key);
+            if (val)
+                (impl_tss_dtor_tbl[i].dtor)(val);
+        }
+    }
+}
+
+/*--------------- 7.25.2 Initialization functions ---------------*/
+// 7.25.2.1
+void call_once(once_flag* flag, void (*func)(void)) {
+    assert(flag && func);
+#ifdef EMULATED_THREADS_USE_NATIVE_CALL_ONCE
+    {
+        struct impl_call_once_param param;
+        param.func = func;
+        InitOnceExecuteOnce(flag, impl_call_once_callback, (PVOID)&param, NULL);
+    }
+#else
+    if (InterlockedCompareExchange(&flag->status, 1, 0) == 0) {
+        (func)();
+        InterlockedExchange(&flag->status, 2);
+    } else {
+        while (flag->status == 1) {
+            // busy loop!
+            thrd_yield();
+        }
+    }
+#endif
+}
+
+/*------------- 7.25.3 Condition variable functions -------------*/
+// 7.25.3.1
+int cnd_broadcast(cnd_t* cond) {
+    if (!cond) return thrd_error;
+#ifdef EMULATED_THREADS_USE_NATIVE_CV
+    WakeAllConditionVariable(&cond->condvar);
+#else
+    impl_cond_do_signal(cond, 1);
+#endif
+    return thrd_success;
+}
+
+// 7.25.3.2
+void cnd_destroy(cnd_t* cond) {
+    assert(cond);
+#ifdef EMULATED_THREADS_USE_NATIVE_CV
+    // do nothing
+#else
+    CloseHandle(cond->sem_queue);
+    CloseHandle(cond->sem_gate);
+    DeleteCriticalSection(&cond->monitor);
+#endif
+}
+
+// 7.25.3.3
+int cnd_init(cnd_t* cond) {
+    if (!cond) return thrd_error;
+#ifdef EMULATED_THREADS_USE_NATIVE_CV
+    InitializeConditionVariable(&cond->condvar);
+#else
+    cond->blocked = 0;
+    cond->gone = 0;
+    cond->to_unblock = 0;
+    cond->sem_queue = CreateSemaphore(NULL, 0, LONG_MAX, NULL);
+    cond->sem_gate = CreateSemaphore(NULL, 1, 1, NULL);
+    InitializeCriticalSection(&cond->monitor);
+#endif
+    return thrd_success;
+}
+
+// 7.25.3.4
+int cnd_signal(cnd_t* cond) {
+    if (!cond) return thrd_error;
+#ifdef EMULATED_THREADS_USE_NATIVE_CV
+    WakeConditionVariable(&cond->condvar);
+#else
+    impl_cond_do_signal(cond, 0);
+#endif
+    return thrd_success;
+}
+
+// 7.25.3.5
+int cnd_timedwait(cnd_t* cond, mtx_t* mtx, const xtime* xt) {
+    if (!cond || !mtx || !xt) return thrd_error;
+#ifdef EMULATED_THREADS_USE_NATIVE_CV
+    if (SleepConditionVariableCS(&cond->condvar, &mtx->cs, impl_xtime2msec(xt)))
+        return thrd_success;
+    return (GetLastError() == ERROR_TIMEOUT) ? thrd_busy : thrd_error;
+#else
+    return impl_cond_do_wait(cond, mtx, xt);
+#endif
+}
+
+// 7.25.3.6
+int cnd_wait(cnd_t* cond, mtx_t* mtx) {
+    if (!cond || !mtx) return thrd_error;
+#ifdef EMULATED_THREADS_USE_NATIVE_CV
+    SleepConditionVariableCS(&cond->condvar, &mtx->cs, INFINITE);
+#else
+    impl_cond_do_wait(cond, mtx, NULL);
+#endif
+    return thrd_success;
+}
+
+/*-------------------- 7.25.4 Mutex functions --------------------*/
+// 7.25.4.1
+void mtx_destroy(mtx_t* mtx) {
+    assert(mtx);
+    DeleteCriticalSection(&mtx->cs);
+}
+
+// 7.25.4.2
+int mtx_init(mtx_t* mtx, int type) {
+    if (!mtx) return thrd_error;
+    if (type != mtx_plain && type != mtx_timed && type != mtx_try && type != (mtx_plain | mtx_recursive) && type != (mtx_timed | mtx_recursive) && type != (mtx_try | mtx_recursive))
+        return thrd_error;
+    InitializeCriticalSection(&mtx->cs);
+    return thrd_success;
+}
+
+// 7.25.4.3
+int mtx_lock(mtx_t* mtx) {
+    if (!mtx) return thrd_error;
+    EnterCriticalSection(&mtx->cs);
+    return thrd_success;
+}
+
+// 7.25.4.4
+int mtx_timedlock(mtx_t* mtx, const xtime* xt) {
+    time_t expire, now;
+    if (!mtx || !xt) return thrd_error;
+    expire = time(NULL);
+    expire += xt->sec;
+    while (mtx_trylock(mtx) != thrd_success) {
+        now = time(NULL);
+        if (expire < now)
+            return thrd_busy;
+        // busy loop!
+        thrd_yield();
+    }
+    return thrd_success;
+}
+
+// 7.25.4.5
+int mtx_trylock(mtx_t* mtx) {
+    if (!mtx) return thrd_error;
+    return TryEnterCriticalSection(&mtx->cs) ? thrd_success : thrd_busy;
+}
+
+// 7.25.4.6
+int mtx_unlock(mtx_t* mtx) {
+    if (!mtx) return thrd_error;
+    LeaveCriticalSection(&mtx->cs);
+    return thrd_success;
+}
+
+/*------------------- 7.25.5 Thread functions -------------------*/
+// 7.25.5.1
+int thrd_create(thrd_t* thr, thrd_start_t func, void* arg) {
+    struct impl_thrd_param* pack;
+    uintptr_t handle;
+    if (!thr) return thrd_error;
+    pack = malloc(sizeof(struct impl_thrd_param));
+    if (!pack) return thrd_nomem;
+    pack->func = func;
+    pack->arg = arg;
+    handle = _beginthreadex(NULL, 0, impl_thrd_routine, pack, 0, NULL);
+    if (handle == 0) {
+        if (errno == EAGAIN || errno == EACCES)
+            return thrd_nomem;
+        return thrd_error;
+    }
+    *thr = (thrd_t)handle;
+    return thrd_success;
+}
+
+// 7.25.5.2
+thrd_t thrd_current(void) {
+    return GetCurrentThread();
+}
+
+// 7.25.5.3
+int thrd_detach(thrd_t thr) {
+    CloseHandle(thr);
+    return thrd_success;
+}
+
+// 7.25.5.4
+int thrd_equal(thrd_t thr0, thrd_t thr1) {
+    return (thr0 == thr1);
+}
+
+// 7.25.5.5
+void thrd_exit(int res) {
+    impl_tss_dtor_invoke();
+    _endthreadex((unsigned)res);
+}
+
+// 7.25.5.6
+int thrd_join(thrd_t thr, int* res) {
+    DWORD w, code;
+    w = WaitForSingleObject(thr, INFINITE);
+    if (w != WAIT_OBJECT_0)
+        return thrd_error;
+    if (res) {
+        if (!GetExitCodeThread(thr, &code)) {
+            CloseHandle(thr);
+            return thrd_error;
+        }
+        *res = (int)code;
+    }
+    CloseHandle(thr);
+    return thrd_success;
+}
+
+// 7.25.5.7
+void thrd_sleep(const xtime* xt) {
+    assert(xt);
+    Sleep(impl_xtime2msec(xt));
+}
+
+// 7.25.5.8
+void thrd_yield(void) {
+    SwitchToThread();
+}
+
+/*----------- 7.25.6 Thread-specific storage functions -----------*/
+// 7.25.6.1
+int tss_create(tss_t* key, tss_dtor_t dtor) {
+    if (!key) return thrd_error;
+    *key = TlsAlloc();
+    if (dtor) {
+        if (impl_tss_dtor_register(*key, dtor)) {
+            TlsFree(*key);
+            return thrd_error;
+        }
+    }
+    return (*key != 0xFFFFFFFF) ? thrd_success : thrd_error;
+}
+
+// 7.25.6.2
+void tss_delete(tss_t key) {
+    TlsFree(key);
+}
+
+// 7.25.6.3
+void* tss_get(tss_t key) {
+    return TlsGetValue(key);
+}
+
+// 7.25.6.4
+int tss_set(tss_t key, void* val) {
+    return TlsSetValue(key, val) ? thrd_success : thrd_error;
+}
+
+/*-------------------- 7.25.7 Time functions --------------------*/
+// 7.25.6.1
+int xtime_get(xtime* xt, int base) {
+    if (!xt) return 0;
+    if (base == TIME_UTC) {
+        xt->sec = time(NULL);
+        xt->nsec = 0;
+        return base;
+    }
+    return 0;
+}
diff --git a/c11threads/threads_posix.c b/c11threads/threads_posix.c
new file mode 100644
index 00000000..c206c5cd
--- /dev/null
+++ b/c11threads/threads_posix.c
@@ -0,0 +1,271 @@
+/*
+ * C11 <threads.h> emulation library
+ *
+ * (C) Copyright yohhoy 2012.
+ * Distributed under the Boost Software License, Version 1.0.
+ * (See copy at http://www.boost.org/LICENSE_1_0.txt)
+ */
+#include <assert.h>
+#include <errno.h>
+#include <limits.h>
+#include <sched.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <stdio.h>
+#include <string.h>
+
+/*
+Configuration macro:
+
+  EMULATED_THREADS_USE_NATIVE_TIMEDLOCK
+    Use pthread_mutex_timedlock() for `mtx_timedlock()'
+    Otherwise use mtx_trylock() + *busy loop* emulation.
+*/
+#if !defined(__CYGWIN__) && !defined(__APPLE__)
+#define EMULATED_THREADS_USE_NATIVE_TIMEDLOCK
+#endif
+
+
+#include "threads.h"
+
+/*
+Implementation limits:
+  - Conditionally emulation for "mutex with timeout"
+    (see EMULATED_THREADS_USE_NATIVE_TIMEDLOCK macro)
+*/
+struct impl_thrd_param {
+    thrd_start_t func;
+    void* arg;
+};
+
+void* impl_thrd_routine(void* p) {
+    struct impl_thrd_param pack = *((struct impl_thrd_param*)p);
+    free(p);
+    return (void*)((size_t)pack.func(pack.arg));
+}
+
+/*--------------- 7.25.2 Initialization functions ---------------*/
+// 7.25.2.1
+void call_once(once_flag* flag, void (*func)(void)) {
+    pthread_once(flag, func);
+}
+
+/*------------- 7.25.3 Condition variable functions -------------*/
+// 7.25.3.1
+int cnd_broadcast(cnd_t* cond) {
+    if (!cond) return thrd_error;
+    pthread_cond_broadcast(cond);
+    return thrd_success;
+}
+
+// 7.25.3.2
+void cnd_destroy(cnd_t* cond) {
+    assert(cond);
+    pthread_cond_destroy(cond);
+}
+
+// 7.25.3.3
+int cnd_init(cnd_t* cond) {
+    if (!cond) return thrd_error;
+    pthread_cond_init(cond, NULL);
+    return thrd_success;
+}
+
+// 7.25.3.4
+int cnd_signal(cnd_t* cond) {
+    if (!cond) return thrd_error;
+    pthread_cond_signal(cond);
+    return thrd_success;
+}
+
+// 7.25.3.5
+int cnd_timedwait(cnd_t* cond, mtx_t* mtx, const xtime* xt) {
+    struct timespec abs_time;
+    int rt;
+    if (!cond || !mtx || !xt) return thrd_error;
+    rt = pthread_cond_timedwait(cond, mtx, &abs_time);
+    if (rt == ETIMEDOUT)
+        return thrd_busy;
+    return (rt == 0) ? thrd_success : thrd_error;
+}
+
+// 7.25.3.6
+int cnd_wait(cnd_t* cond, mtx_t* mtx) {
+    if (!cond || !mtx) return thrd_error;
+    pthread_cond_wait(cond, mtx);
+    return thrd_success;
+}
+
+/*-------------------- 7.25.4 Mutex functions --------------------*/
+// 7.25.4.1
+void mtx_destroy(mtx_t* mtx) {
+    assert(mtx);
+    pthread_mutex_destroy(mtx);
+}
+
+// 7.25.4.2
+int mtx_init(mtx_t* mtx, int type) {
+    pthread_mutexattr_t attr;
+    if (!mtx) return thrd_error;
+    if (type != mtx_plain && type != mtx_timed && type != mtx_try && type != (mtx_plain | mtx_recursive) && type != (mtx_timed | mtx_recursive) && type != (mtx_try | mtx_recursive))
+        return thrd_error;
+    pthread_mutexattr_init(&attr);
+    if ((type & mtx_recursive) != 0) {
+#if defined(__linux__) || defined(__linux)
+        pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE_NP);
+#else
+        pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
+#endif
+    }
+    pthread_mutex_init(mtx, &attr);
+    pthread_mutexattr_destroy(&attr);
+    return thrd_success;
+}
+
+// 7.25.4.3
+int mtx_lock(mtx_t* mtx) {
+    if (!mtx) return thrd_error;
+    pthread_mutex_lock(mtx);
+    return thrd_success;
+}
+
+// 7.25.4.4
+int mtx_timedlock(mtx_t* mtx, const xtime* xt) {
+    if (!mtx || !xt) return thrd_error;
+    {
+#ifdef EMULATED_THREADS_USE_NATIVE_TIMEDLOCK
+        struct timespec ts;
+        int rt;
+        ts.tv_sec = xt->sec;
+        ts.tv_nsec = xt->nsec;
+        rt = pthread_mutex_timedlock(mtx, &ts);
+        if (rt == 0)
+            return thrd_success;
+        return (rt == ETIMEDOUT) ? thrd_busy : thrd_error;
+#else
+        time_t expire = time(NULL);
+        expire += xt->sec;
+        while (mtx_trylock(mtx) != thrd_success) {
+            time_t now = time(NULL);
+            if (expire < now)
+                return thrd_busy;
+            // busy loop!
+            thrd_yield();
+        }
+        return thrd_success;
+#endif
+    }
+}
+
+// 7.25.4.5
+int mtx_trylock(mtx_t* mtx) {
+    if (!mtx) return thrd_error;
+    return (pthread_mutex_trylock(mtx) == 0) ? thrd_success : thrd_busy;
+}
+
+// 7.25.4.6
+int mtx_unlock(mtx_t* mtx) {
+    if (!mtx) return thrd_error;
+    pthread_mutex_unlock(mtx);
+    return thrd_success;
+}
+
+/*------------------- 7.25.5 Thread functions -------------------*/
+// 7.25.5.1
+int thrd_create(thrd_t* thr, thrd_start_t func, void* arg) {
+    struct impl_thrd_param* pack;
+    if (!thr) return thrd_error;
+    pack = malloc(sizeof(struct impl_thrd_param));
+    if (!pack) return thrd_nomem;
+    pack->func = func;
+    pack->arg = arg;
+
+    pthread_attr_t attr;
+    pthread_attr_init(&attr);
+
+    if (pthread_create(thr, &attr, impl_thrd_routine, pack) != 0) {
+        free(pack);
+        return thrd_error;
+    }
+    return thrd_success;
+}
+
+// 7.25.5.2
+thrd_t thrd_current(void) {
+    return pthread_self();
+}
+
+// 7.25.5.3
+int thrd_detach(thrd_t thr) {
+    return (pthread_detach(thr) == 0) ? thrd_success : thrd_error;
+}
+
+// 7.25.5.4
+int thrd_equal(thrd_t thr0, thrd_t thr1) {
+    return pthread_equal(thr0, thr1);
+}
+
+// 7.25.5.5
+void thrd_exit(int res) {
+    pthread_exit((void*)((size_t)res));
+}
+
+// 7.25.5.6
+int thrd_join(thrd_t thr, int* res) {
+    void* code;
+    if (pthread_join(thr, &code) != 0)
+        return thrd_error;
+    if (res)
+        *res = (int)((size_t)code);
+    return thrd_success;
+}
+
+// 7.25.5.7
+void thrd_sleep(const xtime* xt) {
+    struct timespec req;
+    assert(xt);
+    req.tv_sec = xt->sec;
+    req.tv_nsec = xt->nsec;
+    nanosleep(&req, NULL);
+}
+
+// 7.25.5.8
+void thrd_yield(void) {
+    sched_yield();
+}
+
+/*----------- 7.25.6 Thread-specific storage functions -----------*/
+// 7.25.6.1
+int tss_create(tss_t* key, tss_dtor_t dtor) {
+    if (!key) return thrd_error;
+    return (pthread_key_create(key, dtor) == 0) ? thrd_success : thrd_error;
+}
+
+// 7.25.6.2
+void tss_delete(tss_t key) {
+    pthread_key_delete(key);
+}
+
+// 7.25.6.3
+void* tss_get(tss_t key) {
+    return pthread_getspecific(key);
+}
+
+// 7.25.6.4
+int tss_set(tss_t key, void* val) {
+    return (pthread_setspecific(key, val) == 0) ? thrd_success : thrd_error;
+}
+
+/*-------------------- 7.25.7 Time functions --------------------*/
+// 7.25.6.1
+int xtime_get(xtime* xt, int base) {
+    if (!xt) return 0;
+    if (base == TIME_UTC) {
+        xt->sec = time(NULL);
+        xt->nsec = 0;
+        return base;
+    }
+    return 0;
+}
diff --git a/common/common.c b/common/common.c
index acee5cee..90060589 100644
--- a/common/common.c
+++ b/common/common.c
@@ -53,17 +53,13 @@ void* cuik__valloc(size_t size) {
     // round size to page size
     size = (size + cuik__page_mask) & ~cuik__page_mask;
 
-    void *ret = VirtualAlloc(NULL, size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
+    return VirtualAlloc(NULL, size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
     #else
     cuik__page_size = 4096;
     cuik__page_mask = 4095;
 
-    void *ret = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    return mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
     #endif
-
-    // GC_add_roots(ret, (char *) ret + size);
-
-    return ret;
 }
 
 void cuik__vfree(void* ptr, size_t size) {
diff --git a/common/common.h b/common/common.h
index b1b3ccd7..069bba12 100644
--- a/common/common.h
+++ b/common/common.h
@@ -6,20 +6,27 @@
 #include <stdlib.h>
 #include <string.h>
 
+// Cuik currently uses mimalloc so we wrap those calls here
+#ifdef CUIK_USE_MIMALLOC
+#include <mimalloc.h>
+
+#define cuik_malloc(size)        mi_malloc(size)
+#define cuik_calloc(count, size) mi_calloc(count, size)
+#define cuik_free(ptr)           mi_free(ptr)
+#define cuik_realloc(ptr, size)  mi_realloc(ptr, size)
+#define cuik_strdup(x)           mi_strdup(x)
+#else
+#define cuik_malloc(size)        malloc(size)
+#define cuik_calloc(count, size) calloc(count, size)
+#define cuik_free(size)          free(size)
+#define cuik_realloc(ptr, size)  realloc(ptr, size)
+
 #ifdef _WIN32
-#define WIN32_LEAN_AND_MEAN
-#include <windows.h>
-#include <Windows.h>
+#define cuik_strdup(x)           _strdup(x)
+#else
+#define cuik_strdup(x)           strdup(x)
+#endif
 #endif
-
-#include "../bdwgc/private/gc/gc.h"
-
-// Cuik currently uses mimalloc so we wrap those calls here
-#define cuik_malloc(size)        GC_malloc(size)
-#define cuik_calloc(count, size) GC_malloc((count) * (size))
-#define cuik_free(ptr)           GC_free(ptr)
-#define cuik_realloc(ptr, size)  GC_realloc(ptr, size)
-#define cuik_strdup(x)           GC_strdup(x)
 
 #if defined(__amd64) || defined(__amd64__) || defined(_M_AMD64) || defined(__x86_64__) || defined(__x86_64)
 #define CUIK__IS_X64 1
@@ -50,13 +57,6 @@
 #define LIKELY(x)      __builtin_expect(!!(x), 1)
 #define UNLIKELY(x)    __builtin_expect(!!(x), 0)
 
-#ifndef _MSC_VER
-#include <signal.h>
-#if defined(__debugbreak)
-#define __debugbreak() raise(5 /* SIGTRAP */)
-#endif
-#endif
-
 #ifdef NDEBUG
 #define TODO() __builtin_unreachable()
 #else
diff --git a/common/hash_map.h b/common/hash_map.h
index 9f042c6d..7c92ab30 100644
--- a/common/hash_map.h
+++ b/common/hash_map.h
@@ -9,13 +9,19 @@
 #include <stddef.h>
 #include <assert.h>
 
-#include "../bdwgc/private/gc/gc.h"
-
-#define cuik_malloc(size)        GC_malloc(size)
-#define cuik_calloc(count, size) GC_malloc((count) * (size))
-#define cuik_free(ptr)           GC_free(ptr)
-#define cuik_realloc(ptr, size)  GC_realloc(ptr, size)
-#define cuik_strdup(x)           GC_strdup(x)
+#if defined(TB_USE_MIMALLOC) || defined(CUIK_USE_MIMALLOC)
+#include <mimalloc.h>
+
+#define NL_MALLOC(s)     mi_malloc(s)
+#define NL_CALLOC(c, s)  mi_calloc(c, s)
+#define NL_REALLOC(p, s) mi_realloc(p, s)
+#define NL_FREE(p)       mi_free(p)
+#else
+#define NL_MALLOC(s)     malloc(s)
+#define NL_CALLOC(c, s)  calloc(c, s)
+#define NL_REALLOC(p, s) realloc(p, s)
+#define NL_FREE(p)       free(p)
+#endif
 
 #define NL_Map(K, V) struct { K k; V v; }*
 #define NL_Strmap(T) struct { NL_Slice k; T v; }*
@@ -134,7 +140,7 @@ inline static uint32_t nl_map__raw_hash(size_t len, const void *key) {
 }
 
 void nl_map__free(NL_MapHeader* restrict table) {
-    cuik_free(table);
+    NL_FREE(table);
 }
 
 NL_MapHeader* nl_map__alloc(size_t cap, size_t entry_size) {
@@ -150,7 +156,7 @@ NL_MapHeader* nl_map__alloc(size_t cap, size_t entry_size) {
 
     cap = (cap == 1 ? 1 : 1 << exp);
 
-    NL_MapHeader* table = cuik_calloc(1, sizeof(NL_MapHeader) + (cap * entry_size));
+    NL_MapHeader* table = NL_CALLOC(1, sizeof(NL_MapHeader) + (cap * entry_size));
     table->exp = exp;
     table->count = 0;
     return table;
diff --git a/common/hash_set.h b/common/hash_set.h
index 5124bf98..fc91aee7 100644
--- a/common/hash_set.h
+++ b/common/hash_set.h
@@ -26,12 +26,14 @@ bool nl_hashset_remove(NL_HashSet* restrict hs, void* ptr);
 bool nl_hashset_put(NL_HashSet* restrict hs, void* ptr);
 size_t nl_hashset_lookup(NL_HashSet* restrict hs, void* ptr);
 
+void* nl_hashset_get2(NL_HashSet* restrict hs, void* ptr, NL_HashFunc hash, NL_CompareFunc cmp);
+
 // this one takes a custom hash function
 void* nl_hashset_put2(NL_HashSet* restrict hs, void* ptr, NL_HashFunc hash, NL_CompareFunc cmp);
 void nl_hashset_remove2(NL_HashSet* restrict hs, void* ptr, NL_HashFunc hash, NL_CompareFunc cmp);
 
 #define nl_hashset_capacity(hs) (1ull << (hs)->exp)
-#define nl_hashset_for(it, hs)  for (void **it = (hs)->data, **_end_ = &it[nl_hashset_capacity(hs)]; it != _end_; it++) if (*it != NULL)
+#define nl_hashset_for(it, hs)  for (void **it = (hs)->data, **_end_ = &it[nl_hashset_capacity(hs)]; it != _end_; it++) if (*it != NULL && *it != NL_HASHSET_TOMB)
 
 #endif /* NL_HASH_SET_H */
 
@@ -159,6 +161,27 @@ void nl_hashset_remove2(NL_HashSet* restrict hs, void* ptr, NL_HashFunc hash, NL
     } while (i != first);
 }
 
+void* nl_hashset_get2(NL_HashSet* restrict hs, void* ptr, NL_HashFunc hash, NL_CompareFunc cmp) {
+    uint32_t h = hash(ptr);
+
+    size_t mask = (1 << hs->exp) - 1;
+    size_t first = h & mask, i = first;
+
+    do {
+        if (hs->data[i] == NULL) {
+            return NULL;
+        } else if (hs->data[i] == NL_HASHSET_TOMB) {
+            // go past it
+        } else if (hs->data[i] == ptr || cmp(hs->data[i], ptr)) {
+            return hs->data[i];
+        }
+
+        i = (i + 1) & mask;
+    } while (i != first);
+
+    return NULL;
+}
+
 // returns old value
 void* nl_hashset_put2(NL_HashSet* restrict hs, void* ptr, NL_HashFunc hash, NL_CompareFunc cmp) {
     uint32_t h = hash(ptr);
diff --git a/common/hashes.h b/common/hashes.h
index 2f757eff..8508e73b 100644
--- a/common/hashes.h
+++ b/common/hashes.h
@@ -4,12 +4,13 @@
 // murmur3 32-bit without UB unaligned accesses
 // https://github.com/demetri/scribbles/blob/master/hashing/ub_aware_hash_functions.c
 static uint32_t tb__murmur3_32(const void* key, size_t len) {
+    const uint32_t* key32 = key;
     uint32_t h = 0;
 
     // main body, work on 32-bit blocks at a time
     for (size_t i=0;i<len/4;i++) {
         uint32_t k;
-        memcpy(&k, (uint32_t*) key + i, sizeof(k));
+        memcpy(&k, &key32[i], sizeof(k));
 
         k *= 0xcc9e2d51;
         k = ((k << 15) | (k >> 17))*0x1b873593;
diff --git a/common/log.c b/common/log.c
index 14d9119f..5c72cc37 100644
--- a/common/log.c
+++ b/common/log.c
@@ -25,7 +25,13 @@
 #include "log.h"
 #include <threads.h>
 
-#if defined(_POSIX_C_SOURCE)
+#ifdef _WIN32
+#ifdef _POSIX_C_SOURCE
+__declspec(dllimport) unsigned int GetCurrentThreadId(void);
+#else
+__declspec(dllimport) unsigned long GetCurrentThreadId(void);
+#endif
+#else
 #include <unistd.h>
 #endif
 
diff --git a/makefile b/makefile
index 4fc36488..6a26e170 100644
--- a/makefile
+++ b/makefile
@@ -15,7 +15,7 @@ GC_OBJS = $(GC_SRCS:%.c=$(OBJ_DIR)/%.o)
 
 STD_SRCS := $(shell find vm/std/libs -name '*.c')
 OPT_SRCS := $(shell find vm/opt -name '*.c')
-ALL_SRCS = vm/ir.c vm/std/std.c vm/lib.c vm/type.c vm/lang/paka.c vm/obj.c vm/jit/tb.c $(STD_SRCS)  $(OPT_SRCS)  $(EXTRA_SRCS)
+ALL_SRCS = vm/ir.c vm/std/std.c vm/lib.c vm/type.c vm/lang/paka.c vm/obj.c vm/jit/tb.c $(STD_SRCS)  $(OPT_SRCS) $(EXTRA_SRCS)
 ALL_OBJS = $(ALL_SRCS:%.c=$(OBJ_DIR)/%.o)
 
 # TB_SRCS := common/common.c common/perf.c tb/src/libtb.c tb/src/x64/x64.c c11threads/threads_msvc.c
diff --git a/tb/.editorconfig b/tb/.editorconfig
new file mode 100644
index 00000000..70fdd549
--- /dev/null
+++ b/tb/.editorconfig
@@ -0,0 +1,11 @@
+# EditorConfig is awesome: https://EditorConfig.org
+
+# top-most EditorConfig file
+root = true
+
+# Unix-style newlines with a newline ending every file
+[*]
+end_of_line = crlf
+insert_final_newline = true
+indent_style = tab
+indent_size = 4
diff --git a/tb/.gitignore b/tb/.gitignore
new file mode 100644
index 00000000..cca3fcc3
--- /dev/null
+++ b/tb/.gitignore
@@ -0,0 +1,26 @@
+.vs/
+.tup/
+bin/
+negate/
+
+tildebackend.lib
+tildebackend.a
+
+*.cache
+*.swp
+*.out
+*.o
+*.obj
+*.exe
+*.pdb
+
+tup.config
+debug.bat
+build.bat
+run.bat
+run4coder.bat
+project.4coder
+tb.rdbg
+build.ninja
+.ninja_deps
+.ninja_log
diff --git a/tb/LICENSE.txt b/tb/LICENSE.txt
new file mode 100644
index 00000000..b731914f
--- /dev/null
+++ b/tb/LICENSE.txt
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 Yasser Arguelles Snape
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/tb/NOTES.txt b/tb/NOTES.txt
new file mode 100644
index 00000000..cc0208e8
--- /dev/null
+++ b/tb/NOTES.txt
@@ -0,0 +1,17 @@
+# Optimizer crap
+
+  some of the optimizations i should probably worry about are proving when numbers can't
+  overflow, like induction vars:
+
+```
+for {
+    i = phi(0, j)
+    // even if n is TOP, i must be at least TOP and
+    // after all additions on the PHI... which means
+    // no overflow
+    if i >= n break
+    ...
+    // next
+    j = i + 1
+}
+```
diff --git a/tb/README.txt b/tb/README.txt
new file mode 100644
index 00000000..7307205f
--- /dev/null
+++ b/tb/README.txt
@@ -0,0 +1,22 @@
+TildeBackend (Tilde or TB for short)
+
+  TB is compiler backend in the form of a reasonable C library. This is built as an alternative to other larger compiler toolchains while providing the optimizations, machine code generation and object file export functionality necessary for the development of compilers.
+
+  # Roadmap
+
+    Code generation:
+      We're starting with x64 but will be moving focus to Aarch64 soon.
+
+    Optimizer:
+      It's almost complete with all the -O1 level passes (mostly missing inlining).
+      After that we can move towards -O2 level stuff (the goal is to compete with
+      LLVM so we need to be a bit ambitious).
+
+    Debug info:
+      Codeview support and DWARF has not yet been started, there's plans on making a
+      new debug info format eventually.
+
+    Output targets:
+     We currently have basic ELF64, COFF64, some current work is being done for
+     PE and Macho-O. We got exporting object files but wanna go further because
+     linkers ain't supposed to be separate programs.
diff --git a/tb/include/tb.h b/tb/include/tb.h
index b193f3c0..da021f81 100644
--- a/tb/include/tb.h
+++ b/tb/include/tb.h
@@ -4,9 +4,13 @@
 //   SSA  - single static assignment
 //   GVN  - global value numbering
 //   CSE  - common subexpression elimination
+//   CFG  - control flow graph
 //   DSE  - dead store elimination
 //   GCM  - global code motion
 //   SROA - scalar replacement of aggregates
+//   SCCP - sparse conditional constant propagation
+//   RPO  - reverse postorder
+//   BB   - basic block
 #ifndef TB_CORE_H
 #define TB_CORE_H
 
@@ -21,7 +25,7 @@
 
 // https://semver.org/
 #define TB_VERSION_MAJOR 0
-#define TB_VERSION_MINOR 2
+#define TB_VERSION_MINOR 3
 #define TB_VERSION_PATCH 0
 
 #ifndef TB_API
@@ -154,14 +158,14 @@ typedef enum TB_DataTypeEnum {
     TB_FLOAT,
     // Pointers
     TB_PTR,
-    // Tuples, these cannot be used in memory ops, just accessed via projections
-    TB_TUPLE,
     // represents control flow for REGION, BRANCH
     TB_CONTROL,
     // represents memory (and I/O)
     TB_MEMORY,
     // continuation (usually just return addresses :p)
     TB_CONT,
+    // Tuples, these cannot be used in memory ops, just accessed via projections
+    TB_TUPLE,
 } TB_DataTypeEnum;
 
 typedef enum TB_FloatFormat {
@@ -171,15 +175,13 @@ typedef enum TB_FloatFormat {
 
 typedef union TB_DataType {
     struct {
-        uint8_t type;
-        // Only integers and floats can be wide.
-        uint8_t width;
+        uint16_t type : 4;
         // for integers it's the bitwidth
-        uint16_t data;
+        uint16_t data : 12;
     };
-    uint32_t raw;
+    uint16_t raw;
 } TB_DataType;
-static_assert(sizeof(TB_DataType) == 4, "im expecting this to be a uint32_t");
+static_assert(sizeof(TB_DataType) == 2, "im expecting this to be a uint16_t");
 
 // classify data types
 #define TB_IS_VOID_TYPE(x)     ((x).type == TB_INT && (x).data == 0)
@@ -259,7 +261,11 @@ typedef enum TB_NodeTypeEnum {
     //   trap will not be continuable but will stop execution.
     TB_TRAP,        // (Control) -> (Control)
     //   unreachable means it won't trap or be continuable.
-    TB_UNREACHABLE, // (Control) -> (Control)
+    TB_UNREACHABLE, // (Control) -> ()
+    //   this is generated when a path becomes disconnected
+    //   from the main IR, it'll be reduced by the monotonic
+    //   rewrites.
+    TB_DEAD,        // () -> (Control)
 
     ////////////////////////////////
     // CONTROL + MEMORY
@@ -278,31 +284,31 @@ typedef enum TB_NodeTypeEnum {
     ////////////////////////////////
     //   MERGEMEM will join multiple non-aliasing memory effects, because
     //   they don't alias there's no ordering guarentee.
-    TB_MERGEMEM,// (Memory...) -> Memory
+    TB_MERGEMEM,    // (Memory...) -> Memory
     //   LOAD and STORE are standard memory accesses, they can be folded away.
-    TB_LOAD,    // (Memory, Ptr)       -> Data
-    TB_STORE,   // (Memory, Ptr, Data) -> Memory
+    TB_LOAD,        // (Control?, Memory, Ptr)       -> Data
+    TB_STORE,       // (Control, Memory, Ptr, Data) -> Memory
     //   bulk memory ops.
-    TB_MEMCPY,  // (Memory, Ptr, Ptr, Size)  -> Memory
-    TB_MEMSET,  // (Memory, Ptr, Int8, Size) -> Memory
+    TB_MEMCPY,      // (Control, Memory, Ptr, Ptr, Size)  -> Memory
+    TB_MEMSET,      // (Control, Memory, Ptr, Int8, Size) -> Memory
     //   these memory accesses represent "volatile" which means
     //   they may produce side effects and thus cannot be eliminated.
-    TB_READ,    // (Memory, Ptr)       -> (Memory, Data)
-    TB_WRITE,   // (Memory, Ptr, Data) -> (Memory, Data)
+    TB_READ,        // (Control, Memory, Ptr)       -> (Memory, Data)
+    TB_WRITE,       // (Control, Memory, Ptr, Data) -> (Memory, Data)
     //   atomics have multiple observers (if not they wouldn't need to
     //   be atomic) and thus produce side effects everywhere just like
     //   volatiles except they have synchronization guarentees. the atomic
     //   data ops will return the value before the operation is performed.
     //   Atomic CAS return the old value and a boolean for success (true if
     //   the value was changed)
-    TB_ATOMIC_LOAD, // (Memory, Ptr)        -> (Memory, Data)
-    TB_ATOMIC_XCHG, // (Memory, Ptr, Data)  -> (Memory, Data)
-    TB_ATOMIC_ADD,  // (Memory, Ptr, Data)  -> (Memory, Data)
-    TB_ATOMIC_SUB,  // (Memory, Ptr, Data)  -> (Memory, Data)
-    TB_ATOMIC_AND,  // (Memory, Ptr, Data)  -> (Memory, Data)
-    TB_ATOMIC_XOR,  // (Memory, Ptr, Data)  -> (Memory, Data)
-    TB_ATOMIC_OR,   // (Memory, Ptr, Data)  -> (Memory, Data)
-    TB_ATOMIC_CAS,  // (Memory, Data, Data) -> (Memory, Data, Bool)
+    TB_ATOMIC_LOAD, // (Control, Memory, Ptr)        -> (Memory, Data)
+    TB_ATOMIC_XCHG, // (Control, Memory, Ptr, Data)  -> (Memory, Data)
+    TB_ATOMIC_ADD,  // (Control, Memory, Ptr, Data)  -> (Memory, Data)
+    TB_ATOMIC_SUB,  // (Control, Memory, Ptr, Data)  -> (Memory, Data)
+    TB_ATOMIC_AND,  // (Control, Memory, Ptr, Data)  -> (Memory, Data)
+    TB_ATOMIC_XOR,  // (Control, Memory, Ptr, Data)  -> (Memory, Data)
+    TB_ATOMIC_OR,   // (Control, Memory, Ptr, Data)  -> (Memory, Data)
+    TB_ATOMIC_CAS,  // (Control, Memory, Data, Data) -> (Memory, Data, Bool)
 
     ////////////////////////////////
     // POINTERS
@@ -496,11 +502,11 @@ struct User {
 
 struct TB_Node {
     TB_NodeType type;
-    uint16_t input_count; // number of node inputs.
+    uint16_t input_count;
     TB_DataType dt;
 
     // makes it easier to track in graph walks
-    size_t gvn;
+    uint32_t gvn;
 
     // only value while inside of a TB_Passes,
     // these are unordered and usually just
@@ -522,8 +528,6 @@ struct TB_Node {
 // this represents switch (many targets), if (one target) and goto (only default) logic.
 typedef struct { // TB_BRANCH
     size_t succ_count;
-    TB_Node** succ;
-
     int64_t keys[];
 } TB_NodeBranch;
 
@@ -603,15 +607,8 @@ typedef struct {
 } TB_NodeSafepoint;
 
 typedef struct {
-    TB_Node* end;
     const char* tag;
 
-    // position in a postorder walk
-    int postorder_id;
-    // immediate dominator (can be approximate)
-    int dom_depth;
-    TB_Node* dom;
-
     // used for IR building only, stale after that.
     TB_Node *mem_in, *mem_out;
 } TB_NodeRegion;
@@ -666,37 +663,37 @@ typedef enum {
 
 #define TB_TYPE_TUPLE   TB_DataType{ { TB_TUPLE } }
 #define TB_TYPE_CONTROL TB_DataType{ { TB_CONTROL } }
-#define TB_TYPE_VOID    TB_DataType{ { TB_INT,   0, 0 } }
-#define TB_TYPE_I8      TB_DataType{ { TB_INT,   0, 8 } }
-#define TB_TYPE_I16     TB_DataType{ { TB_INT,   0, 16 } }
-#define TB_TYPE_I32     TB_DataType{ { TB_INT,   0, 32 } }
-#define TB_TYPE_I64     TB_DataType{ { TB_INT,   0, 64 } }
-#define TB_TYPE_F32     TB_DataType{ { TB_FLOAT, 0, TB_FLT_32 } }
-#define TB_TYPE_F64     TB_DataType{ { TB_FLOAT, 0, TB_FLT_64 } }
-#define TB_TYPE_BOOL    TB_DataType{ { TB_INT,   0, 1 } }
-#define TB_TYPE_PTR     TB_DataType{ { TB_PTR,   0, 0 } }
-#define TB_TYPE_MEMORY  TB_DataType{ { TB_MEMORY,0, 0 } }
-#define TB_TYPE_CONT    TB_DataType{ { TB_CONT,  0, 0 } }
-#define TB_TYPE_INTN(N) TB_DataType{ { TB_INT,   0, (N) } }
-#define TB_TYPE_PTRN(N) TB_DataType{ { TB_PTR,   0, (N) } }
+#define TB_TYPE_VOID    TB_DataType{ { TB_INT,   0 } }
+#define TB_TYPE_I8      TB_DataType{ { TB_INT,   8 } }
+#define TB_TYPE_I16     TB_DataType{ { TB_INT,   16 } }
+#define TB_TYPE_I32     TB_DataType{ { TB_INT,   32 } }
+#define TB_TYPE_I64     TB_DataType{ { TB_INT,   64 } }
+#define TB_TYPE_F32     TB_DataType{ { TB_FLOAT, TB_FLT_32 } }
+#define TB_TYPE_F64     TB_DataType{ { TB_FLOAT, TB_FLT_64 } }
+#define TB_TYPE_BOOL    TB_DataType{ { TB_INT,   1 } }
+#define TB_TYPE_PTR     TB_DataType{ { TB_PTR,   0 } }
+#define TB_TYPE_MEMORY  TB_DataType{ { TB_MEMORY,0 } }
+#define TB_TYPE_CONT    TB_DataType{ { TB_CONT,  0 } }
+#define TB_TYPE_INTN(N) TB_DataType{ { TB_INT,   (N) } }
+#define TB_TYPE_PTRN(N) TB_DataType{ { TB_PTR,   (N) } }
 
 #else
 
 #define TB_TYPE_TUPLE   (TB_DataType){ { TB_TUPLE } }
 #define TB_TYPE_CONTROL (TB_DataType){ { TB_CONTROL } }
-#define TB_TYPE_VOID    (TB_DataType){ { TB_INT,   0, 0 } }
-#define TB_TYPE_I8      (TB_DataType){ { TB_INT,   0, 8 } }
-#define TB_TYPE_I16     (TB_DataType){ { TB_INT,   0, 16 } }
-#define TB_TYPE_I32     (TB_DataType){ { TB_INT,   0, 32 } }
-#define TB_TYPE_I64     (TB_DataType){ { TB_INT,   0, 64 } }
-#define TB_TYPE_F32     (TB_DataType){ { TB_FLOAT, 0, TB_FLT_32 } }
-#define TB_TYPE_F64     (TB_DataType){ { TB_FLOAT, 0, TB_FLT_64 } }
-#define TB_TYPE_BOOL    (TB_DataType){ { TB_INT,   0, 1 } }
-#define TB_TYPE_PTR     (TB_DataType){ { TB_PTR,   0, 0 } }
-#define TB_TYPE_CONT    (TB_DataType){ { TB_CONT,  0, 0 } }
-#define TB_TYPE_MEMORY  (TB_DataType){ { TB_MEMORY,0, 0 } }
-#define TB_TYPE_INTN(N) (TB_DataType){ { TB_INT,   0, (N) } }
-#define TB_TYPE_PTRN(N) (TB_DataType){ { TB_PTR,   0, (N) } }
+#define TB_TYPE_VOID    (TB_DataType){ { TB_INT,   0 } }
+#define TB_TYPE_I8      (TB_DataType){ { TB_INT,   8 } }
+#define TB_TYPE_I16     (TB_DataType){ { TB_INT,   16 } }
+#define TB_TYPE_I32     (TB_DataType){ { TB_INT,   32 } }
+#define TB_TYPE_I64     (TB_DataType){ { TB_INT,   64 } }
+#define TB_TYPE_F32     (TB_DataType){ { TB_FLOAT, TB_FLT_32 } }
+#define TB_TYPE_F64     (TB_DataType){ { TB_FLOAT, TB_FLT_64 } }
+#define TB_TYPE_BOOL    (TB_DataType){ { TB_INT,   1 } }
+#define TB_TYPE_PTR     (TB_DataType){ { TB_PTR,   0 } }
+#define TB_TYPE_CONT    (TB_DataType){ { TB_CONT,  0 } }
+#define TB_TYPE_MEMORY  (TB_DataType){ { TB_MEMORY,0 } }
+#define TB_TYPE_INTN(N) (TB_DataType){ { TB_INT,   (N) } }
+#define TB_TYPE_PTRN(N) (TB_DataType){ { TB_PTR,   (N) } }
 
 #endif
 
@@ -1020,8 +1017,6 @@ TB_API const char* tb_symbol_get_name(TB_Symbol* s);
 TB_API void tb_function_set_prototype(TB_Function* f, TB_ModuleSectionHandle section, TB_FunctionPrototype* p, TB_Arena* arena);
 TB_API TB_FunctionPrototype* tb_function_get_prototype(TB_Function* f);
 
-TB_API void tb_function_print(TB_Function* f, TB_PrintCallback callback, void* user_data);
-
 TB_API void tb_inst_set_control(TB_Function* f, TB_Node* control);
 TB_API TB_Node* tb_inst_get_control(TB_Function* f);
 
@@ -1224,11 +1219,11 @@ TB_API bool tb_pass_mem2reg(TB_Passes* opt);
 // this just runs the optimizer in the default configuration
 TB_API void tb_pass_optimize(TB_Passes* opt);
 
-TB_API void tb_pass_schedule(TB_Passes* opt);
-
 // analysis
 //   print: prints IR in a flattened text form.
 TB_API bool tb_pass_print(TB_Passes* opt);
+//   print-dot: prints IR as DOT
+TB_API void tb_pass_print_dot(TB_Passes* opt, TB_PrintCallback callback, void* user_data);
 
 // codegen
 TB_API TB_FunctionOutput* tb_pass_codegen(TB_Passes* opt, bool emit_asm);
@@ -1240,8 +1235,6 @@ TB_API void tb_pass_mark_users(TB_Passes* opt, TB_Node* n);
 ////////////////////////////////
 // IR access
 ////////////////////////////////
-TB_API bool tb_is_dominated_by(TB_Node* expected_dom, TB_Node* bb);
-
 TB_API const char* tb_node_get_name(TB_Node* n);
 
 TB_API TB_Node* tb_get_parent_region(TB_Node* n);
diff --git a/tb/man/GUIDE.md b/tb/man/GUIDE.md
new file mode 100644
index 00000000..fe66fa73
--- /dev/null
+++ b/tb/man/GUIDE.md
@@ -0,0 +1,60 @@
+# Module creation
+
+Modules are the largest logical unit of code in TB, they contain functions and globals which can be exported. Get started by writing:
+
+```c
+	// this will use the host machine for the target architecture and system, this is
+	// helpful when doing JIT or non-cross AOT compilation
+	TB_Module* module = tb_module_create_for_host(arch, TB_SYSTEM_WINDOWS, TB_DEBUGFMT_NONE, NULL);
+```
+
+```c
+	// See TB_Arch, TB_System
+	TB_Arch arch  = TB_ARCH_X86_64;
+	TB_System sys = TB_SYSTEM_WINDOWS;
+
+	// See TB_DebugFormat. When exporting the binary this decides
+	// how the line info, type table and other debug information is
+	// encoded.
+	TB_DebugFormat debug_fmt = TB_DEBUGFMT_CODEVIEW;
+
+	// See TB_FeatureSet, this allows us to tell the code generator
+	// what extensions are active in the platform, an example is enabling
+	// AVX or BF16
+	TB_FeatureSet features = { 0 };
+
+	TB_Module* module = tb_module_create_for_host(arch, sys, debug_fmt, &features);
+```
+
+# Exporter API
+
+The exporting API allows for packaging compiled code into objects, shared/static or executable form. Once you've compiled all your functions in TB you may export to a file using:
+
+```c
+	// see TB_OutputFlavor for the full list
+	TB_ModuleExporter* e = tb_make_exporter(module, TB_FLAVOR_OBJECT);
+	if (!tb_exporter_to_file(e, module, "hello.obj")) {
+		/* failed to export */
+	}
+
+	/* file has been exported! */
+```
+
+If instead you need to output a buffer in memory:
+
+```c
+	// see TB_OutputFlavor for the full list
+	TB_ModuleExporter* e = tb_make_exporter(module, TB_FLAVOR_OBJECT);
+
+	ptrdiff_t length;
+	uint8_t* buffer = tb_exporter_to_buffer(e, module, &length);
+	if (length < 0) {
+		/* failed to export */
+	}
+
+	...
+
+	tb_exporter_free_buffer(buffer);
+```
+
+# Builder API
\ No newline at end of file
diff --git a/tb/man/IR.txt b/tb/man/IR.txt
new file mode 100644
index 00000000..c95e7c88
--- /dev/null
+++ b/tb/man/IR.txt
@@ -0,0 +1,58 @@
+# Sea of Nodes (SoN)
+
+  https://www.oracle.com/technetwork/java/javase/tech/c2-ir95-150110.pdf)
+
+  SoN is an SSA where ordering is relaxed in the form of explicit dependencies
+  as opposed to local ordering inside basic blocks, for instance pure operations
+  like addition will not have an exact placement only the constraint that it must
+  be resolved after it's inputs. This makes it easier to perform local optimizations
+  without a care for scheduling, this is especially helpful because of how many
+  optimizations we've moved to peepholes.
+
+  note: edges going down from A to B means B is dependent on A.
+
+                       Reassociation
+
+                x+2+4
+
+               x   2                  2   4
+                \ /                    \ /
+                 +   4       =>     x   +
+                  \ /                \ /
+                   +                  +
+
+                            GVN
+
+              A*B + A*B
+
+                A   B                A   B
+                |\ /|                 \ /
+                | X |                  *
+                |/ \|        =>       / \
+                *   *                 \ /
+                 \ /                   +
+                  +
+                       Load elimination
+
+              *x = 16
+              return *x
+
+                   x_
+                   | \
+                   |  \
+                   |   \                  x
+                   |    |                 |
+          memory   | 16 |    =>  memory   |   16
+                \  | |  |              \  |  / |
+                 Store  |               Store  |
+                   |    |                     /
+                   |   /                     /
+                   |  /                     /
+                   | /                     /
+                 Load                     |
+                   |                      |
+                   V                      V
+
+  note: we're not showing the control edge memory operations have for simplicit but
+  both of these are sharing a control edge. Stores produce more memory but don't produce
+  more control flow and Loads use memory but don't produce more (these are both non-volatile)
diff --git a/tb/man/TYPES.md b/tb/man/TYPES.md
new file mode 100644
index 00000000..defc2f05
--- /dev/null
+++ b/tb/man/TYPES.md
@@ -0,0 +1,32 @@
+# Types
+
+The TBIR data types are used to represent the structure and certain proofs
+about the data itself.
+
+## Void (TB_VOID)
+
+void type is a unit type and thus cannot hold data.
+
+## Boolean (TB_BOOL)
+
+booleans represent either true or false and the conversion is defined as:
+
+`((x != 0) ? true : false) where x is a data-holding type`
+
+this is important to note because in float types NaN comparisons always return
+false which means that NaN is considered false in a (NaN -> bool) conversion.
+
+## Integers (TB_I8, TB_I16, TB_I32, TB_I64)
+
+integer types come in a few basic sizes (i8, i16, i32, i64) and represent numerical
+data and raw data. Integer operations can come in two forms: signed and unsigned.
+
+## Floats (TB_F32, TB_F64)
+
+floating point types are IEEE-754-2008 compliant with f32, and f64 mapping to binary32,
+and binary64 respectively.
+
+## Pointers (TB_PTR)
+
+Pointers refer to memory objects in the global address space (see MEMORY.md)
+TODO: currently TB only supports on address space but this might be subject to change
diff --git a/tb/src/abi.c b/tb/src/abi.c
index 5e0ba963..c48b5deb 100644
--- a/tb/src/abi.c
+++ b/tb/src/abi.c
@@ -1,6 +1,5 @@
 // This is gonna get complicated but we can push through :p
 
-
 ////////////////////////////////
 // x86-64
 ////////////////////////////////
@@ -93,7 +92,7 @@ static TB_DataType debug_type_to_tb(TB_DebugType* t) {
         case TB_DEBUG_TYPE_ARRAY:    return TB_TYPE_PTR;
         case TB_DEBUG_TYPE_POINTER:  return TB_TYPE_PTR;
 
-        case TB_DEBUG_TYPE_FLOAT: return (TB_DataType){ { TB_FLOAT, 0, t->float_fmt } };
+        case TB_DEBUG_TYPE_FLOAT: return (TB_DataType){ { TB_FLOAT, t->float_fmt } };
 
         default: tb_assert(0, "todo"); return TB_TYPE_VOID;
     }
@@ -109,7 +108,7 @@ static TB_DataType reg_class_to_tb(TB_ABI abi, RegClass rg, TB_DebugType* type)
 
         case RG_SSE: {
             assert(type->tag == TB_DEBUG_TYPE_FLOAT);
-            return (TB_DataType){ { TB_FLOAT, 0, type->float_fmt } };
+            return (TB_DataType){ { TB_FLOAT, type->float_fmt } };
         }
 
         default: tb_assert(0, "todo"); return TB_TYPE_VOID;
diff --git a/tb/src/codegen/emitter.h b/tb/src/codegen/emitter.h
index dde46185..2cc9b756 100644
--- a/tb/src/codegen/emitter.h
+++ b/tb/src/codegen/emitter.h
@@ -28,7 +28,6 @@ typedef struct {
     uint8_t* data;
 
     NL_Map(TB_Node*, uint32_t) labels;
-    uint32_t return_label;
 } TB_CGEmitter;
 
 // Helper macros
@@ -37,13 +36,19 @@ typedef struct {
 #define EMIT2(e, b) do { uint16_t _b = (b); memcpy(tb_cgemit_reserve(e, 2), &_b, 2); (e)->count += 2; } while (0)
 #define EMIT4(e, b) do { uint32_t _b = (b); memcpy(tb_cgemit_reserve(e, 4), &_b, 4); (e)->count += 4; } while (0)
 #define EMIT8(e, b) do { uint64_t _b = (b); memcpy(tb_cgemit_reserve(e, 8), &_b, 8); (e)->count += 8; } while (0)
-#define RELOC4(e, p, b) do {  void *_ptr = &(e)->data[p];           \
-                              uint32_t _b = (b), _temp;             \
-                              memcpy(&_temp, _ptr, 4);              \
-                              _temp += _b;                          \
-                              memcpy(_ptr, &_temp, 4); } while (0)
 #define PATCH4(e, p, b) do { uint32_t _b = (b); memcpy(&(e)->data[p], &_b, 4); } while (0)
 #define GET_CODE_POS(e) ((e)->count)
+#define RELOC4(e, p, b) tb_reloc4(e, p, b)
+
+static void tb_reloc4(TB_CGEmitter* restrict e, uint32_t p, uint32_t b) {
+    void* ptr = &e->data[p];
+
+    // i love UBsan...
+    uint32_t tmp;
+    memcpy(&tmp, ptr, 4);
+    tmp += b;
+    memcpy(ptr, &tmp, 4);
+}
 
 static void tb_asm_print(TB_CGEmitter* restrict e, const char* fmt, ...) {
     // let's hope the optimizer can hoist this early-out outside of the call
diff --git a/tb/src/codegen/generic_cg.h b/tb/src/codegen/generic_cg.h
index 18dff0e9..2225283f 100644
--- a/tb/src/codegen/generic_cg.h
+++ b/tb/src/codegen/generic_cg.h
@@ -4,6 +4,7 @@
 #include <log.h>
 
 static thread_local bool reg_alloc_log;
+static TB_CFG* muh_______cfg;
 
 enum {
     CG_VAL_UNRESOLVED = 0,
@@ -40,6 +41,9 @@ _Static_assert(sizeof(TB_PhysicalReg) == sizeof(RegIndex), "these should be the
 typedef struct MachineBB {
     Inst* first;
 
+    // what's the terminator, it helps us walk successors
+    TB_Node* end_node;
+
     int start, end;
     int terminator;
 
@@ -73,8 +77,11 @@ typedef struct {
 
     TB_Passes* p;
 
+    int bb_count;
+    int* bb_order;
+
     // Scheduling
-    size_t block_count;
+    TB_CFG cfg;
     Worklist worklist; // reusing from TB_Passes.
     ValueDesc* values; // the indices match the GVN.
 
@@ -117,7 +124,7 @@ static int classify_reg_class(TB_DataType dt);
 static void isel(Ctx* restrict ctx, TB_Node* n, int dst);
 static bool should_rematerialize(TB_Node* n);
 
-static void emit_code(Ctx* restrict ctx, TB_FunctionOutput* restrict func_out);
+static void emit_code(Ctx* restrict ctx, TB_FunctionOutput* restrict func_out, int end);
 static void mark_callee_saved_constraints(Ctx* restrict ctx, uint64_t callee_saved[CG_REGISTER_CLASSES]);
 
 static void add_debug_local(Ctx* restrict ctx, TB_Node* n, int pos) {
@@ -204,6 +211,8 @@ struct Inst {
 };
 
 // generic instructions
+static Inst* inst_jmp(TB_Node* target);
+
 static Inst* inst_label(TB_Node* n) {
     Inst* i = TB_ARENA_ALLOC(tmp_arena, Inst);
     *i = (Inst){ .type = INST_LABEL, .flags = INST_NODE, .n = n };
@@ -393,8 +402,11 @@ static int alloc_vreg(Ctx* restrict ctx, TB_Node* n, TB_DataType dt) {
     dyn_array_put(ctx->intervals, (LiveInterval){
             .reg_class = classify_reg_class(dt),
             .n = n, .reg = -1, .hint = -1, .assigned = -1,
-            .dt = legalize(dt), .start = INT_MAX, .split_kid = -1
+            .dt = legalize(dt), .split_kid = -1
         });
+
+    LiveRange r = { INT_MAX, INT_MAX };
+    dyn_array_put(ctx->intervals[i].ranges, r);
     return i;
 }
 
@@ -413,17 +425,23 @@ static int liveness(Ctx* restrict ctx, TB_Function* f) {
 
     // find BB boundaries in sequences
     MachineBBs seq_bb = NULL;
-    nl_map_create(seq_bb, ctx->block_count);
+    nl_map_create(seq_bb, ctx->bb_count);
 
-    FOREACH_N(i, 0, ctx->block_count) {
-        MachineBB bb = {
+    TB_Node** bbs = ctx->worklist.items;
+    int* bb_order = ctx->bb_order;
+    FOREACH_N(i, 0, ctx->bb_count) {
+        TB_Node* n = bbs[bb_order[i]];
+        TB_BasicBlock* bb = &nl_map_get_checked(ctx->cfg.node_to_block, n);
+
+        MachineBB mbb = {
+            .end_node = bb->end,
             .gen = set_create_in_arena(arena, interval_count),
             .kill = set_create_in_arena(arena, interval_count),
             .live_in = set_create_in_arena(arena, interval_count),
             .live_out = set_create_in_arena(arena, interval_count)
         };
 
-        nl_map_put(seq_bb, ctx->worklist.items[i], bb);
+        nl_map_put(seq_bb, n, mbb);
     }
 
     // generate local live sets
@@ -435,13 +453,13 @@ static int liveness(Ctx* restrict ctx, TB_Function* f) {
             assert(inst->type == INST_LABEL);
 
             // initial label
-            MachineBB* mbb = &nl_map_get_checked(seq_bb, f->start_node);
+            TB_Node* bb = ctx->worklist.items[0];
+            MachineBB* mbb = &nl_map_get_checked(seq_bb, bb);
             mbb->first = inst;
             mbb->start = 2;
             inst->time = 2;
             inst = inst->next;
 
-            TB_Node* bb = f->start_node;
             for (; inst; inst = inst->next) {
                 if (inst->type == INST_LABEL) {
                     nl_map_get_checked(seq_bb, bb).end = timeline;
@@ -483,36 +501,32 @@ static int liveness(Ctx* restrict ctx, TB_Function* f) {
 
     // generate global live sets
     size_t base = dyn_array_length(ctx->worklist.items);
-    assert(base == ctx->block_count);
 
     // all nodes go into the worklist
-    FOREACH_N(i, 0, ctx->block_count) {
-        TB_Node* bb = ctx->worklist.items[i];
-        assert(bb->type == TB_START || bb->type == TB_REGION);
-
-        dyn_array_put(ctx->worklist.items, bb);
+    FOREACH_REVERSE_N(i, 0, ctx->bb_count) {
+        TB_Node* n = bbs[bb_order[i]];
+        dyn_array_put(ctx->worklist.items, n);
 
         // in(bb) = use(bb)
-        MachineBB* mbb = &nl_map_get_checked(seq_bb, bb);
+        MachineBB* mbb = &nl_map_get_checked(seq_bb, n);
         set_copy(&mbb->live_in, &mbb->gen);
     }
 
     while (dyn_array_length(ctx->worklist.items) > base) // CUIK_TIMED_BLOCK("global iter")
     {
         TB_Node* bb = dyn_array_pop(ctx->worklist.items);
-        TB_NodeRegion* r = TB_NODE_GET_EXTRA(bb);
         MachineBB* mbb = &nl_map_get_checked(seq_bb, bb);
 
-        // walk all successors
         Set* restrict live_out = &mbb->live_out;
         set_clear(live_out);
 
-        if (r->end->type == TB_BRANCH) {
-            TB_NodeBranch* br = TB_NODE_GET_EXTRA(r->end);
-            FOREACH_N(i, 0, br->succ_count) {
+        // walk all successors
+        TB_Node* end = mbb->end_node;
+        for (User* u = end->users; u; u = u->next) {
+            if (cfg_is_control(u->n)) {
                 // union with successor's lives
-                MachineBB* succ = &nl_map_get_checked(seq_bb, br->succ[i]);
-                set_union(live_out, &succ->live_in);
+                TB_Node* succ = cfg_get_fallthru(u->n);
+                set_union(live_out, &nl_map_get_checked(seq_bb, succ).live_in);
             }
         }
 
@@ -533,26 +547,23 @@ static int liveness(Ctx* restrict ctx, TB_Function* f) {
         // if we have changes, mark the predeccesors
         if (changes) {
             FOREACH_N(i, 0, bb->input_count) {
-                dyn_array_put(ctx->worklist.items, tb_get_parent_region(bb->inputs[i]));
+                dyn_array_put(ctx->worklist.items, get_block_begin(bb->inputs[i]));
             }
         }
     }
-    dyn_array_set_length(ctx->worklist.items, ctx->block_count);
-
-    /*FOREACH_REVERSE_N(i, 0, ctx->block_count) {
-        MachineBB* mbb = &nl_map_get_checked(seq_bb, ctx->worklist.items[i]);
-        int j = 120;
-
-        printf("v%zu:", i);
-        if (set_get(&mbb->gen, j)) printf("GEN ");
-        if (set_get(&mbb->kill, j)) printf("KILL ");
-        if (set_get(&mbb->live_in, j)) printf("IN ");
-        if (set_get(&mbb->live_out, j)) printf("OUT ");
-        printf("\n");
+
+    /*if (!strcmp(f->super.name, "WinMain")) {
+        FOREACH_N(i, 0, ctx->bb_count) {
+            TB_Node* n = bbs[bb_order[i]];
+            MachineBB* mbb = &nl_map_get_checked(seq_bb, n);
+
+            bool in = set_get(&mbb->live_in, 83);
+            bool out = set_get(&mbb->live_out, 83);
+            printf(".bb%d: %s %s\n", bb_order[i], in?"in":"", out?"out":"");
+        }
     }*/
 
     ctx->machine_bbs = seq_bb;
-    assert(epilogue >= 0);
     return epilogue;
 }
 
@@ -630,24 +641,36 @@ static void isel_set_location(Ctx* restrict ctx, TB_Node* n) {
     }
 }
 
-static void isel_region(Ctx* restrict ctx, TB_Node* bb, TB_Node* end) {
-    assert(dyn_array_length(ctx->worklist.items) == ctx->block_count);
+static void isel_region(Ctx* restrict ctx, TB_Node* bb_start, TB_Node* end, size_t rpo_index) {
+    assert(dyn_array_length(ctx->worklist.items) == ctx->cfg.block_count);
+    TB_Scheduled scheduled = ctx->p->scheduled;
+    TB_BasicBlock* bb = nl_map_get_checked(scheduled, bb_start);
 
     // phase 1: logical schedule
     DynArray(PhiVal) phi_vals = ctx->phi_vals;
     CUIK_TIMED_BLOCK("phase 1") {
-        sched_walk(ctx->p, &ctx->worklist, &phi_vals, bb, end);
+        sched_walk(ctx->p, &ctx->worklist, &phi_vals, bb, end, true);
+
+        // schedule params
+        if (rpo_index == 0) {
+            for (User* use = ctx->f->start_node->users; use; use = use->next) {
+                TB_Node* use_n = use->n;
+                if (use_n->type == TB_PROJ && !worklist_test_n_set(&ctx->worklist, use_n)) {
+                    dyn_array_put(ctx->worklist.items, use_n);
+                }
+            }
+        }
     }
 
     // phase 2: define all the nodes in this BB
     CUIK_TIMED_BLOCK("phase 2") {
-        FOREACH_REVERSE_N(i, ctx->block_count, dyn_array_length(ctx->worklist.items)) {
+        FOREACH_N(i, ctx->cfg.block_count, dyn_array_length(ctx->worklist.items)) {
             TB_Node* n = ctx->worklist.items[i];
 
-            // track use count
+            // track non-dead users
             size_t use_count = 0;
             for (User* use = find_users(ctx->p, n); use; use = use->next) {
-                if (use->n->inputs[0] != NULL) use_count++;
+                if (nl_map_get(scheduled, use->n) >= 0) use_count++;
             }
 
             // we don't have to worry about resizing here which is really nice
@@ -660,7 +683,6 @@ static void isel_region(Ctx* restrict ctx, TB_Node* bb, TB_Node* end) {
     // not the new one we're producing.
     size_t our_phis = dyn_array_length(phi_vals);
     CUIK_TIMED_BLOCK("phase 3") {
-        TB_Node* top = ctx->worklist.items[ctx->block_count];
         FOREACH_N(i, 0, our_phis) {
             PhiVal* v = &phi_vals[i];
 
@@ -668,25 +690,28 @@ static void isel_region(Ctx* restrict ctx, TB_Node* bb, TB_Node* end) {
             v->dst = input_reg(ctx, v->phi);
         }
 
-        for (User* use = find_users(ctx->p, top); use; use = use->next) {
-            if (use->n->type == TB_PHI && use->n->dt.type != TB_MEMORY) {
-                ValueDesc* val = &ctx->values[use->n->gvn];
+        if (bb_start->type == TB_REGION) {
+            for (User* use = find_users(ctx->p, bb_start); use; use = use->next) {
+                if (use->n->type == TB_PHI && use->n->dt.type != TB_MEMORY) {
+                    ValueDesc* val = &ctx->values[use->n->gvn];
 
-                // copy PHI into temporary
-                PhiVal p = { .phi = use->n, .dst = input_reg(ctx, use->n) };
-                dyn_array_put(phi_vals, p);
+                    // copy PHI into temporary
+                    PhiVal p = { .phi = use->n, .dst = input_reg(ctx, use->n) };
+                    dyn_array_put(phi_vals, p);
 
-                TB_DataType dt = p.phi->dt;
-                int tmp = DEF(NULL, dt);
-                SUBMIT(inst_move(dt, tmp, p.dst));
+                    TB_DataType dt = p.phi->dt;
+                    int tmp = DEF(NULL, dt);
+                    SUBMIT(inst_move(dt, tmp, p.dst));
 
-                // assign temporary as the PHI until the end of the BB
-                val->vreg = tmp;
+                    // assign temporary as the PHI until the end of the BB
+                    val->vreg = tmp;
+                }
             }
         }
 
-        assert(top->type == TB_START || top->type == TB_REGION);
-        isel(ctx, top, -1);
+        if (rpo_index == 0) {
+            isel(ctx, ctx->f->start_node, -1);
+        }
     }
 
     // phase 4: walk all nodes (we're allowed to fold nodes into those which appear later)
@@ -694,19 +719,24 @@ static void isel_region(Ctx* restrict ctx, TB_Node* bb, TB_Node* end) {
     // isel is emitting start->end but we're iterating in reverse order so we need
     // to reverse the instruction stream as we go, it's a linked list so it's not
     // hard.
-    DO_IF(TB_OPTDEBUG_CODEGEN)(printf("BB %p\n", bb));
+    int bbid = nl_map_get_checked(ctx->cfg.node_to_block, bb_start).id;
+    TB_OPTDEBUG(CODEGEN)(printf("BB %d\n", bbid));
 
     CUIK_TIMED_BLOCK("phase 4") {
         Inst *head = ctx->head, *last = NULL;
         TB_Node* prev_effect = NULL;
-        FOREACH_REVERSE_N(i, ctx->block_count + 1, dyn_array_length(ctx->worklist.items)) {
+        FOREACH_REVERSE_N(i, ctx->cfg.block_count, dyn_array_length(ctx->worklist.items)) {
             TB_Node* n = ctx->worklist.items[i];
+            if (n->type == TB_START) {
+                continue;
+            }
+
             ValueDesc* val = lookup_val(ctx, n);
 
             // if the value hasn't been asked for yet and
-            if (val->vreg < 0 && should_rematerialize(n)) {
+            if (n != end && val->vreg < 0 && should_rematerialize(n)) {
                 DO_IF(TB_OPTDEBUG_CODEGEN)(
-                    printf("  DISCARD %zu: ", n->gvn),
+                    printf("  DISCARD %u: ", n->gvn),
                     print_node_sexpr(n, 0),
                     printf("\n")
                 );
@@ -719,28 +749,16 @@ static void isel_region(Ctx* restrict ctx, TB_Node* bb, TB_Node* end) {
             ctx->head = &dummy;
 
             if (n->dt.type == TB_TUPLE || n->dt.type == TB_CONTROL || n->dt.type == TB_MEMORY) {
-                DO_IF(TB_OPTDEBUG_CODEGEN)(
-                    printf("  EFFECT %zu: ", n->gvn),
+                TB_OPTDEBUG(CODEGEN)(
+                    printf("  EFFECT %u: ", n->gvn),
                     print_node_sexpr(n, 0),
                     printf("\n")
                 );
 
-                if (n->type == TB_BRANCH) {
-                    // writeback PHIs
-                    FOREACH_N(i, 0, our_phis) {
-                        PhiVal* v = &phi_vals[i];
-                        TB_DataType dt = v->phi->dt;
-
-                        int src = input_reg(ctx, v->n);
-
-                        hint_reg(ctx, v->dst, src);
-                        SUBMIT(inst_move(dt, v->dst, src));
-                    }
-                }
 
                 isel(ctx, n, val->vreg);
 
-                if (n->inputs[0]->type == TB_START || n->type != TB_PROJ) {
+                if ((n->input_count > 0 && n->inputs[0]->type == TB_START) || n->type != TB_PROJ) {
                     if (prev_effect != NULL) {
                         isel_set_location(ctx, prev_effect);
                     }
@@ -759,16 +777,16 @@ static void isel_region(Ctx* restrict ctx, TB_Node* bb, TB_Node* end) {
                     val->vreg = DEF(n, n->dt);
                 }
 
-                DO_IF(TB_OPTDEBUG_CODEGEN)(
-                    printf("  DATA %zu: ", n->gvn),
+                TB_OPTDEBUG(CODEGEN)(
+                    printf("  DATA %u: ", n->gvn),
                     print_node_sexpr(n, 0),
                     printf("\n")
                 );
 
                 isel(ctx, n, val->vreg);
             } else {
-                DO_IF(TB_OPTDEBUG_CODEGEN)(
-                    printf("  DEAD %zu: ", n->gvn),
+                TB_OPTDEBUG(CODEGEN)(
+                    printf("  DEAD %u: ", n->gvn),
                     print_node_sexpr(n, 0),
                     printf("\n")
                 );
@@ -824,9 +842,35 @@ static void isel_region(Ctx* restrict ctx, TB_Node* bb, TB_Node* end) {
         dyn_array_clear(phi_vals);
         ctx->phi_vals = phi_vals;
         ctx->head = last ? last : head;
+
+        if (end->type != TB_END    && end->type != TB_TRAP &&
+            end->type != TB_BRANCH && end->type != TB_UNREACHABLE) {
+            TB_OPTDEBUG(CODEGEN)(
+                printf("  TERMINATOR %u: ", end->gvn),
+                print_node_sexpr(end, 0),
+                printf("\n")
+            );
+
+            // writeback PHIs
+            FOREACH_N(i, 0, our_phis) {
+                PhiVal* v = &phi_vals[i];
+                TB_DataType dt = v->phi->dt;
+
+                int src = input_reg(ctx, v->n);
+
+                hint_reg(ctx, v->dst, src);
+                SUBMIT(inst_move(dt, v->dst, src));
+            }
+
+            // implicit goto
+            TB_Node* succ = cfg_next_control(end);
+            if (ctx->fallthrough != succ) {
+                SUBMIT(inst_jmp(succ));
+            }
+        }
     }
 
-    dyn_array_set_length(ctx->worklist.items, ctx->block_count);
+    dyn_array_set_length(ctx->worklist.items, ctx->cfg.block_count);
 }
 
 // Codegen through here is done in phases
@@ -836,15 +880,12 @@ static void compile_function(TB_Passes* restrict p, TB_FunctionOutput* restrict
     TB_Function* restrict f = p->f;
     DO_IF(TB_OPTDEBUG_PEEP)(log_debug("%s: starting codegen with %d nodes", f->super.name, f->node_count));
 
-    tb_pass_schedule(p);
-
     #if 0
-    reg_alloc_log = strcmp(f->super.name, "main_wnd_proc") == 0;
-    if (reg_alloc_log) {
-        printf("\n\n\n");
+    if (!strcmp(f->super.name, "stbi__parse_png_file")) {
+        reg_alloc_log = true;
         tb_pass_print(p);
     } else {
-        emit_asm = false;
+        reg_alloc_log = false;
     }
     #endif
 
@@ -870,16 +911,18 @@ static void compile_function(TB_Passes* restrict p, TB_FunctionOutput* restrict
     ctx.worklist = p->worklist;
     ctx.values = tb_arena_alloc(tmp_arena, f->node_count * sizeof(ValueDesc));
 
+    // We need to generate a CFG
+    ctx.cfg = tb_compute_rpo(f, p);
+    muh_______cfg = &ctx.cfg;
+
+    // And perform global scheduling
+    tb_pass_schedule(p, ctx.cfg);
+
     // allocate more stuff now that we've run stats on the IR
-    ctx.emit.return_label = 0;
-    nl_map_create(ctx.emit.labels, ctx.block_count);
+    nl_map_create(ctx.emit.labels, ctx.cfg.block_count);
     nl_map_create(ctx.stack_slots, 8);
     dyn_array_create(ctx.debug_stack_slots, 8);
 
-    // We need to generate a CFG
-    ctx.block_count = tb_push_postorder(f, &p->worklist);
-    assert(p->worklist.items[ctx.block_count - 1] == f->start_node && "Codegen must always schedule entry BB first");
-
     worklist_clear_visited(&p->worklist);
 
     // Instruction selection:
@@ -887,11 +930,15 @@ static void compile_function(TB_Passes* restrict p, TB_FunctionOutput* restrict
     //   fixed and which need allocation. For now regalloc is handled
     //   immediately but in theory it could be delayed until all selection
     //   is done.
+    ctx.bb_count = 0;
+    int* bb_order = ctx.bb_order = tb_arena_alloc(tmp_arena, ctx.cfg.block_count * sizeof(int));
+
     CUIK_TIMED_BLOCK("isel") {
-        assert(dyn_array_length(ctx.worklist.items) == ctx.block_count);
+        assert(dyn_array_length(ctx.worklist.items) == ctx.cfg.block_count);
 
-        // define all PHIs early
-        FOREACH_REVERSE_N(i, 0, ctx.block_count) {
+        // define all PHIs early and sort BB order
+        int stop_bb = -1;
+        FOREACH_N(i, 0, ctx.cfg.block_count) {
             TB_Node* bb = ctx.worklist.items[i];
 
             for (User* use = find_users(p, bb); use; use = use->next) {
@@ -902,54 +949,38 @@ static void compile_function(TB_Passes* restrict p, TB_FunctionOutput* restrict
                     ctx.values[n->gvn].vreg = -1;
                 }
             }
+
+            TB_Node* end = nl_map_get_checked(ctx.cfg.node_to_block, bb).end;
+            if (end->type == TB_END) {
+                stop_bb = i;
+            } else {
+                bb_order[ctx.bb_count++] = i;
+            }
         }
 
-        // compile all nodes which aren't the STOP node
-        TB_Node* stop_node = f->stop_node;
-        TB_Node* stop_bb = tb_get_parent_region(stop_node);
+        // enter END block at the... end
+        if (stop_bb >= 0) {
+            bb_order[ctx.bb_count++] = stop_bb;
+        }
 
-        bool has_stop = false;
-        FOREACH_REVERSE_N(i, 0, ctx.block_count) {
-            TB_Node* bb = ctx.worklist.items[i];
-            assert(bb->type == TB_START || bb->type == TB_REGION);
+        TB_Node** bbs = ctx.worklist.items;
+        FOREACH_N(i, 0, ctx.bb_count) {
+            TB_Node* bb = bbs[bb_order[i]];
 
             nl_map_put(ctx.emit.labels, bb, 0);
-            if (bb != stop_bb) {
-                // mark fallthrough
-                ctx.fallthrough = i > 0 ? ctx.worklist.items[i - 1] : NULL;
-                if (ctx.fallthrough == stop_bb) ctx.fallthrough = NULL;
-
-                Inst* label = inst_label(bb);
-                if (ctx.first == NULL) {
-                    ctx.first = ctx.head = label;
-                } else {
-                    append_inst(&ctx, label);
-                }
-
-                TB_Node* end = TB_NODE_GET_EXTRA_T(bb, TB_NodeRegion)->end;
-                isel_region(&ctx, bb, end);
-            } else {
-                has_stop = true;
-            }
-        }
 
-        // always schedule the STOP node here
-        if (has_stop) {
-            // mark fallthrough
-            ctx.fallthrough = NULL;
+            // find next BB
+            ctx.fallthrough = i + 1 < ctx.bb_count ? bbs[bb_order[i + 1]] : NULL;
 
-            Inst* label = inst_label(stop_bb);
+            Inst* label = inst_label(bb);
             if (ctx.first == NULL) {
                 ctx.first = ctx.head = label;
             } else {
                 append_inst(&ctx, label);
             }
 
-            TB_Node* end = TB_NODE_GET_EXTRA_T(stop_bb, TB_NodeRegion)->end;
-            isel_region(&ctx, stop_bb, end);
-        } else {
-            // liveness expects one but we don't really have shit to put down there... it's never reached
-            append_inst(&ctx, alloc_inst(INST_EPILOGUE, TB_TYPE_VOID, 0, 0, 0));
+            TB_Node* end = nl_map_get_checked(ctx.cfg.node_to_block, bb).end;
+            isel_region(&ctx, bb, end, i);
         }
     }
     p->worklist = ctx.worklist;
@@ -967,10 +998,11 @@ static void compile_function(TB_Passes* restrict p, TB_FunctionOutput* restrict
 
         // Arch-specific: convert instruction buffer into actual instructions
         CUIK_TIMED_BLOCK("emit code") {
-            emit_code(&ctx, func_out);
+            emit_code(&ctx, func_out, end);
         }
     }
 
+    tb_free_cfg(&ctx.cfg);
     nl_map_free(ctx.emit.labels);
     nl_map_free(ctx.machine_bbs);
     dyn_array_destroy(ctx.intervals);
@@ -999,7 +1031,7 @@ static void get_data_type_size(TB_DataType dt, size_t* out_size, size_t* out_ali
             // round up bits to a byte
             int bits = is_big_int ? ((dt.data + 7) / 8) : tb_next_pow2(dt.data - 1);
 
-            *out_size  = ((bits+7) / 8) << dt.width;
+            *out_size  = ((bits+7) / 8);
             *out_align = is_big_int ? 8 : ((dt.data + 7) / 8);
             break;
         }
@@ -1009,7 +1041,7 @@ static void get_data_type_size(TB_DataType dt, size_t* out_size, size_t* out_ali
             else if (dt.data == TB_FLT_64) s = 8;
             else tb_unreachable();
 
-            *out_size = s << dt.width;
+            *out_size = s;
             *out_align = s;
             break;
         }
diff --git a/tb/src/codegen/reg_alloc.h b/tb/src/codegen/reg_alloc.h
index 15cc9524..db20ea3f 100644
--- a/tb/src/codegen/reg_alloc.h
+++ b/tb/src/codegen/reg_alloc.h
@@ -40,9 +40,6 @@ struct LiveInterval {
     // register num, -1 if the interval isn't a physical reg
     int reg, hint;
 
-    // whole interval
-    int start, end;
-
     // spill point, -1 if there's none
     int spill, split_kid;
 
@@ -91,7 +88,6 @@ static void add_use_pos(LiveInterval* interval, int t, int kind) {
     dyn_array_put(interval->uses, u);
 }
 
-// interval->start is filled in by the definition
 static void add_range(LiveInterval* interval, int start, int end) {
     assert(start <= end);
     size_t count = dyn_array_length(interval->ranges);
@@ -105,8 +101,6 @@ static void add_range(LiveInterval* interval, int start, int end) {
         LiveRange r = { start, end };
         dyn_array_put(interval->ranges, r);
     }
-
-    if (end > interval->end) interval->end = end;
 }
 
 static void reverse_bb_walk(LSRA* restrict ra, MachineBB* bb, Inst* inst) {
@@ -127,13 +121,12 @@ static void reverse_bb_walk(LSRA* restrict ra, MachineBB* bb, Inst* inst) {
         assert(*ops >= 0);
         LiveInterval* interval = &ra->intervals[*ops++];
 
-        if (interval->ranges == NULL) {
+        if (dyn_array_length(interval->ranges) == 1) {
             add_range(interval, inst->time, inst->time);
         } else {
             interval->ranges[dyn_array_length(interval->ranges) - 1].start = inst->time;
         }
 
-        interval->start = inst->time;
         add_use_pos(interval, inst->time, dst_use_reg ? USE_REG : USE_OUT);
     }
 
@@ -158,33 +151,19 @@ static void reverse_bb_walk(LSRA* restrict ra, MachineBB* bb, Inst* inst) {
     }
 }
 
-static int range_intersect(int start, int end, LiveRange* b) {
-    if (b->start <= end && start <= b->end) {
-        return start > b->start ? start : b->start;
+static int range_intersect(LiveRange* a, LiveRange* b) {
+    if (b->start <= a->end && a->start <= b->end) {
+        return a->start > b->start ? a->start : b->start;
     } else {
         return -1;
     }
 }
 
 static int interval_intersect(LiveInterval* a, LiveInterval* b) {
-    if (!(b->start <= a->end && a->start <= b->end)) {
-        return -1; // don't intersect at all
-    }
-
-    FOREACH_N(i, 0, dyn_array_length(a->ranges)) {
-        LiveRange a_range = a->ranges[i];
-
-        FOREACH_N(j, 0, dyn_array_length(b->ranges)) {
-            LiveRange b_range = b->ranges[j];
-
-            // if the end is greater than the start, then we've overshot
-            if (a_range.start >= b_range.end) {
-                break;
-            }
-
-            if (a_range.end >= b_range.start) {
-                return b_range.start > a_range.start ? b_range.start : a_range.start;
-            }
+    dyn_array_for(i, a->ranges) {
+        int t = range_intersect(&a->ranges[i], &b->ranges[b->active_range]);
+        if (t >= 0) {
+            return t;
         }
     }
 
@@ -192,8 +171,7 @@ static int interval_intersect(LiveInterval* a, LiveInterval* b) {
 }
 
 #define FOREACH_SET(it, set) \
-FOREACH_N(_i, 0, ((set).capacity + 63) / 64) \
-for (uint64_t bits = (set).data[_i], it = _i*64; bits; bits >>= 1, it++) if (bits & 1)
+FOREACH_N(_i, 0, ((set).capacity + 63) / 64) FOREACH_BIT(it, _i*64, (set).data[_i])
 
 static int next_use(LSRA* restrict ra, LiveInterval* interval, int time) {
     for (;;) {
@@ -207,26 +185,9 @@ static int next_use(LSRA* restrict ra, LiveInterval* interval, int time) {
             interval = &ra->intervals[interval->split_kid];
             continue;
         }
-    }
-
-    return INT_MAX;
-}
-
-// if < 0, then it's -x - 1 where x is the nearest starting point
-static int covers(LiveInterval* it, int start, int end) {
-    size_t i = 0, count = dyn_array_length(it->ranges);
-    for (; i < count; i++) {
-        // if the end is greater than the start, then we've overshot
-        if (start > it->ranges[i].end) {
-            return -1;
-        }
 
-        if (end >= it->ranges[i].start) {
-            return i;
-        }
+        return INT_MAX;
     }
-
-    return -1;
 }
 
 static LiveInterval* get_active(LSRA* restrict ra, int rc, int reg) {
@@ -273,13 +234,16 @@ static void insert_split_move(LSRA* restrict ra, int t, int old_reg, int new_reg
     prev->next = new_inst;
 }
 
+static int interval_start(LiveInterval* interval) { return interval->ranges[dyn_array_length(interval->ranges) - 1].start; }
+static int interval_end(LiveInterval* interval)   { return interval->ranges[1].end; }
+
 static LiveInterval* split_interval_at(LSRA* restrict ra, LiveInterval* interval, int pos) {
     // skip past previous intervals
-    while (interval->split_kid >= 0 && pos > interval->end) {
+    while (interval->split_kid >= 0 && pos > interval_end(interval)) {
         interval = &ra->intervals[interval->split_kid];
     }
 
-    assert(interval->reg >= 0 || pos <= interval->end);
+    assert(interval->reg >= 0 || pos <= interval_end(interval));
     return interval;
 }
 
@@ -295,13 +259,10 @@ static int split_intersecting(LSRA* restrict ra, int current_time, int pos, Live
         int size = 8;
         ra->stack_usage = align_up(ra->stack_usage + size, size);
 
+        // remove from active set
+        set_remove(&ra->active_set[interval->reg_class], interval->assigned);
+
         REG_ALLOC_LOG printf("  \x1b[33m#   v%d: spill %s to [RBP - %d] at t=%d\x1b[0m\n", ri, reg_name(interval->reg_class, interval->assigned), ra->stack_usage, pos);
-        if (current_time >= pos && interval->assigned >= 0) {
-            if (set_get(&ra->active_set[interval->reg_class], interval->assigned) && ra->active[interval->reg_class][interval->assigned] == ri) {
-                REG_ALLOC_LOG printf("  \x1b[33m#   v%d: expired during split\x1b[0m\n", ri);
-                set_remove(&ra->active_set[interval->reg_class], interval->assigned);
-            }
-        }
     }
 
     // split lifetime
@@ -313,18 +274,17 @@ static int split_intersecting(LSRA* restrict ra, int current_time, int pos, Live
         it.reg = -1;
     }
     it.assigned = it.reg = -1;
-    it.start = pos;
-    it.end = interval->end;
     it.uses = NULL;
-    it.ranges = NULL;
+    it.ranges = dyn_array_create(LiveRange, 4);
     it.n = NULL;
     it.split_kid = -1;
-    interval->end = pos;
 
+    assert(interval->split_kid < 0 && "cannot spill while spilled");
     int old_reg = interval - ra->intervals;
     int new_reg = dyn_array_length(ra->intervals);
     interval->split_kid = new_reg;
 
+    dyn_array_put(it.ranges, (LiveRange){ INT_MAX, INT_MAX });
     dyn_array_put(ra->intervals, it);
     interval = &ra->intervals[old_reg];
 
@@ -333,7 +293,7 @@ static int split_intersecting(LSRA* restrict ra, int current_time, int pos, Live
         // unhandled list... we can push this to the top wit no problem
         size_t i = 0, count = dyn_array_length(ra->unhandled);
         for (; i < count; i++) {
-            if (pos > ra->intervals[ra->unhandled[i]].start) break;
+            if (pos > interval_start(&ra->intervals[ra->unhandled[i]])) break;
         }
 
         // we know where to insert
@@ -360,7 +320,7 @@ static int split_intersecting(LSRA* restrict ra, int current_time, int pos, Live
     }
 
     // split ranges
-    for (size_t i = 0; i < dyn_array_length(interval->ranges);) {
+    for (size_t i = 1; i < dyn_array_length(interval->ranges);) {
         LiveRange* range = &interval->ranges[i];
         if (range->start > pos) {
             dyn_array_put(it.ranges, *range);
@@ -372,6 +332,8 @@ static int split_intersecting(LSRA* restrict ra, int current_time, int pos, Live
                 memmove(range, range + 1, shift * sizeof(LiveRange));
             }
             dyn_array_pop(interval->ranges);
+            interval->active_range -= 1;
+            continue;
         } else if (range->end > pos) {
             // intersects pos, we need to split the range
             LiveRange r = { pos, range->end };
@@ -426,7 +388,7 @@ static ptrdiff_t allocate_free_reg(LSRA* restrict ra, LiveInterval* interval) {
         LiveInterval* it = &ra->intervals[ra->inactive[i]];
         int fp = ra->free_pos[it->assigned];
         if (fp > 0) {
-            int p = range_intersect(interval->start, interval->end, &it->ranges[it->active_range]);
+            int p = interval_intersect(interval, it);
             if (p >= 0 && p < fp) {
                 ra->free_pos[it->assigned] = p;
             }
@@ -448,7 +410,7 @@ static ptrdiff_t allocate_free_reg(LSRA* restrict ra, LiveInterval* interval) {
         assert(hint->reg_class == rc);
         hint_reg = hint->assigned;
 
-        if (interval->end <= ra->free_pos[hint_reg]) {
+        if (interval_end(interval) <= ra->free_pos[hint_reg]) {
             highest = hint_reg;
         }
     }
@@ -488,14 +450,16 @@ static ptrdiff_t allocate_free_reg(LSRA* restrict ra, LiveInterval* interval) {
             dyn_array_put(ra->intervals, it);
 
             // insert spill and reload
-            insert_split_move(ra, 0,            vreg, spill_slot);
-            insert_split_move(ra, ra->endpoint, spill_slot, vreg);
+            insert_split_move(ra, 0, vreg, spill_slot);
+            if (ra->endpoint) {
+                insert_split_move(ra, ra->endpoint, spill_slot, vreg);
+            }
 
             // adding to intervals might resized this
             interval = &ra->intervals[old_reg];
         }
 
-        if (interval->end <= pos) {
+        if (interval_end(interval) <= pos) {
             // we can steal it completely
             REG_ALLOC_LOG printf("  #   assign to %s", reg_name(rc, highest));
 
@@ -511,7 +475,7 @@ static ptrdiff_t allocate_free_reg(LSRA* restrict ra, LiveInterval* interval) {
         } else {
             // TODO(NeGate): split current at optimal position before current
             interval->assigned = highest;
-            split_intersecting(ra, interval->start, pos - 1, interval, true);
+            split_intersecting(ra, interval_start(interval), pos - 1, interval, true);
         }
 
         return highest;
@@ -526,17 +490,18 @@ static ptrdiff_t allocate_blocked_reg(LSRA* restrict ra, LiveInterval* interval)
     FOREACH_N(i, 0, 16) use_pos[i] = INT_MAX;
 
     // mark non-fixed intervals
+    int start = interval_start(interval);
     FOREACH_SET(i, ra->active_set[rc]) {
         LiveInterval* it = &ra->intervals[ra->active[rc][i]];
         if (it->reg_class == rc && it->reg < 0) {
-            use_pos[i] = next_use(ra, it, interval->start);
+            use_pos[i] = next_use(ra, it, start);
         }
     }
 
     dyn_array_for(i, ra->inactive) {
         LiveInterval* it = &ra->intervals[ra->inactive[i]];
         if (it->reg_class == rc && it->reg < 0) {
-            use_pos[i] = next_use(ra, it, interval->start);
+            use_pos[i] = next_use(ra, it, start);
         }
     }
 
@@ -554,7 +519,7 @@ static ptrdiff_t allocate_blocked_reg(LSRA* restrict ra, LiveInterval* interval)
         if (it->reg_class == rc && it->reg >= 0) {
             int bp = ra->block_pos[it->assigned];
             if (bp > 0) {
-                int p = range_intersect(interval->start, interval->end, &it->ranges[it->active_range]);
+                int p = interval_intersect(interval, it);
                 if (p >= 0 && p < bp) {
                     ra->block_pos[it->assigned] = p;
                 }
@@ -575,7 +540,10 @@ static ptrdiff_t allocate_blocked_reg(LSRA* restrict ra, LiveInterval* interval)
     }
 
     int pos = use_pos[highest];
-    int first_use = interval->uses[dyn_array_length(interval->uses) - 1].pos;
+    int first_use = INT_MAX;
+    if (dyn_array_length(interval->uses)) {
+        first_use = interval->uses[dyn_array_length(interval->uses) - 1].pos;
+    }
 
     bool spilled = false;
     if (first_use > pos) {
@@ -588,19 +556,20 @@ static ptrdiff_t allocate_blocked_reg(LSRA* restrict ra, LiveInterval* interval)
         // split at optimal spot before first use that requires a register
         FOREACH_REVERSE_N(i, 0, dyn_array_length(interval->uses)) {
             if (interval->uses[i].pos >= pos && interval->uses[i].kind == USE_REG) {
-                split_intersecting(ra, interval->start, interval->uses[i].pos - 1, interval, false);
+                split_intersecting(ra, start, interval->uses[i].pos - 1, interval, false);
                 break;
             }
         }
 
         spilled = true;
     } else {
-        int split_pos = (interval->start & ~1) - 1;
+        int start = interval_start(interval);
+        int split_pos = (start & ~1) - 1;
 
         // split active or inactive interval reg
         LiveInterval* to_split = get_active(ra, rc, highest);
         if (to_split != NULL) {
-            split_intersecting(ra, interval->start, split_pos, to_split, true);
+            split_intersecting(ra, start, split_pos, to_split, true);
         }
 
         // split any inactive interval for reg at the end of it's lifetime hole
@@ -609,7 +578,7 @@ static ptrdiff_t allocate_blocked_reg(LSRA* restrict ra, LiveInterval* interval)
             LiveRange* r = &it->ranges[it->active_range];
 
             if (it->reg_class == rc && it->assigned == highest && r->start <= pos+1 && pos <= r->end) {
-                split_intersecting(ra, interval->start, split_pos, it, true);
+                split_intersecting(ra, start, split_pos, it, true);
             }
         }
     }
@@ -617,9 +586,9 @@ static ptrdiff_t allocate_blocked_reg(LSRA* restrict ra, LiveInterval* interval)
     // split active reg if it intersects with fixed interval
     LiveInterval* fix_interval = &ra->intervals[(rc ? FIRST_XMM : FIRST_GPR) + highest];
     if (dyn_array_length(fix_interval->ranges)) {
-        int p = range_intersect(interval->start, interval->end, &fix_interval->ranges[fix_interval->active_range]);
+        int p = interval_intersect(interval, fix_interval);
         if (p >= 0) {
-            split_intersecting(ra, interval->start, p, fix_interval, true);
+            split_intersecting(ra, start, p, interval, true);
         }
     }
 
@@ -631,22 +600,27 @@ static void move_to_active(LSRA* restrict ra, LiveInterval* interval) {
     int ri = interval - ra->intervals;
 
     if (set_get(&ra->active_set[rc], reg)) {
-        tb_panic("intervals should never be forced out, we should've accomodated them in the first place");
+        tb_panic("v%d: interval v%d should never be forced out, we should've accomodated them in the first place", ri, ra->active[rc][reg]);
     }
 
+    assert(reg < 16);
     set_put(&ra->active_set[rc], reg);
     ra->active[rc][reg] = ri;
 }
 
 // update active range to match where the position is currently
 static bool update_interval(LSRA* restrict ra, LiveInterval* restrict interval, bool is_active, int time, int inactive_index) {
-    int ri = interval - ra->intervals;
+    /*while (interval->split_kid >= 0) {
+        interval = &ra->intervals[interval->split_kid];
+    }*/
 
     // get to the right range first
     while (interval->ranges[interval->active_range].end <= time) {
+        assert(interval->active_range > 0);
         interval->active_range -= 1;
     }
 
+    int ri = interval - ra->intervals;
     int hole_end = interval->ranges[interval->active_range].start;
     int active_end = interval->ranges[interval->active_range].end;
     bool is_now_active = time >= hole_end;
@@ -654,7 +628,7 @@ static bool update_interval(LSRA* restrict ra, LiveInterval* restrict interval,
     int rc = interval->reg_class;
     int reg = interval->assigned;
 
-    if (time >= interval->end) { // expired
+    if (interval->active_range == 0) { // expired
         if (is_active) {
             REG_ALLOC_LOG printf("  #   active %s has expired (v%d)\n", reg_name(rc, reg), ri);
             set_remove(&ra->active_set[rc], reg);
@@ -694,8 +668,8 @@ static int linear_scan(Ctx* restrict ctx, TB_Function* f, int stack_usage, int e
     MachineBBs mbbs = ctx->machine_bbs;
     size_t interval_count = dyn_array_length(ra.intervals);
     CUIK_TIMED_BLOCK("build intervals") {
-        FOREACH_N(i, 0, ctx->block_count) {
-            TB_Node* bb = ctx->worklist.items[i];
+        FOREACH_REVERSE_N(i, 0, ctx->bb_count) {
+            TB_Node* bb = ctx->worklist.items[ctx->bb_order[i]];
             MachineBB* mbb = &nl_map_get_checked(mbbs, bb);
 
             int bb_start = mbb->start;
@@ -721,21 +695,20 @@ static int linear_scan(Ctx* restrict ctx, TB_Function* f, int stack_usage, int e
 
     // we use every fixed interval at the very start to force them into
     // the inactive set.
-    FOREACH_N(i, 0, 32) if (ra.intervals[i].ranges) {
-        ra.intervals[i].start = 0;
+    FOREACH_N(i, 0, 32) {
         add_range(&ra.intervals[i], 0, 1);
     }
 
-    dyn_array_destroy(ra.intervals[RBP].ranges);
-    dyn_array_destroy(ra.intervals[RSP].ranges);
-
     ra.endpoint = end;
     mark_callee_saved_constraints(ctx, ra.callee_saved);
 
     // generate unhandled interval list (sorted by starting point)
     ra.unhandled = dyn_array_create(LiveInterval*, (interval_count * 4) / 3);
-    FOREACH_N(i, 0, interval_count) dyn_array_put(ra.unhandled, i);
-    cuiksort_defs(ctx->intervals, 0, interval_count - 1, ra.unhandled);
+    FOREACH_N(i, 0, interval_count) {
+        ra.intervals[i].active_range = dyn_array_length(ra.intervals[i].ranges) - 1;
+        dyn_array_put(ra.unhandled, i);
+    }
+    cuiksort_defs(ra.intervals, 0, interval_count - 1, ra.unhandled);
 
     // only need enough to store for the biggest register class
     ra.free_pos  = TB_ARENA_ARR_ALLOC(tmp_arena, 16, int);
@@ -748,28 +721,24 @@ static int linear_scan(Ctx* restrict ctx, TB_Function* f, int stack_usage, int e
             LiveInterval* interval = &ra.intervals[ri];
 
             // unused interval, skip
-            if (interval->ranges == NULL) continue;
-
-            int time = interval->start;
-
-            int before_next_time = interval->start;
-            if (dyn_array_length(ra.unhandled)) {
-                int before = ra.intervals[ra.unhandled[dyn_array_length(ra.unhandled) - 1]].start - 1;
-                if (before > before_next_time) {
-                    before_next_time = before;
-                }
+            if (interval->reg >= 0) {
+                continue;
             }
 
+            int time = interval->ranges[interval->active_range].start;
+            assert(time != INT_MAX);
+
+            int end = interval_end(interval);
             if (interval->reg >= 0) {
-                REG_ALLOC_LOG printf("  # %-5s t=[%-4d - %4d)\n", reg_name(interval->reg_class, interval->reg), time, interval->end);
+                REG_ALLOC_LOG printf("  # %-5s t=[%-4d - %4d)\n", reg_name(interval->reg_class, interval->reg), time, end);
             } else if (interval->spill > 0) {
                 REG_ALLOC_LOG {
-                    printf("  # v%-4d t=[%-4d - %4d) SPILLED [RBP - %d]\n", ri, time, interval->end, interval->spill);
+                    printf("  # v%-4d t=[%-4d - %4d) SPILLED [RBP - %d]\n", ri, time, end, interval->spill);
                 }
                 continue;
             } else {
                 REG_ALLOC_LOG {
-                    printf("  # v%-4d t=[%-4d - %4d)   ", ri, time, interval->end);
+                    printf("  # v%-4d t=[%-4d - %4d)   ", ri, time, end);
                     if (interval->n != NULL) {
                         print_node_sexpr(interval->n, 0);
                     }
@@ -818,7 +787,6 @@ static int linear_scan(Ctx* restrict ctx, TB_Function* f, int stack_usage, int e
             // add to active set
             if (reg >= 0) {
                 interval->assigned = reg;
-                interval->active_range = dyn_array_length(interval->ranges) - 1;
                 move_to_active(&ra, interval);
             }
 
@@ -843,46 +811,36 @@ static int linear_scan(Ctx* restrict ctx, TB_Function* f, int stack_usage, int e
 
     // move resolver
     CUIK_TIMED_BLOCK("move resolver") {
-        FOREACH_N(i, 0, ctx->block_count) {
-            TB_Node* bb = ctx->worklist.items[i];
-            assert(bb->type == TB_START || bb->type == TB_REGION);
-
+        TB_Node** bbs = ctx->worklist.items;
+        int* bb_order = ctx->bb_order;
+        FOREACH_N(i, 0, ctx->bb_count) {
+            TB_Node* bb = bbs[bb_order[i]];
             MachineBB* mbb = &nl_map_get_checked(mbbs, bb);
-            TB_NodeRegion* r = TB_NODE_GET_EXTRA(bb);
-
-            if (r->end->type != TB_BRANCH) {
-                continue;
-            }
-
-            TB_NodeBranch* br = TB_NODE_GET_EXTRA(r->end);
-            FOREACH_REVERSE_N(i, 0, br->succ_count) {
-                TB_Node* bb = br->succ[i];
-                MachineBB* target = &nl_map_get_checked(mbbs, bb);
-
-                // for all live-ins, we should check if we need to insert a move
-                FOREACH_SET(i, target->live_in) {
-                    LiveInterval* interval = &ra.intervals[i];
-
-                    // if the value changes across the edge, insert move
-                    LiveInterval* start = split_interval_at(&ra, interval, mbb->end);
-                    LiveInterval* end = split_interval_at(&ra, interval, target->start);
-
-                    if (start != end) {
-                        if (start->spill > 0) {
-                            assert(end->spill <= 0 && "TODO: both can't be spills yet");
-                            insert_split_move(&ra, target->start + 1, start - ra.intervals, end - ra.intervals);
-                        } else {
-                            insert_split_move(&ra, mbb->terminator - 1, start - ra.intervals, end - ra.intervals);
+            TB_Node* end_node = mbb->end_node;
+
+            for (User* u = end_node->users; u; u = u->next) {
+                if (cfg_is_control(u->n)) {
+                    TB_Node* succ = cfg_get_fallthru(u->n);
+                    MachineBB* target = &nl_map_get_checked(mbbs, succ);
+
+                    // for all live-ins, we should check if we need to insert a move
+                    FOREACH_SET(k, target->live_in) {
+                        LiveInterval* interval = &ra.intervals[k];
+
+                        // if the value changes across the edge, insert move
+                        LiveInterval* start = split_interval_at(&ra, interval, mbb->end);
+                        LiveInterval* end = split_interval_at(&ra, interval, target->start);
+
+                        if (start != end) {
+                            if (start->spill > 0) {
+                                assert(end->spill <= 0 && "TODO: both can't be spills yet");
+                                insert_split_move(&ra, target->start + 1, start - ra.intervals, end - ra.intervals);
+                            } else {
+                                insert_split_move(&ra, mbb->terminator - 1, start - ra.intervals, end - ra.intervals);
+                            }
                         }
                     }
                 }
-
-                // the moves are inserted either at the end of block from or at the beginning of block to,
-                // depending on the control flow
-                // resolver.find_insert_position(from, to)
-
-                // insert all moves in correct order (without overwriting registers that are used later)
-                // resolver.resolve_mappings()
             }
         }
     }
@@ -916,17 +874,17 @@ static int linear_scan(Ctx* restrict ctx, TB_Function* f, int stack_usage, int e
 // Sorting unhandled list
 ////////////////////////////////
 static size_t partition(LiveInterval* intervals, ptrdiff_t lo, ptrdiff_t hi, RegIndex* arr) {
-    int pivot = intervals[arr[(hi - lo) / 2 + lo]].start; // middle
+    int pivot = interval_start(&intervals[arr[(hi - lo) / 2 + lo]]); // middle
 
     ptrdiff_t i = lo - 1, j = hi + 1;
     for (;;) {
         // Move the left index to the right at least once and while the element at
         // the left index is less than the pivot
-        do { i += 1; } while (intervals[arr[i]].start > pivot);
+        do { i += 1; } while (interval_start(&intervals[arr[i]]) > pivot);
 
         // Move the right index to the left at least once and while the element at
         // the right index is greater than the pivot
-        do { j -= 1; } while (intervals[arr[j]].start < pivot);
+        do { j -= 1; } while (interval_start(&intervals[arr[j]]) < pivot);
 
         // If the indices crossed, return
         if (i >= j) return j;
diff --git a/tb/src/debug/cv.c b/tb/src/debug/cv.c
index 8b1a37f6..8c4dbdbd 100644
--- a/tb/src/debug/cv.c
+++ b/tb/src/debug/cv.c
@@ -3,6 +3,15 @@
 
 #include "cv_type_builder.c"
 
+#include <sys/stat.h>
+
+#if defined(_WIN32) && !defined(_POSIX_C_SOURCE)
+#define fileno _fileno
+#define fstat  _fstat
+#define stat   _stat
+#define strdup _strdup
+#endif
+
 // constant sized "hash map" which is used to
 // deduplicate types in the codeview
 #define MAX_TYPE_ENTRY_LOOKUP_SIZE 1024
@@ -35,7 +44,6 @@ static void md5sum_file(uint8_t out_bytes[16], const char* filepath) {
 }
 
 static uint16_t get_codeview_type(TB_DataType dt) {
-    assert(dt.width == 0 && "TODO: implement vector types in CodeView output");
     switch (dt.type) {
         case TB_INT: {
             if (dt.data <= 0)  return 0x0003; // T_VOID
diff --git a/tb/src/ir_printer.c b/tb/src/ir_printer.c
index b2eb6b80..61a1a35a 100644
--- a/tb/src/ir_printer.c
+++ b/tb/src/ir_printer.c
@@ -11,6 +11,7 @@ TB_API void tb_default_print_callback(void* user_data, const char* fmt, ...) {
 TB_API const char* tb_node_get_name(TB_Node* n) {
     switch (n->type) {
         case TB_NULL: return "BAD";
+        case TB_DEAD: return "dead";
 
         case TB_START:  return "start";
         case TB_END:    return "end";
@@ -70,6 +71,8 @@ TB_API const char* tb_node_get_name(TB_Node* n) {
         case TB_ATOMIC_OR: return "atomic.or";
         case TB_ATOMIC_CAS: return "atomic.cas";
 
+        case TB_CLZ: return "clz";
+        case TB_CTZ: return "ctz";
         case TB_NEG: return "neg";
         case TB_NOT: return "not";
         case TB_AND: return "and";
@@ -110,8 +113,6 @@ TB_API const char* tb_node_get_name(TB_Node* n) {
 
 #define P(...) callback(user_data, __VA_ARGS__)
 static void tb_print_type(TB_DataType dt, TB_PrintCallback callback, void* user_data) {
-    assert(dt.width < 8 && "Vector width too big!");
-
     switch (dt.type) {
         case TB_INT: {
             if (dt.data == 0) P("void");
@@ -148,257 +149,145 @@ static void tb_print_type(TB_DataType dt, TB_PrintCallback callback, void* user_
     }
 }
 
-#if 0
-static void tb_print_node(TB_Function* f, NL_HashSet* visited, TB_PrintCallback callback, void* user_data, TB_Node* restrict n) {
-    if (!nl_hashset_put(visited, n)) {
-        return;
-    }
-
-    bool is_effect = tb_has_effects(n);
-
-    const char* fillcolor = is_effect ? "lightgrey" : "antiquewhite1";
-    if (n->type == TB_PROJ) {
-        fillcolor = "lightblue";
-    }
-
-    P("  r%p [style=\"filled\"; ordering=in; shape=box; fillcolor=%s; label=\"", n, fillcolor);
-    switch (n->type) {
-        case TB_FLOAT32_CONST: {
-            TB_NodeFloat32* f = TB_NODE_GET_EXTRA(n);
-            P("f32 %f", f->value);
-            break;
-        }
-
-        case TB_FLOAT64_CONST: {
-            TB_NodeFloat64* f = TB_NODE_GET_EXTRA(n);
-            P("f64 %f", f->value);
-            break;
+static bool print_graph_node(TB_Function* f, NL_HashSet* visited, TB_PrintCallback callback, void* user_data, TB_Node* restrict n) {
+    do {
+        if (!nl_hashset_put(visited, n)) {
+            return false;
         }
 
-        case TB_INTEGER_CONST: {
-            P("%s ", tb_node_get_name(n));
+        /*bool is_effect = tb_has_effects(n);
+        const char* fillcolor = is_effect ? "lightgrey" : "antiquewhite1";
+        if (n->dt.type == TB_MEMORY) {
+            fillcolor = "lightblue";
+        }*/
+
+        P("  r%u [ordering=in; shape=plaintext; label=\"", n->gvn);
+        switch (n->type) {
+            case TB_START: P("start"); break;
+            case TB_REGION: P("region"); break;
+
+            case TB_LOAD: {
+                P("ld.");
+                tb_print_type(n->dt, callback, user_data);
+                break;
+            }
+            case TB_STORE: {
+                P("st.");
+                tb_print_type(n->inputs[2]->dt, callback, user_data);
+                break;
+            }
 
-            TB_NodeInt* num = TB_NODE_GET_EXTRA(n);
-            tb_print_type(n->dt, callback, user_data);
+            case TB_SYMBOL: {
+                TB_Symbol* sym = TB_NODE_GET_EXTRA_T(n, TB_NodeSymbol)->sym;
+                if (sym->name[0]) {
+                    P("%s", sym->name);
+                } else {
+                    P("sym%p", sym);
+                }
+                break;
+            }
 
-            if (num->value < 0xFFFF) {
-                int bits = n->dt.type == TB_PTR ? 64 : n->dt.data;
-                int64_t x = tb__sxt(num->value, bits, 64);
+            case TB_BITCAST: {
+                P("bitcast ");
+                tb_print_type(n->inputs[1]->dt, callback, user_data);
+                P(" -> ");
+                tb_print_type(n->dt, callback, user_data);
+                break;
+            }
 
-                P(" %"PRId64, x);
-            } else {
-                P("%#0"PRIx64, num->value);
+            case TB_INTEGER_CONST: {
+                TB_NodeInt* num = TB_NODE_GET_EXTRA(n);
+                if (num->value < 0xFFFF) {
+                    P("%"PRId64, num->value);
+                } else {
+                    P("%#0"PRIx64, num->value);
+                }
+                break;
             }
-            break;
-        }
 
-        case TB_MEMBER_ACCESS: {
-            TB_NodeMember* m = TB_NODE_GET_EXTRA(n);
-            P("member %"PRId64, m->offset);
-            break;
-        }
+            case TB_ARRAY_ACCESS: {
+                int64_t stride = TB_NODE_GET_EXTRA_T(n, TB_NodeArray)->stride;
+                P("*%td", stride);
+                break;
+            }
 
-        case TB_SYMBOL: {
-            TB_NodeSymbol* s = TB_NODE_GET_EXTRA(n);
-            P("symbol %s", s->sym->name ? s->sym->name : "???");
-            break;
-        }
+            case TB_MEMBER_ACCESS: {
+                int64_t offset = TB_NODE_GET_EXTRA_T(n, TB_NodeMember)->offset;
+                P("+%td", offset);
+                break;
+            }
 
-        case TB_END: {
-            P("stop ");
-            FOREACH_N(i, 1, n->input_count) {
-                if (i != 1) P(", ");
-                tb_print_type(n->inputs[i]->dt, callback, user_data);
+            case TB_PROJ: {
+                int index = TB_NODE_GET_EXTRA_T(n, TB_NodeProj)->index;
+                if (n->inputs[0]->type == TB_START) {
+                    if (index == 0) {
+                        P("ctrl");
+                    } else if (index == 1) {
+                        P("mem");
+                    } else if (index == 2) {
+                        P("rpc");
+                    } else {
+                        P("%c", 'a'+(index - 3));
+                    }
+                } else {
+                    P("%d", index);
+                }
+                break;
             }
-            break;
-        }
 
-        case TB_STORE: {
-            P("store ");
-            tb_print_type(n->inputs[3]->dt, callback, user_data);
+            default:
+            P("%s", tb_node_get_name(n));
             break;
         }
+        P("\"]");
 
-        case TB_START:
-        case TB_REGION:
-        case TB_BRANCH:
-        P("%s", tb_node_get_name(n));
-        break;
+        FOREACH_N(i, 0, n->input_count) if (n->inputs[i]) {
+            TB_Node* in = n->inputs[i];
 
-        case TB_PROJ: {
-            int index = TB_NODE_GET_EXTRA_T(n, TB_NodeProj)->index;
+            const char* color = "black";
+            TB_DataType dt = n->type == TB_PROJ ? n->dt : in->dt;
 
-            P("proj.");
-            tb_print_type(n->dt, callback, user_data);
-            P(" %zu", index);
-            break;
-        }
-
-        case TB_CMP_EQ:
-        case TB_CMP_NE:
-        case TB_CMP_ULT:
-        case TB_CMP_ULE:
-        case TB_CMP_SLT:
-        case TB_CMP_SLE:
-        case TB_CMP_FLT:
-        case TB_CMP_FLE:
-        P("%s ", tb_node_get_name(n));
-        tb_print_type(n->inputs[1]->dt, callback, user_data);
-        break;
-
-        default:
-        P("%s ", tb_node_get_name(n));
-        tb_print_type(n->dt, callback, user_data);
-        break;
-    }
-    P("\"];\n");
-
-    FOREACH_N(i, 0, n->input_count) if (n->inputs[i]) {
-        TB_Node* in = n->inputs[i];
-
-        if (in->type == TB_PROJ && (in->inputs[0]->type != TB_START || in->dt.type == TB_CONTROL)) {
-            // projections get treated as edges
-            TB_Node* src = in->inputs[0];
-            int index = TB_NODE_GET_EXTRA_T(in, TB_NodeProj)->index;
-
-            tb_print_node(f, visited, callback, user_data, src);
-
-            P("  r%p -> r%p [label=\"", src, n);
-            if (src->type == TB_BRANCH) {
-                // branch projections can get nicer looking
-                TB_NodeBranch* br = TB_NODE_GET_EXTRA(src);
-
-                TB_Node* key = src->input_count > 1 ? src->inputs[1] : NULL;
-                if (br->keys[0] == 0 && br->succ_count == 2 && key && key->dt.type == TB_INT) {
-                    // boolean branch, we can use true and false
-                    P(index ? "is false?" : "is true?");
-                } else if (br->succ_count == 1) {
-                    P("");
-                } else if (index == 0) {
-                    P("is default?");
-                } else {
-                    P("is %d?", br->keys[index - 1]);
-                }
-            } else if (in->dt.type == TB_CONTROL) {
-                P("cproj");
-            } else {
-                P("%zu", index);
-            }
-
-            if (in->dt.type == TB_CONTROL) {
-                P("\"] [color=\"red\"]\n");
-            } else {
-                P("\"]\n");
-            }
-        } else {
-            tb_print_node(f, visited, callback, user_data, in);
-            P("  r%p -> r%p", in, n);
-            if (i == 0 || n->type == TB_REGION) {
-                P(" [color=\"red\"]");
-            } else if (in->dt.type == TB_MEMORY) {
-                P(" [color=\"blue\" style=\"dashed\"]");
+            if (dt.type == TB_CONTROL) {
+                color = "red";
+            } else if (dt.type == TB_CONT) {
+                color = "purple";
+            } else if (dt.type == TB_MEMORY) {
+                color = "blue";
             }
 
-            if (n->type == TB_CALL && i > 1) {
-                P(" [label=\"%zu\"];\n", i - 2);
-            } else if (n->type == TB_PHI && i > 0) {
-                P(" [label=\"%zu\"];\n", i - 1);
-            } else {
-                P("\n");
-            }
+            P("; r%u -> r%u [color=%s]", in->gvn, n->gvn, color);
         }
-    }
-}
-#endif
-
-static bool print_graph_node(TB_Function* f, NL_HashSet* visited, TB_PrintCallback callback, void* user_data, TB_Node* restrict n) {
-    if (!nl_hashset_put(visited, n)) {
-        return false;
-    }
-
-    bool is_effect = tb_has_effects(n);
-    if (is_effect) {
-        return false;
-    }
+        P("\n");
 
-    const char* fillcolor = is_effect ? "lightgrey" : "antiquewhite1";
-    P("  r%p [style=\"filled\"; ordering=in; shape=box; fillcolor=%s; label=\"", n, fillcolor);
-    P("%zu: %s", n->gvn, tb_node_get_name(n));
-    P("\"];\n");
+        // print all the inputs
+        FOREACH_N(i, 1, n->input_count) if (n->inputs[i]) {
+            print_graph_node(f, visited, callback, user_data, n->inputs[i]);
+        }
 
-    FOREACH_N(i, 0, n->input_count) if (n->inputs[i]) {
-        TB_Node* in = n->inputs[i];
-        P("  r%p -> r%p\n", in, n);
-        print_graph_node(f, visited, callback, user_data, in);
-    }
+        if (n->input_count == 0) break;
+        n = n->inputs[0];
+    } while (n != NULL);
 
     return true;
 }
 
-static void print_graph_bb(TB_Function* f, NL_HashSet* visited, TB_PrintCallback callback, void* user_data, TB_Node* restrict bb) {
-    if (!nl_hashset_put(visited, bb)) {
-        return;
-    }
-
-    // walk control edges (aka predecessors)
-    TB_NodeRegion* r = TB_NODE_GET_EXTRA(bb);
-    if (r->end->type == TB_BRANCH) {
-        TB_NodeBranch* br = TB_NODE_GET_EXTRA(r->end);
-        FOREACH_REVERSE_N(i, 0, br->succ_count) {
-            print_graph_bb(f, visited, callback, user_data, br->succ[i]);
-        }
-    }
-
-    P("  subgraph {\n", bb->gvn);
-    TB_Node* curr = r->end;
-    do {
-        nl_hashset_put(visited, curr);
-        P("    r%p [style=\"filled\"; shape=box; fillcolor=antiquewhite1; label=\"%zu: ", curr, curr->gvn);
-        if (curr->type == TB_END) {
-            P("END");
-        } else {
-            P("EFFECT");
-        }
-        P("\"]\n    r%p -> r%p\n", curr->inputs[0], curr);
-        curr = curr->inputs[0];
-    } while (curr != bb);
-
-    // basic block header
-    P("    r%p [style=\"filled\"; shape=box; fillcolor=antiquewhite1; label=\"%zu: %s\"]\n", bb, bb->gvn, bb->type == TB_START ? "START" : "REGION");
-    if (bb->type == TB_START) {
-        P("    { rank=min; r%p }\n", bb);
-    } else if (r->end->type == TB_END) {
-        P("    { rank=max; r%p }\n", r->end);
-    }
-    P("  }\n");
-
-    // write predecessor edges
-    FOREACH_N(i, 0, bb->input_count) {
-        TB_Node* pred = bb->inputs[i];
-        if (pred->type == TB_PROJ) {
-            P("  r%p -> r%p\n", pred->inputs[0], bb);
-        } else {
-            P("  r%p -> r%p\n", bb->inputs[i], bb);
-        }
-    }
+TB_API void tb_pass_print_dot(TB_Passes* opt, TB_PrintCallback callback, void* user_data) {
+    TB_Function* f = opt->f;
+    P("digraph %s {\n", f->super.name ? f->super.name : "unnamed");
 
-    // process adjacent nodes
-    curr = r->end;
-    do {
-        FOREACH_N(i, 1, curr->input_count) {
-            print_graph_node(f, visited, callback, user_data, curr->inputs[i]);
-            P("    r%p -> r%p\n", curr->inputs[i], curr);
-        }
-        curr = curr->inputs[0];
-    } while (curr != bb);
-}
+    Worklist tmp_ws = { 0 };
+    worklist_alloc(&tmp_ws, f->node_count);
 
-TB_API void tb_function_print(TB_Function* f, TB_PrintCallback callback, void* user_data) {
-    P("digraph %s {\n  rankdir=TB\n", f->super.name ? f->super.name : "unnamed");
+    TB_CFG cfg = tb_compute_rpo2(f, &tmp_ws, &opt->stack);
 
     NL_HashSet visited = nl_hashset_alloc(f->node_count);
-    print_graph_bb(f, &visited, callback, user_data, f->start_node);
+    FOREACH_N(i, 0, cfg.block_count) {
+        TB_BasicBlock* bb = &nl_map_get_checked(cfg.node_to_block, tmp_ws.items[i]);
+        print_graph_node(f, &visited, callback, user_data, bb->end);
+    }
     nl_hashset_free(visited);
+    worklist_free(&tmp_ws);
+    tb_free_cfg(&cfg);
 
-    P("}\n\n");
+    P("}\n");
 }
diff --git a/tb/src/jit.c b/tb/src/jit.c
index 95073e2d..603c4f3e 100644
--- a/tb/src/jit.c
+++ b/tb/src/jit.c
@@ -1,6 +1,9 @@
 #include "tb_internal.h"
 #include "host.h"
 
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
 enum {
     ALLOC_GRANULARITY = 16,
 
diff --git a/tb/src/libtb.c b/tb/src/libtb.c
index c3ec043f..40ace594 100644
--- a/tb/src/libtb.c
+++ b/tb/src/libtb.c
@@ -33,32 +33,7 @@
 #include "linker/elf.c"
 
 // Platform layer
-#if defined(_POSIX_C_SOURCE) && !defined(_WIN32)
-void* tb_platform_valloc(size_t size) {
-    return mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-}
-
-void* tb_platform_valloc_guard(size_t size) {
-    return mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-}
-
-void tb_platform_vfree(void* ptr, size_t size) {
-    munmap(ptr, size);
-}
-
-bool tb_platform_vprotect(void* ptr, size_t size, TB_MemProtect prot) {
-    uint32_t protect;
-    switch (prot) {
-        case TB_PAGE_RO:  protect = PROT_READ; break;
-        case TB_PAGE_RW:  protect = PROT_READ | PROT_WRITE; break;
-        case TB_PAGE_RX:  protect = PROT_READ | PROT_EXEC; break;
-        case TB_PAGE_RXW: protect = PROT_READ | PROT_WRITE | PROT_EXEC; break;
-        default: return false;
-    }
-
-    return mprotect(ptr, size, protect) == 0;
-}
-#elif defined(_WIN32)
+#if defined(_WIN32)
 #pragma comment(lib, "onecore.lib")
 
 void* tb_platform_valloc(size_t size) {
@@ -135,4 +110,29 @@ void* tb_jit_create_stack(size_t* out_size) {
     return VirtualAlloc2(GetCurrentProcess(), NULL, size, MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES, PAGE_READWRITE, &param, 1);
 }
 #endif /* NTDDI_VERSION >= NTDDI_WIN10_RS4 */
+#elif defined(_POSIX_C_SOURCE)
+void* tb_platform_valloc(size_t size) {
+    return mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+}
+
+void* tb_platform_valloc_guard(size_t size) {
+    return mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+}
+
+void tb_platform_vfree(void* ptr, size_t size) {
+    munmap(ptr, size);
+}
+
+bool tb_platform_vprotect(void* ptr, size_t size, TB_MemProtect prot) {
+    uint32_t protect;
+    switch (prot) {
+        case TB_PAGE_RO:  protect = PROT_READ; break;
+        case TB_PAGE_RW:  protect = PROT_READ | PROT_WRITE; break;
+        case TB_PAGE_RX:  protect = PROT_READ | PROT_EXEC; break;
+        case TB_PAGE_RXW: protect = PROT_READ | PROT_WRITE | PROT_EXEC; break;
+        default: return false;
+    }
+
+    return mprotect(ptr, size, protect) == 0;
+}
 #endif
diff --git a/tb/src/objects/coff.h b/tb/src/objects/coff.h
index 4f26e74e..85d52468 100644
--- a/tb/src/objects/coff.h
+++ b/tb/src/objects/coff.h
@@ -1,14 +1,6 @@
 // https://github.com/dotnet/runtime/blob/main/docs/design/specs/PE-COFF.md
 #pragma once
 #include "../tb_internal.h"
-#include <sys/stat.h>
-
-#if defined(_WIN32) && !defined(_POSIX_C_SOURCE)
-#define fileno _fileno
-#define fstat  _fstat
-#define stat   _stat
-#define strdup _strdup
-#endif
 
 /*#if TB_HOST_ARCH == TB_HOST_X86_64
 #include <emmintrin.h>
diff --git a/tb/src/objects/elf64.c b/tb/src/objects/elf64.c
index e10ba616..2d638d36 100644
--- a/tb/src/objects/elf64.c
+++ b/tb/src/objects/elf64.c
@@ -22,7 +22,7 @@ static int put_symbol(TB_Emitter* stab, uint32_t name, uint8_t sym_info, uint16_
         .size  = size
     };
     tb_outs(stab, sizeof(sym), (uint8_t*)&sym);
-    return stab->count / sizeof(TB_Elf64_Sym);
+    return (stab->count / sizeof(TB_Elf64_Sym)) - 1;
 }
 
 static void put_section_symbols(DynArray(TB_ModuleSection) sections, TB_Emitter* strtbl, TB_Emitter* stab, int t) {
@@ -39,8 +39,12 @@ static void put_section_symbols(DynArray(TB_ModuleSection) sections, TB_Emitter*
             out_f->parent->super.symbol_id = put_symbol(stab, name, TB_ELF64_ST_INFO(t, TB_ELF64_STT_FUNC), sec_num, out_f->code_pos, out_f->code_size);
         }
 
+        int acceptable = t == TB_ELF64_STB_GLOBAL ? TB_LINKAGE_PUBLIC : TB_LINKAGE_PRIVATE;
         dyn_array_for(i, globals) {
             TB_Global* g = globals[i];
+            if (g->linkage != acceptable) {
+                continue;
+            }
 
             uint32_t name = 0;
             if (g->super.name) {
@@ -143,8 +147,8 @@ TB_ExportBuffer tb_elf64obj_write_output(TB_Module* m, const IDebugFormat* dbg)
 
     assert(dbg_section_count == 0);
 
-    put_section_symbols(sections, &strtbl, &local_symtab, TB_ELF64_STB_GLOBAL);
-    put_section_symbols(sections, &strtbl, &global_symtab, TB_ELF64_STB_LOCAL);
+    put_section_symbols(sections, &strtbl, &local_symtab, TB_ELF64_STB_LOCAL);
+    put_section_symbols(sections, &strtbl, &global_symtab, TB_ELF64_STB_GLOBAL);
 
     FOREACH_N(i, 0, exports.count) {
         TB_External* ext = exports.data[i];
@@ -207,7 +211,7 @@ TB_ExportBuffer tb_elf64obj_write_output(TB_Module* m, const IDebugFormat* dbg)
         dyn_array_for(j, funcs) {
             TB_FunctionOutput* func_out = funcs[j];
 
-            size_t source_offset = func_out->prologue_length + func_out->code_pos;
+            size_t source_offset = func_out->code_pos;
             for (TB_SymbolPatch* p = func_out->last_patch; p; p = p->prev) {
                 if (p->internal) continue;
 
diff --git a/tb/src/opt/branches.h b/tb/src/opt/branches.h
index e79d68a7..52094a6f 100644
--- a/tb/src/opt/branches.h
+++ b/tb/src/opt/branches.h
@@ -2,11 +2,13 @@
 static TB_Node* ideal_region(TB_Passes* restrict p, TB_Function* f, TB_Node* n) {
     TB_NodeRegion* r = TB_NODE_GET_EXTRA(n);
 
-    // if there's one predecessor and it's to an unconditional branch, merge them.
-    if (n->input_count == 1 && n->inputs[0]->type == TB_PROJ &&
-        n->inputs[0]->inputs[0]->type == TB_BRANCH &&
-        n->inputs[0]->inputs[0]->input_count == 1) {
-        // check for any phi nodes
+    // if a region is dead, start a violent death chain
+    if (n->input_count == 0) {
+        n->type = TB_DEAD;
+        return n;
+    } else if (n->input_count == 1) {
+        // single entry regions are useless...
+        // check for any phi nodes, because we're single entry they're all degens
         User* use = n->users;
         while (use != NULL) {
             User* next = use->next;
@@ -17,69 +19,34 @@ static TB_Node* ideal_region(TB_Passes* restrict p, TB_Function* f, TB_Node* n)
             use = next;
         }
 
-        TB_Node* top_node = unsafe_get_region(n->inputs[0]);
-        TB_NodeRegion* top_region = TB_NODE_GET_EXTRA(top_node);
-
-        // set new terminator
-        top_region->end = r->end;
-        TB_Node* parent = n->inputs[0]->inputs[0]->inputs[0];
-
-        tb_pass_kill_node(p, n->inputs[0]->inputs[0]);
-        tb_pass_kill_node(p, n->inputs[0]);
+        // we might want this as an identity
+        return n->inputs[0];
+    } else {
+        // remove dead predeccessors
+        bool changes = false;
 
-        return parent;
-    }
-
-    // if a region is dead, dettach it's succesors
-    if (n->input_count == 0 && r->end->type == TB_BRANCH) {
-        TB_NodeBranch* br = TB_NODE_GET_EXTRA(r->end);
         size_t i = 0;
-        while (i < br->succ_count) {
-            TB_Node* succ = br->succ[i];
-            if (remove_pred(p, f, n, succ)) {
-                tb_pass_mark(p, succ);
-                tb_pass_mark_users(p, succ);
-
-                br->succ_count -= 1;
-            } else {
-                i += 1;
+        while (i < n->input_count) {
+            if (n->inputs[i]->type == TB_DEAD) {
+                changes = true;
+                remove_input(p, f, n, i);
+
+                // update PHIs
+                for (User* use = n->users; use; use = use->next) {
+                    if (use->n->type == TB_PHI && use->slot == 0) {
+                        remove_input(p, f, use->n, i + 1);
+                    }
+                }
+                continue;
             }
-        }
-
-        assert(br->succ_count == 0);
-    }
-
-    return NULL;
-}
-
-static void transmute_goto(TB_Passes* restrict opt, TB_Function* f, TB_Node* br, TB_Node* dst) {
-    assert(br->type == TB_BRANCH && dst->input_count >= 1);
 
-    // convert to unconditional branch
-    set_input(opt, br, NULL, 1);
-    br->input_count = 1;
-
-    // remove predecessor from other branches
-    TB_Node* bb = unsafe_get_region(br);
-    TB_NodeBranch* br_info = TB_NODE_GET_EXTRA(br);
-
-    size_t i = 0;
-    while (i < br_info->succ_count) {
-        if (br_info->succ[i] != dst) {
-            if (remove_pred(opt, f, bb, br_info->succ[i])) {
-                br_info->succ_count -= 1;
-            }
-        } else {
             i += 1;
         }
+
+        return changes ? n : NULL;
     }
-    assert(br_info->succ[0] == dst);
 
-    // we need to mark the changes to that jump
-    // threading can clean it up
-    tb_pass_mark(opt, bb);
-    tb_pass_mark(opt, dst);
-    tb_pass_mark_users(opt, bb);
+    return NULL;
 }
 
 static TB_Node* ideal_phi(TB_Passes* restrict opt, TB_Function* f, TB_Node* n) {
@@ -90,6 +57,10 @@ static TB_Node* ideal_phi(TB_Passes* restrict opt, TB_Function* f, TB_Node* n) {
     }
 
     // if branch, both paths are empty => select(cond, t, f)
+    //
+    // TODO(NeGate): we can make this diamond trick work for bigger
+    // branches, we should support a lookup instruction similar to
+    // "switch" logic for data.
     TB_DataType dt = n->dt;
     TB_Node* region = n->inputs[0];
     if (region->input_count == 2) {
@@ -101,47 +72,51 @@ static TB_Node* ideal_phi(TB_Passes* restrict opt, TB_Function* f, TB_Node* n) {
             }
         }
 
-        // guarentee paths are effectless
-        if (!is_empty_bb(opt, region->inputs[0]->inputs[0])) { return NULL; }
-        if (!is_empty_bb(opt, region->inputs[1]->inputs[0])) { return NULL; }
-
-        // these don't have directions, i just need names
-        TB_Node* left  = region->inputs[0]->inputs[0]->inputs[0];
-        TB_Node* right = region->inputs[1]->inputs[0]->inputs[0];
-
-        // is it a proper if-diamond?
-        if (left->input_count == 1 && right->input_count == 1 &&
-            left->inputs[0]->type == TB_PROJ &&
-            left->inputs[0]->type == TB_PROJ &&
-            left->inputs[0]->inputs[0]->type == TB_BRANCH &&
-            left->inputs[0]->inputs[0] == right->inputs[0]->inputs[0]) {
-            TB_Node* branch = left->inputs[0]->inputs[0];
+        // guarentee paths are effectless (there's only one data phi and no control nodes)
+        //
+        //        If
+        //       /  \
+        // CProjT    CProjF          Region[0][0] == Region[1][0]
+        //       \  /
+        //      Region
+        //
+        TB_Node* left = region->inputs[0];
+        TB_Node* right = region->inputs[1];
+        if (left->inputs[0]->type == TB_BRANCH && left->inputs[0] == right->inputs[0]) {
+            TB_Node* branch = left->inputs[0];
             TB_NodeBranch* header_br = TB_NODE_GET_EXTRA(branch);
 
             if (header_br->succ_count == 2) {
-                assert(left->inputs[0]->inputs[0]->input_count == 2);
-                TB_Node* cond    = branch->inputs[1];
-                TB_Node* left_v  = n->inputs[1];
-                TB_Node* right_v = n->inputs[2];
+                assert(branch->input_count == 2);
+
+                TB_Node *values[2];
+                for (User* u = branch->users; u; u = u->next) {
+                    TB_Node* proj = u->n;
+                    if (proj->type == TB_PROJ) {
+                        int index = TB_NODE_GET_EXTRA_T(proj, TB_NodeProj)->index;
+                        // the projection needs to exclusively refer to the region,
+                        // if not we can't elide those effects here.
+                        if (proj->users->next != NULL || proj->users->n != region) {
+                            return NULL;
+                        }
+
+                        int phi_i = proj->users->slot;
+                        assert(phi_i + 1 < n->input_count);
+                        values[index] = n->inputs[1 + phi_i];
+                    }
+                }
 
-                bool right_false = header_br->succ[0] == right;
                 uint64_t falsey = TB_NODE_GET_EXTRA_T(branch, TB_NodeBranch)->keys[0];
+                TB_Node* cond = branch->inputs[1];
 
                 // TODO(NeGate): handle non-zero falseys
                 if (falsey == 0) {
-                    // kill both successors, since they were unique we can properly murder em'
-                    tb_pass_kill_node(opt, left->inputs[0]);
-                    tb_pass_kill_node(opt, left);
-                    tb_pass_kill_node(opt, right->inputs[0]);
-                    tb_pass_kill_node(opt, right);
-
                     // header -> merge
                     {
                         TB_Node* parent = branch->inputs[0];
                         tb_pass_kill_node(opt, branch);
-
-                        TB_NodeRegion* header = TB_NODE_GET_EXTRA(unsafe_get_region(parent));
-                        header->end = TB_NODE_GET_EXTRA_T(region, TB_NodeRegion)->end;
+                        tb_pass_kill_node(opt, left);
+                        tb_pass_kill_node(opt, right);
 
                         // attach the header and merge to each other
                         tb_pass_mark(opt, parent);
@@ -151,8 +126,8 @@ static TB_Node* ideal_phi(TB_Passes* restrict opt, TB_Function* f, TB_Node* n) {
 
                     TB_Node* selector = tb_alloc_node(f, TB_SELECT, dt, 4, 0);
                     set_input(opt, selector, cond, 1);
-                    set_input(opt, selector, left_v, 2 + right_false);
-                    set_input(opt, selector, right_v, 2 + !right_false);
+                    set_input(opt, selector, values[0], 2);
+                    set_input(opt, selector, values[1], 3);
                     return selector;
                 }
             }
@@ -174,7 +149,7 @@ static TB_Node* ideal_branch(TB_Passes* restrict opt, TB_Function* f, TB_Node* n
             // if (a && b) A else B => if (a ? b : 0) A else B
             //
             // TODO(NeGate): implement form which works on an arbitrary falsey
-            if (n->inputs[0]->type == TB_REGION && n->inputs[0]->input_count == 2 && is_empty_bb(opt, n)) {
+            /*if (n->inputs[0]->type == TB_REGION && n->inputs[0]->input_count == 2 && is_empty_bb(opt, n)) {
                 TB_Node* bb = n->inputs[0];
 
                 uint64_t falsey = br->keys[0];
@@ -227,7 +202,7 @@ static TB_Node* ideal_branch(TB_Passes* restrict opt, TB_Function* f, TB_Node* n
                         return n;
                     }
                 }
-            }
+            }*/
 
             // br ((y <= x)) => br (x < y) flipped conditions
             if (cmp_type == TB_CMP_SLE || cmp_type == TB_CMP_ULE) {
@@ -236,7 +211,12 @@ static TB_Node* ideal_branch(TB_Passes* restrict opt, TB_Function* f, TB_Node* n
                 set_input(opt, new_cmp, cmp_node->inputs[1], 2);
                 TB_NODE_SET_EXTRA(new_cmp, TB_NodeCompare, .cmp_dt = TB_NODE_GET_EXTRA_T(cmp_node, TB_NodeCompare)->cmp_dt);
 
-                SWAP(TB_Node*, br->succ[0], br->succ[1]);
+                // flip
+                for (User* u = n->users; u; u = u->next) {
+                    TB_NodeProj* p = TB_NODE_GET_EXTRA(u->n);
+                    p->index = !p->index;
+                }
+
                 set_input(opt, n, new_cmp, 1);
                 tb_pass_mark(opt, new_cmp);
                 return n;
@@ -250,13 +230,17 @@ static TB_Node* ideal_branch(TB_Passes* restrict opt, TB_Function* f, TB_Node* n
 
                 // flip successors
                 if (cmp_type == TB_CMP_EQ) {
-                    SWAP(TB_Node*, br->succ[0], br->succ[1]);
+                    for (User* u = n->users; u; u = u->next) {
+                        TB_NodeProj* p = TB_NODE_GET_EXTRA(u->n);
+                        p->index = !p->index;
+                    }
                 }
+
                 return n;
             }
 
             // check if we're dominated by a branch that already checked it
-            TB_Node* bb = unsafe_get_region(n->inputs[0]);
+            /*TB_Node* bb = get_block_begin(n->inputs[0]);
             for (User* u = find_users(opt, cmp_node); u; u = u->next) {
                 if (u->n != n && u->slot == 1 && u->n->type == TB_BRANCH) {
                     TB_NodeBranch* dom_branch = TB_NODE_GET_EXTRA(u->n);
@@ -266,7 +250,7 @@ static TB_Node* ideal_branch(TB_Passes* restrict opt, TB_Function* f, TB_Node* n
                         ptrdiff_t match = -1;
                         FOREACH_N(i, 0, dom_branch->succ_count) {
                             TB_Node* target = dom_branch->succ[i];
-                            if (tb_is_dominated_by(target, bb)) {
+                            if (tb_is_dominated_by(opt->cfg, target, bb)) {
                                 match = i;
                                 break;
                             }
@@ -278,63 +262,69 @@ static TB_Node* ideal_branch(TB_Passes* restrict opt, TB_Function* f, TB_Node* n
                         }
                     }
                 }
-            }
+            }*/
         }
     }
 
     // constant fold branch
-    /*if (n->input_count == 2) {
-        uint64_t key;
-        if (get_int_const(n->inputs[1], &key)) {
+    if (n->input_count == 2) {
+        Lattice* key = lattice_universe_get(&opt->universe, n->inputs[1]);
+
+        // we can walk the dominator tree to see if the condition is already
+        // been checked.
+
+        if (key->tag == LATTICE_INT && key->_int.min == key->_int.max) {
+            int64_t key_const = key->_int.max;
+
             size_t taken = 0;
             FOREACH_N(i, 0, br->succ_count - 1) {
-                uint64_t case_key = br->keys[i];
-                if (key == case_key) { taken = i + 1; break; }
+                int64_t case_key = br->keys[i];
+                if (key_const == case_key) {
+                    taken = i + 1;
+                    break;
+                }
             }
 
-            TB_Node* dead = make_dead(f, opt);
+            TB_Node* dead = make_dead_node(f, opt);
 
             // convert dead projections into DEAD and convert live projection into index 0
-            for (User* use = find_users(opt, n); use; use = use->next) {
-                if (use->n->type == TB_PROJ) {
-                    int index = TB_NODE_GET_EXTRA_T(use->n, TB_NodeProj)->index;
+            for (User* u = n->users; u; u = u->next) {
+                TB_Node* proj = u->n;
+                if (proj->type == TB_PROJ) {
+                    int index = TB_NODE_GET_EXTRA_T(proj, TB_NodeProj)->index;
                     if (index != taken) {
-                        subsume_node(opt, f, use->n, dead);
+                        subsume_node(opt, f, proj, dead);
                     } else {
-                        TB_NODE_GET_EXTRA_T(use->n, TB_NodeProj)->index = 0;
-
-                        User* proj_use = find_users(opt, use->n);
-                        assert(proj_use->next == NULL && "control projection has conflicts?");
-                        assert(proj_use->n->type == TB_REGION);
+                        TB_NODE_GET_EXTRA_T(proj, TB_NodeProj)->index = 0;
+
+                        // if we folded away from a region, then we should subsume
+                        // the degen phis.
+                        assert(proj->users->next == NULL);
+                        TB_Node* succ = proj->users->n;
+                        if (succ->type == TB_REGION) {
+                            int phi_i = proj->users->slot;
+
+                            User* u = succ->users;
+                            while (u != NULL) {
+                                User* next = u->next;
+                                if (u->n->type == TB_PHI) {
+                                    tb_pass_mark_users(opt, u->n);
+                                    subsume_node(opt, f, u->n, u->n->inputs[phi_i + 1]);
+                                }
+                                u = next;
+                            }
+                        }
 
-                        br->succ_count = 1;
-                        br->succ[0] = proj_use->n;
+                        tb_pass_kill_node(opt, proj);
+                        set_input(opt, succ, n->inputs[0], 0);
                     }
                 }
             }
-            assert(br->succ_count == 1);
 
             // remove condition
-            set_input(opt, n, NULL, 1);
-            n->input_count = 1;
-            return n;
+            return dead;
         }
     }
 
-    // check if it's a dead region
-    TB_Node* parent = unsafe_get_region(n);
-    if (parent->input_count == 0 && br->succ_count != 0) {
-        // remove predecessor from successors
-        TB_Node* dead = make_dead(f, opt);
-        for (User* use = find_users(opt, n); use; use = use->next) {
-            if (use->n->type == TB_PROJ) {
-                subsume_node(opt, f, use->n, dead);
-            }
-        }
-
-        br->succ_count = 0;
-        return n;
-    }*/
-
     return NULL;
 }
diff --git a/tb/src/opt/cfg.h b/tb/src/opt/cfg.h
index 9d58401a..2cd0201a 100644
--- a/tb/src/opt/cfg.h
+++ b/tb/src/opt/cfg.h
@@ -1,79 +1,159 @@
 
-typedef struct {
-    TB_Function* f;
-
-    size_t block_count;
-    TB_Node** blocks;
-} DomContext;
-
-// we'll be walking backwards from the end node
-static void postorder(Worklist* restrict ws, TB_Node* n) {
-    if (!worklist_test_n_set(ws, n)) {
-        // walk control edges (aka predecessors)
-        TB_NodeRegion* r = TB_NODE_GET_EXTRA(n);
-        if (r->end->type == TB_BRANCH) {
-            TB_NodeBranch* br = TB_NODE_GET_EXTRA(r->end);
-            FOREACH_REVERSE_N(i, 0, br->succ_count) {
-                postorder(ws, br->succ[i]);
-            }
-        }
+void tb_free_cfg(TB_CFG* cfg) {
+    nl_map_for(i, cfg->node_to_block) {
+        nl_hashset_free(cfg->node_to_block[i].v.items);
+    }
+    nl_map_free(cfg->node_to_block);
+}
+
+TB_CFG tb_compute_rpo(TB_Function* f, TB_Passes* p) {
+    return tb_compute_rpo2(f, &p->worklist, &p->stack);
+}
+
+static TB_Node* next_control(Worklist* ws, TB_Node* n) {
+    // unless it's a branch (aka a terminator), it'll have one successor
+    TB_Node* next = NULL;
+    for (User* u = n->users; u; u = u->next) {
+        TB_Node* succ = u->n;
+
+        // we can't treat regions in the chain
+        if (succ->type == TB_REGION) break;
 
-        dyn_array_put(ws->items, n);
+        // we've found the next step in control flow
+        if (cfg_is_control(succ) && !worklist_test_n_set(ws, succ)) {
+            return succ;
+        }
     }
+
+    return NULL;
 }
 
-size_t tb_push_postorder(TB_Function* f, Worklist* restrict ws) {
+TB_CFG tb_compute_rpo2(TB_Function* f, Worklist* ws, DynArray(TB_Node*)* tmp_stack) {
     assert(dyn_array_length(ws->items) == 0);
-    postorder(ws, f->start_node);
-    return dyn_array_length(ws->items);
+
+    TB_CFG cfg = { 0 };
+    DynArray(TB_Node*) stack = *tmp_stack;
+    if (stack == NULL) {
+        stack = dyn_array_create(TB_Node*, 1024);
+    }
+
+    dyn_array_put(stack, f->start_node);
+    worklist_test_n_set(ws, f->start_node);
+
+    // depth-first search
+    int order = 0;
+    while (dyn_array_length(stack)) {
+        TB_Node* n = dyn_array_pop(stack);
+
+        // we've spotted a BB entry
+        if (cfg_is_bb_entry(n)) {
+            // proj BB's will prefer to be REGION BB's
+            if (n->inputs[0]->type != TB_START && n->type == TB_PROJ && n->users->n->type == TB_REGION) {
+                // we've already seen this BB, let's skip it
+                if (worklist_test_n_set(ws, n->users->n)) {
+                    continue;
+                }
+
+                n = n->users->n;
+            }
+
+            // walk until terminator
+            TB_Node* entry = n;
+            TB_BasicBlock bb = { .id = cfg.block_count++ };
+            while (!cfg_is_terminator(n)) {
+                TB_Node* next = next_control(ws, n);
+                if (next == NULL) {
+                    break;
+                }
+                n = next;
+            }
+
+            // the start node always has it's dom depth filled
+            if (bb.id == 0) {
+                bb.dom = entry;
+                bb.dom_depth = 0;
+            } else {
+                bb.dom_depth = -1;
+            }
+
+            bb.end = n;
+            dyn_array_put(ws->items, entry);
+            nl_map_put(cfg.node_to_block, entry, bb);
+        }
+
+        // add successors (could be multi-way like a branch)
+        if (n->type == TB_BRANCH) {
+            size_t succ_count = TB_NODE_GET_EXTRA_T(n, TB_NodeBranch)->succ_count;
+
+            dyn_array_put_uninit(stack, succ_count);
+            TB_Node** top = &stack[dyn_array_length(stack) - 1];
+
+            for (User* u = n->users; u; u = u->next) {
+                TB_Node* succ = u->n;
+                if (cfg_is_control(succ) && !worklist_test_n_set(ws, succ)) {
+                    assert(succ->type == TB_PROJ);
+                    int index = TB_NODE_GET_EXTRA_T(succ, TB_NodeProj)->index;
+                    top[-index] = succ;
+                }
+            }
+        } else {
+            for (User* u = n->users; u; u = u->next) {
+                TB_Node* succ = u->n;
+                if (cfg_is_control(succ) && !worklist_test_n_set(ws, succ)) {
+                    dyn_array_put(stack, succ);
+                }
+            }
+        }
+    }
+
+    *tmp_stack = stack;
+    return cfg;
 }
 
-static int find_traversal_index(TB_Node* n) {
-    assert(n->type == TB_REGION || n->type == TB_START);
-    assert(TB_NODE_GET_EXTRA_T(n, TB_NodeRegion)->postorder_id >= 0);
-    return TB_NODE_GET_EXTRA_T(n, TB_NodeRegion)->postorder_id;
+static int find_traversal_index(TB_CFG* cfg, TB_Node* n) {
+    return nl_map_get_checked(cfg->node_to_block, n).id;
 }
 
-static int try_find_traversal_index(TB_Node* n) {
-    assert(n->type == TB_REGION || n->type == TB_START);
-    return TB_NODE_GET_EXTRA_T(n, TB_NodeRegion)->postorder_id;
+static int try_find_traversal_index(TB_CFG* cfg, TB_Node* n) {
+    ptrdiff_t search = nl_map_get(cfg->node_to_block, n);
+    return search >= 0 ? cfg->node_to_block[search].v.id : -1;
 }
 
-static int resolve_dom_depth(TB_Node* bb) {
-    if (dom_depth(bb) >= 0) {
-        return dom_depth(bb);
+static int resolve_dom_depth(TB_CFG* cfg, TB_Node* bb) {
+    if (dom_depth(cfg, bb) >= 0) {
+        return dom_depth(cfg, bb);
     }
 
-    int parent = resolve_dom_depth(idom(bb));
+    int parent = resolve_dom_depth(cfg, idom(cfg, bb));
 
     // it's one more than it's parent
-    TB_NODE_GET_EXTRA_T(bb, TB_NodeRegion)->dom_depth = parent + 1;
+    nl_map_get_checked(cfg->node_to_block, bb).dom_depth = parent + 1;
     return parent + 1;
 }
 
-TB_DominanceFrontiers* tb_get_dominance_frontiers(TB_Function* f, size_t count, TB_Node** blocks) {
-    size_t stride = (count + 63) / 64;
-    size_t elems = stride * count;
+TB_DominanceFrontiers* tb_get_dominance_frontiers(TB_Function* f, TB_Passes* restrict p, TB_CFG cfg, TB_Node** blocks) {
+    size_t stride = (cfg.block_count + 63) / 64;
+    size_t elems = stride * cfg.block_count;
     size_t size = sizeof(TB_DominanceFrontiers) + sizeof(uint64_t)*elems;
 
     TB_DominanceFrontiers* df = tb_platform_heap_alloc(size);
     memset(df, 0, size);
     df->stride = stride;
 
-    FOREACH_REVERSE_N(i, 0, count) {
+    FOREACH_N(i, 0, cfg.block_count) {
         TB_Node* bb = blocks[i];
-        assert(TB_NODE_GET_EXTRA_T(bb, TB_NodeRegion)->postorder_id == i);
+        assert(find_traversal_index(&cfg, bb) == i);
 
-        if (bb->input_count >= 2) {
+        if (bb->type == TB_REGION && bb->input_count >= 2) {
             FOREACH_N(k, 0, bb->input_count) {
-                TB_Node* runner = unsafe_get_region(bb->inputs[k]);
+                TB_Node* runner = get_pred(bb, k);
 
-                while (runner->input_count > 0 && runner != idom(bb)) {
+                while (!(runner->type == TB_PROJ && runner->inputs[0]->type == TB_START) && runner != idom(&cfg, bb)) {
                     // add to frontier set
-                    TB_NodeRegion* r = TB_NODE_GET_EXTRA(runner);
-                    tb_dommy_fronts_put(df, r->postorder_id, i);
+                    int id = nl_map_get_checked(cfg.node_to_block, runner).id;
+                    tb_dommy_fronts_put(df, id, i);
 
-                    runner = idom(runner);
+                    runner = idom(&cfg, runner);
                 }
             }
         }
@@ -87,68 +167,58 @@ TB_API void tb_free_dominance_frontiers(TB_DominanceFrontiers* df) {
 }
 
 // https://www.cs.rice.edu/~keith/EMBED/dom.pdf
-void tb_compute_dominators(TB_Function* f, size_t count, TB_Node** blocks) {
-    DomContext ctx = { .f = f, .block_count = count, .blocks = blocks };
-
-    FOREACH_N(i, 0, count) {
-        TB_NodeRegion* r = TB_NODE_GET_EXTRA(blocks[i]);
-        r->dom_depth = -1; // unresolved
-        r->dom = NULL;
-        r->postorder_id = i;
-    }
-
-    // entry dominates itself
-    TB_NodeRegion* r = TB_NODE_GET_EXTRA(f->start_node);
-    r->dom_depth = 0;
-    r->dom = f->start_node;
-
-    // identify post order traversal order
-    int entry_dom = ctx.block_count - 1;
+void tb_compute_dominators(TB_Function* f, TB_Passes* restrict p, TB_CFG cfg) {
+    tb_compute_dominators2(f, &p->worklist, cfg);
+}
 
+void tb_compute_dominators2(TB_Function* f, Worklist* ws, TB_CFG cfg) {
+    TB_Node** blocks = ws->items;
     bool changed = true;
     while (changed) {
         changed = false;
 
         // for all nodes, b, in reverse postorder (except start node)
-        FOREACH_REVERSE_N(i, 0, count - 1) {
+        FOREACH_REVERSE_N(i, 1, cfg.block_count) {
             TB_Node* b = blocks[i];
-            TB_Node* new_idom = unsafe_get_region(b->inputs[0]);
-
-            // for all other predecessors, p, of b
-            FOREACH_N(j, 1, b->input_count) {
-                TB_Node* p = unsafe_get_region(b->inputs[j]);
-
-                // if doms[p] already calculated
-                TB_Node* idom_p = TB_NODE_GET_EXTRA_T(p, TB_NodeRegion)->dom;
-                if (idom_p == NULL && p->input_count > 0) {
-                    int a = try_find_traversal_index(p);
-                    if (a >= 0) {
-                        int b = find_traversal_index(new_idom);
-                        while (a != b) {
-                            // while (finger1 < finger2)
-                            //   finger1 = doms[finger1]
-                            while (a < b) {
-                                TB_Node* d = idom(blocks[a]);
-                                a = d ? find_traversal_index(d) : entry_dom;
+            TB_Node* new_idom = get_pred(b, 0);
+
+            if (b->type == TB_REGION) {
+                // for all other predecessors, p, of b
+                FOREACH_N(j, 1, b->input_count) {
+                    TB_Node* p = get_pred(b, j);
+
+                    // if doms[p] already calculated
+                    TB_Node* idom_p = idom(&cfg, p);
+                    if (idom_p == NULL && p->input_count > 0) {
+                        int a = try_find_traversal_index(&cfg, p);
+                        if (a >= 0) {
+                            int b = find_traversal_index(&cfg, new_idom);
+                            while (a != b) {
+                                // while (finger1 > finger2)
+                                //   finger1 = doms[finger1]
+                                while (a > b) {
+                                    TB_Node* d = idom(&cfg, blocks[a]);
+                                    a = d ? find_traversal_index(&cfg, d) : 0;
+                                }
+
+                                // while (finger2 > finger1)
+                                //   finger2 = doms[finger2]
+                                while (b > a) {
+                                    TB_Node* d = idom(&cfg, blocks[b]);
+                                    b = d ? find_traversal_index(&cfg, d) : 0;
+                                }
                             }
 
-                            // while (finger2 < finger1)
-                            //   finger2 = doms[finger2]
-                            while (b < a) {
-                                TB_Node* d = idom(blocks[b]);
-                                b = d ? find_traversal_index(d) : entry_dom;
-                            }
+                            new_idom = blocks[a];
                         }
-
-                        new_idom = blocks[a];
                     }
                 }
             }
 
             assert(new_idom != NULL);
-            TB_NodeRegion* region_b = TB_NODE_GET_EXTRA_T(b, TB_NodeRegion);
-            if (region_b->dom != new_idom) {
-                region_b->dom = new_idom;
+            TB_Node** dom_ptr = &nl_map_get_checked(cfg.node_to_block, b).dom;
+            if (*dom_ptr != new_idom) {
+                *dom_ptr = new_idom;
                 changed = true;
             }
         }
@@ -156,8 +226,8 @@ void tb_compute_dominators(TB_Function* f, size_t count, TB_Node** blocks) {
 
     // generate depth values
     CUIK_TIMED_BLOCK("generate dom tree") {
-        FOREACH_N(i, 0, count - 1) {
-            resolve_dom_depth(blocks[i]);
+        FOREACH_REVERSE_N(i, 1, cfg.block_count) {
+            resolve_dom_depth(&cfg, blocks[i]);
         }
     }
 }
@@ -171,9 +241,9 @@ TB_Node* tb_get_parent_region(TB_Node* n) {
     return n;
 }
 
-bool tb_is_dominated_by(TB_Node* expected_dom, TB_Node* bb) {
+bool tb_is_dominated_by(TB_CFG cfg, TB_Node* expected_dom, TB_Node* bb) {
     while (expected_dom != bb) {
-        TB_Node* new_bb = idom(bb);
+        TB_Node* new_bb = idom(&cfg, bb);
         if (bb == new_bb) {
             return false;
         }
diff --git a/tb/src/opt/fold.h b/tb/src/opt/fold.h
index fa2dd27e..0e645720 100644
--- a/tb/src/opt/fold.h
+++ b/tb/src/opt/fold.h
@@ -49,26 +49,16 @@ static bool get_int_const(TB_Node* n, uint64_t* imm) {
 ////////////////////////////////
 // Integer idealizations
 ////////////////////////////////
-static TB_Node* ideal_truncate(TB_Passes* restrict opt, TB_Function* f, TB_Node* n) {
+static TB_Node* ideal_bitcast(TB_Passes* restrict opt, TB_Function* f, TB_Node* n) {
     TB_Node* src = n->inputs[1];
-    if (src->type != TB_INTEGER_CONST || n->dt.type != TB_INT) {
-        return NULL;
-    }
 
-    TB_NodeInt* src_i = TB_NODE_GET_EXTRA(src);
-
-    uint64_t mask = n->dt.data == 64 ? UINT64_MAX : (1ull << n->dt.data) - 1;
-    return make_int_node(f, opt, n->dt, src_i->value & mask);
-}
-
-static TB_Node* ideal_int2ptr(TB_Passes* restrict opt, TB_Function* f, TB_Node* n) {
-    TB_Node* src = n->inputs[1];
-    if (src->type != TB_INTEGER_CONST) {
-        return NULL;
+    // int -> smaller int means truncate
+    if (src->dt.type == TB_INT && n->dt.type == TB_INT && src->dt.data > n->dt.data) {
+        n->type = TB_TRUNCATE;
+        return n;
     }
 
-    TB_NodeInt* src_i = TB_NODE_GET_EXTRA(src);
-    return make_int_node(f, opt, n->dt, src_i->value);
+    return NULL;
 }
 
 // cmp.slt(a, 0) => is_sign(a)
@@ -100,6 +90,279 @@ static bool inverted_cmp(TB_Node* n, TB_Node* n2) {
     }
 }
 
+static Lattice* dataflow_sext(TB_Passes* restrict opt, LatticeUniverse* uni, TB_Node* n) {
+    Lattice* a = lattice_universe_get(uni, n->inputs[1]);
+    int old_bits = n->inputs[1]->dt.data;
+
+    int64_t min = tb__sxt(a->_int.min, old_bits, n->dt.data);
+    int64_t max = tb__sxt(a->_int.max, old_bits, n->dt.data);
+    uint64_t zeros = a->_int.known_zeros;
+    uint64_t ones  = a->_int.known_ones;
+
+    // if we know the sign bit then we can know what the extended bits look like
+    uint64_t mask = tb__mask(n->dt.data) & ~tb__mask(old_bits);
+    if (zeros >> (old_bits - 1)) {
+        zeros |= mask;
+    } else if (ones >> (old_bits - 1)) {
+        ones |= mask;
+    }
+
+    return lattice_intern(uni, (Lattice){ LATTICE_INT, ._int = { min, max, zeros, ones } });
+}
+
+static Lattice* dataflow_zext(TB_Passes* restrict opt, LatticeUniverse* uni, TB_Node* n) {
+    Lattice* a = lattice_universe_get(uni, n->inputs[1]);
+    uint64_t mask = tb__mask(n->dt.data) & ~tb__mask(n->inputs[1]->dt.data);
+
+    int64_t min = a->_int.min;
+    int64_t max = a->_int.max;
+    uint64_t zeros = a->_int.known_zeros | mask; // we know the top bits must be zero
+    uint64_t ones  = a->_int.known_ones;
+
+    return lattice_intern(uni, (Lattice){ LATTICE_INT, ._int = { min, max, zeros, ones } });
+}
+
+static Lattice* dataflow_trunc(TB_Passes* restrict opt, LatticeUniverse* uni, TB_Node* n) {
+    Lattice* a = lattice_universe_get(uni, n->inputs[1]);
+
+    int64_t mask = tb__mask(n->dt.data);
+    int64_t min = a->_int.min & mask;
+    int64_t max = a->_int.max & mask;
+    if (min > max) {
+        min = lattice_int_min(n->dt.data);
+        max = lattice_int_max(n->dt.data);
+    }
+
+    uint64_t zeros = a->_int.known_zeros | ~mask;
+    uint64_t ones  = a->_int.known_ones  & mask;
+    return lattice_intern(uni, (Lattice){ LATTICE_INT, ._int = { min, max, zeros, ones } });
+}
+
+static int64_t wrapped_int_add(int64_t x, int64_t y) { return (uint64_t)x + (uint64_t)y; }
+static int64_t wrapped_int_sub(int64_t x, int64_t y) { return (uint64_t)x - (uint64_t)y; }
+static int64_t wrapped_int_mul(int64_t x, int64_t y) { return (uint64_t)x * (uint64_t)y; }
+static bool wrapped_int_lt(int64_t x, int64_t y, int bits) { return (int64_t)tb__sxt(x, bits, 64) < (int64_t)tb__sxt(y, bits, 64); }
+
+static bool sub_overflow(uint64_t x, uint64_t y, uint64_t xy, int bits) {
+    uint64_t v = (x ^ y) & (xy ^ x);
+    // check the sign bit
+    return (v >> (bits - 1)) & 1;
+}
+
+static Lattice* dataflow_arith(TB_Passes* restrict opt, LatticeUniverse* uni, TB_Node* n) {
+    Lattice* a = lattice_universe_get(uni, n->inputs[1]);
+    Lattice* b = lattice_universe_get(uni, n->inputs[2]);
+    assert(a->tag == LATTICE_INT && b->tag == LATTICE_INT);
+
+    int64_t mask = tb__mask(n->dt.data);
+    int64_t min, max;
+    switch (n->type) {
+        case TB_ADD:
+        min = wrapped_int_add(a->_int.min, b->_int.min);
+        max = wrapped_int_add(a->_int.max, b->_int.max);
+        break;
+
+        case TB_SUB:
+        min = wrapped_int_sub(a->_int.min, b->_int.min);
+        max = wrapped_int_sub(a->_int.max, b->_int.max);
+        break;
+
+        case TB_MUL:
+        min = wrapped_int_mul(a->_int.min, b->_int.min);
+        max = wrapped_int_mul(a->_int.max, b->_int.max);
+        break;
+    }
+
+    // truncate to the size of the raw DataType
+    min &= mask, max &= mask;
+
+    if (!lattice_is_const_int(a) || !lattice_is_const_int(b)) {
+        // if we overflow, default to the full range
+        if (n->type == TB_SUB) {
+            // subtraction does overflow check different from add or mul
+            if (sub_overflow(a->_int.min, b->_int.min, min, n->dt.data) ||
+                sub_overflow(a->_int.max, b->_int.max, max, n->dt.data)
+            ) {
+                min = lattice_int_min(n->dt.data);
+                max = lattice_int_max(n->dt.data);
+            }
+        } else {
+            if (((a->_int.min & b->_int.min) < 0 && min >= 0) ||
+                (~(a->_int.max | b->_int.max) < 0 && max < 0) ||
+                wrapped_int_lt(max, min, n->dt.data)
+            ) {
+                min = lattice_int_min(n->dt.data);
+                max = lattice_int_max(n->dt.data);
+            }
+        }
+    }
+
+    return lattice_intern(uni, (Lattice){ LATTICE_INT, ._int = { min, max } });
+}
+
+static Lattice* dataflow_int2ptr(TB_Passes* restrict opt, LatticeUniverse* uni, TB_Node* n) {
+    Lattice* a = lattice_universe_get(uni, n->inputs[1]);
+    assert(a->tag == LATTICE_INT);
+
+    if (a->_int.min == a->_int.max) {
+        // int2ptr with a constant leads to fun cool stuff (usually we get constant
+        // zeros)
+        LatticeTrifecta t = a->_int.min ? LATTICE_KNOWN_NOT_NULL : LATTICE_KNOWN_NULL;
+        return lattice_intern(uni, (Lattice){ LATTICE_POINTER, ._ptr = { t } });
+    }
+
+    return NULL;
+}
+
+static Lattice* dataflow_unary(TB_Passes* restrict opt, LatticeUniverse* uni, TB_Node* n) {
+    Lattice* a = lattice_universe_get(uni, n->inputs[1]);
+    if (a->tag == LATTICE_INT) {
+        uint64_t mask = tb__mask(n->dt.data);
+        uint64_t min = ~a->_int.min & mask;
+        uint64_t max = ~a->_int.max & mask;
+
+        if ((int64_t)min > (int64_t)max) {
+            SWAP(int64_t, min, max);
+        }
+
+        uint64_t zeros = 0, ones = 0;
+        if (n->type == TB_NEG) {
+            // -x => ~x + 1
+            //   because of this addition we can technically
+            //   overflow... umm? glhf?
+            uint64_t min_inc = (min+1) & mask;
+            uint64_t max_inc = (max+1) & mask;
+
+            if (min_inc < min || max_inc < min) {
+                min = lattice_int_min(n->dt.data);
+                max = lattice_int_min(n->dt.data);
+            } else {
+                min = min_inc;
+                max = max_inc;
+            }
+        } else {
+            zeros = ~a->_int.known_zeros;
+            ones  = ~a->_int.known_ones;
+        }
+
+        return lattice_intern(uni, (Lattice){ LATTICE_INT, ._int = { min, max, zeros, ones } });
+    } else {
+        return NULL;
+    }
+}
+
+static Lattice* dataflow_bits(TB_Passes* restrict opt, LatticeUniverse* uni, TB_Node* n) {
+    Lattice* a = lattice_universe_get(uni, n->inputs[1]);
+    Lattice* b = lattice_universe_get(uni, n->inputs[2]);
+
+    uint64_t zeros, ones;
+    switch (n->type) {
+        case TB_AND:
+        // 0 if either is zero, 1 if both are 1
+        zeros = a->_int.known_zeros | b->_int.known_zeros;
+        ones  = a->_int.known_ones  & b->_int.known_ones;
+        break;
+
+        case TB_OR:
+        // 0 if both are 0, 1 if either is 1
+        zeros = a->_int.known_zeros & b->_int.known_zeros;
+        ones  = a->_int.known_ones  | b->_int.known_ones;
+        break;
+
+        case TB_XOR:
+        // 0 if both bits are 0 or 1
+        // 1 if both bits aren't the same
+        zeros = (a->_int.known_zeros & b->_int.known_zeros) | (a->_int.known_ones & b->_int.known_ones);
+        ones  = (a->_int.known_zeros & b->_int.known_ones)  | (a->_int.known_ones & b->_int.known_zeros);
+        break;
+
+        default: tb_todo();
+    }
+
+    uint64_t mask = tb__mask(n->dt.data);
+    zeros &= mask, ones &= mask;
+
+    // we can deduce a min and max by assuming the unknown bits are either zeros or ones
+    int64_t min = ones, max = ~zeros;
+    if (wrapped_int_lt(max, min, n->dt.data)) {
+        min = lattice_int_min(n->dt.data);
+        max = lattice_int_max(n->dt.data);
+    }
+    min &= mask, max &= mask;
+
+    return lattice_intern(uni, (Lattice){ LATTICE_INT, ._int = { min, max, zeros, ones } });
+}
+
+static Lattice* dataflow_shift(TB_Passes* restrict opt, LatticeUniverse* uni, TB_Node* n) {
+    Lattice* a = lattice_universe_get(uni, n->inputs[1]);
+    Lattice* b = lattice_universe_get(uni, n->inputs[2]);
+
+    uint64_t bits = n->dt.data;
+    uint64_t mask = tb__mask(n->dt.data);
+
+    // shift that's in-bounds can tell us quite a few nice details
+    if (b->_int.max <= bits) {
+        uint64_t min, max, zeros, ones = 0;
+        switch (n->type) {
+            case TB_SHL:
+            min = a->_int.min << b->_int.min;
+            max = a->_int.max << b->_int.max;
+            min &= mask, max &= mask;
+
+            if (((a->_int.min & b->_int.min) < 0 && min >= 0) ||
+                (~(a->_int.max | b->_int.max) < 0 && max < 0) ||
+                wrapped_int_lt(max, min, n->dt.data)
+            ) {
+                min = lattice_int_min(n->dt.data);
+                max = lattice_int_max(n->dt.data);
+            }
+
+            // we at least shifted this many bits therefore we
+            // at least have this many zeros at the bottom
+            zeros = (1ull << b->_int.min) - 1ull;
+            // if we know how many bits we shifted then we know where
+            // our known ones ones went
+            if (b->_int.min == b->_int.max) {
+                ones <<= b->_int.min;
+            }
+            break;
+
+            case TB_SHR:
+            // perform shift logic as unsigned
+            min = a->_int.min;
+            max = a->_int.max;
+            if (min > max) {
+                min = 0, max = mask;
+            }
+
+            // the largest value is caused by the lowest shift amount
+            min >>= b->_int.max;
+            max >>= b->_int.min;
+
+            // convert range back into signed
+            if (wrapped_int_lt(max, min, n->dt.data)) {
+                min = lattice_int_min(n->dt.data);
+                max = lattice_int_max(n->dt.data);
+            }
+
+            // TODO(NeGate): we can technically guarentee the top bits are zero
+            zeros = 0;
+            // if we know how many bits we shifted then we know where
+            // our known ones ones went
+            if (b->_int.min == b->_int.max) {
+                ones >>= b->_int.min;
+            }
+            break;
+
+            default: tb_todo();
+        }
+
+        return lattice_intern(uni, (Lattice){ LATTICE_INT, ._int = { min, max, zeros, ones } });
+    } else {
+        return NULL;
+    }
+}
+
 static TB_Node* ideal_select(TB_Passes* restrict opt, TB_Function* f, TB_Node* n) {
     TB_Node* src = n->inputs[1];
 
@@ -214,21 +477,18 @@ static TB_Node* identity_extension(TB_Passes* restrict opt, TB_Function* f, TB_N
     }
 }
 
-static TB_Node* ideal_int_unary(TB_Passes* restrict opt, TB_Function* f, TB_Node* n) {
-    assert(n->type == TB_NOT || n->type == TB_NEG);
-    TB_Node* src = n->inputs[1];
-    if (src->type == TB_INTEGER_CONST) {
-        assert(src->dt.type == TB_INT && src->dt.data > 0);
-        uint64_t src_i = ~TB_NODE_GET_EXTRA_T(src, TB_NodeInt)->value;
+static int node_pos(TB_Node* n) {
+    switch (n->type) {
+        case TB_PHI:
+        return 1;
 
-        if (n->type == TB_NEG) {
-            // -x => ~x + 1
-            src_i += 1;
-        }
+        case TB_INTEGER_CONST:
+        case TB_FLOAT32_CONST:
+        case TB_FLOAT64_CONST:
+        return 2;
 
-        return make_int_node(f, opt, n->dt, src_i);
-    } else {
-        return NULL;
+        default:
+        return 3;
     }
 }
 
@@ -236,8 +496,7 @@ static TB_Node* ideal_int_binop(TB_Passes* restrict opt, TB_Function* f, TB_Node
     TB_NodeTypeEnum type = n->type;
     if (is_commutative(type)) {
         // if it's commutative: we wanna have a canonical form.
-        // lower types to the right (constants are basically the lowest things)
-        if (n->inputs[1]->type < n->inputs[2]->type) {
+        if (node_pos(n->inputs[1]) > node_pos(n->inputs[2])) {
             TB_Node* tmp = n->inputs[1];
             set_input(opt, n, n->inputs[2], 1);
             set_input(opt, n, tmp, 2);
@@ -319,41 +578,7 @@ static TB_Node* ideal_int_binop(TB_Passes* restrict opt, TB_Function* f, TB_Node
         }
     }
 
-    if (a->type != TB_INTEGER_CONST || b->type != TB_INTEGER_CONST) {
-        return NULL;
-    }
-
-    // fully fold
-    uint64_t ai = TB_NODE_GET_EXTRA_T(a, TB_NodeInt)->value;
-    uint64_t bi = TB_NODE_GET_EXTRA_T(b, TB_NodeInt)->value;
-    if (type >= TB_CMP_EQ && type <= TB_CMP_ULE) {
-        bool result = false;
-        switch (type) {
-            case TB_CMP_EQ:  result = ai == bi; break;
-            case TB_CMP_NE:  result = ai != bi; break;
-            case TB_CMP_ULT: result = ai <  bi; break;
-            case TB_CMP_ULE: result = ai <= bi; break;
-            default: tb_unreachable();
-        }
-
-        return make_int_node(f, opt, n->dt, result);
-    } else if (type >= TB_AND && type <= TB_MUL) {
-        uint64_t dst;
-        switch (type) {
-            case TB_AND: dst = ai & bi; break;
-            case TB_OR:  dst = ai | bi; break;
-            case TB_XOR: dst = ai ^ bi; break;
-            case TB_ADD: dst = ai + bi; break;
-            case TB_SUB: dst = ai - bi; break;
-            case TB_MUL: dst = ai * bi; break;
-            default: tb_unreachable();
-        }
-
-        // truncate
-        return make_int_node(f, opt, n->dt, dst & tb__mask(n->dt.data));
-    } else {
-        return NULL;
-    }
+    return NULL;
 }
 
 static TB_Node* ideal_int_div(TB_Passes* restrict opt, TB_Function* f, TB_Node* n) {
@@ -468,7 +693,7 @@ static TB_Node* identity_int_binop(TB_Passes* restrict opt, TB_Function* f, TB_N
 
         case TB_UDIV:
         case TB_SDIV:
-        return tb_inst_poison(f);
+        return make_poison(f, opt, n->dt);
 
         // (cmp.ne a 0) => a
         case TB_CMP_NE: {
diff --git a/tb/src/opt/gcm.h b/tb/src/opt/gcm.h
index 95dbf2f6..45e43ab9 100644
--- a/tb/src/opt/gcm.h
+++ b/tb/src/opt/gcm.h
@@ -1,45 +1,50 @@
 // Scheduling: "Global Code Motion Global Value Numbering", Cliff Click 1995
 // https://courses.cs.washington.edu/courses/cse501/06wi/reading/click-pldi95.pdf
+static uint32_t node_hash(void* a) { return ((TB_Node*) a)->gvn; }
+static bool node_compare(void* a, void* b) { return a == b; }
 
 ////////////////////////////////
 // Early scheduling
 ////////////////////////////////
-static void schedule_early(TB_Passes* passes, TB_Node* n) {
+static void schedule_early(TB_Passes* p, TB_Node* n) {
     // already visited
-    if (worklist_test_n_set(&passes->worklist, n)) {
+    if (worklist_test_n_set(&p->worklist, n)) {
         return;
     }
 
-    // track leaf nodes
-    if (n->input_count <= 2) {
-        dyn_array_put(passes->worklist.items, n);
-    }
+    // push node, late scheduling will process this list
+    dyn_array_put(p->worklist.items, n);
 
     // schedule inputs first
     FOREACH_N(i, 0, n->input_count) if (n->inputs[i]) {
-        schedule_early(passes, n->inputs[i]);
+        schedule_early(p, n->inputs[i]);
     }
 
-    if (!is_pinned(n)) {
-        TB_Node* best = passes->f->start_node;
+    // schedule unpinned nodes
+    if (!is_pinned(n) || n->input_count == 0) {
+        // start at the entry point
+        TB_BasicBlock* best = nl_map_get_checked(p->scheduled, p->worklist.items[0]);
         int best_depth = 0;
 
         // choose deepest block
-        FOREACH_N(i, 0, n->input_count) if (n->inputs[i] && n->inputs[i]->inputs[0]) {
-            TB_Node* bb = unsafe_get_region(n->inputs[i]);
+        FOREACH_N(i, 0, n->input_count) if (n->inputs[i]) {
+            ptrdiff_t search = nl_map_get(p->scheduled, n->inputs[i]);
+            if (search < 0) {
+                // input has no scheduling... weird?
+                continue;
+            }
 
-            int bb_depth = TB_NODE_GET_EXTRA_T(bb, TB_NodeRegion)->dom_depth;
-            if (best_depth < bb_depth) {
+            TB_BasicBlock* bb = p->scheduled[search].v;
+            if (best_depth < bb->dom_depth) {
+                best_depth = bb->dom_depth;
                 best = bb;
-                best_depth = bb_depth;
             }
         }
 
-        if (passes->f->start_node == best) {
-            best = passes->f->params[0];
-        }
+        DO_IF(TB_OPTDEBUG_GCM)(printf("%s: v%u into .bb%d\n", p->f->super.name, n->gvn, best->id));
 
-        set_input(passes, n, best, 0);
+        nl_hashset_put2(&best->items, n, node_hash, node_compare);
+        nl_map_put(p->scheduled, n, best);
     }
 }
 
@@ -48,109 +53,158 @@ static void schedule_early(TB_Passes* passes, TB_Node* n) {
 ////////////////////////////////
 // schedule nodes such that they appear the least common
 // ancestor to all their users
-static TB_Node* find_lca(TB_Node* a, TB_Node* b) {
+static TB_BasicBlock* find_lca(TB_Passes* p, TB_BasicBlock* a, TB_BasicBlock* b) {
     if (a == NULL) return b;
 
     // line both up
-    while (dom_depth(a) > dom_depth(b)) a = idom(a);
-    while (dom_depth(b) > dom_depth(a)) b = idom(b);
+    while (a->dom_depth > b->dom_depth) a = nl_map_get_checked(p->scheduled, a->dom);
+    while (b->dom_depth > a->dom_depth) b = nl_map_get_checked(p->scheduled, b->dom);
 
     while (a != b) {
-        b = idom(b);
-        a = idom(a);
+        b = idom_bb(p, b);
+        a = idom_bb(p, a);
     }
 
     return a;
 }
 
-static void schedule_late(TB_Passes* passes, TB_Node* n) {
-    // already visited
-    if (worklist_test_n_set(&passes->worklist, n)) {
-        return;
-    }
+static void schedule_late(TB_Passes* p, TB_Node* n) {
+    // pinned nodes can't be rescheduled
+    if (!is_pinned(n)) {
+        DO_IF(TB_OPTDEBUG_GCM)(printf("%s: try late v%u\n", p->f->super.name, n->gvn));
 
-    // schedule all users first
-    for (User* use = find_users(passes, n); use; use = use->next) {
-        schedule_late(passes, use->n);
-    }
+        // we're gonna find the least common ancestor
+        TB_BasicBlock* lca = NULL;
+        for (User* use = n->users; use; use = use->next) {
+            TB_Node* y = use->n;
 
-    // pinned nodes can't be rescheduled
-    if (is_pinned(n)) {
-        return;
-    }
+            ptrdiff_t search = nl_map_get(p->scheduled, y);
+            if (search < 0) continue; // dead
 
-    // we're gonna find the least common ancestor
-    TB_Node* lca = NULL;
-    for (User* use = find_users(passes, n); use; use = use->next) {
-        TB_Node* y = use->n;
-        if (y->inputs[0] == NULL) continue; // dead
+            TB_BasicBlock* use_block = p->scheduled[search].v;
+            if (y->type == TB_PHI) {
+                TB_Node* use_node = y->inputs[0];
+                assert(use_node->type == TB_REGION);
 
-        TB_Node* use_block = tb_get_parent_region(y->inputs[0]);
-        if (y->type == TB_PHI) {
-            if (y->input_count != use_block->input_count + 1) {
-                tb_panic("phi has parent with mismatched predecessors");
-            }
+                if (y->input_count != use_node->input_count + 1) {
+                    tb_panic("phi has parent with mismatched predecessors");
+                }
 
-            ptrdiff_t j = 1;
-            for (; j < y->input_count; j++) {
-                if (y->inputs[j] == n) {
-                    break;
+                ptrdiff_t j = 1;
+                for (; j < y->input_count; j++) {
+                    if (y->inputs[j] == n) {
+                        break;
+                    }
                 }
+                assert(j >= 0);
+
+                use_block = nl_map_get_checked(p->scheduled, use_node->inputs[j - 1]);
             }
-            assert(j >= 0);
 
-            use_block = get_block_begin(use_block->inputs[j - 1]);
+            lca = find_lca(p, lca, use_block);
         }
 
-        lca = find_lca(lca, use_block);
-    }
+        // tb_assert(lca, "missing least common ancestor");
+        if (lca != NULL) {
+            TB_OPTDEBUG(GCM)(
+                printf("  LATE v%u into .bb%d: ", n->gvn, lca->id),
+                print_node_sexpr(n, 0),
+                printf("\n")
+            );
+
+            ptrdiff_t search = nl_map_get(p->scheduled, n);
+            if (search >= 0) {
+                // replace old
+                TB_BasicBlock* old = p->scheduled[search].v;
+                p->scheduled[search].v = lca;
+                nl_hashset_remove2(&old->items, n, node_hash, node_compare);
+            } else {
+                nl_map_put(p->scheduled, n, lca);
+            }
 
-    if (passes->f->start_node == lca) {
-        lca = passes->f->params[0];
+            nl_hashset_put2(&lca->items, n, node_hash, node_compare);
+        }
     }
-
-    // tb_assert(lca, "missing least common ancestor");
-    set_input(passes, n, lca, 0);
 }
 
-void tb_pass_schedule(TB_Passes* p) {
-    if (p->scheduled) {
-        return;
+void tb_pass_schedule(TB_Passes* p, TB_CFG cfg) {
+    if (p->scheduled != NULL) {
+        nl_map_free(p->scheduled);
     }
 
     CUIK_TIMED_BLOCK("schedule") {
         Worklist* restrict ws = &p->worklist;
-        p->scheduled = true;
+        nl_map_create(p->scheduled, 256);
 
-        size_t block_count;
         CUIK_TIMED_BLOCK("dominators") {
-            worklist_clear(ws);
+            // jarvis pull up the dommies
+            tb_compute_dominators(p->f, p, cfg);
+
+            worklist_clear_visited(ws);
+            FOREACH_N(i, 0, cfg.block_count) {
+                TB_BasicBlock* best = &nl_map_get_checked(cfg.node_to_block, ws->items[i]);
+                if (i == 0) {
+                    worklist_test_n_set(ws, p->f->start_node);
+                    nl_map_put(p->scheduled, p->f->start_node, best);
+                }
 
-            block_count = tb_push_postorder(p->f, ws);
-            tb_compute_dominators(p->f, block_count, &ws->items[0]);
+                best->items = nl_hashset_alloc(32);
+                nl_map_put(p->scheduled, ws->items[i], best);
+                worklist_test_n_set(ws, ws->items[i]);
+            }
         }
 
-        CUIK_TIMED_BLOCK("early schedule") {
-            worklist_clear_visited(ws);
-            FOREACH_N(i, 0, block_count) {
-                TB_Node* bb = ws->items[i];
-                assert(bb->type == TB_START || bb->type == TB_REGION);
+        CUIK_TIMED_BLOCK("pinned schedule") {
+            FOREACH_REVERSE_N(i, 0, cfg.block_count) {
+                TB_Node* bb_node = ws->items[i];
+                TB_BasicBlock* bb = &nl_map_get_checked(cfg.node_to_block, bb_node);
+
+                if (i == 0) {
+                    // schedule START node
+                    TB_Node* start = p->f->start_node;
+                    nl_hashset_put2(&bb->items, start, node_hash, node_compare);
+                    nl_map_put(p->scheduled, start, bb);
+                }
 
-                TB_NodeRegion* r = TB_NODE_GET_EXTRA(bb);
-                schedule_early(p, r->end);
+                // schedule top of BB
+                nl_hashset_put2(&bb->items, bb_node, node_hash, node_compare);
+                nl_map_put(p->scheduled, bb_node, bb);
+
+                TB_Node* n = bb->end;
+                while (n != bb_node) {
+                    DO_IF(TB_OPTDEBUG_GCM)(printf("%s: v%u pinned to .bb%d\n", p->f->super.name, n->gvn, bb->id));
+                    nl_hashset_put2(&bb->items, n, node_hash, node_compare);
+                    nl_map_put(p->scheduled, n, bb);
+
+                    // mark projections into the same block
+                    for (User* use = n->users; use; use = use->next) {
+                        TB_Node* proj = use->n;
+                        if (proj->type == TB_PROJ) {
+                            nl_hashset_put2(&bb->items, proj, node_hash, node_compare);
+                            nl_map_put(p->scheduled, proj, bb);
+                        }
+                    }
+
+                    n = n->inputs[0];
+                }
+            }
+        }
+
+        CUIK_TIMED_BLOCK("early schedule") {
+            FOREACH_REVERSE_N(i, 0, cfg.block_count) {
+                TB_Node* end = nl_map_get_checked(cfg.node_to_block, ws->items[i]).end;
+                schedule_early(p, end);
             }
         }
 
         // move nodes closer to their usage site
         CUIK_TIMED_BLOCK("late schedule") {
-            worklist_clear_visited(ws);
-
-            // schedule late on leaves
-            FOREACH_N(i, 0, dyn_array_length(ws->items)) {
+            FOREACH_REVERSE_N(i, cfg.block_count, dyn_array_length(ws->items)) {
                 schedule_late(p, ws->items[i]);
             }
 
-            schedule_late(p, p->f->start_node);
+            worklist_clear_visited(ws);
+            dyn_array_set_length(ws->items, cfg.block_count);
         }
     }
 }
diff --git a/tb/src/opt/cse.h b/tb/src/opt/gvn.h
similarity index 96%
rename from tb/src/opt/cse.h
rename to tb/src/opt/gvn.h
index 5d1e6b46..f9519ea9 100644
--- a/tb/src/opt/cse.h
+++ b/tb/src/opt/gvn.h
@@ -46,6 +46,8 @@ static size_t extra_bytes(TB_Node* n) {
         case TB_TRUNCATE:
         case TB_INT2PTR:
         case TB_PTR2INT:
+        case TB_UINT2FLOAT:
+        case TB_FLOAT2UINT:
         case TB_INT2FLOAT:
         case TB_FLOAT2INT:
         case TB_FLOAT_EXT:
@@ -63,10 +65,13 @@ static size_t extra_bytes(TB_Node* n) {
         case TB_END:
         case TB_PROJ:
         case TB_PHI:
+        case TB_CLZ:
+        case TB_CTZ:
         case TB_VA_START:
         case TB_POISON:
         case TB_SELECT:
         case TB_MERGEMEM:
+        case TB_DEAD:
         return 0;
 
         case TB_START:
@@ -109,7 +114,7 @@ static size_t extra_bytes(TB_Node* n) {
     }
 }
 
-uint32_t cse_hash(void* a) {
+uint32_t gvn_hash(void* a) {
     TB_Node* n = a;
 
     size_t extra = extra_bytes(n);
@@ -130,7 +135,7 @@ uint32_t cse_hash(void* a) {
     return h;
 }
 
-bool cse_compare(void* a, void* b) {
+bool gvn_compare(void* a, void* b) {
     TB_Node *x = a, *y = b;
 
     // early outs
@@ -222,6 +227,8 @@ bool cse_compare(void* a, void* b) {
         case TB_FMUL:
         case TB_FDIV:
         case TB_PHI:
+        case TB_CLZ:
+        case TB_CTZ:
         case TB_MERGEMEM:
         return true;
 
diff --git a/tb/src/opt/lattice.h b/tb/src/opt/lattice.h
index aecf9cbe..99bc6e76 100644
--- a/tb/src/opt/lattice.h
+++ b/tb/src/opt/lattice.h
@@ -1,58 +1,6 @@
 #include <hashes.h>
 
-// TODO(NeGate): implement dual? from there i can do join with
-//
-// dual(dual(x) ^ dual(y)) = join(x, y)
-typedef struct {
-    uint64_t bot, top;
-
-    // for known bit analysis
-    uint64_t known_zeros;
-    uint64_t known_ones;
-} LatticeInt;
-
-// a simplification of the set of all pointers (or floats)
-typedef enum {
-    LATTICE_UNKNOWN,        // top aka {nan, non-nan} or for pointers {null, non-null}
-
-    LATTICE_KNOWN_NAN = 1,  // {nan}
-    LATTICE_KNOWN_NOT_NAN,  // {non-nan}
-
-    LATTICE_KNOWN_NULL = 1, // {null}
-    LATTICE_KNOWN_NOT_NULL  // {non-null}
-} LatticeTrifecta;
-
-typedef struct {
-    LatticeTrifecta trifecta;
-} LatticeFloat;
-
-// TODO(NeGate): we might wanna store more info like aliasing, ownership and alignment.
-typedef struct {
-    LatticeTrifecta trifecta;
-} LatticePointer;
-
-// Represents the fancier type system within the optimizer, it's
-// all backed by my shitty understanding of lattice theory
-typedef struct {
-    enum {
-        LATTICE_INT,
-        LATTICE_FLOAT32,
-        LATTICE_FLOAT64,
-        LATTICE_POINTER,
-    } tag;
-    uint32_t pad;
-    union {
-        LatticeInt _int;
-        LatticeFloat _float;
-        LatticePointer _ptr;
-    };
-} Lattice;
-
-// hash-consing because there's a lot of
-// redundant types we might construct.
-typedef struct {
-    NL_HashSet pool;
-} LatticeUniverse;
+static Lattice* lattice_top(LatticeUniverse* uni, TB_DataType dt);
 
 static uint32_t lattice_hash(void* a) {
     return tb__murmur3_32(a, sizeof(Lattice));
@@ -63,24 +11,82 @@ static bool lattice_cmp(void* a, void* b) {
     return aa->tag == bb->tag ? memcmp(aa, bb, sizeof(Lattice)) == 0 : false;
 }
 
+static bool lattice_is_const_int(Lattice* l) { return l->_int.min == l->_int.max; }
+
+static void lattice_universe_map(LatticeUniverse* uni, TB_Node* n, Lattice* l) {
+    // reserve cap, slow path :p
+    if (UNLIKELY(n->gvn >= uni->type_cap)) {
+        size_t new_cap = tb_next_pow2(n->gvn + 16);
+        uni->types = tb_platform_heap_realloc(uni->types, new_cap * sizeof(Lattice*));
+
+        // clear new space
+        FOREACH_N(i, uni->type_cap, new_cap) {
+            uni->types[i] = NULL;
+        }
+
+        uni->type_cap = new_cap;
+    }
+
+    uni->types[n->gvn] = l;
+}
+
+static Lattice* lattice_universe_get(LatticeUniverse* uni, TB_Node* n) {
+    // reserve cap, slow path :p
+    if (UNLIKELY(n->gvn >= uni->type_cap)) {
+        size_t new_cap = tb_next_pow2(n->gvn + 16);
+        uni->types = tb_platform_heap_realloc(uni->types, new_cap * sizeof(Lattice*));
+
+        // clear new space
+        FOREACH_N(i, uni->type_cap, new_cap) {
+            uni->types[i] = NULL;
+        }
+
+        uni->type_cap = new_cap;
+    }
+
+    if (uni->types[n->gvn] == NULL) {
+        return uni->types[n->gvn] = lattice_top(uni, n->dt);
+    } else {
+        return uni->types[n->gvn];
+    }
+}
+
+static Lattice* lattice_intern(LatticeUniverse* uni, Lattice l) {
+    Lattice* k = nl_hashset_get2(&uni->pool, &l, lattice_hash, lattice_cmp);
+    if (k != NULL) {
+        return k;
+    }
+
+    // allocate new node
+    k = tb_arena_alloc(uni->arena, sizeof(Lattice));
+    memcpy(k, &l, sizeof(l));
+    nl_hashset_put2(&uni->pool, k, lattice_hash, lattice_cmp);
+    return k;
+}
+
+static int64_t lattice_int_min(int bits) { return 1ll << (bits - 1); }
+static int64_t lattice_int_max(int bits) { return (1ll << (bits - 1)) - 1; }
+
+// constructs a type for a CONTROL node
+static Lattice* lattice_ctrl(LatticeUniverse* uni, TB_Node* dom) {
+    return lattice_intern(uni, (Lattice){ LATTICE_CONTROL, ._ctrl = { dom } });
+}
+
 // maximal subset
-static Lattice lattice_top(TB_DataType dt) {
+static Lattice* lattice_top(LatticeUniverse* uni, TB_DataType dt) {
     switch (dt.type) {
         case TB_INT: {
             assert(dt.data <= 64);
-            uint64_t max_bits = UINT64_MAX >> dt.data;
-            tb_todo();
-
-            return (Lattice){ LATTICE_INT, ._int = { 0, max_bits } };
+            return lattice_intern(uni, (Lattice){ LATTICE_INT, ._int = { lattice_int_min(dt.data), lattice_int_max(dt.data) } });
         }
 
         case TB_FLOAT: {
             assert(dt.data == TB_FLT_32 || dt.data == TB_FLT_64);
-            return (Lattice){ dt.data == TB_FLT_64 ? LATTICE_FLOAT64 : LATTICE_FLOAT32, ._float = { LATTICE_UNKNOWN } };
+            return lattice_intern(uni, (Lattice){ dt.data == TB_FLT_64 ? LATTICE_FLOAT64 : LATTICE_FLOAT32, ._float = { LATTICE_UNKNOWN } });
         }
 
         case TB_PTR: {
-            return (Lattice){ LATTICE_POINTER, ._ptr = { LATTICE_UNKNOWN } };
+            return lattice_intern(uni, (Lattice){ LATTICE_POINTER, ._ptr = { LATTICE_UNKNOWN } });
         }
 
         default:
@@ -93,7 +99,7 @@ static Lattice lattice_top(TB_DataType dt) {
 #define TRIFECTA_MEET(a, b) ((a).trifecta == (b).trifecta ? (a).trifecta : LATTICE_UNKNOWN)
 
 // generates the greatest lower bound between a and b
-static Lattice lattice_meet(const Lattice* a, const Lattice* b) {
+static Lattice* lattice_meet(LatticeUniverse* uni, Lattice* a, Lattice* b) {
     assert(a->tag == b->tag);
     switch (a->tag) {
         case LATTICE_INT: {
@@ -101,24 +107,24 @@ static Lattice lattice_meet(const Lattice* a, const Lattice* b) {
             LatticeInt aa = a->_int;
             LatticeInt bb = b->_int;
 
-            LatticeInt i = { aa.bot, aa.top };
-            if (i.bot > bb.bot) i.bot = bb.bot;
-            if (i.top < bb.top) i.top = bb.top;
+            LatticeInt i = { aa.min, aa.max };
+            if (i.min > bb.min) i.min = bb.min;
+            if (i.max < bb.max) i.max = bb.max;
 
             i.known_zeros = aa.known_zeros & bb.known_zeros;
             i.known_ones = aa.known_ones & bb.known_ones;
-            return (Lattice){ LATTICE_INT, ._int = i };
+            return lattice_intern(uni, (Lattice){ LATTICE_INT, ._int = i });
         }
 
         case LATTICE_FLOAT32:
         case LATTICE_FLOAT64: {
             LatticeFloat f = { .trifecta = TRIFECTA_MEET(a->_float, b->_float) };
-            return (Lattice){ a->tag, ._float = f };
+            return lattice_intern(uni, (Lattice){ a->tag, ._float = f });
         }
 
         case LATTICE_POINTER: {
             LatticePointer p = { .trifecta = TRIFECTA_MEET(a->_ptr, b->_ptr) };
-            return (Lattice){ LATTICE_POINTER, ._ptr = p };
+            return lattice_intern(uni, (Lattice){ LATTICE_POINTER, ._ptr = p });
         }
 
         default: tb_todo();
diff --git a/tb/src/opt/mem2reg.h b/tb/src/opt/mem2reg.h
index 109f19db..068038e0 100644
--- a/tb/src/opt/mem2reg.h
+++ b/tb/src/opt/mem2reg.h
@@ -23,7 +23,7 @@ typedef struct Mem2Reg_Ctx {
     TB_Function* f;
     TB_Passes* p;
 
-    size_t block_count;
+    TB_CFG cfg;
     TB_Node** blocks;
 
     // Stack slots we're going to convert into
@@ -61,7 +61,7 @@ static TB_Node* new_phi(Mem2Reg_Ctx* restrict c, TB_Function* f, int var, TB_Nod
         break;
     }*/
 
-    DO_IF(TB_OPTDEBUG_MEM2REG)(log_debug("%p: insert new PHI node (in %p)", n, block));
+    DO_IF(TB_OPTDEBUG_MEM2REG)(log_debug("v%u: insert new PHI node (in v%u)", n->gvn, block->gvn));
     tb_pass_mark(c->p, n);
     return n;
 }
@@ -79,7 +79,7 @@ static void add_phi_operand(Mem2Reg_Ctx* restrict c, TB_Function* f, TB_Node* ph
 
     assert(phi_node->type == TB_PHI);
     TB_Node* phi_region = phi_node->inputs[0];
-    DO_IF(TB_OPTDEBUG_MEM2REG)(log_debug("%p: adding %p to PHI", phi_node, node));
+    DO_IF(TB_OPTDEBUG_MEM2REG)(log_debug("v%u: adding v%u to PHI", phi_node->gvn, node->gvn));
 
     // the slot to fill is based on the predecessor list of the region
     FOREACH_N(i, 0, phi_region->input_count) {
@@ -145,70 +145,9 @@ static bool is_effect_tuple(TB_Node* n) {
         n->type == TB_MACHINE_OP;
 }
 
-static void ssa_rename_node(Mem2Reg_Ctx* c, TB_Node* bb, TB_Node* n, DynArray(TB_Node*)* stack) {
-    TB_Node* parent = (n->type == TB_PROJ ? n->inputs[0] : n);
-    if (parent->input_count >= 2 && unsafe_get_region(parent->inputs[1]) == bb) {
-        assert(parent->inputs[1]->dt.type == TB_MEMORY);
-        ssa_rename_node(c, bb, parent->inputs[1], stack);
-    }
-
-    // find promoted stack slots
-    bool kill = false;
-    if (n->type == TB_STORE) {
-        int var = get_variable_id(c, n->inputs[2]);
-        if (var >= 0) {
-            // push new store value onto the stack
-            dyn_array_put(stack[var], n->inputs[3]);
-            kill = true;
-        }
-    }
-
-    // check for any loads and replace them
-    for (User* u = find_users(c->p, n); u; u = u->next) {
-        TB_Node* use = u->n;
-
-        if (u->slot == 1 && use->type == TB_LOAD) {
-            int var = get_variable_id(c, use->inputs[2]);
-            if (var >= 0) {
-                TB_Node* val;
-                if (dyn_array_length(stack[var]) == 0) {
-                    // this is UB since it implies we've read before initializing the
-                    // stack slot.
-                    val = make_poison(c->f, c->p, TB_TYPE_VOID);
-                    log_warn("%p: found load-before-init in mem2reg, this is UB", use);
-                } else {
-                    val = stack[var][dyn_array_length(stack[var]) - 1];
-                }
-
-                // make sure it's the right type
-                if (use->dt.raw != val->dt.raw) {
-                    TB_Node* cast = tb_alloc_node(c->f, TB_BITCAST, use->dt, 2, 0);
-                    tb_pass_mark(c->p, cast);
-                    set_input(c->p, cast, val, 1);
-
-                    val = cast;
-                }
-
-                tb_pass_mark_users(c->p, use);
-                set_input(c->p, use, NULL, 1); // unlink first
-                subsume_node(c->p, c->f, use, val);
-            }
-        }
-    }
-
-    // we can remove the effect now
-    if (kill) {
-        // log_info("%p: pass to %p", n, n->inputs[0]);
-        TB_Node* into = n->inputs[1];
-        tb_pass_mark(c->p, into);
-        tb_pass_mark(c->p, n);
-        set_input(c->p, n, NULL, 1);
-        subsume_node(c->p, c->f, n, into);
-    }
-}
-
 static void ssa_rename(Mem2Reg_Ctx* c, TB_Function* f, TB_Node* bb, DynArray(TB_Node*)* stack) {
     assert(bb);
+    TB_Passes* p = c->p;
 
     // push phi nodes
     size_t* old_len = tb_tls_push(c->tls, sizeof(size_t) * c->to_promote_count);
@@ -222,28 +161,97 @@ static void ssa_rename(Mem2Reg_Ctx* c, TB_Function* f, TB_Node* bb, DynArray(TB_
     }
 
     // rewrite operations
-    TB_NodeRegion* r = TB_NODE_GET_EXTRA(bb);
-    TB_Node* end = r->end;
+    TB_BasicBlock* bb_info = &nl_map_get_checked(c->cfg.node_to_block, bb);
+    TB_Node* end = bb_info->end;
+
+    tb_pass_mark(p, bb);
+    tb_pass_mark_users(p, bb);
 
-    tb_pass_mark(c->p, bb);
-    tb_pass_mark_users(c->p, bb);
+    DO_IF(TB_OPTDEBUG_MEM2REG)(
+        printf("  FORST %u: ", bb->gvn),
+        print_node_sexpr(bb, 0),
+        printf("\n")
+    );
 
     // go through all uses and replace their accessors
-    if (r->mem_out) {
-        ssa_rename_node(c, bb, r->mem_out, stack);
+    TB_Node* n = bb_info->mem_in;
+    if (n != NULL) {
+        do {
+            DO_IF(TB_OPTDEBUG_MEM2REG)(
+                printf("  SIGMA %u: ", n->gvn),
+                print_node_sexpr(n, 0),
+                printf("\n")
+            );
+
+            // if we spot a store, we push to the stack
+            bool kill = false;
+            if (n->type == TB_STORE) {
+                int var = get_variable_id(c, n->inputs[2]);
+                if (var >= 0) {
+                    // push new store value onto the stack
+                    dyn_array_put(stack[var], n->inputs[3]);
+                    kill = true;
+                }
+            }
+
+            // check for any loads and replace them
+            for (User* u = n->users; u; u = u->next) {
+                TB_Node* use = u->n;
+
+                if (u->slot == 1 && use->type == TB_LOAD) {
+                    int var = get_variable_id(c, use->inputs[2]);
+                    if (var >= 0) {
+                        TB_Node* val;
+                        if (dyn_array_length(stack[var]) == 0) {
+                            // this is UB since it implies we've read before initializing the
+                            // stack slot.
+                            val = make_poison(f, p, use->dt);
+                            log_warn("v%u: found load-before-init in mem2reg, this is UB", use->gvn);
+                        } else {
+                            val = stack[var][dyn_array_length(stack[var]) - 1];
+                        }
+
+                        // make sure it's the right type
+                        if (use->dt.raw != val->dt.raw) {
+                            TB_Node* cast = tb_alloc_node(c->f, TB_BITCAST, use->dt, 2, 0);
+                            tb_pass_mark(c->p, cast);
+                            set_input(c->p, cast, val, 1);
+
+                            val = cast;
+                        }
+
+                        tb_pass_mark_users(p, use);
+                        set_input(p, use, NULL, 1); // unlink first
+                        subsume_node(p, f, use, val);
+                    }
+                }
+            }
+
+            // next memory has to be decided before we kill the node since
+            // murder will dettach the users.
+            TB_Node* next = mem_user(p, n, 1);
+
+            // we can remove the effect now
+            if (kill) {
+                TB_Node* into = n->inputs[1];
+                tb_pass_mark(c->p, into);
+                tb_pass_mark(c->p, n);
+                set_input(p, n, NULL, 1);
+                subsume_node(p, c->f, n, into);
+            }
+
+            n = next;
+        } while (n != NULL && get_block_begin(n) == bb);
     }
 
     // replace phi arguments on successor
     if (end != NULL) {
-        if (end->type == TB_NULL || end->type == TB_END || end->type == TB_TRAP || end->type == TB_UNREACHABLE) {
-            /* RET can't do shit in this context */
-        } else if (end->type == TB_BRANCH) {
-            TB_NodeBranch* br_info = TB_NODE_GET_EXTRA(end);
-            FOREACH_N(i, 0, br_info->succ_count) {
-                ssa_replace_phi_arg(c, f, bb, br_info->succ[i], stack);
+        // fill successors
+        for (User* u = end->users; u; u = u->next) {
+            if (cfg_is_control(u->n)) {
+                TB_Node* succ = cfg_next_region_control(u->n);
+                ssa_replace_phi_arg(c, f, bb, succ, stack);
             }
-        } else {
-            tb_todo();
         }
     }
 
@@ -252,9 +260,9 @@ static void ssa_rename(Mem2Reg_Ctx* c, TB_Function* f, TB_Node* bb, DynArray(TB_
     //
     // TODO(NeGate): maybe we want a data structure for this because it'll
     // be "kinda" slow.
-    FOREACH_N(i, 0, c->block_count) {
+    FOREACH_N(i, 0, c->cfg.block_count) {
         TB_Node* k = c->blocks[i];
-        TB_Node* v = idom(k);
+        TB_Node* v = idom(&c->cfg, k);
 
         if (v == bb && k != bb) {
             ssa_rename(c, f, k, stack);
@@ -267,57 +275,30 @@ static void ssa_rename(Mem2Reg_Ctx* c, TB_Function* f, TB_Node* bb, DynArray(TB_
     tb_tls_restore(c->tls, old_len);
 }
 
-typedef struct {
-    TB_Node* old_n;
-
-    int64_t offset;
-    TB_CharUnits size;
-    TB_DataType dt;
-} AggregateConfig;
-
-static ptrdiff_t find_config(size_t config_count, AggregateConfig* configs, int64_t offset) {
-    FOREACH_N(i, 0, config_count) {
-        if (configs[i].offset == offset) return i;
-    }
-
-    tb_unreachable();
-    return -1;
-}
-
-// -1 is a bad match
-// -2 is no match, so we can add a new config
-static ptrdiff_t compatible_with_configs(size_t config_count, AggregateConfig* configs, int64_t offset, TB_CharUnits size, TB_DataType dt) {
-    int64_t max = offset + size;
-
-    FOREACH_N(i, 0, config_count) {
-        int64_t max2 = configs[i].offset + configs[i].size;
-
-        if (offset >= configs[i].offset && max <= max2) {
-            // they overlap... but is it a clean overlap?
-            if (offset == configs[i].offset && max == max2 && TB_DATA_TYPE_EQUALS(dt, configs[i].dt)) {
-                return i;
+static void insert_phis(Mem2Reg_Ctx* restrict ctx, TB_Node* bb, TB_Node* n) {
+    DO_IF(TB_OPTDEBUG_MEM2REG)(
+        printf("  FORST %u: ", bb->gvn),
+        print_node_sexpr(bb, 0),
+        printf("\n")
+    );
+
+    do {
+        DO_IF(TB_OPTDEBUG_MEM2REG)(
+            printf("  OMEGA %u: ", n->gvn),
+            print_node_sexpr(n, 0),
+            printf("\n")
+        );
+
+        if (n->type == TB_STORE) {
+            int var = get_variable_id(ctx, n->inputs[2]);
+            if (var >= 0) {
+                write_variable(ctx, var, bb, n->inputs[3]);
             }
-
-            return -1;
         }
-    }
 
-    return -2;
-}
-
-static void insert_phis(Mem2Reg_Ctx* restrict ctx, TB_Node* bb, TB_Node* n) {
-    TB_Node* parent = (n->type == TB_PROJ ? n->inputs[0] : n);
-    if (parent->input_count >= 2 && unsafe_get_region(parent->inputs[1]) == bb) {
-        assert(parent->inputs[1]->dt.type == TB_MEMORY);
-        insert_phis(ctx, bb, parent->inputs[1]);
-    }
-
-    if (n->type == TB_STORE) {
-        int var = get_variable_id(ctx, n->inputs[2]);
-        if (var >= 0) {
-            write_variable(ctx, var, bb, n->inputs[3]);
-        }
-    }
+        // next memory
+        n = mem_user(ctx->p, n, 1);
+    } while (n != NULL && get_block_begin(n) == bb);
 }
 
 bool tb_pass_mem2reg(TB_Passes* p) {
@@ -329,7 +310,6 @@ bool tb_pass_mem2reg(TB_Passes* p) {
     ////////////////////////////////
     size_t to_promote_count = 0;
     TB_Node** to_promote = tb_tls_push(tls, sizeof(TB_Node*) * dyn_array_length(p->locals));
-
     dyn_array_for(i, p->locals) {
         TB_Node* n = p->locals[i];
 
@@ -338,28 +318,26 @@ bool tb_pass_mem2reg(TB_Passes* p) {
 
         switch (coherence) {
             case COHERENCY_GOOD: {
-                tb_tls_push(tls, sizeof(TB_Node*));
                 to_promote[to_promote_count++] = n;
-
                 n->dt = dt;
 
-                DO_IF(TB_OPTDEBUG_MEM2REG)(log_debug("%s: %p promoting to IR register", f->super.name, n));
+                DO_IF(TB_OPTDEBUG_MEM2REG)(log_debug("%s: v%u promoting to IR register", f->super.name, n->gvn));
                 break;
             }
             case COHERENCY_UNINITIALIZED: {
-                DO_IF(TB_OPTDEBUG_MEM2REG)(log_debug("%s: %p could not mem2reg (uninitialized)", f->super.name, n));
+                DO_IF(TB_OPTDEBUG_MEM2REG)(log_debug("%s: v%u could not mem2reg (uninitialized)", f->super.name, n->gvn));
                 break;
             }
             case COHERENCY_VOLATILE: {
-                DO_IF(TB_OPTDEBUG_MEM2REG)(log_debug("%s: %p could not mem2reg (volatile load/store)", f->super.name, n));
+                DO_IF(TB_OPTDEBUG_MEM2REG)(log_debug("%s: v%u could not mem2reg (volatile load/store)", f->super.name, n->gvn));
                 break;
             }
             case COHERENCY_USES_ADDRESS: {
-                DO_IF(TB_OPTDEBUG_MEM2REG)(log_debug("%s: %p could not mem2reg (uses address directly)", f->super.name));
+                DO_IF(TB_OPTDEBUG_MEM2REG)(log_debug("%s: v%u could not mem2reg (uses address directly)", f->super.name, n->gvn));
                 break;
             }
             case COHERENCY_BAD_DATA_TYPE: {
-                DO_IF(TB_OPTDEBUG_MEM2REG)(log_debug("%s: %p could not mem2reg (data type is too inconsistent)", f->super.name, n));
+                DO_IF(TB_OPTDEBUG_MEM2REG)(log_debug("%s: v%u could not mem2reg (data type is too inconsistent)", f->super.name, n->gvn));
                 break;
             }
             default: tb_todo();
@@ -382,50 +360,70 @@ bool tb_pass_mem2reg(TB_Passes* p) {
     c.defs = tb_tls_push(c.tls, to_promote_count * sizeof(Mem2Reg_Def));
     memset(c.defs, 0, to_promote_count * sizeof(Mem2Reg_Def));
 
-    c.block_count = tb_push_postorder(f, &p->worklist);
+    c.cfg = tb_compute_rpo(f, p);
     c.blocks = &p->worklist.items[0];
 
-    tb_compute_dominators(f, c.block_count, p->worklist.items);
+    tb_compute_dominators(f, p, c.cfg);
 
-    TB_DominanceFrontiers* df = tb_get_dominance_frontiers(f, c.block_count, c.blocks);
+    TB_DominanceFrontiers* df = tb_get_dominance_frontiers(f, p, c.cfg, c.blocks);
 
     ////////////////////////////////
     // Phase 1: Insert phi functions
     ////////////////////////////////
     // Identify the final value of all the variables in the function per basic block
-    FOREACH_REVERSE_N(i, 0, c.block_count) {
-        TB_Node* end = TB_NODE_GET_EXTRA_T(c.blocks[i], TB_NodeRegion)->end;
+    FOREACH_N(i, 0, c.cfg.block_count) {
+        TB_Node* bb = c.blocks[i];
+        TB_BasicBlock* bb_info = &nl_map_get_checked(c.cfg.node_to_block, bb);
+
+        if (i == 0) {
+            // start block can use the input memory as the earliest point
+            insert_phis(&c, bb, f->params[1]);
+            bb_info->mem_in = f->params[1];
+            continue;
+        }
+
+        TB_Node* end = bb_info->end;
 
-        TB_Node* ctrl = end->inputs[0];
-        TB_Node* latest_mem = NULL;
+        // find memory phi
+        TB_Node* n = bb;
+        TB_Node* mem = NULL;
         do {
-            latest_mem = mem_user(p, ctrl, 0);
-            ctrl = ctrl->inputs[0];
-        } while (latest_mem == NULL && ctrl->type != TB_START && ctrl->type != TB_REGION);
-
-        if (latest_mem) {
-            for (;;) {
-                TB_Node* next = mem_user(p, latest_mem, 1);
-                if (next == NULL || next->inputs[0] != latest_mem->inputs[0]) break;
-                latest_mem = next;
+            for (User* u = n->users; u; u = u->next) {
+                if (is_mem_out_op(u->n)) {
+                    mem = u->n;
+                    goto done;
+                }
+            }
+
+            n = cfg_next_control(n);
+        } while (n != NULL && n != end);
+
+        done:
+        // find earliest memory in the BB:
+        //   note this doesn't account for multiple memory streams
+        //   but that's fine for now...
+        if (mem) {
+            while (mem->inputs[1]->inputs[0]->type != TB_START && get_block_begin(mem->inputs[1]->inputs[0]) == bb) {
+                mem = mem->inputs[1];
             }
 
-            insert_phis(&c, c.blocks[i], latest_mem);
+            insert_phis(&c, bb, mem);
         }
-        TB_NODE_GET_EXTRA_T(c.blocks[i], TB_NodeRegion)->mem_out = latest_mem;
+
+        bb_info->mem_in = mem;
     }
 
     // for each global name we'll insert phi nodes
-    TB_Node** phi_p = tb_tls_push(tls, c.block_count * sizeof(TB_Node*));
+    TB_Node** phi_p = tb_tls_push(tls, c.cfg.block_count * sizeof(TB_Node*));
 
-    NL_HashSet ever_worked = nl_hashset_alloc(c.block_count);
-    NL_HashSet has_already = nl_hashset_alloc(c.block_count);
+    NL_HashSet ever_worked = nl_hashset_alloc(c.cfg.block_count);
+    NL_HashSet has_already = nl_hashset_alloc(c.cfg.block_count);
     FOREACH_N(var, 0, c.to_promote_count) {
         nl_hashset_clear(&ever_worked);
         nl_hashset_clear(&has_already);
 
         size_t p_count = 0;
-        FOREACH_REVERSE_N(i, 0, c.block_count) {
+        FOREACH_N(i, 0, c.cfg.block_count) {
             TB_Node* bb = c.blocks[i];
 
             ptrdiff_t search = nl_map_get(c.defs[var], bb);
@@ -444,7 +442,7 @@ bool tb_pass_mem2reg(TB_Passes* p) {
                 TB_DataType dt = value->dt;
 
                 // for all DFs of BB, insert PHI
-                int bb_id = TB_NODE_GET_EXTRA_T(bb, TB_NodeRegion)->postorder_id;
+                int bb_id = nl_map_get_checked(c.cfg.node_to_block, bb).id;
                 uint64_t* frontier = &df->arr[bb_id * df->stride];
                 FOREACH_N(j, 0, df->stride) FOREACH_BIT(k, j*64, frontier[j]) {
                     TB_Node* l = c.blocks[k];
@@ -488,15 +486,18 @@ bool tb_pass_mem2reg(TB_Passes* p) {
         stack[var] = dyn_array_create(TB_Node*, 16);
     }
 
-    ssa_rename(&c, f, f->start_node, stack);
+    ssa_rename(&c, f, c.blocks[0], stack);
+    // tb_function_print(f, tb_default_print_callback, stdout);
 
     // don't need these anymore
     FOREACH_N(var, 0, c.to_promote_count) {
+        assert(c.to_promote[var]->users == NULL);
         tb_pass_kill_node(c.p, c.to_promote[var]);
     }
 
     tb_tls_restore(tls, to_promote);
 
+    tb_free_cfg(&c.cfg);
     cuikperf_region_end();
     return true;
 
@@ -505,100 +506,6 @@ bool tb_pass_mem2reg(TB_Passes* p) {
     return false;
 }
 
-static bool sane_writer(TB_Node* n) {
-    return n->type == TB_STORE || n->type == TB_MEMCPY || n->type == TB_MEMSET;
-}
-
-// false means failure to SROA
-static bool add_configs(TB_Passes* p, TB_TemporaryStorage* tls, User* use, TB_Node* base_address, size_t base_offset, size_t* config_count, AggregateConfig* configs, int pointer_size) {
-    for (; use; use = use->next) {
-        TB_Node* n = use->n;
-
-        if (n->type == TB_MEMBER_ACCESS && use->slot == 1) {
-            // same rules, different offset
-            int64_t offset = TB_NODE_GET_EXTRA_T(n, TB_NodeMember)->offset;
-            if (!add_configs(p, tls, find_users(p, n), base_address, base_offset + offset, config_count, configs, pointer_size)) {
-                return false;
-            }
-            continue;
-        }
-
-        // we can only SROA if we know we're not using the
-        // address for anything but direct memory ops or TB_MEMBERs.
-        if (use->slot != 2) {
-            return false;
-        }
-
-        // find direct memory op
-        if (n->type != TB_LOAD && n->type != TB_STORE) {
-            return false;
-        }
-
-        TB_DataType dt = n->type == TB_LOAD ? n->dt : n->inputs[3]->dt;
-        TB_Node* address = n->inputs[2];
-        int size = (bits_in_data_type(pointer_size, dt) + 7) / 8;
-
-        // see if it's a compatible configuration
-        int match = compatible_with_configs(*config_count, configs, base_offset, size, dt);
-        if (match == -1) {
-            return false;
-        } else if (match == -2) {
-            // add new config
-            tb_tls_push(tls, sizeof(AggregateConfig));
-            configs[(*config_count)++] = (AggregateConfig){ address, base_offset, size, dt };
-        } else if (configs[match].old_n != address) {
-            log_warn("%s: %p SROA config matches but reaches so via a different node, please idealize nodes before mem2reg", p->f->super.name, address);
-            return false;
-        }
-    }
-
-    return true;
-}
-
-void tb_pass_sroa(TB_Passes* p) {
-    cuikperf_region_start("sroa", NULL);
-    verify_tmp_arena(p);
-
-    TB_Function* f = p->f;
-    TB_TemporaryStorage* tls = tb_tls_steal();
-    int pointer_size = tb__find_code_generator(f->super.module)->pointer_size;
-
-    for (size_t i = dyn_array_length(p->locals); i--;) retry: {
-        TB_Node* address = p->locals[i];
-        void* mark = tb_tls_push(tls, 0);
-
-        size_t config_count = 0;
-        AggregateConfig* configs = tb_tls_push(tls, 0);
-        if (!add_configs(p, tls, find_users(p, address), address, 0, &config_count, configs, pointer_size)) {
-            TB_NODE_GET_EXTRA_T(address, TB_NodeLocal)->alias_index = 0;
-            continue;
-        }
-
-        // split allocation into pieces
-        if (config_count > 1) {
-            DO_IF(TB_OPTDEBUG_MEM2REG)(log_debug("%s: v%zu was able to SROA into %zu pieces", f->super.name, address->gvn, config_count));
-
-            uint32_t alignment = TB_NODE_GET_EXTRA_T(address, TB_NodeLocal)->align;
-            FOREACH_N(i, 0, config_count) {
-                TB_Node* new_n = tb_alloc_node(f, TB_LOCAL, TB_TYPE_PTR, 1, sizeof(TB_NodeLocal));
-                set_input(p, new_n, f->start_node, 0);
-                TB_NODE_SET_EXTRA(new_n, TB_NodeLocal, .size = configs[i].size, .align = alignment);
-
-                // mark all users, there may be some fun new opts now
-                tb_pass_mark_users(p, configs[i].old_n);
-
-                // replace old pointer with new fancy
-                subsume_node(p, f, configs[i].old_n, new_n);
-                dyn_array_put(p->locals, new_n);
-            }
-            tb_tls_restore(tls, mark);
-            goto retry; // retry but don't go the next int
-        }
-    }
-
-    cuikperf_region_end();
-}
-
 // NOTE(NeGate): a stack slot is coherent when all loads and stores share
 // the same type and alignment along with not needing any address usage.
 static Coherency tb_get_stack_slot_coherency(TB_Passes* p, TB_Function* f, TB_Node* address, TB_DataType* out_dt) {
@@ -634,7 +541,7 @@ static Coherency tb_get_stack_slot_coherency(TB_Passes* p, TB_Function* f, TB_No
                 dt_bits = bits;
             }
         } else {
-            DO_IF(TB_OPTDEBUG_MEM2REG)(log_debug("%p uses pointer arithmatic (%s)", address, tb_node_get_name(n)));
+            DO_IF(TB_OPTDEBUG_MEM2REG)(log_debug("v%u uses pointer arithmatic (%s)", address->gvn, tb_node_get_name(n)));
             return COHERENCY_USES_ADDRESS;
         }
     }
diff --git a/tb/src/opt/mem_opt.h b/tb/src/opt/mem_opt.h
index 9acdfc3d..d56d67cf 100644
--- a/tb/src/opt/mem_opt.h
+++ b/tb/src/opt/mem_opt.h
@@ -6,6 +6,16 @@ typedef struct {
     int64_t offset;
 } KnownPointer;
 
+static bool is_local_ptr(TB_Node* n) {
+    // skip past ptr arith
+    retry: {
+        if (n->type == TB_MEMBER_ACCESS) goto retry;
+        if (n->type == TB_ARRAY_ACCESS) goto retry;
+    }
+
+    return n->type == TB_LOCAL;
+}
+
 static KnownPointer known_pointer(TB_Node* n) {
     if (n->type == TB_MEMBER_ACCESS) {
         return (KnownPointer){ n->inputs[1], TB_NODE_GET_EXTRA_T(n, TB_NodeMember)->offset };
@@ -71,61 +81,50 @@ static TB_Node* ideal_load(TB_Passes* restrict p, TB_Function* f, TB_Node* n) {
     TB_Node* mem = n->inputs[1];
     TB_Node* addr = n->inputs[2];
     if (n->inputs[0] != NULL) {
-        TB_Node* base = addr;
-        while (base->type == TB_MEMBER_ACCESS || base->type == TB_ARRAY_ACCESS) {
-            base = base->inputs[1];
-        }
-
-        // loads based on LOCALs don't need control-dependence, it's actually kinda annoying
-        if (base->type == TB_LOCAL) {
+        // we've dependent on code which must always be run (START.mem)
+        if (n->inputs[0]->type == TB_PROJ && n->inputs[0]->inputs[0]->type == TB_START) {
             set_input(p, n, NULL, 0);
             return n;
+        } else {
+            TB_Node* base = addr;
+            while (base->type == TB_MEMBER_ACCESS || base->type == TB_ARRAY_ACCESS) {
+                base = base->inputs[1];
+            }
+
+            // loads based on LOCALs don't need control-dependence, it's actually kinda annoying
+            if (base->type == TB_LOCAL) {
+                set_input(p, n, NULL, 0);
+                return n;
+            }
         }
     }
 
     // if LOAD has already been safely accessed we can relax our control dependency
-    if (n->inputs[0] != NULL && n->inputs[0]->type == TB_REGION && n->inputs[0]->input_count == 1) {
-        TB_Node* parent_bb = get_block_begin(n->inputs[0]->inputs[0]);
-
-        for (User* u = find_users(p, parent_bb); u; u = u->next) {
+    if (n->inputs[0] != NULL) {
+        TB_Node* parent_bb = get_block_begin(n->inputs[0]);
+        for (User* u = addr->users; u; u = u->next) {
             TB_Node* use = u->n;
-            if (use != n && use->type == TB_LOAD && use->inputs[2] == addr) {
-                tb_pass_mark_users(p, get_block_begin(n->inputs[0]));
+            if (use != n && use->type == TB_LOAD && u->slot == 2) {
+                // if the other load has no control deps we don't need any
+                // either... if they're the same type (really it just needs
+                // to read the same bytes or less)
+                if (use->dt.raw == n->dt.raw) {
+                    set_input(p, n, NULL, 0);
+                    return n;
+                }
 
-                set_input(p, n, use->inputs[0], 0);
-                return n;
+                // if we're dominated by some previous load then we can inherit
+                // it's control dep.
+                TB_Node* bb = get_block_begin(use->inputs[0]);
+                if (lattice_dommy(&p->universe, bb, parent_bb)) {
+                    set_input(p, n, use->inputs[0], 0);
+                    return n;
+                }
             }
         }
     }
 
     return NULL;
-
-    // loads based on PHIs may be reduced into data PHIs
-    /*if (n->inputs[1]->type == TB_PHI) {
-        return data_phi_from_memory_phi(p, f, n->dt, n->inputs[1], addr, NULL);
-    }*/
-
-    // if a load is control dependent on a store and it doesn't alias we can move the
-    // dependency up a bit.
-    /*if (n->inputs[1]->type != TB_STORE) return NULL;
-
-    KnownPointer ld_ptr = known_pointer(n->inputs[2]);
-    KnownPointer st_ptr = known_pointer(n->inputs[1]->inputs[2]);
-    if (ld_ptr.base != st_ptr.base) return NULL;
-
-    // it's probably not the fastest way to grab this value ngl...
-    ICodeGen* cg = tb__find_code_generator(f->super.module);
-    ld_ptr.offset *= cg->minimum_addressable_size;
-    st_ptr.offset *= cg->minimum_addressable_size;
-
-    size_t loaded_end = ld_ptr.offset + bits_in_data_type(cg->pointer_size, n->dt);
-    size_t stored_end = st_ptr.offset + bits_in_data_type(cg->pointer_size, n->inputs[0]->inputs[2]->dt);
-
-    // both bases match so if the effective ranges don't intersect, they don't alias.
-    if (ld_ptr.offset <= stored_end && st_ptr.offset <= loaded_end) return NULL;
-
-    set_input(p, n, n->inputs[1]->inputs[1], 1);
-    return n;*/
 }
 
 static TB_Node* identity_load(TB_Passes* restrict p, TB_Function* f, TB_Node* n) {
@@ -160,9 +159,15 @@ static TB_Node* ideal_store(TB_Passes* restrict p, TB_Function* f, TB_Node* n) {
 }
 
 static TB_Node* ideal_end(TB_Passes* restrict p, TB_Function* f, TB_Node* n) {
+    // remove dead local store
+    if (n->inputs[1]->type == TB_STORE && is_local_ptr(n->inputs[1]->inputs[2])) {
+        set_input(p, n, n->inputs[1]->inputs[1], 1);
+        return n;
+    }
+
     return NULL;
 }
 
 static TB_Node* ideal_memcpy(TB_Passes* restrict p, TB_Function* f, TB_Node* n) {
     return NULL;
-}
+}
\ No newline at end of file
diff --git a/tb/src/opt/optimizer.c b/tb/src/opt/optimizer.c
index 43a8c68f..b67ff5ab 100644
--- a/tb/src/opt/optimizer.c
+++ b/tb/src/opt/optimizer.c
@@ -8,7 +8,7 @@
 //   set_input(opt, n, in, slot)
 //     basically `n->inputs[slot] = in` except it correctly updates the user set
 //
-// # Implement peepholes
+// # How to implement peepholes
 //     TODO
 //
 #include "../passes.h"
@@ -31,12 +31,13 @@ static void subsume_node(TB_Passes* restrict p, TB_Function* f, TB_Node* n, TB_N
 static TB_Node* clone_node(TB_Passes* restrict p, TB_Function* f, TB_Node* region, TB_Node* n, bool* new_node);
 
 // node creation helpers
-TB_Node* make_dead(TB_Function* f, TB_Passes* restrict p);
 TB_Node* make_poison(TB_Function* f, TB_Passes* restrict p, TB_DataType dt);
 TB_Node* make_int_node(TB_Function* f, TB_Passes* restrict p, TB_DataType dt, uint64_t x);
+TB_Node* make_dead_node(TB_Function* f, TB_Passes* restrict p);
 TB_Node* make_proj_node(TB_Function* f, TB_Passes* restrict p, TB_DataType dt, TB_Node* src, int i);
 
 static bool remove_pred(TB_Passes* restrict p, TB_Function* f, TB_Node* src, TB_Node* dst);
+static bool lattice_dommy(LatticeUniverse* uni, TB_Node* expected_dom, TB_Node* bb);
 
 ////////////////////////////////
 // Worklist
@@ -68,6 +69,14 @@ void worklist_clear(Worklist* restrict ws) {
     }
 }
 
+void worklist_remove(Worklist* restrict ws, TB_Node* n) {
+    uint64_t gvn_word = n->gvn / 64; // which word this ID is at
+    if (gvn_word >= ws->visited_cap) return;
+
+    uint64_t gvn_mask = 1ull << (n->gvn % 64);
+    ws->visited[gvn_word] &= ~gvn_mask;
+}
+
 // checks if node is visited but doesn't push item
 bool worklist_test(Worklist* restrict ws, TB_Node* n) {
     uint64_t gvn_word = n->gvn / 64; // which word this ID is at
@@ -172,7 +181,10 @@ static char* lil_name(TB_Function* f, const char* fmt, ...) {
 
 static TB_Node* mem_user(TB_Passes* restrict p, TB_Node* n, int slot) {
     for (User* u = find_users(p, n); u; u = u->next) {
-        if (u->slot == slot && is_mem_out_op(u->n)) return u->n;
+        if ((u->n->type == TB_PROJ && u->n->dt.type == TB_MEMORY) ||
+            (u->slot == slot && is_mem_out_op(u->n))) {
+            return u->n;
+        }
     }
 
     return NULL;
@@ -185,7 +197,7 @@ static TB_Node* single_user(TB_Passes* restrict p, TB_Node* n) {
 }
 
 static bool single_use(TB_Passes* restrict p, TB_Node* n) {
-    return find_users(p, n)->next == NULL;
+    return n->users->next == NULL;
 }
 
 static bool is_same_align(TB_Node* a, TB_Node* b) {
@@ -196,7 +208,7 @@ static bool is_same_align(TB_Node* a, TB_Node* b) {
 
 static bool is_empty_bb(TB_Passes* restrict p, TB_Node* end) {
     assert(end->type == TB_BRANCH || end->type == TB_UNREACHABLE);
-    if (!is_block_begin(end->inputs[0])) {
+    if (!cfg_is_bb_entry(end->inputs[0])) {
         return false;
     }
 
@@ -221,10 +233,11 @@ static bool is_if_branch(TB_Node* n, uint64_t* falsey) {
 // unity build with all the passes
 #include "lattice.h"
 #include "cfg.h"
-#include "cse.h"
+#include "gvn.h"
 #include "dce.h"
 #include "fold.h"
 #include "mem_opt.h"
+#include "sroa.h"
 #include "loop.h"
 #include "branches.h"
 #include "print.h"
@@ -233,9 +246,24 @@ static bool is_if_branch(TB_Node* n, uint64_t* falsey) {
 #include "libcalls.h"
 #include "scheduler.h"
 
+static bool lattice_dommy(LatticeUniverse* uni, TB_Node* expected_dom, TB_Node* bb) {
+    while (bb != NULL && expected_dom != bb) {
+        Lattice* l = lattice_universe_get(uni, bb);
+        assert(l->tag == LATTICE_CONTROL);
+
+        TB_Node* new_bb = l->_ctrl.idom;
+        if (bb == new_bb) {
+            return false;
+        }
+        bb = new_bb;
+    }
+
+    return true;
+}
+
 static TB_Node* gvn(TB_Passes* restrict p, TB_Node* n, size_t extra) {
     // try CSE, if we succeed, just delete the node and use the old copy
-    TB_Node* k = nl_hashset_put2(&p->cse_nodes, n, cse_hash, cse_compare);
+    TB_Node* k = nl_hashset_put2(&p->gvn_nodes, n, gvn_hash, gvn_compare);
     if (k != NULL) {
         // try free
         tb_arena_free(p->f->arena, n->inputs, sizeof(TB_Node*));
@@ -250,10 +278,21 @@ TB_Node* make_poison(TB_Function* f, TB_Passes* restrict p, TB_DataType dt) {
     return gvn(p, tb_alloc_node(f, TB_POISON, dt, 1, 0), 0);
 }
 
+TB_Node* make_dead_node(TB_Function* f, TB_Passes* restrict p) {
+    return gvn(p, tb_alloc_node(f, TB_DEAD, TB_TYPE_CONTROL, 1, 0), 0);
+}
+
 TB_Node* make_int_node(TB_Function* f, TB_Passes* restrict p, TB_DataType dt, uint64_t x) {
+    uint64_t mask = tb__mask(dt.data);
+    x &= mask;
+
     TB_Node* n = tb_alloc_node(f, TB_INTEGER_CONST, dt, 1, sizeof(TB_NodeInt));
     TB_NodeInt* i = TB_NODE_GET_EXTRA(n);
     i->value = x;
+
+    Lattice* l = lattice_intern(&p->universe, (Lattice){ LATTICE_INT, ._int = { x, x, ~x & mask, x } });
+    lattice_universe_map(&p->universe, n, l);
+
     return gvn(p, n, sizeof(TB_NodeInt));
 }
 
@@ -304,7 +343,7 @@ static bool remove_pred(TB_Passes* restrict p, TB_Function* f, TB_Node* src, TB_
 
 void tb_pass_kill_node(TB_Passes* restrict p, TB_Node* n) {
     // remove from CSE if we're murdering it
-    nl_hashset_remove2(&p->cse_nodes, n, cse_hash, cse_compare);
+    nl_hashset_remove2(&p->gvn_nodes, n, gvn_hash, gvn_compare);
 
     if (n->type == TB_LOCAL) {
         // remove from local list
@@ -319,8 +358,7 @@ void tb_pass_kill_node(TB_Passes* restrict p, TB_Node* n) {
         n->inputs[i] = NULL;
     }
 
-    n->users = NULL;
-
+    // assert(n->users == NULL && "we can't kill nodes with users, that's fucking rude");
     n->input_count = 0;
     n->type = TB_NULL;
 }
@@ -385,80 +423,54 @@ void tb_pass_mark_users(TB_Passes* restrict p, TB_Node* n) {
         TB_NodeTypeEnum type = use->n->type;
 
         // tuples changing means their projections did too.
-        if (use->n->dt.type == TB_TUPLE || type == TB_PROJ) {
+        if (type == TB_PROJ || type == TB_DEAD) {
             tb_pass_mark_users(p, use->n);
         }
 
-        // if the store is changed, the users (potential loads) should be notified.
-        // (br (cmp ...))
-        if (type == TB_CMP_NE || type == TB_CMP_EQ || type == TB_STORE) {
+        // (br (cmp a b)) => ...
+        if (type >= TB_CMP_EQ && type <= TB_CMP_FLE) {
             tb_pass_mark_users_raw(p, use->n);
         }
-
-        if (type == TB_REGION) {
-            tb_pass_mark_users_raw(p, use->n);
-
-            TB_NodeRegion* r = TB_NODE_GET_EXTRA(use->n);
-            TB_Node* end = r->end;
-            if (end->type == TB_BRANCH) {
-                tb_pass_mark(p, end);
-
-                // mark direct successors
-                TB_NodeBranch* br_info = TB_NODE_GET_EXTRA(end);
-                FOREACH_N(i, 0, br_info->succ_count) {
-                    tb_pass_mark(p, br_info->succ[i]);
-                }
-            }
-        }
     }
 }
 
-static void push_all_bb(Worklist* restrict ws, DynArray(TB_Node*)* stack_ptr, TB_Node* root) {
-    if (worklist_test_n_set(ws, root)) {
-        return;
-    }
-
-    // walk control edges (aka predecessors)
-    assert(root->type == TB_START || root->type == TB_REGION);
-    TB_NodeRegion* r = TB_NODE_GET_EXTRA(root);
-    TB_Node* end = r->end;
-
-    if (end->type == TB_BRANCH) {
-        TB_NodeBranch* br = TB_NODE_GET_EXTRA(end);
-        FOREACH_REVERSE_N(i, 0, br->succ_count) {
-            push_all_bb(ws, stack_ptr, br->succ[i]);
+static void push_all_nodes(TB_Passes* restrict p, Worklist* restrict ws, TB_Function* f) {
+    CUIK_TIMED_BLOCK("push_all_nodes") {
+        DynArray(TB_Node*) stack = p->stack;
+        if (stack == NULL) {
+            stack = dyn_array_create(TB_Node*, 1024);
         }
-    }
 
-    DynArray(TB_Node*) stack = *stack_ptr;
+        // push all nodes using the terminator list
+        DynArray(TB_Node*) terminators = f->terminators;
+        dyn_array_for(i, terminators) {
+            TB_Node* end = terminators[i];
 
-    // place endpoint, we'll construct the rest from there
-    worklist_test_n_set(ws, end);
-    dyn_array_put(stack, end);
+            // place endpoint, we'll construct the rest from there
+            if (worklist_test_n_set(ws, end)) {
+                // already processed
+                continue;
+            }
 
-    while (dyn_array_length(stack)) {
-        TB_Node* n = dyn_array_pop(stack);
+            dyn_array_put(stack, end);
 
-        // place self first
-        dyn_array_put(ws->items, n);
+            while (dyn_array_length(stack)) {
+                TB_Node* n = dyn_array_pop(stack);
 
-        // push inputs
-        FOREACH_N(i, 0, n->input_count) {
-            TB_Node* in = n->inputs[i];
-            if (in && !worklist_test_n_set(ws, in)) {
-                dyn_array_put(stack, in);
+                // place self first
+                dyn_array_put(ws->items, n);
+
+                // push inputs
+                FOREACH_N(i, 0, n->input_count) {
+                    TB_Node* in = n->inputs[i];
+                    if (in && !worklist_test_n_set(ws, in)) {
+                        dyn_array_put(stack, in);
+                    }
+                }
             }
         }
-    }
 
-    *stack_ptr = stack;
-}
-
-static void push_all_nodes(Worklist* restrict ws, TB_Node* root) {
-    CUIK_TIMED_BLOCK("push_all_nodes") {
-        DynArray(TB_Node*) stack = dyn_array_create(TB_Node*, 1024);
-        push_all_bb(ws, &stack, root);
-        dyn_array_destroy(stack);
+        p->stack = stack;
     }
 }
 
@@ -491,13 +503,13 @@ void print_node_sexpr(TB_Node* n, int depth) {
             printf("sym%p", sym);
         }
     } else if (depth >= 1) {
-        printf("(v%zu: %s", n->gvn, tb_node_get_name(n));
+        printf("(v%u: %s", n->gvn, tb_node_get_name(n));
         cool_print_type(n);
         printf(" ...)");
     } else {
         depth -= (n->type == TB_PROJ);
 
-        printf("(%s", tb_node_get_name(n));
+        printf("(v%u: %s", n->gvn, tb_node_get_name(n));
         cool_print_type(n);
         FOREACH_N(i, 0, n->input_count) if (n->inputs[i]) {
             if (i == 0) printf(" @");
@@ -526,10 +538,6 @@ void print_node_sexpr(TB_Node* n, int depth) {
 // Returns NULL or a modified node (could be the same node, we can stitch it back into place)
 static TB_Node* idealize(TB_Passes* restrict p, TB_Function* f, TB_Node* n, TB_PeepholeFlags flags) {
     switch (n->type) {
-        case TB_NOT:
-        case TB_NEG:
-        return ideal_int_unary(p, f, n);
-
         // integer ops
         case TB_AND:
         case TB_OR:
@@ -574,13 +582,8 @@ static TB_Node* idealize(TB_Passes* restrict p, TB_Function* f, TB_Node* n, TB_P
         case TB_SIGN_EXT:
         case TB_ZERO_EXT:
         return ideal_extension(p, f, n);
-
-        case TB_INT2PTR:
-        return ideal_int2ptr(p, f, n);
-
-        // truncate
-        case TB_TRUNCATE:
-        return ideal_truncate(p, f, n);
+        case TB_BITCAST:
+        return ideal_bitcast(p, f, n);
 
         case TB_CALL:
         return ideal_libcall(p, f, n);
@@ -664,6 +667,88 @@ static TB_Node* identity(TB_Passes* restrict p, TB_Function* f, TB_Node* n, TB_P
     }
 }
 
+// computes the type of a node based on it's inputs
+static Lattice* dataflow(TB_Passes* restrict p, LatticeUniverse* uni, TB_Node* n) {
+    switch (n->type) {
+        case TB_INTEGER_CONST: {
+            TB_NodeInt* num = TB_NODE_GET_EXTRA(n);
+            return lattice_intern(&p->universe, (Lattice){ LATTICE_INT, ._int = { num->value, num->value, ~num->value, num->value } });
+        }
+
+        case TB_LOCAL:
+        case TB_SYMBOL:
+        return lattice_intern(uni, (Lattice){ LATTICE_POINTER, ._ptr = { LATTICE_KNOWN_NOT_NULL } });
+
+        case TB_INT2PTR:
+        return dataflow_int2ptr(p, uni, n);
+
+        case TB_TRUNCATE:
+        return dataflow_trunc(p, uni, n);
+
+        case TB_ZERO_EXT:
+        return dataflow_zext(p, uni, n);
+
+        case TB_SIGN_EXT:
+        return dataflow_sext(p, uni, n);
+
+        case TB_NEG:
+        case TB_NOT:
+        return dataflow_unary(p, uni, n);
+
+        case TB_AND:
+        case TB_OR:
+        case TB_XOR:
+        return dataflow_bits(p, uni, n);
+
+        case TB_ADD:
+        case TB_SUB:
+        case TB_MUL:
+        return dataflow_arith(p, uni, n);
+
+        case TB_SHL:
+        case TB_SHR:
+        return dataflow_shift(p, uni, n);
+
+        // meet all inputs
+        case TB_PHI: {
+            Lattice* l = lattice_universe_get(uni, n->inputs[1]);
+            FOREACH_N(i, 2, n->input_count) {
+                l = lattice_meet(uni, l, lattice_universe_get(uni, n->inputs[i]));
+            }
+            return l;
+        }
+
+        default: return NULL;
+    }
+}
+
+// converts constant Lattice into constant node
+static TB_Node* try_as_const(TB_Passes* restrict p, TB_Node* n, Lattice* l) {
+    // already a constant?
+    if (n->type == TB_INTEGER_CONST || n->type == TB_FLOAT32_CONST || n->type == TB_FLOAT64_CONST) {
+        return NULL;
+    }
+
+    switch (l->tag) {
+        case LATTICE_INT: {
+            // degenerate range
+            if (l->_int.min == l->_int.max) {
+                return make_int_node(p->f, p, n->dt, l->_int.max);
+            }
+
+            // all bits are known
+            uint64_t mask = tb__mask(n->dt.data);
+            if ((l->_int.known_zeros | l->_int.known_ones) == mask) {
+                return make_int_node(p->f, p, n->dt, l->_int.known_ones);
+            }
+
+            return NULL;
+        }
+
+        default: return NULL;
+    }
+}
+
 static bool is_terminator(TB_Node* n) {
     return n->type == TB_BRANCH || n->type == TB_END || n->type == TB_TRAP || n->type == TB_UNREACHABLE;
 }
@@ -676,15 +761,52 @@ static TB_Node* unsafe_get_region(TB_Node* n) {
     return n;
 }
 
+static void validate_node_users(TB_Node* n) {
+    if (n != NULL) {
+        for (User* use = n->users; use; use = use->next) {
+            tb_assert(use->n->inputs[use->slot] == n, "Mismatch between def-use and use-def data");
+        }
+    }
+}
+
+static void print_lattice(Lattice* l, TB_DataType dt) {
+    switch (l->tag) {
+        case LATTICE_INT:
+        assert(dt.type == TB_INT);
+        printf("[%"PRId64, tb__sxt(l->_int.min, dt.data, 64));
+        // printf("[%#"PRIx64, l->_int.min);
+        if (l->_int.min != l->_int.max) {
+            // printf(" - %#"PRIx64, l->_int.max);
+            printf(" - %"PRId64, tb__sxt(l->_int.max, dt.data, 64));
+        }
+
+        uint64_t known = l->_int.known_zeros | l->_int.known_ones;
+        if (known && known != UINT64_MAX) {
+            printf("; zeros=%#"PRIx64", ones=%#"PRIx64, l->_int.known_zeros, l->_int.known_ones);
+        }
+        printf("]");
+        break;
+
+        case LATTICE_POINTER: {
+            static const char* tri[] = { "unknown", "null", "~null" };
+            printf("[%s]", tri[l->_ptr.trifecta]);
+            break;
+        }
+
+        default:
+        break;
+    }
+}
+
 static bool peephole(TB_Passes* restrict p, TB_Function* f, TB_Node* n, TB_PeepholeFlags flags) {
     // must've dead sometime between getting scheduled and getting
     // here.
-    if (n->type != TB_END && find_users(p, n) == NULL) {
+    if (n->type != TB_END && n->users == NULL) {
         return false;
     }
 
     DO_IF(TB_OPTDEBUG_STATS)(p->stats.peeps++);
-    DO_IF(TB_OPTDEBUG_PEEP)(printf("peep v%zu? ", n->gvn), print_node_sexpr(n, 0));
+    DO_IF(TB_OPTDEBUG_PEEP)(printf("peep t=%d? ", p->stats.time++), print_node_sexpr(n, 0));
 
     // idealize node (in a loop of course)
     TB_Node* k = idealize(p, f, n, flags);
@@ -698,7 +820,6 @@ static bool peephole(TB_Passes* restrict p, TB_Function* f, TB_Node* n, TB_Peeph
 
         // transfer users from n -> k
         if (n != k) {
-            tb_assert(!is_terminator(n), "can't peephole a branch into a new branch");
             subsume_node(p, f, n, k);
             n = k;
         }
@@ -708,6 +829,34 @@ static bool peephole(TB_Passes* restrict p, TB_Function* f, TB_Node* n, TB_Peeph
         DO_IF(TB_OPTDEBUG_PEEP)(if (++loop_count > 10) { log_warn("%p: we looping a lil too much dawg...", n); });
     }
 
+    // generate fancier type
+    if (n->dt.type >= TB_INT && n->dt.type <= TB_PTR) {
+        //   no type provided? just make a not-so-form fitting TOP
+        Lattice* new_type = dataflow(p, &p->universe, n);
+        if (new_type == NULL) {
+            new_type = lattice_top(&p->universe, n->dt);
+            DO_IF(TB_OPTDEBUG_PEEP)(printf(" => \x1b[93mTOP\x1b[0m"));
+        } else {
+            // print fancy type
+            DO_IF(TB_OPTDEBUG_PEEP)(printf(" => \x1b[93m"), print_lattice(new_type, n->dt), printf("\x1b[0m"));
+        }
+
+        // types that consist of one possible value are made into value constants.
+        k = try_as_const(p, n, new_type);
+        if (k != NULL) {
+            DO_IF(TB_OPTDEBUG_PEEP)(printf(" => \x1b[96m"), print_node_sexpr(k, 0), printf("\x1b[0m"));
+
+            subsume_node(p, f, n, k);
+
+            // because certain optimizations apply when things are merged
+            // we mark ALL users including the ones who didn't get changed.
+            tb_pass_mark_users(p, k);
+            return k;
+        } else {
+            lattice_universe_map(&p->universe, n, new_type);
+        }
+    }
+
     // convert into matching identity
     k = identity(p, f, n, flags);
     if (n != k) {
@@ -719,11 +868,11 @@ static bool peephole(TB_Passes* restrict p, TB_Function* f, TB_Node* n, TB_Peeph
         return k;
     }
 
-    // common subexpression elim
-    k = nl_hashset_put2(&p->cse_nodes, n, cse_hash, cse_compare);
+    // global value numbering
+    k = nl_hashset_put2(&p->gvn_nodes, n, gvn_hash, gvn_compare);
     if (k && (k != n)) {
-        DO_IF(TB_OPTDEBUG_STATS)(p->stats.cse_hit++);
-        DO_IF(TB_OPTDEBUG_PEEP)(printf(" => \x1b[31mCSE\x1b[0m"));
+        DO_IF(TB_OPTDEBUG_STATS)(p->stats.gvn_hit++);
+        DO_IF(TB_OPTDEBUG_PEEP)(printf(" => \x1b[31mGVN\x1b[0m"));
 
         subsume_node(p, f, n, k);
 
@@ -732,14 +881,14 @@ static bool peephole(TB_Passes* restrict p, TB_Function* f, TB_Node* n, TB_Peeph
         tb_pass_mark_users(p, k);
         return k;
     } else {
-        DO_IF(TB_OPTDEBUG_STATS)(p->stats.cse_miss++);
+        DO_IF(TB_OPTDEBUG_STATS)(p->stats.gvn_miss++);
     }
 
     return n;
 }
 
 static void subsume_node(TB_Passes* restrict p, TB_Function* f, TB_Node* n, TB_Node* new_n) {
-    User* use = find_users(p, n);
+    User* use = n->users;
     while (use != NULL) {
         tb_assert(use->n->inputs[use->slot] == n, "Mismatch between def-use and use-def data");
 
@@ -763,8 +912,8 @@ static void generate_use_lists(TB_Passes* restrict p, TB_Function* f) {
             dyn_array_put(p->locals, n);
         }
 
-        FOREACH_N(i, 0, n->input_count) if (n->inputs[i]) {
-            add_user(p, n, n->inputs[i], i, NULL);
+        FOREACH_N(j, 0, n->input_count) if (n->inputs[j]) {
+            add_user(p, n, n->inputs[j], j, NULL);
         }
     }
 }
@@ -782,16 +931,10 @@ TB_Passes* tb_pass_enter(TB_Function* f, TB_Arena* arena) {
 
     worklist_alloc(&p->worklist, f->node_count);
 
-    // generate early doms
-    CUIK_TIMED_BLOCK("doms") {
-        size_t block_count = tb_push_postorder(f, &p->worklist);
-        tb_compute_dominators(f, block_count, p->worklist.items);
-        worklist_clear(&p->worklist);
-    }
-
     // generate work list (put everything)
     CUIK_TIMED_BLOCK("gen worklist") {
-        push_all_nodes(&p->worklist, f->start_node);
+        push_all_nodes(p, &p->worklist, f);
+
         DO_IF(TB_OPTDEBUG_STATS)(p->stats.initial = worklist_popcount(&p->worklist));
     }
 
@@ -805,6 +948,23 @@ TB_Passes* tb_pass_enter(TB_Function* f, TB_Arena* arena) {
     return p;
 }
 
+void tb_pass_sroa(TB_Passes* p) {
+    cuikperf_region_start("sroa", NULL);
+    verify_tmp_arena(p);
+
+    TB_Function* f = p->f;
+
+    int pointer_size = tb__find_code_generator(f->super.module)->pointer_size;
+    TB_Node* start = f->start_node;
+
+    size_t i = 0;
+    while (i < dyn_array_length(p->locals)) {
+        i += sroa_rewrite(p, pointer_size, start, p->locals[i]);
+    }
+
+    cuikperf_region_end();
+}
+
 void tb_pass_optimize(TB_Passes* p) {
     tb_pass_peephole(p, TB_PEEPHOLE_ALL);
     tb_pass_sroa(p);
@@ -816,8 +976,51 @@ void tb_pass_optimize(TB_Passes* p) {
 void tb_pass_peephole(TB_Passes* p, TB_PeepholeFlags flags) {
     verify_tmp_arena(p);
 
-    if (p->cse_nodes.data == NULL) {
-        p->cse_nodes = nl_hashset_alloc(p->f->node_count);
+    if (p->gvn_nodes.data == NULL) {
+        p->gvn_nodes = nl_hashset_alloc(p->f->node_count);
+    }
+
+    // make sure we have space for the lattice universe
+    if (p->universe.arena == NULL) {
+        TB_ThreadInfo* info = tb_thread_info(p->f->super.module);
+        if (info->type_arena.chunk_size == 0) {
+            // make new arena
+            tb_arena_create(&info->type_arena, TB_ARENA_LARGE_CHUNK_SIZE);
+        }
+
+        size_t count = p->f->node_count;
+        p->universe.arena = &info->type_arena;
+        p->universe.pool = nl_hashset_alloc(64);
+        p->universe.type_cap = count;
+        p->universe.types = tb_platform_heap_alloc(count * sizeof(Lattice*));
+        memset(p->universe.types, 0, count * sizeof(Lattice*));
+
+        // generate early doms
+        CUIK_TIMED_BLOCK("doms") {
+            TB_Function* f = p->f;
+
+            Worklist tmp_ws = { 0 };
+            worklist_alloc(&tmp_ws, (f->node_count / 4) + 4);
+
+            TB_CFG cfg = tb_compute_rpo2(f, &tmp_ws, &p->stack);
+            tb_compute_dominators2(f, &tmp_ws, cfg);
+
+            // mark IDOM for each "BB" node
+            FOREACH_N(i, 0, cfg.block_count) {
+                // entry block should be marked as dominated by NULL, to make it easy
+                // to end the iteration of a dom chain.
+                TB_Node* dom = NULL;
+                if (i != 0) {
+                    dom = nl_map_get_checked(cfg.node_to_block, tmp_ws.items[i]).dom;
+                }
+
+                Lattice* l = lattice_ctrl(&p->universe, dom);
+                lattice_universe_map(&p->universe, tmp_ws.items[i], l);
+            }
+
+            worklist_free(&tmp_ws);
+            tb_free_cfg(&cfg);
+        }
     }
 
     TB_Function* f = p->f;
@@ -832,25 +1035,39 @@ void tb_pass_peephole(TB_Passes* p, TB_PeepholeFlags flags) {
 }
 
 void tb_pass_exit(TB_Passes* p) {
+    verify_tmp_arena(p);
+
     TB_Function* f = p->f;
 
+    // terminators will be made obselete by the optimizer
+    dyn_array_destroy(f->terminators);
+
+    // tb_function_print(f, tb_default_print_callback, stdout);
+
     #if TB_OPTDEBUG_STATS
-    push_all_nodes(&p->worklist, f->start_node);
+    push_all_nodes(p, &p->worklist, f);
     int final_count = worklist_popcount(&p->worklist);
 
     double factor = ((double) final_count / (double) p->stats.initial) * 100.0;
 
     printf("%s: stats:\n", f->super.name);
     printf("  %4d   -> %4d nodes (%.2f%%)\n", p->stats.initial, final_count, factor);
-    printf("  %4d CSE hit    %4d CSE miss\n", p->stats.cse_hit, p->stats.cse_miss);
+    printf("  %4d GVN hit    %4d GVN miss\n", p->stats.gvn_hit, p->stats.gvn_miss);
     printf("  %4d peepholes  %4d rewrites    %4d identities\n", p->stats.peeps, p->stats.rewrites, p->stats.identities);
     #endif
 
-    verify_tmp_arena(p);
-
+    nl_map_free(p->scheduled);
     worklist_free(&p->worklist);
-    nl_hashset_free(p->cse_nodes);
+    nl_hashset_free(p->gvn_nodes);
+    dyn_array_destroy(p->stack);
     dyn_array_destroy(p->locals);
 
+    if (p->universe.arena != NULL) {
+        tb_arena_clear(p->universe.arena);
+        nl_hashset_free(p->universe.pool);
+        tb_platform_heap_free(p->universe.types);
+    }
+
     tb_arena_clear(tmp_arena);
+    tb_platform_heap_free(p);
 }
diff --git a/tb/src/opt/print.h b/tb/src/opt/print.h
index ef41b6c9..f12e8808 100644
--- a/tb/src/opt/print.h
+++ b/tb/src/opt/print.h
@@ -2,12 +2,10 @@
 typedef struct {
     TB_Passes* opt;
     TB_Function* f;
-    size_t block_count;
+    TB_CFG cfg;
 } PrinterCtx;
 
 static void print_type(TB_DataType dt) {
-    assert(dt.width < 8 && "Vector width too big!");
-
     switch (dt.type) {
         case TB_INT: {
             if (dt.data == 0) printf("void");
@@ -53,13 +51,13 @@ static void print_ref_to_node(PrinterCtx* ctx, TB_Node* n, bool def) {
         if (def) {
             printf("(");
             TB_Node** params = ctx->f->params;
-            FOREACH_N(i, 1, 1 + ctx->f->param_count) {
+            FOREACH_N(i, 1, 3 + ctx->f->param_count) {
                 if (i > 1) printf(", ");
 
                 if (params[i] == NULL) {
                     printf("_");
                 } else {
-                    printf("v%zu: ", params[i]->gvn);
+                    printf("v%u: ", params[i]->gvn);
                     print_type(params[i]->dt);
                 }
             }
@@ -70,7 +68,7 @@ static void print_ref_to_node(PrinterCtx* ctx, TB_Node* n, bool def) {
         if (r->tag != NULL) {
             printf(".%s", r->tag);
         } else {
-            ptrdiff_t i = try_find_traversal_index(n);
+            ptrdiff_t i = try_find_traversal_index(&ctx->cfg, n);
             if (i >= 0) {
                 printf(".bb%zu", i);
             } else {
@@ -90,6 +88,17 @@ static void print_ref_to_node(PrinterCtx* ctx, TB_Node* n, bool def) {
         } else {
             printf("sym%p", sym);
         }
+    } else if (n->type == TB_PROJ && n->dt.type == TB_CONTROL) {
+        if (n->inputs[0]->type == TB_START) {
+            print_ref_to_node(ctx, n->inputs[0], def);
+        } else {
+            ptrdiff_t i = try_find_traversal_index(&ctx->cfg, n);
+            if (i >= 0) {
+                printf(".bb%zu", i);
+            } else {
+                printf("*DEAD*");
+            }
+        }
     } else if (n->type == TB_ZERO_EXT) {
         printf("(zxt.");
         print_type(n->dt);
@@ -111,7 +120,7 @@ static void print_ref_to_node(PrinterCtx* ctx, TB_Node* n, bool def) {
             printf("%#0"PRIx64, num->value);
         }
     } else {
-        printf("v%llu", (long long unsigned) n->gvn);
+        printf("v%u", n->gvn);
     }
 }
 
@@ -129,40 +138,39 @@ static void print_location(TB_Function* f, TB_Node* n) {
     }
 }
 
-static void print_bb(PrinterCtx* ctx, TB_Node* bb) {
-    assert(bb->type == TB_START || bb->type == TB_REGION);
-    print_ref_to_node(ctx, bb, true);
+static void print_bb(PrinterCtx* ctx, TB_Node* bb_start) {
+    print_ref_to_node(ctx, bb_start, true);
     printf(":");
 
     // print predecessors
-    if (bb->input_count > 0) {
+    if (!(bb_start->type == TB_PROJ && bb_start->inputs[0]->type == TB_START) && bb_start->input_count > 0) {
         printf(" # preds: ");
-        FOREACH_N(j, 0, bb->input_count) {
-            print_ref_to_node(ctx, tb_get_parent_region(bb->inputs[j]), false);
+        FOREACH_N(j, 0, bb_start->input_count) {
+            print_ref_to_node(ctx, get_pred(bb_start, j), false);
             printf(" ");
         }
     }
 
-    if (ctx->opt->error_n == bb) {
+    if (ctx->opt->error_n == bb_start) {
         printf("\x1b[31m  <-- ERROR\x1b[0m");
     }
     printf("\n");
 
-    TB_Node* end = TB_NODE_GET_EXTRA_T(bb, TB_NodeRegion)->end;
+    TB_BasicBlock* bb = nl_map_get_checked(ctx->opt->scheduled, bb_start);
     Worklist* ws = &ctx->opt->worklist;
 
-    sched_walk(ctx->opt, ws, NULL, bb, end);
-    assert(ws->items[ctx->block_count] == bb);
+    sched_walk(ctx->opt, ws, NULL, bb, bb->end, true);
 
     TB_Node* prev_effect = NULL;
-    FOREACH_N(i, ctx->block_count + 1, dyn_array_length(ws->items)) {
+    FOREACH_N(i, ctx->cfg.block_count, dyn_array_length(ws->items)) {
         TB_Node* n = ws->items[i];
 
         // skip these
         if (n->type == TB_INTEGER_CONST || n->type == TB_FLOAT32_CONST ||
             n->type == TB_FLOAT64_CONST || n->type == TB_SYMBOL ||
             n->type == TB_SIGN_EXT || n->type == TB_ZERO_EXT ||
-            n->type == TB_PROJ) {
+            n->type == TB_PROJ || n->type == TB_START ||
+            n->type == TB_REGION || n->type == TB_NULL) {
             continue;
         }
 
@@ -176,9 +184,20 @@ static void print_bb(PrinterCtx* ctx, TB_Node* bb) {
 
             case TB_BRANCH: {
                 TB_NodeBranch* br = TB_NODE_GET_EXTRA(n);
+                TB_ArenaSavepoint sp = tb_arena_save(tmp_arena);
+                TB_Node** restrict succ = tb_arena_alloc(tmp_arena, br->succ_count * sizeof(TB_Node**));
+
+                // fill successors
+                for (User* u = n->users; u; u = u->next) {
+                    if (u->n->type == TB_PROJ) {
+                        int index = TB_NODE_GET_EXTRA_T(u->n, TB_NodeProj)->index;
+                        succ[index] = cfg_next_bb_after_cproj(u->n);
+                    }
+                }
+
                 if (br->succ_count == 1) {
                     printf("  goto ");
-                    print_ref_to_node(ctx, br->succ[0], false);
+                    print_ref_to_node(ctx, succ[0], false);
                 } else if (br->succ_count == 2) {
                     printf("  if ");
                     FOREACH_N(i, 1, n->input_count) {
@@ -190,9 +209,9 @@ static void print_bb(PrinterCtx* ctx, TB_Node* bb) {
                     } else {
                         printf(" != %"PRId64" then ", br->keys[0]);
                     }
-                    print_ref_to_node(ctx, br->succ[0], false);
+                    print_ref_to_node(ctx, succ[0], false);
                     printf(" else ");
-                    print_ref_to_node(ctx, br->succ[1], false);
+                    print_ref_to_node(ctx, succ[1], false);
                 } else {
                     printf("  br ");
                     FOREACH_N(i, 1, n->input_count) {
@@ -205,11 +224,12 @@ static void print_bb(PrinterCtx* ctx, TB_Node* bb) {
                         if (i != 0) printf("    %"PRId64": ", br->keys[i - 1]);
                         else printf("    default: ");
 
-                        print_ref_to_node(ctx, br->succ[i], false);
+                        print_ref_to_node(ctx, succ[i], false);
                         printf("\n");
                     }
                     printf("  }");
                 }
+                tb_arena_restore(tmp_arena, sp);
                 break;
             }
 
@@ -233,7 +253,7 @@ static void print_bb(PrinterCtx* ctx, TB_Node* bb) {
                     TB_Node* projs[4];
                     for (size_t i = 0; i < 4; i++) projs[i] = NULL;
 
-                    for (User* use = find_users(ctx->opt, n); use; use = use->next) {
+                    for (User* use = n->users; use; use = use->next) {
                         if (use->n->type == TB_PROJ) {
                             int index = TB_NODE_GET_EXTRA_T(use->n, TB_NodeProj)->index;
                             projs[index] = use->n;
@@ -246,7 +266,7 @@ static void print_bb(PrinterCtx* ctx, TB_Node* bb) {
                     FOREACH_N(i, first, 4) {
                         if (projs[i] == NULL) break;
                         if (i > first) printf(", ");
-                        printf("v%zu", projs[i]->gvn);
+                        printf("v%u", projs[i]->gvn);
                     }
                     printf(" = %s.(", tb_node_get_name(n));
                     FOREACH_N(i, first, 4) {
@@ -260,7 +280,7 @@ static void print_bb(PrinterCtx* ctx, TB_Node* bb) {
                     if (n->dt.type == TB_INT && n->dt.data == 0) {
                         printf("  %s.", tb_node_get_name(n));
                     } else {
-                        printf("  v%zu = %s.", n->gvn, tb_node_get_name(n));
+                        printf("  v%u = %s.", n->gvn, tb_node_get_name(n));
                     }
 
                     TB_DataType dt = n->dt;
@@ -394,37 +414,34 @@ static void print_bb(PrinterCtx* ctx, TB_Node* bb) {
         printf("\n");
     }
 
-    dyn_array_set_length(ws->items, ctx->block_count);
+    dyn_array_set_length(ws->items, ctx->cfg.block_count);
+
+    if (bb->end->type != TB_END &&
+        bb->end->type != TB_TRAP &&
+        bb->end->type != TB_BRANCH &&
+        bb->end->type != TB_UNREACHABLE) {
+        printf("  goto ");
+        print_ref_to_node(ctx, cfg_next_control(bb->end), false);
+        printf("\n");
+    }
 }
 
 bool tb_pass_print(TB_Passes* opt) {
     TB_Function* f = opt->f;
-
-    // schedule nodes
-    tb_pass_schedule(opt);
-
-    PrinterCtx ctx = { opt, f };
     worklist_clear(&opt->worklist);
 
-    ctx.block_count = tb_push_postorder(f, &opt->worklist);
-    TB_Node* stop_bb = get_block_begin(f->stop_node);
+    PrinterCtx ctx = { opt, f };
+    ctx.cfg = tb_compute_rpo(f, opt);
 
+    // schedule nodes
+    tb_pass_schedule(opt, ctx.cfg);
     worklist_clear_visited(&opt->worklist);
 
-    bool has_stop = false;
-    FOREACH_REVERSE_N(i, 0, ctx.block_count) {
-        TB_Node* bb = opt->worklist.items[i];
-        if (bb != stop_bb) {
-            print_bb(&ctx, bb);
-        } else {
-            has_stop = true;
-        }
-    }
-
-    if (has_stop) {
-        print_bb(&ctx, stop_bb);
+    FOREACH_N(i, 0, ctx.cfg.block_count) {
+        print_bb(&ctx, opt->worklist.items[i]);
     }
 
+    tb_free_cfg(&ctx.cfg);
     ctx.opt->error_n = NULL;
     return false;
 }
diff --git a/tb/src/opt/scheduler.h b/tb/src/opt/scheduler.h
index 452f80e7..29195b5b 100644
--- a/tb/src/opt/scheduler.h
+++ b/tb/src/opt/scheduler.h
@@ -2,19 +2,7 @@
 // sort which is anti-dependency aware, a future TB could implement multiple schedulers.
 //
 // Once the worklist is filled, you can walk backwards and generate instructions accordingly.
-static bool is_same_bb(TB_Node* bb, TB_Node* n) {
-    if (n->type != TB_START && n->inputs[0] == NULL) {
-        return false;
-    }
-
-    while (n->type != TB_START && n->type != TB_REGION) {
-        n = n->inputs[0];
-    }
-
-    return n == bb;
-}
-
-static void sched_walk_phi(TB_Passes* passes, Worklist* ws, DynArray(PhiVal)* phi_vals, TB_Node* bb, TB_Node* phi, size_t phi_index) {
+static void sched_walk_phi(TB_Passes* passes, Worklist* ws, DynArray(PhiVal)* phi_vals, TB_BasicBlock* bb, TB_Node* phi, size_t phi_index) {
     TB_Node* val = phi->inputs[1 + phi_index];
 
     // reserve PHI space
@@ -27,26 +15,25 @@ static void sched_walk_phi(TB_Passes* passes, Worklist* ws, DynArray(PhiVal)* ph
         dyn_array_put(*phi_vals, p);
     }
 
-    sched_walk(passes, ws, phi_vals, bb, val);
+    sched_walk(passes, ws, phi_vals, bb, val, false);
 }
 
-void sched_walk(TB_Passes* passes, Worklist* ws, DynArray(PhiVal)* phi_vals, TB_Node* bb, TB_Node* n) {
-    if (!is_same_bb(bb, n) || worklist_test_n_set(ws, n)) {
+void sched_walk(TB_Passes* passes, Worklist* ws, DynArray(PhiVal)* phi_vals, TB_BasicBlock* bb, TB_Node* n, bool is_end) {
+    ptrdiff_t search = nl_map_get(passes->scheduled, n);
+    if (search < 0 || passes->scheduled[search].v != bb || worklist_test_n_set(ws, n)) {
         return;
     }
 
     // if we're a branch, push our PHI nodes
-    if (n->type == TB_BRANCH) {
-        TB_NodeBranch* br = TB_NODE_GET_EXTRA(n);
-        TB_Node** succ = br->succ;
-
-        FOREACH_N(i, 0, br->succ_count) {
-            TB_Node* dst = br->succ[i];
+    if (is_end) {
+        for (User* u = n->users; u; u = u->next) {
+            if (!cfg_is_control(u->n)) continue;
+            TB_Node* dst = cfg_next_region_control(u->n);
 
             // find predecessor index and do that edge
             ptrdiff_t phi_index = -1;
             FOREACH_N(j, 0, dst->input_count) {
-                TB_Node* pred = unsafe_get_region(dst->inputs[j]);
+                TB_BasicBlock* pred = nl_map_get_checked(passes->scheduled, dst->inputs[j]);
 
                 if (pred == bb) {
                     phi_index = j;
@@ -56,7 +43,7 @@ void sched_walk(TB_Passes* passes, Worklist* ws, DynArray(PhiVal)* phi_vals, TB_
             if (phi_index < 0) continue;
 
             // schedule memory PHIs
-            for (User* use = find_users(passes, dst); use; use = use->next) {
+            for (User* use = dst->users; use; use = use->next) {
                 TB_Node* phi = use->n;
                 if (phi->type == TB_PHI && phi->dt.type == TB_MEMORY) {
                     sched_walk_phi(passes, ws, phi_vals, bb, phi, phi_index);
@@ -64,7 +51,7 @@ void sched_walk(TB_Passes* passes, Worklist* ws, DynArray(PhiVal)* phi_vals, TB_
             }
 
             // schedule data PHIs, we schedule these afterwards because it's "generally" better
-            for (User* use = find_users(passes, dst); use; use = use->next) {
+            for (User* use = dst->users; use; use = use->next) {
                 TB_Node* phi = use->n;
                 if (phi->type == TB_PHI && phi->dt.type != TB_MEMORY) {
                     sched_walk_phi(passes, ws, phi_vals, bb, phi, phi_index);
@@ -75,25 +62,24 @@ void sched_walk(TB_Passes* passes, Worklist* ws, DynArray(PhiVal)* phi_vals, TB_
 
     // push inputs
     FOREACH_REVERSE_N(i, 0, n->input_count) if (n->inputs[i]) {
-        sched_walk(passes, ws, phi_vals, bb, n->inputs[i]);
+        sched_walk(passes, ws, phi_vals, bb, n->inputs[i], false);
     }
 
     // before the terminator we should eval leftovers that GCM linked here
-    if (is_block_end(n)) {
-        TB_Node* parent = get_block_begin(n);
-        for (User* use = find_users(passes, parent); use; use = use->next) {
-            sched_walk(passes, ws, phi_vals, bb, use->n);
+    if (is_end) {
+        nl_hashset_for(entry, &bb->items) {
+            sched_walk(passes, ws, phi_vals, bb, *entry, false);
         }
     }
 
     dyn_array_put(ws->items, n);
 
-    if (is_mem_out_op(n)) {
+    if (is_mem_out_op(n) && n->type != TB_PHI && n->type != TB_PROJ) {
         // memory effects have anti-dependencies, the previous loads
         // must finish before the next memory effect is applied.
         for (User* use = find_users(passes, n->inputs[1]); use; use = use->next) {
             if (use->slot == 1 && use->n != n) {
-                sched_walk(passes, ws, phi_vals, bb, use->n);
+                sched_walk(passes, ws, phi_vals, bb, use->n, false);
             }
         }
     }
@@ -103,7 +89,7 @@ void sched_walk(TB_Passes* passes, Worklist* ws, DynArray(PhiVal)* phi_vals, TB_
         for (User* use = find_users(passes, n); use; use = use->next) {
             TB_Node* use_n = use->n;
             if (use_n->type == TB_PROJ) {
-                sched_walk(passes, ws, phi_vals, bb, use_n);
+                sched_walk(passes, ws, phi_vals, bb, use_n, false);
             }
         }
     }
diff --git a/tb/src/opt/sroa.h b/tb/src/opt/sroa.h
new file mode 100644
index 00000000..56f7f1ce
--- /dev/null
+++ b/tb/src/opt/sroa.h
@@ -0,0 +1,122 @@
+
+
+typedef struct {
+    TB_Node* old_n;
+
+    int64_t offset;
+    TB_CharUnits size;
+    TB_DataType dt;
+} AggregateConfig;
+
+static ptrdiff_t find_config(size_t config_count, AggregateConfig* configs, int64_t offset) {
+    FOREACH_N(i, 0, config_count) {
+        if (configs[i].offset == offset) return i;
+    }
+
+    tb_unreachable();
+    return -1;
+}
+
+// -1 is a bad match
+// -2 is no match, so we can add a new config
+static ptrdiff_t compatible_with_configs(size_t config_count, AggregateConfig* configs, int64_t offset, TB_CharUnits size, TB_DataType dt) {
+    int64_t max = offset + size;
+
+    FOREACH_N(i, 0, config_count) {
+        int64_t max2 = configs[i].offset + configs[i].size;
+
+        if (offset >= configs[i].offset && max <= max2) {
+            // they overlap... but is it a clean overlap?
+            if (offset == configs[i].offset && max == max2 && TB_DATA_TYPE_EQUALS(dt, configs[i].dt)) {
+                return i;
+            }
+
+            return -1;
+        }
+    }
+
+    return -2;
+}
+
+// false means failure to SROA
+static bool add_configs(TB_Passes* p, TB_TemporaryStorage* tls, User* use, TB_Node* base_address, size_t base_offset, size_t* config_count, AggregateConfig* configs, int pointer_size) {
+    for (; use; use = use->next) {
+        TB_Node* n = use->n;
+
+        if (n->type == TB_MEMBER_ACCESS && use->slot == 1) {
+            // same rules, different offset
+            int64_t offset = TB_NODE_GET_EXTRA_T(n, TB_NodeMember)->offset;
+            if (!add_configs(p, tls, find_users(p, n), base_address, base_offset + offset, config_count, configs, pointer_size)) {
+                return false;
+            }
+            continue;
+        }
+
+        // we can only SROA if we know we're not using the
+        // address for anything but direct memory ops or TB_MEMBERs.
+        if (use->slot != 2) {
+            return false;
+        }
+
+        // find direct memory op
+        if (n->type != TB_LOAD && n->type != TB_STORE) {
+            return false;
+        }
+
+        TB_DataType dt = n->type == TB_LOAD ? n->dt : n->inputs[3]->dt;
+        TB_Node* address = n->inputs[2];
+        int size = (bits_in_data_type(pointer_size, dt) + 7) / 8;
+
+        // see if it's a compatible configuration
+        int match = compatible_with_configs(*config_count, configs, base_offset, size, dt);
+        if (match == -1) {
+            return false;
+        } else if (match == -2) {
+            // add new config
+            tb_tls_push(tls, sizeof(AggregateConfig));
+            configs[(*config_count)++] = (AggregateConfig){ address, base_offset, size, dt };
+        } else if (configs[match].old_n != address) {
+            log_warn("%s: v%u SROA config matches but reaches so via a different node, please idealize nodes before mem2reg", p->f->super.name, address->gvn);
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static size_t sroa_rewrite(TB_Passes* restrict p, int pointer_size, TB_Node* start, TB_Node* n) {
+    TB_TemporaryStorage* tls = tb_tls_steal();
+    void* mark = tb_tls_push(tls, 0);
+
+    size_t config_count = 0;
+    AggregateConfig* configs = tb_tls_push(tls, 0);
+    if (!add_configs(p, tls, n->users, n, 0, &config_count, configs, pointer_size)) {
+        return 1;
+    }
+
+    // split allocation into pieces
+    if (config_count > 1) {
+        DO_IF(TB_OPTDEBUG_SROA)(printf("sroa v%u => SROA to %zu pieces", n->gvn, config_count));
+
+        uint32_t alignment = TB_NODE_GET_EXTRA_T(n, TB_NodeLocal)->align;
+        FOREACH_N(i, 0, config_count) {
+            TB_Node* new_n = tb_alloc_node(p->f, TB_LOCAL, TB_TYPE_PTR, 1, sizeof(TB_NodeLocal));
+            set_input(p, new_n, start, 0);
+            TB_NODE_SET_EXTRA(new_n, TB_NodeLocal, .size = configs[i].size, .align = alignment);
+
+            // mark all users, there may be some fun new opts now
+            tb_pass_mark_users(p, configs[i].old_n);
+
+            // replace old pointer with new fancy
+            subsume_node(p, p->f, configs[i].old_n, new_n);
+            dyn_array_put(p->locals, new_n);
+        }
+
+        // we marked the changes else where which is cheating the peephole
+        // but still doing all the progress it needs to.
+        tb_pass_mark_users(p, n);
+    }
+
+    tb_tls_restore(tls, mark);
+    return config_count > 1 ? 1 + config_count : 1;
+}
diff --git a/tb/src/passes.h b/tb/src/passes.h
index 3eda6fd6..c7787181 100644
--- a/tb/src/passes.h
+++ b/tb/src/passes.h
@@ -1,17 +1,91 @@
 #pragma once
 #include "tb_internal.h"
 
-#define TB_OPTDEBUG_STATS 0
-
-#define TB_OPTDEBUG_PEEP 0
-#define TB_OPTDEBUG_LOOP 0
+#define TB_OPTDEBUG_STATS   0
+#define TB_OPTDEBUG_PEEP    0
+#define TB_OPTDEBUG_LOOP    0
+#define TB_OPTDEBUG_SROA    0
+#define TB_OPTDEBUG_GCM     0
 #define TB_OPTDEBUG_MEM2REG 0
 #define TB_OPTDEBUG_CODEGEN 0
 
+#define TB_OPTDEBUG(cond) CONCAT(DO_IF_, CONCAT(TB_OPTDEBUG_, cond))
+
 #define DO_IF(cond) CONCAT(DO_IF_, cond)
 #define DO_IF_0(...)
 #define DO_IF_1(...) __VA_ARGS__
 
+////////////////////////////////
+// SCCP
+////////////////////////////////
+// TODO(NeGate): implement dual? from there i can do join with
+// dual(dual(x) ^ dual(y)) = join(x, y)
+typedef struct {
+    int64_t min, max;
+
+    // for known bit analysis
+    uint64_t known_zeros;
+    uint64_t known_ones;
+} LatticeInt;
+
+// a simplification of the set of all pointers (or floats)
+typedef enum {
+    LATTICE_UNKNOWN,        // top aka {nan, non-nan} or for pointers {null, non-null}
+
+    LATTICE_KNOWN_NAN = 1,  // {nan}
+    LATTICE_KNOWN_NOT_NAN,  // {non-nan}
+
+    LATTICE_KNOWN_NULL = 1, // {null}
+    LATTICE_KNOWN_NOT_NULL  // {non-null}
+} LatticeTrifecta;
+
+typedef struct {
+    LatticeTrifecta trifecta;
+} LatticeFloat;
+
+// TODO(NeGate): we might wanna store more info like aliasing, ownership and alignment.
+typedef struct {
+    LatticeTrifecta trifecta;
+} LatticePointer;
+
+typedef struct {
+    TB_Node* idom;
+} LatticeControl;
+
+// Represents the fancier type system within the optimizer, it's
+// all backed by my shitty understanding of lattice theory
+typedef struct {
+    enum {
+        LATTICE_INT,
+        LATTICE_FLOAT32,
+        LATTICE_FLOAT64,
+        LATTICE_POINTER,
+        LATTICE_CONTROL,
+    } tag;
+    uint32_t pad;
+    union {
+        LatticeInt _int;
+        LatticeFloat _float;
+        LatticePointer _ptr;
+        LatticeControl _ctrl;
+    };
+} Lattice;
+
+// hash-consing because there's a lot of
+// redundant types we might construct.
+typedef struct {
+    TB_Arena* arena;
+    NL_HashSet pool;
+
+    // track a lattice per node (basically all get one
+    // so a non-sparse array works)
+    size_t type_cap;
+    Lattice** types;
+} LatticeUniverse;
+
+////////////////////////////////
+// CFG
+////////////////////////////////
 typedef struct {
     size_t stride;
     uint64_t arr[];
@@ -32,6 +106,25 @@ typedef struct {
     int dst, src;
 } PhiVal;
 
+typedef struct TB_BasicBlock {
+    TB_Node* dom;
+    TB_Node* end;
+    int id, dom_depth;
+
+    TB_Node* mem_in;
+    NL_HashSet items;
+} TB_BasicBlock;
+
+typedef struct TB_CFG {
+    size_t block_count;
+    NL_Map(TB_Node*, TB_BasicBlock) node_to_block;
+} TB_CFG;
+
+typedef NL_Map(TB_Node*, TB_BasicBlock*) TB_Scheduled;
+
+////////////////////////////////
+// Core optimizer
+////////////////////////////////
 typedef struct {
     DynArray(TB_Node*) items;
 
@@ -42,7 +135,7 @@ typedef struct {
 
 struct TB_Passes {
     TB_Function* f;
-    bool scheduled;
+    TB_Scheduled scheduled;
 
     // we use this to verify that we're on the same thread
     // for the entire duration of the TB_Passes.
@@ -50,43 +143,78 @@ struct TB_Passes {
 
     Worklist worklist;
 
+    // sometimes we be using arrays of nodes, let's just keep one around for a bit
+    DynArray(TB_Node*) stack;
+
     // we wanna track locals because it's nice and easy
     DynArray(TB_Node*) locals;
 
-    // this is used to do CSE
-    NL_HashSet cse_nodes;
+    // tracks the fancier type system
+    LatticeUniverse universe;
+
+    // this is used to do GVN
+    NL_HashSet gvn_nodes;
 
     // debug shit:
     TB_Node* error_n;
 
     // nice stats
     struct {
+        #if TB_OPTDEBUG_PEEP
+        int time;
+        #endif
+
         #if TB_OPTDEBUG_STATS
         int initial;
-        int cse_hit, cse_miss;
+        int gvn_hit, gvn_miss;
         int peeps, identities, rewrites;
         #endif
     } stats;
 };
 
-// it's either START, REGION or control node with CONTROL PROJ predecessor
-static bool is_block_begin(TB_Node* n) {
-    // regions also have a CONTROL PROJ so we
-    // don't need to check them explicitly.
-    return n->type == TB_REGION || (n->type == TB_PROJ && n->inputs[0]->type == TB_START);
+static bool cfg_is_terminator(TB_Node* n) {
+    return n->type == TB_BRANCH || n->type == TB_UNREACHABLE || n->type == TB_TRAP || n->type == TB_END;
+}
+
+// includes tuples which have control flow
+static bool cfg_is_control(TB_Node* n) {
+    // easy case
+    if (n->dt.type == TB_CONTROL) return true;
+    if (n->dt.type != TB_TUPLE) return false;
+
+    // harder case is figuring out which tuples have control outputs (without manually
+    // checking which is annoying and slow)
+    //
+    //     branch, debugbreak, trap, unreachable, dead  OR  call, syscall, safepoint
+    return (n->type >= TB_BRANCH && n->type <= TB_DEAD) || (n->type >= TB_CALL && n->type <= TB_SAFEPOINT_POLL);
 }
 
-static bool is_block_end(TB_Node* n) {
-    return n->type == TB_BRANCH;
+static bool cfg_is_bb_entry(TB_Node* n) {
+    if (n->type == TB_REGION) {
+        return true;
+    } else if (n->type == TB_PROJ && (n->inputs[0]->type == TB_START || n->inputs[0]->type == TB_BRANCH)) {
+        // Start's control proj or a branch target
+        return true;
+    } else {
+        return false;
+    }
+}
+
+static TB_Node* cfg_get_fallthru(TB_Node* n) {
+    if (n->type == TB_PROJ && n->dt.type == TB_CONTROL) {
+        // if it's single user and that user is the terminator we can skip it in the fallthrough logic
+        return n->users->next == NULL && n->users->n->type == TB_REGION ? n->users->n : n;
+    } else {
+        return n;
+    }
 }
 
 static bool is_mem_out_op(TB_Node* n) {
-    return n->type == TB_END || (n->type >= TB_STORE && n->type <= TB_ATOMIC_CAS) || (n->type == TB_PHI && n->dt.type == TB_MEMORY);
+    return n->dt.type == TB_MEMORY || (n->type >= TB_STORE && n->type <= TB_ATOMIC_CAS) || (n->type >= TB_CALL && n->type <= TB_SAFEPOINT_POLL);
 }
 
-// schedule nodes below any of their pinned dependencies
 static bool is_pinned(TB_Node* n) {
-    return (n->type >= TB_START && n->type <= TB_SAFEPOINT_POLL) || n->type == TB_PROJ || n->type == TB_LOCAL;
+    return (n->type >= TB_START && n->type <= TB_SAFEPOINT_POLL) || n->type == TB_PROJ;
 }
 
 static bool is_mem_in_op(TB_Node* n) {
@@ -96,31 +224,76 @@ static bool is_mem_in_op(TB_Node* n) {
 ////////////////////////////////
 // CFG analysis
 ////////////////////////////////
-static TB_Node* get_block_begin(TB_Node* n) {
-    while (!is_block_begin(n)) {
-        n = n->inputs[0];
+// if we see a branch projection, it may either be a BB itself
+// or if it enters a REGION directly, then that region is the BB.
+static TB_Node* cfg_next_bb_after_cproj(TB_Node* n) {
+    assert(n->type == TB_PROJ);
+    return n->users->n->type == TB_REGION ? n->users->n : n;
+}
+
+static TB_Node* cfg_next_region_control(TB_Node* n) {
+    if (n->type != TB_REGION) {
+        for (User* u = n->users; u; u = u->next) {
+            if (u->n->type == TB_REGION && u->n->input_count == 1) {
+                return u->n;
+            }
+        }
     }
+
     return n;
 }
 
-// shorthand because we use it a lot
-static TB_Node* idom(TB_Node* n) {
-    if (n->type == TB_PROJ) n = n->inputs[0];
+static TB_Node* cfg_next_control(TB_Node* n) {
+    for (User* u = n->users; u; u = u->next) {
+        if (cfg_is_control(u->n)) {
+            return u->n;
+        }
+    }
 
-    assert(n->type == TB_START || n->type == TB_REGION);
-    return TB_NODE_GET_EXTRA_T(n, TB_NodeRegion)->dom;
+    return NULL;
 }
 
-static int dom_depth(TB_Node* n) {
-    if (n == NULL) {
-        return 0;
+static TB_Node* get_pred(TB_Node* n, int i) {
+    TB_Node* base = n;
+    n = n->inputs[i];
+
+    if (base->type == TB_REGION && n->type == TB_PROJ) {
+        TB_Node* parent = n->inputs[0];
+
+        // start or cprojs with multiple users (it's a BB) will just exit
+        if (parent->type == TB_START || (parent->type == TB_REGION && n->users->next == NULL)) {
+            return n;
+        }
+        n = parent;
     }
 
-    while (n->type != TB_REGION && n->type != TB_START) {
+    while (!cfg_is_bb_entry(n)) {
         n = n->inputs[0];
     }
+    return n;
+}
 
-    return TB_NODE_GET_EXTRA_T(n, TB_NodeRegion)->dom_depth;
+static TB_Node* get_block_begin(TB_Node* n) {
+    while (!cfg_is_bb_entry(n)) {
+        n = n->inputs[0];
+    }
+    return n;
+}
+
+static TB_BasicBlock* idom_bb(TB_Passes* p, TB_BasicBlock* bb) {
+    ptrdiff_t search = nl_map_get(p->scheduled, bb->dom);
+    return search >= 0 ? p->scheduled[search].v : NULL;
+}
+
+// shorthand because we use it a lot
+static TB_Node* idom(TB_CFG* cfg, TB_Node* n) {
+    if (cfg->node_to_block == NULL) return NULL;
+    ptrdiff_t search = nl_map_get(cfg->node_to_block, n);
+    return search >= 0 ? cfg->node_to_block[search].v.dom : NULL;
+}
+
+static int dom_depth(TB_CFG* cfg, TB_Node* n) {
+    return nl_map_get_checked(cfg->node_to_block, n).dom_depth;
 }
 
 extern thread_local TB_Arena* tmp_arena;
@@ -134,10 +307,12 @@ static User* find_users(TB_Passes* restrict p, TB_Node* n) {
 
 // CFG
 //   pushes postorder walk into worklist items, also modifies the visited set.
-//   some entries will not be START or REGION, instead you'll see
-size_t tb_push_postorder(TB_Function* f, Worklist* restrict ws);
+TB_CFG tb_compute_rpo(TB_Function* f, TB_Passes* restrict p);
+TB_CFG tb_compute_rpo2(TB_Function* f, Worklist* ws, DynArray(TB_Node*)* tmp_stack);
+void tb_free_cfg(TB_CFG* cfg);
 //   postorder walk -> dominators
-void tb_compute_dominators(TB_Function* f, size_t count, TB_Node** blocks);
+void tb_compute_dominators(TB_Function* f, TB_Passes* restrict p, TB_CFG cfg);
+void tb_compute_dominators2(TB_Function* f, Worklist* ws, TB_CFG cfg);
 
 // Worklist API
 void worklist_alloc(Worklist* restrict ws, size_t initial_cap);
@@ -151,6 +326,8 @@ int worklist_popcount(Worklist* ws);
 TB_Node* worklist_pop(Worklist* ws);
 
 // Local scheduler
-void sched_walk(TB_Passes* passes, Worklist* ws, DynArray(PhiVal)* phi_vals, TB_Node* bb, TB_Node* n);
+void sched_walk(TB_Passes* passes, Worklist* ws, DynArray(PhiVal)* phi_vals, TB_BasicBlock* bb, TB_Node* n, bool is_end);
+
+static void push_all_nodes(TB_Passes* restrict passes, Worklist* restrict ws, TB_Function* f);
 
-static void push_all_nodes(Worklist* restrict ws, TB_Node* n);
+void tb_pass_schedule(TB_Passes* opt, TB_CFG cfg);
diff --git a/tb/src/tb.c b/tb/src/tb.c
index 5d045e2b..d52201c9 100644
--- a/tb/src/tb.c
+++ b/tb/src/tb.c
@@ -350,10 +350,9 @@ void tb_function_set_prototype(TB_Function* f, TB_ModuleSectionHandle section, T
     f->node_count = 0;
     f->start_node = tb_alloc_node(f, TB_START, TB_TYPE_TUPLE, 0, extra_size);
 
+    f->terminators = dyn_array_create(TB_Node*, 4);
+
     TB_NodeRegion* start = TB_NODE_GET_EXTRA(f->start_node);
-    start->dom_depth = 0;
-    start->dom = f->start_node;
-    start->tag = f->super.name;
 
     f->param_count = param_count;
     f->params = tb_arena_alloc(f->arena, (3+param_count) * sizeof(TB_Node*));
diff --git a/tb/src/tb_builder.c b/tb/src/tb_builder.c
index df5cfd6f..5316dc6a 100644
--- a/tb/src/tb_builder.c
+++ b/tb/src/tb_builder.c
@@ -157,7 +157,6 @@ TB_Node* tb_inst_ptr2int(TB_Function* f, TB_Node* src, TB_DataType dt) {
 TB_Node* tb_inst_int2float(TB_Function* f, TB_Node* src, TB_DataType dt, bool is_signed) {
     assert(dt.type == TB_FLOAT);
     assert(src->dt.type == TB_INT);
-    assert(src->dt.width == dt.width);
 
     if (src->type == TB_INTEGER_CONST) {
         uint64_t y = TB_NODE_GET_EXTRA_T(src, TB_NodeInt)->value;
@@ -222,38 +221,23 @@ TB_Node* tb_inst_get_control(TB_Function* f) {
 }
 
 void tb_inst_unreachable(TB_Function* f) {
-    TB_Node* n = tb_alloc_node(f, TB_UNREACHABLE, TB_TYPE_VOID, 1, 0);
+    TB_Node* n = tb_alloc_node(f, TB_UNREACHABLE, TB_TYPE_CONTROL, 1, 0);
     n->inputs[0] = f->active_control_node;
-
-    TB_Node* bb = tb_get_parent_region(f->active_control_node);
-    TB_NODE_GET_EXTRA_T(bb, TB_NodeRegion)->end = n;
-    f->active_control_node = n;
-
-    // return afterwards
-    tb_inst_ret(f, 0, NULL);
+    f->active_control_node = NULL;
+    dyn_array_put(f->terminators, n);
 }
 
 void tb_inst_debugbreak(TB_Function* f) {
-    TB_Node* n = tb_alloc_node(f, TB_DEBUGBREAK, TB_TYPE_VOID, 1, 0);
+    TB_Node* n = tb_alloc_node(f, TB_DEBUGBREAK, TB_TYPE_CONTROL, 1, 0);
     n->inputs[0] = f->active_control_node;
     f->active_control_node = n;
 }
 
 void tb_inst_trap(TB_Function* f) {
-    TB_Node* n = tb_alloc_node(f, TB_TRAP, TB_TYPE_VOID, 1, 0);
+    TB_Node* n = tb_alloc_node(f, TB_TRAP, TB_TYPE_CONTROL, 1, 0);
     n->inputs[0] = f->active_control_node;
-
-    TB_Node* bb = tb_get_parent_region(f->active_control_node);
-    TB_NODE_GET_EXTRA_T(bb, TB_NodeRegion)->end = n;
-    f->active_control_node = n;
-
-    // return afterwards
-    tb_inst_ret(f, 0, NULL);
-}
-
-TB_Node* tb_inst_poison(TB_Function* f) {
-    TB_Node* n = tb_alloc_node(f, TB_POISON, TB_TYPE_VOID, 1, 0);
-    return n;
+    f->active_control_node = NULL;
+    dyn_array_put(f->terminators, n);
 }
 
 TB_Node* tb_inst_local(TB_Function* f, TB_CharUnits size, TB_CharUnits alignment) {
@@ -496,27 +480,21 @@ TB_Node* tb_inst_bswap(TB_Function* f, TB_Node* src) {
 
 TB_Node* tb_inst_clz(TB_Function* f, TB_Node* src) {
     assert(TB_IS_INTEGER_TYPE(src->dt));
-    uint64_t bits = tb_ffs(src->dt.data) - 1;
-
-    TB_Node* n = tb_alloc_node(f, TB_CLZ, TB_TYPE_INTN(bits), 2, 0);
+    TB_Node* n = tb_alloc_node(f, TB_CLZ, TB_TYPE_I32, 2, 0);
     n->inputs[1] = src;
     return n;
 }
 
 TB_Node* tb_inst_ctz(TB_Function* f, TB_Node* src) {
     assert(TB_IS_INTEGER_TYPE(src->dt));
-    uint64_t bits = tb_ffs(src->dt.data) - 1;
-
-    TB_Node* n = tb_alloc_node(f, TB_CTZ, TB_TYPE_INTN(bits), 2, 0);
+    TB_Node* n = tb_alloc_node(f, TB_CTZ, TB_TYPE_I32, 2, 0);
     n->inputs[1] = src;
     return n;
 }
 
 TB_Node* tb_inst_popcount(TB_Function* f, TB_Node* src) {
     assert(TB_IS_INTEGER_TYPE(src->dt));
-    uint64_t bits = tb_ffs(src->dt.data) - 1;
-
-    TB_Node* n = tb_alloc_node(f, TB_POPCNT, TB_TYPE_INTN(bits), 2, 0);
+    TB_Node* n = tb_alloc_node(f, TB_POPCNT, TB_TYPE_I32, 2, 0);
     n->inputs[1] = src;
     return n;
 }
@@ -823,9 +801,6 @@ TB_Node* tb_inst_phi2(TB_Function* f, TB_Node* region, TB_Node* a, TB_Node* b) {
 TB_Node* tb_inst_region(TB_Function* f) {
     TB_Node* n = tb_alloc_node(f, TB_REGION, TB_TYPE_CONTROL, 0, sizeof(TB_NodeRegion));
     TB_NodeRegion* r = TB_NODE_GET_EXTRA(n);
-    r->postorder_id = -1;
-    r->dom_depth = -1; // unresolved
-    r->dom = NULL;
 
     TB_Node* phi = tb_alloc_node(f, TB_PHI, TB_TYPE_MEMORY, 1, 0);
     phi->inputs[0] = n;
@@ -850,24 +825,15 @@ static void add_input_late(TB_Function* f, TB_Node* n, TB_Node* in) {
 
     size_t old_count = n->input_count;
     TB_Node** new_inputs = alloc_from_node_arena(f, (old_count + 1) * sizeof(TB_Node*));
-    if (n->inputs != NULL)
+    if (n->inputs != NULL) {
         memcpy(new_inputs, n->inputs, old_count * sizeof(TB_Node*));
+    }
     new_inputs[old_count] = in;
 
     n->inputs = new_inputs;
     n->input_count = old_count + 1;
 }
 
-static TB_Node** add_successors(TB_Function* f, TB_Node* terminator, size_t count) {
-    TB_NodeRegion* bb = TB_NODE_GET_EXTRA(tb_get_parent_region(f->active_control_node));
-    bb->end = terminator;
-
-    TB_NodeBranch* br = TB_NODE_GET_EXTRA(terminator);
-    br->succ_count = count;
-    br->succ = alloc_from_node_arena(f, count * sizeof(TB_Node*));
-    return br->succ;
-}
-
 static void add_memory_edge(TB_Function* f, TB_Node* n, TB_Node* mem_state, TB_Node* target) {
     assert(target->type == TB_REGION);
     TB_NodeRegion* r = TB_NODE_GET_EXTRA(target);
@@ -878,18 +844,15 @@ static void add_memory_edge(TB_Function* f, TB_Node* n, TB_Node* mem_state, TB_N
 void tb_inst_goto(TB_Function* f, TB_Node* target) {
     TB_Node* mem_state = peek_mem(f, f->active_control_node);
 
-    TB_Node* n = tb_alloc_node(f, TB_BRANCH, TB_TYPE_TUPLE, 1, sizeof(TB_NodeBranch));
-    n->inputs[0] = f->active_control_node; // control edge
-
-    TB_Node** succ = add_successors(f, n, 1);
-    succ[0] = target;
+    // there's no need for a branch if the path isn't diverging.
+    TB_Node* n = f->active_control_node;
+    dyn_array_put(f->terminators, n);
     f->active_control_node = NULL;
 
-    {
-        TB_Node* cproj = tb__make_proj(f, TB_TYPE_CONTROL, n, 0);
-        add_input_late(f, target, cproj);
-        add_memory_edge(f, n, mem_state, target);
-    }
+    // just add the edge directly.
+    assert(n->dt.type == TB_CONTROL);
+    add_input_late(f, target, n);
+    add_memory_edge(f, n, mem_state, target);
 }
 
 void tb_inst_if(TB_Function* f, TB_Node* cond, TB_Node* if_true, TB_Node* if_false) {
@@ -909,11 +872,10 @@ void tb_inst_if(TB_Function* f, TB_Node* cond, TB_Node* if_true, TB_Node* if_fal
     }
 
     TB_NodeBranch* br = TB_NODE_GET_EXTRA(n);
+    br->succ_count = 2;
     br->keys[0] = 0;
 
-    TB_Node** succ = add_successors(f, n, 2);
-    succ[0] = if_true;
-    succ[1] = if_false;
+    dyn_array_put(f->terminators, n);
     f->active_control_node = NULL;
 }
 
@@ -934,16 +896,12 @@ void tb_inst_branch(TB_Function* f, TB_DataType dt, TB_Node* key, TB_Node* defau
     }
 
     TB_NodeBranch* br = TB_NODE_GET_EXTRA(n);
+    br->succ_count = 1 + entry_count;
     FOREACH_N(i, 0, entry_count) {
         br->keys[i] = entries[i].key;
     }
 
-    TB_Node** succ = add_successors(f, n, 1 + entry_count);
-    succ[0] = default_label;
-    FOREACH_N(i, 0, entry_count) {
-        succ[1 + i] = entries[i].value;
-    }
-
+    dyn_array_put(f->terminators, n);
     f->active_control_node = NULL;
 }
 
@@ -978,7 +936,9 @@ void tb_inst_ret(TB_Function* f, size_t count, TB_Node** values) {
         }
 
         f->stop_node = end;
-        TB_NODE_SET_EXTRA(region, TB_NodeRegion, .mem_in = mem_phi, .mem_out = mem_phi, .end = end, .tag = "ret");
+        TB_NODE_SET_EXTRA(region, TB_NodeRegion, .mem_in = mem_phi, .mem_out = mem_phi, .tag = "ret");
+
+        dyn_array_put(f->terminators, end);
     } else {
         // add to PHIs
         assert(end->input_count >= 3 + count);
@@ -1001,13 +961,7 @@ void tb_inst_ret(TB_Function* f, size_t count, TB_Node** values) {
     }
 
     // basically just tb_inst_goto without the memory PHI (we did it earlier)
-    TB_Node* region = end->inputs[0];
-    TB_Node* n = tb_alloc_node(f, TB_BRANCH, TB_TYPE_TUPLE, 1, sizeof(TB_NodeBranch));
-    n->inputs[0] = f->active_control_node; // control edge
-
-    TB_Node** succ = add_successors(f, n, 1);
-    succ[0] = region;
+    TB_Node* n = f->active_control_node;
     f->active_control_node = NULL;
-
-    add_input_late(f, region, tb__make_proj(f, TB_TYPE_CONTROL, n, 0));
+    add_input_late(f, end->inputs[0], n);
 }
diff --git a/tb/src/tb_internal.h b/tb/src/tb_internal.h
index 1e6e8773..b2582982 100644
--- a/tb/src/tb_internal.h
+++ b/tb/src/tb_internal.h
@@ -295,6 +295,9 @@ struct TB_Function {
     // IR allocation
     TB_Arena* arena;
 
+    // used for CFG walk in TB_Passes
+    DynArray(TB_Node*) terminators;
+
     // IR building
     TB_Node* active_control_node;
     TB_Attrib exit_attrib;
@@ -357,6 +360,7 @@ struct TB_ThreadInfo {
 
     TB_Arena perm_arena;
     TB_Arena tmp_arena;
+    TB_Arena type_arena;
 
     // live symbols (globals, functions and externals)
     //   we'll be iterating these during object/executable
@@ -370,6 +374,11 @@ struct TB_ThreadInfo {
     TB_CodeRegion* code; // compiled output
 };
 
+typedef struct {
+    size_t count;
+    TB_External** data;
+} ExportList;
+
 struct TB_Module {
     bool is_jit;
 
@@ -390,6 +399,7 @@ struct TB_Module {
     TB_Arch target_arch;
     TB_System target_system;
     TB_FeatureSet features;
+    ExportList exports;
 
     // This is a hack for windows since they've got this idea
     // of a _tls_index
@@ -567,11 +577,6 @@ inline static bool tb_is_power_of_two(uint64_t x) {
 TB_Node* tb_alloc_node(TB_Function* f, int type, TB_DataType dt, int input_count, size_t extra);
 TB_Node* tb__make_proj(TB_Function* f, TB_DataType dt, TB_Node* src, int index);
 
-typedef struct {
-    size_t count;
-    TB_External** data;
-} ExportList;
-
 ExportList tb_module_layout_sections(TB_Module* m);
 
 ////////////////////////////////
@@ -609,6 +614,10 @@ static TB_Arena* get_temporary_arena(TB_Module* m) {
     return &tb_thread_info(m)->tmp_arena;
 }
 
+static TB_Arena* get_type_arena(TB_Module* m) {
+    return &tb_thread_info(m)->type_arena;
+}
+
 static TB_Arena* get_permanent_arena(TB_Module* m) {
     return &tb_thread_info(m)->perm_arena;
 }
diff --git a/tb/src/tb_platform.h b/tb/src/tb_platform.h
index 2401b9e9..3d633df2 100644
--- a/tb/src/tb_platform.h
+++ b/tb/src/tb_platform.h
@@ -1,13 +1,18 @@
 // If you're trying to port TB on to a new platform you'll need to fill in these
 // functions with their correct behavior.
 #pragma once
-
 #include <setjmp.h>
-#include "../bdwgc/private/gc/gc.h"
 
-#define tb_platform_heap_alloc(size)        GC_malloc(size)
-#define tb_platform_heap_realloc(ptr, size) GC_realloc(ptr, size)
-#define tb_platform_heap_free(ptr)          GC_free(ptr)
+#if defined(TB_USE_MIMALLOC)
+#include <mimalloc.h>
+#define tb_platform_heap_alloc(size)        mi_malloc(size)
+#define tb_platform_heap_realloc(ptr, size) mi_realloc(ptr, size)
+#define tb_platform_heap_free(ptr)          mi_free(ptr)
+#else
+#define tb_platform_heap_alloc(size)        malloc(size)
+#define tb_platform_heap_free(ptr)          free(ptr)
+#define tb_platform_heap_realloc(ptr, size) realloc(ptr, size)
+#endif
 
 ////////////////////////////////
 // Virtual memory management
diff --git a/tb/src/x64/x64.c b/tb/src/x64/x64.c
index 961e1a7d..2d72b406 100644
--- a/tb/src/x64/x64.c
+++ b/tb/src/x64/x64.c
@@ -47,12 +47,19 @@ static size_t emit_epilogue(Ctx* restrict ctx, TB_Node* stop);
 // initialize register allocator state
 static void init_regalloc(Ctx* restrict ctx) {
     // Generate intervals for physical registers
-    FOREACH_N(i, 0, 16) {
-        dyn_array_put(ctx->intervals, (LiveInterval){ .reg_class = REG_CLASS_GPR, .dt = TB_X86_TYPE_QWORD, .reg = i, .assigned = i, .hint = -1, .start = INT_MAX, .split_kid = -1 });
-    }
-
-    FOREACH_N(i, 0, 16) {
-        dyn_array_put(ctx->intervals, (LiveInterval){ .reg_class = REG_CLASS_XMM, .dt = TB_X86_TYPE_XMMWORD, .reg = i, .assigned = i, .hint = -1, .start = INT_MAX, .split_kid = -1 });
+    FOREACH_N(i, 0, 32) {
+        DynArray(LiveRange) ranges = dyn_array_create(LiveRange, 8);
+        dyn_array_put(ranges, (LiveRange){ INT_MAX, INT_MAX });
+
+        bool is_gpr = i < 16;
+        int reg = i % 16;
+
+        dyn_array_put(ctx->intervals, (LiveInterval){
+                .reg_class = is_gpr ? REG_CLASS_GPR : REG_CLASS_XMM,
+                .dt = is_gpr ? TB_X86_TYPE_QWORD : TB_X86_TYPE_XMMWORD,
+                .reg = reg, .assigned = reg, .hint = -1, .split_kid = -1,
+                .ranges = ranges
+            });
     }
 }
 
@@ -100,17 +107,7 @@ static TB_X86_DataType legalize_int2(TB_DataType dt) {
 
 static TB_X86_DataType legalize_float(TB_DataType dt) {
     assert(dt.type == TB_FLOAT);
-    TB_X86_DataType t = (dt.data == TB_FLT_64 ? TB_X86_TYPE_SSE_SD : TB_X86_TYPE_SSE_SS);
-
-    if (dt.data == TB_FLT_64) {
-        assert(dt.width == 0 || dt.width == 1);
-    } else if (dt.data == TB_FLT_32) {
-        assert(dt.width == 0 || dt.width == 2);
-    } else {
-        tb_unreachable();
-    }
-
-    return t + (dt.width ? 2 : 0);
+    return (dt.data == TB_FLT_64 ? TB_X86_TYPE_SSE_SD : TB_X86_TYPE_SSE_SS);
 }
 
 static TB_X86_DataType legalize(TB_DataType dt) {
@@ -137,7 +134,7 @@ static bool is_terminator(int t) {
 static bool try_for_imm32(Ctx* restrict ctx, TB_Node* n, int32_t* out_x) {
     if (n->type == TB_INTEGER_CONST) {
         TB_NodeInt* i = TB_NODE_GET_EXTRA(n);
-        if (fits_into_int32(i->value)) {
+        if (i->value == (int32_t)i->value) {
             *out_x = i->value;
             return true;
         }
@@ -362,7 +359,6 @@ static Cond isel_cmp(Ctx* restrict ctx, TB_Node* n) {
 
     if (n->type >= TB_CMP_EQ && n->type <= TB_CMP_FLE) {
         TB_DataType cmp_dt = TB_NODE_GET_EXTRA_T(n, TB_NodeCompare)->cmp_dt;
-        assert(cmp_dt.width == 0 && "TODO: Implement vector compares");
 
         Cond cc = -1;
         use(ctx, n);
@@ -424,7 +420,7 @@ static Cond isel_cmp(Ctx* restrict ctx, TB_Node* n) {
 }
 
 static bool should_rematerialize(TB_Node* n) {
-    if (n->type == TB_INT2PTR && n->inputs[0]->type == TB_INTEGER_CONST) {
+    if ((n->type == TB_INT2FLOAT || n->type == TB_INT2PTR) && n->inputs[1]->type == TB_INTEGER_CONST) {
         return true;
     }
 
@@ -440,6 +436,13 @@ static void isel(Ctx* restrict ctx, TB_Node* n, const int dst) {
         case TB_PHI: break;
         case TB_REGION: break;
 
+        case TB_POISON: {
+            Inst* inst = alloc_inst(INST_INLINE, TB_TYPE_VOID, 1, 0, 0);
+            inst->operands[0] = dst;
+            append_inst(ctx, inst);
+            break;
+        }
+
         case TB_START: {
             TB_NodeRegion* start = TB_NODE_GET_EXTRA(n);
             const TB_FunctionPrototype* restrict proto = ctx->f->prototype;
@@ -570,13 +573,16 @@ static void isel(Ctx* restrict ctx, TB_Node* n, const int dst) {
                 x &= (1ull << bits_in_type) - 1;
             }
 
-            if (!fits_into_int32(x)) {
-                // movabs reg, imm64
-                SUBMIT(inst_op_abs(MOVABS, n->dt, dst, x));
-            } else if (x == 0) {
+            if (x == 0) {
                 SUBMIT(inst_op_zero(n->dt, dst));
-            } else {
+            } else if (x == (int32_t) x) {
                 SUBMIT(inst_op_imm(MOV, n->dt, dst, x));
+            } else if ((x >> 32ull) == UINT32_MAX) {
+                // mov but zero ext
+                SUBMIT(inst_op_imm(MOV, TB_TYPE_I32, dst, x));
+            } else {
+                // movabs reg, imm64
+                SUBMIT(inst_op_abs(MOVABS, n->dt, dst, x));
             }
             break;
         }
@@ -604,7 +610,16 @@ static void isel(Ctx* restrict ctx, TB_Node* n, const int dst) {
             hint_reg(ctx, dst, lhs);
 
             int32_t x;
-            if (try_for_imm32(ctx, n->inputs[2], &x)) {
+            if (n->inputs[2]->type == TB_LOAD && on_last_use(ctx, n->inputs[2])) {
+                use(ctx, n->inputs[2]);
+
+                SUBMIT(inst_move(n->dt, dst, lhs));
+
+                Inst* inst = isel_addr2(ctx, n->inputs[2]->inputs[2], dst, -1, dst);
+                inst->type = op;
+                inst->dt = legalize(n->dt);
+                SUBMIT(inst);
+            } else if (try_for_imm32(ctx, n->inputs[2], &x)) {
                 use(ctx, n->inputs[2]);
 
                 SUBMIT(inst_move(n->dt, dst, lhs));
@@ -666,6 +681,38 @@ static void isel(Ctx* restrict ctx, TB_Node* n, const int dst) {
             break;
         }
 
+        // bit magic
+        case TB_CTZ:
+        case TB_CLZ: {
+            int op = type == TB_CLZ ? BSR : BSF;
+            int lhs = input_reg(ctx, n->inputs[1]);
+            hint_reg(ctx, dst, lhs);
+
+            // we only wanna deal with 32 or 64 ops for
+            // this (16 is annoying and 8 is unavailable)
+            TB_DataType dt = n->dt;
+            if (dt.data < 64) {
+                // make sure the bits are zero'd above
+                if (dt.data < 32) {
+                    assert(type == TB_CLZ && "clz is different, and im stupid");
+                    SUBMIT(inst_op_zero(TB_TYPE_I32, dst));
+                }
+
+                dt.data = 32;
+            }
+
+            Inst* inst = inst_op_rr(op, dt, dst, lhs);
+            if (type == TB_CLZ) {
+                // the difference between bsf and tzcnt
+                inst->flags |= INST_REP;
+            }
+            SUBMIT(inst);
+
+            // flip bits to make CLZ instead of bitscanreverse
+            SUBMIT(inst_op_rri(XOR, dt, dst, dst, 63));
+            break;
+        }
+
         // bit shifts
         case TB_SHL:
         case TB_SHR:
@@ -753,7 +800,7 @@ static void isel(Ctx* restrict ctx, TB_Node* n, const int dst) {
         }
 
         case TB_FLOAT32_CONST: {
-            assert(n->dt.type == TB_FLOAT && n->dt.width == 0);
+            assert(n->dt.type == TB_FLOAT);
             uint32_t imm = (Cvt_F32U32) { .f = TB_NODE_GET_EXTRA_T(n, TB_NodeFloat32)->value }.i;
 
             if (imm == 0) {
@@ -765,7 +812,7 @@ static void isel(Ctx* restrict ctx, TB_Node* n, const int dst) {
             break;
         }
         case TB_FLOAT64_CONST: {
-            assert(n->dt.type == TB_FLOAT && n->dt.width == 0);
+            assert(n->dt.type == TB_FLOAT);
             uint64_t imm = (Cvt_F64U64){ .f = TB_NODE_GET_EXTRA_T(n, TB_NodeFloat64)->value }.i;
 
             if (imm == 0) {
@@ -819,7 +866,7 @@ static void isel(Ctx* restrict ctx, TB_Node* n, const int dst) {
             hint_reg(ctx, dst, lhs);
             SUBMIT(inst_move(n->dt, dst, lhs));
 
-            if (n->inputs[2]->type == TB_LOAD) {
+            if (n->inputs[2]->type == TB_LOAD && on_last_use(ctx, n->inputs[2])) {
                 use(ctx, n->inputs[2]);
 
                 Inst* inst = isel_addr2(ctx, n->inputs[2]->inputs[2], dst, -1, dst);
@@ -989,7 +1036,7 @@ static void isel(Ctx* restrict ctx, TB_Node* n, const int dst) {
                 TB_Node* param = n->inputs[i];
                 TB_DataType param_dt = param->dt;
 
-                bool use_xmm = TB_IS_FLOAT_TYPE(param_dt) || param_dt.width;
+                bool use_xmm = TB_IS_FLOAT_TYPE(param_dt);
                 int reg = use_xmm ? xmms_used : gprs_used;
                 if (is_sysv) {
                     if (use_xmm) {
@@ -1031,7 +1078,7 @@ static void isel(Ctx* restrict ctx, TB_Node* n, const int dst) {
             FOREACH_N(i, 0, in_count) {
                 TB_DataType dt = n->inputs[3 + i]->dt;
 
-                bool use_xmm = TB_IS_FLOAT_TYPE(dt) || dt.width;
+                bool use_xmm = TB_IS_FLOAT_TYPE(dt);
                 SUBMIT(inst_move(dt, ins[i], param_srcs[i]));
 
                 // in win64, float params past the vararg cutoff are
@@ -1066,7 +1113,7 @@ static void isel(Ctx* restrict ctx, TB_Node* n, const int dst) {
                 }
             }
 
-            bool use_xmm_ret = TB_IS_FLOAT_TYPE(ret_dt) || ret_dt.width;
+            bool use_xmm_ret = TB_IS_FLOAT_TYPE(ret_dt);
             if (ret_node != NULL) {
                 if (use_xmm_ret) {
                     caller_saved_xmms &= ~(1ull << XMM0);
@@ -1140,7 +1187,20 @@ static void isel(Ctx* restrict ctx, TB_Node* n, const int dst) {
         case TB_BRANCH: {
             TB_Node* bb = tb_get_parent_region(n);
             TB_NodeBranch* br = TB_NODE_GET_EXTRA(n);
-            TB_Node** succ = br->succ;
+
+            // the arena on the function should also be available at this time, we're
+            // in the TB_Passes
+            TB_Arena* arena = ctx->f->arena;
+            TB_ArenaSavepoint sp = tb_arena_save(arena);
+            TB_Node** restrict succ = tb_arena_alloc(arena, br->succ_count * sizeof(TB_Node**));
+
+            // fill successors
+            for (User* u = n->users; u; u = u->next) {
+                if (u->n->type == TB_PROJ) {
+                    int index = TB_NODE_GET_EXTRA_T(u->n, TB_NodeProj)->index;
+                    succ[index] = cfg_get_fallthru(u->n);
+                }
+            }
 
             SUBMIT(alloc_inst(INST_TERMINATOR, TB_TYPE_VOID, 0, 0, 0));
             if (br->succ_count == 1) {
@@ -1212,6 +1272,7 @@ static void isel(Ctx* restrict ctx, TB_Node* n, const int dst) {
                 }
                 SUBMIT(inst_jmp(succ[0]));
             }
+            tb_arena_restore(arena, sp);
             break;
         }
 
@@ -1233,7 +1294,7 @@ static void isel(Ctx* restrict ctx, TB_Node* n, const int dst) {
         }
         case TB_LOAD:
         case TB_ATOMIC_LOAD: {
-            int mov_op = (TB_IS_FLOAT_TYPE(n->dt) || n->dt.width) ? FP_MOV : MOV;
+            int mov_op = TB_IS_FLOAT_TYPE(n->dt) ? FP_MOV : MOV;
             TB_Node* addr = n->inputs[2];
 
             Inst* ld_inst = isel_addr2(ctx, addr, dst, -1, -1);
@@ -1284,7 +1345,7 @@ static void isel(Ctx* restrict ctx, TB_Node* n, const int dst) {
 
                 src = src->inputs[2];
             } else {
-                store_op = (TB_IS_FLOAT_TYPE(store_dt) || store_dt.width) ? FP_MOV : MOV;
+                store_op = TB_IS_FLOAT_TYPE(store_dt) ? FP_MOV : MOV;
             }
 
             int32_t imm;
@@ -1512,8 +1573,9 @@ static void print_operand(TB_CGEmitter* restrict e, Val* v, TB_X86_DataType dt)
                 EMITA(e, ".ret");
             } else {
                 TB_Node* n = v->target;
-                assert(n->type == TB_START || n->type == TB_REGION);
-                EMITA(e, "L%d", TB_NODE_GET_EXTRA_T(n, TB_NodeRegion)->postorder_id);
+
+                int id = nl_map_get_checked(muh_______cfg->node_to_block, n).id;
+                EMITA(e, ".bb%d", id);
             }
             break;
         }
@@ -1605,7 +1667,7 @@ static int resolve_interval(Ctx* restrict ctx, Inst* inst, int i, Val* val) {
     return 1;
 }
 
-static void emit_code(Ctx* restrict ctx, TB_FunctionOutput* restrict func_out) {
+static void emit_code(Ctx* restrict ctx, TB_FunctionOutput* restrict func_out, int end) {
     TB_CGEmitter* e = &ctx->emit;
 
     // resolve stack usage
@@ -1627,10 +1689,11 @@ static void emit_code(Ctx* restrict ctx, TB_FunctionOutput* restrict func_out) {
     Inst* prev_line = NULL;
     for (Inst* restrict inst = ctx->first; inst; inst = inst->next) {
         size_t in_base = inst->out_count;
-        InstCategory cat = inst->type >= (sizeof inst_table / sizeof *inst_table) ? INST_BINOP : inst_table[inst->type].cat;
+        size_t inst_table_size = sizeof(inst_table) / sizeof(*inst_table);
+        InstCategory cat = inst->type >= inst_table_size ? INST_BINOP : inst_table[inst->type].cat;
 
         if (0) {
-            EMITA(e, "  \x1b[32m# %s t=%d { outs:", inst->type < sizeof inst_table / sizeof *inst_table ? inst_table[inst->type].mnemonic : "???", inst->time);
+            EMITA(e, "  \x1b[32m# %s t=%d { outs:", inst->type < inst_table_size ? inst_table[inst->type].mnemonic : "???", inst->time);
             FOREACH_N(i, 0, inst->out_count) {
                 EMITA(e, " v%d", inst->operands[i]);
             }
@@ -1648,22 +1711,22 @@ static void emit_code(Ctx* restrict ctx, TB_FunctionOutput* restrict func_out) {
             uint32_t pos = GET_CODE_POS(&ctx->emit);
             tb_resolve_rel32(&ctx->emit, &nl_map_get_checked(ctx->emit.labels, bb), pos);
 
-            if (bb != ctx->f->start_node) {
-                assert(bb->type == TB_REGION);
-                EMITA(e, "L%d:\n", TB_NODE_GET_EXTRA_T(bb, TB_NodeRegion)->postorder_id);
+            int id = nl_map_get_checked(ctx->cfg.node_to_block, bb).id;
+            if (id > 0) {
+                EMITA(e, ".bb%d:\n", id);
             }
         } else if (inst->type == INST_INLINE) {
-            TB_NodeMachineOp* mach = TB_NODE_GET_EXTRA(inst->n);
+            if (inst->n) {
+                TB_NodeMachineOp* mach = TB_NODE_GET_EXTRA(inst->n);
 
-            EMITA(&ctx->emit, "  INLINE MACHINE CODE:");
-            FOREACH_N(i, 0, mach->length) {
-                EMITA(&ctx->emit, " %#02x", mach->data[i]);
+                EMITA(&ctx->emit, "  INLINE MACHINE CODE:");
+                FOREACH_N(i, 0, mach->length) {
+                    EMITA(&ctx->emit, " %#02x", mach->data[i]);
+                }
+                EMITA(&ctx->emit, "\n");
             }
-            EMITA(&ctx->emit, "\n");
         } else if (inst->type == INST_EPILOGUE) {
-            // return label goes here
-            EMITA(&ctx->emit, ".ret:\n");
-            tb_resolve_rel32(&ctx->emit, &ctx->emit.return_label, GET_CODE_POS(&ctx->emit));
+            // just a marker for regalloc
         } else if (inst->type == INST_LINE) {
             TB_Function* f = ctx->f;
             TB_Attrib* loc = inst->a;
@@ -1734,6 +1797,11 @@ static void emit_code(Ctx* restrict ctx, TB_FunctionOutput* restrict func_out) {
                 EMIT1(e, 0xF0);
             }
 
+            if (inst->flags & INST_REP) {
+                EMITA(e, "  REP");
+                EMIT1(e, 0xF3);
+            }
+
             // resolve output
             Val out;
             int i = 0;
@@ -1809,7 +1877,9 @@ static void emit_code(Ctx* restrict ctx, TB_FunctionOutput* restrict func_out) {
         }
     }
 
-    emit_epilogue(ctx, ctx->f->stop_node);
+    if (end >= 0) {
+        emit_epilogue(ctx, ctx->f->stop_node);
+    }
 
     // pad to 16bytes
     static const uint8_t nops[8][8] = {
diff --git a/tb/src/x64/x64_disasm.c b/tb/src/x64/x64_disasm.c
index e6d835d7..c17e7479 100644
--- a/tb/src/x64/x64_disasm.c
+++ b/tb/src/x64/x64_disasm.c
@@ -167,7 +167,7 @@ bool tb_x86_disasm(TB_X86_Inst* restrict inst, size_t length, const uint8_t* dat
 
         // immediates might use RX for an extended opcode
         if (uses_imm) {
-            __debugbreak();
+            tb_todo();
         } else {
             int8_t real_rx = ((rex & 4 ? 8 : 0) | rx);
             if (rex == 0 && inst->data_type == TB_X86_TYPE_BYTE && real_rx >= 4) {
diff --git a/tb/src/x64/x64_emitter.h b/tb/src/x64/x64_emitter.h
index 3dafbbde..f0e60951 100644
--- a/tb/src/x64/x64_emitter.h
+++ b/tb/src/x64/x64_emitter.h
@@ -124,11 +124,8 @@ static void inst1(TB_CGEmitter* restrict e, InstType type, const Val* r, TB_X86_
         EMIT1(e, inst->op);
         EMIT4(e, 0);
 
-        if (r->target != NULL) {
-            tb_emit_rel32(e, &nl_map_get_checked(e->labels, r->target), GET_CODE_POS(e) - 4);
-        } else {
-            tb_emit_rel32(e, &e->return_label, GET_CODE_POS(e) - 4);
-        }
+        assert(r->target != NULL);
+        tb_emit_rel32(e, &nl_map_get_checked(e->labels, r->target), GET_CODE_POS(e) - 4);
     } else {
         tb_unreachable();
     }
@@ -142,7 +139,7 @@ static void inst2(TB_CGEmitter* restrict e, InstType type, const Val* a, const V
     if (type == MOVABS) {
         assert(a->type == VAL_GPR && b->type == VAL_ABS);
 
-        EMIT1(e, rex(true, a->reg, 0, 0));
+        EMIT1(e, rex(true, 0, a->reg, 0));
         EMIT1(e, inst->op + (a->reg & 0b111));
         EMIT8(e, b->abs);
         return;
diff --git a/tb/src/x64/x64_insts.inc b/tb/src/x64/x64_insts.inc
index 0497a271..b39ffb23 100644
--- a/tb/src/x64/x64_insts.inc
+++ b/tb/src/x64/x64_insts.inc
@@ -72,6 +72,10 @@ X(CMOVGE,     "cmovge",       BINOP_EXT,  0x4D)
 X(CMOVLE,     "cmovle",       BINOP_EXT,  0x4E)
 X(CMOVG,      "cmovg",        BINOP_EXT,  0x4F)
 
+// bitmagic
+X(BSF,        "bsf",          BINOP_EXT, 0xBC)
+X(BSR,        "bsr",          BINOP_EXT, 0xBD)
+
 // binary ops but they have an implicit CL on the righthand side
 X(SHL,       "shl",         BINOP_CL,   0xD2, 0xC0, 0x04)
 X(SHR,       "shr",         BINOP_CL,   0xD2, 0xC0, 0x05)
diff --git a/tb/unittests/tb_test_exit_status.inc b/tb/unittests/tb_test_exit_status.inc
new file mode 100644
index 00000000..eac737bb
--- /dev/null
+++ b/tb/unittests/tb_test_exit_status.inc
@@ -0,0 +1,11 @@
+#include "util.inc"
+
+static int test_exit_status(void) {
+  TB_TEST_MODULE_BEGIN_;
+
+  TB_Node *exit_value = tb_inst_sint(f_main, TB_TYPE_I32, 42);
+  EXIT_WITH_(exit_value);
+
+  TB_TEST_MODULE_END_(test_exit_status, 42, 1);
+  return status;
+}
diff --git a/tb/unittests/tb_test_int_arith.inc b/tb/unittests/tb_test_int_arith.inc
new file mode 100644
index 00000000..e44c6307
--- /dev/null
+++ b/tb/unittests/tb_test_int_arith.inc
@@ -0,0 +1,67 @@
+#include "util.inc"
+
+#define TEST_INT_ARITH_(prefix_, type_, inst_type_, inst_op_, arg0_, \
+                        arg1_, res_)                                 \
+  static int test_##prefix_##_##inst_op_(void) {                     \
+    TB_TEST_MODULE_BEGIN_;                                           \
+                                                                     \
+    TB_Node *foo = tb_inst_##inst_type_(f_main, TB_TYPE_##type_,     \
+                                        (arg0_));                    \
+    TB_Node *bar = tb_inst_##inst_type_(f_main, TB_TYPE_##type_,     \
+                                        (arg1_));                    \
+    TB_Node *sum = tb_inst_##inst_op_(f_main, foo, bar,              \
+                                      TB_ARITHMATIC_NONE);           \
+                                                                     \
+    EXIT_WITH_(sum);                                                 \
+                                                                     \
+    TB_TEST_MODULE_END_(test_##prefix_##_##inst_op_, (res_), 0);     \
+    return status;                                                   \
+  }
+
+TEST_INT_ARITH_(i8, I8, sint, add, 50, -8, 42)
+TEST_INT_ARITH_(i8, I8, sint, sub, 20, -10, 30)
+TEST_INT_ARITH_(i8, I8, sint, mul, 7, 9, 63)
+TEST_INT_ARITH_(i8, I8, sint, div, 100, 11, 9)
+TEST_INT_ARITH_(i8, I8, sint, mod, 100, 11, 1)
+
+TEST_INT_ARITH_(i16, I16, sint, add, 300, -240, 60)
+TEST_INT_ARITH_(i16, I16, sint, sub, 1000, 934, 66)
+TEST_INT_ARITH_(i16, I16, sint, mul, 9, 8, 72)
+TEST_INT_ARITH_(i16, I16, sint, div, 999, 112, 8)
+TEST_INT_ARITH_(i16, I16, sint, mod, 999, 112, 103)
+
+TEST_INT_ARITH_(i32, I32, sint, add, 300, -240, 60)
+TEST_INT_ARITH_(i32, I32, sint, sub, 1000, 934, 66)
+TEST_INT_ARITH_(i32, I32, sint, mul, 9, 8, 72)
+TEST_INT_ARITH_(i32, I32, sint, div, 999, 112, 8)
+TEST_INT_ARITH_(i32, I32, sint, mod, 999, 112, 103)
+
+TEST_INT_ARITH_(i64, I64, sint, add, 300, -240, 60)
+TEST_INT_ARITH_(i64, I64, sint, sub, 1000, 934, 66)
+TEST_INT_ARITH_(i64, I64, sint, mul, 9, 8, 72)
+TEST_INT_ARITH_(i64, I64, sint, div, 999, 112, 8)
+TEST_INT_ARITH_(i64, I64, sint, mod, 999, 112, 103)
+
+TEST_INT_ARITH_(u8, I8, uint, add, 50, 8, 58)
+TEST_INT_ARITH_(u8, I8, uint, sub, 30, 10, 20)
+TEST_INT_ARITH_(u8, I8, uint, mul, 7, 9, 63)
+TEST_INT_ARITH_(u8, I8, uint, div, 100, 11, 9)
+TEST_INT_ARITH_(u8, I8, uint, mod, 100, 11, 1)
+
+TEST_INT_ARITH_(u16, I16, uint, add, 30, 50, 80)
+TEST_INT_ARITH_(u16, I16, uint, sub, 1000, 934, 66)
+TEST_INT_ARITH_(u16, I16, uint, mul, 9, 8, 72)
+TEST_INT_ARITH_(u16, I16, uint, div, 999, 112, 8)
+TEST_INT_ARITH_(u16, I16, uint, mod, 999, 112, 103)
+
+TEST_INT_ARITH_(u32, I32, uint, add, 50, 40, 90)
+TEST_INT_ARITH_(u32, I32, uint, sub, 1000, 934, 66)
+TEST_INT_ARITH_(u32, I32, uint, mul, 9, 8, 72)
+TEST_INT_ARITH_(u32, I32, uint, div, 999, 112, 8)
+TEST_INT_ARITH_(u32, I32, uint, mod, 999, 112, 103)
+
+TEST_INT_ARITH_(u64, I64, uint, add, 20, 25, 45)
+TEST_INT_ARITH_(u64, I64, uint, sub, 1000, 934, 66)
+TEST_INT_ARITH_(u64, I64, uint, mul, 9, 8, 72)
+TEST_INT_ARITH_(u64, I64, uint, div, 999, 112, 8)
+TEST_INT_ARITH_(u64, I64, uint, mod, 999, 112, 103)
diff --git a/tb/unittests/tb_test_regressions.inc b/tb/unittests/tb_test_regressions.inc
new file mode 100644
index 00000000..f21bbcf9
--- /dev/null
+++ b/tb/unittests/tb_test_regressions.inc
@@ -0,0 +1,26 @@
+#include "util.inc"
+
+static int test_regression_module_arena(void) {
+  tb_module_destroy(tb_module_create(tb_test_arch, tb_test_system,
+                                     &tb_test_feature_set, 0));
+  tb_module_destroy(tb_module_create(tb_test_arch, tb_test_system,
+                                     &tb_test_feature_set, 0));
+
+  // We're testing for segfault.
+  return 1;
+}
+
+static int test_regression_link_global(void) {
+  TB_Module *module = tb_module_create(tb_test_arch, tb_test_system,
+                                       &tb_test_feature_set, 0);
+  TB_Global *global = tb_global_create(module, -1, "global", NULL,
+                                       TB_LINKAGE_PRIVATE);
+  tb_global_set_storage(module, tb_module_get_rdata(module), global,
+                        8, 8, 1);
+  TB_Linker *linker = tb_linker_create(tb_test_exe_type,
+                                       tb_test_arch);
+  tb_linker_append_module(linker, module);
+  tb_module_destroy(module);
+  tb_linker_destroy(linker);
+  return 1;
+}
diff --git a/tb/unittests/tb_unittests.c b/tb/unittests/tb_unittests.c
new file mode 100644
index 00000000..bed9d4e6
--- /dev/null
+++ b/tb/unittests/tb_unittests.c
@@ -0,0 +1,88 @@
+#include <dyn_array.h>
+#include "tb_test_regressions.inc"
+#include "tb_test_exit_status.inc"
+#include "tb_test_int_arith.inc"
+
+#define TEST(proc_)                                        \
+do {                                                       \
+    fflush(stdout);                                        \
+    printf("%s\r", #proc_);                                \
+    fflush(stdout);                                        \
+    int status_ = test_##proc_();                          \
+    fflush(stdout);                                        \
+    printf("%s%.*s ", #proc_, (int) (41 - sizeof(#proc_)), \
+        " ........................................");      \
+    if (status_)                                           \
+    printf("OK\n");                                        \
+    else {                                                 \
+        printf("FAILED\n");                                \
+        failed++;                                          \
+    }                                                      \
+    total++;                                               \
+    fflush(stdout);                                        \
+} while (0)
+
+int main(int argc, char **argv) {
+    int failed = 0, total = 0;
+
+    TEST(regression_module_arena);
+    TEST(regression_link_global);
+    TEST(exit_status);
+
+    TEST(i8_add);
+    TEST(i8_sub);
+    TEST(i8_mul);
+    TEST(i8_div);
+    TEST(i8_mod);
+
+    TEST(i16_add);
+    TEST(i16_sub);
+    TEST(i16_mul);
+    TEST(i16_div);
+    TEST(i16_mod);
+
+    TEST(i32_add);
+    TEST(i32_sub);
+    TEST(i32_mul);
+    TEST(i32_div);
+    TEST(i32_mod);
+
+    TEST(i64_add);
+    TEST(i64_sub);
+    TEST(i64_mul);
+    TEST(i64_div);
+    TEST(i64_mod);
+
+    TEST(u8_add);
+    TEST(u8_sub);
+    TEST(u8_mul);
+    TEST(u8_div);
+    TEST(u8_mod);
+
+    TEST(u16_add);
+    TEST(u16_sub);
+    TEST(u16_mul);
+    TEST(u16_div);
+    TEST(u16_mod);
+
+    TEST(u32_add);
+    TEST(u32_sub);
+    TEST(u32_mul);
+    TEST(u32_div);
+    TEST(u32_mod);
+
+    TEST(u64_add);
+    TEST(u64_sub);
+    TEST(u64_mul);
+    TEST(u64_div);
+    TEST(u64_mod);
+
+    fflush(stdout);
+    if (failed > 0)
+        printf("\n%d of %d tests failed.\n", failed, total);
+    else
+        printf("\nAll %d tests succeeded.\n", total);
+    fflush(stdout);
+
+    return failed;
+}
diff --git a/tb/unittests/util.inc b/tb/unittests/util.inc
new file mode 100644
index 00000000..5b3fdb7c
--- /dev/null
+++ b/tb/unittests/util.inc
@@ -0,0 +1,209 @@
+#ifndef TB_TEST_UTIL_INC
+#define TB_TEST_UTIL_INC
+
+#if defined(__GCC__) || defined(__clang__)
+#  pragma GCC diagnostic ignored "-Wunknown-pragmas"
+#  pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+
+#include "../include/tb.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#if defined(_WIN32) && !defined(__CYGWIN__)
+#  define TB_TEST_IS_WINDOWS_ 1
+#  define WEXITSTATUS(x) x
+
+static TB_System         tb_test_system   = TB_SYSTEM_WINDOWS;
+static TB_ExecutableType tb_test_exe_type = TB_EXECUTABLE_PE;
+#else
+#  define TB_TEST_IS_WINDOWS_ 0
+
+#  include <sys/wait.h>
+
+static TB_System         tb_test_system   = TB_SYSTEM_LINUX;
+static TB_ExecutableType tb_test_exe_type = TB_EXECUTABLE_ELF;
+#endif
+
+static TB_Arch       tb_test_arch        = TB_ARCH_X86_64;
+static TB_FeatureSet tb_test_feature_set = { .x64 = 0 };
+
+static void tb_test_link_library(TB_Linker  *linker,
+                                 char const *name) {
+#if TB_TEST_IS_WINDOWS_
+  //  FIXME
+  //  Find and link library on Windows.
+  //
+
+  char full_path[200] = "C:\\Program Files (x86)\\Windows "
+                        "Kits\\10\\Lib\\10.0.14393.0\\um\\x64\\";
+
+  ptrdiff_t folder_len = strlen(full_path);
+  ptrdiff_t name_len   = strlen(name);
+
+  assert(folder_len + name_len + 1 < sizeof full_path);
+  memcpy(full_path + folder_len, name, name_len);
+  full_path[folder_len + name_len] = '\0';
+
+  FILE *f = fopen(full_path, "rb");
+
+  assert(f != NULL);
+  if (f == NULL)
+    return;
+
+  ptrdiff_t chunk_size = 100000;
+  ptrdiff_t data_size  = 0;
+
+  uint8_t *data = NULL;
+  uint8_t *p    = data;
+
+  while (!feof(f)) {
+    data = realloc(data, data_size + chunk_size);
+    assert(data != NULL);
+    if (data == NULL)
+      return;
+
+    ptrdiff_t n = fread(data + data_size, 1, chunk_size, f);
+    if (n <= 0)
+      break;
+    data_size += n;
+  }
+
+  fclose(f);
+
+  TB_Slice sl_name = { .length = name_len,
+                       .data   = (uint8_t const *) name };
+  TB_Slice sl_data = { .length = data_size, .data = data };
+  tb_linker_append_library(linker, sl_name, sl_data);
+
+  free(data);
+#endif
+
+  //  NOTE
+  //  We don't need to link libraries for unit-testing on Linux yet.
+  //
+}
+
+#define ERROR(x)                                                   \
+  do {                                                             \
+    printf("Error in %s (\"%s\" line %d): " #x "\n", __FUNCTION__, \
+           __FILE__, (int) __LINE__);                              \
+    status = 0;                                                    \
+    goto _final;                                                   \
+  } while (0)
+
+#if TB_TEST_IS_WINDOWS_
+
+#  define EXIT_WITH_(node_status_)                                    \
+    do {                                                              \
+      TB_PrototypeParam     param0         = { .dt   = TB_TYPE_I32,   \
+                                               .name = "uExitCode" }; \
+      TB_FunctionPrototype *fp_ExitProcess = tb_prototype_create(     \
+          module, TB_STDCALL, 1, &param0, 0, NULL, 0);                \
+                                                                      \
+      TB_Node *addr_ExitProcess = tb_inst_get_symbol_address(         \
+          f_main,                                                     \
+          (TB_Symbol *) tb_extern_create(module, -1, "ExitProcess",   \
+                                         TB_EXTERNAL_SO_LOCAL));      \
+                                                                      \
+      tb_inst_call(f_main, fp_ExitProcess, addr_ExitProcess, 1,       \
+                   &(node_status_));                                  \
+      tb_inst_ret(f_main, 0, NULL);                                   \
+    } while (0)
+
+#else
+
+#  define EXIT_WITH_(node_status_)                           \
+    do {                                                     \
+      TB_Node *num_ = tb_inst_sint(f_main, TB_TYPE_I32, 60); \
+      tb_inst_syscall(f_main, TB_TYPE_I64, num_, 1,          \
+                      &(node_status_));                      \
+      tb_inst_ret(f_main, 0, NULL);                          \
+    } while (0)
+
+#endif
+
+#define TB_TEST_MODULE_BEGIN_                                 \
+  int status = 1;                                             \
+  int ret    = 0;                                             \
+                                                              \
+  TB_Module *module = NULL;                                   \
+  TB_Linker *linker = NULL;                                   \
+                                                              \
+  module = tb_module_create(tb_test_arch, tb_test_system,     \
+                            &tb_test_feature_set, 0);         \
+                                                              \
+  if (module == NULL)                                         \
+    ERROR("tb_module_create failed.");                        \
+                                                              \
+  TB_FunctionPrototype *fp_main = tb_prototype_create(        \
+      module, TB_CDECL, 0, NULL, 0, NULL, false);             \
+                                                              \
+  TB_Function *f_main = tb_function_create(                   \
+      module, -1, "main", TB_LINKAGE_PUBLIC);                 \
+                                                              \
+  TB_ModuleSectionHandle text = tb_module_get_text(module);   \
+  tb_function_set_prototype(f_main, text, fp_main, NULL);     \
+                                                              \
+  if (f_main == NULL)                                         \
+    ERROR("tb_function_create failed.");
+
+#define TB_TEST_MODULE_END_(name_, result_, print_asm_)          \
+  {                                                              \
+    TB_SymbolIter it = tb_symbol_iter(module);                   \
+    TB_Symbol* sym;                                              \
+    while (sym = tb_symbol_iter_next(&it), sym) {                \
+      if (sym->tag == TB_SYMBOL_FUNCTION) {                      \
+        TB_Function *f = (TB_Function*) sym;                     \
+        TB_Passes *passes = tb_pass_enter(f, NULL);              \
+                                                                 \
+        if (passes == NULL)                                      \
+          ERROR("tb_pass_enter failed.");                        \
+                                                                 \
+        TB_FunctionOutput *asm_out = tb_pass_codegen(passes, 1); \
+                                                                 \
+        if ((print_asm_) && asm_out != NULL) {                   \
+          printf("\n");                                          \
+          tb_output_print_asm(asm_out, stdout);                  \
+        }                                                        \
+                                                                 \
+        tb_pass_exit(passes);                                    \
+      }                                                          \
+    }                                                            \
+  }                                                              \
+                                                                 \
+  linker = tb_linker_create(tb_test_exe_type, tb_test_arch);     \
+                                                                 \
+  if (linker == NULL)                                            \
+    ERROR("tb_linker_create failed.");                           \
+                                                                 \
+  tb_linker_append_module(linker, module);                       \
+                                                                 \
+  tb_linker_set_entrypoint(linker, "main");                      \
+                                                                 \
+  if (TB_TEST_IS_WINDOWS_)                                       \
+    tb_test_link_library(linker, "kernel32.lib");                \
+                                                                 \
+  TB_ExportBuffer buf = tb_linker_export(linker);                \
+  tb_export_buffer_to_file(buf, "bin/" #name_);                  \
+  tb_export_buffer_free(buf);                                    \
+                                                                 \
+  if (!TB_TEST_IS_WINDOWS_) {                                    \
+    (void) system("chmod a+x bin/" #name_);                      \
+    ret = WEXITSTATUS(system("./bin/" #name_));                  \
+  } else                                                         \
+    ret = WEXITSTATUS(system("start bin\\" #name_));             \
+                                                                 \
+  if (ret != (result_)) {                                        \
+    printf("Got %d, expected %d\n", (int) ret, (int) (result_)); \
+    status = 0;                                                  \
+  }                                                              \
+                                                                 \
+_final:                                                          \
+  if (module != NULL)                                            \
+    tb_module_destroy(module);                                   \
+  if (linker != NULL)                                            \
+    tb_linker_destroy(linker);
+
+#endif
diff --git a/test/fib40.lua b/test/fib40.lua
new file mode 100644
index 00000000..faae15f0
--- /dev/null
+++ b/test/fib40.lua
@@ -0,0 +1,10 @@
+
+local function fib(n)
+    if n < 2 then
+        return n
+    else
+        return fib(n-1) + fib(n-2)
+    end
+end
+
+print(fib(40))
diff --git a/test/fib5.paka b/test/fib5.paka
new file mode 100644
index 00000000..c0105f34
--- /dev/null
+++ b/test/fib5.paka
@@ -0,0 +1,10 @@
+
+def fib(n) {
+    if n < 2 {
+        return n
+    } else {
+        return fib(n-1) + fib(n-2)
+    }
+}
+
+env.io.debug(fib(5))
diff --git a/vm/jit/tb.c b/vm/jit/tb.c
index 6a3c8a94..604f015b 100644
--- a/vm/jit/tb.c
+++ b/vm/jit/tb.c
@@ -296,7 +296,7 @@ TB_Node *vm_tb_func_body(vm_tb_state_t *state, TB_Function *fun, TB_Node **args,
         func,
         rblock->block->nargs + 1,
         call_args);
-
+    
     tb_inst_ret(fun, 0, NULL);
 
     tb_inst_set_control(fun, ctrl);
@@ -305,19 +305,6 @@ TB_Node *vm_tb_func_body(vm_tb_state_t *state, TB_Function *fun, TB_Node **args,
 }
 
 TB_Node *vm_tb_func_body_call(vm_tb_state_t *state, TB_Function *fun, TB_Node **args, vm_rblock_t *rblock) {
-    TB_Module *module = state->module;
-    
-    TB_PrototypeParam comp_args[2] = {
-        {TB_TYPE_PTR},
-        {TB_TYPE_PTR},
-    };
-
-    TB_PrototypeParam comp_ret[1] = {
-        {TB_TYPE_PTR},
-    };
-
-    TB_FunctionPrototype *comp_proto = tb_prototype_create(state->module, VM_TB_CC, 2, comp_args, 1, comp_ret, false);
-
     TB_Node *comp_params[2];
 
     comp_params[0] = tb_inst_uint(fun, TB_TYPE_PTR, (uint64_t)state);
@@ -681,7 +668,6 @@ TB_Node *vm_tb_func_body_once(vm_tb_state_t *state, TB_Function *fun, TB_Node **
             
             TB_SwitchEntry keys[VM_TAG_MAX - 1];
             for (size_t i = 1; i < VM_TAG_MAX; i++) {
-                keys[i - 1].key = i;
                 // vm_block_t *next_block = vm_tb_rblock_version(branch.rtargets[i]);
                 TB_Node **next_args = vm_malloc(sizeof(TB_Node *) * branch.targets[0]->nargs);
                 for (size_t j = 0; j < branch.targets[0]->nargs; j++) {
@@ -700,6 +686,7 @@ TB_Node *vm_tb_func_body_once(vm_tb_state_t *state, TB_Function *fun, TB_Node **
                         next_args[j] = vm_tb_func_read_arg(fun, regs, next_arg);
                     }
                 }
+                keys[i - 1].key = i;
                 keys[i - 1].value = vm_tb_func_body(state, fun, next_args, branch.rtargets[i]);
             }
             
@@ -862,7 +849,7 @@ void *vm_tb_rfunc_comp(vm_tb_state_t *state, vm_rblock_t *rblock) {
     fprintf(stdout, "\n--- tb ---\n");
     tb_pass_print(passes);
 #endif
-    tb_pass_mem2reg(passes);
+    // tb_pass_mem2reg(passes);
     tb_pass_optimize(passes);
 #if defined(VM_DUMP_TB_OPT)
     fprintf(stdout, "\n--- opt tb ---\n");