diff --git a/.vscode/settings.json b/.vscode/settings.json index 8d0be481..7e42f2d1 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -12,6 +12,7 @@ "xtree": "c", "xutility": "c", "tb_internal.h": "c", - "gc.h": "c" + "gc.h": "c", + "windows.h": "c" } } \ No newline at end of file diff --git a/c11threads/threads.h b/c11threads/threads.h new file mode 100644 index 00000000..ead6e87b --- /dev/null +++ b/c11threads/threads.h @@ -0,0 +1,151 @@ +/* + * C11 emulation library + * + * (C) Copyright yohhoy 2012. + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://www.boost.org/LICENSE_1_0.txt) + */ +#ifndef EMULATED_THREADS_H_INCLUDED_ +#define EMULATED_THREADS_H_INCLUDED_ + +#include + +#if defined(_WIN32) +#define WIN32_LEAN_AND_MEAN +#include + +// check configuration +#if defined(EMULATED_THREADS_USE_NATIVE_CALL_ONCE) && (_WIN32_WINNT < 0x0600) +#error EMULATED_THREADS_USE_NATIVE_CALL_ONCE requires _WIN32_WINNT>=0x0600 +#endif + +#if defined(EMULATED_THREADS_USE_NATIVE_CV) && (_WIN32_WINNT < 0x0600) +#error EMULATED_THREADS_USE_NATIVE_CV requires _WIN32_WINNT>=0x0600 +#endif + +/*---------------------------- macros ----------------------------*/ +#ifdef EMULATED_THREADS_USE_NATIVE_CALL_ONCE +#define ONCE_FLAG_INIT INIT_ONCE_STATIC_INIT +#else +#define ONCE_FLAG_INIT \ +{ 0 } +#endif +#define TSS_DTOR_ITERATIONS 1 + +#ifndef thread_local +#define thread_local _Thread_local +#endif + +/*---------------------------- types ----------------------------*/ +typedef struct cnd_t { + #ifdef EMULATED_THREADS_USE_NATIVE_CV + CONDITION_VARIABLE condvar; + #else + int blocked; + int gone; + int to_unblock; + HANDLE sem_queue; + HANDLE sem_gate; + CRITICAL_SECTION monitor; + #endif +} cnd_t; + +typedef HANDLE thrd_t; + +typedef DWORD tss_t; + +typedef struct mtx_t { + CRITICAL_SECTION cs; +} mtx_t; + +#ifdef EMULATED_THREADS_USE_NATIVE_CALL_ONCE +typedef INIT_ONCE once_flag; +#else +typedef struct once_flag_t { + volatile LONG status; +} once_flag; +#endif + +#elif defined(__unix__) || defined(__unix) || defined(__APPLE__) +#include + +/*---------------------------- macros ----------------------------*/ +#define ONCE_FLAG_INIT PTHREAD_ONCE_INIT +#ifdef INIT_ONCE_STATIC_INIT +#define TSS_DTOR_ITERATIONS PTHREAD_DESTRUCTOR_ITERATIONS +#else +#define TSS_DTOR_ITERATIONS 1 // assume TSS dtor MAY be called at least once. +#endif + +/*---------------------------- types ----------------------------*/ +typedef pthread_cond_t cnd_t; +typedef pthread_t thrd_t; +typedef pthread_key_t tss_t; +typedef pthread_mutex_t mtx_t; +typedef pthread_once_t once_flag; + +#else +#error Not supported on this platform. +#endif + +/*---------------------------- types ----------------------------*/ +typedef void (*tss_dtor_t)(void*); +typedef int (*thrd_start_t)(void*); + +struct xtime { + time_t sec; + long nsec; +}; +typedef struct xtime xtime; + +/*-------------------- enumeration constants --------------------*/ +enum { + mtx_plain = 0, + mtx_try = 1, + mtx_timed = 2, + mtx_recursive = 4 +}; + +enum { + thrd_success = 0, // succeeded + thrd_timeout, // timeout + thrd_error, // failed + thrd_busy, // resource busy + thrd_nomem // out of memory +}; + +/*-------------------------- functions --------------------------*/ +void call_once(once_flag* flag, void (*func)(void)); + +int cnd_broadcast(cnd_t* cond); +void cnd_destroy(cnd_t* cond); +int cnd_init(cnd_t* cond); +int cnd_signal(cnd_t* cond); +int cnd_timedwait(cnd_t* cond, mtx_t* mtx, const xtime* xt); +int cnd_wait(cnd_t* cond, mtx_t* mtx); + +void mtx_destroy(mtx_t* mtx); +int mtx_init(mtx_t* mtx, int type); +int mtx_lock(mtx_t* mtx); +int mtx_timedlock(mtx_t* mtx, const xtime* xt); +int mtx_trylock(mtx_t* mtx); +int mtx_unlock(mtx_t* mtx); + +int thrd_create(thrd_t* thr, thrd_start_t func, void* arg); +thrd_t thrd_current(void); +int thrd_detach(thrd_t thr); +int thrd_equal(thrd_t thr0, thrd_t thr1); +void thrd_exit(int res); +int thrd_join(thrd_t thr, int* res); +void thrd_sleep(const xtime* xt); +void thrd_yield(void); + +int tss_create(tss_t* key, tss_dtor_t dtor); +void tss_delete(tss_t key); +void* tss_get(tss_t key); +int tss_set(tss_t key, void* val); + +int xtime_get(xtime* xt, int base); +#define TIME_UTC 1 + +#endif /* EMULATED_THREADS_H_INCLUDED_ */ diff --git a/c11threads/threads_msvc.c b/c11threads/threads_msvc.c new file mode 100644 index 00000000..6fc621ad --- /dev/null +++ b/c11threads/threads_msvc.c @@ -0,0 +1,460 @@ +/* + * C11 emulation library + * + * (C) Copyright yohhoy 2012. + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://www.boost.org/LICENSE_1_0.txt) + */ +#include +#include +#include +#include // MSVCRT +#include + +/* +Configuration macro: + + EMULATED_THREADS_USE_NATIVE_CALL_ONCE + Use native WindowsAPI one-time initialization function. + (requires WinVista or later) + Otherwise emulate by mtx_trylock() + *busy loop* for WinXP. + + EMULATED_THREADS_USE_NATIVE_CV + Use native WindowsAPI condition variable object. + (requires WinVista or later) + Otherwise use emulated implementation for WinXP. + + EMULATED_THREADS_TSS_DTOR_SLOTNUM + Max registerable TSS dtor number. +*/ +#if _WIN32_WINNT >= 0x0600 +// Prefer native WindowsAPI on newer environment. +#define EMULATED_THREADS_USE_NATIVE_CALL_ONCE +#define EMULATED_THREADS_USE_NATIVE_CV +#endif +#define EMULATED_THREADS_TSS_DTOR_SLOTNUM 64 // see TLS_MINIMUM_AVAILABLE + +#include "threads.h" + +/* +Implementation limits: + - Conditionally emulation for "Initialization functions" + (see EMULATED_THREADS_USE_NATIVE_CALL_ONCE macro) + - Emulated `mtx_timelock()' with mtx_trylock() + *busy loop* +*/ +static void impl_tss_dtor_invoke(); // forward decl. + +struct impl_thrd_param { + thrd_start_t func; + void* arg; +}; + +static unsigned __stdcall impl_thrd_routine(void* p) { + struct impl_thrd_param pack; + int code; + memcpy(&pack, p, sizeof(struct impl_thrd_param)); + free(p); + code = pack.func(pack.arg); + impl_tss_dtor_invoke(); + return (unsigned)code; +} + +static DWORD impl_xtime2msec(const xtime* xt) { + return (DWORD)((xt->sec * 1000u) + (xt->nsec / 1000000)); +} + +#ifdef EMULATED_THREADS_USE_NATIVE_CALL_ONCE +struct impl_call_once_param { + void (*func)(void); +}; +static BOOL CALLBACK impl_call_once_callback(PINIT_ONCE InitOnce, PVOID Parameter, PVOID* Context) { + struct impl_call_once_param* param = (struct impl_call_once_param*)Parameter; + (param->func)(); + ((void)InitOnce); + ((void)Context); // suppress warning + return TRUE; +} +#endif // ifdef EMULATED_THREADS_USE_NATIVE_CALL_ONCE + +#ifndef EMULATED_THREADS_USE_NATIVE_CV +/* +Note: + The implementation of condition variable is ported from Boost.Interprocess + See http://www.boost.org/boost/interprocess/sync/windows/condition.hpp +*/ +static void impl_cond_do_signal(cnd_t* cond, int broadcast) { + int nsignal = 0; + + EnterCriticalSection(&cond->monitor); + if (cond->to_unblock != 0) { + if (cond->blocked == 0) { + LeaveCriticalSection(&cond->monitor); + return; + } + if (broadcast) { + cond->to_unblock += nsignal = cond->blocked; + cond->blocked = 0; + } else { + nsignal = 1; + cond->to_unblock++; + cond->blocked--; + } + } else if (cond->blocked > cond->gone) { + WaitForSingleObject(cond->sem_gate, INFINITE); + if (cond->gone != 0) { + cond->blocked -= cond->gone; + cond->gone = 0; + } + if (broadcast) { + nsignal = cond->to_unblock = cond->blocked; + cond->blocked = 0; + } else { + nsignal = cond->to_unblock = 1; + cond->blocked--; + } + } + LeaveCriticalSection(&cond->monitor); + + if (0 < nsignal) + ReleaseSemaphore(cond->sem_queue, nsignal, NULL); +} + +static int impl_cond_do_wait(cnd_t* cond, mtx_t* mtx, const xtime* xt) { + int nleft = 0; + int ngone = 0; + int timeout = 0; + DWORD w; + + WaitForSingleObject(cond->sem_gate, INFINITE); + cond->blocked++; + ReleaseSemaphore(cond->sem_gate, 1, NULL); + + mtx_unlock(mtx); + + w = WaitForSingleObject(cond->sem_queue, xt ? impl_xtime2msec(xt) : INFINITE); + timeout = (w == WAIT_TIMEOUT); + + EnterCriticalSection(&cond->monitor); + if ((nleft = cond->to_unblock) != 0) { + if (timeout) { + if (cond->blocked != 0) { + cond->blocked--; + } else { + cond->gone++; + } + } + if (--cond->to_unblock == 0) { + if (cond->blocked != 0) { + ReleaseSemaphore(cond->sem_gate, 1, NULL); + nleft = 0; + } else if ((ngone = cond->gone) != 0) { + cond->gone = 0; + } + } + } else if (++cond->gone == INT_MAX / 2) { + WaitForSingleObject(cond->sem_gate, INFINITE); + cond->blocked -= cond->gone; + ReleaseSemaphore(cond->sem_gate, 1, NULL); + cond->gone = 0; + } + LeaveCriticalSection(&cond->monitor); + + if (nleft == 1) { + while (ngone--) + WaitForSingleObject(cond->sem_queue, INFINITE); + ReleaseSemaphore(cond->sem_gate, 1, NULL); + } + + mtx_lock(mtx); + return timeout ? thrd_busy : thrd_success; +} +#endif // ifndef EMULATED_THREADS_USE_NATIVE_CV + +static struct impl_tss_dtor_entry { + tss_t key; + tss_dtor_t dtor; +} impl_tss_dtor_tbl[EMULATED_THREADS_TSS_DTOR_SLOTNUM]; + +static int impl_tss_dtor_register(tss_t key, tss_dtor_t dtor) { + int i; + for (i = 0; i < EMULATED_THREADS_TSS_DTOR_SLOTNUM; i++) { + if (!impl_tss_dtor_tbl[i].dtor) + break; + } + if (i == EMULATED_THREADS_TSS_DTOR_SLOTNUM) + return 1; + impl_tss_dtor_tbl[i].key = key; + impl_tss_dtor_tbl[i].dtor = dtor; + return 0; +} + +static void impl_tss_dtor_invoke() { + int i; + for (i = 0; i < EMULATED_THREADS_TSS_DTOR_SLOTNUM; i++) { + if (impl_tss_dtor_tbl[i].dtor) { + void* val = tss_get(impl_tss_dtor_tbl[i].key); + if (val) + (impl_tss_dtor_tbl[i].dtor)(val); + } + } +} + +/*--------------- 7.25.2 Initialization functions ---------------*/ +// 7.25.2.1 +void call_once(once_flag* flag, void (*func)(void)) { + assert(flag && func); +#ifdef EMULATED_THREADS_USE_NATIVE_CALL_ONCE + { + struct impl_call_once_param param; + param.func = func; + InitOnceExecuteOnce(flag, impl_call_once_callback, (PVOID)¶m, NULL); + } +#else + if (InterlockedCompareExchange(&flag->status, 1, 0) == 0) { + (func)(); + InterlockedExchange(&flag->status, 2); + } else { + while (flag->status == 1) { + // busy loop! + thrd_yield(); + } + } +#endif +} + +/*------------- 7.25.3 Condition variable functions -------------*/ +// 7.25.3.1 +int cnd_broadcast(cnd_t* cond) { + if (!cond) return thrd_error; +#ifdef EMULATED_THREADS_USE_NATIVE_CV + WakeAllConditionVariable(&cond->condvar); +#else + impl_cond_do_signal(cond, 1); +#endif + return thrd_success; +} + +// 7.25.3.2 +void cnd_destroy(cnd_t* cond) { + assert(cond); +#ifdef EMULATED_THREADS_USE_NATIVE_CV + // do nothing +#else + CloseHandle(cond->sem_queue); + CloseHandle(cond->sem_gate); + DeleteCriticalSection(&cond->monitor); +#endif +} + +// 7.25.3.3 +int cnd_init(cnd_t* cond) { + if (!cond) return thrd_error; +#ifdef EMULATED_THREADS_USE_NATIVE_CV + InitializeConditionVariable(&cond->condvar); +#else + cond->blocked = 0; + cond->gone = 0; + cond->to_unblock = 0; + cond->sem_queue = CreateSemaphore(NULL, 0, LONG_MAX, NULL); + cond->sem_gate = CreateSemaphore(NULL, 1, 1, NULL); + InitializeCriticalSection(&cond->monitor); +#endif + return thrd_success; +} + +// 7.25.3.4 +int cnd_signal(cnd_t* cond) { + if (!cond) return thrd_error; +#ifdef EMULATED_THREADS_USE_NATIVE_CV + WakeConditionVariable(&cond->condvar); +#else + impl_cond_do_signal(cond, 0); +#endif + return thrd_success; +} + +// 7.25.3.5 +int cnd_timedwait(cnd_t* cond, mtx_t* mtx, const xtime* xt) { + if (!cond || !mtx || !xt) return thrd_error; +#ifdef EMULATED_THREADS_USE_NATIVE_CV + if (SleepConditionVariableCS(&cond->condvar, &mtx->cs, impl_xtime2msec(xt))) + return thrd_success; + return (GetLastError() == ERROR_TIMEOUT) ? thrd_busy : thrd_error; +#else + return impl_cond_do_wait(cond, mtx, xt); +#endif +} + +// 7.25.3.6 +int cnd_wait(cnd_t* cond, mtx_t* mtx) { + if (!cond || !mtx) return thrd_error; +#ifdef EMULATED_THREADS_USE_NATIVE_CV + SleepConditionVariableCS(&cond->condvar, &mtx->cs, INFINITE); +#else + impl_cond_do_wait(cond, mtx, NULL); +#endif + return thrd_success; +} + +/*-------------------- 7.25.4 Mutex functions --------------------*/ +// 7.25.4.1 +void mtx_destroy(mtx_t* mtx) { + assert(mtx); + DeleteCriticalSection(&mtx->cs); +} + +// 7.25.4.2 +int mtx_init(mtx_t* mtx, int type) { + if (!mtx) return thrd_error; + if (type != mtx_plain && type != mtx_timed && type != mtx_try && type != (mtx_plain | mtx_recursive) && type != (mtx_timed | mtx_recursive) && type != (mtx_try | mtx_recursive)) + return thrd_error; + InitializeCriticalSection(&mtx->cs); + return thrd_success; +} + +// 7.25.4.3 +int mtx_lock(mtx_t* mtx) { + if (!mtx) return thrd_error; + EnterCriticalSection(&mtx->cs); + return thrd_success; +} + +// 7.25.4.4 +int mtx_timedlock(mtx_t* mtx, const xtime* xt) { + time_t expire, now; + if (!mtx || !xt) return thrd_error; + expire = time(NULL); + expire += xt->sec; + while (mtx_trylock(mtx) != thrd_success) { + now = time(NULL); + if (expire < now) + return thrd_busy; + // busy loop! + thrd_yield(); + } + return thrd_success; +} + +// 7.25.4.5 +int mtx_trylock(mtx_t* mtx) { + if (!mtx) return thrd_error; + return TryEnterCriticalSection(&mtx->cs) ? thrd_success : thrd_busy; +} + +// 7.25.4.6 +int mtx_unlock(mtx_t* mtx) { + if (!mtx) return thrd_error; + LeaveCriticalSection(&mtx->cs); + return thrd_success; +} + +/*------------------- 7.25.5 Thread functions -------------------*/ +// 7.25.5.1 +int thrd_create(thrd_t* thr, thrd_start_t func, void* arg) { + struct impl_thrd_param* pack; + uintptr_t handle; + if (!thr) return thrd_error; + pack = malloc(sizeof(struct impl_thrd_param)); + if (!pack) return thrd_nomem; + pack->func = func; + pack->arg = arg; + handle = _beginthreadex(NULL, 0, impl_thrd_routine, pack, 0, NULL); + if (handle == 0) { + if (errno == EAGAIN || errno == EACCES) + return thrd_nomem; + return thrd_error; + } + *thr = (thrd_t)handle; + return thrd_success; +} + +// 7.25.5.2 +thrd_t thrd_current(void) { + return GetCurrentThread(); +} + +// 7.25.5.3 +int thrd_detach(thrd_t thr) { + CloseHandle(thr); + return thrd_success; +} + +// 7.25.5.4 +int thrd_equal(thrd_t thr0, thrd_t thr1) { + return (thr0 == thr1); +} + +// 7.25.5.5 +void thrd_exit(int res) { + impl_tss_dtor_invoke(); + _endthreadex((unsigned)res); +} + +// 7.25.5.6 +int thrd_join(thrd_t thr, int* res) { + DWORD w, code; + w = WaitForSingleObject(thr, INFINITE); + if (w != WAIT_OBJECT_0) + return thrd_error; + if (res) { + if (!GetExitCodeThread(thr, &code)) { + CloseHandle(thr); + return thrd_error; + } + *res = (int)code; + } + CloseHandle(thr); + return thrd_success; +} + +// 7.25.5.7 +void thrd_sleep(const xtime* xt) { + assert(xt); + Sleep(impl_xtime2msec(xt)); +} + +// 7.25.5.8 +void thrd_yield(void) { + SwitchToThread(); +} + +/*----------- 7.25.6 Thread-specific storage functions -----------*/ +// 7.25.6.1 +int tss_create(tss_t* key, tss_dtor_t dtor) { + if (!key) return thrd_error; + *key = TlsAlloc(); + if (dtor) { + if (impl_tss_dtor_register(*key, dtor)) { + TlsFree(*key); + return thrd_error; + } + } + return (*key != 0xFFFFFFFF) ? thrd_success : thrd_error; +} + +// 7.25.6.2 +void tss_delete(tss_t key) { + TlsFree(key); +} + +// 7.25.6.3 +void* tss_get(tss_t key) { + return TlsGetValue(key); +} + +// 7.25.6.4 +int tss_set(tss_t key, void* val) { + return TlsSetValue(key, val) ? thrd_success : thrd_error; +} + +/*-------------------- 7.25.7 Time functions --------------------*/ +// 7.25.6.1 +int xtime_get(xtime* xt, int base) { + if (!xt) return 0; + if (base == TIME_UTC) { + xt->sec = time(NULL); + xt->nsec = 0; + return base; + } + return 0; +} diff --git a/c11threads/threads_posix.c b/c11threads/threads_posix.c new file mode 100644 index 00000000..c206c5cd --- /dev/null +++ b/c11threads/threads_posix.c @@ -0,0 +1,271 @@ +/* + * C11 emulation library + * + * (C) Copyright yohhoy 2012. + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://www.boost.org/LICENSE_1_0.txt) + */ +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* +Configuration macro: + + EMULATED_THREADS_USE_NATIVE_TIMEDLOCK + Use pthread_mutex_timedlock() for `mtx_timedlock()' + Otherwise use mtx_trylock() + *busy loop* emulation. +*/ +#if !defined(__CYGWIN__) && !defined(__APPLE__) +#define EMULATED_THREADS_USE_NATIVE_TIMEDLOCK +#endif + + +#include "threads.h" + +/* +Implementation limits: + - Conditionally emulation for "mutex with timeout" + (see EMULATED_THREADS_USE_NATIVE_TIMEDLOCK macro) +*/ +struct impl_thrd_param { + thrd_start_t func; + void* arg; +}; + +void* impl_thrd_routine(void* p) { + struct impl_thrd_param pack = *((struct impl_thrd_param*)p); + free(p); + return (void*)((size_t)pack.func(pack.arg)); +} + +/*--------------- 7.25.2 Initialization functions ---------------*/ +// 7.25.2.1 +void call_once(once_flag* flag, void (*func)(void)) { + pthread_once(flag, func); +} + +/*------------- 7.25.3 Condition variable functions -------------*/ +// 7.25.3.1 +int cnd_broadcast(cnd_t* cond) { + if (!cond) return thrd_error; + pthread_cond_broadcast(cond); + return thrd_success; +} + +// 7.25.3.2 +void cnd_destroy(cnd_t* cond) { + assert(cond); + pthread_cond_destroy(cond); +} + +// 7.25.3.3 +int cnd_init(cnd_t* cond) { + if (!cond) return thrd_error; + pthread_cond_init(cond, NULL); + return thrd_success; +} + +// 7.25.3.4 +int cnd_signal(cnd_t* cond) { + if (!cond) return thrd_error; + pthread_cond_signal(cond); + return thrd_success; +} + +// 7.25.3.5 +int cnd_timedwait(cnd_t* cond, mtx_t* mtx, const xtime* xt) { + struct timespec abs_time; + int rt; + if (!cond || !mtx || !xt) return thrd_error; + rt = pthread_cond_timedwait(cond, mtx, &abs_time); + if (rt == ETIMEDOUT) + return thrd_busy; + return (rt == 0) ? thrd_success : thrd_error; +} + +// 7.25.3.6 +int cnd_wait(cnd_t* cond, mtx_t* mtx) { + if (!cond || !mtx) return thrd_error; + pthread_cond_wait(cond, mtx); + return thrd_success; +} + +/*-------------------- 7.25.4 Mutex functions --------------------*/ +// 7.25.4.1 +void mtx_destroy(mtx_t* mtx) { + assert(mtx); + pthread_mutex_destroy(mtx); +} + +// 7.25.4.2 +int mtx_init(mtx_t* mtx, int type) { + pthread_mutexattr_t attr; + if (!mtx) return thrd_error; + if (type != mtx_plain && type != mtx_timed && type != mtx_try && type != (mtx_plain | mtx_recursive) && type != (mtx_timed | mtx_recursive) && type != (mtx_try | mtx_recursive)) + return thrd_error; + pthread_mutexattr_init(&attr); + if ((type & mtx_recursive) != 0) { +#if defined(__linux__) || defined(__linux) + pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE_NP); +#else + pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE); +#endif + } + pthread_mutex_init(mtx, &attr); + pthread_mutexattr_destroy(&attr); + return thrd_success; +} + +// 7.25.4.3 +int mtx_lock(mtx_t* mtx) { + if (!mtx) return thrd_error; + pthread_mutex_lock(mtx); + return thrd_success; +} + +// 7.25.4.4 +int mtx_timedlock(mtx_t* mtx, const xtime* xt) { + if (!mtx || !xt) return thrd_error; + { +#ifdef EMULATED_THREADS_USE_NATIVE_TIMEDLOCK + struct timespec ts; + int rt; + ts.tv_sec = xt->sec; + ts.tv_nsec = xt->nsec; + rt = pthread_mutex_timedlock(mtx, &ts); + if (rt == 0) + return thrd_success; + return (rt == ETIMEDOUT) ? thrd_busy : thrd_error; +#else + time_t expire = time(NULL); + expire += xt->sec; + while (mtx_trylock(mtx) != thrd_success) { + time_t now = time(NULL); + if (expire < now) + return thrd_busy; + // busy loop! + thrd_yield(); + } + return thrd_success; +#endif + } +} + +// 7.25.4.5 +int mtx_trylock(mtx_t* mtx) { + if (!mtx) return thrd_error; + return (pthread_mutex_trylock(mtx) == 0) ? thrd_success : thrd_busy; +} + +// 7.25.4.6 +int mtx_unlock(mtx_t* mtx) { + if (!mtx) return thrd_error; + pthread_mutex_unlock(mtx); + return thrd_success; +} + +/*------------------- 7.25.5 Thread functions -------------------*/ +// 7.25.5.1 +int thrd_create(thrd_t* thr, thrd_start_t func, void* arg) { + struct impl_thrd_param* pack; + if (!thr) return thrd_error; + pack = malloc(sizeof(struct impl_thrd_param)); + if (!pack) return thrd_nomem; + pack->func = func; + pack->arg = arg; + + pthread_attr_t attr; + pthread_attr_init(&attr); + + if (pthread_create(thr, &attr, impl_thrd_routine, pack) != 0) { + free(pack); + return thrd_error; + } + return thrd_success; +} + +// 7.25.5.2 +thrd_t thrd_current(void) { + return pthread_self(); +} + +// 7.25.5.3 +int thrd_detach(thrd_t thr) { + return (pthread_detach(thr) == 0) ? thrd_success : thrd_error; +} + +// 7.25.5.4 +int thrd_equal(thrd_t thr0, thrd_t thr1) { + return pthread_equal(thr0, thr1); +} + +// 7.25.5.5 +void thrd_exit(int res) { + pthread_exit((void*)((size_t)res)); +} + +// 7.25.5.6 +int thrd_join(thrd_t thr, int* res) { + void* code; + if (pthread_join(thr, &code) != 0) + return thrd_error; + if (res) + *res = (int)((size_t)code); + return thrd_success; +} + +// 7.25.5.7 +void thrd_sleep(const xtime* xt) { + struct timespec req; + assert(xt); + req.tv_sec = xt->sec; + req.tv_nsec = xt->nsec; + nanosleep(&req, NULL); +} + +// 7.25.5.8 +void thrd_yield(void) { + sched_yield(); +} + +/*----------- 7.25.6 Thread-specific storage functions -----------*/ +// 7.25.6.1 +int tss_create(tss_t* key, tss_dtor_t dtor) { + if (!key) return thrd_error; + return (pthread_key_create(key, dtor) == 0) ? thrd_success : thrd_error; +} + +// 7.25.6.2 +void tss_delete(tss_t key) { + pthread_key_delete(key); +} + +// 7.25.6.3 +void* tss_get(tss_t key) { + return pthread_getspecific(key); +} + +// 7.25.6.4 +int tss_set(tss_t key, void* val) { + return (pthread_setspecific(key, val) == 0) ? thrd_success : thrd_error; +} + +/*-------------------- 7.25.7 Time functions --------------------*/ +// 7.25.6.1 +int xtime_get(xtime* xt, int base) { + if (!xt) return 0; + if (base == TIME_UTC) { + xt->sec = time(NULL); + xt->nsec = 0; + return base; + } + return 0; +} diff --git a/common/common.c b/common/common.c index acee5cee..90060589 100644 --- a/common/common.c +++ b/common/common.c @@ -53,17 +53,13 @@ void* cuik__valloc(size_t size) { // round size to page size size = (size + cuik__page_mask) & ~cuik__page_mask; - void *ret = VirtualAlloc(NULL, size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); + return VirtualAlloc(NULL, size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); #else cuik__page_size = 4096; cuik__page_mask = 4095; - void *ret = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + return mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); #endif - - // GC_add_roots(ret, (char *) ret + size); - - return ret; } void cuik__vfree(void* ptr, size_t size) { diff --git a/common/common.h b/common/common.h index b1b3ccd7..069bba12 100644 --- a/common/common.h +++ b/common/common.h @@ -6,20 +6,27 @@ #include #include +// Cuik currently uses mimalloc so we wrap those calls here +#ifdef CUIK_USE_MIMALLOC +#include + +#define cuik_malloc(size) mi_malloc(size) +#define cuik_calloc(count, size) mi_calloc(count, size) +#define cuik_free(ptr) mi_free(ptr) +#define cuik_realloc(ptr, size) mi_realloc(ptr, size) +#define cuik_strdup(x) mi_strdup(x) +#else +#define cuik_malloc(size) malloc(size) +#define cuik_calloc(count, size) calloc(count, size) +#define cuik_free(size) free(size) +#define cuik_realloc(ptr, size) realloc(ptr, size) + #ifdef _WIN32 -#define WIN32_LEAN_AND_MEAN -#include -#include +#define cuik_strdup(x) _strdup(x) +#else +#define cuik_strdup(x) strdup(x) +#endif #endif - -#include "../bdwgc/private/gc/gc.h" - -// Cuik currently uses mimalloc so we wrap those calls here -#define cuik_malloc(size) GC_malloc(size) -#define cuik_calloc(count, size) GC_malloc((count) * (size)) -#define cuik_free(ptr) GC_free(ptr) -#define cuik_realloc(ptr, size) GC_realloc(ptr, size) -#define cuik_strdup(x) GC_strdup(x) #if defined(__amd64) || defined(__amd64__) || defined(_M_AMD64) || defined(__x86_64__) || defined(__x86_64) #define CUIK__IS_X64 1 @@ -50,13 +57,6 @@ #define LIKELY(x) __builtin_expect(!!(x), 1) #define UNLIKELY(x) __builtin_expect(!!(x), 0) -#ifndef _MSC_VER -#include -#if defined(__debugbreak) -#define __debugbreak() raise(5 /* SIGTRAP */) -#endif -#endif - #ifdef NDEBUG #define TODO() __builtin_unreachable() #else diff --git a/common/hash_map.h b/common/hash_map.h index 9f042c6d..7c92ab30 100644 --- a/common/hash_map.h +++ b/common/hash_map.h @@ -9,13 +9,19 @@ #include #include -#include "../bdwgc/private/gc/gc.h" - -#define cuik_malloc(size) GC_malloc(size) -#define cuik_calloc(count, size) GC_malloc((count) * (size)) -#define cuik_free(ptr) GC_free(ptr) -#define cuik_realloc(ptr, size) GC_realloc(ptr, size) -#define cuik_strdup(x) GC_strdup(x) +#if defined(TB_USE_MIMALLOC) || defined(CUIK_USE_MIMALLOC) +#include + +#define NL_MALLOC(s) mi_malloc(s) +#define NL_CALLOC(c, s) mi_calloc(c, s) +#define NL_REALLOC(p, s) mi_realloc(p, s) +#define NL_FREE(p) mi_free(p) +#else +#define NL_MALLOC(s) malloc(s) +#define NL_CALLOC(c, s) calloc(c, s) +#define NL_REALLOC(p, s) realloc(p, s) +#define NL_FREE(p) free(p) +#endif #define NL_Map(K, V) struct { K k; V v; }* #define NL_Strmap(T) struct { NL_Slice k; T v; }* @@ -134,7 +140,7 @@ inline static uint32_t nl_map__raw_hash(size_t len, const void *key) { } void nl_map__free(NL_MapHeader* restrict table) { - cuik_free(table); + NL_FREE(table); } NL_MapHeader* nl_map__alloc(size_t cap, size_t entry_size) { @@ -150,7 +156,7 @@ NL_MapHeader* nl_map__alloc(size_t cap, size_t entry_size) { cap = (cap == 1 ? 1 : 1 << exp); - NL_MapHeader* table = cuik_calloc(1, sizeof(NL_MapHeader) + (cap * entry_size)); + NL_MapHeader* table = NL_CALLOC(1, sizeof(NL_MapHeader) + (cap * entry_size)); table->exp = exp; table->count = 0; return table; diff --git a/common/hash_set.h b/common/hash_set.h index 5124bf98..fc91aee7 100644 --- a/common/hash_set.h +++ b/common/hash_set.h @@ -26,12 +26,14 @@ bool nl_hashset_remove(NL_HashSet* restrict hs, void* ptr); bool nl_hashset_put(NL_HashSet* restrict hs, void* ptr); size_t nl_hashset_lookup(NL_HashSet* restrict hs, void* ptr); +void* nl_hashset_get2(NL_HashSet* restrict hs, void* ptr, NL_HashFunc hash, NL_CompareFunc cmp); + // this one takes a custom hash function void* nl_hashset_put2(NL_HashSet* restrict hs, void* ptr, NL_HashFunc hash, NL_CompareFunc cmp); void nl_hashset_remove2(NL_HashSet* restrict hs, void* ptr, NL_HashFunc hash, NL_CompareFunc cmp); #define nl_hashset_capacity(hs) (1ull << (hs)->exp) -#define nl_hashset_for(it, hs) for (void **it = (hs)->data, **_end_ = &it[nl_hashset_capacity(hs)]; it != _end_; it++) if (*it != NULL) +#define nl_hashset_for(it, hs) for (void **it = (hs)->data, **_end_ = &it[nl_hashset_capacity(hs)]; it != _end_; it++) if (*it != NULL && *it != NL_HASHSET_TOMB) #endif /* NL_HASH_SET_H */ @@ -159,6 +161,27 @@ void nl_hashset_remove2(NL_HashSet* restrict hs, void* ptr, NL_HashFunc hash, NL } while (i != first); } +void* nl_hashset_get2(NL_HashSet* restrict hs, void* ptr, NL_HashFunc hash, NL_CompareFunc cmp) { + uint32_t h = hash(ptr); + + size_t mask = (1 << hs->exp) - 1; + size_t first = h & mask, i = first; + + do { + if (hs->data[i] == NULL) { + return NULL; + } else if (hs->data[i] == NL_HASHSET_TOMB) { + // go past it + } else if (hs->data[i] == ptr || cmp(hs->data[i], ptr)) { + return hs->data[i]; + } + + i = (i + 1) & mask; + } while (i != first); + + return NULL; +} + // returns old value void* nl_hashset_put2(NL_HashSet* restrict hs, void* ptr, NL_HashFunc hash, NL_CompareFunc cmp) { uint32_t h = hash(ptr); diff --git a/common/hashes.h b/common/hashes.h index 2f757eff..8508e73b 100644 --- a/common/hashes.h +++ b/common/hashes.h @@ -4,12 +4,13 @@ // murmur3 32-bit without UB unaligned accesses // https://github.com/demetri/scribbles/blob/master/hashing/ub_aware_hash_functions.c static uint32_t tb__murmur3_32(const void* key, size_t len) { + const uint32_t* key32 = key; uint32_t h = 0; // main body, work on 32-bit blocks at a time for (size_t i=0;i> 17))*0x1b873593; diff --git a/common/log.c b/common/log.c index 14d9119f..5c72cc37 100644 --- a/common/log.c +++ b/common/log.c @@ -25,7 +25,13 @@ #include "log.h" #include -#if defined(_POSIX_C_SOURCE) +#ifdef _WIN32 +#ifdef _POSIX_C_SOURCE +__declspec(dllimport) unsigned int GetCurrentThreadId(void); +#else +__declspec(dllimport) unsigned long GetCurrentThreadId(void); +#endif +#else #include #endif diff --git a/makefile b/makefile index 4fc36488..6a26e170 100644 --- a/makefile +++ b/makefile @@ -15,7 +15,7 @@ GC_OBJS = $(GC_SRCS:%.c=$(OBJ_DIR)/%.o) STD_SRCS := $(shell find vm/std/libs -name '*.c') OPT_SRCS := $(shell find vm/opt -name '*.c') -ALL_SRCS = vm/ir.c vm/std/std.c vm/lib.c vm/type.c vm/lang/paka.c vm/obj.c vm/jit/tb.c $(STD_SRCS) $(OPT_SRCS) $(EXTRA_SRCS) +ALL_SRCS = vm/ir.c vm/std/std.c vm/lib.c vm/type.c vm/lang/paka.c vm/obj.c vm/jit/tb.c $(STD_SRCS) $(OPT_SRCS) $(EXTRA_SRCS) ALL_OBJS = $(ALL_SRCS:%.c=$(OBJ_DIR)/%.o) # TB_SRCS := common/common.c common/perf.c tb/src/libtb.c tb/src/x64/x64.c c11threads/threads_msvc.c diff --git a/tb/.editorconfig b/tb/.editorconfig new file mode 100644 index 00000000..70fdd549 --- /dev/null +++ b/tb/.editorconfig @@ -0,0 +1,11 @@ +# EditorConfig is awesome: https://EditorConfig.org + +# top-most EditorConfig file +root = true + +# Unix-style newlines with a newline ending every file +[*] +end_of_line = crlf +insert_final_newline = true +indent_style = tab +indent_size = 4 diff --git a/tb/.gitignore b/tb/.gitignore new file mode 100644 index 00000000..cca3fcc3 --- /dev/null +++ b/tb/.gitignore @@ -0,0 +1,26 @@ +.vs/ +.tup/ +bin/ +negate/ + +tildebackend.lib +tildebackend.a + +*.cache +*.swp +*.out +*.o +*.obj +*.exe +*.pdb + +tup.config +debug.bat +build.bat +run.bat +run4coder.bat +project.4coder +tb.rdbg +build.ninja +.ninja_deps +.ninja_log diff --git a/tb/LICENSE.txt b/tb/LICENSE.txt new file mode 100644 index 00000000..b731914f --- /dev/null +++ b/tb/LICENSE.txt @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 Yasser Arguelles Snape + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/tb/NOTES.txt b/tb/NOTES.txt new file mode 100644 index 00000000..cc0208e8 --- /dev/null +++ b/tb/NOTES.txt @@ -0,0 +1,17 @@ +# Optimizer crap + + some of the optimizations i should probably worry about are proving when numbers can't + overflow, like induction vars: + +``` +for { + i = phi(0, j) + // even if n is TOP, i must be at least TOP and + // after all additions on the PHI... which means + // no overflow + if i >= n break + ... + // next + j = i + 1 +} +``` diff --git a/tb/README.txt b/tb/README.txt new file mode 100644 index 00000000..7307205f --- /dev/null +++ b/tb/README.txt @@ -0,0 +1,22 @@ +TildeBackend (Tilde or TB for short) + + TB is compiler backend in the form of a reasonable C library. This is built as an alternative to other larger compiler toolchains while providing the optimizations, machine code generation and object file export functionality necessary for the development of compilers. + + # Roadmap + + Code generation: + We're starting with x64 but will be moving focus to Aarch64 soon. + + Optimizer: + It's almost complete with all the -O1 level passes (mostly missing inlining). + After that we can move towards -O2 level stuff (the goal is to compete with + LLVM so we need to be a bit ambitious). + + Debug info: + Codeview support and DWARF has not yet been started, there's plans on making a + new debug info format eventually. + + Output targets: + We currently have basic ELF64, COFF64, some current work is being done for + PE and Macho-O. We got exporting object files but wanna go further because + linkers ain't supposed to be separate programs. diff --git a/tb/include/tb.h b/tb/include/tb.h index b193f3c0..da021f81 100644 --- a/tb/include/tb.h +++ b/tb/include/tb.h @@ -4,9 +4,13 @@ // SSA - single static assignment // GVN - global value numbering // CSE - common subexpression elimination +// CFG - control flow graph // DSE - dead store elimination // GCM - global code motion // SROA - scalar replacement of aggregates +// SCCP - sparse conditional constant propagation +// RPO - reverse postorder +// BB - basic block #ifndef TB_CORE_H #define TB_CORE_H @@ -21,7 +25,7 @@ // https://semver.org/ #define TB_VERSION_MAJOR 0 -#define TB_VERSION_MINOR 2 +#define TB_VERSION_MINOR 3 #define TB_VERSION_PATCH 0 #ifndef TB_API @@ -154,14 +158,14 @@ typedef enum TB_DataTypeEnum { TB_FLOAT, // Pointers TB_PTR, - // Tuples, these cannot be used in memory ops, just accessed via projections - TB_TUPLE, // represents control flow for REGION, BRANCH TB_CONTROL, // represents memory (and I/O) TB_MEMORY, // continuation (usually just return addresses :p) TB_CONT, + // Tuples, these cannot be used in memory ops, just accessed via projections + TB_TUPLE, } TB_DataTypeEnum; typedef enum TB_FloatFormat { @@ -171,15 +175,13 @@ typedef enum TB_FloatFormat { typedef union TB_DataType { struct { - uint8_t type; - // Only integers and floats can be wide. - uint8_t width; + uint16_t type : 4; // for integers it's the bitwidth - uint16_t data; + uint16_t data : 12; }; - uint32_t raw; + uint16_t raw; } TB_DataType; -static_assert(sizeof(TB_DataType) == 4, "im expecting this to be a uint32_t"); +static_assert(sizeof(TB_DataType) == 2, "im expecting this to be a uint16_t"); // classify data types #define TB_IS_VOID_TYPE(x) ((x).type == TB_INT && (x).data == 0) @@ -259,7 +261,11 @@ typedef enum TB_NodeTypeEnum { // trap will not be continuable but will stop execution. TB_TRAP, // (Control) -> (Control) // unreachable means it won't trap or be continuable. - TB_UNREACHABLE, // (Control) -> (Control) + TB_UNREACHABLE, // (Control) -> () + // this is generated when a path becomes disconnected + // from the main IR, it'll be reduced by the monotonic + // rewrites. + TB_DEAD, // () -> (Control) //////////////////////////////// // CONTROL + MEMORY @@ -278,31 +284,31 @@ typedef enum TB_NodeTypeEnum { //////////////////////////////// // MERGEMEM will join multiple non-aliasing memory effects, because // they don't alias there's no ordering guarentee. - TB_MERGEMEM,// (Memory...) -> Memory + TB_MERGEMEM, // (Memory...) -> Memory // LOAD and STORE are standard memory accesses, they can be folded away. - TB_LOAD, // (Memory, Ptr) -> Data - TB_STORE, // (Memory, Ptr, Data) -> Memory + TB_LOAD, // (Control?, Memory, Ptr) -> Data + TB_STORE, // (Control, Memory, Ptr, Data) -> Memory // bulk memory ops. - TB_MEMCPY, // (Memory, Ptr, Ptr, Size) -> Memory - TB_MEMSET, // (Memory, Ptr, Int8, Size) -> Memory + TB_MEMCPY, // (Control, Memory, Ptr, Ptr, Size) -> Memory + TB_MEMSET, // (Control, Memory, Ptr, Int8, Size) -> Memory // these memory accesses represent "volatile" which means // they may produce side effects and thus cannot be eliminated. - TB_READ, // (Memory, Ptr) -> (Memory, Data) - TB_WRITE, // (Memory, Ptr, Data) -> (Memory, Data) + TB_READ, // (Control, Memory, Ptr) -> (Memory, Data) + TB_WRITE, // (Control, Memory, Ptr, Data) -> (Memory, Data) // atomics have multiple observers (if not they wouldn't need to // be atomic) and thus produce side effects everywhere just like // volatiles except they have synchronization guarentees. the atomic // data ops will return the value before the operation is performed. // Atomic CAS return the old value and a boolean for success (true if // the value was changed) - TB_ATOMIC_LOAD, // (Memory, Ptr) -> (Memory, Data) - TB_ATOMIC_XCHG, // (Memory, Ptr, Data) -> (Memory, Data) - TB_ATOMIC_ADD, // (Memory, Ptr, Data) -> (Memory, Data) - TB_ATOMIC_SUB, // (Memory, Ptr, Data) -> (Memory, Data) - TB_ATOMIC_AND, // (Memory, Ptr, Data) -> (Memory, Data) - TB_ATOMIC_XOR, // (Memory, Ptr, Data) -> (Memory, Data) - TB_ATOMIC_OR, // (Memory, Ptr, Data) -> (Memory, Data) - TB_ATOMIC_CAS, // (Memory, Data, Data) -> (Memory, Data, Bool) + TB_ATOMIC_LOAD, // (Control, Memory, Ptr) -> (Memory, Data) + TB_ATOMIC_XCHG, // (Control, Memory, Ptr, Data) -> (Memory, Data) + TB_ATOMIC_ADD, // (Control, Memory, Ptr, Data) -> (Memory, Data) + TB_ATOMIC_SUB, // (Control, Memory, Ptr, Data) -> (Memory, Data) + TB_ATOMIC_AND, // (Control, Memory, Ptr, Data) -> (Memory, Data) + TB_ATOMIC_XOR, // (Control, Memory, Ptr, Data) -> (Memory, Data) + TB_ATOMIC_OR, // (Control, Memory, Ptr, Data) -> (Memory, Data) + TB_ATOMIC_CAS, // (Control, Memory, Data, Data) -> (Memory, Data, Bool) //////////////////////////////// // POINTERS @@ -496,11 +502,11 @@ struct User { struct TB_Node { TB_NodeType type; - uint16_t input_count; // number of node inputs. + uint16_t input_count; TB_DataType dt; // makes it easier to track in graph walks - size_t gvn; + uint32_t gvn; // only value while inside of a TB_Passes, // these are unordered and usually just @@ -522,8 +528,6 @@ struct TB_Node { // this represents switch (many targets), if (one target) and goto (only default) logic. typedef struct { // TB_BRANCH size_t succ_count; - TB_Node** succ; - int64_t keys[]; } TB_NodeBranch; @@ -603,15 +607,8 @@ typedef struct { } TB_NodeSafepoint; typedef struct { - TB_Node* end; const char* tag; - // position in a postorder walk - int postorder_id; - // immediate dominator (can be approximate) - int dom_depth; - TB_Node* dom; - // used for IR building only, stale after that. TB_Node *mem_in, *mem_out; } TB_NodeRegion; @@ -666,37 +663,37 @@ typedef enum { #define TB_TYPE_TUPLE TB_DataType{ { TB_TUPLE } } #define TB_TYPE_CONTROL TB_DataType{ { TB_CONTROL } } -#define TB_TYPE_VOID TB_DataType{ { TB_INT, 0, 0 } } -#define TB_TYPE_I8 TB_DataType{ { TB_INT, 0, 8 } } -#define TB_TYPE_I16 TB_DataType{ { TB_INT, 0, 16 } } -#define TB_TYPE_I32 TB_DataType{ { TB_INT, 0, 32 } } -#define TB_TYPE_I64 TB_DataType{ { TB_INT, 0, 64 } } -#define TB_TYPE_F32 TB_DataType{ { TB_FLOAT, 0, TB_FLT_32 } } -#define TB_TYPE_F64 TB_DataType{ { TB_FLOAT, 0, TB_FLT_64 } } -#define TB_TYPE_BOOL TB_DataType{ { TB_INT, 0, 1 } } -#define TB_TYPE_PTR TB_DataType{ { TB_PTR, 0, 0 } } -#define TB_TYPE_MEMORY TB_DataType{ { TB_MEMORY,0, 0 } } -#define TB_TYPE_CONT TB_DataType{ { TB_CONT, 0, 0 } } -#define TB_TYPE_INTN(N) TB_DataType{ { TB_INT, 0, (N) } } -#define TB_TYPE_PTRN(N) TB_DataType{ { TB_PTR, 0, (N) } } +#define TB_TYPE_VOID TB_DataType{ { TB_INT, 0 } } +#define TB_TYPE_I8 TB_DataType{ { TB_INT, 8 } } +#define TB_TYPE_I16 TB_DataType{ { TB_INT, 16 } } +#define TB_TYPE_I32 TB_DataType{ { TB_INT, 32 } } +#define TB_TYPE_I64 TB_DataType{ { TB_INT, 64 } } +#define TB_TYPE_F32 TB_DataType{ { TB_FLOAT, TB_FLT_32 } } +#define TB_TYPE_F64 TB_DataType{ { TB_FLOAT, TB_FLT_64 } } +#define TB_TYPE_BOOL TB_DataType{ { TB_INT, 1 } } +#define TB_TYPE_PTR TB_DataType{ { TB_PTR, 0 } } +#define TB_TYPE_MEMORY TB_DataType{ { TB_MEMORY,0 } } +#define TB_TYPE_CONT TB_DataType{ { TB_CONT, 0 } } +#define TB_TYPE_INTN(N) TB_DataType{ { TB_INT, (N) } } +#define TB_TYPE_PTRN(N) TB_DataType{ { TB_PTR, (N) } } #else #define TB_TYPE_TUPLE (TB_DataType){ { TB_TUPLE } } #define TB_TYPE_CONTROL (TB_DataType){ { TB_CONTROL } } -#define TB_TYPE_VOID (TB_DataType){ { TB_INT, 0, 0 } } -#define TB_TYPE_I8 (TB_DataType){ { TB_INT, 0, 8 } } -#define TB_TYPE_I16 (TB_DataType){ { TB_INT, 0, 16 } } -#define TB_TYPE_I32 (TB_DataType){ { TB_INT, 0, 32 } } -#define TB_TYPE_I64 (TB_DataType){ { TB_INT, 0, 64 } } -#define TB_TYPE_F32 (TB_DataType){ { TB_FLOAT, 0, TB_FLT_32 } } -#define TB_TYPE_F64 (TB_DataType){ { TB_FLOAT, 0, TB_FLT_64 } } -#define TB_TYPE_BOOL (TB_DataType){ { TB_INT, 0, 1 } } -#define TB_TYPE_PTR (TB_DataType){ { TB_PTR, 0, 0 } } -#define TB_TYPE_CONT (TB_DataType){ { TB_CONT, 0, 0 } } -#define TB_TYPE_MEMORY (TB_DataType){ { TB_MEMORY,0, 0 } } -#define TB_TYPE_INTN(N) (TB_DataType){ { TB_INT, 0, (N) } } -#define TB_TYPE_PTRN(N) (TB_DataType){ { TB_PTR, 0, (N) } } +#define TB_TYPE_VOID (TB_DataType){ { TB_INT, 0 } } +#define TB_TYPE_I8 (TB_DataType){ { TB_INT, 8 } } +#define TB_TYPE_I16 (TB_DataType){ { TB_INT, 16 } } +#define TB_TYPE_I32 (TB_DataType){ { TB_INT, 32 } } +#define TB_TYPE_I64 (TB_DataType){ { TB_INT, 64 } } +#define TB_TYPE_F32 (TB_DataType){ { TB_FLOAT, TB_FLT_32 } } +#define TB_TYPE_F64 (TB_DataType){ { TB_FLOAT, TB_FLT_64 } } +#define TB_TYPE_BOOL (TB_DataType){ { TB_INT, 1 } } +#define TB_TYPE_PTR (TB_DataType){ { TB_PTR, 0 } } +#define TB_TYPE_CONT (TB_DataType){ { TB_CONT, 0 } } +#define TB_TYPE_MEMORY (TB_DataType){ { TB_MEMORY,0 } } +#define TB_TYPE_INTN(N) (TB_DataType){ { TB_INT, (N) } } +#define TB_TYPE_PTRN(N) (TB_DataType){ { TB_PTR, (N) } } #endif @@ -1020,8 +1017,6 @@ TB_API const char* tb_symbol_get_name(TB_Symbol* s); TB_API void tb_function_set_prototype(TB_Function* f, TB_ModuleSectionHandle section, TB_FunctionPrototype* p, TB_Arena* arena); TB_API TB_FunctionPrototype* tb_function_get_prototype(TB_Function* f); -TB_API void tb_function_print(TB_Function* f, TB_PrintCallback callback, void* user_data); - TB_API void tb_inst_set_control(TB_Function* f, TB_Node* control); TB_API TB_Node* tb_inst_get_control(TB_Function* f); @@ -1224,11 +1219,11 @@ TB_API bool tb_pass_mem2reg(TB_Passes* opt); // this just runs the optimizer in the default configuration TB_API void tb_pass_optimize(TB_Passes* opt); -TB_API void tb_pass_schedule(TB_Passes* opt); - // analysis // print: prints IR in a flattened text form. TB_API bool tb_pass_print(TB_Passes* opt); +// print-dot: prints IR as DOT +TB_API void tb_pass_print_dot(TB_Passes* opt, TB_PrintCallback callback, void* user_data); // codegen TB_API TB_FunctionOutput* tb_pass_codegen(TB_Passes* opt, bool emit_asm); @@ -1240,8 +1235,6 @@ TB_API void tb_pass_mark_users(TB_Passes* opt, TB_Node* n); //////////////////////////////// // IR access //////////////////////////////// -TB_API bool tb_is_dominated_by(TB_Node* expected_dom, TB_Node* bb); - TB_API const char* tb_node_get_name(TB_Node* n); TB_API TB_Node* tb_get_parent_region(TB_Node* n); diff --git a/tb/man/GUIDE.md b/tb/man/GUIDE.md new file mode 100644 index 00000000..fe66fa73 --- /dev/null +++ b/tb/man/GUIDE.md @@ -0,0 +1,60 @@ +# Module creation + +Modules are the largest logical unit of code in TB, they contain functions and globals which can be exported. Get started by writing: + +```c + // this will use the host machine for the target architecture and system, this is + // helpful when doing JIT or non-cross AOT compilation + TB_Module* module = tb_module_create_for_host(arch, TB_SYSTEM_WINDOWS, TB_DEBUGFMT_NONE, NULL); +``` + +```c + // See TB_Arch, TB_System + TB_Arch arch = TB_ARCH_X86_64; + TB_System sys = TB_SYSTEM_WINDOWS; + + // See TB_DebugFormat. When exporting the binary this decides + // how the line info, type table and other debug information is + // encoded. + TB_DebugFormat debug_fmt = TB_DEBUGFMT_CODEVIEW; + + // See TB_FeatureSet, this allows us to tell the code generator + // what extensions are active in the platform, an example is enabling + // AVX or BF16 + TB_FeatureSet features = { 0 }; + + TB_Module* module = tb_module_create_for_host(arch, sys, debug_fmt, &features); +``` + +# Exporter API + +The exporting API allows for packaging compiled code into objects, shared/static or executable form. Once you've compiled all your functions in TB you may export to a file using: + +```c + // see TB_OutputFlavor for the full list + TB_ModuleExporter* e = tb_make_exporter(module, TB_FLAVOR_OBJECT); + if (!tb_exporter_to_file(e, module, "hello.obj")) { + /* failed to export */ + } + + /* file has been exported! */ +``` + +If instead you need to output a buffer in memory: + +```c + // see TB_OutputFlavor for the full list + TB_ModuleExporter* e = tb_make_exporter(module, TB_FLAVOR_OBJECT); + + ptrdiff_t length; + uint8_t* buffer = tb_exporter_to_buffer(e, module, &length); + if (length < 0) { + /* failed to export */ + } + + ... + + tb_exporter_free_buffer(buffer); +``` + +# Builder API \ No newline at end of file diff --git a/tb/man/IR.txt b/tb/man/IR.txt new file mode 100644 index 00000000..c95e7c88 --- /dev/null +++ b/tb/man/IR.txt @@ -0,0 +1,58 @@ +# Sea of Nodes (SoN) + + https://www.oracle.com/technetwork/java/javase/tech/c2-ir95-150110.pdf) + + SoN is an SSA where ordering is relaxed in the form of explicit dependencies + as opposed to local ordering inside basic blocks, for instance pure operations + like addition will not have an exact placement only the constraint that it must + be resolved after it's inputs. This makes it easier to perform local optimizations + without a care for scheduling, this is especially helpful because of how many + optimizations we've moved to peepholes. + + note: edges going down from A to B means B is dependent on A. + + Reassociation + + x+2+4 + + x 2 2 4 + \ / \ / + + 4 => x + + \ / \ / + + + + + GVN + + A*B + A*B + + A B A B + |\ /| \ / + | X | * + |/ \| => / \ + * * \ / + \ / + + + + Load elimination + + *x = 16 + return *x + + x_ + | \ + | \ + | \ x + | | | + memory | 16 | => memory | 16 + \ | | | \ | / | + Store | Store | + | | / + | / / + | / / + | / / + Load | + | | + V V + + note: we're not showing the control edge memory operations have for simplicit but + both of these are sharing a control edge. Stores produce more memory but don't produce + more control flow and Loads use memory but don't produce more (these are both non-volatile) diff --git a/tb/man/TYPES.md b/tb/man/TYPES.md new file mode 100644 index 00000000..defc2f05 --- /dev/null +++ b/tb/man/TYPES.md @@ -0,0 +1,32 @@ +# Types + +The TBIR data types are used to represent the structure and certain proofs +about the data itself. + +## Void (TB_VOID) + +void type is a unit type and thus cannot hold data. + +## Boolean (TB_BOOL) + +booleans represent either true or false and the conversion is defined as: + +`((x != 0) ? true : false) where x is a data-holding type` + +this is important to note because in float types NaN comparisons always return +false which means that NaN is considered false in a (NaN -> bool) conversion. + +## Integers (TB_I8, TB_I16, TB_I32, TB_I64) + +integer types come in a few basic sizes (i8, i16, i32, i64) and represent numerical +data and raw data. Integer operations can come in two forms: signed and unsigned. + +## Floats (TB_F32, TB_F64) + +floating point types are IEEE-754-2008 compliant with f32, and f64 mapping to binary32, +and binary64 respectively. + +## Pointers (TB_PTR) + +Pointers refer to memory objects in the global address space (see MEMORY.md) +TODO: currently TB only supports on address space but this might be subject to change diff --git a/tb/src/abi.c b/tb/src/abi.c index 5e0ba963..c48b5deb 100644 --- a/tb/src/abi.c +++ b/tb/src/abi.c @@ -1,6 +1,5 @@ // This is gonna get complicated but we can push through :p - //////////////////////////////// // x86-64 //////////////////////////////// @@ -93,7 +92,7 @@ static TB_DataType debug_type_to_tb(TB_DebugType* t) { case TB_DEBUG_TYPE_ARRAY: return TB_TYPE_PTR; case TB_DEBUG_TYPE_POINTER: return TB_TYPE_PTR; - case TB_DEBUG_TYPE_FLOAT: return (TB_DataType){ { TB_FLOAT, 0, t->float_fmt } }; + case TB_DEBUG_TYPE_FLOAT: return (TB_DataType){ { TB_FLOAT, t->float_fmt } }; default: tb_assert(0, "todo"); return TB_TYPE_VOID; } @@ -109,7 +108,7 @@ static TB_DataType reg_class_to_tb(TB_ABI abi, RegClass rg, TB_DebugType* type) case RG_SSE: { assert(type->tag == TB_DEBUG_TYPE_FLOAT); - return (TB_DataType){ { TB_FLOAT, 0, type->float_fmt } }; + return (TB_DataType){ { TB_FLOAT, type->float_fmt } }; } default: tb_assert(0, "todo"); return TB_TYPE_VOID; diff --git a/tb/src/codegen/emitter.h b/tb/src/codegen/emitter.h index dde46185..2cc9b756 100644 --- a/tb/src/codegen/emitter.h +++ b/tb/src/codegen/emitter.h @@ -28,7 +28,6 @@ typedef struct { uint8_t* data; NL_Map(TB_Node*, uint32_t) labels; - uint32_t return_label; } TB_CGEmitter; // Helper macros @@ -37,13 +36,19 @@ typedef struct { #define EMIT2(e, b) do { uint16_t _b = (b); memcpy(tb_cgemit_reserve(e, 2), &_b, 2); (e)->count += 2; } while (0) #define EMIT4(e, b) do { uint32_t _b = (b); memcpy(tb_cgemit_reserve(e, 4), &_b, 4); (e)->count += 4; } while (0) #define EMIT8(e, b) do { uint64_t _b = (b); memcpy(tb_cgemit_reserve(e, 8), &_b, 8); (e)->count += 8; } while (0) -#define RELOC4(e, p, b) do { void *_ptr = &(e)->data[p]; \ - uint32_t _b = (b), _temp; \ - memcpy(&_temp, _ptr, 4); \ - _temp += _b; \ - memcpy(_ptr, &_temp, 4); } while (0) #define PATCH4(e, p, b) do { uint32_t _b = (b); memcpy(&(e)->data[p], &_b, 4); } while (0) #define GET_CODE_POS(e) ((e)->count) +#define RELOC4(e, p, b) tb_reloc4(e, p, b) + +static void tb_reloc4(TB_CGEmitter* restrict e, uint32_t p, uint32_t b) { + void* ptr = &e->data[p]; + + // i love UBsan... + uint32_t tmp; + memcpy(&tmp, ptr, 4); + tmp += b; + memcpy(ptr, &tmp, 4); +} static void tb_asm_print(TB_CGEmitter* restrict e, const char* fmt, ...) { // let's hope the optimizer can hoist this early-out outside of the call diff --git a/tb/src/codegen/generic_cg.h b/tb/src/codegen/generic_cg.h index 18dff0e9..2225283f 100644 --- a/tb/src/codegen/generic_cg.h +++ b/tb/src/codegen/generic_cg.h @@ -4,6 +4,7 @@ #include static thread_local bool reg_alloc_log; +static TB_CFG* muh_______cfg; enum { CG_VAL_UNRESOLVED = 0, @@ -40,6 +41,9 @@ _Static_assert(sizeof(TB_PhysicalReg) == sizeof(RegIndex), "these should be the typedef struct MachineBB { Inst* first; + // what's the terminator, it helps us walk successors + TB_Node* end_node; + int start, end; int terminator; @@ -73,8 +77,11 @@ typedef struct { TB_Passes* p; + int bb_count; + int* bb_order; + // Scheduling - size_t block_count; + TB_CFG cfg; Worklist worklist; // reusing from TB_Passes. ValueDesc* values; // the indices match the GVN. @@ -117,7 +124,7 @@ static int classify_reg_class(TB_DataType dt); static void isel(Ctx* restrict ctx, TB_Node* n, int dst); static bool should_rematerialize(TB_Node* n); -static void emit_code(Ctx* restrict ctx, TB_FunctionOutput* restrict func_out); +static void emit_code(Ctx* restrict ctx, TB_FunctionOutput* restrict func_out, int end); static void mark_callee_saved_constraints(Ctx* restrict ctx, uint64_t callee_saved[CG_REGISTER_CLASSES]); static void add_debug_local(Ctx* restrict ctx, TB_Node* n, int pos) { @@ -204,6 +211,8 @@ struct Inst { }; // generic instructions +static Inst* inst_jmp(TB_Node* target); + static Inst* inst_label(TB_Node* n) { Inst* i = TB_ARENA_ALLOC(tmp_arena, Inst); *i = (Inst){ .type = INST_LABEL, .flags = INST_NODE, .n = n }; @@ -393,8 +402,11 @@ static int alloc_vreg(Ctx* restrict ctx, TB_Node* n, TB_DataType dt) { dyn_array_put(ctx->intervals, (LiveInterval){ .reg_class = classify_reg_class(dt), .n = n, .reg = -1, .hint = -1, .assigned = -1, - .dt = legalize(dt), .start = INT_MAX, .split_kid = -1 + .dt = legalize(dt), .split_kid = -1 }); + + LiveRange r = { INT_MAX, INT_MAX }; + dyn_array_put(ctx->intervals[i].ranges, r); return i; } @@ -413,17 +425,23 @@ static int liveness(Ctx* restrict ctx, TB_Function* f) { // find BB boundaries in sequences MachineBBs seq_bb = NULL; - nl_map_create(seq_bb, ctx->block_count); + nl_map_create(seq_bb, ctx->bb_count); - FOREACH_N(i, 0, ctx->block_count) { - MachineBB bb = { + TB_Node** bbs = ctx->worklist.items; + int* bb_order = ctx->bb_order; + FOREACH_N(i, 0, ctx->bb_count) { + TB_Node* n = bbs[bb_order[i]]; + TB_BasicBlock* bb = &nl_map_get_checked(ctx->cfg.node_to_block, n); + + MachineBB mbb = { + .end_node = bb->end, .gen = set_create_in_arena(arena, interval_count), .kill = set_create_in_arena(arena, interval_count), .live_in = set_create_in_arena(arena, interval_count), .live_out = set_create_in_arena(arena, interval_count) }; - nl_map_put(seq_bb, ctx->worklist.items[i], bb); + nl_map_put(seq_bb, n, mbb); } // generate local live sets @@ -435,13 +453,13 @@ static int liveness(Ctx* restrict ctx, TB_Function* f) { assert(inst->type == INST_LABEL); // initial label - MachineBB* mbb = &nl_map_get_checked(seq_bb, f->start_node); + TB_Node* bb = ctx->worklist.items[0]; + MachineBB* mbb = &nl_map_get_checked(seq_bb, bb); mbb->first = inst; mbb->start = 2; inst->time = 2; inst = inst->next; - TB_Node* bb = f->start_node; for (; inst; inst = inst->next) { if (inst->type == INST_LABEL) { nl_map_get_checked(seq_bb, bb).end = timeline; @@ -483,36 +501,32 @@ static int liveness(Ctx* restrict ctx, TB_Function* f) { // generate global live sets size_t base = dyn_array_length(ctx->worklist.items); - assert(base == ctx->block_count); // all nodes go into the worklist - FOREACH_N(i, 0, ctx->block_count) { - TB_Node* bb = ctx->worklist.items[i]; - assert(bb->type == TB_START || bb->type == TB_REGION); - - dyn_array_put(ctx->worklist.items, bb); + FOREACH_REVERSE_N(i, 0, ctx->bb_count) { + TB_Node* n = bbs[bb_order[i]]; + dyn_array_put(ctx->worklist.items, n); // in(bb) = use(bb) - MachineBB* mbb = &nl_map_get_checked(seq_bb, bb); + MachineBB* mbb = &nl_map_get_checked(seq_bb, n); set_copy(&mbb->live_in, &mbb->gen); } while (dyn_array_length(ctx->worklist.items) > base) // CUIK_TIMED_BLOCK("global iter") { TB_Node* bb = dyn_array_pop(ctx->worklist.items); - TB_NodeRegion* r = TB_NODE_GET_EXTRA(bb); MachineBB* mbb = &nl_map_get_checked(seq_bb, bb); - // walk all successors Set* restrict live_out = &mbb->live_out; set_clear(live_out); - if (r->end->type == TB_BRANCH) { - TB_NodeBranch* br = TB_NODE_GET_EXTRA(r->end); - FOREACH_N(i, 0, br->succ_count) { + // walk all successors + TB_Node* end = mbb->end_node; + for (User* u = end->users; u; u = u->next) { + if (cfg_is_control(u->n)) { // union with successor's lives - MachineBB* succ = &nl_map_get_checked(seq_bb, br->succ[i]); - set_union(live_out, &succ->live_in); + TB_Node* succ = cfg_get_fallthru(u->n); + set_union(live_out, &nl_map_get_checked(seq_bb, succ).live_in); } } @@ -533,26 +547,23 @@ static int liveness(Ctx* restrict ctx, TB_Function* f) { // if we have changes, mark the predeccesors if (changes) { FOREACH_N(i, 0, bb->input_count) { - dyn_array_put(ctx->worklist.items, tb_get_parent_region(bb->inputs[i])); + dyn_array_put(ctx->worklist.items, get_block_begin(bb->inputs[i])); } } } - dyn_array_set_length(ctx->worklist.items, ctx->block_count); - - /*FOREACH_REVERSE_N(i, 0, ctx->block_count) { - MachineBB* mbb = &nl_map_get_checked(seq_bb, ctx->worklist.items[i]); - int j = 120; - - printf("v%zu:", i); - if (set_get(&mbb->gen, j)) printf("GEN "); - if (set_get(&mbb->kill, j)) printf("KILL "); - if (set_get(&mbb->live_in, j)) printf("IN "); - if (set_get(&mbb->live_out, j)) printf("OUT "); - printf("\n"); + + /*if (!strcmp(f->super.name, "WinMain")) { + FOREACH_N(i, 0, ctx->bb_count) { + TB_Node* n = bbs[bb_order[i]]; + MachineBB* mbb = &nl_map_get_checked(seq_bb, n); + + bool in = set_get(&mbb->live_in, 83); + bool out = set_get(&mbb->live_out, 83); + printf(".bb%d: %s %s\n", bb_order[i], in?"in":"", out?"out":""); + } }*/ ctx->machine_bbs = seq_bb; - assert(epilogue >= 0); return epilogue; } @@ -630,24 +641,36 @@ static void isel_set_location(Ctx* restrict ctx, TB_Node* n) { } } -static void isel_region(Ctx* restrict ctx, TB_Node* bb, TB_Node* end) { - assert(dyn_array_length(ctx->worklist.items) == ctx->block_count); +static void isel_region(Ctx* restrict ctx, TB_Node* bb_start, TB_Node* end, size_t rpo_index) { + assert(dyn_array_length(ctx->worklist.items) == ctx->cfg.block_count); + TB_Scheduled scheduled = ctx->p->scheduled; + TB_BasicBlock* bb = nl_map_get_checked(scheduled, bb_start); // phase 1: logical schedule DynArray(PhiVal) phi_vals = ctx->phi_vals; CUIK_TIMED_BLOCK("phase 1") { - sched_walk(ctx->p, &ctx->worklist, &phi_vals, bb, end); + sched_walk(ctx->p, &ctx->worklist, &phi_vals, bb, end, true); + + // schedule params + if (rpo_index == 0) { + for (User* use = ctx->f->start_node->users; use; use = use->next) { + TB_Node* use_n = use->n; + if (use_n->type == TB_PROJ && !worklist_test_n_set(&ctx->worklist, use_n)) { + dyn_array_put(ctx->worklist.items, use_n); + } + } + } } // phase 2: define all the nodes in this BB CUIK_TIMED_BLOCK("phase 2") { - FOREACH_REVERSE_N(i, ctx->block_count, dyn_array_length(ctx->worklist.items)) { + FOREACH_N(i, ctx->cfg.block_count, dyn_array_length(ctx->worklist.items)) { TB_Node* n = ctx->worklist.items[i]; - // track use count + // track non-dead users size_t use_count = 0; for (User* use = find_users(ctx->p, n); use; use = use->next) { - if (use->n->inputs[0] != NULL) use_count++; + if (nl_map_get(scheduled, use->n) >= 0) use_count++; } // we don't have to worry about resizing here which is really nice @@ -660,7 +683,6 @@ static void isel_region(Ctx* restrict ctx, TB_Node* bb, TB_Node* end) { // not the new one we're producing. size_t our_phis = dyn_array_length(phi_vals); CUIK_TIMED_BLOCK("phase 3") { - TB_Node* top = ctx->worklist.items[ctx->block_count]; FOREACH_N(i, 0, our_phis) { PhiVal* v = &phi_vals[i]; @@ -668,25 +690,28 @@ static void isel_region(Ctx* restrict ctx, TB_Node* bb, TB_Node* end) { v->dst = input_reg(ctx, v->phi); } - for (User* use = find_users(ctx->p, top); use; use = use->next) { - if (use->n->type == TB_PHI && use->n->dt.type != TB_MEMORY) { - ValueDesc* val = &ctx->values[use->n->gvn]; + if (bb_start->type == TB_REGION) { + for (User* use = find_users(ctx->p, bb_start); use; use = use->next) { + if (use->n->type == TB_PHI && use->n->dt.type != TB_MEMORY) { + ValueDesc* val = &ctx->values[use->n->gvn]; - // copy PHI into temporary - PhiVal p = { .phi = use->n, .dst = input_reg(ctx, use->n) }; - dyn_array_put(phi_vals, p); + // copy PHI into temporary + PhiVal p = { .phi = use->n, .dst = input_reg(ctx, use->n) }; + dyn_array_put(phi_vals, p); - TB_DataType dt = p.phi->dt; - int tmp = DEF(NULL, dt); - SUBMIT(inst_move(dt, tmp, p.dst)); + TB_DataType dt = p.phi->dt; + int tmp = DEF(NULL, dt); + SUBMIT(inst_move(dt, tmp, p.dst)); - // assign temporary as the PHI until the end of the BB - val->vreg = tmp; + // assign temporary as the PHI until the end of the BB + val->vreg = tmp; + } } } - assert(top->type == TB_START || top->type == TB_REGION); - isel(ctx, top, -1); + if (rpo_index == 0) { + isel(ctx, ctx->f->start_node, -1); + } } // phase 4: walk all nodes (we're allowed to fold nodes into those which appear later) @@ -694,19 +719,24 @@ static void isel_region(Ctx* restrict ctx, TB_Node* bb, TB_Node* end) { // isel is emitting start->end but we're iterating in reverse order so we need // to reverse the instruction stream as we go, it's a linked list so it's not // hard. - DO_IF(TB_OPTDEBUG_CODEGEN)(printf("BB %p\n", bb)); + int bbid = nl_map_get_checked(ctx->cfg.node_to_block, bb_start).id; + TB_OPTDEBUG(CODEGEN)(printf("BB %d\n", bbid)); CUIK_TIMED_BLOCK("phase 4") { Inst *head = ctx->head, *last = NULL; TB_Node* prev_effect = NULL; - FOREACH_REVERSE_N(i, ctx->block_count + 1, dyn_array_length(ctx->worklist.items)) { + FOREACH_REVERSE_N(i, ctx->cfg.block_count, dyn_array_length(ctx->worklist.items)) { TB_Node* n = ctx->worklist.items[i]; + if (n->type == TB_START) { + continue; + } + ValueDesc* val = lookup_val(ctx, n); // if the value hasn't been asked for yet and - if (val->vreg < 0 && should_rematerialize(n)) { + if (n != end && val->vreg < 0 && should_rematerialize(n)) { DO_IF(TB_OPTDEBUG_CODEGEN)( - printf(" DISCARD %zu: ", n->gvn), + printf(" DISCARD %u: ", n->gvn), print_node_sexpr(n, 0), printf("\n") ); @@ -719,28 +749,16 @@ static void isel_region(Ctx* restrict ctx, TB_Node* bb, TB_Node* end) { ctx->head = &dummy; if (n->dt.type == TB_TUPLE || n->dt.type == TB_CONTROL || n->dt.type == TB_MEMORY) { - DO_IF(TB_OPTDEBUG_CODEGEN)( - printf(" EFFECT %zu: ", n->gvn), + TB_OPTDEBUG(CODEGEN)( + printf(" EFFECT %u: ", n->gvn), print_node_sexpr(n, 0), printf("\n") ); - if (n->type == TB_BRANCH) { - // writeback PHIs - FOREACH_N(i, 0, our_phis) { - PhiVal* v = &phi_vals[i]; - TB_DataType dt = v->phi->dt; - - int src = input_reg(ctx, v->n); - - hint_reg(ctx, v->dst, src); - SUBMIT(inst_move(dt, v->dst, src)); - } - } isel(ctx, n, val->vreg); - if (n->inputs[0]->type == TB_START || n->type != TB_PROJ) { + if ((n->input_count > 0 && n->inputs[0]->type == TB_START) || n->type != TB_PROJ) { if (prev_effect != NULL) { isel_set_location(ctx, prev_effect); } @@ -759,16 +777,16 @@ static void isel_region(Ctx* restrict ctx, TB_Node* bb, TB_Node* end) { val->vreg = DEF(n, n->dt); } - DO_IF(TB_OPTDEBUG_CODEGEN)( - printf(" DATA %zu: ", n->gvn), + TB_OPTDEBUG(CODEGEN)( + printf(" DATA %u: ", n->gvn), print_node_sexpr(n, 0), printf("\n") ); isel(ctx, n, val->vreg); } else { - DO_IF(TB_OPTDEBUG_CODEGEN)( - printf(" DEAD %zu: ", n->gvn), + TB_OPTDEBUG(CODEGEN)( + printf(" DEAD %u: ", n->gvn), print_node_sexpr(n, 0), printf("\n") ); @@ -824,9 +842,35 @@ static void isel_region(Ctx* restrict ctx, TB_Node* bb, TB_Node* end) { dyn_array_clear(phi_vals); ctx->phi_vals = phi_vals; ctx->head = last ? last : head; + + if (end->type != TB_END && end->type != TB_TRAP && + end->type != TB_BRANCH && end->type != TB_UNREACHABLE) { + TB_OPTDEBUG(CODEGEN)( + printf(" TERMINATOR %u: ", end->gvn), + print_node_sexpr(end, 0), + printf("\n") + ); + + // writeback PHIs + FOREACH_N(i, 0, our_phis) { + PhiVal* v = &phi_vals[i]; + TB_DataType dt = v->phi->dt; + + int src = input_reg(ctx, v->n); + + hint_reg(ctx, v->dst, src); + SUBMIT(inst_move(dt, v->dst, src)); + } + + // implicit goto + TB_Node* succ = cfg_next_control(end); + if (ctx->fallthrough != succ) { + SUBMIT(inst_jmp(succ)); + } + } } - dyn_array_set_length(ctx->worklist.items, ctx->block_count); + dyn_array_set_length(ctx->worklist.items, ctx->cfg.block_count); } // Codegen through here is done in phases @@ -836,15 +880,12 @@ static void compile_function(TB_Passes* restrict p, TB_FunctionOutput* restrict TB_Function* restrict f = p->f; DO_IF(TB_OPTDEBUG_PEEP)(log_debug("%s: starting codegen with %d nodes", f->super.name, f->node_count)); - tb_pass_schedule(p); - #if 0 - reg_alloc_log = strcmp(f->super.name, "main_wnd_proc") == 0; - if (reg_alloc_log) { - printf("\n\n\n"); + if (!strcmp(f->super.name, "stbi__parse_png_file")) { + reg_alloc_log = true; tb_pass_print(p); } else { - emit_asm = false; + reg_alloc_log = false; } #endif @@ -870,16 +911,18 @@ static void compile_function(TB_Passes* restrict p, TB_FunctionOutput* restrict ctx.worklist = p->worklist; ctx.values = tb_arena_alloc(tmp_arena, f->node_count * sizeof(ValueDesc)); + // We need to generate a CFG + ctx.cfg = tb_compute_rpo(f, p); + muh_______cfg = &ctx.cfg; + + // And perform global scheduling + tb_pass_schedule(p, ctx.cfg); + // allocate more stuff now that we've run stats on the IR - ctx.emit.return_label = 0; - nl_map_create(ctx.emit.labels, ctx.block_count); + nl_map_create(ctx.emit.labels, ctx.cfg.block_count); nl_map_create(ctx.stack_slots, 8); dyn_array_create(ctx.debug_stack_slots, 8); - // We need to generate a CFG - ctx.block_count = tb_push_postorder(f, &p->worklist); - assert(p->worklist.items[ctx.block_count - 1] == f->start_node && "Codegen must always schedule entry BB first"); - worklist_clear_visited(&p->worklist); // Instruction selection: @@ -887,11 +930,15 @@ static void compile_function(TB_Passes* restrict p, TB_FunctionOutput* restrict // fixed and which need allocation. For now regalloc is handled // immediately but in theory it could be delayed until all selection // is done. + ctx.bb_count = 0; + int* bb_order = ctx.bb_order = tb_arena_alloc(tmp_arena, ctx.cfg.block_count * sizeof(int)); + CUIK_TIMED_BLOCK("isel") { - assert(dyn_array_length(ctx.worklist.items) == ctx.block_count); + assert(dyn_array_length(ctx.worklist.items) == ctx.cfg.block_count); - // define all PHIs early - FOREACH_REVERSE_N(i, 0, ctx.block_count) { + // define all PHIs early and sort BB order + int stop_bb = -1; + FOREACH_N(i, 0, ctx.cfg.block_count) { TB_Node* bb = ctx.worklist.items[i]; for (User* use = find_users(p, bb); use; use = use->next) { @@ -902,54 +949,38 @@ static void compile_function(TB_Passes* restrict p, TB_FunctionOutput* restrict ctx.values[n->gvn].vreg = -1; } } + + TB_Node* end = nl_map_get_checked(ctx.cfg.node_to_block, bb).end; + if (end->type == TB_END) { + stop_bb = i; + } else { + bb_order[ctx.bb_count++] = i; + } } - // compile all nodes which aren't the STOP node - TB_Node* stop_node = f->stop_node; - TB_Node* stop_bb = tb_get_parent_region(stop_node); + // enter END block at the... end + if (stop_bb >= 0) { + bb_order[ctx.bb_count++] = stop_bb; + } - bool has_stop = false; - FOREACH_REVERSE_N(i, 0, ctx.block_count) { - TB_Node* bb = ctx.worklist.items[i]; - assert(bb->type == TB_START || bb->type == TB_REGION); + TB_Node** bbs = ctx.worklist.items; + FOREACH_N(i, 0, ctx.bb_count) { + TB_Node* bb = bbs[bb_order[i]]; nl_map_put(ctx.emit.labels, bb, 0); - if (bb != stop_bb) { - // mark fallthrough - ctx.fallthrough = i > 0 ? ctx.worklist.items[i - 1] : NULL; - if (ctx.fallthrough == stop_bb) ctx.fallthrough = NULL; - - Inst* label = inst_label(bb); - if (ctx.first == NULL) { - ctx.first = ctx.head = label; - } else { - append_inst(&ctx, label); - } - - TB_Node* end = TB_NODE_GET_EXTRA_T(bb, TB_NodeRegion)->end; - isel_region(&ctx, bb, end); - } else { - has_stop = true; - } - } - // always schedule the STOP node here - if (has_stop) { - // mark fallthrough - ctx.fallthrough = NULL; + // find next BB + ctx.fallthrough = i + 1 < ctx.bb_count ? bbs[bb_order[i + 1]] : NULL; - Inst* label = inst_label(stop_bb); + Inst* label = inst_label(bb); if (ctx.first == NULL) { ctx.first = ctx.head = label; } else { append_inst(&ctx, label); } - TB_Node* end = TB_NODE_GET_EXTRA_T(stop_bb, TB_NodeRegion)->end; - isel_region(&ctx, stop_bb, end); - } else { - // liveness expects one but we don't really have shit to put down there... it's never reached - append_inst(&ctx, alloc_inst(INST_EPILOGUE, TB_TYPE_VOID, 0, 0, 0)); + TB_Node* end = nl_map_get_checked(ctx.cfg.node_to_block, bb).end; + isel_region(&ctx, bb, end, i); } } p->worklist = ctx.worklist; @@ -967,10 +998,11 @@ static void compile_function(TB_Passes* restrict p, TB_FunctionOutput* restrict // Arch-specific: convert instruction buffer into actual instructions CUIK_TIMED_BLOCK("emit code") { - emit_code(&ctx, func_out); + emit_code(&ctx, func_out, end); } } + tb_free_cfg(&ctx.cfg); nl_map_free(ctx.emit.labels); nl_map_free(ctx.machine_bbs); dyn_array_destroy(ctx.intervals); @@ -999,7 +1031,7 @@ static void get_data_type_size(TB_DataType dt, size_t* out_size, size_t* out_ali // round up bits to a byte int bits = is_big_int ? ((dt.data + 7) / 8) : tb_next_pow2(dt.data - 1); - *out_size = ((bits+7) / 8) << dt.width; + *out_size = ((bits+7) / 8); *out_align = is_big_int ? 8 : ((dt.data + 7) / 8); break; } @@ -1009,7 +1041,7 @@ static void get_data_type_size(TB_DataType dt, size_t* out_size, size_t* out_ali else if (dt.data == TB_FLT_64) s = 8; else tb_unreachable(); - *out_size = s << dt.width; + *out_size = s; *out_align = s; break; } diff --git a/tb/src/codegen/reg_alloc.h b/tb/src/codegen/reg_alloc.h index 15cc9524..db20ea3f 100644 --- a/tb/src/codegen/reg_alloc.h +++ b/tb/src/codegen/reg_alloc.h @@ -40,9 +40,6 @@ struct LiveInterval { // register num, -1 if the interval isn't a physical reg int reg, hint; - // whole interval - int start, end; - // spill point, -1 if there's none int spill, split_kid; @@ -91,7 +88,6 @@ static void add_use_pos(LiveInterval* interval, int t, int kind) { dyn_array_put(interval->uses, u); } -// interval->start is filled in by the definition static void add_range(LiveInterval* interval, int start, int end) { assert(start <= end); size_t count = dyn_array_length(interval->ranges); @@ -105,8 +101,6 @@ static void add_range(LiveInterval* interval, int start, int end) { LiveRange r = { start, end }; dyn_array_put(interval->ranges, r); } - - if (end > interval->end) interval->end = end; } static void reverse_bb_walk(LSRA* restrict ra, MachineBB* bb, Inst* inst) { @@ -127,13 +121,12 @@ static void reverse_bb_walk(LSRA* restrict ra, MachineBB* bb, Inst* inst) { assert(*ops >= 0); LiveInterval* interval = &ra->intervals[*ops++]; - if (interval->ranges == NULL) { + if (dyn_array_length(interval->ranges) == 1) { add_range(interval, inst->time, inst->time); } else { interval->ranges[dyn_array_length(interval->ranges) - 1].start = inst->time; } - interval->start = inst->time; add_use_pos(interval, inst->time, dst_use_reg ? USE_REG : USE_OUT); } @@ -158,33 +151,19 @@ static void reverse_bb_walk(LSRA* restrict ra, MachineBB* bb, Inst* inst) { } } -static int range_intersect(int start, int end, LiveRange* b) { - if (b->start <= end && start <= b->end) { - return start > b->start ? start : b->start; +static int range_intersect(LiveRange* a, LiveRange* b) { + if (b->start <= a->end && a->start <= b->end) { + return a->start > b->start ? a->start : b->start; } else { return -1; } } static int interval_intersect(LiveInterval* a, LiveInterval* b) { - if (!(b->start <= a->end && a->start <= b->end)) { - return -1; // don't intersect at all - } - - FOREACH_N(i, 0, dyn_array_length(a->ranges)) { - LiveRange a_range = a->ranges[i]; - - FOREACH_N(j, 0, dyn_array_length(b->ranges)) { - LiveRange b_range = b->ranges[j]; - - // if the end is greater than the start, then we've overshot - if (a_range.start >= b_range.end) { - break; - } - - if (a_range.end >= b_range.start) { - return b_range.start > a_range.start ? b_range.start : a_range.start; - } + dyn_array_for(i, a->ranges) { + int t = range_intersect(&a->ranges[i], &b->ranges[b->active_range]); + if (t >= 0) { + return t; } } @@ -192,8 +171,7 @@ static int interval_intersect(LiveInterval* a, LiveInterval* b) { } #define FOREACH_SET(it, set) \ -FOREACH_N(_i, 0, ((set).capacity + 63) / 64) \ -for (uint64_t bits = (set).data[_i], it = _i*64; bits; bits >>= 1, it++) if (bits & 1) +FOREACH_N(_i, 0, ((set).capacity + 63) / 64) FOREACH_BIT(it, _i*64, (set).data[_i]) static int next_use(LSRA* restrict ra, LiveInterval* interval, int time) { for (;;) { @@ -207,26 +185,9 @@ static int next_use(LSRA* restrict ra, LiveInterval* interval, int time) { interval = &ra->intervals[interval->split_kid]; continue; } - } - - return INT_MAX; -} - -// if < 0, then it's -x - 1 where x is the nearest starting point -static int covers(LiveInterval* it, int start, int end) { - size_t i = 0, count = dyn_array_length(it->ranges); - for (; i < count; i++) { - // if the end is greater than the start, then we've overshot - if (start > it->ranges[i].end) { - return -1; - } - if (end >= it->ranges[i].start) { - return i; - } + return INT_MAX; } - - return -1; } static LiveInterval* get_active(LSRA* restrict ra, int rc, int reg) { @@ -273,13 +234,16 @@ static void insert_split_move(LSRA* restrict ra, int t, int old_reg, int new_reg prev->next = new_inst; } +static int interval_start(LiveInterval* interval) { return interval->ranges[dyn_array_length(interval->ranges) - 1].start; } +static int interval_end(LiveInterval* interval) { return interval->ranges[1].end; } + static LiveInterval* split_interval_at(LSRA* restrict ra, LiveInterval* interval, int pos) { // skip past previous intervals - while (interval->split_kid >= 0 && pos > interval->end) { + while (interval->split_kid >= 0 && pos > interval_end(interval)) { interval = &ra->intervals[interval->split_kid]; } - assert(interval->reg >= 0 || pos <= interval->end); + assert(interval->reg >= 0 || pos <= interval_end(interval)); return interval; } @@ -295,13 +259,10 @@ static int split_intersecting(LSRA* restrict ra, int current_time, int pos, Live int size = 8; ra->stack_usage = align_up(ra->stack_usage + size, size); + // remove from active set + set_remove(&ra->active_set[interval->reg_class], interval->assigned); + REG_ALLOC_LOG printf(" \x1b[33m# v%d: spill %s to [RBP - %d] at t=%d\x1b[0m\n", ri, reg_name(interval->reg_class, interval->assigned), ra->stack_usage, pos); - if (current_time >= pos && interval->assigned >= 0) { - if (set_get(&ra->active_set[interval->reg_class], interval->assigned) && ra->active[interval->reg_class][interval->assigned] == ri) { - REG_ALLOC_LOG printf(" \x1b[33m# v%d: expired during split\x1b[0m\n", ri); - set_remove(&ra->active_set[interval->reg_class], interval->assigned); - } - } } // split lifetime @@ -313,18 +274,17 @@ static int split_intersecting(LSRA* restrict ra, int current_time, int pos, Live it.reg = -1; } it.assigned = it.reg = -1; - it.start = pos; - it.end = interval->end; it.uses = NULL; - it.ranges = NULL; + it.ranges = dyn_array_create(LiveRange, 4); it.n = NULL; it.split_kid = -1; - interval->end = pos; + assert(interval->split_kid < 0 && "cannot spill while spilled"); int old_reg = interval - ra->intervals; int new_reg = dyn_array_length(ra->intervals); interval->split_kid = new_reg; + dyn_array_put(it.ranges, (LiveRange){ INT_MAX, INT_MAX }); dyn_array_put(ra->intervals, it); interval = &ra->intervals[old_reg]; @@ -333,7 +293,7 @@ static int split_intersecting(LSRA* restrict ra, int current_time, int pos, Live // unhandled list... we can push this to the top wit no problem size_t i = 0, count = dyn_array_length(ra->unhandled); for (; i < count; i++) { - if (pos > ra->intervals[ra->unhandled[i]].start) break; + if (pos > interval_start(&ra->intervals[ra->unhandled[i]])) break; } // we know where to insert @@ -360,7 +320,7 @@ static int split_intersecting(LSRA* restrict ra, int current_time, int pos, Live } // split ranges - for (size_t i = 0; i < dyn_array_length(interval->ranges);) { + for (size_t i = 1; i < dyn_array_length(interval->ranges);) { LiveRange* range = &interval->ranges[i]; if (range->start > pos) { dyn_array_put(it.ranges, *range); @@ -372,6 +332,8 @@ static int split_intersecting(LSRA* restrict ra, int current_time, int pos, Live memmove(range, range + 1, shift * sizeof(LiveRange)); } dyn_array_pop(interval->ranges); + interval->active_range -= 1; + continue; } else if (range->end > pos) { // intersects pos, we need to split the range LiveRange r = { pos, range->end }; @@ -426,7 +388,7 @@ static ptrdiff_t allocate_free_reg(LSRA* restrict ra, LiveInterval* interval) { LiveInterval* it = &ra->intervals[ra->inactive[i]]; int fp = ra->free_pos[it->assigned]; if (fp > 0) { - int p = range_intersect(interval->start, interval->end, &it->ranges[it->active_range]); + int p = interval_intersect(interval, it); if (p >= 0 && p < fp) { ra->free_pos[it->assigned] = p; } @@ -448,7 +410,7 @@ static ptrdiff_t allocate_free_reg(LSRA* restrict ra, LiveInterval* interval) { assert(hint->reg_class == rc); hint_reg = hint->assigned; - if (interval->end <= ra->free_pos[hint_reg]) { + if (interval_end(interval) <= ra->free_pos[hint_reg]) { highest = hint_reg; } } @@ -488,14 +450,16 @@ static ptrdiff_t allocate_free_reg(LSRA* restrict ra, LiveInterval* interval) { dyn_array_put(ra->intervals, it); // insert spill and reload - insert_split_move(ra, 0, vreg, spill_slot); - insert_split_move(ra, ra->endpoint, spill_slot, vreg); + insert_split_move(ra, 0, vreg, spill_slot); + if (ra->endpoint) { + insert_split_move(ra, ra->endpoint, spill_slot, vreg); + } // adding to intervals might resized this interval = &ra->intervals[old_reg]; } - if (interval->end <= pos) { + if (interval_end(interval) <= pos) { // we can steal it completely REG_ALLOC_LOG printf(" # assign to %s", reg_name(rc, highest)); @@ -511,7 +475,7 @@ static ptrdiff_t allocate_free_reg(LSRA* restrict ra, LiveInterval* interval) { } else { // TODO(NeGate): split current at optimal position before current interval->assigned = highest; - split_intersecting(ra, interval->start, pos - 1, interval, true); + split_intersecting(ra, interval_start(interval), pos - 1, interval, true); } return highest; @@ -526,17 +490,18 @@ static ptrdiff_t allocate_blocked_reg(LSRA* restrict ra, LiveInterval* interval) FOREACH_N(i, 0, 16) use_pos[i] = INT_MAX; // mark non-fixed intervals + int start = interval_start(interval); FOREACH_SET(i, ra->active_set[rc]) { LiveInterval* it = &ra->intervals[ra->active[rc][i]]; if (it->reg_class == rc && it->reg < 0) { - use_pos[i] = next_use(ra, it, interval->start); + use_pos[i] = next_use(ra, it, start); } } dyn_array_for(i, ra->inactive) { LiveInterval* it = &ra->intervals[ra->inactive[i]]; if (it->reg_class == rc && it->reg < 0) { - use_pos[i] = next_use(ra, it, interval->start); + use_pos[i] = next_use(ra, it, start); } } @@ -554,7 +519,7 @@ static ptrdiff_t allocate_blocked_reg(LSRA* restrict ra, LiveInterval* interval) if (it->reg_class == rc && it->reg >= 0) { int bp = ra->block_pos[it->assigned]; if (bp > 0) { - int p = range_intersect(interval->start, interval->end, &it->ranges[it->active_range]); + int p = interval_intersect(interval, it); if (p >= 0 && p < bp) { ra->block_pos[it->assigned] = p; } @@ -575,7 +540,10 @@ static ptrdiff_t allocate_blocked_reg(LSRA* restrict ra, LiveInterval* interval) } int pos = use_pos[highest]; - int first_use = interval->uses[dyn_array_length(interval->uses) - 1].pos; + int first_use = INT_MAX; + if (dyn_array_length(interval->uses)) { + first_use = interval->uses[dyn_array_length(interval->uses) - 1].pos; + } bool spilled = false; if (first_use > pos) { @@ -588,19 +556,20 @@ static ptrdiff_t allocate_blocked_reg(LSRA* restrict ra, LiveInterval* interval) // split at optimal spot before first use that requires a register FOREACH_REVERSE_N(i, 0, dyn_array_length(interval->uses)) { if (interval->uses[i].pos >= pos && interval->uses[i].kind == USE_REG) { - split_intersecting(ra, interval->start, interval->uses[i].pos - 1, interval, false); + split_intersecting(ra, start, interval->uses[i].pos - 1, interval, false); break; } } spilled = true; } else { - int split_pos = (interval->start & ~1) - 1; + int start = interval_start(interval); + int split_pos = (start & ~1) - 1; // split active or inactive interval reg LiveInterval* to_split = get_active(ra, rc, highest); if (to_split != NULL) { - split_intersecting(ra, interval->start, split_pos, to_split, true); + split_intersecting(ra, start, split_pos, to_split, true); } // split any inactive interval for reg at the end of it's lifetime hole @@ -609,7 +578,7 @@ static ptrdiff_t allocate_blocked_reg(LSRA* restrict ra, LiveInterval* interval) LiveRange* r = &it->ranges[it->active_range]; if (it->reg_class == rc && it->assigned == highest && r->start <= pos+1 && pos <= r->end) { - split_intersecting(ra, interval->start, split_pos, it, true); + split_intersecting(ra, start, split_pos, it, true); } } } @@ -617,9 +586,9 @@ static ptrdiff_t allocate_blocked_reg(LSRA* restrict ra, LiveInterval* interval) // split active reg if it intersects with fixed interval LiveInterval* fix_interval = &ra->intervals[(rc ? FIRST_XMM : FIRST_GPR) + highest]; if (dyn_array_length(fix_interval->ranges)) { - int p = range_intersect(interval->start, interval->end, &fix_interval->ranges[fix_interval->active_range]); + int p = interval_intersect(interval, fix_interval); if (p >= 0) { - split_intersecting(ra, interval->start, p, fix_interval, true); + split_intersecting(ra, start, p, interval, true); } } @@ -631,22 +600,27 @@ static void move_to_active(LSRA* restrict ra, LiveInterval* interval) { int ri = interval - ra->intervals; if (set_get(&ra->active_set[rc], reg)) { - tb_panic("intervals should never be forced out, we should've accomodated them in the first place"); + tb_panic("v%d: interval v%d should never be forced out, we should've accomodated them in the first place", ri, ra->active[rc][reg]); } + assert(reg < 16); set_put(&ra->active_set[rc], reg); ra->active[rc][reg] = ri; } // update active range to match where the position is currently static bool update_interval(LSRA* restrict ra, LiveInterval* restrict interval, bool is_active, int time, int inactive_index) { - int ri = interval - ra->intervals; + /*while (interval->split_kid >= 0) { + interval = &ra->intervals[interval->split_kid]; + }*/ // get to the right range first while (interval->ranges[interval->active_range].end <= time) { + assert(interval->active_range > 0); interval->active_range -= 1; } + int ri = interval - ra->intervals; int hole_end = interval->ranges[interval->active_range].start; int active_end = interval->ranges[interval->active_range].end; bool is_now_active = time >= hole_end; @@ -654,7 +628,7 @@ static bool update_interval(LSRA* restrict ra, LiveInterval* restrict interval, int rc = interval->reg_class; int reg = interval->assigned; - if (time >= interval->end) { // expired + if (interval->active_range == 0) { // expired if (is_active) { REG_ALLOC_LOG printf(" # active %s has expired (v%d)\n", reg_name(rc, reg), ri); set_remove(&ra->active_set[rc], reg); @@ -694,8 +668,8 @@ static int linear_scan(Ctx* restrict ctx, TB_Function* f, int stack_usage, int e MachineBBs mbbs = ctx->machine_bbs; size_t interval_count = dyn_array_length(ra.intervals); CUIK_TIMED_BLOCK("build intervals") { - FOREACH_N(i, 0, ctx->block_count) { - TB_Node* bb = ctx->worklist.items[i]; + FOREACH_REVERSE_N(i, 0, ctx->bb_count) { + TB_Node* bb = ctx->worklist.items[ctx->bb_order[i]]; MachineBB* mbb = &nl_map_get_checked(mbbs, bb); int bb_start = mbb->start; @@ -721,21 +695,20 @@ static int linear_scan(Ctx* restrict ctx, TB_Function* f, int stack_usage, int e // we use every fixed interval at the very start to force them into // the inactive set. - FOREACH_N(i, 0, 32) if (ra.intervals[i].ranges) { - ra.intervals[i].start = 0; + FOREACH_N(i, 0, 32) { add_range(&ra.intervals[i], 0, 1); } - dyn_array_destroy(ra.intervals[RBP].ranges); - dyn_array_destroy(ra.intervals[RSP].ranges); - ra.endpoint = end; mark_callee_saved_constraints(ctx, ra.callee_saved); // generate unhandled interval list (sorted by starting point) ra.unhandled = dyn_array_create(LiveInterval*, (interval_count * 4) / 3); - FOREACH_N(i, 0, interval_count) dyn_array_put(ra.unhandled, i); - cuiksort_defs(ctx->intervals, 0, interval_count - 1, ra.unhandled); + FOREACH_N(i, 0, interval_count) { + ra.intervals[i].active_range = dyn_array_length(ra.intervals[i].ranges) - 1; + dyn_array_put(ra.unhandled, i); + } + cuiksort_defs(ra.intervals, 0, interval_count - 1, ra.unhandled); // only need enough to store for the biggest register class ra.free_pos = TB_ARENA_ARR_ALLOC(tmp_arena, 16, int); @@ -748,28 +721,24 @@ static int linear_scan(Ctx* restrict ctx, TB_Function* f, int stack_usage, int e LiveInterval* interval = &ra.intervals[ri]; // unused interval, skip - if (interval->ranges == NULL) continue; - - int time = interval->start; - - int before_next_time = interval->start; - if (dyn_array_length(ra.unhandled)) { - int before = ra.intervals[ra.unhandled[dyn_array_length(ra.unhandled) - 1]].start - 1; - if (before > before_next_time) { - before_next_time = before; - } + if (interval->reg >= 0) { + continue; } + int time = interval->ranges[interval->active_range].start; + assert(time != INT_MAX); + + int end = interval_end(interval); if (interval->reg >= 0) { - REG_ALLOC_LOG printf(" # %-5s t=[%-4d - %4d)\n", reg_name(interval->reg_class, interval->reg), time, interval->end); + REG_ALLOC_LOG printf(" # %-5s t=[%-4d - %4d)\n", reg_name(interval->reg_class, interval->reg), time, end); } else if (interval->spill > 0) { REG_ALLOC_LOG { - printf(" # v%-4d t=[%-4d - %4d) SPILLED [RBP - %d]\n", ri, time, interval->end, interval->spill); + printf(" # v%-4d t=[%-4d - %4d) SPILLED [RBP - %d]\n", ri, time, end, interval->spill); } continue; } else { REG_ALLOC_LOG { - printf(" # v%-4d t=[%-4d - %4d) ", ri, time, interval->end); + printf(" # v%-4d t=[%-4d - %4d) ", ri, time, end); if (interval->n != NULL) { print_node_sexpr(interval->n, 0); } @@ -818,7 +787,6 @@ static int linear_scan(Ctx* restrict ctx, TB_Function* f, int stack_usage, int e // add to active set if (reg >= 0) { interval->assigned = reg; - interval->active_range = dyn_array_length(interval->ranges) - 1; move_to_active(&ra, interval); } @@ -843,46 +811,36 @@ static int linear_scan(Ctx* restrict ctx, TB_Function* f, int stack_usage, int e // move resolver CUIK_TIMED_BLOCK("move resolver") { - FOREACH_N(i, 0, ctx->block_count) { - TB_Node* bb = ctx->worklist.items[i]; - assert(bb->type == TB_START || bb->type == TB_REGION); - + TB_Node** bbs = ctx->worklist.items; + int* bb_order = ctx->bb_order; + FOREACH_N(i, 0, ctx->bb_count) { + TB_Node* bb = bbs[bb_order[i]]; MachineBB* mbb = &nl_map_get_checked(mbbs, bb); - TB_NodeRegion* r = TB_NODE_GET_EXTRA(bb); - - if (r->end->type != TB_BRANCH) { - continue; - } - - TB_NodeBranch* br = TB_NODE_GET_EXTRA(r->end); - FOREACH_REVERSE_N(i, 0, br->succ_count) { - TB_Node* bb = br->succ[i]; - MachineBB* target = &nl_map_get_checked(mbbs, bb); - - // for all live-ins, we should check if we need to insert a move - FOREACH_SET(i, target->live_in) { - LiveInterval* interval = &ra.intervals[i]; - - // if the value changes across the edge, insert move - LiveInterval* start = split_interval_at(&ra, interval, mbb->end); - LiveInterval* end = split_interval_at(&ra, interval, target->start); - - if (start != end) { - if (start->spill > 0) { - assert(end->spill <= 0 && "TODO: both can't be spills yet"); - insert_split_move(&ra, target->start + 1, start - ra.intervals, end - ra.intervals); - } else { - insert_split_move(&ra, mbb->terminator - 1, start - ra.intervals, end - ra.intervals); + TB_Node* end_node = mbb->end_node; + + for (User* u = end_node->users; u; u = u->next) { + if (cfg_is_control(u->n)) { + TB_Node* succ = cfg_get_fallthru(u->n); + MachineBB* target = &nl_map_get_checked(mbbs, succ); + + // for all live-ins, we should check if we need to insert a move + FOREACH_SET(k, target->live_in) { + LiveInterval* interval = &ra.intervals[k]; + + // if the value changes across the edge, insert move + LiveInterval* start = split_interval_at(&ra, interval, mbb->end); + LiveInterval* end = split_interval_at(&ra, interval, target->start); + + if (start != end) { + if (start->spill > 0) { + assert(end->spill <= 0 && "TODO: both can't be spills yet"); + insert_split_move(&ra, target->start + 1, start - ra.intervals, end - ra.intervals); + } else { + insert_split_move(&ra, mbb->terminator - 1, start - ra.intervals, end - ra.intervals); + } } } } - - // the moves are inserted either at the end of block from or at the beginning of block to, - // depending on the control flow - // resolver.find_insert_position(from, to) - - // insert all moves in correct order (without overwriting registers that are used later) - // resolver.resolve_mappings() } } } @@ -916,17 +874,17 @@ static int linear_scan(Ctx* restrict ctx, TB_Function* f, int stack_usage, int e // Sorting unhandled list //////////////////////////////// static size_t partition(LiveInterval* intervals, ptrdiff_t lo, ptrdiff_t hi, RegIndex* arr) { - int pivot = intervals[arr[(hi - lo) / 2 + lo]].start; // middle + int pivot = interval_start(&intervals[arr[(hi - lo) / 2 + lo]]); // middle ptrdiff_t i = lo - 1, j = hi + 1; for (;;) { // Move the left index to the right at least once and while the element at // the left index is less than the pivot - do { i += 1; } while (intervals[arr[i]].start > pivot); + do { i += 1; } while (interval_start(&intervals[arr[i]]) > pivot); // Move the right index to the left at least once and while the element at // the right index is greater than the pivot - do { j -= 1; } while (intervals[arr[j]].start < pivot); + do { j -= 1; } while (interval_start(&intervals[arr[j]]) < pivot); // If the indices crossed, return if (i >= j) return j; diff --git a/tb/src/debug/cv.c b/tb/src/debug/cv.c index 8b1a37f6..8c4dbdbd 100644 --- a/tb/src/debug/cv.c +++ b/tb/src/debug/cv.c @@ -3,6 +3,15 @@ #include "cv_type_builder.c" +#include + +#if defined(_WIN32) && !defined(_POSIX_C_SOURCE) +#define fileno _fileno +#define fstat _fstat +#define stat _stat +#define strdup _strdup +#endif + // constant sized "hash map" which is used to // deduplicate types in the codeview #define MAX_TYPE_ENTRY_LOOKUP_SIZE 1024 @@ -35,7 +44,6 @@ static void md5sum_file(uint8_t out_bytes[16], const char* filepath) { } static uint16_t get_codeview_type(TB_DataType dt) { - assert(dt.width == 0 && "TODO: implement vector types in CodeView output"); switch (dt.type) { case TB_INT: { if (dt.data <= 0) return 0x0003; // T_VOID diff --git a/tb/src/ir_printer.c b/tb/src/ir_printer.c index b2eb6b80..61a1a35a 100644 --- a/tb/src/ir_printer.c +++ b/tb/src/ir_printer.c @@ -11,6 +11,7 @@ TB_API void tb_default_print_callback(void* user_data, const char* fmt, ...) { TB_API const char* tb_node_get_name(TB_Node* n) { switch (n->type) { case TB_NULL: return "BAD"; + case TB_DEAD: return "dead"; case TB_START: return "start"; case TB_END: return "end"; @@ -70,6 +71,8 @@ TB_API const char* tb_node_get_name(TB_Node* n) { case TB_ATOMIC_OR: return "atomic.or"; case TB_ATOMIC_CAS: return "atomic.cas"; + case TB_CLZ: return "clz"; + case TB_CTZ: return "ctz"; case TB_NEG: return "neg"; case TB_NOT: return "not"; case TB_AND: return "and"; @@ -110,8 +113,6 @@ TB_API const char* tb_node_get_name(TB_Node* n) { #define P(...) callback(user_data, __VA_ARGS__) static void tb_print_type(TB_DataType dt, TB_PrintCallback callback, void* user_data) { - assert(dt.width < 8 && "Vector width too big!"); - switch (dt.type) { case TB_INT: { if (dt.data == 0) P("void"); @@ -148,257 +149,145 @@ static void tb_print_type(TB_DataType dt, TB_PrintCallback callback, void* user_ } } -#if 0 -static void tb_print_node(TB_Function* f, NL_HashSet* visited, TB_PrintCallback callback, void* user_data, TB_Node* restrict n) { - if (!nl_hashset_put(visited, n)) { - return; - } - - bool is_effect = tb_has_effects(n); - - const char* fillcolor = is_effect ? "lightgrey" : "antiquewhite1"; - if (n->type == TB_PROJ) { - fillcolor = "lightblue"; - } - - P(" r%p [style=\"filled\"; ordering=in; shape=box; fillcolor=%s; label=\"", n, fillcolor); - switch (n->type) { - case TB_FLOAT32_CONST: { - TB_NodeFloat32* f = TB_NODE_GET_EXTRA(n); - P("f32 %f", f->value); - break; - } - - case TB_FLOAT64_CONST: { - TB_NodeFloat64* f = TB_NODE_GET_EXTRA(n); - P("f64 %f", f->value); - break; +static bool print_graph_node(TB_Function* f, NL_HashSet* visited, TB_PrintCallback callback, void* user_data, TB_Node* restrict n) { + do { + if (!nl_hashset_put(visited, n)) { + return false; } - case TB_INTEGER_CONST: { - P("%s ", tb_node_get_name(n)); + /*bool is_effect = tb_has_effects(n); + const char* fillcolor = is_effect ? "lightgrey" : "antiquewhite1"; + if (n->dt.type == TB_MEMORY) { + fillcolor = "lightblue"; + }*/ + + P(" r%u [ordering=in; shape=plaintext; label=\"", n->gvn); + switch (n->type) { + case TB_START: P("start"); break; + case TB_REGION: P("region"); break; + + case TB_LOAD: { + P("ld."); + tb_print_type(n->dt, callback, user_data); + break; + } + case TB_STORE: { + P("st."); + tb_print_type(n->inputs[2]->dt, callback, user_data); + break; + } - TB_NodeInt* num = TB_NODE_GET_EXTRA(n); - tb_print_type(n->dt, callback, user_data); + case TB_SYMBOL: { + TB_Symbol* sym = TB_NODE_GET_EXTRA_T(n, TB_NodeSymbol)->sym; + if (sym->name[0]) { + P("%s", sym->name); + } else { + P("sym%p", sym); + } + break; + } - if (num->value < 0xFFFF) { - int bits = n->dt.type == TB_PTR ? 64 : n->dt.data; - int64_t x = tb__sxt(num->value, bits, 64); + case TB_BITCAST: { + P("bitcast "); + tb_print_type(n->inputs[1]->dt, callback, user_data); + P(" -> "); + tb_print_type(n->dt, callback, user_data); + break; + } - P(" %"PRId64, x); - } else { - P("%#0"PRIx64, num->value); + case TB_INTEGER_CONST: { + TB_NodeInt* num = TB_NODE_GET_EXTRA(n); + if (num->value < 0xFFFF) { + P("%"PRId64, num->value); + } else { + P("%#0"PRIx64, num->value); + } + break; } - break; - } - case TB_MEMBER_ACCESS: { - TB_NodeMember* m = TB_NODE_GET_EXTRA(n); - P("member %"PRId64, m->offset); - break; - } + case TB_ARRAY_ACCESS: { + int64_t stride = TB_NODE_GET_EXTRA_T(n, TB_NodeArray)->stride; + P("*%td", stride); + break; + } - case TB_SYMBOL: { - TB_NodeSymbol* s = TB_NODE_GET_EXTRA(n); - P("symbol %s", s->sym->name ? s->sym->name : "???"); - break; - } + case TB_MEMBER_ACCESS: { + int64_t offset = TB_NODE_GET_EXTRA_T(n, TB_NodeMember)->offset; + P("+%td", offset); + break; + } - case TB_END: { - P("stop "); - FOREACH_N(i, 1, n->input_count) { - if (i != 1) P(", "); - tb_print_type(n->inputs[i]->dt, callback, user_data); + case TB_PROJ: { + int index = TB_NODE_GET_EXTRA_T(n, TB_NodeProj)->index; + if (n->inputs[0]->type == TB_START) { + if (index == 0) { + P("ctrl"); + } else if (index == 1) { + P("mem"); + } else if (index == 2) { + P("rpc"); + } else { + P("%c", 'a'+(index - 3)); + } + } else { + P("%d", index); + } + break; } - break; - } - case TB_STORE: { - P("store "); - tb_print_type(n->inputs[3]->dt, callback, user_data); + default: + P("%s", tb_node_get_name(n)); break; } + P("\"]"); - case TB_START: - case TB_REGION: - case TB_BRANCH: - P("%s", tb_node_get_name(n)); - break; + FOREACH_N(i, 0, n->input_count) if (n->inputs[i]) { + TB_Node* in = n->inputs[i]; - case TB_PROJ: { - int index = TB_NODE_GET_EXTRA_T(n, TB_NodeProj)->index; + const char* color = "black"; + TB_DataType dt = n->type == TB_PROJ ? n->dt : in->dt; - P("proj."); - tb_print_type(n->dt, callback, user_data); - P(" %zu", index); - break; - } - - case TB_CMP_EQ: - case TB_CMP_NE: - case TB_CMP_ULT: - case TB_CMP_ULE: - case TB_CMP_SLT: - case TB_CMP_SLE: - case TB_CMP_FLT: - case TB_CMP_FLE: - P("%s ", tb_node_get_name(n)); - tb_print_type(n->inputs[1]->dt, callback, user_data); - break; - - default: - P("%s ", tb_node_get_name(n)); - tb_print_type(n->dt, callback, user_data); - break; - } - P("\"];\n"); - - FOREACH_N(i, 0, n->input_count) if (n->inputs[i]) { - TB_Node* in = n->inputs[i]; - - if (in->type == TB_PROJ && (in->inputs[0]->type != TB_START || in->dt.type == TB_CONTROL)) { - // projections get treated as edges - TB_Node* src = in->inputs[0]; - int index = TB_NODE_GET_EXTRA_T(in, TB_NodeProj)->index; - - tb_print_node(f, visited, callback, user_data, src); - - P(" r%p -> r%p [label=\"", src, n); - if (src->type == TB_BRANCH) { - // branch projections can get nicer looking - TB_NodeBranch* br = TB_NODE_GET_EXTRA(src); - - TB_Node* key = src->input_count > 1 ? src->inputs[1] : NULL; - if (br->keys[0] == 0 && br->succ_count == 2 && key && key->dt.type == TB_INT) { - // boolean branch, we can use true and false - P(index ? "is false?" : "is true?"); - } else if (br->succ_count == 1) { - P(""); - } else if (index == 0) { - P("is default?"); - } else { - P("is %d?", br->keys[index - 1]); - } - } else if (in->dt.type == TB_CONTROL) { - P("cproj"); - } else { - P("%zu", index); - } - - if (in->dt.type == TB_CONTROL) { - P("\"] [color=\"red\"]\n"); - } else { - P("\"]\n"); - } - } else { - tb_print_node(f, visited, callback, user_data, in); - P(" r%p -> r%p", in, n); - if (i == 0 || n->type == TB_REGION) { - P(" [color=\"red\"]"); - } else if (in->dt.type == TB_MEMORY) { - P(" [color=\"blue\" style=\"dashed\"]"); + if (dt.type == TB_CONTROL) { + color = "red"; + } else if (dt.type == TB_CONT) { + color = "purple"; + } else if (dt.type == TB_MEMORY) { + color = "blue"; } - if (n->type == TB_CALL && i > 1) { - P(" [label=\"%zu\"];\n", i - 2); - } else if (n->type == TB_PHI && i > 0) { - P(" [label=\"%zu\"];\n", i - 1); - } else { - P("\n"); - } + P("; r%u -> r%u [color=%s]", in->gvn, n->gvn, color); } - } -} -#endif - -static bool print_graph_node(TB_Function* f, NL_HashSet* visited, TB_PrintCallback callback, void* user_data, TB_Node* restrict n) { - if (!nl_hashset_put(visited, n)) { - return false; - } - - bool is_effect = tb_has_effects(n); - if (is_effect) { - return false; - } + P("\n"); - const char* fillcolor = is_effect ? "lightgrey" : "antiquewhite1"; - P(" r%p [style=\"filled\"; ordering=in; shape=box; fillcolor=%s; label=\"", n, fillcolor); - P("%zu: %s", n->gvn, tb_node_get_name(n)); - P("\"];\n"); + // print all the inputs + FOREACH_N(i, 1, n->input_count) if (n->inputs[i]) { + print_graph_node(f, visited, callback, user_data, n->inputs[i]); + } - FOREACH_N(i, 0, n->input_count) if (n->inputs[i]) { - TB_Node* in = n->inputs[i]; - P(" r%p -> r%p\n", in, n); - print_graph_node(f, visited, callback, user_data, in); - } + if (n->input_count == 0) break; + n = n->inputs[0]; + } while (n != NULL); return true; } -static void print_graph_bb(TB_Function* f, NL_HashSet* visited, TB_PrintCallback callback, void* user_data, TB_Node* restrict bb) { - if (!nl_hashset_put(visited, bb)) { - return; - } - - // walk control edges (aka predecessors) - TB_NodeRegion* r = TB_NODE_GET_EXTRA(bb); - if (r->end->type == TB_BRANCH) { - TB_NodeBranch* br = TB_NODE_GET_EXTRA(r->end); - FOREACH_REVERSE_N(i, 0, br->succ_count) { - print_graph_bb(f, visited, callback, user_data, br->succ[i]); - } - } - - P(" subgraph {\n", bb->gvn); - TB_Node* curr = r->end; - do { - nl_hashset_put(visited, curr); - P(" r%p [style=\"filled\"; shape=box; fillcolor=antiquewhite1; label=\"%zu: ", curr, curr->gvn); - if (curr->type == TB_END) { - P("END"); - } else { - P("EFFECT"); - } - P("\"]\n r%p -> r%p\n", curr->inputs[0], curr); - curr = curr->inputs[0]; - } while (curr != bb); - - // basic block header - P(" r%p [style=\"filled\"; shape=box; fillcolor=antiquewhite1; label=\"%zu: %s\"]\n", bb, bb->gvn, bb->type == TB_START ? "START" : "REGION"); - if (bb->type == TB_START) { - P(" { rank=min; r%p }\n", bb); - } else if (r->end->type == TB_END) { - P(" { rank=max; r%p }\n", r->end); - } - P(" }\n"); - - // write predecessor edges - FOREACH_N(i, 0, bb->input_count) { - TB_Node* pred = bb->inputs[i]; - if (pred->type == TB_PROJ) { - P(" r%p -> r%p\n", pred->inputs[0], bb); - } else { - P(" r%p -> r%p\n", bb->inputs[i], bb); - } - } +TB_API void tb_pass_print_dot(TB_Passes* opt, TB_PrintCallback callback, void* user_data) { + TB_Function* f = opt->f; + P("digraph %s {\n", f->super.name ? f->super.name : "unnamed"); - // process adjacent nodes - curr = r->end; - do { - FOREACH_N(i, 1, curr->input_count) { - print_graph_node(f, visited, callback, user_data, curr->inputs[i]); - P(" r%p -> r%p\n", curr->inputs[i], curr); - } - curr = curr->inputs[0]; - } while (curr != bb); -} + Worklist tmp_ws = { 0 }; + worklist_alloc(&tmp_ws, f->node_count); -TB_API void tb_function_print(TB_Function* f, TB_PrintCallback callback, void* user_data) { - P("digraph %s {\n rankdir=TB\n", f->super.name ? f->super.name : "unnamed"); + TB_CFG cfg = tb_compute_rpo2(f, &tmp_ws, &opt->stack); NL_HashSet visited = nl_hashset_alloc(f->node_count); - print_graph_bb(f, &visited, callback, user_data, f->start_node); + FOREACH_N(i, 0, cfg.block_count) { + TB_BasicBlock* bb = &nl_map_get_checked(cfg.node_to_block, tmp_ws.items[i]); + print_graph_node(f, &visited, callback, user_data, bb->end); + } nl_hashset_free(visited); + worklist_free(&tmp_ws); + tb_free_cfg(&cfg); - P("}\n\n"); + P("}\n"); } diff --git a/tb/src/jit.c b/tb/src/jit.c index 95073e2d..603c4f3e 100644 --- a/tb/src/jit.c +++ b/tb/src/jit.c @@ -1,6 +1,9 @@ #include "tb_internal.h" #include "host.h" +#define WIN32_LEAN_AND_MEAN +#include + enum { ALLOC_GRANULARITY = 16, diff --git a/tb/src/libtb.c b/tb/src/libtb.c index c3ec043f..40ace594 100644 --- a/tb/src/libtb.c +++ b/tb/src/libtb.c @@ -33,32 +33,7 @@ #include "linker/elf.c" // Platform layer -#if defined(_POSIX_C_SOURCE) && !defined(_WIN32) -void* tb_platform_valloc(size_t size) { - return mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); -} - -void* tb_platform_valloc_guard(size_t size) { - return mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); -} - -void tb_platform_vfree(void* ptr, size_t size) { - munmap(ptr, size); -} - -bool tb_platform_vprotect(void* ptr, size_t size, TB_MemProtect prot) { - uint32_t protect; - switch (prot) { - case TB_PAGE_RO: protect = PROT_READ; break; - case TB_PAGE_RW: protect = PROT_READ | PROT_WRITE; break; - case TB_PAGE_RX: protect = PROT_READ | PROT_EXEC; break; - case TB_PAGE_RXW: protect = PROT_READ | PROT_WRITE | PROT_EXEC; break; - default: return false; - } - - return mprotect(ptr, size, protect) == 0; -} -#elif defined(_WIN32) +#if defined(_WIN32) #pragma comment(lib, "onecore.lib") void* tb_platform_valloc(size_t size) { @@ -135,4 +110,29 @@ void* tb_jit_create_stack(size_t* out_size) { return VirtualAlloc2(GetCurrentProcess(), NULL, size, MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES, PAGE_READWRITE, ¶m, 1); } #endif /* NTDDI_VERSION >= NTDDI_WIN10_RS4 */ +#elif defined(_POSIX_C_SOURCE) +void* tb_platform_valloc(size_t size) { + return mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); +} + +void* tb_platform_valloc_guard(size_t size) { + return mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); +} + +void tb_platform_vfree(void* ptr, size_t size) { + munmap(ptr, size); +} + +bool tb_platform_vprotect(void* ptr, size_t size, TB_MemProtect prot) { + uint32_t protect; + switch (prot) { + case TB_PAGE_RO: protect = PROT_READ; break; + case TB_PAGE_RW: protect = PROT_READ | PROT_WRITE; break; + case TB_PAGE_RX: protect = PROT_READ | PROT_EXEC; break; + case TB_PAGE_RXW: protect = PROT_READ | PROT_WRITE | PROT_EXEC; break; + default: return false; + } + + return mprotect(ptr, size, protect) == 0; +} #endif diff --git a/tb/src/objects/coff.h b/tb/src/objects/coff.h index 4f26e74e..85d52468 100644 --- a/tb/src/objects/coff.h +++ b/tb/src/objects/coff.h @@ -1,14 +1,6 @@ // https://github.com/dotnet/runtime/blob/main/docs/design/specs/PE-COFF.md #pragma once #include "../tb_internal.h" -#include - -#if defined(_WIN32) && !defined(_POSIX_C_SOURCE) -#define fileno _fileno -#define fstat _fstat -#define stat _stat -#define strdup _strdup -#endif /*#if TB_HOST_ARCH == TB_HOST_X86_64 #include diff --git a/tb/src/objects/elf64.c b/tb/src/objects/elf64.c index e10ba616..2d638d36 100644 --- a/tb/src/objects/elf64.c +++ b/tb/src/objects/elf64.c @@ -22,7 +22,7 @@ static int put_symbol(TB_Emitter* stab, uint32_t name, uint8_t sym_info, uint16_ .size = size }; tb_outs(stab, sizeof(sym), (uint8_t*)&sym); - return stab->count / sizeof(TB_Elf64_Sym); + return (stab->count / sizeof(TB_Elf64_Sym)) - 1; } static void put_section_symbols(DynArray(TB_ModuleSection) sections, TB_Emitter* strtbl, TB_Emitter* stab, int t) { @@ -39,8 +39,12 @@ static void put_section_symbols(DynArray(TB_ModuleSection) sections, TB_Emitter* out_f->parent->super.symbol_id = put_symbol(stab, name, TB_ELF64_ST_INFO(t, TB_ELF64_STT_FUNC), sec_num, out_f->code_pos, out_f->code_size); } + int acceptable = t == TB_ELF64_STB_GLOBAL ? TB_LINKAGE_PUBLIC : TB_LINKAGE_PRIVATE; dyn_array_for(i, globals) { TB_Global* g = globals[i]; + if (g->linkage != acceptable) { + continue; + } uint32_t name = 0; if (g->super.name) { @@ -143,8 +147,8 @@ TB_ExportBuffer tb_elf64obj_write_output(TB_Module* m, const IDebugFormat* dbg) assert(dbg_section_count == 0); - put_section_symbols(sections, &strtbl, &local_symtab, TB_ELF64_STB_GLOBAL); - put_section_symbols(sections, &strtbl, &global_symtab, TB_ELF64_STB_LOCAL); + put_section_symbols(sections, &strtbl, &local_symtab, TB_ELF64_STB_LOCAL); + put_section_symbols(sections, &strtbl, &global_symtab, TB_ELF64_STB_GLOBAL); FOREACH_N(i, 0, exports.count) { TB_External* ext = exports.data[i]; @@ -207,7 +211,7 @@ TB_ExportBuffer tb_elf64obj_write_output(TB_Module* m, const IDebugFormat* dbg) dyn_array_for(j, funcs) { TB_FunctionOutput* func_out = funcs[j]; - size_t source_offset = func_out->prologue_length + func_out->code_pos; + size_t source_offset = func_out->code_pos; for (TB_SymbolPatch* p = func_out->last_patch; p; p = p->prev) { if (p->internal) continue; diff --git a/tb/src/opt/branches.h b/tb/src/opt/branches.h index e79d68a7..52094a6f 100644 --- a/tb/src/opt/branches.h +++ b/tb/src/opt/branches.h @@ -2,11 +2,13 @@ static TB_Node* ideal_region(TB_Passes* restrict p, TB_Function* f, TB_Node* n) { TB_NodeRegion* r = TB_NODE_GET_EXTRA(n); - // if there's one predecessor and it's to an unconditional branch, merge them. - if (n->input_count == 1 && n->inputs[0]->type == TB_PROJ && - n->inputs[0]->inputs[0]->type == TB_BRANCH && - n->inputs[0]->inputs[0]->input_count == 1) { - // check for any phi nodes + // if a region is dead, start a violent death chain + if (n->input_count == 0) { + n->type = TB_DEAD; + return n; + } else if (n->input_count == 1) { + // single entry regions are useless... + // check for any phi nodes, because we're single entry they're all degens User* use = n->users; while (use != NULL) { User* next = use->next; @@ -17,69 +19,34 @@ static TB_Node* ideal_region(TB_Passes* restrict p, TB_Function* f, TB_Node* n) use = next; } - TB_Node* top_node = unsafe_get_region(n->inputs[0]); - TB_NodeRegion* top_region = TB_NODE_GET_EXTRA(top_node); - - // set new terminator - top_region->end = r->end; - TB_Node* parent = n->inputs[0]->inputs[0]->inputs[0]; - - tb_pass_kill_node(p, n->inputs[0]->inputs[0]); - tb_pass_kill_node(p, n->inputs[0]); + // we might want this as an identity + return n->inputs[0]; + } else { + // remove dead predeccessors + bool changes = false; - return parent; - } - - // if a region is dead, dettach it's succesors - if (n->input_count == 0 && r->end->type == TB_BRANCH) { - TB_NodeBranch* br = TB_NODE_GET_EXTRA(r->end); size_t i = 0; - while (i < br->succ_count) { - TB_Node* succ = br->succ[i]; - if (remove_pred(p, f, n, succ)) { - tb_pass_mark(p, succ); - tb_pass_mark_users(p, succ); - - br->succ_count -= 1; - } else { - i += 1; + while (i < n->input_count) { + if (n->inputs[i]->type == TB_DEAD) { + changes = true; + remove_input(p, f, n, i); + + // update PHIs + for (User* use = n->users; use; use = use->next) { + if (use->n->type == TB_PHI && use->slot == 0) { + remove_input(p, f, use->n, i + 1); + } + } + continue; } - } - - assert(br->succ_count == 0); - } - - return NULL; -} - -static void transmute_goto(TB_Passes* restrict opt, TB_Function* f, TB_Node* br, TB_Node* dst) { - assert(br->type == TB_BRANCH && dst->input_count >= 1); - // convert to unconditional branch - set_input(opt, br, NULL, 1); - br->input_count = 1; - - // remove predecessor from other branches - TB_Node* bb = unsafe_get_region(br); - TB_NodeBranch* br_info = TB_NODE_GET_EXTRA(br); - - size_t i = 0; - while (i < br_info->succ_count) { - if (br_info->succ[i] != dst) { - if (remove_pred(opt, f, bb, br_info->succ[i])) { - br_info->succ_count -= 1; - } - } else { i += 1; } + + return changes ? n : NULL; } - assert(br_info->succ[0] == dst); - // we need to mark the changes to that jump - // threading can clean it up - tb_pass_mark(opt, bb); - tb_pass_mark(opt, dst); - tb_pass_mark_users(opt, bb); + return NULL; } static TB_Node* ideal_phi(TB_Passes* restrict opt, TB_Function* f, TB_Node* n) { @@ -90,6 +57,10 @@ static TB_Node* ideal_phi(TB_Passes* restrict opt, TB_Function* f, TB_Node* n) { } // if branch, both paths are empty => select(cond, t, f) + // + // TODO(NeGate): we can make this diamond trick work for bigger + // branches, we should support a lookup instruction similar to + // "switch" logic for data. TB_DataType dt = n->dt; TB_Node* region = n->inputs[0]; if (region->input_count == 2) { @@ -101,47 +72,51 @@ static TB_Node* ideal_phi(TB_Passes* restrict opt, TB_Function* f, TB_Node* n) { } } - // guarentee paths are effectless - if (!is_empty_bb(opt, region->inputs[0]->inputs[0])) { return NULL; } - if (!is_empty_bb(opt, region->inputs[1]->inputs[0])) { return NULL; } - - // these don't have directions, i just need names - TB_Node* left = region->inputs[0]->inputs[0]->inputs[0]; - TB_Node* right = region->inputs[1]->inputs[0]->inputs[0]; - - // is it a proper if-diamond? - if (left->input_count == 1 && right->input_count == 1 && - left->inputs[0]->type == TB_PROJ && - left->inputs[0]->type == TB_PROJ && - left->inputs[0]->inputs[0]->type == TB_BRANCH && - left->inputs[0]->inputs[0] == right->inputs[0]->inputs[0]) { - TB_Node* branch = left->inputs[0]->inputs[0]; + // guarentee paths are effectless (there's only one data phi and no control nodes) + // + // If + // / \ + // CProjT CProjF Region[0][0] == Region[1][0] + // \ / + // Region + // + TB_Node* left = region->inputs[0]; + TB_Node* right = region->inputs[1]; + if (left->inputs[0]->type == TB_BRANCH && left->inputs[0] == right->inputs[0]) { + TB_Node* branch = left->inputs[0]; TB_NodeBranch* header_br = TB_NODE_GET_EXTRA(branch); if (header_br->succ_count == 2) { - assert(left->inputs[0]->inputs[0]->input_count == 2); - TB_Node* cond = branch->inputs[1]; - TB_Node* left_v = n->inputs[1]; - TB_Node* right_v = n->inputs[2]; + assert(branch->input_count == 2); + + TB_Node *values[2]; + for (User* u = branch->users; u; u = u->next) { + TB_Node* proj = u->n; + if (proj->type == TB_PROJ) { + int index = TB_NODE_GET_EXTRA_T(proj, TB_NodeProj)->index; + // the projection needs to exclusively refer to the region, + // if not we can't elide those effects here. + if (proj->users->next != NULL || proj->users->n != region) { + return NULL; + } + + int phi_i = proj->users->slot; + assert(phi_i + 1 < n->input_count); + values[index] = n->inputs[1 + phi_i]; + } + } - bool right_false = header_br->succ[0] == right; uint64_t falsey = TB_NODE_GET_EXTRA_T(branch, TB_NodeBranch)->keys[0]; + TB_Node* cond = branch->inputs[1]; // TODO(NeGate): handle non-zero falseys if (falsey == 0) { - // kill both successors, since they were unique we can properly murder em' - tb_pass_kill_node(opt, left->inputs[0]); - tb_pass_kill_node(opt, left); - tb_pass_kill_node(opt, right->inputs[0]); - tb_pass_kill_node(opt, right); - // header -> merge { TB_Node* parent = branch->inputs[0]; tb_pass_kill_node(opt, branch); - - TB_NodeRegion* header = TB_NODE_GET_EXTRA(unsafe_get_region(parent)); - header->end = TB_NODE_GET_EXTRA_T(region, TB_NodeRegion)->end; + tb_pass_kill_node(opt, left); + tb_pass_kill_node(opt, right); // attach the header and merge to each other tb_pass_mark(opt, parent); @@ -151,8 +126,8 @@ static TB_Node* ideal_phi(TB_Passes* restrict opt, TB_Function* f, TB_Node* n) { TB_Node* selector = tb_alloc_node(f, TB_SELECT, dt, 4, 0); set_input(opt, selector, cond, 1); - set_input(opt, selector, left_v, 2 + right_false); - set_input(opt, selector, right_v, 2 + !right_false); + set_input(opt, selector, values[0], 2); + set_input(opt, selector, values[1], 3); return selector; } } @@ -174,7 +149,7 @@ static TB_Node* ideal_branch(TB_Passes* restrict opt, TB_Function* f, TB_Node* n // if (a && b) A else B => if (a ? b : 0) A else B // // TODO(NeGate): implement form which works on an arbitrary falsey - if (n->inputs[0]->type == TB_REGION && n->inputs[0]->input_count == 2 && is_empty_bb(opt, n)) { + /*if (n->inputs[0]->type == TB_REGION && n->inputs[0]->input_count == 2 && is_empty_bb(opt, n)) { TB_Node* bb = n->inputs[0]; uint64_t falsey = br->keys[0]; @@ -227,7 +202,7 @@ static TB_Node* ideal_branch(TB_Passes* restrict opt, TB_Function* f, TB_Node* n return n; } } - } + }*/ // br ((y <= x)) => br (x < y) flipped conditions if (cmp_type == TB_CMP_SLE || cmp_type == TB_CMP_ULE) { @@ -236,7 +211,12 @@ static TB_Node* ideal_branch(TB_Passes* restrict opt, TB_Function* f, TB_Node* n set_input(opt, new_cmp, cmp_node->inputs[1], 2); TB_NODE_SET_EXTRA(new_cmp, TB_NodeCompare, .cmp_dt = TB_NODE_GET_EXTRA_T(cmp_node, TB_NodeCompare)->cmp_dt); - SWAP(TB_Node*, br->succ[0], br->succ[1]); + // flip + for (User* u = n->users; u; u = u->next) { + TB_NodeProj* p = TB_NODE_GET_EXTRA(u->n); + p->index = !p->index; + } + set_input(opt, n, new_cmp, 1); tb_pass_mark(opt, new_cmp); return n; @@ -250,13 +230,17 @@ static TB_Node* ideal_branch(TB_Passes* restrict opt, TB_Function* f, TB_Node* n // flip successors if (cmp_type == TB_CMP_EQ) { - SWAP(TB_Node*, br->succ[0], br->succ[1]); + for (User* u = n->users; u; u = u->next) { + TB_NodeProj* p = TB_NODE_GET_EXTRA(u->n); + p->index = !p->index; + } } + return n; } // check if we're dominated by a branch that already checked it - TB_Node* bb = unsafe_get_region(n->inputs[0]); + /*TB_Node* bb = get_block_begin(n->inputs[0]); for (User* u = find_users(opt, cmp_node); u; u = u->next) { if (u->n != n && u->slot == 1 && u->n->type == TB_BRANCH) { TB_NodeBranch* dom_branch = TB_NODE_GET_EXTRA(u->n); @@ -266,7 +250,7 @@ static TB_Node* ideal_branch(TB_Passes* restrict opt, TB_Function* f, TB_Node* n ptrdiff_t match = -1; FOREACH_N(i, 0, dom_branch->succ_count) { TB_Node* target = dom_branch->succ[i]; - if (tb_is_dominated_by(target, bb)) { + if (tb_is_dominated_by(opt->cfg, target, bb)) { match = i; break; } @@ -278,63 +262,69 @@ static TB_Node* ideal_branch(TB_Passes* restrict opt, TB_Function* f, TB_Node* n } } } - } + }*/ } } // constant fold branch - /*if (n->input_count == 2) { - uint64_t key; - if (get_int_const(n->inputs[1], &key)) { + if (n->input_count == 2) { + Lattice* key = lattice_universe_get(&opt->universe, n->inputs[1]); + + // we can walk the dominator tree to see if the condition is already + // been checked. + + if (key->tag == LATTICE_INT && key->_int.min == key->_int.max) { + int64_t key_const = key->_int.max; + size_t taken = 0; FOREACH_N(i, 0, br->succ_count - 1) { - uint64_t case_key = br->keys[i]; - if (key == case_key) { taken = i + 1; break; } + int64_t case_key = br->keys[i]; + if (key_const == case_key) { + taken = i + 1; + break; + } } - TB_Node* dead = make_dead(f, opt); + TB_Node* dead = make_dead_node(f, opt); // convert dead projections into DEAD and convert live projection into index 0 - for (User* use = find_users(opt, n); use; use = use->next) { - if (use->n->type == TB_PROJ) { - int index = TB_NODE_GET_EXTRA_T(use->n, TB_NodeProj)->index; + for (User* u = n->users; u; u = u->next) { + TB_Node* proj = u->n; + if (proj->type == TB_PROJ) { + int index = TB_NODE_GET_EXTRA_T(proj, TB_NodeProj)->index; if (index != taken) { - subsume_node(opt, f, use->n, dead); + subsume_node(opt, f, proj, dead); } else { - TB_NODE_GET_EXTRA_T(use->n, TB_NodeProj)->index = 0; - - User* proj_use = find_users(opt, use->n); - assert(proj_use->next == NULL && "control projection has conflicts?"); - assert(proj_use->n->type == TB_REGION); + TB_NODE_GET_EXTRA_T(proj, TB_NodeProj)->index = 0; + + // if we folded away from a region, then we should subsume + // the degen phis. + assert(proj->users->next == NULL); + TB_Node* succ = proj->users->n; + if (succ->type == TB_REGION) { + int phi_i = proj->users->slot; + + User* u = succ->users; + while (u != NULL) { + User* next = u->next; + if (u->n->type == TB_PHI) { + tb_pass_mark_users(opt, u->n); + subsume_node(opt, f, u->n, u->n->inputs[phi_i + 1]); + } + u = next; + } + } - br->succ_count = 1; - br->succ[0] = proj_use->n; + tb_pass_kill_node(opt, proj); + set_input(opt, succ, n->inputs[0], 0); } } } - assert(br->succ_count == 1); // remove condition - set_input(opt, n, NULL, 1); - n->input_count = 1; - return n; + return dead; } } - // check if it's a dead region - TB_Node* parent = unsafe_get_region(n); - if (parent->input_count == 0 && br->succ_count != 0) { - // remove predecessor from successors - TB_Node* dead = make_dead(f, opt); - for (User* use = find_users(opt, n); use; use = use->next) { - if (use->n->type == TB_PROJ) { - subsume_node(opt, f, use->n, dead); - } - } - - br->succ_count = 0; - return n; - }*/ - return NULL; } diff --git a/tb/src/opt/cfg.h b/tb/src/opt/cfg.h index 9d58401a..2cd0201a 100644 --- a/tb/src/opt/cfg.h +++ b/tb/src/opt/cfg.h @@ -1,79 +1,159 @@ -typedef struct { - TB_Function* f; - - size_t block_count; - TB_Node** blocks; -} DomContext; - -// we'll be walking backwards from the end node -static void postorder(Worklist* restrict ws, TB_Node* n) { - if (!worklist_test_n_set(ws, n)) { - // walk control edges (aka predecessors) - TB_NodeRegion* r = TB_NODE_GET_EXTRA(n); - if (r->end->type == TB_BRANCH) { - TB_NodeBranch* br = TB_NODE_GET_EXTRA(r->end); - FOREACH_REVERSE_N(i, 0, br->succ_count) { - postorder(ws, br->succ[i]); - } - } +void tb_free_cfg(TB_CFG* cfg) { + nl_map_for(i, cfg->node_to_block) { + nl_hashset_free(cfg->node_to_block[i].v.items); + } + nl_map_free(cfg->node_to_block); +} + +TB_CFG tb_compute_rpo(TB_Function* f, TB_Passes* p) { + return tb_compute_rpo2(f, &p->worklist, &p->stack); +} + +static TB_Node* next_control(Worklist* ws, TB_Node* n) { + // unless it's a branch (aka a terminator), it'll have one successor + TB_Node* next = NULL; + for (User* u = n->users; u; u = u->next) { + TB_Node* succ = u->n; + + // we can't treat regions in the chain + if (succ->type == TB_REGION) break; - dyn_array_put(ws->items, n); + // we've found the next step in control flow + if (cfg_is_control(succ) && !worklist_test_n_set(ws, succ)) { + return succ; + } } + + return NULL; } -size_t tb_push_postorder(TB_Function* f, Worklist* restrict ws) { +TB_CFG tb_compute_rpo2(TB_Function* f, Worklist* ws, DynArray(TB_Node*)* tmp_stack) { assert(dyn_array_length(ws->items) == 0); - postorder(ws, f->start_node); - return dyn_array_length(ws->items); + + TB_CFG cfg = { 0 }; + DynArray(TB_Node*) stack = *tmp_stack; + if (stack == NULL) { + stack = dyn_array_create(TB_Node*, 1024); + } + + dyn_array_put(stack, f->start_node); + worklist_test_n_set(ws, f->start_node); + + // depth-first search + int order = 0; + while (dyn_array_length(stack)) { + TB_Node* n = dyn_array_pop(stack); + + // we've spotted a BB entry + if (cfg_is_bb_entry(n)) { + // proj BB's will prefer to be REGION BB's + if (n->inputs[0]->type != TB_START && n->type == TB_PROJ && n->users->n->type == TB_REGION) { + // we've already seen this BB, let's skip it + if (worklist_test_n_set(ws, n->users->n)) { + continue; + } + + n = n->users->n; + } + + // walk until terminator + TB_Node* entry = n; + TB_BasicBlock bb = { .id = cfg.block_count++ }; + while (!cfg_is_terminator(n)) { + TB_Node* next = next_control(ws, n); + if (next == NULL) { + break; + } + n = next; + } + + // the start node always has it's dom depth filled + if (bb.id == 0) { + bb.dom = entry; + bb.dom_depth = 0; + } else { + bb.dom_depth = -1; + } + + bb.end = n; + dyn_array_put(ws->items, entry); + nl_map_put(cfg.node_to_block, entry, bb); + } + + // add successors (could be multi-way like a branch) + if (n->type == TB_BRANCH) { + size_t succ_count = TB_NODE_GET_EXTRA_T(n, TB_NodeBranch)->succ_count; + + dyn_array_put_uninit(stack, succ_count); + TB_Node** top = &stack[dyn_array_length(stack) - 1]; + + for (User* u = n->users; u; u = u->next) { + TB_Node* succ = u->n; + if (cfg_is_control(succ) && !worklist_test_n_set(ws, succ)) { + assert(succ->type == TB_PROJ); + int index = TB_NODE_GET_EXTRA_T(succ, TB_NodeProj)->index; + top[-index] = succ; + } + } + } else { + for (User* u = n->users; u; u = u->next) { + TB_Node* succ = u->n; + if (cfg_is_control(succ) && !worklist_test_n_set(ws, succ)) { + dyn_array_put(stack, succ); + } + } + } + } + + *tmp_stack = stack; + return cfg; } -static int find_traversal_index(TB_Node* n) { - assert(n->type == TB_REGION || n->type == TB_START); - assert(TB_NODE_GET_EXTRA_T(n, TB_NodeRegion)->postorder_id >= 0); - return TB_NODE_GET_EXTRA_T(n, TB_NodeRegion)->postorder_id; +static int find_traversal_index(TB_CFG* cfg, TB_Node* n) { + return nl_map_get_checked(cfg->node_to_block, n).id; } -static int try_find_traversal_index(TB_Node* n) { - assert(n->type == TB_REGION || n->type == TB_START); - return TB_NODE_GET_EXTRA_T(n, TB_NodeRegion)->postorder_id; +static int try_find_traversal_index(TB_CFG* cfg, TB_Node* n) { + ptrdiff_t search = nl_map_get(cfg->node_to_block, n); + return search >= 0 ? cfg->node_to_block[search].v.id : -1; } -static int resolve_dom_depth(TB_Node* bb) { - if (dom_depth(bb) >= 0) { - return dom_depth(bb); +static int resolve_dom_depth(TB_CFG* cfg, TB_Node* bb) { + if (dom_depth(cfg, bb) >= 0) { + return dom_depth(cfg, bb); } - int parent = resolve_dom_depth(idom(bb)); + int parent = resolve_dom_depth(cfg, idom(cfg, bb)); // it's one more than it's parent - TB_NODE_GET_EXTRA_T(bb, TB_NodeRegion)->dom_depth = parent + 1; + nl_map_get_checked(cfg->node_to_block, bb).dom_depth = parent + 1; return parent + 1; } -TB_DominanceFrontiers* tb_get_dominance_frontiers(TB_Function* f, size_t count, TB_Node** blocks) { - size_t stride = (count + 63) / 64; - size_t elems = stride * count; +TB_DominanceFrontiers* tb_get_dominance_frontiers(TB_Function* f, TB_Passes* restrict p, TB_CFG cfg, TB_Node** blocks) { + size_t stride = (cfg.block_count + 63) / 64; + size_t elems = stride * cfg.block_count; size_t size = sizeof(TB_DominanceFrontiers) + sizeof(uint64_t)*elems; TB_DominanceFrontiers* df = tb_platform_heap_alloc(size); memset(df, 0, size); df->stride = stride; - FOREACH_REVERSE_N(i, 0, count) { + FOREACH_N(i, 0, cfg.block_count) { TB_Node* bb = blocks[i]; - assert(TB_NODE_GET_EXTRA_T(bb, TB_NodeRegion)->postorder_id == i); + assert(find_traversal_index(&cfg, bb) == i); - if (bb->input_count >= 2) { + if (bb->type == TB_REGION && bb->input_count >= 2) { FOREACH_N(k, 0, bb->input_count) { - TB_Node* runner = unsafe_get_region(bb->inputs[k]); + TB_Node* runner = get_pred(bb, k); - while (runner->input_count > 0 && runner != idom(bb)) { + while (!(runner->type == TB_PROJ && runner->inputs[0]->type == TB_START) && runner != idom(&cfg, bb)) { // add to frontier set - TB_NodeRegion* r = TB_NODE_GET_EXTRA(runner); - tb_dommy_fronts_put(df, r->postorder_id, i); + int id = nl_map_get_checked(cfg.node_to_block, runner).id; + tb_dommy_fronts_put(df, id, i); - runner = idom(runner); + runner = idom(&cfg, runner); } } } @@ -87,68 +167,58 @@ TB_API void tb_free_dominance_frontiers(TB_DominanceFrontiers* df) { } // https://www.cs.rice.edu/~keith/EMBED/dom.pdf -void tb_compute_dominators(TB_Function* f, size_t count, TB_Node** blocks) { - DomContext ctx = { .f = f, .block_count = count, .blocks = blocks }; - - FOREACH_N(i, 0, count) { - TB_NodeRegion* r = TB_NODE_GET_EXTRA(blocks[i]); - r->dom_depth = -1; // unresolved - r->dom = NULL; - r->postorder_id = i; - } - - // entry dominates itself - TB_NodeRegion* r = TB_NODE_GET_EXTRA(f->start_node); - r->dom_depth = 0; - r->dom = f->start_node; - - // identify post order traversal order - int entry_dom = ctx.block_count - 1; +void tb_compute_dominators(TB_Function* f, TB_Passes* restrict p, TB_CFG cfg) { + tb_compute_dominators2(f, &p->worklist, cfg); +} +void tb_compute_dominators2(TB_Function* f, Worklist* ws, TB_CFG cfg) { + TB_Node** blocks = ws->items; bool changed = true; while (changed) { changed = false; // for all nodes, b, in reverse postorder (except start node) - FOREACH_REVERSE_N(i, 0, count - 1) { + FOREACH_REVERSE_N(i, 1, cfg.block_count) { TB_Node* b = blocks[i]; - TB_Node* new_idom = unsafe_get_region(b->inputs[0]); - - // for all other predecessors, p, of b - FOREACH_N(j, 1, b->input_count) { - TB_Node* p = unsafe_get_region(b->inputs[j]); - - // if doms[p] already calculated - TB_Node* idom_p = TB_NODE_GET_EXTRA_T(p, TB_NodeRegion)->dom; - if (idom_p == NULL && p->input_count > 0) { - int a = try_find_traversal_index(p); - if (a >= 0) { - int b = find_traversal_index(new_idom); - while (a != b) { - // while (finger1 < finger2) - // finger1 = doms[finger1] - while (a < b) { - TB_Node* d = idom(blocks[a]); - a = d ? find_traversal_index(d) : entry_dom; + TB_Node* new_idom = get_pred(b, 0); + + if (b->type == TB_REGION) { + // for all other predecessors, p, of b + FOREACH_N(j, 1, b->input_count) { + TB_Node* p = get_pred(b, j); + + // if doms[p] already calculated + TB_Node* idom_p = idom(&cfg, p); + if (idom_p == NULL && p->input_count > 0) { + int a = try_find_traversal_index(&cfg, p); + if (a >= 0) { + int b = find_traversal_index(&cfg, new_idom); + while (a != b) { + // while (finger1 > finger2) + // finger1 = doms[finger1] + while (a > b) { + TB_Node* d = idom(&cfg, blocks[a]); + a = d ? find_traversal_index(&cfg, d) : 0; + } + + // while (finger2 > finger1) + // finger2 = doms[finger2] + while (b > a) { + TB_Node* d = idom(&cfg, blocks[b]); + b = d ? find_traversal_index(&cfg, d) : 0; + } } - // while (finger2 < finger1) - // finger2 = doms[finger2] - while (b < a) { - TB_Node* d = idom(blocks[b]); - b = d ? find_traversal_index(d) : entry_dom; - } + new_idom = blocks[a]; } - - new_idom = blocks[a]; } } } assert(new_idom != NULL); - TB_NodeRegion* region_b = TB_NODE_GET_EXTRA_T(b, TB_NodeRegion); - if (region_b->dom != new_idom) { - region_b->dom = new_idom; + TB_Node** dom_ptr = &nl_map_get_checked(cfg.node_to_block, b).dom; + if (*dom_ptr != new_idom) { + *dom_ptr = new_idom; changed = true; } } @@ -156,8 +226,8 @@ void tb_compute_dominators(TB_Function* f, size_t count, TB_Node** blocks) { // generate depth values CUIK_TIMED_BLOCK("generate dom tree") { - FOREACH_N(i, 0, count - 1) { - resolve_dom_depth(blocks[i]); + FOREACH_REVERSE_N(i, 1, cfg.block_count) { + resolve_dom_depth(&cfg, blocks[i]); } } } @@ -171,9 +241,9 @@ TB_Node* tb_get_parent_region(TB_Node* n) { return n; } -bool tb_is_dominated_by(TB_Node* expected_dom, TB_Node* bb) { +bool tb_is_dominated_by(TB_CFG cfg, TB_Node* expected_dom, TB_Node* bb) { while (expected_dom != bb) { - TB_Node* new_bb = idom(bb); + TB_Node* new_bb = idom(&cfg, bb); if (bb == new_bb) { return false; } diff --git a/tb/src/opt/fold.h b/tb/src/opt/fold.h index fa2dd27e..0e645720 100644 --- a/tb/src/opt/fold.h +++ b/tb/src/opt/fold.h @@ -49,26 +49,16 @@ static bool get_int_const(TB_Node* n, uint64_t* imm) { //////////////////////////////// // Integer idealizations //////////////////////////////// -static TB_Node* ideal_truncate(TB_Passes* restrict opt, TB_Function* f, TB_Node* n) { +static TB_Node* ideal_bitcast(TB_Passes* restrict opt, TB_Function* f, TB_Node* n) { TB_Node* src = n->inputs[1]; - if (src->type != TB_INTEGER_CONST || n->dt.type != TB_INT) { - return NULL; - } - TB_NodeInt* src_i = TB_NODE_GET_EXTRA(src); - - uint64_t mask = n->dt.data == 64 ? UINT64_MAX : (1ull << n->dt.data) - 1; - return make_int_node(f, opt, n->dt, src_i->value & mask); -} - -static TB_Node* ideal_int2ptr(TB_Passes* restrict opt, TB_Function* f, TB_Node* n) { - TB_Node* src = n->inputs[1]; - if (src->type != TB_INTEGER_CONST) { - return NULL; + // int -> smaller int means truncate + if (src->dt.type == TB_INT && n->dt.type == TB_INT && src->dt.data > n->dt.data) { + n->type = TB_TRUNCATE; + return n; } - TB_NodeInt* src_i = TB_NODE_GET_EXTRA(src); - return make_int_node(f, opt, n->dt, src_i->value); + return NULL; } // cmp.slt(a, 0) => is_sign(a) @@ -100,6 +90,279 @@ static bool inverted_cmp(TB_Node* n, TB_Node* n2) { } } +static Lattice* dataflow_sext(TB_Passes* restrict opt, LatticeUniverse* uni, TB_Node* n) { + Lattice* a = lattice_universe_get(uni, n->inputs[1]); + int old_bits = n->inputs[1]->dt.data; + + int64_t min = tb__sxt(a->_int.min, old_bits, n->dt.data); + int64_t max = tb__sxt(a->_int.max, old_bits, n->dt.data); + uint64_t zeros = a->_int.known_zeros; + uint64_t ones = a->_int.known_ones; + + // if we know the sign bit then we can know what the extended bits look like + uint64_t mask = tb__mask(n->dt.data) & ~tb__mask(old_bits); + if (zeros >> (old_bits - 1)) { + zeros |= mask; + } else if (ones >> (old_bits - 1)) { + ones |= mask; + } + + return lattice_intern(uni, (Lattice){ LATTICE_INT, ._int = { min, max, zeros, ones } }); +} + +static Lattice* dataflow_zext(TB_Passes* restrict opt, LatticeUniverse* uni, TB_Node* n) { + Lattice* a = lattice_universe_get(uni, n->inputs[1]); + uint64_t mask = tb__mask(n->dt.data) & ~tb__mask(n->inputs[1]->dt.data); + + int64_t min = a->_int.min; + int64_t max = a->_int.max; + uint64_t zeros = a->_int.known_zeros | mask; // we know the top bits must be zero + uint64_t ones = a->_int.known_ones; + + return lattice_intern(uni, (Lattice){ LATTICE_INT, ._int = { min, max, zeros, ones } }); +} + +static Lattice* dataflow_trunc(TB_Passes* restrict opt, LatticeUniverse* uni, TB_Node* n) { + Lattice* a = lattice_universe_get(uni, n->inputs[1]); + + int64_t mask = tb__mask(n->dt.data); + int64_t min = a->_int.min & mask; + int64_t max = a->_int.max & mask; + if (min > max) { + min = lattice_int_min(n->dt.data); + max = lattice_int_max(n->dt.data); + } + + uint64_t zeros = a->_int.known_zeros | ~mask; + uint64_t ones = a->_int.known_ones & mask; + return lattice_intern(uni, (Lattice){ LATTICE_INT, ._int = { min, max, zeros, ones } }); +} + +static int64_t wrapped_int_add(int64_t x, int64_t y) { return (uint64_t)x + (uint64_t)y; } +static int64_t wrapped_int_sub(int64_t x, int64_t y) { return (uint64_t)x - (uint64_t)y; } +static int64_t wrapped_int_mul(int64_t x, int64_t y) { return (uint64_t)x * (uint64_t)y; } +static bool wrapped_int_lt(int64_t x, int64_t y, int bits) { return (int64_t)tb__sxt(x, bits, 64) < (int64_t)tb__sxt(y, bits, 64); } + +static bool sub_overflow(uint64_t x, uint64_t y, uint64_t xy, int bits) { + uint64_t v = (x ^ y) & (xy ^ x); + // check the sign bit + return (v >> (bits - 1)) & 1; +} + +static Lattice* dataflow_arith(TB_Passes* restrict opt, LatticeUniverse* uni, TB_Node* n) { + Lattice* a = lattice_universe_get(uni, n->inputs[1]); + Lattice* b = lattice_universe_get(uni, n->inputs[2]); + assert(a->tag == LATTICE_INT && b->tag == LATTICE_INT); + + int64_t mask = tb__mask(n->dt.data); + int64_t min, max; + switch (n->type) { + case TB_ADD: + min = wrapped_int_add(a->_int.min, b->_int.min); + max = wrapped_int_add(a->_int.max, b->_int.max); + break; + + case TB_SUB: + min = wrapped_int_sub(a->_int.min, b->_int.min); + max = wrapped_int_sub(a->_int.max, b->_int.max); + break; + + case TB_MUL: + min = wrapped_int_mul(a->_int.min, b->_int.min); + max = wrapped_int_mul(a->_int.max, b->_int.max); + break; + } + + // truncate to the size of the raw DataType + min &= mask, max &= mask; + + if (!lattice_is_const_int(a) || !lattice_is_const_int(b)) { + // if we overflow, default to the full range + if (n->type == TB_SUB) { + // subtraction does overflow check different from add or mul + if (sub_overflow(a->_int.min, b->_int.min, min, n->dt.data) || + sub_overflow(a->_int.max, b->_int.max, max, n->dt.data) + ) { + min = lattice_int_min(n->dt.data); + max = lattice_int_max(n->dt.data); + } + } else { + if (((a->_int.min & b->_int.min) < 0 && min >= 0) || + (~(a->_int.max | b->_int.max) < 0 && max < 0) || + wrapped_int_lt(max, min, n->dt.data) + ) { + min = lattice_int_min(n->dt.data); + max = lattice_int_max(n->dt.data); + } + } + } + + return lattice_intern(uni, (Lattice){ LATTICE_INT, ._int = { min, max } }); +} + +static Lattice* dataflow_int2ptr(TB_Passes* restrict opt, LatticeUniverse* uni, TB_Node* n) { + Lattice* a = lattice_universe_get(uni, n->inputs[1]); + assert(a->tag == LATTICE_INT); + + if (a->_int.min == a->_int.max) { + // int2ptr with a constant leads to fun cool stuff (usually we get constant + // zeros) + LatticeTrifecta t = a->_int.min ? LATTICE_KNOWN_NOT_NULL : LATTICE_KNOWN_NULL; + return lattice_intern(uni, (Lattice){ LATTICE_POINTER, ._ptr = { t } }); + } + + return NULL; +} + +static Lattice* dataflow_unary(TB_Passes* restrict opt, LatticeUniverse* uni, TB_Node* n) { + Lattice* a = lattice_universe_get(uni, n->inputs[1]); + if (a->tag == LATTICE_INT) { + uint64_t mask = tb__mask(n->dt.data); + uint64_t min = ~a->_int.min & mask; + uint64_t max = ~a->_int.max & mask; + + if ((int64_t)min > (int64_t)max) { + SWAP(int64_t, min, max); + } + + uint64_t zeros = 0, ones = 0; + if (n->type == TB_NEG) { + // -x => ~x + 1 + // because of this addition we can technically + // overflow... umm? glhf? + uint64_t min_inc = (min+1) & mask; + uint64_t max_inc = (max+1) & mask; + + if (min_inc < min || max_inc < min) { + min = lattice_int_min(n->dt.data); + max = lattice_int_min(n->dt.data); + } else { + min = min_inc; + max = max_inc; + } + } else { + zeros = ~a->_int.known_zeros; + ones = ~a->_int.known_ones; + } + + return lattice_intern(uni, (Lattice){ LATTICE_INT, ._int = { min, max, zeros, ones } }); + } else { + return NULL; + } +} + +static Lattice* dataflow_bits(TB_Passes* restrict opt, LatticeUniverse* uni, TB_Node* n) { + Lattice* a = lattice_universe_get(uni, n->inputs[1]); + Lattice* b = lattice_universe_get(uni, n->inputs[2]); + + uint64_t zeros, ones; + switch (n->type) { + case TB_AND: + // 0 if either is zero, 1 if both are 1 + zeros = a->_int.known_zeros | b->_int.known_zeros; + ones = a->_int.known_ones & b->_int.known_ones; + break; + + case TB_OR: + // 0 if both are 0, 1 if either is 1 + zeros = a->_int.known_zeros & b->_int.known_zeros; + ones = a->_int.known_ones | b->_int.known_ones; + break; + + case TB_XOR: + // 0 if both bits are 0 or 1 + // 1 if both bits aren't the same + zeros = (a->_int.known_zeros & b->_int.known_zeros) | (a->_int.known_ones & b->_int.known_ones); + ones = (a->_int.known_zeros & b->_int.known_ones) | (a->_int.known_ones & b->_int.known_zeros); + break; + + default: tb_todo(); + } + + uint64_t mask = tb__mask(n->dt.data); + zeros &= mask, ones &= mask; + + // we can deduce a min and max by assuming the unknown bits are either zeros or ones + int64_t min = ones, max = ~zeros; + if (wrapped_int_lt(max, min, n->dt.data)) { + min = lattice_int_min(n->dt.data); + max = lattice_int_max(n->dt.data); + } + min &= mask, max &= mask; + + return lattice_intern(uni, (Lattice){ LATTICE_INT, ._int = { min, max, zeros, ones } }); +} + +static Lattice* dataflow_shift(TB_Passes* restrict opt, LatticeUniverse* uni, TB_Node* n) { + Lattice* a = lattice_universe_get(uni, n->inputs[1]); + Lattice* b = lattice_universe_get(uni, n->inputs[2]); + + uint64_t bits = n->dt.data; + uint64_t mask = tb__mask(n->dt.data); + + // shift that's in-bounds can tell us quite a few nice details + if (b->_int.max <= bits) { + uint64_t min, max, zeros, ones = 0; + switch (n->type) { + case TB_SHL: + min = a->_int.min << b->_int.min; + max = a->_int.max << b->_int.max; + min &= mask, max &= mask; + + if (((a->_int.min & b->_int.min) < 0 && min >= 0) || + (~(a->_int.max | b->_int.max) < 0 && max < 0) || + wrapped_int_lt(max, min, n->dt.data) + ) { + min = lattice_int_min(n->dt.data); + max = lattice_int_max(n->dt.data); + } + + // we at least shifted this many bits therefore we + // at least have this many zeros at the bottom + zeros = (1ull << b->_int.min) - 1ull; + // if we know how many bits we shifted then we know where + // our known ones ones went + if (b->_int.min == b->_int.max) { + ones <<= b->_int.min; + } + break; + + case TB_SHR: + // perform shift logic as unsigned + min = a->_int.min; + max = a->_int.max; + if (min > max) { + min = 0, max = mask; + } + + // the largest value is caused by the lowest shift amount + min >>= b->_int.max; + max >>= b->_int.min; + + // convert range back into signed + if (wrapped_int_lt(max, min, n->dt.data)) { + min = lattice_int_min(n->dt.data); + max = lattice_int_max(n->dt.data); + } + + // TODO(NeGate): we can technically guarentee the top bits are zero + zeros = 0; + // if we know how many bits we shifted then we know where + // our known ones ones went + if (b->_int.min == b->_int.max) { + ones >>= b->_int.min; + } + break; + + default: tb_todo(); + } + + return lattice_intern(uni, (Lattice){ LATTICE_INT, ._int = { min, max, zeros, ones } }); + } else { + return NULL; + } +} + static TB_Node* ideal_select(TB_Passes* restrict opt, TB_Function* f, TB_Node* n) { TB_Node* src = n->inputs[1]; @@ -214,21 +477,18 @@ static TB_Node* identity_extension(TB_Passes* restrict opt, TB_Function* f, TB_N } } -static TB_Node* ideal_int_unary(TB_Passes* restrict opt, TB_Function* f, TB_Node* n) { - assert(n->type == TB_NOT || n->type == TB_NEG); - TB_Node* src = n->inputs[1]; - if (src->type == TB_INTEGER_CONST) { - assert(src->dt.type == TB_INT && src->dt.data > 0); - uint64_t src_i = ~TB_NODE_GET_EXTRA_T(src, TB_NodeInt)->value; +static int node_pos(TB_Node* n) { + switch (n->type) { + case TB_PHI: + return 1; - if (n->type == TB_NEG) { - // -x => ~x + 1 - src_i += 1; - } + case TB_INTEGER_CONST: + case TB_FLOAT32_CONST: + case TB_FLOAT64_CONST: + return 2; - return make_int_node(f, opt, n->dt, src_i); - } else { - return NULL; + default: + return 3; } } @@ -236,8 +496,7 @@ static TB_Node* ideal_int_binop(TB_Passes* restrict opt, TB_Function* f, TB_Node TB_NodeTypeEnum type = n->type; if (is_commutative(type)) { // if it's commutative: we wanna have a canonical form. - // lower types to the right (constants are basically the lowest things) - if (n->inputs[1]->type < n->inputs[2]->type) { + if (node_pos(n->inputs[1]) > node_pos(n->inputs[2])) { TB_Node* tmp = n->inputs[1]; set_input(opt, n, n->inputs[2], 1); set_input(opt, n, tmp, 2); @@ -319,41 +578,7 @@ static TB_Node* ideal_int_binop(TB_Passes* restrict opt, TB_Function* f, TB_Node } } - if (a->type != TB_INTEGER_CONST || b->type != TB_INTEGER_CONST) { - return NULL; - } - - // fully fold - uint64_t ai = TB_NODE_GET_EXTRA_T(a, TB_NodeInt)->value; - uint64_t bi = TB_NODE_GET_EXTRA_T(b, TB_NodeInt)->value; - if (type >= TB_CMP_EQ && type <= TB_CMP_ULE) { - bool result = false; - switch (type) { - case TB_CMP_EQ: result = ai == bi; break; - case TB_CMP_NE: result = ai != bi; break; - case TB_CMP_ULT: result = ai < bi; break; - case TB_CMP_ULE: result = ai <= bi; break; - default: tb_unreachable(); - } - - return make_int_node(f, opt, n->dt, result); - } else if (type >= TB_AND && type <= TB_MUL) { - uint64_t dst; - switch (type) { - case TB_AND: dst = ai & bi; break; - case TB_OR: dst = ai | bi; break; - case TB_XOR: dst = ai ^ bi; break; - case TB_ADD: dst = ai + bi; break; - case TB_SUB: dst = ai - bi; break; - case TB_MUL: dst = ai * bi; break; - default: tb_unreachable(); - } - - // truncate - return make_int_node(f, opt, n->dt, dst & tb__mask(n->dt.data)); - } else { - return NULL; - } + return NULL; } static TB_Node* ideal_int_div(TB_Passes* restrict opt, TB_Function* f, TB_Node* n) { @@ -468,7 +693,7 @@ static TB_Node* identity_int_binop(TB_Passes* restrict opt, TB_Function* f, TB_N case TB_UDIV: case TB_SDIV: - return tb_inst_poison(f); + return make_poison(f, opt, n->dt); // (cmp.ne a 0) => a case TB_CMP_NE: { diff --git a/tb/src/opt/gcm.h b/tb/src/opt/gcm.h index 95dbf2f6..45e43ab9 100644 --- a/tb/src/opt/gcm.h +++ b/tb/src/opt/gcm.h @@ -1,45 +1,50 @@ // Scheduling: "Global Code Motion Global Value Numbering", Cliff Click 1995 // https://courses.cs.washington.edu/courses/cse501/06wi/reading/click-pldi95.pdf +static uint32_t node_hash(void* a) { return ((TB_Node*) a)->gvn; } +static bool node_compare(void* a, void* b) { return a == b; } //////////////////////////////// // Early scheduling //////////////////////////////// -static void schedule_early(TB_Passes* passes, TB_Node* n) { +static void schedule_early(TB_Passes* p, TB_Node* n) { // already visited - if (worklist_test_n_set(&passes->worklist, n)) { + if (worklist_test_n_set(&p->worklist, n)) { return; } - // track leaf nodes - if (n->input_count <= 2) { - dyn_array_put(passes->worklist.items, n); - } + // push node, late scheduling will process this list + dyn_array_put(p->worklist.items, n); // schedule inputs first FOREACH_N(i, 0, n->input_count) if (n->inputs[i]) { - schedule_early(passes, n->inputs[i]); + schedule_early(p, n->inputs[i]); } - if (!is_pinned(n)) { - TB_Node* best = passes->f->start_node; + // schedule unpinned nodes + if (!is_pinned(n) || n->input_count == 0) { + // start at the entry point + TB_BasicBlock* best = nl_map_get_checked(p->scheduled, p->worklist.items[0]); int best_depth = 0; // choose deepest block - FOREACH_N(i, 0, n->input_count) if (n->inputs[i] && n->inputs[i]->inputs[0]) { - TB_Node* bb = unsafe_get_region(n->inputs[i]); + FOREACH_N(i, 0, n->input_count) if (n->inputs[i]) { + ptrdiff_t search = nl_map_get(p->scheduled, n->inputs[i]); + if (search < 0) { + // input has no scheduling... weird? + continue; + } - int bb_depth = TB_NODE_GET_EXTRA_T(bb, TB_NodeRegion)->dom_depth; - if (best_depth < bb_depth) { + TB_BasicBlock* bb = p->scheduled[search].v; + if (best_depth < bb->dom_depth) { + best_depth = bb->dom_depth; best = bb; - best_depth = bb_depth; } } - if (passes->f->start_node == best) { - best = passes->f->params[0]; - } + DO_IF(TB_OPTDEBUG_GCM)(printf("%s: v%u into .bb%d\n", p->f->super.name, n->gvn, best->id)); - set_input(passes, n, best, 0); + nl_hashset_put2(&best->items, n, node_hash, node_compare); + nl_map_put(p->scheduled, n, best); } } @@ -48,109 +53,158 @@ static void schedule_early(TB_Passes* passes, TB_Node* n) { //////////////////////////////// // schedule nodes such that they appear the least common // ancestor to all their users -static TB_Node* find_lca(TB_Node* a, TB_Node* b) { +static TB_BasicBlock* find_lca(TB_Passes* p, TB_BasicBlock* a, TB_BasicBlock* b) { if (a == NULL) return b; // line both up - while (dom_depth(a) > dom_depth(b)) a = idom(a); - while (dom_depth(b) > dom_depth(a)) b = idom(b); + while (a->dom_depth > b->dom_depth) a = nl_map_get_checked(p->scheduled, a->dom); + while (b->dom_depth > a->dom_depth) b = nl_map_get_checked(p->scheduled, b->dom); while (a != b) { - b = idom(b); - a = idom(a); + b = idom_bb(p, b); + a = idom_bb(p, a); } return a; } -static void schedule_late(TB_Passes* passes, TB_Node* n) { - // already visited - if (worklist_test_n_set(&passes->worklist, n)) { - return; - } +static void schedule_late(TB_Passes* p, TB_Node* n) { + // pinned nodes can't be rescheduled + if (!is_pinned(n)) { + DO_IF(TB_OPTDEBUG_GCM)(printf("%s: try late v%u\n", p->f->super.name, n->gvn)); - // schedule all users first - for (User* use = find_users(passes, n); use; use = use->next) { - schedule_late(passes, use->n); - } + // we're gonna find the least common ancestor + TB_BasicBlock* lca = NULL; + for (User* use = n->users; use; use = use->next) { + TB_Node* y = use->n; - // pinned nodes can't be rescheduled - if (is_pinned(n)) { - return; - } + ptrdiff_t search = nl_map_get(p->scheduled, y); + if (search < 0) continue; // dead - // we're gonna find the least common ancestor - TB_Node* lca = NULL; - for (User* use = find_users(passes, n); use; use = use->next) { - TB_Node* y = use->n; - if (y->inputs[0] == NULL) continue; // dead + TB_BasicBlock* use_block = p->scheduled[search].v; + if (y->type == TB_PHI) { + TB_Node* use_node = y->inputs[0]; + assert(use_node->type == TB_REGION); - TB_Node* use_block = tb_get_parent_region(y->inputs[0]); - if (y->type == TB_PHI) { - if (y->input_count != use_block->input_count + 1) { - tb_panic("phi has parent with mismatched predecessors"); - } + if (y->input_count != use_node->input_count + 1) { + tb_panic("phi has parent with mismatched predecessors"); + } - ptrdiff_t j = 1; - for (; j < y->input_count; j++) { - if (y->inputs[j] == n) { - break; + ptrdiff_t j = 1; + for (; j < y->input_count; j++) { + if (y->inputs[j] == n) { + break; + } } + assert(j >= 0); + + use_block = nl_map_get_checked(p->scheduled, use_node->inputs[j - 1]); } - assert(j >= 0); - use_block = get_block_begin(use_block->inputs[j - 1]); + lca = find_lca(p, lca, use_block); } - lca = find_lca(lca, use_block); - } + // tb_assert(lca, "missing least common ancestor"); + if (lca != NULL) { + TB_OPTDEBUG(GCM)( + printf(" LATE v%u into .bb%d: ", n->gvn, lca->id), + print_node_sexpr(n, 0), + printf("\n") + ); + + ptrdiff_t search = nl_map_get(p->scheduled, n); + if (search >= 0) { + // replace old + TB_BasicBlock* old = p->scheduled[search].v; + p->scheduled[search].v = lca; + nl_hashset_remove2(&old->items, n, node_hash, node_compare); + } else { + nl_map_put(p->scheduled, n, lca); + } - if (passes->f->start_node == lca) { - lca = passes->f->params[0]; + nl_hashset_put2(&lca->items, n, node_hash, node_compare); + } } - - // tb_assert(lca, "missing least common ancestor"); - set_input(passes, n, lca, 0); } -void tb_pass_schedule(TB_Passes* p) { - if (p->scheduled) { - return; +void tb_pass_schedule(TB_Passes* p, TB_CFG cfg) { + if (p->scheduled != NULL) { + nl_map_free(p->scheduled); } CUIK_TIMED_BLOCK("schedule") { Worklist* restrict ws = &p->worklist; - p->scheduled = true; + nl_map_create(p->scheduled, 256); - size_t block_count; CUIK_TIMED_BLOCK("dominators") { - worklist_clear(ws); + // jarvis pull up the dommies + tb_compute_dominators(p->f, p, cfg); + + worklist_clear_visited(ws); + FOREACH_N(i, 0, cfg.block_count) { + TB_BasicBlock* best = &nl_map_get_checked(cfg.node_to_block, ws->items[i]); + if (i == 0) { + worklist_test_n_set(ws, p->f->start_node); + nl_map_put(p->scheduled, p->f->start_node, best); + } - block_count = tb_push_postorder(p->f, ws); - tb_compute_dominators(p->f, block_count, &ws->items[0]); + best->items = nl_hashset_alloc(32); + nl_map_put(p->scheduled, ws->items[i], best); + worklist_test_n_set(ws, ws->items[i]); + } } - CUIK_TIMED_BLOCK("early schedule") { - worklist_clear_visited(ws); - FOREACH_N(i, 0, block_count) { - TB_Node* bb = ws->items[i]; - assert(bb->type == TB_START || bb->type == TB_REGION); + CUIK_TIMED_BLOCK("pinned schedule") { + FOREACH_REVERSE_N(i, 0, cfg.block_count) { + TB_Node* bb_node = ws->items[i]; + TB_BasicBlock* bb = &nl_map_get_checked(cfg.node_to_block, bb_node); + + if (i == 0) { + // schedule START node + TB_Node* start = p->f->start_node; + nl_hashset_put2(&bb->items, start, node_hash, node_compare); + nl_map_put(p->scheduled, start, bb); + } - TB_NodeRegion* r = TB_NODE_GET_EXTRA(bb); - schedule_early(p, r->end); + // schedule top of BB + nl_hashset_put2(&bb->items, bb_node, node_hash, node_compare); + nl_map_put(p->scheduled, bb_node, bb); + + TB_Node* n = bb->end; + while (n != bb_node) { + DO_IF(TB_OPTDEBUG_GCM)(printf("%s: v%u pinned to .bb%d\n", p->f->super.name, n->gvn, bb->id)); + nl_hashset_put2(&bb->items, n, node_hash, node_compare); + nl_map_put(p->scheduled, n, bb); + + // mark projections into the same block + for (User* use = n->users; use; use = use->next) { + TB_Node* proj = use->n; + if (proj->type == TB_PROJ) { + nl_hashset_put2(&bb->items, proj, node_hash, node_compare); + nl_map_put(p->scheduled, proj, bb); + } + } + + n = n->inputs[0]; + } + } + } + + CUIK_TIMED_BLOCK("early schedule") { + FOREACH_REVERSE_N(i, 0, cfg.block_count) { + TB_Node* end = nl_map_get_checked(cfg.node_to_block, ws->items[i]).end; + schedule_early(p, end); } } // move nodes closer to their usage site CUIK_TIMED_BLOCK("late schedule") { - worklist_clear_visited(ws); - - // schedule late on leaves - FOREACH_N(i, 0, dyn_array_length(ws->items)) { + FOREACH_REVERSE_N(i, cfg.block_count, dyn_array_length(ws->items)) { schedule_late(p, ws->items[i]); } - schedule_late(p, p->f->start_node); + worklist_clear_visited(ws); + dyn_array_set_length(ws->items, cfg.block_count); } } } diff --git a/tb/src/opt/cse.h b/tb/src/opt/gvn.h similarity index 96% rename from tb/src/opt/cse.h rename to tb/src/opt/gvn.h index 5d1e6b46..f9519ea9 100644 --- a/tb/src/opt/cse.h +++ b/tb/src/opt/gvn.h @@ -46,6 +46,8 @@ static size_t extra_bytes(TB_Node* n) { case TB_TRUNCATE: case TB_INT2PTR: case TB_PTR2INT: + case TB_UINT2FLOAT: + case TB_FLOAT2UINT: case TB_INT2FLOAT: case TB_FLOAT2INT: case TB_FLOAT_EXT: @@ -63,10 +65,13 @@ static size_t extra_bytes(TB_Node* n) { case TB_END: case TB_PROJ: case TB_PHI: + case TB_CLZ: + case TB_CTZ: case TB_VA_START: case TB_POISON: case TB_SELECT: case TB_MERGEMEM: + case TB_DEAD: return 0; case TB_START: @@ -109,7 +114,7 @@ static size_t extra_bytes(TB_Node* n) { } } -uint32_t cse_hash(void* a) { +uint32_t gvn_hash(void* a) { TB_Node* n = a; size_t extra = extra_bytes(n); @@ -130,7 +135,7 @@ uint32_t cse_hash(void* a) { return h; } -bool cse_compare(void* a, void* b) { +bool gvn_compare(void* a, void* b) { TB_Node *x = a, *y = b; // early outs @@ -222,6 +227,8 @@ bool cse_compare(void* a, void* b) { case TB_FMUL: case TB_FDIV: case TB_PHI: + case TB_CLZ: + case TB_CTZ: case TB_MERGEMEM: return true; diff --git a/tb/src/opt/lattice.h b/tb/src/opt/lattice.h index aecf9cbe..99bc6e76 100644 --- a/tb/src/opt/lattice.h +++ b/tb/src/opt/lattice.h @@ -1,58 +1,6 @@ #include -// TODO(NeGate): implement dual? from there i can do join with -// -// dual(dual(x) ^ dual(y)) = join(x, y) -typedef struct { - uint64_t bot, top; - - // for known bit analysis - uint64_t known_zeros; - uint64_t known_ones; -} LatticeInt; - -// a simplification of the set of all pointers (or floats) -typedef enum { - LATTICE_UNKNOWN, // top aka {nan, non-nan} or for pointers {null, non-null} - - LATTICE_KNOWN_NAN = 1, // {nan} - LATTICE_KNOWN_NOT_NAN, // {non-nan} - - LATTICE_KNOWN_NULL = 1, // {null} - LATTICE_KNOWN_NOT_NULL // {non-null} -} LatticeTrifecta; - -typedef struct { - LatticeTrifecta trifecta; -} LatticeFloat; - -// TODO(NeGate): we might wanna store more info like aliasing, ownership and alignment. -typedef struct { - LatticeTrifecta trifecta; -} LatticePointer; - -// Represents the fancier type system within the optimizer, it's -// all backed by my shitty understanding of lattice theory -typedef struct { - enum { - LATTICE_INT, - LATTICE_FLOAT32, - LATTICE_FLOAT64, - LATTICE_POINTER, - } tag; - uint32_t pad; - union { - LatticeInt _int; - LatticeFloat _float; - LatticePointer _ptr; - }; -} Lattice; - -// hash-consing because there's a lot of -// redundant types we might construct. -typedef struct { - NL_HashSet pool; -} LatticeUniverse; +static Lattice* lattice_top(LatticeUniverse* uni, TB_DataType dt); static uint32_t lattice_hash(void* a) { return tb__murmur3_32(a, sizeof(Lattice)); @@ -63,24 +11,82 @@ static bool lattice_cmp(void* a, void* b) { return aa->tag == bb->tag ? memcmp(aa, bb, sizeof(Lattice)) == 0 : false; } +static bool lattice_is_const_int(Lattice* l) { return l->_int.min == l->_int.max; } + +static void lattice_universe_map(LatticeUniverse* uni, TB_Node* n, Lattice* l) { + // reserve cap, slow path :p + if (UNLIKELY(n->gvn >= uni->type_cap)) { + size_t new_cap = tb_next_pow2(n->gvn + 16); + uni->types = tb_platform_heap_realloc(uni->types, new_cap * sizeof(Lattice*)); + + // clear new space + FOREACH_N(i, uni->type_cap, new_cap) { + uni->types[i] = NULL; + } + + uni->type_cap = new_cap; + } + + uni->types[n->gvn] = l; +} + +static Lattice* lattice_universe_get(LatticeUniverse* uni, TB_Node* n) { + // reserve cap, slow path :p + if (UNLIKELY(n->gvn >= uni->type_cap)) { + size_t new_cap = tb_next_pow2(n->gvn + 16); + uni->types = tb_platform_heap_realloc(uni->types, new_cap * sizeof(Lattice*)); + + // clear new space + FOREACH_N(i, uni->type_cap, new_cap) { + uni->types[i] = NULL; + } + + uni->type_cap = new_cap; + } + + if (uni->types[n->gvn] == NULL) { + return uni->types[n->gvn] = lattice_top(uni, n->dt); + } else { + return uni->types[n->gvn]; + } +} + +static Lattice* lattice_intern(LatticeUniverse* uni, Lattice l) { + Lattice* k = nl_hashset_get2(&uni->pool, &l, lattice_hash, lattice_cmp); + if (k != NULL) { + return k; + } + + // allocate new node + k = tb_arena_alloc(uni->arena, sizeof(Lattice)); + memcpy(k, &l, sizeof(l)); + nl_hashset_put2(&uni->pool, k, lattice_hash, lattice_cmp); + return k; +} + +static int64_t lattice_int_min(int bits) { return 1ll << (bits - 1); } +static int64_t lattice_int_max(int bits) { return (1ll << (bits - 1)) - 1; } + +// constructs a type for a CONTROL node +static Lattice* lattice_ctrl(LatticeUniverse* uni, TB_Node* dom) { + return lattice_intern(uni, (Lattice){ LATTICE_CONTROL, ._ctrl = { dom } }); +} + // maximal subset -static Lattice lattice_top(TB_DataType dt) { +static Lattice* lattice_top(LatticeUniverse* uni, TB_DataType dt) { switch (dt.type) { case TB_INT: { assert(dt.data <= 64); - uint64_t max_bits = UINT64_MAX >> dt.data; - tb_todo(); - - return (Lattice){ LATTICE_INT, ._int = { 0, max_bits } }; + return lattice_intern(uni, (Lattice){ LATTICE_INT, ._int = { lattice_int_min(dt.data), lattice_int_max(dt.data) } }); } case TB_FLOAT: { assert(dt.data == TB_FLT_32 || dt.data == TB_FLT_64); - return (Lattice){ dt.data == TB_FLT_64 ? LATTICE_FLOAT64 : LATTICE_FLOAT32, ._float = { LATTICE_UNKNOWN } }; + return lattice_intern(uni, (Lattice){ dt.data == TB_FLT_64 ? LATTICE_FLOAT64 : LATTICE_FLOAT32, ._float = { LATTICE_UNKNOWN } }); } case TB_PTR: { - return (Lattice){ LATTICE_POINTER, ._ptr = { LATTICE_UNKNOWN } }; + return lattice_intern(uni, (Lattice){ LATTICE_POINTER, ._ptr = { LATTICE_UNKNOWN } }); } default: @@ -93,7 +99,7 @@ static Lattice lattice_top(TB_DataType dt) { #define TRIFECTA_MEET(a, b) ((a).trifecta == (b).trifecta ? (a).trifecta : LATTICE_UNKNOWN) // generates the greatest lower bound between a and b -static Lattice lattice_meet(const Lattice* a, const Lattice* b) { +static Lattice* lattice_meet(LatticeUniverse* uni, Lattice* a, Lattice* b) { assert(a->tag == b->tag); switch (a->tag) { case LATTICE_INT: { @@ -101,24 +107,24 @@ static Lattice lattice_meet(const Lattice* a, const Lattice* b) { LatticeInt aa = a->_int; LatticeInt bb = b->_int; - LatticeInt i = { aa.bot, aa.top }; - if (i.bot > bb.bot) i.bot = bb.bot; - if (i.top < bb.top) i.top = bb.top; + LatticeInt i = { aa.min, aa.max }; + if (i.min > bb.min) i.min = bb.min; + if (i.max < bb.max) i.max = bb.max; i.known_zeros = aa.known_zeros & bb.known_zeros; i.known_ones = aa.known_ones & bb.known_ones; - return (Lattice){ LATTICE_INT, ._int = i }; + return lattice_intern(uni, (Lattice){ LATTICE_INT, ._int = i }); } case LATTICE_FLOAT32: case LATTICE_FLOAT64: { LatticeFloat f = { .trifecta = TRIFECTA_MEET(a->_float, b->_float) }; - return (Lattice){ a->tag, ._float = f }; + return lattice_intern(uni, (Lattice){ a->tag, ._float = f }); } case LATTICE_POINTER: { LatticePointer p = { .trifecta = TRIFECTA_MEET(a->_ptr, b->_ptr) }; - return (Lattice){ LATTICE_POINTER, ._ptr = p }; + return lattice_intern(uni, (Lattice){ LATTICE_POINTER, ._ptr = p }); } default: tb_todo(); diff --git a/tb/src/opt/mem2reg.h b/tb/src/opt/mem2reg.h index 109f19db..068038e0 100644 --- a/tb/src/opt/mem2reg.h +++ b/tb/src/opt/mem2reg.h @@ -23,7 +23,7 @@ typedef struct Mem2Reg_Ctx { TB_Function* f; TB_Passes* p; - size_t block_count; + TB_CFG cfg; TB_Node** blocks; // Stack slots we're going to convert into @@ -61,7 +61,7 @@ static TB_Node* new_phi(Mem2Reg_Ctx* restrict c, TB_Function* f, int var, TB_Nod break; }*/ - DO_IF(TB_OPTDEBUG_MEM2REG)(log_debug("%p: insert new PHI node (in %p)", n, block)); + DO_IF(TB_OPTDEBUG_MEM2REG)(log_debug("v%u: insert new PHI node (in v%u)", n->gvn, block->gvn)); tb_pass_mark(c->p, n); return n; } @@ -79,7 +79,7 @@ static void add_phi_operand(Mem2Reg_Ctx* restrict c, TB_Function* f, TB_Node* ph assert(phi_node->type == TB_PHI); TB_Node* phi_region = phi_node->inputs[0]; - DO_IF(TB_OPTDEBUG_MEM2REG)(log_debug("%p: adding %p to PHI", phi_node, node)); + DO_IF(TB_OPTDEBUG_MEM2REG)(log_debug("v%u: adding v%u to PHI", phi_node->gvn, node->gvn)); // the slot to fill is based on the predecessor list of the region FOREACH_N(i, 0, phi_region->input_count) { @@ -145,70 +145,9 @@ static bool is_effect_tuple(TB_Node* n) { n->type == TB_MACHINE_OP; } -static void ssa_rename_node(Mem2Reg_Ctx* c, TB_Node* bb, TB_Node* n, DynArray(TB_Node*)* stack) { - TB_Node* parent = (n->type == TB_PROJ ? n->inputs[0] : n); - if (parent->input_count >= 2 && unsafe_get_region(parent->inputs[1]) == bb) { - assert(parent->inputs[1]->dt.type == TB_MEMORY); - ssa_rename_node(c, bb, parent->inputs[1], stack); - } - - // find promoted stack slots - bool kill = false; - if (n->type == TB_STORE) { - int var = get_variable_id(c, n->inputs[2]); - if (var >= 0) { - // push new store value onto the stack - dyn_array_put(stack[var], n->inputs[3]); - kill = true; - } - } - - // check for any loads and replace them - for (User* u = find_users(c->p, n); u; u = u->next) { - TB_Node* use = u->n; - - if (u->slot == 1 && use->type == TB_LOAD) { - int var = get_variable_id(c, use->inputs[2]); - if (var >= 0) { - TB_Node* val; - if (dyn_array_length(stack[var]) == 0) { - // this is UB since it implies we've read before initializing the - // stack slot. - val = make_poison(c->f, c->p, TB_TYPE_VOID); - log_warn("%p: found load-before-init in mem2reg, this is UB", use); - } else { - val = stack[var][dyn_array_length(stack[var]) - 1]; - } - - // make sure it's the right type - if (use->dt.raw != val->dt.raw) { - TB_Node* cast = tb_alloc_node(c->f, TB_BITCAST, use->dt, 2, 0); - tb_pass_mark(c->p, cast); - set_input(c->p, cast, val, 1); - - val = cast; - } - - tb_pass_mark_users(c->p, use); - set_input(c->p, use, NULL, 1); // unlink first - subsume_node(c->p, c->f, use, val); - } - } - } - - // we can remove the effect now - if (kill) { - // log_info("%p: pass to %p", n, n->inputs[0]); - TB_Node* into = n->inputs[1]; - tb_pass_mark(c->p, into); - tb_pass_mark(c->p, n); - set_input(c->p, n, NULL, 1); - subsume_node(c->p, c->f, n, into); - } -} - static void ssa_rename(Mem2Reg_Ctx* c, TB_Function* f, TB_Node* bb, DynArray(TB_Node*)* stack) { assert(bb); + TB_Passes* p = c->p; // push phi nodes size_t* old_len = tb_tls_push(c->tls, sizeof(size_t) * c->to_promote_count); @@ -222,28 +161,97 @@ static void ssa_rename(Mem2Reg_Ctx* c, TB_Function* f, TB_Node* bb, DynArray(TB_ } // rewrite operations - TB_NodeRegion* r = TB_NODE_GET_EXTRA(bb); - TB_Node* end = r->end; + TB_BasicBlock* bb_info = &nl_map_get_checked(c->cfg.node_to_block, bb); + TB_Node* end = bb_info->end; + + tb_pass_mark(p, bb); + tb_pass_mark_users(p, bb); - tb_pass_mark(c->p, bb); - tb_pass_mark_users(c->p, bb); + DO_IF(TB_OPTDEBUG_MEM2REG)( + printf(" FORST %u: ", bb->gvn), + print_node_sexpr(bb, 0), + printf("\n") + ); // go through all uses and replace their accessors - if (r->mem_out) { - ssa_rename_node(c, bb, r->mem_out, stack); + TB_Node* n = bb_info->mem_in; + if (n != NULL) { + do { + DO_IF(TB_OPTDEBUG_MEM2REG)( + printf(" SIGMA %u: ", n->gvn), + print_node_sexpr(n, 0), + printf("\n") + ); + + // if we spot a store, we push to the stack + bool kill = false; + if (n->type == TB_STORE) { + int var = get_variable_id(c, n->inputs[2]); + if (var >= 0) { + // push new store value onto the stack + dyn_array_put(stack[var], n->inputs[3]); + kill = true; + } + } + + // check for any loads and replace them + for (User* u = n->users; u; u = u->next) { + TB_Node* use = u->n; + + if (u->slot == 1 && use->type == TB_LOAD) { + int var = get_variable_id(c, use->inputs[2]); + if (var >= 0) { + TB_Node* val; + if (dyn_array_length(stack[var]) == 0) { + // this is UB since it implies we've read before initializing the + // stack slot. + val = make_poison(f, p, use->dt); + log_warn("v%u: found load-before-init in mem2reg, this is UB", use->gvn); + } else { + val = stack[var][dyn_array_length(stack[var]) - 1]; + } + + // make sure it's the right type + if (use->dt.raw != val->dt.raw) { + TB_Node* cast = tb_alloc_node(c->f, TB_BITCAST, use->dt, 2, 0); + tb_pass_mark(c->p, cast); + set_input(c->p, cast, val, 1); + + val = cast; + } + + tb_pass_mark_users(p, use); + set_input(p, use, NULL, 1); // unlink first + subsume_node(p, f, use, val); + } + } + } + + // next memory has to be decided before we kill the node since + // murder will dettach the users. + TB_Node* next = mem_user(p, n, 1); + + // we can remove the effect now + if (kill) { + TB_Node* into = n->inputs[1]; + tb_pass_mark(c->p, into); + tb_pass_mark(c->p, n); + set_input(p, n, NULL, 1); + subsume_node(p, c->f, n, into); + } + + n = next; + } while (n != NULL && get_block_begin(n) == bb); } // replace phi arguments on successor if (end != NULL) { - if (end->type == TB_NULL || end->type == TB_END || end->type == TB_TRAP || end->type == TB_UNREACHABLE) { - /* RET can't do shit in this context */ - } else if (end->type == TB_BRANCH) { - TB_NodeBranch* br_info = TB_NODE_GET_EXTRA(end); - FOREACH_N(i, 0, br_info->succ_count) { - ssa_replace_phi_arg(c, f, bb, br_info->succ[i], stack); + // fill successors + for (User* u = end->users; u; u = u->next) { + if (cfg_is_control(u->n)) { + TB_Node* succ = cfg_next_region_control(u->n); + ssa_replace_phi_arg(c, f, bb, succ, stack); } - } else { - tb_todo(); } } @@ -252,9 +260,9 @@ static void ssa_rename(Mem2Reg_Ctx* c, TB_Function* f, TB_Node* bb, DynArray(TB_ // // TODO(NeGate): maybe we want a data structure for this because it'll // be "kinda" slow. - FOREACH_N(i, 0, c->block_count) { + FOREACH_N(i, 0, c->cfg.block_count) { TB_Node* k = c->blocks[i]; - TB_Node* v = idom(k); + TB_Node* v = idom(&c->cfg, k); if (v == bb && k != bb) { ssa_rename(c, f, k, stack); @@ -267,57 +275,30 @@ static void ssa_rename(Mem2Reg_Ctx* c, TB_Function* f, TB_Node* bb, DynArray(TB_ tb_tls_restore(c->tls, old_len); } -typedef struct { - TB_Node* old_n; - - int64_t offset; - TB_CharUnits size; - TB_DataType dt; -} AggregateConfig; - -static ptrdiff_t find_config(size_t config_count, AggregateConfig* configs, int64_t offset) { - FOREACH_N(i, 0, config_count) { - if (configs[i].offset == offset) return i; - } - - tb_unreachable(); - return -1; -} - -// -1 is a bad match -// -2 is no match, so we can add a new config -static ptrdiff_t compatible_with_configs(size_t config_count, AggregateConfig* configs, int64_t offset, TB_CharUnits size, TB_DataType dt) { - int64_t max = offset + size; - - FOREACH_N(i, 0, config_count) { - int64_t max2 = configs[i].offset + configs[i].size; - - if (offset >= configs[i].offset && max <= max2) { - // they overlap... but is it a clean overlap? - if (offset == configs[i].offset && max == max2 && TB_DATA_TYPE_EQUALS(dt, configs[i].dt)) { - return i; +static void insert_phis(Mem2Reg_Ctx* restrict ctx, TB_Node* bb, TB_Node* n) { + DO_IF(TB_OPTDEBUG_MEM2REG)( + printf(" FORST %u: ", bb->gvn), + print_node_sexpr(bb, 0), + printf("\n") + ); + + do { + DO_IF(TB_OPTDEBUG_MEM2REG)( + printf(" OMEGA %u: ", n->gvn), + print_node_sexpr(n, 0), + printf("\n") + ); + + if (n->type == TB_STORE) { + int var = get_variable_id(ctx, n->inputs[2]); + if (var >= 0) { + write_variable(ctx, var, bb, n->inputs[3]); } - - return -1; } - } - return -2; -} - -static void insert_phis(Mem2Reg_Ctx* restrict ctx, TB_Node* bb, TB_Node* n) { - TB_Node* parent = (n->type == TB_PROJ ? n->inputs[0] : n); - if (parent->input_count >= 2 && unsafe_get_region(parent->inputs[1]) == bb) { - assert(parent->inputs[1]->dt.type == TB_MEMORY); - insert_phis(ctx, bb, parent->inputs[1]); - } - - if (n->type == TB_STORE) { - int var = get_variable_id(ctx, n->inputs[2]); - if (var >= 0) { - write_variable(ctx, var, bb, n->inputs[3]); - } - } + // next memory + n = mem_user(ctx->p, n, 1); + } while (n != NULL && get_block_begin(n) == bb); } bool tb_pass_mem2reg(TB_Passes* p) { @@ -329,7 +310,6 @@ bool tb_pass_mem2reg(TB_Passes* p) { //////////////////////////////// size_t to_promote_count = 0; TB_Node** to_promote = tb_tls_push(tls, sizeof(TB_Node*) * dyn_array_length(p->locals)); - dyn_array_for(i, p->locals) { TB_Node* n = p->locals[i]; @@ -338,28 +318,26 @@ bool tb_pass_mem2reg(TB_Passes* p) { switch (coherence) { case COHERENCY_GOOD: { - tb_tls_push(tls, sizeof(TB_Node*)); to_promote[to_promote_count++] = n; - n->dt = dt; - DO_IF(TB_OPTDEBUG_MEM2REG)(log_debug("%s: %p promoting to IR register", f->super.name, n)); + DO_IF(TB_OPTDEBUG_MEM2REG)(log_debug("%s: v%u promoting to IR register", f->super.name, n->gvn)); break; } case COHERENCY_UNINITIALIZED: { - DO_IF(TB_OPTDEBUG_MEM2REG)(log_debug("%s: %p could not mem2reg (uninitialized)", f->super.name, n)); + DO_IF(TB_OPTDEBUG_MEM2REG)(log_debug("%s: v%u could not mem2reg (uninitialized)", f->super.name, n->gvn)); break; } case COHERENCY_VOLATILE: { - DO_IF(TB_OPTDEBUG_MEM2REG)(log_debug("%s: %p could not mem2reg (volatile load/store)", f->super.name, n)); + DO_IF(TB_OPTDEBUG_MEM2REG)(log_debug("%s: v%u could not mem2reg (volatile load/store)", f->super.name, n->gvn)); break; } case COHERENCY_USES_ADDRESS: { - DO_IF(TB_OPTDEBUG_MEM2REG)(log_debug("%s: %p could not mem2reg (uses address directly)", f->super.name)); + DO_IF(TB_OPTDEBUG_MEM2REG)(log_debug("%s: v%u could not mem2reg (uses address directly)", f->super.name, n->gvn)); break; } case COHERENCY_BAD_DATA_TYPE: { - DO_IF(TB_OPTDEBUG_MEM2REG)(log_debug("%s: %p could not mem2reg (data type is too inconsistent)", f->super.name, n)); + DO_IF(TB_OPTDEBUG_MEM2REG)(log_debug("%s: v%u could not mem2reg (data type is too inconsistent)", f->super.name, n->gvn)); break; } default: tb_todo(); @@ -382,50 +360,70 @@ bool tb_pass_mem2reg(TB_Passes* p) { c.defs = tb_tls_push(c.tls, to_promote_count * sizeof(Mem2Reg_Def)); memset(c.defs, 0, to_promote_count * sizeof(Mem2Reg_Def)); - c.block_count = tb_push_postorder(f, &p->worklist); + c.cfg = tb_compute_rpo(f, p); c.blocks = &p->worklist.items[0]; - tb_compute_dominators(f, c.block_count, p->worklist.items); + tb_compute_dominators(f, p, c.cfg); - TB_DominanceFrontiers* df = tb_get_dominance_frontiers(f, c.block_count, c.blocks); + TB_DominanceFrontiers* df = tb_get_dominance_frontiers(f, p, c.cfg, c.blocks); //////////////////////////////// // Phase 1: Insert phi functions //////////////////////////////// // Identify the final value of all the variables in the function per basic block - FOREACH_REVERSE_N(i, 0, c.block_count) { - TB_Node* end = TB_NODE_GET_EXTRA_T(c.blocks[i], TB_NodeRegion)->end; + FOREACH_N(i, 0, c.cfg.block_count) { + TB_Node* bb = c.blocks[i]; + TB_BasicBlock* bb_info = &nl_map_get_checked(c.cfg.node_to_block, bb); + + if (i == 0) { + // start block can use the input memory as the earliest point + insert_phis(&c, bb, f->params[1]); + bb_info->mem_in = f->params[1]; + continue; + } + + TB_Node* end = bb_info->end; - TB_Node* ctrl = end->inputs[0]; - TB_Node* latest_mem = NULL; + // find memory phi + TB_Node* n = bb; + TB_Node* mem = NULL; do { - latest_mem = mem_user(p, ctrl, 0); - ctrl = ctrl->inputs[0]; - } while (latest_mem == NULL && ctrl->type != TB_START && ctrl->type != TB_REGION); - - if (latest_mem) { - for (;;) { - TB_Node* next = mem_user(p, latest_mem, 1); - if (next == NULL || next->inputs[0] != latest_mem->inputs[0]) break; - latest_mem = next; + for (User* u = n->users; u; u = u->next) { + if (is_mem_out_op(u->n)) { + mem = u->n; + goto done; + } + } + + n = cfg_next_control(n); + } while (n != NULL && n != end); + + done: + // find earliest memory in the BB: + // note this doesn't account for multiple memory streams + // but that's fine for now... + if (mem) { + while (mem->inputs[1]->inputs[0]->type != TB_START && get_block_begin(mem->inputs[1]->inputs[0]) == bb) { + mem = mem->inputs[1]; } - insert_phis(&c, c.blocks[i], latest_mem); + insert_phis(&c, bb, mem); } - TB_NODE_GET_EXTRA_T(c.blocks[i], TB_NodeRegion)->mem_out = latest_mem; + + bb_info->mem_in = mem; } // for each global name we'll insert phi nodes - TB_Node** phi_p = tb_tls_push(tls, c.block_count * sizeof(TB_Node*)); + TB_Node** phi_p = tb_tls_push(tls, c.cfg.block_count * sizeof(TB_Node*)); - NL_HashSet ever_worked = nl_hashset_alloc(c.block_count); - NL_HashSet has_already = nl_hashset_alloc(c.block_count); + NL_HashSet ever_worked = nl_hashset_alloc(c.cfg.block_count); + NL_HashSet has_already = nl_hashset_alloc(c.cfg.block_count); FOREACH_N(var, 0, c.to_promote_count) { nl_hashset_clear(&ever_worked); nl_hashset_clear(&has_already); size_t p_count = 0; - FOREACH_REVERSE_N(i, 0, c.block_count) { + FOREACH_N(i, 0, c.cfg.block_count) { TB_Node* bb = c.blocks[i]; ptrdiff_t search = nl_map_get(c.defs[var], bb); @@ -444,7 +442,7 @@ bool tb_pass_mem2reg(TB_Passes* p) { TB_DataType dt = value->dt; // for all DFs of BB, insert PHI - int bb_id = TB_NODE_GET_EXTRA_T(bb, TB_NodeRegion)->postorder_id; + int bb_id = nl_map_get_checked(c.cfg.node_to_block, bb).id; uint64_t* frontier = &df->arr[bb_id * df->stride]; FOREACH_N(j, 0, df->stride) FOREACH_BIT(k, j*64, frontier[j]) { TB_Node* l = c.blocks[k]; @@ -488,15 +486,18 @@ bool tb_pass_mem2reg(TB_Passes* p) { stack[var] = dyn_array_create(TB_Node*, 16); } - ssa_rename(&c, f, f->start_node, stack); + ssa_rename(&c, f, c.blocks[0], stack); + // tb_function_print(f, tb_default_print_callback, stdout); // don't need these anymore FOREACH_N(var, 0, c.to_promote_count) { + assert(c.to_promote[var]->users == NULL); tb_pass_kill_node(c.p, c.to_promote[var]); } tb_tls_restore(tls, to_promote); + tb_free_cfg(&c.cfg); cuikperf_region_end(); return true; @@ -505,100 +506,6 @@ bool tb_pass_mem2reg(TB_Passes* p) { return false; } -static bool sane_writer(TB_Node* n) { - return n->type == TB_STORE || n->type == TB_MEMCPY || n->type == TB_MEMSET; -} - -// false means failure to SROA -static bool add_configs(TB_Passes* p, TB_TemporaryStorage* tls, User* use, TB_Node* base_address, size_t base_offset, size_t* config_count, AggregateConfig* configs, int pointer_size) { - for (; use; use = use->next) { - TB_Node* n = use->n; - - if (n->type == TB_MEMBER_ACCESS && use->slot == 1) { - // same rules, different offset - int64_t offset = TB_NODE_GET_EXTRA_T(n, TB_NodeMember)->offset; - if (!add_configs(p, tls, find_users(p, n), base_address, base_offset + offset, config_count, configs, pointer_size)) { - return false; - } - continue; - } - - // we can only SROA if we know we're not using the - // address for anything but direct memory ops or TB_MEMBERs. - if (use->slot != 2) { - return false; - } - - // find direct memory op - if (n->type != TB_LOAD && n->type != TB_STORE) { - return false; - } - - TB_DataType dt = n->type == TB_LOAD ? n->dt : n->inputs[3]->dt; - TB_Node* address = n->inputs[2]; - int size = (bits_in_data_type(pointer_size, dt) + 7) / 8; - - // see if it's a compatible configuration - int match = compatible_with_configs(*config_count, configs, base_offset, size, dt); - if (match == -1) { - return false; - } else if (match == -2) { - // add new config - tb_tls_push(tls, sizeof(AggregateConfig)); - configs[(*config_count)++] = (AggregateConfig){ address, base_offset, size, dt }; - } else if (configs[match].old_n != address) { - log_warn("%s: %p SROA config matches but reaches so via a different node, please idealize nodes before mem2reg", p->f->super.name, address); - return false; - } - } - - return true; -} - -void tb_pass_sroa(TB_Passes* p) { - cuikperf_region_start("sroa", NULL); - verify_tmp_arena(p); - - TB_Function* f = p->f; - TB_TemporaryStorage* tls = tb_tls_steal(); - int pointer_size = tb__find_code_generator(f->super.module)->pointer_size; - - for (size_t i = dyn_array_length(p->locals); i--;) retry: { - TB_Node* address = p->locals[i]; - void* mark = tb_tls_push(tls, 0); - - size_t config_count = 0; - AggregateConfig* configs = tb_tls_push(tls, 0); - if (!add_configs(p, tls, find_users(p, address), address, 0, &config_count, configs, pointer_size)) { - TB_NODE_GET_EXTRA_T(address, TB_NodeLocal)->alias_index = 0; - continue; - } - - // split allocation into pieces - if (config_count > 1) { - DO_IF(TB_OPTDEBUG_MEM2REG)(log_debug("%s: v%zu was able to SROA into %zu pieces", f->super.name, address->gvn, config_count)); - - uint32_t alignment = TB_NODE_GET_EXTRA_T(address, TB_NodeLocal)->align; - FOREACH_N(i, 0, config_count) { - TB_Node* new_n = tb_alloc_node(f, TB_LOCAL, TB_TYPE_PTR, 1, sizeof(TB_NodeLocal)); - set_input(p, new_n, f->start_node, 0); - TB_NODE_SET_EXTRA(new_n, TB_NodeLocal, .size = configs[i].size, .align = alignment); - - // mark all users, there may be some fun new opts now - tb_pass_mark_users(p, configs[i].old_n); - - // replace old pointer with new fancy - subsume_node(p, f, configs[i].old_n, new_n); - dyn_array_put(p->locals, new_n); - } - tb_tls_restore(tls, mark); - goto retry; // retry but don't go the next int - } - } - - cuikperf_region_end(); -} - // NOTE(NeGate): a stack slot is coherent when all loads and stores share // the same type and alignment along with not needing any address usage. static Coherency tb_get_stack_slot_coherency(TB_Passes* p, TB_Function* f, TB_Node* address, TB_DataType* out_dt) { @@ -634,7 +541,7 @@ static Coherency tb_get_stack_slot_coherency(TB_Passes* p, TB_Function* f, TB_No dt_bits = bits; } } else { - DO_IF(TB_OPTDEBUG_MEM2REG)(log_debug("%p uses pointer arithmatic (%s)", address, tb_node_get_name(n))); + DO_IF(TB_OPTDEBUG_MEM2REG)(log_debug("v%u uses pointer arithmatic (%s)", address->gvn, tb_node_get_name(n))); return COHERENCY_USES_ADDRESS; } } diff --git a/tb/src/opt/mem_opt.h b/tb/src/opt/mem_opt.h index 9acdfc3d..d56d67cf 100644 --- a/tb/src/opt/mem_opt.h +++ b/tb/src/opt/mem_opt.h @@ -6,6 +6,16 @@ typedef struct { int64_t offset; } KnownPointer; +static bool is_local_ptr(TB_Node* n) { + // skip past ptr arith + retry: { + if (n->type == TB_MEMBER_ACCESS) goto retry; + if (n->type == TB_ARRAY_ACCESS) goto retry; + } + + return n->type == TB_LOCAL; +} + static KnownPointer known_pointer(TB_Node* n) { if (n->type == TB_MEMBER_ACCESS) { return (KnownPointer){ n->inputs[1], TB_NODE_GET_EXTRA_T(n, TB_NodeMember)->offset }; @@ -71,61 +81,50 @@ static TB_Node* ideal_load(TB_Passes* restrict p, TB_Function* f, TB_Node* n) { TB_Node* mem = n->inputs[1]; TB_Node* addr = n->inputs[2]; if (n->inputs[0] != NULL) { - TB_Node* base = addr; - while (base->type == TB_MEMBER_ACCESS || base->type == TB_ARRAY_ACCESS) { - base = base->inputs[1]; - } - - // loads based on LOCALs don't need control-dependence, it's actually kinda annoying - if (base->type == TB_LOCAL) { + // we've dependent on code which must always be run (START.mem) + if (n->inputs[0]->type == TB_PROJ && n->inputs[0]->inputs[0]->type == TB_START) { set_input(p, n, NULL, 0); return n; + } else { + TB_Node* base = addr; + while (base->type == TB_MEMBER_ACCESS || base->type == TB_ARRAY_ACCESS) { + base = base->inputs[1]; + } + + // loads based on LOCALs don't need control-dependence, it's actually kinda annoying + if (base->type == TB_LOCAL) { + set_input(p, n, NULL, 0); + return n; + } } } // if LOAD has already been safely accessed we can relax our control dependency - if (n->inputs[0] != NULL && n->inputs[0]->type == TB_REGION && n->inputs[0]->input_count == 1) { - TB_Node* parent_bb = get_block_begin(n->inputs[0]->inputs[0]); - - for (User* u = find_users(p, parent_bb); u; u = u->next) { + if (n->inputs[0] != NULL) { + TB_Node* parent_bb = get_block_begin(n->inputs[0]); + for (User* u = addr->users; u; u = u->next) { TB_Node* use = u->n; - if (use != n && use->type == TB_LOAD && use->inputs[2] == addr) { - tb_pass_mark_users(p, get_block_begin(n->inputs[0])); + if (use != n && use->type == TB_LOAD && u->slot == 2) { + // if the other load has no control deps we don't need any + // either... if they're the same type (really it just needs + // to read the same bytes or less) + if (use->dt.raw == n->dt.raw) { + set_input(p, n, NULL, 0); + return n; + } - set_input(p, n, use->inputs[0], 0); - return n; + // if we're dominated by some previous load then we can inherit + // it's control dep. + TB_Node* bb = get_block_begin(use->inputs[0]); + if (lattice_dommy(&p->universe, bb, parent_bb)) { + set_input(p, n, use->inputs[0], 0); + return n; + } } } } return NULL; - - // loads based on PHIs may be reduced into data PHIs - /*if (n->inputs[1]->type == TB_PHI) { - return data_phi_from_memory_phi(p, f, n->dt, n->inputs[1], addr, NULL); - }*/ - - // if a load is control dependent on a store and it doesn't alias we can move the - // dependency up a bit. - /*if (n->inputs[1]->type != TB_STORE) return NULL; - - KnownPointer ld_ptr = known_pointer(n->inputs[2]); - KnownPointer st_ptr = known_pointer(n->inputs[1]->inputs[2]); - if (ld_ptr.base != st_ptr.base) return NULL; - - // it's probably not the fastest way to grab this value ngl... - ICodeGen* cg = tb__find_code_generator(f->super.module); - ld_ptr.offset *= cg->minimum_addressable_size; - st_ptr.offset *= cg->minimum_addressable_size; - - size_t loaded_end = ld_ptr.offset + bits_in_data_type(cg->pointer_size, n->dt); - size_t stored_end = st_ptr.offset + bits_in_data_type(cg->pointer_size, n->inputs[0]->inputs[2]->dt); - - // both bases match so if the effective ranges don't intersect, they don't alias. - if (ld_ptr.offset <= stored_end && st_ptr.offset <= loaded_end) return NULL; - - set_input(p, n, n->inputs[1]->inputs[1], 1); - return n;*/ } static TB_Node* identity_load(TB_Passes* restrict p, TB_Function* f, TB_Node* n) { @@ -160,9 +159,15 @@ static TB_Node* ideal_store(TB_Passes* restrict p, TB_Function* f, TB_Node* n) { } static TB_Node* ideal_end(TB_Passes* restrict p, TB_Function* f, TB_Node* n) { + // remove dead local store + if (n->inputs[1]->type == TB_STORE && is_local_ptr(n->inputs[1]->inputs[2])) { + set_input(p, n, n->inputs[1]->inputs[1], 1); + return n; + } + return NULL; } static TB_Node* ideal_memcpy(TB_Passes* restrict p, TB_Function* f, TB_Node* n) { return NULL; -} +} \ No newline at end of file diff --git a/tb/src/opt/optimizer.c b/tb/src/opt/optimizer.c index 43a8c68f..b67ff5ab 100644 --- a/tb/src/opt/optimizer.c +++ b/tb/src/opt/optimizer.c @@ -8,7 +8,7 @@ // set_input(opt, n, in, slot) // basically `n->inputs[slot] = in` except it correctly updates the user set // -// # Implement peepholes +// # How to implement peepholes // TODO // #include "../passes.h" @@ -31,12 +31,13 @@ static void subsume_node(TB_Passes* restrict p, TB_Function* f, TB_Node* n, TB_N static TB_Node* clone_node(TB_Passes* restrict p, TB_Function* f, TB_Node* region, TB_Node* n, bool* new_node); // node creation helpers -TB_Node* make_dead(TB_Function* f, TB_Passes* restrict p); TB_Node* make_poison(TB_Function* f, TB_Passes* restrict p, TB_DataType dt); TB_Node* make_int_node(TB_Function* f, TB_Passes* restrict p, TB_DataType dt, uint64_t x); +TB_Node* make_dead_node(TB_Function* f, TB_Passes* restrict p); TB_Node* make_proj_node(TB_Function* f, TB_Passes* restrict p, TB_DataType dt, TB_Node* src, int i); static bool remove_pred(TB_Passes* restrict p, TB_Function* f, TB_Node* src, TB_Node* dst); +static bool lattice_dommy(LatticeUniverse* uni, TB_Node* expected_dom, TB_Node* bb); //////////////////////////////// // Worklist @@ -68,6 +69,14 @@ void worklist_clear(Worklist* restrict ws) { } } +void worklist_remove(Worklist* restrict ws, TB_Node* n) { + uint64_t gvn_word = n->gvn / 64; // which word this ID is at + if (gvn_word >= ws->visited_cap) return; + + uint64_t gvn_mask = 1ull << (n->gvn % 64); + ws->visited[gvn_word] &= ~gvn_mask; +} + // checks if node is visited but doesn't push item bool worklist_test(Worklist* restrict ws, TB_Node* n) { uint64_t gvn_word = n->gvn / 64; // which word this ID is at @@ -172,7 +181,10 @@ static char* lil_name(TB_Function* f, const char* fmt, ...) { static TB_Node* mem_user(TB_Passes* restrict p, TB_Node* n, int slot) { for (User* u = find_users(p, n); u; u = u->next) { - if (u->slot == slot && is_mem_out_op(u->n)) return u->n; + if ((u->n->type == TB_PROJ && u->n->dt.type == TB_MEMORY) || + (u->slot == slot && is_mem_out_op(u->n))) { + return u->n; + } } return NULL; @@ -185,7 +197,7 @@ static TB_Node* single_user(TB_Passes* restrict p, TB_Node* n) { } static bool single_use(TB_Passes* restrict p, TB_Node* n) { - return find_users(p, n)->next == NULL; + return n->users->next == NULL; } static bool is_same_align(TB_Node* a, TB_Node* b) { @@ -196,7 +208,7 @@ static bool is_same_align(TB_Node* a, TB_Node* b) { static bool is_empty_bb(TB_Passes* restrict p, TB_Node* end) { assert(end->type == TB_BRANCH || end->type == TB_UNREACHABLE); - if (!is_block_begin(end->inputs[0])) { + if (!cfg_is_bb_entry(end->inputs[0])) { return false; } @@ -221,10 +233,11 @@ static bool is_if_branch(TB_Node* n, uint64_t* falsey) { // unity build with all the passes #include "lattice.h" #include "cfg.h" -#include "cse.h" +#include "gvn.h" #include "dce.h" #include "fold.h" #include "mem_opt.h" +#include "sroa.h" #include "loop.h" #include "branches.h" #include "print.h" @@ -233,9 +246,24 @@ static bool is_if_branch(TB_Node* n, uint64_t* falsey) { #include "libcalls.h" #include "scheduler.h" +static bool lattice_dommy(LatticeUniverse* uni, TB_Node* expected_dom, TB_Node* bb) { + while (bb != NULL && expected_dom != bb) { + Lattice* l = lattice_universe_get(uni, bb); + assert(l->tag == LATTICE_CONTROL); + + TB_Node* new_bb = l->_ctrl.idom; + if (bb == new_bb) { + return false; + } + bb = new_bb; + } + + return true; +} + static TB_Node* gvn(TB_Passes* restrict p, TB_Node* n, size_t extra) { // try CSE, if we succeed, just delete the node and use the old copy - TB_Node* k = nl_hashset_put2(&p->cse_nodes, n, cse_hash, cse_compare); + TB_Node* k = nl_hashset_put2(&p->gvn_nodes, n, gvn_hash, gvn_compare); if (k != NULL) { // try free tb_arena_free(p->f->arena, n->inputs, sizeof(TB_Node*)); @@ -250,10 +278,21 @@ TB_Node* make_poison(TB_Function* f, TB_Passes* restrict p, TB_DataType dt) { return gvn(p, tb_alloc_node(f, TB_POISON, dt, 1, 0), 0); } +TB_Node* make_dead_node(TB_Function* f, TB_Passes* restrict p) { + return gvn(p, tb_alloc_node(f, TB_DEAD, TB_TYPE_CONTROL, 1, 0), 0); +} + TB_Node* make_int_node(TB_Function* f, TB_Passes* restrict p, TB_DataType dt, uint64_t x) { + uint64_t mask = tb__mask(dt.data); + x &= mask; + TB_Node* n = tb_alloc_node(f, TB_INTEGER_CONST, dt, 1, sizeof(TB_NodeInt)); TB_NodeInt* i = TB_NODE_GET_EXTRA(n); i->value = x; + + Lattice* l = lattice_intern(&p->universe, (Lattice){ LATTICE_INT, ._int = { x, x, ~x & mask, x } }); + lattice_universe_map(&p->universe, n, l); + return gvn(p, n, sizeof(TB_NodeInt)); } @@ -304,7 +343,7 @@ static bool remove_pred(TB_Passes* restrict p, TB_Function* f, TB_Node* src, TB_ void tb_pass_kill_node(TB_Passes* restrict p, TB_Node* n) { // remove from CSE if we're murdering it - nl_hashset_remove2(&p->cse_nodes, n, cse_hash, cse_compare); + nl_hashset_remove2(&p->gvn_nodes, n, gvn_hash, gvn_compare); if (n->type == TB_LOCAL) { // remove from local list @@ -319,8 +358,7 @@ void tb_pass_kill_node(TB_Passes* restrict p, TB_Node* n) { n->inputs[i] = NULL; } - n->users = NULL; - + // assert(n->users == NULL && "we can't kill nodes with users, that's fucking rude"); n->input_count = 0; n->type = TB_NULL; } @@ -385,80 +423,54 @@ void tb_pass_mark_users(TB_Passes* restrict p, TB_Node* n) { TB_NodeTypeEnum type = use->n->type; // tuples changing means their projections did too. - if (use->n->dt.type == TB_TUPLE || type == TB_PROJ) { + if (type == TB_PROJ || type == TB_DEAD) { tb_pass_mark_users(p, use->n); } - // if the store is changed, the users (potential loads) should be notified. - // (br (cmp ...)) - if (type == TB_CMP_NE || type == TB_CMP_EQ || type == TB_STORE) { + // (br (cmp a b)) => ... + if (type >= TB_CMP_EQ && type <= TB_CMP_FLE) { tb_pass_mark_users_raw(p, use->n); } - - if (type == TB_REGION) { - tb_pass_mark_users_raw(p, use->n); - - TB_NodeRegion* r = TB_NODE_GET_EXTRA(use->n); - TB_Node* end = r->end; - if (end->type == TB_BRANCH) { - tb_pass_mark(p, end); - - // mark direct successors - TB_NodeBranch* br_info = TB_NODE_GET_EXTRA(end); - FOREACH_N(i, 0, br_info->succ_count) { - tb_pass_mark(p, br_info->succ[i]); - } - } - } } } -static void push_all_bb(Worklist* restrict ws, DynArray(TB_Node*)* stack_ptr, TB_Node* root) { - if (worklist_test_n_set(ws, root)) { - return; - } - - // walk control edges (aka predecessors) - assert(root->type == TB_START || root->type == TB_REGION); - TB_NodeRegion* r = TB_NODE_GET_EXTRA(root); - TB_Node* end = r->end; - - if (end->type == TB_BRANCH) { - TB_NodeBranch* br = TB_NODE_GET_EXTRA(end); - FOREACH_REVERSE_N(i, 0, br->succ_count) { - push_all_bb(ws, stack_ptr, br->succ[i]); +static void push_all_nodes(TB_Passes* restrict p, Worklist* restrict ws, TB_Function* f) { + CUIK_TIMED_BLOCK("push_all_nodes") { + DynArray(TB_Node*) stack = p->stack; + if (stack == NULL) { + stack = dyn_array_create(TB_Node*, 1024); } - } - DynArray(TB_Node*) stack = *stack_ptr; + // push all nodes using the terminator list + DynArray(TB_Node*) terminators = f->terminators; + dyn_array_for(i, terminators) { + TB_Node* end = terminators[i]; - // place endpoint, we'll construct the rest from there - worklist_test_n_set(ws, end); - dyn_array_put(stack, end); + // place endpoint, we'll construct the rest from there + if (worklist_test_n_set(ws, end)) { + // already processed + continue; + } - while (dyn_array_length(stack)) { - TB_Node* n = dyn_array_pop(stack); + dyn_array_put(stack, end); - // place self first - dyn_array_put(ws->items, n); + while (dyn_array_length(stack)) { + TB_Node* n = dyn_array_pop(stack); - // push inputs - FOREACH_N(i, 0, n->input_count) { - TB_Node* in = n->inputs[i]; - if (in && !worklist_test_n_set(ws, in)) { - dyn_array_put(stack, in); + // place self first + dyn_array_put(ws->items, n); + + // push inputs + FOREACH_N(i, 0, n->input_count) { + TB_Node* in = n->inputs[i]; + if (in && !worklist_test_n_set(ws, in)) { + dyn_array_put(stack, in); + } + } } } - } - *stack_ptr = stack; -} - -static void push_all_nodes(Worklist* restrict ws, TB_Node* root) { - CUIK_TIMED_BLOCK("push_all_nodes") { - DynArray(TB_Node*) stack = dyn_array_create(TB_Node*, 1024); - push_all_bb(ws, &stack, root); - dyn_array_destroy(stack); + p->stack = stack; } } @@ -491,13 +503,13 @@ void print_node_sexpr(TB_Node* n, int depth) { printf("sym%p", sym); } } else if (depth >= 1) { - printf("(v%zu: %s", n->gvn, tb_node_get_name(n)); + printf("(v%u: %s", n->gvn, tb_node_get_name(n)); cool_print_type(n); printf(" ...)"); } else { depth -= (n->type == TB_PROJ); - printf("(%s", tb_node_get_name(n)); + printf("(v%u: %s", n->gvn, tb_node_get_name(n)); cool_print_type(n); FOREACH_N(i, 0, n->input_count) if (n->inputs[i]) { if (i == 0) printf(" @"); @@ -526,10 +538,6 @@ void print_node_sexpr(TB_Node* n, int depth) { // Returns NULL or a modified node (could be the same node, we can stitch it back into place) static TB_Node* idealize(TB_Passes* restrict p, TB_Function* f, TB_Node* n, TB_PeepholeFlags flags) { switch (n->type) { - case TB_NOT: - case TB_NEG: - return ideal_int_unary(p, f, n); - // integer ops case TB_AND: case TB_OR: @@ -574,13 +582,8 @@ static TB_Node* idealize(TB_Passes* restrict p, TB_Function* f, TB_Node* n, TB_P case TB_SIGN_EXT: case TB_ZERO_EXT: return ideal_extension(p, f, n); - - case TB_INT2PTR: - return ideal_int2ptr(p, f, n); - - // truncate - case TB_TRUNCATE: - return ideal_truncate(p, f, n); + case TB_BITCAST: + return ideal_bitcast(p, f, n); case TB_CALL: return ideal_libcall(p, f, n); @@ -664,6 +667,88 @@ static TB_Node* identity(TB_Passes* restrict p, TB_Function* f, TB_Node* n, TB_P } } +// computes the type of a node based on it's inputs +static Lattice* dataflow(TB_Passes* restrict p, LatticeUniverse* uni, TB_Node* n) { + switch (n->type) { + case TB_INTEGER_CONST: { + TB_NodeInt* num = TB_NODE_GET_EXTRA(n); + return lattice_intern(&p->universe, (Lattice){ LATTICE_INT, ._int = { num->value, num->value, ~num->value, num->value } }); + } + + case TB_LOCAL: + case TB_SYMBOL: + return lattice_intern(uni, (Lattice){ LATTICE_POINTER, ._ptr = { LATTICE_KNOWN_NOT_NULL } }); + + case TB_INT2PTR: + return dataflow_int2ptr(p, uni, n); + + case TB_TRUNCATE: + return dataflow_trunc(p, uni, n); + + case TB_ZERO_EXT: + return dataflow_zext(p, uni, n); + + case TB_SIGN_EXT: + return dataflow_sext(p, uni, n); + + case TB_NEG: + case TB_NOT: + return dataflow_unary(p, uni, n); + + case TB_AND: + case TB_OR: + case TB_XOR: + return dataflow_bits(p, uni, n); + + case TB_ADD: + case TB_SUB: + case TB_MUL: + return dataflow_arith(p, uni, n); + + case TB_SHL: + case TB_SHR: + return dataflow_shift(p, uni, n); + + // meet all inputs + case TB_PHI: { + Lattice* l = lattice_universe_get(uni, n->inputs[1]); + FOREACH_N(i, 2, n->input_count) { + l = lattice_meet(uni, l, lattice_universe_get(uni, n->inputs[i])); + } + return l; + } + + default: return NULL; + } +} + +// converts constant Lattice into constant node +static TB_Node* try_as_const(TB_Passes* restrict p, TB_Node* n, Lattice* l) { + // already a constant? + if (n->type == TB_INTEGER_CONST || n->type == TB_FLOAT32_CONST || n->type == TB_FLOAT64_CONST) { + return NULL; + } + + switch (l->tag) { + case LATTICE_INT: { + // degenerate range + if (l->_int.min == l->_int.max) { + return make_int_node(p->f, p, n->dt, l->_int.max); + } + + // all bits are known + uint64_t mask = tb__mask(n->dt.data); + if ((l->_int.known_zeros | l->_int.known_ones) == mask) { + return make_int_node(p->f, p, n->dt, l->_int.known_ones); + } + + return NULL; + } + + default: return NULL; + } +} + static bool is_terminator(TB_Node* n) { return n->type == TB_BRANCH || n->type == TB_END || n->type == TB_TRAP || n->type == TB_UNREACHABLE; } @@ -676,15 +761,52 @@ static TB_Node* unsafe_get_region(TB_Node* n) { return n; } +static void validate_node_users(TB_Node* n) { + if (n != NULL) { + for (User* use = n->users; use; use = use->next) { + tb_assert(use->n->inputs[use->slot] == n, "Mismatch between def-use and use-def data"); + } + } +} + +static void print_lattice(Lattice* l, TB_DataType dt) { + switch (l->tag) { + case LATTICE_INT: + assert(dt.type == TB_INT); + printf("[%"PRId64, tb__sxt(l->_int.min, dt.data, 64)); + // printf("[%#"PRIx64, l->_int.min); + if (l->_int.min != l->_int.max) { + // printf(" - %#"PRIx64, l->_int.max); + printf(" - %"PRId64, tb__sxt(l->_int.max, dt.data, 64)); + } + + uint64_t known = l->_int.known_zeros | l->_int.known_ones; + if (known && known != UINT64_MAX) { + printf("; zeros=%#"PRIx64", ones=%#"PRIx64, l->_int.known_zeros, l->_int.known_ones); + } + printf("]"); + break; + + case LATTICE_POINTER: { + static const char* tri[] = { "unknown", "null", "~null" }; + printf("[%s]", tri[l->_ptr.trifecta]); + break; + } + + default: + break; + } +} + static bool peephole(TB_Passes* restrict p, TB_Function* f, TB_Node* n, TB_PeepholeFlags flags) { // must've dead sometime between getting scheduled and getting // here. - if (n->type != TB_END && find_users(p, n) == NULL) { + if (n->type != TB_END && n->users == NULL) { return false; } DO_IF(TB_OPTDEBUG_STATS)(p->stats.peeps++); - DO_IF(TB_OPTDEBUG_PEEP)(printf("peep v%zu? ", n->gvn), print_node_sexpr(n, 0)); + DO_IF(TB_OPTDEBUG_PEEP)(printf("peep t=%d? ", p->stats.time++), print_node_sexpr(n, 0)); // idealize node (in a loop of course) TB_Node* k = idealize(p, f, n, flags); @@ -698,7 +820,6 @@ static bool peephole(TB_Passes* restrict p, TB_Function* f, TB_Node* n, TB_Peeph // transfer users from n -> k if (n != k) { - tb_assert(!is_terminator(n), "can't peephole a branch into a new branch"); subsume_node(p, f, n, k); n = k; } @@ -708,6 +829,34 @@ static bool peephole(TB_Passes* restrict p, TB_Function* f, TB_Node* n, TB_Peeph DO_IF(TB_OPTDEBUG_PEEP)(if (++loop_count > 10) { log_warn("%p: we looping a lil too much dawg...", n); }); } + // generate fancier type + if (n->dt.type >= TB_INT && n->dt.type <= TB_PTR) { + // no type provided? just make a not-so-form fitting TOP + Lattice* new_type = dataflow(p, &p->universe, n); + if (new_type == NULL) { + new_type = lattice_top(&p->universe, n->dt); + DO_IF(TB_OPTDEBUG_PEEP)(printf(" => \x1b[93mTOP\x1b[0m")); + } else { + // print fancy type + DO_IF(TB_OPTDEBUG_PEEP)(printf(" => \x1b[93m"), print_lattice(new_type, n->dt), printf("\x1b[0m")); + } + + // types that consist of one possible value are made into value constants. + k = try_as_const(p, n, new_type); + if (k != NULL) { + DO_IF(TB_OPTDEBUG_PEEP)(printf(" => \x1b[96m"), print_node_sexpr(k, 0), printf("\x1b[0m")); + + subsume_node(p, f, n, k); + + // because certain optimizations apply when things are merged + // we mark ALL users including the ones who didn't get changed. + tb_pass_mark_users(p, k); + return k; + } else { + lattice_universe_map(&p->universe, n, new_type); + } + } + // convert into matching identity k = identity(p, f, n, flags); if (n != k) { @@ -719,11 +868,11 @@ static bool peephole(TB_Passes* restrict p, TB_Function* f, TB_Node* n, TB_Peeph return k; } - // common subexpression elim - k = nl_hashset_put2(&p->cse_nodes, n, cse_hash, cse_compare); + // global value numbering + k = nl_hashset_put2(&p->gvn_nodes, n, gvn_hash, gvn_compare); if (k && (k != n)) { - DO_IF(TB_OPTDEBUG_STATS)(p->stats.cse_hit++); - DO_IF(TB_OPTDEBUG_PEEP)(printf(" => \x1b[31mCSE\x1b[0m")); + DO_IF(TB_OPTDEBUG_STATS)(p->stats.gvn_hit++); + DO_IF(TB_OPTDEBUG_PEEP)(printf(" => \x1b[31mGVN\x1b[0m")); subsume_node(p, f, n, k); @@ -732,14 +881,14 @@ static bool peephole(TB_Passes* restrict p, TB_Function* f, TB_Node* n, TB_Peeph tb_pass_mark_users(p, k); return k; } else { - DO_IF(TB_OPTDEBUG_STATS)(p->stats.cse_miss++); + DO_IF(TB_OPTDEBUG_STATS)(p->stats.gvn_miss++); } return n; } static void subsume_node(TB_Passes* restrict p, TB_Function* f, TB_Node* n, TB_Node* new_n) { - User* use = find_users(p, n); + User* use = n->users; while (use != NULL) { tb_assert(use->n->inputs[use->slot] == n, "Mismatch between def-use and use-def data"); @@ -763,8 +912,8 @@ static void generate_use_lists(TB_Passes* restrict p, TB_Function* f) { dyn_array_put(p->locals, n); } - FOREACH_N(i, 0, n->input_count) if (n->inputs[i]) { - add_user(p, n, n->inputs[i], i, NULL); + FOREACH_N(j, 0, n->input_count) if (n->inputs[j]) { + add_user(p, n, n->inputs[j], j, NULL); } } } @@ -782,16 +931,10 @@ TB_Passes* tb_pass_enter(TB_Function* f, TB_Arena* arena) { worklist_alloc(&p->worklist, f->node_count); - // generate early doms - CUIK_TIMED_BLOCK("doms") { - size_t block_count = tb_push_postorder(f, &p->worklist); - tb_compute_dominators(f, block_count, p->worklist.items); - worklist_clear(&p->worklist); - } - // generate work list (put everything) CUIK_TIMED_BLOCK("gen worklist") { - push_all_nodes(&p->worklist, f->start_node); + push_all_nodes(p, &p->worklist, f); + DO_IF(TB_OPTDEBUG_STATS)(p->stats.initial = worklist_popcount(&p->worklist)); } @@ -805,6 +948,23 @@ TB_Passes* tb_pass_enter(TB_Function* f, TB_Arena* arena) { return p; } +void tb_pass_sroa(TB_Passes* p) { + cuikperf_region_start("sroa", NULL); + verify_tmp_arena(p); + + TB_Function* f = p->f; + + int pointer_size = tb__find_code_generator(f->super.module)->pointer_size; + TB_Node* start = f->start_node; + + size_t i = 0; + while (i < dyn_array_length(p->locals)) { + i += sroa_rewrite(p, pointer_size, start, p->locals[i]); + } + + cuikperf_region_end(); +} + void tb_pass_optimize(TB_Passes* p) { tb_pass_peephole(p, TB_PEEPHOLE_ALL); tb_pass_sroa(p); @@ -816,8 +976,51 @@ void tb_pass_optimize(TB_Passes* p) { void tb_pass_peephole(TB_Passes* p, TB_PeepholeFlags flags) { verify_tmp_arena(p); - if (p->cse_nodes.data == NULL) { - p->cse_nodes = nl_hashset_alloc(p->f->node_count); + if (p->gvn_nodes.data == NULL) { + p->gvn_nodes = nl_hashset_alloc(p->f->node_count); + } + + // make sure we have space for the lattice universe + if (p->universe.arena == NULL) { + TB_ThreadInfo* info = tb_thread_info(p->f->super.module); + if (info->type_arena.chunk_size == 0) { + // make new arena + tb_arena_create(&info->type_arena, TB_ARENA_LARGE_CHUNK_SIZE); + } + + size_t count = p->f->node_count; + p->universe.arena = &info->type_arena; + p->universe.pool = nl_hashset_alloc(64); + p->universe.type_cap = count; + p->universe.types = tb_platform_heap_alloc(count * sizeof(Lattice*)); + memset(p->universe.types, 0, count * sizeof(Lattice*)); + + // generate early doms + CUIK_TIMED_BLOCK("doms") { + TB_Function* f = p->f; + + Worklist tmp_ws = { 0 }; + worklist_alloc(&tmp_ws, (f->node_count / 4) + 4); + + TB_CFG cfg = tb_compute_rpo2(f, &tmp_ws, &p->stack); + tb_compute_dominators2(f, &tmp_ws, cfg); + + // mark IDOM for each "BB" node + FOREACH_N(i, 0, cfg.block_count) { + // entry block should be marked as dominated by NULL, to make it easy + // to end the iteration of a dom chain. + TB_Node* dom = NULL; + if (i != 0) { + dom = nl_map_get_checked(cfg.node_to_block, tmp_ws.items[i]).dom; + } + + Lattice* l = lattice_ctrl(&p->universe, dom); + lattice_universe_map(&p->universe, tmp_ws.items[i], l); + } + + worklist_free(&tmp_ws); + tb_free_cfg(&cfg); + } } TB_Function* f = p->f; @@ -832,25 +1035,39 @@ void tb_pass_peephole(TB_Passes* p, TB_PeepholeFlags flags) { } void tb_pass_exit(TB_Passes* p) { + verify_tmp_arena(p); + TB_Function* f = p->f; + // terminators will be made obselete by the optimizer + dyn_array_destroy(f->terminators); + + // tb_function_print(f, tb_default_print_callback, stdout); + #if TB_OPTDEBUG_STATS - push_all_nodes(&p->worklist, f->start_node); + push_all_nodes(p, &p->worklist, f); int final_count = worklist_popcount(&p->worklist); double factor = ((double) final_count / (double) p->stats.initial) * 100.0; printf("%s: stats:\n", f->super.name); printf(" %4d -> %4d nodes (%.2f%%)\n", p->stats.initial, final_count, factor); - printf(" %4d CSE hit %4d CSE miss\n", p->stats.cse_hit, p->stats.cse_miss); + printf(" %4d GVN hit %4d GVN miss\n", p->stats.gvn_hit, p->stats.gvn_miss); printf(" %4d peepholes %4d rewrites %4d identities\n", p->stats.peeps, p->stats.rewrites, p->stats.identities); #endif - verify_tmp_arena(p); - + nl_map_free(p->scheduled); worklist_free(&p->worklist); - nl_hashset_free(p->cse_nodes); + nl_hashset_free(p->gvn_nodes); + dyn_array_destroy(p->stack); dyn_array_destroy(p->locals); + if (p->universe.arena != NULL) { + tb_arena_clear(p->universe.arena); + nl_hashset_free(p->universe.pool); + tb_platform_heap_free(p->universe.types); + } + tb_arena_clear(tmp_arena); + tb_platform_heap_free(p); } diff --git a/tb/src/opt/print.h b/tb/src/opt/print.h index ef41b6c9..f12e8808 100644 --- a/tb/src/opt/print.h +++ b/tb/src/opt/print.h @@ -2,12 +2,10 @@ typedef struct { TB_Passes* opt; TB_Function* f; - size_t block_count; + TB_CFG cfg; } PrinterCtx; static void print_type(TB_DataType dt) { - assert(dt.width < 8 && "Vector width too big!"); - switch (dt.type) { case TB_INT: { if (dt.data == 0) printf("void"); @@ -53,13 +51,13 @@ static void print_ref_to_node(PrinterCtx* ctx, TB_Node* n, bool def) { if (def) { printf("("); TB_Node** params = ctx->f->params; - FOREACH_N(i, 1, 1 + ctx->f->param_count) { + FOREACH_N(i, 1, 3 + ctx->f->param_count) { if (i > 1) printf(", "); if (params[i] == NULL) { printf("_"); } else { - printf("v%zu: ", params[i]->gvn); + printf("v%u: ", params[i]->gvn); print_type(params[i]->dt); } } @@ -70,7 +68,7 @@ static void print_ref_to_node(PrinterCtx* ctx, TB_Node* n, bool def) { if (r->tag != NULL) { printf(".%s", r->tag); } else { - ptrdiff_t i = try_find_traversal_index(n); + ptrdiff_t i = try_find_traversal_index(&ctx->cfg, n); if (i >= 0) { printf(".bb%zu", i); } else { @@ -90,6 +88,17 @@ static void print_ref_to_node(PrinterCtx* ctx, TB_Node* n, bool def) { } else { printf("sym%p", sym); } + } else if (n->type == TB_PROJ && n->dt.type == TB_CONTROL) { + if (n->inputs[0]->type == TB_START) { + print_ref_to_node(ctx, n->inputs[0], def); + } else { + ptrdiff_t i = try_find_traversal_index(&ctx->cfg, n); + if (i >= 0) { + printf(".bb%zu", i); + } else { + printf("*DEAD*"); + } + } } else if (n->type == TB_ZERO_EXT) { printf("(zxt."); print_type(n->dt); @@ -111,7 +120,7 @@ static void print_ref_to_node(PrinterCtx* ctx, TB_Node* n, bool def) { printf("%#0"PRIx64, num->value); } } else { - printf("v%llu", (long long unsigned) n->gvn); + printf("v%u", n->gvn); } } @@ -129,40 +138,39 @@ static void print_location(TB_Function* f, TB_Node* n) { } } -static void print_bb(PrinterCtx* ctx, TB_Node* bb) { - assert(bb->type == TB_START || bb->type == TB_REGION); - print_ref_to_node(ctx, bb, true); +static void print_bb(PrinterCtx* ctx, TB_Node* bb_start) { + print_ref_to_node(ctx, bb_start, true); printf(":"); // print predecessors - if (bb->input_count > 0) { + if (!(bb_start->type == TB_PROJ && bb_start->inputs[0]->type == TB_START) && bb_start->input_count > 0) { printf(" # preds: "); - FOREACH_N(j, 0, bb->input_count) { - print_ref_to_node(ctx, tb_get_parent_region(bb->inputs[j]), false); + FOREACH_N(j, 0, bb_start->input_count) { + print_ref_to_node(ctx, get_pred(bb_start, j), false); printf(" "); } } - if (ctx->opt->error_n == bb) { + if (ctx->opt->error_n == bb_start) { printf("\x1b[31m <-- ERROR\x1b[0m"); } printf("\n"); - TB_Node* end = TB_NODE_GET_EXTRA_T(bb, TB_NodeRegion)->end; + TB_BasicBlock* bb = nl_map_get_checked(ctx->opt->scheduled, bb_start); Worklist* ws = &ctx->opt->worklist; - sched_walk(ctx->opt, ws, NULL, bb, end); - assert(ws->items[ctx->block_count] == bb); + sched_walk(ctx->opt, ws, NULL, bb, bb->end, true); TB_Node* prev_effect = NULL; - FOREACH_N(i, ctx->block_count + 1, dyn_array_length(ws->items)) { + FOREACH_N(i, ctx->cfg.block_count, dyn_array_length(ws->items)) { TB_Node* n = ws->items[i]; // skip these if (n->type == TB_INTEGER_CONST || n->type == TB_FLOAT32_CONST || n->type == TB_FLOAT64_CONST || n->type == TB_SYMBOL || n->type == TB_SIGN_EXT || n->type == TB_ZERO_EXT || - n->type == TB_PROJ) { + n->type == TB_PROJ || n->type == TB_START || + n->type == TB_REGION || n->type == TB_NULL) { continue; } @@ -176,9 +184,20 @@ static void print_bb(PrinterCtx* ctx, TB_Node* bb) { case TB_BRANCH: { TB_NodeBranch* br = TB_NODE_GET_EXTRA(n); + TB_ArenaSavepoint sp = tb_arena_save(tmp_arena); + TB_Node** restrict succ = tb_arena_alloc(tmp_arena, br->succ_count * sizeof(TB_Node**)); + + // fill successors + for (User* u = n->users; u; u = u->next) { + if (u->n->type == TB_PROJ) { + int index = TB_NODE_GET_EXTRA_T(u->n, TB_NodeProj)->index; + succ[index] = cfg_next_bb_after_cproj(u->n); + } + } + if (br->succ_count == 1) { printf(" goto "); - print_ref_to_node(ctx, br->succ[0], false); + print_ref_to_node(ctx, succ[0], false); } else if (br->succ_count == 2) { printf(" if "); FOREACH_N(i, 1, n->input_count) { @@ -190,9 +209,9 @@ static void print_bb(PrinterCtx* ctx, TB_Node* bb) { } else { printf(" != %"PRId64" then ", br->keys[0]); } - print_ref_to_node(ctx, br->succ[0], false); + print_ref_to_node(ctx, succ[0], false); printf(" else "); - print_ref_to_node(ctx, br->succ[1], false); + print_ref_to_node(ctx, succ[1], false); } else { printf(" br "); FOREACH_N(i, 1, n->input_count) { @@ -205,11 +224,12 @@ static void print_bb(PrinterCtx* ctx, TB_Node* bb) { if (i != 0) printf(" %"PRId64": ", br->keys[i - 1]); else printf(" default: "); - print_ref_to_node(ctx, br->succ[i], false); + print_ref_to_node(ctx, succ[i], false); printf("\n"); } printf(" }"); } + tb_arena_restore(tmp_arena, sp); break; } @@ -233,7 +253,7 @@ static void print_bb(PrinterCtx* ctx, TB_Node* bb) { TB_Node* projs[4]; for (size_t i = 0; i < 4; i++) projs[i] = NULL; - for (User* use = find_users(ctx->opt, n); use; use = use->next) { + for (User* use = n->users; use; use = use->next) { if (use->n->type == TB_PROJ) { int index = TB_NODE_GET_EXTRA_T(use->n, TB_NodeProj)->index; projs[index] = use->n; @@ -246,7 +266,7 @@ static void print_bb(PrinterCtx* ctx, TB_Node* bb) { FOREACH_N(i, first, 4) { if (projs[i] == NULL) break; if (i > first) printf(", "); - printf("v%zu", projs[i]->gvn); + printf("v%u", projs[i]->gvn); } printf(" = %s.(", tb_node_get_name(n)); FOREACH_N(i, first, 4) { @@ -260,7 +280,7 @@ static void print_bb(PrinterCtx* ctx, TB_Node* bb) { if (n->dt.type == TB_INT && n->dt.data == 0) { printf(" %s.", tb_node_get_name(n)); } else { - printf(" v%zu = %s.", n->gvn, tb_node_get_name(n)); + printf(" v%u = %s.", n->gvn, tb_node_get_name(n)); } TB_DataType dt = n->dt; @@ -394,37 +414,34 @@ static void print_bb(PrinterCtx* ctx, TB_Node* bb) { printf("\n"); } - dyn_array_set_length(ws->items, ctx->block_count); + dyn_array_set_length(ws->items, ctx->cfg.block_count); + + if (bb->end->type != TB_END && + bb->end->type != TB_TRAP && + bb->end->type != TB_BRANCH && + bb->end->type != TB_UNREACHABLE) { + printf(" goto "); + print_ref_to_node(ctx, cfg_next_control(bb->end), false); + printf("\n"); + } } bool tb_pass_print(TB_Passes* opt) { TB_Function* f = opt->f; - - // schedule nodes - tb_pass_schedule(opt); - - PrinterCtx ctx = { opt, f }; worklist_clear(&opt->worklist); - ctx.block_count = tb_push_postorder(f, &opt->worklist); - TB_Node* stop_bb = get_block_begin(f->stop_node); + PrinterCtx ctx = { opt, f }; + ctx.cfg = tb_compute_rpo(f, opt); + // schedule nodes + tb_pass_schedule(opt, ctx.cfg); worklist_clear_visited(&opt->worklist); - bool has_stop = false; - FOREACH_REVERSE_N(i, 0, ctx.block_count) { - TB_Node* bb = opt->worklist.items[i]; - if (bb != stop_bb) { - print_bb(&ctx, bb); - } else { - has_stop = true; - } - } - - if (has_stop) { - print_bb(&ctx, stop_bb); + FOREACH_N(i, 0, ctx.cfg.block_count) { + print_bb(&ctx, opt->worklist.items[i]); } + tb_free_cfg(&ctx.cfg); ctx.opt->error_n = NULL; return false; } diff --git a/tb/src/opt/scheduler.h b/tb/src/opt/scheduler.h index 452f80e7..29195b5b 100644 --- a/tb/src/opt/scheduler.h +++ b/tb/src/opt/scheduler.h @@ -2,19 +2,7 @@ // sort which is anti-dependency aware, a future TB could implement multiple schedulers. // // Once the worklist is filled, you can walk backwards and generate instructions accordingly. -static bool is_same_bb(TB_Node* bb, TB_Node* n) { - if (n->type != TB_START && n->inputs[0] == NULL) { - return false; - } - - while (n->type != TB_START && n->type != TB_REGION) { - n = n->inputs[0]; - } - - return n == bb; -} - -static void sched_walk_phi(TB_Passes* passes, Worklist* ws, DynArray(PhiVal)* phi_vals, TB_Node* bb, TB_Node* phi, size_t phi_index) { +static void sched_walk_phi(TB_Passes* passes, Worklist* ws, DynArray(PhiVal)* phi_vals, TB_BasicBlock* bb, TB_Node* phi, size_t phi_index) { TB_Node* val = phi->inputs[1 + phi_index]; // reserve PHI space @@ -27,26 +15,25 @@ static void sched_walk_phi(TB_Passes* passes, Worklist* ws, DynArray(PhiVal)* ph dyn_array_put(*phi_vals, p); } - sched_walk(passes, ws, phi_vals, bb, val); + sched_walk(passes, ws, phi_vals, bb, val, false); } -void sched_walk(TB_Passes* passes, Worklist* ws, DynArray(PhiVal)* phi_vals, TB_Node* bb, TB_Node* n) { - if (!is_same_bb(bb, n) || worklist_test_n_set(ws, n)) { +void sched_walk(TB_Passes* passes, Worklist* ws, DynArray(PhiVal)* phi_vals, TB_BasicBlock* bb, TB_Node* n, bool is_end) { + ptrdiff_t search = nl_map_get(passes->scheduled, n); + if (search < 0 || passes->scheduled[search].v != bb || worklist_test_n_set(ws, n)) { return; } // if we're a branch, push our PHI nodes - if (n->type == TB_BRANCH) { - TB_NodeBranch* br = TB_NODE_GET_EXTRA(n); - TB_Node** succ = br->succ; - - FOREACH_N(i, 0, br->succ_count) { - TB_Node* dst = br->succ[i]; + if (is_end) { + for (User* u = n->users; u; u = u->next) { + if (!cfg_is_control(u->n)) continue; + TB_Node* dst = cfg_next_region_control(u->n); // find predecessor index and do that edge ptrdiff_t phi_index = -1; FOREACH_N(j, 0, dst->input_count) { - TB_Node* pred = unsafe_get_region(dst->inputs[j]); + TB_BasicBlock* pred = nl_map_get_checked(passes->scheduled, dst->inputs[j]); if (pred == bb) { phi_index = j; @@ -56,7 +43,7 @@ void sched_walk(TB_Passes* passes, Worklist* ws, DynArray(PhiVal)* phi_vals, TB_ if (phi_index < 0) continue; // schedule memory PHIs - for (User* use = find_users(passes, dst); use; use = use->next) { + for (User* use = dst->users; use; use = use->next) { TB_Node* phi = use->n; if (phi->type == TB_PHI && phi->dt.type == TB_MEMORY) { sched_walk_phi(passes, ws, phi_vals, bb, phi, phi_index); @@ -64,7 +51,7 @@ void sched_walk(TB_Passes* passes, Worklist* ws, DynArray(PhiVal)* phi_vals, TB_ } // schedule data PHIs, we schedule these afterwards because it's "generally" better - for (User* use = find_users(passes, dst); use; use = use->next) { + for (User* use = dst->users; use; use = use->next) { TB_Node* phi = use->n; if (phi->type == TB_PHI && phi->dt.type != TB_MEMORY) { sched_walk_phi(passes, ws, phi_vals, bb, phi, phi_index); @@ -75,25 +62,24 @@ void sched_walk(TB_Passes* passes, Worklist* ws, DynArray(PhiVal)* phi_vals, TB_ // push inputs FOREACH_REVERSE_N(i, 0, n->input_count) if (n->inputs[i]) { - sched_walk(passes, ws, phi_vals, bb, n->inputs[i]); + sched_walk(passes, ws, phi_vals, bb, n->inputs[i], false); } // before the terminator we should eval leftovers that GCM linked here - if (is_block_end(n)) { - TB_Node* parent = get_block_begin(n); - for (User* use = find_users(passes, parent); use; use = use->next) { - sched_walk(passes, ws, phi_vals, bb, use->n); + if (is_end) { + nl_hashset_for(entry, &bb->items) { + sched_walk(passes, ws, phi_vals, bb, *entry, false); } } dyn_array_put(ws->items, n); - if (is_mem_out_op(n)) { + if (is_mem_out_op(n) && n->type != TB_PHI && n->type != TB_PROJ) { // memory effects have anti-dependencies, the previous loads // must finish before the next memory effect is applied. for (User* use = find_users(passes, n->inputs[1]); use; use = use->next) { if (use->slot == 1 && use->n != n) { - sched_walk(passes, ws, phi_vals, bb, use->n); + sched_walk(passes, ws, phi_vals, bb, use->n, false); } } } @@ -103,7 +89,7 @@ void sched_walk(TB_Passes* passes, Worklist* ws, DynArray(PhiVal)* phi_vals, TB_ for (User* use = find_users(passes, n); use; use = use->next) { TB_Node* use_n = use->n; if (use_n->type == TB_PROJ) { - sched_walk(passes, ws, phi_vals, bb, use_n); + sched_walk(passes, ws, phi_vals, bb, use_n, false); } } } diff --git a/tb/src/opt/sroa.h b/tb/src/opt/sroa.h new file mode 100644 index 00000000..56f7f1ce --- /dev/null +++ b/tb/src/opt/sroa.h @@ -0,0 +1,122 @@ + + +typedef struct { + TB_Node* old_n; + + int64_t offset; + TB_CharUnits size; + TB_DataType dt; +} AggregateConfig; + +static ptrdiff_t find_config(size_t config_count, AggregateConfig* configs, int64_t offset) { + FOREACH_N(i, 0, config_count) { + if (configs[i].offset == offset) return i; + } + + tb_unreachable(); + return -1; +} + +// -1 is a bad match +// -2 is no match, so we can add a new config +static ptrdiff_t compatible_with_configs(size_t config_count, AggregateConfig* configs, int64_t offset, TB_CharUnits size, TB_DataType dt) { + int64_t max = offset + size; + + FOREACH_N(i, 0, config_count) { + int64_t max2 = configs[i].offset + configs[i].size; + + if (offset >= configs[i].offset && max <= max2) { + // they overlap... but is it a clean overlap? + if (offset == configs[i].offset && max == max2 && TB_DATA_TYPE_EQUALS(dt, configs[i].dt)) { + return i; + } + + return -1; + } + } + + return -2; +} + +// false means failure to SROA +static bool add_configs(TB_Passes* p, TB_TemporaryStorage* tls, User* use, TB_Node* base_address, size_t base_offset, size_t* config_count, AggregateConfig* configs, int pointer_size) { + for (; use; use = use->next) { + TB_Node* n = use->n; + + if (n->type == TB_MEMBER_ACCESS && use->slot == 1) { + // same rules, different offset + int64_t offset = TB_NODE_GET_EXTRA_T(n, TB_NodeMember)->offset; + if (!add_configs(p, tls, find_users(p, n), base_address, base_offset + offset, config_count, configs, pointer_size)) { + return false; + } + continue; + } + + // we can only SROA if we know we're not using the + // address for anything but direct memory ops or TB_MEMBERs. + if (use->slot != 2) { + return false; + } + + // find direct memory op + if (n->type != TB_LOAD && n->type != TB_STORE) { + return false; + } + + TB_DataType dt = n->type == TB_LOAD ? n->dt : n->inputs[3]->dt; + TB_Node* address = n->inputs[2]; + int size = (bits_in_data_type(pointer_size, dt) + 7) / 8; + + // see if it's a compatible configuration + int match = compatible_with_configs(*config_count, configs, base_offset, size, dt); + if (match == -1) { + return false; + } else if (match == -2) { + // add new config + tb_tls_push(tls, sizeof(AggregateConfig)); + configs[(*config_count)++] = (AggregateConfig){ address, base_offset, size, dt }; + } else if (configs[match].old_n != address) { + log_warn("%s: v%u SROA config matches but reaches so via a different node, please idealize nodes before mem2reg", p->f->super.name, address->gvn); + return false; + } + } + + return true; +} + +static size_t sroa_rewrite(TB_Passes* restrict p, int pointer_size, TB_Node* start, TB_Node* n) { + TB_TemporaryStorage* tls = tb_tls_steal(); + void* mark = tb_tls_push(tls, 0); + + size_t config_count = 0; + AggregateConfig* configs = tb_tls_push(tls, 0); + if (!add_configs(p, tls, n->users, n, 0, &config_count, configs, pointer_size)) { + return 1; + } + + // split allocation into pieces + if (config_count > 1) { + DO_IF(TB_OPTDEBUG_SROA)(printf("sroa v%u => SROA to %zu pieces", n->gvn, config_count)); + + uint32_t alignment = TB_NODE_GET_EXTRA_T(n, TB_NodeLocal)->align; + FOREACH_N(i, 0, config_count) { + TB_Node* new_n = tb_alloc_node(p->f, TB_LOCAL, TB_TYPE_PTR, 1, sizeof(TB_NodeLocal)); + set_input(p, new_n, start, 0); + TB_NODE_SET_EXTRA(new_n, TB_NodeLocal, .size = configs[i].size, .align = alignment); + + // mark all users, there may be some fun new opts now + tb_pass_mark_users(p, configs[i].old_n); + + // replace old pointer with new fancy + subsume_node(p, p->f, configs[i].old_n, new_n); + dyn_array_put(p->locals, new_n); + } + + // we marked the changes else where which is cheating the peephole + // but still doing all the progress it needs to. + tb_pass_mark_users(p, n); + } + + tb_tls_restore(tls, mark); + return config_count > 1 ? 1 + config_count : 1; +} diff --git a/tb/src/passes.h b/tb/src/passes.h index 3eda6fd6..c7787181 100644 --- a/tb/src/passes.h +++ b/tb/src/passes.h @@ -1,17 +1,91 @@ #pragma once #include "tb_internal.h" -#define TB_OPTDEBUG_STATS 0 - -#define TB_OPTDEBUG_PEEP 0 -#define TB_OPTDEBUG_LOOP 0 +#define TB_OPTDEBUG_STATS 0 +#define TB_OPTDEBUG_PEEP 0 +#define TB_OPTDEBUG_LOOP 0 +#define TB_OPTDEBUG_SROA 0 +#define TB_OPTDEBUG_GCM 0 #define TB_OPTDEBUG_MEM2REG 0 #define TB_OPTDEBUG_CODEGEN 0 +#define TB_OPTDEBUG(cond) CONCAT(DO_IF_, CONCAT(TB_OPTDEBUG_, cond)) + #define DO_IF(cond) CONCAT(DO_IF_, cond) #define DO_IF_0(...) #define DO_IF_1(...) __VA_ARGS__ +//////////////////////////////// +// SCCP +//////////////////////////////// +// TODO(NeGate): implement dual? from there i can do join with +// dual(dual(x) ^ dual(y)) = join(x, y) +typedef struct { + int64_t min, max; + + // for known bit analysis + uint64_t known_zeros; + uint64_t known_ones; +} LatticeInt; + +// a simplification of the set of all pointers (or floats) +typedef enum { + LATTICE_UNKNOWN, // top aka {nan, non-nan} or for pointers {null, non-null} + + LATTICE_KNOWN_NAN = 1, // {nan} + LATTICE_KNOWN_NOT_NAN, // {non-nan} + + LATTICE_KNOWN_NULL = 1, // {null} + LATTICE_KNOWN_NOT_NULL // {non-null} +} LatticeTrifecta; + +typedef struct { + LatticeTrifecta trifecta; +} LatticeFloat; + +// TODO(NeGate): we might wanna store more info like aliasing, ownership and alignment. +typedef struct { + LatticeTrifecta trifecta; +} LatticePointer; + +typedef struct { + TB_Node* idom; +} LatticeControl; + +// Represents the fancier type system within the optimizer, it's +// all backed by my shitty understanding of lattice theory +typedef struct { + enum { + LATTICE_INT, + LATTICE_FLOAT32, + LATTICE_FLOAT64, + LATTICE_POINTER, + LATTICE_CONTROL, + } tag; + uint32_t pad; + union { + LatticeInt _int; + LatticeFloat _float; + LatticePointer _ptr; + LatticeControl _ctrl; + }; +} Lattice; + +// hash-consing because there's a lot of +// redundant types we might construct. +typedef struct { + TB_Arena* arena; + NL_HashSet pool; + + // track a lattice per node (basically all get one + // so a non-sparse array works) + size_t type_cap; + Lattice** types; +} LatticeUniverse; + +//////////////////////////////// +// CFG +//////////////////////////////// typedef struct { size_t stride; uint64_t arr[]; @@ -32,6 +106,25 @@ typedef struct { int dst, src; } PhiVal; +typedef struct TB_BasicBlock { + TB_Node* dom; + TB_Node* end; + int id, dom_depth; + + TB_Node* mem_in; + NL_HashSet items; +} TB_BasicBlock; + +typedef struct TB_CFG { + size_t block_count; + NL_Map(TB_Node*, TB_BasicBlock) node_to_block; +} TB_CFG; + +typedef NL_Map(TB_Node*, TB_BasicBlock*) TB_Scheduled; + +//////////////////////////////// +// Core optimizer +//////////////////////////////// typedef struct { DynArray(TB_Node*) items; @@ -42,7 +135,7 @@ typedef struct { struct TB_Passes { TB_Function* f; - bool scheduled; + TB_Scheduled scheduled; // we use this to verify that we're on the same thread // for the entire duration of the TB_Passes. @@ -50,43 +143,78 @@ struct TB_Passes { Worklist worklist; + // sometimes we be using arrays of nodes, let's just keep one around for a bit + DynArray(TB_Node*) stack; + // we wanna track locals because it's nice and easy DynArray(TB_Node*) locals; - // this is used to do CSE - NL_HashSet cse_nodes; + // tracks the fancier type system + LatticeUniverse universe; + + // this is used to do GVN + NL_HashSet gvn_nodes; // debug shit: TB_Node* error_n; // nice stats struct { + #if TB_OPTDEBUG_PEEP + int time; + #endif + #if TB_OPTDEBUG_STATS int initial; - int cse_hit, cse_miss; + int gvn_hit, gvn_miss; int peeps, identities, rewrites; #endif } stats; }; -// it's either START, REGION or control node with CONTROL PROJ predecessor -static bool is_block_begin(TB_Node* n) { - // regions also have a CONTROL PROJ so we - // don't need to check them explicitly. - return n->type == TB_REGION || (n->type == TB_PROJ && n->inputs[0]->type == TB_START); +static bool cfg_is_terminator(TB_Node* n) { + return n->type == TB_BRANCH || n->type == TB_UNREACHABLE || n->type == TB_TRAP || n->type == TB_END; +} + +// includes tuples which have control flow +static bool cfg_is_control(TB_Node* n) { + // easy case + if (n->dt.type == TB_CONTROL) return true; + if (n->dt.type != TB_TUPLE) return false; + + // harder case is figuring out which tuples have control outputs (without manually + // checking which is annoying and slow) + // + // branch, debugbreak, trap, unreachable, dead OR call, syscall, safepoint + return (n->type >= TB_BRANCH && n->type <= TB_DEAD) || (n->type >= TB_CALL && n->type <= TB_SAFEPOINT_POLL); } -static bool is_block_end(TB_Node* n) { - return n->type == TB_BRANCH; +static bool cfg_is_bb_entry(TB_Node* n) { + if (n->type == TB_REGION) { + return true; + } else if (n->type == TB_PROJ && (n->inputs[0]->type == TB_START || n->inputs[0]->type == TB_BRANCH)) { + // Start's control proj or a branch target + return true; + } else { + return false; + } +} + +static TB_Node* cfg_get_fallthru(TB_Node* n) { + if (n->type == TB_PROJ && n->dt.type == TB_CONTROL) { + // if it's single user and that user is the terminator we can skip it in the fallthrough logic + return n->users->next == NULL && n->users->n->type == TB_REGION ? n->users->n : n; + } else { + return n; + } } static bool is_mem_out_op(TB_Node* n) { - return n->type == TB_END || (n->type >= TB_STORE && n->type <= TB_ATOMIC_CAS) || (n->type == TB_PHI && n->dt.type == TB_MEMORY); + return n->dt.type == TB_MEMORY || (n->type >= TB_STORE && n->type <= TB_ATOMIC_CAS) || (n->type >= TB_CALL && n->type <= TB_SAFEPOINT_POLL); } -// schedule nodes below any of their pinned dependencies static bool is_pinned(TB_Node* n) { - return (n->type >= TB_START && n->type <= TB_SAFEPOINT_POLL) || n->type == TB_PROJ || n->type == TB_LOCAL; + return (n->type >= TB_START && n->type <= TB_SAFEPOINT_POLL) || n->type == TB_PROJ; } static bool is_mem_in_op(TB_Node* n) { @@ -96,31 +224,76 @@ static bool is_mem_in_op(TB_Node* n) { //////////////////////////////// // CFG analysis //////////////////////////////// -static TB_Node* get_block_begin(TB_Node* n) { - while (!is_block_begin(n)) { - n = n->inputs[0]; +// if we see a branch projection, it may either be a BB itself +// or if it enters a REGION directly, then that region is the BB. +static TB_Node* cfg_next_bb_after_cproj(TB_Node* n) { + assert(n->type == TB_PROJ); + return n->users->n->type == TB_REGION ? n->users->n : n; +} + +static TB_Node* cfg_next_region_control(TB_Node* n) { + if (n->type != TB_REGION) { + for (User* u = n->users; u; u = u->next) { + if (u->n->type == TB_REGION && u->n->input_count == 1) { + return u->n; + } + } } + return n; } -// shorthand because we use it a lot -static TB_Node* idom(TB_Node* n) { - if (n->type == TB_PROJ) n = n->inputs[0]; +static TB_Node* cfg_next_control(TB_Node* n) { + for (User* u = n->users; u; u = u->next) { + if (cfg_is_control(u->n)) { + return u->n; + } + } - assert(n->type == TB_START || n->type == TB_REGION); - return TB_NODE_GET_EXTRA_T(n, TB_NodeRegion)->dom; + return NULL; } -static int dom_depth(TB_Node* n) { - if (n == NULL) { - return 0; +static TB_Node* get_pred(TB_Node* n, int i) { + TB_Node* base = n; + n = n->inputs[i]; + + if (base->type == TB_REGION && n->type == TB_PROJ) { + TB_Node* parent = n->inputs[0]; + + // start or cprojs with multiple users (it's a BB) will just exit + if (parent->type == TB_START || (parent->type == TB_REGION && n->users->next == NULL)) { + return n; + } + n = parent; } - while (n->type != TB_REGION && n->type != TB_START) { + while (!cfg_is_bb_entry(n)) { n = n->inputs[0]; } + return n; +} - return TB_NODE_GET_EXTRA_T(n, TB_NodeRegion)->dom_depth; +static TB_Node* get_block_begin(TB_Node* n) { + while (!cfg_is_bb_entry(n)) { + n = n->inputs[0]; + } + return n; +} + +static TB_BasicBlock* idom_bb(TB_Passes* p, TB_BasicBlock* bb) { + ptrdiff_t search = nl_map_get(p->scheduled, bb->dom); + return search >= 0 ? p->scheduled[search].v : NULL; +} + +// shorthand because we use it a lot +static TB_Node* idom(TB_CFG* cfg, TB_Node* n) { + if (cfg->node_to_block == NULL) return NULL; + ptrdiff_t search = nl_map_get(cfg->node_to_block, n); + return search >= 0 ? cfg->node_to_block[search].v.dom : NULL; +} + +static int dom_depth(TB_CFG* cfg, TB_Node* n) { + return nl_map_get_checked(cfg->node_to_block, n).dom_depth; } extern thread_local TB_Arena* tmp_arena; @@ -134,10 +307,12 @@ static User* find_users(TB_Passes* restrict p, TB_Node* n) { // CFG // pushes postorder walk into worklist items, also modifies the visited set. -// some entries will not be START or REGION, instead you'll see -size_t tb_push_postorder(TB_Function* f, Worklist* restrict ws); +TB_CFG tb_compute_rpo(TB_Function* f, TB_Passes* restrict p); +TB_CFG tb_compute_rpo2(TB_Function* f, Worklist* ws, DynArray(TB_Node*)* tmp_stack); +void tb_free_cfg(TB_CFG* cfg); // postorder walk -> dominators -void tb_compute_dominators(TB_Function* f, size_t count, TB_Node** blocks); +void tb_compute_dominators(TB_Function* f, TB_Passes* restrict p, TB_CFG cfg); +void tb_compute_dominators2(TB_Function* f, Worklist* ws, TB_CFG cfg); // Worklist API void worklist_alloc(Worklist* restrict ws, size_t initial_cap); @@ -151,6 +326,8 @@ int worklist_popcount(Worklist* ws); TB_Node* worklist_pop(Worklist* ws); // Local scheduler -void sched_walk(TB_Passes* passes, Worklist* ws, DynArray(PhiVal)* phi_vals, TB_Node* bb, TB_Node* n); +void sched_walk(TB_Passes* passes, Worklist* ws, DynArray(PhiVal)* phi_vals, TB_BasicBlock* bb, TB_Node* n, bool is_end); + +static void push_all_nodes(TB_Passes* restrict passes, Worklist* restrict ws, TB_Function* f); -static void push_all_nodes(Worklist* restrict ws, TB_Node* n); +void tb_pass_schedule(TB_Passes* opt, TB_CFG cfg); diff --git a/tb/src/tb.c b/tb/src/tb.c index 5d045e2b..d52201c9 100644 --- a/tb/src/tb.c +++ b/tb/src/tb.c @@ -350,10 +350,9 @@ void tb_function_set_prototype(TB_Function* f, TB_ModuleSectionHandle section, T f->node_count = 0; f->start_node = tb_alloc_node(f, TB_START, TB_TYPE_TUPLE, 0, extra_size); + f->terminators = dyn_array_create(TB_Node*, 4); + TB_NodeRegion* start = TB_NODE_GET_EXTRA(f->start_node); - start->dom_depth = 0; - start->dom = f->start_node; - start->tag = f->super.name; f->param_count = param_count; f->params = tb_arena_alloc(f->arena, (3+param_count) * sizeof(TB_Node*)); diff --git a/tb/src/tb_builder.c b/tb/src/tb_builder.c index df5cfd6f..5316dc6a 100644 --- a/tb/src/tb_builder.c +++ b/tb/src/tb_builder.c @@ -157,7 +157,6 @@ TB_Node* tb_inst_ptr2int(TB_Function* f, TB_Node* src, TB_DataType dt) { TB_Node* tb_inst_int2float(TB_Function* f, TB_Node* src, TB_DataType dt, bool is_signed) { assert(dt.type == TB_FLOAT); assert(src->dt.type == TB_INT); - assert(src->dt.width == dt.width); if (src->type == TB_INTEGER_CONST) { uint64_t y = TB_NODE_GET_EXTRA_T(src, TB_NodeInt)->value; @@ -222,38 +221,23 @@ TB_Node* tb_inst_get_control(TB_Function* f) { } void tb_inst_unreachable(TB_Function* f) { - TB_Node* n = tb_alloc_node(f, TB_UNREACHABLE, TB_TYPE_VOID, 1, 0); + TB_Node* n = tb_alloc_node(f, TB_UNREACHABLE, TB_TYPE_CONTROL, 1, 0); n->inputs[0] = f->active_control_node; - - TB_Node* bb = tb_get_parent_region(f->active_control_node); - TB_NODE_GET_EXTRA_T(bb, TB_NodeRegion)->end = n; - f->active_control_node = n; - - // return afterwards - tb_inst_ret(f, 0, NULL); + f->active_control_node = NULL; + dyn_array_put(f->terminators, n); } void tb_inst_debugbreak(TB_Function* f) { - TB_Node* n = tb_alloc_node(f, TB_DEBUGBREAK, TB_TYPE_VOID, 1, 0); + TB_Node* n = tb_alloc_node(f, TB_DEBUGBREAK, TB_TYPE_CONTROL, 1, 0); n->inputs[0] = f->active_control_node; f->active_control_node = n; } void tb_inst_trap(TB_Function* f) { - TB_Node* n = tb_alloc_node(f, TB_TRAP, TB_TYPE_VOID, 1, 0); + TB_Node* n = tb_alloc_node(f, TB_TRAP, TB_TYPE_CONTROL, 1, 0); n->inputs[0] = f->active_control_node; - - TB_Node* bb = tb_get_parent_region(f->active_control_node); - TB_NODE_GET_EXTRA_T(bb, TB_NodeRegion)->end = n; - f->active_control_node = n; - - // return afterwards - tb_inst_ret(f, 0, NULL); -} - -TB_Node* tb_inst_poison(TB_Function* f) { - TB_Node* n = tb_alloc_node(f, TB_POISON, TB_TYPE_VOID, 1, 0); - return n; + f->active_control_node = NULL; + dyn_array_put(f->terminators, n); } TB_Node* tb_inst_local(TB_Function* f, TB_CharUnits size, TB_CharUnits alignment) { @@ -496,27 +480,21 @@ TB_Node* tb_inst_bswap(TB_Function* f, TB_Node* src) { TB_Node* tb_inst_clz(TB_Function* f, TB_Node* src) { assert(TB_IS_INTEGER_TYPE(src->dt)); - uint64_t bits = tb_ffs(src->dt.data) - 1; - - TB_Node* n = tb_alloc_node(f, TB_CLZ, TB_TYPE_INTN(bits), 2, 0); + TB_Node* n = tb_alloc_node(f, TB_CLZ, TB_TYPE_I32, 2, 0); n->inputs[1] = src; return n; } TB_Node* tb_inst_ctz(TB_Function* f, TB_Node* src) { assert(TB_IS_INTEGER_TYPE(src->dt)); - uint64_t bits = tb_ffs(src->dt.data) - 1; - - TB_Node* n = tb_alloc_node(f, TB_CTZ, TB_TYPE_INTN(bits), 2, 0); + TB_Node* n = tb_alloc_node(f, TB_CTZ, TB_TYPE_I32, 2, 0); n->inputs[1] = src; return n; } TB_Node* tb_inst_popcount(TB_Function* f, TB_Node* src) { assert(TB_IS_INTEGER_TYPE(src->dt)); - uint64_t bits = tb_ffs(src->dt.data) - 1; - - TB_Node* n = tb_alloc_node(f, TB_POPCNT, TB_TYPE_INTN(bits), 2, 0); + TB_Node* n = tb_alloc_node(f, TB_POPCNT, TB_TYPE_I32, 2, 0); n->inputs[1] = src; return n; } @@ -823,9 +801,6 @@ TB_Node* tb_inst_phi2(TB_Function* f, TB_Node* region, TB_Node* a, TB_Node* b) { TB_Node* tb_inst_region(TB_Function* f) { TB_Node* n = tb_alloc_node(f, TB_REGION, TB_TYPE_CONTROL, 0, sizeof(TB_NodeRegion)); TB_NodeRegion* r = TB_NODE_GET_EXTRA(n); - r->postorder_id = -1; - r->dom_depth = -1; // unresolved - r->dom = NULL; TB_Node* phi = tb_alloc_node(f, TB_PHI, TB_TYPE_MEMORY, 1, 0); phi->inputs[0] = n; @@ -850,24 +825,15 @@ static void add_input_late(TB_Function* f, TB_Node* n, TB_Node* in) { size_t old_count = n->input_count; TB_Node** new_inputs = alloc_from_node_arena(f, (old_count + 1) * sizeof(TB_Node*)); - if (n->inputs != NULL) + if (n->inputs != NULL) { memcpy(new_inputs, n->inputs, old_count * sizeof(TB_Node*)); + } new_inputs[old_count] = in; n->inputs = new_inputs; n->input_count = old_count + 1; } -static TB_Node** add_successors(TB_Function* f, TB_Node* terminator, size_t count) { - TB_NodeRegion* bb = TB_NODE_GET_EXTRA(tb_get_parent_region(f->active_control_node)); - bb->end = terminator; - - TB_NodeBranch* br = TB_NODE_GET_EXTRA(terminator); - br->succ_count = count; - br->succ = alloc_from_node_arena(f, count * sizeof(TB_Node*)); - return br->succ; -} - static void add_memory_edge(TB_Function* f, TB_Node* n, TB_Node* mem_state, TB_Node* target) { assert(target->type == TB_REGION); TB_NodeRegion* r = TB_NODE_GET_EXTRA(target); @@ -878,18 +844,15 @@ static void add_memory_edge(TB_Function* f, TB_Node* n, TB_Node* mem_state, TB_N void tb_inst_goto(TB_Function* f, TB_Node* target) { TB_Node* mem_state = peek_mem(f, f->active_control_node); - TB_Node* n = tb_alloc_node(f, TB_BRANCH, TB_TYPE_TUPLE, 1, sizeof(TB_NodeBranch)); - n->inputs[0] = f->active_control_node; // control edge - - TB_Node** succ = add_successors(f, n, 1); - succ[0] = target; + // there's no need for a branch if the path isn't diverging. + TB_Node* n = f->active_control_node; + dyn_array_put(f->terminators, n); f->active_control_node = NULL; - { - TB_Node* cproj = tb__make_proj(f, TB_TYPE_CONTROL, n, 0); - add_input_late(f, target, cproj); - add_memory_edge(f, n, mem_state, target); - } + // just add the edge directly. + assert(n->dt.type == TB_CONTROL); + add_input_late(f, target, n); + add_memory_edge(f, n, mem_state, target); } void tb_inst_if(TB_Function* f, TB_Node* cond, TB_Node* if_true, TB_Node* if_false) { @@ -909,11 +872,10 @@ void tb_inst_if(TB_Function* f, TB_Node* cond, TB_Node* if_true, TB_Node* if_fal } TB_NodeBranch* br = TB_NODE_GET_EXTRA(n); + br->succ_count = 2; br->keys[0] = 0; - TB_Node** succ = add_successors(f, n, 2); - succ[0] = if_true; - succ[1] = if_false; + dyn_array_put(f->terminators, n); f->active_control_node = NULL; } @@ -934,16 +896,12 @@ void tb_inst_branch(TB_Function* f, TB_DataType dt, TB_Node* key, TB_Node* defau } TB_NodeBranch* br = TB_NODE_GET_EXTRA(n); + br->succ_count = 1 + entry_count; FOREACH_N(i, 0, entry_count) { br->keys[i] = entries[i].key; } - TB_Node** succ = add_successors(f, n, 1 + entry_count); - succ[0] = default_label; - FOREACH_N(i, 0, entry_count) { - succ[1 + i] = entries[i].value; - } - + dyn_array_put(f->terminators, n); f->active_control_node = NULL; } @@ -978,7 +936,9 @@ void tb_inst_ret(TB_Function* f, size_t count, TB_Node** values) { } f->stop_node = end; - TB_NODE_SET_EXTRA(region, TB_NodeRegion, .mem_in = mem_phi, .mem_out = mem_phi, .end = end, .tag = "ret"); + TB_NODE_SET_EXTRA(region, TB_NodeRegion, .mem_in = mem_phi, .mem_out = mem_phi, .tag = "ret"); + + dyn_array_put(f->terminators, end); } else { // add to PHIs assert(end->input_count >= 3 + count); @@ -1001,13 +961,7 @@ void tb_inst_ret(TB_Function* f, size_t count, TB_Node** values) { } // basically just tb_inst_goto without the memory PHI (we did it earlier) - TB_Node* region = end->inputs[0]; - TB_Node* n = tb_alloc_node(f, TB_BRANCH, TB_TYPE_TUPLE, 1, sizeof(TB_NodeBranch)); - n->inputs[0] = f->active_control_node; // control edge - - TB_Node** succ = add_successors(f, n, 1); - succ[0] = region; + TB_Node* n = f->active_control_node; f->active_control_node = NULL; - - add_input_late(f, region, tb__make_proj(f, TB_TYPE_CONTROL, n, 0)); + add_input_late(f, end->inputs[0], n); } diff --git a/tb/src/tb_internal.h b/tb/src/tb_internal.h index 1e6e8773..b2582982 100644 --- a/tb/src/tb_internal.h +++ b/tb/src/tb_internal.h @@ -295,6 +295,9 @@ struct TB_Function { // IR allocation TB_Arena* arena; + // used for CFG walk in TB_Passes + DynArray(TB_Node*) terminators; + // IR building TB_Node* active_control_node; TB_Attrib exit_attrib; @@ -357,6 +360,7 @@ struct TB_ThreadInfo { TB_Arena perm_arena; TB_Arena tmp_arena; + TB_Arena type_arena; // live symbols (globals, functions and externals) // we'll be iterating these during object/executable @@ -370,6 +374,11 @@ struct TB_ThreadInfo { TB_CodeRegion* code; // compiled output }; +typedef struct { + size_t count; + TB_External** data; +} ExportList; + struct TB_Module { bool is_jit; @@ -390,6 +399,7 @@ struct TB_Module { TB_Arch target_arch; TB_System target_system; TB_FeatureSet features; + ExportList exports; // This is a hack for windows since they've got this idea // of a _tls_index @@ -567,11 +577,6 @@ inline static bool tb_is_power_of_two(uint64_t x) { TB_Node* tb_alloc_node(TB_Function* f, int type, TB_DataType dt, int input_count, size_t extra); TB_Node* tb__make_proj(TB_Function* f, TB_DataType dt, TB_Node* src, int index); -typedef struct { - size_t count; - TB_External** data; -} ExportList; - ExportList tb_module_layout_sections(TB_Module* m); //////////////////////////////// @@ -609,6 +614,10 @@ static TB_Arena* get_temporary_arena(TB_Module* m) { return &tb_thread_info(m)->tmp_arena; } +static TB_Arena* get_type_arena(TB_Module* m) { + return &tb_thread_info(m)->type_arena; +} + static TB_Arena* get_permanent_arena(TB_Module* m) { return &tb_thread_info(m)->perm_arena; } diff --git a/tb/src/tb_platform.h b/tb/src/tb_platform.h index 2401b9e9..3d633df2 100644 --- a/tb/src/tb_platform.h +++ b/tb/src/tb_platform.h @@ -1,13 +1,18 @@ // If you're trying to port TB on to a new platform you'll need to fill in these // functions with their correct behavior. #pragma once - #include -#include "../bdwgc/private/gc/gc.h" -#define tb_platform_heap_alloc(size) GC_malloc(size) -#define tb_platform_heap_realloc(ptr, size) GC_realloc(ptr, size) -#define tb_platform_heap_free(ptr) GC_free(ptr) +#if defined(TB_USE_MIMALLOC) +#include +#define tb_platform_heap_alloc(size) mi_malloc(size) +#define tb_platform_heap_realloc(ptr, size) mi_realloc(ptr, size) +#define tb_platform_heap_free(ptr) mi_free(ptr) +#else +#define tb_platform_heap_alloc(size) malloc(size) +#define tb_platform_heap_free(ptr) free(ptr) +#define tb_platform_heap_realloc(ptr, size) realloc(ptr, size) +#endif //////////////////////////////// // Virtual memory management diff --git a/tb/src/x64/x64.c b/tb/src/x64/x64.c index 961e1a7d..2d72b406 100644 --- a/tb/src/x64/x64.c +++ b/tb/src/x64/x64.c @@ -47,12 +47,19 @@ static size_t emit_epilogue(Ctx* restrict ctx, TB_Node* stop); // initialize register allocator state static void init_regalloc(Ctx* restrict ctx) { // Generate intervals for physical registers - FOREACH_N(i, 0, 16) { - dyn_array_put(ctx->intervals, (LiveInterval){ .reg_class = REG_CLASS_GPR, .dt = TB_X86_TYPE_QWORD, .reg = i, .assigned = i, .hint = -1, .start = INT_MAX, .split_kid = -1 }); - } - - FOREACH_N(i, 0, 16) { - dyn_array_put(ctx->intervals, (LiveInterval){ .reg_class = REG_CLASS_XMM, .dt = TB_X86_TYPE_XMMWORD, .reg = i, .assigned = i, .hint = -1, .start = INT_MAX, .split_kid = -1 }); + FOREACH_N(i, 0, 32) { + DynArray(LiveRange) ranges = dyn_array_create(LiveRange, 8); + dyn_array_put(ranges, (LiveRange){ INT_MAX, INT_MAX }); + + bool is_gpr = i < 16; + int reg = i % 16; + + dyn_array_put(ctx->intervals, (LiveInterval){ + .reg_class = is_gpr ? REG_CLASS_GPR : REG_CLASS_XMM, + .dt = is_gpr ? TB_X86_TYPE_QWORD : TB_X86_TYPE_XMMWORD, + .reg = reg, .assigned = reg, .hint = -1, .split_kid = -1, + .ranges = ranges + }); } } @@ -100,17 +107,7 @@ static TB_X86_DataType legalize_int2(TB_DataType dt) { static TB_X86_DataType legalize_float(TB_DataType dt) { assert(dt.type == TB_FLOAT); - TB_X86_DataType t = (dt.data == TB_FLT_64 ? TB_X86_TYPE_SSE_SD : TB_X86_TYPE_SSE_SS); - - if (dt.data == TB_FLT_64) { - assert(dt.width == 0 || dt.width == 1); - } else if (dt.data == TB_FLT_32) { - assert(dt.width == 0 || dt.width == 2); - } else { - tb_unreachable(); - } - - return t + (dt.width ? 2 : 0); + return (dt.data == TB_FLT_64 ? TB_X86_TYPE_SSE_SD : TB_X86_TYPE_SSE_SS); } static TB_X86_DataType legalize(TB_DataType dt) { @@ -137,7 +134,7 @@ static bool is_terminator(int t) { static bool try_for_imm32(Ctx* restrict ctx, TB_Node* n, int32_t* out_x) { if (n->type == TB_INTEGER_CONST) { TB_NodeInt* i = TB_NODE_GET_EXTRA(n); - if (fits_into_int32(i->value)) { + if (i->value == (int32_t)i->value) { *out_x = i->value; return true; } @@ -362,7 +359,6 @@ static Cond isel_cmp(Ctx* restrict ctx, TB_Node* n) { if (n->type >= TB_CMP_EQ && n->type <= TB_CMP_FLE) { TB_DataType cmp_dt = TB_NODE_GET_EXTRA_T(n, TB_NodeCompare)->cmp_dt; - assert(cmp_dt.width == 0 && "TODO: Implement vector compares"); Cond cc = -1; use(ctx, n); @@ -424,7 +420,7 @@ static Cond isel_cmp(Ctx* restrict ctx, TB_Node* n) { } static bool should_rematerialize(TB_Node* n) { - if (n->type == TB_INT2PTR && n->inputs[0]->type == TB_INTEGER_CONST) { + if ((n->type == TB_INT2FLOAT || n->type == TB_INT2PTR) && n->inputs[1]->type == TB_INTEGER_CONST) { return true; } @@ -440,6 +436,13 @@ static void isel(Ctx* restrict ctx, TB_Node* n, const int dst) { case TB_PHI: break; case TB_REGION: break; + case TB_POISON: { + Inst* inst = alloc_inst(INST_INLINE, TB_TYPE_VOID, 1, 0, 0); + inst->operands[0] = dst; + append_inst(ctx, inst); + break; + } + case TB_START: { TB_NodeRegion* start = TB_NODE_GET_EXTRA(n); const TB_FunctionPrototype* restrict proto = ctx->f->prototype; @@ -570,13 +573,16 @@ static void isel(Ctx* restrict ctx, TB_Node* n, const int dst) { x &= (1ull << bits_in_type) - 1; } - if (!fits_into_int32(x)) { - // movabs reg, imm64 - SUBMIT(inst_op_abs(MOVABS, n->dt, dst, x)); - } else if (x == 0) { + if (x == 0) { SUBMIT(inst_op_zero(n->dt, dst)); - } else { + } else if (x == (int32_t) x) { SUBMIT(inst_op_imm(MOV, n->dt, dst, x)); + } else if ((x >> 32ull) == UINT32_MAX) { + // mov but zero ext + SUBMIT(inst_op_imm(MOV, TB_TYPE_I32, dst, x)); + } else { + // movabs reg, imm64 + SUBMIT(inst_op_abs(MOVABS, n->dt, dst, x)); } break; } @@ -604,7 +610,16 @@ static void isel(Ctx* restrict ctx, TB_Node* n, const int dst) { hint_reg(ctx, dst, lhs); int32_t x; - if (try_for_imm32(ctx, n->inputs[2], &x)) { + if (n->inputs[2]->type == TB_LOAD && on_last_use(ctx, n->inputs[2])) { + use(ctx, n->inputs[2]); + + SUBMIT(inst_move(n->dt, dst, lhs)); + + Inst* inst = isel_addr2(ctx, n->inputs[2]->inputs[2], dst, -1, dst); + inst->type = op; + inst->dt = legalize(n->dt); + SUBMIT(inst); + } else if (try_for_imm32(ctx, n->inputs[2], &x)) { use(ctx, n->inputs[2]); SUBMIT(inst_move(n->dt, dst, lhs)); @@ -666,6 +681,38 @@ static void isel(Ctx* restrict ctx, TB_Node* n, const int dst) { break; } + // bit magic + case TB_CTZ: + case TB_CLZ: { + int op = type == TB_CLZ ? BSR : BSF; + int lhs = input_reg(ctx, n->inputs[1]); + hint_reg(ctx, dst, lhs); + + // we only wanna deal with 32 or 64 ops for + // this (16 is annoying and 8 is unavailable) + TB_DataType dt = n->dt; + if (dt.data < 64) { + // make sure the bits are zero'd above + if (dt.data < 32) { + assert(type == TB_CLZ && "clz is different, and im stupid"); + SUBMIT(inst_op_zero(TB_TYPE_I32, dst)); + } + + dt.data = 32; + } + + Inst* inst = inst_op_rr(op, dt, dst, lhs); + if (type == TB_CLZ) { + // the difference between bsf and tzcnt + inst->flags |= INST_REP; + } + SUBMIT(inst); + + // flip bits to make CLZ instead of bitscanreverse + SUBMIT(inst_op_rri(XOR, dt, dst, dst, 63)); + break; + } + // bit shifts case TB_SHL: case TB_SHR: @@ -753,7 +800,7 @@ static void isel(Ctx* restrict ctx, TB_Node* n, const int dst) { } case TB_FLOAT32_CONST: { - assert(n->dt.type == TB_FLOAT && n->dt.width == 0); + assert(n->dt.type == TB_FLOAT); uint32_t imm = (Cvt_F32U32) { .f = TB_NODE_GET_EXTRA_T(n, TB_NodeFloat32)->value }.i; if (imm == 0) { @@ -765,7 +812,7 @@ static void isel(Ctx* restrict ctx, TB_Node* n, const int dst) { break; } case TB_FLOAT64_CONST: { - assert(n->dt.type == TB_FLOAT && n->dt.width == 0); + assert(n->dt.type == TB_FLOAT); uint64_t imm = (Cvt_F64U64){ .f = TB_NODE_GET_EXTRA_T(n, TB_NodeFloat64)->value }.i; if (imm == 0) { @@ -819,7 +866,7 @@ static void isel(Ctx* restrict ctx, TB_Node* n, const int dst) { hint_reg(ctx, dst, lhs); SUBMIT(inst_move(n->dt, dst, lhs)); - if (n->inputs[2]->type == TB_LOAD) { + if (n->inputs[2]->type == TB_LOAD && on_last_use(ctx, n->inputs[2])) { use(ctx, n->inputs[2]); Inst* inst = isel_addr2(ctx, n->inputs[2]->inputs[2], dst, -1, dst); @@ -989,7 +1036,7 @@ static void isel(Ctx* restrict ctx, TB_Node* n, const int dst) { TB_Node* param = n->inputs[i]; TB_DataType param_dt = param->dt; - bool use_xmm = TB_IS_FLOAT_TYPE(param_dt) || param_dt.width; + bool use_xmm = TB_IS_FLOAT_TYPE(param_dt); int reg = use_xmm ? xmms_used : gprs_used; if (is_sysv) { if (use_xmm) { @@ -1031,7 +1078,7 @@ static void isel(Ctx* restrict ctx, TB_Node* n, const int dst) { FOREACH_N(i, 0, in_count) { TB_DataType dt = n->inputs[3 + i]->dt; - bool use_xmm = TB_IS_FLOAT_TYPE(dt) || dt.width; + bool use_xmm = TB_IS_FLOAT_TYPE(dt); SUBMIT(inst_move(dt, ins[i], param_srcs[i])); // in win64, float params past the vararg cutoff are @@ -1066,7 +1113,7 @@ static void isel(Ctx* restrict ctx, TB_Node* n, const int dst) { } } - bool use_xmm_ret = TB_IS_FLOAT_TYPE(ret_dt) || ret_dt.width; + bool use_xmm_ret = TB_IS_FLOAT_TYPE(ret_dt); if (ret_node != NULL) { if (use_xmm_ret) { caller_saved_xmms &= ~(1ull << XMM0); @@ -1140,7 +1187,20 @@ static void isel(Ctx* restrict ctx, TB_Node* n, const int dst) { case TB_BRANCH: { TB_Node* bb = tb_get_parent_region(n); TB_NodeBranch* br = TB_NODE_GET_EXTRA(n); - TB_Node** succ = br->succ; + + // the arena on the function should also be available at this time, we're + // in the TB_Passes + TB_Arena* arena = ctx->f->arena; + TB_ArenaSavepoint sp = tb_arena_save(arena); + TB_Node** restrict succ = tb_arena_alloc(arena, br->succ_count * sizeof(TB_Node**)); + + // fill successors + for (User* u = n->users; u; u = u->next) { + if (u->n->type == TB_PROJ) { + int index = TB_NODE_GET_EXTRA_T(u->n, TB_NodeProj)->index; + succ[index] = cfg_get_fallthru(u->n); + } + } SUBMIT(alloc_inst(INST_TERMINATOR, TB_TYPE_VOID, 0, 0, 0)); if (br->succ_count == 1) { @@ -1212,6 +1272,7 @@ static void isel(Ctx* restrict ctx, TB_Node* n, const int dst) { } SUBMIT(inst_jmp(succ[0])); } + tb_arena_restore(arena, sp); break; } @@ -1233,7 +1294,7 @@ static void isel(Ctx* restrict ctx, TB_Node* n, const int dst) { } case TB_LOAD: case TB_ATOMIC_LOAD: { - int mov_op = (TB_IS_FLOAT_TYPE(n->dt) || n->dt.width) ? FP_MOV : MOV; + int mov_op = TB_IS_FLOAT_TYPE(n->dt) ? FP_MOV : MOV; TB_Node* addr = n->inputs[2]; Inst* ld_inst = isel_addr2(ctx, addr, dst, -1, -1); @@ -1284,7 +1345,7 @@ static void isel(Ctx* restrict ctx, TB_Node* n, const int dst) { src = src->inputs[2]; } else { - store_op = (TB_IS_FLOAT_TYPE(store_dt) || store_dt.width) ? FP_MOV : MOV; + store_op = TB_IS_FLOAT_TYPE(store_dt) ? FP_MOV : MOV; } int32_t imm; @@ -1512,8 +1573,9 @@ static void print_operand(TB_CGEmitter* restrict e, Val* v, TB_X86_DataType dt) EMITA(e, ".ret"); } else { TB_Node* n = v->target; - assert(n->type == TB_START || n->type == TB_REGION); - EMITA(e, "L%d", TB_NODE_GET_EXTRA_T(n, TB_NodeRegion)->postorder_id); + + int id = nl_map_get_checked(muh_______cfg->node_to_block, n).id; + EMITA(e, ".bb%d", id); } break; } @@ -1605,7 +1667,7 @@ static int resolve_interval(Ctx* restrict ctx, Inst* inst, int i, Val* val) { return 1; } -static void emit_code(Ctx* restrict ctx, TB_FunctionOutput* restrict func_out) { +static void emit_code(Ctx* restrict ctx, TB_FunctionOutput* restrict func_out, int end) { TB_CGEmitter* e = &ctx->emit; // resolve stack usage @@ -1627,10 +1689,11 @@ static void emit_code(Ctx* restrict ctx, TB_FunctionOutput* restrict func_out) { Inst* prev_line = NULL; for (Inst* restrict inst = ctx->first; inst; inst = inst->next) { size_t in_base = inst->out_count; - InstCategory cat = inst->type >= (sizeof inst_table / sizeof *inst_table) ? INST_BINOP : inst_table[inst->type].cat; + size_t inst_table_size = sizeof(inst_table) / sizeof(*inst_table); + InstCategory cat = inst->type >= inst_table_size ? INST_BINOP : inst_table[inst->type].cat; if (0) { - EMITA(e, " \x1b[32m# %s t=%d { outs:", inst->type < sizeof inst_table / sizeof *inst_table ? inst_table[inst->type].mnemonic : "???", inst->time); + EMITA(e, " \x1b[32m# %s t=%d { outs:", inst->type < inst_table_size ? inst_table[inst->type].mnemonic : "???", inst->time); FOREACH_N(i, 0, inst->out_count) { EMITA(e, " v%d", inst->operands[i]); } @@ -1648,22 +1711,22 @@ static void emit_code(Ctx* restrict ctx, TB_FunctionOutput* restrict func_out) { uint32_t pos = GET_CODE_POS(&ctx->emit); tb_resolve_rel32(&ctx->emit, &nl_map_get_checked(ctx->emit.labels, bb), pos); - if (bb != ctx->f->start_node) { - assert(bb->type == TB_REGION); - EMITA(e, "L%d:\n", TB_NODE_GET_EXTRA_T(bb, TB_NodeRegion)->postorder_id); + int id = nl_map_get_checked(ctx->cfg.node_to_block, bb).id; + if (id > 0) { + EMITA(e, ".bb%d:\n", id); } } else if (inst->type == INST_INLINE) { - TB_NodeMachineOp* mach = TB_NODE_GET_EXTRA(inst->n); + if (inst->n) { + TB_NodeMachineOp* mach = TB_NODE_GET_EXTRA(inst->n); - EMITA(&ctx->emit, " INLINE MACHINE CODE:"); - FOREACH_N(i, 0, mach->length) { - EMITA(&ctx->emit, " %#02x", mach->data[i]); + EMITA(&ctx->emit, " INLINE MACHINE CODE:"); + FOREACH_N(i, 0, mach->length) { + EMITA(&ctx->emit, " %#02x", mach->data[i]); + } + EMITA(&ctx->emit, "\n"); } - EMITA(&ctx->emit, "\n"); } else if (inst->type == INST_EPILOGUE) { - // return label goes here - EMITA(&ctx->emit, ".ret:\n"); - tb_resolve_rel32(&ctx->emit, &ctx->emit.return_label, GET_CODE_POS(&ctx->emit)); + // just a marker for regalloc } else if (inst->type == INST_LINE) { TB_Function* f = ctx->f; TB_Attrib* loc = inst->a; @@ -1734,6 +1797,11 @@ static void emit_code(Ctx* restrict ctx, TB_FunctionOutput* restrict func_out) { EMIT1(e, 0xF0); } + if (inst->flags & INST_REP) { + EMITA(e, " REP"); + EMIT1(e, 0xF3); + } + // resolve output Val out; int i = 0; @@ -1809,7 +1877,9 @@ static void emit_code(Ctx* restrict ctx, TB_FunctionOutput* restrict func_out) { } } - emit_epilogue(ctx, ctx->f->stop_node); + if (end >= 0) { + emit_epilogue(ctx, ctx->f->stop_node); + } // pad to 16bytes static const uint8_t nops[8][8] = { diff --git a/tb/src/x64/x64_disasm.c b/tb/src/x64/x64_disasm.c index e6d835d7..c17e7479 100644 --- a/tb/src/x64/x64_disasm.c +++ b/tb/src/x64/x64_disasm.c @@ -167,7 +167,7 @@ bool tb_x86_disasm(TB_X86_Inst* restrict inst, size_t length, const uint8_t* dat // immediates might use RX for an extended opcode if (uses_imm) { - __debugbreak(); + tb_todo(); } else { int8_t real_rx = ((rex & 4 ? 8 : 0) | rx); if (rex == 0 && inst->data_type == TB_X86_TYPE_BYTE && real_rx >= 4) { diff --git a/tb/src/x64/x64_emitter.h b/tb/src/x64/x64_emitter.h index 3dafbbde..f0e60951 100644 --- a/tb/src/x64/x64_emitter.h +++ b/tb/src/x64/x64_emitter.h @@ -124,11 +124,8 @@ static void inst1(TB_CGEmitter* restrict e, InstType type, const Val* r, TB_X86_ EMIT1(e, inst->op); EMIT4(e, 0); - if (r->target != NULL) { - tb_emit_rel32(e, &nl_map_get_checked(e->labels, r->target), GET_CODE_POS(e) - 4); - } else { - tb_emit_rel32(e, &e->return_label, GET_CODE_POS(e) - 4); - } + assert(r->target != NULL); + tb_emit_rel32(e, &nl_map_get_checked(e->labels, r->target), GET_CODE_POS(e) - 4); } else { tb_unreachable(); } @@ -142,7 +139,7 @@ static void inst2(TB_CGEmitter* restrict e, InstType type, const Val* a, const V if (type == MOVABS) { assert(a->type == VAL_GPR && b->type == VAL_ABS); - EMIT1(e, rex(true, a->reg, 0, 0)); + EMIT1(e, rex(true, 0, a->reg, 0)); EMIT1(e, inst->op + (a->reg & 0b111)); EMIT8(e, b->abs); return; diff --git a/tb/src/x64/x64_insts.inc b/tb/src/x64/x64_insts.inc index 0497a271..b39ffb23 100644 --- a/tb/src/x64/x64_insts.inc +++ b/tb/src/x64/x64_insts.inc @@ -72,6 +72,10 @@ X(CMOVGE, "cmovge", BINOP_EXT, 0x4D) X(CMOVLE, "cmovle", BINOP_EXT, 0x4E) X(CMOVG, "cmovg", BINOP_EXT, 0x4F) +// bitmagic +X(BSF, "bsf", BINOP_EXT, 0xBC) +X(BSR, "bsr", BINOP_EXT, 0xBD) + // binary ops but they have an implicit CL on the righthand side X(SHL, "shl", BINOP_CL, 0xD2, 0xC0, 0x04) X(SHR, "shr", BINOP_CL, 0xD2, 0xC0, 0x05) diff --git a/tb/unittests/tb_test_exit_status.inc b/tb/unittests/tb_test_exit_status.inc new file mode 100644 index 00000000..eac737bb --- /dev/null +++ b/tb/unittests/tb_test_exit_status.inc @@ -0,0 +1,11 @@ +#include "util.inc" + +static int test_exit_status(void) { + TB_TEST_MODULE_BEGIN_; + + TB_Node *exit_value = tb_inst_sint(f_main, TB_TYPE_I32, 42); + EXIT_WITH_(exit_value); + + TB_TEST_MODULE_END_(test_exit_status, 42, 1); + return status; +} diff --git a/tb/unittests/tb_test_int_arith.inc b/tb/unittests/tb_test_int_arith.inc new file mode 100644 index 00000000..e44c6307 --- /dev/null +++ b/tb/unittests/tb_test_int_arith.inc @@ -0,0 +1,67 @@ +#include "util.inc" + +#define TEST_INT_ARITH_(prefix_, type_, inst_type_, inst_op_, arg0_, \ + arg1_, res_) \ + static int test_##prefix_##_##inst_op_(void) { \ + TB_TEST_MODULE_BEGIN_; \ + \ + TB_Node *foo = tb_inst_##inst_type_(f_main, TB_TYPE_##type_, \ + (arg0_)); \ + TB_Node *bar = tb_inst_##inst_type_(f_main, TB_TYPE_##type_, \ + (arg1_)); \ + TB_Node *sum = tb_inst_##inst_op_(f_main, foo, bar, \ + TB_ARITHMATIC_NONE); \ + \ + EXIT_WITH_(sum); \ + \ + TB_TEST_MODULE_END_(test_##prefix_##_##inst_op_, (res_), 0); \ + return status; \ + } + +TEST_INT_ARITH_(i8, I8, sint, add, 50, -8, 42) +TEST_INT_ARITH_(i8, I8, sint, sub, 20, -10, 30) +TEST_INT_ARITH_(i8, I8, sint, mul, 7, 9, 63) +TEST_INT_ARITH_(i8, I8, sint, div, 100, 11, 9) +TEST_INT_ARITH_(i8, I8, sint, mod, 100, 11, 1) + +TEST_INT_ARITH_(i16, I16, sint, add, 300, -240, 60) +TEST_INT_ARITH_(i16, I16, sint, sub, 1000, 934, 66) +TEST_INT_ARITH_(i16, I16, sint, mul, 9, 8, 72) +TEST_INT_ARITH_(i16, I16, sint, div, 999, 112, 8) +TEST_INT_ARITH_(i16, I16, sint, mod, 999, 112, 103) + +TEST_INT_ARITH_(i32, I32, sint, add, 300, -240, 60) +TEST_INT_ARITH_(i32, I32, sint, sub, 1000, 934, 66) +TEST_INT_ARITH_(i32, I32, sint, mul, 9, 8, 72) +TEST_INT_ARITH_(i32, I32, sint, div, 999, 112, 8) +TEST_INT_ARITH_(i32, I32, sint, mod, 999, 112, 103) + +TEST_INT_ARITH_(i64, I64, sint, add, 300, -240, 60) +TEST_INT_ARITH_(i64, I64, sint, sub, 1000, 934, 66) +TEST_INT_ARITH_(i64, I64, sint, mul, 9, 8, 72) +TEST_INT_ARITH_(i64, I64, sint, div, 999, 112, 8) +TEST_INT_ARITH_(i64, I64, sint, mod, 999, 112, 103) + +TEST_INT_ARITH_(u8, I8, uint, add, 50, 8, 58) +TEST_INT_ARITH_(u8, I8, uint, sub, 30, 10, 20) +TEST_INT_ARITH_(u8, I8, uint, mul, 7, 9, 63) +TEST_INT_ARITH_(u8, I8, uint, div, 100, 11, 9) +TEST_INT_ARITH_(u8, I8, uint, mod, 100, 11, 1) + +TEST_INT_ARITH_(u16, I16, uint, add, 30, 50, 80) +TEST_INT_ARITH_(u16, I16, uint, sub, 1000, 934, 66) +TEST_INT_ARITH_(u16, I16, uint, mul, 9, 8, 72) +TEST_INT_ARITH_(u16, I16, uint, div, 999, 112, 8) +TEST_INT_ARITH_(u16, I16, uint, mod, 999, 112, 103) + +TEST_INT_ARITH_(u32, I32, uint, add, 50, 40, 90) +TEST_INT_ARITH_(u32, I32, uint, sub, 1000, 934, 66) +TEST_INT_ARITH_(u32, I32, uint, mul, 9, 8, 72) +TEST_INT_ARITH_(u32, I32, uint, div, 999, 112, 8) +TEST_INT_ARITH_(u32, I32, uint, mod, 999, 112, 103) + +TEST_INT_ARITH_(u64, I64, uint, add, 20, 25, 45) +TEST_INT_ARITH_(u64, I64, uint, sub, 1000, 934, 66) +TEST_INT_ARITH_(u64, I64, uint, mul, 9, 8, 72) +TEST_INT_ARITH_(u64, I64, uint, div, 999, 112, 8) +TEST_INT_ARITH_(u64, I64, uint, mod, 999, 112, 103) diff --git a/tb/unittests/tb_test_regressions.inc b/tb/unittests/tb_test_regressions.inc new file mode 100644 index 00000000..f21bbcf9 --- /dev/null +++ b/tb/unittests/tb_test_regressions.inc @@ -0,0 +1,26 @@ +#include "util.inc" + +static int test_regression_module_arena(void) { + tb_module_destroy(tb_module_create(tb_test_arch, tb_test_system, + &tb_test_feature_set, 0)); + tb_module_destroy(tb_module_create(tb_test_arch, tb_test_system, + &tb_test_feature_set, 0)); + + // We're testing for segfault. + return 1; +} + +static int test_regression_link_global(void) { + TB_Module *module = tb_module_create(tb_test_arch, tb_test_system, + &tb_test_feature_set, 0); + TB_Global *global = tb_global_create(module, -1, "global", NULL, + TB_LINKAGE_PRIVATE); + tb_global_set_storage(module, tb_module_get_rdata(module), global, + 8, 8, 1); + TB_Linker *linker = tb_linker_create(tb_test_exe_type, + tb_test_arch); + tb_linker_append_module(linker, module); + tb_module_destroy(module); + tb_linker_destroy(linker); + return 1; +} diff --git a/tb/unittests/tb_unittests.c b/tb/unittests/tb_unittests.c new file mode 100644 index 00000000..bed9d4e6 --- /dev/null +++ b/tb/unittests/tb_unittests.c @@ -0,0 +1,88 @@ +#include +#include "tb_test_regressions.inc" +#include "tb_test_exit_status.inc" +#include "tb_test_int_arith.inc" + +#define TEST(proc_) \ +do { \ + fflush(stdout); \ + printf("%s\r", #proc_); \ + fflush(stdout); \ + int status_ = test_##proc_(); \ + fflush(stdout); \ + printf("%s%.*s ", #proc_, (int) (41 - sizeof(#proc_)), \ + " ........................................"); \ + if (status_) \ + printf("OK\n"); \ + else { \ + printf("FAILED\n"); \ + failed++; \ + } \ + total++; \ + fflush(stdout); \ +} while (0) + +int main(int argc, char **argv) { + int failed = 0, total = 0; + + TEST(regression_module_arena); + TEST(regression_link_global); + TEST(exit_status); + + TEST(i8_add); + TEST(i8_sub); + TEST(i8_mul); + TEST(i8_div); + TEST(i8_mod); + + TEST(i16_add); + TEST(i16_sub); + TEST(i16_mul); + TEST(i16_div); + TEST(i16_mod); + + TEST(i32_add); + TEST(i32_sub); + TEST(i32_mul); + TEST(i32_div); + TEST(i32_mod); + + TEST(i64_add); + TEST(i64_sub); + TEST(i64_mul); + TEST(i64_div); + TEST(i64_mod); + + TEST(u8_add); + TEST(u8_sub); + TEST(u8_mul); + TEST(u8_div); + TEST(u8_mod); + + TEST(u16_add); + TEST(u16_sub); + TEST(u16_mul); + TEST(u16_div); + TEST(u16_mod); + + TEST(u32_add); + TEST(u32_sub); + TEST(u32_mul); + TEST(u32_div); + TEST(u32_mod); + + TEST(u64_add); + TEST(u64_sub); + TEST(u64_mul); + TEST(u64_div); + TEST(u64_mod); + + fflush(stdout); + if (failed > 0) + printf("\n%d of %d tests failed.\n", failed, total); + else + printf("\nAll %d tests succeeded.\n", total); + fflush(stdout); + + return failed; +} diff --git a/tb/unittests/util.inc b/tb/unittests/util.inc new file mode 100644 index 00000000..5b3fdb7c --- /dev/null +++ b/tb/unittests/util.inc @@ -0,0 +1,209 @@ +#ifndef TB_TEST_UTIL_INC +#define TB_TEST_UTIL_INC + +#if defined(__GCC__) || defined(__clang__) +# pragma GCC diagnostic ignored "-Wunknown-pragmas" +# pragma GCC diagnostic ignored "-Wunused-function" +#endif + +#include "../include/tb.h" + +#include +#include + +#if defined(_WIN32) && !defined(__CYGWIN__) +# define TB_TEST_IS_WINDOWS_ 1 +# define WEXITSTATUS(x) x + +static TB_System tb_test_system = TB_SYSTEM_WINDOWS; +static TB_ExecutableType tb_test_exe_type = TB_EXECUTABLE_PE; +#else +# define TB_TEST_IS_WINDOWS_ 0 + +# include + +static TB_System tb_test_system = TB_SYSTEM_LINUX; +static TB_ExecutableType tb_test_exe_type = TB_EXECUTABLE_ELF; +#endif + +static TB_Arch tb_test_arch = TB_ARCH_X86_64; +static TB_FeatureSet tb_test_feature_set = { .x64 = 0 }; + +static void tb_test_link_library(TB_Linker *linker, + char const *name) { +#if TB_TEST_IS_WINDOWS_ + // FIXME + // Find and link library on Windows. + // + + char full_path[200] = "C:\\Program Files (x86)\\Windows " + "Kits\\10\\Lib\\10.0.14393.0\\um\\x64\\"; + + ptrdiff_t folder_len = strlen(full_path); + ptrdiff_t name_len = strlen(name); + + assert(folder_len + name_len + 1 < sizeof full_path); + memcpy(full_path + folder_len, name, name_len); + full_path[folder_len + name_len] = '\0'; + + FILE *f = fopen(full_path, "rb"); + + assert(f != NULL); + if (f == NULL) + return; + + ptrdiff_t chunk_size = 100000; + ptrdiff_t data_size = 0; + + uint8_t *data = NULL; + uint8_t *p = data; + + while (!feof(f)) { + data = realloc(data, data_size + chunk_size); + assert(data != NULL); + if (data == NULL) + return; + + ptrdiff_t n = fread(data + data_size, 1, chunk_size, f); + if (n <= 0) + break; + data_size += n; + } + + fclose(f); + + TB_Slice sl_name = { .length = name_len, + .data = (uint8_t const *) name }; + TB_Slice sl_data = { .length = data_size, .data = data }; + tb_linker_append_library(linker, sl_name, sl_data); + + free(data); +#endif + + // NOTE + // We don't need to link libraries for unit-testing on Linux yet. + // +} + +#define ERROR(x) \ + do { \ + printf("Error in %s (\"%s\" line %d): " #x "\n", __FUNCTION__, \ + __FILE__, (int) __LINE__); \ + status = 0; \ + goto _final; \ + } while (0) + +#if TB_TEST_IS_WINDOWS_ + +# define EXIT_WITH_(node_status_) \ + do { \ + TB_PrototypeParam param0 = { .dt = TB_TYPE_I32, \ + .name = "uExitCode" }; \ + TB_FunctionPrototype *fp_ExitProcess = tb_prototype_create( \ + module, TB_STDCALL, 1, ¶m0, 0, NULL, 0); \ + \ + TB_Node *addr_ExitProcess = tb_inst_get_symbol_address( \ + f_main, \ + (TB_Symbol *) tb_extern_create(module, -1, "ExitProcess", \ + TB_EXTERNAL_SO_LOCAL)); \ + \ + tb_inst_call(f_main, fp_ExitProcess, addr_ExitProcess, 1, \ + &(node_status_)); \ + tb_inst_ret(f_main, 0, NULL); \ + } while (0) + +#else + +# define EXIT_WITH_(node_status_) \ + do { \ + TB_Node *num_ = tb_inst_sint(f_main, TB_TYPE_I32, 60); \ + tb_inst_syscall(f_main, TB_TYPE_I64, num_, 1, \ + &(node_status_)); \ + tb_inst_ret(f_main, 0, NULL); \ + } while (0) + +#endif + +#define TB_TEST_MODULE_BEGIN_ \ + int status = 1; \ + int ret = 0; \ + \ + TB_Module *module = NULL; \ + TB_Linker *linker = NULL; \ + \ + module = tb_module_create(tb_test_arch, tb_test_system, \ + &tb_test_feature_set, 0); \ + \ + if (module == NULL) \ + ERROR("tb_module_create failed."); \ + \ + TB_FunctionPrototype *fp_main = tb_prototype_create( \ + module, TB_CDECL, 0, NULL, 0, NULL, false); \ + \ + TB_Function *f_main = tb_function_create( \ + module, -1, "main", TB_LINKAGE_PUBLIC); \ + \ + TB_ModuleSectionHandle text = tb_module_get_text(module); \ + tb_function_set_prototype(f_main, text, fp_main, NULL); \ + \ + if (f_main == NULL) \ + ERROR("tb_function_create failed."); + +#define TB_TEST_MODULE_END_(name_, result_, print_asm_) \ + { \ + TB_SymbolIter it = tb_symbol_iter(module); \ + TB_Symbol* sym; \ + while (sym = tb_symbol_iter_next(&it), sym) { \ + if (sym->tag == TB_SYMBOL_FUNCTION) { \ + TB_Function *f = (TB_Function*) sym; \ + TB_Passes *passes = tb_pass_enter(f, NULL); \ + \ + if (passes == NULL) \ + ERROR("tb_pass_enter failed."); \ + \ + TB_FunctionOutput *asm_out = tb_pass_codegen(passes, 1); \ + \ + if ((print_asm_) && asm_out != NULL) { \ + printf("\n"); \ + tb_output_print_asm(asm_out, stdout); \ + } \ + \ + tb_pass_exit(passes); \ + } \ + } \ + } \ + \ + linker = tb_linker_create(tb_test_exe_type, tb_test_arch); \ + \ + if (linker == NULL) \ + ERROR("tb_linker_create failed."); \ + \ + tb_linker_append_module(linker, module); \ + \ + tb_linker_set_entrypoint(linker, "main"); \ + \ + if (TB_TEST_IS_WINDOWS_) \ + tb_test_link_library(linker, "kernel32.lib"); \ + \ + TB_ExportBuffer buf = tb_linker_export(linker); \ + tb_export_buffer_to_file(buf, "bin/" #name_); \ + tb_export_buffer_free(buf); \ + \ + if (!TB_TEST_IS_WINDOWS_) { \ + (void) system("chmod a+x bin/" #name_); \ + ret = WEXITSTATUS(system("./bin/" #name_)); \ + } else \ + ret = WEXITSTATUS(system("start bin\\" #name_)); \ + \ + if (ret != (result_)) { \ + printf("Got %d, expected %d\n", (int) ret, (int) (result_)); \ + status = 0; \ + } \ + \ +_final: \ + if (module != NULL) \ + tb_module_destroy(module); \ + if (linker != NULL) \ + tb_linker_destroy(linker); + +#endif diff --git a/test/fib40.lua b/test/fib40.lua new file mode 100644 index 00000000..faae15f0 --- /dev/null +++ b/test/fib40.lua @@ -0,0 +1,10 @@ + +local function fib(n) + if n < 2 then + return n + else + return fib(n-1) + fib(n-2) + end +end + +print(fib(40)) diff --git a/test/fib5.paka b/test/fib5.paka new file mode 100644 index 00000000..c0105f34 --- /dev/null +++ b/test/fib5.paka @@ -0,0 +1,10 @@ + +def fib(n) { + if n < 2 { + return n + } else { + return fib(n-1) + fib(n-2) + } +} + +env.io.debug(fib(5)) diff --git a/vm/jit/tb.c b/vm/jit/tb.c index 6a3c8a94..604f015b 100644 --- a/vm/jit/tb.c +++ b/vm/jit/tb.c @@ -296,7 +296,7 @@ TB_Node *vm_tb_func_body(vm_tb_state_t *state, TB_Function *fun, TB_Node **args, func, rblock->block->nargs + 1, call_args); - + tb_inst_ret(fun, 0, NULL); tb_inst_set_control(fun, ctrl); @@ -305,19 +305,6 @@ TB_Node *vm_tb_func_body(vm_tb_state_t *state, TB_Function *fun, TB_Node **args, } TB_Node *vm_tb_func_body_call(vm_tb_state_t *state, TB_Function *fun, TB_Node **args, vm_rblock_t *rblock) { - TB_Module *module = state->module; - - TB_PrototypeParam comp_args[2] = { - {TB_TYPE_PTR}, - {TB_TYPE_PTR}, - }; - - TB_PrototypeParam comp_ret[1] = { - {TB_TYPE_PTR}, - }; - - TB_FunctionPrototype *comp_proto = tb_prototype_create(state->module, VM_TB_CC, 2, comp_args, 1, comp_ret, false); - TB_Node *comp_params[2]; comp_params[0] = tb_inst_uint(fun, TB_TYPE_PTR, (uint64_t)state); @@ -681,7 +668,6 @@ TB_Node *vm_tb_func_body_once(vm_tb_state_t *state, TB_Function *fun, TB_Node ** TB_SwitchEntry keys[VM_TAG_MAX - 1]; for (size_t i = 1; i < VM_TAG_MAX; i++) { - keys[i - 1].key = i; // vm_block_t *next_block = vm_tb_rblock_version(branch.rtargets[i]); TB_Node **next_args = vm_malloc(sizeof(TB_Node *) * branch.targets[0]->nargs); for (size_t j = 0; j < branch.targets[0]->nargs; j++) { @@ -700,6 +686,7 @@ TB_Node *vm_tb_func_body_once(vm_tb_state_t *state, TB_Function *fun, TB_Node ** next_args[j] = vm_tb_func_read_arg(fun, regs, next_arg); } } + keys[i - 1].key = i; keys[i - 1].value = vm_tb_func_body(state, fun, next_args, branch.rtargets[i]); } @@ -862,7 +849,7 @@ void *vm_tb_rfunc_comp(vm_tb_state_t *state, vm_rblock_t *rblock) { fprintf(stdout, "\n--- tb ---\n"); tb_pass_print(passes); #endif - tb_pass_mem2reg(passes); + // tb_pass_mem2reg(passes); tb_pass_optimize(passes); #if defined(VM_DUMP_TB_OPT) fprintf(stdout, "\n--- opt tb ---\n");