diff --git a/software/apps/baremetal/mimo_mmse_q16/main.c b/software/apps/baremetal/mimo_mmse_q16/main.c index 8e2b557a4..10ae6327e 100644 --- a/software/apps/baremetal/mimo_mmse_q16/main.c +++ b/software/apps/baremetal/mimo_mmse_q16/main.c @@ -22,18 +22,29 @@ Parameters and defines PARALLEL: When defined benchmark parallel MIMO-MMSE. SINGLE: When defined benchmark single-core MIMO-MMSE. +FOLD: When defined 1 fold matrices in memory. */ -int16_t l1_H[2 * N_TX * N_RX * N_ITR] - __attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)), - section(".l1_prio"))); +#define FOLD (1) +#define PARALLEL + +#if FOLD +#define NUM_ROW (1 + ((N_ITR * N_TX - 1) / NUM_BANKS)) +#define NUM_COL (NUM_BANKS / N_TX) + +int16_t l1_G[2 * N_TX * NUM_BANKS * NUM_ROW] + __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); +int16_t l1_L[2 * N_TX * NUM_BANKS * NUM_ROW] + __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); +#else int16_t l1_G[2 * N_TX * N_TX * N_ITR] - __attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)), - section(".l1_prio"))); + __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); int16_t l1_L[2 * N_TX * N_TX * N_ITR] - __attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)), - section(".l1_prio"))); + __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); +#endif +int16_t l1_H[2 * N_TX * N_RX * N_ITR] + __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); int16_t l1_S[2 * N_TX * N_ITR] __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); int16_t l1_y[2 * N_RX * N_ITR] @@ -51,12 +62,14 @@ int main() { uint32_t core_id = mempool_get_core_id(); uint32_t num_cores = mempool_get_core_count(); mempool_barrier_init(core_id); // Initialize barrier and synchronize + uint32_t time_init, time_end; /* Initialize matrices */ if (core_id == 0) { dma_memcpy_blocking(l1_H, l2_H, N_TX * N_RX * N_ITR * sizeof(int32_t)); dma_memcpy_blocking(l1_y, l2_y, N_RX * N_ITR * sizeof(int32_t)); dma_memcpy_blocking(l1_S, l2_S, N_TX * N_ITR * sizeof(int32_t)); + printf("Data transferred\n"); } mempool_barrier(num_cores); @@ -65,13 +78,18 @@ int main() { if (core_id == 0) { mempool_start_benchmark(); - mempool_hermitian_q16vecs((v2s *)l1_H, (v2s *)l1_G, (v2s *)l1_Sigma, N_RX, - N_TX); - mempool_MVP_conjtransp_q16vecs((v2s *)l1_H, (v2s *)l1_y, (v2s *)y2, N_RX, - N_TX, 0); - mempool_cholesky_q16vecs(l1_G, l1_L, N_TX); - mempool_Ltrisol_q16vecs(l1_L, y2, y3, N_TX, 0); - mempool_Ltrisol_q16vecs(l1_L, y3, l1_x, N_TX, 1); + time_init = mempool_get_timer(); + v2s *PtrH = (v2s *)l1_H; + v2s *PtrG = (v2s *)l1_G; + v2s *PtrS = (v2s *)l1_Sigma; + v2s *Ptry = (v2s *)l1_y; + v2s *Ptry2 = (v2s *)y2; + mempool_hermitian_q16vecs(PtrH, PtrG, PtrS, N_RX, N_TX); + mempool_MVP_conjtransp_q16vecs(PtrH, Ptry, Ptry2, N_RX, N_TX, FOLD); + mempool_cholesky_q16vecs(l1_G, l1_L, N_TX, FOLD); + mempool_Ltrisol_q16vecs(l1_L, y2, y3, N_TX, 0, FOLD); + mempool_Ltrisol_q16vecs(l1_L, y3, l1_x, N_TX, 1, FOLD); + time_end = mempool_get_timer(); mempool_stop_benchmark(); } mempool_barrier(num_cores); @@ -81,30 +99,49 @@ int main() { #ifdef PARALLEL mempool_start_benchmark(); + time_init = mempool_get_timer(); for (uint32_t itr = core_id; itr < N_ITR; itr += num_cores) { int16_t *PtrH = l1_H + itr * (2 * N_TX * N_RX); int16_t *Ptry = l1_y + itr * (2 * N_RX); - int16_t *PtrSigma = l1_S + itr * (2 * N_TX); - + int16_t *PtrS = l1_S + itr * (2 * N_TX); + +#if FOLD + int16_t *PtrG = l1_G + (itr / NUM_COL) * (2 * N_TX * NUM_BANKS) + + (itr % NUM_COL) * (2 * N_TX); + int16_t *PtrL = l1_L + (itr / NUM_COL) * (2 * N_TX * NUM_BANKS) + + (itr % NUM_COL) * (2 * N_TX); + int16_t *Ptry2 = + y2 + (itr / NUM_COL) * (2 * NUM_BANKS) + (itr % NUM_COL) * (2 * N_TX); + int16_t *Ptry3 = + y3 + (itr / NUM_COL) * (2 * NUM_BANKS) + (itr % NUM_COL) * (2 * N_TX); + int16_t *Ptrx = l1_x + itr * (2 * N_TX); +#else int16_t *PtrG = l1_G + itr * (2 * N_TX * N_TX); int16_t *PtrL = l1_L + itr * (2 * N_TX * N_TX); int16_t *Ptry2 = y2 + itr * (2 * N_TX); int16_t *Ptry3 = y3 + itr * (2 * N_TX); int16_t *Ptrx = l1_x + itr * (2 * N_TX); +#endif - mempool_hermitian_q16vecs((v2s *)PtrH, (v2s *)PtrG, (v2s *)PtrSigma, N_RX, + mempool_hermitian_q16vecs((v2s *)PtrH, (v2s *)PtrG, (v2s *)PtrS, N_RX, N_TX); mempool_MVP_conjtransp_q16vecs((v2s *)PtrH, (v2s *)Ptry, (v2s *)Ptry2, N_RX, - N_TX, 0); - mempool_cholesky_q16vecs(PtrG, PtrL, N_TX); - mempool_Ltrisol_q16vecs(PtrL, Ptry2, Ptry3, N_TX, 0); - mempool_Ltrisol_q16vecs(PtrL, Ptry3, Ptrx, N_TX, 1); + N_TX, FOLD); + mempool_cholesky_q16vecs(PtrG, PtrL, N_TX, FOLD); + mempool_Ltrisol_q16vecs(PtrL, Ptry2, Ptry3, N_TX, 0, FOLD); + mempool_Ltrisol_q16vecs(PtrL, Ptry3, Ptrx, N_TX, 1, FOLD); } - mempool_log_barrier(2, core_id); + mempool_barrier(num_cores); + time_end = mempool_get_timer(); mempool_stop_benchmark(); #endif + if (core_id == 0) { + printf("Runtime: %d\n", time_end - time_init); + } + mempool_barrier(num_cores); + return 0; } diff --git a/software/kernels/baremetal/mempool_cholesky_q16s.h b/software/kernels/baremetal/mempool_cholesky_q16s.h index fe7c2bd8a..bd5ae83f7 100644 --- a/software/kernels/baremetal/mempool_cholesky_q16s.h +++ b/software/kernels/baremetal/mempool_cholesky_q16s.h @@ -16,13 +16,15 @@ @param[in] n dimension of the input data @return none */ -void mempool_cholesky_q16vecs(int16_t *pSrc, int16_t *pL, const uint32_t n) { +void mempool_cholesky_q16vecs(int16_t *pSrc, int16_t *pL, const uint32_t n, + const uint32_t folded) { - uint32_t i, j, k; int32_t sum; // Sum for elements on diagonal (real) int32_t diag; // Diagonal element (real) int32_t as, bs; // Sum for elements on rows (complex) int32_t ap, bp; // Pivot elements (complex) + uint32_t i, j, k; + const uint32_t offset = folded ? NUM_BANKS : n; v2s ab = (v2s){0, 0}; v2s cd = (v2s){0, 0}; @@ -33,9 +35,9 @@ void mempool_cholesky_q16vecs(int16_t *pSrc, int16_t *pL, const uint32_t n) { // Elements on diagonal (input matrix is positive-definite) sum = 0; - diag = (int32_t)pSrc[2 * (j * n + j)]; + diag = (int32_t)pSrc[2 * (j * offset + j)]; for (k = 0; k < j; k++) { - ab = *(v2s *)&pL[2 * (j * n + k)]; + ab = *(v2s *)&pL[2 * (j * offset + k)]; asm volatile("pv.dotsp.h %[sum], %[ab], %[ab];" "srai %[sum], %[sum], 0x8;" "p.clip %[sum], %[sum], 0x16;" @@ -43,20 +45,20 @@ void mempool_cholesky_q16vecs(int16_t *pSrc, int16_t *pL, const uint32_t n) { : [ab] "r"(ab) :); } - pL[2U * (j * n + j)] = (int16_t)mempool_sqrt_q32s(diag - sum, 16); + pL[2U * (j * offset + j)] = (int16_t)mempool_sqrt_q32s(diag - sum, 16); // Elements on rows for (i = j + 1; i < n; i++) { - ap = (int32_t)pSrc[2 * (i * n + j)]; // Pivot - bp = (int32_t)pSrc[2 * (i * n + j) + 1]; // Pivot - diag = (int32_t)pL[2 * (j * n + j)]; // Diag + ap = (int32_t)pSrc[2 * (i * offset + j)]; // Pivot + bp = (int32_t)pSrc[2 * (i * offset + j) + 1]; // Pivot + diag = (int32_t)pL[2 * (j * offset + j)]; // Diag as = 0; bs = 0; // Sum -> s = s + (ac + bd) + j*(bc - ad) for (k = 0; k < j; k++) { - ab = *(v2s *)&pL[2U * (i * n + k)]; - cd = *(v2s *)&pL[2U * (j * n + k)]; + ab = *(v2s *)&pL[2U * (i * offset + k)]; + cd = *(v2s *)&pL[2U * (j * offset + k)]; const uint32_t shuffle_mask = 0x00020003; asm volatile( // s = s + (ac + bd) + j(bc - ad) @@ -81,7 +83,7 @@ void mempool_cholesky_q16vecs(int16_t *pSrc, int16_t *pL, const uint32_t n) { : [ap] "+&r"(ap), [bp] "+&r"(bp), [res] "+&r"(res) : [as] "r"(as), [bs] "r"(bs), [diag] "r"(diag) :); - (*(v2s *)&pL[2 * (i * n + j)]) = res; + (*(v2s *)&pL[2 * (i * offset + j)]) = res; } } return; diff --git a/software/kernels/baremetal/mempool_linearsolver_q16s.h b/software/kernels/baremetal/mempool_linearsolver_q16s.h index 39bf46394..c325e1fbb 100644 --- a/software/kernels/baremetal/mempool_linearsolver_q16s.h +++ b/software/kernels/baremetal/mempool_linearsolver_q16s.h @@ -17,26 +17,29 @@ */ void mempool_Ltrisol_q16vecs(int16_t *pL, int16_t *y, int16_t *x, - const uint32_t n, const uint32_t transposed) { + const uint32_t n, const uint32_t transposed, + const uint32_t folded) { uint32_t i, j; int32_t as, bs, diag; v2s ab, cd; v2s res = (v2s){0, 0}; v2s ndc = (v2s){0, 0}; + const uint32_t offset = folded ? NUM_BANKS : n; // Solve for each variable x[i] in loop for (i = 0; i < n; i++) { uint32_t ridx = transposed ? (n - i - 1) : i; - diag = pL[2U * (ridx + ridx)]; + diag = pL[2U * (ridx * offset + ridx)]; // Initialize the sums as = 0; bs = 0; // Use the previously solved variables to compute the sum for (j = 0; j < i; j++) { + uint32_t cidx = transposed ? (n - j - 1) : j; if (!transposed) { - ab = *(v2s *)&pL[2U * (ridx * n + cidx)]; + ab = *(v2s *)&pL[2U * (ridx * offset + cidx)]; } else { ab = *(v2s *)&pL[2U * (cidx * n + ridx)]; }