Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Correct AVX2 instructions in load_kernel_x86_avx.S file #7

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/code/lec04-roofline/asm/cpufp_kernel_x86_sse.S
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ cpufp_kernel_x86_sse_fp64:
addpd %xmm5, %xmm5
mulpd %xmm6, %xmm6
addpd %xmm7, %xmm7
sub $0x1, %rdi
mulpd %xmm8, %xmm8
addpd %xmm9, %xmm9
mulpd %xmm10, %xmm10
Expand All @@ -74,6 +73,7 @@ cpufp_kernel_x86_sse_fp64:
addpd %xmm13, %xmm13
mulpd %xmm14, %xmm14
addpd %xmm15, %xmm15
sub $0x1, %rdi
jne .cpufp.x86.sse.fp64.L1
ret

14 changes: 7 additions & 7 deletions src/code/lec04-roofline/asm/load_kernel_x86_avx.S
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@ load_kernel_x86_avx:
#first parameter , len is in rdi
#second parameter, src is in rsi
.READ_LOOP:
addq $128, %rsi
vmovdqa64 -128(%rsi), %ymm0
vmovdqa64 -96(%rsi), %ymm1
vmovdqa64 -64(%rsi), %ymm2
vmovdqa64 -32(%rsi), %ymm3
addq $128, %rax
cmpq %rax, %rdi
addq $128, %rsi
vmovdqa -128(%rsi), %ymm0
vmovdqa -96(%rsi), %ymm1
vmovdqa -64(%rsi), %ymm2
vmovdqa -32(%rsi), %ymm3
addq $128, %rax
cmpq %rax, %rdi
jne .READ_LOOP

pop %rbp
Expand Down
23 changes: 17 additions & 6 deletions src/code/lec04-roofline/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,34 @@ g++ -g -O3 -c smtl.cpp
ins_set=`lscpu | grep Flags`
compile_commands=""

link_sources="g++ -pthread -O3 -o pe_bench table.o smtl.o cpubm_x86.o pe_bench.o"
link_sources="g++ -pthread -O0 -o pe_bench table.o smtl.o cpubm_x86.o pe_bench.o"
inst_flags=""

if [[ $ins_set =~ "sse" ]];
then
echo "sse supported"
inst_flags=${inst_flags}" -DSSE"
compile_commands=${compile_commands}"g++ -c asm/cpufp_kernel_x86_sse.S;"
compile_commands=${compile_commands}"g++ -c -O0 asm/cpufp_kernel_x86_sse.S;"
link_sources=${link_sources}" cpufp_kernel_x86_sse.o"
#mem
compile_commands=${compile_commands}"g++ -c -O0 asm/load_kernel_x86_sse.S;"
link_sources=${link_sources}" load_kernel_x86_sse.o"
fi

if [[ $ins_set =~ "fma" ]];
then
echo "fma supported"
inst_flags=${inst_flags}" -DFMA"
compile_commands=${compile_commands}"g++ -c -O0 asm/cpufp_kernel_x86_fma.S;"
link_sources=${link_sources}" cpufp_kernel_x86_fma.o"
fi

#avx instruction set check
if [[ $ins_set =~ "avx2" ]];
then
echo "avx2 supported"
inst_flags=${inst_flags}" -DAVX"
compile_commands=${compile_commands}"g++ -c asm/cpufp_kernel_x86_avx.S;"
compile_commands=${compile_commands}"g++ -c -O0 asm/cpufp_kernel_x86_avx.S;"
link_sources=${link_sources}" cpufp_kernel_x86_avx.o"
#mem
compile_commands=${compile_commands}"g++ -c -O0 asm/load_kernel_x86_avx.S;"
Expand All @@ -35,7 +44,7 @@ if [[ $ins_set =~ "avx512" ]];
then
echo "avx512 supported"
inst_flags=${inst_flags}" -DAVX512"
compile_commands=${compile_commands}"g++ -c asm/cpufp_kernel_x86_avx512f.S;"
compile_commands=${compile_commands}"g++ -c -O0 asm/cpufp_kernel_x86_avx512f.S;"
link_sources=${link_sources}" cpufp_kernel_x86_avx512f.o"
#mem
compile_commands=${compile_commands}"g++ -c -O0 asm/load_kernel_x86_avx512.S;"
Expand All @@ -47,21 +56,23 @@ if [[ $ins_set =~ "avx512_vnni" ]];
then
echo "avx512vnni supported"
inst_flags=${inst_flags}" -DAVX512_VNNI"
compile_commands=${compile_commands}"g++ -c asm/cpufp_kernel_x86_avx512_vnni.S;"
compile_commands=${compile_commands}"g++ -c -O0 asm/cpufp_kernel_x86_avx512_vnni.S;"
link_sources=${link_sources}" cpufp_kernel_x86_avx512_vnni.o"
fi

#avx_vnni instruction set check
if [[ $ins_set =~ "avx_vnni" ]];
then
echo "avx_vnni supported"
compile_commands=${compile_commands}"echo avx_vnni;"
inst_flags=${inst_flags}" -DAVX_VNNI"
compile_commands=${compile_commands}"g++ -c -O0 asm/cpufp_kernel_x86_avx_vnni.S;"
link_sources=${link_sources}" cpufp_kernel_x86_avx_vnni.o"
fi

echo ${inst_flags}
g++ -g -O2 -c cpubm_x86.cpp ${inst_flags}
g++ -g -O2 -c pe_bench.cpp ${inst_flags}
echo ${compile_commands}
eval ${compile_commands}
echo ${link_sources}
eval ${link_sources}
Expand Down
43 changes: 31 additions & 12 deletions src/code/lec04-roofline/pe_bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,38 +113,57 @@ static void register_isa()
0x20000000LL, 640LL,
cpufp_kernel_x86_avx512_vnni_int16);
#endif

#ifdef AVX512
reg_new_fp_bench("AVX512F", "FP32", "GFLOPS",
0x20000000LL, 320LL,
cpufp_kernel_x86_avx512f_fp32);
reg_new_fp_bench("AVX512F", "FP64", "GFLOPS",
0x20000000LL, 160LL,
cpufp_kernel_x86_avx512f_fp64);
reg_new_mem_bench("AVX512", "load A[i]", "GB/s",
50, 1024*1024*32, load_kernel_x86_avx512);
#endif

#ifdef AVX_VNNI
reg_new_fp_bench("AVX_VNNI", "INT8", "GFLOPS",
0x40000000LL, 640LL,
cpufp_kernel_x86_avx_vnni_int8);
reg_new_fp_bench("AVX_VNNI", "INT16", "GFLOPS",
0x40000000LL, 320LL,
cpufp_kernel_x86_avx_vnni_int16);
#endif

#ifdef AVX
reg_new_fp_bench("AVX", "FP32", "GFLOPS",
0x40000000LL, 96LL,
cpufp_kernel_x86_avx_fp32);
reg_new_fp_bench("AVX", "FP64", "GFLOPS",
0x40000000LL, 48LL,
cpufp_kernel_x86_avx_fp64);
#endif
#ifdef SSE
reg_new_mem_bench("SSE", "load A[i]", "GB/s",
50, 1024*1024*32, load_kernel_x86_sse);
#endif

#ifdef AVX
reg_new_mem_bench("AVX", "load A[i]", "GB/s",
50, 1024*1024*32, load_kernel_x86_avx);
50, 1024*1024*32, load_kernel_x86_avx);
#endif

#ifdef AVX512
reg_new_mem_bench("AVX512", "load A[i]", "GB/s",
50, 1024*1024*32, load_kernel_x86_avx512);
#ifdef FMA
reg_new_fp_bench("FMA", "FP32", "GFLOPS",
0x80000000LL, 160LL,
cpufp_kernel_x86_fma_fp32);
reg_new_fp_bench("FMA", "FP64", "GFLOPS",
0x80000000LL, 80LL,
cpufp_kernel_x86_fma_fp64);
#endif


#ifdef SSE
reg_new_fp_bench("SSE", "FP32", "GFLOPS",
0x80000000LL, 64LL,
cpufp_kernel_x86_sse_fp32);
reg_new_fp_bench("SSE", "FP64", "GFLOPS",
0x80000000LL, 32LL,
cpufp_kernel_x86_sse_fp64);
reg_new_mem_bench("SSE", "load A[i]", "GB/s",
50, 1024*1024*32, load_kernel_x86_sse);
#endif
}

int main(int argc, char *argv[])
Expand Down