Skip to content

Commit

Permalink
Merge pull request #208 from opencompl/sasha/matmul-4x4
Browse files Browse the repository at this point in the history
add 4x4 matmul
  • Loading branch information
compor authored Apr 8, 2024
2 parents 88f979f + 6e0734d commit 85befed
Show file tree
Hide file tree
Showing 13 changed files with 211 additions and 1 deletion.
11 changes: 10 additions & 1 deletion kernels/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,15 @@ dsum/params.csv: $(DSUM_8_16)/tests.csv
python3 generate_params.py "params" $@ $^


MATMUL_4 = matmul/4x4xf64

MATMUL_4_TESTS =
MATMUL_4_TESTS += $(MATMUL_4)/snitch_stream.csv

$(MATMUL_4)/tests.csv: $(MATMUL_4_TESTS)
python3 generate_tests.py $@ $^


MATMUL_8 = matmul/8x8xf64

MATMUL_8_TESTS =
Expand All @@ -119,7 +128,7 @@ MATMUL_8_TESTS += $(MATMUL_8)/snrt.csv
$(MATMUL_8)/tests.csv: $(MATMUL_8_TESTS)
python3 generate_tests.py $@ $^

matmul/params.csv: $(MATMUL_8)/tests.csv
matmul/params.csv: $(MATMUL_8)/tests.csv $(MATMUL_4)/tests.csv
python3 generate_params.py "params" $@ $^


Expand Down
1 change: 1 addition & 0 deletions kernels/kernels.csv
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ matmul,8x8xf64,baseline,2517,3262,3259,2.9941520467836256,1.0,513,1536,0.2038140
matmul,8x8xf64,linalg,2829,3579,3576,2.9941520467836256,1.072992700729927,513,1536,0.1813361611876988,0.4347457627118644,1180,588,548,0.4171085189112761,0,119.0,1.0,1.0,3.0,1180,0.8160442600276625,266,15,5,0.0940261576528808,751,0.0,0.511134676564157,0.0,133.0
matmul,8x8xf64,snitch_stream,648,1402,1398,2.9941520467836256,0.0,513,1536,0.7916666666666666,0.996116504854369,515,0,0,0.7947530864197531,0,,2.4407582938388623,2.440758293838863,0.0,211,0.717687074829932,83,0,0,0.1280864197530864,755,0.0,0.9228395061728396,0.0,
matmul,8x8xf64,snrt,2325,3075,3072,2.593457943925233,0.0,642,1665,0.2761290322580645,0.9968944099378882,644,0,0,0.2769892473118279,0,,2.476923076923077,2.476923076923077,0.0,260,0.8904109589041096,32,0,0,0.013763440860215,751,0.0,0.290752688172043,0.0,
matmul,4x4xf64,snitch_stream,207,948,945,2.953846153846154,0.0,65,192,0.3140096618357488,0.9701492537313432,67,0,0,0.323671497584541,0,,1.2181818181818185,1.2181818181818185,0.0,55,0.4867256637168141,58,0,0,0.2801932367149758,742,0.0,0.6038647342995169,0.0,
pooling_nchw_max_d1_s2_3x3,1x1x16x16xf64,baseline,1447,2188,2185,0.997737556561086,1.0,442,441,0.3054595715272978,0.4505606523955148,981,490,490,0.677954388389772,0,49.0,1.0,1.0,0.0,981,0.7898550724637681,261,0,0,0.1803731859018659,742,0.0,0.8583275742916379,0.0,
pooling_nchw_max_d1_s2_3x3,1x1x16x16xf64,linalg,1194,1927,1924,0.997737556561086,1.0,442,441,0.3701842546063651,0.5169590643274854,855,364,364,0.7160804020100503,0,49.0,1.0,1.0,0.0,855,0.9574468085106383,38,0,0,0.0318257956448911,734,0.0,0.7479061976549414,0.0,
pooling_nchw_max_d1_s2_3x3,1x1x16x16xf64,snitch_stream,1114,1839,1836,0.997737556561086,1.0,442,441,0.3967684021543985,0.8154981549815498,542,49,49,0.4865350089766607,0,49.0,2.7236180904522618,2.7236180904522613,0.0,199,0.5320855614973262,175,0,0,0.1570915619389587,726,0.0,0.6436265709156194,0.0,
Expand Down
8 changes: 8 additions & 0 deletions kernels/matmul/4x4xf64/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
.DEFAULT_GOAL := all

include ../../../snitch/Makefile.rules

TESTS =
TESTS += snitch_stream.x

include ../../Makefile.kernels
83 changes: 83 additions & 0 deletions kernels/matmul/4x4xf64/data.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#define M 4
#define K 4
#define N 4

const double X[M * K] = {
97.62700785,
430.37873274,
205.52675214,
89.76636599,
-152.69040132,
291.78822613,
-124.82557747,
783.54600156,
927.325521 ,
-233.11696235,
583.45007617,
57.78983951,
136.08912219,
851.19327659,
-857.9278836 ,
-825.7414006
};


const double Y[K * N] = {
-959.56320512,
665.2396911 ,
556.3135019 ,
740.02429649,
957.23668447,
598.31712843,
-77.04127549,
561.05835257,
-763.45114826,
279.84204266,
-713.29342518,
889.3378341 ,
43.6966435 ,
-170.67612002,
-470.88877579,
548.46737887
};


const double G_IN[M * N] = {
-87.69933557,
136.86789774,
-962.42039913,
235.27099415,
224.19144544,
233.86799375,
887.49615703,
363.64059821,
-280.98419885,
-125.9360924 ,
395.26239185,
-879.54905674,
333.53343089,
341.27573924,
-579.23487785,
-742.14740469
};


const double G_OUT[M * N] = {
165307.88060468,
364642.37914657,
-167716.5590894 ,
545730.58065036,
555363.0457854 ,
-95657.85781753,
-387349.2227266 ,
369452.92762403,
-1555885.96584004,
630826.38715326,
90459.64607962,
1106031.26628897,
1303111.21590797,
500665.44958852,
1010917.87631656,
-637601.5930368
};

10 changes: 10 additions & 0 deletions kernels/matmul/4x4xf64/data.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#pragma once

#define M 4
#define K 4
#define N 4

extern const double X[M * K];
extern const double Y[K * N];
extern const double G_IN[M * N];
extern const double G_OUT[M * N];
2 changes: 2 additions & 0 deletions kernels/matmul/4x4xf64/snitch_stream.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
cycles,end,end_fpss,fpss_avg_fpu_latency,fpss_avg_load_latency,fpss_fpu_issues,fpss_fpu_latency,fpss_fpu_occupancy,fpss_fpu_rel_occupancy,fpss_issues,fpss_load_latency,fpss_loads,fpss_occupancy,fpss_section_latency,fseq_fpu_yield,fseq_yield,snitch_avg_load_latency,snitch_fseq_offloads,snitch_fseq_rel_offloads,snitch_issues,snitch_load_latency,snitch_loads,snitch_occupancy,start,tend,total_ipc,tstart
207,948,945,2.953846153846154,0,65,192,0.3140096618357488,0.9701492537313433,67,0,0,0.32367149758454106,0,1.2181818181818183,1.2181818181818183,0,55,0.48672566371681414,58,0,0,0.28019323671497587,742,0.0,0.6038647342995169,0.0
89 changes: 89 additions & 0 deletions kernels/matmul/4x4xf64/snitch_stream.xdsl.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
riscv.assembly_section ".text" {
riscv.directive ".globl" "matmul"
riscv.directive ".p2align" "2"

// x[ M x K ]
// y[ K x N ]
// g[ M x N ]
riscv_func.func @matmul(
%X : !riscv.reg<a0>,
%Y : !riscv.reg<a1>,
%G : !riscv.reg<a2>
) {
%X_moved = riscv.mv %X : (!riscv.reg<a0>) -> !riscv.reg<>
%Y_moved = riscv.mv %Y : (!riscv.reg<a1>) -> !riscv.reg<>
%G_moved = riscv.mv %G : (!riscv.reg<a2>) -> !riscv.reg<>

%c0 = riscv.get_register : () -> !riscv.reg<zero>
%c1 = riscv.li 1 : () -> !riscv.reg<>
%c4 = riscv.li 4 : () -> !riscv.reg<>
%c5 = riscv.li 5 : () -> !riscv.reg<>
%c6 = riscv.li 6 : () -> !riscv.reg<>
%c7 = riscv.li 7 : () -> !riscv.reg<>
%c8 = riscv.li 8 : () -> !riscv.reg<>
%c16 = riscv.li 16 : () -> !riscv.reg<>

"snitch_stream.streaming_region"(%X_moved, %Y_moved, %G_moved) <{
"stride_patterns" = [
#snitch_stream.stride_pattern<ub = [4, 1, 4, 4], strides = [32, 0, 8, 0]>,
#snitch_stream.stride_pattern<ub = [4, 1, 4, 4], strides = [0, 32, 32, 8]>,
#snitch_stream.stride_pattern<ub = [16], strides = [8]>
],
"operandSegmentSizes" = array<i32: 2, 1>
}> ({
^bb0(%X_stream : !stream.readable<!riscv.freg<ft0>>, %Y_stream : !stream.readable<!riscv.freg<ft1>>, %G_stream : !stream.writable<!riscv.freg<ft2>>):
riscv_scf.for %g_i : !riscv.reg<> = %c0 to %c16 step %c4 {
%x00 = riscv_snitch.read from %X_stream : !riscv.freg<ft0>
%y00 = riscv_snitch.read from %Y_stream : !riscv.freg<ft1>
%init0 = riscv.fmul.d %x00, %y00 : (!riscv.freg<ft0>, !riscv.freg<ft1>) -> !riscv.freg<>
%x01 = riscv_snitch.read from %X_stream : !riscv.freg<ft0>
%y01 = riscv_snitch.read from %Y_stream : !riscv.freg<ft1>
%init1 = riscv.fmul.d %x01, %y01 : (!riscv.freg<ft0>, !riscv.freg<ft1>) -> !riscv.freg<>
%x02 = riscv_snitch.read from %X_stream : !riscv.freg<ft0>
%y02 = riscv_snitch.read from %Y_stream : !riscv.freg<ft1>
%init2 = riscv.fmul.d %x02, %y02 : (!riscv.freg<ft0>, !riscv.freg<ft1>) -> !riscv.freg<>
%x03 = riscv_snitch.read from %X_stream : !riscv.freg<ft0>
%y03 = riscv_snitch.read from %Y_stream : !riscv.freg<ft1>
%init3 = riscv.fmul.d %x03, %y03 : (!riscv.freg<ft0>, !riscv.freg<ft1>) -> !riscv.freg<>

%g00, %g01, %g02, %g03 = riscv_snitch.frep_outer %c1 iter_args(%acc0 = %init0, %acc1 = %init1, %acc2 = %init2, %acc3 = %init3) -> (!riscv.freg<>, !riscv.freg<>, !riscv.freg<>, !riscv.freg<>) {
%x10 = riscv_snitch.read from %X_stream : !riscv.freg<ft0>
%y10 = riscv_snitch.read from %Y_stream : !riscv.freg<ft1>
%res0 = riscv.fmadd.d %x10, %y10, %acc0 : (!riscv.freg<ft0>, !riscv.freg<ft1>, !riscv.freg<>) -> !riscv.freg<>
%x11 = riscv_snitch.read from %X_stream : !riscv.freg<ft0>
%y11 = riscv_snitch.read from %Y_stream : !riscv.freg<ft1>
%res1 = riscv.fmadd.d %x11, %y11, %acc1 : (!riscv.freg<ft0>, !riscv.freg<ft1>, !riscv.freg<>) -> !riscv.freg<>
%x12 = riscv_snitch.read from %X_stream : !riscv.freg<ft0>
%y12 = riscv_snitch.read from %Y_stream : !riscv.freg<ft1>
%res2 = riscv.fmadd.d %x12, %y12, %acc2 : (!riscv.freg<ft0>, !riscv.freg<ft1>, !riscv.freg<>) -> !riscv.freg<>
%x13 = riscv_snitch.read from %X_stream : !riscv.freg<ft0>
%y13 = riscv_snitch.read from %Y_stream : !riscv.freg<ft1>
%res3 = riscv.fmadd.d %x13, %y13, %acc3 : (!riscv.freg<ft0>, !riscv.freg<ft1>, !riscv.freg<>) -> !riscv.freg<>

riscv_snitch.frep_yield %res0, %res1, %res2, %res3 : !riscv.freg<>, !riscv.freg<>, !riscv.freg<>, !riscv.freg<>
}

%x20 = riscv_snitch.read from %X_stream : !riscv.freg<ft0>
%y20 = riscv_snitch.read from %Y_stream : !riscv.freg<ft1>
%g10 = riscv.fmadd.d %x20, %y20, %g00 : (!riscv.freg<ft0>, !riscv.freg<ft1>, !riscv.freg<>) -> !riscv.freg<ft2>
riscv_snitch.write %g10 to %G_stream : !riscv.freg<ft2>
%x21 = riscv_snitch.read from %X_stream : !riscv.freg<ft0>
%y21 = riscv_snitch.read from %Y_stream : !riscv.freg<ft1>
%g11 = riscv.fmadd.d %x21, %y21, %g01 : (!riscv.freg<ft0>, !riscv.freg<ft1>, !riscv.freg<>) -> !riscv.freg<ft2>
riscv_snitch.write %g11 to %G_stream : !riscv.freg<ft2>
%x22 = riscv_snitch.read from %X_stream : !riscv.freg<ft0>
%y22 = riscv_snitch.read from %Y_stream : !riscv.freg<ft1>
%g12 = riscv.fmadd.d %x22, %y22, %g02 : (!riscv.freg<ft0>, !riscv.freg<ft1>, !riscv.freg<>) -> !riscv.freg<ft2>
riscv_snitch.write %g12 to %G_stream : !riscv.freg<ft2>
%x23 = riscv_snitch.read from %X_stream : !riscv.freg<ft0>
%y23 = riscv_snitch.read from %Y_stream : !riscv.freg<ft1>
%g13 = riscv.fmadd.d %x23, %y23, %g03 : (!riscv.freg<ft0>, !riscv.freg<ft1>, !riscv.freg<>) -> !riscv.freg<ft2>
riscv_snitch.write %g13 to %G_stream : !riscv.freg<ft2>

riscv_scf.yield
}
}) : (!riscv.reg<>, !riscv.reg<>, !riscv.reg<>) -> ()

riscv_func.return
}
}
2 changes: 2 additions & 0 deletions kernels/matmul/4x4xf64/snrt.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
cycles,end,end_fpss,fpss_avg_fpu_latency,fpss_avg_load_latency,fpss_fpu_issues,fpss_fpu_latency,fpss_fpu_occupancy,fpss_fpu_rel_occupancy,fpss_issues,fpss_load_latency,fpss_loads,fpss_occupancy,fpss_section_latency,fseq_fpu_yield,fseq_yield,snitch_avg_load_latency,snitch_fseq_offloads,snitch_fseq_rel_offloads,snitch_issues,snitch_load_latency,snitch_loads,snitch_occupancy,start,tend,total_ipc,tstart
2325,3075,3072,2.5934579439252334,0,642,1665,0.2761290322580645,0.9968944099378882,644,0,0,0.27698924731182795,0,2.476923076923077,2.476923076923077,0,260,0.8904109589041096,32,0,0,0.013763440860215054,751,0.0,0.29075268817204303,0.0
2 changes: 2 additions & 0 deletions kernels/matmul/4x4xf64/tests.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
impl,cycles,end,end_fpss,fpss_avg_fpu_latency,fpss_avg_load_latency,fpss_fpu_issues,fpss_fpu_latency,fpss_fpu_occupancy,fpss_fpu_rel_occupancy,fpss_issues,fpss_load_latency,fpss_loads,fpss_occupancy,fpss_section_latency,fseq_fpu_yield,fseq_yield,snitch_avg_load_latency,snitch_fseq_offloads,snitch_fseq_rel_offloads,snitch_issues,snitch_load_latency,snitch_loads,snitch_occupancy,start,tend,total_ipc,tstart
snitch_stream,207,948,945,2.953846153846154,0,65,192,0.3140096618357488,0.9701492537313432,67,0,0,0.323671497584541,0,1.2181818181818185,1.2181818181818185,0,55,0.4867256637168141,58,0,0,0.2801932367149758,742,0.0,0.6038647342995169,0.0
1 change: 1 addition & 0 deletions kernels/matmul/params.csv
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ params,impl,cycles,end,end_fpss,fpss_avg_fpu_latency,fpss_avg_load_latency,fpss_
8x8xf64,linalg,2829,3579,3576,2.9941520467836256,1.072992700729927,513,1536,0.1813361611876988,0.4347457627118644,1180,588,548,0.4171085189112761,0,119.0,1.0,1.0,3.0,1180,0.8160442600276625,266,15,5,0.0940261576528808,751,0.0,0.511134676564157,0.0,133.0
8x8xf64,snitch_stream,648,1402,1398,2.9941520467836256,0.0,513,1536,0.7916666666666666,0.996116504854369,515,0,0,0.7947530864197531,0,,2.4407582938388623,2.440758293838863,0.0,211,0.717687074829932,83,0,0,0.1280864197530864,755,0.0,0.9228395061728396,0.0,
8x8xf64,snrt,2325,3075,3072,2.593457943925233,0.0,642,1665,0.2761290322580645,0.9968944099378882,644,0,0,0.2769892473118279,0,,2.476923076923077,2.476923076923077,0.0,260,0.8904109589041096,32,0,0,0.013763440860215,751,0.0,0.290752688172043,0.0,
4x4xf64,snitch_stream,207,948,945,2.953846153846154,0.0,65,192,0.3140096618357488,0.9701492537313432,67,0,0,0.323671497584541,0,,1.2181818181818185,1.2181818181818185,0.0,55,0.4867256637168141,58,0,0,0.2801932367149758,742,0.0,0.6038647342995169,0.0,
1 change: 1 addition & 0 deletions kernels/pivoted.csv
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ ddot 128xf64,961,957,2245,205,569,957
dense 8x8xf64,3240,3257,,2729,2737,3240
dsum 8x16xf32,1224,809,5480,177,198,809
fill 16x16xf64,349,349,2647,291,289,349
matmul 4x4xf64,,,,207,,
matmul 8x8xf64,2517,2829,,648,2325,2517
pooling_nchw_max_d1_s2_3x3 1x1x16x16xf64,1447,1194,,1114,1099,1194
pooling_nchw_sum_d1_s2_3x3 1x1x16x16xf64,1940,1940,,1996,1981,1940
Expand Down
1 change: 1 addition & 0 deletions kernels/pivoted_fpu.csv
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ ddot 128xf64,0.13,0.13,0.06,0.66,0.23
dense 8x8xf64,0.20,0.20,,0.24,0.23
dsum 8x16xf32,0.11,0.16,0.02,0.73,0.65
fill 16x16xf64,0.00,0.00,0.00,0.89,0.89
matmul 4x4xf64,,,,0.31,
matmul 8x8xf64,0.20,0.18,,0.79,0.28
pooling_nchw_max_d1_s2_3x3 1x1x16x16xf64,0.31,0.37,,0.40,0.40
pooling_nchw_sum_d1_s2_3x3 1x1x16x16xf64,0.23,0.23,,0.22,0.22
Expand Down
1 change: 1 addition & 0 deletions kernels/pivoted_ipc.csv
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ ddot 128xf64,0.94,0.95,0.63,0.76,0.26
dense 8x8xf64,0.53,0.53,,0.39,0.32
dsum 8x16xf32,0.95,0.70,0.57,0.85,0.74
fill 16x16xf64,0.91,0.91,0.70,0.95,0.94
matmul 4x4xf64,,,,0.60,
matmul 8x8xf64,0.46,0.51,,0.92,0.29
pooling_nchw_max_d1_s2_3x3 1x1x16x16xf64,0.86,0.75,,0.64,0.56
pooling_nchw_sum_d1_s2_3x3 1x1x16x16xf64,0.46,0.46,,0.36,0.31
Expand Down

0 comments on commit 85befed

Please sign in to comment.