diff --git a/examples/MLIRLinalg/linalg-batch-matmul.mlir b/examples/MLIRLinalg/linalg-batch-matmul-f32.mlir
similarity index 100%
rename from examples/MLIRLinalg/linalg-batch-matmul.mlir
rename to examples/MLIRLinalg/linalg-batch-matmul-f32.mlir
diff --git a/examples/MLIRLinalg/linalg-batch-matmul-i8.mlir b/examples/MLIRLinalg/linalg-batch-matmul-i8.mlir
new file mode 100644
index 0000000000..7b39258fcf
--- /dev/null
+++ b/examples/MLIRLinalg/linalg-batch-matmul-i8.mlir
@@ -0,0 +1,46 @@
+// RUN: buddy-opt -batchmatmul-optimize -verify-diagnostics -expand-strided-metadata -lower-affine -convert-vector-to-llvm -finalize-memref-to-llvm -convert-scf-to-cf -convert-linalg-to-llvm -llvm-request-c-wrappers -convert-func-to-llvm -reconcile-unrealized-casts %s \
+// RUN: | mlir-cpu-runner -O0 -e buddy_batchmatmul_i8 \
+// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext,%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+memref.global "private" @A : memref<2x2x3xi8> = dense<[[[9, 4, 6],[2, 4, 0]],[[6, 3, 3],[0, 4, 7]]]>
+memref.global "private" @B : memref<2x3x4xi8> = dense<[[[1, 3, 8, 0],[1, 8, 8, 7], [6, 9, 7, 9]],[[3, 8, 6, 8],[2, 7, 0, 6],[0, 4, 0, 4]]]>
+memref.global "private" @C : memref<2x2x4xi8> = dense<[[[49, 12, 14, 82],[6, 38, 48, 28]],[[24, 81, 36, 78],[8, 56, 0, 52]]]>
+
+func.func private @printMemrefF32(memref<*xf32>) attributes { llvm.emit_c_interface }
+
+func.func @buddy_batchmatmul_i8() -> f32{
+  %a = memref.get_global @A : memref<2x2x3xi8>
+  %b = memref.get_global @B : memref<2x3x4xi8>
+  %c = memref.get_global @C : memref<2x2x4xi8>
+
+  linalg.batch_matmul 
+      ins(%a, %b: memref<2x2x3xi8>, memref<2x3x4xi8>)
+      outs(%c: memref<2x2x4xi8>)
+  
+  %cst_0 = arith.constant 0 : index
+  %cst_1 = arith.constant 1 : index
+  %cst_2 = arith.constant 2 : index
+  %cst_4 = arith.constant 4 : index
+
+  %c_f32 = memref.alloca() : memref<2x2x4xf32>
+  scf.for %i = %cst_0 to %cst_2 step %cst_1 {
+    scf.for %j = %cst_0 to %cst_2 step %cst_1 {
+      scf.for %k = %cst_0 to %cst_4 step %cst_1 {
+        %val_i8 = memref.load %c[%i, %j, %k] : memref<2x2x4xi8>
+        %val_f32 = arith.sitofp %val_i8 : i8 to f32
+        memref.store %val_f32, %c_f32[%i, %j, %k] : memref<2x2x4xf32>
+      }
+    }
+  }
+
+  %printed_c = memref.cast %c_f32 : memref<2x2x4xf32> to memref<*xf32>
+  call @printMemrefF32(%printed_c) : (memref<*xf32>) -> ()
+  // CHECK: {{Unranked Memref base@ = 0x[0-9A-Fa-f]{1,} rank = 3 offset = 0 sizes = \[2, 2, 4\] strides = \[8, 4, 1\] data =}}
+  // CHECK{LITERAL}: [[[98,    125,    -96,    -92], 
+  // CHECK{LITERAL}:   [12,    76,    96,    56]], 
+  // CHECK{LITERAL}:  [[48,    -94,    72,    -100], 
+  // CHECK{LITERAL}:   [16,    112,    0,    104]]]
+  %zero = arith.constant 0.0 :f32
+  return %zero :f32
+}
diff --git a/examples/MLIRLinalg/makefile b/examples/MLIRLinalg/makefile
index 6f408de3b3..c606b92112 100644
--- a/examples/MLIRLinalg/makefile
+++ b/examples/MLIRLinalg/makefile
@@ -137,8 +137,8 @@ linalg-matmul-optimize-run:
 		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
 
 linalg-batch-matmul-optimize-run:
-	@${BUDDY_OPT} linalg-matmul.mlir ${MLIR_OPT_OPTIONS} \
-		-batchmatmul-optimize="step=64" \
+	@${BUDDY_OPT} linalg-batch-matmul-f32.mlir ${MLIR_OPT_OPTIONS} \
+		-batchmatmul-optimize="vector-size=64" \
 		-convert-linalg-to-loops \
 		-expand-strided-metadata \
 		-lower-affine \
@@ -152,34 +152,89 @@ linalg-batch-matmul-optimize-run:
 		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
 
 linalg-batch-matmul-lower:
-	@${MLIR_OPT} linalg-batch-matmul.mlir ${MLIR_OPT_OPTIONS} \
+	@${MLIR_OPT} linalg-batch-matmul-f32.mlir ${MLIR_OPT_OPTIONS} \
 		-convert-linalg-to-loops -lower-affine -convert-scf-to-cf \
 		-convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm \
 		-convert-func-to-llvm -reconcile-unrealized-casts \
 		-o ./log.mlir
 
 linalg-batch-matmul-translate:
-	@${MLIR_OPT} linalg-batch-matmul.mlir ${MLIR_OPT_OPTIONS} \
+	@${MLIR_OPT} linalg-batch-matmul-f32.mlir ${MLIR_OPT_OPTIONS} \
 		-convert-linalg-to-loops -lower-affine -convert-scf-to-cf \
 		-convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm \
 		-convert-func-to-llvm -reconcile-unrealized-casts | \
 	${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll
 
 linalg-batch-matmul-run:
-	@${MLIR_OPT} linalg-batch-matmul.mlir ${MLIR_OPT_OPTIONS} \
+	@${MLIR_OPT} linalg-batch-matmul-f32.mlir ${MLIR_OPT_OPTIONS} \
 		-convert-linalg-to-loops -lower-affine -convert-scf-to-cf \
 		-convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm \
 		-convert-func-to-llvm -reconcile-unrealized-casts | \
 	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
 
 linalg-batch-matmul-optimize-lower:
-	@${BUDDY_OPT} linalg-batch-matmul.mlir ${MLIR_OPT_OPTIONS} \
-		-batchmatmul-optimize="step=64" \
+	@${BUDDY_OPT} linalg-batch-matmul-f32.mlir ${MLIR_OPT_OPTIONS} \
+		-batchmatmul-optimize="vector-size=64" \
 		-o ./log.mlir
 
 linalg-batch-matmul-optimize-translate:
-	@${BUDDY_OPT} linalg-batch-matmul.mlir ${MLIR_OPT_OPTIONS} \
-		-batchmatmul-optimize="step=64" \
+	@${BUDDY_OPT} linalg-batch-matmul-f32.mlir ${MLIR_OPT_OPTIONS} \
+		-batchmatmul-optimize="vector-size=64" \
+		-convert-linalg-to-loops \
+		-expand-strided-metadata \
+		-lower-affine \
+		-convert-scf-to-cf \
+		-convert-vector-to-llvm \
+		-finalize-memref-to-llvm \
+		-convert-arith-to-llvm \
+		-convert-func-to-llvm \
+		-reconcile-unrealized-casts | \
+	${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll
+
+linalg-batch-matmul-i8-optimize-run:
+	@${BUDDY_OPT} linalg-batch-matmul-i8.mlir ${MLIR_OPT_OPTIONS} \
+		-batchmatmul-optimize="vector-size=64" \
+		-convert-linalg-to-loops \
+		-expand-strided-metadata \
+		-lower-affine \
+		-convert-scf-to-cf \
+		-convert-vector-to-llvm \
+		-finalize-memref-to-llvm \
+		-convert-arith-to-llvm \
+		-convert-func-to-llvm \
+		-reconcile-unrealized-casts | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+linalg-batch-matmul-i8-lower:
+	@${MLIR_OPT} linalg-batch-matmul-i8.mlir ${MLIR_OPT_OPTIONS} \
+		-convert-linalg-to-loops -lower-affine -convert-scf-to-cf \
+		-convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm \
+		-convert-func-to-llvm -reconcile-unrealized-casts \
+		-o ./log.mlir
+
+linalg-batch-matmul-i8-translate:
+	@${MLIR_OPT} linalg-batch-matmul-i8.mlir ${MLIR_OPT_OPTIONS} \
+		-convert-linalg-to-loops -lower-affine -convert-scf-to-cf \
+		-convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm \
+		-convert-func-to-llvm -reconcile-unrealized-casts | \
+	${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll
+
+linalg-batch-matmul-i8-run:
+	@${MLIR_OPT} linalg-batch-matmul-i8.mlir ${MLIR_OPT_OPTIONS} \
+		-convert-linalg-to-loops -lower-affine -convert-scf-to-cf \
+		-convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm \
+		-convert-func-to-llvm -reconcile-unrealized-casts | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+linalg-batch-matmul-i8-optimize-lower:
+	@${BUDDY_OPT} linalg-batch-matmul-i8.mlir ${MLIR_OPT_OPTIONS} \
+		-batchmatmul-optimize="vector-size=64" \
+		-o ./log.mlir
+
+linalg-batch-matmul-i8-optimize-translate:
+	@${BUDDY_OPT} linalg-batch-matmul-i8.mlir ${MLIR_OPT_OPTIONS} \
+		-batchmatmul-optimize="vector-size=64" \
 		-convert-linalg-to-loops \
 		-expand-strided-metadata \
 		-lower-affine \
diff --git a/midend/lib/Conversion/MatMulOptimization/BatchMatMulOptimize.cpp b/midend/lib/Conversion/MatMulOptimization/BatchMatMulOptimize.cpp
index 9ce7acbcb2..1d907da91e 100644
--- a/midend/lib/Conversion/MatMulOptimization/BatchMatMulOptimize.cpp
+++ b/midend/lib/Conversion/MatMulOptimization/BatchMatMulOptimize.cpp
@@ -122,7 +122,7 @@ class BatchMatMulOptimizePattern : public ConversionPattern {
 
     rewriter.create<affine::AffinePrefetchOp>(
         loc, A, AffineMap::get(3, 0, {d0, d1, d2}, rewriter.getContext()),
-        ArrayRef<Value>{ivBatch, c0, c0}, false, 3, true);
+        ArrayRef<Value>{ivBatch, M, K}, false, 3, true);
     affine::buildAffineLoopNest(
         rewriter, loc, {c0}, {K}, 1,
         [&](OpBuilder &builder, Location loc, ValueRange ivRange) {
@@ -174,7 +174,7 @@ class BatchMatMulOptimizePattern : public ConversionPattern {
                                         rewriter.getContext()),
                                     ValueRange{ivBatch, ivA_row, ivB_col});
                             Value result_vec;
-                            if (A_elementType.isIntOrFloat() && 0) { // bug
+                            if (A_elementType.isa<IntegerType>()) {
                               Value add_vec = builder.create<arith::MulIOp>(
                                   loc, a_vec, b_vec);
                               result_vec = builder.create<arith::AddIOp>(
@@ -220,7 +220,7 @@ class BatchMatMulOptimizePattern : public ConversionPattern {
                                                b_col_idx_tail},
                                     mask_vec, c0_dynamicType_vec);
                             Value result_vec_tail;
-                            if (A_elementType.isIntOrFloat() && 0) { // bug
+                            if (A_elementType.isa<IntegerType>()) {
                               Value add_vec = builder.create<arith::MulIOp>(
                                   loc, a_vec, b_vec_tail);
                               result_vec_tail = builder.create<arith::AddIOp>(