jax-ml · copybara-service · Dec 19, 2024
diff --git a/jaxlib/mosaic/dialect/tpu/tpu.td b/jaxlib/mosaic/dialect/tpu/tpu.td
@@ -847,6 +847,7 @@ def InferVectorLayoutPass : Pass<"tpu-infer-vector-layout", "::mlir::func::FuncO
   ];
   let constructor = "::mlir::tpu::createInferVectorLayoutPass()";
   let options = [
+    Option<"hardware_generation", "hardware-generation", "int", /*default=*/"-1", "">,
     Option<"lane_count", "lane-count", "int", /*default=*/"128", "">,
     Option<"sublane_count", "sublane-count", "int", /*default=*/"8", "">,
   ];

diff --git a/jaxlib/mosaic/dialect/tpu/tpu_dialect.h b/jaxlib/mosaic/dialect/tpu/tpu_dialect.h
@@ -79,6 +79,7 @@ std::unique_ptr<OperationPass<func::FuncOp>> createCanonicalizeMosaicPass(
     int hardware_generation = -1);
 
 std::unique_ptr<OperationPass<func::FuncOp>> createInferVectorLayoutPass(
+    int hardware_generation = -1,
     std::array<int64_t, 2> target_shape = {8, 128});
 
 std::unique_ptr<OperationPass<func::FuncOp>> createApplyVectorLayoutPass(

diff --git a/jaxlib/mosaic/dialect/tpu/transforms/infer_memref_layout.cc b/jaxlib/mosaic/dialect/tpu/transforms/infer_memref_layout.cc
@@ -61,7 +61,10 @@ int getTilingFactor(const int num_lanes, const int hardware_generation,
     if (bitwidth == 8 && tpu_tiling_flags.use_x8_large_second_minor) {
       return sublane_count * 4;
     }
-    if (bitwidth == 16 && tpu_tiling_flags.use_x16_large_second_minor) {
+    // 16-bit values are generally always possible to relayout on the fly in v6,
+    // so we allow large 2nd minor tiling whenever possible.
+    if (bitwidth == 16 && (tpu_tiling_flags.use_x16_large_second_minor ||
+                           hardware_generation >= 6)) {
       return sublane_count * 2;
     }
     return sublane_count;

diff --git a/jaxlib/mosaic/dialect/tpu/transforms/infer_vector_layout.cc b/jaxlib/mosaic/dialect/tpu/transforms/infer_vector_layout.cc
@@ -108,8 +108,10 @@ LogicalResult verifyDivisibleIndex(Value tiled_index, int64_t tiling, int dim,
 // have corresponding native instructions.
 class VectorLayoutInferer {
  public:
-  explicit VectorLayoutInferer(std::array<int64_t, 2> target_shape)
-      : target_shape_({target_shape[0], target_shape[1]}),
+  explicit VectorLayoutInferer(int hardware_generation,
+                               std::array<int64_t, 2> target_shape)
+      : hardware_generation_(hardware_generation),
+        target_shape_({target_shape[0], target_shape[1]}),
         default_tiling_(target_shape) {}
 
 #define TPU_CHECK_OP(cond, msg) \
@@ -1709,7 +1711,12 @@ class VectorLayoutInferer {
                    "Only 32-bit truncation supported");
     }
     auto &layout = *some_layout;
-    bool select_native = allUsersRequireNativeTiling(op->getResult(0));
+    // TPUv6 has good support for compute in 16-bit and cheap retiling between
+    // large 2nd minor and the default tiling, so we bias towards large tiles.
+    bool select_native =
+        (hardware_generation_ >= 6 && dst_ty.getElementTypeBitWidth() == 16)
+            ? true
+            : allUsersRequireNativeTiling(op->getResult(0));
     auto src_layout = VectorLayout(32, layout.offsets(), default_tiling_,
                                    layout.implicit_dim());
     auto dst_layout = VectorLayout(
@@ -2064,29 +2071,36 @@ class VectorLayoutInferer {
             default_tiling_[1]};
   }
 
+  int hardware_generation_;
   std::array<int64_t, 2> target_shape_;
   std::array<int64_t, 2> default_tiling_;
 
   // TODO(b/342235360): Deprecate force_first_tile_offsets_ once we fully
   // remove the restriction that offsets must fall within the first tile.
   bool force_first_tile_offsets_ = false;
 
-  // Address alignment requirement, counted in 32-bit increments.
-  static constexpr int64_t kVmemAlignment32 = 128;
   // TODO(apaszke): This is not really native on newer generations of TPUs.
   // Get rid of this temporary stopgap.
   static constexpr int8_t kNativeBitwidth = 32;
 };
 
 struct InferVectorLayoutPass
     : public impl::InferVectorLayoutPassBase<InferVectorLayoutPass> {
-  InferVectorLayoutPass(std::array<int64_t, 2> target_shape) {
+  InferVectorLayoutPass(int hardware_generation,
+                        std::array<int64_t, 2> target_shape) {
     this->sublane_count = target_shape[0];
     this->lane_count = target_shape[1];
+    this->hardware_generation = hardware_generation;
   }
   void runOnOperation() override {
+    // Fail if hardware_generation has not been set from the default value.
+    if (hardware_generation < 0) {
+      getOperation().emitOpError("hardware_generation must be set");
+      signalPassFailure();
+      return;
+    }
     func::FuncOp func = getOperation();
-    VectorLayoutInferer run({sublane_count, lane_count});
+    VectorLayoutInferer run(hardware_generation, {sublane_count, lane_count});
     if (run.infer(func).failed()) {
       signalPassFailure();
     }
@@ -2096,8 +2110,9 @@ struct InferVectorLayoutPass
 }  // namespace
 
 std::unique_ptr<OperationPass<func::FuncOp>> createInferVectorLayoutPass(
-    std::array<int64_t, 2> target_shape) {
-  return std::make_unique<InferVectorLayoutPass>(target_shape);
+    int hardware_generation, std::array<int64_t, 2> target_shape) {
+  return std::make_unique<InferVectorLayoutPass>(hardware_generation,
+                                                 target_shape);
 }
 
 }  // namespace mlir::tpu