diff --git a/ci/docker/python-wheel-manylinux.dockerfile b/ci/docker/python-wheel-manylinux.dockerfile
index c6fa3cc0dce97..0b5645285b6e1 100644
--- a/ci/docker/python-wheel-manylinux.dockerfile
+++ b/ci/docker/python-wheel-manylinux.dockerfile
@@ -107,6 +107,7 @@ RUN --mount=type=secret,id=github_repository_owner \
         --x-feature=flight \
         --x-feature=gcs \
         --x-feature=json \
+        --x-feature=orc \
         --x-feature=parquet \
         --x-feature=s3 && \
       rm -rf ~/.config/NuGet/
diff --git a/ci/docker/ubuntu-24.04-verify-rc.dockerfile b/ci/docker/ubuntu-24.04-verify-rc.dockerfile
new file mode 100644
index 0000000000000..42d71afcb0999
--- /dev/null
+++ b/ci/docker/ubuntu-24.04-verify-rc.dockerfile
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+ARG arch=amd64
+FROM ${arch}/ubuntu:24.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+COPY dev/release/setup-ubuntu.sh /
+RUN /setup-ubuntu.sh && \
+    rm /setup-ubuntu.sh && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists*
diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh
index 91925e7abe8b0..1eaecd6bea07d 100755
--- a/ci/scripts/python_wheel_macos_build.sh
+++ b/ci/scripts/python_wheel_macos_build.sh
@@ -144,7 +144,6 @@ cmake \
     -DCMAKE_INSTALL_PREFIX=${build_dir}/install \
     -DCMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES} \
     -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD} \
-    -DORC_SOURCE=BUNDLED \
     -DPARQUET_REQUIRE_ENCRYPTION=${PARQUET_REQUIRE_ENCRYPTION} \
     -DVCPKG_MANIFEST_MODE=OFF \
     -DVCPKG_TARGET_TRIPLET=${VCPKG_TARGET_TRIPLET} \
diff --git a/ci/scripts/python_wheel_manylinux_build.sh b/ci/scripts/python_wheel_manylinux_build.sh
index 6365fcfacfc38..b9f4406a2d452 100755
--- a/ci/scripts/python_wheel_manylinux_build.sh
+++ b/ci/scripts/python_wheel_manylinux_build.sh
@@ -125,7 +125,6 @@ cmake \
     -DCMAKE_INSTALL_LIBDIR=lib \
     -DCMAKE_INSTALL_PREFIX=/tmp/arrow-dist \
     -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD} \
-    -DORC_SOURCE=BUNDLED \
     -DPARQUET_REQUIRE_ENCRYPTION=${PARQUET_REQUIRE_ENCRYPTION} \
     -DVCPKG_MANIFEST_MODE=OFF \
     -DVCPKG_TARGET_TRIPLET=${VCPKG_TARGET_TRIPLET} \
diff --git a/cpp/examples/arrow/parquet_read_write.cc b/cpp/examples/arrow/parquet_read_write.cc
index a07c10fda5af8..7a2fe6f070a56 100644
--- a/cpp/examples/arrow/parquet_read_write.cc
+++ b/cpp/examples/arrow/parquet_read_write.cc
@@ -26,7 +26,7 @@
 
 arrow::Status ReadFullFile(std::string path_to_file) {
   // #include "arrow/io/api.h"
-  // #include "arrow/parquet/arrow/reader.h"
+  // #include "parquet/arrow/reader.h"
 
   arrow::MemoryPool* pool = arrow::default_memory_pool();
   std::shared_ptr<arrow::io::RandomAccessFile> input;
@@ -44,7 +44,7 @@ arrow::Status ReadFullFile(std::string path_to_file) {
 
 arrow::Status ReadInBatches(std::string path_to_file) {
   // #include "arrow/io/api.h"
-  // #include "arrow/parquet/arrow/reader.h"
+  // #include "parquet/arrow/reader.h"
 
   arrow::MemoryPool* pool = arrow::default_memory_pool();
 
diff --git a/cpp/src/arrow/acero/ArrowAceroConfig.cmake.in b/cpp/src/arrow/acero/ArrowAceroConfig.cmake.in
index 124cbcbf3d42e..66aa2b4078c7f 100644
--- a/cpp/src/arrow/acero/ArrowAceroConfig.cmake.in
+++ b/cpp/src/arrow/acero/ArrowAceroConfig.cmake.in
@@ -28,7 +28,6 @@
 
 include(CMakeFindDependencyMacro)
 find_dependency(Arrow)
-find_dependency(Parquet)
 
 include("${CMAKE_CURRENT_LIST_DIR}/ArrowAceroTargets.cmake")
 
diff --git a/cpp/src/arrow/acero/hash_join_benchmark.cc b/cpp/src/arrow/acero/hash_join_benchmark.cc
index e3e37e249e6a3..0a56194f2a3c8 100644
--- a/cpp/src/arrow/acero/hash_join_benchmark.cc
+++ b/cpp/src/arrow/acero/hash_join_benchmark.cc
@@ -20,6 +20,7 @@
 #include "arrow/acero/hash_join.h"
 #include "arrow/acero/hash_join_node.h"
 #include "arrow/acero/options.h"
+#include "arrow/acero/swiss_join_internal.h"
 #include "arrow/acero/test_util_internal.h"
 #include "arrow/acero/util.h"
 #include "arrow/api.h"
@@ -365,6 +366,21 @@ static void BM_HashJoinBasic_ComplexResidualFilter(benchmark::State& st,
 
   HashJoinBasicBenchmarkImpl(st, settings);
 }
+
+static void BM_HashJoinBasic_HeavyBuildPayload(benchmark::State& st) {
+  BenchmarkSettings settings;
+  settings.build_payload_types = {boolean(), fixed_size_binary(64), utf8(),
+                                  boolean(), fixed_size_binary(64), utf8()};
+  settings.probe_payload_types = {int32()};
+  settings.null_percentage = 0.5;
+  settings.cardinality = 1.0 / 16.0;
+  settings.num_build_batches = static_cast<int>(st.range(0));
+  settings.num_probe_batches = settings.num_build_batches;
+  settings.var_length_min = 64;
+  settings.var_length_max = 128;
+
+  HashJoinBasicBenchmarkImpl(st, settings);
+}
 #endif
 
 std::vector<int64_t> hashtable_krows = benchmark::CreateRange(1, 4096, 8);
@@ -622,6 +638,10 @@ BENCHMARK_CAPTURE(BM_HashJoinBasic_ComplexResidualFilter, "Full Outer",
                   JoinType::FULL_OUTER)
     ->ArgNames(complex_residual_filter_argnames)
     ->ArgsProduct(complex_residual_filter_args);
+
+BENCHMARK(BM_HashJoinBasic_HeavyBuildPayload)
+    ->ArgNames({"HashTable krows"})
+    ->ArgsProduct({benchmark::CreateRange(1, 512, 8)});
 #else
 
 BENCHMARK_CAPTURE(BM_HashJoinBasic_KeyTypes, "{int32}", {int32()})
@@ -640,5 +660,106 @@ BENCHMARK(BM_HashJoinBasic_ProbeParallelism)
 
 #endif  // ARROW_BUILD_DETAILED_BENCHMARKS
 
+void RowArrayDecodeBenchmark(benchmark::State& st, const std::shared_ptr<Schema>& schema,
+                             int column_to_decode) {
+  auto batches = MakeRandomBatches(schema, 1, std::numeric_limits<uint16_t>::max());
+  const auto& batch = batches.batches[0];
+  RowArray rows;
+  std::vector<uint16_t> row_ids_encode(batch.length);
+  std::iota(row_ids_encode.begin(), row_ids_encode.end(), 0);
+  std::vector<KeyColumnArray> temp_column_arrays;
+  DCHECK_OK(rows.AppendBatchSelection(
+      default_memory_pool(), internal::CpuInfo::GetInstance()->hardware_flags(), batch, 0,
+      static_cast<int>(batch.length), static_cast<int>(batch.length),
+      row_ids_encode.data(), temp_column_arrays));
+  std::vector<uint32_t> row_ids_decode(batch.length);
+  // Create a random access pattern to simulate hash join.
+  std::default_random_engine gen(42);
+  std::uniform_int_distribution<uint32_t> dist(0,
+                                               static_cast<uint32_t>(batch.length - 1));
+  std::transform(row_ids_decode.begin(), row_ids_decode.end(), row_ids_decode.begin(),
+                 [&](uint32_t) { return dist(gen); });
+
+  for (auto _ : st) {
+    ResizableArrayData column;
+    // Allocate at least 8 rows for the convenience of SIMD decoding.
+    int log_num_rows_min = std::max(3, bit_util::Log2(batch.length));
+    DCHECK_OK(column.Init(batch[column_to_decode].type(), default_memory_pool(),
+                          log_num_rows_min));
+    DCHECK_OK(rows.DecodeSelected(&column, column_to_decode,
+                                  static_cast<int>(batch.length), row_ids_decode.data(),
+                                  default_memory_pool()));
+  }
+  st.SetItemsProcessed(st.iterations() * batch.length);
+}
+
+static void BM_RowArray_Decode(benchmark::State& st,
+                               const std::shared_ptr<DataType>& type) {
+  SchemaBuilder schema_builder;
+  DCHECK_OK(schema_builder.AddField(field("", type)));
+  auto schema = *schema_builder.Finish();
+  RowArrayDecodeBenchmark(st, schema, 0);
+}
+
+BENCHMARK_CAPTURE(BM_RowArray_Decode, "boolean", boolean());
+BENCHMARK_CAPTURE(BM_RowArray_Decode, "int8", int8());
+BENCHMARK_CAPTURE(BM_RowArray_Decode, "int16", int16());
+BENCHMARK_CAPTURE(BM_RowArray_Decode, "int32", int32());
+BENCHMARK_CAPTURE(BM_RowArray_Decode, "int64", int64());
+
+static void BM_RowArray_DecodeFixedSizeBinary(benchmark::State& st) {
+  int fixed_size = static_cast<int>(st.range(0));
+  SchemaBuilder schema_builder;
+  DCHECK_OK(schema_builder.AddField(field("", fixed_size_binary(fixed_size))));
+  auto schema = *schema_builder.Finish();
+  RowArrayDecodeBenchmark(st, schema, 0);
+}
+
+BENCHMARK(BM_RowArray_DecodeFixedSizeBinary)
+    ->ArgNames({"fixed_size"})
+    ->ArgsProduct({{3, 5, 6, 7, 9, 16, 42}});
+
+static void BM_RowArray_DecodeBinary(benchmark::State& st) {
+  int max_length = static_cast<int>(st.range(0));
+  std::unordered_map<std::string, std::string> metadata;
+  metadata["max_length"] = std::to_string(max_length);
+  SchemaBuilder schema_builder;
+  DCHECK_OK(schema_builder.AddField(field("", utf8(), key_value_metadata(metadata))));
+  auto schema = *schema_builder.Finish();
+  RowArrayDecodeBenchmark(st, schema, 0);
+}
+
+BENCHMARK(BM_RowArray_DecodeBinary)
+    ->ArgNames({"max_length"})
+    ->ArgsProduct({{32, 64, 128}});
+
+static void BM_RowArray_DecodeOneOfColumns(benchmark::State& st,
+                                           std::vector<std::shared_ptr<DataType>> types) {
+  SchemaBuilder schema_builder;
+  for (const auto& type : types) {
+    DCHECK_OK(schema_builder.AddField(field("", type)));
+  }
+  auto schema = *schema_builder.Finish();
+  int column_to_decode = static_cast<int>(st.range(0));
+  RowArrayDecodeBenchmark(st, schema, column_to_decode);
+}
+
+const std::vector<std::shared_ptr<DataType>> fixed_length_row_column_types{
+    boolean(), int32(), fixed_size_binary(64)};
+BENCHMARK_CAPTURE(BM_RowArray_DecodeOneOfColumns,
+                  "fixed_length_row:{boolean,int32,fixed_size_binary(64)}",
+                  fixed_length_row_column_types)
+    ->ArgNames({"column"})
+    ->ArgsProduct(
+        {benchmark::CreateDenseRange(0, fixed_length_row_column_types.size() - 1, 1)});
+
+const std::vector<std::shared_ptr<DataType>> var_length_row_column_types{
+    boolean(), int32(), utf8(), utf8()};
+BENCHMARK_CAPTURE(BM_RowArray_DecodeOneOfColumns,
+                  "var_length_row:{boolean,int32,utf8,utf8}", var_length_row_column_types)
+    ->ArgNames({"column"})
+    ->ArgsProduct({benchmark::CreateDenseRange(0, var_length_row_column_types.size() - 1,
+                                               1)});
+
 }  // namespace acero
 }  // namespace arrow
diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc
index 6c783110af571..53092c898eac9 100644
--- a/cpp/src/arrow/acero/swiss_join.cc
+++ b/cpp/src/arrow/acero/swiss_join.cc
@@ -57,150 +57,12 @@ int RowArrayAccessor::VarbinaryColumnId(const RowTableMetadata& row_metadata,
   return varbinary_column_id;
 }
 
-int RowArrayAccessor::NumRowsToSkip(const RowTableImpl& rows, int column_id, int num_rows,
-                                    const uint32_t* row_ids, int num_tail_bytes_to_skip) {
-  uint32_t num_bytes_skipped = 0;
-  int num_rows_left = num_rows;
-
-  bool is_fixed_length_column =
-      rows.metadata().column_metadatas[column_id].is_fixed_length;
-
-  if (!is_fixed_length_column) {
-    // Varying length column
-    //
-    int varbinary_column_id = VarbinaryColumnId(rows.metadata(), column_id);
-
-    while (num_rows_left > 0 &&
-           num_bytes_skipped < static_cast<uint32_t>(num_tail_bytes_to_skip)) {
-      // Find the pointer to the last requested row
-      //
-      uint32_t last_row_id = row_ids[num_rows_left - 1];
-      const uint8_t* row_ptr = rows.data(2) + rows.offsets()[last_row_id];
-
-      // Find the length of the requested varying length field in that row
-      //
-      uint32_t field_offset_within_row, field_length;
-      if (varbinary_column_id == 0) {
-        rows.metadata().first_varbinary_offset_and_length(
-            row_ptr, &field_offset_within_row, &field_length);
-      } else {
-        rows.metadata().nth_varbinary_offset_and_length(
-            row_ptr, varbinary_column_id, &field_offset_within_row, &field_length);
-      }
-
-      num_bytes_skipped += field_length;
-      --num_rows_left;
-    }
-  } else {
-    // Fixed length column
-    //
-    uint32_t field_length = rows.metadata().column_metadatas[column_id].fixed_length;
-    uint32_t num_bytes_skipped = 0;
-    while (num_rows_left > 0 &&
-           num_bytes_skipped < static_cast<uint32_t>(num_tail_bytes_to_skip)) {
-      num_bytes_skipped += field_length;
-      --num_rows_left;
-    }
-  }
-
-  return num_rows - num_rows_left;
-}
-
-template <class PROCESS_VALUE_FN>
-void RowArrayAccessor::Visit(const RowTableImpl& rows, int column_id, int num_rows,
-                             const uint32_t* row_ids, PROCESS_VALUE_FN process_value_fn) {
-  bool is_fixed_length_column =
-      rows.metadata().column_metadatas[column_id].is_fixed_length;
-
-  // There are 4 cases, each requiring different steps:
-  // 1. Varying length column that is the first varying length column in a row
-  // 2. Varying length column that is not the first varying length column in a
-  // row
-  // 3. Fixed length column in a fixed length row
-  // 4. Fixed length column in a varying length row
-
-  if (!is_fixed_length_column) {
-    int varbinary_column_id = VarbinaryColumnId(rows.metadata(), column_id);
-    const uint8_t* row_ptr_base = rows.data(2);
-    const RowTableImpl::offset_type* row_offsets = rows.offsets();
-    uint32_t field_offset_within_row, field_length;
-
-    if (varbinary_column_id == 0) {
-      // Case 1: This is the first varbinary column
-      //
-      for (int i = 0; i < num_rows; ++i) {
-        uint32_t row_id = row_ids[i];
-        const uint8_t* row_ptr = row_ptr_base + row_offsets[row_id];
-        rows.metadata().first_varbinary_offset_and_length(
-            row_ptr, &field_offset_within_row, &field_length);
-        process_value_fn(i, row_ptr + field_offset_within_row, field_length);
-      }
-    } else {
-      // Case 2: This is second or later varbinary column
-      //
-      for (int i = 0; i < num_rows; ++i) {
-        uint32_t row_id = row_ids[i];
-        const uint8_t* row_ptr = row_ptr_base + row_offsets[row_id];
-        rows.metadata().nth_varbinary_offset_and_length(
-            row_ptr, varbinary_column_id, &field_offset_within_row, &field_length);
-        process_value_fn(i, row_ptr + field_offset_within_row, field_length);
-      }
-    }
-  }
-
-  if (is_fixed_length_column) {
-    uint32_t field_offset_within_row = rows.metadata().encoded_field_offset(
-        rows.metadata().pos_after_encoding(column_id));
-    uint32_t field_length = rows.metadata().column_metadatas[column_id].fixed_length;
-    // Bit column is encoded as a single byte
-    //
-    if (field_length == 0) {
-      field_length = 1;
-    }
-    uint32_t row_length = rows.metadata().fixed_length;
-
-    bool is_fixed_length_row = rows.metadata().is_fixed_length;
-    if (is_fixed_length_row) {
-      // Case 3: This is a fixed length column in a fixed length row
-      //
-      const uint8_t* row_ptr_base = rows.data(1) + field_offset_within_row;
-      for (int i = 0; i < num_rows; ++i) {
-        uint32_t row_id = row_ids[i];
-        const uint8_t* row_ptr = row_ptr_base + row_length * row_id;
-        process_value_fn(i, row_ptr, field_length);
-      }
-    } else {
-      // Case 4: This is a fixed length column in a varying length row
-      //
-      const uint8_t* row_ptr_base = rows.data(2) + field_offset_within_row;
-      const RowTableImpl::offset_type* row_offsets = rows.offsets();
-      for (int i = 0; i < num_rows; ++i) {
-        uint32_t row_id = row_ids[i];
-        const uint8_t* row_ptr = row_ptr_base + row_offsets[row_id];
-        process_value_fn(i, row_ptr, field_length);
-      }
-    }
-  }
-}
-
-template <class PROCESS_VALUE_FN>
-void RowArrayAccessor::VisitNulls(const RowTableImpl& rows, int column_id, int num_rows,
-                                  const uint32_t* row_ids,
-                                  PROCESS_VALUE_FN process_value_fn) {
-  const uint8_t* null_masks = rows.null_masks();
-  uint32_t null_mask_num_bytes = rows.metadata().null_masks_bytes_per_row;
-  uint32_t pos_after_encoding = rows.metadata().pos_after_encoding(column_id);
-  for (int i = 0; i < num_rows; ++i) {
-    uint32_t row_id = row_ids[i];
-    int64_t bit_id = row_id * null_mask_num_bytes * 8 + pos_after_encoding;
-    process_value_fn(i, bit_util::GetBit(null_masks, bit_id) ? 0xff : 0);
-  }
-}
-
-Status RowArray::InitIfNeeded(MemoryPool* pool, const RowTableMetadata& row_metadata) {
+Status RowArray::InitIfNeeded(MemoryPool* pool, int64_t hardware_flags,
+                              const RowTableMetadata& row_metadata) {
   if (is_initialized_) {
     return Status::OK();
   }
+  hardware_flags_ = hardware_flags;
   encoder_.Init(row_metadata.column_metadatas, sizeof(uint64_t), sizeof(uint64_t));
   RETURN_NOT_OK(rows_temp_.Init(pool, row_metadata));
   RETURN_NOT_OK(rows_.Init(pool, row_metadata));
@@ -208,7 +70,8 @@ Status RowArray::InitIfNeeded(MemoryPool* pool, const RowTableMetadata& row_meta
   return Status::OK();
 }
 
-Status RowArray::InitIfNeeded(MemoryPool* pool, const ExecBatch& batch) {
+Status RowArray::InitIfNeeded(MemoryPool* pool, int64_t hardware_flags,
+                              const ExecBatch& batch) {
   if (is_initialized_) {
     return Status::OK();
   }
@@ -218,14 +81,15 @@ Status RowArray::InitIfNeeded(MemoryPool* pool, const ExecBatch& batch) {
   row_metadata.FromColumnMetadataVector(column_metadatas, sizeof(uint64_t),
                                         sizeof(uint64_t));
 
-  return InitIfNeeded(pool, row_metadata);
+  return InitIfNeeded(pool, hardware_flags, row_metadata);
 }
 
-Status RowArray::AppendBatchSelection(MemoryPool* pool, const ExecBatch& batch,
-                                      int begin_row_id, int end_row_id, int num_row_ids,
+Status RowArray::AppendBatchSelection(MemoryPool* pool, int64_t hardware_flags,
+                                      const ExecBatch& batch, int begin_row_id,
+                                      int end_row_id, int num_row_ids,
                                       const uint16_t* row_ids,
                                       std::vector<KeyColumnArray>& temp_column_arrays) {
-  RETURN_NOT_OK(InitIfNeeded(pool, batch));
+  RETURN_NOT_OK(InitIfNeeded(pool, hardware_flags, batch));
   RETURN_NOT_OK(ColumnArraysFromExecBatch(batch, begin_row_id, end_row_id - begin_row_id,
                                           &temp_column_arrays));
   encoder_.PrepareEncodeSelected(
@@ -238,7 +102,7 @@ Status RowArray::AppendBatchSelection(MemoryPool* pool, const ExecBatch& batch,
 void RowArray::Compare(const ExecBatch& batch, int begin_row_id, int end_row_id,
                        int num_selected, const uint16_t* batch_selection_maybe_null,
                        const uint32_t* array_row_ids, uint32_t* out_num_not_equal,
-                       uint16_t* out_not_equal_selection, int64_t hardware_flags,
+                       uint16_t* out_not_equal_selection,
                        arrow::util::TempVectorStack* temp_stack,
                        std::vector<KeyColumnArray>& temp_column_arrays,
                        uint8_t* out_match_bitvector_maybe_null) {
@@ -247,7 +111,7 @@ void RowArray::Compare(const ExecBatch& batch, int begin_row_id, int end_row_id,
   ARROW_DCHECK(status.ok());
 
   LightContext ctx;
-  ctx.hardware_flags = hardware_flags;
+  ctx.hardware_flags = hardware_flags_;
   ctx.stack = temp_stack;
   KeyCompare::CompareColumnsToRows(
       num_selected, batch_selection_maybe_null, array_row_ids, &ctx, out_num_not_equal,
@@ -259,6 +123,25 @@ Status RowArray::DecodeSelected(ResizableArrayData* output, int column_id,
                                 int num_rows_to_append, const uint32_t* row_ids,
                                 MemoryPool* pool) const {
   int num_rows_before = output->num_rows();
+#ifdef ARROW_HAVE_RUNTIME_AVX2
+  // Preprocess some rows if necessary to assure that AVX2 version sees 8-row aligned
+  // output address.
+  if ((hardware_flags_ & arrow::internal::CpuInfo::AVX2) && (num_rows_before % 8 != 0) &&
+      (num_rows_to_append >= 8)) {
+    int num_rows_to_preprocess = 8 - num_rows_before % 8;
+    // The output must have allocated enough rows to store this few number of preprocessed
+    // rows without costly resizing the internal buffers.
+    DCHECK_GE(output->num_rows_allocated(), num_rows_before + num_rows_to_preprocess);
+    RETURN_NOT_OK(
+        DecodeSelected(output, column_id, num_rows_to_preprocess, row_ids, pool));
+    return DecodeSelected(output, column_id, num_rows_to_append - num_rows_to_preprocess,
+                          row_ids + num_rows_to_preprocess, pool);
+  }
+
+  bool use_avx2 =
+      (hardware_flags_ & arrow::internal::CpuInfo::AVX2) && (num_rows_before % 8 == 0);
+#endif
+
   RETURN_NOT_OK(output->ResizeFixedLengthBuffers(num_rows_before + num_rows_to_append));
 
   // Both input (KeyRowArray) and output (ResizableArrayData) have buffers with
@@ -267,98 +150,59 @@ Status RowArray::DecodeSelected(ResizableArrayData* output, int column_id,
   //
 
   ARROW_ASSIGN_OR_RAISE(KeyColumnMetadata column_metadata, output->column_metadata());
+  int num_rows_processed = 0;
 
   if (column_metadata.is_fixed_length) {
     uint32_t fixed_length = column_metadata.fixed_length;
-    switch (fixed_length) {
-      case 0:
-        RowArrayAccessor::Visit(rows_, column_id, num_rows_to_append, row_ids,
-                                [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
-                                  bit_util::SetBitTo(output->mutable_data(1),
-                                                     num_rows_before + i, *ptr != 0);
-                                });
-        break;
-      case 1:
-        RowArrayAccessor::Visit(rows_, column_id, num_rows_to_append, row_ids,
-                                [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
-                                  output->mutable_data(1)[num_rows_before + i] = *ptr;
-                                });
-        break;
-      case 2:
-        RowArrayAccessor::Visit(
-            rows_, column_id, num_rows_to_append, row_ids,
-            [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
-              reinterpret_cast<uint16_t*>(output->mutable_data(1))[num_rows_before + i] =
-                  *reinterpret_cast<const uint16_t*>(ptr);
-            });
-        break;
-      case 4:
-        RowArrayAccessor::Visit(
-            rows_, column_id, num_rows_to_append, row_ids,
-            [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
-              reinterpret_cast<uint32_t*>(output->mutable_data(1))[num_rows_before + i] =
-                  *reinterpret_cast<const uint32_t*>(ptr);
-            });
-        break;
-      case 8:
-        RowArrayAccessor::Visit(
-            rows_, column_id, num_rows_to_append, row_ids,
-            [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
-              reinterpret_cast<uint64_t*>(output->mutable_data(1))[num_rows_before + i] =
-                  *reinterpret_cast<const uint64_t*>(ptr);
-            });
-        break;
-      default:
-        RowArrayAccessor::Visit(
-            rows_, column_id, num_rows_to_append, row_ids,
-            [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
-              uint64_t* dst = reinterpret_cast<uint64_t*>(
-                  output->mutable_data(1) + num_bytes * (num_rows_before + i));
-              const uint64_t* src = reinterpret_cast<const uint64_t*>(ptr);
-              for (uint32_t word_id = 0;
-                   word_id < bit_util::CeilDiv(num_bytes, sizeof(uint64_t)); ++word_id) {
-                arrow::util::SafeStore<uint64_t>(dst + word_id,
-                                                 arrow::util::SafeLoad(src + word_id));
-              }
-            });
-        break;
+
+    // Process fixed length columns
+    //
+#ifdef ARROW_HAVE_RUNTIME_AVX2
+    if (use_avx2) {
+      num_rows_processed = DecodeFixedLength_avx2(
+          output, num_rows_before, column_id, fixed_length, num_rows_to_append, row_ids);
     }
+#endif
+    DecodeFixedLength(output, num_rows_before + num_rows_processed, column_id,
+                      fixed_length, num_rows_to_append - num_rows_processed,
+                      row_ids + num_rows_processed);
   } else {
-    uint32_t* offsets =
-        reinterpret_cast<uint32_t*>(output->mutable_data(1)) + num_rows_before;
-    uint32_t sum = num_rows_before == 0 ? 0 : offsets[0];
-    RowArrayAccessor::Visit(
-        rows_, column_id, num_rows_to_append, row_ids,
-        [&](int i, const uint8_t* ptr, uint32_t num_bytes) { offsets[i] = num_bytes; });
-    for (int i = 0; i < num_rows_to_append; ++i) {
-      uint32_t length = offsets[i];
-      offsets[i] = sum;
-      sum += length;
-    }
-    offsets[num_rows_to_append] = sum;
+    // Process offsets for varying length columns
+    //
+#ifdef ARROW_HAVE_RUNTIME_AVX2
+    if (use_avx2) {
+      num_rows_processed = DecodeOffsets_avx2(output, num_rows_before, column_id,
+                                              num_rows_to_append, row_ids);
+    }
+#endif
+    DecodeOffsets(output, num_rows_before + num_rows_processed, column_id,
+                  num_rows_to_append - num_rows_processed, row_ids + num_rows_processed);
+
     RETURN_NOT_OK(output->ResizeVaryingLengthBuffer());
-    RowArrayAccessor::Visit(
-        rows_, column_id, num_rows_to_append, row_ids,
-        [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
-          uint64_t* dst = reinterpret_cast<uint64_t*>(
-              output->mutable_data(2) +
-              reinterpret_cast<const uint32_t*>(
-                  output->mutable_data(1))[num_rows_before + i]);
-          const uint64_t* src = reinterpret_cast<const uint64_t*>(ptr);
-          for (uint32_t word_id = 0;
-               word_id < bit_util::CeilDiv(num_bytes, sizeof(uint64_t)); ++word_id) {
-            arrow::util::SafeStore<uint64_t>(dst + word_id,
-                                             arrow::util::SafeLoad(src + word_id));
-          }
-        });
+
+    // Process data for varying length columns
+    //
+#ifdef ARROW_HAVE_RUNTIME_AVX2
+    if (use_avx2) {
+      num_rows_processed = DecodeVarLength_avx2(output, num_rows_before, column_id,
+                                                num_rows_to_append, row_ids);
+    }
+#endif
+    DecodeVarLength(output, num_rows_before + num_rows_processed, column_id,
+                    num_rows_to_append - num_rows_processed,
+                    row_ids + num_rows_processed);
   }
 
   // Process nulls
   //
-  RowArrayAccessor::VisitNulls(
-      rows_, column_id, num_rows_to_append, row_ids, [&](int i, uint8_t value) {
-        bit_util::SetBitTo(output->mutable_data(0), num_rows_before + i, value == 0);
-      });
+#ifdef ARROW_HAVE_RUNTIME_AVX2
+  if (use_avx2) {
+    num_rows_processed =
+        DecodeNulls_avx2(output, num_rows_before, column_id, num_rows_to_append, row_ids);
+  }
+#endif
+  DecodeNulls(output, num_rows_before + num_rows_processed, column_id,
+              num_rows_to_append - num_rows_processed, row_ids + num_rows_processed);
 
   return Status::OK();
 }
@@ -437,16 +281,125 @@ void RowArray::DebugPrintToFile(const char* filename, bool print_sorted) const {
   }
 }
 
+void RowArray::DecodeFixedLength(ResizableArrayData* output, int output_start_row,
+                                 int column_id, uint32_t fixed_length,
+                                 int num_rows_to_append, const uint32_t* row_ids) const {
+  switch (fixed_length) {
+    case 0:
+      RowArrayAccessor::Visit(rows_, column_id, num_rows_to_append, row_ids,
+                              [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
+                                bit_util::SetBitTo(output->mutable_data(1),
+                                                   output_start_row + i, *ptr != 0);
+                              });
+      break;
+    case 1:
+      RowArrayAccessor::Visit(rows_, column_id, num_rows_to_append, row_ids,
+                              [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
+                                output->mutable_data(1)[output_start_row + i] = *ptr;
+                              });
+      break;
+    case 2:
+      RowArrayAccessor::Visit(
+          rows_, column_id, num_rows_to_append, row_ids,
+          [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
+            output->mutable_data_as<uint16_t>(1)[output_start_row + i] =
+                *reinterpret_cast<const uint16_t*>(ptr);
+          });
+      break;
+    case 4:
+      RowArrayAccessor::Visit(
+          rows_, column_id, num_rows_to_append, row_ids,
+          [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
+            output->mutable_data_as<uint32_t>(1)[output_start_row + i] =
+                *reinterpret_cast<const uint32_t*>(ptr);
+          });
+      break;
+    case 8:
+      RowArrayAccessor::Visit(
+          rows_, column_id, num_rows_to_append, row_ids,
+          [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
+            output->mutable_data_as<uint64_t>(1)[output_start_row + i] =
+                *reinterpret_cast<const uint64_t*>(ptr);
+          });
+      break;
+    default:
+      RowArrayAccessor::Visit(
+          rows_, column_id, num_rows_to_append, row_ids,
+          [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
+            uint64_t* dst = reinterpret_cast<uint64_t*>(
+                output->mutable_data(1) + num_bytes * (output_start_row + i));
+            const uint64_t* src = reinterpret_cast<const uint64_t*>(ptr);
+            // Note that both `output` and `ptr` have been allocated with enough padding
+            // to accommodate the memory overshoot. See the allocations for
+            // `ResizableArrayData` in `JoinResultMaterialize` and `JoinResidualFilter`
+            // for `output`, and `RowTableImpl::kPaddingForVectors` for `ptr`.
+            for (uint32_t word_id = 0;
+                 word_id < bit_util::CeilDiv(num_bytes, sizeof(uint64_t)); ++word_id) {
+              arrow::util::SafeStore<uint64_t>(dst + word_id,
+                                               arrow::util::SafeLoad(src + word_id));
+            }
+          });
+      break;
+  }
+}
+
+void RowArray::DecodeOffsets(ResizableArrayData* output, int output_start_row,
+                             int column_id, int num_rows_to_append,
+                             const uint32_t* row_ids) const {
+  uint32_t* offsets = output->mutable_data_as<uint32_t>(1) + output_start_row;
+  uint32_t sum = (output_start_row == 0) ? 0 : offsets[0];
+  RowArrayAccessor::Visit(
+      rows_, column_id, num_rows_to_append, row_ids,
+      [&](int i, const uint8_t* ptr, uint32_t num_bytes) { offsets[i] = num_bytes; });
+  for (int i = 0; i < num_rows_to_append; ++i) {
+    uint32_t length = offsets[i];
+    offsets[i] = sum;
+    sum += length;
+  }
+  offsets[num_rows_to_append] = sum;
+}
+
+void RowArray::DecodeVarLength(ResizableArrayData* output, int output_start_row,
+                               int column_id, int num_rows_to_append,
+                               const uint32_t* row_ids) const {
+  RowArrayAccessor::Visit(
+      rows_, column_id, num_rows_to_append, row_ids,
+      [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
+        uint64_t* dst = reinterpret_cast<uint64_t*>(
+            output->mutable_data(2) +
+            output->mutable_data_as<uint32_t>(1)[output_start_row + i]);
+        const uint64_t* src = reinterpret_cast<const uint64_t*>(ptr);
+        // Note that both `output` and `ptr` have been allocated with enough padding to
+        // accommodate the memory overshoot. See the allocations for `ResizableArrayData`
+        // in `JoinResultMaterialize` and `JoinResidualFilter` for `output`, and
+        // `RowTableImpl::kPaddingForVectors` for `ptr`.
+        for (uint32_t word_id = 0;
+             word_id < bit_util::CeilDiv(num_bytes, sizeof(uint64_t)); ++word_id) {
+          arrow::util::SafeStore<uint64_t>(dst + word_id,
+                                           arrow::util::SafeLoad(src + word_id));
+        }
+      });
+}
+
+void RowArray::DecodeNulls(ResizableArrayData* output, int output_start_row,
+                           int column_id, int num_rows_to_append,
+                           const uint32_t* row_ids) const {
+  RowArrayAccessor::VisitNulls(
+      rows_, column_id, num_rows_to_append, row_ids, [&](int i, uint8_t value) {
+        bit_util::SetBitTo(output->mutable_data(0), output_start_row + i, value == 0);
+      });
+}
+
 Status RowArrayMerge::PrepareForMerge(RowArray* target,
                                       const std::vector<RowArray*>& sources,
                                       std::vector<int64_t>* first_target_row_id,
-                                      MemoryPool* pool) {
+                                      MemoryPool* pool, int64_t hardware_flags) {
   ARROW_DCHECK(!sources.empty());
 
   ARROW_DCHECK(sources[0]->is_initialized_);
   const RowTableMetadata& metadata = sources[0]->rows_.metadata();
   ARROW_DCHECK(!target->is_initialized_);
-  RETURN_NOT_OK(target->InitIfNeeded(pool, metadata));
+  RETURN_NOT_OK(target->InitIfNeeded(pool, hardware_flags, metadata));
 
   // Sum the number of rows from all input sources and calculate their total
   // size.
@@ -895,8 +848,8 @@ void SwissTableWithKeys::EqualCallback(int num_keys, const uint16_t* selection_m
     uint8_t* match_bitvector = match_bitvector_buf.mutable_data();
 
     keys_.Compare(*in->batch, batch_start_to_use, batch_end_to_use, num_keys,
-                  selection_to_use, group_ids_to_use, nullptr, nullptr, hardware_flags,
-                  in->temp_stack, *in->temp_column_arrays, match_bitvector);
+                  selection_to_use, group_ids_to_use, nullptr, nullptr, in->temp_stack,
+                  *in->temp_column_arrays, match_bitvector);
 
     if (selection_maybe_null) {
       int num_keys_mismatch = 0;
@@ -918,8 +871,7 @@ void SwissTableWithKeys::EqualCallback(int num_keys, const uint16_t* selection_m
     group_ids_to_use = group_ids;
     keys_.Compare(*in->batch, batch_start_to_use, batch_end_to_use, num_keys,
                   selection_to_use, group_ids_to_use, out_num_keys_mismatch,
-                  out_selection_mismatch, hardware_flags, in->temp_stack,
-                  *in->temp_column_arrays);
+                  out_selection_mismatch, in->temp_stack, *in->temp_column_arrays);
   }
 }
 
@@ -944,16 +896,18 @@ Status SwissTableWithKeys::AppendCallback(int num_keys, const uint16_t* selectio
     batch_end_to_use = static_cast<int>(in->batch->length);
     selection_to_use = selection_to_use_buf.mutable_data();
 
-    return keys_.AppendBatchSelection(swiss_table_.pool(), *in->batch, batch_start_to_use,
-                                      batch_end_to_use, num_keys, selection_to_use,
+    return keys_.AppendBatchSelection(swiss_table_.pool(), swiss_table_.hardware_flags(),
+                                      *in->batch, batch_start_to_use, batch_end_to_use,
+                                      num_keys, selection_to_use,
                                       *in->temp_column_arrays);
   } else {
     batch_start_to_use = in->batch_start_row;
     batch_end_to_use = in->batch_end_row;
     selection_to_use = selection;
 
-    return keys_.AppendBatchSelection(swiss_table_.pool(), *in->batch, batch_start_to_use,
-                                      batch_end_to_use, num_keys, selection_to_use,
+    return keys_.AppendBatchSelection(swiss_table_.pool(), swiss_table_.hardware_flags(),
+                                      *in->batch, batch_start_to_use, batch_end_to_use,
+                                      num_keys, selection_to_use,
                                       *in->temp_column_arrays);
   }
 }
@@ -1177,8 +1131,10 @@ Status SwissTableForJoinBuild::Init(SwissTableForJoin* target, int dop, int64_t
   for (int i = 0; i < num_prtns_; ++i) {
     PartitionState& prtn_state = prtn_states_[i];
     RETURN_NOT_OK(prtn_state.keys.Init(hardware_flags_, pool_));
-    RETURN_NOT_OK(prtn_state.keys.keys()->InitIfNeeded(pool, key_row_metadata));
-    RETURN_NOT_OK(prtn_state.payloads.InitIfNeeded(pool, payload_row_metadata));
+    RETURN_NOT_OK(
+        prtn_state.keys.keys()->InitIfNeeded(pool, hardware_flags, key_row_metadata));
+    RETURN_NOT_OK(
+        prtn_state.payloads.InitIfNeeded(pool, hardware_flags, payload_row_metadata));
   }
 
   target_->dop_ = dop_;
@@ -1294,7 +1250,7 @@ Status SwissTableForJoinBuild::ProcessPartition(int64_t thread_id,
   if (!no_payload_) {
     ARROW_DCHECK(payload_batch_maybe_null);
     RETURN_NOT_OK(prtn_state.payloads.AppendBatchSelection(
-        pool_, *payload_batch_maybe_null, 0,
+        pool_, hardware_flags_, *payload_batch_maybe_null, 0,
         static_cast<int>(payload_batch_maybe_null->length), num_rows_new, row_ids,
         locals.temp_column_arrays));
   }
@@ -1324,7 +1280,8 @@ Status SwissTableForJoinBuild::PreparePrtnMerge() {
     partition_keys[i] = prtn_states_[i].keys.keys();
   }
   RETURN_NOT_OK(RowArrayMerge::PrepareForMerge(target_->map_.keys(), partition_keys,
-                                               &partition_keys_first_row_id_, pool_));
+                                               &partition_keys_first_row_id_, pool_,
+                                               hardware_flags_));
 
   // 2. SwissTable:
   //
@@ -1346,8 +1303,8 @@ Status SwissTableForJoinBuild::PreparePrtnMerge() {
       partition_payloads[i] = &prtn_states_[i].payloads;
     }
     RETURN_NOT_OK(RowArrayMerge::PrepareForMerge(&target_->payloads_, partition_payloads,
-                                                 &partition_payloads_first_row_id_,
-                                                 pool_));
+                                                 &partition_payloads_first_row_id_, pool_,
+                                                 hardware_flags_));
   }
 
   // Check if we have duplicate keys
@@ -1499,7 +1456,7 @@ void SwissTableForJoinBuild::FinishPrtnMerge(arrow::util::TempVectorStack* temp_
   LightContext ctx;
   ctx.hardware_flags = hardware_flags_;
   ctx.stack = temp_stack;
-  std::ignore = target_->map_.keys()->rows_.has_any_nulls(&ctx);
+  target_->map_.keys()->EnsureHasAnyNullsComputed(ctx);
 }
 
 void JoinResultMaterialize::Init(MemoryPool* pool,
@@ -1667,7 +1624,9 @@ Result<std::shared_ptr<ArrayData>> JoinResultMaterialize::FlushBuildColumn(
     const std::shared_ptr<DataType>& data_type, const RowArray* row_array, int column_id,
     uint32_t* row_ids) {
   ResizableArrayData output;
-  RETURN_NOT_OK(output.Init(data_type, pool_, bit_util::Log2(num_rows_)));
+  // Allocate at least 8 rows for the convenience of SIMD decoding.
+  int log_num_rows_min = std::max(3, bit_util::Log2(num_rows_));
+  RETURN_NOT_OK(output.Init(data_type, pool_, log_num_rows_min));
 
   for (size_t i = 0; i <= null_ranges_.size(); ++i) {
     int row_id_begin =
@@ -2247,9 +2206,11 @@ Result<ExecBatch> JoinResidualFilter::MaterializeFilterInput(
         build_schemas_->map(HashJoinProjection::FILTER, HashJoinProjection::PAYLOAD);
     for (int i = 0; i < num_build_cols; ++i) {
       ResizableArrayData column_data;
+      // Allocate at least 8 rows for the convenience of SIMD decoding.
+      int log_num_rows_min = std::max(3, bit_util::Log2(num_batch_rows));
       RETURN_NOT_OK(
           column_data.Init(build_schemas_->data_type(HashJoinProjection::FILTER, i),
-                           pool_, bit_util::Log2(num_batch_rows)));
+                           pool_, log_num_rows_min));
       if (auto idx = to_key.get(i); idx != SchemaProjectionMap::kMissingField) {
         RETURN_NOT_OK(build_keys_->DecodeSelected(&column_data, idx, num_batch_rows,
                                                   key_ids_maybe_null, pool_));
diff --git a/cpp/src/arrow/acero/swiss_join_avx2.cc b/cpp/src/arrow/acero/swiss_join_avx2.cc
index 1076073523448..20886cad539c3 100644
--- a/cpp/src/arrow/acero/swiss_join_avx2.cc
+++ b/cpp/src/arrow/acero/swiss_join_avx2.cc
@@ -32,7 +32,7 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu
   // Number of rows processed together in a single iteration of the loop (single
   // call to the provided processing lambda).
   //
-  constexpr int unroll = 8;
+  constexpr int kUnroll = 8;
 
   bool is_fixed_length_column =
       rows.metadata().column_metadatas[column_id].is_fixed_length;
@@ -48,6 +48,8 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu
     int varbinary_column_id = VarbinaryColumnId(rows.metadata(), column_id);
     const uint8_t* row_ptr_base = rows.data(2);
     const RowTableImpl::offset_type* row_offsets = rows.offsets();
+    auto row_offsets_i64 =
+        reinterpret_cast<const arrow::util::int64_for_gather_t*>(row_offsets);
     static_assert(
         sizeof(RowTableImpl::offset_type) == sizeof(int64_t),
         "RowArrayAccessor::Visit_avx2 only supports 64-bit RowTableImpl::offset_type");
@@ -58,17 +60,17 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu
       __m256i field_offset_within_row = _mm256_set1_epi32(rows.metadata().fixed_length);
       __m256i varbinary_end_array_offset =
           _mm256_set1_epi64x(rows.metadata().varbinary_end_array_offset);
-      for (int i = 0; i < num_rows / unroll; ++i) {
+      for (int i = 0; i < num_rows / kUnroll; ++i) {
         // Load 8 32-bit row ids.
         __m256i row_id =
             _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row_ids) + i);
         // Gather the lower/higher 4 64-bit row offsets based on the lower/higher 4 32-bit
         // row ids.
         __m256i row_offset_lo =
-            _mm256_i32gather_epi64(row_offsets, _mm256_castsi256_si128(row_id),
+            _mm256_i32gather_epi64(row_offsets_i64, _mm256_castsi256_si128(row_id),
                                    sizeof(RowTableImpl::offset_type));
         __m256i row_offset_hi =
-            _mm256_i32gather_epi64(row_offsets, _mm256_extracti128_si256(row_id, 1),
+            _mm256_i32gather_epi64(row_offsets_i64, _mm256_extracti128_si256(row_id, 1),
                                    sizeof(RowTableImpl::offset_type));
         // Gather the lower/higher 4 32-bit field lengths based on the lower/higher 4
         // 64-bit row offsets.
@@ -81,7 +83,7 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu
         // The final 8 32-bit field lengths, subtracting the field offset within row.
         __m256i field_length = _mm256_sub_epi32(
             _mm256_set_m128i(field_length_hi, field_length_lo), field_offset_within_row);
-        process_8_values_fn(i * unroll, row_ptr_base,
+        process_8_values_fn(i * kUnroll, row_ptr_base,
                             _mm256_add_epi64(row_offset_lo, field_offset_within_row),
                             _mm256_add_epi64(row_offset_hi, field_offset_within_row),
                             field_length);
@@ -94,19 +96,19 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu
                              sizeof(uint32_t) * (varbinary_column_id - 1));
       auto row_ptr_base_i64 =
           reinterpret_cast<const arrow::util::int64_for_gather_t*>(row_ptr_base);
-      for (int i = 0; i < num_rows / unroll; ++i) {
+      for (int i = 0; i < num_rows / kUnroll; ++i) {
         // Load 8 32-bit row ids.
         __m256i row_id =
             _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row_ids) + i);
         // Gather the lower/higher 4 64-bit row offsets based on the lower/higher 4 32-bit
         // row ids.
         __m256i row_offset_lo =
-            _mm256_i32gather_epi64(row_offsets, _mm256_castsi256_si128(row_id),
+            _mm256_i32gather_epi64(row_offsets_i64, _mm256_castsi256_si128(row_id),
                                    sizeof(RowTableImpl::offset_type));
         // Gather the lower/higher 4 32-bit field lengths based on the lower/higher 4
         // 64-bit row offsets.
         __m256i row_offset_hi =
-            _mm256_i32gather_epi64(row_offsets, _mm256_extracti128_si256(row_id, 1),
+            _mm256_i32gather_epi64(row_offsets_i64, _mm256_extracti128_si256(row_id, 1),
                                    sizeof(RowTableImpl::offset_type));
         // Prepare the lower/higher 4 64-bit end array offsets based on the lower/higher 4
         // 64-bit row offsets.
@@ -127,8 +129,8 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu
         __m256i field_offset_within_row = _mm256_blend_epi32(
             field_offset_within_row_A, field_offset_within_row_B, 0xf0);
 
-        __m256i alignment_padding =
-            _mm256_andnot_si256(field_offset_within_row, _mm256_set1_epi8(0xff));
+        __m256i alignment_padding = _mm256_andnot_si256(
+            field_offset_within_row, _mm256_set1_epi8(static_cast<char>(0xff)));
         alignment_padding = _mm256_add_epi32(alignment_padding, _mm256_set1_epi32(1));
         alignment_padding = _mm256_and_si256(
             alignment_padding, _mm256_set1_epi32(rows.metadata().string_alignment - 1));
@@ -147,7 +149,7 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu
         field_offset_within_row_B =
             _mm256_add_epi32(field_offset_within_row_B, alignment_padding);
 
-        process_8_values_fn(i * unroll, row_ptr_base,
+        process_8_values_fn(i * kUnroll, row_ptr_base,
                             _mm256_add_epi64(row_offset_lo, field_offset_within_row_A),
                             _mm256_add_epi64(row_offset_hi, field_offset_within_row_B),
                             field_length);
@@ -159,15 +161,21 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu
     __m256i field_offset_within_row =
         _mm256_set1_epi64x(rows.metadata().encoded_field_offset(
             rows.metadata().pos_after_encoding(column_id)));
-    __m256i field_length =
-        _mm256_set1_epi32(rows.metadata().column_metadatas[column_id].fixed_length);
+    uint32_t actual_field_length =
+        rows.metadata().column_metadatas[column_id].fixed_length;
+    // Bit column is encoded as a single byte
+    if (actual_field_length == 0) {
+      actual_field_length = 1;
+    }
+    __m256i field_length = _mm256_set1_epi32(actual_field_length);
+    __m256i row_length = _mm256_set1_epi64x(rows.metadata().fixed_length);
 
     bool is_fixed_length_row = rows.metadata().is_fixed_length;
     if (is_fixed_length_row) {
       // Case 3: This is a fixed length column in fixed length row
       //
       const uint8_t* row_ptr_base = rows.data(1);
-      for (int i = 0; i < num_rows / unroll; ++i) {
+      for (int i = 0; i < num_rows / kUnroll; ++i) {
         // Load 8 32-bit row ids.
         __m256i row_id =
             _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row_ids) + i);
@@ -177,15 +185,15 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu
         __m256i row_id_hi = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(row_id, 1));
         // Calculate the lower/higher 4 64-bit row offsets based on the lower/higher 4
         // 64-bit row ids and the fixed field length.
-        __m256i row_offset_lo = _mm256_mul_epi32(row_id_lo, field_length);
-        __m256i row_offset_hi = _mm256_mul_epi32(row_id_hi, field_length);
+        __m256i row_offset_lo = _mm256_mul_epi32(row_id_lo, row_length);
+        __m256i row_offset_hi = _mm256_mul_epi32(row_id_hi, row_length);
         // Calculate the lower/higher 4 64-bit field offsets based on the lower/higher 4
         // 64-bit row offsets and field offset within row.
         __m256i field_offset_lo =
             _mm256_add_epi64(row_offset_lo, field_offset_within_row);
         __m256i field_offset_hi =
             _mm256_add_epi64(row_offset_hi, field_offset_within_row);
-        process_8_values_fn(i * unroll, row_ptr_base, field_offset_lo, field_offset_hi,
+        process_8_values_fn(i * kUnroll, row_ptr_base, field_offset_lo, field_offset_hi,
                             field_length);
       }
     } else {
@@ -193,17 +201,19 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu
       //
       const uint8_t* row_ptr_base = rows.data(2);
       const RowTableImpl::offset_type* row_offsets = rows.offsets();
-      for (int i = 0; i < num_rows / unroll; ++i) {
+      auto row_offsets_i64 =
+          reinterpret_cast<const arrow::util::int64_for_gather_t*>(row_offsets);
+      for (int i = 0; i < num_rows / kUnroll; ++i) {
         // Load 8 32-bit row ids.
         __m256i row_id =
             _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row_ids) + i);
         // Gather the lower/higher 4 64-bit row offsets based on the lower/higher 4 32-bit
         // row ids.
         __m256i row_offset_lo =
-            _mm256_i32gather_epi64(row_offsets, _mm256_castsi256_si128(row_id),
+            _mm256_i32gather_epi64(row_offsets_i64, _mm256_castsi256_si128(row_id),
                                    sizeof(RowTableImpl::offset_type));
         __m256i row_offset_hi =
-            _mm256_i32gather_epi64(row_offsets, _mm256_extracti128_si256(row_id, 1),
+            _mm256_i32gather_epi64(row_offsets_i64, _mm256_extracti128_si256(row_id, 1),
                                    sizeof(RowTableImpl::offset_type));
         // Calculate the lower/higher 4 64-bit field offsets based on the lower/higher 4
         // 64-bit row offsets and field offset within row.
@@ -211,13 +221,13 @@ int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int nu
             _mm256_add_epi64(row_offset_lo, field_offset_within_row);
         __m256i field_offset_hi =
             _mm256_add_epi64(row_offset_hi, field_offset_within_row);
-        process_8_values_fn(i * unroll, row_ptr_base, field_offset_lo, field_offset_hi,
+        process_8_values_fn(i * kUnroll, row_ptr_base, field_offset_lo, field_offset_hi,
                             field_length);
       }
     }
   }
 
-  return num_rows - (num_rows % unroll);
+  return num_rows - (num_rows % kUnroll);
 }
 
 template <class PROCESS_8_VALUES_FN>
@@ -227,31 +237,296 @@ int RowArrayAccessor::VisitNulls_avx2(const RowTableImpl& rows, int column_id,
   // Number of rows processed together in a single iteration of the loop (single
   // call to the provided processing lambda).
   //
-  constexpr int unroll = 8;
+  constexpr int kUnroll = 8;
 
   const uint8_t* null_masks = rows.null_masks();
   __m256i null_bits_per_row =
       _mm256_set1_epi32(8 * rows.metadata().null_masks_bytes_per_row);
-  for (int i = 0; i < num_rows / unroll; ++i) {
+  __m256i pos_after_encoding =
+      _mm256_set1_epi32(rows.metadata().pos_after_encoding(column_id));
+  for (int i = 0; i < num_rows / kUnroll; ++i) {
     __m256i row_id = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row_ids) + i);
     __m256i bit_id = _mm256_mullo_epi32(row_id, null_bits_per_row);
-    bit_id = _mm256_add_epi32(bit_id, _mm256_set1_epi32(column_id));
+    bit_id = _mm256_add_epi32(bit_id, pos_after_encoding);
     __m256i bytes = _mm256_i32gather_epi32(reinterpret_cast<const int*>(null_masks),
                                            _mm256_srli_epi32(bit_id, 3), 1);
     __m256i bit_in_word = _mm256_sllv_epi32(
         _mm256_set1_epi32(1), _mm256_and_si256(bit_id, _mm256_set1_epi32(7)));
+    // `result` will contain one 32-bit word per tested null bit, either 0xffffffff if the
+    // null bit was set or 0 if it was unset.
     __m256i result =
         _mm256_cmpeq_epi32(_mm256_and_si256(bytes, bit_in_word), bit_in_word);
-    uint64_t null_bytes = static_cast<uint64_t>(
+    // NB: Be careful about sign-extension when casting the return value of
+    // _mm256_movemask_epi8 (signed 32-bit) to unsigned 64-bit, which will pollute the
+    // higher bits of the following OR.
+    uint32_t null_bytes_lo = static_cast<uint32_t>(
         _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(result))));
-    null_bytes |= static_cast<uint64_t>(_mm256_movemask_epi8(
-                      _mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 1))))
-                  << 32;
+    uint64_t null_bytes_hi =
+        _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 1)));
+    uint64_t null_bytes = null_bytes_lo | (null_bytes_hi << 32);
 
-    process_8_values_fn(i * unroll, null_bytes);
+    process_8_values_fn(i * kUnroll, null_bytes);
   }
 
-  return num_rows - (num_rows % unroll);
+  return num_rows - (num_rows % kUnroll);
+}
+
+namespace {
+
+inline void Decode8FixedLength0_avx2(uint8_t* output, const uint8_t* row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 1 bit interesting) values based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo =
+      _mm256_i64gather_epi32(reinterpret_cast<const int*>(row_ptr_base), offset_lo, 1);
+  __m128i row_hi =
+      _mm256_i64gather_epi32(reinterpret_cast<const int*>(row_ptr_base), offset_hi, 1);
+  // Extend to 64-bit.
+  __m256i row_lo_64 = _mm256_cvtepi32_epi64(row_lo);
+  __m256i row_hi_64 = _mm256_cvtepi32_epi64(row_hi);
+  // Keep the first 8 bits in each 64-bit value, as the other bits belong to other
+  // columns.
+  row_lo_64 = _mm256_and_si256(row_lo_64, _mm256_set1_epi64x(0xFF));
+  row_hi_64 = _mm256_and_si256(row_hi_64, _mm256_set1_epi64x(0xFF));
+  // If a 64-bit value is zero, then we get 64 set bits.
+  __m256i is_zero_lo_64 = _mm256_cmpeq_epi64(row_lo_64, _mm256_setzero_si256());
+  __m256i is_zero_hi_64 = _mm256_cmpeq_epi64(row_hi_64, _mm256_setzero_si256());
+  // 64 set bits per value to 8 set bits (one byte) per value.
+  int is_zero_lo_8 = _mm256_movemask_epi8(is_zero_lo_64);
+  int is_zero_hi_8 = _mm256_movemask_epi8(is_zero_hi_64);
+  // 8 set bits to 1 set bit.
+  uint8_t is_zero = static_cast<uint8_t>(
+      _mm_movemask_epi8(_mm_set_epi32(0, 0, is_zero_hi_8, is_zero_lo_8)));
+  *output = static_cast<uint8_t>(~is_zero);
+}
+
+inline void Decode8FixedLength1_avx2(uint8_t* output, const uint8_t* row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 8 bits interesting) values based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo =
+      _mm256_i64gather_epi32(reinterpret_cast<const int*>(row_ptr_base), offset_lo, 1);
+  __m128i row_hi =
+      _mm256_i64gather_epi32(reinterpret_cast<const int*>(row_ptr_base), offset_hi, 1);
+  __m256i row = _mm256_set_m128i(row_hi, row_lo);
+  // Shuffle the lower 8 bits of each 32-bit values to the lower 32 bits of each 128-bit
+  // lane.
+  constexpr uint64_t kByteSequence_0_4_8_12 = 0x0c080400ULL;
+  const __m256i shuffle_const =
+      _mm256_setr_epi64x(kByteSequence_0_4_8_12, -1, kByteSequence_0_4_8_12, -1);
+  row = _mm256_shuffle_epi8(row, shuffle_const);
+  // Get the lower 32-bits (4 8-bit values) from each 128-bit lane.
+  // NB: Be careful about sign-extension when casting the return value of
+  // _mm256_extract_epi32 (signed 32-bit) to unsigned 64-bit, which will pollute the
+  // higher bits of the following OR.
+  uint32_t compact_row_lo = static_cast<uint32_t>(_mm256_extract_epi32(row, 0));
+  uint64_t compact_row_hi = static_cast<uint64_t>(_mm256_extract_epi32(row, 4)) << 32;
+  *reinterpret_cast<uint64_t*>(output) = compact_row_lo | compact_row_hi;
+}
+
+inline void Decode8FixedLength2_avx2(uint16_t* output, const uint8_t* row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 16 bits interesting) values based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo =
+      _mm256_i64gather_epi32(reinterpret_cast<const int*>(row_ptr_base), offset_lo, 1);
+  __m128i row_hi =
+      _mm256_i64gather_epi32(reinterpret_cast<const int*>(row_ptr_base), offset_hi, 1);
+  __m256i row = _mm256_set_m128i(row_hi, row_lo);
+  // Shuffle the lower 16 bits of each 32-bit values to the lower 64 bits of each 128-bit
+  // lane.
+  constexpr uint64_t kByteSequence_0_1_4_5_8_9_12_13 = 0x0d0c090805040100ULL;
+  const __m256i shuffle_const = _mm256_setr_epi64x(kByteSequence_0_1_4_5_8_9_12_13, -1,
+                                                   kByteSequence_0_1_4_5_8_9_12_13, -1);
+  row = _mm256_shuffle_epi8(row, shuffle_const);
+  // Swap the second and the third 64-bit lane, so that all 16-bit values end up in the
+  // lower half of `row`.
+  // (0xd8 = 0b 11 01 10 00)
+  row = _mm256_permute4x64_epi64(row, 0xd8);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(output), _mm256_castsi256_si128(row));
+}
+
+inline void Decode8FixedLength4_avx2(uint32_t* output, const uint8_t* row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit values based on the lower/higher 4 64-bit row
+  // offsets.
+  __m128i row_lo =
+      _mm256_i64gather_epi32(reinterpret_cast<const int*>(row_ptr_base), offset_lo, 1);
+  __m128i row_hi =
+      _mm256_i64gather_epi32(reinterpret_cast<const int*>(row_ptr_base), offset_hi, 1);
+  __m256i row = _mm256_set_m128i(row_hi, row_lo);
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output), row);
+}
+
+inline void Decode8FixedLength8_avx2(uint64_t* output, const uint8_t* row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  auto row_ptr_base_i64 =
+      reinterpret_cast<const arrow::util::int64_for_gather_t*>(row_ptr_base);
+  // Gather the lower/higher 4 64-bit values based on the lower/higher 4 64-bit row
+  // offsets.
+  __m256i row_lo = _mm256_i64gather_epi64(row_ptr_base_i64, offset_lo, 1);
+  __m256i row_hi = _mm256_i64gather_epi64(row_ptr_base_i64, offset_hi, 1);
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output), row_lo);
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + 4), row_hi);
+}
+
+inline void Decode1_avx2(uint8_t* output, const uint8_t* row_ptr, uint32_t num_bytes) {
+  // Copy 32 bytes at a time.
+  // Note that both `output` and `row_ptr` have been allocated with enough padding to
+  // accommodate the memory overshoot. See the allocations for `ResizableArrayData` in
+  // `JoinResultMaterialize` and `JoinResidualFilter` for `output`, and
+  // `RowTableImpl::kPaddingForVectors` for `row_ptr`.
+  __m256i* output_i256 = reinterpret_cast<__m256i*>(output);
+  const __m256i* row_ptr_i256 = reinterpret_cast<const __m256i*>(row_ptr);
+  for (int istripe = 0; istripe < bit_util::CeilDiv(num_bytes, 32); ++istripe) {
+    _mm256_storeu_si256(output_i256 + istripe,
+                        _mm256_loadu_si256(row_ptr_i256 + istripe));
+  }
+}
+
+inline uint32_t Decode8Offset_avx2(uint32_t* output, uint32_t current_length,
+                                   __m256i num_bytes) {
+  uint32_t num_bytes_last = static_cast<uint32_t>(_mm256_extract_epi32(num_bytes, 7));
+  // Init every offset with the current length.
+  __m256i offsets = _mm256_set1_epi32(current_length);
+  // We keep left-shifting the length and accumulate the offset by adding the length.
+  __m256i length =
+      _mm256_permutevar8x32_epi32(num_bytes, _mm256_setr_epi32(7, 0, 1, 2, 3, 4, 5, 6));
+  length = _mm256_insert_epi32(length, 0, 0);
+  // `length` is now a sequence of 32-bit words such as:
+  //   - length[0] = 0
+  //   - length[1] = num_bytes[0]
+  //   ...
+  //   - length[7] = num_bytes[6]
+  // (note that num_bytes[7] is kept in `num_bytes_last`)
+  for (int i = 0; i < 7; ++i) {
+    offsets = _mm256_add_epi32(offsets, length);
+    length =
+        _mm256_permutevar8x32_epi32(length, _mm256_setr_epi32(7, 0, 1, 2, 3, 4, 5, 6));
+    length = _mm256_insert_epi32(length, 0, 0);
+  }
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output), offsets);
+  return _mm256_extract_epi32(offsets, 7) + num_bytes_last;
+}
+
+inline void Decode8Null_avx2(uint8_t* output, uint64_t null_bytes) {
+  uint8_t null_bits =
+      static_cast<uint8_t>(_mm256_movemask_epi8(_mm256_set1_epi64x(null_bytes)));
+  *output = ~null_bits;
+}
+
+}  // namespace
+
+int RowArray::DecodeFixedLength_avx2(ResizableArrayData* output, int output_start_row,
+                                     int column_id, uint32_t fixed_length,
+                                     int num_rows_to_append,
+                                     const uint32_t* row_ids) const {
+  DCHECK_EQ(output_start_row % 8, 0);
+
+  int num_rows_processed = 0;
+  switch (fixed_length) {
+    case 0:
+      num_rows_processed = RowArrayAccessor::Visit_avx2(
+          rows_, column_id, num_rows_to_append, row_ids,
+          [&](int i, const uint8_t* row_ptr_base, __m256i offset_lo, __m256i offset_hi,
+              __m256i num_bytes) {
+            DCHECK_EQ(i % 8, 0);
+            Decode8FixedLength0_avx2(output->mutable_data(1) + (output_start_row + i) / 8,
+                                     row_ptr_base, offset_lo, offset_hi);
+          });
+      break;
+    case 1:
+      num_rows_processed = RowArrayAccessor::Visit_avx2(
+          rows_, column_id, num_rows_to_append, row_ids,
+          [&](int i, const uint8_t* row_ptr_base, __m256i offset_lo, __m256i offset_hi,
+              __m256i num_bytes) {
+            Decode8FixedLength1_avx2(output->mutable_data(1) + output_start_row + i,
+                                     row_ptr_base, offset_lo, offset_hi);
+          });
+      break;
+    case 2:
+      num_rows_processed = RowArrayAccessor::Visit_avx2(
+          rows_, column_id, num_rows_to_append, row_ids,
+          [&](int i, const uint8_t* row_ptr_base, __m256i offset_lo, __m256i offset_hi,
+              __m256i num_bytes) {
+            Decode8FixedLength2_avx2(
+                output->mutable_data_as<uint16_t>(1) + output_start_row + i, row_ptr_base,
+                offset_lo, offset_hi);
+          });
+      break;
+    case 4:
+      num_rows_processed = RowArrayAccessor::Visit_avx2(
+          rows_, column_id, num_rows_to_append, row_ids,
+          [&](int i, const uint8_t* row_ptr_base, __m256i offset_lo, __m256i offset_hi,
+              __m256i num_bytes) {
+            Decode8FixedLength4_avx2(
+                output->mutable_data_as<uint32_t>(1) + output_start_row + i, row_ptr_base,
+                offset_lo, offset_hi);
+          });
+      break;
+    case 8:
+      num_rows_processed = RowArrayAccessor::Visit_avx2(
+          rows_, column_id, num_rows_to_append, row_ids,
+          [&](int i, const uint8_t* row_ptr_base, __m256i offset_lo, __m256i offset_hi,
+              __m256i num_bytes) {
+            Decode8FixedLength8_avx2(
+                output->mutable_data_as<uint64_t>(1) + output_start_row + i, row_ptr_base,
+                offset_lo, offset_hi);
+          });
+      break;
+    default:
+      RowArrayAccessor::Visit(
+          rows_, column_id, num_rows_to_append, row_ids,
+          [&](int i, const uint8_t* row_ptr, uint32_t num_bytes) {
+            Decode1_avx2(output->mutable_data(1) + num_bytes * (output_start_row + i),
+                         row_ptr, num_bytes);
+          });
+      num_rows_processed = num_rows_to_append;
+      break;
+  }
+
+  return num_rows_processed;
+}
+
+int RowArray::DecodeOffsets_avx2(ResizableArrayData* output, int output_start_row,
+                                 int column_id, int num_rows_to_append,
+                                 const uint32_t* row_ids) const {
+  uint32_t* offsets = output->mutable_data_as<uint32_t>(1) + output_start_row;
+  uint32_t current_length = (output_start_row == 0) ? 0 : offsets[0];
+  int num_rows_processed = RowArrayAccessor::Visit_avx2(
+      rows_, column_id, num_rows_to_append, row_ids,
+      [&](int i, const uint8_t* row_ptr_base, __m256i offset_lo, __m256i offset_hi,
+          __m256i num_bytes) {
+        current_length = Decode8Offset_avx2(offsets + i, current_length, num_bytes);
+      });
+  offsets[num_rows_processed] = current_length;
+  return num_rows_processed;
+}
+
+int RowArray::DecodeVarLength_avx2(ResizableArrayData* output, int output_start_row,
+                                   int column_id, int num_rows_to_append,
+                                   const uint32_t* row_ids) const {
+  RowArrayAccessor::Visit(
+      rows_, column_id, num_rows_to_append, row_ids,
+      [&](int i, const uint8_t* row_ptr, uint32_t num_bytes) {
+        uint8_t* dst = output->mutable_data(2) +
+                       output->mutable_data_as<uint32_t>(1)[output_start_row + i];
+        Decode1_avx2(dst, row_ptr, num_bytes);
+      });
+  return num_rows_to_append;
+}
+
+int RowArray::DecodeNulls_avx2(ResizableArrayData* output, int output_start_row,
+                               int column_id, int num_rows_to_append,
+                               const uint32_t* row_ids) const {
+  DCHECK_EQ(output_start_row % 8, 0);
+
+  return RowArrayAccessor::VisitNulls_avx2(
+      rows_, column_id, num_rows_to_append, row_ids, [&](int i, uint64_t null_bytes) {
+        DCHECK_EQ(i % 8, 0);
+        Decode8Null_avx2(output->mutable_data(0) + (output_start_row + i) / 8,
+                         null_bytes);
+      });
 }
 
 }  // namespace acero
diff --git a/cpp/src/arrow/acero/swiss_join_internal.h b/cpp/src/arrow/acero/swiss_join_internal.h
index 4d749c1c529ae..f2f3ac5b1bf93 100644
--- a/cpp/src/arrow/acero/swiss_join_internal.h
+++ b/cpp/src/arrow/acero/swiss_join_internal.h
@@ -32,6 +32,7 @@ namespace arrow {
 using compute::ExecBatchBuilder;
 using compute::KeyColumnArray;
 using compute::KeyColumnMetadata;
+using compute::LightContext;
 using compute::ResizableArrayData;
 using compute::RowTableEncoder;
 using compute::RowTableImpl;
@@ -47,16 +48,6 @@ class RowArrayAccessor {
   //
   static int VarbinaryColumnId(const RowTableMetadata& row_metadata, int column_id);
 
-  // Calculate how many rows to skip from the tail of the
-  // sequence of selected rows, such that the total size of skipped rows is at
-  // least equal to the size specified by the caller. Skipping of the tail rows
-  // is used to allow for faster processing by the caller of remaining rows
-  // without checking buffer bounds (useful with SIMD or fixed size memory loads
-  // and stores).
-  //
-  static int NumRowsToSkip(const RowTableImpl& rows, int column_id, int num_rows,
-                           const uint32_t* row_ids, int num_tail_bytes_to_skip);
-
   // The supplied lambda will be called for each row in the given list of rows.
   // The arguments given to it will be:
   // - index of a row (within the set of selected rows),
@@ -68,7 +59,80 @@ class RowArrayAccessor {
   //
   template <class PROCESS_VALUE_FN>
   static void Visit(const RowTableImpl& rows, int column_id, int num_rows,
-                    const uint32_t* row_ids, PROCESS_VALUE_FN process_value_fn);
+                    const uint32_t* row_ids, PROCESS_VALUE_FN process_value_fn) {
+    bool is_fixed_length_column =
+        rows.metadata().column_metadatas[column_id].is_fixed_length;
+
+    // There are 4 cases, each requiring different steps:
+    // 1. Varying length column that is the first varying length column in a row
+    // 2. Varying length column that is not the first varying length column in a
+    // row
+    // 3. Fixed length column in a fixed length row
+    // 4. Fixed length column in a varying length row
+
+    if (!is_fixed_length_column) {
+      int varbinary_column_id = VarbinaryColumnId(rows.metadata(), column_id);
+      const uint8_t* row_ptr_base = rows.data(2);
+      const RowTableImpl::offset_type* row_offsets = rows.offsets();
+      uint32_t field_offset_within_row, field_length;
+
+      if (varbinary_column_id == 0) {
+        // Case 1: This is the first varbinary column
+        //
+        for (int i = 0; i < num_rows; ++i) {
+          uint32_t row_id = row_ids[i];
+          const uint8_t* row_ptr = row_ptr_base + row_offsets[row_id];
+          rows.metadata().first_varbinary_offset_and_length(
+              row_ptr, &field_offset_within_row, &field_length);
+          process_value_fn(i, row_ptr + field_offset_within_row, field_length);
+        }
+      } else {
+        // Case 2: This is second or later varbinary column
+        //
+        for (int i = 0; i < num_rows; ++i) {
+          uint32_t row_id = row_ids[i];
+          const uint8_t* row_ptr = row_ptr_base + row_offsets[row_id];
+          rows.metadata().nth_varbinary_offset_and_length(
+              row_ptr, varbinary_column_id, &field_offset_within_row, &field_length);
+          process_value_fn(i, row_ptr + field_offset_within_row, field_length);
+        }
+      }
+    }
+
+    if (is_fixed_length_column) {
+      uint32_t field_offset_within_row = rows.metadata().encoded_field_offset(
+          rows.metadata().pos_after_encoding(column_id));
+      uint32_t field_length = rows.metadata().column_metadatas[column_id].fixed_length;
+      // Bit column is encoded as a single byte
+      //
+      if (field_length == 0) {
+        field_length = 1;
+      }
+      uint32_t row_length = rows.metadata().fixed_length;
+
+      bool is_fixed_length_row = rows.metadata().is_fixed_length;
+      if (is_fixed_length_row) {
+        // Case 3: This is a fixed length column in a fixed length row
+        //
+        const uint8_t* row_ptr_base = rows.data(1) + field_offset_within_row;
+        for (int i = 0; i < num_rows; ++i) {
+          uint32_t row_id = row_ids[i];
+          const uint8_t* row_ptr = row_ptr_base + row_length * row_id;
+          process_value_fn(i, row_ptr, field_length);
+        }
+      } else {
+        // Case 4: This is a fixed length column in a varying length row
+        //
+        const uint8_t* row_ptr_base = rows.data(2) + field_offset_within_row;
+        const RowTableImpl::offset_type* row_offsets = rows.offsets();
+        for (int i = 0; i < num_rows; ++i) {
+          uint32_t row_id = row_ids[i];
+          const uint8_t* row_ptr = row_ptr_base + row_offsets[row_id];
+          process_value_fn(i, row_ptr, field_length);
+        }
+      }
+    }
+  }
 
   // The supplied lambda will be called for each row in the given list of rows.
   // The arguments given to it will be:
@@ -77,9 +141,17 @@ class RowArrayAccessor {
   //
   template <class PROCESS_VALUE_FN>
   static void VisitNulls(const RowTableImpl& rows, int column_id, int num_rows,
-                         const uint32_t* row_ids, PROCESS_VALUE_FN process_value_fn);
+                         const uint32_t* row_ids, PROCESS_VALUE_FN process_value_fn) {
+    const uint8_t* null_masks = rows.null_masks();
+    uint32_t null_mask_num_bytes = rows.metadata().null_masks_bytes_per_row;
+    uint32_t pos_after_encoding = rows.metadata().pos_after_encoding(column_id);
+    for (int i = 0; i < num_rows; ++i) {
+      uint32_t row_id = row_ids[i];
+      int64_t bit_id = row_id * null_mask_num_bytes * 8 + pos_after_encoding;
+      process_value_fn(i, bit_util::GetBit(null_masks, bit_id) ? 0xff : 0);
+    }
+  }
 
- private:
 #if defined(ARROW_HAVE_RUNTIME_AVX2)
   // This is equivalent to Visit method, but processing 8 rows at a time in a
   // loop.
@@ -108,13 +180,15 @@ class RowArrayAccessor {
 // can be called by multiple threads concurrently.
 //
 struct RowArray {
-  RowArray() : is_initialized_(false) {}
+  RowArray() : is_initialized_(false), hardware_flags_(0) {}
 
-  Status InitIfNeeded(MemoryPool* pool, const ExecBatch& batch);
-  Status InitIfNeeded(MemoryPool* pool, const RowTableMetadata& row_metadata);
+  Status InitIfNeeded(MemoryPool* pool, int64_t hardware_flags, const ExecBatch& batch);
+  Status InitIfNeeded(MemoryPool* pool, int64_t hardware_flags,
+                      const RowTableMetadata& row_metadata);
 
-  Status AppendBatchSelection(MemoryPool* pool, const ExecBatch& batch, int begin_row_id,
-                              int end_row_id, int num_row_ids, const uint16_t* row_ids,
+  Status AppendBatchSelection(MemoryPool* pool, int64_t hardware_flags,
+                              const ExecBatch& batch, int begin_row_id, int end_row_id,
+                              int num_row_ids, const uint16_t* row_ids,
                               std::vector<KeyColumnArray>& temp_column_arrays);
 
   // This can only be called for a minibatch.
@@ -122,12 +196,10 @@ struct RowArray {
   void Compare(const ExecBatch& batch, int begin_row_id, int end_row_id, int num_selected,
                const uint16_t* batch_selection_maybe_null, const uint32_t* array_row_ids,
                uint32_t* out_num_not_equal, uint16_t* out_not_equal_selection,
-               int64_t hardware_flags, arrow::util::TempVectorStack* temp_stack,
+               arrow::util::TempVectorStack* temp_stack,
                std::vector<KeyColumnArray>& temp_column_arrays,
                uint8_t* out_match_bitvector_maybe_null = NULLPTR);
 
-  // TODO: add AVX2 version
-  //
   Status DecodeSelected(ResizableArrayData* target, int column_id, int num_rows_to_append,
                         const uint32_t* row_ids, MemoryPool* pool) const;
 
@@ -135,10 +207,43 @@ struct RowArray {
 
   int64_t num_rows() const { return is_initialized_ ? rows_.length() : 0; }
 
+  void EnsureHasAnyNullsComputed(const LightContext& ctx) {
+    std::ignore = rows_.has_any_nulls(&ctx);
+  }
+
+ private:
   bool is_initialized_;
+
+  int64_t hardware_flags_;
   RowTableEncoder encoder_;
   RowTableImpl rows_;
   RowTableImpl rows_temp_;
+
+ private:
+  void DecodeFixedLength(ResizableArrayData* output, int output_start_row, int column_id,
+                         uint32_t fixed_length, int num_rows_to_append,
+                         const uint32_t* row_ids) const;
+  void DecodeOffsets(ResizableArrayData* output, int output_start_row, int column_id,
+                     int num_rows_to_append, const uint32_t* row_ids) const;
+  void DecodeVarLength(ResizableArrayData* output, int output_start_row, int column_id,
+                       int num_rows_to_append, const uint32_t* row_ids) const;
+  void DecodeNulls(ResizableArrayData* output, int output_start_row, int column_id,
+                   int num_rows_to_append, const uint32_t* row_ids) const;
+
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
+  int DecodeFixedLength_avx2(ResizableArrayData* output, int output_start_row,
+                             int column_id, uint32_t fixed_length, int num_rows_to_append,
+                             const uint32_t* row_ids) const;
+  int DecodeOffsets_avx2(ResizableArrayData* output, int output_start_row, int column_id,
+                         int num_rows_to_append, const uint32_t* row_ids) const;
+  int DecodeVarLength_avx2(ResizableArrayData* output, int output_start_row,
+                           int column_id, int num_rows_to_append,
+                           const uint32_t* row_ids) const;
+  int DecodeNulls_avx2(ResizableArrayData* output, int output_start_row, int column_id,
+                       int num_rows_to_append, const uint32_t* row_ids) const;
+#endif
+
+  friend class RowArrayMerge;
 };
 
 // Implements concatenating multiple row arrays into a single one, using
@@ -161,7 +266,7 @@ class RowArrayMerge {
   //
   static Status PrepareForMerge(RowArray* target, const std::vector<RowArray*>& sources,
                                 std::vector<int64_t>* first_target_row_id,
-                                MemoryPool* pool);
+                                MemoryPool* pool, int64_t hardware_flags);
 
   // Copy rows from source array to target array.
   // Both arrays must have the same row metadata.
diff --git a/cpp/src/arrow/compute/light_array_internal.h b/cpp/src/arrow/compute/light_array_internal.h
index 5adb06e540009..60f1a6a21e264 100644
--- a/cpp/src/arrow/compute/light_array_internal.h
+++ b/cpp/src/arrow/compute/light_array_internal.h
@@ -319,6 +319,9 @@ class ARROW_EXPORT ResizableArrayData {
   /// \brief The current length (in rows) of the array
   int num_rows() const { return num_rows_; }
 
+  /// \brief The current allocated length (in rows) of the array
+  int num_rows_allocated() const { return num_rows_allocated_; }
+
   /// \brief A non-owning view into this array
   KeyColumnArray column_array() const;
 
@@ -347,6 +350,11 @@ class ARROW_EXPORT ResizableArrayData {
   /// length binary data
   uint8_t* mutable_data(int i) { return buffers_[i]->mutable_data(); }
 
+  template <typename T>
+  T* mutable_data_as(int i) {
+    return reinterpret_cast<T*>(mutable_data(i));
+  }
+
  private:
   static constexpr int64_t kNumPaddingBytes = 64;
   int log_num_rows_min_;
diff --git a/cpp/src/arrow/dataset/ArrowDatasetConfig.cmake.in b/cpp/src/arrow/dataset/ArrowDatasetConfig.cmake.in
index 66b0302cbca80..4573ac3718557 100644
--- a/cpp/src/arrow/dataset/ArrowDatasetConfig.cmake.in
+++ b/cpp/src/arrow/dataset/ArrowDatasetConfig.cmake.in
@@ -26,10 +26,12 @@
 
 @PACKAGE_INIT@
 
+set(ARROW_DATASET_REQUIRED_DEPENDENCIES "@ARROW_DATASET_REQUIRED_DEPENDENCIES@")
+
 include(CMakeFindDependencyMacro)
-find_dependency(Arrow)
-find_dependency(ArrowAcero)
-find_dependency(Parquet)
+foreach(dependency ${ARROW_DATASET_REQUIRED_DEPENDENCIES})
+  find_dependency(${dependency})
+endforeach()
 
 include("${CMAKE_CURRENT_LIST_DIR}/ArrowDatasetTargets.cmake")
 
diff --git a/cpp/src/arrow/dataset/CMakeLists.txt b/cpp/src/arrow/dataset/CMakeLists.txt
index e48bcfaf65bcb..bdb89ee8914f8 100644
--- a/cpp/src/arrow/dataset/CMakeLists.txt
+++ b/cpp/src/arrow/dataset/CMakeLists.txt
@@ -32,8 +32,10 @@ set(ARROW_DATASET_SRCS
     scan_node.cc)
 
 set(ARROW_DATASET_PKG_CONFIG_REQUIRES "arrow-acero")
+set(ARROW_DATASET_REQUIRED_DEPENDENCIES Arrow ArrowAcero)
 if(ARROW_PARQUET)
   string(APPEND ARROW_DATASET_PKG_CONFIG_REQUIRES " parquet")
+  list(APPEND ARROW_DATASET_REQUIRED_DEPENDENCIES Parquet)
 endif()
 
 set(ARROW_DATASET_STATIC_LINK_LIBS)
diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc
index 78f4ad1edd9a9..4638bb12c783c 100644
--- a/cpp/src/arrow/filesystem/azurefs.cc
+++ b/cpp/src/arrow/filesystem/azurefs.cc
@@ -106,6 +106,18 @@ Status AzureOptions::ExtractFromUriQuery(const Uri& uri) {
   std::string tenant_id;
   std::string client_id;
   std::string client_secret;
+
+  // These query parameters are the union of the following docs:
+  // https://learn.microsoft.com/en-us/rest/api/storageservices/create-account-sas#specify-the-account-sas-parameters
+  // https://learn.microsoft.com/en-us/rest/api/storageservices/create-service-sas#construct-a-service-sas
+  // (excluding parameters for table storage only)
+  // https://learn.microsoft.com/en-us/rest/api/storageservices/create-user-delegation-sas#construct-a-user-delegation-sas
+  static const std::set<std::string> sas_token_query_parameters = {
+      "sv",    "ss",    "sr",  "st",  "se",   "sp",   "si",   "sip",   "spr",
+      "skoid", "sktid", "srt", "skt", "ske",  "skv",  "sks",  "saoid", "suoid",
+      "scid",  "sdd",   "ses", "sig", "rscc", "rscd", "rsce", "rscl",  "rsct",
+  };
+
   ARROW_ASSIGN_OR_RAISE(const auto options_items, uri.query_items());
   for (const auto& kv : options_items) {
     if (kv.first == "blob_storage_authority") {
@@ -147,6 +159,9 @@ Status AzureOptions::ExtractFromUriQuery(const Uri& uri) {
     } else if (kv.first == "background_writes") {
       ARROW_ASSIGN_OR_RAISE(background_writes,
                             ::arrow::internal::ParseBoolean(kv.second));
+    } else if (sas_token_query_parameters.find(kv.first) !=
+               sas_token_query_parameters.end()) {
+      credential_kind = CredentialKind::kSASToken;
     } else {
       return Status::Invalid(
           "Unexpected query parameter in Azure Blob File System URI: '", kv.first, "'");
@@ -180,6 +195,13 @@ Status AzureOptions::ExtractFromUriQuery(const Uri& uri) {
       case CredentialKind::kEnvironment:
         RETURN_NOT_OK(ConfigureEnvironmentCredential());
         break;
+      case CredentialKind::kSASToken:
+        // Reconstructing the SAS token without the other URI query parameters is awkward
+        // because some parts are URI escaped and some parts are not. Instead we just
+        // pass through the entire query string and Azure ignores the extra query
+        // parameters.
+        RETURN_NOT_OK(ConfigureSASCredential("?" + uri.query_string()));
+        break;
       default:
         // Default credential
         break;
@@ -225,7 +247,6 @@ Result<AzureOptions> AzureOptions::FromUri(const std::string& uri_string,
 }
 
 bool AzureOptions::Equals(const AzureOptions& other) const {
-  // TODO(GH-38598): update here when more auth methods are added.
   const bool equals = blob_storage_authority == other.blob_storage_authority &&
                       dfs_storage_authority == other.dfs_storage_authority &&
                       blob_storage_scheme == other.blob_storage_scheme &&
@@ -243,6 +264,8 @@ bool AzureOptions::Equals(const AzureOptions& other) const {
     case CredentialKind::kStorageSharedKey:
       return storage_shared_key_credential_->AccountName ==
              other.storage_shared_key_credential_->AccountName;
+    case CredentialKind::kSASToken:
+      return sas_token_ == other.sas_token_;
     case CredentialKind::kClientSecret:
     case CredentialKind::kCLI:
     case CredentialKind::kManagedIdentity:
@@ -311,6 +334,15 @@ Status AzureOptions::ConfigureAccountKeyCredential(const std::string& account_ke
   return Status::OK();
 }
 
+Status AzureOptions::ConfigureSASCredential(const std::string& sas_token) {
+  credential_kind_ = CredentialKind::kSASToken;
+  if (account_name.empty()) {
+    return Status::Invalid("AzureOptions doesn't contain a valid account name");
+  }
+  sas_token_ = sas_token;
+  return Status::OK();
+}
+
 Status AzureOptions::ConfigureClientSecretCredential(const std::string& tenant_id,
                                                      const std::string& client_id,
                                                      const std::string& client_secret) {
@@ -372,6 +404,9 @@ Result<std::unique_ptr<Blobs::BlobServiceClient>> AzureOptions::MakeBlobServiceC
     case CredentialKind::kStorageSharedKey:
       return std::make_unique<Blobs::BlobServiceClient>(AccountBlobUrl(account_name),
                                                         storage_shared_key_credential_);
+    case CredentialKind::kSASToken:
+      return std::make_unique<Blobs::BlobServiceClient>(AccountBlobUrl(account_name) +
+                                                        sas_token_);
   }
   return Status::Invalid("AzureOptions doesn't contain a valid auth configuration");
 }
@@ -404,29 +439,13 @@ AzureOptions::MakeDataLakeServiceClient() const {
     case CredentialKind::kStorageSharedKey:
       return std::make_unique<DataLake::DataLakeServiceClient>(
           AccountDfsUrl(account_name), storage_shared_key_credential_);
+    case CredentialKind::kSASToken:
+      return std::make_unique<DataLake::DataLakeServiceClient>(
+          AccountBlobUrl(account_name) + sas_token_);
   }
   return Status::Invalid("AzureOptions doesn't contain a valid auth configuration");
 }
 
-Result<std::string> AzureOptions::GenerateSASToken(
-    Storage::Sas::BlobSasBuilder* builder, Blobs::BlobServiceClient* client) const {
-  using SasProtocol = Storage::Sas::SasProtocol;
-  builder->Protocol =
-      blob_storage_scheme == "http" ? SasProtocol::HttpsAndHttp : SasProtocol::HttpsOnly;
-  if (storage_shared_key_credential_) {
-    return builder->GenerateSasToken(*storage_shared_key_credential_);
-  } else {
-    // GH-39344: This part isn't tested. This may not work.
-    try {
-      auto delegation_key_response = client->GetUserDelegationKey(builder->ExpiresOn);
-      return builder->GenerateSasToken(delegation_key_response.Value, account_name);
-    } catch (const Storage::StorageException& exception) {
-      return ExceptionToStatus(exception, "GetUserDelegationKey failed for '",
-                               client->GetUrl(), "'.");
-    }
-  }
-}
-
 namespace {
 
 // An AzureFileSystem represents an Azure storage account. An AzureLocation describes a
@@ -3161,19 +3180,7 @@ class AzureFileSystem::Impl {
     if (src == dest) {
       return Status::OK();
     }
-    std::string sas_token;
-    {
-      Storage::Sas::BlobSasBuilder builder;
-      std::chrono::seconds available_period(60);
-      builder.ExpiresOn = std::chrono::system_clock::now() + available_period;
-      builder.BlobContainerName = src.container;
-      builder.BlobName = src.path;
-      builder.Resource = Storage::Sas::BlobSasResource::Blob;
-      builder.SetPermissions(Storage::Sas::BlobSasPermissions::Read);
-      ARROW_ASSIGN_OR_RAISE(
-          sas_token, options_.GenerateSASToken(&builder, blob_service_client_.get()));
-    }
-    auto src_url = GetBlobClient(src.container, src.path).GetUrl() + sas_token;
+    auto src_url = GetBlobClient(src.container, src.path).GetUrl();
     auto dest_blob_client = GetBlobClient(dest.container, dest.path);
     if (!dest.path.empty()) {
       auto dest_parent = dest.parent();
@@ -3186,9 +3193,21 @@ class AzureFileSystem::Impl {
       }
     }
     try {
-      dest_blob_client.CopyFromUri(src_url);
+      // We use StartCopyFromUri instead of CopyFromUri because it supports blobs larger
+      // than 256 MiB and it doesn't require generating a SAS token to authenticate
+      // reading a source blob in the same storage account.
+      auto copy_operation = dest_blob_client.StartCopyFromUri(src_url);
+      // For large blobs, the copy operation may be slow so we need to poll until it
+      // completes. We use a polling interval of 1 second.
+      copy_operation.PollUntilDone(std::chrono::milliseconds(1000));
     } catch (const Storage::StorageException& exception) {
-      return ExceptionToStatus(exception, "Failed to copy a blob. (", src_url, " -> ",
+      // StartCopyFromUri failed or a GetProperties call inside PollUntilDone failed.
+      return ExceptionToStatus(
+          exception, "Failed to start blob copy or poll status of ongoing copy. (",
+          src_url, " -> ", dest_blob_client.GetUrl(), ")");
+    } catch (const Azure::Core::RequestFailedException& exception) {
+      // A GetProperties call inside PollUntilDone returned a failed CopyStatus.
+      return ExceptionToStatus(exception, "Failed to copy blob. (", src_url, " -> ",
                                dest_blob_client.GetUrl(), ")");
     }
     return Status::OK();
diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h
index c5e5091256959..ee0956afdd7a9 100644
--- a/cpp/src/arrow/filesystem/azurefs.h
+++ b/cpp/src/arrow/filesystem/azurefs.h
@@ -37,10 +37,6 @@ namespace Azure::Storage::Blobs {
 class BlobServiceClient;
 }
 
-namespace Azure::Storage::Sas {
-struct BlobSasBuilder;
-}
-
 namespace Azure::Storage::Files::DataLake {
 class DataLakeFileSystemClient;
 class DataLakeServiceClient;
@@ -120,6 +116,7 @@ struct ARROW_EXPORT AzureOptions {
     kDefault,
     kAnonymous,
     kStorageSharedKey,
+    kSASToken,
     kClientSecret,
     kManagedIdentity,
     kCLI,
@@ -129,6 +126,7 @@ struct ARROW_EXPORT AzureOptions {
 
   std::shared_ptr<Azure::Storage::StorageSharedKeyCredential>
       storage_shared_key_credential_;
+  std::string sas_token_;
   mutable std::shared_ptr<Azure::Core::Credentials::TokenCredential> token_credential_;
 
  public:
@@ -180,6 +178,9 @@ struct ARROW_EXPORT AzureOptions {
   ///   AzureOptions::ConfigureClientSecretCredential() is called.
   /// * client_secret: You must specify "tenant_id" and "client_id"
   ///   too. AzureOptions::ConfigureClientSecretCredential() is called.
+  /// * A SAS token is made up of several query parameters. Appending a SAS
+  ///   token to the URI configures SAS token auth by calling
+  ///   AzureOptions::ConfigureSASCredential().
   ///
   /// [1]:
   /// https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction-abfs-uri
@@ -189,6 +190,7 @@ struct ARROW_EXPORT AzureOptions {
   Status ConfigureDefaultCredential();
   Status ConfigureAnonymousCredential();
   Status ConfigureAccountKeyCredential(const std::string& account_key);
+  Status ConfigureSASCredential(const std::string& sas_token);
   Status ConfigureClientSecretCredential(const std::string& tenant_id,
                                          const std::string& client_id,
                                          const std::string& client_secret);
@@ -207,10 +209,6 @@ struct ARROW_EXPORT AzureOptions {
 
   Result<std::unique_ptr<Azure::Storage::Files::DataLake::DataLakeServiceClient>>
   MakeDataLakeServiceClient() const;
-
-  Result<std::string> GenerateSASToken(
-      Azure::Storage::Sas::BlobSasBuilder* builder,
-      Azure::Storage::Blobs::BlobServiceClient* client) const;
 };
 
 /// \brief FileSystem implementation backed by Azure Blob Storage (ABS) [1] and
diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc
index a04977bdee076..7c1d450051901 100644
--- a/cpp/src/arrow/filesystem/azurefs_test.cc
+++ b/cpp/src/arrow/filesystem/azurefs_test.cc
@@ -387,6 +387,30 @@ class TestGeneric : public ::testing::Test, public GenericFileSystemTest {
   //     builddir/main/../../threads.c:580:10 #2 0x7fa914b1cd1e in xmlGetGlobalState
   //     builddir/main/../../threads.c:666:31
   bool have_false_positive_memory_leak_with_generator() const override { return true; }
+  // This false positive leak is similar to the one pinpointed in the
+  // have_false_positive_memory_leak_with_generator() comments above,
+  // though the stack trace is different. It happens when a block list
+  // is committed from a background thread.
+  //
+  // clang-format off
+  // Direct leak of 968 byte(s) in 1 object(s) allocated from:
+  //   #0 calloc
+  //   #1 (/lib/x86_64-linux-gnu/libxml2.so.2+0xe25a4)
+  //   #2 __xmlDefaultBufferSize
+  //   #3 xmlBufferCreate
+  //   #4 Azure::Storage::_internal::XmlWriter::XmlWriter()
+  //   #5 Azure::Storage::Blobs::_detail::BlockBlobClient::CommitBlockList
+  //   #6 Azure::Storage::Blobs::BlockBlobClient::CommitBlockList
+  //   #7 arrow::fs::(anonymous namespace)::CommitBlockList
+  //   #8 arrow::fs::(anonymous namespace)::ObjectAppendStream::FlushAsync()::'lambda'
+  // clang-format on
+  //
+  // TODO perhaps remove this skip once we can rely on
+  // https://github.com/Azure/azure-sdk-for-cpp/pull/5767
+  //
+  // Also note that ClickHouse has a workaround for a similar issue:
+  // https://github.com/ClickHouse/ClickHouse/pull/45796
+  bool have_false_positive_memory_leak_with_async_close() const override { return true; }
 
   BaseAzureEnv* env_;
   std::shared_ptr<AzureFileSystem> azure_fs_;
@@ -690,6 +714,36 @@ class TestAzureOptions : public ::testing::Test {
     ASSERT_EQ(options.credential_kind_, AzureOptions::CredentialKind::kEnvironment);
   }
 
+  void TestFromUriCredentialSASToken() {
+    const std::string sas_token =
+        "?se=2024-12-12T18:57:47Z&sig=pAs7qEBdI6sjUhqX1nrhNAKsTY%2B1SqLxPK%"
+        "2BbAxLiopw%3D&sp=racwdxylti&spr=https,http&sr=c&sv=2024-08-04";
+    ASSERT_OK_AND_ASSIGN(
+        auto options,
+        AzureOptions::FromUri(
+            "abfs://file_system@account.dfs.core.windows.net/" + sas_token, nullptr));
+    ASSERT_EQ(options.credential_kind_, AzureOptions::CredentialKind::kSASToken);
+    ASSERT_EQ(options.sas_token_, sas_token);
+  }
+
+  void TestFromUriCredentialSASTokenWithOtherParameters() {
+    const std::string uri_query_string =
+        "?enable_tls=false&se=2024-12-12T18:57:47Z&sig=pAs7qEBdI6sjUhqX1nrhNAKsTY%"
+        "2B1SqLxPK%"
+        "2BbAxLiopw%3D&sp=racwdxylti&spr=https,http&sr=c&sv=2024-08-04";
+    ASSERT_OK_AND_ASSIGN(
+        auto options,
+        AzureOptions::FromUri(
+            "abfs://account@127.0.0.1:10000/container/dir/blob" + uri_query_string,
+            nullptr));
+    ASSERT_EQ(options.credential_kind_, AzureOptions::CredentialKind::kSASToken);
+    ASSERT_EQ(options.sas_token_, uri_query_string);
+    ASSERT_EQ(options.blob_storage_authority, "127.0.0.1:10000");
+    ASSERT_EQ(options.dfs_storage_authority, "127.0.0.1:10000");
+    ASSERT_EQ(options.blob_storage_scheme, "http");
+    ASSERT_EQ(options.dfs_storage_scheme, "http");
+  }
+
   void TestFromUriCredentialInvalid() {
     ASSERT_RAISES(Invalid, AzureOptions::FromUri(
                                "abfs://file_system@account.dfs.core.windows.net/dir/file?"
@@ -777,6 +831,10 @@ TEST_F(TestAzureOptions, FromUriCredentialWorkloadIdentity) {
 TEST_F(TestAzureOptions, FromUriCredentialEnvironment) {
   TestFromUriCredentialEnvironment();
 }
+TEST_F(TestAzureOptions, FromUriCredentialSASToken) { TestFromUriCredentialSASToken(); }
+TEST_F(TestAzureOptions, FromUriCredentialSASTokenWithOtherParameters) {
+  TestFromUriCredentialSASTokenWithOtherParameters();
+}
 TEST_F(TestAzureOptions, FromUriCredentialInvalid) { TestFromUriCredentialInvalid(); }
 TEST_F(TestAzureOptions, FromUriBlobStorageAuthority) {
   TestFromUriBlobStorageAuthority();
@@ -912,6 +970,20 @@ class TestAzureFileSystem : public ::testing::Test {
         .Value;
   }
 
+  Result<std::string> GetContainerSASToken(
+      const std::string& container_name,
+      Azure::Storage::StorageSharedKeyCredential storage_shared_key_credential) {
+    std::string sas_token;
+    Azure::Storage::Sas::BlobSasBuilder builder;
+    std::chrono::seconds available_period(60);
+    builder.ExpiresOn = std::chrono::system_clock::now() + available_period;
+    builder.BlobContainerName = container_name;
+    builder.Resource = Azure::Storage::Sas::BlobSasResource::BlobContainer;
+    builder.SetPermissions(Azure::Storage::Sas::BlobContainerSasPermissions::All);
+    builder.Protocol = Azure::Storage::Sas::SasProtocol::HttpsAndHttp;
+    return builder.GenerateSasToken(storage_shared_key_credential);
+  }
+
   void UploadLines(const std::vector<std::string>& lines, const std::string& path,
                    int total_size) {
     ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(path, {}));
@@ -1536,29 +1608,7 @@ class TestAzureFileSystem : public ::testing::Test {
 
   void TestOpenOutputStreamCloseAsync() {
 #if defined(ADDRESS_SANITIZER) || defined(ARROW_VALGRIND)
-    // This false positive leak is similar to the one pinpointed in the
-    // have_false_positive_memory_leak_with_generator() comments above,
-    // though the stack trace is different. It happens when a block list
-    // is committed from a background thread.
-    //
-    // clang-format off
-    // Direct leak of 968 byte(s) in 1 object(s) allocated from:
-    //   #0 calloc
-    //   #1 (/lib/x86_64-linux-gnu/libxml2.so.2+0xe25a4)
-    //   #2 __xmlDefaultBufferSize
-    //   #3 xmlBufferCreate
-    //   #4 Azure::Storage::_internal::XmlWriter::XmlWriter()
-    //   #5 Azure::Storage::Blobs::_detail::BlockBlobClient::CommitBlockList
-    //   #6 Azure::Storage::Blobs::BlockBlobClient::CommitBlockList
-    //   #7 arrow::fs::(anonymous namespace)::CommitBlockList
-    //   #8 arrow::fs::(anonymous namespace)::ObjectAppendStream::FlushAsync()::'lambda'
-    // clang-format on
-    //
-    // TODO perhaps remove this skip once we can rely on
-    // https://github.com/Azure/azure-sdk-for-cpp/pull/5767
-    //
-    // Also note that ClickHouse has a workaround for a similar issue:
-    // https://github.com/ClickHouse/ClickHouse/pull/45796
+    // See comment about have_false_positive_memory_leak_with_generator above.
     if (options_.background_writes) {
       GTEST_SKIP() << "False positive memory leak in libxml2 with CloseAsync";
     }
@@ -1617,6 +1667,31 @@ class TestAzureFileSystem : public ::testing::Test {
     AssertObjectContents(fs.get(), path, payload);
   }
 
+  void TestSASCredential() {
+    auto data = SetUpPreexistingData();
+
+    ASSERT_OK_AND_ASSIGN(auto env, GetAzureEnv());
+    ASSERT_OK_AND_ASSIGN(auto options, MakeOptions(env));
+    ASSERT_OK_AND_ASSIGN(
+        auto sas_token,
+        GetContainerSASToken(data.container_name,
+                             Azure::Storage::StorageSharedKeyCredential(
+                                 env->account_name(), env->account_key())));
+    // AzureOptions::FromUri will not cut off extra query parameters that it consumes, so
+    // make sure these don't cause problems.
+    ARROW_EXPECT_OK(options.ConfigureSASCredential(
+        "?blob_storage_authority=dummy_value0&" + sas_token.substr(1) +
+        "&credential_kind=dummy-value1"));
+    EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options));
+
+    AssertFileInfo(fs.get(), data.ObjectPath(), FileType::File);
+
+    // Test CopyFile because the most obvious implementation requires generating a SAS
+    // token at runtime which doesn't work when the original auth is SAS token.
+    ASSERT_OK(fs->CopyFile(data.ObjectPath(), data.ObjectPath() + "_copy"));
+    AssertFileInfo(fs.get(), data.ObjectPath() + "_copy", FileType::File);
+  }
+
  private:
   using StringMatcher =
       ::testing::PolymorphicMatcher<::testing::internal::HasSubstrMatcher<std::string>>;
@@ -2328,6 +2403,10 @@ TYPED_TEST(TestAzureFileSystemOnAllScenarios, CreateContainerFromPath) {
 
 TYPED_TEST(TestAzureFileSystemOnAllScenarios, MovePath) { this->TestMovePath(); }
 
+TYPED_TEST(TestAzureFileSystemOnAllScenarios, SASCredential) {
+  this->TestSASCredential();
+}
+
 // Tests using Azurite (the local Azure emulator)
 
 TEST_F(TestAzuriteFileSystem, CheckIfHierarchicalNamespaceIsEnabledRuntimeError) {
@@ -2634,6 +2713,17 @@ TEST_F(TestAzuriteFileSystem, CopyFileSuccessDestinationNonexistent) {
   EXPECT_EQ(PreexistingData::kLoremIpsum, buffer->ToString());
 }
 
+TEST_F(TestAzuriteFileSystem, CopyFileSuccessDestinationDifferentContainer) {
+  auto data = SetUpPreexistingData();
+  auto data2 = SetUpPreexistingData();
+  const auto destination_path = data2.ContainerPath("copy-destionation");
+  ASSERT_OK(fs()->CopyFile(data.ObjectPath(), destination_path));
+  ASSERT_OK_AND_ASSIGN(auto info, fs()->GetFileInfo(destination_path));
+  ASSERT_OK_AND_ASSIGN(auto stream, fs()->OpenInputStream(info));
+  ASSERT_OK_AND_ASSIGN(auto buffer, stream->Read(1024));
+  EXPECT_EQ(PreexistingData::kLoremIpsum, buffer->ToString());
+}
+
 TEST_F(TestAzuriteFileSystem, CopyFileSuccessDestinationSame) {
   auto data = SetUpPreexistingData();
   ASSERT_OK(fs()->CopyFile(data.ObjectPath(), data.ObjectPath()));
diff --git a/cpp/src/arrow/filesystem/filesystem.cc b/cpp/src/arrow/filesystem/filesystem.cc
index b5765010ec7e9..37619df90fc34 100644
--- a/cpp/src/arrow/filesystem/filesystem.cc
+++ b/cpp/src/arrow/filesystem/filesystem.cc
@@ -630,9 +630,12 @@ Status CopyFiles(const std::vector<FileLocator>& sources,
                            destinations.size(), " paths.");
   }
 
-  auto copy_one_file = [&](int i) {
-    if (sources[i].filesystem->Equals(destinations[i].filesystem)) {
-      return sources[i].filesystem->CopyFile(sources[i].path, destinations[i].path);
+  auto copy_one_file = [&](size_t i,
+                           const FileLocator& source_file_locator) -> Result<Future<>> {
+    if (source_file_locator.filesystem->Equals(destinations[i].filesystem)) {
+      RETURN_NOT_OK(source_file_locator.filesystem->CopyFile(source_file_locator.path,
+                                                             destinations[i].path));
+      return Future<>::MakeFinished();
     }
 
     ARROW_ASSIGN_OR_RAISE(auto source,
@@ -642,12 +645,31 @@ Status CopyFiles(const std::vector<FileLocator>& sources,
     ARROW_ASSIGN_OR_RAISE(auto destination, destinations[i].filesystem->OpenOutputStream(
                                                 destinations[i].path, metadata));
     RETURN_NOT_OK(internal::CopyStream(source, destination, chunk_size, io_context));
-    return destination->Close();
+    // Using the blocking Close() here can cause reduced performance and deadlocks because
+    // FileSystem implementations that implement background_writes need to queue and wait
+    // for other IO thread(s). There is a risk that most or all the threads in the IO
+    // thread pool are blocking on a call Close(), leaving no IO threads left to actually
+    // fulfil the background writes.
+    return destination->CloseAsync();
   };
 
-  return ::arrow::internal::OptionalParallelFor(
-      use_threads, static_cast<int>(sources.size()), std::move(copy_one_file),
-      io_context.executor());
+  // Spawn copy_one_file less urgently than default, so that background_writes are done
+  // with higher priority. Otherwise copy_one_file will keep buffering more data in memory
+  // without giving the background_writes any chance to upload the data and drop it from
+  // memory. Therefore, without this large copies would cause OOMs.
+  TaskHints hints{10};
+  auto future = ::arrow::internal::OptionalParallelForAsync(
+      use_threads, sources, std::move(copy_one_file), io_context.executor(), hints);
+
+  // Wait for all the copy_one_file instances to complete.
+  ARROW_ASSIGN_OR_RAISE(auto copy_close_async_future, future.result());
+
+  // Wait for all the futures returned by copy_one_file to complete. When the destination
+  // filesystem uses background_writes this is when most of the upload happens.
+  for (const auto& result : copy_close_async_future) {
+    result.Wait();
+  }
+  return Status::OK();
 }
 
 Status CopyFiles(const std::shared_ptr<FileSystem>& source_fs,
diff --git a/cpp/src/arrow/filesystem/test_util.cc b/cpp/src/arrow/filesystem/test_util.cc
index a6c897636000e..efe7cff4958ab 100644
--- a/cpp/src/arrow/filesystem/test_util.cc
+++ b/cpp/src/arrow/filesystem/test_util.cc
@@ -578,6 +578,67 @@ void GenericFileSystemTest::TestCopyFile(FileSystem* fs) {
   AssertAllFiles(fs, {"AB/abc", "EF/ghi", "def"});
 }
 
+void GenericFileSystemTest::TestCopyFiles(FileSystem* fs) {
+#if defined(ADDRESS_SANITIZER) || defined(ARROW_VALGRIND)
+  if (have_false_positive_memory_leak_with_async_close()) {
+    GTEST_SKIP() << "Filesystem have false positive memory leak with generator";
+  }
+#endif
+  auto io_thread_pool =
+      static_cast<arrow::internal::ThreadPool*>(fs->io_context().executor());
+  auto original_threads = io_thread_pool->GetCapacity();
+  // Needs to be smaller than the number of files we test with to catch GH-15233
+  ASSERT_OK(io_thread_pool->SetCapacity(2));
+  // Ensure the thread pool capacity is set back to the original value after the test
+  auto reset_thread_pool = [io_thread_pool, original_threads](void*) {
+    ASSERT_OK(io_thread_pool->SetCapacity(original_threads));
+  };
+  std::unique_ptr<void, decltype(reset_thread_pool)> reset_thread_pool_guard(
+      nullptr, reset_thread_pool);
+
+  auto mock_fs = std::make_shared<arrow::fs::internal::MockFileSystem>(
+      std::chrono::system_clock::now());
+  std::vector<std::string> dirs0{"0", "0/AB", "0/AB/CD"};
+  std::map<std::string, std::string> files0{
+      {"0/123", "123 data"}, {"0/AB/abc", "abc data"}, {"0/AB/CD/def", "def data"}};
+
+  std::vector<std::string> dirs0and1{"0", "0/AB", "0/AB/CD", "1", "1/AB", "1/AB/CD"};
+  std::map<std::string, std::string> files0and1{
+      {"0/123", "123 data"}, {"0/AB/abc", "abc data"}, {"0/AB/CD/def", "def data"},
+      {"1/123", "123 data"}, {"1/AB/abc", "abc data"}, {"1/AB/CD/def", "def data"}};
+
+  ASSERT_OK(mock_fs->CreateDir("0/AB/CD"));
+  for (const auto& kv : files0) {
+    CreateFile(mock_fs.get(), kv.first, kv.second);
+  }
+
+  auto selector0 = arrow::fs::FileSelector{};
+  selector0.base_dir = "0";
+  selector0.recursive = true;
+
+  ASSERT_OK(CopyFiles(mock_fs, selector0, fs->shared_from_this(), "0"));
+  AssertAllDirs(fs, dirs0);
+  for (const auto& kv : files0) {
+    AssertFileContents(fs, kv.first, kv.second);
+  }
+
+  ASSERT_OK(CopyFiles(fs->shared_from_this(), selector0, fs->shared_from_this(), "1"));
+  AssertAllDirs(fs, dirs0and1);
+  for (const auto& kv : files0and1) {
+    AssertFileContents(fs, kv.first, kv.second);
+  }
+
+  auto selector1 = arrow::fs::FileSelector{};
+  selector1.base_dir = "1";
+  selector1.recursive = true;
+
+  ASSERT_OK(CopyFiles(fs->shared_from_this(), selector1, mock_fs, "1"));
+  AssertAllDirs(mock_fs.get(), dirs0and1);
+  for (const auto& kv : files0and1) {
+    AssertFileContents(mock_fs.get(), kv.first, kv.second);
+  }
+}
+
 void GenericFileSystemTest::TestGetFileInfo(FileSystem* fs) {
   ASSERT_OK(fs->CreateDir("AB/CD/EF"));
   CreateFile(fs, "AB/CD/ghi", "some data");
@@ -1212,6 +1273,7 @@ GENERIC_FS_TEST_DEFINE(TestDeleteFiles)
 GENERIC_FS_TEST_DEFINE(TestMoveFile)
 GENERIC_FS_TEST_DEFINE(TestMoveDir)
 GENERIC_FS_TEST_DEFINE(TestCopyFile)
+GENERIC_FS_TEST_DEFINE(TestCopyFiles)
 GENERIC_FS_TEST_DEFINE(TestGetFileInfo)
 GENERIC_FS_TEST_DEFINE(TestGetFileInfoVector)
 GENERIC_FS_TEST_DEFINE(TestGetFileInfoSelector)
diff --git a/cpp/src/arrow/filesystem/test_util.h b/cpp/src/arrow/filesystem/test_util.h
index 04000c14e9c2a..3a643b7e9f08b 100644
--- a/cpp/src/arrow/filesystem/test_util.h
+++ b/cpp/src/arrow/filesystem/test_util.h
@@ -140,6 +140,7 @@ class ARROW_TESTING_EXPORT GenericFileSystemTest {
   void TestMoveFile();
   void TestMoveDir();
   void TestCopyFile();
+  void TestCopyFiles();
   void TestGetFileInfo();
   void TestGetFileInfoVector();
   void TestGetFileInfoSelector();
@@ -189,6 +190,8 @@ class ARROW_TESTING_EXPORT GenericFileSystemTest {
   virtual bool have_file_metadata() const { return false; }
   // - Whether the filesystem has a false positive memory leak with generator
   virtual bool have_false_positive_memory_leak_with_generator() const { return false; }
+  // - Whether the filesystem has a false positive memory leak in async close
+  virtual bool have_false_positive_memory_leak_with_async_close() const { return false; }
 
   void TestEmpty(FileSystem* fs);
   void TestNormalizePath(FileSystem* fs);
@@ -201,6 +204,7 @@ class ARROW_TESTING_EXPORT GenericFileSystemTest {
   void TestMoveFile(FileSystem* fs);
   void TestMoveDir(FileSystem* fs);
   void TestCopyFile(FileSystem* fs);
+  void TestCopyFiles(FileSystem* fs);
   void TestGetFileInfo(FileSystem* fs);
   void TestGetFileInfoVector(FileSystem* fs);
   void TestGetFileInfoSelector(FileSystem* fs);
@@ -233,6 +237,7 @@ class ARROW_TESTING_EXPORT GenericFileSystemTest {
   GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, MoveFile)                         \
   GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, MoveDir)                          \
   GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, CopyFile)                         \
+  GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, CopyFiles)                        \
   GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfo)                      \
   GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfoVector)                \
   GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfoSelector)              \
diff --git a/cpp/src/arrow/io/interfaces.cc b/cpp/src/arrow/io/interfaces.cc
index 1d35549cc4345..f6be60509c45e 100644
--- a/cpp/src/arrow/io/interfaces.cc
+++ b/cpp/src/arrow/io/interfaces.cc
@@ -68,8 +68,8 @@ Status SetIOThreadPoolCapacity(int threads) {
 FileInterface::~FileInterface() = default;
 
 Future<> FileInterface::CloseAsync() {
-  return DeferNotOk(
-      default_io_context().executor()->Submit([this]() { return Close(); }));
+  return DeferNotOk(default_io_context().executor()->Submit(
+      [self = shared_from_this()]() { return self->Close(); }));
 }
 
 Status FileInterface::Abort() { return Close(); }
diff --git a/cpp/src/arrow/util/parallel.h b/cpp/src/arrow/util/parallel.h
index 80f60fbdb3676..ae48a606e366f 100644
--- a/cpp/src/arrow/util/parallel.h
+++ b/cpp/src/arrow/util/parallel.h
@@ -48,12 +48,13 @@ Status ParallelFor(int num_tasks, FUNCTION&& func,
 
 template <class FUNCTION, typename T,
           typename R = typename internal::call_traits::return_type<FUNCTION>::ValueType>
-Future<std::vector<R>> ParallelForAsync(
-    std::vector<T> inputs, FUNCTION&& func,
-    Executor* executor = internal::GetCpuThreadPool()) {
+Future<std::vector<R>> ParallelForAsync(std::vector<T> inputs, FUNCTION&& func,
+                                        Executor* executor = internal::GetCpuThreadPool(),
+                                        TaskHints hints = TaskHints{}) {
   std::vector<Future<R>> futures(inputs.size());
   for (size_t i = 0; i < inputs.size(); ++i) {
-    ARROW_ASSIGN_OR_RAISE(futures[i], executor->Submit(func, i, std::move(inputs[i])));
+    ARROW_ASSIGN_OR_RAISE(futures[i],
+                          executor->Submit(hints, func, i, std::move(inputs[i])));
   }
   return All(std::move(futures))
       .Then([](const std::vector<Result<R>>& results) -> Result<std::vector<R>> {
@@ -86,9 +87,10 @@ template <class FUNCTION, typename T,
           typename R = typename internal::call_traits::return_type<FUNCTION>::ValueType>
 Future<std::vector<R>> OptionalParallelForAsync(
     bool use_threads, std::vector<T> inputs, FUNCTION&& func,
-    Executor* executor = internal::GetCpuThreadPool()) {
+    Executor* executor = internal::GetCpuThreadPool(), TaskHints hints = TaskHints{}) {
   if (use_threads) {
-    return ParallelForAsync(std::move(inputs), std::forward<FUNCTION>(func), executor);
+    return ParallelForAsync(std::move(inputs), std::forward<FUNCTION>(func), executor,
+                            hints);
   } else {
     std::vector<R> result(inputs.size());
     for (size_t i = 0; i < inputs.size(); ++i) {
diff --git a/cpp/src/arrow/util/thread_pool.cc b/cpp/src/arrow/util/thread_pool.cc
index 8aa6d548893de..faef51307e5d2 100644
--- a/cpp/src/arrow/util/thread_pool.cc
+++ b/cpp/src/arrow/util/thread_pool.cc
@@ -52,10 +52,28 @@ struct Task {
   Executor::StopCallback stop_callback;
 };
 
+struct QueuedTask {
+  Task task;
+  int32_t priority;
+  uint64_t spawn_index;
+
+  // Implement comparison so that std::priority_queue will pop the low priorities more
+  // urgently.
+  bool operator<(const QueuedTask& other) const {
+    if (priority == other.priority) {
+      // Maintain execution order for tasks with the same priority. Its preferable to keep
+      // the execution order of tasks deterministic.
+      return spawn_index > other.spawn_index;
+    }
+    return priority > other.priority;
+  }
+};
+
 }  // namespace
 
 struct SerialExecutor::State {
-  std::deque<Task> task_queue;
+  std::priority_queue<QueuedTask> task_queue;
+  uint64_t spawned_tasks_count_ = 0;
   std::mutex mutex;
   std::condition_variable wait_for_tasks;
   std::thread::id current_thread;
@@ -153,8 +171,9 @@ Status SerialExecutor::SpawnReal(TaskHints hints, FnOnce<void()> task,
           "Attempt to schedule a task on a serial executor that has already finished or "
           "been abandoned");
     }
-    state->task_queue.push_back(
-        Task{std::move(task), std::move(stop_token), std::move(stop_callback)});
+    state->task_queue.push(QueuedTask{std::move(task), std::move(stop_token),
+                                      std::move(stop_callback), hints.priority,
+                                      state_->spawned_tasks_count_++});
   }
   state->wait_for_tasks.notify_one();
   return Status::OK();
@@ -189,8 +208,9 @@ Status SerialExecutor::SpawnReal(TaskHints hints, FnOnce<void()> task,
         "been abandoned");
   }
 
-  state_->task_queue.push_back(
-      Task{std::move(task), std::move(stop_token), std::move(stop_callback)});
+  state_->task_queue.push(QueuedTask{std::move(task), std::move(stop_token),
+                                     std::move(stop_callback), hints.priority,
+                                     state_->spawned_tasks_count_++});
 
   return Status::OK();
 }
@@ -245,8 +265,8 @@ void SerialExecutor::RunLoop() {
     // because sometimes we will pause even with work leftover when processing
     // an async generator
     while (!state_->paused && !state_->task_queue.empty()) {
-      Task task = std::move(state_->task_queue.front());
-      state_->task_queue.pop_front();
+      Task task = std::move(const_cast<Task&>(state_->task_queue.top().task));
+      state_->task_queue.pop();
       lk.unlock();
       if (!task.stop_token.IsStopRequested()) {
         std::move(task.callable)();
@@ -309,8 +329,8 @@ bool SerialExecutor::RunTasksOnAllExecutors() {
       if (exe->state_->paused == false && exe->state_->task_queue.empty() == false) {
         SerialExecutor* old_exe = globalState->current_executor;
         globalState->current_executor = exe;
-        Task task = std::move(exe->state_->task_queue.front());
-        exe->state_->task_queue.pop_front();
+        Task task = std::move(const_cast<Task&>(exe->state_->task_queue.top().task));
+        exe->state_->task_queue.pop();
         run_task = true;
         exe->state_->tasks_running += 1;
         if (!task.stop_token.IsStopRequested()) {
@@ -344,8 +364,8 @@ void SerialExecutor::RunLoop() {
     // we can't run any more until something else drops off the queue
     if (state_->tasks_running <= state_->max_tasks_running) {
       while (!state_->paused && !state_->task_queue.empty()) {
-        Task task = std::move(state_->task_queue.front());
-        state_->task_queue.pop_front();
+        Task task = std::move(const_cast<Task&>(state_->task_queue.top().task));
+        state_->task_queue.pop();
         auto last_executor = globalState->current_executor;
         globalState->current_executor = this;
         state_->tasks_running += 1;
@@ -386,7 +406,8 @@ struct ThreadPool::State {
   std::list<std::thread> workers_;
   // Trashcan for finished threads
   std::vector<std::thread> finished_workers_;
-  std::deque<Task> pending_tasks_;
+  std::priority_queue<QueuedTask> pending_tasks_;
+  uint64_t spawned_tasks_count_ = 0;
 
   // Desired number of threads
   int desired_capacity_ = 0;
@@ -449,8 +470,8 @@ static void WorkerLoop(std::shared_ptr<ThreadPool::State> state,
 
       DCHECK_GE(state->tasks_queued_or_running_, 0);
       {
-        Task task = std::move(state->pending_tasks_.front());
-        state->pending_tasks_.pop_front();
+        Task task = std::move(const_cast<Task&>(state->pending_tasks_.top().task));
+        state->pending_tasks_.pop();
         StopToken* stop_token = &task.stop_token;
         lock.unlock();
         if (!stop_token->IsStopRequested()) {
@@ -592,7 +613,8 @@ Status ThreadPool::Shutdown(bool wait) {
   if (!state_->quick_shutdown_) {
     DCHECK_EQ(state_->pending_tasks_.size(), 0);
   } else {
-    state_->pending_tasks_.clear();
+    std::priority_queue<QueuedTask> empty;
+    std::swap(state_->pending_tasks_, empty);
   }
   CollectFinishedWorkersUnlocked();
   return Status::OK();
@@ -653,8 +675,10 @@ Status ThreadPool::SpawnReal(TaskHints hints, FnOnce<void()> task, StopToken sto
       // We can still spin up more workers so spin up a new worker
       LaunchWorkersUnlocked(/*threads=*/1);
     }
-    state_->pending_tasks_.push_back(
-        {std::move(task), std::move(stop_token), std::move(stop_callback)});
+    state_->pending_tasks_.push(
+        QueuedTask{{std::move(task), std::move(stop_token), std::move(stop_callback)},
+                   hints.priority,
+                   state_->spawned_tasks_count_++});
   }
   state_->cv_.notify_one();
   return Status::OK();
@@ -737,7 +761,8 @@ Status ThreadPool::Shutdown(bool wait) {
   } else {
     // clear any pending tasks so that we behave
     // the same as threadpool on fast shutdown
-    state_->task_queue.clear();
+    std::priority_queue<QueuedTask> empty;
+    std::swap(state_->task_queue, empty);
   }
   return Status::OK();
 }
@@ -777,7 +802,8 @@ Result<std::shared_ptr<ThreadPool>> ThreadPool::MakeEternal(int threads) {
 ThreadPool::~ThreadPool() {
   // clear threadpool, otherwise ~SerialExecutor will
   // run any tasks left (which isn't threadpool behaviour)
-  state_->task_queue.clear();
+  std::priority_queue<QueuedTask> empty;
+  std::swap(state_->task_queue, empty);
 }
 
 #endif  // ARROW_ENABLE_THREADING
diff --git a/cpp/src/arrow/util/thread_pool_test.cc b/cpp/src/arrow/util/thread_pool_test.cc
index 7cf8826e8a173..2c83146030243 100644
--- a/cpp/src/arrow/util/thread_pool_test.cc
+++ b/cpp/src/arrow/util/thread_pool_test.cc
@@ -21,6 +21,7 @@
 #endif
 
 #include <algorithm>
+#include <condition_variable>
 #include <cstdio>
 #include <cstdlib>
 #include <functional>
@@ -578,6 +579,62 @@ TEST_F(TestThreadPool, Spawn) {
   SpawnAdds(pool.get(), 7, task_add<int>);
 }
 
+TEST_F(TestThreadPool, TasksRunInPriorityOrder) {
+  auto pool = this->MakeThreadPool(1);
+  constexpr int kNumTasks = 10;
+  auto recorded_times = std::vector<std::chrono::steady_clock::time_point>(kNumTasks);
+  auto futures = std::vector<Future<int>>(kNumTasks);
+  std::mutex mutex;
+
+  auto wait_task = [&mutex] { std::unique_lock<std::mutex> lock(mutex); };
+  {
+    std::unique_lock<std::mutex> lock(mutex);
+    // Spawn wait_task to block the pool while we add the other tasks. This
+    // ensures all the tasks are queued before any of them start running, so that
+    // their running order is fully determined by their priority.
+    ASSERT_OK(pool->Spawn(wait_task));
+
+    for (int i = 0; i < kNumTasks; ++i) {
+      auto record_time = [&recorded_times, i]() {
+        recorded_times[i] = std::chrono::steady_clock::now();
+        return i;
+      };
+      // Spawn tasks in opposite order to urgency.
+      ASSERT_OK_AND_ASSIGN(futures[i],
+                           pool->Submit(TaskHints{kNumTasks - i}, record_time));
+    }
+  }
+
+  ASSERT_OK(pool->Shutdown());
+
+  for (size_t i = 1; i < kNumTasks; ++i) {
+    ASSERT_GE(recorded_times[i - 1], recorded_times[i]);
+    ASSERT_LT(futures[i - 1].result().ValueOrDie(), futures[i].result().ValueOrDie());
+  }
+}
+
+TEST_F(TestThreadPool, TasksOfEqualPriorityRunInSpawnOrder) {
+  auto pool = this->MakeThreadPool(1);
+  constexpr int kNumTasks = 10;
+  auto recorded_times = std::vector<std::chrono::steady_clock::time_point>(kNumTasks);
+  auto futures = std::vector<Future<int>>(kNumTasks);
+
+  for (int i = 0; i < kNumTasks; ++i) {
+    auto record_time = [&recorded_times, i]() {
+      recorded_times[i] = std::chrono::steady_clock::now();
+      return i;
+    };
+    ASSERT_OK_AND_ASSIGN(futures[i], pool->Submit(record_time));
+  }
+
+  ASSERT_OK(pool->Shutdown());
+
+  for (size_t i = 1; i < kNumTasks; ++i) {
+    ASSERT_LE(recorded_times[i - 1], recorded_times[i]);
+    ASSERT_LT(futures[i - 1].result().ValueOrDie(), futures[i].result().ValueOrDie());
+  }
+}
+
 TEST_F(TestThreadPool, StressSpawn) {
   auto pool = this->MakeThreadPool(30);
   SpawnAdds(pool.get(), 1000, task_add<int>);
diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc
index 3cc42ae370217..1c9b2323de500 100644
--- a/cpp/src/parquet/file_reader.cc
+++ b/cpp/src/parquet/file_reader.cc
@@ -83,8 +83,6 @@ bool IsColumnChunkFullyDictionaryEncoded(const ColumnChunkMetaData& col) {
 }
 }  // namespace
 
-// PARQUET-978: Minimize footer reads by reading 64 KB from the end of the file
-static constexpr int64_t kDefaultFooterReadSize = 64 * 1024;
 static constexpr uint32_t kFooterSize = 8;
 
 // For PARQUET-816
@@ -482,7 +480,8 @@ class SerializedFile : public ParquetFileReader::Contents {
           "Parquet file size is ", source_size_,
           " bytes, smaller than the minimum file footer (", kFooterSize, " bytes)");
     }
-    return std::min(source_size_, kDefaultFooterReadSize);
+
+    return std::min(static_cast<size_t>(source_size_), properties_.footer_read_size());
   }
 
   // Validate the magic bytes and get the length of the full footer.
diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
index 7f2e371df66d7..a8e4430a03d82 100644
--- a/cpp/src/parquet/properties.h
+++ b/cpp/src/parquet/properties.h
@@ -56,6 +56,9 @@ constexpr int32_t kDefaultThriftStringSizeLimit = 100 * 1000 * 1000;
 // kDefaultStringSizeLimit.
 constexpr int32_t kDefaultThriftContainerSizeLimit = 1000 * 1000;
 
+// PARQUET-978: Minimize footer reads by reading 64 KB from the end of the file
+constexpr int64_t kDefaultFooterReadSize = 64 * 1024;
+
 class PARQUET_EXPORT ReaderProperties {
  public:
   explicit ReaderProperties(MemoryPool* pool = ::arrow::default_memory_pool())
@@ -120,6 +123,12 @@ class PARQUET_EXPORT ReaderProperties {
     page_checksum_verification_ = check_crc;
   }
 
+  // Set the default read size to read the footer from a file. For high latency
+  // file systems and files with large metadata (>64KB) this can increase performance
+  // by reducing the number of round-trips to retrieve the entire file metadata.
+  void set_footer_read_size(size_t size) { footer_read_size_ = size; }
+  size_t footer_read_size() const { return footer_read_size_; }
+
  private:
   MemoryPool* pool_;
   int64_t buffer_size_ = kDefaultBufferSize;
@@ -129,6 +138,7 @@ class PARQUET_EXPORT ReaderProperties {
   bool page_checksum_verification_ = false;
   // Used with a RecordReader.
   bool read_dense_for_nullable_ = false;
+  size_t footer_read_size_ = kDefaultFooterReadSize;
   std::shared_ptr<FileDecryptionProperties> file_decryption_properties_;
 };
 
diff --git a/cpp/src/parquet/properties_test.cc b/cpp/src/parquet/properties_test.cc
index b2c574413abf7..35fc11565914e 100644
--- a/cpp/src/parquet/properties_test.cc
+++ b/cpp/src/parquet/properties_test.cc
@@ -35,6 +35,7 @@ TEST(TestReaderProperties, Basics) {
   ReaderProperties props;
 
   ASSERT_EQ(props.buffer_size(), kDefaultBufferSize);
+  ASSERT_EQ(props.footer_read_size(), kDefaultFooterReadSize);
   ASSERT_FALSE(props.is_buffered_stream_enabled());
   ASSERT_FALSE(props.page_checksum_verification());
 }
diff --git a/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj b/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj
index 6a5666d8f06b2..e8c387a1f3946 100644
--- a/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj
+++ b/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj
@@ -13,7 +13,7 @@
 
   <ItemGroup>
     <PackageReference Include="K4os.Compression.LZ4.Streams" Version="1.3.8" />
-    <PackageReference Include="ZstdSharp.Port" Version="0.8.3" />
+    <PackageReference Include="ZstdSharp.Port" Version="0.8.4" />
   </ItemGroup>
 
   <ItemGroup>
diff --git a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj
index 9c51f979aeadd..2c08e109dbfe5 100644
--- a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj
+++ b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj
@@ -23,7 +23,7 @@
       <IncludeAssets>runtime; build; native; contentfiles; analyzers</IncludeAssets>
     </PackageReference>
     <PackageReference Include="xunit.skippablefact" Version="1.5.23" />
-    <PackageReference Include="pythonnet" Version="3.0.4" />
+    <PackageReference Include="pythonnet" Version="3.0.5" />
   </ItemGroup>
 
   <ItemGroup>
diff --git a/dev/release/setup-ubuntu.sh b/dev/release/setup-ubuntu.sh
index b877c1225ab6a..686507d6257a3 100755
--- a/dev/release/setup-ubuntu.sh
+++ b/dev/release/setup-ubuntu.sh
@@ -22,27 +22,20 @@
 
 set -exu
 
-codename=$(. /etc/os-release && echo ${UBUNTU_CODENAME})
+version=$(. /etc/os-release && echo ${VERSION_ID})
 
-case ${codename} in
-  *)
-    nlohmann_json=3
-    python=3
-    apt-get update -y -q
-    apt-get install -y -q --no-install-recommends \
-      llvm-dev
-    ;;
-esac
+apt-get update -y -q
 
-case ${codename} in
-  focal)
-    ;;
-  *)
-    apt-get update -y -q
-    apt-get install -y -q --no-install-recommends \
-      libxsimd-dev
-    ;;
-esac
+if [ ${version} \> "20.04" ]; then
+  apt-get install -y -q --no-install-recommends \
+    libxsimd-dev
+fi
+
+if [ ${version} \> "22.04" ]; then
+  # Some tests rely on legacy timezone aliases such as "US/Pacific"
+  apt-get install -y -q --no-install-recommends \
+    tzdata-legacy
+fi
 
 apt-get install -y -q --no-install-recommends \
   build-essential \
@@ -58,10 +51,10 @@ apt-get install -y -q --no-install-recommends \
   libsqlite3-dev \
   libssl-dev \
   ninja-build \
-  nlohmann-json${nlohmann_json}-dev \
+  nlohmann-json3-dev \
   pkg-config \
-  python${python}-dev \
-  python${python}-venv \
+  python3-dev \
+  python3-venv \
   python3-pip \
   ruby-dev \
   tzdata \
diff --git a/dev/tasks/python-wheels/github.linux.yml b/dev/tasks/python-wheels/github.linux.yml
index f083b7c0c8f61..8ddd0a23099df 100644
--- a/dev/tasks/python-wheels/github.linux.yml
+++ b/dev/tasks/python-wheels/github.linux.yml
@@ -19,6 +19,16 @@
 
 {{ macros.github_header() }}
 
+# Testing free-threaded wheels uses a different Docker setup
+{% set test_imports_image = (
+    'python-free-threaded-wheel-manylinux-test-imports' if python_abi_tag == 'cp313t'
+    else 'python-wheel-manylinux-test-imports')
+%}
+{% set test_unittests_image = (
+    'python-free-threaded-wheel-manylinux-test-unittests' if python_abi_tag == 'cp313t'
+    else 'python-wheel-manylinux-test-unittests')
+%}
+
 permissions:
   packages: write
 
@@ -72,23 +82,11 @@ jobs:
 
       # TODO(kszucs): auditwheel show
       - name: Test wheel
-        if: |
-          '{{ python_abi_tag }}' != 'cp313t'
-        shell: bash
-        run: |
-          source arrow/ci/scripts/util_enable_core_dumps.sh
-          archery docker run python-wheel-manylinux-test-imports
-          archery docker run python-wheel-manylinux-test-unittests
-
-      # Free-threaded wheels need to be tested using a different Docker Compose service
-      - name: Test free-threaded wheel
-        if: |
-          '{{ python_abi_tag }}' == 'cp313t'
         shell: bash
         run: |
           source arrow/ci/scripts/util_enable_core_dumps.sh
-          archery docker run python-free-threaded-wheel-manylinux-test-imports
-          archery docker run python-free-threaded-wheel-manylinux-test-unittests
+          archery docker run {{ test_imports_image }}
+          archery docker run {{ test_unittests_image }}
 
       - name: Test wheel on AlmaLinux 8
         shell: bash
@@ -136,14 +134,29 @@ jobs:
             -e TEST_WHEELS=1 \
             ubuntu-verify-rc
 
+      - name: Test wheel on Ubuntu 24.04
+        shell: bash
+        if: |
+          '{{ python_version }}' == '3.12'
+        env:
+          UBUNTU: "24.04"
+        run: |
+          archery docker run \
+            -e TEST_DEFAULT=0 \
+            -e TEST_PYARROW_VERSION={{ arrow.no_rc_version }} \
+            -e TEST_PYTHON_VERSIONS={{ python_version }} \
+            -e TEST_WHEEL_PLATFORM_TAGS={{ wheel_platform_tag }} \
+            -e TEST_WHEELS=1 \
+            ubuntu-verify-rc
+
       {{ macros.github_upload_releases("arrow/python/repaired_wheels/*.whl")|indent }}
       {{ macros.github_upload_gemfury("arrow/python/repaired_wheels/*.whl")|indent }}
       {{ macros.github_upload_wheel_scientific_python("arrow/python/repaired_wheels/*.whl")|indent }}
 
       {% if arrow.is_default_branch() %}
-      - name: Push Docker Image
+      - name: Push Docker images
         shell: bash
         run: |
           archery docker push python-wheel-manylinux-{{ manylinux_version }}
-          archery docker push python-wheel-manylinux-test-unittests
+          archery docker push {{ test_unittests_image }}
       {% endif %}
diff --git a/dev/tasks/python-wheels/github.osx.yml b/dev/tasks/python-wheels/github.osx.yml
index 1799bd6ad6b6f..031bad94227e8 100644
--- a/dev/tasks/python-wheels/github.osx.yml
+++ b/dev/tasks/python-wheels/github.osx.yml
@@ -89,6 +89,7 @@ jobs:
             --x-feature=flight \
             --x-feature=gcs \
             --x-feature=json \
+            --x-feature=orc \
             --x-feature=parquet \
             --x-feature=s3
 
diff --git a/docs/source/java/memory.rst b/docs/source/java/memory.rst
index 8014a27444ac9..28ff01fb9447f 100644
--- a/docs/source/java/memory.rst
+++ b/docs/source/java/memory.rst
@@ -107,7 +107,7 @@ Child allocators can also be named, which makes it easier to tell where an Arrow
 Reference counting
 ------------------
 
-Because direct memory is expensive to allocate and deallocate, allocators may share direct buffers. To managed shared buffers
+Because direct memory is expensive to allocate and deallocate, allocators may share direct buffers. To manage shared buffers
 deterministically, we use manual reference counting instead of the garbage collector.
 This simply means that each buffer has a counter keeping track of the number of references to
 the buffer, and the user is responsible for properly incrementing/decrementing the counter as the buffer is used.
diff --git a/docs/source/python/api/arrays.rst b/docs/source/python/api/arrays.rst
index 5219902362375..dc24be8bd06d8 100644
--- a/docs/source/python/api/arrays.rst
+++ b/docs/source/python/api/arrays.rst
@@ -72,6 +72,8 @@ may expose data type-specific methods or properties.
    TimestampArray
    DurationArray
    MonthDayNanoIntervalArray
+   Decimal32Array
+   Decimal64Array
    Decimal128Array
    Decimal256Array
    DictionaryArray
diff --git a/docs/source/python/api/datatypes.rst b/docs/source/python/api/datatypes.rst
index 65f6da56a553c..5e151a1f93af5 100644
--- a/docs/source/python/api/datatypes.rst
+++ b/docs/source/python/api/datatypes.rst
@@ -116,6 +116,8 @@ functions above.
    Time64Type
    DurationType
    FixedSizeBinaryType
+   Decimal32Type
+   Decimal64Type
    Decimal128Type
    Decimal256Type
    Field
diff --git a/format/Schema.fbs b/format/Schema.fbs
index 72e2a8e85387c..835b1af608b99 100644
--- a/format/Schema.fbs
+++ b/format/Schema.fbs
@@ -61,7 +61,7 @@ enum MetadataVersion:short {
 ///      forward compatibility guarantees).
 ///  2.  A means of negotiating between a client and server
 ///      what features a stream is allowed to use. The enums
-///      values here are intented to represent higher level
+///      values here are intended to represent higher level
 ///      features, additional details may be negotiated
 ///      with key-value pairs specific to the protocol.
 ///
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 8c8c09265d0bf..d00a731324c92 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -166,7 +166,7 @@ def print_entry(label, value):
                          float16, float32, float64,
                          binary, string, utf8, binary_view, string_view,
                          large_binary, large_string, large_utf8,
-                         decimal128, decimal256,
+                         decimal32, decimal64, decimal128, decimal256,
                          list_, large_list, list_view, large_list_view,
                          map_, struct,
                          union, sparse_union, dense_union,
@@ -180,7 +180,8 @@ def print_entry(label, value):
                          ListViewType, LargeListViewType,
                          MapType, UnionType, SparseUnionType, DenseUnionType,
                          TimestampType, Time32Type, Time64Type, DurationType,
-                         FixedSizeBinaryType, Decimal128Type, Decimal256Type,
+                         FixedSizeBinaryType,
+                         Decimal32Type, Decimal64Type, Decimal128Type, Decimal256Type,
                          BaseExtensionType, ExtensionType,
                          RunEndEncodedType, Bool8Type, FixedShapeTensorType,
                          JsonType, OpaqueType, UuidType,
@@ -216,7 +217,8 @@ def print_entry(label, value):
                          Date32Array, Date64Array, TimestampArray,
                          Time32Array, Time64Array, DurationArray,
                          MonthDayNanoIntervalArray,
-                         Decimal128Array, Decimal256Array, StructArray, ExtensionArray,
+                         Decimal32Array, Decimal64Array, Decimal128Array, Decimal256Array,
+                         StructArray, ExtensionArray,
                          RunEndEncodedArray, Bool8Array, FixedShapeTensorArray,
                          JsonArray, OpaqueArray, UuidArray,
                          scalar, NA, _NULL as NULL, Scalar,
@@ -224,7 +226,7 @@ def print_entry(label, value):
                          Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar,
                          UInt8Scalar, UInt16Scalar, UInt32Scalar, UInt64Scalar,
                          HalfFloatScalar, FloatScalar, DoubleScalar,
-                         Decimal128Scalar, Decimal256Scalar,
+                         Decimal32Scalar, Decimal64Scalar, Decimal128Scalar, Decimal256Scalar,
                          ListScalar, LargeListScalar, FixedSizeListScalar,
                          ListViewScalar, LargeListViewScalar,
                          Date32Scalar, Date64Scalar,
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 8bddc34e1000b..f86caf1433d4e 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -2327,6 +2327,15 @@ cdef class FixedSizeBinaryArray(Array):
     Concrete class for Arrow arrays of a fixed-size binary data type.
     """
 
+cdef class Decima32Array(FixedSizeBinaryArray):
+    """
+    Concrete class for Arrow arrays of decimal32 data type.
+    """
+
+cdef class Decimal64Array(FixedSizeBinaryArray):
+    """
+    Concrete class for Arrow arrays of decimal64 data type.
+    """
 
 cdef class Decimal128Array(FixedSizeBinaryArray):
     """
@@ -4043,7 +4052,7 @@ cdef class StructArray(Array):
         memory_pool : MemoryPool (optional)
             For memory allocations, if required, otherwise uses default pool.
         type : pyarrow.StructType (optional)
-            Struct type for name and type of each child. 
+            Struct type for name and type of each child.
 
         Returns
         -------
@@ -4705,6 +4714,8 @@ cdef dict _array_classes = {
     _Type_STRING_VIEW: StringViewArray,
     _Type_DICTIONARY: DictionaryArray,
     _Type_FIXED_SIZE_BINARY: FixedSizeBinaryArray,
+    _Type_DECIMAL32: Decimal32Array,
+    _Type_DECIMAL64: Decimal64Array,
     _Type_DECIMAL128: Decimal128Array,
     _Type_DECIMAL256: Decimal256Array,
     _Type_STRUCT: StructArray,
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 8bf61b73cc211..b2edeb0b4192f 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -45,6 +45,16 @@ cdef extern from "arrow/util/key_value_metadata.h" namespace "arrow" nogil:
         c_bool Contains(const c_string& key) const
 
 
+cdef extern from "arrow/util/decimal.h" namespace "arrow" nogil:
+    cdef cppclass CDecimal32" arrow::Decimal32":
+        c_string ToString(int32_t scale) const
+
+
+cdef extern from "arrow/util/decimal.h" namespace "arrow" nogil:
+    cdef cppclass CDecimal64" arrow::Decimal64":
+        c_string ToString(int32_t scale) const
+
+
 cdef extern from "arrow/util/decimal.h" namespace "arrow" nogil:
     cdef cppclass CDecimal128" arrow::Decimal128":
         c_string ToString(int32_t scale) const
@@ -110,6 +120,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         _Type_FLOAT" arrow::Type::FLOAT"
         _Type_DOUBLE" arrow::Type::DOUBLE"
 
+        _Type_DECIMAL32" arrow::Type::DECIMAL32"
+        _Type_DECIMAL64" arrow::Type::DECIMAL64"
         _Type_DECIMAL128" arrow::Type::DECIMAL128"
         _Type_DECIMAL256" arrow::Type::DECIMAL256"
 
@@ -453,6 +465,18 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         int byte_width()
         int bit_width()
 
+    cdef cppclass CDecimal32Type \
+            " arrow::Decimal32Type"(CFixedSizeBinaryType):
+        CDecimal32Type(int precision, int scale)
+        int precision()
+        int scale()
+
+    cdef cppclass CDecimal64Type \
+            " arrow::Decimal64Type"(CFixedSizeBinaryType):
+        CDecimal64Type(int precision, int scale)
+        int precision()
+        int scale()
+
     cdef cppclass CDecimal128Type \
             " arrow::Decimal128Type"(CFixedSizeBinaryType):
         CDecimal128Type(int precision, int scale)
@@ -680,6 +704,16 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
     cdef cppclass CFixedSizeBinaryArray" arrow::FixedSizeBinaryArray"(CArray):
         const uint8_t* GetValue(int i)
 
+    cdef cppclass CDecimal32Array" arrow::Decimal32Array"(
+        CFixedSizeBinaryArray
+    ):
+        c_string FormatValue(int i)
+
+    cdef cppclass CDecimal64Array" arrow::Decimal64Array"(
+        CFixedSizeBinaryArray
+    ):
+        c_string FormatValue(int i)
+
     cdef cppclass CDecimal128Array" arrow::Decimal128Array"(
         CFixedSizeBinaryArray
     ):
@@ -1263,6 +1297,12 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
     cdef cppclass CDoubleScalar" arrow::DoubleScalar"(CScalar):
         double value
 
+    cdef cppclass CDecimal32Scalar" arrow::Decimal32Scalar"(CScalar):
+        CDecimal32 value
+
+    cdef cppclass CDecimal64Scalar" arrow::Decimal64Scalar"(CScalar):
+        CDecimal64 value
+
     cdef cppclass CDecimal128Scalar" arrow::Decimal128Scalar"(CScalar):
         CDecimal128 value
 
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index f3d4e1eec0899..bc9811b92b007 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -185,6 +185,16 @@ cdef class FixedSizeBinaryType(DataType):
         const CFixedSizeBinaryType* fixed_size_binary_type
 
 
+cdef class Decimal32Type(FixedSizeBinaryType):
+    cdef:
+        const CDecimal32Type* decimal32_type
+
+
+cdef class Decimal64Type(FixedSizeBinaryType):
+    cdef:
+        const CDecimal64Type* decimal64_type
+
+
 cdef class Decimal128Type(FixedSizeBinaryType):
     cdef:
         const CDecimal128Type* decimal128_type
@@ -430,6 +440,14 @@ cdef class FixedSizeBinaryArray(Array):
     pass
 
 
+cdef class Decimal32Array(FixedSizeBinaryArray):
+    pass
+
+
+cdef class Decimal64Array(FixedSizeBinaryArray):
+    pass
+
+
 cdef class Decimal128Array(FixedSizeBinaryArray):
     pass
 
diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx
index 6b82eb6566896..2c92ecbfa7344 100644
--- a/python/pyarrow/lib.pyx
+++ b/python/pyarrow/lib.pyx
@@ -87,9 +87,9 @@ def set_cpu_count(int count):
 
 def is_threading_enabled() -> bool:
     """
-    Returns True if threading is enabled in libarrow. 
+    Returns True if threading is enabled in libarrow.
 
-    If it isn't enabled, then python shouldn't create any 
+    If it isn't enabled, then python shouldn't create any
     threads either, because we're probably on a system where
     threading doesn't work (e.g. Emscripten).
     """
@@ -109,6 +109,8 @@ Type_INT64 = _Type_INT64
 Type_HALF_FLOAT = _Type_HALF_FLOAT
 Type_FLOAT = _Type_FLOAT
 Type_DOUBLE = _Type_DOUBLE
+Type_DECIMAL32 = _Type_DECIMAL32
+Type_DECIMAL64 = _Type_DECIMAL64
 Type_DECIMAL128 = _Type_DECIMAL128
 Type_DECIMAL256 = _Type_DECIMAL256
 Type_DATE32 = _Type_DATE32
diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi
index 913e25e308254..d1fa1192debc3 100644
--- a/python/pyarrow/public-api.pxi
+++ b/python/pyarrow/public-api.pxi
@@ -111,6 +111,10 @@ cdef api object pyarrow_wrap_data_type(
         out = DurationType.__new__(DurationType)
     elif type.get().id() == _Type_FIXED_SIZE_BINARY:
         out = FixedSizeBinaryType.__new__(FixedSizeBinaryType)
+    elif type.get().id() == _Type_DECIMAL32:
+        out = Decimal32Type.__new__(Decimal32Type)
+    elif type.get().id() == _Type_DECIMAL64:
+        out = Decimal64Type.__new__(Decimal64Type)
     elif type.get().id() == _Type_DECIMAL128:
         out = Decimal128Type.__new__(Decimal128Type)
     elif type.get().id() == _Type_DECIMAL256:
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index 2bfdcddf30736..2235cd0b981a6 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -336,6 +336,46 @@ cdef class DoubleScalar(Scalar):
         return sp.value if sp.is_valid else None
 
 
+cdef class Decimal32Scalar(Scalar):
+    """
+    Concrete class for decimal32 scalars.
+    """
+
+    def as_py(self):
+        """
+        Return this value as a Python Decimal.
+        """
+        cdef:
+            CDecimal32Scalar* sp = <CDecimal32Scalar*> self.wrapped.get()
+            CDecimal32Type* dtype = <CDecimal32Type*> sp.type.get()
+        if sp.is_valid:
+            return _pydecimal.Decimal(
+                frombytes(sp.value.ToString(dtype.scale()))
+            )
+        else:
+            return None
+
+
+cdef class Decimal64Scalar(Scalar):
+    """
+    Concrete class for decimal64 scalars.
+    """
+
+    def as_py(self):
+        """
+        Return this value as a Python Decimal.
+        """
+        cdef:
+            CDecimal64Scalar* sp = <CDecimal64Scalar*> self.wrapped.get()
+            CDecimal64Type* dtype = <CDecimal64Type*> sp.type.get()
+        if sp.is_valid:
+            return _pydecimal.Decimal(
+                frombytes(sp.value.ToString(dtype.scale()))
+            )
+        else:
+            return None
+
+
 cdef class Decimal128Scalar(Scalar):
     """
     Concrete class for decimal128 scalars.
@@ -1132,6 +1172,8 @@ cdef dict _scalar_classes = {
     _Type_HALF_FLOAT: HalfFloatScalar,
     _Type_FLOAT: FloatScalar,
     _Type_DOUBLE: DoubleScalar,
+    _Type_DECIMAL32: Decimal32Scalar,
+    _Type_DECIMAL64: Decimal64Scalar,
     _Type_DECIMAL128: Decimal128Scalar,
     _Type_DECIMAL256: Decimal256Scalar,
     _Type_DATE32: Date32Scalar,
diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
index 110dab7d35538..10c4d0e16000b 100644
--- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
+++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
@@ -1317,15 +1317,8 @@ struct ObjectWriterVisitor {
                                                         out_values);
   }
 
-  Status Visit(const Decimal32Type& type) {
-    return Status::NotImplemented("Decimal32 type not yet implemented");
-  }
-
-  Status Visit(const Decimal64Type& type) {
-    return Status::NotImplemented("Decimal64 type not yet implemented");
-  }
-
-  Status Visit(const Decimal128Type& type) {
+  template <typename DecimalT, typename DecimalArrayT>
+  Status VisitDecimal(const DecimalT& type) {
     OwnedRef decimal;
     OwnedRef Decimal;
     RETURN_NOT_OK(internal::ImportModule("decimal", &decimal));
@@ -1333,7 +1326,7 @@ struct ObjectWriterVisitor {
     PyObject* decimal_constructor = Decimal.obj();
 
     for (int c = 0; c < data.num_chunks(); c++) {
-      const auto& arr = checked_cast<const arrow::Decimal128Array&>(*data.chunk(c));
+      const auto& arr = checked_cast<const DecimalArrayT&>(*data.chunk(c));
 
       for (int64_t i = 0; i < arr.length(); ++i) {
         if (arr.IsNull(i)) {
@@ -1350,29 +1343,20 @@ struct ObjectWriterVisitor {
     return Status::OK();
   }
 
-  Status Visit(const Decimal256Type& type) {
-    OwnedRef decimal;
-    OwnedRef Decimal;
-    RETURN_NOT_OK(internal::ImportModule("decimal", &decimal));
-    RETURN_NOT_OK(internal::ImportFromModule(decimal.obj(), "Decimal", &Decimal));
-    PyObject* decimal_constructor = Decimal.obj();
+  Status Visit(const Decimal32Type& type) {
+    return VisitDecimal<Decimal32Type, Decimal32Array>(type);
+  }
 
-    for (int c = 0; c < data.num_chunks(); c++) {
-      const auto& arr = checked_cast<const arrow::Decimal256Array&>(*data.chunk(c));
+  Status Visit(const Decimal64Type& type) {
+    return VisitDecimal<Decimal64Type, Decimal64Array>(type);
+  }
 
-      for (int64_t i = 0; i < arr.length(); ++i) {
-        if (arr.IsNull(i)) {
-          Py_INCREF(Py_None);
-          *out_values++ = Py_None;
-        } else {
-          *out_values++ =
-              internal::DecimalFromString(decimal_constructor, arr.FormatValue(i));
-          RETURN_IF_PYERROR();
-        }
-      }
-    }
+  Status Visit(const Decimal128Type& type) {
+    return VisitDecimal<Decimal128Type, Decimal128Array>(type);
+  }
 
-    return Status::OK();
+  Status Visit(const Decimal256Type& type) {
+    return VisitDecimal<Decimal256Type, Decimal256Array>(type);
   }
 
   template <typename T>
diff --git a/python/pyarrow/src/arrow/python/decimal.cc b/python/pyarrow/src/arrow/python/decimal.cc
index 0c00fcfaa8e59..e6caff2201ddc 100644
--- a/python/pyarrow/src/arrow/python/decimal.cc
+++ b/python/pyarrow/src/arrow/python/decimal.cc
@@ -164,6 +164,24 @@ Status InternalDecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type,
 
 }  // namespace
 
+Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
+                                Decimal32* out) {
+  return InternalDecimalFromPythonDecimal(python_decimal, arrow_type, out);
+}
+
+Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, Decimal32* out) {
+  return InternalDecimalFromPyObject(obj, arrow_type, out);
+}
+
+Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
+                                Decimal64* out) {
+  return InternalDecimalFromPythonDecimal(python_decimal, arrow_type, out);
+}
+
+Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, Decimal64* out) {
+  return InternalDecimalFromPyObject(obj, arrow_type, out);
+}
+
 Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
                                 Decimal128* out) {
   return InternalDecimalFromPythonDecimal(python_decimal, arrow_type, out);
diff --git a/python/pyarrow/src/arrow/python/decimal.h b/python/pyarrow/src/arrow/python/decimal.h
index 1187037aed29e..83ded0b82b922 100644
--- a/python/pyarrow/src/arrow/python/decimal.h
+++ b/python/pyarrow/src/arrow/python/decimal.h
@@ -56,6 +56,40 @@ ARROW_PYTHON_EXPORT
 PyObject* DecimalFromString(PyObject* decimal_constructor,
                             const std::string& decimal_string);
 
+// \brief Convert a Python decimal to an Arrow Decimal128 object
+// \param[in] python_decimal A Python decimal.Decimal instance
+// \param[in] arrow_type An instance of arrow::DecimalType
+// \param[out] out A pointer to a Decimal128
+// \return The status of the operation
+ARROW_PYTHON_EXPORT
+Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
+                                Decimal32* out);
+
+// \brief Convert a Python object to an Arrow Decimal128 object
+// \param[in] python_decimal A Python int or decimal.Decimal instance
+// \param[in] arrow_type An instance of arrow::DecimalType
+// \param[out] out A pointer to a Decimal128
+// \return The status of the operation
+ARROW_PYTHON_EXPORT
+Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, Decimal32* out);
+
+// \brief Convert a Python decimal to an Arrow Decimal128 object
+// \param[in] python_decimal A Python decimal.Decimal instance
+// \param[in] arrow_type An instance of arrow::DecimalType
+// \param[out] out A pointer to a Decimal128
+// \return The status of the operation
+ARROW_PYTHON_EXPORT
+Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
+                                Decimal64* out);
+
+// \brief Convert a Python object to an Arrow Decimal128 object
+// \param[in] python_decimal A Python int or decimal.Decimal instance
+// \param[in] arrow_type An instance of arrow::DecimalType
+// \param[out] out A pointer to a Decimal128
+// \return The status of the operation
+ARROW_PYTHON_EXPORT
+Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, Decimal64* out);
+
 // \brief Convert a Python decimal to an Arrow Decimal128 object
 // \param[in] python_decimal A Python decimal.Decimal instance
 // \param[in] arrow_type An instance of arrow::DecimalType
diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc
index e7195e99072b0..709338b4e7756 100644
--- a/python/pyarrow/src/arrow/python/python_to_arrow.cc
+++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc
@@ -260,6 +260,18 @@ class PyValue {
     return value;
   }
 
+  static Result<Decimal32> Convert(const Decimal32Type* type, const O&, I obj) {
+    Decimal32 value;
+    RETURN_NOT_OK(internal::DecimalFromPyObject(obj, *type, &value));
+    return value;
+  }
+
+  static Result<Decimal64> Convert(const Decimal64Type* type, const O&, I obj) {
+    Decimal64 value;
+    RETURN_NOT_OK(internal::DecimalFromPyObject(obj, *type, &value));
+    return value;
+  }
+
   static Result<Decimal128> Convert(const Decimal128Type* type, const O&, I obj) {
     Decimal128 value;
     RETURN_NOT_OK(internal::DecimalFromPyObject(obj, *type, &value));
diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py
index 7a1b31a4d9d77..450cce74f1d43 100644
--- a/python/pyarrow/tests/strategies.py
+++ b/python/pyarrow/tests/strategies.py
@@ -92,6 +92,16 @@
     pa.float32(),
     pa.float64()
 ])
+decimal32_type = st.builds(
+    pa.decimal32,
+    precision=st.integers(min_value=1, max_value=9),
+    scale=st.integers(min_value=1, max_value=9)
+)
+decimal64_type = st.builds(
+    pa.decimal64,
+    precision=st.integers(min_value=1, max_value=18),
+    scale=st.integers(min_value=1, max_value=18)
+)
 decimal128_type = st.builds(
     pa.decimal128,
     precision=st.integers(min_value=1, max_value=38),
diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py
index e388851bea17b..e6fcd6149ee04 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -1900,7 +1900,9 @@ def test_fsl_to_fsl_cast(value_type):
 FloatToDecimalCase = namedtuple('FloatToDecimalCase',
                                 ('precision', 'scale', 'float_val'))
 
-decimal_type_traits = [DecimalTypeTraits('decimal128', pa.decimal128, 38),
+decimal_type_traits = [DecimalTypeTraits('decimal32', pa.decimal32, 9),
+                       DecimalTypeTraits('decimal64', pa.decimal64, 18),
+                       DecimalTypeTraits('decimal128', pa.decimal128, 38),
                        DecimalTypeTraits('decimal256', pa.decimal256, 76)]
 
 
@@ -1991,7 +1993,7 @@ def check_cast_float_to_decimal(float_ty, float_val, decimal_ty, decimal_ctx,
         # very high precisions as rounding errors can accumulate in
         # the iterative algorithm (GH-35576).
         diff_digits = abs(actual - expected) * 10**decimal_ty.scale
-        limit = 2 if decimal_ty.precision < max_precision - 1 else 4
+        limit = 2 if decimal_ty.precision < max_precision - 2 else 4
         assert diff_digits <= limit, (
             f"float_val = {float_val!r}, precision={decimal_ty.precision}, "
             f"expected = {expected!r}, actual = {actual!r}, "
@@ -2041,6 +2043,11 @@ def test_cast_float_to_decimal_random(float_ty, decimal_traits):
     mantissa_digits = math.floor(math.log10(2**mantissa_bits))
     max_precision = decimal_traits.max_precision
 
+    # For example, decimal32 <-> float64
+    if max_precision < mantissa_digits:
+        mantissa_bits = math.floor(math.log2(10**max_precision))
+        mantissa_digits = math.floor(math.log10(2**mantissa_bits))
+
     with decimal.localcontext() as ctx:
         precision = mantissa_digits
         ctx.prec = precision
diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py
index c3589877e6423..07286125c4cf6 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -1592,7 +1592,7 @@ def test_sequence_mixed_types_with_specified_type_fails():
 
 def test_sequence_decimal():
     data = [decimal.Decimal('1234.183'), decimal.Decimal('8094.234')]
-    for type in [pa.decimal128, pa.decimal256]:
+    for type in [pa.decimal32, pa.decimal64, pa.decimal128, pa.decimal256]:
         arr = pa.array(data, type=type(precision=7, scale=3))
         assert arr.to_pylist() == data
 
@@ -1601,28 +1601,28 @@ def test_sequence_decimal_different_precisions():
     data = [
         decimal.Decimal('1234234983.183'), decimal.Decimal('80943244.234')
     ]
-    for type in [pa.decimal128, pa.decimal256]:
+    for type in [pa.decimal64, pa.decimal128, pa.decimal256]:
         arr = pa.array(data, type=type(precision=13, scale=3))
         assert arr.to_pylist() == data
 
 
 def test_sequence_decimal_no_scale():
     data = [decimal.Decimal('1234234983'), decimal.Decimal('8094324')]
-    for type in [pa.decimal128, pa.decimal256]:
+    for type in [pa.decimal64, pa.decimal128, pa.decimal256]:
         arr = pa.array(data, type=type(precision=10))
         assert arr.to_pylist() == data
 
 
 def test_sequence_decimal_negative():
     data = [decimal.Decimal('-1234.234983'), decimal.Decimal('-8.094324')]
-    for type in [pa.decimal128, pa.decimal256]:
+    for type in [pa.decimal64, pa.decimal128, pa.decimal256]:
         arr = pa.array(data, type=type(precision=10, scale=6))
         assert arr.to_pylist() == data
 
 
 def test_sequence_decimal_no_whole_part():
     data = [decimal.Decimal('-.4234983'), decimal.Decimal('.0103943')]
-    for type in [pa.decimal128, pa.decimal256]:
+    for type in [pa.decimal32, pa.decimal64, pa.decimal128, pa.decimal256]:
         arr = pa.array(data, type=type(precision=7, scale=7))
         assert arr.to_pylist() == data
 
diff --git a/python/pyarrow/tests/test_json.py b/python/pyarrow/tests/test_json.py
index 3bb4440e89750..978c92307a69e 100644
--- a/python/pyarrow/tests/test_json.py
+++ b/python/pyarrow/tests/test_json.py
@@ -256,7 +256,9 @@ def test_explicit_schema_decimal(self):
         expected = {
             'a': [Decimal("1"), Decimal("1.45"), Decimal("-23.456"), None],
         }
-        for type_factory in (pa.decimal128, pa.decimal256):
+
+        decimal_types = (pa.decimal32, pa.decimal64, pa.decimal128, pa.decimal256)
+        for type_factory in decimal_types:
             schema = pa.schema([('a', type_factory(9, 4))])
             opts = ParseOptions(explicit_schema=schema)
             table = self.read_bytes(rows, parse_options=opts)
diff --git a/python/pyarrow/tests/test_misc.py b/python/pyarrow/tests/test_misc.py
index 0b2055018f695..dbba7852190f4 100644
--- a/python/pyarrow/tests/test_misc.py
+++ b/python/pyarrow/tests/test_misc.py
@@ -165,6 +165,8 @@ def test_set_timezone_db_path_non_windows():
     pa.Time32Type,
     pa.Time64Type,
     pa.TimestampType,
+    pa.Decimal32Type,
+    pa.Decimal64Type,
     pa.Decimal128Type,
     pa.Decimal256Type,
     pa.DictionaryType,
diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py
index bdcb6c2b42d78..b6d36787fbd37 100644
--- a/python/pyarrow/tests/test_schema.py
+++ b/python/pyarrow/tests/test_schema.py
@@ -615,6 +615,8 @@ def test_type_schema_pickling(pickle_module):
         pa.date64(),
         pa.timestamp('ms'),
         pa.timestamp('ns'),
+        pa.decimal32(9, 3),
+        pa.decimal64(11, 4),
         pa.decimal128(12, 2),
         pa.decimal256(76, 38),
         pa.field('a', 'string', metadata={b'foo': b'bar'}),
diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py
index de439b6bb8cd7..926de46318036 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -57,6 +57,8 @@ def get_many_types():
         pa.float16(),
         pa.float32(),
         pa.float64(),
+        pa.decimal32(9, 4),
+        pa.decimal64(18, 4),
         pa.decimal128(19, 4),
         pa.decimal256(76, 38),
         pa.string(),
@@ -139,18 +141,38 @@ def test_null_field_may_not_be_non_nullable():
 
 
 def test_is_decimal():
+    decimal32 = pa.decimal32(9, 4)
+    decimal64 = pa.decimal64(18, 4)
     decimal128 = pa.decimal128(19, 4)
     decimal256 = pa.decimal256(76, 38)
     int32 = pa.int32()
 
+    assert types.is_decimal(decimal32)
+    assert types.is_decimal(decimal64)
     assert types.is_decimal(decimal128)
     assert types.is_decimal(decimal256)
     assert not types.is_decimal(int32)
 
+    assert types.is_decimal32(decimal32)
+    assert not types.is_decimal32(decimal64)
+    assert not types.is_decimal32(decimal128)
+    assert not types.is_decimal32(decimal256)
+    assert not types.is_decimal32(int32)
+
+    assert not types.is_decimal64(decimal32)
+    assert types.is_decimal64(decimal64)
+    assert not types.is_decimal64(decimal128)
+    assert not types.is_decimal64(decimal256)
+    assert not types.is_decimal64(int32)
+
+    assert not types.is_decimal128(decimal32)
+    assert not types.is_decimal128(decimal64)
     assert types.is_decimal128(decimal128)
     assert not types.is_decimal128(decimal256)
     assert not types.is_decimal128(int32)
 
+    assert not types.is_decimal256(decimal32)
+    assert not types.is_decimal256(decimal64)
     assert not types.is_decimal256(decimal128)
     assert types.is_decimal256(decimal256)
     assert not types.is_decimal256(int32)
@@ -970,6 +992,8 @@ def test_bit_and_byte_width():
         (pa.float16(), 16, 2),
         (pa.timestamp('s'), 64, 8),
         (pa.date32(), 32, 4),
+        (pa.decimal32(9, 4), 32, 4),
+        (pa.decimal64(18, 4), 64, 8),
         (pa.decimal128(19, 4), 128, 16),
         (pa.decimal256(76, 38), 256, 32),
         (pa.binary(42), 42 * 8, 42),
@@ -1002,6 +1026,14 @@ def test_fixed_size_binary_byte_width():
 
 
 def test_decimal_properties():
+    ty = pa.decimal32(9, 4)
+    assert ty.byte_width == 4
+    assert ty.precision == 9
+    assert ty.scale == 4
+    ty = pa.decimal64(18, 4)
+    assert ty.byte_width == 8
+    assert ty.precision == 18
+    assert ty.scale == 4
     ty = pa.decimal128(19, 4)
     assert ty.byte_width == 16
     assert ty.precision == 19
@@ -1013,6 +1045,18 @@ def test_decimal_properties():
 
 
 def test_decimal_overflow():
+    pa.decimal32(1, 0)
+    pa.decimal32(9, 0)
+    for i in (0, -1, 10):
+        with pytest.raises(ValueError):
+            pa.decimal32(i, 0)
+
+    pa.decimal64(1, 0)
+    pa.decimal64(18, 0)
+    for i in (0, -1, 19):
+        with pytest.raises(ValueError):
+            pa.decimal64(i, 0)
+
     pa.decimal128(1, 0)
     pa.decimal128(38, 0)
     for i in (0, -1, 39):
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 827243ce00e16..3caf068a4c9b1 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -73,7 +73,10 @@ def _get_pandas_type_map():
             _Type_STRING: np.object_,
             _Type_LIST: np.object_,
             _Type_MAP: np.object_,
+            _Type_DECIMAL32: np.object_,
+            _Type_DECIMAL64: np.object_,
             _Type_DECIMAL128: np.object_,
+            _Type_DECIMAL256: np.object_,
         })
     return _pandas_type_map
 
@@ -1417,6 +1420,104 @@ cdef class FixedSizeBinaryType(DataType):
         return binary, (self.byte_width,)
 
 
+cdef class Decimal32Type(FixedSizeBinaryType):
+    """
+    Concrete class for decimal32 data types.
+
+    Examples
+    --------
+    Create an instance of decimal32 type:
+
+    >>> import pyarrow as pa
+    >>> pa.decimal32(5, 2)
+    Decimal32Type(decimal32(5, 2))
+    """
+
+    cdef void init(self, const shared_ptr[CDataType]& type) except *:
+        FixedSizeBinaryType.init(self, type)
+        self.decimal32_type = <const CDecimal32Type*> type.get()
+
+    def __reduce__(self):
+        return decimal32, (self.precision, self.scale)
+
+    @property
+    def precision(self):
+        """
+        The decimal precision, in number of decimal digits (an integer).
+
+        Examples
+        --------
+        >>> import pyarrow as pa
+        >>> t = pa.decimal32(5, 2)
+        >>> t.precision
+        5
+        """
+        return self.decimal32_type.precision()
+
+    @property
+    def scale(self):
+        """
+        The decimal scale (an integer).
+
+        Examples
+        --------
+        >>> import pyarrow as pa
+        >>> t = pa.decimal32(5, 2)
+        >>> t.scale
+        2
+        """
+        return self.decimal32_type.scale()
+
+
+cdef class Decimal64Type(FixedSizeBinaryType):
+    """
+    Concrete class for decimal64 data types.
+
+    Examples
+    --------
+    Create an instance of decimal64 type:
+
+    >>> import pyarrow as pa
+    >>> pa.decimal64(5, 2)
+    Decimal64Type(decimal64(5, 2))
+    """
+
+    cdef void init(self, const shared_ptr[CDataType]& type) except *:
+        FixedSizeBinaryType.init(self, type)
+        self.decimal64_type = <const CDecimal64Type*> type.get()
+
+    def __reduce__(self):
+        return decimal64, (self.precision, self.scale)
+
+    @property
+    def precision(self):
+        """
+        The decimal precision, in number of decimal digits (an integer).
+
+        Examples
+        --------
+        >>> import pyarrow as pa
+        >>> t = pa.decimal64(5, 2)
+        >>> t.precision
+        5
+        """
+        return self.decimal64_type.precision()
+
+    @property
+    def scale(self):
+        """
+        The decimal scale (an integer).
+
+        Examples
+        --------
+        >>> import pyarrow as pa
+        >>> t = pa.decimal64(5, 2)
+        >>> t.scale
+        2
+        """
+        return self.decimal64_type.scale()
+
+
 cdef class Decimal128Type(FixedSizeBinaryType):
     """
     Concrete class for decimal128 data types.
@@ -4500,6 +4601,116 @@ def float64():
     return primitive_type(_Type_DOUBLE)
 
 
+cpdef DataType decimal32(int precision, int scale=0):
+    """
+    Create decimal type with precision and scale and 32-bit width.
+
+    Arrow decimals are fixed-point decimal numbers encoded as a scaled
+    integer.  The precision is the number of significant digits that the
+    decimal type can represent; the scale is the number of digits after
+    the decimal point (note the scale can be negative).
+
+    As an example, ``decimal32(7, 3)`` can exactly represent the numbers
+    1234.567 and -1234.567 (encoded internally as the 32-bit integers
+    1234567 and -1234567, respectively), but neither 12345.67 nor 123.4567.
+
+    ``decimal32(5, -3)`` can exactly represent the number 12345000
+    (encoded internally as the 32-bit integer 12345), but neither
+    123450000 nor 1234500.
+
+    If you need a precision higher than 9 significant digits, consider
+    using ``decimal64``, ``decimal128``, or ``decimal256``.
+
+    Parameters
+    ----------
+    precision : int
+        Must be between 1 and 9
+    scale : int
+
+    Returns
+    -------
+    decimal_type : Decimal32Type
+
+    Examples
+    --------
+    Create an instance of decimal type:
+
+    >>> import pyarrow as pa
+    >>> pa.decimal32(5, 2)
+    Decimal32Type(decimal32(5, 2))
+
+    Create an array with decimal type:
+
+    >>> import decimal
+    >>> a = decimal.Decimal('123.45')
+    >>> pa.array([a], pa.decimal32(5, 2))
+    <pyarrow.lib.Decimal32Array object at ...>
+    [
+      123.45
+    ]
+    """
+    cdef shared_ptr[CDataType] decimal_type
+    if precision < 1 or precision > 9:
+        raise ValueError("precision should be between 1 and 9")
+    decimal_type.reset(new CDecimal32Type(precision, scale))
+    return pyarrow_wrap_data_type(decimal_type)
+
+
+cpdef DataType decimal64(int precision, int scale=0):
+    """
+    Create decimal type with precision and scale and 64-bit width.
+
+    Arrow decimals are fixed-point decimal numbers encoded as a scaled
+    integer.  The precision is the number of significant digits that the
+    decimal type can represent; the scale is the number of digits after
+    the decimal point (note the scale can be negative).
+
+    As an example, ``decimal64(7, 3)`` can exactly represent the numbers
+    1234.567 and -1234.567 (encoded internally as the 64-bit integers
+    1234567 and -1234567, respectively), but neither 12345.67 nor 123.4567.
+
+    ``decimal64(5, -3)`` can exactly represent the number 12345000
+    (encoded internally as the 64-bit integer 12345), but neither
+    123450000 nor 1234500.
+
+    If you need a precision higher than 18 significant digits, consider
+    using ``decimal128``, or ``decimal256``.
+
+    Parameters
+    ----------
+    precision : int
+        Must be between 1 and 18
+    scale : int
+
+    Returns
+    -------
+    decimal_type : Decimal64Type
+
+    Examples
+    --------
+    Create an instance of decimal type:
+
+    >>> import pyarrow as pa
+    >>> pa.decimal64(5, 2)
+    Decimal64Type(decimal64(5, 2))
+
+    Create an array with decimal type:
+
+    >>> import decimal
+    >>> a = decimal.Decimal('123.45')
+    >>> pa.array([a], pa.decimal64(5, 2))
+    <pyarrow.lib.Decimal64Array object at ...>
+    [
+      123.45
+    ]
+    """
+    cdef shared_ptr[CDataType] decimal_type
+    if precision < 1 or precision > 18:
+        raise ValueError("precision should be between 1 and 18")
+    decimal_type.reset(new CDecimal64Type(precision, scale))
+    return pyarrow_wrap_data_type(decimal_type)
+
+
 cpdef DataType decimal128(int precision, int scale=0):
     """
     Create decimal type with precision and scale and 128-bit width.
diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py
index 66b1ec33953a9..2bb5cfcf8b739 100644
--- a/python/pyarrow/types.py
+++ b/python/pyarrow/types.py
@@ -32,7 +32,8 @@
                            lib.Type_UINT64}
 _INTEGER_TYPES = _SIGNED_INTEGER_TYPES | _UNSIGNED_INTEGER_TYPES
 _FLOATING_TYPES = {lib.Type_HALF_FLOAT, lib.Type_FLOAT, lib.Type_DOUBLE}
-_DECIMAL_TYPES = {lib.Type_DECIMAL128, lib.Type_DECIMAL256}
+_DECIMAL_TYPES = {lib.Type_DECIMAL32, lib.Type_DECIMAL64, lib.Type_DECIMAL128,
+                  lib.Type_DECIMAL256}
 _DATE_TYPES = {lib.Type_DATE32, lib.Type_DATE64}
 _TIME_TYPES = {lib.Type_TIME32, lib.Type_TIME64}
 _INTERVAL_TYPES = {lib.Type_INTERVAL_MONTH_DAY_NANO}
@@ -289,6 +290,16 @@ def is_decimal(t):
     return t.id in _DECIMAL_TYPES
 
 
+@doc(is_null, datatype="decimal32")
+def is_decimal32(t):
+    return t.id == lib.Type_DECIMAL32
+
+
+@doc(is_null, datatype="decimal64")
+def is_decimal64(t):
+    return t.id == lib.Type_DECIMAL64
+
+
 @doc(is_null, datatype="decimal128")
 def is_decimal128(t):
     return t.id == lib.Type_DECIMAL128