From 6a4a041f76229adb658bffd784fee1a52d3c0a6c Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov@amd.com>
Date: Sun, 19 Oct 2025 07:42:29 -0500
Subject: [PATCH 1/6] [AMDGPU] Add hot block register renaming pass

This patch introduces a post-allocation register renaming optimization
pass that reduces value density in hot basic blocks. The pass helps
the post-RA scheduler avoid false WAW dependencies by moving local
values to unused physical registers.

The pass operates after greedy register allocation but before
VirtRegRewriter. It identifies hot blocks (above frequency threshold),
calculates value density per physical register, and selectively moves
local live ranges to free registers. Only 32-bit VGPR values that live
entirely within a single basic block are moved, ensuring conservative
behavior.

Key features:
- Respects tied operands and register allocation constraints
- Honors occupancy-based VGPR limits to avoid spilling
- Disabled by default (enable with -amdgpu-enable-hot-block-reg-renaming)
- Includes comprehensive lit tests

Performance results show up to 2% improvement on register-intensive
kernels such as rocRAND MTGP32.
---
 llvm/lib/Target/AMDGPU/AMDGPU.h               |   3 +
 .../AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp | 505 ++++++++++++++++++
 .../AMDGPU/AMDGPUHotBlockRegisterRenaming.h   |  34 ++
 llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def |   1 +
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 227 ++++----
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |   1 +
 .../AMDGPU/hot-block-register-renaming.mir    | 146 +++++
 7 files changed, 806 insertions(+), 111 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.h
 create mode 100644 llvm/test/CodeGen/AMDGPU/hot-block-register-renaming.mir

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 8fe324728405f..0784098f3326b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -545,6 +545,9 @@ extern char &AMDGPUExportKernelRuntimeHandlesLegacyID;
 void initializeGCNNSAReassignLegacyPass(PassRegistry &);
 extern char &GCNNSAReassignID;
 
+void initializeAMDGPUHotBlockRegisterRenamingLegacyPass(PassRegistry &);
+extern char &AMDGPUHotBlockRegisterRenamingID;
+
 void initializeGCNPreRALongBranchRegLegacyPass(PassRegistry &);
 extern char &GCNPreRALongBranchRegID;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp
new file mode 100644
index 0000000000000..bc95ee375d008
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp
@@ -0,0 +1,505 @@
+//===-- AMDGPUHotBlockRegisterRenaming.cpp -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Reduces value density in hot basic blocks by remapping local values
+/// from overused physical registers to free physical registers.
+///
+/// This gives the Post-RA scheduler more flexibility to reorder instructions
+/// by reducing false dependencies created by register reuse.
+///
+/// Algorithm:
+/// 1. Sort basic blocks by frequency (hottest first)
+/// 2. For each BB:
+///    a. Calculate value density (count of local values per PhysReg)
+///    b. Identify free PhysRegs (completely unused in this BB)
+///    c. Iteratively move local values from dense to free registers
+/// 3. VirtRegRewriter applies the updated VirtRegMap
+///
+/// Constraints (conservative):
+/// - Only move 32-bit VGPRs
+/// - Only move local values (single segment, entirely within BB)
+/// - Only move to completely free registers
+/// - Skip values with allocation hints
+/// - Skip reserved registers
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUHotBlockRegisterRenaming.h"
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include <queue>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-hot-block-reg-renaming"
+
+STATISTIC(NumBlocksProcessed, "Number of hot blocks processed");
+STATISTIC(NumValuesRemapped, "Number of values remapped to reduce density");
+STATISTIC(NumBlocksSkipped,
+          "Number of blocks skipped (no dense regs or no free regs)");
+
+namespace {
+
+class AMDGPUHotBlockRegisterRenamingImpl {
+public:
+  AMDGPUHotBlockRegisterRenamingImpl(VirtRegMap *VRM, LiveRegMatrix *LRM,
+                                     LiveIntervals *LIS,
+                                     MachineBlockFrequencyInfo *MBFI,
+                                     const GCNSubtarget *ST,
+                                     const SIMachineFunctionInfo &MFI)
+      : VRM(VRM), LRM(LRM), LIS(LIS), MBFI(MBFI), ST(ST), MFI(MFI) {}
+
+  bool run(MachineFunction &MF);
+
+private:
+  VirtRegMap *VRM;
+  LiveRegMatrix *LRM;
+  LiveIntervals *LIS;
+  MachineBlockFrequencyInfo *MBFI;
+  const GCNSubtarget *ST;
+  const SIMachineFunctionInfo &MFI;
+  const SIRegisterInfo *TRI = nullptr;
+  MachineRegisterInfo *MRI = nullptr;
+  unsigned VGPRLimit = 0; // Register limit based on occupancy
+
+  /// Cache of VirtRegs that cannot be moved (e.g. tied operands)
+  DenseSet<Register> UnmovableVRegs;
+
+  /// Process a single basic block
+  bool processBasicBlock(MachineBasicBlock *MBB);
+
+  /// Calculate value density map for a basic block
+  void calculateValueDensity(MachineBasicBlock *MBB,
+                             DenseMap<MCRegister, unsigned> &ValueDensity);
+
+  /// Find free physical registers in a basic block
+  void findFreeRegisters(MachineBasicBlock *MBB,
+                         SmallVectorImpl<MCRegister> &FreeRegs);
+
+  /// Check if a segment is local to a basic block
+  bool isLocalSegment(const LiveInterval::Segment &Seg, SlotIndex BBStart,
+                      SlotIndex BBEnd) const;
+
+  /// Check if a register is suitable for our optimization
+  bool isSuitableRegister(MCRegister PhysReg) const;
+
+  /// Check if a virtual register can be safely moved
+  bool canMoveValue(Register VirtReg, MCRegister CurrentPhysReg,
+                    MCRegister TargetPhysReg, SlotIndex BBStart,
+                    SlotIndex BBEnd);
+
+  /// Try to move a value from DenseReg to FreeReg
+  bool tryMoveValue(MCRegister DenseReg, MCRegister FreeReg,
+                    MachineBasicBlock *MBB, SlotIndex BBStart, SlotIndex BBEnd);
+};
+
+class AMDGPUHotBlockRegisterRenamingLegacy : public MachineFunctionPass {
+public:
+  static char ID;
+
+  AMDGPUHotBlockRegisterRenamingLegacy() : MachineFunctionPass(ID) {
+    initializeAMDGPUHotBlockRegisterRenamingLegacyPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "AMDGPU Hot Block Register Renaming";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LiveIntervalsWrapperPass>();
+    AU.addRequired<VirtRegMapWrapperLegacy>();
+    AU.addRequired<LiveRegMatrixWrapperLegacy>();
+    AU.addRequired<MachineBlockFrequencyInfoWrapperPass>();
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // end anonymous namespace
+
+char AMDGPUHotBlockRegisterRenamingLegacy::ID = 0;
+
+char &llvm::AMDGPUHotBlockRegisterRenamingID =
+    AMDGPUHotBlockRegisterRenamingLegacy::ID;
+
+INITIALIZE_PASS_BEGIN(AMDGPUHotBlockRegisterRenamingLegacy, DEBUG_TYPE,
+                      "AMDGPU Hot Block Register Renaming", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy)
+INITIALIZE_PASS_DEPENDENCY(LiveRegMatrixWrapperLegacy)
+INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_END(AMDGPUHotBlockRegisterRenamingLegacy, DEBUG_TYPE,
+                    "AMDGPU Hot Block Register Renaming", false, false)
+
+bool AMDGPUHotBlockRegisterRenamingLegacy::runOnMachineFunction(
+    MachineFunction &MF) {
+  VirtRegMap *VRM = &getAnalysis<VirtRegMapWrapperLegacy>().getVRM();
+  LiveRegMatrix *LRM = &getAnalysis<LiveRegMatrixWrapperLegacy>().getLRM();
+  LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
+  MachineBlockFrequencyInfo *MBFI =
+      &getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI();
+
+  const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+
+  AMDGPUHotBlockRegisterRenamingImpl Impl(VRM, LRM, LIS, MBFI, ST, MFI);
+  return Impl.run(MF);
+}
+
+bool AMDGPUHotBlockRegisterRenamingImpl::run(MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << "AMDGPUHotBlockRegisterRenaming: Processing "
+                    << MF.getName() << "\n");
+
+  TRI = ST->getRegisterInfo();
+  MRI = &MF.getRegInfo();
+
+  // Calculate VGPR limit based on occupancy
+  unsigned Occupancy = MFI.getOccupancy();
+  VGPRLimit = ST->getMaxNumVGPRs(Occupancy, MFI.getDynamicVGPRBlockSize());
+
+  LLVM_DEBUG(dbgs() << "  Occupancy: " << Occupancy
+                    << ", VGPR Limit: " << VGPRLimit << "\n");
+
+  // Sort basic blocks by frequency (hottest first)
+  SmallVector<MachineBasicBlock *, 16> SortedBBs;
+  for (MachineBasicBlock &MBB : MF) {
+    SortedBBs.push_back(&MBB);
+  }
+
+  llvm::sort(SortedBBs, [this](MachineBasicBlock *A, MachineBasicBlock *B) {
+    return MBFI->getBlockFreq(A) > MBFI->getBlockFreq(B);
+  });
+
+  bool Changed = false;
+  for (MachineBasicBlock *MBB : SortedBBs) {
+    Changed |= processBasicBlock(MBB);
+  }
+
+  return Changed;
+}
+
+bool AMDGPUHotBlockRegisterRenamingImpl::processBasicBlock(
+    MachineBasicBlock *MBB) {
+  LLVM_DEBUG(dbgs() << "  Processing BB#" << MBB->getNumber() << " (freq="
+                    << MBFI->getBlockFreq(MBB).getFrequency() << ")\n");
+
+  // Clear the unmovable cache for each BB (tied operands are BB-specific)
+  UnmovableVRegs.clear();
+
+  SlotIndex BBStart = LIS->getMBBStartIdx(MBB);
+  SlotIndex BBEnd = LIS->getMBBEndIdx(MBB);
+
+  // Step 1: Calculate value density
+  DenseMap<MCRegister, unsigned> ValueDensity;
+  calculateValueDensity(MBB, ValueDensity);
+
+  if (ValueDensity.empty()) {
+    LLVM_DEBUG(dbgs() << "    No values found, skipping\n");
+    ++NumBlocksSkipped;
+    return false;
+  }
+
+  // Step 2: Find free registers
+  SmallVector<MCRegister, 64> FreeRegs;
+  findFreeRegisters(MBB, FreeRegs);
+
+  if (FreeRegs.empty()) {
+    LLVM_DEBUG(dbgs() << "    No free registers, skipping\n");
+    ++NumBlocksSkipped;
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "    Found " << ValueDensity.size()
+                    << " registers with values, " << FreeRegs.size()
+                    << " free registers\n");
+
+  // Step 3: Create max heap of dense registers
+  auto Comparator = [&ValueDensity](MCRegister A, MCRegister B) {
+    return ValueDensity[A] < ValueDensity[B]; // max heap
+  };
+  std::priority_queue<MCRegister, std::vector<MCRegister>, decltype(Comparator)>
+      DenseRegsHeap(Comparator);
+
+  for (auto &Entry : ValueDensity) {
+    if (Entry.second > 1) { // Only interested in registers with density > 1
+      DenseRegsHeap.push(Entry.first);
+    }
+  }
+
+  if (DenseRegsHeap.empty()) {
+    LLVM_DEBUG(
+        dbgs() << "    No dense registers (all density <= 1), skipping\n");
+    ++NumBlocksSkipped;
+    return false;
+  }
+
+  // Step 4: Iteratively move values
+  bool Changed = false;
+  size_t FreeRegIdx = 0;
+
+  while (!DenseRegsHeap.empty() && FreeRegIdx < FreeRegs.size()) {
+    MCRegister DenseReg = DenseRegsHeap.top();
+    DenseRegsHeap.pop();
+
+    MCRegister FreeReg = FreeRegs[FreeRegIdx++];
+
+    if (tryMoveValue(DenseReg, FreeReg, MBB, BBStart, BBEnd)) {
+      Changed = true;
+      ++NumValuesRemapped;
+
+      // Update density
+      ValueDensity[DenseReg]--;
+
+      // If still dense, put back in heap
+      if (ValueDensity[DenseReg] > 1) {
+        DenseRegsHeap.push(DenseReg);
+      }
+    }
+  }
+
+  if (Changed) {
+    ++NumBlocksProcessed;
+  } else {
+    ++NumBlocksSkipped;
+  }
+
+  return Changed;
+}
+
+void AMDGPUHotBlockRegisterRenamingImpl::calculateValueDensity(
+    MachineBasicBlock *MBB, DenseMap<MCRegister, unsigned> &ValueDensity) {
+  SlotIndex BBStart = LIS->getMBBStartIdx(MBB);
+  SlotIndex BBEnd = LIS->getMBBEndIdx(MBB);
+
+  // Iterate over VGPR_32 register class
+  const TargetRegisterClass *VGPR_32_RC =
+      TRI->getRegClass(AMDGPU::VGPR_32RegClassID);
+
+  for (MCRegister PhysReg : *VGPR_32_RC) {
+    if (MRI->isReserved(PhysReg))
+      continue;
+
+    unsigned LocalValueCount = 0;
+
+    // Access LiveIntervalUnion for this PhysReg
+    for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
+      LiveIntervalUnion &LIU = LRM->getLiveUnions()[Unit];
+
+      for (LiveIntervalUnion::SegmentIter SI = LIU.begin(); SI.valid(); ++SI) {
+        SlotIndex SegStart = SI.start();
+        SlotIndex SegEnd = SI.stop();
+
+        // Check if segment is entirely within this BB
+        if (SegStart >= BBStart && SegEnd < BBEnd) {
+          LocalValueCount++;
+        }
+      }
+    }
+
+    if (LocalValueCount > 0) {
+      ValueDensity[PhysReg] = LocalValueCount;
+    }
+  }
+}
+
+void AMDGPUHotBlockRegisterRenamingImpl::findFreeRegisters(
+    MachineBasicBlock *MBB, SmallVectorImpl<MCRegister> &FreeRegs) {
+  SlotIndex BBStart = LIS->getMBBStartIdx(MBB);
+  SlotIndex BBEnd = LIS->getMBBEndIdx(MBB);
+
+  const TargetRegisterClass *VGPR_32_RC =
+      TRI->getRegClass(AMDGPU::VGPR_32RegClassID);
+
+  unsigned RegIdx = 0;
+  for (MCRegister PhysReg : *VGPR_32_RC) {
+    // Only consider registers up to VGPRLimit (based on occupancy)
+    if (RegIdx >= VGPRLimit)
+      break;
+    RegIdx++;
+
+    if (MRI->isReserved(PhysReg))
+      continue;
+
+    bool IsFree = true;
+
+    // Check all register units
+    for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
+      LiveIntervalUnion &LIU = LRM->getLiveUnions()[Unit];
+
+      // Check if anything is live in this BB
+      LiveIntervalUnion::SegmentIter SI = LIU.find(BBStart);
+      if (SI.valid() && SI.start() < BBEnd) {
+        IsFree = false;
+        break;
+      }
+    }
+
+    if (IsFree) {
+      FreeRegs.push_back(PhysReg);
+    }
+  }
+}
+
+bool AMDGPUHotBlockRegisterRenamingImpl::canMoveValue(Register VirtReg,
+                                                      MCRegister CurrentPhysReg,
+                                                      MCRegister TargetPhysReg,
+                                                      SlotIndex BBStart,
+                                                      SlotIndex BBEnd) {
+
+  // Check for tied operands
+  // A tied operand means the instruction requires source and destination to be
+  // the same physical register. Moving such a value would break this
+  // constraint.
+
+  LiveInterval &VirtRegLI = LIS->getInterval(VirtReg);
+
+  for (const LiveRange::Segment &S : VirtRegLI) {
+    // Only check segments within this BB
+    if (S.start < BBStart || S.end > BBEnd)
+      continue;
+
+    // Check if this segment starts at a tied def point
+    // (meaning it's the destination of a tied operand instruction)
+    MachineInstr *DefMI = LIS->getInstructionFromIndex(S.start);
+    if (!DefMI)
+      continue;
+
+    for (const MachineOperand &MO : DefMI->operands()) {
+      if (MO.isReg() && MO.getReg() == VirtReg && MO.isDef() && MO.isTied()) {
+        LLVM_DEBUG(dbgs() << "        Cannot move " << printReg(VirtReg, TRI)
+                          << ": has tied def at " << S.start << " in "
+                          << *DefMI);
+        return false;
+      }
+    }
+  }
+
+  // Future checks can be added here:
+  // - Register class constraints
+  // - Special register restrictions
+  // - Architecture-specific constraints
+
+  return true;
+}
+
+bool AMDGPUHotBlockRegisterRenamingImpl::tryMoveValue(MCRegister DenseReg,
+                                                      MCRegister FreeReg,
+                                                      MachineBasicBlock *MBB,
+                                                      SlotIndex BBStart,
+                                                      SlotIndex BBEnd) {
+  // Find a movable local value in DenseReg
+  for (MCRegUnit Unit : TRI->regunits(DenseReg)) {
+    LiveIntervalUnion &LIU = LRM->getLiveUnions()[Unit];
+
+    for (LiveIntervalUnion::SegmentIter SI = LIU.begin(); SI.valid(); ++SI) {
+      Register VirtReg = SI.value()->reg();
+
+      // Check if this VirtReg is mapped to DenseReg
+      if (VRM->getPhys(VirtReg) != DenseReg)
+        continue;
+
+      // Get the proper LiveInterval from LiveIntervals
+      LiveInterval &VirtRegLI = LIS->getInterval(VirtReg);
+
+      // Check: segment is local (entirely within BB)
+      SlotIndex SegStart = SI.start();
+      SlotIndex SegEnd = SI.stop();
+      if (SegStart < BBStart || SegEnd >= BBEnd)
+        continue;
+
+      // Check: LiveInterval has only one segment (conservative)
+      if (VirtRegLI.size() != 1)
+        continue;
+
+      // Check: No subranges (conservative - avoid complex cases)
+      if (VirtRegLI.hasSubRanges())
+        continue;
+
+      // Check: No allocation hints
+      if (VRM->hasKnownPreference(VirtReg))
+        continue;
+
+      // Check: Cached unmovable VirtRegs
+      if (UnmovableVRegs.contains(VirtReg)) {
+        LLVM_DEBUG(dbgs() << "        Skipping " << printReg(VirtReg, TRI)
+                          << " (cached as unmovable)\n");
+        continue;
+      }
+
+      // Check: Can this value be safely moved?
+      if (!canMoveValue(VirtReg, DenseReg, FreeReg, BBStart, BBEnd)) {
+        // Cache the result to avoid checking again
+        UnmovableVRegs.insert(VirtReg);
+        continue;
+      }
+
+      // This VirtReg is movable! Perform the remap
+      LLVM_DEBUG(dbgs() << "      Moving " << printReg(VirtReg, TRI) << " from "
+                        << printReg(DenseReg, TRI) << " to "
+                        << printReg(FreeReg, TRI) << "\n");
+
+      // Safety check: must be assigned before unassign
+      if (!VRM->hasPhys(VirtReg)) {
+        LLVM_DEBUG(
+            dbgs() << "        WARNING: VirtReg not assigned, skipping\n");
+        continue;
+      }
+
+      LRM->unassign(VirtRegLI); // Remove from LiveRegMatrix
+      LRM->assign(VirtRegLI,
+                  FreeReg); // Assign to new physreg (updates VirtRegMap too)
+
+      // Sanity check: verify VirtReg is now mapped to FreeReg
+      assert(VRM->getPhys(VirtReg) == FreeReg &&
+             "VirtRegMap not updated correctly");
+
+      return true; // Successfully moved one value
+    }
+  }
+
+  return false; // No movable value found
+}
+
+PreservedAnalyses
+AMDGPUHotBlockRegisterRenamingPass::run(MachineFunction &MF,
+                                        MachineFunctionAnalysisManager &MFAM) {
+  VirtRegMap *VRM = &MFAM.getResult<VirtRegMapAnalysis>(MF);
+  LiveRegMatrix *LRM = &MFAM.getResult<LiveRegMatrixAnalysis>(MF);
+  LiveIntervals *LIS = &MFAM.getResult<LiveIntervalsAnalysis>(MF);
+  MachineBlockFrequencyInfo *MBFI =
+      &MFAM.getResult<MachineBlockFrequencyAnalysis>(MF);
+
+  const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+
+  AMDGPUHotBlockRegisterRenamingImpl Impl(VRM, LRM, LIS, MBFI, ST, MFI);
+  if (!Impl.run(MF))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.h b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.h
new file mode 100644
index 0000000000000..6dfdd1bec72ef
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.h
@@ -0,0 +1,34 @@
+//===-- AMDGPUHotBlockRegisterRenaming.h -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Reduces value density in hot basic blocks by remapping local values
+/// from overused physical registers to free physical registers.
+///
+/// This gives the Post-RA scheduler more flexibility to reorder instructions
+/// by reducing false dependencies created by register reuse.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUHOTBLOCKREGISTERRENAMING_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUHOTBLOCKREGISTERRENAMING_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+
+class AMDGPUHotBlockRegisterRenamingPass
+    : public PassInfoMixin<AMDGPUHotBlockRegisterRenamingPass> {
+public:
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &MFAM);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUHOTBLOCKREGISTERRENAMING_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 5cd9bcf4a96fd..a299b239cd020 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -122,6 +122,7 @@ MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass()
 MACHINE_FUNCTION_PASS("amdgpu-preload-kern-arg-prolog", AMDGPUPreloadKernArgPrologPass())
 MACHINE_FUNCTION_PASS("amdgpu-prepare-agpr-alloc", AMDGPUPrepareAGPRAllocPass())
 MACHINE_FUNCTION_PASS("amdgpu-nsa-reassign", GCNNSAReassignPass())
+MACHINE_FUNCTION_PASS("amdgpu-hot-block-reg-renaming", AMDGPUHotBlockRegisterRenamingPass())
 MACHINE_FUNCTION_PASS("amdgpu-wait-sgpr-hazards", AMDGPUWaitSGPRHazardsPass())
 MACHINE_FUNCTION_PASS("gcn-create-vopd", GCNCreateVOPDPass())
 MACHINE_FUNCTION_PASS("gcn-dpp-combine", GCNDPPCombinePass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index a9a0a4a0cc3e4..af0f4e5eca7be 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -21,6 +21,7 @@
 #include "AMDGPUCtorDtorLowering.h"
 #include "AMDGPUExportClustering.h"
 #include "AMDGPUExportKernelRuntimeHandles.h"
+#include "AMDGPUHotBlockRegisterRenaming.h"
 #include "AMDGPUIGroupLP.h"
 #include "AMDGPUISelDAGToDAG.h"
 #include "AMDGPULowerVGPREncoding.h"
@@ -169,13 +170,13 @@ class AMDGPUCodeGenPassBuilder
 class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
 public:
   SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
-    : RegisterRegAllocBase(N, D, C) {}
+      : RegisterRegAllocBase(N, D, C) {}
 };
 
 class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
 public:
   VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
-    : RegisterRegAllocBase(N, D, C) {}
+      : RegisterRegAllocBase(N, D, C) {}
 };
 
 class WWMRegisterRegAlloc : public RegisterRegAllocBase<WWMRegisterRegAlloc> {
@@ -218,19 +219,21 @@ static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
 static llvm::once_flag InitializeDefaultWWMRegisterAllocatorFlag;
 
 static SGPRRegisterRegAlloc
-defaultSGPRRegAlloc("default",
-                    "pick SGPR register allocator based on -O option",
-                    useDefaultRegisterAllocator);
+    defaultSGPRRegAlloc("default",
+                        "pick SGPR register allocator based on -O option",
+                        useDefaultRegisterAllocator);
 
 static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
                RegisterPassParser<SGPRRegisterRegAlloc>>
-SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
-             cl::desc("Register allocator to use for SGPRs"));
+    SGPRRegAlloc("sgpr-regalloc", cl::Hidden,
+                 cl::init(&useDefaultRegisterAllocator),
+                 cl::desc("Register allocator to use for SGPRs"));
 
 static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
                RegisterPassParser<VGPRRegisterRegAlloc>>
-VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
-             cl::desc("Register allocator to use for VGPRs"));
+    VGPRRegAlloc("vgpr-regalloc", cl::Hidden,
+                 cl::init(&useDefaultRegisterAllocator),
+                 cl::desc("Register allocator to use for VGPRs"));
 
 static cl::opt<WWMRegisterRegAlloc::FunctionPassCtor, false,
                RegisterPassParser<WWMRegisterRegAlloc>>
@@ -301,22 +304,25 @@ static FunctionPass *createFastWWMRegisterAllocator() {
   return createFastRegisterAllocator(onlyAllocateWWMRegs, false);
 }
 
-static SGPRRegisterRegAlloc basicRegAllocSGPR(
-  "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
-static SGPRRegisterRegAlloc greedyRegAllocSGPR(
-  "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator);
-
-static SGPRRegisterRegAlloc fastRegAllocSGPR(
-  "fast", "fast register allocator", createFastSGPRRegisterAllocator);
+static SGPRRegisterRegAlloc basicRegAllocSGPR("basic",
+                                              "basic register allocator",
+                                              createBasicSGPRRegisterAllocator);
+static SGPRRegisterRegAlloc
+    greedyRegAllocSGPR("greedy", "greedy register allocator",
+                       createGreedySGPRRegisterAllocator);
 
+static SGPRRegisterRegAlloc fastRegAllocSGPR("fast", "fast register allocator",
+                                             createFastSGPRRegisterAllocator);
 
-static VGPRRegisterRegAlloc basicRegAllocVGPR(
-  "basic", "basic register allocator", createBasicVGPRRegisterAllocator);
-static VGPRRegisterRegAlloc greedyRegAllocVGPR(
-  "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator);
+static VGPRRegisterRegAlloc basicRegAllocVGPR("basic",
+                                              "basic register allocator",
+                                              createBasicVGPRRegisterAllocator);
+static VGPRRegisterRegAlloc
+    greedyRegAllocVGPR("greedy", "greedy register allocator",
+                       createGreedyVGPRRegisterAllocator);
 
-static VGPRRegisterRegAlloc fastRegAllocVGPR(
-  "fast", "fast register allocator", createFastVGPRRegisterAllocator);
+static VGPRRegisterRegAlloc fastRegAllocVGPR("fast", "fast register allocator",
+                                             createFastVGPRRegisterAllocator);
 
 static WWMRegisterRegAlloc basicRegAllocWWMReg("basic",
                                                "basic register allocator",
@@ -334,14 +340,14 @@ static bool isLTOPreLink(ThinOrFullLTOPhase Phase) {
 } // anonymous namespace
 
 static cl::opt<bool>
-EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
-                        cl::desc("Run early if-conversion"),
-                        cl::init(false));
+    EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
+                            cl::desc("Run early if-conversion"),
+                            cl::init(false));
 
 static cl::opt<bool>
-OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
-            cl::desc("Run pre-RA exec mask optimizations"),
-            cl::init(true));
+    OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
+                     cl::desc("Run pre-RA exec mask optimizations"),
+                     cl::init(true));
 
 static cl::opt<bool>
     LowerCtorDtor("amdgpu-lower-global-ctor-dtor",
@@ -349,32 +355,27 @@ static cl::opt<bool>
                   cl::init(true), cl::Hidden);
 
 // Option to disable vectorizer for tests.
-static cl::opt<bool> EnableLoadStoreVectorizer(
-  "amdgpu-load-store-vectorizer",
-  cl::desc("Enable load store vectorizer"),
-  cl::init(true),
-  cl::Hidden);
+static cl::opt<bool>
+    EnableLoadStoreVectorizer("amdgpu-load-store-vectorizer",
+                              cl::desc("Enable load store vectorizer"),
+                              cl::init(true), cl::Hidden);
 
 // Option to control global loads scalarization
-static cl::opt<bool> ScalarizeGlobal(
-  "amdgpu-scalarize-global-loads",
-  cl::desc("Enable global load scalarization"),
-  cl::init(true),
-  cl::Hidden);
+static cl::opt<bool>
+    ScalarizeGlobal("amdgpu-scalarize-global-loads",
+                    cl::desc("Enable global load scalarization"),
+                    cl::init(true), cl::Hidden);
 
 // Option to run internalize pass.
 static cl::opt<bool> InternalizeSymbols(
-  "amdgpu-internalize-symbols",
-  cl::desc("Enable elimination of non-kernel functions and unused globals"),
-  cl::init(false),
-  cl::Hidden);
+    "amdgpu-internalize-symbols",
+    cl::desc("Enable elimination of non-kernel functions and unused globals"),
+    cl::init(false), cl::Hidden);
 
 // Option to inline all early.
-static cl::opt<bool> EarlyInlineAll(
-  "amdgpu-early-inline-all",
-  cl::desc("Inline all functions early"),
-  cl::init(false),
-  cl::Hidden);
+static cl::opt<bool> EarlyInlineAll("amdgpu-early-inline-all",
+                                    cl::desc("Inline all functions early"),
+                                    cl::init(false), cl::Hidden);
 
 static cl::opt<bool> RemoveIncompatibleFunctions(
     "amdgpu-enable-remove-incompatible-functions", cl::Hidden,
@@ -382,39 +383,40 @@ static cl::opt<bool> RemoveIncompatibleFunctions(
              "use features not supported by the target GPU"),
     cl::init(true));
 
-static cl::opt<bool> EnableSDWAPeephole(
-  "amdgpu-sdwa-peephole",
-  cl::desc("Enable SDWA peepholer"),
-  cl::init(true));
+static cl::opt<bool> EnableSDWAPeephole("amdgpu-sdwa-peephole",
+                                        cl::desc("Enable SDWA peepholer"),
+                                        cl::init(true));
 
-static cl::opt<bool> EnableDPPCombine(
-  "amdgpu-dpp-combine",
-  cl::desc("Enable DPP combiner"),
-  cl::init(true));
+static cl::opt<bool> EnableDPPCombine("amdgpu-dpp-combine",
+                                      cl::desc("Enable DPP combiner"),
+                                      cl::init(true));
 
 // Enable address space based alias analysis
-static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
-  cl::desc("Enable AMDGPU Alias Analysis"),
-  cl::init(true));
+static cl::opt<bool>
+    EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
+                              cl::desc("Enable AMDGPU Alias Analysis"),
+                              cl::init(true));
 
 // Enable lib calls simplifications
-static cl::opt<bool> EnableLibCallSimplify(
-  "amdgpu-simplify-libcall",
-  cl::desc("Enable amdgpu library simplifications"),
-  cl::init(true),
-  cl::Hidden);
+static cl::opt<bool>
+    EnableLibCallSimplify("amdgpu-simplify-libcall",
+                          cl::desc("Enable amdgpu library simplifications"),
+                          cl::init(true), cl::Hidden);
 
 static cl::opt<bool> EnableLowerKernelArguments(
-  "amdgpu-ir-lower-kernel-arguments",
-  cl::desc("Lower kernel argument loads in IR pass"),
-  cl::init(true),
-  cl::Hidden);
+    "amdgpu-ir-lower-kernel-arguments",
+    cl::desc("Lower kernel argument loads in IR pass"), cl::init(true),
+    cl::Hidden);
 
 static cl::opt<bool> EnableRegReassign(
-  "amdgpu-reassign-regs",
-  cl::desc("Enable register reassign optimizations on gfx10+"),
-  cl::init(true),
-  cl::Hidden);
+    "amdgpu-reassign-regs",
+    cl::desc("Enable register reassign optimizations on gfx10+"),
+    cl::init(true), cl::Hidden);
+
+static cl::opt<bool> EnableHotBlockRegRenaming(
+    "amdgpu-enable-hot-block-reg-renaming",
+    cl::desc("Enable hot block register renaming to reduce value density"),
+    cl::init(false), cl::Hidden);
 
 static cl::opt<bool> OptVGPRLiveRange(
     "amdgpu-opt-vgpr-liverange",
@@ -432,11 +434,10 @@ static cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy(
         clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer")));
 
 // Enable Mode register optimization
-static cl::opt<bool> EnableSIModeRegisterPass(
-  "amdgpu-mode-register",
-  cl::desc("Enable mode register pass"),
-  cl::init(true),
-  cl::Hidden);
+static cl::opt<bool>
+    EnableSIModeRegisterPass("amdgpu-mode-register",
+                             cl::desc("Enable mode register pass"),
+                             cl::init(true), cl::Hidden);
 
 // Enable GFX11+ s_delay_alu insertion
 static cl::opt<bool>
@@ -452,19 +453,16 @@ static cl::opt<bool>
 
 // Option is used in lit tests to prevent deadcoding of patterns inspected.
 static cl::opt<bool>
-EnableDCEInRA("amdgpu-dce-in-ra",
-    cl::init(true), cl::Hidden,
-    cl::desc("Enable machine DCE inside regalloc"));
+    EnableDCEInRA("amdgpu-dce-in-ra", cl::init(true), cl::Hidden,
+                  cl::desc("Enable machine DCE inside regalloc"));
 
 static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
                                            cl::desc("Adjust wave priority"),
                                            cl::init(false), cl::Hidden);
 
-static cl::opt<bool> EnableScalarIRPasses(
-  "amdgpu-scalar-ir-passes",
-  cl::desc("Enable scalar IR passes"),
-  cl::init(true),
-  cl::Hidden);
+static cl::opt<bool> EnableScalarIRPasses("amdgpu-scalar-ir-passes",
+                                          cl::desc("Enable scalar IR passes"),
+                                          cl::init(true), cl::Hidden);
 
 static cl::opt<bool>
     EnableSwLowerLDS("amdgpu-enable-sw-lower-lds",
@@ -477,10 +475,10 @@ static cl::opt<bool, true> EnableLowerModuleLDS(
     cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true),
     cl::Hidden);
 
-static cl::opt<bool> EnablePreRAOptimizations(
-    "amdgpu-enable-pre-ra-optimizations",
-    cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
-    cl::Hidden);
+static cl::opt<bool>
+    EnablePreRAOptimizations("amdgpu-enable-pre-ra-optimizations",
+                             cl::desc("Enable Pre-RA optimizations pass"),
+                             cl::init(true), cl::Hidden);
 
 static cl::opt<bool> EnablePromoteKernelArguments(
     "amdgpu-enable-promote-kernel-arguments",
@@ -507,10 +505,10 @@ static cl::opt<bool> EnableRewritePartialRegUses(
     cl::desc("Enable rewrite partial reg uses pass"), cl::init(true),
     cl::Hidden);
 
-static cl::opt<bool> EnableHipStdPar(
-  "amdgpu-enable-hipstdpar",
-  cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(false),
-  cl::Hidden);
+static cl::opt<bool>
+    EnableHipStdPar("amdgpu-enable-hipstdpar",
+                    cl::desc("Enable HIP Standard Parallelism Offload support"),
+                    cl::init(false), cl::Hidden);
 
 static cl::opt<bool>
     EnableAMDGPUAttributor("amdgpu-attributor-enable",
@@ -613,6 +611,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUPrintfRuntimeBindingPass(*PR);
   initializeAMDGPUResourceUsageAnalysisWrapperPassPass(*PR);
   initializeGCNNSAReassignLegacyPass(*PR);
+  initializeAMDGPUHotBlockRegisterRenamingLegacyPass(*PR);
   initializeGCNPreRAOptimizationsLegacyPass(*PR);
   initializeGCNPreRALongBranchRegLegacyPass(*PR);
   initializeGCNRewritePartialRegUsesLegacyPass(*PR);
@@ -634,8 +633,8 @@ static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
 static ScheduleDAGInstrs *
 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
-  ScheduleDAGMILive *DAG =
-    new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
+  ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(
+      C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
   if (ST.shouldClusterStores())
     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
@@ -698,14 +697,13 @@ createIterativeILPMachineScheduler(MachineSchedContext *C) {
   return DAG;
 }
 
-static MachineSchedRegistry
-SISchedRegistry("si", "Run SI's custom scheduler",
-                createSIMachineScheduler);
+static MachineSchedRegistry SISchedRegistry("si", "Run SI's custom scheduler",
+                                            createSIMachineScheduler);
 
 static MachineSchedRegistry
-GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
-                             "Run GCN scheduler to maximize occupancy",
-                             createGCNMaxOccupancyMachineScheduler);
+    GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
+                                 "Run GCN scheduler to maximize occupancy",
+                                 createGCNMaxOccupancyMachineScheduler);
 
 static MachineSchedRegistry
     GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
@@ -962,7 +960,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
 
   PB.registerFullLinkTimeOptimizationLastEPCallback(
       [this](ModulePassManager &PM, OptimizationLevel Level) {
-
         // Promote kernel arguments to global address space for LLVM IR
         // generated by flang compiler
         FunctionPassManager FPM;
@@ -1395,7 +1392,7 @@ void AMDGPUPassConfig::addIRPasses() {
                                              AAResults &AAR) {
         if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
           AAR.addAAResult(WrapperPass->getResult());
-        }));
+      }));
     }
 
     if (TM.getTargetTriple().isAMDGCN()) {
@@ -1655,6 +1652,10 @@ void GCNPassConfig::addOptimizedRegAlloc() {
 }
 
 bool GCNPassConfig::addPreRewrite() {
+  // Hot block register renaming to reduce value density
+  if (TM->getOptLevel() > CodeGenOptLevel::None && EnableHotBlockRegRenaming)
+    addPass(&AMDGPUHotBlockRegisterRenamingID);
+
   if (EnableRegReassign)
     addPass(&GCNNSAReassignID);
 
@@ -2013,8 +2014,8 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
                              AMDGPU::SGPR_32RegClass,
                              MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
        parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId,
-                             AMDGPU::SGPR_32RegClass,
-                             MFI->ArgInfo.LDSKernelId, 0, 1) ||
+                             AMDGPU::SGPR_32RegClass, MFI->ArgInfo.LDSKernelId,
+                             0, 1) ||
        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
                              AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
                              0, 1) ||
@@ -2037,14 +2038,14 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
                              AMDGPU::SReg_64RegClass,
                              MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
-                             AMDGPU::VGPR_32RegClass,
-                             MFI->ArgInfo.WorkItemIDX, 0, 0) ||
+                             AMDGPU::VGPR_32RegClass, MFI->ArgInfo.WorkItemIDX,
+                             0, 0) ||
        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
-                             AMDGPU::VGPR_32RegClass,
-                             MFI->ArgInfo.WorkItemIDY, 0, 0) ||
+                             AMDGPU::VGPR_32RegClass, MFI->ArgInfo.WorkItemIDY,
+                             0, 0) ||
        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
-                             AMDGPU::VGPR_32RegClass,
-                             MFI->ArgInfo.WorkItemIDZ, 0, 0)))
+                             AMDGPU::VGPR_32RegClass, MFI->ArgInfo.WorkItemIDZ,
+                             0, 0)))
     return true;
 
   if (ST.hasIEEEMode())
@@ -2251,6 +2252,11 @@ Error AMDGPUCodeGenPassBuilder::addInstSelector(AddMachinePass &addPass) const {
 }
 
 void AMDGPUCodeGenPassBuilder::addPreRewrite(AddMachinePass &addPass) const {
+  // Hot block register renaming to reduce value density
+  if (TM.getOptLevel() > CodeGenOptLevel::None && EnableHotBlockRegRenaming) {
+    addPass(AMDGPUHotBlockRegisterRenamingPass());
+  }
+
   if (EnableRegReassign) {
     addPass(GCNNSAReassignPass());
   }
@@ -2353,7 +2359,6 @@ Error AMDGPUCodeGenPassBuilder::addRegAssignmentOptimized(
   // For allocating per-thread VGPRs.
   addPass(RAGreedyPass({onlyAllocateVGPRs, "vgpr"}));
 
-
   addPreRewrite(addPass);
   addPass(VirtRegRewriterPass(true));
 
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 8c0f9d01a7d30..b2d45438021ce 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -63,6 +63,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUFrameLowering.cpp
   AMDGPUGlobalISelDivergenceLowering.cpp
   AMDGPUGlobalISelUtils.cpp
+  AMDGPUHotBlockRegisterRenaming.cpp
   AMDGPUHSAMetadataStreamer.cpp
   AMDGPUInsertDelayAlu.cpp
   AMDGPUInstCombineIntrinsic.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/hot-block-register-renaming.mir b/llvm/test/CodeGen/AMDGPU/hot-block-register-renaming.mir
new file mode 100644
index 0000000000000..28c9c16f248d1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hot-block-register-renaming.mir
@@ -0,0 +1,146 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-enable-hot-block-reg-renaming -verify-machineinstrs -run-pass=greedy,amdgpu-hot-block-reg-renaming,virtregrewriter -o - %s | FileCheck %s
+
+--- |
+  define amdgpu_kernel void @test_basic_move() { ret void }
+  define amdgpu_kernel void @test_tied_operand() { ret void }
+  define amdgpu_kernel void @test_no_free_registers() #0 { ret void }
+  
+  attributes #0 = { "amdgpu-num-vgpr"="8" }
+...
+
+---
+# Test 1: Verify that we correctly move a value when it's safe to do so
+# Multiple values allocated to vgpr3, one should be moved to a free register
+# CHECK-LABEL: name: test_basic_move
+# CHECK: bb.1:
+# CHECK-NOT: renamable $vgpr3 = V_ADD_F32_e64 0, renamable $vgpr0, 0, renamable $vgpr1
+# CHECK-NOT: renamable $vgpr3 = V_MUL_F32_e64 0, killed renamable $vgpr3, 0, renamable $vgpr2
+# CHECK: renamable $vgpr{{[4-9]|[1-5][0-9]}} = V_MUL_F32_e64
+name:            test_basic_move
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '$vgpr0' }
+  - { id: 1, class: vgpr_32, preferred-register: '$vgpr1' }
+  - { id: 2, class: vgpr_32, preferred-register: '$vgpr2' }
+  - { id: 3, class: vgpr_32, preferred-register: '$vgpr3' }
+  - { id: 4, class: vgpr_32, preferred-register: '$vgpr3' }
+  - { id: 5, class: vgpr_32, preferred-register: '$vgpr3' }
+  - { id: 6, class: vgpr_32, preferred-register: '$vgpr3' }
+  - { id: 7, class: vgpr_32, preferred-register: '$vgpr3' }
+body: |
+  bb.0:
+    successors: %bb.1
+    liveins: $vgpr0, $vgpr1, $vgpr2
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    %2:vgpr_32 = COPY $vgpr2
+
+  bb.1:
+    successors: %bb.1, %bb.2
+    ; High value density: %3, %4, %5, %6, %7 all prefer vgpr3
+    ; Some should be moved to free registers
+    %3:vgpr_32 = V_ADD_F32_e64 0, %0, 0, %1, 0, 0, implicit $mode, implicit $exec
+    %4:vgpr_32 = V_ADD_F32_e64 0, %3, 0, %2, 0, 0, implicit $mode, implicit $exec
+    %5:vgpr_32 = V_MUL_F32_e64 0, %3, 0, %2, 0, 0, implicit $mode, implicit $exec
+    %6:vgpr_32 = V_MUL_F32_e64 0, %3, 0, %1, 0, 0, implicit $mode, implicit $exec
+    %7:vgpr_32 = V_FMA_F32_e64 0, %3, 0, %2, 0, %4, 0, 0, implicit $mode, implicit $exec
+    S_CBRANCH_EXECNZ %bb.1, implicit $exec
+
+  bb.2:
+    S_ENDPGM 0, implicit %7
+
+...
+---
+# Test 2: Verify that we do NOT move values with tied operands
+# V_MAC_F32 has tied def-use, should not be moved
+# CHECK-LABEL: name: test_tied_operand
+# CHECK: bb.1:
+# CHECK: renamable $vgpr3 = V_ADD_F32_e32
+# CHECK-NEXT: renamable $vgpr3 = V_MAC_F32_e32
+# CHECK-NEXT: renamable $vgpr3 = V_MUL_F32_e64
+name:            test_tied_operand
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '$vgpr0' }
+  - { id: 1, class: vgpr_32, preferred-register: '$vgpr1' }
+  - { id: 2, class: vgpr_32, preferred-register: '$vgpr3' }
+  - { id: 3, class: vgpr_32, preferred-register: '$vgpr3' }
+  - { id: 4, class: vgpr_32, preferred-register: '$vgpr3' }
+body: |
+  bb.0:
+    successors: %bb.1
+    liveins: $vgpr0, $vgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+
+  bb.1:
+    successors: %bb.1, %bb.2
+    ; %2 and %3 both prefer vgpr3, but %3 has tied operand
+    %2:vgpr_32 = V_ADD_F32_e32 %1, %0, implicit $mode, implicit $exec
+    %3:vgpr_32 = V_MAC_F32_e32 %0, %1, %2, implicit $mode, implicit $exec
+    ; The pass should NOT move %3 because it has a tied def operand
+    %4:vgpr_32 = V_MUL_F32_e64 0, %3, 0, %0, 0, 0, implicit $mode, implicit $exec
+    S_CBRANCH_EXECNZ %bb.1, implicit $exec
+
+  bb.2:
+    S_ENDPGM 0, implicit %4
+
+...
+---
+# Test 3: Verify that we do NOT move when there are no free registers
+# With only 8 VGPRs available and all in use, no moves should happen
+# CHECK-LABEL: name: test_no_free_registers
+# CHECK: bb.1:
+# CHECK: renamable $vgpr3 = V_ADD_F32_e64
+# CHECK: renamable $vgpr3 = V_MUL_F32_e64
+name:            test_no_free_registers
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '$vgpr0' }
+  - { id: 1, class: vgpr_32, preferred-register: '$vgpr1' }
+  - { id: 2, class: vgpr_32, preferred-register: '$vgpr2' }
+  - { id: 3, class: vgpr_32, preferred-register: '$vgpr3' }
+  - { id: 4, class: vgpr_32, preferred-register: '$vgpr4' }
+  - { id: 5, class: vgpr_32, preferred-register: '$vgpr5' }
+  - { id: 6, class: vgpr_32, preferred-register: '$vgpr6' }
+  - { id: 7, class: vgpr_32, preferred-register: '$vgpr7' }
+  - { id: 8, class: vgpr_32, preferred-register: '$vgpr3' }
+  - { id: 9, class: vgpr_32, preferred-register: '$vgpr3' }
+body: |
+  bb.0:
+    successors: %bb.1
+    liveins: $vgpr0, $vgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    ; Fill up vgpr2-7 to leave no free registers
+    %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %3:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    %4:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
+    %5:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
+    %6:vgpr_32 = V_MOV_B32_e32 4, implicit $exec
+    %7:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
+
+  bb.1:
+    successors: %bb.1, %bb.2
+    ; With all registers occupied, pass should not move values
+    %8:vgpr_32 = V_ADD_F32_e64 0, %0, 0, %1, 0, 0, implicit $mode, implicit $exec
+    %9:vgpr_32 = V_MUL_F32_e64 0, %8, 0, %1, 0, 0, implicit $mode, implicit $exec
+    S_CBRANCH_EXECNZ %bb.1, implicit $exec
+
+  bb.2:
+    S_ENDPGM 0, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7, implicit %9
+
+...
+

From 8bc7d7aeafe618efaeb93dae5968ec1e06404602 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov@amd.com>
Date: Mon, 20 Oct 2025 06:52:42 -0500
Subject: [PATCH 2/6] [AMDGPU] Hot block register renaming: address PR review
 feedback

- Rename canMoveValue to isVirtRegMovable for clarity
- Add assertions to verify single-value precondition
- Restore VRM->getPhys check: NOT redundant due to register aliasing
  (register units are shared between aliased registers like VGPR0 and VGPR0_VGPR1)
- Improve tied operand check to verify tied source register compatibility
---
 .../AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp | 61 +++++++++++++++----
 1 file changed, 49 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp
index bc95ee375d008..a6a7f1362626f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp
@@ -104,9 +104,9 @@ class AMDGPUHotBlockRegisterRenamingImpl {
   bool isSuitableRegister(MCRegister PhysReg) const;
 
   /// Check if a virtual register can be safely moved
-  bool canMoveValue(Register VirtReg, MCRegister CurrentPhysReg,
-                    MCRegister TargetPhysReg, SlotIndex BBStart,
-                    SlotIndex BBEnd);
+  bool isVirtRegMovable(Register VirtReg, MCRegister CurrentPhysReg,
+                        MCRegister TargetPhysReg, SlotIndex BBStart,
+                        SlotIndex BBEnd);
 
   /// Try to move a value from DenseReg to FreeReg
   bool tryMoveValue(MCRegister DenseReg, MCRegister FreeReg,
@@ -363,19 +363,30 @@ void AMDGPUHotBlockRegisterRenamingImpl::findFreeRegisters(
   }
 }
 
-bool AMDGPUHotBlockRegisterRenamingImpl::canMoveValue(Register VirtReg,
-                                                      MCRegister CurrentPhysReg,
-                                                      MCRegister TargetPhysReg,
-                                                      SlotIndex BBStart,
-                                                      SlotIndex BBEnd) {
+bool AMDGPUHotBlockRegisterRenamingImpl::isVirtRegMovable(Register VirtReg,
+                                                          MCRegister CurrentPhysReg,
+                                                          MCRegister TargetPhysReg,
+                                                          SlotIndex BBStart,
+                                                          SlotIndex BBEnd) {
+
+  LiveInterval &VirtRegLI = LIS->getInterval(VirtReg);
+
+  // Verify precondition: single value with single segment in BB
+  unsigned SegmentCount = 0;
+  for (const LiveRange::Segment &S : VirtRegLI) {
+    if (S.start >= BBStart && S.end <= BBEnd)
+      SegmentCount++;
+  }
+  assert(SegmentCount == 1 &&
+         "isVirtRegMovable expects VirtReg with single segment in BB");
+  assert(VirtRegLI.getNumValNums() == 1 &&
+         "isVirtRegMovable expects VirtReg with single value");
 
   // Check for tied operands
   // A tied operand means the instruction requires source and destination to be
   // the same physical register. Moving such a value would break this
   // constraint.
 
-  LiveInterval &VirtRegLI = LIS->getInterval(VirtReg);
-
   for (const LiveRange::Segment &S : VirtRegLI) {
     // Only check segments within this BB
     if (S.start < BBStart || S.end > BBEnd)
@@ -387,8 +398,31 @@ bool AMDGPUHotBlockRegisterRenamingImpl::canMoveValue(Register VirtReg,
     if (!DefMI)
       continue;
 
-    for (const MachineOperand &MO : DefMI->operands()) {
+    for (unsigned OpIdx = 0, E = DefMI->getNumOperands(); OpIdx < E; ++OpIdx) {
+      const MachineOperand &MO = DefMI->getOperand(OpIdx);
       if (MO.isReg() && MO.getReg() == VirtReg && MO.isDef() && MO.isTied()) {
+        // Found a tied def - need to check the source operand it's tied to
+        unsigned TiedIdx = DefMI->findTiedOperandIdx(OpIdx);
+        const MachineOperand &TiedMO = DefMI->getOperand(TiedIdx);
+        
+        // If the tied source is a register, verify it won't conflict
+        if (TiedMO.isReg()) {
+          Register TiedReg = TiedMO.getReg();
+          if (TiedReg.isVirtual()) {
+            MCRegister TiedPhysReg = VRM->getPhys(TiedReg);
+            // Cannot move if it would violate the tied constraint
+            // (source and dest must be in same physical register)
+            if (TiedPhysReg != CurrentPhysReg) {
+              LLVM_DEBUG(dbgs() << "        Cannot move " << printReg(VirtReg, TRI)
+                                << ": tied to " << printReg(TiedReg, TRI)
+                                << " which is in different PhysReg "
+                                << printReg(TiedPhysReg, TRI) << " at " << S.start
+                                << " in " << *DefMI);
+              return false;
+            }
+          }
+        }
+        
         LLVM_DEBUG(dbgs() << "        Cannot move " << printReg(VirtReg, TRI)
                           << ": has tied def at " << S.start << " in "
                           << *DefMI);
@@ -418,6 +452,9 @@ bool AMDGPUHotBlockRegisterRenamingImpl::tryMoveValue(MCRegister DenseReg,
       Register VirtReg = SI.value()->reg();
 
       // Check if this VirtReg is mapped to DenseReg
+      // NOTE: This is NOT redundant! We iterate per register unit, and units
+      // can be shared between aliased registers (e.g., VGPR0 and VGPR0_VGPR1).
+      // This check filters out VirtRegs mapped to aliased registers.
       if (VRM->getPhys(VirtReg) != DenseReg)
         continue;
 
@@ -450,7 +487,7 @@ bool AMDGPUHotBlockRegisterRenamingImpl::tryMoveValue(MCRegister DenseReg,
       }
 
       // Check: Can this value be safely moved?
-      if (!canMoveValue(VirtReg, DenseReg, FreeReg, BBStart, BBEnd)) {
+      if (!isVirtRegMovable(VirtReg, DenseReg, FreeReg, BBStart, BBEnd)) {
         // Cache the result to avoid checking again
         UnmovableVRegs.insert(VirtReg);
         continue;

From 2942c0e82d03d1cf1f69fc1de13435a192b703a3 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov@amd.com>
Date: Fri, 24 Oct 2025 06:05:38 -0500
Subject: [PATCH 3/6] [AMDGPU] enable Hot Block Register Renaming by default
 for CI/CT run

This flips the default of -amdgpu-enable-hot-block-reg-renaming to true
to exercise the pass across large CI/CT builds. This is a temporary
enablement to flush out issues; users can still disable with
-mllvm -amdgpu-enable-hot-block-reg-renaming=false.
---
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index af0f4e5eca7be..d3c2e0642cf9c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -416,7 +416,7 @@ static cl::opt<bool> EnableRegReassign(
 static cl::opt<bool> EnableHotBlockRegRenaming(
     "amdgpu-enable-hot-block-reg-renaming",
     cl::desc("Enable hot block register renaming to reduce value density"),
-    cl::init(false), cl::Hidden);
+    cl::init(true), cl::Hidden);
 
 static cl::opt<bool> OptVGPRLiveRange(
     "amdgpu-opt-vgpr-liverange",

From 8cc658b32633a7468b6d89007f4ee734d555a720 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov@amd.com>
Date: Mon, 27 Oct 2025 10:03:39 -0500
Subject: [PATCH 4/6] [AMDGPU] Fix Hot Block Register Renaming assertions on
 complex IR

Fix two assertions discovered during CI/CT testing with rocBLAS kernels:

1. isVirtRegMovable() crashed on PHI nodes with multiple value definitions.
   Converted assertions to early-return checks, allowing the pass to skip
   unmovable registers instead of crashing on legitimate IR patterns.

2. tryMoveValue() assumed LiveIntervalUnion contains only virtual registers,
   but it can contain physical registers after allocation. Added isVirtual()
   check before calling VirtRegMap::getPhys() to prevent assertion failures.

Both fixes improve robustness without affecting correctness or performance.
---
 .../AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp | 27 ++++++++++++++++---
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp
index a6a7f1362626f..820132df8e23c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp
@@ -377,10 +377,21 @@ bool AMDGPUHotBlockRegisterRenamingImpl::isVirtRegMovable(Register VirtReg,
     if (S.start >= BBStart && S.end <= BBEnd)
       SegmentCount++;
   }
-  assert(SegmentCount == 1 &&
-         "isVirtRegMovable expects VirtReg with single segment in BB");
-  assert(VirtRegLI.getNumValNums() == 1 &&
-         "isVirtRegMovable expects VirtReg with single value");
+  
+  // Cannot move registers with multiple segments in BB (e.g., PHI nodes)
+  if (SegmentCount != 1) {
+    LLVM_DEBUG(dbgs() << "        Cannot move " << printReg(VirtReg, TRI)
+                      << ": has " << SegmentCount << " segments in BB\n");
+    return false;
+  }
+  
+  // Cannot move registers with multiple definitions (e.g., from PHI merge)
+  if (VirtRegLI.getNumValNums() != 1) {
+    LLVM_DEBUG(dbgs() << "        Cannot move " << printReg(VirtReg, TRI)
+                      << ": has " << VirtRegLI.getNumValNums() 
+                      << " value definitions\n");
+    return false;
+  }
 
   // Check for tied operands
   // A tied operand means the instruction requires source and destination to be
@@ -451,6 +462,14 @@ bool AMDGPUHotBlockRegisterRenamingImpl::tryMoveValue(MCRegister DenseReg,
     for (LiveIntervalUnion::SegmentIter SI = LIU.begin(); SI.valid(); ++SI) {
       Register VirtReg = SI.value()->reg();
 
+      // Skip physical registers (LiveIntervalUnion can contain both)
+      if (!VirtReg.isVirtual())
+        continue;
+
+      // Skip virtual registers that haven't been allocated yet
+      if (!VRM->hasPhys(VirtReg))
+        continue;
+
       // Check if this VirtReg is mapped to DenseReg
       // NOTE: This is NOT redundant! We iterate per register unit, and units
       // can be shared between aliased registers (e.g., VGPR0 and VGPR0_VGPR1).

From c1606b33afc1317216abed4cefc6f50463efe959 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov@amd.com>
Date: Thu, 30 Oct 2025 08:48:01 -0500
Subject: [PATCH 5/6] AMDGPU: Fix correctness issues in Hot Block Register
 Renaming pass

Three critical correctness fixes for the Hot Block Register Renaming pass:

Fix #0 (Kernel-Only): Restrict pass to kernel functions only. Post-RA passes
cannot safely modify non-kernel functions because they have no mechanism to
update RegMask operands in caller's call instructions, which would lead to
inter-procedural register corruption.

Fix #1a (Redefinitions): Check that target free register is not redefined by
any instruction within the virtual register's live range. Without this check,
moving a value to a register that gets overwritten mid-range causes segfaults.

Fix #1b (Call Clobbers): Use LiveIntervals::checkRegMaskInterference() to
verify that target register is not clobbered by any call instruction within
the live range. Prevents incorrect register assignments across function calls.

All fixes verified on aomp-complex test case (segfault fixed) and rocRAND
MTGP32 kernel (117 values remapped, original optimization preserved).
---
 .../AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp | 85 +++++++++++++++++--
 1 file changed, 78 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp
index 820132df8e23c..c4c16c56f17c8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp
@@ -44,6 +44,7 @@
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/CallingConv.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -57,6 +58,8 @@ STATISTIC(NumBlocksProcessed, "Number of hot blocks processed");
 STATISTIC(NumValuesRemapped, "Number of values remapped to reduce density");
 STATISTIC(NumBlocksSkipped,
           "Number of blocks skipped (no dense regs or no free regs)");
+STATISTIC(NumNonKernelsSkipped,
+          "Number of non-kernel functions skipped for safety");
 
 namespace {
 
@@ -110,7 +113,8 @@ class AMDGPUHotBlockRegisterRenamingImpl {
 
   /// Try to move a value from DenseReg to FreeReg
   bool tryMoveValue(MCRegister DenseReg, MCRegister FreeReg,
-                    MachineBasicBlock *MBB, SlotIndex BBStart, SlotIndex BBEnd);
+                    MachineBasicBlock *MBB, SlotIndex BBStart, SlotIndex BBEnd,
+                    const DenseMap<MCRegister, SmallVector<SlotIndex, 4>> &PhysRegDefs);
 };
 
 class AMDGPUHotBlockRegisterRenamingLegacy : public MachineFunctionPass {
@@ -173,6 +177,17 @@ bool AMDGPUHotBlockRegisterRenamingImpl::run(MachineFunction &MF) {
   LLVM_DEBUG(dbgs() << "AMDGPUHotBlockRegisterRenaming: Processing "
                     << MF.getName() << "\n");
 
+  // Fix #0: Skip non-kernel functions to avoid RegMask corruption issues.
+  // Post-RA pass cannot update RegMask operands in caller's call instructions,
+  // which would lead to incorrect assumptions about clobbered registers.
+  CallingConv::ID CC = MF.getFunction().getCallingConv();
+  if (CC != CallingConv::AMDGPU_KERNEL) {
+    LLVM_DEBUG(dbgs() << "  Skipping non-kernel function (CC=" << CC
+                      << "): Post-RA pass cannot safely modify callees\n");
+    ++NumNonKernelsSkipped;
+    return false;
+  }
+
   TRI = ST->getRegisterInfo();
   MRI = &MF.getRegInfo();
 
@@ -236,6 +251,33 @@ bool AMDGPUHotBlockRegisterRenamingImpl::processBasicBlock(
                     << " registers with values, " << FreeRegs.size()
                     << " free registers\n");
 
+  // Step 2a: Build PhysReg definitions cache (Fix #1a)
+  // Track all SlotIndexes where each physical register is defined
+  const TargetRegisterClass *VGPR_32_RC =
+      TRI->getRegClass(AMDGPU::VGPR_32RegClassID);
+  DenseMap<MCRegister, SmallVector<SlotIndex, 4>> PhysRegDefs;
+  
+  for (MachineInstr &MI : *MBB) {
+    SlotIndex Idx = LIS->getInstructionIndex(MI);
+    for (const MachineOperand &MO : MI.operands()) {
+      if (MO.isReg() && MO.isDef() && MO.getReg().isPhysical()) {
+        MCRegister PhysReg = MO.getReg();
+        if (VGPR_32_RC->contains(PhysReg)) {
+          PhysRegDefs[PhysReg].push_back(Idx);
+          // Also track superregs for aliasing
+          for (MCRegister Super : TRI->superregs(PhysReg)) {
+            PhysRegDefs[Super].push_back(Idx);
+          }
+        }
+      }
+    }
+  }
+
+  LLVM_DEBUG({
+    dbgs() << "    Built PhysRegDefs cache: " << PhysRegDefs.size() 
+           << " registers have definitions in this BB\n";
+  });
+
   // Step 3: Create max heap of dense registers
   auto Comparator = [&ValueDensity](MCRegister A, MCRegister B) {
     return ValueDensity[A] < ValueDensity[B]; // max heap
@@ -266,7 +308,7 @@ bool AMDGPUHotBlockRegisterRenamingImpl::processBasicBlock(
 
     MCRegister FreeReg = FreeRegs[FreeRegIdx++];
 
-    if (tryMoveValue(DenseReg, FreeReg, MBB, BBStart, BBEnd)) {
+    if (tryMoveValue(DenseReg, FreeReg, MBB, BBStart, BBEnd, PhysRegDefs)) {
       Changed = true;
       ++NumValuesRemapped;
 
@@ -450,11 +492,10 @@ bool AMDGPUHotBlockRegisterRenamingImpl::isVirtRegMovable(Register VirtReg,
   return true;
 }
 
-bool AMDGPUHotBlockRegisterRenamingImpl::tryMoveValue(MCRegister DenseReg,
-                                                      MCRegister FreeReg,
-                                                      MachineBasicBlock *MBB,
-                                                      SlotIndex BBStart,
-                                                      SlotIndex BBEnd) {
+bool AMDGPUHotBlockRegisterRenamingImpl::tryMoveValue(
+    MCRegister DenseReg, MCRegister FreeReg, MachineBasicBlock *MBB,
+    SlotIndex BBStart, SlotIndex BBEnd,
+    const DenseMap<MCRegister, SmallVector<SlotIndex, 4>> &PhysRegDefs) {
   // Find a movable local value in DenseReg
   for (MCRegUnit Unit : TRI->regunits(DenseReg)) {
     LiveIntervalUnion &LIU = LRM->getLiveUnions()[Unit];
@@ -512,6 +553,36 @@ bool AMDGPUHotBlockRegisterRenamingImpl::tryMoveValue(MCRegister DenseReg,
         continue;
       }
 
+      // Fix #1a: Check that FreeReg is not redefined in VirtReg's live range
+      auto DefIt = PhysRegDefs.find(FreeReg);
+      if (DefIt != PhysRegDefs.end()) {
+        bool HasConflict = false;
+        for (SlotIndex DefIdx : DefIt->second) {
+          // Check if definition is strictly inside the live range (not at endpoints)
+          if (DefIdx > SegStart && DefIdx < SegEnd) {
+            LLVM_DEBUG(dbgs() << "        Cannot move to " << printReg(FreeReg, TRI)
+                              << ": redefined at " << DefIdx << " inside live range ["
+                              << SegStart << ", " << SegEnd << ")\n");
+            HasConflict = true;
+            break;
+          }
+        }
+        if (HasConflict)
+          continue;  // Try next VirtReg
+      }
+
+      // Fix #1b: Check that FreeReg is not clobbered by any call in the live range
+      BitVector UsableRegs;
+      if (LIS->checkRegMaskInterference(VirtRegLI, UsableRegs)) {
+        // checkRegMaskInterference returns true if LI crosses RegMask instructions
+        // UsableRegs now contains registers NOT clobbered by any RegMask
+        if (!UsableRegs.test(FreeReg)) {
+          LLVM_DEBUG(dbgs() << "        Cannot move to " << printReg(FreeReg, TRI)
+                            << ": clobbered by call RegMask in live range\n");
+          continue;  // Try next VirtReg
+        }
+      }
+
       // This VirtReg is movable! Perform the remap
       LLVM_DEBUG(dbgs() << "      Moving " << printReg(VirtReg, TRI) << " from "
                         << printReg(DenseReg, TRI) << " to "

From ba382b7b2f299e112d06c4e735a10f460ddae426 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov@amd.com>
Date: Mon, 3 Nov 2025 14:40:35 -0600
Subject: [PATCH 6/6] [TEMP] Fix LiveRegMatrix dangling pointers in
 postOptimization

---
 llvm/include/llvm/CodeGen/LiveRangeEdit.h | 17 +++++++++++++++++
 llvm/lib/CodeGen/InlineSpiller.cpp        | 20 +++++++++++++++++---
 llvm/lib/CodeGen/LiveRangeEdit.cpp        |  6 ++++++
 3 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/LiveRangeEdit.h b/llvm/include/llvm/CodeGen/LiveRangeEdit.h
index d0ed3ff660d9b..658231c31481c 100644
--- a/llvm/include/llvm/CodeGen/LiveRangeEdit.h
+++ b/llvm/include/llvm/CodeGen/LiveRangeEdit.h
@@ -20,6 +20,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -58,6 +59,11 @@ class LiveRangeEdit : private MachineRegisterInfo::Delegate {
     /// Called before shrinking the live range of a virtual register.
     virtual void LRE_WillShrinkVirtReg(Register) {}
 
+    /// Called when a virtual register's LiveInterval is about to become empty.
+    /// This happens when removeVRegDefAt removes the last definition.
+    /// Implementations should unassign from LiveRegMatrix before the interval is cleared.
+    virtual void LRE_WillClearVirtReg(Register, LiveInterval &) {}
+
     /// Called after cloning a virtual register.
     /// This is used for new registers representing connected components of Old.
     virtual void LRE_DidCloneVirtReg(Register New, Register Old) {}
@@ -75,6 +81,11 @@ class LiveRangeEdit : private MachineRegisterInfo::Delegate {
   /// FirstNew - Index of the first register added to NewRegs.
   const unsigned FirstNew;
 
+  /// Track which virtual registers are new (created during this edit).
+  /// Used to avoid calling Matrix->unassign on registers that were never
+  /// added to LiveRegMatrix.
+  SmallSet<Register, 8> NewVirtRegs;
+
   /// DeadRemats - The saved instructions which have already been dead after
   /// rematerialization but not deleted yet -- to be done in postOptimization.
   SmallPtrSet<MachineInstr *, 32> *DeadRemats;
@@ -142,6 +153,12 @@ class LiveRangeEdit : private MachineRegisterInfo::Delegate {
   bool empty() const { return size() == 0; }
   Register get(unsigned idx) const { return NewRegs[idx + FirstNew]; }
 
+  /// Check if a virtual register was created during this edit.
+  /// This is used to identify registers that were never added to LiveRegMatrix.
+  bool isNewVirtualRegister(Register VReg) const {
+    return NewVirtRegs.contains(VReg);
+  }
+
   /// pop_back - It allows LiveRangeEdit users to drop new registers.
   /// The context is when an original def instruction of a register is
   /// dead after rematerialization, we still want to keep it for following
diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
index c3e0964594bd5..006a2a4bff0f3 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -86,6 +86,7 @@ class HoistSpillHelper : private LiveRangeEdit::Delegate {
   const TargetInstrInfo &TII;
   const TargetRegisterInfo &TRI;
   const MachineBlockFrequencyInfo &MBFI;
+  LiveRegMatrix *Matrix;
 
   InsertPointAnalysis IPA;
 
@@ -129,16 +130,18 @@ class HoistSpillHelper : private LiveRangeEdit::Delegate {
 
 public:
   HoistSpillHelper(const Spiller::RequiredAnalyses &Analyses,
-                   MachineFunction &mf, VirtRegMap &vrm)
+                   MachineFunction &mf, VirtRegMap &vrm,
+                   LiveRegMatrix *matrix = nullptr)
       : MF(mf), LIS(Analyses.LIS), LSS(Analyses.LSS), MDT(Analyses.MDT),
         VRM(vrm), MRI(mf.getRegInfo()), TII(*mf.getSubtarget().getInstrInfo()),
         TRI(*mf.getSubtarget().getRegisterInfo()), MBFI(Analyses.MBFI),
-        IPA(LIS, mf.getNumBlockIDs()) {}
+        Matrix(matrix), IPA(LIS, mf.getNumBlockIDs()) {}
 
   void addToMergeableSpills(MachineInstr &Spill, int StackSlot,
                             Register Original);
   bool rmFromMergeableSpills(MachineInstr &Spill, int StackSlot);
   void hoistAllSpills();
+  void LRE_WillClearVirtReg(Register, LiveInterval &) override;
   void LRE_DidCloneVirtReg(Register, Register) override;
 };
 
@@ -191,7 +194,7 @@ class InlineSpiller : public Spiller {
       : MF(MF), LIS(Analyses.LIS), LSS(Analyses.LSS), VRM(VRM),
         MRI(MF.getRegInfo()), TII(*MF.getSubtarget().getInstrInfo()),
         TRI(*MF.getSubtarget().getRegisterInfo()), Matrix(Matrix),
-        HSpiller(Analyses, MF, VRM), VRAI(VRAI) {}
+        HSpiller(Analyses, MF, VRM, Matrix), VRAI(VRAI) {}
 
   void spill(LiveRangeEdit &, AllocationOrder *Order = nullptr) override;
   ArrayRef<Register> getSpilledRegs() override { return RegsToSpill; }
@@ -1750,6 +1753,17 @@ void HoistSpillHelper::hoistAllSpills() {
   }
 }
 
+/// Called when a LiveInterval is about to be cleared by removeVRegDefAt.
+/// Unassign from LiveRegMatrix to prevent dangling pointers (fixes LLVM bug #48911).
+void HoistSpillHelper::LRE_WillClearVirtReg(Register VirtReg,
+                                             LiveInterval &LI) {
+  // If this virtual register is assigned to a physical register, unassign it
+  // from LiveRegMatrix before the interval is cleared. Otherwise, LiveIntervalUnion
+  // will contain dangling pointers.
+  if (Matrix && VRM.hasPhys(VirtReg))
+    Matrix->unassign(LI);
+}
+
 /// For VirtReg clone, the \p New register should have the same physreg or
 /// stackslot as the \p old register.
 void HoistSpillHelper::LRE_DidCloneVirtReg(Register New, Register Old) {
diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp
index 5b0365da4e8c6..fbf5da3c58366 100644
--- a/llvm/lib/CodeGen/LiveRangeEdit.cpp
+++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp
@@ -268,6 +268,11 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) {
     if (MO.isDef()) {
       if (TheDelegate && LI.getVNInfoAt(Idx) != nullptr)
         TheDelegate->LRE_WillShrinkVirtReg(LI.reg());
+      // Notify delegate BEFORE clearing if this will make the interval empty.
+      // This allows Matrix->unassign to work with non-empty LiveRange.
+      // Skip new registers (clones) - they were never added to LiveRegMatrix.
+      if (TheDelegate && LI.size() == 1 && !isNewVirtualRegister(Reg))
+        TheDelegate->LRE_WillClearVirtReg(Reg, LI);
       LIS.removeVRegDefAt(LI, Idx);
       if (LI.empty())
         RegsToErase.push_back(Reg);
@@ -398,6 +403,7 @@ LiveRangeEdit::MRI_NoteNewVirtualRegister(Register VReg) {
     VRM->grow();
 
   NewRegs.push_back(VReg);
+  NewVirtRegs.insert(VReg);
 }
 
 void LiveRangeEdit::calculateRegClassAndHint(MachineFunction &MF,