Refactored canEvaluateShifted to identify candidates for

zGoldthorpe · zGoldthorpe · commit 730920c9517d · 2025-07-18T12:20:32.000-05:00
simplification.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -530,112 +530,159 @@ Instruction *InstCombinerImpl::commonShiftTransforms(BinaryOperator &I) {
   return nullptr;
 }
 
-/// Return true if we can simplify two logical (either left or right) shifts
-/// that have constant shift amounts: OuterShift (InnerShift X, C1), C2.
-static bool canEvaluateShiftedShift(unsigned OuterShAmt, bool IsOuterShl,
-                                    Instruction *InnerShift,
-                                    InstCombinerImpl &IC, Instruction *CxtI) {
+/// Return a bitmask of all constant outer shift amounts that can be simplified
+/// by foldShiftedShift().
+static APInt getEvaluableShiftedShiftMask(bool IsOuterShl,
+                                          Instruction *InnerShift,
+                                          InstCombinerImpl &IC,
+                                          Instruction *CxtI) {
   assert(InnerShift->isLogicalShift() && "Unexpected instruction type");
 
+  const unsigned TypeWidth = InnerShift->getType()->getScalarSizeInBits();
+
   // We need constant scalar or constant splat shifts.
   const APInt *InnerShiftConst;
   if (!match(InnerShift->getOperand(1), m_APInt(InnerShiftConst)))
-    return false;
+    return APInt::getZero(TypeWidth);
 
-  // Two logical shifts in the same direction:
+  if (InnerShiftConst->uge(TypeWidth))
+    return APInt::getZero(TypeWidth);
+
+  const unsigned InnerShAmt = InnerShiftConst->getZExtValue();
+
+  // Two logical shifts in the same direction can always be simplified, so long
+  // as the total shift amount is legal.
   // shl (shl X, C1), C2 -->  shl X, C1 + C2
   // lshr (lshr X, C1), C2 --> lshr X, C1 + C2
   bool IsInnerShl = InnerShift->getOpcode() == Instruction::Shl;
   if (IsInnerShl == IsOuterShl)
-    return true;
+    return APInt::getLowBitsSet(TypeWidth, TypeWidth - InnerShAmt);
 
+  APInt ShMask = APInt::getZero(TypeWidth);
   // Equal shift amounts in opposite directions become bitwise 'and':
   // lshr (shl X, C), C --> and X, C'
   // shl (lshr X, C), C --> and X, C'
-  if (*InnerShiftConst == OuterShAmt)
-    return true;
+  ShMask.setBit(InnerShAmt);
 
-  // If the 2nd shift is bigger than the 1st, we can fold:
+  // If the inner shift is bigger than the outer, we can fold:
   // lshr (shl X, C1), C2 -->  and (shl X, C1 - C2), C3
   // shl (lshr X, C1), C2 --> and (lshr X, C1 - C2), C3
-  // but it isn't profitable unless we know the and'd out bits are already zero.
-  // Also, check that the inner shift is valid (less than the type width) or
-  // we'll crash trying to produce the bit mask for the 'and'.
-  unsigned TypeWidth = InnerShift->getType()->getScalarSizeInBits();
-  if (InnerShiftConst->ugt(OuterShAmt) && InnerShiftConst->ult(TypeWidth)) {
-    unsigned InnerShAmt = InnerShiftConst->getZExtValue();
-    unsigned MaskShift =
-        IsInnerShl ? TypeWidth - InnerShAmt : InnerShAmt - OuterShAmt;
-    APInt Mask = APInt::getLowBitsSet(TypeWidth, OuterShAmt) << MaskShift;
-    if (IC.MaskedValueIsZero(InnerShift->getOperand(0), Mask, CxtI))
-      return true;
-  }
-
-  return false;
+  // but it isn't profitable unless we know the masked out bits are already
+  // zero.
+  KnownBits Known = IC.computeKnownBits(InnerShift->getOperand(0), CxtI);
+  // Isolate the bits that are annihilated by the inner shift.
+  APInt InnerShMask = IsInnerShl ? Known.Zero.lshr(TypeWidth - InnerShAmt)
+                                 : Known.Zero.trunc(InnerShAmt);
+  // Isolate the upper (resp. lower) InnerShAmt bits of the base operand of the
+  // inner shl (resp. lshr).
+  // Then:
+  // - lshr (shl X, C1), C2 == (shl X, C1 - C2) if the bottom C2 of the isolated
+  //   bits are zero
+  // - shl (lshr X, C1), C2 == (lshr X, C1 - C2) if the top C2 of the isolated
+  //   bits are zero
+  const unsigned MaxOuterShAmt =
+      IsInnerShl ? Known.Zero.lshr(TypeWidth - InnerShAmt).countr_one()
+                 : Known.Zero.trunc(InnerShAmt).countl_one();
+  ShMask.setLowBits(MaxOuterShAmt);
+  return ShMask;
 }
 
-/// See if we can compute the specified value, but shifted logically to the left
-/// or right by some number of bits. This should return true if the expression
-/// can be computed for the same cost as the current expression tree. This is
-/// used to eliminate extraneous shifting from things like:
-///      %C = shl i128 %A, 64
-///      %D = shl i128 %B, 96
-///      %E = or i128 %C, %D
-///      %F = lshr i128 %E, 64
-/// where the client will ask if E can be computed shifted right by 64-bits. If
-/// this succeeds, getShiftedValue() will be called to produce the value.
-static bool canEvaluateShifted(Value *V, unsigned NumBits, bool IsLeftShift,
-                               InstCombinerImpl &IC, Instruction *CxtI) {
+/// Given a bitmask \p ShiftMask of desired shift amounts, determine the submask
+/// of bits corresponding to shift amounts X for which the given expression \p V
+/// can be computed for at worst the same cost as the current expression tree
+/// when shifted by X. For each set bit in the \p ShiftMask afterward,
+/// getShiftedValue() can produce the corresponding value.
+///
+/// \returns true if and only if at least one bit of the \p ShiftMask is set
+/// after refinement.
+static bool refineEvaluableShiftMask(Value *V, APInt &ShiftMask,
+                                     bool IsLeftShift, InstCombinerImpl &IC,
+                                     Instruction *CxtI) {
   // We can always evaluate immediate constants.
   if (match(V, m_ImmConstant()))
     return true;
 
   Instruction *I = dyn_cast<Instruction>(V);
-  if (!I) return false;
+  if (!I) {
+    ShiftMask.clearAllBits();
+    return false;
+  }
 
   // We can't mutate something that has multiple uses: doing so would
   // require duplicating the instruction in general, which isn't profitable.
-  if (!I->hasOneUse()) return false;
+  if (!I->hasOneUse()) {
+    ShiftMask.clearAllBits();
+    return false;
+  }
 
   switch (I->getOpcode()) {
-  default: return false;
+  default: {
+    ShiftMask.clearAllBits();
+    return false;
+  }
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor:
-    // Bitwise operators can all arbitrarily be arbitrarily evaluated shifted.
-    return canEvaluateShifted(I->getOperand(0), NumBits, IsLeftShift, IC, I) &&
-           canEvaluateShifted(I->getOperand(1), NumBits, IsLeftShift, IC, I);
+    return refineEvaluableShiftMask(I->getOperand(0), ShiftMask, IsLeftShift,
+                                    IC, I) &&
+           refineEvaluableShiftMask(I->getOperand(1), ShiftMask, IsLeftShift,
+                                    IC, I);
 
   case Instruction::Shl:
-  case Instruction::LShr:
-    return canEvaluateShiftedShift(NumBits, IsLeftShift, I, IC, CxtI);
+  case Instruction::LShr: {
+    ShiftMask &= getEvaluableShiftedShiftMask(IsLeftShift, I, IC, CxtI);
+    return !ShiftMask.isZero();
+  }
 
   case Instruction::Select: {
     SelectInst *SI = cast<SelectInst>(I);
     Value *TrueVal = SI->getTrueValue();
     Value *FalseVal = SI->getFalseValue();
-    return canEvaluateShifted(TrueVal, NumBits, IsLeftShift, IC, SI) &&
-           canEvaluateShifted(FalseVal, NumBits, IsLeftShift, IC, SI);
+    return refineEvaluableShiftMask(TrueVal, ShiftMask, IsLeftShift, IC, SI) &&
+           refineEvaluableShiftMask(FalseVal, ShiftMask, IsLeftShift, IC, SI);
   }
   case Instruction::PHI: {
     // We can change a phi if we can change all operands.  Note that we never
     // get into trouble with cyclic PHIs here because we only consider
     // instructions with a single use.
     PHINode *PN = cast<PHINode>(I);
     for (Value *IncValue : PN->incoming_values())
-      if (!canEvaluateShifted(IncValue, NumBits, IsLeftShift, IC, PN))
+      if (!refineEvaluableShiftMask(IncValue, ShiftMask, IsLeftShift, IC, PN))
         return false;
     return true;
   }
   case Instruction::Mul: {
     const APInt *MulConst;
     // We can fold (shr (mul X, -(1 << C)), C) -> (and (neg X), C`)
-    return !IsLeftShift && match(I->getOperand(1), m_APInt(MulConst)) &&
-           MulConst->isNegatedPowerOf2() && MulConst->countr_zero() == NumBits;
+    if (IsLeftShift || !match(I->getOperand(1), m_APInt(MulConst)) ||
+        !MulConst->isNegatedPowerOf2()) {
+      ShiftMask.clearAllBits();
+      return false;
+    }
+    ShiftMask &=
+        APInt::getOneBitSet(ShiftMask.getBitWidth(), MulConst->countr_zero());
+    return !ShiftMask.isZero();
   }
   }
 }
 
+/// See if we can compute the specified value, but shifted logically to the left
+/// or right by some number of bits. This should return true if the expression
+/// can be computed for the same cost as the current expression tree. This is
+/// used to eliminate extraneous shifting from things like:
+///      %C = shl i128 %A, 64
+///      %D = shl i128 %B, 96
+///      %E = or i128 %C, %D
+///      %F = lshr i128 %E, 64
+/// where the client will ask if E can be computed shifted right by 64-bits. If
+/// this succeeds, getShiftedValue() will be called to produce the value.
+static bool canEvaluateShifted(Value *V, unsigned ShAmt, bool IsLeftShift,
+                               InstCombinerImpl &IC, Instruction *CxtI) {
+  APInt ShiftMask =
+      APInt::getOneBitSet(V->getType()->getScalarSizeInBits(), ShAmt);
+  return refineEvaluableShiftMask(V, ShiftMask, IsLeftShift, IC, CxtI);
+}
+
 /// Fold OuterShift (InnerShift X, C1), C2.
 /// See canEvaluateShiftedShift() for the constraints on these instructions.
 static Value *foldShiftedShift(BinaryOperator *InnerShift, unsigned OuterShAmt,
@@ -985,37 +1032,32 @@ static Instruction *foldShrThroughZExtedShl(BinaryOperator &I, Value *Op,
                                             InstCombinerImpl &IC,
                                             const DataLayout &DL) {
   Type *DestTy = I.getType();
+  const unsigned InnerBitWidth = Op->getType()->getScalarSizeInBits();
 
-  auto *Inner = dyn_cast<Instruction>(Op);
-  if (!Inner)
+  // Determine if the operand is effectively right-shifted by counting the
+  // known leading zero bits.
+  KnownBits Known = IC.computeKnownBits(Op, nullptr);
+  const unsigned MaxInnerShrAmt = Known.countMinLeadingZeros();
+  if (MaxInnerShrAmt == 0)
     return nullptr;
+  APInt ShrMask =
+      APInt::getLowBitsSet(InnerBitWidth, std::min(MaxInnerShrAmt, ShlAmt) + 1);
 
-  // Dig through operations until the first shift.
-  while (!Inner->isShift())
-    if (!match(Inner, m_BinOp(m_OneUse(m_Instruction(Inner)), m_Constant())))
-      return nullptr;
-
-  // Fold only if the inner shift is a logical right-shift.
-  const APInt *InnerShrConst;
-  if (!match(Inner, m_LShr(m_Value(), m_APInt(InnerShrConst))))
+  // Undo the maximal inner right shift amount that simplifies the overall
+  // computation.
+  if (!refineEvaluableShiftMask(Op, ShrMask, /*IsLeftShift=*/true, IC, nullptr))
     return nullptr;
 
-  const uint64_t InnerShrAmt = InnerShrConst->getZExtValue();
-  if (InnerShrAmt >= ShlAmt) {
-    const uint64_t ReducedShrAmt = InnerShrAmt - ShlAmt;
-    if (!canEvaluateShifted(Op, ReducedShrAmt, /*IsLeftShift=*/false, IC,
-                            nullptr))
-      return nullptr;
-    Value *NewOp =
-        getShiftedValue(Op, ReducedShrAmt, /*isLeftShift=*/false, IC, DL);
-    return new ZExtInst(NewOp, DestTy);
-  }
-
-  if (!canEvaluateShifted(Op, InnerShrAmt, /*IsLeftShift=*/true, IC, nullptr))
+  const unsigned InnerShrAmt = ShrMask.getActiveBits() - 1;
+  if (InnerShrAmt == 0)
     return nullptr;
+  assert(InnerShrAmt <= ShlAmt);
 
   const uint64_t ReducedShlAmt = ShlAmt - InnerShrAmt;
   Value *NewOp = getShiftedValue(Op, InnerShrAmt, /*isLeftShift=*/true, IC, DL);
+  if (ReducedShlAmt == 0)
+    return new ZExtInst(NewOp, DestTy);
+
   Value *NewZExt = IC.Builder.CreateZExt(NewOp, DestTy);
   NewZExt->takeName(I.getOperand(0));
   auto *NewShl = BinaryOperator::CreateShl(
diff --git a/llvm/test/Transforms/InstCombine/shifts-around-zext.ll b/llvm/test/Transforms/InstCombine/shifts-around-zext.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -passes=instcombine %s | FileCheck %s
 
+declare void @clobber.i32(i32)
+
 define i64 @simple(i32 %x) {
 ; CHECK-LABEL: define i64 @simple(
 ; CHECK-SAME: i32 [[X:%.*]]) {
@@ -15,6 +17,20 @@ define i64 @simple(i32 %x) {
   ret i64 %shl
 }
 
+define <2 x i64> @simple.vec(<2 x i32> %v) {
+; CHECK-LABEL: define <2 x i64> @simple.vec(
+; CHECK-SAME: <2 x i32> [[V:%.*]]) {
+; CHECK-NEXT:    [[LSHR:%.*]] = and <2 x i32> [[V]], splat (i32 -256)
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext <2 x i32> [[LSHR]] to <2 x i64>
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw nsw <2 x i64> [[ZEXT]], splat (i64 24)
+; CHECK-NEXT:    ret <2 x i64> [[SHL]]
+;
+  %lshr = lshr <2 x i32> %v, splat(i32 8)
+  %zext = zext <2 x i32> %lshr to <2 x i64>
+  %shl = shl <2 x i64> %zext, splat(i64 32)
+  ret <2 x i64> %shl
+}
+
 ;; u0xff0 = 4080
 define i64 @masked(i32 %x) {
 ; CHECK-LABEL: define i64 @masked(
@@ -31,6 +47,83 @@ define i64 @masked(i32 %x) {
   ret i64 %shl
 }
 
+define i64 @masked.multi_use.0(i32 %x) {
+; CHECK-LABEL: define i64 @masked.multi_use.0(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr i32 [[X]], 4
+; CHECK-NEXT:    call void @clobber.i32(i32 [[LSHR]])
+; CHECK-NEXT:    [[MASK:%.*]] = and i32 [[LSHR]], 255
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext nneg i32 [[MASK]] to i64
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw nsw i64 [[ZEXT]], 48
+; CHECK-NEXT:    ret i64 [[SHL]]
+;
+  %lshr = lshr i32 %x, 4
+  call void @clobber.i32(i32 %lshr)
+  %mask = and i32 %lshr, u0xff
+  %zext = zext i32 %mask to i64
+  %shl = shl i64 %zext, 48
+  ret i64 %shl
+}
+
+define i64 @masked.multi_use.1(i32 %x) {
+; CHECK-LABEL: define i64 @masked.multi_use.1(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr i32 [[X]], 4
+; CHECK-NEXT:    [[MASK:%.*]] = and i32 [[LSHR]], 255
+; CHECK-NEXT:    call void @clobber.i32(i32 [[MASK]])
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext nneg i32 [[MASK]] to i64
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw nsw i64 [[ZEXT]], 48
+; CHECK-NEXT:    ret i64 [[SHL]]
+;
+  %lshr = lshr i32 %x, 4
+  %mask = and i32 %lshr, u0xff
+  call void @clobber.i32(i32 %mask)
+  %zext = zext i32 %mask to i64
+  %shl = shl i64 %zext, 48
+  ret i64 %shl
+}
+
+define <2 x i64> @masked.multi_use.2(i32 %x) {
+; CHECK-LABEL: define <2 x i64> @masked.multi_use.2(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr i32 [[X]], 4
+; CHECK-NEXT:    [[MASK:%.*]] = and i32 [[LSHR]], 255
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext nneg i32 [[MASK]] to i64
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw nsw i64 [[ZEXT]], 48
+; CHECK-NEXT:    [[CLOBBER:%.*]] = xor i32 [[MASK]], 255
+; CHECK-NEXT:    [[CLOBBER_Z:%.*]] = zext nneg i32 [[CLOBBER]] to i64
+; CHECK-NEXT:    [[V_0:%.*]] = insertelement <2 x i64> poison, i64 [[SHL]], i64 0
+; CHECK-NEXT:    [[V_1:%.*]] = insertelement <2 x i64> [[V_0]], i64 [[CLOBBER_Z]], i64 1
+; CHECK-NEXT:    ret <2 x i64> [[V_1]]
+;
+  %lshr = lshr i32 %x, 4
+  %mask = and i32 %lshr, u0xff
+  %zext = zext i32 %mask to i64
+  %shl = shl i64 %zext, 48
+
+  %clobber = xor i32 %mask, u0xff
+  %clobber.z = zext i32 %clobber to i64
+  %v.0 = insertelement <2 x i64> poison, i64 %shl, i32 0
+  %v.1 = insertelement <2 x i64> %v.0, i64 %clobber.z, i32 1
+  ret <2 x i64> %v.1
+}
+
+;; u0xff0 = 4080
+define <2 x i64> @masked.vec(<2 x i32> %v) {
+; CHECK-LABEL: define <2 x i64> @masked.vec(
+; CHECK-SAME: <2 x i32> [[V:%.*]]) {
+; CHECK-NEXT:    [[MASK:%.*]] = and <2 x i32> [[V]], splat (i32 4080)
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext nneg <2 x i32> [[MASK]] to <2 x i64>
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw nsw <2 x i64> [[ZEXT]], splat (i64 44)
+; CHECK-NEXT:    ret <2 x i64> [[SHL]]
+;
+  %lshr = lshr <2 x i32> %v, splat(i32 4)
+  %mask = and <2 x i32> %lshr, splat(i32 u0xff)
+  %zext = zext <2 x i32> %mask to <2 x i64>
+  %shl = shl <2 x i64> %zext, splat(i64 48)
+  ret <2 x i64> %shl
+}
+
 define i64 @combine(i32 %lower, i32 %upper) {
 ; CHECK-LABEL: define i64 @combine(
 ; CHECK-SAME: i32 [[LOWER:%.*]], i32 [[UPPER:%.*]]) {
@@ -67,17 +160,3 @@ define i64 @combine(i32 %lower, i32 %upper) {
 
   ret i64 %o.3
 }
-
-define <2 x i64> @simple.vec(<2 x i32> %v) {
-; CHECK-LABEL: define <2 x i64> @simple.vec(
-; CHECK-SAME: <2 x i32> [[V:%.*]]) {
-; CHECK-NEXT:    [[LSHR:%.*]] = and <2 x i32> [[V]], splat (i32 -256)
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext <2 x i32> [[LSHR]] to <2 x i64>
-; CHECK-NEXT:    [[SHL:%.*]] = shl nuw nsw <2 x i64> [[ZEXT]], splat (i64 24)
-; CHECK-NEXT:    ret <2 x i64> [[SHL]]
-;
-  %lshr = lshr <2 x i32> %v, splat(i32 8)
-  %zext = zext <2 x i32> %lshr to <2 x i64>
-  %shl = shl <2 x i64> %zext, splat(i64 32)
-  ret <2 x i64> %shl
-}