Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 132 additions & 6 deletions llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsBPF.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
Expand Down Expand Up @@ -478,9 +479,95 @@ static void aspaceWrapOperand(DenseMap<Value *, Value *> &Cache, Instruction *I,
}
}

static Value *wrapPtrIfASNotZero(DenseMap<Value *, Value *> &Cache,
CallInst *CI, Value *P) {
if (auto *PTy = dyn_cast<PointerType>(P->getType())) {
if (PTy->getAddressSpace() == 0)
return P;
}
return aspaceWrapValue(Cache, CI->getFunction(), P);
}

static Instruction *aspaceMemSet(Intrinsic::ID ID,
DenseMap<Value *, Value *> &Cache,
CallInst *CI) {
auto *MI = cast<MemIntrinsic>(CI);
IRBuilder<> B(CI);

Value *OldDst = CI->getArgOperand(0);
Value *NewDst = wrapPtrIfASNotZero(Cache, CI, OldDst);
if (OldDst == NewDst)
return nullptr;

// memset(new_dst, val, len, align, isvolatile, md)
Value *Val = CI->getArgOperand(1);
Value *Len = CI->getArgOperand(2);

auto *MS = cast<MemSetInst>(CI);
MaybeAlign Align = MS->getDestAlign();
bool IsVolatile = MS->isVolatile();

if (ID == Intrinsic::memset)
return B.CreateMemSet(NewDst, Val, Len, Align, IsVolatile,
MI->getAAMetadata());
else
return B.CreateMemSetInline(NewDst, Align, Val, Len, IsVolatile,
MI->getAAMetadata());
}

static Instruction *aspaceMemCpy(Intrinsic::ID ID,
DenseMap<Value *, Value *> &Cache,
CallInst *CI) {
auto *MI = cast<MemIntrinsic>(CI);
IRBuilder<> B(CI);

Value *OldDst = CI->getArgOperand(0);
Value *OldSrc = CI->getArgOperand(1);
Value *NewDst = wrapPtrIfASNotZero(Cache, CI, OldDst);
Value *NewSrc = wrapPtrIfASNotZero(Cache, CI, OldSrc);
if (OldDst == NewDst && OldSrc == NewSrc)
return nullptr;

// memcpy(new_dst, dst_align, new_src, src_align, len, isvolatile, md)
Value *Len = CI->getArgOperand(2);

auto *MT = cast<MemTransferInst>(CI);
MaybeAlign DstAlign = MT->getDestAlign();
MaybeAlign SrcAlign = MT->getSourceAlign();
bool IsVolatile = MT->isVolatile();

return B.CreateMemTransferInst(ID, NewDst, DstAlign, NewSrc, SrcAlign, Len,
IsVolatile, MI->getAAMetadata());
}

static Instruction *aspaceMemMove(DenseMap<Value *, Value *> &Cache,
CallInst *CI) {
auto *MI = cast<MemIntrinsic>(CI);
IRBuilder<> B(CI);

Value *OldDst = CI->getArgOperand(0);
Value *OldSrc = CI->getArgOperand(1);
Value *NewDst = wrapPtrIfASNotZero(Cache, CI, OldDst);
Value *NewSrc = wrapPtrIfASNotZero(Cache, CI, OldSrc);
if (OldDst == NewDst && OldSrc == NewSrc)
return nullptr;

// memmove(new_dst, dst_align, new_src, src_align, len, isvolatile, md)
Value *Len = CI->getArgOperand(2);

auto *MT = cast<MemTransferInst>(CI);
MaybeAlign DstAlign = MT->getDestAlign();
MaybeAlign SrcAlign = MT->getSourceAlign();
bool IsVolatile = MT->isVolatile();

return B.CreateMemMove(NewDst, DstAlign, NewSrc, SrcAlign, Len, IsVolatile,
MI->getAAMetadata());
}

// Support for BPF address spaces:
// - for each function in the module M, update pointer operand of
// each memory access instruction (load/store/cmpxchg/atomicrmw)
// or intrinsic call insns (memset/memcpy/memmove)
// by casting it from non-zero address space to zero address space, e.g:
//
// (load (ptr addrspace (N) %p) ...)
Expand All @@ -493,21 +580,60 @@ bool BPFCheckAndAdjustIR::insertASpaceCasts(Module &M) {
for (Function &F : M) {
DenseMap<Value *, Value *> CastsCache;
for (BasicBlock &BB : F) {
for (Instruction &I : BB) {
for (Instruction &I : llvm::make_early_inc_range(BB)) {
unsigned PtrOpNum;

if (auto *LD = dyn_cast<LoadInst>(&I))
if (auto *LD = dyn_cast<LoadInst>(&I)) {
PtrOpNum = LD->getPointerOperandIndex();
else if (auto *ST = dyn_cast<StoreInst>(&I))
aspaceWrapOperand(CastsCache, &I, PtrOpNum);
continue;
}
if (auto *ST = dyn_cast<StoreInst>(&I)) {
PtrOpNum = ST->getPointerOperandIndex();
else if (auto *CmpXchg = dyn_cast<AtomicCmpXchgInst>(&I))
aspaceWrapOperand(CastsCache, &I, PtrOpNum);
continue;
}
if (auto *CmpXchg = dyn_cast<AtomicCmpXchgInst>(&I)) {
PtrOpNum = CmpXchg->getPointerOperandIndex();
else if (auto *RMW = dyn_cast<AtomicRMWInst>(&I))
aspaceWrapOperand(CastsCache, &I, PtrOpNum);
continue;
}
if (auto *RMW = dyn_cast<AtomicRMWInst>(&I)) {
PtrOpNum = RMW->getPointerOperandIndex();
aspaceWrapOperand(CastsCache, &I, PtrOpNum);
continue;
}

auto *CI = dyn_cast<CallInst>(&I);
if (!CI)
continue;

Function *Callee = CI->getCalledFunction();
if (!Callee || !Callee->isIntrinsic())
continue;

// Check memset/memcpy/memmove
Intrinsic::ID ID = Callee->getIntrinsicID();
bool IsSet = ID == Intrinsic::memset || ID == Intrinsic::memset_inline;
bool IsCpy = ID == Intrinsic::memcpy || ID == Intrinsic::memcpy_inline;
bool IsMove = ID == Intrinsic::memmove;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've checked if there are some other intrinsics we need to care about and found these:

  • Intrinsic::memcpy_inline, available as a builtin function (link)
  • Intrinsic::memset_inline, same
  • Intrinsic::memcpy_element_unordered_atomic, Intrinsic::memmove_element_unordered_atomic, Intrinsic::memset_element_unordered_atomic -- see the code to handle these, but don't see any code that introduces them.
  • Intrinsic::experimental_memset_pattern -- LoopIdiomRecognize::processLoopStridedStore can introduce these.
  • there are also a some vector related intrinsics, but I assume these are irrelevant.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for list the intrinsic's in the above. I missed __builtin_memcpy_inline and __builtin_memset_inline which is very similar to __builtin_mem{cpy,set} but the __inline version requires the 'size' argument to be constant. In current bpf progs, we all use __builtin_mem{set,cpy}() with constant size, so it essentially equivalent to __builtin_mem{set,cpy}_line(). It will be trivial to add both to the pull request.

I think we can ignore mem{cpy,move,set}_element_unordered_atomic. I am aware of this set of intrinsics. The operand of these memory operations need to be atomic and so for our addr-space arguments, we can ignore them.

For Intrinsic:experimental_memset_pattern, it tries to convert a loop like

    for (unsigned i = 0; i < 2 * n; i += 2) {
      f[i] = 2;
      f[i+1] = 2;
    }

to the following intrinsic

// Memset variant that writes a given pattern.
def int_experimental_memset_pattern
    : Intrinsic<[],            
      [llvm_anyptr_ty, // Destination.
       llvm_any_ty,    // Pattern value.
       llvm_anyint_ty, // Count (number of times to fill value).
       llvm_i1_ty],    // IsVolatile.
      [IntrWriteMem, IntrArgMemOnly, IntrWillReturn, IntrNoFree, IntrNoCallback,
       NoCapture<ArgIndex<0>>, WriteOnly<ArgIndex<0>>,
       ImmArg<ArgIndex<3>>]>;

This should be rare. But for completeness, I think I can add this as well.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I asked ChatGPT for the following question?

In llvm, what kind of C code can generate the following intrinsic: Intrinsic::experimental_memset_pattern ?

The following is the answer:

====================
Short version: call Apple’s memset_pattern{4,8,16} from C on a Darwin target (macOS/iOS). Clang recognizes these and lowers them to the LLVM IR intrinsic llvm.experimental.memset.pattern (which can then be expanded efficiently).

Minimal example (macOS / iOS targets)

// clang -O2 -target x86_64-apple-macos14 -S -emit-llvm ex.c -o ex.ll
#include <string.h>

void fill16(void *dst, size_t n) {
unsigned char pat[16] =
{0,1,2,3,4,5,6,7, 8,9,10,11,12,13,14,15};
memset_pattern16(dst, pat, n);
}

void fill8(void *dst, size_t n) {
unsigned char pat[8] = {1,2,3,4,5,6,7,8};
memset_pattern8(dst, pat, n);
}

void fill4(void *dst, size_t n) {
unsigned char pat[4] = {0xAA,0xBB,0xCC,0xDD};
memset_pattern4(dst, pat, n);
}

In the emitted IR you’ll see calls like:

call void @llvm.experimental.memset.pattern.p0.i64(
ptr %dst, ptr %pat, i64 %n, i1 false)

Notes

These memset_pattern{4,8,16} functions are Apple libc extensions. On non-Darwin targets, Clang won’t lower them to the intrinsic—either you’ll get a normal library call or a loop.

There isn’t a portable C standard function that maps to llvm.experimental.memset.pattern.

The intrinsic allows repeating multi-byte patterns (4/8/16). Plain llvm.memset only repeats a single byte.

If you’re not on macOS/iOS but still want the intrinsic (for experimentation), compile with a Darwin target triple as shown above.

====================

I actually tried to compile with the above example. It compiled successfully with the following compiler:

Apple clang version 17.0.0 (clang-1700.0.13.5)
Target: arm64-apple-darwin24.6.0
Thread model: posix
InstalledDir: /Library/Developer/CommandLineTools/usr/bin

But it will fail to compile with linux and x86 target.

$ clang -O2 -S -emit-llvm ex.c -o ex.ll
ex.c:7:5: error: call to undeclared function 'memset_pattern16'; ISO C99 and later do not support
      implicit function declarations [-Wimplicit-function-declaration]
    7 |     memset_pattern16(dst, pat, n);
      |     ^
ex.c:12:5: error: call to undeclared function 'memset_pattern8'; ISO C99 and later do not support
      implicit function declarations [-Wimplicit-function-declaration]
   12 |     memset_pattern8(dst, pat, n);
      |     ^
ex.c:17:5: error: call to undeclared function 'memset_pattern4'; ISO C99 and later do not support
      implicit function declarations [-Wimplicit-function-declaration]
   17 |     memset_pattern4(dst, pat, n);
      |     ^
3 errors generated.

Unfortunately, the compiler of Apple on my Mac is too old to generate llvm.experimental.memset.pattern. I suspect the latest clang (with Apple target) should generate llvm.experimental.memset.pattern. The following is the related code in LoopIDiomRecognize.cpp:

  if (SplatValue) {
    NewCall = Builder.CreateMemSet(BasePtr, SplatValue, MemsetArg,
                                   MaybeAlign(StoreAlignment),
                                   /*isVolatile=*/false, AATags);
  } else if (ForceMemsetPatternIntrinsic ||
             isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)) {
    assert(isa<SCEVConstant>(StoreSizeSCEV) && "Expected constant store size");

    NewCall = Builder.CreateIntrinsic(
        Intrinsic::experimental_memset_pattern,
        {DestInt8PtrTy, PatternValue->getType(), IntIdxTy},
        {BasePtr, PatternValue, MemsetArg,
         ConstantInt::getFalse(M->getContext())});
    if (StoreAlignment)
      cast<MemSetPatternInst>(NewCall)->setDestAlignment(*StoreAlignment);
    NewCall->setAAMetadata(AATags);
  } else {
    // Neither a memset, nor memset_pattern16
    return Changed;
  }

ForceMemsetPatternIntrinsic is an internal flag.

static cl::opt<bool> ForceMemsetPatternIntrinsic(
    "loop-idiom-force-memset-pattern-intrinsic",
    cl::desc("Use memset.pattern intrinsic whenever possible"), cl::init(false),
    cl::Hidden);

So memset_pattern16 function is needed to generate Intrinsic::experimental_memset_pattern() and memset_pattern16 is only available for Apple target.

So I will skip experimental_memset_pattern for now.

if (!IsSet && !IsCpy && !IsMove)
continue;

Instruction *New;
if (IsSet)
New = aspaceMemSet(ID, CastsCache, CI);
else if (IsCpy)
New = aspaceMemCpy(ID, CastsCache, CI);
else
New = aspaceMemMove(CastsCache, CI);

if (!New)
continue;

aspaceWrapOperand(CastsCache, &I, PtrOpNum);
I.replaceAllUsesWith(New);
New->takeName(&I);
I.eraseFromParent();
}
}
Changed |= !CastsCache.empty();
Expand Down
60 changes: 60 additions & 0 deletions llvm/test/CodeGen/BPF/addr-space-memintrinsic-gep.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
; RUN: opt --bpf-check-and-opt-ir -S -mtriple=bpf-pc-linux < %s | FileCheck %s

@page1 = dso_local local_unnamed_addr addrspace(1) global [10 x ptr] zeroinitializer, align 8
@page2 = dso_local local_unnamed_addr addrspace(1) global [10 x ptr] zeroinitializer, align 8

define dso_local void @test_memset() local_unnamed_addr {
; CHECK-LABEL: define dso_local void @test_memset() local_unnamed_addr {
; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 addrspacecast (ptr addrspace(1) getelementptr inbounds nuw (i8, ptr addrspace(1) @page1, i64 16) to ptr), i8 0, i64 16, i1 false)
; CHECK-NEXT: ret void
;
tail call void @llvm.memset.p1.i64(ptr addrspace(1) noundef nonnull align 8 dereferenceable(16) getelementptr inbounds nuw (i8, ptr addrspace(1) @page1, i64 16), i8 0, i64 16, i1 false)
ret void
}

declare void @llvm.memset.p1.i64(ptr addrspace(1) writeonly captures(none), i8, i64, i1 immarg)

define dso_local void @test_memcpy() local_unnamed_addr {
; CHECK-LABEL: define dso_local void @test_memcpy() local_unnamed_addr {
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 addrspacecast (ptr addrspace(1) getelementptr inbounds nuw (i8, ptr addrspace(1) @page2, i64 8) to ptr), ptr align 8 addrspacecast (ptr addrspace(1) getelementptr inbounds nuw (i8, ptr addrspace(1) @page1, i64 8) to ptr), i64 16, i1 false)
; CHECK-NEXT: ret void
;
tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef nonnull align 8 dereferenceable(16) getelementptr inbounds nuw (i8, ptr addrspace(1) @page2, i64 8), ptr addrspace(1) noundef nonnull align 8 dereferenceable(16) getelementptr inbounds nuw (i8, ptr addrspace(1) @page1, i64 8), i64 16, i1 false)
ret void
}

declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noalias writeonly captures(none), ptr addrspace(1) noalias readonly captures(none), i64, i1 immarg)

define dso_local void @test_memmove() local_unnamed_addr {
; CHECK-LABEL: define dso_local void @test_memmove() local_unnamed_addr {
; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 8 addrspacecast (ptr addrspace(1) getelementptr inbounds nuw (i8, ptr addrspace(1) @page2, i64 16) to ptr), ptr align 8 addrspacecast (ptr addrspace(1) getelementptr inbounds nuw (i8, ptr addrspace(1) @page2, i64 8) to ptr), i64 16, i1 false)
; CHECK-NEXT: ret void
;
tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef nonnull align 8 dereferenceable(16) getelementptr inbounds nuw (i8, ptr addrspace(1) @page2, i64 16), ptr addrspace(1) noundef nonnull align 8 dereferenceable(16) getelementptr inbounds nuw (i8, ptr addrspace(1) @page2, i64 8), i64 16, i1 false)
ret void
}

declare void @llvm.memmove.p1.p1.i64(ptr addrspace(1) writeonly captures(none), ptr addrspace(1) readonly captures(none), i64, i1 immarg)

define dso_local void @test_memset_inline() local_unnamed_addr {
; CHECK-LABEL: define dso_local void @test_memset_inline() local_unnamed_addr {
; CHECK-NEXT: call void @llvm.memset.inline.p0.i64(ptr align 8 addrspacecast (ptr addrspace(1) getelementptr inbounds nuw (i8, ptr addrspace(1) @page1, i64 16) to ptr), i8 0, i64 16, i1 false)
; CHECK-NEXT: ret void
;
tail call void @llvm.memset.inline.p1.i64(ptr addrspace(1) nonnull align 8 getelementptr inbounds nuw (i8, ptr addrspace(1) @page1, i64 16), i8 0, i64 16, i1 false)
ret void
}

declare void @llvm.memset.inline.p1.i64(ptr addrspace(1) writeonly captures(none), i8, i64, i1 immarg)

define dso_local void @test_memcpy_inline() local_unnamed_addr {
; CHECK-LABEL: define dso_local void @test_memcpy_inline() local_unnamed_addr {
; CHECK-NEXT: call void @llvm.memcpy.inline.p0.p0.i64(ptr align 8 addrspacecast (ptr addrspace(1) getelementptr inbounds nuw (i8, ptr addrspace(1) @page2, i64 8) to ptr), ptr align 8 addrspacecast (ptr addrspace(1) getelementptr inbounds nuw (i8, ptr addrspace(1) @page1, i64 8) to ptr), i64 16, i1 false)
; CHECK-NEXT: ret void
;
tail call void @llvm.memcpy.inline.p1.p1.i64(ptr addrspace(1) nonnull align 8 getelementptr inbounds nuw (i8, ptr addrspace(1) @page2, i64 8), ptr addrspace(1) nonnull align 8 getelementptr inbounds nuw (i8, ptr addrspace(1) @page1, i64 8), i64 16, i1 false)
ret void
}

declare void @llvm.memcpy.inline.p1.p1.i64(ptr addrspace(1) noalias writeonly captures(none), ptr addrspace(1) noalias readonly captures(none), i64, i1 immarg)
49 changes: 49 additions & 0 deletions llvm/test/CodeGen/BPF/addr-space-memintrinsic-no-gep.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
; RUN: opt --bpf-check-and-opt-ir -S -mtriple=bpf-pc-linux < %s | FileCheck %s

@page1 = dso_local local_unnamed_addr addrspace(1) global [10 x ptr] zeroinitializer, align 8
@page2 = dso_local local_unnamed_addr addrspace(1) global [10 x ptr] zeroinitializer, align 8

define dso_local void @test_memset() local_unnamed_addr {
; CHECK-LABEL: define dso_local void @test_memset() local_unnamed_addr {
; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 addrspacecast (ptr addrspace(1) @page1 to ptr), i8 0, i64 16, i1 false)
; CHECK-NEXT: ret void
;
tail call void @llvm.memset.p1.i64(ptr addrspace(1) noundef align 8 dereferenceable(16) @page1, i8 0, i64 16, i1 false)
ret void
}

declare void @llvm.memset.p1.i64(ptr addrspace(1) writeonly captures(none), i8, i64, i1 immarg)

define dso_local void @test_memcpy() local_unnamed_addr {
; CHECK-LABEL: define dso_local void @test_memcpy() local_unnamed_addr {
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 addrspacecast (ptr addrspace(1) @page2 to ptr), ptr align 8 addrspacecast (ptr addrspace(1) @page1 to ptr), i64 16, i1 false)
; CHECK-NEXT: ret void
;
tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef align 8 dereferenceable(16) @page2, ptr addrspace(1) noundef align 8 dereferenceable(16) @page1, i64 16, i1 false)
ret void
}

declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noalias writeonly captures(none), ptr addrspace(1) noalias readonly captures(none), i64, i1 immarg)

define dso_local void @test_memset_inline() local_unnamed_addr {
; CHECK-LABEL: define dso_local void @test_memset_inline() local_unnamed_addr {
; CHECK-NEXT: call void @llvm.memset.inline.p0.i64(ptr align 8 addrspacecast (ptr addrspace(1) @page1 to ptr), i8 0, i64 16, i1 false)
; CHECK-NEXT: ret void
;
tail call void @llvm.memset.inline.p1.i64(ptr addrspace(1) align 8 @page1, i8 0, i64 16, i1 false)
ret void
}

declare void @llvm.memset.inline.p1.i64(ptr addrspace(1) writeonly captures(none), i8, i64, i1 immarg)

define dso_local void @test_memcpy_inline() local_unnamed_addr {
; CHECK-LABEL: define dso_local void @test_memcpy_inline() local_unnamed_addr {
; CHECK-NEXT: call void @llvm.memcpy.inline.p0.p0.i64(ptr align 8 addrspacecast (ptr addrspace(1) @page2 to ptr), ptr align 8 addrspacecast (ptr addrspace(1) @page1 to ptr), i64 16, i1 false)
; CHECK-NEXT: ret void
;
tail call void @llvm.memcpy.inline.p1.p1.i64(ptr addrspace(1) align 8 @page2, ptr addrspace(1) align 8 @page1, i64 16, i1 false)
ret void
}

declare void @llvm.memcpy.inline.p1.p1.i64(ptr addrspace(1) noalias writeonly captures(none), ptr addrspace(1) noalias readonly captures(none), i64, i1 immarg)