From 0dd0d39ecd32367375708455da1e22bd9e30fa38 Mon Sep 17 00:00:00 2001 From: Zlatko Buljan Date: Mon, 30 Nov 2015 08:37:38 +0000 Subject: [PATCH 001/186] [mips][microMIPS] Implement PRECR.QB.PH, PRECR_SRA[_R].PH.W, PRECRQ.PH.W, PRECRQ.QB.PH, PRECRQU_S.QB.PH and PRECRQ_RS.PH.W instructions Differential Revision: http://reviews.llvm.org/D14605 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254291 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/Mips/MicroMipsDSPInstrFormats.td | 12 +++++++++++ lib/Target/Mips/MicroMipsDSPInstrInfo.td | 20 +++++++++++++++++++ lib/Target/Mips/MipsDSPInstrInfo.td | 16 ++++++++------- .../Disassembler/Mips/micromips-dsp/valid.txt | 4 ++++ .../Mips/micromips-dspr2/valid.txt | 7 +++++++ test/MC/Mips/micromips-dsp/valid.s | 4 ++++ test/MC/Mips/micromips-dspr2/valid.s | 7 +++++++ 7 files changed, 63 insertions(+), 7 deletions(-) diff --git a/lib/Target/Mips/MicroMipsDSPInstrFormats.td b/lib/Target/Mips/MicroMipsDSPInstrFormats.td index 65c8303f25fe..d3f9fe31afb7 100644 --- a/lib/Target/Mips/MicroMipsDSPInstrFormats.td +++ b/lib/Target/Mips/MicroMipsDSPInstrFormats.td @@ -141,3 +141,15 @@ class POOL32A_1RIMM5AC_FMT funct> : MMDSPInst { let Inst{13-6} = funct; let Inst{5-0} = 0b111100; } + +class POOL32A_2RSA5_FMT op> : MMDSPInst { + bits<5> rt; + bits<5> rs; + bits<5> sa; + + let Inst{31-26} = 0b000000; + let Inst{25-21} = rt; + let Inst{20-16} = rs; + let Inst{15-11} = sa; + let Inst{10-0} = op; +} diff --git a/lib/Target/Mips/MicroMipsDSPInstrInfo.td b/lib/Target/Mips/MicroMipsDSPInstrInfo.td index b2e5ec61c8b4..f515f380f0db 100644 --- a/lib/Target/Mips/MicroMipsDSPInstrInfo.td +++ b/lib/Target/Mips/MicroMipsDSPInstrInfo.td @@ -120,6 +120,16 @@ class MULQ_RS_PH_MM_ENC : POOL32A_3RB0_FMT<"mulq_rs.ph", 0b0100010101>; class MULQ_RS_W_MMR2_ENC : POOL32A_3RB0_FMT<"mulq_rs.w", 0b0110010101>; class MULQ_S_PH_MMR2_ENC : POOL32A_3RB0_FMT<"mulq_s.ph", 0b0101010101>; class MULQ_S_W_MMR2_ENC : POOL32A_3RB0_FMT<"mulq_s.w", 0b0111010101>; +class PRECR_QB_PH_MMR2_ENC : POOL32A_3RB0_FMT<"precr.qb.ph", 0b0001101101>; +class PRECR_SRA_PH_W_MMR2_ENC + : POOL32A_2RSA5_FMT<"precr_sra.ph.w", 0b01111001101>; +class PRECR_SRA_R_PH_W_MMR2_ENC + : POOL32A_2RSA5_FMT<"precr_sra_r.ph.w", 0b11111001101>; +class PRECRQ_PH_W_MM_ENC : POOL32A_3RB0_FMT<"precrq.ph.w", 0b0011101101>; +class PRECRQ_QB_PH_MM_ENC : POOL32A_3RB0_FMT<"precrq.qb.ph", 0b0010101101>; +class PRECRQU_S_QB_PH_MM_ENC + : POOL32A_3RB0_FMT<"precrqu_s.qb.ph", 0b0101101101>; +class PRECRQ_RS_PH_W_MM_ENC : POOL32A_3RB0_FMT<"precrq_rs.ph.w", 0b0100101101>; // Instruction desc. class ABSQ_S_PH_MM_R2_DESC_BASE Pattern = [(set ROD:$rd, (OpNode ROS:$rs, ROT:$rt))]; InstrItinClass Itinerary = itin; + string BaseOpcode = instr_asm; } class PRECR_SRA_PH_W_DESC_BASE Pattern = [(set ROT:$rt, (OpNode ROS:$src, ROS:$rs, immZExt5:$sa))]; InstrItinClass Itinerary = itin; string Constraints = "$src = $rt"; + string BaseOpcode = instr_asm; } class ABSQ_S_PH_R2_DESC_BASE Date: Mon, 30 Nov 2015 09:52:00 +0000 Subject: [PATCH 002/186] [mips][ias] Removed MSA instructions from base architecture valid-xfail.s's. valid-xfail.s is for instructions that should be valid in the given ISA but incorrectly fail. MSA instructions are correct to fail since MSA is not enabled. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254293 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/MC/Mips/mips32r2/invalid-msa.s | 62 +++++++++++++++++++++++++++++ test/MC/Mips/mips32r2/valid-xfail.s | 55 ------------------------- test/MC/Mips/mips32r3/valid-xfail.s | 55 ------------------------- test/MC/Mips/mips32r5/valid-xfail.s | 55 ------------------------- test/MC/Mips/mips64r2/valid-xfail.s | 55 ------------------------- test/MC/Mips/mips64r3/valid-xfail.s | 55 ------------------------- test/MC/Mips/mips64r5/valid-xfail.s | 55 ------------------------- 7 files changed, 62 insertions(+), 330 deletions(-) create mode 100644 test/MC/Mips/mips32r2/invalid-msa.s diff --git a/test/MC/Mips/mips32r2/invalid-msa.s b/test/MC/Mips/mips32r2/invalid-msa.s new file mode 100644 index 000000000000..2ad99b147209 --- /dev/null +++ b/test/MC/Mips/mips32r2/invalid-msa.s @@ -0,0 +1,62 @@ +# Instructions that are invalid +# +# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding \ +# RUN: -mcpu=mips32r2 2>%t1 +# RUN: FileCheck %s < %t1 + + .set noat + and.v $w10,$w25,$w29 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + bmnz.v $w15,$w2,$w28 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + bmz.v $w13,$w11,$w21 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + bsel.v $w28,$w7,$w0 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + fclass.d $w14,$w27 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + fclass.w $w19,$w28 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + fexupl.d $w10,$w29 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + fexupl.w $w12,$w27 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + fexupr.d $w31,$w15 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + fexupr.w $w29,$w12 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + ffint_s.d $w1,$w30 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + ffint_s.w $w16,$w14 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + ffint_u.d $w23,$w18 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + ffint_u.w $w19,$w12 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + ffql.d $w2,$w3 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + ffql.w $w9,$w0 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + ffqr.d $w25,$w24 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + ffqr.w $w10,$w6 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + fill.b $w9,$v1 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + fill.h $w9,$8 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + fill.w $w31,$15 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + flog2.d $w12,$w16 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + flog2.w $w19,$w23 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + frcp.d $w12,$w4 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + frcp.w $w30,$w8 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + frint.d $w20,$w8 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + frint.w $w11,$w29 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + frsqrt.d $w29,$w2 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + frsqrt.w $w9,$w8 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + fsqrt.d $w3,$w1 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + fsqrt.w $w5,$w15 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + ftint_s.d $w31,$w26 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + ftint_s.w $w27,$w14 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + ftint_u.d $w5,$w31 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + ftint_u.w $w12,$w29 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + ftrunc_s.d $w4,$w22 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + ftrunc_s.w $w24,$w7 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + ftrunc_u.d $w20,$w25 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + ftrunc_u.w $w7,$w26 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + move.v $w8,$w17 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + nloc.b $w12,$w30 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + nloc.d $w16,$w7 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + nloc.h $w21,$w17 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + nloc.w $w17,$w16 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + nlzc.b $w12,$w7 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + nlzc.d $w14,$w14 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + nlzc.h $w24,$w24 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + nlzc.w $w10,$w4 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + nor.v $w20,$w20,$w15 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + or.v $w13,$w23,$w12 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + pcnt.b $w30,$w15 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + pcnt.d $w5,$w16 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + pcnt.h $w20,$w24 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + pcnt.w $w22,$w20 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled + xor.v $w20,$w21,$w30 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled diff --git a/test/MC/Mips/mips32r2/valid-xfail.s b/test/MC/Mips/mips32r2/valid-xfail.s index 13385d06ce81..658f172aec3d 100644 --- a/test/MC/Mips/mips32r2/valid-xfail.s +++ b/test/MC/Mips/mips32r2/valid-xfail.s @@ -28,11 +28,7 @@ adduh_r.qb $a0,$9,$12 addwc $k0,$s6,$s7 alnv.ps $f12,$f18,$f30,$12 - and.v $w10,$w25,$w29 bitrev $14,$at - bmnz.v $w15,$w2,$w28 - bmz.v $w13,$w11,$w21 - bsel.v $w28,$w7,$w0 c.eq.d $fcc1,$f15,$f15 c.eq.ps $fcc5,$f0,$f9 c.eq.s $fcc5,$f24,$f17 @@ -124,44 +120,9 @@ extrv_r.w $8,$ac1,$s6 extrv_rs.w $gp,$ac1,$s6 extrv_s.h $s2,$ac1,$14 - fclass.d $w14,$w27 - fclass.w $w19,$w28 - fexupl.d $w10,$w29 - fexupl.w $w12,$w27 - fexupr.d $w31,$w15 - fexupr.w $w29,$w12 - ffint_s.d $w1,$w30 - ffint_s.w $w16,$w14 - ffint_u.d $w23,$w18 - ffint_u.w $w19,$w12 - ffql.d $w2,$w3 - ffql.w $w9,$w0 - ffqr.d $w25,$w24 - ffqr.w $w10,$w6 - fill.b $w9,$v1 - fill.h $w9,$8 - fill.w $w31,$15 - flog2.d $w12,$w16 - flog2.w $w19,$w23 floor.l.d $f26,$f7 floor.l.s $f12,$f5 fork $s2,$8,$a0 - frcp.d $w12,$w4 - frcp.w $w30,$w8 - frint.d $w20,$w8 - frint.w $w11,$w29 - frsqrt.d $w29,$w2 - frsqrt.w $w9,$w8 - fsqrt.d $w3,$w1 - fsqrt.w $w5,$w15 - ftint_s.d $w31,$w26 - ftint_s.w $w27,$w14 - ftint_u.d $w5,$w31 - ftint_u.w $w12,$w29 - ftrunc_s.d $w4,$w22 - ftrunc_s.w $w24,$w7 - ftrunc_u.d $w20,$w25 - ftrunc_u.w $w7,$w26 insv $s2,$at iret lbe $14,122($9) @@ -184,7 +145,6 @@ mflo $9,$ac2 modsub $a3,$12,$a3 mov.ps $f22,$f17 - move.v $w8,$w17 movf.ps $f10,$f28,$fcc6 movn.ps $f31,$f31,$s3 movt.ps $f20,$f25,$fcc2 @@ -210,23 +170,9 @@ mulsa.w.ph $ac1,$s4,$s6 mulsaq_s.w.ph $ac0,$ra,$s2 neg.ps $f19,$f13 - nloc.b $w12,$w30 - nloc.d $w16,$w7 - nloc.h $w21,$w17 - nloc.w $w17,$w16 - nlzc.b $w12,$w7 - nlzc.d $w14,$w14 - nlzc.h $w24,$w24 - nlzc.w $w10,$w4 nmadd.ps $f27,$f4,$f9,$f25 nmsub.ps $f6,$f12,$f14,$f17 - nor.v $w20,$w20,$w15 - or.v $w13,$w23,$w12 packrl.ph $ra,$24,$14 - pcnt.b $w30,$w15 - pcnt.d $w5,$w16 - pcnt.h $w20,$w24 - pcnt.w $w22,$w20 pick.ph $ra,$a2,$gp pick.qb $11,$a0,$gp pll.ps $f25,$f9,$f30 @@ -304,5 +250,4 @@ trunc.l.d $f23,$f23 trunc.l.s $f28,$f31 wrpgpr $zero,$13 - xor.v $w20,$w21,$w30 yield $v1,$s0 diff --git a/test/MC/Mips/mips32r3/valid-xfail.s b/test/MC/Mips/mips32r3/valid-xfail.s index b0fc3a1d23f7..09e19e8bb3b6 100644 --- a/test/MC/Mips/mips32r3/valid-xfail.s +++ b/test/MC/Mips/mips32r3/valid-xfail.s @@ -28,11 +28,7 @@ adduh_r.qb $a0,$9,$12 addwc $k0,$s6,$s7 alnv.ps $f12,$f18,$f30,$12 - and.v $w10,$w25,$w29 bitrev $14,$at - bmnz.v $w15,$w2,$w28 - bmz.v $w13,$w11,$w21 - bsel.v $w28,$w7,$w0 c.eq.d $fcc1,$f15,$f15 c.eq.ps $fcc5,$f0,$f9 c.eq.s $fcc5,$f24,$f17 @@ -124,44 +120,9 @@ extrv_r.w $8,$ac1,$s6 extrv_rs.w $gp,$ac1,$s6 extrv_s.h $s2,$ac1,$14 - fclass.d $w14,$w27 - fclass.w $w19,$w28 - fexupl.d $w10,$w29 - fexupl.w $w12,$w27 - fexupr.d $w31,$w15 - fexupr.w $w29,$w12 - ffint_s.d $w1,$w30 - ffint_s.w $w16,$w14 - ffint_u.d $w23,$w18 - ffint_u.w $w19,$w12 - ffql.d $w2,$w3 - ffql.w $w9,$w0 - ffqr.d $w25,$w24 - ffqr.w $w10,$w6 - fill.b $w9,$v1 - fill.h $w9,$8 - fill.w $w31,$15 - flog2.d $w12,$w16 - flog2.w $w19,$w23 floor.l.d $f26,$f7 floor.l.s $f12,$f5 fork $s2,$8,$a0 - frcp.d $w12,$w4 - frcp.w $w30,$w8 - frint.d $w20,$w8 - frint.w $w11,$w29 - frsqrt.d $w29,$w2 - frsqrt.w $w9,$w8 - fsqrt.d $w3,$w1 - fsqrt.w $w5,$w15 - ftint_s.d $w31,$w26 - ftint_s.w $w27,$w14 - ftint_u.d $w5,$w31 - ftint_u.w $w12,$w29 - ftrunc_s.d $w4,$w22 - ftrunc_s.w $w24,$w7 - ftrunc_u.d $w20,$w25 - ftrunc_u.w $w7,$w26 insv $s2,$at iret lbe $14,122($9) @@ -184,7 +145,6 @@ mflo $9,$ac2 modsub $a3,$12,$a3 mov.ps $f22,$f17 - move.v $w8,$w17 movf.ps $f10,$f28,$fcc6 movn.ps $f31,$f31,$s3 movt.ps $f20,$f25,$fcc2 @@ -210,23 +170,9 @@ mulsa.w.ph $ac1,$s4,$s6 mulsaq_s.w.ph $ac0,$ra,$s2 neg.ps $f19,$f13 - nloc.b $w12,$w30 - nloc.d $w16,$w7 - nloc.h $w21,$w17 - nloc.w $w17,$w16 - nlzc.b $w12,$w7 - nlzc.d $w14,$w14 - nlzc.h $w24,$w24 - nlzc.w $w10,$w4 nmadd.ps $f27,$f4,$f9,$f25 nmsub.ps $f6,$f12,$f14,$f17 - nor.v $w20,$w20,$w15 - or.v $w13,$w23,$w12 packrl.ph $ra,$24,$14 - pcnt.b $w30,$w15 - pcnt.d $w5,$w16 - pcnt.h $w20,$w24 - pcnt.w $w22,$w20 pick.ph $ra,$a2,$gp pick.qb $11,$a0,$gp pll.ps $f25,$f9,$f30 @@ -304,5 +250,4 @@ trunc.l.d $f23,$f23 trunc.l.s $f28,$f31 wrpgpr $zero,$13 - xor.v $w20,$w21,$w30 yield $v1,$s0 diff --git a/test/MC/Mips/mips32r5/valid-xfail.s b/test/MC/Mips/mips32r5/valid-xfail.s index a821dddb85ca..30fc4b98e056 100644 --- a/test/MC/Mips/mips32r5/valid-xfail.s +++ b/test/MC/Mips/mips32r5/valid-xfail.s @@ -28,11 +28,7 @@ adduh_r.qb $a0,$9,$12 addwc $k0,$s6,$s7 alnv.ps $f12,$f18,$f30,$12 - and.v $w10,$w25,$w29 bitrev $14,$at - bmnz.v $w15,$w2,$w28 - bmz.v $w13,$w11,$w21 - bsel.v $w28,$w7,$w0 c.eq.d $fcc1,$f15,$f15 c.eq.ps $fcc5,$f0,$f9 c.eq.s $fcc5,$f24,$f17 @@ -124,44 +120,9 @@ extrv_r.w $8,$ac1,$s6 extrv_rs.w $gp,$ac1,$s6 extrv_s.h $s2,$ac1,$14 - fclass.d $w14,$w27 - fclass.w $w19,$w28 - fexupl.d $w10,$w29 - fexupl.w $w12,$w27 - fexupr.d $w31,$w15 - fexupr.w $w29,$w12 - ffint_s.d $w1,$w30 - ffint_s.w $w16,$w14 - ffint_u.d $w23,$w18 - ffint_u.w $w19,$w12 - ffql.d $w2,$w3 - ffql.w $w9,$w0 - ffqr.d $w25,$w24 - ffqr.w $w10,$w6 - fill.b $w9,$v1 - fill.h $w9,$8 - fill.w $w31,$15 - flog2.d $w12,$w16 - flog2.w $w19,$w23 floor.l.d $f26,$f7 floor.l.s $f12,$f5 fork $s2,$8,$a0 - frcp.d $w12,$w4 - frcp.w $w30,$w8 - frint.d $w20,$w8 - frint.w $w11,$w29 - frsqrt.d $w29,$w2 - frsqrt.w $w9,$w8 - fsqrt.d $w3,$w1 - fsqrt.w $w5,$w15 - ftint_s.d $w31,$w26 - ftint_s.w $w27,$w14 - ftint_u.d $w5,$w31 - ftint_u.w $w12,$w29 - ftrunc_s.d $w4,$w22 - ftrunc_s.w $w24,$w7 - ftrunc_u.d $w20,$w25 - ftrunc_u.w $w7,$w26 insv $s2,$at iret lbe $14,122($9) @@ -184,7 +145,6 @@ mflo $9,$ac2 modsub $a3,$12,$a3 mov.ps $f22,$f17 - move.v $w8,$w17 movf.ps $f10,$f28,$fcc6 movn.ps $f31,$f31,$s3 movt.ps $f20,$f25,$fcc2 @@ -210,23 +170,9 @@ mulsa.w.ph $ac1,$s4,$s6 mulsaq_s.w.ph $ac0,$ra,$s2 neg.ps $f19,$f13 - nloc.b $w12,$w30 - nloc.d $w16,$w7 - nloc.h $w21,$w17 - nloc.w $w17,$w16 - nlzc.b $w12,$w7 - nlzc.d $w14,$w14 - nlzc.h $w24,$w24 - nlzc.w $w10,$w4 nmadd.ps $f27,$f4,$f9,$f25 nmsub.ps $f6,$f12,$f14,$f17 - nor.v $w20,$w20,$w15 - or.v $w13,$w23,$w12 packrl.ph $ra,$24,$14 - pcnt.b $w30,$w15 - pcnt.d $w5,$w16 - pcnt.h $w20,$w24 - pcnt.w $w22,$w20 pick.ph $ra,$a2,$gp pick.qb $11,$a0,$gp pll.ps $f25,$f9,$f30 @@ -304,5 +250,4 @@ trunc.l.d $f23,$f23 trunc.l.s $f28,$f31 wrpgpr $zero,$13 - xor.v $w20,$w21,$w30 yield $v1,$s0 diff --git a/test/MC/Mips/mips64r2/valid-xfail.s b/test/MC/Mips/mips64r2/valid-xfail.s index 148758cd3263..5faa29d6468e 100644 --- a/test/MC/Mips/mips64r2/valid-xfail.s +++ b/test/MC/Mips/mips64r2/valid-xfail.s @@ -31,11 +31,7 @@ alnv.ob $v31,$v23,$v30,$at alnv.ob $v8,$v17,$v30,$a1 alnv.ps $f12,$f18,$f30,$12 - and.v $w10,$w25,$w29 bitrev $14,$at - bmnz.v $w15,$w2,$w28 - bmz.v $w13,$w11,$w21 - bsel.v $w28,$w7,$w0 c.eq.d $fcc1,$f15,$f15 c.eq.ps $fcc5,$f0,$f9 c.eq.s $fcc5,$f24,$f17 @@ -126,43 +122,7 @@ extrv_r.w $8,$ac1,$s6 extrv_rs.w $gp,$ac1,$s6 extrv_s.h $s2,$ac1,$14 - fclass.d $w14,$w27 - fclass.w $w19,$w28 - fexupl.d $w10,$w29 - fexupl.w $w12,$w27 - fexupr.d $w31,$w15 - fexupr.w $w29,$w12 - ffint_s.d $w1,$w30 - ffint_s.w $w16,$w14 - ffint_u.d $w23,$w18 - ffint_u.w $w19,$w12 - ffql.d $w2,$w3 - ffql.w $w9,$w0 - ffqr.d $w25,$w24 - ffqr.w $w10,$w6 - fill.b $w9,$v1 - fill.d $w28,$8 - fill.h $w9,$8 - fill.w $w31,$15 - flog2.d $w12,$w16 - flog2.w $w19,$w23 fork $s2,$8,$a0 - frcp.d $w12,$w4 - frcp.w $w30,$w8 - frint.d $w20,$w8 - frint.w $w11,$w29 - frsqrt.d $w29,$w2 - frsqrt.w $w9,$w8 - fsqrt.d $w3,$w1 - fsqrt.w $w5,$w15 - ftint_s.d $w31,$w26 - ftint_s.w $w27,$w14 - ftint_u.d $w5,$w31 - ftint_u.w $w12,$w29 - ftrunc_s.d $w4,$w22 - ftrunc_s.w $w24,$w7 - ftrunc_u.d $w20,$w25 - ftrunc_u.w $w7,$w26 insv $s2,$at iret lbe $14,122($9) @@ -212,23 +172,9 @@ mulsa.w.ph $ac1,$s4,$s6 mulsaq_s.w.ph $ac0,$ra,$s2 neg.ps $f19,$f13 - nloc.b $w12,$w30 - nloc.d $w16,$w7 - nloc.h $w21,$w17 - nloc.w $w17,$w16 - nlzc.b $w12,$w7 - nlzc.d $w14,$w14 - nlzc.h $w24,$w24 - nlzc.w $w10,$w4 nmadd.ps $f27,$f4,$f9,$f25 nmsub.ps $f6,$f12,$f14,$f17 - nor.v $w20,$w20,$w15 - or.v $w13,$w23,$w12 packrl.ph $ra,$24,$14 - pcnt.b $w30,$w15 - pcnt.d $w5,$w16 - pcnt.h $w20,$w24 - pcnt.w $w22,$w20 pick.ph $ra,$a2,$gp pick.qb $11,$a0,$gp pll.ps $f25,$f9,$f30 @@ -302,5 +248,4 @@ tlbinv tlbinvf wrpgpr $zero,$13 - xor.v $w20,$w21,$w30 yield $v1,$s0 diff --git a/test/MC/Mips/mips64r3/valid-xfail.s b/test/MC/Mips/mips64r3/valid-xfail.s index f2949c4f2dda..dcf66bf97d68 100644 --- a/test/MC/Mips/mips64r3/valid-xfail.s +++ b/test/MC/Mips/mips64r3/valid-xfail.s @@ -31,11 +31,7 @@ alnv.ob $v31,$v23,$v30,$at alnv.ob $v8,$v17,$v30,$a1 alnv.ps $f12,$f18,$f30,$12 - and.v $w10,$w25,$w29 bitrev $14,$at - bmnz.v $w15,$w2,$w28 - bmz.v $w13,$w11,$w21 - bsel.v $w28,$w7,$w0 c.eq.d $fcc1,$f15,$f15 c.eq.ps $fcc5,$f0,$f9 c.eq.s $fcc5,$f24,$f17 @@ -126,43 +122,7 @@ extrv_r.w $8,$ac1,$s6 extrv_rs.w $gp,$ac1,$s6 extrv_s.h $s2,$ac1,$14 - fclass.d $w14,$w27 - fclass.w $w19,$w28 - fexupl.d $w10,$w29 - fexupl.w $w12,$w27 - fexupr.d $w31,$w15 - fexupr.w $w29,$w12 - ffint_s.d $w1,$w30 - ffint_s.w $w16,$w14 - ffint_u.d $w23,$w18 - ffint_u.w $w19,$w12 - ffql.d $w2,$w3 - ffql.w $w9,$w0 - ffqr.d $w25,$w24 - ffqr.w $w10,$w6 - fill.b $w9,$v1 - fill.d $w28,$8 - fill.h $w9,$8 - fill.w $w31,$15 - flog2.d $w12,$w16 - flog2.w $w19,$w23 fork $s2,$8,$a0 - frcp.d $w12,$w4 - frcp.w $w30,$w8 - frint.d $w20,$w8 - frint.w $w11,$w29 - frsqrt.d $w29,$w2 - frsqrt.w $w9,$w8 - fsqrt.d $w3,$w1 - fsqrt.w $w5,$w15 - ftint_s.d $w31,$w26 - ftint_s.w $w27,$w14 - ftint_u.d $w5,$w31 - ftint_u.w $w12,$w29 - ftrunc_s.d $w4,$w22 - ftrunc_s.w $w24,$w7 - ftrunc_u.d $w20,$w25 - ftrunc_u.w $w7,$w26 insv $s2,$at iret lbe $14,122($9) @@ -212,23 +172,9 @@ mulsa.w.ph $ac1,$s4,$s6 mulsaq_s.w.ph $ac0,$ra,$s2 neg.ps $f19,$f13 - nloc.b $w12,$w30 - nloc.d $w16,$w7 - nloc.h $w21,$w17 - nloc.w $w17,$w16 - nlzc.b $w12,$w7 - nlzc.d $w14,$w14 - nlzc.h $w24,$w24 - nlzc.w $w10,$w4 nmadd.ps $f27,$f4,$f9,$f25 nmsub.ps $f6,$f12,$f14,$f17 - nor.v $w20,$w20,$w15 - or.v $w13,$w23,$w12 packrl.ph $ra,$24,$14 - pcnt.b $w30,$w15 - pcnt.d $w5,$w16 - pcnt.h $w20,$w24 - pcnt.w $w22,$w20 pick.ph $ra,$a2,$gp pick.qb $11,$a0,$gp pll.ps $f25,$f9,$f30 @@ -302,5 +248,4 @@ tlbinv tlbinvf wrpgpr $zero,$13 - xor.v $w20,$w21,$w30 yield $v1,$s0 diff --git a/test/MC/Mips/mips64r5/valid-xfail.s b/test/MC/Mips/mips64r5/valid-xfail.s index 04221ddb8630..0f7788359cf2 100644 --- a/test/MC/Mips/mips64r5/valid-xfail.s +++ b/test/MC/Mips/mips64r5/valid-xfail.s @@ -31,11 +31,7 @@ alnv.ob $v31,$v23,$v30,$at alnv.ob $v8,$v17,$v30,$a1 alnv.ps $f12,$f18,$f30,$12 - and.v $w10,$w25,$w29 bitrev $14,$at - bmnz.v $w15,$w2,$w28 - bmz.v $w13,$w11,$w21 - bsel.v $w28,$w7,$w0 c.eq.d $fcc1,$f15,$f15 c.eq.ps $fcc5,$f0,$f9 c.eq.s $fcc5,$f24,$f17 @@ -126,43 +122,7 @@ extrv_r.w $8,$ac1,$s6 extrv_rs.w $gp,$ac1,$s6 extrv_s.h $s2,$ac1,$14 - fclass.d $w14,$w27 - fclass.w $w19,$w28 - fexupl.d $w10,$w29 - fexupl.w $w12,$w27 - fexupr.d $w31,$w15 - fexupr.w $w29,$w12 - ffint_s.d $w1,$w30 - ffint_s.w $w16,$w14 - ffint_u.d $w23,$w18 - ffint_u.w $w19,$w12 - ffql.d $w2,$w3 - ffql.w $w9,$w0 - ffqr.d $w25,$w24 - ffqr.w $w10,$w6 - fill.b $w9,$v1 - fill.d $w28,$8 - fill.h $w9,$8 - fill.w $w31,$15 - flog2.d $w12,$w16 - flog2.w $w19,$w23 fork $s2,$8,$a0 - frcp.d $w12,$w4 - frcp.w $w30,$w8 - frint.d $w20,$w8 - frint.w $w11,$w29 - frsqrt.d $w29,$w2 - frsqrt.w $w9,$w8 - fsqrt.d $w3,$w1 - fsqrt.w $w5,$w15 - ftint_s.d $w31,$w26 - ftint_s.w $w27,$w14 - ftint_u.d $w5,$w31 - ftint_u.w $w12,$w29 - ftrunc_s.d $w4,$w22 - ftrunc_s.w $w24,$w7 - ftrunc_u.d $w20,$w25 - ftrunc_u.w $w7,$w26 insv $s2,$at iret lbe $14,122($9) @@ -212,23 +172,9 @@ mulsa.w.ph $ac1,$s4,$s6 mulsaq_s.w.ph $ac0,$ra,$s2 neg.ps $f19,$f13 - nloc.b $w12,$w30 - nloc.d $w16,$w7 - nloc.h $w21,$w17 - nloc.w $w17,$w16 - nlzc.b $w12,$w7 - nlzc.d $w14,$w14 - nlzc.h $w24,$w24 - nlzc.w $w10,$w4 nmadd.ps $f27,$f4,$f9,$f25 nmsub.ps $f6,$f12,$f14,$f17 - nor.v $w20,$w20,$w15 - or.v $w13,$w23,$w12 packrl.ph $ra,$24,$14 - pcnt.b $w30,$w15 - pcnt.d $w5,$w16 - pcnt.h $w20,$w24 - pcnt.w $w22,$w20 pick.ph $ra,$a2,$gp pick.qb $11,$a0,$gp pll.ps $f25,$f9,$f30 @@ -302,5 +248,4 @@ tlbinv tlbinvf wrpgpr $zero,$13 - xor.v $w20,$w21,$w30 yield $v1,$s0 From bffca30038b9b302ecaa601025e67f1b578d0103 Mon Sep 17 00:00:00 2001 From: Igor Breger Date: Mon, 30 Nov 2015 10:40:52 +0000 Subject: [PATCH 003/186] AVX512: regenerate avx512bw intrincics tests results. Differential Revision: http://reviews.llvm.org/D15069 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254295 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/X86/avx512bw-intrinsics.ll | 1349 +++++++++++++++-------- 1 file changed, 879 insertions(+), 470 deletions(-) diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll index 7b9768c95944..6b032e0e6d78 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -1,15 +1,24 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw --show-mc-encoding| FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512BW + define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) { -; CHECK-LABEL: test_pcmpeq_b -; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 ## +; AVX512BW-LABEL: test_pcmpeq_b: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovq %k0, %rax +; AVX512BW-NEXT: retq %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1) ret i64 %res } define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) { -; CHECK-LABEL: test_mask_pcmpeq_b -; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} ## +; AVX512BW-LABEL: test_mask_pcmpeq_b: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovq %rdi, %k1 +; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovq %k0, %rax +; AVX512BW-NEXT: retq %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask) ret i64 %res } @@ -17,15 +26,22 @@ define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) { declare i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8>, <64 x i8>, i64) define i32 @test_pcmpeq_w(<32 x i16> %a, <32 x i16> %b) { -; CHECK-LABEL: test_pcmpeq_w -; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 ## +; AVX512BW-LABEL: test_pcmpeq_w: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: retq %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1) ret i32 %res } define i32 @test_mask_pcmpeq_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) { -; CHECK-LABEL: test_mask_pcmpeq_w -; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} ## +; AVX512BW-LABEL: test_mask_pcmpeq_w: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: retq %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask) ret i32 %res } @@ -33,15 +49,22 @@ define i32 @test_mask_pcmpeq_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) { declare i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16>, <32 x i16>, i32) define i64 @test_pcmpgt_b(<64 x i8> %a, <64 x i8> %b) { -; CHECK-LABEL: test_pcmpgt_b -; CHECK: vpcmpgtb %zmm1, %zmm0, %k0 ## +; AVX512BW-LABEL: test_pcmpgt_b: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovq %k0, %rax +; AVX512BW-NEXT: retq %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1) ret i64 %res } define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) { -; CHECK-LABEL: test_mask_pcmpgt_b -; CHECK: vpcmpgtb %zmm1, %zmm0, %k0 {%k1} ## +; AVX512BW-LABEL: test_mask_pcmpgt_b: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovq %rdi, %k1 +; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovq %k0, %rax +; AVX512BW-NEXT: retq %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask) ret i64 %res } @@ -49,326 +72,491 @@ define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) { declare i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8>, <64 x i8>, i64) define i32 @test_pcmpgt_w(<32 x i16> %a, <32 x i16> %b) { -; CHECK-LABEL: test_pcmpgt_w -; CHECK: vpcmpgtw %zmm1, %zmm0, %k0 ## +; AVX512BW-LABEL: test_pcmpgt_w: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: retq %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1) ret i32 %res } define i32 @test_mask_pcmpgt_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) { -; CHECK-LABEL: test_mask_pcmpgt_w -; CHECK: vpcmpgtw %zmm1, %zmm0, %k0 {%k1} ## +; AVX512BW-LABEL: test_mask_pcmpgt_w: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: retq %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask) ret i32 %res } declare i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16>, <32 x i16>, i32) -define <8 x i64> @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) { -; CHECK_LABEL: test_cmp_b_512 -; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 ## +define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) { +; AVX512BW-LABEL: test_cmp_b_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovq %k0, %rax +; AVX512BW-NEXT: vpcmpltb %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovq %k0, %rcx +; AVX512BW-NEXT: addq %rax, %rcx +; AVX512BW-NEXT: vpcmpleb %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovq %k0, %rax +; AVX512BW-NEXT: addq %rcx, %rax +; AVX512BW-NEXT: vpcmpunordb %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovq %k0, %rcx +; AVX512BW-NEXT: addq %rax, %rcx +; AVX512BW-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovq %k0, %rax +; AVX512BW-NEXT: addq %rcx, %rax +; AVX512BW-NEXT: vpcmpnltb %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovq %k0, %rcx +; AVX512BW-NEXT: addq %rax, %rcx +; AVX512BW-NEXT: vpcmpnleb %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovq %k0, %rdx +; AVX512BW-NEXT: addq %rcx, %rdx +; AVX512BW-NEXT: vpcmpordb %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovq %k0, %rax +; AVX512BW-NEXT: addq %rdx, %rax +; AVX512BW-NEXT: retq %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1) - %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0 -; CHECK: vpcmpltb %zmm1, %zmm0, %k0 ## %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1) - %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1 -; CHECK: vpcmpleb %zmm1, %zmm0, %k0 ## + %ret1 = add i64 %res0, %res1 %res2 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 -1) - %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2 -; CHECK: vpcmpunordb %zmm1, %zmm0, %k0 ## + %ret2 = add i64 %ret1, %res2 %res3 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 -1) - %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3 -; CHECK: vpcmpneqb %zmm1, %zmm0, %k0 ## + %ret3 = add i64 %ret2, %res3 %res4 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 -1) - %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4 -; CHECK: vpcmpnltb %zmm1, %zmm0, %k0 ## + %ret4 = add i64 %ret3, %res4 %res5 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 -1) - %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5 -; CHECK: vpcmpnleb %zmm1, %zmm0, %k0 ## + %ret5 = add i64 %ret4, %res5 %res6 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 -1) - %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6 -; CHECK: vpcmpordb %zmm1, %zmm0, %k0 ## + %ret6 = add i64 %ret5, %res6 %res7 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 -1) - %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7 - ret <8 x i64> %vec7 -} - -define <8 x i64> @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { -; CHECK_LABEL: test_mask_cmp_b_512 -; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} ## + %ret7 = add i64 %ret6, %res7 + ret i64 %ret7 +} + +define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { +; AVX512BW-LABEL: test_mask_cmp_b_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovq %rdi, %k1 +; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovq %k0, %rax +; AVX512BW-NEXT: vpcmpltb %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovq %k0, %rcx +; AVX512BW-NEXT: addq %rax, %rcx +; AVX512BW-NEXT: vpcmpleb %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovq %k0, %rax +; AVX512BW-NEXT: addq %rcx, %rax +; AVX512BW-NEXT: vpcmpunordb %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovq %k0, %rcx +; AVX512BW-NEXT: addq %rax, %rcx +; AVX512BW-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovq %k0, %rax +; AVX512BW-NEXT: addq %rcx, %rax +; AVX512BW-NEXT: vpcmpnltb %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovq %k0, %rcx +; AVX512BW-NEXT: addq %rax, %rcx +; AVX512BW-NEXT: vpcmpnleb %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovq %k0, %rdx +; AVX512BW-NEXT: addq %rcx, %rdx +; AVX512BW-NEXT: vpcmpordb %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovq %k0, %rax +; AVX512BW-NEXT: addq %rdx, %rax +; AVX512BW-NEXT: retq %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask) - %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0 -; CHECK: vpcmpltb %zmm1, %zmm0, %k0 {%k1} ## %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask) - %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1 -; CHECK: vpcmpleb %zmm1, %zmm0, %k0 {%k1} ## + %ret1 = add i64 %res0, %res1 %res2 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 %mask) - %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2 -; CHECK: vpcmpunordb %zmm1, %zmm0, %k0 {%k1} ## + %ret2 = add i64 %ret1, %res2 %res3 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 %mask) - %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3 -; CHECK: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} ## + %ret3 = add i64 %ret2, %res3 %res4 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 %mask) - %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4 -; CHECK: vpcmpnltb %zmm1, %zmm0, %k0 {%k1} ## + %ret4 = add i64 %ret3, %res4 %res5 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 %mask) - %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5 -; CHECK: vpcmpnleb %zmm1, %zmm0, %k0 {%k1} ## + %ret5 = add i64 %ret4, %res5 %res6 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 %mask) - %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6 -; CHECK: vpcmpordb %zmm1, %zmm0, %k0 {%k1} ## + %ret6 = add i64 %ret5, %res6 %res7 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 %mask) - %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7 - ret <8 x i64> %vec7 + %ret7 = add i64 %ret6, %res7 + ret i64 %ret7 } declare i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8>, <64 x i8>, i32, i64) nounwind readnone -define <8 x i64> @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) { -; CHECK_LABEL: test_ucmp_b_512 -; CHECK: vpcmpequb %zmm1, %zmm0, %k0 ## +define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) { +; AVX512BW-LABEL: test_ucmp_b_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpcmpequb %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovq %k0, %rax +; AVX512BW-NEXT: vpcmpltub %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovq %k0, %rcx +; AVX512BW-NEXT: addq %rax, %rcx +; AVX512BW-NEXT: vpcmpleub %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovq %k0, %rax +; AVX512BW-NEXT: addq %rcx, %rax +; AVX512BW-NEXT: vpcmpunordub %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovq %k0, %rcx +; AVX512BW-NEXT: addq %rax, %rcx +; AVX512BW-NEXT: vpcmpnequb %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovq %k0, %rax +; AVX512BW-NEXT: addq %rcx, %rax +; AVX512BW-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovq %k0, %rcx +; AVX512BW-NEXT: addq %rax, %rcx +; AVX512BW-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovq %k0, %rdx +; AVX512BW-NEXT: addq %rcx, %rdx +; AVX512BW-NEXT: vpcmpordub %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovq %k0, %rax +; AVX512BW-NEXT: addq %rdx, %rax +; AVX512BW-NEXT: retq %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1) - %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0 -; CHECK: vpcmpltub %zmm1, %zmm0, %k0 ## %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1) - %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1 -; CHECK: vpcmpleub %zmm1, %zmm0, %k0 ## + %ret1 = add i64 %res0, %res1 %res2 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 -1) - %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2 -; CHECK: vpcmpunordub %zmm1, %zmm0, %k0 ## + %ret2 = add i64 %ret1, %res2 %res3 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 -1) - %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3 -; CHECK: vpcmpnequb %zmm1, %zmm0, %k0 ## + %ret3 = add i64 %ret2, %res3 %res4 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 -1) - %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4 -; CHECK: vpcmpnltub %zmm1, %zmm0, %k0 ## + %ret4 = add i64 %ret3, %res4 %res5 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 -1) - %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5 -; CHECK: vpcmpnleub %zmm1, %zmm0, %k0 ## + %ret5 = add i64 %ret4, %res5 %res6 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 -1) - %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6 -; CHECK: vpcmpordub %zmm1, %zmm0, %k0 ## + %ret6 = add i64 %ret5, %res6 %res7 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 -1) - %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7 - ret <8 x i64> %vec7 -} - -define <8 x i64> @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { -; CHECK_LABEL: test_mask_ucmp_b_512 -; CHECK: vpcmpequb %zmm1, %zmm0, %k0 {%k1} ## + %ret7 = add i64 %ret6, %res7 + ret i64 %ret7 +} + +define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { +; AVX512BW-LABEL: test_mask_x86_avx512_ucmp_b_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovq %rdi, %k1 +; AVX512BW-NEXT: vpcmpequb %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovq %k0, %rax +; AVX512BW-NEXT: vpcmpltub %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovq %k0, %rcx +; AVX512BW-NEXT: addq %rax, %rcx +; AVX512BW-NEXT: vpcmpleub %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovq %k0, %rax +; AVX512BW-NEXT: addq %rcx, %rax +; AVX512BW-NEXT: vpcmpunordub %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovq %k0, %rcx +; AVX512BW-NEXT: addq %rax, %rcx +; AVX512BW-NEXT: vpcmpnequb %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovq %k0, %rax +; AVX512BW-NEXT: addq %rcx, %rax +; AVX512BW-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovq %k0, %rcx +; AVX512BW-NEXT: addq %rax, %rcx +; AVX512BW-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovq %k0, %rdx +; AVX512BW-NEXT: addq %rcx, %rdx +; AVX512BW-NEXT: vpcmpordub %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovq %k0, %rax +; AVX512BW-NEXT: addq %rdx, %rax +; AVX512BW-NEXT: retq %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask) - %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0 -; CHECK: vpcmpltub %zmm1, %zmm0, %k0 {%k1} ## %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask) - %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1 -; CHECK: vpcmpleub %zmm1, %zmm0, %k0 {%k1} ## + %ret1 = add i64 %res0, %res1 %res2 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 %mask) - %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2 -; CHECK: vpcmpunordub %zmm1, %zmm0, %k0 {%k1} ## + %ret2 = add i64 %ret1, %res2 %res3 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 %mask) - %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3 -; CHECK: vpcmpnequb %zmm1, %zmm0, %k0 {%k1} ## + %ret3 = add i64 %ret2, %res3 %res4 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 %mask) - %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4 -; CHECK: vpcmpnltub %zmm1, %zmm0, %k0 {%k1} ## + %ret4 = add i64 %ret3, %res4 %res5 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 %mask) - %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5 -; CHECK: vpcmpnleub %zmm1, %zmm0, %k0 {%k1} ## + %ret5 = add i64 %ret4, %res5 %res6 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 %mask) - %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6 -; CHECK: vpcmpordub %zmm1, %zmm0, %k0 {%k1} ## + %ret6 = add i64 %ret5, %res6 %res7 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 %mask) - %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7 - ret <8 x i64> %vec7 + %ret7 = add i64 %ret6, %res7 + ret i64 %ret7 } declare i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8>, <64 x i8>, i32, i64) nounwind readnone -define <8 x i32> @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) { -; CHECK_LABEL: test_cmp_w_512 -; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 ## +define i32 @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) { +; AVX512BW-LABEL: test_cmp_w_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: vpcmpltw %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %ecx +; AVX512BW-NEXT: addl %eax, %ecx +; AVX512BW-NEXT: vpcmplew %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: addl %ecx, %eax +; AVX512BW-NEXT: vpcmpunordw %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %ecx +; AVX512BW-NEXT: addl %eax, %ecx +; AVX512BW-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: addl %ecx, %eax +; AVX512BW-NEXT: vpcmpnltw %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %ecx +; AVX512BW-NEXT: addl %eax, %ecx +; AVX512BW-NEXT: vpcmpnlew %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %edx +; AVX512BW-NEXT: addl %ecx, %edx +; AVX512BW-NEXT: vpcmpordw %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: addl %edx, %eax +; AVX512BW-NEXT: retq %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1) - %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 -; CHECK: vpcmpltw %zmm1, %zmm0, %k0 ## %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1) - %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1 -; CHECK: vpcmplew %zmm1, %zmm0, %k0 ## + %ret1 = add i32 %res0, %res1 %res2 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 -1) - %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2 -; CHECK: vpcmpunordw %zmm1, %zmm0, %k0 ## + %ret2 = add i32 %ret1, %res2 %res3 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 -1) - %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3 -; CHECK: vpcmpneqw %zmm1, %zmm0, %k0 ## + %ret3 = add i32 %ret2, %res3 %res4 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 -1) - %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4 -; CHECK: vpcmpnltw %zmm1, %zmm0, %k0 ## + %ret4 = add i32 %ret3, %res4 %res5 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 -1) - %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5 -; CHECK: vpcmpnlew %zmm1, %zmm0, %k0 ## + %ret5 = add i32 %ret4, %res5 %res6 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 -1) - %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6 -; CHECK: vpcmpordw %zmm1, %zmm0, %k0 ## + %ret6 = add i32 %ret5, %res6 %res7 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 -1) - %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7 - ret <8 x i32> %vec7 -} - -define <8 x i32> @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { -; CHECK_LABEL: test_mask_cmp_w_512 -; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} ## + %ret7 = add i32 %ret6, %res7 + ret i32 %ret7 +} + +define i32 @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { +; AVX512BW-LABEL: test_mask_cmp_w_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: vpcmpltw %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovd %k0, %ecx +; AVX512BW-NEXT: addl %eax, %ecx +; AVX512BW-NEXT: vpcmplew %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: addl %ecx, %eax +; AVX512BW-NEXT: vpcmpunordw %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovd %k0, %ecx +; AVX512BW-NEXT: addl %eax, %ecx +; AVX512BW-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: addl %ecx, %eax +; AVX512BW-NEXT: vpcmpnltw %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovd %k0, %ecx +; AVX512BW-NEXT: addl %eax, %ecx +; AVX512BW-NEXT: vpcmpnlew %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovd %k0, %edx +; AVX512BW-NEXT: addl %ecx, %edx +; AVX512BW-NEXT: vpcmpordw %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: addl %edx, %eax +; AVX512BW-NEXT: retq %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask) - %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 -; CHECK: vpcmpltw %zmm1, %zmm0, %k0 {%k1} ## %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask) - %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1 -; CHECK: vpcmplew %zmm1, %zmm0, %k0 {%k1} ## + %ret1 = add i32 %res0, %res1 %res2 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 %mask) - %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2 -; CHECK: vpcmpunordw %zmm1, %zmm0, %k0 {%k1} ## + %ret2 = add i32 %ret1, %res2 %res3 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 %mask) - %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3 -; CHECK: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} ## + %ret3 = add i32 %ret2, %res3 %res4 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 %mask) - %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4 -; CHECK: vpcmpnltw %zmm1, %zmm0, %k0 {%k1} ## + %ret4 = add i32 %ret3, %res4 %res5 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 %mask) - %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5 -; CHECK: vpcmpnlew %zmm1, %zmm0, %k0 {%k1} ## + %ret5 = add i32 %ret4, %res5 %res6 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 %mask) - %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6 -; CHECK: vpcmpordw %zmm1, %zmm0, %k0 {%k1} ## + %ret6 = add i32 %ret5, %res6 %res7 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 %mask) - %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7 - ret <8 x i32> %vec7 + %ret7 = add i32 %ret6, %res7 + ret i32 %ret7 } declare i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16>, <32 x i16>, i32, i32) nounwind readnone -define <8 x i32> @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) { -; CHECK_LABEL: test_ucmp_w_512 -; CHECK: vpcmpequw %zmm1, %zmm0, %k0 ## +define i32 @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) { +; AVX512BW-LABEL: test_ucmp_w_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpcmpequw %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %ecx +; AVX512BW-NEXT: addl %eax, %ecx +; AVX512BW-NEXT: vpcmpleuw %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: addl %ecx, %eax +; AVX512BW-NEXT: vpcmpunorduw %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %ecx +; AVX512BW-NEXT: addl %eax, %ecx +; AVX512BW-NEXT: vpcmpnequw %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: addl %ecx, %eax +; AVX512BW-NEXT: vpcmpnltuw %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %ecx +; AVX512BW-NEXT: addl %eax, %ecx +; AVX512BW-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %edx +; AVX512BW-NEXT: addl %ecx, %edx +; AVX512BW-NEXT: vpcmporduw %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: addl %edx, %eax +; AVX512BW-NEXT: retq %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1) - %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 -; CHECK: vpcmpltuw %zmm1, %zmm0, %k0 ## %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1) - %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1 -; CHECK: vpcmpleuw %zmm1, %zmm0, %k0 ## + %ret1 = add i32 %res0, %res1 %res2 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 -1) - %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2 -; CHECK: vpcmpunorduw %zmm1, %zmm0, %k0 ## + %ret2 = add i32 %ret1, %res2 %res3 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 -1) - %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3 -; CHECK: vpcmpnequw %zmm1, %zmm0, %k0 ## + %ret3 = add i32 %ret2, %res3 %res4 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 -1) - %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4 -; CHECK: vpcmpnltuw %zmm1, %zmm0, %k0 ## + %ret4 = add i32 %ret3, %res4 %res5 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 -1) - %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5 -; CHECK: vpcmpnleuw %zmm1, %zmm0, %k0 ## + %ret5 = add i32 %ret4, %res5 %res6 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 -1) - %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6 -; CHECK: vpcmporduw %zmm1, %zmm0, %k0 ## + %ret6 = add i32 %ret5, %res6 %res7 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 -1) - %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7 - ret <8 x i32> %vec7 -} - -define <8 x i32> @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { -; CHECK_LABEL: test_mask_ucmp_w_512 -; CHECK: vpcmpequw %zmm1, %zmm0, %k0 {%k1} ## + %ret7 = add i32 %ret6, %res7 + ret i32 %ret7 +} + +define i32 @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { +; AVX512BW-LABEL: test_mask_ucmp_w_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpcmpequw %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovd %k0, %ecx +; AVX512BW-NEXT: addl %eax, %ecx +; AVX512BW-NEXT: vpcmpleuw %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: addl %ecx, %eax +; AVX512BW-NEXT: vpcmpunorduw %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovd %k0, %ecx +; AVX512BW-NEXT: addl %eax, %ecx +; AVX512BW-NEXT: vpcmpnequw %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: addl %ecx, %eax +; AVX512BW-NEXT: vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovd %k0, %ecx +; AVX512BW-NEXT: addl %eax, %ecx +; AVX512BW-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovd %k0, %edx +; AVX512BW-NEXT: addl %ecx, %edx +; AVX512BW-NEXT: vpcmporduw %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: addl %edx, %eax +; AVX512BW-NEXT: retq %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask) - %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 -; CHECK: vpcmpltuw %zmm1, %zmm0, %k0 {%k1} ## %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask) - %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1 -; CHECK: vpcmpleuw %zmm1, %zmm0, %k0 {%k1} ## + %ret1 = add i32 %res0, %res1 %res2 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 %mask) - %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2 -; CHECK: vpcmpunorduw %zmm1, %zmm0, %k0 {%k1} ## + %ret2 = add i32 %ret1, %res2 %res3 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 %mask) - %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3 -; CHECK: vpcmpnequw %zmm1, %zmm0, %k0 {%k1} ## + %ret3 = add i32 %ret2, %res3 %res4 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 %mask) - %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4 -; CHECK: vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} ## + %ret4 = add i32 %ret3, %res4 %res5 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 %mask) - %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5 -; CHECK: vpcmpnleuw %zmm1, %zmm0, %k0 {%k1} ## + %ret5 = add i32 %ret4, %res5 %res6 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 %mask) - %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6 -; CHECK: vpcmporduw %zmm1, %zmm0, %k0 {%k1} ## + %ret6 = add i32 %ret5, %res6 %res7 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 %mask) - %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7 - ret <8 x i32> %vec7 + %ret7 = add i32 %ret6, %res7 + ret i32 %ret7 } declare i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16>, <32 x i16>, i32, i32) nounwind readnone declare <64 x i8> @llvm.x86.avx512.mask.blend.b.512(<64 x i8>, <64 x i8>, i64) nounwind readonly -; CHECK-LABEL: test_x86_mask_blend_w_512 define <32 x i16> @test_x86_mask_blend_w_512(i32 %mask, <32 x i16> %a1, <32 x i16> %a2) { - ; CHECK: vpblendmw - %res = call <32 x i16> @llvm.x86.avx512.mask.blend.w.512(<32 x i16> %a1, <32 x i16> %a2, i32 %mask) ; <<32 x i16>> [#uses=1] +; AVX512BW-LABEL: test_x86_mask_blend_w_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpblendmw %zmm1, %zmm0, %zmm0 {%k1} +; AVX512BW-NEXT: retq + %res = call <32 x i16> @llvm.x86.avx512.mask.blend.w.512(<32 x i16> %a1, <32 x i16> %a2, i32 %mask) ; <<32 x i16>> [#uses=1] ret <32 x i16> %res } declare <32 x i16> @llvm.x86.avx512.mask.blend.w.512(<32 x i16>, <32 x i16>, i32) nounwind readonly -; CHECK-LABEL: test_x86_mask_blend_b_512 -; CHECK: vpblendmb define <64 x i8> @test_x86_mask_blend_b_512(i64 %a0, <64 x i8> %a1, <64 x i8> %a2) { +; AVX512BW-LABEL: test_x86_mask_blend_b_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovq %rdi, %k1 +; AVX512BW-NEXT: vpblendmb %zmm1, %zmm0, %zmm0 {%k1} +; AVX512BW-NEXT: retq %res = call <64 x i8> @llvm.x86.avx512.mask.blend.b.512(<64 x i8> %a1, <64 x i8> %a2, i64 %a0) ; <<64 x i8>> [#uses=1] ret <64 x i8> %res } define <32 x i16> @test_mask_packs_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) { - ;CHECK-LABEL: test_mask_packs_epi32_rr_512 - ;CHECK: vpackssdw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0x6b,0xc1] +; AVX512BW-LABEL: test_mask_packs_epi32_rr_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpackssdw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1) ret <32 x i16> %res } define <32 x i16> @test_mask_packs_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) { - ;CHECK-LABEL: test_mask_packs_epi32_rrk_512 - ;CHECK: vpackssdw %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x6b,0xd1] +; AVX512BW-LABEL: test_mask_packs_epi32_rrk_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpackssdw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovaps %zmm2, %zmm0 +; AVX512BW-NEXT: retq %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) ret <32 x i16> %res } define <32 x i16> @test_mask_packs_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) { - ;CHECK-LABEL: test_mask_packs_epi32_rrkz_512 - ;CHECK: vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x6b,0xc1] +; AVX512BW-LABEL: test_mask_packs_epi32_rrkz_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask) ret <32 x i16> %res } define <32 x i16> @test_mask_packs_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) { - ;CHECK-LABEL: test_mask_packs_epi32_rm_512 - ;CHECK: vpackssdw (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0x6b,0x07] +; AVX512BW-LABEL: test_mask_packs_epi32_rm_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpackssdw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq %b = load <16 x i32>, <16 x i32>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1) ret <32 x i16> %res } define <32 x i16> @test_mask_packs_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) { - ;CHECK-LABEL: test_mask_packs_epi32_rmk_512 - ;CHECK: vpackssdw (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x6b,0x0f] +; AVX512BW-LABEL: test_mask_packs_epi32_rmk_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpackssdw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512BW-NEXT: retq %b = load <16 x i32>, <16 x i32>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) ret <32 x i16> %res } define <32 x i16> @test_mask_packs_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) { - ;CHECK-LABEL: test_mask_packs_epi32_rmkz_512 - ;CHECK: vpackssdw (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x6b,0x07] +; AVX512BW-LABEL: test_mask_packs_epi32_rmkz_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpackssdw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq %b = load <16 x i32>, <16 x i32>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask) ret <32 x i16> %res } define <32 x i16> @test_mask_packs_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) { - ;CHECK-LABEL: test_mask_packs_epi32_rmb_512 - ;CHECK: vpackssdw (%rdi){1to16}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x58,0x6b,0x07] +; AVX512BW-LABEL: test_mask_packs_epi32_rmb_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpackssdw (%rdi){1to16}, %zmm0, %zmm0 +; AVX512BW-NEXT: retq %q = load i32, i32* %ptr_b %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer @@ -377,8 +565,12 @@ define <32 x i16> @test_mask_packs_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) { } define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) { - ;CHECK-LABEL: test_mask_packs_epi32_rmbk_512 - ;CHECK: vpackssdw (%rdi){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x59,0x6b,0x0f] +; AVX512BW-LABEL: test_mask_packs_epi32_rmbk_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpackssdw (%rdi){1to16}, %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512BW-NEXT: retq %q = load i32, i32* %ptr_b %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer @@ -387,8 +579,11 @@ define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <3 } define <32 x i16> @test_mask_packs_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) { - ;CHECK-LABEL: test_mask_packs_epi32_rmbkz_512 - ;CHECK: vpackssdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xd9,0x6b,0x07] +; AVX512BW-LABEL: test_mask_packs_epi32_rmbkz_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpackssdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq %q = load i32, i32* %ptr_b %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer @@ -399,45 +594,63 @@ define <32 x i16> @test_mask_packs_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i declare <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32>, <16 x i32>, <32 x i16>, i32) define <64 x i8> @test_mask_packs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { - ;CHECK-LABEL: test_mask_packs_epi16_rr_512 - ;CHECK: vpacksswb %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x63,0xc1] +; AVX512BW-LABEL: test_mask_packs_epi16_rr_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpacksswb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1) ret <64 x i8> %res } define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) { - ;CHECK-LABEL: test_mask_packs_epi16_rrk_512 - ;CHECK: vpacksswb %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x63,0xd1] +; AVX512BW-LABEL: test_mask_packs_epi16_rrk_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovq %rdi, %k1 +; AVX512BW-NEXT: vpacksswb %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovaps %zmm2, %zmm0 +; AVX512BW-NEXT: retq %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) ret <64 x i8> %res } define <64 x i8> @test_mask_packs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) { - ;CHECK-LABEL: test_mask_packs_epi16_rrkz_512 - ;CHECK: vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x63,0xc1] +; AVX512BW-LABEL: test_mask_packs_epi16_rrkz_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovq %rdi, %k1 +; AVX512BW-NEXT: vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask) ret <64 x i8> %res } define <64 x i8> @test_mask_packs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { - ;CHECK-LABEL: test_mask_packs_epi16_rm_512 - ;CHECK: vpacksswb (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x63,0x07] +; AVX512BW-LABEL: test_mask_packs_epi16_rm_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpacksswb (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1) ret <64 x i8> %res } define <64 x i8> @test_mask_packs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) { - ;CHECK-LABEL: test_mask_packs_epi16_rmk_512 - ;CHECK: vpacksswb (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x63,0x0f] +; AVX512BW-LABEL: test_mask_packs_epi16_rmk_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovq %rsi, %k1 +; AVX512BW-NEXT: vpacksswb (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512BW-NEXT: retq %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) ret <64 x i8> %res } define <64 x i8> @test_mask_packs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) { - ;CHECK-LABEL: test_mask_packs_epi16_rmkz_512 - ;CHECK: vpacksswb (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x63,0x07] +; AVX512BW-LABEL: test_mask_packs_epi16_rmkz_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovq %rsi, %k1 +; AVX512BW-NEXT: vpacksswb (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask) ret <64 x i8> %res @@ -447,53 +660,73 @@ declare <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16>, <32 x i16>, <64 define <32 x i16> @test_mask_packus_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) { - ;CHECK-LABEL: test_mask_packus_epi32_rr_512 - ;CHECK: vpackusdw %zmm1, %zmm0, %zmm0 +; AVX512BW-LABEL: test_mask_packus_epi32_rr_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpackusdw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1) ret <32 x i16> %res } define <32 x i16> @test_mask_packus_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) { - ;CHECK-LABEL: test_mask_packus_epi32_rrk_512 - ;CHECK: vpackusdw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-LABEL: test_mask_packus_epi32_rrk_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpackusdw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovaps %zmm2, %zmm0 +; AVX512BW-NEXT: retq %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) ret <32 x i16> %res } define <32 x i16> @test_mask_packus_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) { - ;CHECK-LABEL: test_mask_packus_epi32_rrkz_512 - ;CHECK: vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-LABEL: test_mask_packus_epi32_rrkz_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask) ret <32 x i16> %res } define <32 x i16> @test_mask_packus_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) { - ;CHECK-LABEL: test_mask_packus_epi32_rm_512 - ;CHECK: vpackusdw (%rdi), %zmm0, %zmm0 +; AVX512BW-LABEL: test_mask_packus_epi32_rm_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpackusdw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq %b = load <16 x i32>, <16 x i32>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1) ret <32 x i16> %res } define <32 x i16> @test_mask_packus_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) { - ;CHECK-LABEL: test_mask_packus_epi32_rmk_512 - ;CHECK: vpackusdw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-LABEL: test_mask_packus_epi32_rmk_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpackusdw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512BW-NEXT: retq %b = load <16 x i32>, <16 x i32>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) ret <32 x i16> %res } define <32 x i16> @test_mask_packus_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) { - ;CHECK-LABEL: test_mask_packus_epi32_rmkz_512 - ;CHECK: vpackusdw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-LABEL: test_mask_packus_epi32_rmkz_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpackusdw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq %b = load <16 x i32>, <16 x i32>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask) ret <32 x i16> %res } define <32 x i16> @test_mask_packus_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) { - ;CHECK-LABEL: test_mask_packus_epi32_rmb_512 - ;CHECK: vpackusdw (%rdi){1to16}, %zmm0, %zmm0 +; AVX512BW-LABEL: test_mask_packus_epi32_rmb_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpackusdw (%rdi){1to16}, %zmm0, %zmm0 +; AVX512BW-NEXT: retq %q = load i32, i32* %ptr_b %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer @@ -502,8 +735,12 @@ define <32 x i16> @test_mask_packus_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) { } define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) { - ;CHECK-LABEL: test_mask_packus_epi32_rmbk_512 - ;CHECK: vpackusdw (%rdi){1to16}, %zmm0, %zmm1 {%k1} +; AVX512BW-LABEL: test_mask_packus_epi32_rmbk_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpackusdw (%rdi){1to16}, %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512BW-NEXT: retq %q = load i32, i32* %ptr_b %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer @@ -512,8 +749,11 @@ define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, < } define <32 x i16> @test_mask_packus_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) { - ;CHECK-LABEL: test_mask_packus_epi32_rmbkz_512 - ;CHECK: vpackusdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-LABEL: test_mask_packus_epi32_rmbkz_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpackusdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq %q = load i32, i32* %ptr_b %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer @@ -524,45 +764,63 @@ define <32 x i16> @test_mask_packus_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, declare <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32>, <16 x i32>, <32 x i16>, i32) define <64 x i8> @test_mask_packus_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { - ;CHECK-LABEL: test_mask_packus_epi16_rr_512 - ;CHECK: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512BW-LABEL: test_mask_packus_epi16_rr_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1) ret <64 x i8> %res } define <64 x i8> @test_mask_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) { - ;CHECK-LABEL: test_mask_packus_epi16_rrk_512 - ;CHECK: vpackuswb %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-LABEL: test_mask_packus_epi16_rrk_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovq %rdi, %k1 +; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovaps %zmm2, %zmm0 +; AVX512BW-NEXT: retq %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) ret <64 x i8> %res } define <64 x i8> @test_mask_packus_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) { - ;CHECK-LABEL: test_mask_packus_epi16_rrkz_512 - ;CHECK: vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-LABEL: test_mask_packus_epi16_rrkz_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovq %rdi, %k1 +; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask) ret <64 x i8> %res } define <64 x i8> @test_mask_packus_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { - ;CHECK-LABEL: test_mask_packus_epi16_rm_512 - ;CHECK: vpackuswb (%rdi), %zmm0, %zmm0 +; AVX512BW-LABEL: test_mask_packus_epi16_rm_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpackuswb (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1) ret <64 x i8> %res } define <64 x i8> @test_mask_packus_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) { - ;CHECK-LABEL: test_mask_packus_epi16_rmk_512 - ;CHECK: vpackuswb (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-LABEL: test_mask_packus_epi16_rmk_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovq %rsi, %k1 +; AVX512BW-NEXT: vpackuswb (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512BW-NEXT: retq %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) ret <64 x i8> %res } define <64 x i8> @test_mask_packus_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) { - ;CHECK-LABEL: test_mask_packus_epi16_rmkz_512 - ;CHECK: vpackuswb (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-LABEL: test_mask_packus_epi16_rmkz_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovq %rsi, %k1 +; AVX512BW-NEXT: vpackuswb (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask) ret <64 x i8> %res @@ -571,45 +829,63 @@ define <64 x i8> @test_mask_packus_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %pt declare <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16>, <32 x i16>, <64 x i8>, i64) define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { - ;CHECK-LABEL: test_mask_adds_epi16_rr_512 - ;CHECK: vpaddsw %zmm1, %zmm0, %zmm0 +; AVX512BW-LABEL: test_mask_adds_epi16_rr_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) ret <32 x i16> %res } define <32 x i16> @test_mask_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { - ;CHECK-LABEL: test_mask_adds_epi16_rrk_512 - ;CHECK: vpaddsw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-LABEL: test_mask_adds_epi16_rrk_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovaps %zmm2, %zmm0 +; AVX512BW-NEXT: retq %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) ret <32 x i16> %res } define <32 x i16> @test_mask_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { - ;CHECK-LABEL: test_mask_adds_epi16_rrkz_512 - ;CHECK: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-LABEL: test_mask_adds_epi16_rrkz_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) ret <32 x i16> %res } define <32 x i16> @test_mask_adds_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { - ;CHECK-LABEL: test_mask_adds_epi16_rm_512 - ;CHECK: vpaddsw (%rdi), %zmm0, %zmm0 +; AVX512BW-LABEL: test_mask_adds_epi16_rm_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) ret <32 x i16> %res } define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { - ;CHECK-LABEL: test_mask_adds_epi16_rmk_512 - ;CHECK: vpaddsw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-LABEL: test_mask_adds_epi16_rmk_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512BW-NEXT: retq %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) ret <32 x i16> %res } define <32 x i16> @test_mask_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { - ;CHECK-LABEL: test_mask_adds_epi16_rmkz_512 - ;CHECK: vpaddsw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-LABEL: test_mask_adds_epi16_rmkz_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) ret <32 x i16> %res @@ -618,45 +894,63 @@ define <32 x i16> @test_mask_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr declare <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) define <32 x i16> @test_mask_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { - ;CHECK-LABEL: test_mask_subs_epi16_rr_512 - ;CHECK: vpsubsw %zmm1, %zmm0, %zmm0 +; AVX512BW-LABEL: test_mask_subs_epi16_rr_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) ret <32 x i16> %res } define <32 x i16> @test_mask_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { - ;CHECK-LABEL: test_mask_subs_epi16_rrk_512 - ;CHECK: vpsubsw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-LABEL: test_mask_subs_epi16_rrk_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovaps %zmm2, %zmm0 +; AVX512BW-NEXT: retq %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) ret <32 x i16> %res } define <32 x i16> @test_mask_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { - ;CHECK-LABEL: test_mask_subs_epi16_rrkz_512 - ;CHECK: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-LABEL: test_mask_subs_epi16_rrkz_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) ret <32 x i16> %res } define <32 x i16> @test_mask_subs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { - ;CHECK-LABEL: test_mask_subs_epi16_rm_512 - ;CHECK: vpsubsw (%rdi), %zmm0, %zmm0 +; AVX512BW-LABEL: test_mask_subs_epi16_rm_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) ret <32 x i16> %res } define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { - ;CHECK-LABEL: test_mask_subs_epi16_rmk_512 - ;CHECK: vpsubsw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-LABEL: test_mask_subs_epi16_rmk_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512BW-NEXT: retq %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) ret <32 x i16> %res } define <32 x i16> @test_mask_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { - ;CHECK-LABEL: test_mask_subs_epi16_rmkz_512 - ;CHECK: vpsubsw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-LABEL: test_mask_subs_epi16_rmkz_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) ret <32 x i16> %res @@ -665,45 +959,63 @@ define <32 x i16> @test_mask_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr declare <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) define <32 x i16> @test_mask_adds_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) { - ;CHECK-LABEL: test_mask_adds_epu16_rr_512 - ;CHECK: vpaddusw %zmm1, %zmm0, %zmm0 +; AVX512BW-LABEL: test_mask_adds_epu16_rr_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) ret <32 x i16> %res } define <32 x i16> @test_mask_adds_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { - ;CHECK-LABEL: test_mask_adds_epu16_rrk_512 - ;CHECK: vpaddusw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-LABEL: test_mask_adds_epu16_rrk_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovaps %zmm2, %zmm0 +; AVX512BW-NEXT: retq %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) ret <32 x i16> %res } define <32 x i16> @test_mask_adds_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { - ;CHECK-LABEL: test_mask_adds_epu16_rrkz_512 - ;CHECK: vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-LABEL: test_mask_adds_epu16_rrkz_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) ret <32 x i16> %res } define <32 x i16> @test_mask_adds_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { - ;CHECK-LABEL: test_mask_adds_epu16_rm_512 - ;CHECK: vpaddusw (%rdi), %zmm0, %zmm0 +; AVX512BW-LABEL: test_mask_adds_epu16_rm_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) ret <32 x i16> %res } define <32 x i16> @test_mask_adds_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { - ;CHECK-LABEL: test_mask_adds_epu16_rmk_512 - ;CHECK: vpaddusw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-LABEL: test_mask_adds_epu16_rmk_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512BW-NEXT: retq %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) ret <32 x i16> %res } define <32 x i16> @test_mask_adds_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { - ;CHECK-LABEL: test_mask_adds_epu16_rmkz_512 - ;CHECK: vpaddusw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-LABEL: test_mask_adds_epu16_rmkz_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) ret <32 x i16> %res @@ -712,45 +1024,63 @@ define <32 x i16> @test_mask_adds_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr declare <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) define <32 x i16> @test_mask_subs_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) { - ;CHECK-LABEL: test_mask_subs_epu16_rr_512 - ;CHECK: vpsubusw %zmm1, %zmm0, %zmm0 +; AVX512BW-LABEL: test_mask_subs_epu16_rr_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) ret <32 x i16> %res } define <32 x i16> @test_mask_subs_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { - ;CHECK-LABEL: test_mask_subs_epu16_rrk_512 - ;CHECK: vpsubusw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-LABEL: test_mask_subs_epu16_rrk_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovaps %zmm2, %zmm0 +; AVX512BW-NEXT: retq %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) ret <32 x i16> %res } define <32 x i16> @test_mask_subs_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { - ;CHECK-LABEL: test_mask_subs_epu16_rrkz_512 - ;CHECK: vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-LABEL: test_mask_subs_epu16_rrkz_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) ret <32 x i16> %res } define <32 x i16> @test_mask_subs_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { - ;CHECK-LABEL: test_mask_subs_epu16_rm_512 - ;CHECK: vpsubusw (%rdi), %zmm0, %zmm0 +; AVX512BW-LABEL: test_mask_subs_epu16_rm_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) ret <32 x i16> %res } define <32 x i16> @test_mask_subs_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { - ;CHECK-LABEL: test_mask_subs_epu16_rmk_512 - ;CHECK: vpsubusw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-LABEL: test_mask_subs_epu16_rmk_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512BW-NEXT: retq %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) ret <32 x i16> %res } define <32 x i16> @test_mask_subs_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { - ;CHECK-LABEL: test_mask_subs_epu16_rmkz_512 - ;CHECK: vpsubusw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-LABEL: test_mask_subs_epu16_rmkz_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) ret <32 x i16> %res @@ -760,11 +1090,14 @@ declare <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16>, <32 x i16>, <3 declare <64 x i8> @llvm.x86.avx512.mask.pmaxs.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) -; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_b_512 -; CHECK-NOT: call -; CHECK: vpmaxsb %zmm -; CHECK: {%k1} define <64 x i8>@test_int_x86_avx512_mask_pmaxs_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { +; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaxs_b_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovq %rdi, %k1 +; AVX512BW-NEXT: vpmaxsb %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: retq %res = call <64 x i8> @llvm.x86.avx512.mask.pmaxs.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmaxs.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) %res2 = add <64 x i8> %res, %res1 @@ -773,11 +1106,14 @@ define <64 x i8>@test_int_x86_avx512_mask_pmaxs_b_512(<64 x i8> %x0, <64 x i8> % declare <32 x i16> @llvm.x86.avx512.mask.pmaxs.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) -; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_w_512 -; CHECK-NOT: call -; CHECK: vpmaxsw %zmm -; CHECK: {%k1} define <32 x i16>@test_int_x86_avx512_mask_pmaxs_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { +; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaxs_w_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpmaxsw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: retq %res = call <32 x i16> @llvm.x86.avx512.mask.pmaxs.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmaxs.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) %res2 = add <32 x i16> %res, %res1 @@ -786,11 +1122,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pmaxs_w_512(<32 x i16> %x0, <32 x i16 declare <64 x i8> @llvm.x86.avx512.mask.pmaxu.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) -; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_b_512 -; CHECK-NOT: call -; CHECK: vpmaxub %zmm -; CHECK: {%k1} define <64 x i8>@test_int_x86_avx512_mask_pmaxu_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { +; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaxu_b_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovq %rdi, %k1 +; AVX512BW-NEXT: vpmaxub %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: retq %res = call <64 x i8> @llvm.x86.avx512.mask.pmaxu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmaxu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) %res2 = add <64 x i8> %res, %res1 @@ -799,11 +1138,14 @@ define <64 x i8>@test_int_x86_avx512_mask_pmaxu_b_512(<64 x i8> %x0, <64 x i8> % declare <32 x i16> @llvm.x86.avx512.mask.pmaxu.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) -; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_w_512 -; CHECK-NOT: call -; CHECK: vpmaxuw %zmm -; CHECK: {%k1} define <32 x i16>@test_int_x86_avx512_mask_pmaxu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { +; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaxu_w_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpmaxuw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: retq %res = call <32 x i16> @llvm.x86.avx512.mask.pmaxu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmaxu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) %res2 = add <32 x i16> %res, %res1 @@ -812,11 +1154,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pmaxu_w_512(<32 x i16> %x0, <32 x i16 declare <64 x i8> @llvm.x86.avx512.mask.pmins.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) -; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_b_512 -; CHECK-NOT: call -; CHECK: vpminsb %zmm -; CHECK: {%k1} define <64 x i8>@test_int_x86_avx512_mask_pmins_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { +; AVX512BW-LABEL: test_int_x86_avx512_mask_pmins_b_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovq %rdi, %k1 +; AVX512BW-NEXT: vpminsb %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: retq %res = call <64 x i8> @llvm.x86.avx512.mask.pmins.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmins.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) %res2 = add <64 x i8> %res, %res1 @@ -825,11 +1170,14 @@ define <64 x i8>@test_int_x86_avx512_mask_pmins_b_512(<64 x i8> %x0, <64 x i8> % declare <32 x i16> @llvm.x86.avx512.mask.pmins.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) -; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_w_512 -; CHECK-NOT: call -; CHECK: vpminsw %zmm -; CHECK: {%k1} define <32 x i16>@test_int_x86_avx512_mask_pmins_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { +; AVX512BW-LABEL: test_int_x86_avx512_mask_pmins_w_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpminsw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: retq %res = call <32 x i16> @llvm.x86.avx512.mask.pmins.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmins.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) %res2 = add <32 x i16> %res, %res1 @@ -838,11 +1186,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pmins_w_512(<32 x i16> %x0, <32 x i16 declare <64 x i8> @llvm.x86.avx512.mask.pminu.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) -; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_b_512 -; CHECK-NOT: call -; CHECK: vpminub %zmm -; CHECK: {%k1} define <64 x i8>@test_int_x86_avx512_mask_pminu_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { +; AVX512BW-LABEL: test_int_x86_avx512_mask_pminu_b_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovq %rdi, %k1 +; AVX512BW-NEXT: vpminub %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: retq %res = call <64 x i8> @llvm.x86.avx512.mask.pminu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) %res1 = call <64 x i8> @llvm.x86.avx512.mask.pminu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) %res2 = add <64 x i8> %res, %res1 @@ -851,11 +1202,14 @@ define <64 x i8>@test_int_x86_avx512_mask_pminu_b_512(<64 x i8> %x0, <64 x i8> % declare <32 x i16> @llvm.x86.avx512.mask.pminu.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) -; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_w_512 -; CHECK-NOT: call -; CHECK: vpminuw %zmm -; CHECK: {%k1} define <32 x i16>@test_int_x86_avx512_mask_pminu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { +; AVX512BW-LABEL: test_int_x86_avx512_mask_pminu_w_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpminuw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: retq %res = call <32 x i16> @llvm.x86.avx512.mask.pminu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) %res1 = call <32 x i16> @llvm.x86.avx512.mask.pminu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) %res2 = add <32 x i16> %res, %res1 @@ -864,11 +1218,15 @@ define <32 x i16>@test_int_x86_avx512_mask_pminu_w_512(<32 x i16> %x0, <32 x i16 declare <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) -; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_hi_512 -; CHECK-NOT: call -; CHECK: kmov -; CHECK: vpermt2w %zmm{{.*}}{%k1} define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { +; AVX512BW-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vmovaps %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm3, %zmm0 +; AVX512BW-NEXT: retq %res = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) %res2 = add <32 x i16> %res, %res1 @@ -877,11 +1235,15 @@ define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32 declare <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) -; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_hi_512 -; CHECK-NOT: call -; CHECK: kmov -; CHECK: vpermt2w %zmm{{.*}}{%k1} {z} define <32 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { +; AVX512BW-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vmovaps %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 {%k1} {z} +; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm3, %zmm0 +; AVX512BW-NEXT: retq %res = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) %res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) %res2 = add <32 x i16> %res, %res1 @@ -890,11 +1252,15 @@ define <32 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <3 declare <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) -; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_hi_512 -; CHECK-NOT: call -; CHECK: kmov -; CHECK: vpermi2w %zmm{{.*}}{%k1} define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { +; AVX512BW-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vmovaps %zmm1, %zmm3 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm3, %zmm0 +; AVX512BW-NEXT: retq %res = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) %res2 = add <32 x i16> %res, %res1 @@ -903,11 +1269,14 @@ define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32 declare <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) -; CHECK-LABEL: @test_int_x86_avx512_mask_pavg_b_512 -; CHECK-NOT: call -; CHECK: vpavgb %zmm -; CHECK: {%k1} define <64 x i8>@test_int_x86_avx512_mask_pavg_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { +; AVX512BW-LABEL: test_int_x86_avx512_mask_pavg_b_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovq %rdi, %k1 +; AVX512BW-NEXT: vpavgb %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpavgb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: retq %res = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) %res1 = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) %res2 = add <64 x i8> %res, %res1 @@ -916,11 +1285,14 @@ define <64 x i8>@test_int_x86_avx512_mask_pavg_b_512(<64 x i8> %x0, <64 x i8> %x declare <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) -; CHECK-LABEL: @test_int_x86_avx512_mask_pavg_w_512 -; CHECK-NOT: call -; CHECK: vpavgw %zmm -; CHECK: {%k1} define <32 x i16>@test_int_x86_avx512_mask_pavg_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { +; AVX512BW-LABEL: test_int_x86_avx512_mask_pavg_w_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpavgw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpavgw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: retq %res = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) %res1 = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) %res2 = add <32 x i16> %res, %res1 @@ -929,11 +1301,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pavg_w_512(<32 x i16> %x0, <32 x i16> declare <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) -; CHECK-LABEL: @test_int_x86_avx512_mask_pshuf_b_512 -; CHECK-NOT: call -; CHECK: kmov -; CHECK: vpshufb %zmm{{.*}}{%k1} define <64 x i8>@test_int_x86_avx512_mask_pshuf_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { +; AVX512BW-LABEL: test_int_x86_avx512_mask_pshuf_b_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovq %rdi, %k1 +; AVX512BW-NEXT: vpshufb %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpshufb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: retq %res = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) %res1 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) %res2 = add <64 x i8> %res, %res1 @@ -942,11 +1317,14 @@ define <64 x i8>@test_int_x86_avx512_mask_pshuf_b_512(<64 x i8> %x0, <64 x i8> % declare <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16>, <32 x i16>, i32) -; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_w_512 -; CHECK-NOT: call -; CHECK: kmov -; CHECK: vpabsw{{.*}}{%k1} define <32 x i16>@test_int_x86_avx512_mask_pabs_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) { +; AVX512BW-LABEL: test_int_x86_avx512_mask_pabs_w_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpabsw %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vpabsw %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: retq %res = call <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) %res1 = call <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 -1) %res2 = add <32 x i16> %res, %res1 @@ -955,11 +1333,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pabs_w_512(<32 x i16> %x0, <32 x i16> declare <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8>, <64 x i8>, i64) -; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_b_512 -; CHECK-NOT: call -; CHECK: kmov -; CHECK: vpabsb{{.*}}{%k1} define <64 x i8>@test_int_x86_avx512_mask_pabs_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) { +; AVX512BW-LABEL: test_int_x86_avx512_mask_pabs_b_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovq %rdi, %k1 +; AVX512BW-NEXT: vpabsb %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vpabsb %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: retq %res = call <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) %res1 = call <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 -1) %res2 = add <64 x i8> %res, %res1 @@ -968,12 +1349,14 @@ define <64 x i8>@test_int_x86_avx512_mask_pabs_b_512(<64 x i8> %x0, <64 x i8> %x declare <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) -; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhu_w_512 -; CHECK-NOT: call -; CHECK: kmov -; CHECK: {%k1} -; CHECK: vpmulhuw {{.*}}encoding: [0x62 define <32 x i16>@test_int_x86_avx512_mask_pmulhu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { +; AVX512BW-LABEL: test_int_x86_avx512_mask_pmulhu_w_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpmulhuw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpmulhuw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: retq %res = call <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) %res2 = add <32 x i16> %res, %res1 @@ -982,12 +1365,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pmulhu_w_512(<32 x i16> %x0, <32 x i1 declare <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) -; CHECK-LABEL: @test_int_x86_avx512_mask_pmulh_w_512 -; CHECK-NOT: call -; CHECK: kmov -; CHECK: {%k1} -; CHECK: vpmulhw {{.*}}encoding: [0x62 define <32 x i16>@test_int_x86_avx512_mask_pmulh_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { +; AVX512BW-LABEL: test_int_x86_avx512_mask_pmulh_w_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpmulhw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpmulhw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: retq %res = call <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) %res2 = add <32 x i16> %res, %res1 @@ -996,12 +1381,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pmulh_w_512(<32 x i16> %x0, <32 x i16 declare <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) -; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhr_sw_512 -; CHECK-NOT: call -; CHECK: kmov -; CHECK: {%k1} -; CHECK: vpmulhrsw {{.*}}encoding: [0x62 define <32 x i16>@test_int_x86_avx512_mask_pmulhr_sw_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { +; AVX512BW-LABEL: test_int_x86_avx512_mask_pmulhr_sw_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpmulhrsw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpmulhrsw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: retq %res = call <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) %res2 = add <32 x i16> %res, %res1 @@ -1011,10 +1398,15 @@ define <32 x i16>@test_int_x86_avx512_mask_pmulhr_sw_512(<32 x i16> %x0, <32 x i declare <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16>, <32 x i8>, i32) define <32 x i8>@test_int_x86_avx512_mask_pmov_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) { -; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_512: -; CHECK: vpmovwb %zmm0, %ymm1 {%k1} -; CHECK-NEXT: vpmovwb %zmm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-LABEL: test_int_x86_avx512_mask_pmov_wb_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1 {%k1} +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm2 {%k1} {z} +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512BW-NEXT: retq %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1) %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2) @@ -1026,9 +1418,12 @@ define <32 x i8>@test_int_x86_avx512_mask_pmov_wb_512(<32 x i16> %x0, <32 x i8> declare void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16>, i32) define void @test_int_x86_avx512_mask_pmov_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) { -; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_512: -; CHECK: vpmovwb %zmm0, (%rdi) -; CHECK: vpmovwb %zmm0, (%rdi) {%k1} +; AVX512BW-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpmovwb %zmm0, (%rdi) +; AVX512BW-NEXT: vpmovwb %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: retq call void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1) call void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2) ret void @@ -1037,10 +1432,15 @@ define void @test_int_x86_avx512_mask_pmov_wb_mem_512(i8* %ptr, <32 x i16> %x1, declare <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16>, <32 x i8>, i32) define <32 x i8>@test_int_x86_avx512_mask_pmovs_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) { -; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_512: -; CHECK: vpmovswb %zmm0, %ymm1 {%k1} -; CHECK-NEXT: vpmovswb %zmm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vpmovswb %zmm0, %ymm0 +; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovs_wb_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpmovswb %zmm0, %ymm1 {%k1} +; AVX512BW-NEXT: vpmovswb %zmm0, %ymm2 {%k1} {z} +; AVX512BW-NEXT: vpmovswb %zmm0, %ymm0 +; AVX512BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512BW-NEXT: retq %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1) %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2) @@ -1052,9 +1452,12 @@ define <32 x i8>@test_int_x86_avx512_mask_pmovs_wb_512(<32 x i16> %x0, <32 x i8> declare void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16>, i32) define void @test_int_x86_avx512_mask_pmovs_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) { -; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_512: -; CHECK: vpmovswb %zmm0, (%rdi) -; CHECK: vpmovswb %zmm0, (%rdi) {%k1} +; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpmovswb %zmm0, (%rdi) +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpmovswb %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: retq call void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1) call void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2) ret void @@ -1063,10 +1466,15 @@ define void @test_int_x86_avx512_mask_pmovs_wb_mem_512(i8* %ptr, <32 x i16> %x1, declare <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16>, <32 x i8>, i32) define <32 x i8>@test_int_x86_avx512_mask_pmovus_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) { -; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_512: -; CHECK: vpmovuswb %zmm0, %ymm1 {%k1} -; CHECK-NEXT: vpmovuswb %zmm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vpmovuswb %zmm0, %ymm0 +; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovus_wb_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpmovuswb %zmm0, %ymm1 {%k1} +; AVX512BW-NEXT: vpmovuswb %zmm0, %ymm2 {%k1} {z} +; AVX512BW-NEXT: vpmovuswb %zmm0, %ymm0 +; AVX512BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512BW-NEXT: retq %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1) %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2) @@ -1078,9 +1486,12 @@ define <32 x i8>@test_int_x86_avx512_mask_pmovus_wb_512(<32 x i16> %x0, <32 x i8 declare void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16>, i32) define void @test_int_x86_avx512_mask_pmovus_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) { -; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_512: -; CHECK: vpmovuswb %zmm0, (%rdi) -; CHECK: vpmovuswb %zmm0, (%rdi) {%k1} +; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpmovuswb %zmm0, (%rdi) +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpmovuswb %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: retq call void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1) call void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2) ret void @@ -1089,13 +1500,13 @@ define void @test_int_x86_avx512_mask_pmovus_wb_mem_512(i8* %ptr, <32 x i16> %x1 declare <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8>, <64 x i8>, <32 x i16>, i32) define <32 x i16>@test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3) { -; CHECK-LABEL: test_int_x86_avx512_mask_pmaddubs_w_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] -; CHECK-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vpaddw %zmm0, %zmm2, %zmm0 -; CHECK-NEXT: retq +; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaddubs_w_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: retq %res = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3) %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 -1) %res2 = add <32 x i16> %res, %res1 @@ -1105,13 +1516,13 @@ define <32 x i16>@test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x i declare <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16>, <32 x i16>, <16 x i32>, i16) define <16 x i32>@test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 %x3) { -; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vpmaddwd %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vpmaddwd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 -; CHECK-NEXT: retq +; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaddw_d_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovw %edi, %k1 +; AVX512BW-NEXT: vpmaddwd %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpmaddwd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 %x3) %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 -1) %res2 = add <16 x i32> %res, %res1 @@ -1121,15 +1532,13 @@ define <16 x i32>@test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i1 declare <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) define <64 x i8>@test_int_x86_avx512_mask_punpckhb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { -; CHECK-LABEL: test_int_x86_avx512_mask_punpckhb_w_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovq %rdi, %k1 -; CHECK-NEXT: vpunpckhbw %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: ## zmm2 = zmm2[8],k1[8],zmm2[9],k1[9],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[12],k1[12],zmm2[13],k1[13],zmm2[14],k1[14],zmm2[15],k1[15],zmm2[24],k1[24],zmm2[25],k1[25],zmm2[26],k1[26],zmm2[27],k1[27],zmm2[28],k1[28],zmm2[29],k1[29],zmm2[30],k1[30],zmm2[31],k1[31],zmm2[40],k1[40],zmm2[41],k1[41],zmm2[42],k1[42],zmm2[43],k1[43],zmm2[44],k1[44],zmm2[45],k1[45],zmm2[46],k1[46],zmm2[47],k1[47],zmm2[56],k1[56],zmm2[57],k1[57],zmm2[58],k1[58],zmm2[59],k1[59],zmm2[60],k1[60],zmm2[61],k1[61],zmm2[62],k1[62],zmm2[63],k1[63] -; CHECK-NEXT: vpunpckhbw %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: ## zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] -; CHECK-NEXT: vpaddb %zmm0, %zmm2, %zmm0 -; CHECK-NEXT: retq +; AVX512BW-LABEL: test_int_x86_avx512_mask_punpckhb_w_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovq %rdi, %k1 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm2[8],k1[8],zmm2[9],k1[9],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[12],k1[12],zmm2[13],k1[13],zmm2[14],k1[14],zmm2[15],k1[15],zmm2[24],k1[24],zmm2[25],k1[25],zmm2[26],k1[26],zmm2[27],k1[27],zmm2[28],k1[28],zmm2[29],k1[29],zmm2[30],k1[30],zmm2[31],k1[31],zmm2[40],k1[40],zmm2[41],k1[41],zmm2[42],k1[42],zmm2[43],k1[43],zmm2[44],k1[44],zmm2[45],k1[45],zmm2[46],k1[46],zmm2[47],k1[47],zmm2[56],k1[56],zmm2[57],k1[57],zmm2[58],k1[58],zmm2[59],k1[59],zmm2[60],k1[60],zmm2[61],k1[61],zmm2[62],k1[62],zmm2[63],k1[63] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] +; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: retq %res = call <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) %res1 = call <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) %res2 = add <64 x i8> %res, %res1 @@ -1139,15 +1548,13 @@ define <64 x i8>@test_int_x86_avx512_mask_punpckhb_w_512(<64 x i8> %x0, <64 x i8 declare <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) define <64 x i8>@test_int_x86_avx512_mask_punpcklb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { -; CHECK-LABEL: test_int_x86_avx512_mask_punpcklb_w_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovq %rdi, %k1 -; CHECK-NEXT: vpunpcklbw %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: ## zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[2],k1[2],zmm2[3],k1[3],zmm2[4],k1[4],zmm2[5],k1[5],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[16],k1[16],zmm2[17],k1[17],zmm2[18],k1[18],zmm2[19],k1[19],zmm2[20],k1[20],zmm2[21],k1[21],zmm2[22],k1[22],zmm2[23],k1[23],zmm2[32],k1[32],zmm2[33],k1[33],zmm2[34],k1[34],zmm2[35],k1[35],zmm2[36],k1[36],zmm2[37],k1[37],zmm2[38],k1[38],zmm2[39],k1[39],zmm2[48],k1[48],zmm2[49],k1[49],zmm2[50],k1[50],zmm2[51],k1[51],zmm2[52],k1[52],zmm2[53],k1[53],zmm2[54],k1[54],zmm2[55],k1[55] -; CHECK-NEXT: vpunpcklbw %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: ## zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] -; CHECK-NEXT: vpaddb %zmm0, %zmm2, %zmm0 -; CHECK-NEXT: retq +; AVX512BW-LABEL: test_int_x86_avx512_mask_punpcklb_w_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovq %rdi, %k1 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[2],k1[2],zmm2[3],k1[3],zmm2[4],k1[4],zmm2[5],k1[5],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[16],k1[16],zmm2[17],k1[17],zmm2[18],k1[18],zmm2[19],k1[19],zmm2[20],k1[20],zmm2[21],k1[21],zmm2[22],k1[22],zmm2[23],k1[23],zmm2[32],k1[32],zmm2[33],k1[33],zmm2[34],k1[34],zmm2[35],k1[35],zmm2[36],k1[36],zmm2[37],k1[37],zmm2[38],k1[38],zmm2[39],k1[39],zmm2[48],k1[48],zmm2[49],k1[49],zmm2[50],k1[50],zmm2[51],k1[51],zmm2[52],k1[52],zmm2[53],k1[53],zmm2[54],k1[54],zmm2[55],k1[55] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] +; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: retq %res = call <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) %res1 = call <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) %res2 = add <64 x i8> %res, %res1 @@ -1157,15 +1564,13 @@ define <64 x i8>@test_int_x86_avx512_mask_punpcklb_w_512(<64 x i8> %x0, <64 x i8 declare <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) define <32 x i16>@test_int_x86_avx512_mask_punpckhw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { -; CHECK-LABEL: test_int_x86_avx512_mask_punpckhw_d_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpunpckhwd %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: ## zmm2 = zmm2[4],k1[4],zmm2[5],k1[5],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[12],k1[12],zmm2[13],k1[13],zmm2[14],k1[14],zmm2[15],k1[15],zmm2[20],k1[20],zmm2[21],k1[21],zmm2[22],k1[22],zmm2[23],k1[23],zmm2[28],k1[28],zmm2[29],k1[29],zmm2[30],k1[30],zmm2[31],k1[31] -; CHECK-NEXT: vpunpckhwd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: ## zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31] -; CHECK-NEXT: vpaddw %zmm0, %zmm2, %zmm0 -; CHECK-NEXT: retq +; AVX512BW-LABEL: test_int_x86_avx512_mask_punpckhw_d_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpunpckhwd {{.*#+}} zmm2 = zmm2[4],k1[4],zmm2[5],k1[5],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[12],k1[12],zmm2[13],k1[13],zmm2[14],k1[14],zmm2[15],k1[15],zmm2[20],k1[20],zmm2[21],k1[21],zmm2[22],k1[22],zmm2[23],k1[23],zmm2[28],k1[28],zmm2[29],k1[29],zmm2[30],k1[30],zmm2[31],k1[31] +; AVX512BW-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31] +; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: retq %res = call <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) %res1 = call <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) %res2 = add <32 x i16> %res, %res1 @@ -1175,15 +1580,13 @@ define <32 x i16>@test_int_x86_avx512_mask_punpckhw_d_512(<32 x i16> %x0, <32 x declare <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) define <32 x i16>@test_int_x86_avx512_mask_punpcklw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { -; CHECK-LABEL: test_int_x86_avx512_mask_punpcklw_d_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpunpcklwd %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: ## zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[2],k1[2],zmm2[3],k1[3],zmm2[8],k1[8],zmm2[9],k1[9],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[16],k1[16],zmm2[17],k1[17],zmm2[18],k1[18],zmm2[19],k1[19],zmm2[24],k1[24],zmm2[25],k1[25],zmm2[26],k1[26],zmm2[27],k1[27] -; CHECK-NEXT: vpunpcklwd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: ## zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27] -; CHECK-NEXT: vpaddw %zmm0, %zmm2, %zmm0 -; CHECK-NEXT: retq +; AVX512BW-LABEL: test_int_x86_avx512_mask_punpcklw_d_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[2],k1[2],zmm2[3],k1[3],zmm2[8],k1[8],zmm2[9],k1[9],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[16],k1[16],zmm2[17],k1[17],zmm2[18],k1[18],zmm2[19],k1[19],zmm2[24],k1[24],zmm2[25],k1[25],zmm2[26],k1[26],zmm2[27],k1[27] +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27] +; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: retq %res = call <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) %res1 = call <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) %res2 = add <32 x i16> %res, %res1 @@ -1193,15 +1596,15 @@ define <32 x i16>@test_int_x86_avx512_mask_punpcklw_d_512(<32 x i16> %x0, <32 x declare <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8>, <64 x i8>, i32, <64 x i8>, i64) define <64 x i8>@test_int_x86_avx512_mask_palignr_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x3, i64 %x4) { -; CHECK-LABEL: test_int_x86_avx512_mask_palignr_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovq %rdi, %k1 -; CHECK-NEXT: vpalignr $2, %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vpalignr $2, %zmm1, %zmm0, %zmm3 {%k1} {z} -; CHECK-NEXT: vpalignr $2, %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vpaddb %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vpaddb %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; AVX512BW-LABEL: test_int_x86_avx512_mask_palignr_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovq %rdi, %k1 +; AVX512BW-NEXT: vpalignr $2, %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpalignr $2, %zmm1, %zmm0, %zmm3 {%k1} {z} +; AVX512BW-NEXT: vpalignr $2, %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm3, %zmm2, %zmm1 +; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: retq %res = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 %x4) %res1 = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> zeroinitializer, i64 %x4) %res2 = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 -1) @@ -1213,15 +1616,15 @@ define <64 x i8>@test_int_x86_avx512_mask_palignr_512(<64 x i8> %x0, <64 x i8> % declare <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8>, <64 x i8>, i32, <32 x i16>, i32) define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x3, i32 %x4) { -; CHECK-LABEL: test_int_x86_avx512_mask_dbpsadbw_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z} -; CHECK-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vpaddw %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; AVX512BW-LABEL: test_int_x86_avx512_mask_dbpsadbw_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z} +; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: retq %res = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> %x3, i32 %x4) %res1 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> zeroinitializer, i32 %x4) %res2 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> %x3, i32 -1) @@ -1232,11 +1635,13 @@ define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8> declare <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64>, i32) -; CHECK-LABEL: @test_int_x86_avx512_mask_psll_dq_512 -; CHECK-NOT: call -; CHECK: vpslldq -; CHECK: vpslldq define <8 x i64>@test_int_x86_avx512_mask_psll_dq_512(<8 x i64> %x0) { +; AVX512BW-LABEL: test_int_x86_avx512_mask_psll_dq_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpslldq $8, %zmm0, %zmm1 +; AVX512BW-NEXT: vpslldq $4, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 8) %res1 = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 4) %res2 = add <8 x i64> %res, %res1 @@ -1245,11 +1650,13 @@ define <8 x i64>@test_int_x86_avx512_mask_psll_dq_512(<8 x i64> %x0) { declare <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64>, i32) -; CHECK-LABEL: @test_int_x86_avx512_mask_psrl_dq_512 -; CHECK-NOT: call -; CHECK: vpsrldq -; CHECK: vpsrldq define <8 x i64>@test_int_x86_avx512_mask_psrl_dq_512(<8 x i64> %x0) { +; AVX512BW-LABEL: test_int_x86_avx512_mask_psrl_dq_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpsrldq $8, %zmm0, %zmm1 +; AVX512BW-NEXT: vpsrldq $4, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 8) %res1 = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 4) %res2 = add <8 x i64> %res, %res1 @@ -1257,11 +1664,13 @@ define <8 x i64>@test_int_x86_avx512_mask_psrl_dq_512(<8 x i64> %x0) { } declare <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8>, <64 x i8>) -; CHECK-LABEL: @test_int_x86_avx512_mask_psadb_w_512 -; CHECK-NOT: call -; CHECK: vpsadbw %zmm1 -; CHECK: vpsadbw %zmm2 define <8 x i64>@test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2){ +; AVX512BW-LABEL: test_int_x86_avx512_mask_psadb_w_512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vpsadbw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x1) %res1 = call <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x2) %res2 = add <8 x i64> %res, %res1 From b294a7b51a8ce78941701fec5e1fdcf806764bc9 Mon Sep 17 00:00:00 2001 From: Zoran Jovanovic Date: Mon, 30 Nov 2015 12:56:18 +0000 Subject: [PATCH 004/186] [mips][microMIPS] Fix issue with offset operand of BALC and BC instructions Value of offset operand for microMIPS BALC and BC instructions is currently shifted 2 bits, but it should be 1 bit. Differential Revision: http://reviews.llvm.org/D14770 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254296 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../Mips/Disassembler/MipsDisassembler.cpp | 17 +++++++++++++++++ .../Mips/MCTargetDesc/MipsMCCodeEmitter.cpp | 17 +++++++++++++++++ .../Mips/MCTargetDesc/MipsMCCodeEmitter.h | 7 +++++++ lib/Target/Mips/MicroMips32r6InstrInfo.td | 11 +++++++++-- .../Disassembler/Mips/micromips32r6/valid.txt | 4 ++-- test/MC/Mips/micromips32r6/valid.s | 4 ++-- 6 files changed, 54 insertions(+), 6 deletions(-) diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp index f9601839b44c..716a96e2c46b 100644 --- a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp +++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp @@ -229,6 +229,13 @@ static DecodeStatus DecodeBranchTargetMM(MCInst &Inst, uint64_t Address, const void *Decoder); +// DecodeBranchTarget26MM - Decode microMIPS branch offset, which is +// shifted left by 1 bit. +static DecodeStatus DecodeBranchTarget26MM(MCInst &Inst, + unsigned Offset, + uint64_t Address, + const void *Decoder); + // DecodeJumpTargetMM - Decode microMIPS jump target, which is // shifted left by 1 bit. static DecodeStatus DecodeJumpTargetMM(MCInst &Inst, @@ -1863,6 +1870,16 @@ static DecodeStatus DecodeBranchTargetMM(MCInst &Inst, return MCDisassembler::Success; } +static DecodeStatus DecodeBranchTarget26MM(MCInst &Inst, + unsigned Offset, + uint64_t Address, + const void *Decoder) { + int32_t BranchOffset = SignExtend32<26>(Offset) << 1; + + Inst.addOperand(MCOperand::createImm(BranchOffset)); + return MCDisassembler::Success; +} + static DecodeStatus DecodeJumpTargetMM(MCInst &Inst, unsigned Insn, uint64_t Address, diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp index 86a5d5882184..ed917a4daba3 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp @@ -350,6 +350,23 @@ getBranchTarget26OpValue(const MCInst &MI, unsigned OpNo, return 0; } +/// getBranchTarget26OpValueMM - Return binary encoding of the branch +/// target operand. If the machine operand requires relocation, +/// record the relocation and return zero. +unsigned MipsMCCodeEmitter::getBranchTarget26OpValueMM( + const MCInst &MI, unsigned OpNo, SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + + const MCOperand &MO = MI.getOperand(OpNo); + + // If the destination is an immediate, divide by 2. + if (MO.isImm()) + return MO.getImm() >> 1; + + // TODO: Push 26 PC fixup. + return 0; +} + /// getJumpOffset16OpValue - Return binary encoding of the jump /// target operand. If the machine operand requires relocation, /// record the relocation and return zero. diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h index c2f4b6a72bbf..eb48914b0649 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h +++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h @@ -137,6 +137,13 @@ class MipsMCCodeEmitter : public MCCodeEmitter { SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const; + // getBranchTarget26OpValueMM - Return binary encoding of the branch + // offset operand. If the machine operand requires relocation, + // record the relocation and return zero. + unsigned getBranchTarget26OpValueMM(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + // getJumpOffset16OpValue - Return binary encoding of the jump // offset operand. If the machine operand requires relocation, // record the relocation and return zero. diff --git a/lib/Target/Mips/MicroMips32r6InstrInfo.td b/lib/Target/Mips/MicroMips32r6InstrInfo.td index cabaa53b2b1b..2dbd20cfad99 100644 --- a/lib/Target/Mips/MicroMips32r6InstrInfo.td +++ b/lib/Target/Mips/MicroMips32r6InstrInfo.td @@ -11,6 +11,13 @@ // //===----------------------------------------------------------------------===// +def brtarget26_mm : Operand { + let EncoderMethod = "getBranchTarget26OpValueMM"; + let OperandType = "OPERAND_PCREL"; + let DecoderMethod = "DecodeBranchTarget26MM"; + let ParserMatchClass = MipsJumpTargetAsmOperand; +} + //===----------------------------------------------------------------------===// // // Instruction Encodings @@ -238,11 +245,11 @@ class BC_MMR6_DESC_BASE bit isBarrier = 1; } -class BALC_MMR6_DESC : BC_MMR6_DESC_BASE<"balc", brtarget26> { +class BALC_MMR6_DESC : BC_MMR6_DESC_BASE<"balc", brtarget26_mm> { bit isCall = 1; list Defs = [RA]; } -class BC_MMR6_DESC : BC_MMR6_DESC_BASE<"bc", brtarget26>; +class BC_MMR6_DESC : BC_MMR6_DESC_BASE<"bc", brtarget26_mm>; class BC16_MMR6_DESC : MicroMipsInst16<(outs), (ins brtarget10_mm:$offset), !strconcat("bc16", "\t$offset"), [], diff --git a/test/MC/Disassembler/Mips/micromips32r6/valid.txt b/test/MC/Disassembler/Mips/micromips32r6/valid.txt index 619ce3faeda6..82c5d50df92d 100644 --- a/test/MC/Disassembler/Mips/micromips32r6/valid.txt +++ b/test/MC/Disassembler/Mips/micromips32r6/valid.txt @@ -44,8 +44,8 @@ 0xe0 0x40 0x02 0x9a # CHECK: bgtzalc $2, 1332 0xe0 0x42 0x02 0x9a # CHECK: bltzalc $2, 1332 0xc0 0x40 0x02 0x9a # CHECK: blezalc $2, 1332 -0xb4 0x37 0x96 0xb8 # CHECK: balc 14572256 -0x94 0x37 0x96 0xb8 # CHECK: bc 14572256 +0xb4 0x37 0x96 0xb8 # CHECK: balc 7286128 +0x94 0x37 0x96 0xb8 # CHECK: bc 7286128 0x00 0x44 0x0b 0x3c # CHECK: bitswap $4, $2 0x00 0x00 0x00 0x07 # CHECK: break 0x00 0x07 0x00 0x07 # CHECK: break 7 diff --git a/test/MC/Mips/micromips32r6/valid.s b/test/MC/Mips/micromips32r6/valid.s index b2bce8407656..81d4b6c6d456 100644 --- a/test/MC/Mips/micromips32r6/valid.s +++ b/test/MC/Mips/micromips32r6/valid.s @@ -26,9 +26,9 @@ bgtzalc $2, 1332 # CHECK: bgtzalc $2, 1332 # encoding: [0xe0,0x40,0x02,0x9a] bltzalc $2, 1332 # CHECK: bltzalc $2, 1332 # encoding: [0xe0,0x42,0x02,0x9a] blezalc $2, 1332 # CHECK: blezalc $2, 1332 # encoding: [0xc0,0x40,0x02,0x9a] - balc 14572256 # CHECK: balc 14572256 # encoding: [0xb4,0x37,0x96,0xb8] + balc 7286128 # CHECK: balc 7286128 # encoding: [0xb4,0x37,0x96,0xb8] b 132 # CHECK: bc16 132 # encoding: [0xcc,0x42] - bc 14572256 # CHECK: bc 14572256 # encoding: [0x94,0x37,0x96,0xb8] + bc 7286128 # CHECK: bc 7286128 # encoding: [0x94,0x37,0x96,0xb8] bc16 132 # CHECK: bc16 132 # encoding: [0xcc,0x42] beqzc16 $6, 20 # CHECK: beqzc16 $6, 20 # encoding: [0x8f,0x0a] bnezc16 $6, 20 # CHECK: bnezc16 $6, 20 # encoding: [0xaf,0x0a] From 30709900f394ba33d546b6830eadc7307add3219 Mon Sep 17 00:00:00 2001 From: Hrvoje Varga Date: Mon, 30 Nov 2015 12:58:39 +0000 Subject: [PATCH 005/186] [mips][microMIPS] Implement LBUX, LHX, LWX, MAQ_S[A].W.PHL, MAQ_S[A].W.PHR, MFHI, MFLO, MTHI and MTLO instructions Differential Revision: http://reviews.llvm.org/D14436 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254297 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/Mips/MicroMipsDSPInstrFormats.td | 25 +++++++++++++ lib/Target/Mips/MicroMipsDSPInstrInfo.td | 36 +++++++++++++++++++ lib/Target/Mips/MipsDSPInstrInfo.td | 25 +++++++------ .../Disassembler/Mips/micromips-dsp/valid.txt | 11 ++++++ test/MC/Mips/micromips-dsp/valid.s | 11 ++++++ 5 files changed, 97 insertions(+), 11 deletions(-) diff --git a/lib/Target/Mips/MicroMipsDSPInstrFormats.td b/lib/Target/Mips/MicroMipsDSPInstrFormats.td index d3f9fe31afb7..f231d3a5294d 100644 --- a/lib/Target/Mips/MicroMipsDSPInstrFormats.td +++ b/lib/Target/Mips/MicroMipsDSPInstrFormats.td @@ -153,3 +153,28 @@ class POOL32A_2RSA5_FMT op> : MMDSPInst { let Inst{15-11} = sa; let Inst{10-0} = op; } + +class POOL32A_1RMEMB0_FMT funct> : MMDSPInst { + bits<5> index; + bits<5> base; + bits<5> rd; + + let Inst{31-26} = 0; + let Inst{25-21} = index; + let Inst{20-16} = base; + let Inst{15-11} = rd; + let Inst{10} = 0b0; + let Inst{9-0} = funct; +} + +class POOL32A_1RAC_FMT funct> : MMDSPInst { + bits<5> rs; + bits<2> ac; + + let Inst{31-26} = 0; + let Inst{25-21} = 0; + let Inst{20-16} = rs; + let Inst{15-14} = ac; + let Inst{13-6} = funct; + let Inst{5-0} = 0b111100; +} diff --git a/lib/Target/Mips/MicroMipsDSPInstrInfo.td b/lib/Target/Mips/MicroMipsDSPInstrInfo.td index f515f380f0db..204a4ec60c5a 100644 --- a/lib/Target/Mips/MicroMipsDSPInstrInfo.td +++ b/lib/Target/Mips/MicroMipsDSPInstrInfo.td @@ -130,6 +130,17 @@ class PRECRQ_QB_PH_MM_ENC : POOL32A_3RB0_FMT<"precrq.qb.ph", 0b0010101101>; class PRECRQU_S_QB_PH_MM_ENC : POOL32A_3RB0_FMT<"precrqu_s.qb.ph", 0b0101101101>; class PRECRQ_RS_PH_W_MM_ENC : POOL32A_3RB0_FMT<"precrq_rs.ph.w", 0b0100101101>; +class LBUX_MM_ENC : POOL32A_1RMEMB0_FMT<"lbux", 0b1000100101>; +class LHX_MM_ENC : POOL32A_1RMEMB0_FMT<"lhx", 0b0101100101>; +class LWX_MM_ENC : POOL32A_1RMEMB0_FMT<"lwx", 0b0110100101>; +class MAQ_S_W_PHL_MM_ENC : POOL32A_2RAC_FMT<"maq_s.w.phl", 0b01101001>; +class MAQ_SA_W_PHL_MM_ENC : POOL32A_2RAC_FMT<"maq_sa.w.phl", 0b11101001>; +class MAQ_S_W_PHR_MM_ENC : POOL32A_2RAC_FMT<"maq_s.w.phr", 0b00101001>; +class MAQ_SA_W_PHR_MM_ENC : POOL32A_2RAC_FMT<"maq_sa.w.phr", 0b10101001>; +class MFHI_MM_ENC : POOL32A_1RAC_FMT<"mfhi", 0b00000001>; +class MFLO_MM_ENC : POOL32A_1RAC_FMT<"mflo", 0b01000001>; +class MTHI_MM_ENC : POOL32A_1RAC_FMT<"mthi", 0b10000001>; +class MTLO_MM_ENC : POOL32A_1RAC_FMT<"mthi", 0b11000001>; // Instruction desc. class ABSQ_S_PH_MM_R2_DESC_BASE, Defs<[DSPOutFlag23]>; +class MFHI_MM_DESC_BASE { + dag OutOperandList = (outs GPR32Opnd:$rs); + dag InOperandList = (ins RO:$ac); + string AsmString = !strconcat(instr_asm, "\t$rs, $ac"); + list Pattern = [(set GPR32Opnd:$rs, (OpNode RO:$ac))]; + InstrItinClass Itinerary = itin; +} + +class MFHI_MM_DESC : MFHI_MM_DESC_BASE<"mfhi", ACC64DSPOpnd, MipsMFHI, + NoItinerary>; +class MFLO_MM_DESC : MFHI_MM_DESC_BASE<"mflo", ACC64DSPOpnd, MipsMFLO, + NoItinerary>; + // Instruction defs. // microMIPS DSP Rev 1 def ADDQ_PH_MM : DspMMRel, ADDQ_PH_MM_ENC, ADDQ_PH_DESC; @@ -368,6 +393,17 @@ def PRECRQ_PH_W_MM : DspMMRel, PRECRQ_PH_W_MM_ENC, PRECRQ_PH_W_DESC; def PRECRQ_QB_PH_MM : DspMMRel, PRECRQ_QB_PH_MM_ENC, PRECRQ_QB_PH_DESC; def PRECRQU_S_QB_PH_MM : DspMMRel, PRECRQU_S_QB_PH_MM_ENC, PRECRQU_S_QB_PH_DESC; def PRECRQ_RS_PH_W_MM : DspMMRel, PRECRQ_RS_PH_W_MM_ENC, PRECRQ_RS_PH_W_DESC; +def LBUX_MM : DspMMRel, LBUX_MM_ENC, LBUX_DESC; +def LHX_MM : DspMMRel, LHX_MM_ENC, LHX_DESC; +def LWX_MM : DspMMRel, LWX_MM_ENC, LWX_DESC; +def MAQ_S_W_PHL_MM : DspMMRel, MAQ_S_W_PHL_MM_ENC, MAQ_S_W_PHL_DESC; +def MAQ_SA_W_PHL_MM : DspMMRel, MAQ_SA_W_PHL_MM_ENC, MAQ_SA_W_PHL_DESC; +def MAQ_S_W_PHR_MM : DspMMRel, MAQ_S_W_PHR_MM_ENC, MAQ_S_W_PHR_DESC; +def MAQ_SA_W_PHR_MM : DspMMRel, MAQ_SA_W_PHR_MM_ENC, MAQ_SA_W_PHR_DESC; +def MFHI_DSP_MM : DspMMRel, MFHI_MM_ENC, MFHI_MM_DESC; +def MFLO_DSP_MM : DspMMRel, MFLO_MM_ENC, MFLO_MM_DESC; +def MTHI_DSP_MM : DspMMRel, MTHI_MM_ENC, MTHI_DESC; +def MTLO_DSP_MM : DspMMRel, MTLO_MM_ENC, MTLO_DESC; // microMIPS DSP Rev 2 def ABSQ_S_QB_MMR2 : DspMMRel, ABSQ_S_QB_MMR2_ENC, ABSQ_S_QB_MMR2_DESC, ISA_DSPR2; diff --git a/lib/Target/Mips/MipsDSPInstrInfo.td b/lib/Target/Mips/MipsDSPInstrInfo.td index 91513f3f2c63..e8cfbcf572d7 100644 --- a/lib/Target/Mips/MipsDSPInstrInfo.td +++ b/lib/Target/Mips/MipsDSPInstrInfo.td @@ -360,6 +360,7 @@ class LX_DESC_BASE Pattern = [(set GPR32Opnd:$rd, (OpNode iPTR:$base, iPTR:$index))]; InstrItinClass Itinerary = itin; bit mayLoad = 1; + string BaseOpcode = instr_asm; } class ADDUH_QB_DESC_BASE Pattern = [(set GPR32Opnd:$rd, (OpNode RO:$ac))]; InstrItinClass Itinerary = itin; + string BaseOpcode = instr_asm; } class MTHI_DESC_BASE { @@ -494,6 +496,7 @@ class MTHI_DESC_BASE dag InOperandList = (ins GPR32Opnd:$rs); string AsmString = !strconcat(instr_asm, "\t$rs, $ac"); InstrItinClass Itinerary = itin; + string BaseOpcode = instr_asm; } class BPOSGE32_PSEUDO_DESC_BASE : @@ -1143,14 +1146,14 @@ def MULEQ_S_W_PHL : DspMMRel, MULEQ_S_W_PHL_ENC, MULEQ_S_W_PHL_DESC; def MULEQ_S_W_PHR : DspMMRel, MULEQ_S_W_PHR_ENC, MULEQ_S_W_PHR_DESC; def MULQ_RS_PH : DspMMRel, MULQ_RS_PH_ENC, MULQ_RS_PH_DESC; def MULSAQ_S_W_PH : MULSAQ_S_W_PH_ENC, MULSAQ_S_W_PH_DESC; -def MAQ_S_W_PHL : MAQ_S_W_PHL_ENC, MAQ_S_W_PHL_DESC; -def MAQ_S_W_PHR : MAQ_S_W_PHR_ENC, MAQ_S_W_PHR_DESC; -def MAQ_SA_W_PHL : MAQ_SA_W_PHL_ENC, MAQ_SA_W_PHL_DESC; -def MAQ_SA_W_PHR : MAQ_SA_W_PHR_ENC, MAQ_SA_W_PHR_DESC; -def MFHI_DSP : MFHI_ENC, MFHI_DESC; -def MFLO_DSP : MFLO_ENC, MFLO_DESC; -def MTHI_DSP : MTHI_ENC, MTHI_DESC; -def MTLO_DSP : MTLO_ENC, MTLO_DESC; +def MAQ_S_W_PHL : DspMMRel, MAQ_S_W_PHL_ENC, MAQ_S_W_PHL_DESC; +def MAQ_S_W_PHR : DspMMRel, MAQ_S_W_PHR_ENC, MAQ_S_W_PHR_DESC; +def MAQ_SA_W_PHL : DspMMRel, MAQ_SA_W_PHL_ENC, MAQ_SA_W_PHL_DESC; +def MAQ_SA_W_PHR : DspMMRel, MAQ_SA_W_PHR_ENC, MAQ_SA_W_PHR_DESC; +def MFHI_DSP : DspMMRel, MFHI_ENC, MFHI_DESC; +def MFLO_DSP : DspMMRel, MFLO_ENC, MFLO_DESC; +def MTHI_DSP : DspMMRel, MTHI_ENC, MTHI_DESC; +def MTLO_DSP : DspMMRel, MTLO_ENC, MTLO_DESC; def DPAU_H_QBL : DspMMRel, DPAU_H_QBL_ENC, DPAU_H_QBL_DESC; def DPAU_H_QBR : DspMMRel, DPAU_H_QBR_ENC, DPAU_H_QBR_DESC; def DPSU_H_QBL : DspMMRel, DPSU_H_QBL_ENC, DPSU_H_QBL_DESC; @@ -1182,9 +1185,9 @@ def REPLV_QB : REPLV_QB_ENC, REPLV_QB_DESC; def REPLV_PH : REPLV_PH_ENC, REPLV_PH_DESC; def PICK_QB : PICK_QB_ENC, PICK_QB_DESC; def PICK_PH : PICK_PH_ENC, PICK_PH_DESC; -def LWX : LWX_ENC, LWX_DESC; -def LHX : LHX_ENC, LHX_DESC; -def LBUX : LBUX_ENC, LBUX_DESC; +def LWX : DspMMRel, LWX_ENC, LWX_DESC; +def LHX : DspMMRel, LHX_ENC, LHX_DESC; +def LBUX : DspMMRel, LBUX_ENC, LBUX_DESC; def BPOSGE32 : BPOSGE32_ENC, BPOSGE32_DESC; def INSV : DspMMRel, INSV_ENC, INSV_DESC; def EXTP : DspMMRel, EXTP_ENC, EXTP_DESC; diff --git a/test/MC/Disassembler/Mips/micromips-dsp/valid.txt b/test/MC/Disassembler/Mips/micromips-dsp/valid.txt index 433022c1e62c..ceba6a9823b8 100644 --- a/test/MC/Disassembler/Mips/micromips-dsp/valid.txt +++ b/test/MC/Disassembler/Mips/micromips-dsp/valid.txt @@ -76,3 +76,14 @@ 0x00 0x62 0x08 0x95 # CHECK: muleu_s.ph.qbl $1, $2, $3 0x00 0x62 0x08 0xd5 # CHECK: muleu_s.ph.qbr $1, $2, $3 0x00,0x62,0x09,0x15 # CHECK: mulq_rs.ph $1, $2, $3 +0x00 0x43 0x0a 0x25 # CHECK: lbux $1, $2($3) +0x00 0x43 0x09 0x65 # CHECK: lhx $1, $2($3) +0x00 0x43 0x09 0xa5 # CHECK: lwx $1, $2($3) +0x00 0x62 0x5a 0x7c # CHECK: maq_s.w.phl $ac1, $2, $3 +0x00 0x62 0x7a 0x7c # CHECK: maq_sa.w.phl $ac1, $2, $3 +0x00 0x62 0x4a 0x7c # CHECK: maq_s.w.phr $ac1, $2, $3 +0x00 0x62 0x6a 0x7c # CHECK: maq_sa.w.phr $ac1, $2, $3 +0x00 0x02 0x40 0x7c # CHECK: mfhi $2, $ac1 +0x00 0x01 0x50 0x7c # CHECK: mflo $1, $ac1 +0x00 0x01 0x60 0x7c # CHECK: mthi $1, $ac1 +0x00 0x01 0x70 0x7c # CHECK: mtlo $1, $ac1 diff --git a/test/MC/Mips/micromips-dsp/valid.s b/test/MC/Mips/micromips-dsp/valid.s index 24e4a07365ab..f2eae8f6cae4 100644 --- a/test/MC/Mips/micromips-dsp/valid.s +++ b/test/MC/Mips/micromips-dsp/valid.s @@ -77,3 +77,14 @@ muleu_s.ph.qbl $1, $2, $3 # CHECK: muleu_s.ph.qbl $1, $2, $3 # encoding: [0x00,0x62,0x08,0x95] muleu_s.ph.qbr $1, $2, $3 # CHECK: muleu_s.ph.qbr $1, $2, $3 # encoding: [0x00,0x62,0x08,0xd5] mulq_rs.ph $1, $2, $3 # CHECK: mulq_rs.ph $1, $2, $3 # encoding: [0x00,0x62,0x09,0x15] + lbux $1, $2($3) # CHECK: lbux $1, $2($3) # encoding: [0x00,0x43,0x0a,0x25] + lhx $1, $2($3) # CHECK: lhx $1, $2($3) # encoding: [0x00,0x43,0x09,0x65] + lwx $1, $2($3) # CHECK: lwx $1, $2($3) # encoding: [0x00,0x43,0x09,0xa5] + maq_s.w.phl $ac1, $2, $3 # CHECK: maq_s.w.phl $ac1, $2, $3 # encoding: [0x00,0x62,0x5a,0x7c] + maq_sa.w.phl $ac1, $2, $3 # CHECK: maq_sa.w.phl $ac1, $2, $3 # encoding: [0x00,0x62,0x7a,0x7c] + maq_s.w.phr $ac1, $2, $3 # CHECK: maq_s.w.phr $ac1, $2, $3 # encoding: [0x00,0x62,0x4a,0x7c] + maq_sa.w.phr $ac1, $2, $3 # CHECK: maq_sa.w.phr $ac1, $2, $3 # encoding: [0x00,0x62,0x6a,0x7c] + mfhi $2, $ac1 # CHECK: mfhi $2, $ac1 # encoding: [0x00,0x02,0x40,0x7c] + mflo $1, $ac1 # CHECK: mflo $1, $ac1 # encoding: [0x00,0x01,0x50,0x7c] + mthi $1, $ac1 # CHECK: mthi $1, $ac1 # encoding: [0x00,0x01,0x60,0x7c] + mtlo $1, $ac1 # CHECK: mtlo $1, $ac1 # encoding: [0x00,0x01,0x70,0x7c] From 9ec9305f0679d052d7311691291e184cde92353c Mon Sep 17 00:00:00 2001 From: Aaron Ballman Date: Mon, 30 Nov 2015 14:52:33 +0000 Subject: [PATCH 006/186] Silencing a 32-bit to 64-bit implicit conversion warning; NFC. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254302 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 0174779223ca..9d8895fa7bd1 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -6431,7 +6431,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SmallVector V(NumElems); if (NumElems == 4 && NumZero > 0) { for (unsigned i = 0; i < 4; ++i) { - bool isZero = !(NonZeros & (1 << i)); + bool isZero = !(NonZeros & (1ULL << i)); if (isZero) V[i] = getZeroVector(VT, Subtarget, DAG, dl); else From 9d75bc99041c25f82d0cd6cbce0a43913b540c6d Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 30 Nov 2015 15:46:47 +0000 Subject: [PATCH 007/186] AMDGPU: Don't reserve SCRATCH_PTR input register This hasn't been doing anything since using relocations was added. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254304 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/SIISelLowering.cpp | 16 ++++------------ .../si-instr-info-correct-implicit-operands.ll | 2 +- 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 5c67bf80c175..e2c644451b43 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -633,21 +633,13 @@ SDValue SITargetLowering::LowerFormalArguments( unsigned InputPtrRegHi = TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 1); - unsigned ScratchPtrReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR); - unsigned ScratchPtrRegLo = - TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 0); - unsigned ScratchPtrRegHi = - TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 1); - CCInfo.AllocateReg(InputPtrRegLo); CCInfo.AllocateReg(InputPtrRegHi); - CCInfo.AllocateReg(ScratchPtrRegLo); - CCInfo.AllocateReg(ScratchPtrRegHi); MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); - MF.addLiveIn(ScratchPtrReg, &AMDGPU::SReg_64RegClass); - SIMachineFunctionInfo *MFI = MF.getInfo(); - if (Subtarget->isAmdHsaOS() && MFI->hasDispatchPtr()) { + + const SIMachineFunctionInfo *MFI = MF.getInfo(); + + if (MFI->hasDispatchPtr()) { unsigned DispatchPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR); unsigned DispatchPtrRegLo = diff --git a/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll b/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll index 0e15bc878650..27a8e70aae13 100644 --- a/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll +++ b/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll @@ -3,7 +3,7 @@ ; register operands in the correct order when modifying the opcode of an ; instruction to V_ADD_I32_e32. -; CHECK: %19 = V_ADD_I32_e32 %13, %12, implicit-def %vcc, implicit %exec +; CHECK: %{{[0-9]+}} = V_ADD_I32_e32 %{{[0-9]+}}, %{{[0-9]+}}, implicit-def %vcc, implicit %exec define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { entry: From d27440bc805cb90ef7f0fe970ee08609a29e4a04 Mon Sep 17 00:00:00 2001 From: Colin LeMahieu Date: Mon, 30 Nov 2015 17:32:34 +0000 Subject: [PATCH 008/186] [Hexagon] NFC Reordering headers. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254307 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp index 2f3521bfd717..b73af8249cb5 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp @@ -13,12 +13,12 @@ #include "MCTargetDesc/HexagonBaseInfo.h" #include "MCTargetDesc/HexagonMCInstrInfo.h" #include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCAsmLayout.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCFixupKindInfo.h" #include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCAsmLayout.h" #include "llvm/Support/Debug.h" #include "llvm/Support/TargetRegistry.h" From 351aaa5d3e5d537b9bfa5ca8d6b82250ca85c3b5 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Mon, 30 Nov 2015 17:52:02 +0000 Subject: [PATCH 009/186] fix formatting; NFC git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254310 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 9d8895fa7bd1..4ec4ec280675 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -25594,8 +25594,8 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG, SmallVector ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) ShuffleVec[i] = i * SizeRatio; - for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i) - ShuffleVec[i] = NumElems*SizeRatio; + for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i) + ShuffleVec[i] = NumElems * SizeRatio; NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, DAG.getConstant(0, dl, WideVecVT), &ShuffleVec[0]); @@ -25676,8 +25676,8 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, "WideVecVT should be legal"); SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec, - DAG.getUNDEF(WideVecVT), - &ShuffleVec[0]); + DAG.getUNDEF(WideVecVT), + &ShuffleVec[0]); SDValue NewMask; SDValue Mask = Mst->getMask(); @@ -25709,8 +25709,9 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); } - return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(), - NewMask, StVT, Mst->getMemOperand(), false); + return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, + Mst->getBasePtr(), NewMask, StVT, + Mst->getMemOperand(), false); } /// PerformSTORECombine - Do target-specific dag combines on STORE nodes. static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, From 8dc6e6c3bb8901096bf31a5aaca5223b5bbf2a4f Mon Sep 17 00:00:00 2001 From: Dan Gohman Date: Mon, 30 Nov 2015 18:42:08 +0000 Subject: [PATCH 010/186] [WebAssembly] Fix a few minor compiler warnings. NFC. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254311 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp index 728910422166..dbd00bc10b1c 100644 --- a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp +++ b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp @@ -162,9 +162,9 @@ static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI) { SmallPtrSet Visited; SmallVector Stack; - MachineBasicBlock *Entry = &*MF.begin(); - Visited.insert(Entry); - Stack.push_back(POStackEntry(Entry, MF, MLI)); + MachineBasicBlock *EntryBlock = &*MF.begin(); + Visited.insert(EntryBlock); + Stack.push_back(POStackEntry(EntryBlock, MF, MLI)); for (;;) { POStackEntry &Entry = Stack.back(); @@ -220,7 +220,7 @@ static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI) { #endif } -static int GetLoopDepth(const MachineLoop *Loop) { +static unsigned GetLoopDepth(const MachineLoop *Loop) { return Loop ? Loop->getLoopDepth() : 0; } @@ -249,12 +249,12 @@ static void PlaceBlockMarkers(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPos; MachineLoop *HeaderLoop = MLI.getLoopFor(Header); - int MBBLoopDepth = GetLoopDepth(MLI.getLoopFor(&MBB)); - int HeaderLoopDepth = GetLoopDepth(HeaderLoop); + unsigned MBBLoopDepth = GetLoopDepth(MLI.getLoopFor(&MBB)); + unsigned HeaderLoopDepth = GetLoopDepth(HeaderLoop); if (HeaderLoopDepth > MBBLoopDepth) { // The nearest common dominating point is more deeply nested. Insert the // BLOCK just above the LOOP. - for (int i = 0; i < HeaderLoopDepth - 1 - MBBLoopDepth; ++i) + for (unsigned i = 0; i < HeaderLoopDepth - 1 - MBBLoopDepth; ++i) HeaderLoop = HeaderLoop->getParentLoop(); Header = HeaderLoop->getHeader(); InsertPos = Header->begin(); From e87f0932d94a1bb29c7d4792adf8742ca165c4ef Mon Sep 17 00:00:00 2001 From: Rafael Espindola Date: Mon, 30 Nov 2015 18:54:24 +0000 Subject: [PATCH 011/186] Fix another llvm.ctors merging bug. We were not looking past casts to see if an element should be included or not. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254313 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Linker/LinkModules.cpp | 5 +++-- test/Linker/Inputs/ctors3.ll | 7 +++++++ test/Linker/ctors3.ll | 8 ++++++++ 3 files changed, 18 insertions(+), 2 deletions(-) create mode 100644 test/Linker/Inputs/ctors3.ll create mode 100644 test/Linker/ctors3.ll diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp index 6b60379803e1..cdf1decc8131 100644 --- a/lib/Linker/LinkModules.cpp +++ b/lib/Linker/LinkModules.cpp @@ -410,7 +410,7 @@ class ModuleLinker { std::vector AppendingVars; // Set of items not to link in from source. - SmallPtrSet DoNotLinkFromSource; + SmallPtrSet DoNotLinkFromSource; DiagnosticHandlerFunction DiagnosticHandler; @@ -1512,7 +1512,8 @@ void ModuleLinker::linkAppendingVarInit(AppendingVarInfo &AVI) { for (auto *V : SrcElements) { if (IsNewStructor) { - Constant *Key = V->getAggregateElement(2); + auto *Key = + dyn_cast(V->getAggregateElement(2)->stripPointerCasts()); if (DoNotLinkFromSource.count(Key)) continue; } diff --git a/test/Linker/Inputs/ctors3.ll b/test/Linker/Inputs/ctors3.ll new file mode 100644 index 000000000000..449ccbd90faf --- /dev/null +++ b/test/Linker/Inputs/ctors3.ll @@ -0,0 +1,7 @@ +$foo = comdat any +%t = type { i8 } +@foo = global %t zeroinitializer, comdat +@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @bar, i8* getelementptr (%t, %t* @foo, i32 0, i32 0) }] +define internal void @bar() comdat($foo) { + ret void +} diff --git a/test/Linker/ctors3.ll b/test/Linker/ctors3.ll new file mode 100644 index 000000000000..e62b92dca0b4 --- /dev/null +++ b/test/Linker/ctors3.ll @@ -0,0 +1,8 @@ +; RUN: llvm-link -S %s %p/Inputs/ctors3.ll -o - | FileCheck %s + +$foo = comdat any +%t = type { i8 } +@foo = global %t zeroinitializer, comdat + +; CHECK: @foo = global %t zeroinitializer, comdat +; CHECK: @llvm.global_ctors = appending global [0 x { i32, void ()*, i8* }] zeroinitializer From 56faeb049c0b274ed12267ef74aed60d290be60f Mon Sep 17 00:00:00 2001 From: Kit Barton Date: Mon, 30 Nov 2015 18:59:41 +0000 Subject: [PATCH 012/186] Enable shrink wrapping for PPC64 Re-enable shrink wrapping for PPC64 Little Endian. One minor modification to PPCFrameLowering::findScratchRegister was necessary to handle fall-thru blocks (blocks with no terminator) correctly. Tested with all LLVM test, clang tests, and the self-hosting build, with no problems found. PHabricator: http://reviews.llvm.org/D14778 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254314 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/PowerPC/PPCFrameLowering.cpp | 20 ++++++++++++++------ test/CodeGen/PowerPC/ppc-shrink-wrapping.ll | 1 - 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp index c7a3bbd3762a..5a151eb6ab25 100644 --- a/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -573,10 +573,18 @@ bool PPCFrameLowering::findScratchRegister(MachineBasicBlock *MBB, RS.initRegState(); RS.enterBasicBlock(MBB); - // The scratch register will be used at the end of the block, so must consider - // all registers used within the block - if (UseAtEnd && MBB->begin() != MBB->getFirstTerminator()) - RS.forward(MBB->getFirstTerminator()); + if (UseAtEnd && !MBB->empty()) { + // The scratch register will be used at the end of the block, so must consider + // all registers used within the block + + MachineBasicBlock::iterator MBBI = MBB->getFirstTerminator(); + // If no terminator, back iterator up to previous instruction. + if (MBBI == MBB->end()) + MBBI = std::prev(MBBI); + + if (MBBI != MBB->begin()) + RS.forward(MBBI); + } if (!RS.isRegUsed(R0)) return true; @@ -1768,6 +1776,6 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, } bool PPCFrameLowering::enableShrinkWrapping(const MachineFunction &MF) const { - // FIXME: Enable this for non-Darwin PPC64 once it is confirmed working. - return false; + return (MF.getSubtarget().isSVR4ABI() && + MF.getSubtarget().isPPC64()); } diff --git a/test/CodeGen/PowerPC/ppc-shrink-wrapping.ll b/test/CodeGen/PowerPC/ppc-shrink-wrapping.ll index a23888425bf5..2da8e8d0984c 100644 --- a/test/CodeGen/PowerPC/ppc-shrink-wrapping.ll +++ b/test/CodeGen/PowerPC/ppc-shrink-wrapping.ll @@ -1,6 +1,5 @@ ; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE ; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu %s -o - -enable-shrink-wrap=false | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE -; XFAIL: * ; ; Note: Lots of tests use inline asm instead of regular calls. ; This allows to have a better control on what the allocation will do. From fbb7433e076fcaa297905347a6c516935d2b90bc Mon Sep 17 00:00:00 2001 From: David Majnemer Date: Mon, 30 Nov 2015 19:04:19 +0000 Subject: [PATCH 013/186] [X86] Add RIP to GR64_TCW64 The MachineVerifier wants to check that the register operands of an instruction belong to the instruction's register class. RIP-relative control flow instructions violated this by referencing RIP. While this was fixed for SysV, it was never fixed for Win64. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254315 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86RegisterInfo.td | 2 +- test/CodeGen/X86/coalescer-win64.ll | 16 ++++++++++++++++ test/CodeGen/X86/x86-shrink-wrapping.ll | 2 +- 3 files changed, 18 insertions(+), 2 deletions(-) create mode 100644 test/CodeGen/X86/coalescer-win64.ll diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td index 12f38c7946a8..ceeb57d0cc4c 100644 --- a/lib/Target/X86/X86RegisterInfo.td +++ b/lib/Target/X86/X86RegisterInfo.td @@ -375,7 +375,7 @@ def GR32_TC : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX)>; def GR64_TC : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI, R8, R9, R11, RIP)>; def GR64_TCW64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, - R8, R9, R10, R11)>; + R8, R9, R10, R11, RIP)>; // GR8_NOREX - GR8 registers which do not require a REX prefix. def GR8_NOREX : RegisterClass<"X86", [i8], 8, diff --git a/test/CodeGen/X86/coalescer-win64.ll b/test/CodeGen/X86/coalescer-win64.ll new file mode 100644 index 000000000000..ff084ae5b9e0 --- /dev/null +++ b/test/CodeGen/X86/coalescer-win64.ll @@ -0,0 +1,16 @@ +; RUN: llc < %s -verify-coalescing | FileCheck %s +target triple = "x86_64-pc-win32" + +@fnptr = external global void ()* + +define void @test1() { +entry: + %p = load void ()*, void ()** @fnptr + tail call void %p() + ret void +} + +; CHECK-LABEL: test1{{$}} +; CHECK: .seh_proc test1{{$}} +; CHECK: rex64 jmpq *fnptr(%rip) +; CHECK: .seh_endproc diff --git a/test/CodeGen/X86/x86-shrink-wrapping.ll b/test/CodeGen/X86/x86-shrink-wrapping.ll index 52e094b54174..0cab17f9de89 100644 --- a/test/CodeGen/X86/x86-shrink-wrapping.ll +++ b/test/CodeGen/X86/x86-shrink-wrapping.ll @@ -445,9 +445,9 @@ if.end: ; preds = %for.body, %if.else ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: %esi, %edi ; CHECK-NEXT: %esi, %edx +; CHECK-NEXT: %esi, %ecx ; CHECK-NEXT: %esi, %r8d ; CHECK-NEXT: %esi, %r9d -; CHECK-NEXT: %esi, %ecx ; CHECK-NEXT: callq _someVariadicFunc ; CHECK-NEXT: movl %eax, %esi ; CHECK-NEXT: shll $3, %esi From 24c91af18b7b2da200d91cf5ae6a27f3be47cdc0 Mon Sep 17 00:00:00 2001 From: Davide Italiano Date: Mon, 30 Nov 2015 19:36:35 +0000 Subject: [PATCH 014/186] [SimplifyLibCalls] Transform log(exp2(y)) to y*log(2) under fast-math. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254317 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Utils/SimplifyLibCalls.cpp | 10 +++++++++- test/Transforms/InstCombine/log-pow-nofastmath.ll | 13 +++++++++++++ test/Transforms/InstCombine/log-pow.ll | 13 +++++++++++++ 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp index 47e587fab7b6..83afb1a65ac0 100644 --- a/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -1322,6 +1322,15 @@ Value *LibCallSimplifier::optimizeLog(CallInst *CI, IRBuilder<> &B) { return B.CreateFMul(OpC->getArgOperand(1), EmitUnaryFloatFnCall(OpC->getOperand(0), Callee->getName(), B, Callee->getAttributes()), "mul"); + + // log(exp2(y)) -> y*log(2) + if (F && Name == "log" && TLI->getLibFunc(F->getName(), Func) && + TLI->has(Func) && Func == LibFunc::exp2) + return B.CreateFMul( + OpC->getArgOperand(0), + EmitUnaryFloatFnCall(ConstantFP::get(CI->getType(), 2.0), + Callee->getName(), B, Callee->getAttributes()), + "logmul"); return Ret; } @@ -2301,7 +2310,6 @@ void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) { // log, logf, logl: // * log(exp(x)) -> x // * log(exp(y)) -> y*log(e) -// * log(exp2(y)) -> y*log(2) // * log(exp10(y)) -> y*log(10) // * log(sqrt(x)) -> 0.5*log(x) // diff --git a/test/Transforms/InstCombine/log-pow-nofastmath.ll b/test/Transforms/InstCombine/log-pow-nofastmath.ll index 0811e63cc74a..6c37c5466cea 100644 --- a/test/Transforms/InstCombine/log-pow-nofastmath.ll +++ b/test/Transforms/InstCombine/log-pow-nofastmath.ll @@ -13,5 +13,18 @@ entry: ; CHECK: ret double %call ; CHECK: } +define double @test3(double %x) #0 { + %call2 = call double @exp2(double %x) #0 + %call3 = call double @log(double %call2) #0 + ret double %call3 +} + +; CHECK-LABEL: @test3 +; CHECK: %call2 = call double @exp2(double %x) +; CHECK: %call3 = call double @log(double %call2) +; CHECK: ret double %call3 +; CHECK: } + declare double @log(double) #0 +declare double @exp2(double) declare double @llvm.pow.f64(double, double) diff --git a/test/Transforms/InstCombine/log-pow.ll b/test/Transforms/InstCombine/log-pow.ll index c98a1a5bc628..1acd0354431e 100644 --- a/test/Transforms/InstCombine/log-pow.ll +++ b/test/Transforms/InstCombine/log-pow.ll @@ -22,7 +22,20 @@ define double @test2(double ()* %fptr, double %p1) #0 { ; CHECK-LABEL: @test2 ; CHECK: log +define double @test3(double %x) #0 { + %call2 = call double @exp2(double %x) #0 + %call3 = call double @log(double %call2) #0 + ret double %call3 +} + +; CHECK-LABEL: @test3 +; CHECK: %call2 = call double @exp2(double %x) #0 +; CHECK: %logmul = fmul fast double %x, 0x3FE62E42FEFA39EF +; CHECK: ret double %logmul +; CHECK: } + declare double @log(double) #0 +declare double @exp2(double) #0 declare double @llvm.pow.f64(double, double) attributes #0 = { "unsafe-fp-math"="true" } From 76755680f6325f0cad7502a2a832a3376b1e3747 Mon Sep 17 00:00:00 2001 From: Davide Italiano Date: Mon, 30 Nov 2015 19:38:35 +0000 Subject: [PATCH 015/186] [SimplifyLibCalls] Remove useless bits of this tests. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254318 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/Transforms/InstCombine/log-pow-nofastmath.ll | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/Transforms/InstCombine/log-pow-nofastmath.ll b/test/Transforms/InstCombine/log-pow-nofastmath.ll index 6c37c5466cea..faaef97311ec 100644 --- a/test/Transforms/InstCombine/log-pow-nofastmath.ll +++ b/test/Transforms/InstCombine/log-pow-nofastmath.ll @@ -1,9 +1,9 @@ ; RUN: opt < %s -instcombine -S | FileCheck %s -define double @mylog(double %x, double %y) #0 { +define double @mylog(double %x, double %y) { entry: %pow = call double @llvm.pow.f64(double %x, double %y) - %call = call double @log(double %pow) #0 + %call = call double @log(double %pow) ret double %call } @@ -13,9 +13,9 @@ entry: ; CHECK: ret double %call ; CHECK: } -define double @test3(double %x) #0 { - %call2 = call double @exp2(double %x) #0 - %call3 = call double @log(double %call2) #0 +define double @test3(double %x) { + %call2 = call double @exp2(double %x) + %call3 = call double @log(double %call2) ret double %call3 } @@ -25,6 +25,6 @@ define double @test3(double %x) #0 { ; CHECK: ret double %call3 ; CHECK: } -declare double @log(double) #0 +declare double @log(double) declare double @exp2(double) declare double @llvm.pow.f64(double, double) From 3be8e6a768d24304a43bc915d974c7551ec86248 Mon Sep 17 00:00:00 2001 From: Reid Kleckner Date: Mon, 30 Nov 2015 20:36:23 +0000 Subject: [PATCH 016/186] Avoid writing to source directory of tests git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254324 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/Linker/ctors.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/Linker/ctors.ll b/test/Linker/ctors.ll index 208fe86402ce..37dba23d4c91 100644 --- a/test/Linker/ctors.ll +++ b/test/Linker/ctors.ll @@ -4,7 +4,7 @@ ; RUN: FileCheck --check-prefix=ALL --check-prefix=CHECK2 %s ; Test the bitcode writer too. It used to crash. -; RUN: llvm-link %s %p/Inputs/ctors.ll -o t.bc +; RUN: llvm-link %s %p/Inputs/ctors.ll -o %t.bc @v = weak global i8 0 ; CHECK1: @v = weak global i8 0 From ddaf09c1921d4306b865fae11bf9cfdca6b62731 Mon Sep 17 00:00:00 2001 From: Quentin Colombet Date: Mon, 30 Nov 2015 20:37:58 +0000 Subject: [PATCH 017/186] [ARM] For old thumb ISA like v4t, we cannot use PC directly in pop. Fix the epilogue emission to account for that. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254325 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/Thumb1FrameLowering.cpp | 23 ++++-------------- test/CodeGen/ARM/thumb1_return_sequence.ll | 27 ++++++++++++++++++---- 2 files changed, 28 insertions(+), 22 deletions(-) diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp index f5d4cb8a3ca1..064cff6f5704 100644 --- a/lib/Target/ARM/Thumb1FrameLowering.cpp +++ b/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -422,25 +422,12 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB, const ThumbRegisterInfo *RegInfo = static_cast(STI.getRegisterInfo()); - // If MBBI is a return instruction, we may be able to directly restore - // LR in the PC. - // This is possible if we do not need to emit any SP update. - // Otherwise, we need a temporary register to pop the value - // and copy that value into LR. + // When we need a special fix up for POP, this means that + // we either cannot use PC in POP or we have to update + // SP after poping the return address. + // In other words, we cannot use a pop {pc} like construction + // here, no matter what. auto MBBI = MBB.getFirstTerminator(); - if (!ArgRegsSaveSize && MBBI != MBB.end() && - MBBI->getOpcode() == ARM::tBX_RET) { - if (!DoIt) - return true; - MachineInstrBuilder MIB = - AddDefaultPred( - BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII.get(ARM::tPOP_RET))) - .addReg(ARM::PC, RegState::Define); - MIB.copyImplicitOps(&*MBBI); - // erase the old tBX_RET instruction - MBB.erase(MBBI); - return true; - } // Look for a temporary register to use. // First, compute the liveness information. diff --git a/test/CodeGen/ARM/thumb1_return_sequence.ll b/test/CodeGen/ARM/thumb1_return_sequence.ll index 5b9c19ab5eb2..67d1cad2cf68 100644 --- a/test/CodeGen/ARM/thumb1_return_sequence.ll +++ b/test/CodeGen/ARM/thumb1_return_sequence.ll @@ -23,9 +23,22 @@ entry: ; -------- ; CHECK-V4T: add sp, ; CHECK-V4T-NEXT: pop {[[SAVED]]} -; We do not have any SP update to insert so we can just optimize -; the pop sequence. -; CHECK-V4T-NEXT: pop {pc} +; The ISA for v4 does not support pop pc, so make sure we do not emit +; one even when we do not need to update SP. +; CHECK-V4T-NOT: pop {pc} +; We may only use lo register to pop, but in that case, all the scratch +; ones are used. +; r12 is the only register we are allowed to clobber for AAPCS. +; Use it to save a lo register. +; CHECK-V4T-NEXT: mov [[TEMP_REG:r12]], [[POP_REG:r[0-7]]] +; Pop the value of LR. +; CHECK-V4T-NEXT: pop {[[POP_REG]]} +; Copy the value of LR in the right register. +; CHECK-V4T-NEXT: mov lr, [[POP_REG]] +; Restore the value that was in the register we used to pop the value of LR. +; CHECK-V4T-NEXT: mov [[POP_REG]], [[TEMP_REG]] +; Return. +; CHECK-V4T-NEXT: bx lr ; CHECK-V5T: pop {[[SAVED]], pc} } @@ -93,7 +106,13 @@ entry: ; Epilogue ; -------- ; CHECK-V4T: pop {[[SAVED]]} -; CHECK-V4T: pop {pc} +; The ISA for v4 does not support pop pc, so make sure we do not emit +; one even when we do not need to update SP. +; CHECK-V4T-NOT: pop {pc} +; Pop the value of LR into a scratch lo register other than r0 (it is +; used for the return value). +; CHECK-V4T-NEXT: pop {[[POP_REG:r[1-3]]]} +; CHECK-V4T-NEXT: bx [[POP_REG]] ; CHECK-V5T: pop {[[SAVED]], pc} } From 32851c191d611f67e22dea11f0c1a0adee584806 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 30 Nov 2015 21:15:45 +0000 Subject: [PATCH 018/186] AMDGPU: Use assert zext for workgroup sizes git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254328 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/SIISelLowering.cpp | 31 +++++++---- lib/Target/AMDGPU/SIISelLowering.h | 3 ++ test/CodeGen/AMDGPU/work-item-intrinsics.ll | 60 +++++++++++++++++++++ 3 files changed, 84 insertions(+), 10 deletions(-) diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index e2c644451b43..94fad32c0070 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1043,6 +1043,18 @@ SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, // a glue result. } +SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, + SDValue Op, + MVT VT, + unsigned Offset) const { + SDLoc SL(Op); + SDValue Param = LowerParameter(DAG, MVT::i32, MVT::i32, SL, + DAG.getEntryNode(), Offset, false); + // The local size values will have the hi 16-bits as zero. + return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param, + DAG.getValueType(VT)); +} + SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); @@ -1080,19 +1092,18 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); case Intrinsic::r600_read_local_size_x: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::LOCAL_SIZE_X, false); + return lowerImplicitZextParam(DAG, Op, MVT::i16, + SI::KernelInputOffsets::LOCAL_SIZE_X); case Intrinsic::r600_read_local_size_y: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::LOCAL_SIZE_Y, false); + return lowerImplicitZextParam(DAG, Op, MVT::i16, + SI::KernelInputOffsets::LOCAL_SIZE_Y); case Intrinsic::r600_read_local_size_z: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::LOCAL_SIZE_Z, false); - + return lowerImplicitZextParam(DAG, Op, MVT::i16, + SI::KernelInputOffsets::LOCAL_SIZE_Z); case Intrinsic::AMDGPU_read_workdim: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - getImplicitParameterOffset(MFI, GRID_DIM), false); - + // Really only 2 bits. + return lowerImplicitZextParam(DAG, Op, MVT::i8, + getImplicitParameterOffset(MFI, GRID_DIM)); case Intrinsic::r600_read_tgid_x: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_X), VT); diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index a8b8ad34ed9d..a358e3fc3c0d 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -28,6 +28,9 @@ class SITargetLowering : public AMDGPUTargetLowering { SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const override; + SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op, + MVT VT, unsigned Offset) const; + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; diff --git a/test/CodeGen/AMDGPU/work-item-intrinsics.ll b/test/CodeGen/AMDGPU/work-item-intrinsics.ll index 4328e964c1bf..ebe73429da9c 100644 --- a/test/CodeGen/AMDGPU/work-item-intrinsics.ll +++ b/test/CodeGen/AMDGPU/work-item-intrinsics.ll @@ -213,6 +213,66 @@ entry: ret void } +; FUNC-LABEL: {{^}}local_size_x_known_bits: +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6 +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18 +; GCN-NOT: 0xffff +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NEXT: buffer_store_dword [[VVAL]] +define void @local_size_x_known_bits(i32 addrspace(1)* %out) { +entry: + %size = call i32 @llvm.r600.read.local.size.x() #0 + %shl = shl i32 %size, 16 + %shr = lshr i32 %shl, 16 + store i32 %shr, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_y_known_bits: +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7 +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c +; GCN-NOT: 0xffff +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NEXT: buffer_store_dword [[VVAL]] +define void @local_size_y_known_bits(i32 addrspace(1)* %out) { +entry: + %size = call i32 @llvm.r600.read.local.size.y() #0 + %shl = shl i32 %size, 16 + %shr = lshr i32 %shl, 16 + store i32 %shr, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_z_known_bits: +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20 +; GCN-NOT: 0xffff +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NEXT: buffer_store_dword [[VVAL]] +define void @local_size_z_known_bits(i32 addrspace(1)* %out) { +entry: + %size = call i32 @llvm.r600.read.local.size.z() #0 + %shl = shl i32 %size, 16 + %shr = lshr i32 %shl, 16 + store i32 %shr, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}get_work_dim_known_bits: +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c +; GCN-NOT: 0xff +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @get_work_dim_known_bits(i32 addrspace(1)* %out) { +entry: + %dim = call i32 @llvm.AMDGPU.read.workdim() #0 + %shl = shl i32 %dim, 24 + %shr = lshr i32 %shl, 24 + store i32 %shr, i32 addrspace(1)* %out + ret void +} + declare i32 @llvm.r600.read.ngroups.x() #0 declare i32 @llvm.r600.read.ngroups.y() #0 declare i32 @llvm.r600.read.ngroups.z() #0 From 956f59ab56fcbaf4256b6dc92dc6d8a9ac592365 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 30 Nov 2015 21:15:53 +0000 Subject: [PATCH 019/186] AMDGPU: Remove SIPrepareScratchRegs It does not work because of emergency stack slots. This pass was supposed to eliminate dummy registers for the spill instructions, but the register scavenger can introduce more during PrologEpilogInserter, so some would end up left behind if they were needed. The potential for spilling the scratch resource descriptor and offset register makes doing something like this overly complicated. Reserve registers to use for the resource descriptor and use them directly in eliminateFrameIndex. Also removes creating another scratch resource descriptor when directly selecting scratch MUBUF instructions. The choice of which registers are reserved is temporary. For now it attempts to pick the next available registers after the user and system SGPRs. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254329 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/AMDGPU.h | 1 - lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 32 +- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 1 - lib/Target/AMDGPU/CMakeLists.txt | 1 - lib/Target/AMDGPU/SIFrameLowering.cpp | 72 +++ lib/Target/AMDGPU/SIFrameLowering.h | 3 + lib/Target/AMDGPU/SIISelLowering.cpp | 17 +- lib/Target/AMDGPU/SIISelLowering.h | 4 - lib/Target/AMDGPU/SIInstrInfo.cpp | 18 +- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp | 8 + lib/Target/AMDGPU/SIPrepareScratchRegs.cpp | 196 ------ lib/Target/AMDGPU/SIRegisterInfo.cpp | 19 + test/CodeGen/AMDGPU/kernel-args.ll | 14 +- test/CodeGen/AMDGPU/large-alloca-compute.ll | 42 ++ test/CodeGen/AMDGPU/large-alloca-graphics.ll | 47 ++ test/CodeGen/AMDGPU/large-alloca.ll | 18 - test/CodeGen/AMDGPU/si-sgpr-spill.ll | 10 + ...vgpr-spill-emergency-stack-slot-compute.ll | 583 ++++++++++++++++++ .../AMDGPU/vgpr-spill-emergency-stack-slot.ll | 22 +- 19 files changed, 827 insertions(+), 281 deletions(-) delete mode 100644 lib/Target/AMDGPU/SIPrepareScratchRegs.cpp create mode 100644 test/CodeGen/AMDGPU/large-alloca-compute.ll create mode 100644 test/CodeGen/AMDGPU/large-alloca-graphics.ll delete mode 100644 test/CodeGen/AMDGPU/large-alloca.ll create mode 100644 test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index 80766086e15c..a620e85101e6 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -48,7 +48,6 @@ FunctionPass *createSIFixSGPRCopiesPass(); FunctionPass *createSIFixSGPRLiveRangesPass(); FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); FunctionPass *createSIInsertWaits(TargetMachine &tm); -FunctionPass *createSIPrepareScratchRegs(); ModulePass *createAMDGPUAnnotateKernelFeaturesPass(); void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 04a0c1d06aff..85a06882ffe3 100644 --- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1064,34 +1064,12 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, MachineFunction &MF = CurDAG->getMachineFunction(); const SIRegisterInfo *TRI = static_cast(Subtarget->getRegisterInfo()); - MachineRegisterInfo &MRI = MF.getRegInfo(); - const SITargetLowering& Lowering = - *static_cast(getTargetLowering()); - - unsigned ScratchOffsetReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); - Lowering.CreateLiveInRegister(*CurDAG, &AMDGPU::SReg_32RegClass, - ScratchOffsetReg, MVT::i32); - SDValue Sym0 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD0", MVT::i32); - SDValue ScratchRsrcDword0 = - SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym0), 0); - - SDValue Sym1 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD1", MVT::i32); - SDValue ScratchRsrcDword1 = - SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym1), 0); + const SIMachineFunctionInfo *Info = MF.getInfo(); - const SDValue RsrcOps[] = { - CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), - ScratchRsrcDword0, - CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), - ScratchRsrcDword1, - CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32), - }; - SDValue ScratchPtr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, - MVT::v2i32, RsrcOps), 0); - Rsrc = SDValue(Lowering.buildScratchRSRC(*CurDAG, DL, ScratchPtr), 0); - SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, - MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32); + unsigned ScratchOffsetReg + = TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); + Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); + SOffset = CurDAG->getRegister(ScratchOffsetReg, MVT::i32); // (add n0, c1) if (CurDAG->isBaseWithConstantOffset(Addr)) { diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 4e31c7ab4d4c..7b0445db4df2 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -327,7 +327,6 @@ void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { } void GCNPassConfig::addPostRegAlloc() { - addPass(createSIPrepareScratchRegs(), false); addPass(createSIShrinkInstructionsPass(), false); } diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt index 7a4b5bb6d359..64c9e1882e4f 100644 --- a/lib/Target/AMDGPU/CMakeLists.txt +++ b/lib/Target/AMDGPU/CMakeLists.txt @@ -57,7 +57,6 @@ add_llvm_target(AMDGPUCodeGen SILowerControlFlow.cpp SILowerI1Copies.cpp SIMachineFunctionInfo.cpp - SIPrepareScratchRegs.cpp SIRegisterInfo.cpp SIShrinkInstructions.cpp SITypeRewriter.cpp diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp index a2d8fa1b0a10..6aff4b5700d4 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -8,17 +8,89 @@ //==-----------------------------------------------------------------------===// #include "SIFrameLowering.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/RegisterScavenging.h" using namespace llvm; + +static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo, + const MachineFrameInfo *FrameInfo) { + if (!FuncInfo->hasSpilledSGPRs()) + return false; + + if (FuncInfo->hasSpilledVGPRs()) + return false; + + for (int I = FrameInfo->getObjectIndexBegin(), + E = FrameInfo->getObjectIndexEnd(); I != E; ++I) { + if (!FrameInfo->isSpillSlotObjectIndex(I)) + return false; + } + + return true; +} + +void SIFrameLowering::emitPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + if (!MF.getFrameInfo()->hasStackObjects()) + return; + + assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); + + const SIMachineFunctionInfo *MFI = MF.getInfo(); + + // If we only have SGPR spills, we won't actually be using scratch memory + // since these spill to VGPRs. + // + // FIXME: We should be cleaning up these unused SGPR spill frame indices + // somewhere. + if (hasOnlySGPRSpills(MFI, MF.getFrameInfo())) + return; + + const SIInstrInfo *TII = + static_cast(MF.getSubtarget().getInstrInfo()); + const SIRegisterInfo *TRI = &TII->getRegisterInfo(); + + // We need to insert initialization of the scratch resource descriptor. + unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); + assert(ScratchRsrcReg != AMDGPU::NoRegister); + + uint64_t Rsrc23 = TII->getScratchRsrcWords23(); + MachineBasicBlock::iterator I = MBB.begin(); + DebugLoc DL; + + unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); + unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); + unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); + unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); + + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0) + .addExternalSymbol("SCRATCH_RSRC_DWORD0"); + + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1) + .addExternalSymbol("SCRATCH_RSRC_DWORD1"); + + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2) + .addImm(Rsrc23 & 0xffffffff); + + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3) + .addImm(Rsrc23 >> 32); +} + void SIFrameLowering::processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS) const { MachineFrameInfo *MFI = MF.getFrameInfo(); + + if (!MFI->hasStackObjects()) + return; + bool MayNeedScavengingEmergencySlot = MFI->hasStackObjects(); assert((RS || !MayNeedScavengingEmergencySlot) && diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h index 677128d6ce0a..a9152fd8b2aa 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.h +++ b/lib/Target/AMDGPU/SIFrameLowering.h @@ -21,6 +21,9 @@ class SIFrameLowering final : public AMDGPUFrameLowering { AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {} ~SIFrameLowering() override {} + void emitPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const override; + void processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS = nullptr) const override; diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 94fad32c0070..51cbc95bc07c 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -552,6 +552,7 @@ SDValue SITargetLowering::LowerFormalArguments( MachineFunction &MF = DAG.getMachineFunction(); FunctionType *FType = MF.getFunction()->getFunctionType(); SIMachineFunctionInfo *Info = MF.getInfo(); + const AMDGPUSubtarget &ST = MF.getSubtarget(); if (Subtarget->isAmdHsaOS() && Info->getShaderType() != ShaderType::COMPUTE) { const Function *Fn = MF.getFunction(); @@ -622,9 +623,9 @@ SDValue SITargetLowering::LowerFormalArguments( // The pointer to the scratch buffer is stored in SGPR2, SGPR3 if (Info->getShaderType() == ShaderType::COMPUTE) { if (Subtarget->isAmdHsaOS()) - Info->NumUserSGPRs = 2; // FIXME: Need to support scratch buffers. + Info->NumUserSGPRs += 4; // FIXME: Need to support scratch buffers. else - Info->NumUserSGPRs = 4; + Info->NumUserSGPRs += 4; unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); @@ -750,6 +751,9 @@ SDValue SITargetLowering::LowerFormalArguments( Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx); } + if (MF.getFrameInfo()->hasStackObjects() || ST.isVGPRSpillingEnabled(Info)) + Info->setScratchRSrcReg(TRI); + if (Chains.empty()) return Chain; @@ -2335,15 +2339,6 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); } -MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG, - SDLoc DL, - SDValue Ptr) const { - const SIInstrInfo *TII = - static_cast(Subtarget->getInstrInfo()); - - return buildRSRC(DAG, DL, Ptr, 0, TII->getScratchRsrcWords23()); -} - SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, unsigned Reg, EVT VT) const { diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index a358e3fc3c0d..0659dd7d5d05 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -116,10 +116,6 @@ class SITargetLowering : public AMDGPUTargetLowering { SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const; - MachineSDNode *buildScratchRSRC(SelectionDAG &DAG, - SDLoc DL, - SDValue Ptr) const; - std::pair getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override; diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 9a85a1d515fe..b7d2a4712759 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -551,15 +551,16 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); + unsigned ScratchOffsetPreloadReg + = RI.getPreloadedValue(*MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); + unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize()); MFI->setHasSpilledVGPRs(); BuildMI(MBB, MI, DL, get(Opcode)) .addReg(SrcReg) // src .addFrameIndex(FrameIndex) // frame_idx - // Place-holder registers, these will be filled in by - // SIPrepareScratchRegs. - .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) - .addReg(AMDGPU::SGPR0, RegState::Undef) + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(ScratchOffsetPreloadReg) // scratch_offset .addMemOperand(MMO); } @@ -637,13 +638,14 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); + unsigned ScratchOffsetPreloadReg + = RI.getPreloadedValue(*MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); + unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize()); BuildMI(MBB, MI, DL, get(Opcode), DestReg) .addFrameIndex(FrameIndex) // frame_idx - // Place-holder registers, these will be filled in by - // SIPrepareScratchRegs. - .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) - .addReg(AMDGPU::SGPR0, RegState::Undef) + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(ScratchOffsetPreloadReg) // scratch_offset .addMemOperand(MMO); } diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 6269dce553f6..d042844aa138 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -68,6 +68,14 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) WorkItemIDZ = true; } +void SIMachineFunctionInfo::setScratchRSrcReg(const SIRegisterInfo *TRI) { + // We need to round up to next multiple of 4. + unsigned NextSReg128 = RoundUpToAlignment(NumUserSGPRs + 5, 4); + unsigned RegSub0 = AMDGPU::SReg_32RegClass.getRegister(NextSReg128); + ScratchRSrcReg = TRI->getMatchingSuperReg(RegSub0, AMDGPU::sub0, + &AMDGPU::SReg_128RegClass); +} + SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( MachineFunction *MF, unsigned FrameIndex, diff --git a/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp b/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp deleted file mode 100644 index a6c22775e098..000000000000 --- a/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp +++ /dev/null @@ -1,196 +0,0 @@ -//===-- SIPrepareScratchRegs.cpp - Use predicates for control flow --------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// -/// This pass loads scratch pointer and scratch offset into a register or a -/// frame index which can be used anywhere in the program. These values will -/// be used for spilling VGPRs. -/// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "SIDefines.h" -#include "SIInstrInfo.h" -#include "SIMachineFunctionInfo.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RegisterScavenging.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/LLVMContext.h" - -using namespace llvm; - -namespace { - -class SIPrepareScratchRegs : public MachineFunctionPass { - -private: - static char ID; - -public: - SIPrepareScratchRegs() : MachineFunctionPass(ID) { } - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "SI prepare scratch registers"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } -}; - -} // End anonymous namespace - -char SIPrepareScratchRegs::ID = 0; - -FunctionPass *llvm::createSIPrepareScratchRegs() { - return new SIPrepareScratchRegs(); -} - -bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) { - SIMachineFunctionInfo *MFI = MF.getInfo(); - const SIInstrInfo *TII = - static_cast(MF.getSubtarget().getInstrInfo()); - const SIRegisterInfo *TRI = &TII->getRegisterInfo(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - MachineFrameInfo *FrameInfo = MF.getFrameInfo(); - MachineBasicBlock *Entry = &MF.front(); - MachineBasicBlock::iterator I = Entry->begin(); - DebugLoc DL = I->getDebugLoc(); - - // FIXME: If we don't have enough VGPRs for SGPR spilling we will need to - // run this pass. - if (!MFI->hasSpilledVGPRs()) - return false; - - unsigned ScratchPtrPreloadReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR); - unsigned ScratchOffsetPreloadReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); - - if (!Entry->isLiveIn(ScratchPtrPreloadReg)) - Entry->addLiveIn(ScratchPtrPreloadReg); - - if (!Entry->isLiveIn(ScratchOffsetPreloadReg)) - Entry->addLiveIn(ScratchOffsetPreloadReg); - - // Load the scratch offset. - unsigned ScratchOffsetReg = - TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_32RegClass); - int ScratchOffsetFI = -1; - - if (ScratchOffsetReg != AMDGPU::NoRegister) { - // Found an SGPR to use - BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B32), ScratchOffsetReg) - .addReg(ScratchOffsetPreloadReg); - } else { - // No SGPR is available, we must spill. - ScratchOffsetFI = FrameInfo->CreateSpillStackObject(4,4); - BuildMI(*Entry, I, DL, TII->get(AMDGPU::SI_SPILL_S32_SAVE)) - .addReg(ScratchOffsetPreloadReg) - .addFrameIndex(ScratchOffsetFI) - .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) - .addReg(AMDGPU::SGPR0, RegState::Undef); - } - - - // Now that we have the scratch pointer and offset values, we need to - // add them to all the SI_SPILL_V* instructions. - - RegScavenger RS; - unsigned ScratchRsrcFI = FrameInfo->CreateSpillStackObject(16, 4); - RS.addScavengingFrameIndex(ScratchRsrcFI); - - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { - - MachineBasicBlock &MBB = *BI; - // Add the scratch offset reg as a live-in so that the register scavenger - // doesn't re-use it. - if (!MBB.isLiveIn(ScratchOffsetReg) && - ScratchOffsetReg != AMDGPU::NoRegister) - MBB.addLiveIn(ScratchOffsetReg); - RS.enterBasicBlock(&MBB); - - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - I != E; ++I) { - MachineInstr &MI = *I; - RS.forward(I); - DebugLoc DL = MI.getDebugLoc(); - if (!TII->isVGPRSpill(MI)) - continue; - - // Scratch resource - unsigned ScratchRsrcReg = - RS.scavengeRegister(&AMDGPU::SReg_128RegClass, 0); - - uint64_t Rsrc23 = TII->getScratchRsrcWords23(); - - unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); - unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); - unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); - unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0) - .addExternalSymbol("SCRATCH_RSRC_DWORD0") - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1) - .addExternalSymbol("SCRATCH_RSRC_DWORD1") - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2) - .addImm(Rsrc23 & 0xffffffff) - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3) - .addImm(Rsrc23 >> 32) - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - // Scratch Offset - if (ScratchOffsetReg == AMDGPU::NoRegister) { - ScratchOffsetReg = RS.scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); - BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_SPILL_S32_RESTORE), - ScratchOffsetReg) - .addFrameIndex(ScratchOffsetFI) - .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) - .addReg(AMDGPU::SGPR0, RegState::Undef); - } else if (!MBB.isLiveIn(ScratchOffsetReg)) { - MBB.addLiveIn(ScratchOffsetReg); - } - - if (ScratchRsrcReg == AMDGPU::NoRegister || - ScratchOffsetReg == AMDGPU::NoRegister) { - LLVMContext &Ctx = MF.getFunction()->getContext(); - Ctx.emitError("ran out of SGPRs for spilling VGPRs"); - ScratchRsrcReg = AMDGPU::SGPR0; - ScratchOffsetReg = AMDGPU::SGPR0; - } - MI.getOperand(2).setReg(ScratchRsrcReg); - MI.getOperand(2).setIsKill(true); - MI.getOperand(2).setIsUndef(false); - MI.getOperand(3).setReg(ScratchOffsetReg); - MI.getOperand(3).setIsUndef(false); - MI.getOperand(3).setIsKill(false); - MI.addOperand(MachineOperand::CreateReg(Rsrc0, false, true, true)); - MI.addOperand(MachineOperand::CreateReg(Rsrc1, false, true, true)); - MI.addOperand(MachineOperand::CreateReg(Rsrc2, false, true, true)); - MI.addOperand(MachineOperand::CreateReg(Rsrc3, false, true, true)); - } - } - return true; -} diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index ab7539b6fb3a..b392c86fa2e1 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -68,6 +68,22 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { } } + const SIMachineFunctionInfo *MFI = MF.getInfo(); + unsigned ScratchRSrcReg = MFI->getScratchRSrcReg(); + if (ScratchRSrcReg != AMDGPU::NoRegister) { + unsigned ScratchOffsetPreloadReg + = getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); + // We will need to use this user SGPR argument for spilling, and thus never + // want it to be spilled. + reserveRegisterTuples(Reserved, ScratchOffsetPreloadReg); + + // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need + // to spill. + // TODO: May need to reserve a VGPR if doing LDS spilling. + reserveRegisterTuples(Reserved, ScratchRSrcReg); + assert(!isSubRegister(ScratchRSrcReg, ScratchOffsetPreloadReg)); + } + return Reserved; } @@ -243,6 +259,9 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, .addReg(SubReg) .addImm(Spill.Lane); + // FIXME: Since this spills to another register instead of an actual + // frame index, we should delete the frame index when all references to + // it are fixed. } MI->eraseFromParent(); break; diff --git a/test/CodeGen/AMDGPU/kernel-args.ll b/test/CodeGen/AMDGPU/kernel-args.ll index 803b2ecced01..e9d98ac89e72 100644 --- a/test/CodeGen/AMDGPU/kernel-args.ll +++ b/test/CodeGen/AMDGPU/kernel-args.ll @@ -294,8 +294,8 @@ entry: ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X -; SI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11 -; VI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x44 +; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11 +; VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44 define void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind { entry: store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4 @@ -311,7 +311,7 @@ entry: ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X -; SI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11 +; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11 define void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind { entry: store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4 @@ -413,8 +413,8 @@ entry: ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X -; SI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19 -; VI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64 +; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19 +; VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64 define void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind { entry: store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4 @@ -438,8 +438,8 @@ entry: ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X -; SI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19 -; VI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64 +; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19 +; VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64 define void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind { entry: store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4 diff --git a/test/CodeGen/AMDGPU/large-alloca-compute.ll b/test/CodeGen/AMDGPU/large-alloca-compute.ll new file mode 100644 index 000000000000..5e8cf5bb3d25 --- /dev/null +++ b/test/CodeGen/AMDGPU/large-alloca-compute.ll @@ -0,0 +1,42 @@ +; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s +; XUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s +; XUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s + +; FIXME: align on alloca seems to be ignored for private_segment_alignment + +; ALL-LABEL: {{^}}large_alloca_compute_shader: + +; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN: s_mov_b32 s14, -1 +; CI: s_mov_b32 s15, 0x80f000 +; VI: s_mov_b32 s15, 0x800000 + + +; GCNHSA: .amd_kernel_code_t +; GCNHSA: private_segment_alignment = 4 +; GCNHSA: .end_amd_kernel_code_t + +; GCNHSA: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCNHSA: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCNHSA: s_mov_b32 s10, -1 +; CIHSA: s_mov_b32 s11, 0x180f000 +; VIHSA: s_mov_b32 s11, 0x11800000 + +; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s6 offen +; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s6 offen + +; Scratch size = alloca size + emergency stack slot +; ALL: ; ScratchSize: 32772 +define void @large_alloca_compute_shader(i32 %x, i32 %y) #0 { + %large = alloca [8192 x i32], align 4 + %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191 + store volatile i32 %x, i32* %gep + %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y + %val = load volatile i32, i32* %gep1 + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/large-alloca-graphics.ll b/test/CodeGen/AMDGPU/large-alloca-graphics.ll new file mode 100644 index 000000000000..208b9a10050c --- /dev/null +++ b/test/CodeGen/AMDGPU/large-alloca-graphics.ll @@ -0,0 +1,47 @@ +; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s + +; ALL-LABEL: {{^}}large_alloca_pixel_shader: +; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN: s_mov_b32 s10, -1 +; CI: s_mov_b32 s11, 0x80f000 +; VI: s_mov_b32 s11, 0x800000 + +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen + +; ALL: ; ScratchSize: 32772 +define void @large_alloca_pixel_shader(i32 %x, i32 %y) #1 { + %large = alloca [8192 x i32], align 4 + %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191 + store volatile i32 %x, i32* %gep + %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y + %val = load volatile i32, i32* %gep1 + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +; ALL-LABEL: {{^}}large_alloca_pixel_shader_inreg: +; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN: s_mov_b32 s10, -1 +; CI: s_mov_b32 s11, 0x80f000 +; VI: s_mov_b32 s11, 0x800000 + +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen + +; ALL: ; ScratchSize: 32772 +define void @large_alloca_pixel_shader_inreg(i32 inreg %x, i32 inreg %y) #1 { + %large = alloca [8192 x i32], align 4 + %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191 + store volatile i32 %x, i32* %gep + %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y + %val = load volatile i32, i32* %gep1 + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind "ShaderType"="0" } diff --git a/test/CodeGen/AMDGPU/large-alloca.ll b/test/CodeGen/AMDGPU/large-alloca.ll deleted file mode 100644 index e1122da78ef5..000000000000 --- a/test/CodeGen/AMDGPU/large-alloca.ll +++ /dev/null @@ -1,18 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s - -; GCN-LABEL: {{^}}large_alloca: -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen -; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen -; GCN: ScratchSize: 32776 -define void @large_alloca(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { - %large = alloca [8192 x i32], align 4 - %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191 - store i32 %x, i32* %gep - %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y - %load = load i32, i32* %gep1 - store i32 %load, i32 addrspace(1)* %out - ret void -} - -attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/si-sgpr-spill.ll b/test/CodeGen/AMDGPU/si-sgpr-spill.ll index 84652701f773..d7b35fc631eb 100644 --- a/test/CodeGen/AMDGPU/si-sgpr-spill.ll +++ b/test/CodeGen/AMDGPU/si-sgpr-spill.ll @@ -6,6 +6,16 @@ ; CHECK-LABEL: {{^}}main: ; CHECK: s_wqm + +; Make sure not emitting unused scratch resource descriptor setup +; CHECK-NOT: s_mov_b32 +; CHECK-NOT: s_mov_b32 +; CHECK-NOT: s_mov_b32 +; CHECK-NOT: s_mov_b32 + +; CHECK: s_mov_b32 m0 + + ; Writing to M0 from an SMRD instruction will hang the GPU. ; CHECK-NOT: s_buffer_load_dword m0 ; CHECK: s_endpgm diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll new file mode 100644 index 000000000000..2cbb67d85fba --- /dev/null +++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll @@ -0,0 +1,583 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; XUN: llc -march=amdgcn -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA %s +; XUN: llc -march=amdgcn -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA %s + +; This ends up using all 256 registers and requires register +; scavenging which will fail to find an unsued register. + +; Check the ScratchSize to avoid regressions from spilling +; intermediate register class copies. + +; FIXME: The same register is initialized to 0 for every spill. + +declare i32 @llvm.r600.read.tgid.x() #1 +declare i32 @llvm.r600.read.tgid.y() #1 +declare i32 @llvm.r600.read.tgid.z() #1 + +; GCN-LABEL: {{^}}spill_vgpr_compute: + +; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_mov_b32 s15, 0x80f000 +; VI-NEXT: s_mov_b32 s15, 0x800000 + +; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s8 offset:{{[0-9]+}} ; 4-byte Folded Spill + +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s8 offen offset:{{[0-9]+}} +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s8 offen offset:{{[0-9]+}} + +; GCN: NumVgprs: 256 +; GCN: ScratchSize: 1024 + +; s[0:3] input user SGPRs. s4,s5,s6 = workgroup IDs. s8 scratch offset. +define void @spill_vgpr_compute(<4 x float> %arg6, float addrspace(1)* %arg, i32 %arg1, i32 %arg2, float %arg3, float %arg4, float %arg5) #0 { +bb: + %tmp = add i32 %arg1, %arg2 + %tmp7 = extractelement <4 x float> %arg6, i32 0 + %tmp8 = extractelement <4 x float> %arg6, i32 1 + %tmp9 = extractelement <4 x float> %arg6, i32 2 + %tmp10 = extractelement <4 x float> %arg6, i32 3 + %tmp11 = bitcast float %arg5 to i32 + br label %bb12 + +bb12: ; preds = %bb145, %bb + %tmp13 = phi float [ 0.000000e+00, %bb ], [ %tmp338, %bb145 ] + %tmp14 = phi float [ 0.000000e+00, %bb ], [ %tmp337, %bb145 ] + %tmp15 = phi float [ 0.000000e+00, %bb ], [ %tmp336, %bb145 ] + %tmp16 = phi float [ 0.000000e+00, %bb ], [ %tmp339, %bb145 ] + %tmp17 = phi float [ 0.000000e+00, %bb ], [ %tmp335, %bb145 ] + %tmp18 = phi float [ 0.000000e+00, %bb ], [ %tmp334, %bb145 ] + %tmp19 = phi float [ 0.000000e+00, %bb ], [ %tmp333, %bb145 ] + %tmp20 = phi float [ 0.000000e+00, %bb ], [ %tmp340, %bb145 ] + %tmp21 = phi float [ 0.000000e+00, %bb ], [ %tmp332, %bb145 ] + %tmp22 = phi float [ 0.000000e+00, %bb ], [ %tmp331, %bb145 ] + %tmp23 = phi float [ 0.000000e+00, %bb ], [ %tmp330, %bb145 ] + %tmp24 = phi float [ 0.000000e+00, %bb ], [ %tmp341, %bb145 ] + %tmp25 = phi float [ 0.000000e+00, %bb ], [ %tmp329, %bb145 ] + %tmp26 = phi float [ 0.000000e+00, %bb ], [ %tmp328, %bb145 ] + %tmp27 = phi float [ 0.000000e+00, %bb ], [ %tmp327, %bb145 ] + %tmp28 = phi float [ 0.000000e+00, %bb ], [ %tmp342, %bb145 ] + %tmp29 = phi float [ 0.000000e+00, %bb ], [ %tmp326, %bb145 ] + %tmp30 = phi float [ 0.000000e+00, %bb ], [ %tmp325, %bb145 ] + %tmp31 = phi float [ 0.000000e+00, %bb ], [ %tmp324, %bb145 ] + %tmp32 = phi float [ 0.000000e+00, %bb ], [ %tmp343, %bb145 ] + %tmp33 = phi float [ 0.000000e+00, %bb ], [ %tmp323, %bb145 ] + %tmp34 = phi float [ 0.000000e+00, %bb ], [ %tmp322, %bb145 ] + %tmp35 = phi float [ 0.000000e+00, %bb ], [ %tmp321, %bb145 ] + %tmp36 = phi float [ 0.000000e+00, %bb ], [ %tmp344, %bb145 ] + %tmp37 = phi float [ 0.000000e+00, %bb ], [ %tmp320, %bb145 ] + %tmp38 = phi float [ 0.000000e+00, %bb ], [ %tmp319, %bb145 ] + %tmp39 = phi float [ 0.000000e+00, %bb ], [ %tmp318, %bb145 ] + %tmp40 = phi float [ 0.000000e+00, %bb ], [ %tmp345, %bb145 ] + %tmp41 = phi float [ 0.000000e+00, %bb ], [ %tmp317, %bb145 ] + %tmp42 = phi float [ 0.000000e+00, %bb ], [ %tmp316, %bb145 ] + %tmp43 = phi float [ 0.000000e+00, %bb ], [ %tmp315, %bb145 ] + %tmp44 = phi float [ 0.000000e+00, %bb ], [ %tmp346, %bb145 ] + %tmp45 = phi float [ 0.000000e+00, %bb ], [ %tmp314, %bb145 ] + %tmp46 = phi float [ 0.000000e+00, %bb ], [ %tmp313, %bb145 ] + %tmp47 = phi float [ 0.000000e+00, %bb ], [ %tmp312, %bb145 ] + %tmp48 = phi float [ 0.000000e+00, %bb ], [ %tmp347, %bb145 ] + %tmp49 = phi float [ 0.000000e+00, %bb ], [ %tmp311, %bb145 ] + %tmp50 = phi float [ 0.000000e+00, %bb ], [ %tmp310, %bb145 ] + %tmp51 = phi float [ 0.000000e+00, %bb ], [ %tmp309, %bb145 ] + %tmp52 = phi float [ 0.000000e+00, %bb ], [ %tmp348, %bb145 ] + %tmp53 = phi float [ 0.000000e+00, %bb ], [ %tmp308, %bb145 ] + %tmp54 = phi float [ 0.000000e+00, %bb ], [ %tmp307, %bb145 ] + %tmp55 = phi float [ 0.000000e+00, %bb ], [ %tmp306, %bb145 ] + %tmp56 = phi float [ 0.000000e+00, %bb ], [ %tmp349, %bb145 ] + %tmp57 = phi float [ 0.000000e+00, %bb ], [ %tmp305, %bb145 ] + %tmp58 = phi float [ 0.000000e+00, %bb ], [ %tmp304, %bb145 ] + %tmp59 = phi float [ 0.000000e+00, %bb ], [ %tmp303, %bb145 ] + %tmp60 = phi float [ 0.000000e+00, %bb ], [ %tmp350, %bb145 ] + %tmp61 = phi float [ 0.000000e+00, %bb ], [ %tmp302, %bb145 ] + %tmp62 = phi float [ 0.000000e+00, %bb ], [ %tmp301, %bb145 ] + %tmp63 = phi float [ 0.000000e+00, %bb ], [ %tmp300, %bb145 ] + %tmp64 = phi float [ 0.000000e+00, %bb ], [ %tmp351, %bb145 ] + %tmp65 = phi float [ 0.000000e+00, %bb ], [ %tmp299, %bb145 ] + %tmp66 = phi float [ 0.000000e+00, %bb ], [ %tmp298, %bb145 ] + %tmp67 = phi float [ 0.000000e+00, %bb ], [ %tmp297, %bb145 ] + %tmp68 = phi float [ 0.000000e+00, %bb ], [ %tmp352, %bb145 ] + %tmp69 = phi float [ 0.000000e+00, %bb ], [ %tmp296, %bb145 ] + %tmp70 = phi float [ 0.000000e+00, %bb ], [ %tmp295, %bb145 ] + %tmp71 = phi float [ 0.000000e+00, %bb ], [ %tmp294, %bb145 ] + %tmp72 = phi float [ 0.000000e+00, %bb ], [ %tmp353, %bb145 ] + %tmp73 = phi float [ 0.000000e+00, %bb ], [ %tmp293, %bb145 ] + %tmp74 = phi float [ 0.000000e+00, %bb ], [ %tmp292, %bb145 ] + %tmp75 = phi float [ 0.000000e+00, %bb ], [ %tmp291, %bb145 ] + %tmp76 = phi float [ 0.000000e+00, %bb ], [ %tmp354, %bb145 ] + %tmp77 = phi float [ 0.000000e+00, %bb ], [ %tmp290, %bb145 ] + %tmp78 = phi float [ 0.000000e+00, %bb ], [ %tmp289, %bb145 ] + %tmp79 = phi float [ 0.000000e+00, %bb ], [ %tmp288, %bb145 ] + %tmp80 = phi float [ 0.000000e+00, %bb ], [ %tmp355, %bb145 ] + %tmp81 = phi float [ 0.000000e+00, %bb ], [ %tmp287, %bb145 ] + %tmp82 = phi float [ 0.000000e+00, %bb ], [ %tmp286, %bb145 ] + %tmp83 = phi float [ 0.000000e+00, %bb ], [ %tmp285, %bb145 ] + %tmp84 = phi float [ 0.000000e+00, %bb ], [ %tmp356, %bb145 ] + %tmp85 = phi float [ 0.000000e+00, %bb ], [ %tmp284, %bb145 ] + %tmp86 = phi float [ 0.000000e+00, %bb ], [ %tmp283, %bb145 ] + %tmp87 = phi float [ 0.000000e+00, %bb ], [ %tmp282, %bb145 ] + %tmp88 = phi float [ 0.000000e+00, %bb ], [ %tmp357, %bb145 ] + %tmp89 = phi float [ 0.000000e+00, %bb ], [ %tmp281, %bb145 ] + %tmp90 = phi float [ 0.000000e+00, %bb ], [ %tmp280, %bb145 ] + %tmp91 = phi float [ 0.000000e+00, %bb ], [ %tmp279, %bb145 ] + %tmp92 = phi float [ 0.000000e+00, %bb ], [ %tmp358, %bb145 ] + %tmp93 = phi float [ 0.000000e+00, %bb ], [ %tmp359, %bb145 ] + %tmp94 = phi float [ 0.000000e+00, %bb ], [ %tmp360, %bb145 ] + %tmp95 = phi float [ 0.000000e+00, %bb ], [ %tmp409, %bb145 ] + %tmp96 = phi float [ 0.000000e+00, %bb ], [ %tmp361, %bb145 ] + %tmp97 = phi float [ 0.000000e+00, %bb ], [ %tmp362, %bb145 ] + %tmp98 = phi float [ 0.000000e+00, %bb ], [ %tmp363, %bb145 ] + %tmp99 = phi float [ 0.000000e+00, %bb ], [ %tmp364, %bb145 ] + %tmp100 = phi float [ 0.000000e+00, %bb ], [ %tmp365, %bb145 ] + %tmp101 = phi float [ 0.000000e+00, %bb ], [ %tmp366, %bb145 ] + %tmp102 = phi float [ 0.000000e+00, %bb ], [ %tmp367, %bb145 ] + %tmp103 = phi float [ 0.000000e+00, %bb ], [ %tmp368, %bb145 ] + %tmp104 = phi float [ 0.000000e+00, %bb ], [ %tmp369, %bb145 ] + %tmp105 = phi float [ 0.000000e+00, %bb ], [ %tmp370, %bb145 ] + %tmp106 = phi float [ 0.000000e+00, %bb ], [ %tmp371, %bb145 ] + %tmp107 = phi float [ 0.000000e+00, %bb ], [ %tmp372, %bb145 ] + %tmp108 = phi float [ 0.000000e+00, %bb ], [ %tmp373, %bb145 ] + %tmp109 = phi float [ 0.000000e+00, %bb ], [ %tmp374, %bb145 ] + %tmp110 = phi float [ 0.000000e+00, %bb ], [ %tmp375, %bb145 ] + %tmp111 = phi float [ 0.000000e+00, %bb ], [ %tmp376, %bb145 ] + %tmp112 = phi float [ 0.000000e+00, %bb ], [ %tmp377, %bb145 ] + %tmp113 = phi float [ 0.000000e+00, %bb ], [ %tmp378, %bb145 ] + %tmp114 = phi float [ 0.000000e+00, %bb ], [ %tmp379, %bb145 ] + %tmp115 = phi float [ 0.000000e+00, %bb ], [ %tmp380, %bb145 ] + %tmp116 = phi float [ 0.000000e+00, %bb ], [ %tmp381, %bb145 ] + %tmp117 = phi float [ 0.000000e+00, %bb ], [ %tmp382, %bb145 ] + %tmp118 = phi float [ 0.000000e+00, %bb ], [ %tmp383, %bb145 ] + %tmp119 = phi float [ 0.000000e+00, %bb ], [ %tmp384, %bb145 ] + %tmp120 = phi float [ 0.000000e+00, %bb ], [ %tmp385, %bb145 ] + %tmp121 = phi float [ 0.000000e+00, %bb ], [ %tmp386, %bb145 ] + %tmp122 = phi float [ 0.000000e+00, %bb ], [ %tmp387, %bb145 ] + %tmp123 = phi float [ 0.000000e+00, %bb ], [ %tmp388, %bb145 ] + %tmp124 = phi float [ 0.000000e+00, %bb ], [ %tmp389, %bb145 ] + %tmp125 = phi float [ 0.000000e+00, %bb ], [ %tmp390, %bb145 ] + %tmp126 = phi float [ 0.000000e+00, %bb ], [ %tmp391, %bb145 ] + %tmp127 = phi float [ 0.000000e+00, %bb ], [ %tmp392, %bb145 ] + %tmp128 = phi float [ 0.000000e+00, %bb ], [ %tmp393, %bb145 ] + %tmp129 = phi float [ 0.000000e+00, %bb ], [ %tmp394, %bb145 ] + %tmp130 = phi float [ 0.000000e+00, %bb ], [ %tmp395, %bb145 ] + %tmp131 = phi float [ 0.000000e+00, %bb ], [ %tmp396, %bb145 ] + %tmp132 = phi float [ 0.000000e+00, %bb ], [ %tmp397, %bb145 ] + %tmp133 = phi float [ 0.000000e+00, %bb ], [ %tmp398, %bb145 ] + %tmp134 = phi float [ 0.000000e+00, %bb ], [ %tmp399, %bb145 ] + %tmp135 = phi float [ 0.000000e+00, %bb ], [ %tmp400, %bb145 ] + %tmp136 = phi float [ 0.000000e+00, %bb ], [ %tmp401, %bb145 ] + %tmp137 = phi float [ 0.000000e+00, %bb ], [ %tmp402, %bb145 ] + %tmp138 = phi float [ 0.000000e+00, %bb ], [ %tmp403, %bb145 ] + %tmp139 = phi float [ 0.000000e+00, %bb ], [ %tmp404, %bb145 ] + %tmp140 = phi float [ 0.000000e+00, %bb ], [ %tmp405, %bb145 ] + %tmp141 = phi float [ 0.000000e+00, %bb ], [ %tmp406, %bb145 ] + %tmp142 = bitcast float %tmp95 to i32 + %tmp143 = icmp sgt i32 %tmp142, 125 + br i1 %tmp143, label %bb144, label %bb145 + +bb144: ; preds = %bb12 + store volatile float %arg3, float addrspace(1)* %arg + store volatile float %tmp91, float addrspace(1)* %arg + store volatile float %tmp90, float addrspace(1)* %arg + store volatile float %tmp89, float addrspace(1)* %arg + store volatile float %tmp87, float addrspace(1)* %arg + store volatile float %tmp86, float addrspace(1)* %arg + store volatile float %tmp85, float addrspace(1)* %arg + store volatile float %tmp83, float addrspace(1)* %arg + store volatile float %tmp82, float addrspace(1)* %arg + store volatile float %tmp81, float addrspace(1)* %arg + store volatile float %tmp79, float addrspace(1)* %arg + store volatile float %tmp78, float addrspace(1)* %arg + store volatile float %tmp77, float addrspace(1)* %arg + store volatile float %tmp75, float addrspace(1)* %arg + store volatile float %tmp74, float addrspace(1)* %arg + store volatile float %tmp73, float addrspace(1)* %arg + store volatile float %tmp71, float addrspace(1)* %arg + store volatile float %tmp70, float addrspace(1)* %arg + store volatile float %tmp69, float addrspace(1)* %arg + store volatile float %tmp67, float addrspace(1)* %arg + store volatile float %tmp66, float addrspace(1)* %arg + store volatile float %tmp65, float addrspace(1)* %arg + store volatile float %tmp63, float addrspace(1)* %arg + store volatile float %tmp62, float addrspace(1)* %arg + store volatile float %tmp61, float addrspace(1)* %arg + store volatile float %tmp59, float addrspace(1)* %arg + store volatile float %tmp58, float addrspace(1)* %arg + store volatile float %tmp57, float addrspace(1)* %arg + store volatile float %tmp55, float addrspace(1)* %arg + store volatile float %tmp54, float addrspace(1)* %arg + store volatile float %tmp53, float addrspace(1)* %arg + store volatile float %tmp51, float addrspace(1)* %arg + store volatile float %tmp50, float addrspace(1)* %arg + store volatile float %tmp49, float addrspace(1)* %arg + store volatile float %tmp47, float addrspace(1)* %arg + store volatile float %tmp46, float addrspace(1)* %arg + store volatile float %tmp45, float addrspace(1)* %arg + store volatile float %tmp43, float addrspace(1)* %arg + store volatile float %tmp42, float addrspace(1)* %arg + store volatile float %tmp41, float addrspace(1)* %arg + store volatile float %tmp39, float addrspace(1)* %arg + store volatile float %tmp38, float addrspace(1)* %arg + store volatile float %tmp37, float addrspace(1)* %arg + store volatile float %tmp35, float addrspace(1)* %arg + store volatile float %tmp34, float addrspace(1)* %arg + store volatile float %tmp33, float addrspace(1)* %arg + store volatile float %tmp31, float addrspace(1)* %arg + store volatile float %tmp30, float addrspace(1)* %arg + store volatile float %tmp29, float addrspace(1)* %arg + store volatile float %tmp27, float addrspace(1)* %arg + store volatile float %tmp26, float addrspace(1)* %arg + store volatile float %tmp25, float addrspace(1)* %arg + store volatile float %tmp23, float addrspace(1)* %arg + store volatile float %tmp22, float addrspace(1)* %arg + store volatile float %tmp21, float addrspace(1)* %arg + store volatile float %tmp19, float addrspace(1)* %arg + store volatile float %tmp18, float addrspace(1)* %arg + store volatile float %tmp17, float addrspace(1)* %arg + store volatile float %tmp15, float addrspace(1)* %arg + store volatile float %tmp14, float addrspace(1)* %arg + store volatile float %tmp13, float addrspace(1)* %arg + store volatile float %tmp16, float addrspace(1)* %arg + store volatile float %tmp20, float addrspace(1)* %arg + store volatile float %tmp24, float addrspace(1)* %arg + store volatile float %tmp28, float addrspace(1)* %arg + store volatile float %tmp32, float addrspace(1)* %arg + store volatile float %tmp36, float addrspace(1)* %arg + store volatile float %tmp40, float addrspace(1)* %arg + store volatile float %tmp44, float addrspace(1)* %arg + store volatile float %tmp48, float addrspace(1)* %arg + store volatile float %tmp52, float addrspace(1)* %arg + store volatile float %tmp56, float addrspace(1)* %arg + store volatile float %tmp60, float addrspace(1)* %arg + store volatile float %tmp64, float addrspace(1)* %arg + store volatile float %tmp68, float addrspace(1)* %arg + store volatile float %tmp72, float addrspace(1)* %arg + store volatile float %tmp76, float addrspace(1)* %arg + store volatile float %tmp80, float addrspace(1)* %arg + store volatile float %tmp84, float addrspace(1)* %arg + store volatile float %tmp88, float addrspace(1)* %arg + store volatile float %tmp92, float addrspace(1)* %arg + store volatile float %tmp93, float addrspace(1)* %arg + store volatile float %tmp94, float addrspace(1)* %arg + store volatile float %tmp96, float addrspace(1)* %arg + store volatile float %tmp97, float addrspace(1)* %arg + store volatile float %tmp98, float addrspace(1)* %arg + store volatile float %tmp99, float addrspace(1)* %arg + store volatile float %tmp100, float addrspace(1)* %arg + store volatile float %tmp101, float addrspace(1)* %arg + store volatile float %tmp102, float addrspace(1)* %arg + store volatile float %tmp103, float addrspace(1)* %arg + store volatile float %tmp104, float addrspace(1)* %arg + store volatile float %tmp105, float addrspace(1)* %arg + store volatile float %tmp106, float addrspace(1)* %arg + store volatile float %tmp107, float addrspace(1)* %arg + store volatile float %tmp108, float addrspace(1)* %arg + store volatile float %tmp109, float addrspace(1)* %arg + store volatile float %tmp110, float addrspace(1)* %arg + store volatile float %tmp111, float addrspace(1)* %arg + store volatile float %tmp112, float addrspace(1)* %arg + store volatile float %tmp113, float addrspace(1)* %arg + store volatile float %tmp114, float addrspace(1)* %arg + store volatile float %tmp115, float addrspace(1)* %arg + store volatile float %tmp116, float addrspace(1)* %arg + store volatile float %tmp117, float addrspace(1)* %arg + store volatile float %tmp118, float addrspace(1)* %arg + store volatile float %tmp119, float addrspace(1)* %arg + store volatile float %tmp120, float addrspace(1)* %arg + store volatile float %tmp121, float addrspace(1)* %arg + store volatile float %tmp122, float addrspace(1)* %arg + store volatile float %tmp123, float addrspace(1)* %arg + store volatile float %tmp124, float addrspace(1)* %arg + store volatile float %tmp125, float addrspace(1)* %arg + store volatile float %tmp126, float addrspace(1)* %arg + store volatile float %tmp127, float addrspace(1)* %arg + store volatile float %tmp128, float addrspace(1)* %arg + store volatile float %tmp129, float addrspace(1)* %arg + store volatile float %tmp130, float addrspace(1)* %arg + store volatile float %tmp131, float addrspace(1)* %arg + store volatile float %tmp132, float addrspace(1)* %arg + store volatile float %tmp133, float addrspace(1)* %arg + store volatile float %tmp134, float addrspace(1)* %arg + store volatile float %tmp135, float addrspace(1)* %arg + store volatile float %tmp136, float addrspace(1)* %arg + store volatile float %tmp137, float addrspace(1)* %arg + store volatile float %tmp138, float addrspace(1)* %arg + store volatile float %tmp139, float addrspace(1)* %arg + store volatile float %arg4, float addrspace(1)* %arg + store volatile float %tmp7, float addrspace(1)* %arg + store volatile float %tmp8, float addrspace(1)* %arg + store volatile float %tmp9, float addrspace(1)* %arg + store volatile float %tmp10, float addrspace(1)* %arg + ret void + +bb145: ; preds = %bb12 + %tmp146 = bitcast float %tmp95 to i32 + %tmp147 = bitcast float %tmp95 to i32 + %tmp148 = add i32 %tmp11, %tmp147 + %tmp149 = bitcast i32 %tmp148 to float + %tmp150 = insertelement <128 x float> undef, float %tmp91, i32 0 + %tmp151 = insertelement <128 x float> %tmp150, float %tmp90, i32 1 + %tmp152 = insertelement <128 x float> %tmp151, float %tmp89, i32 2 + %tmp153 = insertelement <128 x float> %tmp152, float %tmp87, i32 3 + %tmp154 = insertelement <128 x float> %tmp153, float %tmp86, i32 4 + %tmp155 = insertelement <128 x float> %tmp154, float %tmp85, i32 5 + %tmp156 = insertelement <128 x float> %tmp155, float %tmp83, i32 6 + %tmp157 = insertelement <128 x float> %tmp156, float %tmp82, i32 7 + %tmp158 = insertelement <128 x float> %tmp157, float %tmp81, i32 8 + %tmp159 = insertelement <128 x float> %tmp158, float %tmp79, i32 9 + %tmp160 = insertelement <128 x float> %tmp159, float %tmp78, i32 10 + %tmp161 = insertelement <128 x float> %tmp160, float %tmp77, i32 11 + %tmp162 = insertelement <128 x float> %tmp161, float %tmp75, i32 12 + %tmp163 = insertelement <128 x float> %tmp162, float %tmp74, i32 13 + %tmp164 = insertelement <128 x float> %tmp163, float %tmp73, i32 14 + %tmp165 = insertelement <128 x float> %tmp164, float %tmp71, i32 15 + %tmp166 = insertelement <128 x float> %tmp165, float %tmp70, i32 16 + %tmp167 = insertelement <128 x float> %tmp166, float %tmp69, i32 17 + %tmp168 = insertelement <128 x float> %tmp167, float %tmp67, i32 18 + %tmp169 = insertelement <128 x float> %tmp168, float %tmp66, i32 19 + %tmp170 = insertelement <128 x float> %tmp169, float %tmp65, i32 20 + %tmp171 = insertelement <128 x float> %tmp170, float %tmp63, i32 21 + %tmp172 = insertelement <128 x float> %tmp171, float %tmp62, i32 22 + %tmp173 = insertelement <128 x float> %tmp172, float %tmp61, i32 23 + %tmp174 = insertelement <128 x float> %tmp173, float %tmp59, i32 24 + %tmp175 = insertelement <128 x float> %tmp174, float %tmp58, i32 25 + %tmp176 = insertelement <128 x float> %tmp175, float %tmp57, i32 26 + %tmp177 = insertelement <128 x float> %tmp176, float %tmp55, i32 27 + %tmp178 = insertelement <128 x float> %tmp177, float %tmp54, i32 28 + %tmp179 = insertelement <128 x float> %tmp178, float %tmp53, i32 29 + %tmp180 = insertelement <128 x float> %tmp179, float %tmp51, i32 30 + %tmp181 = insertelement <128 x float> %tmp180, float %tmp50, i32 31 + %tmp182 = insertelement <128 x float> %tmp181, float %tmp49, i32 32 + %tmp183 = insertelement <128 x float> %tmp182, float %tmp47, i32 33 + %tmp184 = insertelement <128 x float> %tmp183, float %tmp46, i32 34 + %tmp185 = insertelement <128 x float> %tmp184, float %tmp45, i32 35 + %tmp186 = insertelement <128 x float> %tmp185, float %tmp43, i32 36 + %tmp187 = insertelement <128 x float> %tmp186, float %tmp42, i32 37 + %tmp188 = insertelement <128 x float> %tmp187, float %tmp41, i32 38 + %tmp189 = insertelement <128 x float> %tmp188, float %tmp39, i32 39 + %tmp190 = insertelement <128 x float> %tmp189, float %tmp38, i32 40 + %tmp191 = insertelement <128 x float> %tmp190, float %tmp37, i32 41 + %tmp192 = insertelement <128 x float> %tmp191, float %tmp35, i32 42 + %tmp193 = insertelement <128 x float> %tmp192, float %tmp34, i32 43 + %tmp194 = insertelement <128 x float> %tmp193, float %tmp33, i32 44 + %tmp195 = insertelement <128 x float> %tmp194, float %tmp31, i32 45 + %tmp196 = insertelement <128 x float> %tmp195, float %tmp30, i32 46 + %tmp197 = insertelement <128 x float> %tmp196, float %tmp29, i32 47 + %tmp198 = insertelement <128 x float> %tmp197, float %tmp27, i32 48 + %tmp199 = insertelement <128 x float> %tmp198, float %tmp26, i32 49 + %tmp200 = insertelement <128 x float> %tmp199, float %tmp25, i32 50 + %tmp201 = insertelement <128 x float> %tmp200, float %tmp23, i32 51 + %tmp202 = insertelement <128 x float> %tmp201, float %tmp22, i32 52 + %tmp203 = insertelement <128 x float> %tmp202, float %tmp21, i32 53 + %tmp204 = insertelement <128 x float> %tmp203, float %tmp19, i32 54 + %tmp205 = insertelement <128 x float> %tmp204, float %tmp18, i32 55 + %tmp206 = insertelement <128 x float> %tmp205, float %tmp17, i32 56 + %tmp207 = insertelement <128 x float> %tmp206, float %tmp15, i32 57 + %tmp208 = insertelement <128 x float> %tmp207, float %tmp14, i32 58 + %tmp209 = insertelement <128 x float> %tmp208, float %tmp13, i32 59 + %tmp210 = insertelement <128 x float> %tmp209, float %tmp16, i32 60 + %tmp211 = insertelement <128 x float> %tmp210, float %tmp20, i32 61 + %tmp212 = insertelement <128 x float> %tmp211, float %tmp24, i32 62 + %tmp213 = insertelement <128 x float> %tmp212, float %tmp28, i32 63 + %tmp214 = insertelement <128 x float> %tmp213, float %tmp32, i32 64 + %tmp215 = insertelement <128 x float> %tmp214, float %tmp36, i32 65 + %tmp216 = insertelement <128 x float> %tmp215, float %tmp40, i32 66 + %tmp217 = insertelement <128 x float> %tmp216, float %tmp44, i32 67 + %tmp218 = insertelement <128 x float> %tmp217, float %tmp48, i32 68 + %tmp219 = insertelement <128 x float> %tmp218, float %tmp52, i32 69 + %tmp220 = insertelement <128 x float> %tmp219, float %tmp56, i32 70 + %tmp221 = insertelement <128 x float> %tmp220, float %tmp60, i32 71 + %tmp222 = insertelement <128 x float> %tmp221, float %tmp64, i32 72 + %tmp223 = insertelement <128 x float> %tmp222, float %tmp68, i32 73 + %tmp224 = insertelement <128 x float> %tmp223, float %tmp72, i32 74 + %tmp225 = insertelement <128 x float> %tmp224, float %tmp76, i32 75 + %tmp226 = insertelement <128 x float> %tmp225, float %tmp80, i32 76 + %tmp227 = insertelement <128 x float> %tmp226, float %tmp84, i32 77 + %tmp228 = insertelement <128 x float> %tmp227, float %tmp88, i32 78 + %tmp229 = insertelement <128 x float> %tmp228, float %tmp92, i32 79 + %tmp230 = insertelement <128 x float> %tmp229, float %tmp93, i32 80 + %tmp231 = insertelement <128 x float> %tmp230, float %tmp94, i32 81 + %tmp232 = insertelement <128 x float> %tmp231, float %tmp96, i32 82 + %tmp233 = insertelement <128 x float> %tmp232, float %tmp97, i32 83 + %tmp234 = insertelement <128 x float> %tmp233, float %tmp98, i32 84 + %tmp235 = insertelement <128 x float> %tmp234, float %tmp99, i32 85 + %tmp236 = insertelement <128 x float> %tmp235, float %tmp100, i32 86 + %tmp237 = insertelement <128 x float> %tmp236, float %tmp101, i32 87 + %tmp238 = insertelement <128 x float> %tmp237, float %tmp102, i32 88 + %tmp239 = insertelement <128 x float> %tmp238, float %tmp103, i32 89 + %tmp240 = insertelement <128 x float> %tmp239, float %tmp104, i32 90 + %tmp241 = insertelement <128 x float> %tmp240, float %tmp105, i32 91 + %tmp242 = insertelement <128 x float> %tmp241, float %tmp106, i32 92 + %tmp243 = insertelement <128 x float> %tmp242, float %tmp107, i32 93 + %tmp244 = insertelement <128 x float> %tmp243, float %tmp108, i32 94 + %tmp245 = insertelement <128 x float> %tmp244, float %tmp109, i32 95 + %tmp246 = insertelement <128 x float> %tmp245, float %tmp110, i32 96 + %tmp247 = insertelement <128 x float> %tmp246, float %tmp111, i32 97 + %tmp248 = insertelement <128 x float> %tmp247, float %tmp112, i32 98 + %tmp249 = insertelement <128 x float> %tmp248, float %tmp113, i32 99 + %tmp250 = insertelement <128 x float> %tmp249, float %tmp114, i32 100 + %tmp251 = insertelement <128 x float> %tmp250, float %tmp115, i32 101 + %tmp252 = insertelement <128 x float> %tmp251, float %tmp116, i32 102 + %tmp253 = insertelement <128 x float> %tmp252, float %tmp117, i32 103 + %tmp254 = insertelement <128 x float> %tmp253, float %tmp118, i32 104 + %tmp255 = insertelement <128 x float> %tmp254, float %tmp119, i32 105 + %tmp256 = insertelement <128 x float> %tmp255, float %tmp120, i32 106 + %tmp257 = insertelement <128 x float> %tmp256, float %tmp121, i32 107 + %tmp258 = insertelement <128 x float> %tmp257, float %tmp122, i32 108 + %tmp259 = insertelement <128 x float> %tmp258, float %tmp123, i32 109 + %tmp260 = insertelement <128 x float> %tmp259, float %tmp124, i32 110 + %tmp261 = insertelement <128 x float> %tmp260, float %tmp125, i32 111 + %tmp262 = insertelement <128 x float> %tmp261, float %tmp126, i32 112 + %tmp263 = insertelement <128 x float> %tmp262, float %tmp127, i32 113 + %tmp264 = insertelement <128 x float> %tmp263, float %tmp128, i32 114 + %tmp265 = insertelement <128 x float> %tmp264, float %tmp129, i32 115 + %tmp266 = insertelement <128 x float> %tmp265, float %tmp130, i32 116 + %tmp267 = insertelement <128 x float> %tmp266, float %tmp131, i32 117 + %tmp268 = insertelement <128 x float> %tmp267, float %tmp132, i32 118 + %tmp269 = insertelement <128 x float> %tmp268, float %tmp133, i32 119 + %tmp270 = insertelement <128 x float> %tmp269, float %tmp134, i32 120 + %tmp271 = insertelement <128 x float> %tmp270, float %tmp135, i32 121 + %tmp272 = insertelement <128 x float> %tmp271, float %tmp136, i32 122 + %tmp273 = insertelement <128 x float> %tmp272, float %tmp137, i32 123 + %tmp274 = insertelement <128 x float> %tmp273, float %tmp138, i32 124 + %tmp275 = insertelement <128 x float> %tmp274, float %tmp139, i32 125 + %tmp276 = insertelement <128 x float> %tmp275, float %tmp140, i32 126 + %tmp277 = insertelement <128 x float> %tmp276, float %tmp141, i32 127 + %tmp278 = insertelement <128 x float> %tmp277, float %tmp149, i32 %tmp146 + %tmp279 = extractelement <128 x float> %tmp278, i32 0 + %tmp280 = extractelement <128 x float> %tmp278, i32 1 + %tmp281 = extractelement <128 x float> %tmp278, i32 2 + %tmp282 = extractelement <128 x float> %tmp278, i32 3 + %tmp283 = extractelement <128 x float> %tmp278, i32 4 + %tmp284 = extractelement <128 x float> %tmp278, i32 5 + %tmp285 = extractelement <128 x float> %tmp278, i32 6 + %tmp286 = extractelement <128 x float> %tmp278, i32 7 + %tmp287 = extractelement <128 x float> %tmp278, i32 8 + %tmp288 = extractelement <128 x float> %tmp278, i32 9 + %tmp289 = extractelement <128 x float> %tmp278, i32 10 + %tmp290 = extractelement <128 x float> %tmp278, i32 11 + %tmp291 = extractelement <128 x float> %tmp278, i32 12 + %tmp292 = extractelement <128 x float> %tmp278, i32 13 + %tmp293 = extractelement <128 x float> %tmp278, i32 14 + %tmp294 = extractelement <128 x float> %tmp278, i32 15 + %tmp295 = extractelement <128 x float> %tmp278, i32 16 + %tmp296 = extractelement <128 x float> %tmp278, i32 17 + %tmp297 = extractelement <128 x float> %tmp278, i32 18 + %tmp298 = extractelement <128 x float> %tmp278, i32 19 + %tmp299 = extractelement <128 x float> %tmp278, i32 20 + %tmp300 = extractelement <128 x float> %tmp278, i32 21 + %tmp301 = extractelement <128 x float> %tmp278, i32 22 + %tmp302 = extractelement <128 x float> %tmp278, i32 23 + %tmp303 = extractelement <128 x float> %tmp278, i32 24 + %tmp304 = extractelement <128 x float> %tmp278, i32 25 + %tmp305 = extractelement <128 x float> %tmp278, i32 26 + %tmp306 = extractelement <128 x float> %tmp278, i32 27 + %tmp307 = extractelement <128 x float> %tmp278, i32 28 + %tmp308 = extractelement <128 x float> %tmp278, i32 29 + %tmp309 = extractelement <128 x float> %tmp278, i32 30 + %tmp310 = extractelement <128 x float> %tmp278, i32 31 + %tmp311 = extractelement <128 x float> %tmp278, i32 32 + %tmp312 = extractelement <128 x float> %tmp278, i32 33 + %tmp313 = extractelement <128 x float> %tmp278, i32 34 + %tmp314 = extractelement <128 x float> %tmp278, i32 35 + %tmp315 = extractelement <128 x float> %tmp278, i32 36 + %tmp316 = extractelement <128 x float> %tmp278, i32 37 + %tmp317 = extractelement <128 x float> %tmp278, i32 38 + %tmp318 = extractelement <128 x float> %tmp278, i32 39 + %tmp319 = extractelement <128 x float> %tmp278, i32 40 + %tmp320 = extractelement <128 x float> %tmp278, i32 41 + %tmp321 = extractelement <128 x float> %tmp278, i32 42 + %tmp322 = extractelement <128 x float> %tmp278, i32 43 + %tmp323 = extractelement <128 x float> %tmp278, i32 44 + %tmp324 = extractelement <128 x float> %tmp278, i32 45 + %tmp325 = extractelement <128 x float> %tmp278, i32 46 + %tmp326 = extractelement <128 x float> %tmp278, i32 47 + %tmp327 = extractelement <128 x float> %tmp278, i32 48 + %tmp328 = extractelement <128 x float> %tmp278, i32 49 + %tmp329 = extractelement <128 x float> %tmp278, i32 50 + %tmp330 = extractelement <128 x float> %tmp278, i32 51 + %tmp331 = extractelement <128 x float> %tmp278, i32 52 + %tmp332 = extractelement <128 x float> %tmp278, i32 53 + %tmp333 = extractelement <128 x float> %tmp278, i32 54 + %tmp334 = extractelement <128 x float> %tmp278, i32 55 + %tmp335 = extractelement <128 x float> %tmp278, i32 56 + %tmp336 = extractelement <128 x float> %tmp278, i32 57 + %tmp337 = extractelement <128 x float> %tmp278, i32 58 + %tmp338 = extractelement <128 x float> %tmp278, i32 59 + %tmp339 = extractelement <128 x float> %tmp278, i32 60 + %tmp340 = extractelement <128 x float> %tmp278, i32 61 + %tmp341 = extractelement <128 x float> %tmp278, i32 62 + %tmp342 = extractelement <128 x float> %tmp278, i32 63 + %tmp343 = extractelement <128 x float> %tmp278, i32 64 + %tmp344 = extractelement <128 x float> %tmp278, i32 65 + %tmp345 = extractelement <128 x float> %tmp278, i32 66 + %tmp346 = extractelement <128 x float> %tmp278, i32 67 + %tmp347 = extractelement <128 x float> %tmp278, i32 68 + %tmp348 = extractelement <128 x float> %tmp278, i32 69 + %tmp349 = extractelement <128 x float> %tmp278, i32 70 + %tmp350 = extractelement <128 x float> %tmp278, i32 71 + %tmp351 = extractelement <128 x float> %tmp278, i32 72 + %tmp352 = extractelement <128 x float> %tmp278, i32 73 + %tmp353 = extractelement <128 x float> %tmp278, i32 74 + %tmp354 = extractelement <128 x float> %tmp278, i32 75 + %tmp355 = extractelement <128 x float> %tmp278, i32 76 + %tmp356 = extractelement <128 x float> %tmp278, i32 77 + %tmp357 = extractelement <128 x float> %tmp278, i32 78 + %tmp358 = extractelement <128 x float> %tmp278, i32 79 + %tmp359 = extractelement <128 x float> %tmp278, i32 80 + %tmp360 = extractelement <128 x float> %tmp278, i32 81 + %tmp361 = extractelement <128 x float> %tmp278, i32 82 + %tmp362 = extractelement <128 x float> %tmp278, i32 83 + %tmp363 = extractelement <128 x float> %tmp278, i32 84 + %tmp364 = extractelement <128 x float> %tmp278, i32 85 + %tmp365 = extractelement <128 x float> %tmp278, i32 86 + %tmp366 = extractelement <128 x float> %tmp278, i32 87 + %tmp367 = extractelement <128 x float> %tmp278, i32 88 + %tmp368 = extractelement <128 x float> %tmp278, i32 89 + %tmp369 = extractelement <128 x float> %tmp278, i32 90 + %tmp370 = extractelement <128 x float> %tmp278, i32 91 + %tmp371 = extractelement <128 x float> %tmp278, i32 92 + %tmp372 = extractelement <128 x float> %tmp278, i32 93 + %tmp373 = extractelement <128 x float> %tmp278, i32 94 + %tmp374 = extractelement <128 x float> %tmp278, i32 95 + %tmp375 = extractelement <128 x float> %tmp278, i32 96 + %tmp376 = extractelement <128 x float> %tmp278, i32 97 + %tmp377 = extractelement <128 x float> %tmp278, i32 98 + %tmp378 = extractelement <128 x float> %tmp278, i32 99 + %tmp379 = extractelement <128 x float> %tmp278, i32 100 + %tmp380 = extractelement <128 x float> %tmp278, i32 101 + %tmp381 = extractelement <128 x float> %tmp278, i32 102 + %tmp382 = extractelement <128 x float> %tmp278, i32 103 + %tmp383 = extractelement <128 x float> %tmp278, i32 104 + %tmp384 = extractelement <128 x float> %tmp278, i32 105 + %tmp385 = extractelement <128 x float> %tmp278, i32 106 + %tmp386 = extractelement <128 x float> %tmp278, i32 107 + %tmp387 = extractelement <128 x float> %tmp278, i32 108 + %tmp388 = extractelement <128 x float> %tmp278, i32 109 + %tmp389 = extractelement <128 x float> %tmp278, i32 110 + %tmp390 = extractelement <128 x float> %tmp278, i32 111 + %tmp391 = extractelement <128 x float> %tmp278, i32 112 + %tmp392 = extractelement <128 x float> %tmp278, i32 113 + %tmp393 = extractelement <128 x float> %tmp278, i32 114 + %tmp394 = extractelement <128 x float> %tmp278, i32 115 + %tmp395 = extractelement <128 x float> %tmp278, i32 116 + %tmp396 = extractelement <128 x float> %tmp278, i32 117 + %tmp397 = extractelement <128 x float> %tmp278, i32 118 + %tmp398 = extractelement <128 x float> %tmp278, i32 119 + %tmp399 = extractelement <128 x float> %tmp278, i32 120 + %tmp400 = extractelement <128 x float> %tmp278, i32 121 + %tmp401 = extractelement <128 x float> %tmp278, i32 122 + %tmp402 = extractelement <128 x float> %tmp278, i32 123 + %tmp403 = extractelement <128 x float> %tmp278, i32 124 + %tmp404 = extractelement <128 x float> %tmp278, i32 125 + %tmp405 = extractelement <128 x float> %tmp278, i32 126 + %tmp406 = extractelement <128 x float> %tmp278, i32 127 + %tmp407 = bitcast float %tmp95 to i32 + %tmp408 = add i32 %tmp407, 1 + %tmp409 = bitcast i32 %tmp408 to float + br label %bb12 +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll index 5ce65371b01c..ef492cef5767 100644 --- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -1,7 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling < %s | FileCheck %s - -; FIXME: Enable -verify-instructions +; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; This ends up using all 255 registers and requires register ; scavenging which will fail to find an unsued register. @@ -11,9 +9,19 @@ ; FIXME: The same register is initialized to 0 for every spill. -; CHECK-LABEL: {{^}}main: -; CHECK: NumVgprs: 256 -; CHECK: ScratchSize: 1024 +; GCN-LABEL: {{^}}main: + +; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCN-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s11, 0x80f000 +; VI-NEXT: s_mov_b32 s11, 0x800000 + +; s12 is offset user SGPR +; GCN: buffer_store_dword {{v[0-9]+}}, s[8:11], s12 offset:{{[0-9]+}} ; 4-byte Folded Spill + +; GCN: NumVgprs: 256 +; GCN: ScratchSize: 1024 define void @main([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <16 x i8>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 { bb: From d4a0a430cc9eca9157aed5751616515db30b8b39 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 30 Nov 2015 21:15:57 +0000 Subject: [PATCH 020/186] AMDGPU: Rename enums to be consistent with HSA code object terminology git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254330 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 4 ++-- lib/Target/AMDGPU/SIISelLowering.cpp | 26 +++++++++------------- lib/Target/AMDGPU/SIInstrInfo.cpp | 19 +++++++++------- lib/Target/AMDGPU/SIRegisterInfo.cpp | 28 ++++++++++++------------ lib/Target/AMDGPU/SIRegisterInfo.h | 22 ++++++++++--------- 5 files changed, 49 insertions(+), 50 deletions(-) diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 85a06882ffe3..aa3909060e52 100644 --- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1066,8 +1066,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, static_cast(Subtarget->getRegisterInfo()); const SIMachineFunctionInfo *Info = MF.getInfo(); - unsigned ScratchOffsetReg - = TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); + unsigned ScratchOffsetReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); SOffset = CurDAG->getRegister(ScratchOffsetReg, MVT::i32); diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 51cbc95bc07c..de52b483bbbc 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -514,7 +514,7 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, MachineFunction &MF = DAG.getMachineFunction(); const SIRegisterInfo *TRI = static_cast(Subtarget->getRegisterInfo()); - unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); + unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); Type *Ty = VT.getTypeForEVT(*DAG.getContext()); @@ -628,7 +628,7 @@ SDValue SITargetLowering::LowerFormalArguments( Info->NumUserSGPRs += 4; unsigned InputPtrReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); + TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); unsigned InputPtrRegLo = TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 0); unsigned InputPtrRegHi = @@ -641,14 +641,8 @@ SDValue SITargetLowering::LowerFormalArguments( const SIMachineFunctionInfo *MFI = MF.getInfo(); if (MFI->hasDispatchPtr()) { - unsigned DispatchPtrReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR); - unsigned DispatchPtrRegLo = - TRI->getPhysRegSubReg(DispatchPtrReg, &AMDGPU::SReg_32RegClass, 0); - unsigned DispatchPtrRegHi = - TRI->getPhysRegSubReg(DispatchPtrReg, &AMDGPU::SReg_32RegClass, 1); - CCInfo.AllocateReg(DispatchPtrRegLo); - CCInfo.AllocateReg(DispatchPtrRegHi); + unsigned DispatchPtrReg + = TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR); MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass); } } @@ -1110,22 +1104,22 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, getImplicitParameterOffset(MFI, GRID_DIM)); case Intrinsic::r600_read_tgid_x: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_X), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT); case Intrinsic::r600_read_tgid_y: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Y), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT); case Intrinsic::r600_read_tgid_z: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Z), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT); case Intrinsic::r600_read_tidig_x: return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_X), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT); case Intrinsic::r600_read_tidig_y: return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Y), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT); case Intrinsic::r600_read_tidig_z: return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Z), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT); case AMDGPUIntrinsic::SI_load_const: { SDValue Ops[] = { Op.getOperand(1), diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index b7d2a4712759..5350edbd74e2 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -551,8 +551,8 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); - unsigned ScratchOffsetPreloadReg - = RI.getPreloadedValue(*MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); + unsigned ScratchOffsetPreloadReg = RI.getPreloadedValue( + *MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize()); MFI->setHasSpilledVGPRs(); @@ -638,8 +638,8 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); - unsigned ScratchOffsetPreloadReg - = RI.getPreloadedValue(*MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); + unsigned ScratchOffsetPreloadReg = RI.getPreloadedValue( + *MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize()); BuildMI(MBB, MI, DL, get(Opcode), DestReg) @@ -678,11 +678,14 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, if (MFI->getShaderType() == ShaderType::COMPUTE && WorkGroupSize > WavefrontSize) { - unsigned TIDIGXReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_X); - unsigned TIDIGYReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Y); - unsigned TIDIGZReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Z); + unsigned TIDIGXReg + = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X); + unsigned TIDIGYReg + = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y); + unsigned TIDIGZReg + = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z); unsigned InputPtrReg = - TRI->getPreloadedValue(*MF, SIRegisterInfo::INPUT_PTR); + TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { if (!Entry.isLiveIn(Reg)) Entry.addLiveIn(Reg); diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index b392c86fa2e1..68629cd50995 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -72,7 +72,7 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { unsigned ScratchRSrcReg = MFI->getScratchRSrcReg(); if (ScratchRSrcReg != AMDGPU::NoRegister) { unsigned ScratchOffsetPreloadReg - = getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); + = getPreloadedValue(MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); // We will need to use this user SGPR argument for spilling, and thus never // want it to be spilled. reserveRegisterTuples(Reserved, ScratchOffsetPreloadReg); @@ -532,30 +532,30 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, const AMDGPUSubtarget &ST = MF.getSubtarget(); const SIMachineFunctionInfo *MFI = MF.getInfo(); switch (Value) { - case SIRegisterInfo::TGID_X: + case SIRegisterInfo::WORKGROUP_ID_X: return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0); - case SIRegisterInfo::TGID_Y: + case SIRegisterInfo::WORKGROUP_ID_Y: return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1); - case SIRegisterInfo::TGID_Z: + case SIRegisterInfo::WORKGROUP_ID_Z: return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2); - case SIRegisterInfo::SCRATCH_WAVE_OFFSET: + case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET: if (MFI->getShaderType() != ShaderType::COMPUTE) return MFI->ScratchOffsetReg; return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4); - case SIRegisterInfo::SCRATCH_PTR: - return AMDGPU::SGPR2_SGPR3; - case SIRegisterInfo::INPUT_PTR: - if (ST.isAmdHsaOS()) - return MFI->hasDispatchPtr() ? AMDGPU::SGPR2_SGPR3 : AMDGPU::SGPR0_SGPR1; - return AMDGPU::SGPR0_SGPR1; + case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER: + llvm_unreachable("currently unused"); + case SIRegisterInfo::KERNARG_SEGMENT_PTR: + return ST.isAmdHsaOS() ? AMDGPU::SGPR2_SGPR3 : AMDGPU::SGPR0_SGPR1; case SIRegisterInfo::DISPATCH_PTR: assert(MFI->hasDispatchPtr()); return AMDGPU::SGPR0_SGPR1; - case SIRegisterInfo::TIDIG_X: + case SIRegisterInfo::QUEUE_PTR: + llvm_unreachable("not implemented"); + case SIRegisterInfo::WORKITEM_ID_X: return AMDGPU::VGPR0; - case SIRegisterInfo::TIDIG_Y: + case SIRegisterInfo::WORKITEM_ID_Y: return AMDGPU::VGPR1; - case SIRegisterInfo::TIDIG_Z: + case SIRegisterInfo::WORKITEM_ID_Z: return AMDGPU::VGPR2; } llvm_unreachable("unexpected preloaded value type"); diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h index 36f6d1c7a261..bb93242b0f88 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/lib/Target/AMDGPU/SIRegisterInfo.h @@ -93,23 +93,25 @@ struct SIRegisterInfo : public AMDGPURegisterInfo { /// \returns True if operands defined with this operand type can accept /// an inline constant. i.e. An integer value in the range (-16, 64) or - /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f. + /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f. bool opCanUseInlineConstant(unsigned OpType) const; enum PreloadedValue { // SGPRS: - SCRATCH_PTR = 0, + PRIVATE_SEGMENT_BUFFER = 0, DISPATCH_PTR = 1, - INPUT_PTR = 3, - TGID_X = 10, - TGID_Y = 11, - TGID_Z = 12, - SCRATCH_WAVE_OFFSET = 14, + QUEUE_PTR = 2, + KERNARG_SEGMENT_PTR = 3, + WORKGROUP_ID_X = 10, + WORKGROUP_ID_Y = 11, + WORKGROUP_ID_Z = 12, + PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14, + // VGPRS: FIRST_VGPR_VALUE = 15, - TIDIG_X = FIRST_VGPR_VALUE, - TIDIG_Y = 16, - TIDIG_Z = 17, + WORKITEM_ID_X = FIRST_VGPR_VALUE, + WORKITEM_ID_Y = 16, + WORKITEM_ID_Z = 17 }; /// \brief Returns the physical register that \p Value is stored in. From 0f1b95f8188f2c2ce4a93394ca185c707463c75e Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 30 Nov 2015 21:16:03 +0000 Subject: [PATCH 021/186] AMDGPU: Rework how private buffer passed for HSA If we know we have stack objects, we reserve the registers that the private buffer resource and wave offset are passed and use them directly. If not, reserve the last 5 SGPRs just in case we need to spill. After register allocation, try to pick the next available registers instead of the last SGPRs, and then insert copies from the inputs to the reserved registers in the progloue. This also only selectively enables all of the input registers which are really required instead of always enabling them. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254331 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 66 +++- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 6 +- lib/Target/AMDGPU/SIFrameLowering.cpp | 168 +++++++++- lib/Target/AMDGPU/SIISelLowering.cpp | 160 ++++++++-- lib/Target/AMDGPU/SIInstrInfo.cpp | 14 +- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp | 80 ++++- lib/Target/AMDGPU/SIMachineFunctionInfo.h | 113 ++++++- lib/Target/AMDGPU/SIRegisterInfo.cpp | 80 +++-- lib/Target/AMDGPU/SIRegisterInfo.h | 9 + test/CodeGen/AMDGPU/hsa.ll | 4 +- test/CodeGen/AMDGPU/large-alloca-compute.ll | 43 ++- test/CodeGen/AMDGPU/large-alloca-graphics.ll | 8 +- .../AMDGPU/llvm.AMDGPU.read.workdim.ll | 37 +++ .../AMDGPU/llvm.amdgcn.dispatch.ptr.ll | 2 +- test/CodeGen/AMDGPU/llvm.dbg.value.ll | 4 +- .../AMDGPU/llvm.r600.read.local.size.ll | 184 +++++++++++ .../AMDGPU/local-memory-two-objects.ll | 2 +- test/CodeGen/AMDGPU/local-memory.ll | 4 +- ...vgpr-spill-emergency-stack-slot-compute.ll | 8 +- .../AMDGPU/vgpr-spill-emergency-stack-slot.ll | 12 +- test/CodeGen/AMDGPU/work-item-intrinsics.ll | 293 ++++++++---------- 21 files changed, 1000 insertions(+), 297 deletions(-) create mode 100644 test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll create mode 100644 test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 314ef721c1fc..b96c48f0561d 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -452,18 +452,27 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, S_00B848_FLOAT_MODE(ProgInfo.FloatMode) | S_00B848_PRIV(ProgInfo.Priv) | S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) | - S_00B848_IEEE_MODE(ProgInfo.DebugMode) | + S_00B848_DEBUG_MODE(ProgInfo.DebugMode) | S_00B848_IEEE_MODE(ProgInfo.IEEEMode); + // 0 = X, 1 = XY, 2 = XYZ + unsigned TIDIGCompCnt = 0; + if (MFI->hasWorkItemIDZ()) + TIDIGCompCnt = 2; + else if (MFI->hasWorkItemIDY()) + TIDIGCompCnt = 1; + ProgInfo.ComputePGMRSrc2 = S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) | - S_00B84C_USER_SGPR(MFI->NumUserSGPRs) | - S_00B84C_TGID_X_EN(1) | - S_00B84C_TGID_Y_EN(1) | - S_00B84C_TGID_Z_EN(1) | - S_00B84C_TG_SIZE_EN(1) | - S_00B84C_TIDIG_COMP_CNT(2) | - S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks); + S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) | + S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) | + S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) | + S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) | + S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) | + S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) | + S_00B84C_EXCP_EN_MSB(0) | + S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks) | + S_00B84C_EXCP_EN(0); } static unsigned getRsrcReg(unsigned ShaderType) { @@ -524,9 +533,44 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, header.compute_pgm_resource_registers = KernelInfo.ComputePGMRSrc1 | (KernelInfo.ComputePGMRSrc2 << 32); - header.code_properties = - AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR | - AMD_CODE_PROPERTY_IS_PTR64; + header.code_properties = AMD_CODE_PROPERTY_IS_PTR64; + + if (MFI->hasPrivateSegmentBuffer()) { + header.code_properties |= + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; + } + + if (MFI->hasDispatchPtr()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; + + if (MFI->hasQueuePtr()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; + + if (MFI->hasKernargSegmentPtr()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; + + if (MFI->hasDispatchID()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; + + if (MFI->hasFlatScratchInit()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; + + // TODO: Private segment size + + if (MFI->hasGridWorkgroupCountX()) { + header.code_properties |= + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X; + } + + if (MFI->hasGridWorkgroupCountY()) { + header.code_properties |= + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y; + } + + if (MFI->hasGridWorkgroupCountZ()) { + header.code_properties |= + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z; + } if (MFI->hasDispatchPtr()) header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index aa3909060e52..6aa4fddd3ec4 100644 --- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1062,14 +1062,10 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, SDLoc DL(Addr); MachineFunction &MF = CurDAG->getMachineFunction(); - const SIRegisterInfo *TRI = - static_cast(Subtarget->getRegisterInfo()); const SIMachineFunctionInfo *Info = MF.getInfo(); - unsigned ScratchOffsetReg = TRI->getPreloadedValue( - MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); - SOffset = CurDAG->getRegister(ScratchOffsetReg, MVT::i32); + SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32); // (add n0, c1) if (CurDAG->isBaseWithConstantOffset(Addr)) { diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp index 6aff4b5700d4..6b3c81c3af74 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -36,6 +36,16 @@ static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo, return true; } +static ArrayRef getAllSGPR128() { + return makeArrayRef(AMDGPU::SReg_128RegClass.begin(), + AMDGPU::SReg_128RegClass.getNumRegs()); +} + +static ArrayRef getAllSGPRs() { + return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), + AMDGPU::SGPR_32RegClass.getNumRegs()); +} + void SIFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { if (!MF.getFrameInfo()->hasStackObjects()) @@ -43,7 +53,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); - const SIMachineFunctionInfo *MFI = MF.getInfo(); + SIMachineFunctionInfo *MFI = MF.getInfo(); // If we only have SGPR spills, we won't actually be using scratch memory // since these spill to VGPRs. @@ -56,31 +66,159 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, const SIInstrInfo *TII = static_cast(MF.getSubtarget().getInstrInfo()); const SIRegisterInfo *TRI = &TII->getRegisterInfo(); + const AMDGPUSubtarget &ST = MF.getSubtarget(); // We need to insert initialization of the scratch resource descriptor. unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); assert(ScratchRsrcReg != AMDGPU::NoRegister); - uint64_t Rsrc23 = TII->getScratchRsrcWords23(); + unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); + assert(ScratchWaveOffsetReg != AMDGPU::NoRegister); + + unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + + unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; + if (ST.isAmdHsaOS()) { + PreloadedPrivateBufferReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); + } + + // If we reserved the original input registers, we don't need to copy to the + // reserved registers. + if (ScratchRsrcReg == PreloadedPrivateBufferReg) { + // We should always reserve these 5 registers at the same time. + assert(ScratchWaveOffsetReg == PreloadedScratchWaveOffsetReg && + "scratch wave offset and private segment buffer inconsistent"); + return; + } + + + // We added live-ins during argument lowering, but since they were not used + // they were deleted. We're adding the uses now, so add them back. + MachineRegisterInfo &MRI = MF.getRegInfo(); + MRI.addLiveIn(PreloadedScratchWaveOffsetReg); + MBB.addLiveIn(PreloadedScratchWaveOffsetReg); + + if (ST.isAmdHsaOS()) { + MRI.addLiveIn(PreloadedPrivateBufferReg); + MBB.addLiveIn(PreloadedPrivateBufferReg); + } + + // We reserved the last registers for this. Shift it down to the end of those + // which were actually used. + // + // FIXME: It might be safer to use a pseudoregister before replacement. + + // FIXME: We should be able to eliminate unused input registers. We only + // cannot do this for the resources required for scratch access. For now we + // skip over user SGPRs and may leave unused holes. + + // We find the resource first because it has an alignment requirement. + if (ScratchRsrcReg == TRI->reservedPrivateSegmentBufferReg(MF)) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + + unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4; + // Skip the last 2 elements because the last one is reserved for VCC, and + // this is the 2nd to last element already. + for (MCPhysReg Reg : getAllSGPR128().drop_back(2).slice(NumPreloaded)) { + // Pick the first unallocated one. Make sure we don't clobber the other + // reserved input we needed. + if (!MRI.isPhysRegUsed(Reg)) { + assert(MRI.isAllocatable(Reg)); + MRI.replaceRegWith(ScratchRsrcReg, Reg); + ScratchRsrcReg = Reg; + MFI->setScratchRSrcReg(ScratchRsrcReg); + break; + } + } + } + + if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + // Skip the last 2 elements because the last one is reserved for VCC, and + // this is the 2nd to last element already. + unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); + for (MCPhysReg Reg : getAllSGPRs().drop_back(6).slice(NumPreloaded)) { + // Pick the first unallocated SGPR. Be careful not to pick an alias of the + // scratch descriptor, since we haven’t added its uses yet. + if (!MRI.isPhysRegUsed(Reg)) { + assert(MRI.isAllocatable(Reg) && + !TRI->isSubRegisterEq(ScratchRsrcReg, Reg)); + + MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); + ScratchWaveOffsetReg = Reg; + MFI->setScratchWaveOffsetReg(ScratchWaveOffsetReg); + break; + } + } + } + + + assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg)); + + const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); MachineBasicBlock::iterator I = MBB.begin(); DebugLoc DL; - unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); - unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); - unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); - unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0) - .addExternalSymbol("SCRATCH_RSRC_DWORD0"); + if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) { + // Make sure we emit the copy for the offset first. We may have chosen to copy + // the buffer resource into a register that aliases the input offset register. + BuildMI(MBB, I, DL, SMovB32, ScratchWaveOffsetReg) + .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill); + } - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1) - .addExternalSymbol("SCRATCH_RSRC_DWORD1"); + if (ST.isAmdHsaOS()) { + // Insert copies from argument register. + assert( + !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchRsrcReg) && + !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchWaveOffsetReg)); + + unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); + unsigned Rsrc23 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2_sub3); + + unsigned Lo = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub0_sub1); + unsigned Hi = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub2_sub3); + + const MCInstrDesc &SMovB64 = TII->get(AMDGPU::S_MOV_B64); + + BuildMI(MBB, I, DL, SMovB64, Rsrc01) + .addReg(Lo, RegState::Kill); + BuildMI(MBB, I, DL, SMovB64, Rsrc23) + .addReg(Hi, RegState::Kill); + } else { + unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); + unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); + unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); + unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); + + // Use relocations to get the pointer, and setup the other bits manually. + uint64_t Rsrc23 = TII->getScratchRsrcWords23(); + BuildMI(MBB, I, DL, SMovB32, Rsrc0) + .addExternalSymbol("SCRATCH_RSRC_DWORD0") + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, SMovB32, Rsrc1) + .addExternalSymbol("SCRATCH_RSRC_DWORD1") + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, SMovB32, Rsrc2) + .addImm(Rsrc23 & 0xffffffff) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, SMovB32, Rsrc3) + .addImm(Rsrc23 >> 32) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + } - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2) - .addImm(Rsrc23 & 0xffffffff); + // Make the register selected live throughout the function. + for (MachineBasicBlock &OtherBB : MF) { + if (&OtherBB == &MBB) + continue; - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3) - .addImm(Rsrc23 >> 32); + OtherBB.addLiveIn(ScratchRsrcReg); + OtherBB.addLiveIn(ScratchWaveOffsetReg); + } } void SIFrameLowering::processFunctionBeforeFrameFinalized( diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index de52b483bbbc..05b54d0bae51 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -542,6 +542,11 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, Align); // Alignment } +static ArrayRef getAllSGPRs() { + return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), + AMDGPU::SGPR_32RegClass.getNumRegs()); +} + SDValue SITargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, SDLoc DL, SelectionDAG &DAG, @@ -619,37 +624,28 @@ SDValue SITargetLowering::LowerFormalArguments( CCInfo.AllocateReg(AMDGPU::VGPR1); } - // The pointer to the list of arguments is stored in SGPR0, SGPR1 - // The pointer to the scratch buffer is stored in SGPR2, SGPR3 if (Info->getShaderType() == ShaderType::COMPUTE) { - if (Subtarget->isAmdHsaOS()) - Info->NumUserSGPRs += 4; // FIXME: Need to support scratch buffers. - else - Info->NumUserSGPRs += 4; - - unsigned InputPtrReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); - unsigned InputPtrRegLo = - TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 0); - unsigned InputPtrRegHi = - TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 1); - - CCInfo.AllocateReg(InputPtrRegLo); - CCInfo.AllocateReg(InputPtrRegHi); - MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); + getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, + Splits); + } - const SIMachineFunctionInfo *MFI = MF.getInfo(); + // FIXME: How should these inputs interact with inreg / custom SGPR inputs? + if (Info->hasPrivateSegmentBuffer()) { + unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI); + MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass); + CCInfo.AllocateReg(PrivateSegmentBufferReg); + } - if (MFI->hasDispatchPtr()) { - unsigned DispatchPtrReg - = TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR); - MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass); - } + if (Info->hasDispatchPtr()) { + unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI); + MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass); + CCInfo.AllocateReg(DispatchPtrReg); } - if (Info->getShaderType() == ShaderType::COMPUTE) { - getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, - Splits); + if (Info->hasKernargSegmentPtr()) { + unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI); + MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); + CCInfo.AllocateReg(InputPtrReg); } AnalyzeFormalArguments(CCInfo, Splits); @@ -739,14 +735,114 @@ SDValue SITargetLowering::LowerFormalArguments( InVals.push_back(Val); } - if (Info->getShaderType() != ShaderType::COMPUTE) { - unsigned ScratchIdx = CCInfo.getFirstUnallocated(makeArrayRef( - AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs())); - Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx); + // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read + // these from the dispatch pointer. + + // Start adding system SGPRs. + if (Info->hasWorkGroupIDX()) { + unsigned Reg = Info->addWorkGroupIDX(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); + } else + llvm_unreachable("work group id x is always enabled"); + + if (Info->hasWorkGroupIDY()) { + unsigned Reg = Info->addWorkGroupIDY(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); + } + + if (Info->hasWorkGroupIDZ()) { + unsigned Reg = Info->addWorkGroupIDZ(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); } - if (MF.getFrameInfo()->hasStackObjects() || ST.isVGPRSpillingEnabled(Info)) - Info->setScratchRSrcReg(TRI); + if (Info->hasWorkGroupInfo()) { + unsigned Reg = Info->addWorkGroupInfo(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); + } + + if (Info->hasPrivateSegmentWaveByteOffset()) { + // Scratch wave offset passed in system SGPR. + unsigned PrivateSegmentWaveByteOffsetReg + = Info->addPrivateSegmentWaveByteOffset(); + + MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); + CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); + } + + // Now that we've figured out where the scratch register inputs are, see if + // should reserve the arguments and use them directly. + + bool HasStackObjects = MF.getFrameInfo()->hasStackObjects(); + + if (ST.isAmdHsaOS()) { + // TODO: Assume we will spill without optimizations. + if (HasStackObjects) { + // If we have stack objects, we unquestionably need the private buffer + // resource. For the HSA ABI, this will be the first 4 user SGPR + // inputs. We can reserve those and use them directly. + + unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); + Info->setScratchRSrcReg(PrivateSegmentBufferReg); + + unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg); + } else { + unsigned ReservedBufferReg + = TRI->reservedPrivateSegmentBufferReg(MF); + unsigned ReservedOffsetReg + = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); + + // We tentatively reserve the last registers (skipping the last two + // which may contain VCC). After register allocation, we'll replace + // these with the ones immediately after those which were really + // allocated. In the prologue copies will be inserted from the argument + // to these reserved registers. + Info->setScratchRSrcReg(ReservedBufferReg); + Info->setScratchWaveOffsetReg(ReservedOffsetReg); + } + } else { + unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF); + + // Without HSA, relocations are used for the scratch pointer and the + // buffer resource setup is always inserted in the prologue. Scratch wave + // offset is still in an input SGPR. + Info->setScratchRSrcReg(ReservedBufferReg); + + if (HasStackObjects) { + unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg); + } else { + unsigned ReservedOffsetReg + = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); + Info->setScratchWaveOffsetReg(ReservedOffsetReg); + } + } + + if (Info->hasWorkItemIDX()) { + unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } else + llvm_unreachable("workitem id x should always be enabled"); + + if (Info->hasWorkItemIDY()) { + unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } + + if (Info->hasWorkItemIDZ()) { + unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } if (Chains.empty()) return Chain; diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 5350edbd74e2..e1668649139d 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -551,16 +551,13 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); - unsigned ScratchOffsetPreloadReg = RI.getPreloadedValue( - *MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize()); MFI->setHasSpilledVGPRs(); BuildMI(MBB, MI, DL, get(Opcode)) .addReg(SrcReg) // src .addFrameIndex(FrameIndex) // frame_idx - .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc - .addReg(ScratchOffsetPreloadReg) // scratch_offset + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset .addMemOperand(MMO); } @@ -638,14 +635,11 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); - unsigned ScratchOffsetPreloadReg = RI.getPreloadedValue( - *MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize()); BuildMI(MBB, MI, DL, get(Opcode), DestReg) .addFrameIndex(FrameIndex) // frame_idx - .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc - .addReg(ScratchOffsetPreloadReg) // scratch_offset + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset .addMemOperand(MMO); } diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index d042844aa138..935aad427198 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -30,15 +30,33 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) : AMDGPUMachineFunction(MF), TIDReg(AMDGPU::NoRegister), ScratchRSrcReg(AMDGPU::NoRegister), + ScratchWaveOffsetReg(AMDGPU::NoRegister), + PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister), + DispatchPtrUserSGPR(AMDGPU::NoRegister), + QueuePtrUserSGPR(AMDGPU::NoRegister), + KernargSegmentPtrUserSGPR(AMDGPU::NoRegister), + DispatchIDUserSGPR(AMDGPU::NoRegister), + FlatScratchInitUserSGPR(AMDGPU::NoRegister), + PrivateSegmentSizeUserSGPR(AMDGPU::NoRegister), + GridWorkGroupCountXUserSGPR(AMDGPU::NoRegister), + GridWorkGroupCountYUserSGPR(AMDGPU::NoRegister), + GridWorkGroupCountZUserSGPR(AMDGPU::NoRegister), + WorkGroupIDXSystemSGPR(AMDGPU::NoRegister), + WorkGroupIDYSystemSGPR(AMDGPU::NoRegister), + WorkGroupIDZSystemSGPR(AMDGPU::NoRegister), + WorkGroupInfoSystemSGPR(AMDGPU::NoRegister), + PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister), LDSWaveSpillSize(0), PSInputAddr(0), NumUserSGPRs(0), + NumSystemSGPRs(0), HasSpilledSGPRs(false), HasSpilledVGPRs(false), + PrivateSegmentBuffer(false), DispatchPtr(false), QueuePtr(false), DispatchID(false), - KernargSegmentPtr(true), + KernargSegmentPtr(false), FlatScratchInit(false), GridWorkgroupCountX(false), GridWorkgroupCountY(false), @@ -47,13 +65,17 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) WorkGroupIDY(false), WorkGroupIDZ(false), WorkGroupInfo(false), + PrivateSegmentWaveByteOffset(false), WorkItemIDX(true), WorkItemIDY(false), WorkItemIDZ(false) { + const AMDGPUSubtarget &ST = MF.getSubtarget(); const Function *F = MF.getFunction(); - if (F->hasFnAttribute("amdgpu-dispatch-ptr")) - DispatchPtr = true; + const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); + + if (getShaderType() == ShaderType::COMPUTE) + KernargSegmentPtr = true; if (F->hasFnAttribute("amdgpu-work-group-id-y")) WorkGroupIDY = true; @@ -66,14 +88,54 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) if (F->hasFnAttribute("amdgpu-work-item-id-z")) WorkItemIDZ = true; + + bool MaySpill = ST.isVGPRSpillingEnabled(this); + bool HasStackObjects = FrameInfo->hasStackObjects(); + + if (HasStackObjects || MaySpill) + PrivateSegmentWaveByteOffset = true; + + if (ST.isAmdHsaOS()) { + if (HasStackObjects || MaySpill) + PrivateSegmentBuffer = true; + + if (F->hasFnAttribute("amdgpu-dispatch-ptr")) + DispatchPtr = true; + } + + // X, XY, and XYZ are the only supported combinations, so make sure Y is + // enabled if Z is. + if (WorkItemIDZ) + WorkItemIDY = true; +} + +unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( + const SIRegisterInfo &TRI) { + PrivateSegmentBufferUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass); + NumUserSGPRs += 4; + return PrivateSegmentBufferUserSGPR; +} + +unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) { + DispatchPtrUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + NumUserSGPRs += 2; + return DispatchPtrUserSGPR; +} + +unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) { + QueuePtrUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + NumUserSGPRs += 2; + return QueuePtrUserSGPR; } -void SIMachineFunctionInfo::setScratchRSrcReg(const SIRegisterInfo *TRI) { - // We need to round up to next multiple of 4. - unsigned NextSReg128 = RoundUpToAlignment(NumUserSGPRs + 5, 4); - unsigned RegSub0 = AMDGPU::SReg_32RegClass.getRegister(NextSReg128); - ScratchRSrcReg = TRI->getMatchingSuperReg(RegSub0, AMDGPU::sub0, - &AMDGPU::SReg_128RegClass); +unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) { + KernargSegmentPtrUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + NumUserSGPRs += 2; + return KernargSegmentPtrUserSGPR; } SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 377c5ce94846..9c528d63bd0e 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -26,10 +26,36 @@ class MachineRegisterInfo; /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which /// tells the hardware which interpolation parameters to load. class SIMachineFunctionInfo : public AMDGPUMachineFunction { + // FIXME: This should be removed and getPreloadedValue moved here. + friend struct SIRegisterInfo; void anchor() override; unsigned TIDReg; + + // Registers that may be reserved for spilling purposes. These may be the same + // as the input registers. unsigned ScratchRSrcReg; + unsigned ScratchWaveOffsetReg; + + // Input registers setup for the HSA ABI. + // User SGPRs in allocation order. + unsigned PrivateSegmentBufferUserSGPR; + unsigned DispatchPtrUserSGPR; + unsigned QueuePtrUserSGPR; + unsigned KernargSegmentPtrUserSGPR; + unsigned DispatchIDUserSGPR; + unsigned FlatScratchInitUserSGPR; + unsigned PrivateSegmentSizeUserSGPR; + unsigned GridWorkGroupCountXUserSGPR; + unsigned GridWorkGroupCountYUserSGPR; + unsigned GridWorkGroupCountZUserSGPR; + + // System SGPRs in allocation order. + unsigned WorkGroupIDXSystemSGPR; + unsigned WorkGroupIDYSystemSGPR; + unsigned WorkGroupIDZSystemSGPR; + unsigned WorkGroupInfoSystemSGPR; + unsigned PrivateSegmentWaveByteOffsetSystemSGPR; public: // FIXME: Make private @@ -38,12 +64,14 @@ class SIMachineFunctionInfo : public AMDGPUMachineFunction { std::map LaneVGPRs; unsigned ScratchOffsetReg; unsigned NumUserSGPRs; + unsigned NumSystemSGPRs; private: bool HasSpilledSGPRs; bool HasSpilledVGPRs; - // Feature bits required for inputs passed in user / system SGPRs. + // Feature bits required for inputs passed in user SGPRs. + bool PrivateSegmentBuffer : 1; bool DispatchPtr : 1; bool QueuePtr : 1; bool DispatchID : 1; @@ -53,15 +81,27 @@ class SIMachineFunctionInfo : public AMDGPUMachineFunction { bool GridWorkgroupCountY : 1; bool GridWorkgroupCountZ : 1; + // Feature bits required for inputs passed in system SGPRs. bool WorkGroupIDX : 1; // Always initialized. bool WorkGroupIDY : 1; bool WorkGroupIDZ : 1; bool WorkGroupInfo : 1; + bool PrivateSegmentWaveByteOffset : 1; bool WorkItemIDX : 1; // Always initialized. bool WorkItemIDY : 1; bool WorkItemIDZ : 1; + + MCPhysReg getNextUserSGPR() const { + assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs"); + return AMDGPU::SGPR0 + NumUserSGPRs; + } + + MCPhysReg getNextSystemSGPR() const { + return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs; + } + public: struct SpilledReg { unsigned VGPR; @@ -80,6 +120,47 @@ class SIMachineFunctionInfo : public AMDGPUMachineFunction { unsigned getTIDReg() const { return TIDReg; }; void setTIDReg(unsigned Reg) { TIDReg = Reg; } + // Add user SGPRs. + unsigned addPrivateSegmentBuffer(const SIRegisterInfo &TRI); + unsigned addDispatchPtr(const SIRegisterInfo &TRI); + unsigned addQueuePtr(const SIRegisterInfo &TRI); + unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI); + + // Add system SGPRs. + unsigned addWorkGroupIDX() { + WorkGroupIDXSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return WorkGroupIDXSystemSGPR; + } + + unsigned addWorkGroupIDY() { + WorkGroupIDYSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return WorkGroupIDYSystemSGPR; + } + + unsigned addWorkGroupIDZ() { + WorkGroupIDZSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return WorkGroupIDZSystemSGPR; + } + + unsigned addWorkGroupInfo() { + WorkGroupInfoSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return WorkGroupInfoSystemSGPR; + } + + unsigned addPrivateSegmentWaveByteOffset() { + PrivateSegmentWaveByteOffsetSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return PrivateSegmentWaveByteOffsetSystemSGPR; + } + + bool hasPrivateSegmentBuffer() const { + return PrivateSegmentBuffer; + } + bool hasDispatchPtr() const { return DispatchPtr; } @@ -128,6 +209,10 @@ class SIMachineFunctionInfo : public AMDGPUMachineFunction { return WorkGroupInfo; } + bool hasPrivateSegmentWaveByteOffset() const { + return PrivateSegmentWaveByteOffset; + } + bool hasWorkItemIDX() const { return WorkItemIDX; } @@ -140,13 +225,37 @@ class SIMachineFunctionInfo : public AMDGPUMachineFunction { return WorkItemIDZ; } + unsigned getNumUserSGPRs() const { + return NumUserSGPRs; + } + + unsigned getNumPreloadedSGPRs() const { + return NumUserSGPRs + NumSystemSGPRs; + } + + unsigned getPrivateSegmentWaveByteOffsetSystemSGPR() const { + return PrivateSegmentWaveByteOffsetSystemSGPR; + } + /// \brief Returns the physical register reserved for use as the resource /// descriptor for scratch accesses. unsigned getScratchRSrcReg() const { return ScratchRSrcReg; } - void setScratchRSrcReg(const SIRegisterInfo *TRI); + void setScratchRSrcReg(unsigned Reg) { + assert(Reg != AMDGPU::NoRegister && "Should never be unset"); + ScratchRSrcReg = Reg; + } + + unsigned getScratchWaveOffsetReg() const { + return ScratchWaveOffsetReg; + } + + void setScratchWaveOffsetReg(unsigned Reg) { + assert(Reg != AMDGPU::NoRegister && "Should never be unset"); + ScratchWaveOffsetReg = Reg; + } bool hasSpilledSGPRs() const { return HasSpilledSGPRs; diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index 68629cd50995..d9799812d453 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -32,6 +32,40 @@ void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) co Reserved.set(*R); } +unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( + const MachineFunction &MF) const { + const AMDGPUSubtarget &ST = MF.getSubtarget(); + if (ST.hasSGPRInitBug()) { + unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 4; + unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); + return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass); + } + + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // 98/99 need to be reserved for flat_scr, and 100/101 for vcc. This is the + // next sgpr128 down. + return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95; + } + + return AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99; +} + +unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( + const MachineFunction &MF) const { + const AMDGPUSubtarget &ST = MF.getSubtarget(); + if (ST.hasSGPRInitBug()) { + unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 5; + return AMDGPU::SGPR_32RegClass.getRegister(Idx); + } + + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // Next register before reservations for flat_scr and vcc. + return AMDGPU::SGPR97; + } + + return AMDGPU::SGPR95; +} + BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); Reserved.set(AMDGPU::INDIRECT_BASE_ADDR); @@ -69,19 +103,20 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { } const SIMachineFunctionInfo *MFI = MF.getInfo(); + + unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); + if (ScratchWaveOffsetReg != AMDGPU::NoRegister) { + // Reserve 1 SGPR for scratch wave offset in case we need to spill. + reserveRegisterTuples(Reserved, ScratchWaveOffsetReg); + } + unsigned ScratchRSrcReg = MFI->getScratchRSrcReg(); if (ScratchRSrcReg != AMDGPU::NoRegister) { - unsigned ScratchOffsetPreloadReg - = getPreloadedValue(MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - // We will need to use this user SGPR argument for spilling, and thus never - // want it to be spilled. - reserveRegisterTuples(Reserved, ScratchOffsetPreloadReg); - // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need // to spill. // TODO: May need to reserve a VGPR if doing LDS spilling. reserveRegisterTuples(Reserved, ScratchRSrcReg); - assert(!isSubRegister(ScratchRSrcReg, ScratchOffsetPreloadReg)); + assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg)); } return Reserved; @@ -204,11 +239,10 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, unsigned SubReg = NumSubRegs > 1 ? getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) : Value; - bool IsKill = (i == e - 1); BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) .addReg(SubReg, getDefRegState(IsLoad)) - .addReg(ScratchRsrcReg, getKillRegState(IsKill)) + .addReg(ScratchRsrcReg) .addReg(SOffset) .addImm(Offset) .addImm(0) // glc @@ -526,6 +560,9 @@ bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { return OpType == AMDGPU::OPERAND_REG_INLINE_C; } +// FIXME: Most of these are flexible with HSA and we don't need to reserve them +// as input registers if unused. Whether the dispatch ptr is necessary should be +// easy to detect from used intrinsics. Scratch setup is harder to know. unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, enum PreloadedValue Value) const { @@ -533,29 +570,36 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, const SIMachineFunctionInfo *MFI = MF.getInfo(); switch (Value) { case SIRegisterInfo::WORKGROUP_ID_X: - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0); + assert(MFI->hasWorkGroupIDX()); + return MFI->WorkGroupIDXSystemSGPR; case SIRegisterInfo::WORKGROUP_ID_Y: - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1); + assert(MFI->hasWorkGroupIDY()); + return MFI->WorkGroupIDYSystemSGPR; case SIRegisterInfo::WORKGROUP_ID_Z: - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2); + assert(MFI->hasWorkGroupIDZ()); + return MFI->WorkGroupIDZSystemSGPR; case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET: - if (MFI->getShaderType() != ShaderType::COMPUTE) - return MFI->ScratchOffsetReg; - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4); + return MFI->PrivateSegmentWaveByteOffsetSystemSGPR; case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER: - llvm_unreachable("currently unused"); + assert(ST.isAmdHsaOS() && "Non-HSA ABI currently uses relocations"); + assert(MFI->hasPrivateSegmentBuffer()); + return MFI->PrivateSegmentBufferUserSGPR; case SIRegisterInfo::KERNARG_SEGMENT_PTR: - return ST.isAmdHsaOS() ? AMDGPU::SGPR2_SGPR3 : AMDGPU::SGPR0_SGPR1; + assert(MFI->hasKernargSegmentPtr()); + return MFI->KernargSegmentPtrUserSGPR; case SIRegisterInfo::DISPATCH_PTR: assert(MFI->hasDispatchPtr()); - return AMDGPU::SGPR0_SGPR1; + return MFI->DispatchPtrUserSGPR; case SIRegisterInfo::QUEUE_PTR: llvm_unreachable("not implemented"); case SIRegisterInfo::WORKITEM_ID_X: + assert(MFI->hasWorkItemIDX()); return AMDGPU::VGPR0; case SIRegisterInfo::WORKITEM_ID_Y: + assert(MFI->hasWorkItemIDY()); return AMDGPU::VGPR1; case SIRegisterInfo::WORKITEM_ID_Z: + assert(MFI->hasWorkItemIDZ()); return AMDGPU::VGPR2; } llvm_unreachable("unexpected preloaded value type"); diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h index bb93242b0f88..eafe4053e878 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/lib/Target/AMDGPU/SIRegisterInfo.h @@ -29,6 +29,15 @@ struct SIRegisterInfo : public AMDGPURegisterInfo { public: SIRegisterInfo(); + /// Return the end register initially reserved for the scratch buffer in case + /// spilling is needed. + unsigned reservedPrivateSegmentBufferReg(const MachineFunction &MF) const; + + /// Return the end register initially reserved for the scratch wave offset in + /// case spilling is needed. + unsigned reservedPrivateSegmentWaveByteOffsetReg( + const MachineFunction &MF) const; + BitVector getReservedRegs(const MachineFunction &MF) const override; unsigned getRegPressureSetLimit(const MachineFunction &MF, diff --git a/test/CodeGen/AMDGPU/hsa.ll b/test/CodeGen/AMDGPU/hsa.ll index ab87fdbc00da..d9bb586163dc 100644 --- a/test/CodeGen/AMDGPU/hsa.ll +++ b/test/CodeGen/AMDGPU/hsa.ll @@ -38,8 +38,10 @@ ; HSA: .amdgpu_hsa_kernel simple ; HSA: {{^}}simple: ; HSA: .amd_kernel_code_t +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_kernarg_segment_ptr = 1 ; HSA: .end_amd_kernel_code_t -; HSA: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[0:1], 0x0 +; HSA: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 ; Make sure we are setting the ATC bit: ; HSA-CI: s_mov_b32 s[[HI:[0-9]]], 0x100f000 diff --git a/test/CodeGen/AMDGPU/large-alloca-compute.ll b/test/CodeGen/AMDGPU/large-alloca-compute.ll index 5e8cf5bb3d25..c348a2e7980f 100644 --- a/test/CodeGen/AMDGPU/large-alloca-compute.ll +++ b/test/CodeGen/AMDGPU/large-alloca-compute.ll @@ -1,31 +1,46 @@ ; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s -; XUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s -; XUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s +; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s +; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s ; FIXME: align on alloca seems to be ignored for private_segment_alignment ; ALL-LABEL: {{^}}large_alloca_compute_shader: -; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN: s_mov_b32 s14, -1 -; CI: s_mov_b32 s15, 0x80f000 -; VI: s_mov_b32 s15, 0x800000 +; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN: s_mov_b32 s10, -1 +; CI: s_mov_b32 s11, 0x80f000 +; VI: s_mov_b32 s11, 0x800000 ; GCNHSA: .amd_kernel_code_t + +; GCNHSA: compute_pgm_rsrc2_scratch_en = 1 +; GCNHSA: compute_pgm_rsrc2_user_sgpr = 6 +; GCNHSA: compute_pgm_rsrc2_tgid_x_en = 1 +; GCNHSA: compute_pgm_rsrc2_tgid_y_en = 0 +; GCNHSA: compute_pgm_rsrc2_tgid_z_en = 0 +; GCNHSA: compute_pgm_rsrc2_tg_size_en = 0 +; GCNHSA: compute_pgm_rsrc2_tidig_comp_cnt = 0 + +; GCNHSA: enable_sgpr_private_segment_buffer = 1 +; GCNHSA: enable_sgpr_dispatch_ptr = 0 +; GCNHSA: enable_sgpr_queue_ptr = 0 +; GCNHSA: enable_sgpr_kernarg_segment_ptr = 1 +; GCNHSA: enable_sgpr_dispatch_id = 0 +; GCNHSA: enable_sgpr_flat_scratch_init = 0 +; GCNHSA: enable_sgpr_private_segment_size = 0 +; GCNHSA: enable_sgpr_grid_workgroup_count_x = 0 +; GCNHSA: enable_sgpr_grid_workgroup_count_y = 0 +; GCNHSA: enable_sgpr_grid_workgroup_count_z = 0 +; GCNHSA: workitem_private_segment_byte_size = 0 ; GCNHSA: private_segment_alignment = 4 ; GCNHSA: .end_amd_kernel_code_t -; GCNHSA: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GCNHSA: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GCNHSA: s_mov_b32 s10, -1 -; CIHSA: s_mov_b32 s11, 0x180f000 -; VIHSA: s_mov_b32 s11, 0x11800000 -; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s6 offen -; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s6 offen +; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s7 offen +; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s7 offen ; Scratch size = alloca size + emergency stack slot ; ALL: ; ScratchSize: 32772 diff --git a/test/CodeGen/AMDGPU/large-alloca-graphics.ll b/test/CodeGen/AMDGPU/large-alloca-graphics.ll index 208b9a10050c..141ee2560152 100644 --- a/test/CodeGen/AMDGPU/large-alloca-graphics.ll +++ b/test/CodeGen/AMDGPU/large-alloca-graphics.ll @@ -8,8 +8,8 @@ ; CI: s_mov_b32 s11, 0x80f000 ; VI: s_mov_b32 s11, 0x800000 -; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen ; ALL: ; ScratchSize: 32772 define void @large_alloca_pixel_shader(i32 %x, i32 %y) #1 { @@ -29,8 +29,8 @@ define void @large_alloca_pixel_shader(i32 %x, i32 %y) #1 { ; CI: s_mov_b32 s11, 0x80f000 ; VI: s_mov_b32 s11, 0x800000 -; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen ; ALL: ; ScratchSize: 32772 define void @large_alloca_pixel_shader_inreg(i32 inreg %x, i32 inreg %y) #1 { diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll new file mode 100644 index 000000000000..6dc9d050eee6 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll @@ -0,0 +1,37 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}read_workdim: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[2].Z + +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] +define void @read_workdim(i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.AMDGPU.read.workdim() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}read_workdim_known_bits: +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c +; GCN-NOT: 0xff +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @read_workdim_known_bits(i32 addrspace(1)* %out) { +entry: + %dim = call i32 @llvm.AMDGPU.read.workdim() #0 + %shl = shl i32 %dim, 24 + %shr = lshr i32 %shl, 24 + store i32 %shr, i32 addrspace(1)* %out + ret void +} + +declare i32 @llvm.AMDGPU.read.workdim() #0 + +attributes #0 = { readnone } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll index 719f7ffe0f1c..dc95cd1ee012 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll @@ -2,7 +2,7 @@ ; GCN-LABEL: {{^}}test: ; GCN: enable_sgpr_dispatch_ptr = 1 -; GCN: s_load_dword s{{[0-9]+}}, s[0:1], 0x0 +; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 define void @test(i32 addrspace(1)* %out) { %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 %header_ptr = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)* diff --git a/test/CodeGen/AMDGPU/llvm.dbg.value.ll b/test/CodeGen/AMDGPU/llvm.dbg.value.ll index c5aba2b76b89..cc109327d929 100644 --- a/test/CodeGen/AMDGPU/llvm.dbg.value.ll +++ b/test/CodeGen/AMDGPU/llvm.dbg.value.ll @@ -1,8 +1,8 @@ ; RUN: llc -O0 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck %s ; CHECK-LABEL: {{^}}test_debug_value: -; CHECK: s_load_dwordx2 -; CHECK: DEBUG_VALUE: test_debug_value:globalptr_arg <- %SGPR0_SGPR1 +; CHECK: s_load_dwordx2 s[4:5] +; CHECK: DEBUG_VALUE: test_debug_value:globalptr_arg <- %SGPR4_SGPR5 ; CHECK: buffer_store_dword ; CHECK: s_endpgm define void @test_debug_value(i32 addrspace(1)* nocapture %globalptr_arg) #0 !dbg !4 { diff --git a/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll new file mode 100644 index 000000000000..f2a7256e812d --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll @@ -0,0 +1,184 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + + +; FUNC-LABEL: {{^}}local_size_x: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[1].Z + +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18 +; CI-HSA: s_load_dword [[XY:s[0-9]+]], s[4:5], 0x1 +; VI-HSA: s_load_dword [[XY:s[0-9]+]], s[4:5], 0x4 + +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @local_size_x(i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.local.size.x() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_y: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[1].W + +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @local_size_y(i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.local.size.y() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_z: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[2].X + +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20 +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @local_size_z(i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.local.size.z() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_xy: +; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6 +; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7 +; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18 +; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c +; GCN-DAG: v_mov_b32_e32 [[VY:v[0-9]+]], [[Y]] +; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VY]] +; GCN: buffer_store_dword [[VAL]] +define void @local_size_xy(i32 addrspace(1)* %out) { +entry: + %x = call i32 @llvm.r600.read.local.size.x() #0 + %y = call i32 @llvm.r600.read.local.size.y() #0 + %val = mul i32 %x, %y + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_xz: + +; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6 +; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8 +; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18 +; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20 +; HSA-DAG: s_and_b32 [[X:s[0-9]+]], [[XY]], 0xffff +; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] +; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VZ]] +; GCN: buffer_store_dword [[VAL]] +define void @local_size_xz(i32 addrspace(1)* %out) { +entry: + %x = call i32 @llvm.r600.read.local.size.x() #0 + %z = call i32 @llvm.r600.read.local.size.z() #0 + %val = mul i32 %x, %z + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_yz: +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_dispatch_ptr = 1 + +; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7 +; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8 +; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c +; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20 +; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] +; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[Y]], [[VZ]] +; GCN: buffer_store_dword [[VAL]] +define void @local_size_yz(i32 addrspace(1)* %out) { +entry: + %y = call i32 @llvm.r600.read.local.size.y() #0 + %z = call i32 @llvm.r600.read.local.size.z() #0 + %val = mul i32 %y, %z + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_xyz: +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_dispatch_ptr = 1 + +; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6 +; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7 +; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8 +; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18 +; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c +; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20 +; GCN-DAG: v_mov_b32_e32 [[VY:v[0-9]+]], [[Y]] +; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] +; GCN: v_mad_u32_u24 [[VAL:v[0-9]+]], [[X]], [[VY]], [[VZ]] +; GCN: buffer_store_dword [[VAL]] +define void @local_size_xyz(i32 addrspace(1)* %out) { +entry: + %x = call i32 @llvm.r600.read.local.size.x() #0 + %y = call i32 @llvm.r600.read.local.size.y() #0 + %z = call i32 @llvm.r600.read.local.size.z() #0 + %xy = mul i32 %x, %y + %xyz = add i32 %xy, %z + store i32 %xyz, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_x_known_bits: +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6 +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18 +; GCN-NOT: 0xffff +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NEXT: buffer_store_dword [[VVAL]] +define void @local_size_x_known_bits(i32 addrspace(1)* %out) { +entry: + %size = call i32 @llvm.r600.read.local.size.x() #0 + %shl = shl i32 %size, 16 + %shr = lshr i32 %shl, 16 + store i32 %shr, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_y_known_bits: +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7 +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c +; GCN-NOT: 0xffff +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NEXT: buffer_store_dword [[VVAL]] +define void @local_size_y_known_bits(i32 addrspace(1)* %out) { +entry: + %size = call i32 @llvm.r600.read.local.size.y() #0 + %shl = shl i32 %size, 16 + %shr = lshr i32 %shl, 16 + store i32 %shr, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_z_known_bits: +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20 +; GCN-NOT: 0xffff +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NEXT: buffer_store_dword [[VVAL]] +define void @local_size_z_known_bits(i32 addrspace(1)* %out) { +entry: + %size = call i32 @llvm.r600.read.local.size.z() #0 + %shl = shl i32 %size, 16 + %shr = lshr i32 %shl, 16 + store i32 %shr, i32 addrspace(1)* %out + ret void +} + +declare i32 @llvm.r600.read.local.size.x() #0 +declare i32 @llvm.r600.read.local.size.y() #0 +declare i32 @llvm.r600.read.local.size.z() #0 + +attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/local-memory-two-objects.ll b/test/CodeGen/AMDGPU/local-memory-two-objects.ll index 7f31ef45b628..6b52b80ba082 100644 --- a/test/CodeGen/AMDGPU/local-memory-two-objects.ll +++ b/test/CodeGen/AMDGPU/local-memory-two-objects.ll @@ -10,7 +10,7 @@ ; EG: .long 166120 ; EG-NEXT: .long 8 ; GCN: .long 47180 -; GCN-NEXT: .long 38792 +; GCN-NEXT: .long 32900 ; EG: {{^}}local_memory_two_objects: diff --git a/test/CodeGen/AMDGPU/local-memory.ll b/test/CodeGen/AMDGPU/local-memory.ll index 9494ed75bd0c..9ffb59e70920 100644 --- a/test/CodeGen/AMDGPU/local-memory.ll +++ b/test/CodeGen/AMDGPU/local-memory.ll @@ -9,9 +9,9 @@ ; EG: .long 166120 ; EG-NEXT: .long 128 ; SI: .long 47180 -; SI-NEXT: .long 71560 +; SI-NEXT: .long 65668 ; CI: .long 47180 -; CI-NEXT: .long 38792 +; CI-NEXT: .long 32900 ; FUNC-LABEL: {{^}}local_memory: diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll index 2cbb67d85fba..cd7c78f408dd 100644 --- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll +++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll @@ -17,16 +17,18 @@ declare i32 @llvm.r600.read.tgid.z() #1 ; GCN-LABEL: {{^}}spill_vgpr_compute: +; GCN: s_mov_b32 s16, s3 ; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_mov_b32 s15, 0x80f000 ; VI-NEXT: s_mov_b32 s15, 0x800000 -; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s8 offset:{{[0-9]+}} ; 4-byte Folded Spill -; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s8 offen offset:{{[0-9]+}} -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s8 offen offset:{{[0-9]+}} +; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill + +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} +; GCN: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} ; GCN: NumVgprs: 256 ; GCN: ScratchSize: 1024 diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll index ef492cef5767..16abb89bb0b8 100644 --- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -11,14 +11,14 @@ ; GCN-LABEL: {{^}}main: -; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GCN-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GCN-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s11, 0x80f000 -; VI-NEXT: s_mov_b32 s11, 0x800000 +; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_mov_b32 s15, 0x80f000 +; VI-NEXT: s_mov_b32 s15, 0x800000 ; s12 is offset user SGPR -; GCN: buffer_store_dword {{v[0-9]+}}, s[8:11], s12 offset:{{[0-9]+}} ; 4-byte Folded Spill +; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Spill ; GCN: NumVgprs: 256 ; GCN: ScratchSize: 1024 diff --git a/test/CodeGen/AMDGPU/work-item-intrinsics.ll b/test/CodeGen/AMDGPU/work-item-intrinsics.ll index ebe73429da9c..a704a23b0f92 100644 --- a/test/CodeGen/AMDGPU/work-item-intrinsics.ll +++ b/test/CodeGen/AMDGPU/work-item-intrinsics.ll @@ -1,5 +1,7 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=HSA -check-prefix=CI-HSA -check-prefix=FUNC %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=HSA -check-prefix=VI-HSA -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s @@ -7,9 +9,26 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[0].X -; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; HSA: .amd_kernel_code_t + +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_dispatch_ptr = 0 +; HSA: enable_sgpr_queue_ptr = 0 +; HSA: enable_sgpr_kernarg_segment_ptr = 1 +; HSA: enable_sgpr_dispatch_id = 0 +; HSA: enable_sgpr_flat_scratch_init = 0 +; HSA: enable_sgpr_private_segment_size = 0 +; HSA: enable_sgpr_grid_workgroup_count_x = 0 +; HSA: enable_sgpr_grid_workgroup_count_y = 0 +; HSA: enable_sgpr_grid_workgroup_count_z = 0 + +; HSA: .end_amd_kernel_code_t + + +; GCN-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] + define void @ngroups_x (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.ngroups.x() #0 @@ -21,10 +40,10 @@ entry: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[0].Y -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] define void @ngroups_y (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.ngroups.y() #0 @@ -36,10 +55,10 @@ entry: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[0].Z -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] define void @ngroups_z (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.ngroups.z() #0 @@ -51,10 +70,10 @@ entry: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[0].W -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] define void @global_size_x (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.global.size.x() #0 @@ -66,10 +85,10 @@ entry: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[1].X -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] define void @global_size_y (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.global.size.y() #0 @@ -81,10 +100,10 @@ entry: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[1].Y -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] define void @global_size_z (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.global.size.z() #0 @@ -92,74 +111,33 @@ entry: ret void } -; FUNC-LABEL: {{^}}local_size_x: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[1].Z - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @local_size_x (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.local.size.x() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}local_size_y: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[1].W - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @local_size_y (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.local.size.y() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}local_size_z: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[2].X - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @local_size_z (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.local.size.z() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}get_work_dim: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[2].Z - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @get_work_dim (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.AMDGPU.read.workdim() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; The tgid values are stored in sgprs offset by the number of user sgprs. -; Currently we always use exactly 2 user sgprs for the pointer to the -; kernel arguments, but this may change in the future. +; The tgid values are stored in sgprs offset by the number of user +; sgprs. ; FUNC-LABEL: {{^}}tgid_x: -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s4 +; HSA: .amd_kernel_code_t +; HSA: compute_pgm_rsrc2_user_sgpr = 6 +; HSA: compute_pgm_rsrc2_tgid_x_en = 1 +; HSA: compute_pgm_rsrc2_tgid_y_en = 0 +; HSA: compute_pgm_rsrc2_tgid_z_en = 0 +; HSA: compute_pgm_rsrc2_tg_size_en = 0 +; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0 +; HSA: enable_sgpr_grid_workgroup_count_x = 0 +; HSA: enable_sgpr_grid_workgroup_count_y = 0 +; HSA: enable_sgpr_grid_workgroup_count_z = 0 +; HSA: .end_amd_kernel_code_t + +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s2{{$}} +; HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6{{$}} ; GCN: buffer_store_dword [[VVAL]] -define void @tgid_x (i32 addrspace(1)* %out) { + +; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6 +; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 +define void @tgid_x(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tgid.x() #0 store i32 %0, i32 addrspace(1)* %out @@ -167,9 +145,25 @@ entry: } ; FUNC-LABEL: {{^}}tgid_y: -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s5 +; HSA: compute_pgm_rsrc2_user_sgpr = 6 +; HSA: compute_pgm_rsrc2_tgid_x_en = 1 +; HSA: compute_pgm_rsrc2_tgid_y_en = 1 +; HSA: compute_pgm_rsrc2_tgid_z_en = 0 +; HSA: compute_pgm_rsrc2_tg_size_en = 0 +; HSA: enable_sgpr_grid_workgroup_count_x = 0 +; HSA: enable_sgpr_grid_workgroup_count_y = 0 +; HSA: enable_sgpr_grid_workgroup_count_z = 0 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3 +; GCN-HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s7 ; GCN: buffer_store_dword [[VVAL]] -define void @tgid_y (i32 addrspace(1)* %out) { + +; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6 +; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 1 +; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 +define void @tgid_y(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tgid.y() #0 store i32 %0, i32 addrspace(1)* %out @@ -177,102 +171,83 @@ entry: } ; FUNC-LABEL: {{^}}tgid_z: -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6 +; HSA: compute_pgm_rsrc2_user_sgpr = 6 +; HSA: compute_pgm_rsrc2_tgid_x_en = 1 +; HSA: compute_pgm_rsrc2_tgid_y_en = 0 +; HSA: compute_pgm_rsrc2_tgid_z_en = 1 +; HSA: compute_pgm_rsrc2_tg_size_en = 0 +; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0 +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_dispatch_ptr = 0 +; HSA: enable_sgpr_queue_ptr = 0 +; HSA: enable_sgpr_kernarg_segment_ptr = 1 +; HSA: enable_sgpr_dispatch_id = 0 +; HSA: enable_sgpr_flat_scratch_init = 0 +; HSA: enable_sgpr_private_segment_size = 0 +; HSA: enable_sgpr_grid_workgroup_count_x = 0 +; HSA: enable_sgpr_grid_workgroup_count_y = 0 +; HSA: enable_sgpr_grid_workgroup_count_z = 0 + +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3{{$}} +; HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s7{{$}} ; GCN: buffer_store_dword [[VVAL]] -define void @tgid_z (i32 addrspace(1)* %out) { + +; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6 +; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1 +; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 +define void @tgid_z(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tgid.z() #0 store i32 %0, i32 addrspace(1)* %out ret void } +; GCN-NOHSA: .section .AMDGPU.config +; GCN-NOHSA: .long 47180 +; GCN-NOHSA-NEXT: .long 132{{$}} + ; FUNC-LABEL: {{^}}tidig_x: +; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0 ; GCN: buffer_store_dword v0 -define void @tidig_x (i32 addrspace(1)* %out) { +define void @tidig_x(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tidig.x() #0 store i32 %0, i32 addrspace(1)* %out ret void } +; GCN-NOHSA: .section .AMDGPU.config +; GCN-NOHSA: .long 47180 +; GCN-NOHSA-NEXT: .long 2180{{$}} + ; FUNC-LABEL: {{^}}tidig_y: + +; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 1 ; GCN: buffer_store_dword v1 -define void @tidig_y (i32 addrspace(1)* %out) { +define void @tidig_y(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tidig.y() #0 store i32 %0, i32 addrspace(1)* %out ret void } +; GCN-NOHSA: .section .AMDGPU.config +; GCN-NOHSA: .long 47180 +; GCN-NOHSA-NEXT: .long 4228{{$}} + ; FUNC-LABEL: {{^}}tidig_z: +; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 2 ; GCN: buffer_store_dword v2 -define void @tidig_z (i32 addrspace(1)* %out) { +define void @tidig_z(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tidig.z() #0 store i32 %0, i32 addrspace(1)* %out ret void } -; FUNC-LABEL: {{^}}local_size_x_known_bits: -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18 -; GCN-NOT: 0xffff -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN-NEXT: buffer_store_dword [[VVAL]] -define void @local_size_x_known_bits(i32 addrspace(1)* %out) { -entry: - %size = call i32 @llvm.r600.read.local.size.x() #0 - %shl = shl i32 %size, 16 - %shr = lshr i32 %shl, 16 - store i32 %shr, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}local_size_y_known_bits: -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c -; GCN-NOT: 0xffff -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN-NEXT: buffer_store_dword [[VVAL]] -define void @local_size_y_known_bits(i32 addrspace(1)* %out) { -entry: - %size = call i32 @llvm.r600.read.local.size.y() #0 - %shl = shl i32 %size, 16 - %shr = lshr i32 %shl, 16 - store i32 %shr, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}local_size_z_known_bits: -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20 -; GCN-NOT: 0xffff -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN-NEXT: buffer_store_dword [[VVAL]] -define void @local_size_z_known_bits(i32 addrspace(1)* %out) { -entry: - %size = call i32 @llvm.r600.read.local.size.z() #0 - %shl = shl i32 %size, 16 - %shr = lshr i32 %shl, 16 - store i32 %shr, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}get_work_dim_known_bits: -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c -; GCN-NOT: 0xff -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @get_work_dim_known_bits(i32 addrspace(1)* %out) { -entry: - %dim = call i32 @llvm.AMDGPU.read.workdim() #0 - %shl = shl i32 %dim, 24 - %shr = lshr i32 %shl, 24 - store i32 %shr, i32 addrspace(1)* %out - ret void -} - declare i32 @llvm.r600.read.ngroups.x() #0 declare i32 @llvm.r600.read.ngroups.y() #0 declare i32 @llvm.r600.read.ngroups.z() #0 @@ -281,10 +256,6 @@ declare i32 @llvm.r600.read.global.size.x() #0 declare i32 @llvm.r600.read.global.size.y() #0 declare i32 @llvm.r600.read.global.size.z() #0 -declare i32 @llvm.r600.read.local.size.x() #0 -declare i32 @llvm.r600.read.local.size.y() #0 -declare i32 @llvm.r600.read.local.size.z() #0 - declare i32 @llvm.r600.read.tgid.x() #0 declare i32 @llvm.r600.read.tgid.y() #0 declare i32 @llvm.r600.read.tgid.z() #0 From 7a47a7be128018d0b3353d8213068cc57e022d87 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 30 Nov 2015 21:16:07 +0000 Subject: [PATCH 022/186] AMDGPU: Error if too many user SGPRs used git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254332 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 5 +++++ lib/Target/AMDGPU/AMDGPUSubtarget.h | 3 +++ 2 files changed, 8 insertions(+) diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index b96c48f0561d..87a0cf20d176 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -401,6 +401,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.NumSGPR = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; } + if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) { + LLVMContext &Ctx = MF.getFunction()->getContext(); + Ctx.emitError("too many user SGPRs used"); + } + ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4; ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8; // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index 9921630326b4..971b5179b13c 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -303,6 +303,9 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo { return isAmdHsaOS() ? 0 : 36; } + unsigned getMaxNumUserSGPRs() const { + return 16; + } }; } // End namespace llvm From 504d89597a278daef61805ae47c96cadb290e9ae Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 30 Nov 2015 21:32:10 +0000 Subject: [PATCH 023/186] AMDGPU: Fix unused function git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254333 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/SIISelLowering.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 05b54d0bae51..1b0cc87206f4 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -542,11 +542,6 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, Align); // Alignment } -static ArrayRef getAllSGPRs() { - return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), - AMDGPU::SGPR_32RegClass.getNumRegs()); -} - SDValue SITargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, SDLoc DL, SelectionDAG &DAG, From f44d69a9c3a9c295184011ba37ad36321ced7531 Mon Sep 17 00:00:00 2001 From: Cong Hou Date: Mon, 30 Nov 2015 21:46:08 +0000 Subject: [PATCH 024/186] [X86] Update test/CodeGen/X86/avg.ll with the help of update_llc_test_checks.py. NFC. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254334 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/X86/avg.ll | 597 +++++++++++++++++++++++----------------- 1 file changed, 347 insertions(+), 250 deletions(-) diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll index ce2bf0fdad19..f1c636a73305 100644 --- a/test/CodeGen/X86/avg.ll +++ b/test/CodeGen/X86/avg.ll @@ -1,24 +1,31 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512bw | FileCheck %s --check-prefix=AVX2 --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512BW define void @avg_v4i8(<4 x i8>* %a, <4 x i8>* %b) { -; SSE2-LABEL: avg_v4i8 -; SSE2: # BB#0: -; SSE2-NEXT: movd (%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movd (%rsi), %xmm1 # xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: pavgb %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, (%rax) -; SSE2-NEXT: retq +; SSE2-LABEL: avg_v4i8: +; SSE2: # BB#0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: pavgb %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, (%rax) +; SSE2-NEXT: retq ; -; AVX2-LABEL: avg_v4i8 +; AVX2-LABEL: avg_v4i8: ; AVX2: # BB#0: -; AVX2-NEXT: vmovd (%rdi), %xmm0 -; AVX2-NEXT: vmovd (%rsi), %xmm1 -; AVX2-NEXT: vpavgb %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovd %xmm0, (%rax) -; AVX2-NEXT: retq +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vpavgb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, (%rax) +; AVX2-NEXT: retq ; +; AVX512BW-LABEL: avg_v4i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovd (%rdi), %xmm0 +; AVX512BW-NEXT: vmovd (%rsi), %xmm1 +; AVX512BW-NEXT: vpavgb %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, (%rax) +; AVX512BW-NEXT: retq %1 = load <4 x i8>, <4 x i8>* %a %2 = load <4 x i8>, <4 x i8>* %b %3 = zext <4 x i8> %1 to <4 x i32> @@ -32,22 +39,29 @@ define void @avg_v4i8(<4 x i8>* %a, <4 x i8>* %b) { } define void @avg_v8i8(<8 x i8>* %a, <8 x i8>* %b) { -; SSE2-LABEL: avg_v8i8 +; SSE2-LABEL: avg_v8i8: ; SSE2: # BB#0: -; SSE2-NEXT: movq (%rdi), %xmm0 # xmm0 = mem[0],zero -; SSE2-NEXT: movq (%rsi), %xmm1 # xmm1 = mem[0],zero -; SSE2-NEXT: pavgb %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rax) -; SSE2-NEXT: retq +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: pavgb %xmm0, %xmm1 +; SSE2-NEXT: movq %xmm1, (%rax) +; SSE2-NEXT: retq ; -; AVX2-LABEL: avg_v8i8 +; AVX2-LABEL: avg_v8i8: ; AVX2: # BB#0: -; AVX2-NEXT: vmovq (%rdi), %xmm0 -; AVX2-NEXT: vmovq (%rsi), %xmm1 -; AVX2-NEXT: vpavgb %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovq %xmm0, (%rax) -; AVX2-NEXT: retq +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpavgb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: retq ; +; AVX512BW-LABEL: avg_v8i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovq (%rdi), %xmm0 +; AVX512BW-NEXT: vmovq (%rsi), %xmm1 +; AVX512BW-NEXT: vpavgb %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rax) +; AVX512BW-NEXT: retq %1 = load <8 x i8>, <8 x i8>* %a %2 = load <8 x i8>, <8 x i8>* %b %3 = zext <8 x i8> %1 to <8 x i32> @@ -61,20 +75,19 @@ define void @avg_v8i8(<8 x i8>* %a, <8 x i8>* %b) { } define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) { -; SSE2-LABEL: avg_v16i8 +; SSE2-LABEL: avg_v16i8: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: pavgb (%rdi), %xmm0 -; SSE2-NEXT: movdqu %xmm0, (%rax) -; SSE2-NEXT: retq -; -; AVX2-LABEL: avg_v16i8 -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rsi), %xmm0 -; AVX2-NEXT: vpavgb (%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vmovdqu %xmm0, (%rax) -; AVX2-NEXT: retq -; +; SSE2-NEXT: movdqa (%rsi), %xmm0 +; SSE2-NEXT: pavgb (%rdi), %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX-LABEL: avg_v16i8: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa (%rsi), %xmm0 +; AVX-NEXT: vpavgb (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqu %xmm0, (%rax) +; AVX-NEXT: retq %1 = load <16 x i8>, <16 x i8>* %a %2 = load <16 x i8>, <16 x i8>* %b %3 = zext <16 x i8> %1 to <16 x i32> @@ -88,12 +101,20 @@ define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) { } define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) { -; AVX2-LABEL: avg_v32i8 +; AVX2-LABEL: avg_v32i8: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vmovdqa (%rsi), %ymm0 +; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; +; AVX512BW-LABEL: avg_v32i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512BW-NEXT: vpavgb (%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) +; AVX512BW-NEXT: retq %1 = load <32 x i8>, <32 x i8>* %a %2 = load <32 x i8>, <32 x i8>* %b %3 = zext <32 x i8> %1 to <32 x i32> @@ -107,13 +128,12 @@ define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) { } define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) { -; AVX512BW-LABEL: avg_v64i8 -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0 -; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax) -; AVX512BW-NEXT: retq -; +; AVX512BW-LABEL: avg_v64i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0 +; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax) +; AVX512BW-NEXT: retq %1 = load <64 x i8>, <64 x i8>* %a %2 = load <64 x i8>, <64 x i8>* %b %3 = zext <64 x i8> %1 to <64 x i32> @@ -127,22 +147,29 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) { } define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) { -; SSE2-LABEL: avg_v4i16 +; SSE2-LABEL: avg_v4i16: ; SSE2: # BB#0: -; SSE2-NEXT: movq (%rdi), %xmm0 # xmm0 = mem[0],zero -; SSE2-NEXT: movq (%rsi), %xmm1 # xmm1 = mem[0],zero -; SSE2-NEXT: pavgw %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rax) -; SSE2-NEXT: retq +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: pavgw %xmm0, %xmm1 +; SSE2-NEXT: movq %xmm1, (%rax) +; SSE2-NEXT: retq ; -; AVX2-LABEL: avg_v4i16 -; AVX2: # BB#0: -; AVX2-NEXT: vmovq (%rdi), %xmm0 -; AVX2-NEXT: vmovq (%rsi), %xmm1 -; AVX2-NEXT: vpavgw %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovq %xmm0, (%rax) -; AVX2-NEXT: retq +; AVX2-LABEL: avg_v4i16: +; AVX2: # BB#0: +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpavgw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: retq ; +; AVX512BW-LABEL: avg_v4i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovq (%rdi), %xmm0 +; AVX512BW-NEXT: vmovq (%rsi), %xmm1 +; AVX512BW-NEXT: vpavgw %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rax) +; AVX512BW-NEXT: retq %1 = load <4 x i16>, <4 x i16>* %a %2 = load <4 x i16>, <4 x i16>* %b %3 = zext <4 x i16> %1 to <4 x i32> @@ -156,20 +183,19 @@ define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) { } define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) { -; SSE2-LABEL: avg_v8i16 +; SSE2-LABEL: avg_v8i16: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: pavgw (%rdi), %xmm0 -; SSE2-NEXT: movdqu %xmm0, (%rax) -; SSE2-NEXT: retq -; -; AVX2-LABEL: avg_v8i16 -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rsi), %xmm0 -; AVX2-NEXT: vpavgw (%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vmovdqu %xmm0, (%rax) -; AVX2-NEXT: retq -; +; SSE2-NEXT: movdqa (%rsi), %xmm0 +; SSE2-NEXT: pavgw (%rdi), %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX-LABEL: avg_v8i16: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa (%rsi), %xmm0 +; AVX-NEXT: vpavgw (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqu %xmm0, (%rax) +; AVX-NEXT: retq %1 = load <8 x i16>, <8 x i16>* %a %2 = load <8 x i16>, <8 x i16>* %b %3 = zext <8 x i16> %1 to <8 x i32> @@ -183,12 +209,20 @@ define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) { } define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) { -; AVX2-LABEL: avg_v16i16 +; AVX2-LABEL: avg_v16i16: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vmovdqa (%rsi), %ymm0 +; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; +; AVX512BW-LABEL: avg_v16i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512BW-NEXT: vpavgw (%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) +; AVX512BW-NEXT: retq %1 = load <16 x i16>, <16 x i16>* %a %2 = load <16 x i16>, <16 x i16>* %b %3 = zext <16 x i16> %1 to <16 x i32> @@ -202,13 +236,12 @@ define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) { } define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) { -; AVX512BW-LABEL: avg_v32i16 -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu16 (%rsi), %zmm0 -; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax) -; AVX512BW-NEXT: retq -; +; AVX512BW-LABEL: avg_v32i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqu16 (%rsi), %zmm0 +; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax) +; AVX512BW-NEXT: retq %1 = load <32 x i16>, <32 x i16>* %a %2 = load <32 x i16>, <32 x i16>* %b %3 = zext <32 x i16> %1 to <32 x i32> @@ -222,22 +255,29 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) { } define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) { -; SSE2-LABEL: avg_v4i8_2 +; SSE2-LABEL: avg_v4i8_2: ; SSE2: # BB#0: -; SSE2-NEXT: movd (%rdi), %xmm0 -; SSE2-NEXT: movd (%rsi), %xmm1 -; SSE2-NEXT: pavgb %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, (%rax) -; SSE2-NEXT: retq +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: pavgb %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, (%rax) +; SSE2-NEXT: retq ; -; AVX2-LABEL: avg_v4i8_2 -; AVX2: # BB#0: -; AVX2-NEXT: vmovd (%rdi), %xmm0 -; AVX2-NEXT: vmovd (%rsi), %xmm1 -; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, (%rax) -; AVX2-NEXT: retq +; AVX2-LABEL: avg_v4i8_2: +; AVX2: # BB#0: +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, (%rax) +; AVX2-NEXT: retq ; +; AVX512BW-LABEL: avg_v4i8_2: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovd (%rdi), %xmm0 +; AVX512BW-NEXT: vmovd (%rsi), %xmm1 +; AVX512BW-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, (%rax) +; AVX512BW-NEXT: retq %1 = load <4 x i8>, <4 x i8>* %a %2 = load <4 x i8>, <4 x i8>* %b %3 = zext <4 x i8> %1 to <4 x i32> @@ -251,22 +291,29 @@ define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) { } define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) { -; SSE2-LABEL: avg_v8i8_2 +; SSE2-LABEL: avg_v8i8_2: ; SSE2: # BB#0: -; SSE2-NEXT: movq (%rdi), %xmm0 # xmm0 = mem[0],zero -; SSE2-NEXT: movq (%rsi), %xmm1 # xmm1 = mem[0],zero -; SSE2-NEXT: pavgb %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rax) -; SSE2-NEXT: retq +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: pavgb %xmm0, %xmm1 +; SSE2-NEXT: movq %xmm1, (%rax) +; SSE2-NEXT: retq ; -; AVX2-LABEL: avg_v8i8_2 +; AVX2-LABEL: avg_v8i8_2: ; AVX2: # BB#0: -; AVX2-NEXT: vmovq (%rdi), %xmm0 -; AVX2-NEXT: vmovq (%rsi), %xmm1 -; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, (%rax) -; AVX2-NEXT: retq +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: retq ; +; AVX512BW-LABEL: avg_v8i8_2: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovq (%rdi), %xmm0 +; AVX512BW-NEXT: vmovq (%rsi), %xmm1 +; AVX512BW-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rax) +; AVX512BW-NEXT: retq %1 = load <8 x i8>, <8 x i8>* %a %2 = load <8 x i8>, <8 x i8>* %b %3 = zext <8 x i8> %1 to <8 x i32> @@ -280,20 +327,19 @@ define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) { } define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) { -; SSE2-LABEL: avg_v16i8_2 +; SSE2-LABEL: avg_v16i8_2: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: pavgb (%rsi), %xmm0 -; SSE2-NEXT: movdqu %xmm0, (%rax) -; SSE2-NEXT: retq -; -; AVX2-LABEL: avg_v16i8_2 -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vpavgb (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vmovdqu %xmm0, (%rax) -; AVX2-NEXT: retq -; +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: pavgb (%rsi), %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX-LABEL: avg_v16i8_2: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpavgb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqu %xmm0, (%rax) +; AVX-NEXT: retq %1 = load <16 x i8>, <16 x i8>* %a %2 = load <16 x i8>, <16 x i8>* %b %3 = zext <16 x i8> %1 to <16 x i32> @@ -307,12 +353,20 @@ define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) { } define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) { -; AVX2-LABEL: avg_v32i8_2 +; AVX2-LABEL: avg_v32i8_2: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; +; AVX512BW-LABEL: avg_v32i8_2: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpavgb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) +; AVX512BW-NEXT: retq %1 = load <32 x i8>, <32 x i8>* %a %2 = load <32 x i8>, <32 x i8>* %b %3 = zext <32 x i8> %1 to <32 x i32> @@ -326,13 +380,12 @@ define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) { } define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) { -; AVX512BW-LABEL: avg_v64i8_2 +; AVX512BW-LABEL: avg_v64i8_2: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0 -; AVX512BW-NEXT: vpavgb %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax) -; AVX512BW-NEXT: retq -; +; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0 +; AVX512BW-NEXT: vpavgb %zmm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax) +; AVX512BW-NEXT: retq %1 = load <64 x i8>, <64 x i8>* %a %2 = load <64 x i8>, <64 x i8>* %b %3 = zext <64 x i8> %1 to <64 x i32> @@ -347,22 +400,29 @@ define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) { define void @avg_v4i16_2(<4 x i16>* %a, <4 x i16>* %b) { -; SSE2-LABEL: avg_v4i16_2 +; SSE2-LABEL: avg_v4i16_2: ; SSE2: # BB#0: -; SSE2-NEXT: movq (%rdi), %xmm0 # xmm0 = mem[0],zero -; SSE2-NEXT: movq (%rsi), %xmm1 # xmm1 = mem[0],zero -; SSE2-NEXT: pavgw %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rax) -; SSE2-NEXT: retq +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: pavgw %xmm0, %xmm1 +; SSE2-NEXT: movq %xmm1, (%rax) +; SSE2-NEXT: retq ; -; AVX2-LABEL: avg_v4i16_2 +; AVX2-LABEL: avg_v4i16_2: ; AVX2: # BB#0: -; AVX2-NEXT: vmovq (%rdi), %xmm0 -; AVX2-NEXT: vmovq (%rsi), %xmm1 -; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, (%rax) -; AVX2-NEXT: retq +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: retq ; +; AVX512BW-LABEL: avg_v4i16_2: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovq (%rdi), %xmm0 +; AVX512BW-NEXT: vmovq (%rsi), %xmm1 +; AVX512BW-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rax) +; AVX512BW-NEXT: retq %1 = load <4 x i16>, <4 x i16>* %a %2 = load <4 x i16>, <4 x i16>* %b %3 = zext <4 x i16> %1 to <4 x i32> @@ -376,20 +436,19 @@ define void @avg_v4i16_2(<4 x i16>* %a, <4 x i16>* %b) { } define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) { -; SSE2-LABEL: avg_v8i16_2 +; SSE2-LABEL: avg_v8i16_2: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: pavgw (%rsi), %xmm0 -; SSE2-NEXT: movdqu %xmm0, (%rax) -; SSE2-NEXT: retq -; -; AVX2-LABEL: avg_v8i16_2 -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vpavgw (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vmovdqu %xmm0, (%rax) -; AVX2-NEXT: retq -; +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: pavgw (%rsi), %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX-LABEL: avg_v8i16_2: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpavgw (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqu %xmm0, (%rax) +; AVX-NEXT: retq %1 = load <8 x i16>, <8 x i16>* %a %2 = load <8 x i16>, <8 x i16>* %b %3 = zext <8 x i16> %1 to <8 x i32> @@ -403,12 +462,20 @@ define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) { } define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) { -; AVX2-LABEL: avg_v16i16_2 +; AVX2-LABEL: avg_v16i16_2: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; +; AVX512BW-LABEL: avg_v16i16_2: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpavgw (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) +; AVX512BW-NEXT: retq %1 = load <16 x i16>, <16 x i16>* %a %2 = load <16 x i16>, <16 x i16>* %b %3 = zext <16 x i16> %1 to <16 x i32> @@ -422,13 +489,12 @@ define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) { } define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) { -; AVX512BW-LABEL: avg_v32i16_2 +; AVX512BW-LABEL: avg_v32i16_2: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 -; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax) -; AVX512BW-NEXT: retq -; +; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 +; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax) +; AVX512BW-NEXT: retq %1 = load <32 x i16>, <32 x i16>* %a %2 = load <32 x i16>, <32 x i16>* %b %3 = zext <32 x i16> %1 to <32 x i32> @@ -442,20 +508,26 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) { } define void @avg_v4i8_const(<4 x i8>* %a) { -; SSE2-LABEL: avg_v4i8_const +; SSE2-LABEL: avg_v4i8_const: ; SSE2: # BB#0: -; SSE2-NEXT: movd (%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: pavgb {{.*}}, %xmm0 -; SSE2-NEXT: movd %xmm0, (%rax) -; SSE2-NEXT: retq +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0 +; SSE2-NEXT: movd %xmm0, (%rax) +; SSE2-NEXT: retq ; -; AVX2-LABEL: avg_v4i8_const +; AVX2-LABEL: avg_v4i8_const: ; AVX2: # BB#0: -; AVX2-NEXT: vmovd (%rdi), %xmm0 -; AVX2-NEXT: vpavgb {{.*}}, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, (%rax) -; AVX2-NEXT: retq +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, (%rax) +; AVX2-NEXT: retq ; +; AVX512BW-LABEL: avg_v4i8_const: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovd (%rdi), %xmm0 +; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, (%rax) +; AVX512BW-NEXT: retq %1 = load <4 x i8>, <4 x i8>* %a %2 = zext <4 x i8> %1 to <4 x i32> %3 = add nuw nsw <4 x i32> %2, @@ -466,20 +538,26 @@ define void @avg_v4i8_const(<4 x i8>* %a) { } define void @avg_v8i8_const(<8 x i8>* %a) { -; SSE2-LABEL: avg_v8i8_const +; SSE2-LABEL: avg_v8i8_const: ; SSE2: # BB#0: -; SSE2-NEXT: movq (%rdi), %xmm0 # xmm0 = mem[0],zero -; SSE2-NEXT: pavgb {{.*}}, %xmm0 -; SSE2-NEXT: movq %xmm0, (%rax) -; SSE2-NEXT: retq +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0 +; SSE2-NEXT: movq %xmm0, (%rax) +; SSE2-NEXT: retq ; -; AVX2-LABEL: avg_v8i8_const +; AVX2-LABEL: avg_v8i8_const: ; AVX2: # BB#0: -; AVX2-NEXT: vmovq (%rdi), %xmm0 -; AVX2-NEXT: vpavgb {{.*}}, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, (%rax) -; AVX2-NEXT: retq +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: retq ; +; AVX512BW-LABEL: avg_v8i8_const: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovq (%rdi), %xmm0 +; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rax) +; AVX512BW-NEXT: retq %1 = load <8 x i8>, <8 x i8>* %a %2 = zext <8 x i8> %1 to <8 x i32> %3 = add nuw nsw <8 x i32> %2, @@ -490,20 +568,19 @@ define void @avg_v8i8_const(<8 x i8>* %a) { } define void @avg_v16i8_const(<16 x i8>* %a) { -; SSE2-LABEL: avg_v16i8_const +; SSE2-LABEL: avg_v16i8_const: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: pavgb {{.*}}, %xmm0 -; SSE2-NEXT: movdqu %xmm0, (%rax) -; SSE2-NEXT: retq -; -; AVX2-LABEL: avg_v16i8_const -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vpavgb {{.*}}, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqu %xmm0, (%rax) -; AVX2-NEXT: retq -; +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX-LABEL: avg_v16i8_const: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vmovdqu %xmm0, (%rax) +; AVX-NEXT: retq %1 = load <16 x i8>, <16 x i8>* %a %2 = zext <16 x i8> %1 to <16 x i32> %3 = add nuw nsw <16 x i32> %2, @@ -514,53 +591,66 @@ define void @avg_v16i8_const(<16 x i8>* %a) { } define void @avg_v32i8_const(<32 x i8>* %a) { -; AVX2-LABEL: avg_v32i8_const +; AVX2-LABEL: avg_v32i8_const: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpavgb {{.*}}, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; +; AVX512BW-LABEL: avg_v32i8_const: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) +; AVX512BW-NEXT: retq %1 = load <32 x i8>, <32 x i8>* %a %2 = zext <32 x i8> %1 to <32 x i32> %3 = add nuw nsw <32 x i32> %2, - %4 = lshr <32 x i32> %3, + %4 = lshr <32 x i32> %3, %5 = trunc <32 x i32> %4 to <32 x i8> store <32 x i8> %5, <32 x i8>* undef, align 4 ret void } define void @avg_v64i8_const(<64 x i8>* %a) { -; AVX512BW-LABEL: avg_v64i8_const -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0 -; AVX512BW-NEXT: vpavgb {{.*}}, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax) -; AVX512BW-NEXT: retq -; +; AVX512BW-LABEL: avg_v64i8_const: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0 +; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax) +; AVX512BW-NEXT: retq %1 = load <64 x i8>, <64 x i8>* %a %2 = zext <64 x i8> %1 to <64 x i32> %3 = add nuw nsw <64 x i32> %2, - %4 = lshr <64 x i32> %3, + %4 = lshr <64 x i32> %3, %5 = trunc <64 x i32> %4 to <64 x i8> store <64 x i8> %5, <64 x i8>* undef, align 4 ret void } define void @avg_v4i16_const(<4 x i16>* %a) { -; SSE2-LABEL: avg_v4i16_const +; SSE2-LABEL: avg_v4i16_const: ; SSE2: # BB#0: -; SSE2-NEXT: movq (%rdi), %xmm0 -; SSE2-NEXT: pavgw {{.*}}, %xmm0 -; SSE2-NEXT: movq %xmm0, (%rax) -; SSE2-NEXT: retq +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: pavgw {{.*}}(%rip), %xmm0 +; SSE2-NEXT: movq %xmm0, (%rax) +; SSE2-NEXT: retq ; -; AVX2-LABEL: avg_v4i16_const +; AVX2-LABEL: avg_v4i16_const: ; AVX2: # BB#0: -; AVX2-NEXT: vmovq (%rdi), %xmm0 -; AVX2-NEXT: vpavgw {{.*}}, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, (%rax) -; AVX2-NEXT: retq +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: retq ; +; AVX512BW-LABEL: avg_v4i16_const: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovq (%rdi), %xmm0 +; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rax) +; AVX512BW-NEXT: retq %1 = load <4 x i16>, <4 x i16>* %a %2 = zext <4 x i16> %1 to <4 x i32> %3 = add nuw nsw <4 x i32> %2, @@ -571,20 +661,19 @@ define void @avg_v4i16_const(<4 x i16>* %a) { } define void @avg_v8i16_const(<8 x i16>* %a) { -; SSE2-LABEL: avg_v8i16_const +; SSE2-LABEL: avg_v8i16_const: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: pavgw {{.*}}, %xmm0 -; SSE2-NEXT: movdqu %xmm0, (%rax) -; SSE2-NEXT: retq -; -; AVX2-LABEL: avg_v8i16_const -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vpavgw {{.*}}, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqu %xmm0, (%rax) -; AVX2-NEXT: retq -; +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: pavgw {{.*}}(%rip), %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX-LABEL: avg_v8i16_const: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vmovdqu %xmm0, (%rax) +; AVX-NEXT: retq %1 = load <8 x i16>, <8 x i16>* %a %2 = zext <8 x i16> %1 to <8 x i32> %3 = add nuw nsw <8 x i32> %2, @@ -595,12 +684,20 @@ define void @avg_v8i16_const(<8 x i16>* %a) { } define void @avg_v16i16_const(<16 x i16>* %a) { -; AVX2-LABEL: avg_v16i16_const +; AVX2-LABEL: avg_v16i16_const: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpavgw {{.*}}, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; +; AVX512BW-LABEL: avg_v16i16_const: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) +; AVX512BW-NEXT: retq %1 = load <16 x i16>, <16 x i16>* %a %2 = zext <16 x i16> %1 to <16 x i32> %3 = add nuw nsw <16 x i32> %2, @@ -611,16 +708,16 @@ define void @avg_v16i16_const(<16 x i16>* %a) { } define void @avg_v32i16_const(<32 x i16>* %a) { -; AVX512BW-LABEL: avg_v32i16_const +; AVX512BW-LABEL: avg_v32i16_const: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 -; AVX512BW-NEXT: vpavgw {{.*}}, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax) -; +; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 +; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax) +; AVX512BW-NEXT: retq %1 = load <32 x i16>, <32 x i16>* %a %2 = zext <32 x i16> %1 to <32 x i32> %3 = add nuw nsw <32 x i32> %2, - %4 = lshr <32 x i32> %3, + %4 = lshr <32 x i32> %3, %5 = trunc <32 x i32> %4 to <32 x i16> store <32 x i16> %5, <32 x i16>* undef, align 4 ret void From ae5d4bfbbefb29e8759287f13bc1d3bb44474044 Mon Sep 17 00:00:00 2001 From: Paul Robinson Date: Mon, 30 Nov 2015 21:56:16 +0000 Subject: [PATCH 025/186] Have 'optnone' respect the -fast-isel=false option. This is primarily useful for debugging optnone v. ISel issues. Differential Revision: http://reviews.llvm.org/D14792 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254335 91177308-0d34-0410-b5e6-96231b3b80d8 --- docs/LangRef.rst | 6 +++--- include/llvm/Target/TargetMachine.h | 3 +++ lib/CodeGen/LLVMTargetMachine.cpp | 3 ++- lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp | 10 +++++++--- test/CodeGen/Mips/emergency-spill-slot-near-fp.ll | 4 ++-- test/Feature/optnone-llc.ll | 8 +++++++- 6 files changed, 24 insertions(+), 10 deletions(-) diff --git a/docs/LangRef.rst b/docs/LangRef.rst index c7ea1c1bf236..be8e63bf071f 100644 --- a/docs/LangRef.rst +++ b/docs/LangRef.rst @@ -1289,9 +1289,9 @@ example: that are recognized by LLVM to handle asynchronous exceptions, such as SEH, will still provide their implementation defined semantics. ``optnone`` - This function attribute indicates that the function is not optimized - by any optimization or code generator passes with the - exception of interprocedural optimization passes. + This function attribute indicates that most optimization passes will skip + this function, with the exception of interprocedural optimization passes. + Code generation defaults to the "fast" instruction selector. This attribute cannot be used together with the ``alwaysinline`` attribute; this attribute is also incompatible with the ``minsize`` attribute and the ``optsize`` attribute. diff --git a/include/llvm/Target/TargetMachine.h b/include/llvm/Target/TargetMachine.h index b40e4a69a4d2..b7760a61806f 100644 --- a/include/llvm/Target/TargetMachine.h +++ b/include/llvm/Target/TargetMachine.h @@ -102,6 +102,7 @@ class TargetMachine { const MCSubtargetInfo *STI; unsigned RequireStructuredCFG : 1; + unsigned O0WantsFastISel : 1; /// This API is here to support the C API, deprecated in 3.7 release. /// This should never be used outside of legacy existing client. @@ -190,6 +191,8 @@ class TargetMachine { void setOptLevel(CodeGenOpt::Level Level) const; void setFastISel(bool Enable) { Options.EnableFastISel = Enable; } + bool getO0WantsFastISel() { return O0WantsFastISel; } + void setO0WantsFastISel(bool Enable) { O0WantsFastISel = Enable; } bool shouldPrintMachineCode() const { return Options.PrintMachineCode; } diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp index 5b8c8258b285..da24cb17918b 100644 --- a/lib/CodeGen/LLVMTargetMachine.cpp +++ b/lib/CodeGen/LLVMTargetMachine.cpp @@ -125,9 +125,10 @@ addPassesToGenerateCode(LLVMTargetMachine *TM, PassManagerBase &PM, PM.add(new MachineFunctionAnalysis(*TM, MFInitializer)); // Enable FastISel with -fast, but allow that to be overridden. + TM->setO0WantsFastISel(EnableFastISelOption != cl::BOU_FALSE); if (EnableFastISelOption == cl::BOU_TRUE || (TM->getOptLevel() == CodeGenOpt::None && - EnableFastISelOption != cl::BOU_FALSE)) + TM->getO0WantsFastISel())) TM->setFastISel(true); // Ask the target for an isel. diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index ebf071cb9946..3bbe5d4203bb 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -264,13 +264,17 @@ namespace llvm { return; IS.OptLevel = NewOptLevel; IS.TM.setOptLevel(NewOptLevel); - SavedFastISel = IS.TM.Options.EnableFastISel; - if (NewOptLevel == CodeGenOpt::None) - IS.TM.setFastISel(true); DEBUG(dbgs() << "\nChanging optimization level for Function " << IS.MF->getFunction()->getName() << "\n"); DEBUG(dbgs() << "\tBefore: -O" << SavedOptLevel << " ; After: -O" << NewOptLevel << "\n"); + SavedFastISel = IS.TM.Options.EnableFastISel; + if (NewOptLevel == CodeGenOpt::None) { + IS.TM.setFastISel(IS.TM.getO0WantsFastISel()); + DEBUG(dbgs() << "\tFastISel is " + << (IS.TM.Options.EnableFastISel ? "enabled" : "disabled") + << "\n"); + } } ~OptLevelChanger() { diff --git a/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll b/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll index 58dd16c9f9c8..54092b4e3ebe 100644 --- a/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll +++ b/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll @@ -1,5 +1,5 @@ ; Check that register scavenging spill slot is close to $fp. -; RUN: llc -march=mipsel -O0 -fast-isel=false < %s | FileCheck %s +; RUN: llc -march=mipsel -O0 < %s | FileCheck %s ; CHECK: sw ${{.*}}, 8($sp) ; CHECK: lw ${{.*}}, 8($sp) @@ -31,4 +31,4 @@ entry: ret i32 0 } -attributes #0 = { noinline optnone "no-frame-pointer-elim"="true" } +attributes #0 = { noinline "no-frame-pointer-elim"="true" } diff --git a/test/Feature/optnone-llc.ll b/test/Feature/optnone-llc.ll index 015cc842d9f8..94f61efea4aa 100644 --- a/test/Feature/optnone-llc.ll +++ b/test/Feature/optnone-llc.ll @@ -3,11 +3,13 @@ ; RUN: llc -O2 -debug %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=LLC-Ox ; RUN: llc -O3 -debug %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=LLC-Ox ; RUN: llc -misched-postra -debug %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=LLC-MORE +; RUN: llc -O1 -debug-only=isel %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=FAST +; RUN: llc -O1 -debug-only=isel -fast-isel=false %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=NOFAST ; REQUIRES: asserts, default_triple ; This test verifies that we don't run Machine Function optimizations -; on optnone functions. +; on optnone functions, and that we can turn off FastISel. ; Function Attrs: noinline optnone define i32 @_Z3fooi(i32 %x) #0 { @@ -52,3 +54,7 @@ attributes #0 = { optnone noinline } ; Alternate post-RA scheduler. ; LLC-MORE: Skipping pass 'PostRA Machine Instruction Scheduler' + +; Selectively disable FastISel for optnone functions. +; FAST: FastISel is enabled +; NOFAST: FastISel is disabled From a586fd2c5665c75c1b2870b1361d794ac25caf9e Mon Sep 17 00:00:00 2001 From: Rafael Espindola Date: Mon, 30 Nov 2015 22:01:43 +0000 Subject: [PATCH 026/186] Start deciding earlier what to link. A traditional linker is roughly split in symbol resolution and "copying stuff". The two tasks are badly mixed in lib/Linker. This starts splitting them apart. With this patch there are no direct call to linkGlobalValueBody or linkGlobalValueProto. Everything is linked via WapValue. This also includes a few fixes: * A GV goes undefined if the comdat is dropped (comdat11.ll). * We error if an internal GV goes undefined (comdat13.ll). * We don't link an unused comdat. The first two match the behavior of an ELF linker. The second one is equivalent to running globaldce on the input. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254336 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Linker/LinkModules.cpp | 157 ++++++++++++++------------- lib/Transforms/Utils/ValueMapper.cpp | 6 +- test/Linker/Inputs/comdat11.ll | 9 ++ test/Linker/Inputs/comdat13.ll | 9 ++ test/Linker/comdat11.ll | 13 +++ test/Linker/comdat12.ll | 8 ++ test/Linker/comdat13.ll | 13 +++ test/Linker/comdat9.ll | 3 + 8 files changed, 138 insertions(+), 80 deletions(-) create mode 100644 test/Linker/Inputs/comdat11.ll create mode 100644 test/Linker/Inputs/comdat13.ll create mode 100644 test/Linker/comdat11.ll create mode 100644 test/Linker/comdat12.ll create mode 100644 test/Linker/comdat13.ll diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp index cdf1decc8131..aeaa7eb90903 100644 --- a/lib/Linker/LinkModules.cpp +++ b/lib/Linker/LinkModules.cpp @@ -436,6 +436,8 @@ class ModuleLinker { /// references. bool DoneLinkingBodies; + bool HasError = false; + public: ModuleLinker(Module *dstM, Linker::IdentifiedStructTypeSet &Set, Module *srcM, DiagnosticHandlerFunction DiagnosticHandler, unsigned Flags, @@ -483,6 +485,7 @@ class ModuleLinker { /// Helper method for setting a message and returning an error code. bool emitError(const Twine &Message) { DiagnosticHandler(LinkDiagnosticInfo(DS_Error, Message)); + HasError = true; return true; } @@ -531,6 +534,7 @@ class ModuleLinker { void upgradeMismatchedGlobalArray(StringRef Name); void upgradeMismatchedGlobals(); + bool linkIfNeeded(GlobalValue &GV); bool linkAppendingVarProto(GlobalVariable *DstGV, const GlobalVariable *SrcGV); @@ -904,16 +908,12 @@ Value *ModuleLinker::materializeDeclFor(Value *V) { if (doneLinkingBodies()) return nullptr; - GlobalValue *DGV = copyGlobalValueProto(TypeMap, SGV); - - if (Comdat *SC = SGV->getComdat()) { - if (auto *DGO = dyn_cast(DGV)) { - Comdat *DC = DstM->getOrInsertComdat(SC->getName()); - DGO->setComdat(DC); - } - } - - return DGV; + linkGlobalValueProto(SGV); + if (HasError) + return nullptr; + Value *Ret = ValueMap[SGV]; + assert(Ret); + return Ret; } void ValueMaterializerTy::materializeInitFor(GlobalValue *New, @@ -922,15 +922,31 @@ void ValueMaterializerTy::materializeInitFor(GlobalValue *New, } void ModuleLinker::materializeInitFor(GlobalValue *New, GlobalValue *Old) { + if (auto *F = dyn_cast(New)) { + if (!F->isDeclaration()) + return; + } else if (auto *V = dyn_cast(New)) { + if (V->hasInitializer()) + return; + } else { + auto *A = cast(New); + if (A->getAliasee()) + return; + } + + if (Old->isDeclaration()) + return; + if (isPerformingImport() && !doImportAsDefinition(Old)) return; - // Skip declarations that ValueMaterializer may have created in - // case we link in only some of SrcM. - if (shouldLinkOnlyNeeded() && Old->isDeclaration()) + if (DoNotLinkFromSource.count(Old)) { + if (!New->hasExternalLinkage() && !New->hasExternalWeakLinkage() && + !New->hasAppendingLinkage()) + emitError("Declaration points to discarded value"); return; + } - assert(!Old->isDeclaration() && "users should not pass down decls"); linkGlobalValueBody(*Old); } @@ -1405,7 +1421,6 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) { std::tie(SK, LinkFromSrc) = ComdatsChosen[SC]; C = DstM->getOrInsertComdat(SC->getName()); C->setSelectionKind(SK); - ComdatMembers[SC].push_back(SGV); } else if (DGV) { if (shouldLinkFromSource(LinkFromSrc, *DGV, *SGV)) return true; @@ -1425,31 +1440,12 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) { if (DGV) HasUnnamedAddr = HasUnnamedAddr && DGV->hasUnnamedAddr(); - if (!LinkFromSrc && !DGV) - return false; - GlobalValue *NewGV; - if (!LinkFromSrc) { + if (!LinkFromSrc && DGV) { NewGV = DGV; // When linking from source we setVisibility from copyGlobalValueProto. setVisibility(NewGV, SGV, DGV); } else { - // If the GV is to be lazily linked, don't create it just yet. - // The ValueMaterializerTy will deal with creating it if it's used. - if (!DGV && !shouldOverrideFromSrc() && SGV != ImportFunction && - (SGV->hasLocalLinkage() || SGV->hasLinkOnceLinkage() || - SGV->hasAvailableExternallyLinkage())) { - DoNotLinkFromSource.insert(SGV); - return false; - } - - // When we only want to link in unresolved dependencies, blacklist - // the symbol unless unless DestM has a matching declaration (DGV). - if (shouldLinkOnlyNeeded() && !(DGV && DGV->isDeclaration())) { - DoNotLinkFromSource.insert(SGV); - return false; - } - NewGV = copyGlobalValueProto(TypeMap, SGV, DGV); if (isPerformingImport() && !doImportAsDefinition(SGV)) @@ -1459,7 +1455,7 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) { NewGV->setUnnamedAddr(HasUnnamedAddr); if (auto *NewGO = dyn_cast(NewGV)) { - if (C) + if (C && LinkFromSrc) NewGO->setComdat(C); if (DGV && DGV->hasCommonLinkage() && SGV->hasCommonLinkage()) @@ -1842,6 +1838,38 @@ static std::string mergeTriples(const Triple &SrcTriple, const Triple &DstTriple return DstTriple.str(); } +bool ModuleLinker::linkIfNeeded(GlobalValue &GV) { + GlobalValue *DGV = getLinkedToGlobal(&GV); + + if (shouldLinkOnlyNeeded() && !(DGV && DGV->isDeclaration())) + return false; + + if (DGV && !GV.hasLocalLinkage()) { + GlobalValue::VisibilityTypes Visibility = + getMinVisibility(DGV->getVisibility(), GV.getVisibility()); + DGV->setVisibility(Visibility); + GV.setVisibility(Visibility); + } + + if (const Comdat *SC = GV.getComdat()) { + bool LinkFromSrc; + Comdat::SelectionKind SK; + std::tie(SK, LinkFromSrc) = ComdatsChosen[SC]; + if (!LinkFromSrc) { + DoNotLinkFromSource.insert(&GV); + return false; + } + } + + if (!DGV && !shouldOverrideFromSrc() && + (GV.hasLocalLinkage() || GV.hasLinkOnceLinkage() || + GV.hasAvailableExternallyLinkage())) { + return false; + } + MapValue(&GV, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer); + return HasError; +} + bool ModuleLinker::run() { assert(DstM && "Null destination module"); assert(SrcM && "Null source module"); @@ -1901,24 +1929,30 @@ bool ModuleLinker::run() { // Upgrade mismatched global arrays. upgradeMismatchedGlobals(); + for (GlobalVariable &GV : SrcM->globals()) + if (const Comdat *SC = GV.getComdat()) + ComdatMembers[SC].push_back(&GV); + + for (Function &SF : *SrcM) + if (const Comdat *SC = SF.getComdat()) + ComdatMembers[SC].push_back(&SF); + + for (GlobalAlias &GA : SrcM->aliases()) + if (const Comdat *SC = GA.getComdat()) + ComdatMembers[SC].push_back(&GA); + // Insert all of the globals in src into the DstM module... without linking // initializers (which could refer to functions not yet mapped over). for (GlobalVariable &GV : SrcM->globals()) - if (linkGlobalValueProto(&GV)) + if (linkIfNeeded(GV)) return true; - // Link the functions together between the two modules, without doing function - // bodies... this just adds external function prototypes to the DstM - // function... We do this so that when we begin processing function bodies, - // all of the global values that may be referenced are available in our - // ValueMap. - for (Function &F :*SrcM) - if (linkGlobalValueProto(&F)) + for (Function &SF : *SrcM) + if (linkIfNeeded(SF)) return true; - // If there were any aliases, link them now. for (GlobalAlias &GA : SrcM->aliases()) - if (linkGlobalValueProto(&GA)) + if (linkIfNeeded(GA)) return true; for (AppendingVarInfo &AppendingVar : AppendingVars) @@ -1933,37 +1967,6 @@ bool ModuleLinker::run() { MapValue(GV, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer); } - // Link in the function bodies that are defined in the source module into - // DstM. - for (Function &SF : *SrcM) { - // Skip if no body (function is external). - if (SF.isDeclaration()) - continue; - - // Skip if not linking from source. - if (DoNotLinkFromSource.count(&SF)) - continue; - - if (linkGlobalValueBody(SF)) - return true; - } - - // Resolve all uses of aliases with aliasees. - for (GlobalAlias &Src : SrcM->aliases()) { - if (DoNotLinkFromSource.count(&Src)) - continue; - linkGlobalValueBody(Src); - } - - // Update the initializers in the DstM module now that all globals that may - // be referenced are in DstM. - for (GlobalVariable &Src : SrcM->globals()) { - // Only process initialized GV's or ones not already in dest. - if (!Src.hasInitializer() || DoNotLinkFromSource.count(&Src)) - continue; - linkGlobalValueBody(Src); - } - // Note that we are done linking global value bodies. This prevents // metadata linking from creating new references. DoneLinkingBodies = true; diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp index 0a63c1d5153c..00a8984845dd 100644 --- a/lib/Transforms/Utils/ValueMapper.cpp +++ b/lib/Transforms/Utils/ValueMapper.cpp @@ -41,9 +41,9 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, if (Value *NewV = Materializer->materializeDeclFor(const_cast(V))) { VM[V] = NewV; - if (auto *GV = dyn_cast(V)) - Materializer->materializeInitFor(cast(NewV), - const_cast(GV)); + if (auto *NewGV = dyn_cast(NewV)) + Materializer->materializeInitFor( + NewGV, const_cast(cast(V))); return NewV; } } diff --git a/test/Linker/Inputs/comdat11.ll b/test/Linker/Inputs/comdat11.ll new file mode 100644 index 000000000000..5b7f74cf0b24 --- /dev/null +++ b/test/Linker/Inputs/comdat11.ll @@ -0,0 +1,9 @@ +$foo = comdat any +@foo = global i8 1, comdat +define void @zed() { + call void @bar() + ret void +} +define void @bar() comdat($foo) { + ret void +} diff --git a/test/Linker/Inputs/comdat13.ll b/test/Linker/Inputs/comdat13.ll new file mode 100644 index 000000000000..a2d16bd261b5 --- /dev/null +++ b/test/Linker/Inputs/comdat13.ll @@ -0,0 +1,9 @@ +$foo = comdat any +@foo = global i8 1, comdat +define void @zed() { + call void @bar() + ret void +} +define internal void @bar() comdat($foo) { + ret void +} diff --git a/test/Linker/comdat11.ll b/test/Linker/comdat11.ll new file mode 100644 index 000000000000..dbade4104fe3 --- /dev/null +++ b/test/Linker/comdat11.ll @@ -0,0 +1,13 @@ +; RUN: llvm-link -S %s %p/Inputs/comdat11.ll -o - | FileCheck %s + +$foo = comdat any +@foo = global i8 0, comdat + +; CHECK: @foo = global i8 0, comdat + +; CHECK: define void @zed() { +; CHECK: call void @bar() +; CHECK: ret void +; CHECK: } + +; CHECK: declare void @bar() diff --git a/test/Linker/comdat12.ll b/test/Linker/comdat12.ll new file mode 100644 index 000000000000..d06e222b63ac --- /dev/null +++ b/test/Linker/comdat12.ll @@ -0,0 +1,8 @@ +; RUN: llvm-link %s -S -o - | FileCheck %s + +$foo = comdat largest +define internal void @foo() comdat($foo) { + ret void +} + +; CHECK-NOT: foo diff --git a/test/Linker/comdat13.ll b/test/Linker/comdat13.ll new file mode 100644 index 000000000000..a8e51f04ae11 --- /dev/null +++ b/test/Linker/comdat13.ll @@ -0,0 +1,13 @@ +; RUN: not llvm-link -S %s %p/Inputs/comdat13.ll -o %t.ll 2>&1 | FileCheck %s + +; In Inputs/comdat13.ll a function not in the $foo comdat (zed) references an +; internal function in the comdat $foo. +; We might want to have the verifier reject that, but for now we at least check +; that the linker produces an error. +; This is the IR equivalent of the "relocation refers to discarded section" in +; an ELF linker. + +; CHECK: Declaration points to discarded value + +$foo = comdat any +@foo = global i8 0, comdat diff --git a/test/Linker/comdat9.ll b/test/Linker/comdat9.ll index 274957401aac..4f6f2cfb845d 100644 --- a/test/Linker/comdat9.ll +++ b/test/Linker/comdat9.ll @@ -14,6 +14,9 @@ $f2 = comdat largest define internal void @f2() comdat($f2) { ret void } +define void @f3() comdat($f2) { + ret void +} ; CHECK-DAG: $f2 = comdat largest ; CHECK-DAG: define internal void @f2() comdat { From 0f73ee481b5c64201c9be66a309d353ee0a677c8 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 30 Nov 2015 22:22:06 +0000 Subject: [PATCH 027/186] [X86][FMA4] Prefer FMA4 to FMA We currently output FMA instructions on targets which support both FMA4 + FMA (i.e. later Bulldozer CPUS bdver2/bdver3/bdver4). This patch flips this so FMA4 is preferred; this is for several reasons: 1 - FMA4 is non-destructive reducing the need for mov instructions. 2 - Its more straighforward to commute and fold inputs (although the recent work on FMA has reduced this difference). 3 - All supported targets have FMA4 performance equal or better to FMA - Piledriver (bdver2) in particular has half the throughput when executing FMA instructions. Its looks like no future AMD processor lines will support FMA4 after the Bulldozer series so we're not causing problems for later CPUs. Differential Revision: http://reviews.llvm.org/D14997 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254339 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86Subtarget.h | 7 ++++--- test/CodeGen/X86/fma-commute-x86.ll | 2 +- test/CodeGen/X86/fma_patterns.ll | 2 +- test/CodeGen/X86/fma_patterns_wide.ll | 2 +- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index 425bc2482e9b..eb0199aecbeb 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -354,9 +354,10 @@ class X86Subtarget final : public X86GenSubtargetInfo { bool hasXSAVEC() const { return HasXSAVEC; } bool hasXSAVES() const { return HasXSAVES; } bool hasPCLMUL() const { return HasPCLMUL; } - bool hasFMA() const { return HasFMA; } - // FIXME: Favor FMA when both are enabled. Is this the right thing to do? - bool hasFMA4() const { return HasFMA4 && !HasFMA; } + // Prefer FMA4 to FMA - its better for commutation/memory folding and + // has equal or better performance on all supported targets. + bool hasFMA() const { return HasFMA && !HasFMA4; } + bool hasFMA4() const { return HasFMA4; } bool hasXOP() const { return HasXOP; } bool hasTBM() const { return HasTBM; } bool hasMOVBE() const { return HasMOVBE; } diff --git a/test/CodeGen/X86/fma-commute-x86.ll b/test/CodeGen/X86/fma-commute-x86.ll index 9e57b00bc0b4..9a368792133b 100644 --- a/test/CodeGen/X86/fma-commute-x86.ll +++ b/test/CodeGen/X86/fma-commute-x86.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-pc-win32 -mattr=+fma,+fma4 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-pc-win32 -mattr=+fma | FileCheck %s ; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 -mattr=-fma4 | FileCheck %s attributes #0 = { nounwind } diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll index 942d799d9761..e3295e45823c 100644 --- a/test/CodeGen/X86/fma_patterns.ll +++ b/test/CodeGen/X86/fma_patterns.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 diff --git a/test/CodeGen/X86/fma_patterns_wide.ll b/test/CodeGen/X86/fma_patterns_wide.ll index de77a27ad2b6..f412c174fe37 100644 --- a/test/CodeGen/X86/fma_patterns_wide.ll +++ b/test/CodeGen/X86/fma_patterns_wide.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -fp-contract=fast | FileCheck %s --check-prefix=FMA -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast | FileCheck %s --check-prefix=FMA +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast | FileCheck %s --check-prefix=FMA4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -fp-contract=fast | FileCheck %s --check-prefix=FMA4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -fp-contract=fast | FileCheck %s --check-prefix=AVX512 From 7cb4a767f86b71670480bdfb99710371d6c93d1c Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Mon, 30 Nov 2015 22:39:36 +0000 Subject: [PATCH 028/186] [InstCombine] add tests to show potential vector IR shuffle transforms git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254342 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../InstCombine/insert-extract-shuffle.ll | 53 +++++++++++++++++-- 1 file changed, 48 insertions(+), 5 deletions(-) diff --git a/test/Transforms/InstCombine/insert-extract-shuffle.ll b/test/Transforms/InstCombine/insert-extract-shuffle.ll index 8929c82def7b..6841cd64abe3 100644 --- a/test/Transforms/InstCombine/insert-extract-shuffle.ll +++ b/test/Transforms/InstCombine/insert-extract-shuffle.ll @@ -24,14 +24,57 @@ define <4 x i16> @test2(<8 x i16> %in, <8 x i16> %in2) { ret <4 x i16> %vec.3 } -define <2 x i64> @test_vcopyq_lane_p64(<2 x i64> %a, <1 x i64> %b) #0 { +define <2 x i64> @test_vcopyq_lane_p64(<2 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: @test_vcopyq_lane_p64 -; CHECK: extractelement -; CHECK: insertelement -; CHECK-NOT: shufflevector -entry: +; CHECK-NEXT: extractelement +; CHECK-NEXT: insertelement +; CHECK-NEXT: ret <2 x i64> %res %elt = extractelement <1 x i64> %b, i32 0 %res = insertelement <2 x i64> %a, i64 %elt, i32 1 ret <2 x i64> %res } +; PR2109: https://llvm.org/bugs/show_bug.cgi?id=2109 + +define <4 x float> @widen_extract2(<4 x float> %ins, <2 x float> %ext) { +; CHECK-LABEL: @widen_extract2( +; CHECK-NEXT: extractelement +; CHECK-NEXT: extractelement +; CHECK-NEXT: insertelement +; CHECK-NEXT: insertelement +; CHECK-NEXT: ret <4 x float> %i2 + %e1 = extractelement <2 x float> %ext, i32 0 + %e2 = extractelement <2 x float> %ext, i32 1 + %i1 = insertelement <4 x float> %ins, float %e1, i32 1 + %i2 = insertelement <4 x float> %i1, float %e2, i32 3 + ret <4 x float> %i2 +} + +define <4 x float> @widen_extract3(<4 x float> %ins, <3 x float> %ext) { +; CHECK-LABEL: @widen_extract3( +; CHECK-NEXT: extractelement +; CHECK-NEXT: extractelement +; CHECK-NEXT: extractelement +; CHECK-NEXT: insertelement +; CHECK-NEXT: insertelement +; CHECK-NEXT: insertelement +; CHECK-NEXT: ret <4 x float> %i3 + %e1 = extractelement <3 x float> %ext, i32 0 + %e2 = extractelement <3 x float> %ext, i32 1 + %e3 = extractelement <3 x float> %ext, i32 2 + %i1 = insertelement <4 x float> %ins, float %e1, i32 2 + %i2 = insertelement <4 x float> %i1, float %e2, i32 1 + %i3 = insertelement <4 x float> %i2, float %e3, i32 0 + ret <4 x float> %i3 +} + +define <8 x float> @too_wide(<8 x float> %ins, <2 x float> %ext) { +; CHECK-LABEL: @too_wide( +; CHECK-NEXT: extractelement +; CHECK-NEXT: insertelement +; CHECK-NEXT: ret <8 x float> %i1 + %e1 = extractelement <2 x float> %ext, i32 0 + %i1 = insertelement <8 x float> %ins, float %e1, i32 2 + ret <8 x float> %i1 +} + From 36d161829490a6e8655db29b13bb247ea4717800 Mon Sep 17 00:00:00 2001 From: Rafael Espindola Date: Mon, 30 Nov 2015 23:05:25 +0000 Subject: [PATCH 029/186] Disable a consistency check. Trying to figure out why it fails on a bot but passes locally. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254344 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Linker/LinkModules.cpp | 6 +----- test/Linker/Inputs/comdat13.ll | 9 --------- test/Linker/comdat13.ll | 13 ------------- 3 files changed, 1 insertion(+), 27 deletions(-) delete mode 100644 test/Linker/Inputs/comdat13.ll delete mode 100644 test/Linker/comdat13.ll diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp index aeaa7eb90903..edee55a1f9f6 100644 --- a/lib/Linker/LinkModules.cpp +++ b/lib/Linker/LinkModules.cpp @@ -940,12 +940,8 @@ void ModuleLinker::materializeInitFor(GlobalValue *New, GlobalValue *Old) { if (isPerformingImport() && !doImportAsDefinition(Old)) return; - if (DoNotLinkFromSource.count(Old)) { - if (!New->hasExternalLinkage() && !New->hasExternalWeakLinkage() && - !New->hasAppendingLinkage()) - emitError("Declaration points to discarded value"); + if (DoNotLinkFromSource.count(Old)) return; - } linkGlobalValueBody(*Old); } diff --git a/test/Linker/Inputs/comdat13.ll b/test/Linker/Inputs/comdat13.ll deleted file mode 100644 index a2d16bd261b5..000000000000 --- a/test/Linker/Inputs/comdat13.ll +++ /dev/null @@ -1,9 +0,0 @@ -$foo = comdat any -@foo = global i8 1, comdat -define void @zed() { - call void @bar() - ret void -} -define internal void @bar() comdat($foo) { - ret void -} diff --git a/test/Linker/comdat13.ll b/test/Linker/comdat13.ll deleted file mode 100644 index a8e51f04ae11..000000000000 --- a/test/Linker/comdat13.ll +++ /dev/null @@ -1,13 +0,0 @@ -; RUN: not llvm-link -S %s %p/Inputs/comdat13.ll -o %t.ll 2>&1 | FileCheck %s - -; In Inputs/comdat13.ll a function not in the $foo comdat (zed) references an -; internal function in the comdat $foo. -; We might want to have the verifier reject that, but for now we at least check -; that the linker produces an error. -; This is the IR equivalent of the "relocation refers to discarded section" in -; an ELF linker. - -; CHECK: Declaration points to discarded value - -$foo = comdat any -@foo = global i8 0, comdat From 7c4c88cf5f63239ee60f335af4d66ed363c92310 Mon Sep 17 00:00:00 2001 From: Rafael Espindola Date: Mon, 30 Nov 2015 23:54:19 +0000 Subject: [PATCH 030/186] This reverts commit r254336 and r254344. They broke a bot and I am debugging why. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254347 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Linker/LinkModules.cpp | 153 ++++++++++++++------------- lib/Transforms/Utils/ValueMapper.cpp | 6 +- test/Linker/Inputs/comdat11.ll | 9 -- test/Linker/comdat11.ll | 13 --- test/Linker/comdat12.ll | 8 -- test/Linker/comdat9.ll | 3 - 6 files changed, 80 insertions(+), 112 deletions(-) delete mode 100644 test/Linker/Inputs/comdat11.ll delete mode 100644 test/Linker/comdat11.ll delete mode 100644 test/Linker/comdat12.ll diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp index edee55a1f9f6..cdf1decc8131 100644 --- a/lib/Linker/LinkModules.cpp +++ b/lib/Linker/LinkModules.cpp @@ -436,8 +436,6 @@ class ModuleLinker { /// references. bool DoneLinkingBodies; - bool HasError = false; - public: ModuleLinker(Module *dstM, Linker::IdentifiedStructTypeSet &Set, Module *srcM, DiagnosticHandlerFunction DiagnosticHandler, unsigned Flags, @@ -485,7 +483,6 @@ class ModuleLinker { /// Helper method for setting a message and returning an error code. bool emitError(const Twine &Message) { DiagnosticHandler(LinkDiagnosticInfo(DS_Error, Message)); - HasError = true; return true; } @@ -534,7 +531,6 @@ class ModuleLinker { void upgradeMismatchedGlobalArray(StringRef Name); void upgradeMismatchedGlobals(); - bool linkIfNeeded(GlobalValue &GV); bool linkAppendingVarProto(GlobalVariable *DstGV, const GlobalVariable *SrcGV); @@ -908,12 +904,16 @@ Value *ModuleLinker::materializeDeclFor(Value *V) { if (doneLinkingBodies()) return nullptr; - linkGlobalValueProto(SGV); - if (HasError) - return nullptr; - Value *Ret = ValueMap[SGV]; - assert(Ret); - return Ret; + GlobalValue *DGV = copyGlobalValueProto(TypeMap, SGV); + + if (Comdat *SC = SGV->getComdat()) { + if (auto *DGO = dyn_cast(DGV)) { + Comdat *DC = DstM->getOrInsertComdat(SC->getName()); + DGO->setComdat(DC); + } + } + + return DGV; } void ValueMaterializerTy::materializeInitFor(GlobalValue *New, @@ -922,27 +922,15 @@ void ValueMaterializerTy::materializeInitFor(GlobalValue *New, } void ModuleLinker::materializeInitFor(GlobalValue *New, GlobalValue *Old) { - if (auto *F = dyn_cast(New)) { - if (!F->isDeclaration()) - return; - } else if (auto *V = dyn_cast(New)) { - if (V->hasInitializer()) - return; - } else { - auto *A = cast(New); - if (A->getAliasee()) - return; - } - - if (Old->isDeclaration()) - return; - if (isPerformingImport() && !doImportAsDefinition(Old)) return; - if (DoNotLinkFromSource.count(Old)) + // Skip declarations that ValueMaterializer may have created in + // case we link in only some of SrcM. + if (shouldLinkOnlyNeeded() && Old->isDeclaration()) return; + assert(!Old->isDeclaration() && "users should not pass down decls"); linkGlobalValueBody(*Old); } @@ -1417,6 +1405,7 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) { std::tie(SK, LinkFromSrc) = ComdatsChosen[SC]; C = DstM->getOrInsertComdat(SC->getName()); C->setSelectionKind(SK); + ComdatMembers[SC].push_back(SGV); } else if (DGV) { if (shouldLinkFromSource(LinkFromSrc, *DGV, *SGV)) return true; @@ -1436,12 +1425,31 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) { if (DGV) HasUnnamedAddr = HasUnnamedAddr && DGV->hasUnnamedAddr(); + if (!LinkFromSrc && !DGV) + return false; + GlobalValue *NewGV; - if (!LinkFromSrc && DGV) { + if (!LinkFromSrc) { NewGV = DGV; // When linking from source we setVisibility from copyGlobalValueProto. setVisibility(NewGV, SGV, DGV); } else { + // If the GV is to be lazily linked, don't create it just yet. + // The ValueMaterializerTy will deal with creating it if it's used. + if (!DGV && !shouldOverrideFromSrc() && SGV != ImportFunction && + (SGV->hasLocalLinkage() || SGV->hasLinkOnceLinkage() || + SGV->hasAvailableExternallyLinkage())) { + DoNotLinkFromSource.insert(SGV); + return false; + } + + // When we only want to link in unresolved dependencies, blacklist + // the symbol unless unless DestM has a matching declaration (DGV). + if (shouldLinkOnlyNeeded() && !(DGV && DGV->isDeclaration())) { + DoNotLinkFromSource.insert(SGV); + return false; + } + NewGV = copyGlobalValueProto(TypeMap, SGV, DGV); if (isPerformingImport() && !doImportAsDefinition(SGV)) @@ -1451,7 +1459,7 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) { NewGV->setUnnamedAddr(HasUnnamedAddr); if (auto *NewGO = dyn_cast(NewGV)) { - if (C && LinkFromSrc) + if (C) NewGO->setComdat(C); if (DGV && DGV->hasCommonLinkage() && SGV->hasCommonLinkage()) @@ -1834,38 +1842,6 @@ static std::string mergeTriples(const Triple &SrcTriple, const Triple &DstTriple return DstTriple.str(); } -bool ModuleLinker::linkIfNeeded(GlobalValue &GV) { - GlobalValue *DGV = getLinkedToGlobal(&GV); - - if (shouldLinkOnlyNeeded() && !(DGV && DGV->isDeclaration())) - return false; - - if (DGV && !GV.hasLocalLinkage()) { - GlobalValue::VisibilityTypes Visibility = - getMinVisibility(DGV->getVisibility(), GV.getVisibility()); - DGV->setVisibility(Visibility); - GV.setVisibility(Visibility); - } - - if (const Comdat *SC = GV.getComdat()) { - bool LinkFromSrc; - Comdat::SelectionKind SK; - std::tie(SK, LinkFromSrc) = ComdatsChosen[SC]; - if (!LinkFromSrc) { - DoNotLinkFromSource.insert(&GV); - return false; - } - } - - if (!DGV && !shouldOverrideFromSrc() && - (GV.hasLocalLinkage() || GV.hasLinkOnceLinkage() || - GV.hasAvailableExternallyLinkage())) { - return false; - } - MapValue(&GV, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer); - return HasError; -} - bool ModuleLinker::run() { assert(DstM && "Null destination module"); assert(SrcM && "Null source module"); @@ -1925,30 +1901,24 @@ bool ModuleLinker::run() { // Upgrade mismatched global arrays. upgradeMismatchedGlobals(); - for (GlobalVariable &GV : SrcM->globals()) - if (const Comdat *SC = GV.getComdat()) - ComdatMembers[SC].push_back(&GV); - - for (Function &SF : *SrcM) - if (const Comdat *SC = SF.getComdat()) - ComdatMembers[SC].push_back(&SF); - - for (GlobalAlias &GA : SrcM->aliases()) - if (const Comdat *SC = GA.getComdat()) - ComdatMembers[SC].push_back(&GA); - // Insert all of the globals in src into the DstM module... without linking // initializers (which could refer to functions not yet mapped over). for (GlobalVariable &GV : SrcM->globals()) - if (linkIfNeeded(GV)) + if (linkGlobalValueProto(&GV)) return true; - for (Function &SF : *SrcM) - if (linkIfNeeded(SF)) + // Link the functions together between the two modules, without doing function + // bodies... this just adds external function prototypes to the DstM + // function... We do this so that when we begin processing function bodies, + // all of the global values that may be referenced are available in our + // ValueMap. + for (Function &F :*SrcM) + if (linkGlobalValueProto(&F)) return true; + // If there were any aliases, link them now. for (GlobalAlias &GA : SrcM->aliases()) - if (linkIfNeeded(GA)) + if (linkGlobalValueProto(&GA)) return true; for (AppendingVarInfo &AppendingVar : AppendingVars) @@ -1963,6 +1933,37 @@ bool ModuleLinker::run() { MapValue(GV, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer); } + // Link in the function bodies that are defined in the source module into + // DstM. + for (Function &SF : *SrcM) { + // Skip if no body (function is external). + if (SF.isDeclaration()) + continue; + + // Skip if not linking from source. + if (DoNotLinkFromSource.count(&SF)) + continue; + + if (linkGlobalValueBody(SF)) + return true; + } + + // Resolve all uses of aliases with aliasees. + for (GlobalAlias &Src : SrcM->aliases()) { + if (DoNotLinkFromSource.count(&Src)) + continue; + linkGlobalValueBody(Src); + } + + // Update the initializers in the DstM module now that all globals that may + // be referenced are in DstM. + for (GlobalVariable &Src : SrcM->globals()) { + // Only process initialized GV's or ones not already in dest. + if (!Src.hasInitializer() || DoNotLinkFromSource.count(&Src)) + continue; + linkGlobalValueBody(Src); + } + // Note that we are done linking global value bodies. This prevents // metadata linking from creating new references. DoneLinkingBodies = true; diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp index 00a8984845dd..0a63c1d5153c 100644 --- a/lib/Transforms/Utils/ValueMapper.cpp +++ b/lib/Transforms/Utils/ValueMapper.cpp @@ -41,9 +41,9 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, if (Value *NewV = Materializer->materializeDeclFor(const_cast(V))) { VM[V] = NewV; - if (auto *NewGV = dyn_cast(NewV)) - Materializer->materializeInitFor( - NewGV, const_cast(cast(V))); + if (auto *GV = dyn_cast(V)) + Materializer->materializeInitFor(cast(NewV), + const_cast(GV)); return NewV; } } diff --git a/test/Linker/Inputs/comdat11.ll b/test/Linker/Inputs/comdat11.ll deleted file mode 100644 index 5b7f74cf0b24..000000000000 --- a/test/Linker/Inputs/comdat11.ll +++ /dev/null @@ -1,9 +0,0 @@ -$foo = comdat any -@foo = global i8 1, comdat -define void @zed() { - call void @bar() - ret void -} -define void @bar() comdat($foo) { - ret void -} diff --git a/test/Linker/comdat11.ll b/test/Linker/comdat11.ll deleted file mode 100644 index dbade4104fe3..000000000000 --- a/test/Linker/comdat11.ll +++ /dev/null @@ -1,13 +0,0 @@ -; RUN: llvm-link -S %s %p/Inputs/comdat11.ll -o - | FileCheck %s - -$foo = comdat any -@foo = global i8 0, comdat - -; CHECK: @foo = global i8 0, comdat - -; CHECK: define void @zed() { -; CHECK: call void @bar() -; CHECK: ret void -; CHECK: } - -; CHECK: declare void @bar() diff --git a/test/Linker/comdat12.ll b/test/Linker/comdat12.ll deleted file mode 100644 index d06e222b63ac..000000000000 --- a/test/Linker/comdat12.ll +++ /dev/null @@ -1,8 +0,0 @@ -; RUN: llvm-link %s -S -o - | FileCheck %s - -$foo = comdat largest -define internal void @foo() comdat($foo) { - ret void -} - -; CHECK-NOT: foo diff --git a/test/Linker/comdat9.ll b/test/Linker/comdat9.ll index 4f6f2cfb845d..274957401aac 100644 --- a/test/Linker/comdat9.ll +++ b/test/Linker/comdat9.ll @@ -14,9 +14,6 @@ $f2 = comdat largest define internal void @f2() comdat($f2) { ret void } -define void @f3() comdat($f2) { - ret void -} ; CHECK-DAG: $f2 = comdat largest ; CHECK-DAG: define internal void @f2() comdat { From 92989cbe8452a9406f6e0d3c5709b5745f27158a Mon Sep 17 00:00:00 2001 From: Cong Hou Date: Tue, 1 Dec 2015 00:02:51 +0000 Subject: [PATCH 031/186] Replace all weight-based interfaces in MBB with probability-based interfaces, and update all uses of old interfaces. The patch in http://reviews.llvm.org/D13745 is broken into four parts: 1. New interfaces without functional changes (http://reviews.llvm.org/D13908). 2. Use new interfaces in SelectionDAG, while in other passes treat probabilities as weights (http://reviews.llvm.org/D14361). 3. Use new interfaces in all other passes. 4. Remove old interfaces. This patch is 3+4 above. In this patch, MBB won't provide weight-based interfaces any more, which are totally replaced by probability-based ones. The interface addSuccessor() is redesigned so that the default probability is unknown. We allow unknown probabilities but don't allow using it together with known probabilities in successor list. That is to say, we either have a list of successors with all known probabilities, or all unknown probabilities. In the latter case, we assume each successor has 1/N probability where N is the number of successors. An assertion checks if the user is attempting to add a successor with the disallowed mixed use as stated above. This can help us catch many misuses. All uses of weight-based interfaces are now updated to use probability-based ones. Differential revision: http://reviews.llvm.org/D14973 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254348 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Analysis/BranchProbabilityInfo.h | 3 + include/llvm/CodeGen/MachineBasicBlock.h | 46 +----- .../CodeGen/MachineBranchProbabilityInfo.h | 22 +-- include/llvm/Support/BranchProbability.h | 31 +++- lib/Analysis/BranchProbabilityInfo.cpp | 6 + lib/CodeGen/BranchFolding.cpp | 9 +- lib/CodeGen/IfConversion.cpp | 155 ++++++++---------- lib/CodeGen/MIRParser/MIParser.cpp | 3 +- lib/CodeGen/MIRPrinter.cpp | 4 +- lib/CodeGen/MachineBasicBlock.cpp | 142 ++++------------ lib/CodeGen/MachineBlockPlacement.cpp | 67 +++----- lib/CodeGen/MachineBranchProbabilityInfo.cpp | 82 +++------ lib/CodeGen/TailDuplication.cpp | 6 +- lib/Support/BranchProbability.cpp | 20 ++- lib/Target/AMDGPU/AMDILCFGStructurizer.cpp | 6 +- lib/Target/ARM/ARMConstantIslandPass.cpp | 3 +- lib/Target/ARM/ARMISelLowering.cpp | 2 +- lib/Target/Hexagon/HexagonCFGOptimizer.cpp | 6 +- lib/Target/Mips/MipsLongBranch.cpp | 3 +- test/CodeGen/ARM/ifcvt-branch-weight-bug.ll | 14 +- test/CodeGen/ARM/ifcvt-branch-weight.ll | 2 +- test/CodeGen/ARM/ifcvt-iter-indbr.ll | 10 +- test/CodeGen/ARM/tail-merge-branch-weight.ll | 2 +- test/CodeGen/ARM/taildup-branch-weight.ll | 4 +- test/CodeGen/Generic/MachineBranchProb.ll | 8 +- test/CodeGen/Hexagon/ifcvt-edge-weight.ll | 2 +- test/CodeGen/MIR/X86/newline-handling.mir | 4 +- .../X86/successor-basic-blocks-weights.mir | 6 +- .../MIR/X86/successor-basic-blocks.mir | 4 +- test/CodeGen/X86/MachineBranchProb.ll | 4 +- test/CodeGen/X86/catchpad-weight.ll | 2 +- test/CodeGen/X86/stack-protector-weight.ll | 4 +- test/CodeGen/X86/switch-edge-weight.ll | 22 +-- test/CodeGen/X86/switch-jump-table.ll | 8 +- 34 files changed, 292 insertions(+), 420 deletions(-) diff --git a/include/llvm/Analysis/BranchProbabilityInfo.h b/include/llvm/Analysis/BranchProbabilityInfo.h index 89dec14b2b3e..69dae5e90785 100644 --- a/include/llvm/Analysis/BranchProbabilityInfo.h +++ b/include/llvm/Analysis/BranchProbabilityInfo.h @@ -61,6 +61,9 @@ class BranchProbabilityInfo { BranchProbability getEdgeProbability(const BasicBlock *Src, const BasicBlock *Dst) const; + BranchProbability getEdgeProbability(const BasicBlock *Src, + succ_const_iterator Dst) const; + /// \brief Test if an edge is hot relative to other out-edges of the Src. /// /// Check whether this edge out of the source block is 'hot'. We define hot diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h index a2b1a850ec76..ac87f4f901f5 100644 --- a/include/llvm/CodeGen/MachineBasicBlock.h +++ b/include/llvm/CodeGen/MachineBasicBlock.h @@ -91,13 +91,6 @@ class MachineBasicBlock std::vector Predecessors; std::vector Successors; - /// Keep track of the weights to the successors. This vector has the same - /// order as Successors, or it is empty if we don't use it (disable - /// optimization). - std::vector Weights; - typedef std::vector::iterator weight_iterator; - typedef std::vector::const_iterator const_weight_iterator; - /// Keep track of the probabilities to the successors. This vector has the /// same order as Successors, or it is empty if we don't use it (disable /// optimization). @@ -440,26 +433,16 @@ class MachineBasicBlock // Machine-CFG mutators - /// Add Succ as a successor of this MachineBasicBlock. The Predecessors list - /// of Succ is automatically updated. WEIGHT parameter is stored in Weights - /// list and it may be used by MachineBranchProbabilityInfo analysis to - /// calculate branch probability. - /// - /// Note that duplicate Machine CFG edges are not allowed. - void addSuccessor(MachineBasicBlock *Succ, uint32_t Weight = 0); - - /// Add Succ as a successor of this MachineBasicBlock. The Predecessors list - /// of Succ is automatically updated. The weight is not provided because BPI - /// is not available (e.g. -O0 is used), in which case edge weights won't be - /// used. Using this interface can save some space. - void addSuccessorWithoutWeight(MachineBasicBlock *Succ); - /// Add Succ as a successor of this MachineBasicBlock. The Predecessors list /// of Succ is automatically updated. PROB parameter is stored in - /// Probabilities list. + /// Probabilities list. The default probability is set as unknown. Mixing + /// known and unknown probabilities in successor list is not allowed. When all + /// successors have unknown probabilities, 1 / N is returned as the + /// probability for each successor, where N is the number of successors. /// /// Note that duplicate Machine CFG edges are not allowed. - void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob); + void addSuccessor(MachineBasicBlock *Succ, + BranchProbability Prob = BranchProbability::getUnknown()); /// Add Succ as a successor of this MachineBasicBlock. The Predecessors list /// of Succ is automatically updated. The probability is not provided because @@ -467,9 +450,6 @@ class MachineBasicBlock /// won't be used. Using this interface can save some space. void addSuccessorWithoutProb(MachineBasicBlock *Succ); - /// Set successor weight of a given iterator. - void setSuccWeight(succ_iterator I, uint32_t Weight); - /// Set successor probability of a given iterator. void setSuccProbability(succ_iterator I, BranchProbability Prob); @@ -488,7 +468,7 @@ class MachineBasicBlock /// Return the iterator to the element after the one removed. succ_iterator removeSuccessor(succ_iterator I); - /// Replace successor OLD with NEW and update weight info. + /// Replace successor OLD with NEW and update probability info. void replaceSuccessor(MachineBasicBlock *Old, MachineBasicBlock *New); /// Transfers all the successors from MBB to this machine basic block (i.e., @@ -500,9 +480,6 @@ class MachineBasicBlock /// operands in the successor blocks which refer to FromMBB to refer to this. void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB); - /// Return true if any of the successors have weights attached to them. - bool hasSuccessorWeights() const { return !Weights.empty(); } - /// Return true if any of the successors have probabilities attached to them. bool hasSuccessorProbabilities() const { return !Probs.empty(); } @@ -759,10 +736,6 @@ class MachineBasicBlock private: - /// Return weight iterator corresponding to the I successor iterator. - weight_iterator getWeightIterator(succ_iterator I); - const_weight_iterator getWeightIterator(const_succ_iterator I) const; - /// Return probability iterator corresponding to the I successor iterator. probability_iterator getProbabilityIterator(succ_iterator I); const_probability_iterator @@ -771,11 +744,6 @@ class MachineBasicBlock friend class MachineBranchProbabilityInfo; friend class MIPrinter; - /// Return weight of the edge from this block to MBB. This method should NOT - /// be called directly, but by using getEdgeWeight method from - /// MachineBranchProbabilityInfo class. - uint32_t getSuccWeight(const_succ_iterator Succ) const; - /// Return probability of the edge from this block to MBB. This method should /// NOT be called directly, but by using getEdgeProbability method from /// MachineBranchProbabilityInfo class. diff --git a/include/llvm/CodeGen/MachineBranchProbabilityInfo.h b/include/llvm/CodeGen/MachineBranchProbabilityInfo.h index 058ab32f3aa9..608e8d257874 100644 --- a/include/llvm/CodeGen/MachineBranchProbabilityInfo.h +++ b/include/llvm/CodeGen/MachineBranchProbabilityInfo.h @@ -55,10 +55,15 @@ class MachineBranchProbabilityInfo : public ImmutablePass { uint32_t getEdgeWeight(const MachineBasicBlock *Src, MachineBasicBlock::const_succ_iterator Dst) const; - // Get sum of the block successors' weights, potentially scaling them to fit - // within 32-bits. If scaling is required, sets Scale based on the necessary - // adjustment. Any edge weights used with the sum should be divided by Scale. - uint32_t getSumForBlock(const MachineBasicBlock *MBB, uint32_t &Scale) const; + // Return edge probability. + BranchProbability getEdgeProbability(const MachineBasicBlock *Src, + const MachineBasicBlock *Dst) const; + + // Same as above, but using a const_succ_iterator from Src. This is faster + // when the iterator is already available. + BranchProbability + getEdgeProbability(const MachineBasicBlock *Src, + MachineBasicBlock::const_succ_iterator Dst) const; // A 'Hot' edge is an edge which probability is >= 80%. bool isEdgeHot(const MachineBasicBlock *Src, @@ -68,15 +73,6 @@ class MachineBranchProbabilityInfo : public ImmutablePass { // NB: This routine's complexity is linear on the number of successors. MachineBasicBlock *getHotSucc(MachineBasicBlock *MBB) const; - // Return a probability as a fraction between 0 (0% probability) and - // 1 (100% probability), however the value is never equal to 0, and can be 1 - // only iff SRC block has only one successor. - // NB: This routine's complexity is linear on the number of successors of - // Src. Querying sequentially for each successor's probability is a quadratic - // query pattern. - BranchProbability getEdgeProbability(const MachineBasicBlock *Src, - const MachineBasicBlock *Dst) const; - // Print value between 0 (0% probability) and 1 (100% probability), // however the value is never equal to 0, and can be 1 only iff SRC block // has only one successor. diff --git a/include/llvm/Support/BranchProbability.h b/include/llvm/Support/BranchProbability.h index 3620d4d5d772..2548384f346b 100644 --- a/include/llvm/Support/BranchProbability.h +++ b/include/llvm/Support/BranchProbability.h @@ -53,6 +53,9 @@ class BranchProbability { // Create a BranchProbability object with the given numerator and 1<<31 // as denominator. static BranchProbability getRaw(uint32_t N) { return BranchProbability(N); } + // Create a BranchProbability object from 64-bit integers. + static BranchProbability getBranchProbability(uint64_t Numerator, + uint64_t Denominator); // Normalize given probabilties so that the sum of them becomes approximate // one. @@ -131,10 +134,30 @@ class BranchProbability { bool operator==(BranchProbability RHS) const { return N == RHS.N; } bool operator!=(BranchProbability RHS) const { return !(*this == RHS); } - bool operator<(BranchProbability RHS) const { return N < RHS.N; } - bool operator>(BranchProbability RHS) const { return RHS < *this; } - bool operator<=(BranchProbability RHS) const { return !(RHS < *this); } - bool operator>=(BranchProbability RHS) const { return !(*this < RHS); } + + bool operator<(BranchProbability RHS) const { + assert(N != UnknownN && RHS.N != UnknownN && + "Unknown probability cannot participate in comparisons."); + return N < RHS.N; + } + + bool operator>(BranchProbability RHS) const { + assert(N != UnknownN && RHS.N != UnknownN && + "Unknown probability cannot participate in comparisons."); + return RHS < *this; + } + + bool operator<=(BranchProbability RHS) const { + assert(N != UnknownN && RHS.N != UnknownN && + "Unknown probability cannot participate in comparisons."); + return !(RHS < *this); + } + + bool operator>=(BranchProbability RHS) const { + assert(N != UnknownN && RHS.N != UnknownN && + "Unknown probability cannot participate in comparisons."); + return !(*this < RHS); + } }; inline raw_ostream &operator<<(raw_ostream &OS, BranchProbability Prob) { diff --git a/lib/Analysis/BranchProbabilityInfo.cpp b/lib/Analysis/BranchProbabilityInfo.cpp index f48394698699..6cdf43a06a9f 100644 --- a/lib/Analysis/BranchProbabilityInfo.cpp +++ b/lib/Analysis/BranchProbabilityInfo.cpp @@ -647,6 +647,12 @@ getEdgeProbability(const BasicBlock *Src, const BasicBlock *Dst) const { return BranchProbability(N, D); } +BranchProbability +BranchProbabilityInfo::getEdgeProbability(const BasicBlock *Src, + succ_const_iterator Dst) const { + return getEdgeProbability(Src, Dst.getSuccessorIndex()); +} + raw_ostream & BranchProbabilityInfo::printEdgeProbability(raw_ostream &OS, const BasicBlock *Src, diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp index 0b2495cc996e..ba21d9cc9d55 100644 --- a/lib/CodeGen/BranchFolding.cpp +++ b/lib/CodeGen/BranchFolding.cpp @@ -1099,13 +1099,16 @@ void BranchFolder::setCommonTailEdgeWeights(MachineBasicBlock &TailMBB) { if (TailMBB.succ_size() <= 1) return; - auto MaxEdgeFreq = *std::max_element(EdgeFreqLs.begin(), EdgeFreqLs.end()); - uint64_t Scale = MaxEdgeFreq.getFrequency() / UINT32_MAX + 1; + auto SumEdgeFreq = + std::accumulate(EdgeFreqLs.begin(), EdgeFreqLs.end(), BlockFrequency(0)) + .getFrequency(); auto EdgeFreq = EdgeFreqLs.begin(); for (auto SuccI = TailMBB.succ_begin(), SuccE = TailMBB.succ_end(); SuccI != SuccE; ++SuccI, ++EdgeFreq) - TailMBB.setSuccWeight(SuccI, EdgeFreq->getFrequency() / Scale); + TailMBB.setSuccProbability( + SuccI, BranchProbability::getBranchProbability(EdgeFreq->getFrequency(), + SumEdgeFreq)); } //===----------------------------------------------------------------------===// diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp index 0b2f3ea165f8..ff28f95cc33d 100644 --- a/lib/CodeGen/IfConversion.cpp +++ b/lib/CodeGen/IfConversion.cpp @@ -32,6 +32,7 @@ #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include using namespace llvm; @@ -1151,28 +1152,6 @@ bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) { return true; } -/// Scale down weights to fit into uint32_t. NewTrue is the new weight -/// for successor TrueBB, and NewFalse is the new weight for successor -/// FalseBB. -static void ScaleWeights(uint64_t NewTrue, uint64_t NewFalse, - MachineBasicBlock *MBB, - const MachineBasicBlock *TrueBB, - const MachineBasicBlock *FalseBB, - const MachineBranchProbabilityInfo *MBPI) { - uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse; - uint32_t Scale = (NewMax / UINT32_MAX) + 1; - for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(), - SE = MBB->succ_end(); - SI != SE; ++SI) { - if (*SI == TrueBB) - MBB->setSuccWeight(SI, (uint32_t)(NewTrue / Scale)); - else if (*SI == FalseBB) - MBB->setSuccWeight(SI, (uint32_t)(NewFalse / Scale)); - else - MBB->setSuccWeight(SI, MBPI->getEdgeWeight(MBB, SI) / Scale); - } -} - /// IfConvertTriangle - If convert a triangle sub-CFG. /// bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) { @@ -1229,16 +1208,14 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) { DontKill.clear(); bool HasEarlyExit = CvtBBI->FalseBB != nullptr; - uint64_t CvtNext = 0, CvtFalse = 0, BBNext = 0, BBCvt = 0, SumWeight = 0; - uint32_t WeightScale = 0; + BranchProbability CvtNext, CvtFalse, BBNext, BBCvt; if (HasEarlyExit) { - // Get weights before modifying CvtBBI->BB and BBI.BB. - CvtNext = MBPI->getEdgeWeight(CvtBBI->BB, NextBBI->BB); - CvtFalse = MBPI->getEdgeWeight(CvtBBI->BB, CvtBBI->FalseBB); - BBNext = MBPI->getEdgeWeight(BBI.BB, NextBBI->BB); - BBCvt = MBPI->getEdgeWeight(BBI.BB, CvtBBI->BB); - SumWeight = MBPI->getSumForBlock(CvtBBI->BB, WeightScale); + // Get probabilities before modifying CvtBBI->BB and BBI.BB. + CvtNext = MBPI->getEdgeProbability(CvtBBI->BB, NextBBI->BB); + CvtFalse = MBPI->getEdgeProbability(CvtBBI->BB, CvtBBI->FalseBB); + BBNext = MBPI->getEdgeProbability(BBI.BB, NextBBI->BB); + BBCvt = MBPI->getEdgeProbability(BBI.BB, CvtBBI->BB); } if (CvtBBI->BB->pred_size() > 1) { @@ -1266,22 +1243,24 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) { CvtBBI->BrCond.end()); if (TII->ReverseBranchCondition(RevCond)) llvm_unreachable("Unable to reverse branch condition!"); + + // Update the edge probability for both CvtBBI->FalseBB and NextBBI. + // NewNext = New_Prob(BBI.BB, NextBBI->BB) = + // Prob(BBI.BB, NextBBI->BB) + + // Prob(BBI.BB, CvtBBI->BB) * Prob(CvtBBI->BB, NextBBI->BB) + // NewFalse = New_Prob(BBI.BB, CvtBBI->FalseBB) = + // Prob(BBI.BB, CvtBBI->BB) * Prob(CvtBBI->BB, CvtBBI->FalseBB) + auto NewTrueBB = getNextBlock(BBI.BB); + auto NewNext = BBNext + BBCvt * CvtNext; + auto NewTrueBBIter = + std::find(BBI.BB->succ_begin(), BBI.BB->succ_end(), NewTrueBB); + assert(NewTrueBBIter != BBI.BB->succ_end() && + "NewTrueBB is not a successor of BBI.BB."); + BBI.BB->setSuccProbability(NewTrueBBIter, NewNext); + + auto NewFalse = BBCvt * CvtFalse; TII->InsertBranch(*BBI.BB, CvtBBI->FalseBB, nullptr, RevCond, dl); - BBI.BB->addSuccessor(CvtBBI->FalseBB); - // Update the edge weight for both CvtBBI->FalseBB and NextBBI. - // New_Weight(BBI.BB, NextBBI->BB) = - // Weight(BBI.BB, NextBBI->BB) * getSumForBlock(CvtBBI->BB) + - // Weight(BBI.BB, CvtBBI->BB) * Weight(CvtBBI->BB, NextBBI->BB) - // New_Weight(BBI.BB, CvtBBI->FalseBB) = - // Weight(BBI.BB, CvtBBI->BB) * Weight(CvtBBI->BB, CvtBBI->FalseBB) - - uint64_t NewNext = BBNext * SumWeight + (BBCvt * CvtNext) / WeightScale; - uint64_t NewFalse = (BBCvt * CvtFalse) / WeightScale; - // We need to scale down all weights of BBI.BB to fit uint32_t. - // Here BBI.BB is connected to CvtBBI->FalseBB and will fall through to - // the next block. - ScaleWeights(NewNext, NewFalse, BBI.BB, getNextBlock(BBI.BB), - CvtBBI->FalseBB, MBPI); + BBI.BB->addSuccessor(CvtBBI->FalseBB, NewFalse); } // Merge in the 'false' block if the 'false' block has no other @@ -1524,7 +1503,7 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind, MergeBlocks(BBI, TailBBI); TailBBI.IsDone = true; } else { - BBI.BB->addSuccessor(TailBB); + BBI.BB->addSuccessor(TailBB, BranchProbability::getOne()); InsertUncondBranch(BBI.BB, TailBB, TII); BBI.HasFallThrough = false; } @@ -1688,21 +1667,26 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) { FromBBI.BB->succ_end()); MachineBasicBlock *NBB = getNextBlock(FromBBI.BB); MachineBasicBlock *FallThrough = FromBBI.HasFallThrough ? NBB : nullptr; - - // The edge weight from ToBBI.BB to FromBBI.BB, which is only needed when + // The edge probability from ToBBI.BB to FromBBI.BB, which is only needed when // AddEdges is true and FromBBI.BB is a successor of ToBBI.BB. - uint32_t To2FromWeight = 0; - // WeightScale and SumWeight are for calculating successor probabilities of - // FromBBI.BB. - uint32_t WeightScale = 0; - uint32_t SumWeight = 0; + auto To2FromProb = BranchProbability::getZero(); if (AddEdges && ToBBI.BB->isSuccessor(FromBBI.BB)) { - To2FromWeight = MBPI->getEdgeWeight(ToBBI.BB, FromBBI.BB); - // Set the edge weight from ToBBI.BB to FromBBI.BB to zero to avoid the edge - // weight being merged to other edges when this edge is removed later. - ToBBI.BB->setSuccWeight( - std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), FromBBI.BB), 0); - SumWeight = MBPI->getSumForBlock(FromBBI.BB, WeightScale); + To2FromProb = MBPI->getEdgeProbability(ToBBI.BB, FromBBI.BB); + // Set the edge probability from ToBBI.BB to FromBBI.BB to zero to avoid the + // edge probability being merged to other edges when this edge is removed + // later. + ToBBI.BB->setSuccProbability( + std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), FromBBI.BB), + BranchProbability::getZero()); + } + + if (AddEdges && ToBBI.BB->isSuccessor(FromBBI.BB)) { + // Set the edge probability from ToBBI.BB to FromBBI.BB to zero to avoid the + // edge probability being merged to other edges when this edge is removed + // later. + ToBBI.BB->setSuccProbability( + std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), FromBBI.BB), + BranchProbability::getZero()); } for (unsigned i = 0, e = FromSuccs.size(); i != e; ++i) { @@ -1711,39 +1695,38 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) { if (Succ == FallThrough) continue; - uint32_t NewWeight = 0; + auto NewProb = BranchProbability::getZero(); if (AddEdges) { - // Calculate the edge weight for the edge from ToBBI.BB to Succ, which is - // a portion of the edge weight from FromBBI.BB to Succ. The portion ratio - // is the edge probability from ToBBI.BB to FromBBI.BB (if FromBBI is a - // successor of ToBBI.BB. See comment below for excepion). - NewWeight = MBPI->getEdgeWeight(FromBBI.BB, Succ); + // Calculate the edge probability for the edge from ToBBI.BB to Succ, + // which is a portion of the edge probability from FromBBI.BB to Succ. The + // portion ratio is the edge probability from ToBBI.BB to FromBBI.BB (if + // FromBBI is a successor of ToBBI.BB. See comment below for excepion). + NewProb = MBPI->getEdgeProbability(FromBBI.BB, Succ); - // To2FromWeight is 0 when FromBBI.BB is not a successor of ToBBI.BB. This + // To2FromProb is 0 when FromBBI.BB is not a successor of ToBBI.BB. This // only happens when if-converting a diamond CFG and FromBBI.BB is the // tail BB. In this case FromBBI.BB post-dominates ToBBI.BB and hence we - // could just use the weights on FromBBI.BB's out-edges when adding new - // successors. - if (To2FromWeight > 0) { - BranchProbability Prob(NewWeight / WeightScale, SumWeight); - NewWeight = Prob.scale(To2FromWeight); - } + // could just use the probabilities on FromBBI.BB's out-edges when adding + // new successors. + if (!To2FromProb.isZero()) + NewProb *= To2FromProb; } FromBBI.BB->removeSuccessor(Succ); if (AddEdges) { - // If the edge from ToBBI.BB to Succ already exists, update the weight of - // this edge by adding NewWeight to it. An example is shown below, in - // which A is ToBBI.BB and B is FromBBI.BB. In this case we don't have to - // set C as A's successor as it already is. We only need to update the - // edge weight on A->C. Note that B will not be immediately removed from - // A's successors. It is possible that B->D is not removed either if D is - // a fallthrough of B. Later the edge A->D (generated here) and B->D will - // be combined into one edge. To maintain correct edge weight of this - // combined edge, we need to set the edge weight of A->B to zero, which is - // already done above. The edge weight on A->D is calculated by scaling - // the original weight on A->B by the probability of B->D. + // If the edge from ToBBI.BB to Succ already exists, update the + // probability of this edge by adding NewWeight to it. An example is shown + // below, in which A is ToBBI.BB and B is FromBBI.BB. In this case we + // don't have to set C as A's successor as it already is. We only need to + // update the edge probability on A->C. Note that B will not be + // immediately removed from A's successors. It is possible that B->D is + // not removed either if D is a fallthrough of B. Later the edge A->D + // (generated here) and B->D will be combined into one edge. To maintain + // correct edge probability of this combined edge, we need to set the edge + // probability of A->B to zero, which is already done above. The edge + // probability on A->D is calculated by scaling the original probability + // on A->B by the probability of B->D. // // Before ifcvt: After ifcvt (assume B->D is kept): // @@ -1755,11 +1738,11 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) { // C D C D // if (ToBBI.BB->isSuccessor(Succ)) - ToBBI.BB->setSuccWeight( + ToBBI.BB->setSuccProbability( std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), Succ), - MBPI->getEdgeWeight(ToBBI.BB, Succ) + NewWeight); + MBPI->getEdgeProbability(ToBBI.BB, Succ) + NewProb); else - ToBBI.BB->addSuccessor(Succ, NewWeight); + ToBBI.BB->addSuccessor(Succ, NewProb); } } diff --git a/lib/CodeGen/MIRParser/MIParser.cpp b/lib/CodeGen/MIRParser/MIParser.cpp index 5a8e96df7603..c9c2d62cec30 100644 --- a/lib/CodeGen/MIRParser/MIParser.cpp +++ b/lib/CodeGen/MIRParser/MIParser.cpp @@ -459,8 +459,9 @@ bool MIParser::parseBasicBlockSuccessors(MachineBasicBlock &MBB) { if (expectAndConsume(MIToken::rparen)) return true; } - MBB.addSuccessor(SuccMBB, Weight); + MBB.addSuccessor(SuccMBB, BranchProbability::getRaw(Weight)); } while (consumeIfPresent(MIToken::comma)); + MBB.normalizeSuccProbs(); return false; } diff --git a/lib/CodeGen/MIRPrinter.cpp b/lib/CodeGen/MIRPrinter.cpp index 0be7807064fb..175cb0d51437 100644 --- a/lib/CodeGen/MIRPrinter.cpp +++ b/lib/CodeGen/MIRPrinter.cpp @@ -461,8 +461,8 @@ void MIPrinter::print(const MachineBasicBlock &MBB) { if (I != MBB.succ_begin()) OS << ", "; printMBBReference(**I); - if (MBB.hasSuccessorWeights()) - OS << '(' << MBB.getSuccWeight(I) << ')'; + if (MBB.hasSuccessorProbabilities()) + OS << '(' << MBB.getSuccProbability(I) << ')'; } OS << "\n"; HasLineAttributes = true; diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp index 602b75182fca..c9c6a9d62462 100644 --- a/lib/CodeGen/MachineBasicBlock.cpp +++ b/lib/CodeGen/MachineBasicBlock.cpp @@ -319,8 +319,8 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST, OS << " Successors according to CFG:"; for (const_succ_iterator SI = succ_begin(), E = succ_end(); SI != E; ++SI) { OS << " BB#" << (*SI)->getNumber(); - if (!Weights.empty()) - OS << '(' << *getWeightIterator(SI) << ')'; + if (!Probs.empty()) + OS << '(' << *getProbabilityIterator(SI) << ')'; } OS << '\n'; } @@ -506,34 +506,16 @@ void MachineBasicBlock::updateTerminator() { } } -void MachineBasicBlock::addSuccessor(MachineBasicBlock *Succ, uint32_t Weight) { - // Weight list is either empty (if successor list isn't empty, this means - // disabled optimization) or has the same size as successor list. - if (!(Weights.empty() && !Successors.empty())) - Weights.push_back(Weight); - Successors.push_back(Succ); - Succ->addPredecessor(this); -} - -void MachineBasicBlock::addSuccessorWithoutWeight(MachineBasicBlock *Succ) { - // We need to make sure weight list is either empty or has the same size of - // successor list. When this function is called, we can safely delete all - // weight in the list. - Weights.clear(); - Successors.push_back(Succ); - Succ->addPredecessor(this); -} - void MachineBasicBlock::addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob) { // Probability list is either empty (if successor list isn't empty, this means // disabled optimization) or has the same size as successor list. if (!(Probs.empty() && !Successors.empty())) { + assert((Probs.empty() || (Prob.isUnknown() && Probs.back().isUnknown()) || + (!Prob.isUnknown() && !Probs.back().isUnknown())) && + "Successors with both known and unknwon probabilities are not " + "allowed."); Probs.push_back(Prob); - // FIXME: Temporarily use the numerator of the probability to represent edge - // weight. This will be removed once all weight-version interfaces in MBB - // are replaced with probability-version interfaces. - Weights.push_back(Prob.getNumerator()); } Successors.push_back(Succ); Succ->addPredecessor(this); @@ -544,7 +526,6 @@ void MachineBasicBlock::addSuccessorWithoutProb(MachineBasicBlock *Succ) { // of successor list. When this function is called, we can safely delete all // probability in the list. Probs.clear(); - Weights.clear(); Successors.push_back(Succ); Succ->addPredecessor(this); } @@ -558,23 +539,12 @@ MachineBasicBlock::succ_iterator MachineBasicBlock::removeSuccessor(succ_iterator I) { assert(I != Successors.end() && "Not a current successor!"); - // If Weight list is empty it means we don't use it (disabled optimization). - if (!Weights.empty()) { - weight_iterator WI = getWeightIterator(I); - Weights.erase(WI); - } - - // FIXME: Temporarily comment the following code as probabilities are now only - // used during instruction lowering, but this interface is called in later - // passes. Uncomment it once all edge weights are replaced with probabilities. -#if 0 // If probability list is empty it means we don't use it (disabled // optimization). if (!Probs.empty()) { probability_iterator WI = getProbabilityIterator(I); Probs.erase(WI); } -#endif (*I)->removePredecessor(this); return Successors.erase(I); @@ -611,17 +581,12 @@ void MachineBasicBlock::replaceSuccessor(MachineBasicBlock *Old, } // New is already a successor. - // Update its weight instead of adding a duplicate edge. - if (!Weights.empty()) - *getWeightIterator(NewI) += *getWeightIterator(OldI); - // FIXME: Temporarily comment the following code as probabilities are now only - // used during instruction lowering, but this interface is called in later - // passes. Uncomment it once all edge weights are replaced with probabilities. -#if 0 // Update its probability instead of adding a duplicate edge. - if (!Probs.empty()) - *getProbabilityIterator(NewI) += *getProbabilityIterator(OldI); -#endif + if (!Probs.empty()) { + auto ProbIter = getProbabilityIterator(NewI); + if (!ProbIter->isUnknown()) + *ProbIter += *getProbabilityIterator(OldI); + } removeSuccessor(OldI); } @@ -641,13 +606,14 @@ void MachineBasicBlock::transferSuccessors(MachineBasicBlock *FromMBB) { while (!FromMBB->succ_empty()) { MachineBasicBlock *Succ = *FromMBB->succ_begin(); - uint32_t Weight = 0; - // If Weight list is empty it means we don't use it (disabled optimization). - if (!FromMBB->Weights.empty()) - Weight = *FromMBB->Weights.begin(); + // If probability list is empty it means we don't use it (disabled optimization). + if (!FromMBB->Probs.empty()) { + auto Prob = *FromMBB->Probs.begin(); + addSuccessor(Succ, Prob); + } else + addSuccessorWithoutProb(Succ); - addSuccessor(Succ, Weight); FromMBB->removeSuccessor(Succ); } } @@ -659,10 +625,11 @@ MachineBasicBlock::transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB) { while (!FromMBB->succ_empty()) { MachineBasicBlock *Succ = *FromMBB->succ_begin(); - uint32_t Weight = 0; - if (!FromMBB->Weights.empty()) - Weight = *FromMBB->Weights.begin(); - addSuccessor(Succ, Weight); + if (!FromMBB->Probs.empty()) { + auto Prob = *FromMBB->Probs.begin(); + addSuccessor(Succ, Prob); + } else + addSuccessorWithoutProb(Succ); FromMBB->removeSuccessor(Succ); // Fix up any PHI nodes in the successor. @@ -1146,80 +1113,37 @@ MachineBasicBlock::findDebugLoc(instr_iterator MBBI) { return DL; } -/// Return weight of the edge from this block to MBB. -uint32_t MachineBasicBlock::getSuccWeight(const_succ_iterator Succ) const { - if (Weights.empty()) - return 0; - - return *getWeightIterator(Succ); -} - -/// Return probability of the edge from this block to MBB. If probability list -/// is empty, return a default probability which is 1/N, where N is the number -/// of successors. If the probability of the given successor is unknown, then -/// sum up all known probabilities and return the complement of the sum divided -/// by the number of unknown probabilities. +/// Return probability of the edge from this block to MBB. BranchProbability MachineBasicBlock::getSuccProbability(const_succ_iterator Succ) const { - if (Probs.empty()) + if (Probs.empty() || Probs.back().isUnknown()) return BranchProbability(1, succ_size()); - auto Prob = *getProbabilityIterator(Succ); - assert(!Prob.isUnknown()); - return Prob; -} - -/// Set successor weight of a given iterator. -void MachineBasicBlock::setSuccWeight(succ_iterator I, uint32_t Weight) { - if (Weights.empty()) - return; - *getWeightIterator(I) = Weight; + return *getProbabilityIterator(Succ); } /// Set successor probability of a given iterator. void MachineBasicBlock::setSuccProbability(succ_iterator I, BranchProbability Prob) { assert(!Prob.isUnknown()); - if (Probs.empty() || Weights.empty()) + if (Probs.empty()) return; *getProbabilityIterator(I) = Prob; - // FIXME: Temporarily use the numerator of the probability to represent edge - // weight. This will be removed once all weight-version interfaces in MBB - // are replaces with probability-version interfaces. - *getWeightIterator(I) = Prob.getNumerator(); -} - -/// Return wight iterator corresonding to the I successor iterator. -MachineBasicBlock::weight_iterator MachineBasicBlock:: -getWeightIterator(MachineBasicBlock::succ_iterator I) { - assert(Weights.size() == Successors.size() && "Async weight list!"); - size_t index = std::distance(Successors.begin(), I); - assert(index < Weights.size() && "Not a current successor!"); - return Weights.begin() + index; } -/// Return wight iterator corresonding to the I successor iterator. -MachineBasicBlock::const_weight_iterator MachineBasicBlock:: -getWeightIterator(MachineBasicBlock::const_succ_iterator I) const { - assert(Weights.size() == Successors.size() && "Async weight list!"); - const size_t index = std::distance(Successors.begin(), I); - assert(index < Weights.size() && "Not a current successor!"); - return Weights.begin() + index; -} - -/// Return probability iterator corresonding to the I successor iterator. -MachineBasicBlock::probability_iterator -MachineBasicBlock::getProbabilityIterator(MachineBasicBlock::succ_iterator I) { +/// Return probability iterator corresonding to the I successor iterator +MachineBasicBlock::const_probability_iterator +MachineBasicBlock::getProbabilityIterator( + MachineBasicBlock::const_succ_iterator I) const { assert(Probs.size() == Successors.size() && "Async probability list!"); const size_t index = std::distance(Successors.begin(), I); assert(index < Probs.size() && "Not a current successor!"); return Probs.begin() + index; } -/// Return probability iterator corresonding to the I successor iterator -MachineBasicBlock::const_probability_iterator -MachineBasicBlock::getProbabilityIterator( - MachineBasicBlock::const_succ_iterator I) const { +/// Return probability iterator corresonding to the I successor iterator. +MachineBasicBlock::probability_iterator +MachineBasicBlock::getProbabilityIterator(MachineBasicBlock::succ_iterator I) { assert(Probs.size() == Successors.size() && "Async probability list!"); const size_t index = std::distance(Successors.begin(), I); assert(index < Probs.size() && "Not a current successor!"); diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp index fba33eb93d5f..ddddd483e801 100644 --- a/lib/CodeGen/MachineBlockPlacement.cpp +++ b/lib/CodeGen/MachineBlockPlacement.cpp @@ -380,19 +380,11 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, const BranchProbability HotProb(4, 5); // 80% MachineBasicBlock *BestSucc = nullptr; - // FIXME: Due to the performance of the probability and weight routines in - // the MBPI analysis, we manually compute probabilities using the edge - // weights. This is suboptimal as it means that the somewhat subtle - // definition of edge weight semantics is encoded here as well. We should - // improve the MBPI interface to efficiently support query patterns such as - // this. - uint32_t BestWeight = 0; - uint32_t WeightScale = 0; - uint32_t SumWeight = MBPI->getSumForBlock(BB, WeightScale); - - // Adjust sum of weights by excluding weights on edges pointing to blocks that - // is either not in BlockFilter or is already in the current chain. Consider - // the following CFG: + auto BestProb = BranchProbability::getZero(); + + // Adjust edge probabilities by excluding edges pointing to blocks that is + // either not in BlockFilter or is already in the current chain. Consider the + // following CFG: // // --->A // | / \ @@ -406,7 +398,7 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, // HotProb). If we exclude E that is not in BlockFilter when calculating the // probability of C->D, D will be selected and we will get A C D B as the // layout of this loop. - uint32_t AdjustedSumWeight = SumWeight; + auto AdjustedSumProb = BranchProbability::getOne(); SmallVector Successors; for (MachineBasicBlock *Succ : BB->successors()) { bool SkipSucc = false; @@ -424,15 +416,16 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, } } if (SkipSucc) - AdjustedSumWeight -= MBPI->getEdgeWeight(BB, Succ) / WeightScale; + AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ); else Successors.push_back(Succ); } DEBUG(dbgs() << "Attempting merge from: " << getBlockName(BB) << "\n"); for (MachineBasicBlock *Succ : Successors) { - uint32_t SuccWeight = MBPI->getEdgeWeight(BB, Succ); - BranchProbability SuccProb(SuccWeight / WeightScale, AdjustedSumWeight); + BranchProbability SuccProb( + MBPI->getEdgeProbability(BB, Succ).getNumerator(), + AdjustedSumProb.getNumerator()); // If we outline optional branches, look whether Succ is unavoidable, i.e. // dominates all terminators of the MachineFunction. If it does, other @@ -470,7 +463,7 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, // Make sure that a hot successor doesn't have a globally more // important predecessor. - BranchProbability RealSuccProb(SuccWeight / WeightScale, SumWeight); + auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ); BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb * HotProb.getCompl(); bool BadCFGConflict = false; @@ -496,10 +489,10 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, << " (prob)" << (SuccChain.LoopPredecessors != 0 ? " (CFG break)" : "") << "\n"); - if (BestSucc && BestWeight >= SuccWeight) + if (BestSucc && BestProb >= SuccProb) continue; BestSucc = Succ; - BestWeight = SuccWeight; + BestProb = SuccProb; } return BestSucc; } @@ -728,11 +721,6 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L, MachineBasicBlock *OldExitingBB = ExitingBB; BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq; bool HasLoopingSucc = false; - // FIXME: Due to the performance of the probability and weight routines in - // the MBPI analysis, we use the internal weights and manually compute the - // probabilities to avoid quadratic behavior. - uint32_t WeightScale = 0; - uint32_t SumWeight = MBPI->getSumForBlock(MBB, WeightScale); for (MachineBasicBlock *Succ : MBB->successors()) { if (Succ->isEHPad()) continue; @@ -746,10 +734,10 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L, continue; } - uint32_t SuccWeight = MBPI->getEdgeWeight(MBB, Succ); + auto SuccProb = MBPI->getEdgeProbability(MBB, Succ); if (LoopBlockSet.count(Succ)) { DEBUG(dbgs() << " looping: " << getBlockName(MBB) << " -> " - << getBlockName(Succ) << " (" << SuccWeight << ")\n"); + << getBlockName(Succ) << " (" << SuccProb << ")\n"); HasLoopingSucc = true; continue; } @@ -761,7 +749,6 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L, BlocksExitingToOuterLoop.insert(MBB); } - BranchProbability SuccProb(SuccWeight / WeightScale, SumWeight); BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb; DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> " << getBlockName(Succ) << " [L:" << SuccLoopDepth << "] ("; @@ -904,21 +891,17 @@ void MachineBlockPlacement::rotateLoopWithProfile( // edge from the tail of the loop chain. SmallVector, 4> ExitsWithFreq; for (auto BB : LoopChain) { - uint32_t LargestExitEdgeWeight = 0; + auto LargestExitEdgeProb = BranchProbability::getZero(); for (auto *Succ : BB->successors()) { BlockChain *SuccChain = BlockToChain[Succ]; if (!LoopBlockSet.count(Succ) && (!SuccChain || Succ == *SuccChain->begin())) { - uint32_t SuccWeight = MBPI->getEdgeWeight(BB, Succ); - LargestExitEdgeWeight = std::max(LargestExitEdgeWeight, SuccWeight); + auto SuccProb = MBPI->getEdgeProbability(BB, Succ); + LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb); } } - if (LargestExitEdgeWeight > 0) { - uint32_t WeightScale = 0; - uint32_t SumWeight = MBPI->getSumForBlock(BB, WeightScale); - auto ExitFreq = - MBFI->getBlockFreq(BB) * - BranchProbability(LargestExitEdgeWeight / WeightScale, SumWeight); + if (LargestExitEdgeProb > BranchProbability::getZero()) { + auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb; ExitsWithFreq.emplace_back(BB, ExitFreq); } } @@ -1290,14 +1273,16 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) { } // If PrevBB has a two-way branch, try to re-order the branches - // such that we branch to the successor with higher weight first. + // such that we branch to the successor with higher probability first. if (TBB && !Cond.empty() && FBB && - MBPI->getEdgeWeight(PrevBB, FBB) > MBPI->getEdgeWeight(PrevBB, TBB) && + MBPI->getEdgeProbability(PrevBB, FBB) > + MBPI->getEdgeProbability(PrevBB, TBB) && !TII->ReverseBranchCondition(Cond)) { DEBUG(dbgs() << "Reverse order of the two branches: " << getBlockName(PrevBB) << "\n"); - DEBUG(dbgs() << " Edge weight: " << MBPI->getEdgeWeight(PrevBB, FBB) - << " vs " << MBPI->getEdgeWeight(PrevBB, TBB) << "\n"); + DEBUG(dbgs() << " Edge probability: " + << MBPI->getEdgeProbability(PrevBB, FBB) << " vs " + << MBPI->getEdgeProbability(PrevBB, TBB) << "\n"); DebugLoc dl; // FIXME: this is nowhere TII->RemoveBranch(*PrevBB); TII->InsertBranch(*PrevBB, FBB, TBB, Cond, dl); diff --git a/lib/CodeGen/MachineBranchProbabilityInfo.cpp b/lib/CodeGen/MachineBranchProbabilityInfo.cpp index 6fbc2be70486..5478dcba261a 100644 --- a/lib/CodeGen/MachineBranchProbabilityInfo.cpp +++ b/lib/CodeGen/MachineBranchProbabilityInfo.cpp @@ -28,91 +28,61 @@ char MachineBranchProbabilityInfo::ID = 0; void MachineBranchProbabilityInfo::anchor() { } -uint32_t MachineBranchProbabilityInfo:: -getSumForBlock(const MachineBasicBlock *MBB, uint32_t &Scale) const { - // First we compute the sum with 64-bits of precision, ensuring that cannot - // overflow by bounding the number of weights considered. Hopefully no one - // actually needs 2^32 successors. - assert(MBB->succ_size() < UINT32_MAX); - uint64_t Sum = 0; - Scale = 1; - for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(), - E = MBB->succ_end(); I != E; ++I) { - uint32_t Weight = getEdgeWeight(MBB, I); - Sum += Weight; - } - - // If the computed sum fits in 32-bits, we're done. - if (Sum <= UINT32_MAX) - return Sum; +uint32_t MachineBranchProbabilityInfo::getEdgeWeight( + const MachineBasicBlock *Src, + MachineBasicBlock::const_succ_iterator Dst) const { + return Src->getSuccProbability(Dst).getNumerator(); +} - // Otherwise, compute the scale necessary to cause the weights to fit, and - // re-sum with that scale applied. - assert((Sum / UINT32_MAX) < UINT32_MAX); - Scale = (Sum / UINT32_MAX) + 1; - Sum = 0; - for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(), - E = MBB->succ_end(); I != E; ++I) { - uint32_t Weight = getEdgeWeight(MBB, I); - Sum += Weight / Scale; - } - assert(Sum <= UINT32_MAX); - return Sum; +uint32_t MachineBranchProbabilityInfo::getEdgeWeight( + const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const { + // This is a linear search. Try to use the const_succ_iterator version when + // possible. + return getEdgeWeight(Src, std::find(Src->succ_begin(), Src->succ_end(), Dst)); } -uint32_t MachineBranchProbabilityInfo:: -getEdgeWeight(const MachineBasicBlock *Src, - MachineBasicBlock::const_succ_iterator Dst) const { - uint32_t Weight = Src->getSuccWeight(Dst); - if (!Weight) - return DEFAULT_WEIGHT; - return Weight; +BranchProbability MachineBranchProbabilityInfo::getEdgeProbability( + const MachineBasicBlock *Src, + MachineBasicBlock::const_succ_iterator Dst) const { + return Src->getSuccProbability(Dst); } -uint32_t MachineBranchProbabilityInfo:: -getEdgeWeight(const MachineBasicBlock *Src, - const MachineBasicBlock *Dst) const { +BranchProbability MachineBranchProbabilityInfo::getEdgeProbability( + const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const { // This is a linear search. Try to use the const_succ_iterator version when // possible. - return getEdgeWeight(Src, std::find(Src->succ_begin(), Src->succ_end(), Dst)); + return getEdgeProbability(Src, + std::find(Src->succ_begin(), Src->succ_end(), Dst)); } bool MachineBranchProbabilityInfo::isEdgeHot(const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const { // Hot probability is at least 4/5 = 80% - // FIXME: Compare against a static "hot" BranchProbability. - return getEdgeProbability(Src, Dst) > BranchProbability(4, 5); + static BranchProbability HotProb(4, 5); + return getEdgeProbability(Src, Dst) > HotProb; } MachineBasicBlock * MachineBranchProbabilityInfo::getHotSucc(MachineBasicBlock *MBB) const { - uint32_t MaxWeight = 0; + auto MaxProb = BranchProbability::getZero(); MachineBasicBlock *MaxSucc = nullptr; for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(), E = MBB->succ_end(); I != E; ++I) { - uint32_t Weight = getEdgeWeight(MBB, I); - if (Weight > MaxWeight) { - MaxWeight = Weight; + auto Prob = getEdgeProbability(MBB, I); + if (Prob > MaxProb) { + MaxProb = Prob; MaxSucc = *I; } } - if (getEdgeProbability(MBB, MaxSucc) >= BranchProbability(4, 5)) + static BranchProbability HotProb(4, 5); + if (getEdgeProbability(MBB, MaxSucc) >= HotProb) return MaxSucc; return nullptr; } -BranchProbability MachineBranchProbabilityInfo::getEdgeProbability( - const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const { - uint32_t Scale = 1; - uint32_t D = getSumForBlock(Src, Scale); - uint32_t N = getEdgeWeight(Src, Dst) / Scale; - - return BranchProbability(N, D); -} - raw_ostream &MachineBranchProbabilityInfo::printEdgeProbability( raw_ostream &OS, const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const { diff --git a/lib/CodeGen/TailDuplication.cpp b/lib/CodeGen/TailDuplication.cpp index ff86dabfac59..1f5b54866ac6 100644 --- a/lib/CodeGen/TailDuplication.cpp +++ b/lib/CodeGen/TailDuplication.cpp @@ -745,12 +745,12 @@ TailDuplicatePass::duplicateSimpleBB(MachineBasicBlock *TailBB, if (PredTBB) TII->InsertBranch(*PredBB, PredTBB, PredFBB, PredCond, DebugLoc()); - uint32_t Weight = MBPI->getEdgeWeight(PredBB, TailBB); + auto Prob = MBPI->getEdgeProbability(PredBB, TailBB); PredBB->removeSuccessor(TailBB); unsigned NumSuccessors = PredBB->succ_size(); assert(NumSuccessors <= 1); if (NumSuccessors == 0 || *PredBB->succ_begin() != NewTarget) - PredBB->addSuccessor(NewTarget, Weight); + PredBB->addSuccessor(NewTarget, Prob); TDBBs.push_back(PredBB); } @@ -858,7 +858,7 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, "TailDuplicate called on block with multiple successors!"); for (MachineBasicBlock::succ_iterator I = TailBB->succ_begin(), E = TailBB->succ_end(); I != E; ++I) - PredBB->addSuccessor(*I, MBPI->getEdgeWeight(TailBB, I)); + PredBB->addSuccessor(*I, MBPI->getEdgeProbability(TailBB, I)); Changed = true; ++NumTailDups; diff --git a/lib/Support/BranchProbability.cpp b/lib/Support/BranchProbability.cpp index 3b0f6e6f06e4..771d02c0aa3c 100644 --- a/lib/Support/BranchProbability.cpp +++ b/lib/Support/BranchProbability.cpp @@ -22,11 +22,14 @@ using namespace llvm; const uint32_t BranchProbability::D; raw_ostream &BranchProbability::print(raw_ostream &OS) const { + if (isUnknown()) + return OS << "?%"; + // Get a percentage rounded to two decimal digits. This avoids // implementation-defined rounding inside printf. double Percent = rint(((double)N / D) * 100.0 * 100.0) / 100.0; - OS << format("0x%08" PRIx32 " / 0x%08" PRIx32 " = %.2f%%", N, D, Percent); - return OS; + return OS << format("0x%08" PRIx32 " / 0x%08" PRIx32 " = %.2f%%", N, D, + Percent); } void BranchProbability::dump() const { print(dbgs()) << '\n'; } @@ -43,6 +46,19 @@ BranchProbability::BranchProbability(uint32_t Numerator, uint32_t Denominator) { } } +BranchProbability +BranchProbability::getBranchProbability(uint64_t Numerator, + uint64_t Denominator) { + assert(Numerator <= Denominator && "Probability cannot be bigger than 1!"); + // Scale down Denominator to fit in a 32-bit integer. + int Scale = 0; + while (Denominator > UINT32_MAX) { + Denominator >>= 1; + Scale++; + } + return BranchProbability(Numerator >> Scale, Denominator); +} + // If ConstD is not zero, then replace D by ConstD so that division and modulo // operations by D can be optimized, in case this function is not inlined by the // compiler. diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp index f1b383017901..cdbd12092150 100644 --- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp +++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp @@ -1570,8 +1570,7 @@ void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk, insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc()); insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc()); - DstBlk->addSuccessor(LandMBB); - DstBlk->removeSuccessor(DstBlk); + DstBlk->replaceSuccessor(DstBlk, LandMBB); } @@ -1666,8 +1665,7 @@ AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB, replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB); //srcBlk, oldBlk, newBlk - PredMBB->removeSuccessor(MBB); - PredMBB->addSuccessor(CloneMBB); + PredMBB->replaceSuccessor(MBB, CloneMBB); // add all successor to cloneBlk cloneSuccessorList(CloneMBB, MBB); diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp index 0bf2d374df6a..e89757c19ecc 100644 --- a/lib/Target/ARM/ARMConstantIslandPass.cpp +++ b/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -2274,8 +2274,7 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) { // Update the CFG. NewBB->addSuccessor(BB); - JTBB->removeSuccessor(BB); - JTBB->addSuccessor(NewBB); + JTBB->replaceSuccessor(BB, NewBB); ++NumJTInserted; return NewBB; diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 0cc41812d71c..e8f3ab65bdbe 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -7346,7 +7346,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, } } - BB->addSuccessor(DispatchBB); + BB->addSuccessor(DispatchBB, BranchProbability::getZero()); // Find the invoke call and mark all of the callee-saved registers as // 'implicit defined' so that they're spilled. This prevents code from diff --git a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp index 96bb61750805..efafdd007289 100644 --- a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp +++ b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp @@ -186,13 +186,11 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) { if (case1 || case2) { InvertAndChangeJumpTarget(MI, UncondTarget); - MBB->removeSuccessor(JumpAroundTarget); - MBB->addSuccessor(UncondTarget); + MBB->replaceSuccessor(JumpAroundTarget, UncondTarget); // Remove the unconditional branch in LayoutSucc. LayoutSucc->erase(LayoutSucc->begin()); - LayoutSucc->removeSuccessor(UncondTarget); - LayoutSucc->addSuccessor(JumpAroundTarget); + LayoutSucc->replaceSuccessor(UncondTarget, JumpAroundTarget); // This code performs the conversion for case 2, which moves // the block to the fall-thru case (BB3 in the code above). diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp index d09843ed0e53..e75858a181e5 100644 --- a/lib/Target/Mips/MipsLongBranch.cpp +++ b/lib/Target/Mips/MipsLongBranch.cpp @@ -262,8 +262,7 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) { static_cast(Subtarget.getInstrInfo()); MF->insert(FallThroughMBB, LongBrMBB); - MBB->removeSuccessor(TgtMBB); - MBB->addSuccessor(LongBrMBB); + MBB->replaceSuccessor(TgtMBB, LongBrMBB); if (IsPIC) { MachineBasicBlock *BalTgtMBB = MF->CreateMachineBasicBlock(BB); diff --git a/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll b/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll index e17da7a97205..a44c9721d6c1 100644 --- a/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll +++ b/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll @@ -14,15 +14,15 @@ entry: br i1 undef, label %for.end, label %for.body ; Before if conversion, we have -; for.body -> lor.lhs.false.i (62) -; -> for.cond.backedge (62) -; lor.lhs.false.i -> for.cond.backedge (1048575) -; -> cond.false.i (1) +; for.body -> lor.lhs.false.i (50%) +; -> for.cond.backedge (50%) +; lor.lhs.false.i -> for.cond.backedge (100%) +; -> cond.false.i (0%) ; Afer if conversion, we have -; for.body -> for.cond.backedge (130023362) -; -> cond.false.i (62) +; for.body -> for.cond.backedge (100%) +; -> cond.false.i (0%) ; CHECK: BB#1: derived from LLVM BB %for.body -; CHECK: Successors according to CFG: BB#2(4294967291) BB#4(2048) +; CHECK: Successors according to CFG: BB#2(0x7ffffc00 / 0x80000000 = 100.00%) BB#4(0x00000400 / 0x80000000 = 0.00%) for.body: br i1 undef, label %for.cond.backedge, label %lor.lhs.false.i, !prof !1 diff --git a/test/CodeGen/ARM/ifcvt-branch-weight.ll b/test/CodeGen/ARM/ifcvt-branch-weight.ll index f2a1229d0d8a..0de039cde23c 100644 --- a/test/CodeGen/ARM/ifcvt-branch-weight.ll +++ b/test/CodeGen/ARM/ifcvt-branch-weight.ll @@ -19,7 +19,7 @@ bb: br i1 %9, label %return, label %bb2 ; CHECK: BB#2: derived from LLVM BB %bb2 -; CHECK: Successors according to CFG: BB#3(4294967289) BB#4(4294967287) +; CHECK: Successors according to CFG: BB#3({{[0-9a-fx/= ]+}}50.00%) BB#4({{[0-9a-fx/= ]+}}50.00%) bb2: %v10 = icmp eq i32 %3, 16 diff --git a/test/CodeGen/ARM/ifcvt-iter-indbr.ll b/test/CodeGen/ARM/ifcvt-iter-indbr.ll index 6ce9bcb56ef4..a96b6e8a1e83 100644 --- a/test/CodeGen/ARM/ifcvt-iter-indbr.ll +++ b/test/CodeGen/ARM/ifcvt-iter-indbr.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple thumbv7s-apple-darwin -asm-verbose=false | FileCheck %s -; RUN: llc < %s -mtriple thumbv7s-apple-darwin -asm-verbose=false -print-machineinstrs=if-converter 2>&1 | FileCheck --check-prefix=CHECK-WEIGHT %s +; RUN: llc < %s -mtriple thumbv7s-apple-darwin -asm-verbose=false -print-machineinstrs=if-converter 2>&1 | FileCheck --check-prefix=CHECK-PROB %s declare i32 @foo(i32) declare i8* @bar(i32, i8*, i8*) @@ -29,10 +29,10 @@ declare i8* @bar(i32, i8*, i8*) ; CHECK-NEXT: [[FOOCALL]]: ; CHECK-NEXT: blx _foo ; -; CHECK-WEIGHT: BB#0: -; CHECK-WEIGHT: Successors according to CFG: BB#1(1073741824) BB#2(536870912) BB#4(536870912) -; CHECK-WEIGHT: BB#1: -; CHECK-WEIGHT: Successors according to CFG: BB#2(1610612736) BB#4(536870912) +; CHECK-PROB: BB#0: +; CHECK-PROB: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}50.00%) BB#2({{[0-9a-fx/= ]+}}25.00%) BB#4({{[0-9a-fx/= ]+}}25.00%) +; CHECK-PROB: BB#1: +; CHECK-PROB: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}75.00%) BB#4({{[0-9a-fx/= ]+}}25.00%) define i32 @test(i32 %a, i32 %a2, i32* %p, i32* %p2) { entry: diff --git a/test/CodeGen/ARM/tail-merge-branch-weight.ll b/test/CodeGen/ARM/tail-merge-branch-weight.ll index 95b0a202e7ff..f83f28815793 100644 --- a/test/CodeGen/ARM/tail-merge-branch-weight.ll +++ b/test/CodeGen/ARM/tail-merge-branch-weight.ll @@ -9,7 +9,7 @@ ; = 0.2 * 0.4 + 0.8 * 0.7 = 0.64 ; CHECK: # Machine code for function test0: -; CHECK: Successors according to CFG: BB#{{[0-9]+}}(13) BB#{{[0-9]+}}(24) +; CHECK: Successors according to CFG: BB#{{[0-9]+}}({{[0-9a-fx/= ]+}}20.00%) BB#{{[0-9]+}}({{[0-9a-fx/= ]+}}80.00%) ; CHECK: BB#{{[0-9]+}}: ; CHECK: BB#{{[0-9]+}}: ; CHECK: # End machine code for function test0. diff --git a/test/CodeGen/ARM/taildup-branch-weight.ll b/test/CodeGen/ARM/taildup-branch-weight.ll index 576c120b444e..799ef62416e6 100644 --- a/test/CodeGen/ARM/taildup-branch-weight.ll +++ b/test/CodeGen/ARM/taildup-branch-weight.ll @@ -3,7 +3,7 @@ ; RUN: | FileCheck %s ; CHECK: Machine code for function test0: -; CHECK: Successors according to CFG: BB#1(67108864) BB#2(2080374784) +; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}3.12%) BB#2({{[0-9a-fx/= ]+}}96.88%) define void @test0(i32 %a, i32 %b, i32* %c, i32* %d) { entry: @@ -30,7 +30,7 @@ B4: !0 = !{!"branch_weights", i32 4, i32 124} ; CHECK: Machine code for function test1: -; CHECK: Successors according to CFG: BB#1(67108864) BB#2(2080374784) +; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}3.12%) BB#2({{[0-9a-fx/= ]+}}96.88%) @g0 = common global i32 0, align 4 diff --git a/test/CodeGen/Generic/MachineBranchProb.ll b/test/CodeGen/Generic/MachineBranchProb.ll index 5a4a4672f7eb..ae3c8da21471 100644 --- a/test/CodeGen/Generic/MachineBranchProb.ll +++ b/test/CodeGen/Generic/MachineBranchProb.ll @@ -16,11 +16,11 @@ entry: i64 5, label %sw.bb1 ], !prof !0 ; CHECK: BB#0: derived from LLVM BB %entry -; CHECK: Successors according to CFG: BB#2(1616928864) BB#4(530554784) +; CHECK: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}75.29%) BB#4({{[0-9a-fx/= ]+}}24.71%) ; CHECK: BB#4: derived from LLVM BB %entry -; CHECK: Successors according to CFG: BB#1(252645135) BB#5(277909649) +; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}47.62%) BB#5({{[0-9a-fx/= ]+}}52.38%) ; CHECK: BB#5: derived from LLVM BB %entry -; CHECK: Successors according to CFG: BB#1(101058054) BB#3(176851595) +; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}36.36%) BB#3({{[0-9a-fx/= ]+}}63.64%) sw.bb: br label %return @@ -62,7 +62,7 @@ return: ret void ; CHECK-LABEL: Machine code for function left_leaning_weight_balanced_tree: ; CHECK: BB#0: derived from LLVM BB %entry ; CHECK-NOT: Successors -; CHECK: Successors according to CFG: BB#8(852677332) BB#9(1294806318) +; CHECK: Successors according to CFG: BB#8({{[0-9a-fx/= ]+}}39.71%) BB#9({{[0-9a-fx/= ]+}}60.29%) } !1 = !{!"branch_weights", diff --git a/test/CodeGen/Hexagon/ifcvt-edge-weight.ll b/test/CodeGen/Hexagon/ifcvt-edge-weight.ll index f84fd95e4fbd..341567e1d02f 100644 --- a/test/CodeGen/Hexagon/ifcvt-edge-weight.ll +++ b/test/CodeGen/Hexagon/ifcvt-edge-weight.ll @@ -2,7 +2,7 @@ ; Check that the edge weights are updated correctly after if-conversion. ; CHECK: BB#3: -; CHECK: Successors according to CFG: BB#2(214748365) BB#1(1932735283) +; CHECK: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}10.00%) BB#1({{[0-9a-fx/= ]+}}90.00%) @a = external global i32 @d = external global i32 diff --git a/test/CodeGen/MIR/X86/newline-handling.mir b/test/CodeGen/MIR/X86/newline-handling.mir index b5ed3b7f27e1..bce06d540114 100644 --- a/test/CodeGen/MIR/X86/newline-handling.mir +++ b/test/CodeGen/MIR/X86/newline-handling.mir @@ -35,7 +35,7 @@ liveins: # CHECK-LABEL: name: foo # CHECK: body: | # CHECK-NEXT: bb.0.entry: -# CHECK-NEXT: successors: %bb.1.less(0), %bb.2.exit(0) +# CHECK-NEXT: successors: %bb.1.less(0x40000000 / 0x80000000 = 50.00%), %bb.2.exit(0x40000000 / 0x80000000 = 50.00%) # CHECK-NEXT: liveins: %edi # CHECK: CMP32ri8 %edi, 10, implicit-def %eflags # CHECK-NEXT: JG_1 %bb.2.exit, implicit killed %eflags @@ -79,7 +79,7 @@ liveins: # CHECK-LABEL: name: bar # CHECK: body: | # CHECK-NEXT: bb.0.entry: -# CHECK-NEXT: successors: %bb.1.less(0), %bb.2.exit(0) +# CHECK-NEXT: successors: %bb.1.less(0x40000000 / 0x80000000 = 50.00%), %bb.2.exit(0x40000000 / 0x80000000 = 50.00%) # CHECK-NEXT: liveins: %edi # CHECK: CMP32ri8 %edi, 10, implicit-def %eflags # CHECK-NEXT: JG_1 %bb.2.exit, implicit killed %eflags diff --git a/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir b/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir index fc5e5d640f7f..64af6121189a 100644 --- a/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir +++ b/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir @@ -1,6 +1,6 @@ # RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s # This test ensures that the MIR parser parses basic block successors and -# weights correctly. +# probabilities correctly. --- | @@ -21,10 +21,10 @@ name: foo body: | ; CHECK-LABEL: bb.0.entry: - ; CHECK: successors: %bb.1.less(16), %bb.2.exit(32) + ; CHECK: successors: %bb.1.less({{[0-9a-fx/= ]+}}33.00%), %bb.2.exit({{[0-9a-fx/= ]+}}67.00%) ; CHECK-LABEL: bb.1.less: bb.0.entry: - successors: %bb.1.less (16), %bb.2.exit(32) + successors: %bb.1.less (33), %bb.2.exit(67) liveins: %edi CMP32ri8 %edi, 10, implicit-def %eflags diff --git a/test/CodeGen/MIR/X86/successor-basic-blocks.mir b/test/CodeGen/MIR/X86/successor-basic-blocks.mir index aa80fe9fbeef..a6c14f70bc7c 100644 --- a/test/CodeGen/MIR/X86/successor-basic-blocks.mir +++ b/test/CodeGen/MIR/X86/successor-basic-blocks.mir @@ -32,7 +32,7 @@ name: foo body: | ; CHECK-LABEL: bb.0.entry: - ; CHECK: successors: %bb.1.less(0), %bb.2.exit(0) + ; CHECK: successors: %bb.1.less(0x40000000 / 0x80000000 = 50.00%), %bb.2.exit(0x40000000 / 0x80000000 = 50.00%) ; CHECK-LABEL: bb.1.less: bb.0.entry: successors: %bb.1.less, %bb.2.exit @@ -58,7 +58,7 @@ body: | ; Verify that we can have multiple lists of successors that will be merged ; into one. ; CHECK-LABEL: bb.0.entry: - ; CHECK: successors: %bb.1(0), %bb.2(0) + ; CHECK: successors: %bb.1(0x80000000 / 0x80000000 = 100.00%), %bb.2(0x00000000 / 0x80000000 = 0.00%) bb.0.entry: liveins: %edi successors: %bb.1 diff --git a/test/CodeGen/X86/MachineBranchProb.ll b/test/CodeGen/X86/MachineBranchProb.ll index da0bf517ecfa..ee1c658d4c55 100644 --- a/test/CodeGen/X86/MachineBranchProb.ll +++ b/test/CodeGen/X86/MachineBranchProb.ll @@ -18,9 +18,9 @@ for.cond2: ; preds = %for.inc, %for.cond %or.cond = or i1 %tobool, %cmp4 br i1 %or.cond, label %for.inc20, label %for.inc, !prof !0 ; CHECK: BB#1: derived from LLVM BB %for.cond2 -; CHECK: Successors according to CFG: BB#3(32756933) BB#4(2114726715) +; CHECK: Successors according to CFG: BB#3({{[0-9a-fx/= ]+}}1.53%) BB#4({{[0-9a-fx/= ]+}}98.47%) ; CHECK: BB#4: derived from LLVM BB %for.cond2 -; CHECK: Successors according to CFG: BB#3(33264335) BB#2(2114219313) +; CHECK: Successors according to CFG: BB#3({{[0-9a-fx/= ]+}}1.55%) BB#2({{[0-9a-fx/= ]+}}98.45%) for.inc: ; preds = %for.cond2 %shl = shl i32 %bit.0, 1 diff --git a/test/CodeGen/X86/catchpad-weight.ll b/test/CodeGen/X86/catchpad-weight.ll index 9b06f2abc81c..e8b416845ec1 100644 --- a/test/CodeGen/X86/catchpad-weight.ll +++ b/test/CodeGen/X86/catchpad-weight.ll @@ -2,7 +2,7 @@ ; Check if the edge weight to the catchpad is calculated correctly. -; CHECK: Successors according to CFG: BB#3(2147481600) BB#1(2048) BB#4(1024) BB#6(512) BB#8(256) +; CHECK: Successors according to CFG: BB#3(0x7ffff100 / 0x80000000 = 100.00%) BB#1(0x00000800 / 0x80000000 = 0.00%) BB#4(0x00000400 / 0x80000000 = 0.00%) BB#6(0x00000200 / 0x80000000 = 0.00%) BB#8(0x00000100 / 0x80000000 = 0.00%) target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64--windows-msvc18.0.0" diff --git a/test/CodeGen/X86/stack-protector-weight.ll b/test/CodeGen/X86/stack-protector-weight.ll index 16877ef70a31..dea66d28e3dd 100644 --- a/test/CodeGen/X86/stack-protector-weight.ll +++ b/test/CodeGen/X86/stack-protector-weight.ll @@ -2,13 +2,13 @@ ; RUN: llc -mtriple=x86_64-apple-darwin -print-machineinstrs=expand-isel-pseudos -enable-selectiondag-sp=false %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=IR ; SELDAG: # Machine code for function test_branch_weights: -; SELDAG: Successors according to CFG: BB#[[SUCCESS:[0-9]+]](2147481600) BB#[[FAILURE:[0-9]+]](2048) +; SELDAG: Successors according to CFG: BB#[[SUCCESS:[0-9]+]]({{[0-9a-fx/= ]+}}100.00%) BB#[[FAILURE:[0-9]+]] ; SELDAG: BB#[[FAILURE]]: ; SELDAG: CALL64pcrel32 ; SELDAG: BB#[[SUCCESS]]: ; IR: # Machine code for function test_branch_weights: -; IR: Successors according to CFG: BB#[[SUCCESS:[0-9]+]](2147481600) BB#[[FAILURE:[0-9]+]](2048) +; IR: Successors according to CFG: BB#[[SUCCESS:[0-9]+]]({{[0-9a-fx/= ]+}}100.00%) BB#[[FAILURE:[0-9]+]] ; IR: BB#[[SUCCESS]]: ; IR: BB#[[FAILURE]]: ; IR: CALL64pcrel32 diff --git a/test/CodeGen/X86/switch-edge-weight.ll b/test/CodeGen/X86/switch-edge-weight.ll index 9026f6f05f0e..6f594868c7ad 100644 --- a/test/CodeGen/X86/switch-edge-weight.ll +++ b/test/CodeGen/X86/switch-edge-weight.ll @@ -34,22 +34,22 @@ sw.epilog: ; CHECK: BB#0: ; BB#0 to BB#4: [0, 1133] (65 = 60 + 5) ; BB#0 to BB#5: [1134, UINT32_MAX] (25 = 20 + 5) -; CHECK: Successors according to CFG: BB#4(1550960411) BB#5(596523235) +; CHECK: Successors according to CFG: BB#4({{[0-9a-fx/= ]+}}72.22%) BB#5({{[0-9a-fx/= ]+}}27.78%) ; ; CHECK: BB#4: ; BB#4 to BB#1: [155, 159] (50) ; BB#4 to BB#5: [0, 1133] - [155, 159] (15 = 10 + 5) -; CHECK: Successors according to CFG: BB#1(1193046470) BB#7(357913941) +; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}76.92%) BB#7({{[0-9a-fx/= ]+}}23.08%) ; ; CHECK: BB#5: ; BB#5 to BB#1: {1140} (10) ; BB#5 to BB#6: [1134, UINT32_MAX] - {1140} (15 = 10 + 5) -; CHECK: Successors according to CFG: BB#1(238609294) BB#6(357913941) +; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}40.00%) BB#6({{[0-9a-fx/= ]+}}60.00%) ; ; CHECK: BB#6: ; BB#6 to BB#1: {1134} (10) ; BB#6 to BB#2: [1134, UINT32_MAX] - {1134, 1140} (5) -; CHECK: Successors according to CFG: BB#1(238609294) BB#2(119304647) +; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}66.67%) BB#2({{[0-9a-fx/= ]+}}33.33%) } ; CHECK-LABEL: test2 @@ -102,7 +102,7 @@ sw.epilog: ; CHECK: BB#0: ; BB#0 to BB#6: {0} + [15, UINT32_MAX] (5) ; BB#0 to BB#8: [1, 14] (jump table) (65 = 60 + 5) -; CHECK: Successors according to CFG: BB#6(153391689) BB#8(1994091957) +; CHECK: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}7.14%) BB#8({{[0-9a-fx/= ]+}}92.86% ; ; CHECK: BB#8: ; BB#8 to BB#1: {1} (10) @@ -111,7 +111,7 @@ sw.epilog: ; BB#8 to BB#3: {11} (10) ; BB#8 to BB#4: {12} (10) ; BB#8 to BB#5: {13, 14} (20) -; CHECK: Successors according to CFG: BB#1(306783378) BB#6(153391689) BB#2(306783378) BB#3(306783378) BB#4(306783378) BB#5(613566756) +; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}14.29%) BB#6({{[0-9a-fx/= ]+}}7.14%) BB#2({{[0-9a-fx/= ]+}}14.29%) BB#3({{[0-9a-fx/= ]+}}14.29%) BB#4({{[0-9a-fx/= ]+}}14.29%) BB#5({{[0-9a-fx/= ]+}}28.57%) } ; CHECK-LABEL: test3 @@ -163,7 +163,7 @@ sw.epilog: ; CHECK: BB#0: ; BB#0 to BB#6: [0, 9] + [15, UINT32_MAX] {10} ; BB#0 to BB#8: [10, 14] (jump table) (50) -; CHECK: Successors according to CFG: BB#6(357913941) BB#8(1789569705) +; CHECK: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}16.67%) BB#8({{[0-9a-fx/= ]+}}83.33%) ; ; CHECK: BB#8: ; BB#8 to BB#1: {10} (10) @@ -171,7 +171,7 @@ sw.epilog: ; BB#8 to BB#3: {12} (10) ; BB#8 to BB#4: {13} (10) ; BB#8 to BB#5: {14} (10) -; CHECK: Successors according to CFG: BB#1(357913941) BB#2(357913941) BB#3(357913941) BB#4(357913941) BB#5(357913941) +; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}20.00%) BB#2({{[0-9a-fx/= ]+}}20.00%) BB#3({{[0-9a-fx/= ]+}}20.00%) BB#4({{[0-9a-fx/= ]+}}20.00%) BB#5({{[0-9a-fx/= ]+}}20.00%) } ; CHECK-LABEL: test4 @@ -216,12 +216,12 @@ sw.epilog: ; CHECK: BB#0: ; BB#0 to BB#6: [0, 110] + [116, UINT32_MAX] (20) ; BB#0 to BB#7: [111, 115] (bit test) (50) -; CHECK: Successors according to CFG: BB#6(613566756) BB#7(1533916890) +; CHECK: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}28.57%) BB#7({{[0-9a-fx/= ]+}}71.43%) ; ; CHECK: BB#7: ; BB#7 to BB#2: {111, 114, 115} (30) ; BB#7 to BB#3: {112, 113} (20) -; CHECK: Successors according to CFG: BB#2(920350134) BB#3(613566756) +; CHECK: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}60.00%) BB#3({{[0-9a-fx/= ]+}}40.00%) } ; CHECK-LABEL: test5 @@ -273,7 +273,7 @@ sw.epilog: ; CHECK: BB#0: ; BB#0 to BB#6: [10, UINT32_MAX] (15) ; BB#0 to BB#8: [1, 5, 7, 9] (jump table) (45) -; CHECK: Successors according to CFG: BB#8(536870912) BB#9(1610612734) +; CHECK: Successors according to CFG: BB#8({{[0-9a-fx/= ]+}}25.00%) BB#9({{[0-9a-fx/= ]+}}75.00%) } !1 = !{!"branch_weights", i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10} diff --git a/test/CodeGen/X86/switch-jump-table.ll b/test/CodeGen/X86/switch-jump-table.ll index 3cfee1cd80e6..896a067da230 100644 --- a/test/CodeGen/X86/switch-jump-table.ll +++ b/test/CodeGen/X86/switch-jump-table.ll @@ -1,5 +1,5 @@ ; RUN: llc -mtriple=i686-pc-gnu-linux < %s | FileCheck %s -check-prefix=CHECK -; RUN: llc -mtriple=i686-pc-gnu-linux -print-machineinstrs=expand-isel-pseudos %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=CHECK-JT-WEIGHT +; RUN: llc -mtriple=i686-pc-gnu-linux -print-machineinstrs=expand-isel-pseudos %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=CHECK-JT-PROB ; An unreachable default destination is replaced with the most popular case label. @@ -54,9 +54,9 @@ default: ; Check if branch probabilities are correctly assigned to the jump table. define void @bar(i32 %x, i32* %to) { -; CHECK-JT-WEIGHT-LABEL: bar: -; CHECK-JT-WEIGHT: Successors according to CFG: BB#6(306783378) BB#8(1840700268) -; CHECK-JT-WEIGHT: Successors according to CFG: BB#1(306783378) BB#2(306783378) BB#3(306783378) BB#4(306783378) BB#5(613566756) +; CHECK-JT-PROB-LABEL: bar: +; CHECK-JT-PROB: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}14.29%) BB#8({{[0-9a-fx/= ]+}}85.71%) +; CHECK-JT-PROB: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}16.67%) BB#2({{[0-9a-fx/= ]+}}16.67%) BB#3({{[0-9a-fx/= ]+}}16.67%) BB#4({{[0-9a-fx/= ]+}}16.67%) BB#5({{[0-9a-fx/= ]+}}33.33%) entry: switch i32 %x, label %default [ From 531c21057053b9dd7d77ef59ddd92a1d99f0e1bc Mon Sep 17 00:00:00 2001 From: Evgeniy Stepanov Date: Tue, 1 Dec 2015 00:06:13 +0000 Subject: [PATCH 032/186] [safestack] Fix handling of array allocas. The current code does not take alloca array size into account and, as a result, considers any access past the first array element to be unsafe. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254350 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Instrumentation/SafeStack.cpp | 22 +++++++-- test/Transforms/SafeStack/array.ll | 48 ++++++++++++++++++++ 2 files changed, 65 insertions(+), 5 deletions(-) diff --git a/lib/Transforms/Instrumentation/SafeStack.cpp b/lib/Transforms/Instrumentation/SafeStack.cpp index 6071ca5a8754..f8c4058ae22a 100644 --- a/lib/Transforms/Instrumentation/SafeStack.cpp +++ b/lib/Transforms/Instrumentation/SafeStack.cpp @@ -118,6 +118,10 @@ class SafeStack : public FunctionPass { SmallVectorImpl &Returns, SmallVectorImpl &StackRestorePoints); + /// \brief Calculate the allocation size of a given alloca. Returns 0 if the + /// size can not be statically determined. + uint64_t getStaticAllocaAllocationSize(const AllocaInst* AI); + /// \brief Allocate space for all static allocas in \p StaticAllocas, /// replace allocas with pointers into the unsafe stack and generate code to /// restore the stack pointer before all return instructions in \p Returns. @@ -177,6 +181,17 @@ class SafeStack : public FunctionPass { bool runOnFunction(Function &F) override; }; // class SafeStack +uint64_t SafeStack::getStaticAllocaAllocationSize(const AllocaInst* AI) { + uint64_t Size = DL->getTypeAllocSize(AI->getAllocatedType()); + if (AI->isArrayAllocation()) { + auto C = dyn_cast(AI->getArraySize()); + if (!C) + return 0; + Size *= C->getZExtValue(); + } + return Size; +} + bool SafeStack::IsAccessSafe(Value *Addr, uint64_t Size, const AllocaInst *AI) { AllocaOffsetRewriter Rewriter(*SE, AI); const SCEV *Expr = Rewriter.visit(SE->getSCEV(Addr)); @@ -187,8 +202,7 @@ bool SafeStack::IsAccessSafe(Value *Addr, uint64_t Size, const AllocaInst *AI) { ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, Size)); ConstantRange AccessRange = AccessStartRange.add(SizeRange); ConstantRange AllocaRange = ConstantRange( - APInt(BitWidth, 0), - APInt(BitWidth, DL->getTypeStoreSize(AI->getAllocatedType()))); + APInt(BitWidth, 0), APInt(BitWidth, getStaticAllocaAllocationSize(AI))); bool Safe = AllocaRange.contains(AccessRange); DEBUG(dbgs() << "[SafeStack] Alloca " << *AI << "\n" @@ -463,10 +477,8 @@ SafeStack::moveStaticAllocasToUnsafeStack(IRBuilder<> &IRB, Function &F, for (AllocaInst *AI : StaticAllocas) { IRB.SetInsertPoint(AI); - auto CArraySize = cast(AI->getArraySize()); Type *Ty = AI->getAllocatedType(); - - uint64_t Size = DL->getTypeAllocSize(Ty) * CArraySize->getZExtValue(); + uint64_t Size = getStaticAllocaAllocationSize(AI); if (Size == 0) Size = 1; // Don't create zero-sized stack objects. diff --git a/test/Transforms/SafeStack/array.ll b/test/Transforms/SafeStack/array.ll index 6036bfc2c9c5..b2454dc2bb9e 100644 --- a/test/Transforms/SafeStack/array.ll +++ b/test/Transforms/SafeStack/array.ll @@ -35,4 +35,52 @@ entry: ret void } +; Load from an array at a fixed offset, no overflow. +define i8 @StaticArrayFixedSafe() nounwind uwtable safestack { +entry: + ; CHECK-LABEL: define i8 @StaticArrayFixedSafe( + ; CHECK-NOT: __safestack_unsafe_stack_ptr + ; CHECK: ret i8 + %buf = alloca i8, i32 4, align 1 + %gep = getelementptr inbounds i8, i8* %buf, i32 2 + %x = load i8, i8* %gep, align 1 + ret i8 %x +} + +; Load from an array at a fixed offset with overflow. +define i8 @StaticArrayFixedUnsafe() nounwind uwtable safestack { +entry: + ; CHECK-LABEL: define i8 @StaticArrayFixedUnsafe( + ; CHECK: __safestack_unsafe_stack_ptr + ; CHECK: ret i8 + %buf = alloca i8, i32 4, align 1 + %gep = getelementptr inbounds i8, i8* %buf, i32 5 + %x = load i8, i8* %gep, align 1 + ret i8 %x +} + +; Load from an array at an unknown offset. +define i8 @StaticArrayVariableUnsafe(i32 %ofs) nounwind uwtable safestack { +entry: + ; CHECK-LABEL: define i8 @StaticArrayVariableUnsafe( + ; CHECK: __safestack_unsafe_stack_ptr + ; CHECK: ret i8 + %buf = alloca i8, i32 4, align 1 + %gep = getelementptr inbounds i8, i8* %buf, i32 %ofs + %x = load i8, i8* %gep, align 1 + ret i8 %x +} + +; Load from an array of an unknown size. +define i8 @DynamicArrayUnsafe(i32 %sz) nounwind uwtable safestack { +entry: + ; CHECK-LABEL: define i8 @DynamicArrayUnsafe( + ; CHECK: __safestack_unsafe_stack_ptr + ; CHECK: ret i8 + %buf = alloca i8, i32 %sz, align 1 + %gep = getelementptr inbounds i8, i8* %buf, i32 2 + %x = load i8, i8* %gep, align 1 + ret i8 %x +} + declare i8* @strcpy(i8*, i8*) From 390ced11080a9faa958b4b6e08e8d7fd40347ac2 Mon Sep 17 00:00:00 2001 From: Evgeniy Stepanov Date: Tue, 1 Dec 2015 00:34:30 +0000 Subject: [PATCH 033/186] Extend debug info for function parameters in SDAG. SDAG currently can emit debug location for function parameters when an llvm.dbg.declare points to either a function argument SSA temp, or to an AllocaInst. This change extends this logic by adding a fallback case when neither of the above is true. This is required for SafeStack, which may copy the contents of a byval function argument into something that is not an alloca, and then describe the target as the new location of the said argument. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254352 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../SelectionDAG/SelectionDAGBuilder.cpp | 27 +++--- test/DebugInfo/Generic/safestack-byval.ll | 91 +++++++++++++++++++ 2 files changed, 102 insertions(+), 16 deletions(-) create mode 100644 test/DebugInfo/Generic/safestack-byval.ll diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index f754e24e3231..d880bcfbdf64 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -4463,22 +4463,17 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { Address = BCI->getOperand(0); // Parameters are handled specially. bool isParameter = Variable->isParameter() || isa(Address); - - const AllocaInst *AI = dyn_cast(Address); - - if (isParameter && !AI) { - FrameIndexSDNode *FINode = dyn_cast(N.getNode()); - if (FINode) - // Byval parameter. We have a frame index at this point. - SDV = DAG.getFrameIndexDbgValue( - Variable, Expression, FINode->getIndex(), 0, dl, SDNodeOrder); - else { - // Address is an argument, so try to emit its dbg value using - // virtual register info from the FuncInfo.ValueMap. - EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, 0, false, - N); - return nullptr; - } + auto FINode = dyn_cast(N.getNode()); + if (isParameter && FINode) { + // Byval parameter. We have a frame index at this point. + SDV = DAG.getFrameIndexDbgValue(Variable, Expression, + FINode->getIndex(), 0, dl, SDNodeOrder); + } else if (isa(Address)) { + // Address is an argument, so try to emit its dbg value using + // virtual register info from the FuncInfo.ValueMap. + EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, 0, false, + N); + return nullptr; } else { SDV = DAG.getDbgValue(Variable, Expression, N.getNode(), N.getResNo(), true, 0, dl, SDNodeOrder); diff --git a/test/DebugInfo/Generic/safestack-byval.ll b/test/DebugInfo/Generic/safestack-byval.ll new file mode 100644 index 000000000000..01ef064a78b6 --- /dev/null +++ b/test/DebugInfo/Generic/safestack-byval.ll @@ -0,0 +1,91 @@ +; Test dwarf codegen for DILocalVariable of a byval function argument that +; points to neither an argument nor an alloca. This kind of IR is generated by +; SafeStack for unsafe byval arguments. +; RUN: llc -stop-after expand-isel-pseudos %s -o /dev/null | FileCheck %s + +; This was built by compiling the following source with SafeStack and +; simplifying the result a little. +; struct S { +; int a[100]; +; }; +; +; int f(S zzz, unsigned long len) { +; return zzz.a[len]; +; } + +; CHECK: ![[ZZZ:.*]] = !DILocalVariable(name: "zzz", +; CHECK: ![[ZZZ_EXPR:.*]] = !DIExpression(DW_OP_deref, DW_OP_minus, 400) +; CHECK: DBG_VALUE {{.*}} ![[ZZZ]], ![[ZZZ_EXPR]] + +%struct.S = type { [100 x i32] } + +@__safestack_unsafe_stack_ptr = external thread_local(initialexec) global i8* + +; Function Attrs: norecurse nounwind readonly safestack uwtable +define i32 @_Z1f1Sm(%struct.S* byval nocapture readonly align 8 %zzz, i64 %len) #0 !dbg !12 { +entry: + %unsafe_stack_ptr = load i8*, i8** @__safestack_unsafe_stack_ptr, !dbg !22 + %unsafe_stack_static_top = getelementptr i8, i8* %unsafe_stack_ptr, i32 -400, !dbg !22 + store i8* %unsafe_stack_static_top, i8** @__safestack_unsafe_stack_ptr, !dbg !22 +; !17 describes "zzz" + call void @llvm.dbg.declare(metadata i8* %unsafe_stack_ptr, metadata !17, metadata !23), !dbg !22 + %0 = getelementptr i8, i8* %unsafe_stack_ptr, i32 -400, !dbg !22 + %zzz.unsafe-byval = bitcast i8* %0 to %struct.S*, !dbg !22 + %1 = bitcast %struct.S* %zzz to i8*, !dbg !24 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 400, i32 8, i1 false), !dbg !24 + tail call void @llvm.dbg.value(metadata i64 %len, i64 0, metadata !18, metadata !25), !dbg !24 + %arrayidx = getelementptr inbounds %struct.S, %struct.S* %zzz.unsafe-byval, i64 0, i32 0, i64 %len, !dbg !26 + %2 = load i32, i32* %arrayidx, align 4, !dbg !26, !tbaa !27 + store i8* %unsafe_stack_ptr, i8** @__safestack_unsafe_stack_ptr, !dbg !31 + ret i32 %2, !dbg !31 +} + +; Function Attrs: nounwind readnone +declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 + +; Function Attrs: nounwind readnone +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1 + +; Function Attrs: argmemonly nounwind +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #2 + +attributes #0 = { norecurse nounwind readonly safestack uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } +attributes #2 = { argmemonly nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!19, !20} +!llvm.ident = !{!21} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.8.0 (trunk 254107) (llvm/trunk 254109)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !3, subprograms: !11) +!1 = !DIFile(filename: "../llvm/1.cc", directory: "/tmp/build") +!2 = !{} +!3 = !{!4} +!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "S", file: !1, line: 4, size: 3200, align: 32, elements: !5, identifier: "_ZTS1S") +!5 = !{!6} +!6 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !"_ZTS1S", file: !1, line: 5, baseType: !7, size: 3200, align: 32) +!7 = !DICompositeType(tag: DW_TAG_array_type, baseType: !8, size: 3200, align: 32, elements: !9) +!8 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) +!9 = !{!10} +!10 = !DISubrange(count: 100) +!11 = !{!12} +!12 = distinct !DISubprogram(name: "f", linkageName: "_Z1f1Sm", scope: !1, file: !1, line: 8, type: !13, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, variables: !16) +!13 = !DISubroutineType(types: !14) +!14 = !{!8, !"_ZTS1S", !15} +!15 = !DIBasicType(name: "long unsigned int", size: 64, align: 64, encoding: DW_ATE_unsigned) +!16 = !{!17, !18} +!17 = !DILocalVariable(name: "zzz", arg: 1, scope: !12, file: !1, line: 8, type: !"_ZTS1S") +!18 = !DILocalVariable(name: "len", arg: 2, scope: !12, file: !1, line: 8, type: !15) +!19 = !{i32 2, !"Dwarf Version", i32 4} +!20 = !{i32 2, !"Debug Info Version", i32 3} +!21 = !{!"clang version 3.8.0 (trunk 254107) (llvm/trunk 254109)"} +!22 = !DILocation(line: 8, column: 9, scope: !12) +!23 = !DIExpression(DW_OP_deref, DW_OP_minus, 400) +!24 = !DILocation(line: 8, column: 28, scope: !12) +!25 = !DIExpression() +!26 = !DILocation(line: 9, column: 10, scope: !12) +!27 = !{!28, !28, i64 0} +!28 = !{!"int", !29, i64 0} +!29 = !{!"omnipotent char", !30, i64 0} +!30 = !{!"Simple C/C++ TBAA"} +!31 = !DILocation(line: 9, column: 3, scope: !12) From dd121fc8471347915e14b54c50409b27813e990b Mon Sep 17 00:00:00 2001 From: Evgeniy Stepanov Date: Tue, 1 Dec 2015 00:40:05 +0000 Subject: [PATCH 034/186] [safestack] Protect byval function arguments. Detect unsafe byval function arguments and move them to the unsafe stack. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254353 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Transforms/Utils/Local.h | 18 ++- lib/Transforms/Instrumentation/SafeStack.cpp | 149 +++++++++++++------ lib/Transforms/Utils/Local.cpp | 16 +- test/Transforms/SafeStack/byval.ll | 51 +++++++ test/Transforms/SafeStack/debug-loc.ll | 118 +++++++-------- 5 files changed, 241 insertions(+), 111 deletions(-) create mode 100644 test/Transforms/SafeStack/byval.ll diff --git a/include/llvm/Transforms/Utils/Local.h b/include/llvm/Transforms/Utils/Local.h index b7d67eaea3a0..1d707a1e5307 100644 --- a/include/llvm/Transforms/Utils/Local.h +++ b/include/llvm/Transforms/Utils/Local.h @@ -271,10 +271,20 @@ bool LowerDbgDeclare(Function &F); /// an alloca, if any. DbgDeclareInst *FindAllocaDbgDeclare(Value *V); -/// \brief Replaces llvm.dbg.declare instruction when an alloca is replaced with -/// a new value. If Deref is true, an additional DW_OP_deref is prepended to the -/// expression. If Offset is non-zero, a constant displacement is added to the -/// expression (after the optional Deref). Offset can be negative. +/// \brief Replaces llvm.dbg.declare instruction when the address it describes +/// is replaced with a new value. If Deref is true, an additional DW_OP_deref is +/// prepended to the expression. If Offset is non-zero, a constant displacement +/// is added to the expression (after the optional Deref). Offset can be +/// negative. +bool replaceDbgDeclare(Value *Address, Value *NewAddress, + Instruction *InsertBefore, DIBuilder &Builder, + bool Deref, int Offset); + +/// \brief Replaces llvm.dbg.declare instruction when the alloca it describes +/// is replaced with a new value. If Deref is true, an additional DW_OP_deref is +/// prepended to the expression. If Offset is non-zero, a constant displacement +/// is added to the expression (after the optional Deref). Offset can be +/// negative. New llvm.dbg.declare is inserted immediately before AI. bool replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress, DIBuilder &Builder, bool Deref, int Offset = 0); diff --git a/lib/Transforms/Instrumentation/SafeStack.cpp b/lib/Transforms/Instrumentation/SafeStack.cpp index f8c4058ae22a..4441663fc6de 100644 --- a/lib/Transforms/Instrumentation/SafeStack.cpp +++ b/lib/Transforms/Instrumentation/SafeStack.cpp @@ -57,6 +57,7 @@ STATISTIC(NumUnsafeStackRestorePointsFunctions, STATISTIC(NumAllocas, "Total number of allocas"); STATISTIC(NumUnsafeStaticAllocas, "Number of unsafe static allocas"); STATISTIC(NumUnsafeDynamicAllocas, "Number of unsafe dynamic allocas"); +STATISTIC(NumUnsafeByValArguments, "Number of unsafe byval arguments"); STATISTIC(NumUnsafeStackRestorePoints, "Number of setjmps and landingpads"); } // namespace llvm @@ -68,14 +69,14 @@ namespace { /// /// The implementation simply replaces all mentions of the alloca with zero. class AllocaOffsetRewriter : public SCEVRewriteVisitor { - const AllocaInst *AI; + const Value *AllocaPtr; public: - AllocaOffsetRewriter(ScalarEvolution &SE, const AllocaInst *AI) - : SCEVRewriteVisitor(SE), AI(AI) {} + AllocaOffsetRewriter(ScalarEvolution &SE, const Value *AllocaPtr) + : SCEVRewriteVisitor(SE), AllocaPtr(AllocaPtr) {} const SCEV *visitUnknown(const SCEVUnknown *Expr) { - if (Expr->getValue() == AI) + if (Expr->getValue() == AllocaPtr) return SE.getZero(Expr->getType()); return Expr; } @@ -115,6 +116,7 @@ class SafeStack : public FunctionPass { /// given function and append them to the respective vectors. void findInsts(Function &F, SmallVectorImpl &StaticAllocas, SmallVectorImpl &DynamicAllocas, + SmallVectorImpl &ByValArguments, SmallVectorImpl &Returns, SmallVectorImpl &StackRestorePoints); @@ -130,6 +132,7 @@ class SafeStack : public FunctionPass { /// allocas are allocated. Value *moveStaticAllocasToUnsafeStack(IRBuilder<> &IRB, Function &F, ArrayRef StaticAllocas, + ArrayRef ByValArguments, ArrayRef Returns); /// \brief Generate code to restore the stack after all stack restore points @@ -149,11 +152,12 @@ class SafeStack : public FunctionPass { AllocaInst *DynamicTop, ArrayRef DynamicAllocas); - bool IsSafeStackAlloca(const AllocaInst *AI); + bool IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize); bool IsMemIntrinsicSafe(const MemIntrinsic *MI, const Use &U, - const AllocaInst *AI); - bool IsAccessSafe(Value *Addr, uint64_t Size, const AllocaInst *AI); + const Value *AllocaPtr, uint64_t AllocaSize); + bool IsAccessSafe(Value *Addr, uint64_t Size, const Value *AllocaPtr, + uint64_t AllocaSize); public: static char ID; // Pass identification, replacement for typeid. @@ -192,20 +196,23 @@ uint64_t SafeStack::getStaticAllocaAllocationSize(const AllocaInst* AI) { return Size; } -bool SafeStack::IsAccessSafe(Value *Addr, uint64_t Size, const AllocaInst *AI) { - AllocaOffsetRewriter Rewriter(*SE, AI); +bool SafeStack::IsAccessSafe(Value *Addr, uint64_t AccessSize, + const Value *AllocaPtr, uint64_t AllocaSize) { + AllocaOffsetRewriter Rewriter(*SE, AllocaPtr); const SCEV *Expr = Rewriter.visit(SE->getSCEV(Addr)); uint64_t BitWidth = SE->getTypeSizeInBits(Expr->getType()); ConstantRange AccessStartRange = SE->getUnsignedRange(Expr); ConstantRange SizeRange = - ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, Size)); + ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, AccessSize)); ConstantRange AccessRange = AccessStartRange.add(SizeRange); - ConstantRange AllocaRange = ConstantRange( - APInt(BitWidth, 0), APInt(BitWidth, getStaticAllocaAllocationSize(AI))); + ConstantRange AllocaRange = + ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, AllocaSize)); bool Safe = AllocaRange.contains(AccessRange); - DEBUG(dbgs() << "[SafeStack] Alloca " << *AI << "\n" + DEBUG(dbgs() << "[SafeStack] " + << (isa(AllocaPtr) ? "Alloca " : "ByValArgument ") + << *AllocaPtr << "\n" << " Access " << *Addr << "\n" << " SCEV " << *Expr << " U: " << SE->getUnsignedRange(Expr) @@ -218,36 +225,38 @@ bool SafeStack::IsAccessSafe(Value *Addr, uint64_t Size, const AllocaInst *AI) { } bool SafeStack::IsMemIntrinsicSafe(const MemIntrinsic *MI, const Use &U, - const AllocaInst *AI) { + const Value *AllocaPtr, + uint64_t AllocaSize) { // All MemIntrinsics have destination address in Arg0 and size in Arg2. if (MI->getRawDest() != U) return true; const auto *Len = dyn_cast(MI->getLength()); // Non-constant size => unsafe. FIXME: try SCEV getRange. if (!Len) return false; - return IsAccessSafe(U, Len->getZExtValue(), AI); + return IsAccessSafe(U, Len->getZExtValue(), AllocaPtr, AllocaSize); } -/// Check whether a given alloca instruction (AI) should be put on the safe +/// Check whether a given allocation must be put on the safe /// stack or not. The function analyzes all uses of AI and checks whether it is /// only accessed in a memory safe way (as decided statically). -bool SafeStack::IsSafeStackAlloca(const AllocaInst *AI) { +bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) { // Go through all uses of this alloca and check whether all accesses to the // allocated object are statically known to be memory safe and, hence, the // object can be placed on the safe stack. SmallPtrSet Visited; - SmallVector WorkList; - WorkList.push_back(AI); + SmallVector WorkList; + WorkList.push_back(AllocaPtr); // A DFS search through all uses of the alloca in bitcasts/PHI/GEPs/etc. while (!WorkList.empty()) { - const Instruction *V = WorkList.pop_back_val(); + const Value *V = WorkList.pop_back_val(); for (const Use &UI : V->uses()) { auto I = cast(UI.getUser()); assert(V == UI.get()); switch (I->getOpcode()) { case Instruction::Load: { - if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getType()), AI)) + if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getType()), AllocaPtr, + AllocaSize)) return false; break; } @@ -257,13 +266,13 @@ bool SafeStack::IsSafeStackAlloca(const AllocaInst *AI) { case Instruction::Store: { if (V == I->getOperand(0)) { // Stored the pointer - conservatively assume it may be unsafe. - DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AI + DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AllocaPtr << "\n store of address: " << *I << "\n"); return false; } - if (!IsAccessSafe( - UI, DL->getTypeStoreSize(I->getOperand(0)->getType()), AI)) + if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getOperand(0)->getType()), + AllocaPtr, AllocaSize)) return false; break; } @@ -283,8 +292,8 @@ bool SafeStack::IsSafeStackAlloca(const AllocaInst *AI) { } if (const MemIntrinsic *MI = dyn_cast(I)) { - if (!IsMemIntrinsicSafe(MI, UI, AI)) { - DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AI + if (!IsMemIntrinsicSafe(MI, UI, AllocaPtr, AllocaSize)) { + DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AllocaPtr << "\n unsafe memintrinsic: " << *I << "\n"); return false; @@ -302,9 +311,9 @@ bool SafeStack::IsSafeStackAlloca(const AllocaInst *AI) { ImmutableCallSite::arg_iterator B = CS.arg_begin(), E = CS.arg_end(); for (ImmutableCallSite::arg_iterator A = B; A != E; ++A) if (A->get() == V) - if (!(CS.doesNotCapture(A - B) && - (CS.doesNotAccessMemory(A - B) || CS.doesNotAccessMemory()))) { - DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AI + if (!(CS.doesNotCapture(A - B) && (CS.doesNotAccessMemory(A - B) || + CS.doesNotAccessMemory()))) { + DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AllocaPtr << "\n unsafe call: " << *I << "\n"); return false; } @@ -355,13 +364,15 @@ Value *SafeStack::getOrCreateUnsafeStackPtr(IRBuilder<> &IRB, Function &F) { void SafeStack::findInsts(Function &F, SmallVectorImpl &StaticAllocas, SmallVectorImpl &DynamicAllocas, + SmallVectorImpl &ByValArguments, SmallVectorImpl &Returns, SmallVectorImpl &StackRestorePoints) { for (Instruction &I : instructions(&F)) { if (auto AI = dyn_cast(&I)) { ++NumAllocas; - if (IsSafeStackAlloca(AI)) + uint64_t Size = getStaticAllocaAllocationSize(AI); + if (IsSafeStackAlloca(AI, Size)) continue; if (AI->isStaticAlloca()) { @@ -386,6 +397,17 @@ void SafeStack::findInsts(Function &F, "gcroot intrinsic not compatible with safestack attribute"); } } + for (Argument &Arg : F.args()) { + if (!Arg.hasByValAttr()) + continue; + uint64_t Size = + DL->getTypeStoreSize(Arg.getType()->getPointerElementType()); + if (IsSafeStackAlloca(&Arg, Size)) + continue; + + ++NumUnsafeByValArguments; + ByValArguments.push_back(&Arg); + } } AllocaInst * @@ -420,7 +442,7 @@ SafeStack::createStackRestorePoints(IRBuilder<> &IRB, Function &F, for (Instruction *I : StackRestorePoints) { ++NumUnsafeStackRestorePoints; - IRB.SetInsertPoint(cast(I->getNextNode())); + IRB.SetInsertPoint(I->getNextNode()); Value *CurrentTop = DynamicTop ? IRB.CreateLoad(DynamicTop) : StaticTop; IRB.CreateStore(CurrentTop, UnsafeStackPtr); } @@ -428,11 +450,10 @@ SafeStack::createStackRestorePoints(IRBuilder<> &IRB, Function &F, return DynamicTop; } -Value * -SafeStack::moveStaticAllocasToUnsafeStack(IRBuilder<> &IRB, Function &F, - ArrayRef StaticAllocas, - ArrayRef Returns) { - if (StaticAllocas.empty()) +Value *SafeStack::moveStaticAllocasToUnsafeStack( + IRBuilder<> &IRB, Function &F, ArrayRef StaticAllocas, + ArrayRef ByValArguments, ArrayRef Returns) { + if (StaticAllocas.empty() && ByValArguments.empty()) return nullptr; DIBuilder DIB(*F.getParent()); @@ -454,6 +475,13 @@ SafeStack::moveStaticAllocasToUnsafeStack(IRBuilder<> &IRB, Function &F, // Compute maximum alignment among static objects on the unsafe stack. unsigned MaxAlignment = 0; + for (Argument *Arg : ByValArguments) { + Type *Ty = Arg->getType()->getPointerElementType(); + unsigned Align = std::max((unsigned)DL->getPrefTypeAlignment(Ty), + Arg->getParamAlignment()); + if (Align > MaxAlignment) + MaxAlignment = Align; + } for (AllocaInst *AI : StaticAllocas) { Type *Ty = AI->getAllocatedType(); unsigned Align = @@ -465,15 +493,46 @@ SafeStack::moveStaticAllocasToUnsafeStack(IRBuilder<> &IRB, Function &F, if (MaxAlignment > StackAlignment) { // Re-align the base pointer according to the max requested alignment. assert(isPowerOf2_32(MaxAlignment)); - IRB.SetInsertPoint(cast(BasePointer->getNextNode())); + IRB.SetInsertPoint(BasePointer->getNextNode()); BasePointer = cast(IRB.CreateIntToPtr( IRB.CreateAnd(IRB.CreatePtrToInt(BasePointer, IntPtrTy), ConstantInt::get(IntPtrTy, ~uint64_t(MaxAlignment - 1))), StackPtrTy)); } - // Allocate space for every unsafe static AllocaInst on the unsafe stack. int64_t StaticOffset = 0; // Current stack top. + IRB.SetInsertPoint(BasePointer->getNextNode()); + + for (Argument *Arg : ByValArguments) { + Type *Ty = Arg->getType()->getPointerElementType(); + + uint64_t Size = DL->getTypeStoreSize(Ty); + if (Size == 0) + Size = 1; // Don't create zero-sized stack objects. + + // Ensure the object is properly aligned. + unsigned Align = std::max((unsigned)DL->getPrefTypeAlignment(Ty), + Arg->getParamAlignment()); + + // Add alignment. + // NOTE: we ensure that BasePointer itself is aligned to >= Align. + StaticOffset += Size; + StaticOffset = RoundUpToAlignment(StaticOffset, Align); + + Value *Off = IRB.CreateGEP(BasePointer, // BasePointer is i8* + ConstantInt::get(Int32Ty, -StaticOffset)); + Value *NewArg = IRB.CreateBitCast(Off, Arg->getType(), + Arg->getName() + ".unsafe-byval"); + + // Replace alloc with the new location. + replaceDbgDeclare(Arg, BasePointer, BasePointer->getNextNode(), DIB, + /*Deref=*/true, -StaticOffset); + Arg->replaceAllUsesWith(NewArg); + IRB.SetInsertPoint(cast(NewArg)->getNextNode()); + IRB.CreateMemCpy(Off, Arg, Size, Arg->getParamAlignment()); + } + + // Allocate space for every unsafe static AllocaInst on the unsafe stack. for (AllocaInst *AI : StaticAllocas) { IRB.SetInsertPoint(AI); @@ -509,7 +568,7 @@ SafeStack::moveStaticAllocasToUnsafeStack(IRBuilder<> &IRB, Function &F, StaticOffset = RoundUpToAlignment(StaticOffset, StackAlignment); // Update shadow stack pointer in the function epilogue. - IRB.SetInsertPoint(cast(BasePointer->getNextNode())); + IRB.SetInsertPoint(BasePointer->getNextNode()); Value *StaticTop = IRB.CreateGEP(BasePointer, ConstantInt::get(Int32Ty, -StaticOffset), @@ -621,6 +680,7 @@ bool SafeStack::runOnFunction(Function &F) { SmallVector StaticAllocas; SmallVector DynamicAllocas; + SmallVector ByValArguments; SmallVector Returns; // Collect all points where stack gets unwound and needs to be restored @@ -632,13 +692,15 @@ bool SafeStack::runOnFunction(Function &F) { // Find all static and dynamic alloca instructions that must be moved to the // unsafe stack, all return instructions and stack restore points. - findInsts(F, StaticAllocas, DynamicAllocas, Returns, StackRestorePoints); + findInsts(F, StaticAllocas, DynamicAllocas, ByValArguments, Returns, + StackRestorePoints); if (StaticAllocas.empty() && DynamicAllocas.empty() && - StackRestorePoints.empty()) + ByValArguments.empty() && StackRestorePoints.empty()) return false; // Nothing to do in this function. - if (!StaticAllocas.empty() || !DynamicAllocas.empty()) + if (!StaticAllocas.empty() || !DynamicAllocas.empty() || + !ByValArguments.empty()) ++NumUnsafeStackFunctions; // This function has the unsafe stack. if (!StackRestorePoints.empty()) @@ -648,7 +710,8 @@ bool SafeStack::runOnFunction(Function &F) { UnsafeStackPtr = getOrCreateUnsafeStackPtr(IRB, F); // The top of the unsafe stack after all unsafe static allocas are allocated. - Value *StaticTop = moveStaticAllocasToUnsafeStack(IRB, F, StaticAllocas, Returns); + Value *StaticTop = moveStaticAllocasToUnsafeStack(IRB, F, StaticAllocas, + ByValArguments, Returns); // Safe stack object that stores the current unsafe stack top. It is updated // as unsafe dynamic (non-constant-sized) allocas are allocated and freed. diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp index 0bd5fa9f8777..623da675e05b 100644 --- a/lib/Transforms/Utils/Local.cpp +++ b/lib/Transforms/Utils/Local.cpp @@ -1136,9 +1136,10 @@ DbgDeclareInst *llvm::FindAllocaDbgDeclare(Value *V) { return nullptr; } -bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress, - DIBuilder &Builder, bool Deref, int Offset) { - DbgDeclareInst *DDI = FindAllocaDbgDeclare(AI); +bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress, + Instruction *InsertBefore, DIBuilder &Builder, + bool Deref, int Offset) { + DbgDeclareInst *DDI = FindAllocaDbgDeclare(Address); if (!DDI) return false; DebugLoc Loc = DDI->getDebugLoc(); @@ -1168,12 +1169,17 @@ bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress, // Insert llvm.dbg.declare immediately after the original alloca, and remove // old llvm.dbg.declare. - Builder.insertDeclare(NewAllocaAddress, DIVar, DIExpr, Loc, - AI->getNextNode()); + Builder.insertDeclare(NewAddress, DIVar, DIExpr, Loc, InsertBefore); DDI->eraseFromParent(); return true; } +bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress, + DIBuilder &Builder, bool Deref, int Offset) { + return replaceDbgDeclare(AI, NewAllocaAddress, AI->getNextNode(), Builder, + Deref, Offset); +} + /// changeToUnreachable - Insert an unreachable instruction before the specified /// instruction, making it and the rest of the code in the block dead. static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) { diff --git a/test/Transforms/SafeStack/byval.ll b/test/Transforms/SafeStack/byval.ll new file mode 100644 index 000000000000..f9a06e54d2df --- /dev/null +++ b/test/Transforms/SafeStack/byval.ll @@ -0,0 +1,51 @@ +; RUN: opt -safe-stack -S -mtriple=i386-pc-linux-gnu < %s -o - | FileCheck %s +; RUN: opt -safe-stack -S -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.S = type { [100 x i32] } + +; Safe access to a byval argument. +define i32 @ByValSafe(%struct.S* byval nocapture readonly align 8 %zzz) norecurse nounwind readonly safestack uwtable { +entry: + ; CHECK-LABEL: @ByValSafe + ; CHECK-NOT: __safestack_unsafe_stack_ptr + ; CHECK: ret i32 + %arrayidx = getelementptr inbounds %struct.S, %struct.S* %zzz, i64 0, i32 0, i64 3 + %0 = load i32, i32* %arrayidx, align 4 + ret i32 %0 +} + +; Unsafe access to a byval argument. +; Argument is copied to the unsafe stack. +define i32 @ByValUnsafe(%struct.S* byval nocapture readonly align 8 %zzz, i64 %idx) norecurse nounwind readonly safestack uwtable { +entry: + ; CHECK-LABEL: @ByValUnsafe + ; CHECK: %[[A:.*]] = load {{.*}} @__safestack_unsafe_stack_ptr + ; CHECK: store {{.*}} @__safestack_unsafe_stack_ptr + ; CHECK: %[[B:.*]] = getelementptr i8, i8* %[[A]], i32 -400 + ; CHECK: %[[C:.*]] = bitcast %struct.S* %zzz to i8* + ; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[B]], i8* %[[C]], i64 400, i32 8, i1 false) + ; CHECK: ret i32 + %arrayidx = getelementptr inbounds %struct.S, %struct.S* %zzz, i64 0, i32 0, i64 %idx + %0 = load i32, i32* %arrayidx, align 4 + ret i32 %0 +} + +; Highly aligned byval argument. +define i32 @ByValUnsafeAligned(%struct.S* byval nocapture readonly align 64 %zzz, i64 %idx) norecurse nounwind readonly safestack uwtable { +entry: + ; CHECK-LABEL: @ByValUnsafeAligned + ; CHECK: %[[A:.*]] = load {{.*}} @__safestack_unsafe_stack_ptr + ; CHECK: %[[B:.*]] = ptrtoint i8* %[[A]] to i64 + ; CHECK: and i64 %[[B]], -64 + ; CHECK: ret i32 + %arrayidx = getelementptr inbounds %struct.S, %struct.S* %zzz, i64 0, i32 0, i64 0 + %0 = load i32, i32* %arrayidx, align 64 + %arrayidx2 = getelementptr inbounds %struct.S, %struct.S* %zzz, i64 0, i32 0, i64 %idx + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + ret i32 %add +} + diff --git a/test/Transforms/SafeStack/debug-loc.ll b/test/Transforms/SafeStack/debug-loc.ll index 57d565f8cfd0..e72d0e9d2ff2 100644 --- a/test/Transforms/SafeStack/debug-loc.ll +++ b/test/Transforms/SafeStack/debug-loc.ll @@ -1,83 +1,83 @@ ; RUN: opt -safe-stack -S -mtriple=i386-pc-linux-gnu < %s -o - | FileCheck %s ; Test debug location for the local variables moved onto the unsafe stack. -; CHECK: define void @f -; CHECK: %[[USP:.*]] = load i8*, i8** @__safestack_unsafe_stack_ptr -; dbg.declare for %buf is gone; replaced with dbg.declare based off the unsafe stack pointer -; CHECK-NOT: @llvm.dbg.declare.*%buf -; CHECK: call void @llvm.dbg.declare(metadata i8* %[[USP]], metadata ![[VAR:.*]], metadata ![[EXPR:.*]]) +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" -; dbg.declare appears before the first use of %buf -; CHECK: getelementptr{{.*}}%buf -; CHECK: call{{.*}}@Capture -; CHECK: ret void +%struct.S = type { [100 x i8] } -; dbg.declare describes "buf"... -; CHECK: ![[VAR]] = !DILocalVariable(name: "buf" +; Function Attrs: safestack uwtable +define void @f(%struct.S* byval align 8 %zzz) #0 !dbg !12 { +; CHECK: define void @f -; ... as an offset from the unsafe stack pointer -; CHECK: ![[EXPR]] = !DIExpression(DW_OP_deref, DW_OP_minus, 400) +entry: +; CHECK: %[[USP:.*]] = load i8*, i8** @__safestack_unsafe_stack_ptr + %xxx = alloca %struct.S, align 1 + call void @llvm.dbg.declare(metadata %struct.S* %zzz, metadata !18, metadata !19), !dbg !20 + call void @llvm.dbg.declare(metadata %struct.S* %xxx, metadata !21, metadata !19), !dbg !22 -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" +; dbg.declare for %zzz and %xxx are gone; replaced with dbg.declare based off the unsafe stack pointer +; CHECK-NOT: call void @llvm.dbg.declare +; CHECK: call void @llvm.dbg.declare(metadata i8* %[[USP]], metadata ![[VAR_ARG:.*]], metadata ![[EXPR_ARG:.*]]) +; CHECK-NOT: call void @llvm.dbg.declare +; CHECK: call void @llvm.dbg.declare(metadata i8* %[[USP]], metadata ![[VAR_LOCAL:.*]], metadata ![[EXPR_LOCAL:.*]]) +; CHECK-NOT: call void @llvm.dbg.declare -; Function Attrs: safestack uwtable -define void @f() #0 !dbg !4 { -entry: - %buf = alloca [100 x i32], align 16 - %0 = bitcast [100 x i32]* %buf to i8*, !dbg !16 - call void @llvm.lifetime.start(i64 400, i8* %0) #4, !dbg !16 - tail call void @llvm.dbg.declare(metadata [100 x i32]* %buf, metadata !8, metadata !17), !dbg !18 + call void @Capture(%struct.S* %zzz), !dbg !23 + call void @Capture(%struct.S* %xxx), !dbg !24 +; dbg.declare appears before the first use +; CHECK: call void @Capture +; CHECK: call void @Capture - %arraydecay = getelementptr inbounds [100 x i32], [100 x i32]* %buf, i64 0, i64 0, !dbg !19 - call void @Capture(i32* %arraydecay), !dbg !20 - call void @llvm.lifetime.end(i64 400, i8* %0) #4, !dbg !21 - ret void, !dbg !21 + ret void, !dbg !25 } -; Function Attrs: nounwind argmemonly -declare void @llvm.lifetime.start(i64, i8* nocapture) #1 +; CHECK-DAG: ![[VAR_ARG]] = !DILocalVariable(name: "zzz" +; 100 aligned up to 8 +; CHECK-DAG: ![[EXPR_ARG]] = !DIExpression(DW_OP_deref, DW_OP_minus, 104 -; Function Attrs: nounwind readnone -declare void @llvm.dbg.declare(metadata, metadata, metadata) #2 +; CHECK-DAG: ![[VAR_LOCAL]] = !DILocalVariable(name: "xxx" +; CHECK-DAG: ![[EXPR_LOCAL]] = !DIExpression(DW_OP_deref, DW_OP_minus, 208 -declare void @Capture(i32*) #3 +; Function Attrs: nounwind readnone +declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 -; Function Attrs: nounwind argmemonly -declare void @llvm.lifetime.end(i64, i8* nocapture) #1 +declare void @Capture(%struct.S*) #2 -attributes #0 = { safestack uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind argmemonly } -attributes #2 = { nounwind readnone } -attributes #3 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #4 = { nounwind } +attributes #0 = { safestack uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } +attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } !llvm.dbg.cu = !{!0} -!llvm.module.flags = !{!13, !14} -!llvm.ident = !{!15} +!llvm.module.flags = !{!15, !16} +!llvm.ident = !{!17} -!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.8.0 (trunk 248518) (llvm/trunk 248512)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, subprograms: !3) -!1 = !DIFile(filename: "1.cc", directory: "/tmp") +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.8.0 (trunk 254019) (llvm/trunk 254036)", isOptimized: false, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !3, subprograms: !11) +!1 = !DIFile(filename: "../llvm/2.cc", directory: "/code/build-llvm") !2 = !{} !3 = !{!4} -!4 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 4, type: !5, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: true, variables: !7) -!5 = !DISubroutineType(types: !6) -!6 = !{null} -!7 = !{!8} -!8 = !DILocalVariable(name: "buf", scope: !4, file: !1, line: 5, type: !9) -!9 = !DICompositeType(tag: DW_TAG_array_type, baseType: !10, size: 3200, align: 32, elements: !11) -!10 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) +!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "S", file: !1, line: 4, size: 800, align: 8, elements: !5, identifier: "_ZTS1S") +!5 = !{!6} +!6 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !"_ZTS1S", file: !1, line: 5, baseType: !7, size: 800, align: 8) +!7 = !DICompositeType(tag: DW_TAG_array_type, baseType: !8, size: 800, align: 8, elements: !9) +!8 = !DIBasicType(name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char) +!9 = !{!10} +!10 = !DISubrange(count: 100) !11 = !{!12} -!12 = !DISubrange(count: 100) -!13 = !{i32 2, !"Dwarf Version", i32 4} -!14 = !{i32 2, !"Debug Info Version", i32 3} -!15 = !{!"clang version 3.8.0 (trunk 248518) (llvm/trunk 248512)"} -!16 = !DILocation(line: 5, column: 3, scope: !4) -!17 = !DIExpression() -!18 = !DILocation(line: 5, column: 7, scope: !4) -!19 = !DILocation(line: 6, column: 11, scope: !4) -!20 = !DILocation(line: 6, column: 3, scope: !4) -!21 = !DILocation(line: 7, column: 1, scope: !4) +!12 = distinct !DISubprogram(name: "f", linkageName: "_Z1f1S", scope: !1, file: !1, line: 10, type: !13, isLocal: false, isDefinition: true, scopeLine: 10, flags: DIFlagPrototyped, isOptimized: false, variables: !2) +!13 = !DISubroutineType(types: !14) +!14 = !{null, !"_ZTS1S"} +!15 = !{i32 2, !"Dwarf Version", i32 4} +!16 = !{i32 2, !"Debug Info Version", i32 3} +!17 = !{!"clang version 3.8.0 (trunk 254019) (llvm/trunk 254036)"} +!18 = !DILocalVariable(name: "zzz", arg: 1, scope: !12, file: !1, line: 10, type: !"_ZTS1S") +!19 = !DIExpression() +!20 = !DILocation(line: 10, column: 10, scope: !12) +!21 = !DILocalVariable(name: "xxx", scope: !12, file: !1, line: 11, type: !"_ZTS1S") +!22 = !DILocation(line: 11, column: 5, scope: !12) +!23 = !DILocation(line: 12, column: 3, scope: !12) +!24 = !DILocation(line: 13, column: 3, scope: !12) +!25 = !DILocation(line: 14, column: 1, scope: !12) From 629180472c0c66f4c44ef9bf06949e79331cd031 Mon Sep 17 00:00:00 2001 From: David Blaikie Date: Tue, 1 Dec 2015 00:48:34 +0000 Subject: [PATCH 035/186] llvm-dwp: Initial layout git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254354 91177308-0d34-0410-b5e6-96231b3b80d8 --- tools/LLVMBuild.txt | 1 + tools/Makefile | 2 +- tools/llvm-dwp/CMakeLists.txt | 13 +++++++++++++ tools/llvm-dwp/LLVMBuild.txt | 23 +++++++++++++++++++++++ tools/llvm-dwp/llvm-dwp.cpp | 2 ++ 5 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 tools/llvm-dwp/CMakeLists.txt create mode 100644 tools/llvm-dwp/LLVMBuild.txt create mode 100644 tools/llvm-dwp/llvm-dwp.cpp diff --git a/tools/LLVMBuild.txt b/tools/LLVMBuild.txt index d6c08d25d944..d4b014771859 100644 --- a/tools/LLVMBuild.txt +++ b/tools/LLVMBuild.txt @@ -28,6 +28,7 @@ subdirectories = llvm-diff llvm-dis llvm-dwarfdump + llvm-dwp llvm-extract llvm-jitlistener llvm-link diff --git a/tools/Makefile b/tools/Makefile index a2ec8b065ebd..92d495451879 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -33,7 +33,7 @@ PARALLEL_DIRS := opt llvm-as llvm-dis llc llvm-ar llvm-nm llvm-link \ llvm-dwarfdump llvm-cov llvm-size llvm-stress llvm-mcmarkup \ llvm-profdata llvm-symbolizer obj2yaml yaml2obj llvm-c-test \ llvm-cxxdump verify-uselistorder dsymutil llvm-pdbdump \ - llvm-split sancov + llvm-split sancov llvm-dwp # If Intel JIT Events support is configured, build an extra tool to test it. ifeq ($(USE_INTEL_JITEVENTS), 1) diff --git a/tools/llvm-dwp/CMakeLists.txt b/tools/llvm-dwp/CMakeLists.txt new file mode 100644 index 000000000000..b29c00d49c3d --- /dev/null +++ b/tools/llvm-dwp/CMakeLists.txt @@ -0,0 +1,13 @@ +set(LLVM_LINK_COMPONENTS + ${LLVM_TARGETS_TO_BUILD} + AsmPrinter + DebugInfoDWARF + MC + Object + Support + Target + ) + +add_llvm_tool(llvm-dwp + llvm-dwp.cpp + ) diff --git a/tools/llvm-dwp/LLVMBuild.txt b/tools/llvm-dwp/LLVMBuild.txt new file mode 100644 index 000000000000..345a73757255 --- /dev/null +++ b/tools/llvm-dwp/LLVMBuild.txt @@ -0,0 +1,23 @@ +;===- ./tools/llvm-dwp/LLVMBuild.txt ---------------------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Tool +name = llvm-dwp +parent = Tools +required_libraries = AsmPrinter DebugInfoDWARF MC Object Support all-targets + diff --git a/tools/llvm-dwp/llvm-dwp.cpp b/tools/llvm-dwp/llvm-dwp.cpp new file mode 100644 index 000000000000..b2f997621bea --- /dev/null +++ b/tools/llvm-dwp/llvm-dwp.cpp @@ -0,0 +1,2 @@ +int main() { +} From bb823837e04dc83f63c5a6c021aae05655b68dc9 Mon Sep 17 00:00:00 2001 From: David Blaikie Date: Tue, 1 Dec 2015 00:48:39 +0000 Subject: [PATCH 036/186] [llvm-dwp] Initial partial prototype This just concatenates the common DWP sections without doing any of the fancy DWP things like: 1) update str_offsets 2) deduplicating strings 3) merging/creating cu/tu_index Patches for these will follow shortly. (also not sure about target triple/object file type for this tool - do I really need a whole triple just to write an object file that contains purely static/hardcoded bytes in each section? & I guess I should just pick it based on the first input, maybe, rather than hardcoding for now - but we only produce .dwo on ELF platforms with objcopy for now anyway) git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254355 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/ADT/StringMap.h | 7 ++ include/llvm/ADT/StringSet.h | 5 + test/tools/llvm-dwp/Inputs/simple/a.cpp | 2 + test/tools/llvm-dwp/Inputs/simple/a.dwo | Bin 0 -> 1193 bytes test/tools/llvm-dwp/Inputs/simple/b.cpp | 3 + test/tools/llvm-dwp/Inputs/simple/b.dwo | Bin 0 -> 1241 bytes test/tools/llvm-dwp/X86/lit.local.cfg | 4 + test/tools/llvm-dwp/X86/simple.test | 58 ++++++++++ tools/llvm-dwp/llvm-dwp.cpp | 144 +++++++++++++++++++++++- 9 files changed, 222 insertions(+), 1 deletion(-) create mode 100644 test/tools/llvm-dwp/Inputs/simple/a.cpp create mode 100644 test/tools/llvm-dwp/Inputs/simple/a.dwo create mode 100644 test/tools/llvm-dwp/Inputs/simple/b.cpp create mode 100644 test/tools/llvm-dwp/Inputs/simple/b.dwo create mode 100644 test/tools/llvm-dwp/X86/lit.local.cfg create mode 100644 test/tools/llvm-dwp/X86/simple.test diff --git a/include/llvm/ADT/StringMap.h b/include/llvm/ADT/StringMap.h index 194235fac570..700bb9e10ef7 100644 --- a/include/llvm/ADT/StringMap.h +++ b/include/llvm/ADT/StringMap.h @@ -232,6 +232,13 @@ class StringMap : public StringMapImpl { : StringMapImpl(InitialSize, static_cast(sizeof(MapEntryTy))), Allocator(A) {} + StringMap(std::initializer_list> List) + : StringMapImpl(static_cast(sizeof(MapEntryTy))) { + for (const auto &P : List) { + insert(P); + } + } + StringMap(StringMap &&RHS) : StringMapImpl(std::move(RHS)), Allocator(std::move(RHS.Allocator)) {} diff --git a/include/llvm/ADT/StringSet.h b/include/llvm/ADT/StringSet.h index 3e0cc200b6dd..08626dc7af84 100644 --- a/include/llvm/ADT/StringSet.h +++ b/include/llvm/ADT/StringSet.h @@ -23,6 +23,11 @@ namespace llvm { class StringSet : public llvm::StringMap { typedef llvm::StringMap base; public: + StringSet() = default; + StringSet(std::initializer_list S) { + for (StringRef X : S) + insert(X); + } std::pair insert(StringRef Key) { assert(!Key.empty()); diff --git a/test/tools/llvm-dwp/Inputs/simple/a.cpp b/test/tools/llvm-dwp/Inputs/simple/a.cpp new file mode 100644 index 000000000000..f85d105a99f8 --- /dev/null +++ b/test/tools/llvm-dwp/Inputs/simple/a.cpp @@ -0,0 +1,2 @@ +struct foo { }; +foo a; diff --git a/test/tools/llvm-dwp/Inputs/simple/a.dwo b/test/tools/llvm-dwp/Inputs/simple/a.dwo new file mode 100644 index 0000000000000000000000000000000000000000..7bdb2a7b9f826e0dfcbafb94b64ef9285b42f4c6 GIT binary patch literal 1193 zcmbtTT}s115T4ytTP=#EFN!Y#5n3N&wTf1;EsBC5;tgtAQ!S=RNqaHDd+~HrgfYZvW8JtxbWTT1RyNqS7Wk=hfB>29j%Xx`; ztjk<&eH3I?6(f32BQ!qp%mZ2`ghABrI>U;t@#a?^&{jhjM7`I?imvhIPce}cC_(S# zbta1nev7=PXgt;DIj34rK*K)(`l2?8$Uoe;;P`O|#`UQC`(i%oJ~8?yuwUDTBP@Dj W+#_lezMoh|yOF`TrpWUre(OK6DqH*j literal 0 HcmV?d00001 diff --git a/test/tools/llvm-dwp/Inputs/simple/b.cpp b/test/tools/llvm-dwp/Inputs/simple/b.cpp new file mode 100644 index 000000000000..fedcc160f41e --- /dev/null +++ b/test/tools/llvm-dwp/Inputs/simple/b.cpp @@ -0,0 +1,3 @@ +struct bar { }; +void b(bar) { +} diff --git a/test/tools/llvm-dwp/Inputs/simple/b.dwo b/test/tools/llvm-dwp/Inputs/simple/b.dwo new file mode 100644 index 0000000000000000000000000000000000000000..f41243dc722b011d346dccede7fd91268525d578 GIT binary patch literal 1241 zcmbtU%`O8`6h8M(>#rfHT2yRiBf(;{8ll7(f`nLDk=RtHrZmQkOsiOl+FI}gEG>Bo zPayFgmNw4my)(wmBo=OR?m6FgzVkC}r`PM-8=Nr+4+9f^P{;zj#7L`4TQC5#95{n^(LJ-yMq zI&D>L^sPYn<1jjO?DM0F>(`9K3m*&tF{!|cU#~ZvRx?gFyt*I9Rof1ncARk=p3}v+ z!@tiMc8GnIQ9mn+eg~>{3e&_De#^j;cw6Da`p}m#AI5R%odhFsGC{kOz){H)U3g}N zi*9w+0cKQ=#P4+BSrab0Rc(M7l_T+}B(qVa2vjZQHAO>kN(wtlPOT@*6%E0HcuC+{ z=K9gokGD@^@nQfFpEUnN)+f!AE&g;`;wMA%k~StjX%ph*GRi~^(h4J>_#04C-$$xq AOaK4? literal 0 HcmV?d00001 diff --git a/test/tools/llvm-dwp/X86/lit.local.cfg b/test/tools/llvm-dwp/X86/lit.local.cfg new file mode 100644 index 000000000000..05f8b38b3346 --- /dev/null +++ b/test/tools/llvm-dwp/X86/lit.local.cfg @@ -0,0 +1,4 @@ +if not 'X86' in config.root.targets: + config.unsupported = True + +config.suffixes = ['.test', '.cpp', '.m', '.s'] diff --git a/test/tools/llvm-dwp/X86/simple.test b/test/tools/llvm-dwp/X86/simple.test new file mode 100644 index 000000000000..26215bdc8c91 --- /dev/null +++ b/test/tools/llvm-dwp/X86/simple.test @@ -0,0 +1,58 @@ +RUN: llvm-dwp %p/../Inputs/simple/a.dwo %p/../Inputs/simple/b.dwo -o %t +RUN: llvm-dwarfdump %t | FileCheck %s + +FIXME: For some reason, piping straight from llvm-dwp to llvm-dwarfdump doesn't behave well - looks like dwarfdump is reading/closes before dwp has finished. + +DWP from non-type-unit debug info for these two translation units: +a.cpp: + struct foo { }; + foo a; + +b.cpp: + struct bar { }; + void b(bar) { + } + +CHECK: .debug_abbrev.dwo contents: +CHECK: Abbrev table for offset: 0x00000000 +CHECK: DW_TAG_compile_unit +CHECK: DW_TAG_variable +CHECK: DW_TAG_structure_type +CHECK: Abbrev table for offset: 0x00000031 +CHECK: DW_TAG_compile_unit +CHECK: DW_TAG_structure_type +CHECK: DW_TAG_subprogram +CHECK: DW_TAG_formal_parameter + +CHECK: .debug_info.dwo contents: +CHECK: 0x00000000: Compile Unit: length = 0x00000025 version = 0x0004 abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x00000029) +CHECK: DW_TAG_compile_unit +CHECK: DW_AT_name {{.*}} "a.cpp" +CHECK: DW_TAG_variable +CHECK: DW_AT_name {{.*}} "a" +CHECK: DW_TAG_structure_type +CHECK: DW_AT_name {{.*}} "foo" + +FIXME: Using cu_index, identify that abbr_offset is 0x0031, not 0x0000 +CHECK: 0x00000029: Compile Unit: length = 0x00000031 version = 0x0004 abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x0000005e) +FIXME: Using cu_index, use strings based on the right str index offset +CHECK: DW_AT_name {{.*}} "a.cpp" +FIXME: Using cu_index to find the right abbrevs at abbr_offset, this abbrevation should actually be structure_type +CHECK: DW_TAG_variable + +CHECK: .debug_cu_index contents: +FIXME: Emit and verify the cu_index contents + +CHECK: .debug_str.dwo contents: +CHECK: "clang version +CHECK: 0x[[ACPP:.*]]: "a.cpp" +FIXME: Remove duplicates +CHECK: "clang version +CHECK: 0x[[BCPP:.*]]: "b.cpp" + +CHECK: .debug_str_offsets.dwo contents: +CHECK: : 00000000 +CHECK: : [[ACPP]] +CHECK: : 00000000 +FIXME: Update str offset indexes, this should be BCPP \/ +CHECK: : [[ACPP]] diff --git a/tools/llvm-dwp/llvm-dwp.cpp b/tools/llvm-dwp/llvm-dwp.cpp index b2f997621bea..7f9f6678db0b 100644 --- a/tools/llvm-dwp/llvm-dwp.cpp +++ b/tools/llvm-dwp/llvm-dwp.cpp @@ -1,2 +1,144 @@ -int main() { +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringSet.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCObjectFileInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Support/Options.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Support/TargetSelect.h" +#include +#include +#include + +using namespace llvm; +using namespace cl; + +OptionCategory DwpCategory("Specific Options"); +static list InputFiles(Positional, OneOrMore, + desc(""), cat(DwpCategory)); + +static opt OutputFilename(Required, "o", desc("Specify the output file."), + value_desc("filename"), cat(DwpCategory)); + +static int error(const Twine &Error, const Twine &Context) { + errs() << Twine("while processing ") + Context + ":\n"; + errs() << Twine("error: ") + Error + "\n"; + return 1; +} + +static std::error_code writeSection(MCStreamer &Out, MCSection *OutSection, + const object::SectionRef &Sym) { + StringRef Contents; + if (auto Err = Sym.getContents(Contents)) + return Err; + Out.SwitchSection(OutSection); + Out.EmitBytes(Contents); + return std::error_code(); +} + +static std::error_code write(MCStreamer &Out, ArrayRef Inputs) { + for (const auto &Input : Inputs) { + auto ErrOrObj = object::ObjectFile::createObjectFile(Input); + if (!ErrOrObj) + return ErrOrObj.getError(); + const auto *Obj = ErrOrObj->getBinary(); + for (const auto &Section : Obj->sections()) { + const auto &MCOFI = *Out.getContext().getObjectFileInfo(); + static const StringMap KnownSections = { + {"debug_info.dwo", MCOFI.getDwarfInfoDWOSection()}, + {"debug_types.dwo", MCOFI.getDwarfTypesDWOSection()}, + {"debug_str_offsets.dwo", MCOFI.getDwarfStrOffDWOSection()}, + {"debug_str.dwo", MCOFI.getDwarfStrDWOSection()}, + {"debug_loc.dwo", MCOFI.getDwarfLocDWOSection()}, + {"debug_abbrev.dwo", MCOFI.getDwarfAbbrevDWOSection()}}; + StringRef Name; + if (std::error_code Err = Section.getName(Name)) + return Err; + if (MCSection *OutSection = + KnownSections.lookup(Name.substr(Name.find_first_not_of("._")))) + if (auto Err = writeSection(Out, OutSection, Section)) + return Err; + } + } + return std::error_code(); +} + +int main(int argc, char** argv) { + + ParseCommandLineOptions(argc, argv, "merge split dwarf (.dwo) files"); + + llvm::InitializeAllTargetInfos(); + llvm::InitializeAllTargetMCs(); + llvm::InitializeAllTargets(); + llvm::InitializeAllAsmPrinters(); + + std::string ErrorStr; + StringRef Context = "dwarf streamer init"; + + Triple TheTriple("x86_64-linux-gnu"); + + // Get the target. + const Target *TheTarget = + TargetRegistry::lookupTarget("", TheTriple, ErrorStr); + if (!TheTarget) + return error(ErrorStr, Context); + std::string TripleName = TheTriple.getTriple(); + + // Create all the MC Objects. + std::unique_ptr MRI(TheTarget->createMCRegInfo(TripleName)); + if (!MRI) + return error(Twine("no register info for target ") + TripleName, Context); + + std::unique_ptr MAI(TheTarget->createMCAsmInfo(*MRI, TripleName)); + if (!MAI) + return error("no asm info for target " + TripleName, Context); + + MCObjectFileInfo MOFI; + MCContext MC(MAI.get(), MRI.get(), &MOFI); + MOFI.InitMCObjectFileInfo(TheTriple, Reloc::Default, CodeModel::Default, + MC); + + auto MAB = TheTarget->createMCAsmBackend(*MRI, TripleName, ""); + if (!MAB) + return error("no asm backend for target " + TripleName, Context); + + std::unique_ptr MII(TheTarget->createMCInstrInfo()); + if (!MII) + return error("no instr info info for target " + TripleName, Context); + + std::unique_ptr MSTI( + TheTarget->createMCSubtargetInfo(TripleName, "", "")); + if (!MSTI) + return error("no subtarget info for target " + TripleName, Context); + + MCCodeEmitter *MCE = TheTarget->createMCCodeEmitter(*MII, *MRI, MC); + if (!MCE) + return error("no code emitter for target " + TripleName, Context); + + // Create the output file. + std::error_code EC; + raw_fd_ostream OutFile(OutputFilename, EC, sys::fs::F_None); + if (EC) + return error(Twine(OutputFilename) + ": " + EC.message(), Context); + + std::unique_ptr MS(TheTarget->createMCObjectStreamer( + TheTriple, MC, *MAB, OutFile, MCE, *MSTI, false, + /*DWARFMustBeAtTheEnd*/ false)); + if (!MS) + return error("no object streamer for target " + TripleName, Context); + + if (auto Err = write(*MS, InputFiles)) + return error(Err.message(), "Writing DWP file"); + + MS->Finish(); } From 08802fa03338ebbe7f759586ef039d53f0d53194 Mon Sep 17 00:00:00 2001 From: Cong Hou Date: Tue, 1 Dec 2015 00:55:42 +0000 Subject: [PATCH 037/186] Fix a bug in MachineBlockPlacement that may cause assertion failure during BranchProbability construction. The root cause is the rounding behavior in BranchProbability construction. We may consider to use truncation instead in the future. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254356 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/MachineBlockPlacement.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp index ddddd483e801..fcddf346cf68 100644 --- a/lib/CodeGen/MachineBlockPlacement.cpp +++ b/lib/CodeGen/MachineBlockPlacement.cpp @@ -423,9 +423,13 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, DEBUG(dbgs() << "Attempting merge from: " << getBlockName(BB) << "\n"); for (MachineBasicBlock *Succ : Successors) { - BranchProbability SuccProb( - MBPI->getEdgeProbability(BB, Succ).getNumerator(), - AdjustedSumProb.getNumerator()); + BranchProbability SuccProb; + uint32_t SuccProbN = MBPI->getEdgeProbability(BB, Succ).getNumerator(); + uint32_t SuccProbD = AdjustedSumProb.getNumerator(); + if (SuccProbN >= SuccProbD) + SuccProb = BranchProbability::getOne(); + else + SuccProb = BranchProbability(SuccProbN, SuccProbD); // If we outline optional branches, look whether Succ is unavoidable, i.e. // dominates all terminators of the MachineFunction. If it does, other From a825191bcb5a553d4333464592353842d601988f Mon Sep 17 00:00:00 2001 From: David Blaikie Date: Tue, 1 Dec 2015 00:57:05 +0000 Subject: [PATCH 038/186] [llvm-dwp] Add missing dependency from llvm tests on the llvm-dwp tool git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254357 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 9bee504efece..138450ba8e02 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -39,6 +39,7 @@ set(LLVM_TEST_DEPENDS llvm-dis llvm-dsymutil llvm-dwarfdump + llvm-dwp llvm-extract llvm-lib llvm-link From 316c5ff46f55ca4bff15829f504f72da48acfc1d Mon Sep 17 00:00:00 2001 From: David Blaikie Date: Tue, 1 Dec 2015 01:07:20 +0000 Subject: [PATCH 039/186] [llvm-dwp] Add missing Makefile for the old configure+make build git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254358 91177308-0d34-0410-b5e6-96231b3b80d8 --- tools/llvm-dwp/Makefile | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 tools/llvm-dwp/Makefile diff --git a/tools/llvm-dwp/Makefile b/tools/llvm-dwp/Makefile new file mode 100644 index 000000000000..826371ecf915 --- /dev/null +++ b/tools/llvm-dwp/Makefile @@ -0,0 +1,18 @@ +##===- tools/llvm-dwp/Makefile -----------------------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## + +LEVEL := ../.. +TOOLNAME := llvm-dwp +LINK_COMPONENTS := all-targets AsmPrinter DebugInfoDWARF MC Object Support + +# This tool has no plugins, optimize startup time. +TOOL_NO_EXPORTS := 1 + +include $(LEVEL)/Makefile.common + From be0d74465db88aa17438edbd8389180b19c47bf3 Mon Sep 17 00:00:00 2001 From: NAKAMURA Takumi Date: Tue, 1 Dec 2015 01:14:58 +0000 Subject: [PATCH 040/186] check-llvm: Introduce the new feature "tls". git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254360 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/lit.cfg | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/lit.cfg b/test/lit.cfg index 3f710debf23a..b77f892d4d92 100644 --- a/test/lit.cfg +++ b/test/lit.cfg @@ -374,6 +374,10 @@ if config.target_triple: if config.host_triple == config.target_triple: config.available_features.add("native") +# Not set for targeting tls-incapable targets. +if not re.match(r'.*-cygwin$', config.target_triple): + config.available_features.add('tls') + import subprocess def have_ld_plugin_support(): From a1d72288c959c435c11fc74f5338aeb693709e01 Mon Sep 17 00:00:00 2001 From: NAKAMURA Takumi Date: Tue, 1 Dec 2015 01:15:03 +0000 Subject: [PATCH 041/186] llvm/test/DebugInfo/Generic/safestack-byval.ll is using tls. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254361 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/DebugInfo/Generic/safestack-byval.ll | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/DebugInfo/Generic/safestack-byval.ll b/test/DebugInfo/Generic/safestack-byval.ll index 01ef064a78b6..24df3815068e 100644 --- a/test/DebugInfo/Generic/safestack-byval.ll +++ b/test/DebugInfo/Generic/safestack-byval.ll @@ -13,6 +13,8 @@ ; return zzz.a[len]; ; } +; REQUIRES: tls + ; CHECK: ![[ZZZ:.*]] = !DILocalVariable(name: "zzz", ; CHECK: ![[ZZZ_EXPR:.*]] = !DIExpression(DW_OP_deref, DW_OP_minus, 400) ; CHECK: DBG_VALUE {{.*}} ![[ZZZ]], ![[ZZZ_EXPR]] From be0d94b85bd29d2947d2cbeef741b2d736daf143 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 1 Dec 2015 02:14:33 +0000 Subject: [PATCH 042/186] Squelch unused variable warning in SIRegisterInfo.cpp. Patch by Justin Lebar git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254362 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/SIRegisterInfo.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index d9799812d453..bf87f0225272 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -566,8 +566,9 @@ bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, enum PreloadedValue Value) const { - const AMDGPUSubtarget &ST = MF.getSubtarget(); const SIMachineFunctionInfo *MFI = MF.getInfo(); + const AMDGPUSubtarget &ST = MF.getSubtarget(); + (void)ST; switch (Value) { case SIRegisterInfo::WORKGROUP_ID_X: assert(MFI->hasWorkGroupIDX()); From e1ef1867304e6858a290939d38193ce6e4cfe1ad Mon Sep 17 00:00:00 2001 From: Davide Italiano Date: Tue, 1 Dec 2015 02:35:04 +0000 Subject: [PATCH 043/186] [Windows] Simplify assertion code. NFC. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254363 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Support/Windows/DynamicLibrary.inc | 6 ++---- lib/Support/Windows/Signals.inc | 5 +---- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/lib/Support/Windows/DynamicLibrary.inc b/lib/Support/Windows/DynamicLibrary.inc index 17418b015c75..77146d47cf29 100644 --- a/lib/Support/Windows/DynamicLibrary.inc +++ b/lib/Support/Windows/DynamicLibrary.inc @@ -61,10 +61,8 @@ DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename, OpenedHandles = new DenseSet(); if (!fEnumerateLoadedModules) { - if (!loadDebugHelp()) { - assert(false && "These APIs should always be available"); - return DynamicLibrary(); - } + assert(loadDebugHelp() && "These APIs should always be available"); + return DynamicLibrary(); } fEnumerateLoadedModules(GetCurrentProcess(), ELM_Callback, 0); diff --git a/lib/Support/Windows/Signals.inc b/lib/Support/Windows/Signals.inc index f40ca72996a1..d109a66d7035 100644 --- a/lib/Support/Windows/Signals.inc +++ b/lib/Support/Windows/Signals.inc @@ -405,10 +405,7 @@ static void RegisterHandler() { // If we cannot load up the APIs (which would be unexpected as they should // exist on every version of Windows we support), we will bail out since // there would be nothing to report. - if (!load64BitDebugHelp()) { - assert(false && "These APIs should always be available"); - return; - } + assert(load64BitDebugHelp() && "These APIs should always be available"); if (RegisteredUnhandledExceptionFilter) { EnterCriticalSection(&CriticalSection); From 5d2885f554db8eb89a3e2b376c4d6507d456eed2 Mon Sep 17 00:00:00 2001 From: Davide Italiano Date: Tue, 1 Dec 2015 02:38:42 +0000 Subject: [PATCH 044/186] [Windows] Follow-up r254363, remove return. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254364 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Support/Windows/DynamicLibrary.inc | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lib/Support/Windows/DynamicLibrary.inc b/lib/Support/Windows/DynamicLibrary.inc index 77146d47cf29..e612283e630f 100644 --- a/lib/Support/Windows/DynamicLibrary.inc +++ b/lib/Support/Windows/DynamicLibrary.inc @@ -60,10 +60,8 @@ DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename, if (OpenedHandles == 0) OpenedHandles = new DenseSet(); - if (!fEnumerateLoadedModules) { + if (!fEnumerateLoadedModules) assert(loadDebugHelp() && "These APIs should always be available"); - return DynamicLibrary(); - } fEnumerateLoadedModules(GetCurrentProcess(), ELM_Callback, 0); // Dummy library that represents "search all handles". From 8e83fe2e97c232086674f9f04b00043bccb89629 Mon Sep 17 00:00:00 2001 From: Hans Wennborg Date: Tue, 1 Dec 2015 03:49:42 +0000 Subject: [PATCH 045/186] Revert r254348: "Replace all weight-based interfaces in MBB with probability-based interfaces, and update all uses of old interfaces." and the follow-up r254356: "Fix a bug in MachineBlockPlacement that may cause assertion failure during BranchProbability construction." Asserts were firing in Chromium builds. See PR25687. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254366 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Analysis/BranchProbabilityInfo.h | 3 - include/llvm/CodeGen/MachineBasicBlock.h | 46 +++++- .../CodeGen/MachineBranchProbabilityInfo.h | 22 ++- include/llvm/Support/BranchProbability.h | 31 +--- lib/Analysis/BranchProbabilityInfo.cpp | 6 - lib/CodeGen/BranchFolding.cpp | 9 +- lib/CodeGen/IfConversion.cpp | 155 ++++++++++-------- lib/CodeGen/MIRParser/MIParser.cpp | 3 +- lib/CodeGen/MIRPrinter.cpp | 4 +- lib/CodeGen/MachineBasicBlock.cpp | 142 ++++++++++++---- lib/CodeGen/MachineBlockPlacement.cpp | 71 ++++---- lib/CodeGen/MachineBranchProbabilityInfo.cpp | 82 ++++++--- lib/CodeGen/TailDuplication.cpp | 6 +- lib/Support/BranchProbability.cpp | 20 +-- lib/Target/AMDGPU/AMDILCFGStructurizer.cpp | 6 +- lib/Target/ARM/ARMConstantIslandPass.cpp | 3 +- lib/Target/ARM/ARMISelLowering.cpp | 2 +- lib/Target/Hexagon/HexagonCFGOptimizer.cpp | 6 +- lib/Target/Mips/MipsLongBranch.cpp | 3 +- test/CodeGen/ARM/ifcvt-branch-weight-bug.ll | 14 +- test/CodeGen/ARM/ifcvt-branch-weight.ll | 2 +- test/CodeGen/ARM/ifcvt-iter-indbr.ll | 10 +- test/CodeGen/ARM/tail-merge-branch-weight.ll | 2 +- test/CodeGen/ARM/taildup-branch-weight.ll | 4 +- test/CodeGen/Generic/MachineBranchProb.ll | 8 +- test/CodeGen/Hexagon/ifcvt-edge-weight.ll | 2 +- test/CodeGen/MIR/X86/newline-handling.mir | 4 +- .../X86/successor-basic-blocks-weights.mir | 6 +- .../MIR/X86/successor-basic-blocks.mir | 4 +- test/CodeGen/X86/MachineBranchProb.ll | 4 +- test/CodeGen/X86/catchpad-weight.ll | 2 +- test/CodeGen/X86/stack-protector-weight.ll | 4 +- test/CodeGen/X86/switch-edge-weight.ll | 22 +-- test/CodeGen/X86/switch-jump-table.ll | 8 +- 34 files changed, 420 insertions(+), 296 deletions(-) diff --git a/include/llvm/Analysis/BranchProbabilityInfo.h b/include/llvm/Analysis/BranchProbabilityInfo.h index 69dae5e90785..89dec14b2b3e 100644 --- a/include/llvm/Analysis/BranchProbabilityInfo.h +++ b/include/llvm/Analysis/BranchProbabilityInfo.h @@ -61,9 +61,6 @@ class BranchProbabilityInfo { BranchProbability getEdgeProbability(const BasicBlock *Src, const BasicBlock *Dst) const; - BranchProbability getEdgeProbability(const BasicBlock *Src, - succ_const_iterator Dst) const; - /// \brief Test if an edge is hot relative to other out-edges of the Src. /// /// Check whether this edge out of the source block is 'hot'. We define hot diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h index ac87f4f901f5..a2b1a850ec76 100644 --- a/include/llvm/CodeGen/MachineBasicBlock.h +++ b/include/llvm/CodeGen/MachineBasicBlock.h @@ -91,6 +91,13 @@ class MachineBasicBlock std::vector Predecessors; std::vector Successors; + /// Keep track of the weights to the successors. This vector has the same + /// order as Successors, or it is empty if we don't use it (disable + /// optimization). + std::vector Weights; + typedef std::vector::iterator weight_iterator; + typedef std::vector::const_iterator const_weight_iterator; + /// Keep track of the probabilities to the successors. This vector has the /// same order as Successors, or it is empty if we don't use it (disable /// optimization). @@ -433,16 +440,26 @@ class MachineBasicBlock // Machine-CFG mutators + /// Add Succ as a successor of this MachineBasicBlock. The Predecessors list + /// of Succ is automatically updated. WEIGHT parameter is stored in Weights + /// list and it may be used by MachineBranchProbabilityInfo analysis to + /// calculate branch probability. + /// + /// Note that duplicate Machine CFG edges are not allowed. + void addSuccessor(MachineBasicBlock *Succ, uint32_t Weight = 0); + + /// Add Succ as a successor of this MachineBasicBlock. The Predecessors list + /// of Succ is automatically updated. The weight is not provided because BPI + /// is not available (e.g. -O0 is used), in which case edge weights won't be + /// used. Using this interface can save some space. + void addSuccessorWithoutWeight(MachineBasicBlock *Succ); + /// Add Succ as a successor of this MachineBasicBlock. The Predecessors list /// of Succ is automatically updated. PROB parameter is stored in - /// Probabilities list. The default probability is set as unknown. Mixing - /// known and unknown probabilities in successor list is not allowed. When all - /// successors have unknown probabilities, 1 / N is returned as the - /// probability for each successor, where N is the number of successors. + /// Probabilities list. /// /// Note that duplicate Machine CFG edges are not allowed. - void addSuccessor(MachineBasicBlock *Succ, - BranchProbability Prob = BranchProbability::getUnknown()); + void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob); /// Add Succ as a successor of this MachineBasicBlock. The Predecessors list /// of Succ is automatically updated. The probability is not provided because @@ -450,6 +467,9 @@ class MachineBasicBlock /// won't be used. Using this interface can save some space. void addSuccessorWithoutProb(MachineBasicBlock *Succ); + /// Set successor weight of a given iterator. + void setSuccWeight(succ_iterator I, uint32_t Weight); + /// Set successor probability of a given iterator. void setSuccProbability(succ_iterator I, BranchProbability Prob); @@ -468,7 +488,7 @@ class MachineBasicBlock /// Return the iterator to the element after the one removed. succ_iterator removeSuccessor(succ_iterator I); - /// Replace successor OLD with NEW and update probability info. + /// Replace successor OLD with NEW and update weight info. void replaceSuccessor(MachineBasicBlock *Old, MachineBasicBlock *New); /// Transfers all the successors from MBB to this machine basic block (i.e., @@ -480,6 +500,9 @@ class MachineBasicBlock /// operands in the successor blocks which refer to FromMBB to refer to this. void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB); + /// Return true if any of the successors have weights attached to them. + bool hasSuccessorWeights() const { return !Weights.empty(); } + /// Return true if any of the successors have probabilities attached to them. bool hasSuccessorProbabilities() const { return !Probs.empty(); } @@ -736,6 +759,10 @@ class MachineBasicBlock private: + /// Return weight iterator corresponding to the I successor iterator. + weight_iterator getWeightIterator(succ_iterator I); + const_weight_iterator getWeightIterator(const_succ_iterator I) const; + /// Return probability iterator corresponding to the I successor iterator. probability_iterator getProbabilityIterator(succ_iterator I); const_probability_iterator @@ -744,6 +771,11 @@ class MachineBasicBlock friend class MachineBranchProbabilityInfo; friend class MIPrinter; + /// Return weight of the edge from this block to MBB. This method should NOT + /// be called directly, but by using getEdgeWeight method from + /// MachineBranchProbabilityInfo class. + uint32_t getSuccWeight(const_succ_iterator Succ) const; + /// Return probability of the edge from this block to MBB. This method should /// NOT be called directly, but by using getEdgeProbability method from /// MachineBranchProbabilityInfo class. diff --git a/include/llvm/CodeGen/MachineBranchProbabilityInfo.h b/include/llvm/CodeGen/MachineBranchProbabilityInfo.h index 608e8d257874..058ab32f3aa9 100644 --- a/include/llvm/CodeGen/MachineBranchProbabilityInfo.h +++ b/include/llvm/CodeGen/MachineBranchProbabilityInfo.h @@ -55,15 +55,10 @@ class MachineBranchProbabilityInfo : public ImmutablePass { uint32_t getEdgeWeight(const MachineBasicBlock *Src, MachineBasicBlock::const_succ_iterator Dst) const; - // Return edge probability. - BranchProbability getEdgeProbability(const MachineBasicBlock *Src, - const MachineBasicBlock *Dst) const; - - // Same as above, but using a const_succ_iterator from Src. This is faster - // when the iterator is already available. - BranchProbability - getEdgeProbability(const MachineBasicBlock *Src, - MachineBasicBlock::const_succ_iterator Dst) const; + // Get sum of the block successors' weights, potentially scaling them to fit + // within 32-bits. If scaling is required, sets Scale based on the necessary + // adjustment. Any edge weights used with the sum should be divided by Scale. + uint32_t getSumForBlock(const MachineBasicBlock *MBB, uint32_t &Scale) const; // A 'Hot' edge is an edge which probability is >= 80%. bool isEdgeHot(const MachineBasicBlock *Src, @@ -73,6 +68,15 @@ class MachineBranchProbabilityInfo : public ImmutablePass { // NB: This routine's complexity is linear on the number of successors. MachineBasicBlock *getHotSucc(MachineBasicBlock *MBB) const; + // Return a probability as a fraction between 0 (0% probability) and + // 1 (100% probability), however the value is never equal to 0, and can be 1 + // only iff SRC block has only one successor. + // NB: This routine's complexity is linear on the number of successors of + // Src. Querying sequentially for each successor's probability is a quadratic + // query pattern. + BranchProbability getEdgeProbability(const MachineBasicBlock *Src, + const MachineBasicBlock *Dst) const; + // Print value between 0 (0% probability) and 1 (100% probability), // however the value is never equal to 0, and can be 1 only iff SRC block // has only one successor. diff --git a/include/llvm/Support/BranchProbability.h b/include/llvm/Support/BranchProbability.h index 2548384f346b..3620d4d5d772 100644 --- a/include/llvm/Support/BranchProbability.h +++ b/include/llvm/Support/BranchProbability.h @@ -53,9 +53,6 @@ class BranchProbability { // Create a BranchProbability object with the given numerator and 1<<31 // as denominator. static BranchProbability getRaw(uint32_t N) { return BranchProbability(N); } - // Create a BranchProbability object from 64-bit integers. - static BranchProbability getBranchProbability(uint64_t Numerator, - uint64_t Denominator); // Normalize given probabilties so that the sum of them becomes approximate // one. @@ -134,30 +131,10 @@ class BranchProbability { bool operator==(BranchProbability RHS) const { return N == RHS.N; } bool operator!=(BranchProbability RHS) const { return !(*this == RHS); } - - bool operator<(BranchProbability RHS) const { - assert(N != UnknownN && RHS.N != UnknownN && - "Unknown probability cannot participate in comparisons."); - return N < RHS.N; - } - - bool operator>(BranchProbability RHS) const { - assert(N != UnknownN && RHS.N != UnknownN && - "Unknown probability cannot participate in comparisons."); - return RHS < *this; - } - - bool operator<=(BranchProbability RHS) const { - assert(N != UnknownN && RHS.N != UnknownN && - "Unknown probability cannot participate in comparisons."); - return !(RHS < *this); - } - - bool operator>=(BranchProbability RHS) const { - assert(N != UnknownN && RHS.N != UnknownN && - "Unknown probability cannot participate in comparisons."); - return !(*this < RHS); - } + bool operator<(BranchProbability RHS) const { return N < RHS.N; } + bool operator>(BranchProbability RHS) const { return RHS < *this; } + bool operator<=(BranchProbability RHS) const { return !(RHS < *this); } + bool operator>=(BranchProbability RHS) const { return !(*this < RHS); } }; inline raw_ostream &operator<<(raw_ostream &OS, BranchProbability Prob) { diff --git a/lib/Analysis/BranchProbabilityInfo.cpp b/lib/Analysis/BranchProbabilityInfo.cpp index 6cdf43a06a9f..f48394698699 100644 --- a/lib/Analysis/BranchProbabilityInfo.cpp +++ b/lib/Analysis/BranchProbabilityInfo.cpp @@ -647,12 +647,6 @@ getEdgeProbability(const BasicBlock *Src, const BasicBlock *Dst) const { return BranchProbability(N, D); } -BranchProbability -BranchProbabilityInfo::getEdgeProbability(const BasicBlock *Src, - succ_const_iterator Dst) const { - return getEdgeProbability(Src, Dst.getSuccessorIndex()); -} - raw_ostream & BranchProbabilityInfo::printEdgeProbability(raw_ostream &OS, const BasicBlock *Src, diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp index ba21d9cc9d55..0b2495cc996e 100644 --- a/lib/CodeGen/BranchFolding.cpp +++ b/lib/CodeGen/BranchFolding.cpp @@ -1099,16 +1099,13 @@ void BranchFolder::setCommonTailEdgeWeights(MachineBasicBlock &TailMBB) { if (TailMBB.succ_size() <= 1) return; - auto SumEdgeFreq = - std::accumulate(EdgeFreqLs.begin(), EdgeFreqLs.end(), BlockFrequency(0)) - .getFrequency(); + auto MaxEdgeFreq = *std::max_element(EdgeFreqLs.begin(), EdgeFreqLs.end()); + uint64_t Scale = MaxEdgeFreq.getFrequency() / UINT32_MAX + 1; auto EdgeFreq = EdgeFreqLs.begin(); for (auto SuccI = TailMBB.succ_begin(), SuccE = TailMBB.succ_end(); SuccI != SuccE; ++SuccI, ++EdgeFreq) - TailMBB.setSuccProbability( - SuccI, BranchProbability::getBranchProbability(EdgeFreq->getFrequency(), - SumEdgeFreq)); + TailMBB.setSuccWeight(SuccI, EdgeFreq->getFrequency() / Scale); } //===----------------------------------------------------------------------===// diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp index ff28f95cc33d..0b2f3ea165f8 100644 --- a/lib/CodeGen/IfConversion.cpp +++ b/lib/CodeGen/IfConversion.cpp @@ -32,7 +32,6 @@ #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" -#include using namespace llvm; @@ -1152,6 +1151,28 @@ bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) { return true; } +/// Scale down weights to fit into uint32_t. NewTrue is the new weight +/// for successor TrueBB, and NewFalse is the new weight for successor +/// FalseBB. +static void ScaleWeights(uint64_t NewTrue, uint64_t NewFalse, + MachineBasicBlock *MBB, + const MachineBasicBlock *TrueBB, + const MachineBasicBlock *FalseBB, + const MachineBranchProbabilityInfo *MBPI) { + uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse; + uint32_t Scale = (NewMax / UINT32_MAX) + 1; + for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(), + SE = MBB->succ_end(); + SI != SE; ++SI) { + if (*SI == TrueBB) + MBB->setSuccWeight(SI, (uint32_t)(NewTrue / Scale)); + else if (*SI == FalseBB) + MBB->setSuccWeight(SI, (uint32_t)(NewFalse / Scale)); + else + MBB->setSuccWeight(SI, MBPI->getEdgeWeight(MBB, SI) / Scale); + } +} + /// IfConvertTriangle - If convert a triangle sub-CFG. /// bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) { @@ -1208,14 +1229,16 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) { DontKill.clear(); bool HasEarlyExit = CvtBBI->FalseBB != nullptr; - BranchProbability CvtNext, CvtFalse, BBNext, BBCvt; + uint64_t CvtNext = 0, CvtFalse = 0, BBNext = 0, BBCvt = 0, SumWeight = 0; + uint32_t WeightScale = 0; if (HasEarlyExit) { - // Get probabilities before modifying CvtBBI->BB and BBI.BB. - CvtNext = MBPI->getEdgeProbability(CvtBBI->BB, NextBBI->BB); - CvtFalse = MBPI->getEdgeProbability(CvtBBI->BB, CvtBBI->FalseBB); - BBNext = MBPI->getEdgeProbability(BBI.BB, NextBBI->BB); - BBCvt = MBPI->getEdgeProbability(BBI.BB, CvtBBI->BB); + // Get weights before modifying CvtBBI->BB and BBI.BB. + CvtNext = MBPI->getEdgeWeight(CvtBBI->BB, NextBBI->BB); + CvtFalse = MBPI->getEdgeWeight(CvtBBI->BB, CvtBBI->FalseBB); + BBNext = MBPI->getEdgeWeight(BBI.BB, NextBBI->BB); + BBCvt = MBPI->getEdgeWeight(BBI.BB, CvtBBI->BB); + SumWeight = MBPI->getSumForBlock(CvtBBI->BB, WeightScale); } if (CvtBBI->BB->pred_size() > 1) { @@ -1243,24 +1266,22 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) { CvtBBI->BrCond.end()); if (TII->ReverseBranchCondition(RevCond)) llvm_unreachable("Unable to reverse branch condition!"); - - // Update the edge probability for both CvtBBI->FalseBB and NextBBI. - // NewNext = New_Prob(BBI.BB, NextBBI->BB) = - // Prob(BBI.BB, NextBBI->BB) + - // Prob(BBI.BB, CvtBBI->BB) * Prob(CvtBBI->BB, NextBBI->BB) - // NewFalse = New_Prob(BBI.BB, CvtBBI->FalseBB) = - // Prob(BBI.BB, CvtBBI->BB) * Prob(CvtBBI->BB, CvtBBI->FalseBB) - auto NewTrueBB = getNextBlock(BBI.BB); - auto NewNext = BBNext + BBCvt * CvtNext; - auto NewTrueBBIter = - std::find(BBI.BB->succ_begin(), BBI.BB->succ_end(), NewTrueBB); - assert(NewTrueBBIter != BBI.BB->succ_end() && - "NewTrueBB is not a successor of BBI.BB."); - BBI.BB->setSuccProbability(NewTrueBBIter, NewNext); - - auto NewFalse = BBCvt * CvtFalse; TII->InsertBranch(*BBI.BB, CvtBBI->FalseBB, nullptr, RevCond, dl); - BBI.BB->addSuccessor(CvtBBI->FalseBB, NewFalse); + BBI.BB->addSuccessor(CvtBBI->FalseBB); + // Update the edge weight for both CvtBBI->FalseBB and NextBBI. + // New_Weight(BBI.BB, NextBBI->BB) = + // Weight(BBI.BB, NextBBI->BB) * getSumForBlock(CvtBBI->BB) + + // Weight(BBI.BB, CvtBBI->BB) * Weight(CvtBBI->BB, NextBBI->BB) + // New_Weight(BBI.BB, CvtBBI->FalseBB) = + // Weight(BBI.BB, CvtBBI->BB) * Weight(CvtBBI->BB, CvtBBI->FalseBB) + + uint64_t NewNext = BBNext * SumWeight + (BBCvt * CvtNext) / WeightScale; + uint64_t NewFalse = (BBCvt * CvtFalse) / WeightScale; + // We need to scale down all weights of BBI.BB to fit uint32_t. + // Here BBI.BB is connected to CvtBBI->FalseBB and will fall through to + // the next block. + ScaleWeights(NewNext, NewFalse, BBI.BB, getNextBlock(BBI.BB), + CvtBBI->FalseBB, MBPI); } // Merge in the 'false' block if the 'false' block has no other @@ -1503,7 +1524,7 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind, MergeBlocks(BBI, TailBBI); TailBBI.IsDone = true; } else { - BBI.BB->addSuccessor(TailBB, BranchProbability::getOne()); + BBI.BB->addSuccessor(TailBB); InsertUncondBranch(BBI.BB, TailBB, TII); BBI.HasFallThrough = false; } @@ -1667,26 +1688,21 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) { FromBBI.BB->succ_end()); MachineBasicBlock *NBB = getNextBlock(FromBBI.BB); MachineBasicBlock *FallThrough = FromBBI.HasFallThrough ? NBB : nullptr; - // The edge probability from ToBBI.BB to FromBBI.BB, which is only needed when - // AddEdges is true and FromBBI.BB is a successor of ToBBI.BB. - auto To2FromProb = BranchProbability::getZero(); - if (AddEdges && ToBBI.BB->isSuccessor(FromBBI.BB)) { - To2FromProb = MBPI->getEdgeProbability(ToBBI.BB, FromBBI.BB); - // Set the edge probability from ToBBI.BB to FromBBI.BB to zero to avoid the - // edge probability being merged to other edges when this edge is removed - // later. - ToBBI.BB->setSuccProbability( - std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), FromBBI.BB), - BranchProbability::getZero()); - } + // The edge weight from ToBBI.BB to FromBBI.BB, which is only needed when + // AddEdges is true and FromBBI.BB is a successor of ToBBI.BB. + uint32_t To2FromWeight = 0; + // WeightScale and SumWeight are for calculating successor probabilities of + // FromBBI.BB. + uint32_t WeightScale = 0; + uint32_t SumWeight = 0; if (AddEdges && ToBBI.BB->isSuccessor(FromBBI.BB)) { - // Set the edge probability from ToBBI.BB to FromBBI.BB to zero to avoid the - // edge probability being merged to other edges when this edge is removed - // later. - ToBBI.BB->setSuccProbability( - std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), FromBBI.BB), - BranchProbability::getZero()); + To2FromWeight = MBPI->getEdgeWeight(ToBBI.BB, FromBBI.BB); + // Set the edge weight from ToBBI.BB to FromBBI.BB to zero to avoid the edge + // weight being merged to other edges when this edge is removed later. + ToBBI.BB->setSuccWeight( + std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), FromBBI.BB), 0); + SumWeight = MBPI->getSumForBlock(FromBBI.BB, WeightScale); } for (unsigned i = 0, e = FromSuccs.size(); i != e; ++i) { @@ -1695,38 +1711,39 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) { if (Succ == FallThrough) continue; - auto NewProb = BranchProbability::getZero(); + uint32_t NewWeight = 0; if (AddEdges) { - // Calculate the edge probability for the edge from ToBBI.BB to Succ, - // which is a portion of the edge probability from FromBBI.BB to Succ. The - // portion ratio is the edge probability from ToBBI.BB to FromBBI.BB (if - // FromBBI is a successor of ToBBI.BB. See comment below for excepion). - NewProb = MBPI->getEdgeProbability(FromBBI.BB, Succ); + // Calculate the edge weight for the edge from ToBBI.BB to Succ, which is + // a portion of the edge weight from FromBBI.BB to Succ. The portion ratio + // is the edge probability from ToBBI.BB to FromBBI.BB (if FromBBI is a + // successor of ToBBI.BB. See comment below for excepion). + NewWeight = MBPI->getEdgeWeight(FromBBI.BB, Succ); - // To2FromProb is 0 when FromBBI.BB is not a successor of ToBBI.BB. This + // To2FromWeight is 0 when FromBBI.BB is not a successor of ToBBI.BB. This // only happens when if-converting a diamond CFG and FromBBI.BB is the // tail BB. In this case FromBBI.BB post-dominates ToBBI.BB and hence we - // could just use the probabilities on FromBBI.BB's out-edges when adding - // new successors. - if (!To2FromProb.isZero()) - NewProb *= To2FromProb; + // could just use the weights on FromBBI.BB's out-edges when adding new + // successors. + if (To2FromWeight > 0) { + BranchProbability Prob(NewWeight / WeightScale, SumWeight); + NewWeight = Prob.scale(To2FromWeight); + } } FromBBI.BB->removeSuccessor(Succ); if (AddEdges) { - // If the edge from ToBBI.BB to Succ already exists, update the - // probability of this edge by adding NewWeight to it. An example is shown - // below, in which A is ToBBI.BB and B is FromBBI.BB. In this case we - // don't have to set C as A's successor as it already is. We only need to - // update the edge probability on A->C. Note that B will not be - // immediately removed from A's successors. It is possible that B->D is - // not removed either if D is a fallthrough of B. Later the edge A->D - // (generated here) and B->D will be combined into one edge. To maintain - // correct edge probability of this combined edge, we need to set the edge - // probability of A->B to zero, which is already done above. The edge - // probability on A->D is calculated by scaling the original probability - // on A->B by the probability of B->D. + // If the edge from ToBBI.BB to Succ already exists, update the weight of + // this edge by adding NewWeight to it. An example is shown below, in + // which A is ToBBI.BB and B is FromBBI.BB. In this case we don't have to + // set C as A's successor as it already is. We only need to update the + // edge weight on A->C. Note that B will not be immediately removed from + // A's successors. It is possible that B->D is not removed either if D is + // a fallthrough of B. Later the edge A->D (generated here) and B->D will + // be combined into one edge. To maintain correct edge weight of this + // combined edge, we need to set the edge weight of A->B to zero, which is + // already done above. The edge weight on A->D is calculated by scaling + // the original weight on A->B by the probability of B->D. // // Before ifcvt: After ifcvt (assume B->D is kept): // @@ -1738,11 +1755,11 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) { // C D C D // if (ToBBI.BB->isSuccessor(Succ)) - ToBBI.BB->setSuccProbability( + ToBBI.BB->setSuccWeight( std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), Succ), - MBPI->getEdgeProbability(ToBBI.BB, Succ) + NewProb); + MBPI->getEdgeWeight(ToBBI.BB, Succ) + NewWeight); else - ToBBI.BB->addSuccessor(Succ, NewProb); + ToBBI.BB->addSuccessor(Succ, NewWeight); } } diff --git a/lib/CodeGen/MIRParser/MIParser.cpp b/lib/CodeGen/MIRParser/MIParser.cpp index c9c2d62cec30..5a8e96df7603 100644 --- a/lib/CodeGen/MIRParser/MIParser.cpp +++ b/lib/CodeGen/MIRParser/MIParser.cpp @@ -459,9 +459,8 @@ bool MIParser::parseBasicBlockSuccessors(MachineBasicBlock &MBB) { if (expectAndConsume(MIToken::rparen)) return true; } - MBB.addSuccessor(SuccMBB, BranchProbability::getRaw(Weight)); + MBB.addSuccessor(SuccMBB, Weight); } while (consumeIfPresent(MIToken::comma)); - MBB.normalizeSuccProbs(); return false; } diff --git a/lib/CodeGen/MIRPrinter.cpp b/lib/CodeGen/MIRPrinter.cpp index 175cb0d51437..0be7807064fb 100644 --- a/lib/CodeGen/MIRPrinter.cpp +++ b/lib/CodeGen/MIRPrinter.cpp @@ -461,8 +461,8 @@ void MIPrinter::print(const MachineBasicBlock &MBB) { if (I != MBB.succ_begin()) OS << ", "; printMBBReference(**I); - if (MBB.hasSuccessorProbabilities()) - OS << '(' << MBB.getSuccProbability(I) << ')'; + if (MBB.hasSuccessorWeights()) + OS << '(' << MBB.getSuccWeight(I) << ')'; } OS << "\n"; HasLineAttributes = true; diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp index c9c6a9d62462..602b75182fca 100644 --- a/lib/CodeGen/MachineBasicBlock.cpp +++ b/lib/CodeGen/MachineBasicBlock.cpp @@ -319,8 +319,8 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST, OS << " Successors according to CFG:"; for (const_succ_iterator SI = succ_begin(), E = succ_end(); SI != E; ++SI) { OS << " BB#" << (*SI)->getNumber(); - if (!Probs.empty()) - OS << '(' << *getProbabilityIterator(SI) << ')'; + if (!Weights.empty()) + OS << '(' << *getWeightIterator(SI) << ')'; } OS << '\n'; } @@ -506,16 +506,34 @@ void MachineBasicBlock::updateTerminator() { } } +void MachineBasicBlock::addSuccessor(MachineBasicBlock *Succ, uint32_t Weight) { + // Weight list is either empty (if successor list isn't empty, this means + // disabled optimization) or has the same size as successor list. + if (!(Weights.empty() && !Successors.empty())) + Weights.push_back(Weight); + Successors.push_back(Succ); + Succ->addPredecessor(this); +} + +void MachineBasicBlock::addSuccessorWithoutWeight(MachineBasicBlock *Succ) { + // We need to make sure weight list is either empty or has the same size of + // successor list. When this function is called, we can safely delete all + // weight in the list. + Weights.clear(); + Successors.push_back(Succ); + Succ->addPredecessor(this); +} + void MachineBasicBlock::addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob) { // Probability list is either empty (if successor list isn't empty, this means // disabled optimization) or has the same size as successor list. if (!(Probs.empty() && !Successors.empty())) { - assert((Probs.empty() || (Prob.isUnknown() && Probs.back().isUnknown()) || - (!Prob.isUnknown() && !Probs.back().isUnknown())) && - "Successors with both known and unknwon probabilities are not " - "allowed."); Probs.push_back(Prob); + // FIXME: Temporarily use the numerator of the probability to represent edge + // weight. This will be removed once all weight-version interfaces in MBB + // are replaced with probability-version interfaces. + Weights.push_back(Prob.getNumerator()); } Successors.push_back(Succ); Succ->addPredecessor(this); @@ -526,6 +544,7 @@ void MachineBasicBlock::addSuccessorWithoutProb(MachineBasicBlock *Succ) { // of successor list. When this function is called, we can safely delete all // probability in the list. Probs.clear(); + Weights.clear(); Successors.push_back(Succ); Succ->addPredecessor(this); } @@ -539,12 +558,23 @@ MachineBasicBlock::succ_iterator MachineBasicBlock::removeSuccessor(succ_iterator I) { assert(I != Successors.end() && "Not a current successor!"); + // If Weight list is empty it means we don't use it (disabled optimization). + if (!Weights.empty()) { + weight_iterator WI = getWeightIterator(I); + Weights.erase(WI); + } + + // FIXME: Temporarily comment the following code as probabilities are now only + // used during instruction lowering, but this interface is called in later + // passes. Uncomment it once all edge weights are replaced with probabilities. +#if 0 // If probability list is empty it means we don't use it (disabled // optimization). if (!Probs.empty()) { probability_iterator WI = getProbabilityIterator(I); Probs.erase(WI); } +#endif (*I)->removePredecessor(this); return Successors.erase(I); @@ -581,12 +611,17 @@ void MachineBasicBlock::replaceSuccessor(MachineBasicBlock *Old, } // New is already a successor. + // Update its weight instead of adding a duplicate edge. + if (!Weights.empty()) + *getWeightIterator(NewI) += *getWeightIterator(OldI); + // FIXME: Temporarily comment the following code as probabilities are now only + // used during instruction lowering, but this interface is called in later + // passes. Uncomment it once all edge weights are replaced with probabilities. +#if 0 // Update its probability instead of adding a duplicate edge. - if (!Probs.empty()) { - auto ProbIter = getProbabilityIterator(NewI); - if (!ProbIter->isUnknown()) - *ProbIter += *getProbabilityIterator(OldI); - } + if (!Probs.empty()) + *getProbabilityIterator(NewI) += *getProbabilityIterator(OldI); +#endif removeSuccessor(OldI); } @@ -606,14 +641,13 @@ void MachineBasicBlock::transferSuccessors(MachineBasicBlock *FromMBB) { while (!FromMBB->succ_empty()) { MachineBasicBlock *Succ = *FromMBB->succ_begin(); + uint32_t Weight = 0; - // If probability list is empty it means we don't use it (disabled optimization). - if (!FromMBB->Probs.empty()) { - auto Prob = *FromMBB->Probs.begin(); - addSuccessor(Succ, Prob); - } else - addSuccessorWithoutProb(Succ); + // If Weight list is empty it means we don't use it (disabled optimization). + if (!FromMBB->Weights.empty()) + Weight = *FromMBB->Weights.begin(); + addSuccessor(Succ, Weight); FromMBB->removeSuccessor(Succ); } } @@ -625,11 +659,10 @@ MachineBasicBlock::transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB) { while (!FromMBB->succ_empty()) { MachineBasicBlock *Succ = *FromMBB->succ_begin(); - if (!FromMBB->Probs.empty()) { - auto Prob = *FromMBB->Probs.begin(); - addSuccessor(Succ, Prob); - } else - addSuccessorWithoutProb(Succ); + uint32_t Weight = 0; + if (!FromMBB->Weights.empty()) + Weight = *FromMBB->Weights.begin(); + addSuccessor(Succ, Weight); FromMBB->removeSuccessor(Succ); // Fix up any PHI nodes in the successor. @@ -1113,32 +1146,65 @@ MachineBasicBlock::findDebugLoc(instr_iterator MBBI) { return DL; } -/// Return probability of the edge from this block to MBB. +/// Return weight of the edge from this block to MBB. +uint32_t MachineBasicBlock::getSuccWeight(const_succ_iterator Succ) const { + if (Weights.empty()) + return 0; + + return *getWeightIterator(Succ); +} + +/// Return probability of the edge from this block to MBB. If probability list +/// is empty, return a default probability which is 1/N, where N is the number +/// of successors. If the probability of the given successor is unknown, then +/// sum up all known probabilities and return the complement of the sum divided +/// by the number of unknown probabilities. BranchProbability MachineBasicBlock::getSuccProbability(const_succ_iterator Succ) const { - if (Probs.empty() || Probs.back().isUnknown()) + if (Probs.empty()) return BranchProbability(1, succ_size()); - return *getProbabilityIterator(Succ); + auto Prob = *getProbabilityIterator(Succ); + assert(!Prob.isUnknown()); + return Prob; +} + +/// Set successor weight of a given iterator. +void MachineBasicBlock::setSuccWeight(succ_iterator I, uint32_t Weight) { + if (Weights.empty()) + return; + *getWeightIterator(I) = Weight; } /// Set successor probability of a given iterator. void MachineBasicBlock::setSuccProbability(succ_iterator I, BranchProbability Prob) { assert(!Prob.isUnknown()); - if (Probs.empty()) + if (Probs.empty() || Weights.empty()) return; *getProbabilityIterator(I) = Prob; + // FIXME: Temporarily use the numerator of the probability to represent edge + // weight. This will be removed once all weight-version interfaces in MBB + // are replaces with probability-version interfaces. + *getWeightIterator(I) = Prob.getNumerator(); } -/// Return probability iterator corresonding to the I successor iterator -MachineBasicBlock::const_probability_iterator -MachineBasicBlock::getProbabilityIterator( - MachineBasicBlock::const_succ_iterator I) const { - assert(Probs.size() == Successors.size() && "Async probability list!"); +/// Return wight iterator corresonding to the I successor iterator. +MachineBasicBlock::weight_iterator MachineBasicBlock:: +getWeightIterator(MachineBasicBlock::succ_iterator I) { + assert(Weights.size() == Successors.size() && "Async weight list!"); + size_t index = std::distance(Successors.begin(), I); + assert(index < Weights.size() && "Not a current successor!"); + return Weights.begin() + index; +} + +/// Return wight iterator corresonding to the I successor iterator. +MachineBasicBlock::const_weight_iterator MachineBasicBlock:: +getWeightIterator(MachineBasicBlock::const_succ_iterator I) const { + assert(Weights.size() == Successors.size() && "Async weight list!"); const size_t index = std::distance(Successors.begin(), I); - assert(index < Probs.size() && "Not a current successor!"); - return Probs.begin() + index; + assert(index < Weights.size() && "Not a current successor!"); + return Weights.begin() + index; } /// Return probability iterator corresonding to the I successor iterator. @@ -1150,6 +1216,16 @@ MachineBasicBlock::getProbabilityIterator(MachineBasicBlock::succ_iterator I) { return Probs.begin() + index; } +/// Return probability iterator corresonding to the I successor iterator +MachineBasicBlock::const_probability_iterator +MachineBasicBlock::getProbabilityIterator( + MachineBasicBlock::const_succ_iterator I) const { + assert(Probs.size() == Successors.size() && "Async probability list!"); + const size_t index = std::distance(Successors.begin(), I); + assert(index < Probs.size() && "Not a current successor!"); + return Probs.begin() + index; +} + /// Return whether (physical) register "Reg" has been ined and not ed /// as of just before "MI". /// diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp index fcddf346cf68..fba33eb93d5f 100644 --- a/lib/CodeGen/MachineBlockPlacement.cpp +++ b/lib/CodeGen/MachineBlockPlacement.cpp @@ -380,11 +380,19 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, const BranchProbability HotProb(4, 5); // 80% MachineBasicBlock *BestSucc = nullptr; - auto BestProb = BranchProbability::getZero(); - - // Adjust edge probabilities by excluding edges pointing to blocks that is - // either not in BlockFilter or is already in the current chain. Consider the - // following CFG: + // FIXME: Due to the performance of the probability and weight routines in + // the MBPI analysis, we manually compute probabilities using the edge + // weights. This is suboptimal as it means that the somewhat subtle + // definition of edge weight semantics is encoded here as well. We should + // improve the MBPI interface to efficiently support query patterns such as + // this. + uint32_t BestWeight = 0; + uint32_t WeightScale = 0; + uint32_t SumWeight = MBPI->getSumForBlock(BB, WeightScale); + + // Adjust sum of weights by excluding weights on edges pointing to blocks that + // is either not in BlockFilter or is already in the current chain. Consider + // the following CFG: // // --->A // | / \ @@ -398,7 +406,7 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, // HotProb). If we exclude E that is not in BlockFilter when calculating the // probability of C->D, D will be selected and we will get A C D B as the // layout of this loop. - auto AdjustedSumProb = BranchProbability::getOne(); + uint32_t AdjustedSumWeight = SumWeight; SmallVector Successors; for (MachineBasicBlock *Succ : BB->successors()) { bool SkipSucc = false; @@ -416,20 +424,15 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, } } if (SkipSucc) - AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ); + AdjustedSumWeight -= MBPI->getEdgeWeight(BB, Succ) / WeightScale; else Successors.push_back(Succ); } DEBUG(dbgs() << "Attempting merge from: " << getBlockName(BB) << "\n"); for (MachineBasicBlock *Succ : Successors) { - BranchProbability SuccProb; - uint32_t SuccProbN = MBPI->getEdgeProbability(BB, Succ).getNumerator(); - uint32_t SuccProbD = AdjustedSumProb.getNumerator(); - if (SuccProbN >= SuccProbD) - SuccProb = BranchProbability::getOne(); - else - SuccProb = BranchProbability(SuccProbN, SuccProbD); + uint32_t SuccWeight = MBPI->getEdgeWeight(BB, Succ); + BranchProbability SuccProb(SuccWeight / WeightScale, AdjustedSumWeight); // If we outline optional branches, look whether Succ is unavoidable, i.e. // dominates all terminators of the MachineFunction. If it does, other @@ -467,7 +470,7 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, // Make sure that a hot successor doesn't have a globally more // important predecessor. - auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ); + BranchProbability RealSuccProb(SuccWeight / WeightScale, SumWeight); BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb * HotProb.getCompl(); bool BadCFGConflict = false; @@ -493,10 +496,10 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, << " (prob)" << (SuccChain.LoopPredecessors != 0 ? " (CFG break)" : "") << "\n"); - if (BestSucc && BestProb >= SuccProb) + if (BestSucc && BestWeight >= SuccWeight) continue; BestSucc = Succ; - BestProb = SuccProb; + BestWeight = SuccWeight; } return BestSucc; } @@ -725,6 +728,11 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L, MachineBasicBlock *OldExitingBB = ExitingBB; BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq; bool HasLoopingSucc = false; + // FIXME: Due to the performance of the probability and weight routines in + // the MBPI analysis, we use the internal weights and manually compute the + // probabilities to avoid quadratic behavior. + uint32_t WeightScale = 0; + uint32_t SumWeight = MBPI->getSumForBlock(MBB, WeightScale); for (MachineBasicBlock *Succ : MBB->successors()) { if (Succ->isEHPad()) continue; @@ -738,10 +746,10 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L, continue; } - auto SuccProb = MBPI->getEdgeProbability(MBB, Succ); + uint32_t SuccWeight = MBPI->getEdgeWeight(MBB, Succ); if (LoopBlockSet.count(Succ)) { DEBUG(dbgs() << " looping: " << getBlockName(MBB) << " -> " - << getBlockName(Succ) << " (" << SuccProb << ")\n"); + << getBlockName(Succ) << " (" << SuccWeight << ")\n"); HasLoopingSucc = true; continue; } @@ -753,6 +761,7 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L, BlocksExitingToOuterLoop.insert(MBB); } + BranchProbability SuccProb(SuccWeight / WeightScale, SumWeight); BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb; DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> " << getBlockName(Succ) << " [L:" << SuccLoopDepth << "] ("; @@ -895,17 +904,21 @@ void MachineBlockPlacement::rotateLoopWithProfile( // edge from the tail of the loop chain. SmallVector, 4> ExitsWithFreq; for (auto BB : LoopChain) { - auto LargestExitEdgeProb = BranchProbability::getZero(); + uint32_t LargestExitEdgeWeight = 0; for (auto *Succ : BB->successors()) { BlockChain *SuccChain = BlockToChain[Succ]; if (!LoopBlockSet.count(Succ) && (!SuccChain || Succ == *SuccChain->begin())) { - auto SuccProb = MBPI->getEdgeProbability(BB, Succ); - LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb); + uint32_t SuccWeight = MBPI->getEdgeWeight(BB, Succ); + LargestExitEdgeWeight = std::max(LargestExitEdgeWeight, SuccWeight); } } - if (LargestExitEdgeProb > BranchProbability::getZero()) { - auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb; + if (LargestExitEdgeWeight > 0) { + uint32_t WeightScale = 0; + uint32_t SumWeight = MBPI->getSumForBlock(BB, WeightScale); + auto ExitFreq = + MBFI->getBlockFreq(BB) * + BranchProbability(LargestExitEdgeWeight / WeightScale, SumWeight); ExitsWithFreq.emplace_back(BB, ExitFreq); } } @@ -1277,16 +1290,14 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) { } // If PrevBB has a two-way branch, try to re-order the branches - // such that we branch to the successor with higher probability first. + // such that we branch to the successor with higher weight first. if (TBB && !Cond.empty() && FBB && - MBPI->getEdgeProbability(PrevBB, FBB) > - MBPI->getEdgeProbability(PrevBB, TBB) && + MBPI->getEdgeWeight(PrevBB, FBB) > MBPI->getEdgeWeight(PrevBB, TBB) && !TII->ReverseBranchCondition(Cond)) { DEBUG(dbgs() << "Reverse order of the two branches: " << getBlockName(PrevBB) << "\n"); - DEBUG(dbgs() << " Edge probability: " - << MBPI->getEdgeProbability(PrevBB, FBB) << " vs " - << MBPI->getEdgeProbability(PrevBB, TBB) << "\n"); + DEBUG(dbgs() << " Edge weight: " << MBPI->getEdgeWeight(PrevBB, FBB) + << " vs " << MBPI->getEdgeWeight(PrevBB, TBB) << "\n"); DebugLoc dl; // FIXME: this is nowhere TII->RemoveBranch(*PrevBB); TII->InsertBranch(*PrevBB, FBB, TBB, Cond, dl); diff --git a/lib/CodeGen/MachineBranchProbabilityInfo.cpp b/lib/CodeGen/MachineBranchProbabilityInfo.cpp index 5478dcba261a..6fbc2be70486 100644 --- a/lib/CodeGen/MachineBranchProbabilityInfo.cpp +++ b/lib/CodeGen/MachineBranchProbabilityInfo.cpp @@ -28,61 +28,91 @@ char MachineBranchProbabilityInfo::ID = 0; void MachineBranchProbabilityInfo::anchor() { } -uint32_t MachineBranchProbabilityInfo::getEdgeWeight( - const MachineBasicBlock *Src, - MachineBasicBlock::const_succ_iterator Dst) const { - return Src->getSuccProbability(Dst).getNumerator(); -} +uint32_t MachineBranchProbabilityInfo:: +getSumForBlock(const MachineBasicBlock *MBB, uint32_t &Scale) const { + // First we compute the sum with 64-bits of precision, ensuring that cannot + // overflow by bounding the number of weights considered. Hopefully no one + // actually needs 2^32 successors. + assert(MBB->succ_size() < UINT32_MAX); + uint64_t Sum = 0; + Scale = 1; + for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(), + E = MBB->succ_end(); I != E; ++I) { + uint32_t Weight = getEdgeWeight(MBB, I); + Sum += Weight; + } -uint32_t MachineBranchProbabilityInfo::getEdgeWeight( - const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const { - // This is a linear search. Try to use the const_succ_iterator version when - // possible. - return getEdgeWeight(Src, std::find(Src->succ_begin(), Src->succ_end(), Dst)); + // If the computed sum fits in 32-bits, we're done. + if (Sum <= UINT32_MAX) + return Sum; + + // Otherwise, compute the scale necessary to cause the weights to fit, and + // re-sum with that scale applied. + assert((Sum / UINT32_MAX) < UINT32_MAX); + Scale = (Sum / UINT32_MAX) + 1; + Sum = 0; + for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(), + E = MBB->succ_end(); I != E; ++I) { + uint32_t Weight = getEdgeWeight(MBB, I); + Sum += Weight / Scale; + } + assert(Sum <= UINT32_MAX); + return Sum; } -BranchProbability MachineBranchProbabilityInfo::getEdgeProbability( - const MachineBasicBlock *Src, - MachineBasicBlock::const_succ_iterator Dst) const { - return Src->getSuccProbability(Dst); +uint32_t MachineBranchProbabilityInfo:: +getEdgeWeight(const MachineBasicBlock *Src, + MachineBasicBlock::const_succ_iterator Dst) const { + uint32_t Weight = Src->getSuccWeight(Dst); + if (!Weight) + return DEFAULT_WEIGHT; + return Weight; } -BranchProbability MachineBranchProbabilityInfo::getEdgeProbability( - const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const { +uint32_t MachineBranchProbabilityInfo:: +getEdgeWeight(const MachineBasicBlock *Src, + const MachineBasicBlock *Dst) const { // This is a linear search. Try to use the const_succ_iterator version when // possible. - return getEdgeProbability(Src, - std::find(Src->succ_begin(), Src->succ_end(), Dst)); + return getEdgeWeight(Src, std::find(Src->succ_begin(), Src->succ_end(), Dst)); } bool MachineBranchProbabilityInfo::isEdgeHot(const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const { // Hot probability is at least 4/5 = 80% - static BranchProbability HotProb(4, 5); - return getEdgeProbability(Src, Dst) > HotProb; + // FIXME: Compare against a static "hot" BranchProbability. + return getEdgeProbability(Src, Dst) > BranchProbability(4, 5); } MachineBasicBlock * MachineBranchProbabilityInfo::getHotSucc(MachineBasicBlock *MBB) const { - auto MaxProb = BranchProbability::getZero(); + uint32_t MaxWeight = 0; MachineBasicBlock *MaxSucc = nullptr; for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(), E = MBB->succ_end(); I != E; ++I) { - auto Prob = getEdgeProbability(MBB, I); - if (Prob > MaxProb) { - MaxProb = Prob; + uint32_t Weight = getEdgeWeight(MBB, I); + if (Weight > MaxWeight) { + MaxWeight = Weight; MaxSucc = *I; } } - static BranchProbability HotProb(4, 5); - if (getEdgeProbability(MBB, MaxSucc) >= HotProb) + if (getEdgeProbability(MBB, MaxSucc) >= BranchProbability(4, 5)) return MaxSucc; return nullptr; } +BranchProbability MachineBranchProbabilityInfo::getEdgeProbability( + const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const { + uint32_t Scale = 1; + uint32_t D = getSumForBlock(Src, Scale); + uint32_t N = getEdgeWeight(Src, Dst) / Scale; + + return BranchProbability(N, D); +} + raw_ostream &MachineBranchProbabilityInfo::printEdgeProbability( raw_ostream &OS, const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const { diff --git a/lib/CodeGen/TailDuplication.cpp b/lib/CodeGen/TailDuplication.cpp index 1f5b54866ac6..ff86dabfac59 100644 --- a/lib/CodeGen/TailDuplication.cpp +++ b/lib/CodeGen/TailDuplication.cpp @@ -745,12 +745,12 @@ TailDuplicatePass::duplicateSimpleBB(MachineBasicBlock *TailBB, if (PredTBB) TII->InsertBranch(*PredBB, PredTBB, PredFBB, PredCond, DebugLoc()); - auto Prob = MBPI->getEdgeProbability(PredBB, TailBB); + uint32_t Weight = MBPI->getEdgeWeight(PredBB, TailBB); PredBB->removeSuccessor(TailBB); unsigned NumSuccessors = PredBB->succ_size(); assert(NumSuccessors <= 1); if (NumSuccessors == 0 || *PredBB->succ_begin() != NewTarget) - PredBB->addSuccessor(NewTarget, Prob); + PredBB->addSuccessor(NewTarget, Weight); TDBBs.push_back(PredBB); } @@ -858,7 +858,7 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, "TailDuplicate called on block with multiple successors!"); for (MachineBasicBlock::succ_iterator I = TailBB->succ_begin(), E = TailBB->succ_end(); I != E; ++I) - PredBB->addSuccessor(*I, MBPI->getEdgeProbability(TailBB, I)); + PredBB->addSuccessor(*I, MBPI->getEdgeWeight(TailBB, I)); Changed = true; ++NumTailDups; diff --git a/lib/Support/BranchProbability.cpp b/lib/Support/BranchProbability.cpp index 771d02c0aa3c..3b0f6e6f06e4 100644 --- a/lib/Support/BranchProbability.cpp +++ b/lib/Support/BranchProbability.cpp @@ -22,14 +22,11 @@ using namespace llvm; const uint32_t BranchProbability::D; raw_ostream &BranchProbability::print(raw_ostream &OS) const { - if (isUnknown()) - return OS << "?%"; - // Get a percentage rounded to two decimal digits. This avoids // implementation-defined rounding inside printf. double Percent = rint(((double)N / D) * 100.0 * 100.0) / 100.0; - return OS << format("0x%08" PRIx32 " / 0x%08" PRIx32 " = %.2f%%", N, D, - Percent); + OS << format("0x%08" PRIx32 " / 0x%08" PRIx32 " = %.2f%%", N, D, Percent); + return OS; } void BranchProbability::dump() const { print(dbgs()) << '\n'; } @@ -46,19 +43,6 @@ BranchProbability::BranchProbability(uint32_t Numerator, uint32_t Denominator) { } } -BranchProbability -BranchProbability::getBranchProbability(uint64_t Numerator, - uint64_t Denominator) { - assert(Numerator <= Denominator && "Probability cannot be bigger than 1!"); - // Scale down Denominator to fit in a 32-bit integer. - int Scale = 0; - while (Denominator > UINT32_MAX) { - Denominator >>= 1; - Scale++; - } - return BranchProbability(Numerator >> Scale, Denominator); -} - // If ConstD is not zero, then replace D by ConstD so that division and modulo // operations by D can be optimized, in case this function is not inlined by the // compiler. diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp index cdbd12092150..f1b383017901 100644 --- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp +++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp @@ -1570,7 +1570,8 @@ void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk, insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc()); insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc()); - DstBlk->replaceSuccessor(DstBlk, LandMBB); + DstBlk->addSuccessor(LandMBB); + DstBlk->removeSuccessor(DstBlk); } @@ -1665,7 +1666,8 @@ AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB, replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB); //srcBlk, oldBlk, newBlk - PredMBB->replaceSuccessor(MBB, CloneMBB); + PredMBB->removeSuccessor(MBB); + PredMBB->addSuccessor(CloneMBB); // add all successor to cloneBlk cloneSuccessorList(CloneMBB, MBB); diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp index e89757c19ecc..0bf2d374df6a 100644 --- a/lib/Target/ARM/ARMConstantIslandPass.cpp +++ b/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -2274,7 +2274,8 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) { // Update the CFG. NewBB->addSuccessor(BB); - JTBB->replaceSuccessor(BB, NewBB); + JTBB->removeSuccessor(BB); + JTBB->addSuccessor(NewBB); ++NumJTInserted; return NewBB; diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index e8f3ab65bdbe..0cc41812d71c 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -7346,7 +7346,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, } } - BB->addSuccessor(DispatchBB, BranchProbability::getZero()); + BB->addSuccessor(DispatchBB); // Find the invoke call and mark all of the callee-saved registers as // 'implicit defined' so that they're spilled. This prevents code from diff --git a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp index efafdd007289..96bb61750805 100644 --- a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp +++ b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp @@ -186,11 +186,13 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) { if (case1 || case2) { InvertAndChangeJumpTarget(MI, UncondTarget); - MBB->replaceSuccessor(JumpAroundTarget, UncondTarget); + MBB->removeSuccessor(JumpAroundTarget); + MBB->addSuccessor(UncondTarget); // Remove the unconditional branch in LayoutSucc. LayoutSucc->erase(LayoutSucc->begin()); - LayoutSucc->replaceSuccessor(UncondTarget, JumpAroundTarget); + LayoutSucc->removeSuccessor(UncondTarget); + LayoutSucc->addSuccessor(JumpAroundTarget); // This code performs the conversion for case 2, which moves // the block to the fall-thru case (BB3 in the code above). diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp index e75858a181e5..d09843ed0e53 100644 --- a/lib/Target/Mips/MipsLongBranch.cpp +++ b/lib/Target/Mips/MipsLongBranch.cpp @@ -262,7 +262,8 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) { static_cast(Subtarget.getInstrInfo()); MF->insert(FallThroughMBB, LongBrMBB); - MBB->replaceSuccessor(TgtMBB, LongBrMBB); + MBB->removeSuccessor(TgtMBB); + MBB->addSuccessor(LongBrMBB); if (IsPIC) { MachineBasicBlock *BalTgtMBB = MF->CreateMachineBasicBlock(BB); diff --git a/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll b/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll index a44c9721d6c1..e17da7a97205 100644 --- a/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll +++ b/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll @@ -14,15 +14,15 @@ entry: br i1 undef, label %for.end, label %for.body ; Before if conversion, we have -; for.body -> lor.lhs.false.i (50%) -; -> for.cond.backedge (50%) -; lor.lhs.false.i -> for.cond.backedge (100%) -; -> cond.false.i (0%) +; for.body -> lor.lhs.false.i (62) +; -> for.cond.backedge (62) +; lor.lhs.false.i -> for.cond.backedge (1048575) +; -> cond.false.i (1) ; Afer if conversion, we have -; for.body -> for.cond.backedge (100%) -; -> cond.false.i (0%) +; for.body -> for.cond.backedge (130023362) +; -> cond.false.i (62) ; CHECK: BB#1: derived from LLVM BB %for.body -; CHECK: Successors according to CFG: BB#2(0x7ffffc00 / 0x80000000 = 100.00%) BB#4(0x00000400 / 0x80000000 = 0.00%) +; CHECK: Successors according to CFG: BB#2(4294967291) BB#4(2048) for.body: br i1 undef, label %for.cond.backedge, label %lor.lhs.false.i, !prof !1 diff --git a/test/CodeGen/ARM/ifcvt-branch-weight.ll b/test/CodeGen/ARM/ifcvt-branch-weight.ll index 0de039cde23c..f2a1229d0d8a 100644 --- a/test/CodeGen/ARM/ifcvt-branch-weight.ll +++ b/test/CodeGen/ARM/ifcvt-branch-weight.ll @@ -19,7 +19,7 @@ bb: br i1 %9, label %return, label %bb2 ; CHECK: BB#2: derived from LLVM BB %bb2 -; CHECK: Successors according to CFG: BB#3({{[0-9a-fx/= ]+}}50.00%) BB#4({{[0-9a-fx/= ]+}}50.00%) +; CHECK: Successors according to CFG: BB#3(4294967289) BB#4(4294967287) bb2: %v10 = icmp eq i32 %3, 16 diff --git a/test/CodeGen/ARM/ifcvt-iter-indbr.ll b/test/CodeGen/ARM/ifcvt-iter-indbr.ll index a96b6e8a1e83..6ce9bcb56ef4 100644 --- a/test/CodeGen/ARM/ifcvt-iter-indbr.ll +++ b/test/CodeGen/ARM/ifcvt-iter-indbr.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple thumbv7s-apple-darwin -asm-verbose=false | FileCheck %s -; RUN: llc < %s -mtriple thumbv7s-apple-darwin -asm-verbose=false -print-machineinstrs=if-converter 2>&1 | FileCheck --check-prefix=CHECK-PROB %s +; RUN: llc < %s -mtriple thumbv7s-apple-darwin -asm-verbose=false -print-machineinstrs=if-converter 2>&1 | FileCheck --check-prefix=CHECK-WEIGHT %s declare i32 @foo(i32) declare i8* @bar(i32, i8*, i8*) @@ -29,10 +29,10 @@ declare i8* @bar(i32, i8*, i8*) ; CHECK-NEXT: [[FOOCALL]]: ; CHECK-NEXT: blx _foo ; -; CHECK-PROB: BB#0: -; CHECK-PROB: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}50.00%) BB#2({{[0-9a-fx/= ]+}}25.00%) BB#4({{[0-9a-fx/= ]+}}25.00%) -; CHECK-PROB: BB#1: -; CHECK-PROB: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}75.00%) BB#4({{[0-9a-fx/= ]+}}25.00%) +; CHECK-WEIGHT: BB#0: +; CHECK-WEIGHT: Successors according to CFG: BB#1(1073741824) BB#2(536870912) BB#4(536870912) +; CHECK-WEIGHT: BB#1: +; CHECK-WEIGHT: Successors according to CFG: BB#2(1610612736) BB#4(536870912) define i32 @test(i32 %a, i32 %a2, i32* %p, i32* %p2) { entry: diff --git a/test/CodeGen/ARM/tail-merge-branch-weight.ll b/test/CodeGen/ARM/tail-merge-branch-weight.ll index f83f28815793..95b0a202e7ff 100644 --- a/test/CodeGen/ARM/tail-merge-branch-weight.ll +++ b/test/CodeGen/ARM/tail-merge-branch-weight.ll @@ -9,7 +9,7 @@ ; = 0.2 * 0.4 + 0.8 * 0.7 = 0.64 ; CHECK: # Machine code for function test0: -; CHECK: Successors according to CFG: BB#{{[0-9]+}}({{[0-9a-fx/= ]+}}20.00%) BB#{{[0-9]+}}({{[0-9a-fx/= ]+}}80.00%) +; CHECK: Successors according to CFG: BB#{{[0-9]+}}(13) BB#{{[0-9]+}}(24) ; CHECK: BB#{{[0-9]+}}: ; CHECK: BB#{{[0-9]+}}: ; CHECK: # End machine code for function test0. diff --git a/test/CodeGen/ARM/taildup-branch-weight.ll b/test/CodeGen/ARM/taildup-branch-weight.ll index 799ef62416e6..576c120b444e 100644 --- a/test/CodeGen/ARM/taildup-branch-weight.ll +++ b/test/CodeGen/ARM/taildup-branch-weight.ll @@ -3,7 +3,7 @@ ; RUN: | FileCheck %s ; CHECK: Machine code for function test0: -; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}3.12%) BB#2({{[0-9a-fx/= ]+}}96.88%) +; CHECK: Successors according to CFG: BB#1(67108864) BB#2(2080374784) define void @test0(i32 %a, i32 %b, i32* %c, i32* %d) { entry: @@ -30,7 +30,7 @@ B4: !0 = !{!"branch_weights", i32 4, i32 124} ; CHECK: Machine code for function test1: -; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}3.12%) BB#2({{[0-9a-fx/= ]+}}96.88%) +; CHECK: Successors according to CFG: BB#1(67108864) BB#2(2080374784) @g0 = common global i32 0, align 4 diff --git a/test/CodeGen/Generic/MachineBranchProb.ll b/test/CodeGen/Generic/MachineBranchProb.ll index ae3c8da21471..5a4a4672f7eb 100644 --- a/test/CodeGen/Generic/MachineBranchProb.ll +++ b/test/CodeGen/Generic/MachineBranchProb.ll @@ -16,11 +16,11 @@ entry: i64 5, label %sw.bb1 ], !prof !0 ; CHECK: BB#0: derived from LLVM BB %entry -; CHECK: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}75.29%) BB#4({{[0-9a-fx/= ]+}}24.71%) +; CHECK: Successors according to CFG: BB#2(1616928864) BB#4(530554784) ; CHECK: BB#4: derived from LLVM BB %entry -; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}47.62%) BB#5({{[0-9a-fx/= ]+}}52.38%) +; CHECK: Successors according to CFG: BB#1(252645135) BB#5(277909649) ; CHECK: BB#5: derived from LLVM BB %entry -; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}36.36%) BB#3({{[0-9a-fx/= ]+}}63.64%) +; CHECK: Successors according to CFG: BB#1(101058054) BB#3(176851595) sw.bb: br label %return @@ -62,7 +62,7 @@ return: ret void ; CHECK-LABEL: Machine code for function left_leaning_weight_balanced_tree: ; CHECK: BB#0: derived from LLVM BB %entry ; CHECK-NOT: Successors -; CHECK: Successors according to CFG: BB#8({{[0-9a-fx/= ]+}}39.71%) BB#9({{[0-9a-fx/= ]+}}60.29%) +; CHECK: Successors according to CFG: BB#8(852677332) BB#9(1294806318) } !1 = !{!"branch_weights", diff --git a/test/CodeGen/Hexagon/ifcvt-edge-weight.ll b/test/CodeGen/Hexagon/ifcvt-edge-weight.ll index 341567e1d02f..f84fd95e4fbd 100644 --- a/test/CodeGen/Hexagon/ifcvt-edge-weight.ll +++ b/test/CodeGen/Hexagon/ifcvt-edge-weight.ll @@ -2,7 +2,7 @@ ; Check that the edge weights are updated correctly after if-conversion. ; CHECK: BB#3: -; CHECK: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}10.00%) BB#1({{[0-9a-fx/= ]+}}90.00%) +; CHECK: Successors according to CFG: BB#2(214748365) BB#1(1932735283) @a = external global i32 @d = external global i32 diff --git a/test/CodeGen/MIR/X86/newline-handling.mir b/test/CodeGen/MIR/X86/newline-handling.mir index bce06d540114..b5ed3b7f27e1 100644 --- a/test/CodeGen/MIR/X86/newline-handling.mir +++ b/test/CodeGen/MIR/X86/newline-handling.mir @@ -35,7 +35,7 @@ liveins: # CHECK-LABEL: name: foo # CHECK: body: | # CHECK-NEXT: bb.0.entry: -# CHECK-NEXT: successors: %bb.1.less(0x40000000 / 0x80000000 = 50.00%), %bb.2.exit(0x40000000 / 0x80000000 = 50.00%) +# CHECK-NEXT: successors: %bb.1.less(0), %bb.2.exit(0) # CHECK-NEXT: liveins: %edi # CHECK: CMP32ri8 %edi, 10, implicit-def %eflags # CHECK-NEXT: JG_1 %bb.2.exit, implicit killed %eflags @@ -79,7 +79,7 @@ liveins: # CHECK-LABEL: name: bar # CHECK: body: | # CHECK-NEXT: bb.0.entry: -# CHECK-NEXT: successors: %bb.1.less(0x40000000 / 0x80000000 = 50.00%), %bb.2.exit(0x40000000 / 0x80000000 = 50.00%) +# CHECK-NEXT: successors: %bb.1.less(0), %bb.2.exit(0) # CHECK-NEXT: liveins: %edi # CHECK: CMP32ri8 %edi, 10, implicit-def %eflags # CHECK-NEXT: JG_1 %bb.2.exit, implicit killed %eflags diff --git a/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir b/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir index 64af6121189a..fc5e5d640f7f 100644 --- a/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir +++ b/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir @@ -1,6 +1,6 @@ # RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s # This test ensures that the MIR parser parses basic block successors and -# probabilities correctly. +# weights correctly. --- | @@ -21,10 +21,10 @@ name: foo body: | ; CHECK-LABEL: bb.0.entry: - ; CHECK: successors: %bb.1.less({{[0-9a-fx/= ]+}}33.00%), %bb.2.exit({{[0-9a-fx/= ]+}}67.00%) + ; CHECK: successors: %bb.1.less(16), %bb.2.exit(32) ; CHECK-LABEL: bb.1.less: bb.0.entry: - successors: %bb.1.less (33), %bb.2.exit(67) + successors: %bb.1.less (16), %bb.2.exit(32) liveins: %edi CMP32ri8 %edi, 10, implicit-def %eflags diff --git a/test/CodeGen/MIR/X86/successor-basic-blocks.mir b/test/CodeGen/MIR/X86/successor-basic-blocks.mir index a6c14f70bc7c..aa80fe9fbeef 100644 --- a/test/CodeGen/MIR/X86/successor-basic-blocks.mir +++ b/test/CodeGen/MIR/X86/successor-basic-blocks.mir @@ -32,7 +32,7 @@ name: foo body: | ; CHECK-LABEL: bb.0.entry: - ; CHECK: successors: %bb.1.less(0x40000000 / 0x80000000 = 50.00%), %bb.2.exit(0x40000000 / 0x80000000 = 50.00%) + ; CHECK: successors: %bb.1.less(0), %bb.2.exit(0) ; CHECK-LABEL: bb.1.less: bb.0.entry: successors: %bb.1.less, %bb.2.exit @@ -58,7 +58,7 @@ body: | ; Verify that we can have multiple lists of successors that will be merged ; into one. ; CHECK-LABEL: bb.0.entry: - ; CHECK: successors: %bb.1(0x80000000 / 0x80000000 = 100.00%), %bb.2(0x00000000 / 0x80000000 = 0.00%) + ; CHECK: successors: %bb.1(0), %bb.2(0) bb.0.entry: liveins: %edi successors: %bb.1 diff --git a/test/CodeGen/X86/MachineBranchProb.ll b/test/CodeGen/X86/MachineBranchProb.ll index ee1c658d4c55..da0bf517ecfa 100644 --- a/test/CodeGen/X86/MachineBranchProb.ll +++ b/test/CodeGen/X86/MachineBranchProb.ll @@ -18,9 +18,9 @@ for.cond2: ; preds = %for.inc, %for.cond %or.cond = or i1 %tobool, %cmp4 br i1 %or.cond, label %for.inc20, label %for.inc, !prof !0 ; CHECK: BB#1: derived from LLVM BB %for.cond2 -; CHECK: Successors according to CFG: BB#3({{[0-9a-fx/= ]+}}1.53%) BB#4({{[0-9a-fx/= ]+}}98.47%) +; CHECK: Successors according to CFG: BB#3(32756933) BB#4(2114726715) ; CHECK: BB#4: derived from LLVM BB %for.cond2 -; CHECK: Successors according to CFG: BB#3({{[0-9a-fx/= ]+}}1.55%) BB#2({{[0-9a-fx/= ]+}}98.45%) +; CHECK: Successors according to CFG: BB#3(33264335) BB#2(2114219313) for.inc: ; preds = %for.cond2 %shl = shl i32 %bit.0, 1 diff --git a/test/CodeGen/X86/catchpad-weight.ll b/test/CodeGen/X86/catchpad-weight.ll index e8b416845ec1..9b06f2abc81c 100644 --- a/test/CodeGen/X86/catchpad-weight.ll +++ b/test/CodeGen/X86/catchpad-weight.ll @@ -2,7 +2,7 @@ ; Check if the edge weight to the catchpad is calculated correctly. -; CHECK: Successors according to CFG: BB#3(0x7ffff100 / 0x80000000 = 100.00%) BB#1(0x00000800 / 0x80000000 = 0.00%) BB#4(0x00000400 / 0x80000000 = 0.00%) BB#6(0x00000200 / 0x80000000 = 0.00%) BB#8(0x00000100 / 0x80000000 = 0.00%) +; CHECK: Successors according to CFG: BB#3(2147481600) BB#1(2048) BB#4(1024) BB#6(512) BB#8(256) target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64--windows-msvc18.0.0" diff --git a/test/CodeGen/X86/stack-protector-weight.ll b/test/CodeGen/X86/stack-protector-weight.ll index dea66d28e3dd..16877ef70a31 100644 --- a/test/CodeGen/X86/stack-protector-weight.ll +++ b/test/CodeGen/X86/stack-protector-weight.ll @@ -2,13 +2,13 @@ ; RUN: llc -mtriple=x86_64-apple-darwin -print-machineinstrs=expand-isel-pseudos -enable-selectiondag-sp=false %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=IR ; SELDAG: # Machine code for function test_branch_weights: -; SELDAG: Successors according to CFG: BB#[[SUCCESS:[0-9]+]]({{[0-9a-fx/= ]+}}100.00%) BB#[[FAILURE:[0-9]+]] +; SELDAG: Successors according to CFG: BB#[[SUCCESS:[0-9]+]](2147481600) BB#[[FAILURE:[0-9]+]](2048) ; SELDAG: BB#[[FAILURE]]: ; SELDAG: CALL64pcrel32 ; SELDAG: BB#[[SUCCESS]]: ; IR: # Machine code for function test_branch_weights: -; IR: Successors according to CFG: BB#[[SUCCESS:[0-9]+]]({{[0-9a-fx/= ]+}}100.00%) BB#[[FAILURE:[0-9]+]] +; IR: Successors according to CFG: BB#[[SUCCESS:[0-9]+]](2147481600) BB#[[FAILURE:[0-9]+]](2048) ; IR: BB#[[SUCCESS]]: ; IR: BB#[[FAILURE]]: ; IR: CALL64pcrel32 diff --git a/test/CodeGen/X86/switch-edge-weight.ll b/test/CodeGen/X86/switch-edge-weight.ll index 6f594868c7ad..9026f6f05f0e 100644 --- a/test/CodeGen/X86/switch-edge-weight.ll +++ b/test/CodeGen/X86/switch-edge-weight.ll @@ -34,22 +34,22 @@ sw.epilog: ; CHECK: BB#0: ; BB#0 to BB#4: [0, 1133] (65 = 60 + 5) ; BB#0 to BB#5: [1134, UINT32_MAX] (25 = 20 + 5) -; CHECK: Successors according to CFG: BB#4({{[0-9a-fx/= ]+}}72.22%) BB#5({{[0-9a-fx/= ]+}}27.78%) +; CHECK: Successors according to CFG: BB#4(1550960411) BB#5(596523235) ; ; CHECK: BB#4: ; BB#4 to BB#1: [155, 159] (50) ; BB#4 to BB#5: [0, 1133] - [155, 159] (15 = 10 + 5) -; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}76.92%) BB#7({{[0-9a-fx/= ]+}}23.08%) +; CHECK: Successors according to CFG: BB#1(1193046470) BB#7(357913941) ; ; CHECK: BB#5: ; BB#5 to BB#1: {1140} (10) ; BB#5 to BB#6: [1134, UINT32_MAX] - {1140} (15 = 10 + 5) -; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}40.00%) BB#6({{[0-9a-fx/= ]+}}60.00%) +; CHECK: Successors according to CFG: BB#1(238609294) BB#6(357913941) ; ; CHECK: BB#6: ; BB#6 to BB#1: {1134} (10) ; BB#6 to BB#2: [1134, UINT32_MAX] - {1134, 1140} (5) -; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}66.67%) BB#2({{[0-9a-fx/= ]+}}33.33%) +; CHECK: Successors according to CFG: BB#1(238609294) BB#2(119304647) } ; CHECK-LABEL: test2 @@ -102,7 +102,7 @@ sw.epilog: ; CHECK: BB#0: ; BB#0 to BB#6: {0} + [15, UINT32_MAX] (5) ; BB#0 to BB#8: [1, 14] (jump table) (65 = 60 + 5) -; CHECK: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}7.14%) BB#8({{[0-9a-fx/= ]+}}92.86% +; CHECK: Successors according to CFG: BB#6(153391689) BB#8(1994091957) ; ; CHECK: BB#8: ; BB#8 to BB#1: {1} (10) @@ -111,7 +111,7 @@ sw.epilog: ; BB#8 to BB#3: {11} (10) ; BB#8 to BB#4: {12} (10) ; BB#8 to BB#5: {13, 14} (20) -; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}14.29%) BB#6({{[0-9a-fx/= ]+}}7.14%) BB#2({{[0-9a-fx/= ]+}}14.29%) BB#3({{[0-9a-fx/= ]+}}14.29%) BB#4({{[0-9a-fx/= ]+}}14.29%) BB#5({{[0-9a-fx/= ]+}}28.57%) +; CHECK: Successors according to CFG: BB#1(306783378) BB#6(153391689) BB#2(306783378) BB#3(306783378) BB#4(306783378) BB#5(613566756) } ; CHECK-LABEL: test3 @@ -163,7 +163,7 @@ sw.epilog: ; CHECK: BB#0: ; BB#0 to BB#6: [0, 9] + [15, UINT32_MAX] {10} ; BB#0 to BB#8: [10, 14] (jump table) (50) -; CHECK: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}16.67%) BB#8({{[0-9a-fx/= ]+}}83.33%) +; CHECK: Successors according to CFG: BB#6(357913941) BB#8(1789569705) ; ; CHECK: BB#8: ; BB#8 to BB#1: {10} (10) @@ -171,7 +171,7 @@ sw.epilog: ; BB#8 to BB#3: {12} (10) ; BB#8 to BB#4: {13} (10) ; BB#8 to BB#5: {14} (10) -; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}20.00%) BB#2({{[0-9a-fx/= ]+}}20.00%) BB#3({{[0-9a-fx/= ]+}}20.00%) BB#4({{[0-9a-fx/= ]+}}20.00%) BB#5({{[0-9a-fx/= ]+}}20.00%) +; CHECK: Successors according to CFG: BB#1(357913941) BB#2(357913941) BB#3(357913941) BB#4(357913941) BB#5(357913941) } ; CHECK-LABEL: test4 @@ -216,12 +216,12 @@ sw.epilog: ; CHECK: BB#0: ; BB#0 to BB#6: [0, 110] + [116, UINT32_MAX] (20) ; BB#0 to BB#7: [111, 115] (bit test) (50) -; CHECK: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}28.57%) BB#7({{[0-9a-fx/= ]+}}71.43%) +; CHECK: Successors according to CFG: BB#6(613566756) BB#7(1533916890) ; ; CHECK: BB#7: ; BB#7 to BB#2: {111, 114, 115} (30) ; BB#7 to BB#3: {112, 113} (20) -; CHECK: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}60.00%) BB#3({{[0-9a-fx/= ]+}}40.00%) +; CHECK: Successors according to CFG: BB#2(920350134) BB#3(613566756) } ; CHECK-LABEL: test5 @@ -273,7 +273,7 @@ sw.epilog: ; CHECK: BB#0: ; BB#0 to BB#6: [10, UINT32_MAX] (15) ; BB#0 to BB#8: [1, 5, 7, 9] (jump table) (45) -; CHECK: Successors according to CFG: BB#8({{[0-9a-fx/= ]+}}25.00%) BB#9({{[0-9a-fx/= ]+}}75.00%) +; CHECK: Successors according to CFG: BB#8(536870912) BB#9(1610612734) } !1 = !{!"branch_weights", i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10} diff --git a/test/CodeGen/X86/switch-jump-table.ll b/test/CodeGen/X86/switch-jump-table.ll index 896a067da230..3cfee1cd80e6 100644 --- a/test/CodeGen/X86/switch-jump-table.ll +++ b/test/CodeGen/X86/switch-jump-table.ll @@ -1,5 +1,5 @@ ; RUN: llc -mtriple=i686-pc-gnu-linux < %s | FileCheck %s -check-prefix=CHECK -; RUN: llc -mtriple=i686-pc-gnu-linux -print-machineinstrs=expand-isel-pseudos %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=CHECK-JT-PROB +; RUN: llc -mtriple=i686-pc-gnu-linux -print-machineinstrs=expand-isel-pseudos %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=CHECK-JT-WEIGHT ; An unreachable default destination is replaced with the most popular case label. @@ -54,9 +54,9 @@ default: ; Check if branch probabilities are correctly assigned to the jump table. define void @bar(i32 %x, i32* %to) { -; CHECK-JT-PROB-LABEL: bar: -; CHECK-JT-PROB: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}14.29%) BB#8({{[0-9a-fx/= ]+}}85.71%) -; CHECK-JT-PROB: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}16.67%) BB#2({{[0-9a-fx/= ]+}}16.67%) BB#3({{[0-9a-fx/= ]+}}16.67%) BB#4({{[0-9a-fx/= ]+}}16.67%) BB#5({{[0-9a-fx/= ]+}}33.33%) +; CHECK-JT-WEIGHT-LABEL: bar: +; CHECK-JT-WEIGHT: Successors according to CFG: BB#6(306783378) BB#8(1840700268) +; CHECK-JT-WEIGHT: Successors according to CFG: BB#1(306783378) BB#2(306783378) BB#3(306783378) BB#4(306783378) BB#5(613566756) entry: switch i32 %x, label %default [ From 85e0db955c7d5f7b1baf3d337ff0fb0cd8fb0404 Mon Sep 17 00:00:00 2001 From: Matthias Braun Date: Tue, 1 Dec 2015 04:19:56 +0000 Subject: [PATCH 046/186] RegisterPressure: Split RegisterOperands analysis code from result object; NFC This is in preparation to expose the RegisterOperands class as RegisterPressure API. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254368 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/RegisterPressure.cpp | 103 ++++++++++++++++++------------- 1 file changed, 60 insertions(+), 43 deletions(-) diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp index 6ff16e551be2..3c9da13c4a39 100644 --- a/lib/CodeGen/RegisterPressure.cpp +++ b/lib/CodeGen/RegisterPressure.cpp @@ -314,71 +314,88 @@ static bool containsReg(ArrayRef RegUnits, unsigned RegUnit) { } namespace { -/// Collect this instruction's unique uses and defs into SmallVectors for -/// processing defs and uses in order. -/// -/// FIXME: always ignore tied opers -class RegisterOperands { - const TargetRegisterInfo *TRI; - const MachineRegisterInfo *MRI; - bool IgnoreDead; +/// List of register defined and used by a machine instruction. +class RegisterOperands { public: SmallVector Uses; SmallVector Defs; SmallVector DeadDefs; - RegisterOperands(const TargetRegisterInfo *tri, - const MachineRegisterInfo *mri, bool ID = false): - TRI(tri), MRI(mri), IgnoreDead(ID) {} + void collect(const MachineInstr &MI, const TargetRegisterInfo &TRI, + const MachineRegisterInfo &MRI, bool IgnoreDead = false); +}; - /// Push this operand's register onto the correct vector. - void collect(const MachineOperand &MO) { +/// Collect this instruction's unique uses and defs into SmallVectors for +/// processing defs and uses in order. +/// +/// FIXME: always ignore tied opers +class RegisterOperandsCollector { + RegisterOperands &RegOpers; + const TargetRegisterInfo &TRI; + const MachineRegisterInfo &MRI; + bool IgnoreDead; + + RegisterOperandsCollector(RegisterOperands &RegOpers, + const TargetRegisterInfo &TRI, + const MachineRegisterInfo &MRI, + bool IgnoreDead) + : RegOpers(RegOpers), TRI(TRI), MRI(MRI), IgnoreDead(IgnoreDead) {} + + void collectInstr(const MachineInstr &MI) const { + for (ConstMIBundleOperands OperI(&MI); OperI.isValid(); ++OperI) + collectOperand(*OperI); + + // Remove redundant physreg dead defs. + SmallVectorImpl::iterator I = + std::remove_if(RegOpers.DeadDefs.begin(), RegOpers.DeadDefs.end(), + std::bind1st(std::ptr_fun(containsReg), RegOpers.Defs)); + RegOpers.DeadDefs.erase(I, RegOpers.DeadDefs.end()); + } + + /// Push this operand's register onto the correct vectors. + void collectOperand(const MachineOperand &MO) const { if (!MO.isReg() || !MO.getReg()) return; + unsigned Reg = MO.getReg(); if (MO.readsReg()) - pushRegUnits(MO.getReg(), Uses); + pushRegUnits(Reg, RegOpers.Uses); if (MO.isDef()) { if (MO.isDead()) { if (!IgnoreDead) - pushRegUnits(MO.getReg(), DeadDefs); - } - else - pushRegUnits(MO.getReg(), Defs); + pushRegUnits(Reg, RegOpers.DeadDefs); + } else + pushRegUnits(Reg, RegOpers.Defs); } } -protected: - void pushRegUnits(unsigned Reg, SmallVectorImpl &RegUnits) { + void pushRegUnits(unsigned Reg, SmallVectorImpl &RegUnits) const { if (TargetRegisterInfo::isVirtualRegister(Reg)) { if (containsReg(RegUnits, Reg)) return; RegUnits.push_back(Reg); - } - else if (MRI->isAllocatable(Reg)) { - for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) { + } else if (MRI.isAllocatable(Reg)) { + for (MCRegUnitIterator Units(Reg, &TRI); Units.isValid(); ++Units) { if (containsReg(RegUnits, *Units)) continue; RegUnits.push_back(*Units); } } } + + friend class RegisterOperands; }; -} // namespace -/// Collect physical and virtual register operands. -static void collectOperands(const MachineInstr *MI, - RegisterOperands &RegOpers) { - for (ConstMIBundleOperands OperI(MI); OperI.isValid(); ++OperI) - RegOpers.collect(*OperI); - - // Remove redundant physreg dead defs. - SmallVectorImpl::iterator I = - std::remove_if(RegOpers.DeadDefs.begin(), RegOpers.DeadDefs.end(), - std::bind1st(std::ptr_fun(containsReg), RegOpers.Defs)); - RegOpers.DeadDefs.erase(I, RegOpers.DeadDefs.end()); +void RegisterOperands::collect(const MachineInstr &MI, + const TargetRegisterInfo &TRI, + const MachineRegisterInfo &MRI, + bool IgnoreDead) { + RegisterOperandsCollector Collector(*this, TRI, MRI, IgnoreDead); + Collector.collectInstr(MI); } +} // namespace + /// Initialize an array of N PressureDiffs. void PressureDiffs::init(unsigned N) { Size = N; @@ -505,8 +522,8 @@ bool RegPressureTracker::recede(SmallVectorImpl *LiveUses, if (RequireIntervals && isTopClosed()) static_cast(P).openTop(SlotIdx); - RegisterOperands RegOpers(TRI, MRI); - collectOperands(CurrPos, RegOpers); + RegisterOperands RegOpers; + RegOpers.collect(*CurrPos, *TRI, *MRI); if (PDiff) collectPDiff(*PDiff, RegOpers, MRI); @@ -594,8 +611,8 @@ bool RegPressureTracker::advance() { static_cast(P).openBottom(CurrPos); } - RegisterOperands RegOpers(TRI, MRI); - collectOperands(CurrPos, RegOpers); + RegisterOperands RegOpers; + RegOpers.collect(*CurrPos, *TRI, *MRI); for (unsigned i = 0, e = RegOpers.Uses.size(); i < e; ++i) { unsigned Reg = RegOpers.Uses[i]; @@ -728,8 +745,8 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) { assert(!MI->isDebugValue() && "Expect a nondebug instruction."); // Account for register pressure similar to RegPressureTracker::recede(). - RegisterOperands RegOpers(TRI, MRI, /*IgnoreDead=*/true); - collectOperands(MI, RegOpers); + RegisterOperands RegOpers; + RegOpers.collect(*MI, *TRI, *MRI, /*IgnoreDead=*/true); // Boost max pressure for all dead defs together. // Since CurrSetPressure and MaxSetPressure @@ -923,8 +940,8 @@ void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) { assert(!MI->isDebugValue() && "Expect a nondebug instruction."); // Account for register pressure similar to RegPressureTracker::recede(). - RegisterOperands RegOpers(TRI, MRI); - collectOperands(MI, RegOpers); + RegisterOperands RegOpers; + RegOpers.collect(*MI, *TRI, *MRI); // Kill liveness at last uses. Assume allocatable physregs are single-use // rather than checking LiveIntervals. From acac7f59d5e9e0da8ae63d26ca2ae946902d784c Mon Sep 17 00:00:00 2001 From: Matthias Braun Date: Tue, 1 Dec 2015 04:19:58 +0000 Subject: [PATCH 047/186] RegisterPressure: There is no need to make discoverLive{In|Out} public git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254369 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/CodeGen/RegisterPressure.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/llvm/CodeGen/RegisterPressure.h b/include/llvm/CodeGen/RegisterPressure.h index 166bd8686891..b6e0a20526fc 100644 --- a/include/llvm/CodeGen/RegisterPressure.h +++ b/include/llvm/CodeGen/RegisterPressure.h @@ -365,9 +365,6 @@ class RegPressureTracker { return CurrSetPressure; } - void discoverLiveOut(unsigned Reg); - void discoverLiveIn(unsigned Reg); - bool isTopClosed() const; bool isBottomClosed() const; @@ -442,6 +439,9 @@ class RegPressureTracker { void dump() const; protected: + void discoverLiveOut(unsigned Reg); + void discoverLiveIn(unsigned Reg); + const LiveRange *getLiveRange(unsigned Reg) const; void increaseRegPressure(ArrayRef Regs); From 4bee6aee5acbc40b98a19e09739470ae1fdd46c9 Mon Sep 17 00:00:00 2001 From: Matthias Braun Date: Tue, 1 Dec 2015 04:20:01 +0000 Subject: [PATCH 048/186] RegisterPressure: There is no need to make getCurSlot() public git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254370 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/CodeGen/RegisterPressure.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/llvm/CodeGen/RegisterPressure.h b/include/llvm/CodeGen/RegisterPressure.h index b6e0a20526fc..42131c831ea1 100644 --- a/include/llvm/CodeGen/RegisterPressure.h +++ b/include/llvm/CodeGen/RegisterPressure.h @@ -328,10 +328,6 @@ class RegPressureTracker { // position changes while pressure does not. void setPos(MachineBasicBlock::const_iterator Pos) { CurrPos = Pos; } - /// \brief Get the SlotIndex for the first nondebug instruction including or - /// after the current position. - SlotIndex getCurrSlot() const; - /// Recede across the previous instruction. bool recede(SmallVectorImpl *LiveUses = nullptr, PressureDiff *PDiff = nullptr); @@ -442,6 +438,10 @@ class RegPressureTracker { void discoverLiveOut(unsigned Reg); void discoverLiveIn(unsigned Reg); + /// \brief Get the SlotIndex for the first nondebug instruction including or + /// after the current position. + SlotIndex getCurrSlot() const; + const LiveRange *getLiveRange(unsigned Reg) const; void increaseRegPressure(ArrayRef Regs); From bce24cfb9ee40462a5fdc09503dfd3305e45326c Mon Sep 17 00:00:00 2001 From: Matthias Braun Date: Tue, 1 Dec 2015 04:20:04 +0000 Subject: [PATCH 049/186] RegisterPressure: Remove support for recede()/advance() at MBB boundaries Nobody was checking the returnvalue of recede()/advance() so we can simply replace this code with asserts. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254371 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/CodeGen/RegisterPressure.h | 7 +++---- lib/CodeGen/RegisterPressure.cpp | 23 +++++------------------ 2 files changed, 8 insertions(+), 22 deletions(-) diff --git a/include/llvm/CodeGen/RegisterPressure.h b/include/llvm/CodeGen/RegisterPressure.h index 42131c831ea1..e296701d8e8c 100644 --- a/include/llvm/CodeGen/RegisterPressure.h +++ b/include/llvm/CodeGen/RegisterPressure.h @@ -329,11 +329,11 @@ class RegPressureTracker { void setPos(MachineBasicBlock::const_iterator Pos) { CurrPos = Pos; } /// Recede across the previous instruction. - bool recede(SmallVectorImpl *LiveUses = nullptr, + void recede(SmallVectorImpl *LiveUses = nullptr, PressureDiff *PDiff = nullptr); /// Advance across the current instruction. - bool advance(); + void advance(); /// Finalize the region boundaries and recored live ins and live outs. void closeRegion(); @@ -350,8 +350,7 @@ class RegPressureTracker { ArrayRef getLiveThru() const { return LiveThruPressure; } /// Get the resulting register pressure over the traversed region. - /// This result is complete if either advance() or recede() has returned true, - /// or if closeRegion() was explicitly invoked. + /// This result is complete if closeRegion() was explicitly invoked. RegisterPressure &getPressure() { return P; } const RegisterPressure &getPressure() const { return P; } diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp index 3c9da13c4a39..18002c8fbd56 100644 --- a/lib/CodeGen/RegisterPressure.cpp +++ b/lib/CodeGen/RegisterPressure.cpp @@ -491,13 +491,9 @@ void RegPressureTracker::discoverLiveOut(unsigned Reg) { /// registers that are both defined and used by the instruction. If a pressure /// difference pointer is provided record the changes is pressure caused by this /// instruction independent of liveness. -bool RegPressureTracker::recede(SmallVectorImpl *LiveUses, +void RegPressureTracker::recede(SmallVectorImpl *LiveUses, PressureDiff *PDiff) { - // Check for the top of the analyzable region. - if (CurrPos == MBB->begin()) { - closeRegion(); - return false; - } + assert(CurrPos != MBB->begin()); if (!isBottomClosed()) closeBottom(); @@ -509,11 +505,8 @@ bool RegPressureTracker::recede(SmallVectorImpl *LiveUses, do --CurrPos; while (CurrPos != MBB->begin() && CurrPos->isDebugValue()); + assert(!CurrPos->isDebugValue()); - if (CurrPos->isDebugValue()) { - closeRegion(); - return false; - } SlotIndex SlotIdx; if (RequireIntervals) SlotIdx = LIS->getInstructionIndex(CurrPos).getRegSlot(); @@ -584,18 +577,13 @@ bool RegPressureTracker::recede(SmallVectorImpl *LiveUses, UntiedDefs.insert(Reg); } } - return true; } /// Advance across the current instruction. -bool RegPressureTracker::advance() { +void RegPressureTracker::advance() { assert(!TrackUntiedDefs && "unsupported mode"); - // Check for the bottom of the analyzable region. - if (CurrPos == MBB->end()) { - closeRegion(); - return false; - } + assert(CurrPos != MBB->end()); if (!isTopClosed()) closeTop(); @@ -653,7 +641,6 @@ bool RegPressureTracker::advance() { do ++CurrPos; while (CurrPos != MBB->end() && CurrPos->isDebugValue()); - return true; } /// Find the max change in excess pressure across all sets. From 104db6b94a127e4e68db7528f50d3fcf24d543bd Mon Sep 17 00:00:00 2001 From: Matthias Braun Date: Tue, 1 Dec 2015 04:20:06 +0000 Subject: [PATCH 050/186] RegisterPressure: If we do not collect dead defs the list must be empty git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254372 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/RegisterPressure.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp index 18002c8fbd56..8eba0edc8bfe 100644 --- a/lib/CodeGen/RegisterPressure.cpp +++ b/lib/CodeGen/RegisterPressure.cpp @@ -734,11 +734,7 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) { // Account for register pressure similar to RegPressureTracker::recede(). RegisterOperands RegOpers; RegOpers.collect(*MI, *TRI, *MRI, /*IgnoreDead=*/true); - - // Boost max pressure for all dead defs together. - // Since CurrSetPressure and MaxSetPressure - increaseRegPressure(RegOpers.DeadDefs); - decreaseRegPressure(RegOpers.DeadDefs); + assert(RegOpers.DeadDefs.size() == 0); // Kill liveness at live defs. for (unsigned i = 0, e = RegOpers.Defs.size(); i < e; ++i) { From 442a04a282ce8f9998be131be50f5a5c73d7791f Mon Sep 17 00:00:00 2001 From: Colin LeMahieu Date: Tue, 1 Dec 2015 04:56:25 +0000 Subject: [PATCH 051/186] [Hexagon] Disabling failing safestack test git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254375 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/DebugInfo/Generic/safestack-byval.ll | 1 + 1 file changed, 1 insertion(+) diff --git a/test/DebugInfo/Generic/safestack-byval.ll b/test/DebugInfo/Generic/safestack-byval.ll index 24df3815068e..1329a95a2201 100644 --- a/test/DebugInfo/Generic/safestack-byval.ll +++ b/test/DebugInfo/Generic/safestack-byval.ll @@ -2,6 +2,7 @@ ; points to neither an argument nor an alloca. This kind of IR is generated by ; SafeStack for unsafe byval arguments. ; RUN: llc -stop-after expand-isel-pseudos %s -o /dev/null | FileCheck %s +; XFAIL: hexagon ; This was built by compiling the following source with SafeStack and ; simplifying the result a little. From 5155021519d454bbd00404a5a0ac66377dffe7be Mon Sep 17 00:00:00 2001 From: Cong Hou Date: Tue, 1 Dec 2015 05:29:22 +0000 Subject: [PATCH 052/186] Replace all weight-based interfaces in MBB with probability-based interfaces, and update all uses of old interfaces. (This is the second attempt to submit this patch. The first caused two assertion failures and was reverted. See https://llvm.org/bugs/show_bug.cgi?id=25687) The patch in http://reviews.llvm.org/D13745 is broken into four parts: 1. New interfaces without functional changes (http://reviews.llvm.org/D13908). 2. Use new interfaces in SelectionDAG, while in other passes treat probabilities as weights (http://reviews.llvm.org/D14361). 3. Use new interfaces in all other passes. 4. Remove old interfaces. This patch is 3+4 above. In this patch, MBB won't provide weight-based interfaces any more, which are totally replaced by probability-based ones. The interface addSuccessor() is redesigned so that the default probability is unknown. We allow unknown probabilities but don't allow using it together with known probabilities in successor list. That is to say, we either have a list of successors with all known probabilities, or all unknown probabilities. In the latter case, we assume each successor has 1/N probability where N is the number of successors. An assertion checks if the user is attempting to add a successor with the disallowed mixed use as stated above. This can help us catch many misuses. All uses of weight-based interfaces are now updated to use probability-based ones. Differential revision: http://reviews.llvm.org/D14973 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254377 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Analysis/BranchProbabilityInfo.h | 3 + include/llvm/CodeGen/MachineBasicBlock.h | 46 +----- .../CodeGen/MachineBranchProbabilityInfo.h | 22 +-- include/llvm/Support/BranchProbability.h | 31 +++- lib/Analysis/BranchProbabilityInfo.cpp | 6 + lib/CodeGen/BranchFolding.cpp | 16 +- lib/CodeGen/IfConversion.cpp | 155 ++++++++---------- lib/CodeGen/MIRParser/MIParser.cpp | 3 +- lib/CodeGen/MIRPrinter.cpp | 4 +- lib/CodeGen/MachineBasicBlock.cpp | 142 ++++------------ lib/CodeGen/MachineBlockPlacement.cpp | 71 ++++---- lib/CodeGen/MachineBranchProbabilityInfo.cpp | 82 +++------ lib/CodeGen/TailDuplication.cpp | 6 +- lib/Support/BranchProbability.cpp | 20 ++- lib/Target/AMDGPU/AMDILCFGStructurizer.cpp | 6 +- lib/Target/ARM/ARMConstantIslandPass.cpp | 3 +- lib/Target/ARM/ARMISelLowering.cpp | 2 +- lib/Target/Hexagon/HexagonCFGOptimizer.cpp | 6 +- lib/Target/Mips/MipsLongBranch.cpp | 3 +- test/CodeGen/ARM/ifcvt-branch-weight-bug.ll | 14 +- test/CodeGen/ARM/ifcvt-branch-weight.ll | 2 +- test/CodeGen/ARM/ifcvt-iter-indbr.ll | 10 +- test/CodeGen/ARM/tail-merge-branch-weight.ll | 2 +- test/CodeGen/ARM/taildup-branch-weight.ll | 4 +- test/CodeGen/Generic/MachineBranchProb.ll | 8 +- test/CodeGen/Hexagon/ifcvt-edge-weight.ll | 2 +- test/CodeGen/MIR/X86/newline-handling.mir | 4 +- .../X86/successor-basic-blocks-weights.mir | 6 +- .../MIR/X86/successor-basic-blocks.mir | 4 +- test/CodeGen/X86/MachineBranchProb.ll | 4 +- test/CodeGen/X86/catchpad-weight.ll | 2 +- test/CodeGen/X86/stack-protector-weight.ll | 4 +- test/CodeGen/X86/switch-edge-weight.ll | 22 +-- test/CodeGen/X86/switch-jump-table.ll | 8 +- 34 files changed, 301 insertions(+), 422 deletions(-) diff --git a/include/llvm/Analysis/BranchProbabilityInfo.h b/include/llvm/Analysis/BranchProbabilityInfo.h index 89dec14b2b3e..69dae5e90785 100644 --- a/include/llvm/Analysis/BranchProbabilityInfo.h +++ b/include/llvm/Analysis/BranchProbabilityInfo.h @@ -61,6 +61,9 @@ class BranchProbabilityInfo { BranchProbability getEdgeProbability(const BasicBlock *Src, const BasicBlock *Dst) const; + BranchProbability getEdgeProbability(const BasicBlock *Src, + succ_const_iterator Dst) const; + /// \brief Test if an edge is hot relative to other out-edges of the Src. /// /// Check whether this edge out of the source block is 'hot'. We define hot diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h index a2b1a850ec76..ac87f4f901f5 100644 --- a/include/llvm/CodeGen/MachineBasicBlock.h +++ b/include/llvm/CodeGen/MachineBasicBlock.h @@ -91,13 +91,6 @@ class MachineBasicBlock std::vector Predecessors; std::vector Successors; - /// Keep track of the weights to the successors. This vector has the same - /// order as Successors, or it is empty if we don't use it (disable - /// optimization). - std::vector Weights; - typedef std::vector::iterator weight_iterator; - typedef std::vector::const_iterator const_weight_iterator; - /// Keep track of the probabilities to the successors. This vector has the /// same order as Successors, or it is empty if we don't use it (disable /// optimization). @@ -440,26 +433,16 @@ class MachineBasicBlock // Machine-CFG mutators - /// Add Succ as a successor of this MachineBasicBlock. The Predecessors list - /// of Succ is automatically updated. WEIGHT parameter is stored in Weights - /// list and it may be used by MachineBranchProbabilityInfo analysis to - /// calculate branch probability. - /// - /// Note that duplicate Machine CFG edges are not allowed. - void addSuccessor(MachineBasicBlock *Succ, uint32_t Weight = 0); - - /// Add Succ as a successor of this MachineBasicBlock. The Predecessors list - /// of Succ is automatically updated. The weight is not provided because BPI - /// is not available (e.g. -O0 is used), in which case edge weights won't be - /// used. Using this interface can save some space. - void addSuccessorWithoutWeight(MachineBasicBlock *Succ); - /// Add Succ as a successor of this MachineBasicBlock. The Predecessors list /// of Succ is automatically updated. PROB parameter is stored in - /// Probabilities list. + /// Probabilities list. The default probability is set as unknown. Mixing + /// known and unknown probabilities in successor list is not allowed. When all + /// successors have unknown probabilities, 1 / N is returned as the + /// probability for each successor, where N is the number of successors. /// /// Note that duplicate Machine CFG edges are not allowed. - void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob); + void addSuccessor(MachineBasicBlock *Succ, + BranchProbability Prob = BranchProbability::getUnknown()); /// Add Succ as a successor of this MachineBasicBlock. The Predecessors list /// of Succ is automatically updated. The probability is not provided because @@ -467,9 +450,6 @@ class MachineBasicBlock /// won't be used. Using this interface can save some space. void addSuccessorWithoutProb(MachineBasicBlock *Succ); - /// Set successor weight of a given iterator. - void setSuccWeight(succ_iterator I, uint32_t Weight); - /// Set successor probability of a given iterator. void setSuccProbability(succ_iterator I, BranchProbability Prob); @@ -488,7 +468,7 @@ class MachineBasicBlock /// Return the iterator to the element after the one removed. succ_iterator removeSuccessor(succ_iterator I); - /// Replace successor OLD with NEW and update weight info. + /// Replace successor OLD with NEW and update probability info. void replaceSuccessor(MachineBasicBlock *Old, MachineBasicBlock *New); /// Transfers all the successors from MBB to this machine basic block (i.e., @@ -500,9 +480,6 @@ class MachineBasicBlock /// operands in the successor blocks which refer to FromMBB to refer to this. void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB); - /// Return true if any of the successors have weights attached to them. - bool hasSuccessorWeights() const { return !Weights.empty(); } - /// Return true if any of the successors have probabilities attached to them. bool hasSuccessorProbabilities() const { return !Probs.empty(); } @@ -759,10 +736,6 @@ class MachineBasicBlock private: - /// Return weight iterator corresponding to the I successor iterator. - weight_iterator getWeightIterator(succ_iterator I); - const_weight_iterator getWeightIterator(const_succ_iterator I) const; - /// Return probability iterator corresponding to the I successor iterator. probability_iterator getProbabilityIterator(succ_iterator I); const_probability_iterator @@ -771,11 +744,6 @@ class MachineBasicBlock friend class MachineBranchProbabilityInfo; friend class MIPrinter; - /// Return weight of the edge from this block to MBB. This method should NOT - /// be called directly, but by using getEdgeWeight method from - /// MachineBranchProbabilityInfo class. - uint32_t getSuccWeight(const_succ_iterator Succ) const; - /// Return probability of the edge from this block to MBB. This method should /// NOT be called directly, but by using getEdgeProbability method from /// MachineBranchProbabilityInfo class. diff --git a/include/llvm/CodeGen/MachineBranchProbabilityInfo.h b/include/llvm/CodeGen/MachineBranchProbabilityInfo.h index 058ab32f3aa9..608e8d257874 100644 --- a/include/llvm/CodeGen/MachineBranchProbabilityInfo.h +++ b/include/llvm/CodeGen/MachineBranchProbabilityInfo.h @@ -55,10 +55,15 @@ class MachineBranchProbabilityInfo : public ImmutablePass { uint32_t getEdgeWeight(const MachineBasicBlock *Src, MachineBasicBlock::const_succ_iterator Dst) const; - // Get sum of the block successors' weights, potentially scaling them to fit - // within 32-bits. If scaling is required, sets Scale based on the necessary - // adjustment. Any edge weights used with the sum should be divided by Scale. - uint32_t getSumForBlock(const MachineBasicBlock *MBB, uint32_t &Scale) const; + // Return edge probability. + BranchProbability getEdgeProbability(const MachineBasicBlock *Src, + const MachineBasicBlock *Dst) const; + + // Same as above, but using a const_succ_iterator from Src. This is faster + // when the iterator is already available. + BranchProbability + getEdgeProbability(const MachineBasicBlock *Src, + MachineBasicBlock::const_succ_iterator Dst) const; // A 'Hot' edge is an edge which probability is >= 80%. bool isEdgeHot(const MachineBasicBlock *Src, @@ -68,15 +73,6 @@ class MachineBranchProbabilityInfo : public ImmutablePass { // NB: This routine's complexity is linear on the number of successors. MachineBasicBlock *getHotSucc(MachineBasicBlock *MBB) const; - // Return a probability as a fraction between 0 (0% probability) and - // 1 (100% probability), however the value is never equal to 0, and can be 1 - // only iff SRC block has only one successor. - // NB: This routine's complexity is linear on the number of successors of - // Src. Querying sequentially for each successor's probability is a quadratic - // query pattern. - BranchProbability getEdgeProbability(const MachineBasicBlock *Src, - const MachineBasicBlock *Dst) const; - // Print value between 0 (0% probability) and 1 (100% probability), // however the value is never equal to 0, and can be 1 only iff SRC block // has only one successor. diff --git a/include/llvm/Support/BranchProbability.h b/include/llvm/Support/BranchProbability.h index 3620d4d5d772..2548384f346b 100644 --- a/include/llvm/Support/BranchProbability.h +++ b/include/llvm/Support/BranchProbability.h @@ -53,6 +53,9 @@ class BranchProbability { // Create a BranchProbability object with the given numerator and 1<<31 // as denominator. static BranchProbability getRaw(uint32_t N) { return BranchProbability(N); } + // Create a BranchProbability object from 64-bit integers. + static BranchProbability getBranchProbability(uint64_t Numerator, + uint64_t Denominator); // Normalize given probabilties so that the sum of them becomes approximate // one. @@ -131,10 +134,30 @@ class BranchProbability { bool operator==(BranchProbability RHS) const { return N == RHS.N; } bool operator!=(BranchProbability RHS) const { return !(*this == RHS); } - bool operator<(BranchProbability RHS) const { return N < RHS.N; } - bool operator>(BranchProbability RHS) const { return RHS < *this; } - bool operator<=(BranchProbability RHS) const { return !(RHS < *this); } - bool operator>=(BranchProbability RHS) const { return !(*this < RHS); } + + bool operator<(BranchProbability RHS) const { + assert(N != UnknownN && RHS.N != UnknownN && + "Unknown probability cannot participate in comparisons."); + return N < RHS.N; + } + + bool operator>(BranchProbability RHS) const { + assert(N != UnknownN && RHS.N != UnknownN && + "Unknown probability cannot participate in comparisons."); + return RHS < *this; + } + + bool operator<=(BranchProbability RHS) const { + assert(N != UnknownN && RHS.N != UnknownN && + "Unknown probability cannot participate in comparisons."); + return !(RHS < *this); + } + + bool operator>=(BranchProbability RHS) const { + assert(N != UnknownN && RHS.N != UnknownN && + "Unknown probability cannot participate in comparisons."); + return !(*this < RHS); + } }; inline raw_ostream &operator<<(raw_ostream &OS, BranchProbability Prob) { diff --git a/lib/Analysis/BranchProbabilityInfo.cpp b/lib/Analysis/BranchProbabilityInfo.cpp index f48394698699..6cdf43a06a9f 100644 --- a/lib/Analysis/BranchProbabilityInfo.cpp +++ b/lib/Analysis/BranchProbabilityInfo.cpp @@ -647,6 +647,12 @@ getEdgeProbability(const BasicBlock *Src, const BasicBlock *Dst) const { return BranchProbability(N, D); } +BranchProbability +BranchProbabilityInfo::getEdgeProbability(const BasicBlock *Src, + succ_const_iterator Dst) const { + return getEdgeProbability(Src, Dst.getSuccessorIndex()); +} + raw_ostream & BranchProbabilityInfo::printEdgeProbability(raw_ostream &OS, const BasicBlock *Src, diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp index 0b2495cc996e..54d92ad67a97 100644 --- a/lib/CodeGen/BranchFolding.cpp +++ b/lib/CodeGen/BranchFolding.cpp @@ -1099,13 +1099,19 @@ void BranchFolder::setCommonTailEdgeWeights(MachineBasicBlock &TailMBB) { if (TailMBB.succ_size() <= 1) return; - auto MaxEdgeFreq = *std::max_element(EdgeFreqLs.begin(), EdgeFreqLs.end()); - uint64_t Scale = MaxEdgeFreq.getFrequency() / UINT32_MAX + 1; + auto SumEdgeFreq = + std::accumulate(EdgeFreqLs.begin(), EdgeFreqLs.end(), BlockFrequency(0)) + .getFrequency(); auto EdgeFreq = EdgeFreqLs.begin(); - for (auto SuccI = TailMBB.succ_begin(), SuccE = TailMBB.succ_end(); - SuccI != SuccE; ++SuccI, ++EdgeFreq) - TailMBB.setSuccWeight(SuccI, EdgeFreq->getFrequency() / Scale); + if (SumEdgeFreq > 0) { + for (auto SuccI = TailMBB.succ_begin(), SuccE = TailMBB.succ_end(); + SuccI != SuccE; ++SuccI, ++EdgeFreq) { + auto Prob = BranchProbability::getBranchProbability( + EdgeFreq->getFrequency(), SumEdgeFreq); + TailMBB.setSuccProbability(SuccI, Prob); + } + } } //===----------------------------------------------------------------------===// diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp index 0b2f3ea165f8..ff28f95cc33d 100644 --- a/lib/CodeGen/IfConversion.cpp +++ b/lib/CodeGen/IfConversion.cpp @@ -32,6 +32,7 @@ #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include using namespace llvm; @@ -1151,28 +1152,6 @@ bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) { return true; } -/// Scale down weights to fit into uint32_t. NewTrue is the new weight -/// for successor TrueBB, and NewFalse is the new weight for successor -/// FalseBB. -static void ScaleWeights(uint64_t NewTrue, uint64_t NewFalse, - MachineBasicBlock *MBB, - const MachineBasicBlock *TrueBB, - const MachineBasicBlock *FalseBB, - const MachineBranchProbabilityInfo *MBPI) { - uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse; - uint32_t Scale = (NewMax / UINT32_MAX) + 1; - for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(), - SE = MBB->succ_end(); - SI != SE; ++SI) { - if (*SI == TrueBB) - MBB->setSuccWeight(SI, (uint32_t)(NewTrue / Scale)); - else if (*SI == FalseBB) - MBB->setSuccWeight(SI, (uint32_t)(NewFalse / Scale)); - else - MBB->setSuccWeight(SI, MBPI->getEdgeWeight(MBB, SI) / Scale); - } -} - /// IfConvertTriangle - If convert a triangle sub-CFG. /// bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) { @@ -1229,16 +1208,14 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) { DontKill.clear(); bool HasEarlyExit = CvtBBI->FalseBB != nullptr; - uint64_t CvtNext = 0, CvtFalse = 0, BBNext = 0, BBCvt = 0, SumWeight = 0; - uint32_t WeightScale = 0; + BranchProbability CvtNext, CvtFalse, BBNext, BBCvt; if (HasEarlyExit) { - // Get weights before modifying CvtBBI->BB and BBI.BB. - CvtNext = MBPI->getEdgeWeight(CvtBBI->BB, NextBBI->BB); - CvtFalse = MBPI->getEdgeWeight(CvtBBI->BB, CvtBBI->FalseBB); - BBNext = MBPI->getEdgeWeight(BBI.BB, NextBBI->BB); - BBCvt = MBPI->getEdgeWeight(BBI.BB, CvtBBI->BB); - SumWeight = MBPI->getSumForBlock(CvtBBI->BB, WeightScale); + // Get probabilities before modifying CvtBBI->BB and BBI.BB. + CvtNext = MBPI->getEdgeProbability(CvtBBI->BB, NextBBI->BB); + CvtFalse = MBPI->getEdgeProbability(CvtBBI->BB, CvtBBI->FalseBB); + BBNext = MBPI->getEdgeProbability(BBI.BB, NextBBI->BB); + BBCvt = MBPI->getEdgeProbability(BBI.BB, CvtBBI->BB); } if (CvtBBI->BB->pred_size() > 1) { @@ -1266,22 +1243,24 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) { CvtBBI->BrCond.end()); if (TII->ReverseBranchCondition(RevCond)) llvm_unreachable("Unable to reverse branch condition!"); + + // Update the edge probability for both CvtBBI->FalseBB and NextBBI. + // NewNext = New_Prob(BBI.BB, NextBBI->BB) = + // Prob(BBI.BB, NextBBI->BB) + + // Prob(BBI.BB, CvtBBI->BB) * Prob(CvtBBI->BB, NextBBI->BB) + // NewFalse = New_Prob(BBI.BB, CvtBBI->FalseBB) = + // Prob(BBI.BB, CvtBBI->BB) * Prob(CvtBBI->BB, CvtBBI->FalseBB) + auto NewTrueBB = getNextBlock(BBI.BB); + auto NewNext = BBNext + BBCvt * CvtNext; + auto NewTrueBBIter = + std::find(BBI.BB->succ_begin(), BBI.BB->succ_end(), NewTrueBB); + assert(NewTrueBBIter != BBI.BB->succ_end() && + "NewTrueBB is not a successor of BBI.BB."); + BBI.BB->setSuccProbability(NewTrueBBIter, NewNext); + + auto NewFalse = BBCvt * CvtFalse; TII->InsertBranch(*BBI.BB, CvtBBI->FalseBB, nullptr, RevCond, dl); - BBI.BB->addSuccessor(CvtBBI->FalseBB); - // Update the edge weight for both CvtBBI->FalseBB and NextBBI. - // New_Weight(BBI.BB, NextBBI->BB) = - // Weight(BBI.BB, NextBBI->BB) * getSumForBlock(CvtBBI->BB) + - // Weight(BBI.BB, CvtBBI->BB) * Weight(CvtBBI->BB, NextBBI->BB) - // New_Weight(BBI.BB, CvtBBI->FalseBB) = - // Weight(BBI.BB, CvtBBI->BB) * Weight(CvtBBI->BB, CvtBBI->FalseBB) - - uint64_t NewNext = BBNext * SumWeight + (BBCvt * CvtNext) / WeightScale; - uint64_t NewFalse = (BBCvt * CvtFalse) / WeightScale; - // We need to scale down all weights of BBI.BB to fit uint32_t. - // Here BBI.BB is connected to CvtBBI->FalseBB and will fall through to - // the next block. - ScaleWeights(NewNext, NewFalse, BBI.BB, getNextBlock(BBI.BB), - CvtBBI->FalseBB, MBPI); + BBI.BB->addSuccessor(CvtBBI->FalseBB, NewFalse); } // Merge in the 'false' block if the 'false' block has no other @@ -1524,7 +1503,7 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind, MergeBlocks(BBI, TailBBI); TailBBI.IsDone = true; } else { - BBI.BB->addSuccessor(TailBB); + BBI.BB->addSuccessor(TailBB, BranchProbability::getOne()); InsertUncondBranch(BBI.BB, TailBB, TII); BBI.HasFallThrough = false; } @@ -1688,21 +1667,26 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) { FromBBI.BB->succ_end()); MachineBasicBlock *NBB = getNextBlock(FromBBI.BB); MachineBasicBlock *FallThrough = FromBBI.HasFallThrough ? NBB : nullptr; - - // The edge weight from ToBBI.BB to FromBBI.BB, which is only needed when + // The edge probability from ToBBI.BB to FromBBI.BB, which is only needed when // AddEdges is true and FromBBI.BB is a successor of ToBBI.BB. - uint32_t To2FromWeight = 0; - // WeightScale and SumWeight are for calculating successor probabilities of - // FromBBI.BB. - uint32_t WeightScale = 0; - uint32_t SumWeight = 0; + auto To2FromProb = BranchProbability::getZero(); if (AddEdges && ToBBI.BB->isSuccessor(FromBBI.BB)) { - To2FromWeight = MBPI->getEdgeWeight(ToBBI.BB, FromBBI.BB); - // Set the edge weight from ToBBI.BB to FromBBI.BB to zero to avoid the edge - // weight being merged to other edges when this edge is removed later. - ToBBI.BB->setSuccWeight( - std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), FromBBI.BB), 0); - SumWeight = MBPI->getSumForBlock(FromBBI.BB, WeightScale); + To2FromProb = MBPI->getEdgeProbability(ToBBI.BB, FromBBI.BB); + // Set the edge probability from ToBBI.BB to FromBBI.BB to zero to avoid the + // edge probability being merged to other edges when this edge is removed + // later. + ToBBI.BB->setSuccProbability( + std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), FromBBI.BB), + BranchProbability::getZero()); + } + + if (AddEdges && ToBBI.BB->isSuccessor(FromBBI.BB)) { + // Set the edge probability from ToBBI.BB to FromBBI.BB to zero to avoid the + // edge probability being merged to other edges when this edge is removed + // later. + ToBBI.BB->setSuccProbability( + std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), FromBBI.BB), + BranchProbability::getZero()); } for (unsigned i = 0, e = FromSuccs.size(); i != e; ++i) { @@ -1711,39 +1695,38 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) { if (Succ == FallThrough) continue; - uint32_t NewWeight = 0; + auto NewProb = BranchProbability::getZero(); if (AddEdges) { - // Calculate the edge weight for the edge from ToBBI.BB to Succ, which is - // a portion of the edge weight from FromBBI.BB to Succ. The portion ratio - // is the edge probability from ToBBI.BB to FromBBI.BB (if FromBBI is a - // successor of ToBBI.BB. See comment below for excepion). - NewWeight = MBPI->getEdgeWeight(FromBBI.BB, Succ); + // Calculate the edge probability for the edge from ToBBI.BB to Succ, + // which is a portion of the edge probability from FromBBI.BB to Succ. The + // portion ratio is the edge probability from ToBBI.BB to FromBBI.BB (if + // FromBBI is a successor of ToBBI.BB. See comment below for excepion). + NewProb = MBPI->getEdgeProbability(FromBBI.BB, Succ); - // To2FromWeight is 0 when FromBBI.BB is not a successor of ToBBI.BB. This + // To2FromProb is 0 when FromBBI.BB is not a successor of ToBBI.BB. This // only happens when if-converting a diamond CFG and FromBBI.BB is the // tail BB. In this case FromBBI.BB post-dominates ToBBI.BB and hence we - // could just use the weights on FromBBI.BB's out-edges when adding new - // successors. - if (To2FromWeight > 0) { - BranchProbability Prob(NewWeight / WeightScale, SumWeight); - NewWeight = Prob.scale(To2FromWeight); - } + // could just use the probabilities on FromBBI.BB's out-edges when adding + // new successors. + if (!To2FromProb.isZero()) + NewProb *= To2FromProb; } FromBBI.BB->removeSuccessor(Succ); if (AddEdges) { - // If the edge from ToBBI.BB to Succ already exists, update the weight of - // this edge by adding NewWeight to it. An example is shown below, in - // which A is ToBBI.BB and B is FromBBI.BB. In this case we don't have to - // set C as A's successor as it already is. We only need to update the - // edge weight on A->C. Note that B will not be immediately removed from - // A's successors. It is possible that B->D is not removed either if D is - // a fallthrough of B. Later the edge A->D (generated here) and B->D will - // be combined into one edge. To maintain correct edge weight of this - // combined edge, we need to set the edge weight of A->B to zero, which is - // already done above. The edge weight on A->D is calculated by scaling - // the original weight on A->B by the probability of B->D. + // If the edge from ToBBI.BB to Succ already exists, update the + // probability of this edge by adding NewWeight to it. An example is shown + // below, in which A is ToBBI.BB and B is FromBBI.BB. In this case we + // don't have to set C as A's successor as it already is. We only need to + // update the edge probability on A->C. Note that B will not be + // immediately removed from A's successors. It is possible that B->D is + // not removed either if D is a fallthrough of B. Later the edge A->D + // (generated here) and B->D will be combined into one edge. To maintain + // correct edge probability of this combined edge, we need to set the edge + // probability of A->B to zero, which is already done above. The edge + // probability on A->D is calculated by scaling the original probability + // on A->B by the probability of B->D. // // Before ifcvt: After ifcvt (assume B->D is kept): // @@ -1755,11 +1738,11 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) { // C D C D // if (ToBBI.BB->isSuccessor(Succ)) - ToBBI.BB->setSuccWeight( + ToBBI.BB->setSuccProbability( std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), Succ), - MBPI->getEdgeWeight(ToBBI.BB, Succ) + NewWeight); + MBPI->getEdgeProbability(ToBBI.BB, Succ) + NewProb); else - ToBBI.BB->addSuccessor(Succ, NewWeight); + ToBBI.BB->addSuccessor(Succ, NewProb); } } diff --git a/lib/CodeGen/MIRParser/MIParser.cpp b/lib/CodeGen/MIRParser/MIParser.cpp index 5a8e96df7603..c9c2d62cec30 100644 --- a/lib/CodeGen/MIRParser/MIParser.cpp +++ b/lib/CodeGen/MIRParser/MIParser.cpp @@ -459,8 +459,9 @@ bool MIParser::parseBasicBlockSuccessors(MachineBasicBlock &MBB) { if (expectAndConsume(MIToken::rparen)) return true; } - MBB.addSuccessor(SuccMBB, Weight); + MBB.addSuccessor(SuccMBB, BranchProbability::getRaw(Weight)); } while (consumeIfPresent(MIToken::comma)); + MBB.normalizeSuccProbs(); return false; } diff --git a/lib/CodeGen/MIRPrinter.cpp b/lib/CodeGen/MIRPrinter.cpp index 0be7807064fb..175cb0d51437 100644 --- a/lib/CodeGen/MIRPrinter.cpp +++ b/lib/CodeGen/MIRPrinter.cpp @@ -461,8 +461,8 @@ void MIPrinter::print(const MachineBasicBlock &MBB) { if (I != MBB.succ_begin()) OS << ", "; printMBBReference(**I); - if (MBB.hasSuccessorWeights()) - OS << '(' << MBB.getSuccWeight(I) << ')'; + if (MBB.hasSuccessorProbabilities()) + OS << '(' << MBB.getSuccProbability(I) << ')'; } OS << "\n"; HasLineAttributes = true; diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp index 602b75182fca..c9c6a9d62462 100644 --- a/lib/CodeGen/MachineBasicBlock.cpp +++ b/lib/CodeGen/MachineBasicBlock.cpp @@ -319,8 +319,8 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST, OS << " Successors according to CFG:"; for (const_succ_iterator SI = succ_begin(), E = succ_end(); SI != E; ++SI) { OS << " BB#" << (*SI)->getNumber(); - if (!Weights.empty()) - OS << '(' << *getWeightIterator(SI) << ')'; + if (!Probs.empty()) + OS << '(' << *getProbabilityIterator(SI) << ')'; } OS << '\n'; } @@ -506,34 +506,16 @@ void MachineBasicBlock::updateTerminator() { } } -void MachineBasicBlock::addSuccessor(MachineBasicBlock *Succ, uint32_t Weight) { - // Weight list is either empty (if successor list isn't empty, this means - // disabled optimization) or has the same size as successor list. - if (!(Weights.empty() && !Successors.empty())) - Weights.push_back(Weight); - Successors.push_back(Succ); - Succ->addPredecessor(this); -} - -void MachineBasicBlock::addSuccessorWithoutWeight(MachineBasicBlock *Succ) { - // We need to make sure weight list is either empty or has the same size of - // successor list. When this function is called, we can safely delete all - // weight in the list. - Weights.clear(); - Successors.push_back(Succ); - Succ->addPredecessor(this); -} - void MachineBasicBlock::addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob) { // Probability list is either empty (if successor list isn't empty, this means // disabled optimization) or has the same size as successor list. if (!(Probs.empty() && !Successors.empty())) { + assert((Probs.empty() || (Prob.isUnknown() && Probs.back().isUnknown()) || + (!Prob.isUnknown() && !Probs.back().isUnknown())) && + "Successors with both known and unknwon probabilities are not " + "allowed."); Probs.push_back(Prob); - // FIXME: Temporarily use the numerator of the probability to represent edge - // weight. This will be removed once all weight-version interfaces in MBB - // are replaced with probability-version interfaces. - Weights.push_back(Prob.getNumerator()); } Successors.push_back(Succ); Succ->addPredecessor(this); @@ -544,7 +526,6 @@ void MachineBasicBlock::addSuccessorWithoutProb(MachineBasicBlock *Succ) { // of successor list. When this function is called, we can safely delete all // probability in the list. Probs.clear(); - Weights.clear(); Successors.push_back(Succ); Succ->addPredecessor(this); } @@ -558,23 +539,12 @@ MachineBasicBlock::succ_iterator MachineBasicBlock::removeSuccessor(succ_iterator I) { assert(I != Successors.end() && "Not a current successor!"); - // If Weight list is empty it means we don't use it (disabled optimization). - if (!Weights.empty()) { - weight_iterator WI = getWeightIterator(I); - Weights.erase(WI); - } - - // FIXME: Temporarily comment the following code as probabilities are now only - // used during instruction lowering, but this interface is called in later - // passes. Uncomment it once all edge weights are replaced with probabilities. -#if 0 // If probability list is empty it means we don't use it (disabled // optimization). if (!Probs.empty()) { probability_iterator WI = getProbabilityIterator(I); Probs.erase(WI); } -#endif (*I)->removePredecessor(this); return Successors.erase(I); @@ -611,17 +581,12 @@ void MachineBasicBlock::replaceSuccessor(MachineBasicBlock *Old, } // New is already a successor. - // Update its weight instead of adding a duplicate edge. - if (!Weights.empty()) - *getWeightIterator(NewI) += *getWeightIterator(OldI); - // FIXME: Temporarily comment the following code as probabilities are now only - // used during instruction lowering, but this interface is called in later - // passes. Uncomment it once all edge weights are replaced with probabilities. -#if 0 // Update its probability instead of adding a duplicate edge. - if (!Probs.empty()) - *getProbabilityIterator(NewI) += *getProbabilityIterator(OldI); -#endif + if (!Probs.empty()) { + auto ProbIter = getProbabilityIterator(NewI); + if (!ProbIter->isUnknown()) + *ProbIter += *getProbabilityIterator(OldI); + } removeSuccessor(OldI); } @@ -641,13 +606,14 @@ void MachineBasicBlock::transferSuccessors(MachineBasicBlock *FromMBB) { while (!FromMBB->succ_empty()) { MachineBasicBlock *Succ = *FromMBB->succ_begin(); - uint32_t Weight = 0; - // If Weight list is empty it means we don't use it (disabled optimization). - if (!FromMBB->Weights.empty()) - Weight = *FromMBB->Weights.begin(); + // If probability list is empty it means we don't use it (disabled optimization). + if (!FromMBB->Probs.empty()) { + auto Prob = *FromMBB->Probs.begin(); + addSuccessor(Succ, Prob); + } else + addSuccessorWithoutProb(Succ); - addSuccessor(Succ, Weight); FromMBB->removeSuccessor(Succ); } } @@ -659,10 +625,11 @@ MachineBasicBlock::transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB) { while (!FromMBB->succ_empty()) { MachineBasicBlock *Succ = *FromMBB->succ_begin(); - uint32_t Weight = 0; - if (!FromMBB->Weights.empty()) - Weight = *FromMBB->Weights.begin(); - addSuccessor(Succ, Weight); + if (!FromMBB->Probs.empty()) { + auto Prob = *FromMBB->Probs.begin(); + addSuccessor(Succ, Prob); + } else + addSuccessorWithoutProb(Succ); FromMBB->removeSuccessor(Succ); // Fix up any PHI nodes in the successor. @@ -1146,80 +1113,37 @@ MachineBasicBlock::findDebugLoc(instr_iterator MBBI) { return DL; } -/// Return weight of the edge from this block to MBB. -uint32_t MachineBasicBlock::getSuccWeight(const_succ_iterator Succ) const { - if (Weights.empty()) - return 0; - - return *getWeightIterator(Succ); -} - -/// Return probability of the edge from this block to MBB. If probability list -/// is empty, return a default probability which is 1/N, where N is the number -/// of successors. If the probability of the given successor is unknown, then -/// sum up all known probabilities and return the complement of the sum divided -/// by the number of unknown probabilities. +/// Return probability of the edge from this block to MBB. BranchProbability MachineBasicBlock::getSuccProbability(const_succ_iterator Succ) const { - if (Probs.empty()) + if (Probs.empty() || Probs.back().isUnknown()) return BranchProbability(1, succ_size()); - auto Prob = *getProbabilityIterator(Succ); - assert(!Prob.isUnknown()); - return Prob; -} - -/// Set successor weight of a given iterator. -void MachineBasicBlock::setSuccWeight(succ_iterator I, uint32_t Weight) { - if (Weights.empty()) - return; - *getWeightIterator(I) = Weight; + return *getProbabilityIterator(Succ); } /// Set successor probability of a given iterator. void MachineBasicBlock::setSuccProbability(succ_iterator I, BranchProbability Prob) { assert(!Prob.isUnknown()); - if (Probs.empty() || Weights.empty()) + if (Probs.empty()) return; *getProbabilityIterator(I) = Prob; - // FIXME: Temporarily use the numerator of the probability to represent edge - // weight. This will be removed once all weight-version interfaces in MBB - // are replaces with probability-version interfaces. - *getWeightIterator(I) = Prob.getNumerator(); -} - -/// Return wight iterator corresonding to the I successor iterator. -MachineBasicBlock::weight_iterator MachineBasicBlock:: -getWeightIterator(MachineBasicBlock::succ_iterator I) { - assert(Weights.size() == Successors.size() && "Async weight list!"); - size_t index = std::distance(Successors.begin(), I); - assert(index < Weights.size() && "Not a current successor!"); - return Weights.begin() + index; } -/// Return wight iterator corresonding to the I successor iterator. -MachineBasicBlock::const_weight_iterator MachineBasicBlock:: -getWeightIterator(MachineBasicBlock::const_succ_iterator I) const { - assert(Weights.size() == Successors.size() && "Async weight list!"); - const size_t index = std::distance(Successors.begin(), I); - assert(index < Weights.size() && "Not a current successor!"); - return Weights.begin() + index; -} - -/// Return probability iterator corresonding to the I successor iterator. -MachineBasicBlock::probability_iterator -MachineBasicBlock::getProbabilityIterator(MachineBasicBlock::succ_iterator I) { +/// Return probability iterator corresonding to the I successor iterator +MachineBasicBlock::const_probability_iterator +MachineBasicBlock::getProbabilityIterator( + MachineBasicBlock::const_succ_iterator I) const { assert(Probs.size() == Successors.size() && "Async probability list!"); const size_t index = std::distance(Successors.begin(), I); assert(index < Probs.size() && "Not a current successor!"); return Probs.begin() + index; } -/// Return probability iterator corresonding to the I successor iterator -MachineBasicBlock::const_probability_iterator -MachineBasicBlock::getProbabilityIterator( - MachineBasicBlock::const_succ_iterator I) const { +/// Return probability iterator corresonding to the I successor iterator. +MachineBasicBlock::probability_iterator +MachineBasicBlock::getProbabilityIterator(MachineBasicBlock::succ_iterator I) { assert(Probs.size() == Successors.size() && "Async probability list!"); const size_t index = std::distance(Successors.begin(), I); assert(index < Probs.size() && "Not a current successor!"); diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp index fba33eb93d5f..fcddf346cf68 100644 --- a/lib/CodeGen/MachineBlockPlacement.cpp +++ b/lib/CodeGen/MachineBlockPlacement.cpp @@ -380,19 +380,11 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, const BranchProbability HotProb(4, 5); // 80% MachineBasicBlock *BestSucc = nullptr; - // FIXME: Due to the performance of the probability and weight routines in - // the MBPI analysis, we manually compute probabilities using the edge - // weights. This is suboptimal as it means that the somewhat subtle - // definition of edge weight semantics is encoded here as well. We should - // improve the MBPI interface to efficiently support query patterns such as - // this. - uint32_t BestWeight = 0; - uint32_t WeightScale = 0; - uint32_t SumWeight = MBPI->getSumForBlock(BB, WeightScale); - - // Adjust sum of weights by excluding weights on edges pointing to blocks that - // is either not in BlockFilter or is already in the current chain. Consider - // the following CFG: + auto BestProb = BranchProbability::getZero(); + + // Adjust edge probabilities by excluding edges pointing to blocks that is + // either not in BlockFilter or is already in the current chain. Consider the + // following CFG: // // --->A // | / \ @@ -406,7 +398,7 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, // HotProb). If we exclude E that is not in BlockFilter when calculating the // probability of C->D, D will be selected and we will get A C D B as the // layout of this loop. - uint32_t AdjustedSumWeight = SumWeight; + auto AdjustedSumProb = BranchProbability::getOne(); SmallVector Successors; for (MachineBasicBlock *Succ : BB->successors()) { bool SkipSucc = false; @@ -424,15 +416,20 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, } } if (SkipSucc) - AdjustedSumWeight -= MBPI->getEdgeWeight(BB, Succ) / WeightScale; + AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ); else Successors.push_back(Succ); } DEBUG(dbgs() << "Attempting merge from: " << getBlockName(BB) << "\n"); for (MachineBasicBlock *Succ : Successors) { - uint32_t SuccWeight = MBPI->getEdgeWeight(BB, Succ); - BranchProbability SuccProb(SuccWeight / WeightScale, AdjustedSumWeight); + BranchProbability SuccProb; + uint32_t SuccProbN = MBPI->getEdgeProbability(BB, Succ).getNumerator(); + uint32_t SuccProbD = AdjustedSumProb.getNumerator(); + if (SuccProbN >= SuccProbD) + SuccProb = BranchProbability::getOne(); + else + SuccProb = BranchProbability(SuccProbN, SuccProbD); // If we outline optional branches, look whether Succ is unavoidable, i.e. // dominates all terminators of the MachineFunction. If it does, other @@ -470,7 +467,7 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, // Make sure that a hot successor doesn't have a globally more // important predecessor. - BranchProbability RealSuccProb(SuccWeight / WeightScale, SumWeight); + auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ); BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb * HotProb.getCompl(); bool BadCFGConflict = false; @@ -496,10 +493,10 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, << " (prob)" << (SuccChain.LoopPredecessors != 0 ? " (CFG break)" : "") << "\n"); - if (BestSucc && BestWeight >= SuccWeight) + if (BestSucc && BestProb >= SuccProb) continue; BestSucc = Succ; - BestWeight = SuccWeight; + BestProb = SuccProb; } return BestSucc; } @@ -728,11 +725,6 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L, MachineBasicBlock *OldExitingBB = ExitingBB; BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq; bool HasLoopingSucc = false; - // FIXME: Due to the performance of the probability and weight routines in - // the MBPI analysis, we use the internal weights and manually compute the - // probabilities to avoid quadratic behavior. - uint32_t WeightScale = 0; - uint32_t SumWeight = MBPI->getSumForBlock(MBB, WeightScale); for (MachineBasicBlock *Succ : MBB->successors()) { if (Succ->isEHPad()) continue; @@ -746,10 +738,10 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L, continue; } - uint32_t SuccWeight = MBPI->getEdgeWeight(MBB, Succ); + auto SuccProb = MBPI->getEdgeProbability(MBB, Succ); if (LoopBlockSet.count(Succ)) { DEBUG(dbgs() << " looping: " << getBlockName(MBB) << " -> " - << getBlockName(Succ) << " (" << SuccWeight << ")\n"); + << getBlockName(Succ) << " (" << SuccProb << ")\n"); HasLoopingSucc = true; continue; } @@ -761,7 +753,6 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L, BlocksExitingToOuterLoop.insert(MBB); } - BranchProbability SuccProb(SuccWeight / WeightScale, SumWeight); BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb; DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> " << getBlockName(Succ) << " [L:" << SuccLoopDepth << "] ("; @@ -904,21 +895,17 @@ void MachineBlockPlacement::rotateLoopWithProfile( // edge from the tail of the loop chain. SmallVector, 4> ExitsWithFreq; for (auto BB : LoopChain) { - uint32_t LargestExitEdgeWeight = 0; + auto LargestExitEdgeProb = BranchProbability::getZero(); for (auto *Succ : BB->successors()) { BlockChain *SuccChain = BlockToChain[Succ]; if (!LoopBlockSet.count(Succ) && (!SuccChain || Succ == *SuccChain->begin())) { - uint32_t SuccWeight = MBPI->getEdgeWeight(BB, Succ); - LargestExitEdgeWeight = std::max(LargestExitEdgeWeight, SuccWeight); + auto SuccProb = MBPI->getEdgeProbability(BB, Succ); + LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb); } } - if (LargestExitEdgeWeight > 0) { - uint32_t WeightScale = 0; - uint32_t SumWeight = MBPI->getSumForBlock(BB, WeightScale); - auto ExitFreq = - MBFI->getBlockFreq(BB) * - BranchProbability(LargestExitEdgeWeight / WeightScale, SumWeight); + if (LargestExitEdgeProb > BranchProbability::getZero()) { + auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb; ExitsWithFreq.emplace_back(BB, ExitFreq); } } @@ -1290,14 +1277,16 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) { } // If PrevBB has a two-way branch, try to re-order the branches - // such that we branch to the successor with higher weight first. + // such that we branch to the successor with higher probability first. if (TBB && !Cond.empty() && FBB && - MBPI->getEdgeWeight(PrevBB, FBB) > MBPI->getEdgeWeight(PrevBB, TBB) && + MBPI->getEdgeProbability(PrevBB, FBB) > + MBPI->getEdgeProbability(PrevBB, TBB) && !TII->ReverseBranchCondition(Cond)) { DEBUG(dbgs() << "Reverse order of the two branches: " << getBlockName(PrevBB) << "\n"); - DEBUG(dbgs() << " Edge weight: " << MBPI->getEdgeWeight(PrevBB, FBB) - << " vs " << MBPI->getEdgeWeight(PrevBB, TBB) << "\n"); + DEBUG(dbgs() << " Edge probability: " + << MBPI->getEdgeProbability(PrevBB, FBB) << " vs " + << MBPI->getEdgeProbability(PrevBB, TBB) << "\n"); DebugLoc dl; // FIXME: this is nowhere TII->RemoveBranch(*PrevBB); TII->InsertBranch(*PrevBB, FBB, TBB, Cond, dl); diff --git a/lib/CodeGen/MachineBranchProbabilityInfo.cpp b/lib/CodeGen/MachineBranchProbabilityInfo.cpp index 6fbc2be70486..5478dcba261a 100644 --- a/lib/CodeGen/MachineBranchProbabilityInfo.cpp +++ b/lib/CodeGen/MachineBranchProbabilityInfo.cpp @@ -28,91 +28,61 @@ char MachineBranchProbabilityInfo::ID = 0; void MachineBranchProbabilityInfo::anchor() { } -uint32_t MachineBranchProbabilityInfo:: -getSumForBlock(const MachineBasicBlock *MBB, uint32_t &Scale) const { - // First we compute the sum with 64-bits of precision, ensuring that cannot - // overflow by bounding the number of weights considered. Hopefully no one - // actually needs 2^32 successors. - assert(MBB->succ_size() < UINT32_MAX); - uint64_t Sum = 0; - Scale = 1; - for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(), - E = MBB->succ_end(); I != E; ++I) { - uint32_t Weight = getEdgeWeight(MBB, I); - Sum += Weight; - } - - // If the computed sum fits in 32-bits, we're done. - if (Sum <= UINT32_MAX) - return Sum; +uint32_t MachineBranchProbabilityInfo::getEdgeWeight( + const MachineBasicBlock *Src, + MachineBasicBlock::const_succ_iterator Dst) const { + return Src->getSuccProbability(Dst).getNumerator(); +} - // Otherwise, compute the scale necessary to cause the weights to fit, and - // re-sum with that scale applied. - assert((Sum / UINT32_MAX) < UINT32_MAX); - Scale = (Sum / UINT32_MAX) + 1; - Sum = 0; - for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(), - E = MBB->succ_end(); I != E; ++I) { - uint32_t Weight = getEdgeWeight(MBB, I); - Sum += Weight / Scale; - } - assert(Sum <= UINT32_MAX); - return Sum; +uint32_t MachineBranchProbabilityInfo::getEdgeWeight( + const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const { + // This is a linear search. Try to use the const_succ_iterator version when + // possible. + return getEdgeWeight(Src, std::find(Src->succ_begin(), Src->succ_end(), Dst)); } -uint32_t MachineBranchProbabilityInfo:: -getEdgeWeight(const MachineBasicBlock *Src, - MachineBasicBlock::const_succ_iterator Dst) const { - uint32_t Weight = Src->getSuccWeight(Dst); - if (!Weight) - return DEFAULT_WEIGHT; - return Weight; +BranchProbability MachineBranchProbabilityInfo::getEdgeProbability( + const MachineBasicBlock *Src, + MachineBasicBlock::const_succ_iterator Dst) const { + return Src->getSuccProbability(Dst); } -uint32_t MachineBranchProbabilityInfo:: -getEdgeWeight(const MachineBasicBlock *Src, - const MachineBasicBlock *Dst) const { +BranchProbability MachineBranchProbabilityInfo::getEdgeProbability( + const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const { // This is a linear search. Try to use the const_succ_iterator version when // possible. - return getEdgeWeight(Src, std::find(Src->succ_begin(), Src->succ_end(), Dst)); + return getEdgeProbability(Src, + std::find(Src->succ_begin(), Src->succ_end(), Dst)); } bool MachineBranchProbabilityInfo::isEdgeHot(const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const { // Hot probability is at least 4/5 = 80% - // FIXME: Compare against a static "hot" BranchProbability. - return getEdgeProbability(Src, Dst) > BranchProbability(4, 5); + static BranchProbability HotProb(4, 5); + return getEdgeProbability(Src, Dst) > HotProb; } MachineBasicBlock * MachineBranchProbabilityInfo::getHotSucc(MachineBasicBlock *MBB) const { - uint32_t MaxWeight = 0; + auto MaxProb = BranchProbability::getZero(); MachineBasicBlock *MaxSucc = nullptr; for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(), E = MBB->succ_end(); I != E; ++I) { - uint32_t Weight = getEdgeWeight(MBB, I); - if (Weight > MaxWeight) { - MaxWeight = Weight; + auto Prob = getEdgeProbability(MBB, I); + if (Prob > MaxProb) { + MaxProb = Prob; MaxSucc = *I; } } - if (getEdgeProbability(MBB, MaxSucc) >= BranchProbability(4, 5)) + static BranchProbability HotProb(4, 5); + if (getEdgeProbability(MBB, MaxSucc) >= HotProb) return MaxSucc; return nullptr; } -BranchProbability MachineBranchProbabilityInfo::getEdgeProbability( - const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const { - uint32_t Scale = 1; - uint32_t D = getSumForBlock(Src, Scale); - uint32_t N = getEdgeWeight(Src, Dst) / Scale; - - return BranchProbability(N, D); -} - raw_ostream &MachineBranchProbabilityInfo::printEdgeProbability( raw_ostream &OS, const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const { diff --git a/lib/CodeGen/TailDuplication.cpp b/lib/CodeGen/TailDuplication.cpp index ff86dabfac59..1f5b54866ac6 100644 --- a/lib/CodeGen/TailDuplication.cpp +++ b/lib/CodeGen/TailDuplication.cpp @@ -745,12 +745,12 @@ TailDuplicatePass::duplicateSimpleBB(MachineBasicBlock *TailBB, if (PredTBB) TII->InsertBranch(*PredBB, PredTBB, PredFBB, PredCond, DebugLoc()); - uint32_t Weight = MBPI->getEdgeWeight(PredBB, TailBB); + auto Prob = MBPI->getEdgeProbability(PredBB, TailBB); PredBB->removeSuccessor(TailBB); unsigned NumSuccessors = PredBB->succ_size(); assert(NumSuccessors <= 1); if (NumSuccessors == 0 || *PredBB->succ_begin() != NewTarget) - PredBB->addSuccessor(NewTarget, Weight); + PredBB->addSuccessor(NewTarget, Prob); TDBBs.push_back(PredBB); } @@ -858,7 +858,7 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, "TailDuplicate called on block with multiple successors!"); for (MachineBasicBlock::succ_iterator I = TailBB->succ_begin(), E = TailBB->succ_end(); I != E; ++I) - PredBB->addSuccessor(*I, MBPI->getEdgeWeight(TailBB, I)); + PredBB->addSuccessor(*I, MBPI->getEdgeProbability(TailBB, I)); Changed = true; ++NumTailDups; diff --git a/lib/Support/BranchProbability.cpp b/lib/Support/BranchProbability.cpp index 3b0f6e6f06e4..771d02c0aa3c 100644 --- a/lib/Support/BranchProbability.cpp +++ b/lib/Support/BranchProbability.cpp @@ -22,11 +22,14 @@ using namespace llvm; const uint32_t BranchProbability::D; raw_ostream &BranchProbability::print(raw_ostream &OS) const { + if (isUnknown()) + return OS << "?%"; + // Get a percentage rounded to two decimal digits. This avoids // implementation-defined rounding inside printf. double Percent = rint(((double)N / D) * 100.0 * 100.0) / 100.0; - OS << format("0x%08" PRIx32 " / 0x%08" PRIx32 " = %.2f%%", N, D, Percent); - return OS; + return OS << format("0x%08" PRIx32 " / 0x%08" PRIx32 " = %.2f%%", N, D, + Percent); } void BranchProbability::dump() const { print(dbgs()) << '\n'; } @@ -43,6 +46,19 @@ BranchProbability::BranchProbability(uint32_t Numerator, uint32_t Denominator) { } } +BranchProbability +BranchProbability::getBranchProbability(uint64_t Numerator, + uint64_t Denominator) { + assert(Numerator <= Denominator && "Probability cannot be bigger than 1!"); + // Scale down Denominator to fit in a 32-bit integer. + int Scale = 0; + while (Denominator > UINT32_MAX) { + Denominator >>= 1; + Scale++; + } + return BranchProbability(Numerator >> Scale, Denominator); +} + // If ConstD is not zero, then replace D by ConstD so that division and modulo // operations by D can be optimized, in case this function is not inlined by the // compiler. diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp index f1b383017901..cdbd12092150 100644 --- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp +++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp @@ -1570,8 +1570,7 @@ void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk, insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc()); insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc()); - DstBlk->addSuccessor(LandMBB); - DstBlk->removeSuccessor(DstBlk); + DstBlk->replaceSuccessor(DstBlk, LandMBB); } @@ -1666,8 +1665,7 @@ AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB, replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB); //srcBlk, oldBlk, newBlk - PredMBB->removeSuccessor(MBB); - PredMBB->addSuccessor(CloneMBB); + PredMBB->replaceSuccessor(MBB, CloneMBB); // add all successor to cloneBlk cloneSuccessorList(CloneMBB, MBB); diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp index 0bf2d374df6a..e89757c19ecc 100644 --- a/lib/Target/ARM/ARMConstantIslandPass.cpp +++ b/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -2274,8 +2274,7 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) { // Update the CFG. NewBB->addSuccessor(BB); - JTBB->removeSuccessor(BB); - JTBB->addSuccessor(NewBB); + JTBB->replaceSuccessor(BB, NewBB); ++NumJTInserted; return NewBB; diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 0cc41812d71c..e8f3ab65bdbe 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -7346,7 +7346,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, } } - BB->addSuccessor(DispatchBB); + BB->addSuccessor(DispatchBB, BranchProbability::getZero()); // Find the invoke call and mark all of the callee-saved registers as // 'implicit defined' so that they're spilled. This prevents code from diff --git a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp index 96bb61750805..efafdd007289 100644 --- a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp +++ b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp @@ -186,13 +186,11 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) { if (case1 || case2) { InvertAndChangeJumpTarget(MI, UncondTarget); - MBB->removeSuccessor(JumpAroundTarget); - MBB->addSuccessor(UncondTarget); + MBB->replaceSuccessor(JumpAroundTarget, UncondTarget); // Remove the unconditional branch in LayoutSucc. LayoutSucc->erase(LayoutSucc->begin()); - LayoutSucc->removeSuccessor(UncondTarget); - LayoutSucc->addSuccessor(JumpAroundTarget); + LayoutSucc->replaceSuccessor(UncondTarget, JumpAroundTarget); // This code performs the conversion for case 2, which moves // the block to the fall-thru case (BB3 in the code above). diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp index d09843ed0e53..e75858a181e5 100644 --- a/lib/Target/Mips/MipsLongBranch.cpp +++ b/lib/Target/Mips/MipsLongBranch.cpp @@ -262,8 +262,7 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) { static_cast(Subtarget.getInstrInfo()); MF->insert(FallThroughMBB, LongBrMBB); - MBB->removeSuccessor(TgtMBB); - MBB->addSuccessor(LongBrMBB); + MBB->replaceSuccessor(TgtMBB, LongBrMBB); if (IsPIC) { MachineBasicBlock *BalTgtMBB = MF->CreateMachineBasicBlock(BB); diff --git a/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll b/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll index e17da7a97205..a44c9721d6c1 100644 --- a/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll +++ b/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll @@ -14,15 +14,15 @@ entry: br i1 undef, label %for.end, label %for.body ; Before if conversion, we have -; for.body -> lor.lhs.false.i (62) -; -> for.cond.backedge (62) -; lor.lhs.false.i -> for.cond.backedge (1048575) -; -> cond.false.i (1) +; for.body -> lor.lhs.false.i (50%) +; -> for.cond.backedge (50%) +; lor.lhs.false.i -> for.cond.backedge (100%) +; -> cond.false.i (0%) ; Afer if conversion, we have -; for.body -> for.cond.backedge (130023362) -; -> cond.false.i (62) +; for.body -> for.cond.backedge (100%) +; -> cond.false.i (0%) ; CHECK: BB#1: derived from LLVM BB %for.body -; CHECK: Successors according to CFG: BB#2(4294967291) BB#4(2048) +; CHECK: Successors according to CFG: BB#2(0x7ffffc00 / 0x80000000 = 100.00%) BB#4(0x00000400 / 0x80000000 = 0.00%) for.body: br i1 undef, label %for.cond.backedge, label %lor.lhs.false.i, !prof !1 diff --git a/test/CodeGen/ARM/ifcvt-branch-weight.ll b/test/CodeGen/ARM/ifcvt-branch-weight.ll index f2a1229d0d8a..0de039cde23c 100644 --- a/test/CodeGen/ARM/ifcvt-branch-weight.ll +++ b/test/CodeGen/ARM/ifcvt-branch-weight.ll @@ -19,7 +19,7 @@ bb: br i1 %9, label %return, label %bb2 ; CHECK: BB#2: derived from LLVM BB %bb2 -; CHECK: Successors according to CFG: BB#3(4294967289) BB#4(4294967287) +; CHECK: Successors according to CFG: BB#3({{[0-9a-fx/= ]+}}50.00%) BB#4({{[0-9a-fx/= ]+}}50.00%) bb2: %v10 = icmp eq i32 %3, 16 diff --git a/test/CodeGen/ARM/ifcvt-iter-indbr.ll b/test/CodeGen/ARM/ifcvt-iter-indbr.ll index 6ce9bcb56ef4..a96b6e8a1e83 100644 --- a/test/CodeGen/ARM/ifcvt-iter-indbr.ll +++ b/test/CodeGen/ARM/ifcvt-iter-indbr.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple thumbv7s-apple-darwin -asm-verbose=false | FileCheck %s -; RUN: llc < %s -mtriple thumbv7s-apple-darwin -asm-verbose=false -print-machineinstrs=if-converter 2>&1 | FileCheck --check-prefix=CHECK-WEIGHT %s +; RUN: llc < %s -mtriple thumbv7s-apple-darwin -asm-verbose=false -print-machineinstrs=if-converter 2>&1 | FileCheck --check-prefix=CHECK-PROB %s declare i32 @foo(i32) declare i8* @bar(i32, i8*, i8*) @@ -29,10 +29,10 @@ declare i8* @bar(i32, i8*, i8*) ; CHECK-NEXT: [[FOOCALL]]: ; CHECK-NEXT: blx _foo ; -; CHECK-WEIGHT: BB#0: -; CHECK-WEIGHT: Successors according to CFG: BB#1(1073741824) BB#2(536870912) BB#4(536870912) -; CHECK-WEIGHT: BB#1: -; CHECK-WEIGHT: Successors according to CFG: BB#2(1610612736) BB#4(536870912) +; CHECK-PROB: BB#0: +; CHECK-PROB: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}50.00%) BB#2({{[0-9a-fx/= ]+}}25.00%) BB#4({{[0-9a-fx/= ]+}}25.00%) +; CHECK-PROB: BB#1: +; CHECK-PROB: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}75.00%) BB#4({{[0-9a-fx/= ]+}}25.00%) define i32 @test(i32 %a, i32 %a2, i32* %p, i32* %p2) { entry: diff --git a/test/CodeGen/ARM/tail-merge-branch-weight.ll b/test/CodeGen/ARM/tail-merge-branch-weight.ll index 95b0a202e7ff..f83f28815793 100644 --- a/test/CodeGen/ARM/tail-merge-branch-weight.ll +++ b/test/CodeGen/ARM/tail-merge-branch-weight.ll @@ -9,7 +9,7 @@ ; = 0.2 * 0.4 + 0.8 * 0.7 = 0.64 ; CHECK: # Machine code for function test0: -; CHECK: Successors according to CFG: BB#{{[0-9]+}}(13) BB#{{[0-9]+}}(24) +; CHECK: Successors according to CFG: BB#{{[0-9]+}}({{[0-9a-fx/= ]+}}20.00%) BB#{{[0-9]+}}({{[0-9a-fx/= ]+}}80.00%) ; CHECK: BB#{{[0-9]+}}: ; CHECK: BB#{{[0-9]+}}: ; CHECK: # End machine code for function test0. diff --git a/test/CodeGen/ARM/taildup-branch-weight.ll b/test/CodeGen/ARM/taildup-branch-weight.ll index 576c120b444e..799ef62416e6 100644 --- a/test/CodeGen/ARM/taildup-branch-weight.ll +++ b/test/CodeGen/ARM/taildup-branch-weight.ll @@ -3,7 +3,7 @@ ; RUN: | FileCheck %s ; CHECK: Machine code for function test0: -; CHECK: Successors according to CFG: BB#1(67108864) BB#2(2080374784) +; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}3.12%) BB#2({{[0-9a-fx/= ]+}}96.88%) define void @test0(i32 %a, i32 %b, i32* %c, i32* %d) { entry: @@ -30,7 +30,7 @@ B4: !0 = !{!"branch_weights", i32 4, i32 124} ; CHECK: Machine code for function test1: -; CHECK: Successors according to CFG: BB#1(67108864) BB#2(2080374784) +; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}3.12%) BB#2({{[0-9a-fx/= ]+}}96.88%) @g0 = common global i32 0, align 4 diff --git a/test/CodeGen/Generic/MachineBranchProb.ll b/test/CodeGen/Generic/MachineBranchProb.ll index 5a4a4672f7eb..ae3c8da21471 100644 --- a/test/CodeGen/Generic/MachineBranchProb.ll +++ b/test/CodeGen/Generic/MachineBranchProb.ll @@ -16,11 +16,11 @@ entry: i64 5, label %sw.bb1 ], !prof !0 ; CHECK: BB#0: derived from LLVM BB %entry -; CHECK: Successors according to CFG: BB#2(1616928864) BB#4(530554784) +; CHECK: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}75.29%) BB#4({{[0-9a-fx/= ]+}}24.71%) ; CHECK: BB#4: derived from LLVM BB %entry -; CHECK: Successors according to CFG: BB#1(252645135) BB#5(277909649) +; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}47.62%) BB#5({{[0-9a-fx/= ]+}}52.38%) ; CHECK: BB#5: derived from LLVM BB %entry -; CHECK: Successors according to CFG: BB#1(101058054) BB#3(176851595) +; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}36.36%) BB#3({{[0-9a-fx/= ]+}}63.64%) sw.bb: br label %return @@ -62,7 +62,7 @@ return: ret void ; CHECK-LABEL: Machine code for function left_leaning_weight_balanced_tree: ; CHECK: BB#0: derived from LLVM BB %entry ; CHECK-NOT: Successors -; CHECK: Successors according to CFG: BB#8(852677332) BB#9(1294806318) +; CHECK: Successors according to CFG: BB#8({{[0-9a-fx/= ]+}}39.71%) BB#9({{[0-9a-fx/= ]+}}60.29%) } !1 = !{!"branch_weights", diff --git a/test/CodeGen/Hexagon/ifcvt-edge-weight.ll b/test/CodeGen/Hexagon/ifcvt-edge-weight.ll index f84fd95e4fbd..341567e1d02f 100644 --- a/test/CodeGen/Hexagon/ifcvt-edge-weight.ll +++ b/test/CodeGen/Hexagon/ifcvt-edge-weight.ll @@ -2,7 +2,7 @@ ; Check that the edge weights are updated correctly after if-conversion. ; CHECK: BB#3: -; CHECK: Successors according to CFG: BB#2(214748365) BB#1(1932735283) +; CHECK: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}10.00%) BB#1({{[0-9a-fx/= ]+}}90.00%) @a = external global i32 @d = external global i32 diff --git a/test/CodeGen/MIR/X86/newline-handling.mir b/test/CodeGen/MIR/X86/newline-handling.mir index b5ed3b7f27e1..bce06d540114 100644 --- a/test/CodeGen/MIR/X86/newline-handling.mir +++ b/test/CodeGen/MIR/X86/newline-handling.mir @@ -35,7 +35,7 @@ liveins: # CHECK-LABEL: name: foo # CHECK: body: | # CHECK-NEXT: bb.0.entry: -# CHECK-NEXT: successors: %bb.1.less(0), %bb.2.exit(0) +# CHECK-NEXT: successors: %bb.1.less(0x40000000 / 0x80000000 = 50.00%), %bb.2.exit(0x40000000 / 0x80000000 = 50.00%) # CHECK-NEXT: liveins: %edi # CHECK: CMP32ri8 %edi, 10, implicit-def %eflags # CHECK-NEXT: JG_1 %bb.2.exit, implicit killed %eflags @@ -79,7 +79,7 @@ liveins: # CHECK-LABEL: name: bar # CHECK: body: | # CHECK-NEXT: bb.0.entry: -# CHECK-NEXT: successors: %bb.1.less(0), %bb.2.exit(0) +# CHECK-NEXT: successors: %bb.1.less(0x40000000 / 0x80000000 = 50.00%), %bb.2.exit(0x40000000 / 0x80000000 = 50.00%) # CHECK-NEXT: liveins: %edi # CHECK: CMP32ri8 %edi, 10, implicit-def %eflags # CHECK-NEXT: JG_1 %bb.2.exit, implicit killed %eflags diff --git a/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir b/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir index fc5e5d640f7f..64af6121189a 100644 --- a/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir +++ b/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir @@ -1,6 +1,6 @@ # RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s # This test ensures that the MIR parser parses basic block successors and -# weights correctly. +# probabilities correctly. --- | @@ -21,10 +21,10 @@ name: foo body: | ; CHECK-LABEL: bb.0.entry: - ; CHECK: successors: %bb.1.less(16), %bb.2.exit(32) + ; CHECK: successors: %bb.1.less({{[0-9a-fx/= ]+}}33.00%), %bb.2.exit({{[0-9a-fx/= ]+}}67.00%) ; CHECK-LABEL: bb.1.less: bb.0.entry: - successors: %bb.1.less (16), %bb.2.exit(32) + successors: %bb.1.less (33), %bb.2.exit(67) liveins: %edi CMP32ri8 %edi, 10, implicit-def %eflags diff --git a/test/CodeGen/MIR/X86/successor-basic-blocks.mir b/test/CodeGen/MIR/X86/successor-basic-blocks.mir index aa80fe9fbeef..a6c14f70bc7c 100644 --- a/test/CodeGen/MIR/X86/successor-basic-blocks.mir +++ b/test/CodeGen/MIR/X86/successor-basic-blocks.mir @@ -32,7 +32,7 @@ name: foo body: | ; CHECK-LABEL: bb.0.entry: - ; CHECK: successors: %bb.1.less(0), %bb.2.exit(0) + ; CHECK: successors: %bb.1.less(0x40000000 / 0x80000000 = 50.00%), %bb.2.exit(0x40000000 / 0x80000000 = 50.00%) ; CHECK-LABEL: bb.1.less: bb.0.entry: successors: %bb.1.less, %bb.2.exit @@ -58,7 +58,7 @@ body: | ; Verify that we can have multiple lists of successors that will be merged ; into one. ; CHECK-LABEL: bb.0.entry: - ; CHECK: successors: %bb.1(0), %bb.2(0) + ; CHECK: successors: %bb.1(0x80000000 / 0x80000000 = 100.00%), %bb.2(0x00000000 / 0x80000000 = 0.00%) bb.0.entry: liveins: %edi successors: %bb.1 diff --git a/test/CodeGen/X86/MachineBranchProb.ll b/test/CodeGen/X86/MachineBranchProb.ll index da0bf517ecfa..ee1c658d4c55 100644 --- a/test/CodeGen/X86/MachineBranchProb.ll +++ b/test/CodeGen/X86/MachineBranchProb.ll @@ -18,9 +18,9 @@ for.cond2: ; preds = %for.inc, %for.cond %or.cond = or i1 %tobool, %cmp4 br i1 %or.cond, label %for.inc20, label %for.inc, !prof !0 ; CHECK: BB#1: derived from LLVM BB %for.cond2 -; CHECK: Successors according to CFG: BB#3(32756933) BB#4(2114726715) +; CHECK: Successors according to CFG: BB#3({{[0-9a-fx/= ]+}}1.53%) BB#4({{[0-9a-fx/= ]+}}98.47%) ; CHECK: BB#4: derived from LLVM BB %for.cond2 -; CHECK: Successors according to CFG: BB#3(33264335) BB#2(2114219313) +; CHECK: Successors according to CFG: BB#3({{[0-9a-fx/= ]+}}1.55%) BB#2({{[0-9a-fx/= ]+}}98.45%) for.inc: ; preds = %for.cond2 %shl = shl i32 %bit.0, 1 diff --git a/test/CodeGen/X86/catchpad-weight.ll b/test/CodeGen/X86/catchpad-weight.ll index 9b06f2abc81c..e8b416845ec1 100644 --- a/test/CodeGen/X86/catchpad-weight.ll +++ b/test/CodeGen/X86/catchpad-weight.ll @@ -2,7 +2,7 @@ ; Check if the edge weight to the catchpad is calculated correctly. -; CHECK: Successors according to CFG: BB#3(2147481600) BB#1(2048) BB#4(1024) BB#6(512) BB#8(256) +; CHECK: Successors according to CFG: BB#3(0x7ffff100 / 0x80000000 = 100.00%) BB#1(0x00000800 / 0x80000000 = 0.00%) BB#4(0x00000400 / 0x80000000 = 0.00%) BB#6(0x00000200 / 0x80000000 = 0.00%) BB#8(0x00000100 / 0x80000000 = 0.00%) target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64--windows-msvc18.0.0" diff --git a/test/CodeGen/X86/stack-protector-weight.ll b/test/CodeGen/X86/stack-protector-weight.ll index 16877ef70a31..dea66d28e3dd 100644 --- a/test/CodeGen/X86/stack-protector-weight.ll +++ b/test/CodeGen/X86/stack-protector-weight.ll @@ -2,13 +2,13 @@ ; RUN: llc -mtriple=x86_64-apple-darwin -print-machineinstrs=expand-isel-pseudos -enable-selectiondag-sp=false %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=IR ; SELDAG: # Machine code for function test_branch_weights: -; SELDAG: Successors according to CFG: BB#[[SUCCESS:[0-9]+]](2147481600) BB#[[FAILURE:[0-9]+]](2048) +; SELDAG: Successors according to CFG: BB#[[SUCCESS:[0-9]+]]({{[0-9a-fx/= ]+}}100.00%) BB#[[FAILURE:[0-9]+]] ; SELDAG: BB#[[FAILURE]]: ; SELDAG: CALL64pcrel32 ; SELDAG: BB#[[SUCCESS]]: ; IR: # Machine code for function test_branch_weights: -; IR: Successors according to CFG: BB#[[SUCCESS:[0-9]+]](2147481600) BB#[[FAILURE:[0-9]+]](2048) +; IR: Successors according to CFG: BB#[[SUCCESS:[0-9]+]]({{[0-9a-fx/= ]+}}100.00%) BB#[[FAILURE:[0-9]+]] ; IR: BB#[[SUCCESS]]: ; IR: BB#[[FAILURE]]: ; IR: CALL64pcrel32 diff --git a/test/CodeGen/X86/switch-edge-weight.ll b/test/CodeGen/X86/switch-edge-weight.ll index 9026f6f05f0e..6f594868c7ad 100644 --- a/test/CodeGen/X86/switch-edge-weight.ll +++ b/test/CodeGen/X86/switch-edge-weight.ll @@ -34,22 +34,22 @@ sw.epilog: ; CHECK: BB#0: ; BB#0 to BB#4: [0, 1133] (65 = 60 + 5) ; BB#0 to BB#5: [1134, UINT32_MAX] (25 = 20 + 5) -; CHECK: Successors according to CFG: BB#4(1550960411) BB#5(596523235) +; CHECK: Successors according to CFG: BB#4({{[0-9a-fx/= ]+}}72.22%) BB#5({{[0-9a-fx/= ]+}}27.78%) ; ; CHECK: BB#4: ; BB#4 to BB#1: [155, 159] (50) ; BB#4 to BB#5: [0, 1133] - [155, 159] (15 = 10 + 5) -; CHECK: Successors according to CFG: BB#1(1193046470) BB#7(357913941) +; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}76.92%) BB#7({{[0-9a-fx/= ]+}}23.08%) ; ; CHECK: BB#5: ; BB#5 to BB#1: {1140} (10) ; BB#5 to BB#6: [1134, UINT32_MAX] - {1140} (15 = 10 + 5) -; CHECK: Successors according to CFG: BB#1(238609294) BB#6(357913941) +; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}40.00%) BB#6({{[0-9a-fx/= ]+}}60.00%) ; ; CHECK: BB#6: ; BB#6 to BB#1: {1134} (10) ; BB#6 to BB#2: [1134, UINT32_MAX] - {1134, 1140} (5) -; CHECK: Successors according to CFG: BB#1(238609294) BB#2(119304647) +; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}66.67%) BB#2({{[0-9a-fx/= ]+}}33.33%) } ; CHECK-LABEL: test2 @@ -102,7 +102,7 @@ sw.epilog: ; CHECK: BB#0: ; BB#0 to BB#6: {0} + [15, UINT32_MAX] (5) ; BB#0 to BB#8: [1, 14] (jump table) (65 = 60 + 5) -; CHECK: Successors according to CFG: BB#6(153391689) BB#8(1994091957) +; CHECK: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}7.14%) BB#8({{[0-9a-fx/= ]+}}92.86% ; ; CHECK: BB#8: ; BB#8 to BB#1: {1} (10) @@ -111,7 +111,7 @@ sw.epilog: ; BB#8 to BB#3: {11} (10) ; BB#8 to BB#4: {12} (10) ; BB#8 to BB#5: {13, 14} (20) -; CHECK: Successors according to CFG: BB#1(306783378) BB#6(153391689) BB#2(306783378) BB#3(306783378) BB#4(306783378) BB#5(613566756) +; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}14.29%) BB#6({{[0-9a-fx/= ]+}}7.14%) BB#2({{[0-9a-fx/= ]+}}14.29%) BB#3({{[0-9a-fx/= ]+}}14.29%) BB#4({{[0-9a-fx/= ]+}}14.29%) BB#5({{[0-9a-fx/= ]+}}28.57%) } ; CHECK-LABEL: test3 @@ -163,7 +163,7 @@ sw.epilog: ; CHECK: BB#0: ; BB#0 to BB#6: [0, 9] + [15, UINT32_MAX] {10} ; BB#0 to BB#8: [10, 14] (jump table) (50) -; CHECK: Successors according to CFG: BB#6(357913941) BB#8(1789569705) +; CHECK: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}16.67%) BB#8({{[0-9a-fx/= ]+}}83.33%) ; ; CHECK: BB#8: ; BB#8 to BB#1: {10} (10) @@ -171,7 +171,7 @@ sw.epilog: ; BB#8 to BB#3: {12} (10) ; BB#8 to BB#4: {13} (10) ; BB#8 to BB#5: {14} (10) -; CHECK: Successors according to CFG: BB#1(357913941) BB#2(357913941) BB#3(357913941) BB#4(357913941) BB#5(357913941) +; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}20.00%) BB#2({{[0-9a-fx/= ]+}}20.00%) BB#3({{[0-9a-fx/= ]+}}20.00%) BB#4({{[0-9a-fx/= ]+}}20.00%) BB#5({{[0-9a-fx/= ]+}}20.00%) } ; CHECK-LABEL: test4 @@ -216,12 +216,12 @@ sw.epilog: ; CHECK: BB#0: ; BB#0 to BB#6: [0, 110] + [116, UINT32_MAX] (20) ; BB#0 to BB#7: [111, 115] (bit test) (50) -; CHECK: Successors according to CFG: BB#6(613566756) BB#7(1533916890) +; CHECK: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}28.57%) BB#7({{[0-9a-fx/= ]+}}71.43%) ; ; CHECK: BB#7: ; BB#7 to BB#2: {111, 114, 115} (30) ; BB#7 to BB#3: {112, 113} (20) -; CHECK: Successors according to CFG: BB#2(920350134) BB#3(613566756) +; CHECK: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}60.00%) BB#3({{[0-9a-fx/= ]+}}40.00%) } ; CHECK-LABEL: test5 @@ -273,7 +273,7 @@ sw.epilog: ; CHECK: BB#0: ; BB#0 to BB#6: [10, UINT32_MAX] (15) ; BB#0 to BB#8: [1, 5, 7, 9] (jump table) (45) -; CHECK: Successors according to CFG: BB#8(536870912) BB#9(1610612734) +; CHECK: Successors according to CFG: BB#8({{[0-9a-fx/= ]+}}25.00%) BB#9({{[0-9a-fx/= ]+}}75.00%) } !1 = !{!"branch_weights", i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10} diff --git a/test/CodeGen/X86/switch-jump-table.ll b/test/CodeGen/X86/switch-jump-table.ll index 3cfee1cd80e6..896a067da230 100644 --- a/test/CodeGen/X86/switch-jump-table.ll +++ b/test/CodeGen/X86/switch-jump-table.ll @@ -1,5 +1,5 @@ ; RUN: llc -mtriple=i686-pc-gnu-linux < %s | FileCheck %s -check-prefix=CHECK -; RUN: llc -mtriple=i686-pc-gnu-linux -print-machineinstrs=expand-isel-pseudos %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=CHECK-JT-WEIGHT +; RUN: llc -mtriple=i686-pc-gnu-linux -print-machineinstrs=expand-isel-pseudos %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=CHECK-JT-PROB ; An unreachable default destination is replaced with the most popular case label. @@ -54,9 +54,9 @@ default: ; Check if branch probabilities are correctly assigned to the jump table. define void @bar(i32 %x, i32* %to) { -; CHECK-JT-WEIGHT-LABEL: bar: -; CHECK-JT-WEIGHT: Successors according to CFG: BB#6(306783378) BB#8(1840700268) -; CHECK-JT-WEIGHT: Successors according to CFG: BB#1(306783378) BB#2(306783378) BB#3(306783378) BB#4(306783378) BB#5(613566756) +; CHECK-JT-PROB-LABEL: bar: +; CHECK-JT-PROB: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}14.29%) BB#8({{[0-9a-fx/= ]+}}85.71%) +; CHECK-JT-PROB: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}16.67%) BB#2({{[0-9a-fx/= ]+}}16.67%) BB#3({{[0-9a-fx/= ]+}}16.67%) BB#4({{[0-9a-fx/= ]+}}16.67%) BB#5({{[0-9a-fx/= ]+}}33.33%) entry: switch i32 %x, label %default [ From ed8c8fab46478302926233d2339c7c28b3711edc Mon Sep 17 00:00:00 2001 From: Davide Italiano Date: Tue, 1 Dec 2015 05:33:24 +0000 Subject: [PATCH 053/186] [Windows] Partially revert r254363 until I can test the right fix. Reported by: David Blaikie git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254378 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Support/Windows/DynamicLibrary.inc | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lib/Support/Windows/DynamicLibrary.inc b/lib/Support/Windows/DynamicLibrary.inc index e612283e630f..17418b015c75 100644 --- a/lib/Support/Windows/DynamicLibrary.inc +++ b/lib/Support/Windows/DynamicLibrary.inc @@ -60,8 +60,12 @@ DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename, if (OpenedHandles == 0) OpenedHandles = new DenseSet(); - if (!fEnumerateLoadedModules) - assert(loadDebugHelp() && "These APIs should always be available"); + if (!fEnumerateLoadedModules) { + if (!loadDebugHelp()) { + assert(false && "These APIs should always be available"); + return DynamicLibrary(); + } + } fEnumerateLoadedModules(GetCurrentProcess(), ELM_Callback, 0); // Dummy library that represents "search all handles". From 9654ce949f87b5e7713504ad79fb6bee41b3ce1c Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 1 Dec 2015 06:12:59 +0000 Subject: [PATCH 054/186] Use array_lengthof instead of manually calculating it. NFC git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254380 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Support/Unix/Signals.inc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lib/Support/Unix/Signals.inc b/lib/Support/Unix/Signals.inc index de98d4adf996..061cdb3da216 100644 --- a/lib/Support/Unix/Signals.inc +++ b/lib/Support/Unix/Signals.inc @@ -86,12 +86,11 @@ static unsigned NumRegisteredSignals = 0; static struct { struct sigaction SA; int SigNo; -} RegisteredSignalInfo[(sizeof(IntSigs)+sizeof(KillSigs))/sizeof(KillSigs[0])]; +} RegisteredSignalInfo[array_lengthof(IntSigs) + array_lengthof(KillSigs)]; static void RegisterHandler(int Signal) { - assert(NumRegisteredSignals < - sizeof(RegisteredSignalInfo)/sizeof(RegisteredSignalInfo[0]) && + assert(NumRegisteredSignals < array_lengthof(RegisteredSignalInfo) && "Out of space for signal handlers!"); struct sigaction NewHandler; From 08bbdebfc9c6ec9a83db984d97687414ef9e2989 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 1 Dec 2015 06:13:01 +0000 Subject: [PATCH 055/186] [ARM] Use range-based for loops to avoid the need for calculating an array size that I would have otherwise cconverted to array_lengthof. NFC git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254381 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARMFrameLowering.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp index 5b3229456317..c5990bb7d1fb 100644 --- a/lib/Target/ARM/ARMFrameLowering.cpp +++ b/lib/Target/ARM/ARMFrameLowering.cpp @@ -1889,10 +1889,9 @@ void ARMFrameLowering::adjustForSegmentedStacks( // first in the list. MachineBasicBlock *AddedBlocks[] = {PrevStackMBB, McrMBB, GetMBB, AllocMBB, PostStackMBB}; - const int NbAddedBlocks = sizeof(AddedBlocks) / sizeof(AddedBlocks[0]); - for (int Idx = 0; Idx < NbAddedBlocks; ++Idx) - BeforePrologueRegion.insert(AddedBlocks[Idx]); + for (MachineBasicBlock *B : AddedBlocks) + BeforePrologueRegion.insert(B); for (const auto &LI : PrologueMBB.liveins()) { for (MachineBasicBlock *PredBB : BeforePrologueRegion) @@ -1901,9 +1900,9 @@ void ARMFrameLowering::adjustForSegmentedStacks( // Remove the newly added blocks from the list, since we know // we do not have to do the following updates for them. - for (int Idx = 0; Idx < NbAddedBlocks; ++Idx) { - BeforePrologueRegion.erase(AddedBlocks[Idx]); - MF.insert(PrologueMBB.getIterator(), AddedBlocks[Idx]); + for (MachineBasicBlock *B : AddedBlocks) { + BeforePrologueRegion.erase(B); + MF.insert(PrologueMBB.getIterator(), B); } for (MachineBasicBlock *MBB : BeforePrologueRegion) { From 586117052154a19a311af41201d360c698d33499 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 1 Dec 2015 06:13:04 +0000 Subject: [PATCH 056/186] [Hexagon] Use ArrayRef to avoid needing to calculate an array size. Interestingly the original code may have had a bug because it was passing the byte size of a uint16_t array instead of the number of entries. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254382 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../Disassembler/HexagonDisassembler.cpp | 27 ++++++++----------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp index 4036650bf74b..51bce8fb01fc 100644 --- a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp +++ b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp @@ -471,12 +471,13 @@ extern const MCInstrDesc HexagonInsts[]; } static DecodeStatus DecodeRegisterClass(MCInst &Inst, unsigned RegNo, - const uint16_t Table[], size_t Size) { - if (RegNo < Size) { + ArrayRef Table) { + if (RegNo < Table.size()) { Inst.addOperand(MCOperand::createReg(Table[RegNo])); return MCDisassembler::Success; - } else - return MCDisassembler::Fail; + } + + return MCDisassembler::Fail; } static DecodeStatus DecodeIntRegsLow8RegisterClass(MCInst &Inst, unsigned RegNo, @@ -497,8 +498,7 @@ static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo, Hexagon::R25, Hexagon::R26, Hexagon::R27, Hexagon::R28, Hexagon::R29, Hexagon::R30, Hexagon::R31}; - return (DecodeRegisterClass(Inst, RegNo, IntRegDecoderTable, - sizeof(IntRegDecoderTable))); + return (DecodeRegisterClass(Inst, RegNo, IntRegDecoderTable)); } static DecodeStatus DecodeVectorRegsRegisterClass(MCInst &Inst, unsigned RegNo, @@ -513,8 +513,7 @@ static DecodeStatus DecodeVectorRegsRegisterClass(MCInst &Inst, unsigned RegNo, Hexagon::V25, Hexagon::V26, Hexagon::V27, Hexagon::V28, Hexagon::V29, Hexagon::V30, Hexagon::V31}; - return (DecodeRegisterClass(Inst, RegNo, VecRegDecoderTable, - sizeof(VecRegDecoderTable))); + return (DecodeRegisterClass(Inst, RegNo, VecRegDecoderTable)); } static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo, @@ -526,8 +525,7 @@ static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo, Hexagon::D8, Hexagon::D9, Hexagon::D10, Hexagon::D11, Hexagon::D12, Hexagon::D13, Hexagon::D14, Hexagon::D15}; - return (DecodeRegisterClass(Inst, RegNo >> 1, DoubleRegDecoderTable, - sizeof(DoubleRegDecoderTable))); + return (DecodeRegisterClass(Inst, RegNo >> 1, DoubleRegDecoderTable)); } static DecodeStatus DecodeVecDblRegsRegisterClass(MCInst &Inst, unsigned RegNo, @@ -539,8 +537,7 @@ static DecodeStatus DecodeVecDblRegsRegisterClass(MCInst &Inst, unsigned RegNo, Hexagon::W8, Hexagon::W9, Hexagon::W10, Hexagon::W11, Hexagon::W12, Hexagon::W13, Hexagon::W14, Hexagon::W15}; - return (DecodeRegisterClass(Inst, RegNo >> 1, VecDblRegDecoderTable, - sizeof(VecDblRegDecoderTable))); + return (DecodeRegisterClass(Inst, RegNo >> 1, VecDblRegDecoderTable)); } static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo, @@ -549,8 +546,7 @@ static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo, static const uint16_t PredRegDecoderTable[] = {Hexagon::P0, Hexagon::P1, Hexagon::P2, Hexagon::P3}; - return (DecodeRegisterClass(Inst, RegNo, PredRegDecoderTable, - sizeof(PredRegDecoderTable))); + return (DecodeRegisterClass(Inst, RegNo, PredRegDecoderTable)); } static DecodeStatus DecodeVecPredRegsRegisterClass(MCInst &Inst, unsigned RegNo, @@ -559,8 +555,7 @@ static DecodeStatus DecodeVecPredRegsRegisterClass(MCInst &Inst, unsigned RegNo, static const uint16_t VecPredRegDecoderTable[] = {Hexagon::Q0, Hexagon::Q1, Hexagon::Q2, Hexagon::Q3}; - return (DecodeRegisterClass(Inst, RegNo, VecPredRegDecoderTable, - sizeof(VecPredRegDecoderTable))); + return (DecodeRegisterClass(Inst, RegNo, VecPredRegDecoderTable)); } static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo, From cac7246b4c5ae1841bbcfd9a9ea30f062b2f1963 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 1 Dec 2015 06:13:06 +0000 Subject: [PATCH 057/186] Use array_lengthof instead of manually calculating it. NFC git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254383 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp index 51bce8fb01fc..ee7f569c4bfd 100644 --- a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp +++ b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp @@ -568,7 +568,7 @@ static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo, Hexagon::CS0, Hexagon::CS1, Hexagon::UPCL, Hexagon::UPC }; - if (RegNo >= sizeof(CtrlRegDecoderTable) / sizeof(CtrlRegDecoderTable[0])) + if (RegNo >= array_lengthof(CtrlRegDecoderTable)) return MCDisassembler::Fail; if (CtrlRegDecoderTable[RegNo] == Hexagon::NoRegister) @@ -592,7 +592,7 @@ static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo, Hexagon::UPC, Hexagon::NoRegister }; - if (RegNo >= sizeof(CtrlReg64DecoderTable) / sizeof(CtrlReg64DecoderTable[0])) + if (RegNo >= array_lengthof(CtrlReg64DecoderTable)) return MCDisassembler::Fail; if (CtrlReg64DecoderTable[RegNo] == Hexagon::NoRegister) From 97beeddf5b474f56e63f9ebb956deea036f4bec4 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 1 Dec 2015 06:13:08 +0000 Subject: [PATCH 058/186] [Hexagon] Use array_lengthof and const correct and type correct the array and array size. NFC git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254384 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../Hexagon/Disassembler/HexagonDisassembler.cpp | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp index ee7f569c4bfd..1db59e1dd99d 100644 --- a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp +++ b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp @@ -779,7 +779,7 @@ static DecodeStatus brtargetDecoder(MCInst &MI, unsigned tmp, uint64_t Address, // Please note that the instructions must be ordered in the descending order // of their opcode. // HexagonII::INST_ICLASS_ST -static unsigned int StoreConditionalOpcodeData[][2] = { +static const unsigned int StoreConditionalOpcodeData[][2] = { {S4_pstorerdfnew_abs, 0xafc02084}, {S4_pstorerdtnew_abs, 0xafc02080}, {S4_pstorerdf_abs, 0xafc00084}, @@ -825,18 +825,16 @@ static unsigned int LoadStoreOpcodeData[][2] = {{L4_loadrd_abs, 0x49c00000}, {S2_storerfabs, 0x48600000}, {S2_storerhabs, 0x48400000}, {S2_storerbabs, 0x48000000}}; -static int NumCondS = - sizeof(StoreConditionalOpcodeData) / sizeof(StoreConditionalOpcodeData[0]); -static int NumLS = sizeof(LoadStoreOpcodeData) / sizeof(LoadStoreOpcodeData[0]); +static const size_t NumCondS = array_lengthof(StoreConditionalOpcodeData); +static const size_t NumLS = array_lengthof(LoadStoreOpcodeData); static DecodeStatus decodeSpecial(MCInst &MI, uint32_t insn) { unsigned MachineOpcode = 0; unsigned LLVMOpcode = 0; - int i; if ((insn & HexagonII::INST_ICLASS_MASK) == HexagonII::INST_ICLASS_ST) { - for (i = 0; i < NumCondS; ++i) { + for (size_t i = 0; i < NumCondS; ++i) { if ((insn & StoreConditionalOpcodeData[i][1]) == StoreConditionalOpcodeData[i][1]) { MachineOpcode = StoreConditionalOpcodeData[i][1]; @@ -846,7 +844,7 @@ static DecodeStatus decodeSpecial(MCInst &MI, uint32_t insn) { } } if ((insn & HexagonII::INST_ICLASS_MASK) == HexagonII::INST_ICLASS_LD_ST_2) { - for (i = 0; i < NumLS; ++i) { + for (size_t i = 0; i < NumLS; ++i) { if ((insn & LoadStoreOpcodeData[i][1]) == LoadStoreOpcodeData[i][1]) { MachineOpcode = LoadStoreOpcodeData[i][1]; LLVMOpcode = LoadStoreOpcodeData[i][0]; From b6f477da5ec08012d2fa8739139eac13123411da Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 1 Dec 2015 06:13:10 +0000 Subject: [PATCH 059/186] [Hexagon] Use std::begin() and std::end() instead of doing the same manually. NFC git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254385 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp index 34817cd98f2f..e6194f61a6ba 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp @@ -81,8 +81,7 @@ static const std::pair opcodeData[] = { std::make_pair((unsigned)V4_SS2_storewi1, 4352)}; static std::map - subinstOpcodeMap(opcodeData, - opcodeData + sizeof(opcodeData) / sizeof(opcodeData[0])); + subinstOpcodeMap(std::begin(opcodeData), std::end(opcodeData)); bool HexagonMCInstrInfo::isDuplexPairMatch(unsigned Ga, unsigned Gb) { switch (Ga) { From 9a293e525c52342f7d286ac999dc84d04c8897e9 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 1 Dec 2015 06:13:13 +0000 Subject: [PATCH 060/186] [X86] Use array_lengthof instead of calculating manually. Also change index types to size_t to match. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254386 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86InstrInfo.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index aaeef465bf50..04b68cd509fb 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -3517,23 +3517,23 @@ unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(MachineInstr *MI, bool IsIntrinOpcode; isFMA3(Opc, &IsIntrinOpcode); - unsigned GroupsNum; + size_t GroupsNum; const unsigned (*OpcodeGroups)[3]; if (IsIntrinOpcode) { - GroupsNum = sizeof(IntrinOpcodeGroups) / sizeof(IntrinOpcodeGroups[0]); + GroupsNum = array_lengthof(IntrinOpcodeGroups); OpcodeGroups = IntrinOpcodeGroups; } else { - GroupsNum = sizeof(RegularOpcodeGroups) / sizeof(RegularOpcodeGroups[0]); + GroupsNum = array_lengthof(RegularOpcodeGroups); OpcodeGroups = RegularOpcodeGroups; } const unsigned *FoundOpcodesGroup = nullptr; - unsigned FormIndex; + size_t FormIndex; // Look for the input opcode in the corresponding opcodes table. - unsigned GroupIndex = 0; - for (; GroupIndex < GroupsNum && !FoundOpcodesGroup; GroupIndex++) { - for (FormIndex = 0; FormIndex < FormsNum; FormIndex++) { + for (size_t GroupIndex = 0; GroupIndex < GroupsNum && !FoundOpcodesGroup; + ++GroupIndex) { + for (FormIndex = 0; FormIndex < FormsNum; ++FormIndex) { if (OpcodeGroups[GroupIndex][FormIndex] == Opc) { FoundOpcodesGroup = OpcodeGroups[GroupIndex]; break; From b676925e7b5b51a5dda9b5af06594a830ae8915a Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 1 Dec 2015 06:13:15 +0000 Subject: [PATCH 061/186] [X86] Use range-based for loops. NFC git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254387 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86InstrInfo.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 04b68cd509fb..12da3a9319e6 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -6715,16 +6715,16 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = { // domains, but they require a bit more work than just switching opcodes. static const uint16_t *lookup(unsigned opcode, unsigned domain) { - for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i) - if (ReplaceableInstrs[i][domain-1] == opcode) - return ReplaceableInstrs[i]; + for (const uint16_t (&Row)[3] : ReplaceableInstrs) + if (Row[domain-1] == opcode) + return Row; return nullptr; } static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) { - for (unsigned i = 0, e = array_lengthof(ReplaceableInstrsAVX2); i != e; ++i) - if (ReplaceableInstrsAVX2[i][domain-1] == opcode) - return ReplaceableInstrsAVX2[i]; + for (const uint16_t (&Row)[3] : ReplaceableInstrsAVX2) + if (Row[domain-1] == opcode) + return Row; return nullptr; } From c8fda7a60ac134e3658d9db112319897b8a6f36f Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 1 Dec 2015 06:13:16 +0000 Subject: [PATCH 062/186] [X86] Fix patterns for memory forms of FP FSUBR and FDIVR. They need to have memory on the left hand side of the fsub/fdiv operations in their patterns. Not sure how to test this. I noticed by inspection in the isel tables where the same pattern tried to produce DIV and DIVR or SUB and SUBR. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254388 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86InstrFPStack.td | 108 +++++++++++++++++++----------- 1 file changed, 69 insertions(+), 39 deletions(-) diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td index 51648c6c567e..8fecc9b1a3fd 100644 --- a/lib/Target/X86/X86InstrFPStack.td +++ b/lib/Target/X86/X86InstrFPStack.td @@ -137,69 +137,99 @@ def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP, // The FopST0 series are not included here because of the irregularities // in where the 'r' goes in assembly output. // These instructions cannot address 80-bit memory. -multiclass FPBinary { +multiclass FPBinary { // ST(0) = ST(0) + [mem] def _Fp32m : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, f32mem:$src2), OneArgFPRW, - [(set RFP32:$dst, - (OpNode RFP32:$src1, (loadf32 addr:$src2)))]>; + [!if(Forward, + (set RFP32:$dst, + (OpNode RFP32:$src1, (loadf32 addr:$src2))), + (set RFP32:$dst, + (OpNode (loadf32 addr:$src2), RFP32:$src1)))]>; def _Fp64m : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, f64mem:$src2), OneArgFPRW, - [(set RFP64:$dst, - (OpNode RFP64:$src1, (loadf64 addr:$src2)))]>; + [!if(Forward, + (set RFP64:$dst, + (OpNode RFP64:$src1, (loadf64 addr:$src2))), + (set RFP64:$dst, + (OpNode (loadf64 addr:$src2), RFP64:$src1)))]>; def _Fp64m32: FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, f32mem:$src2), OneArgFPRW, - [(set RFP64:$dst, - (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2))))]>; + [!if(Forward, + (set RFP64:$dst, + (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2)))), + (set RFP64:$dst, + (OpNode (f64 (extloadf32 addr:$src2)), RFP64:$src1)))]>; def _Fp80m32: FpI_<(outs RFP80:$dst), (ins RFP80:$src1, f32mem:$src2), OneArgFPRW, - [(set RFP80:$dst, - (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2))))]>; + [!if(Forward, + (set RFP80:$dst, + (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2)))), + (set RFP80:$dst, + (OpNode (f80 (extloadf32 addr:$src2)), RFP80:$src1)))]>; def _Fp80m64: FpI_<(outs RFP80:$dst), (ins RFP80:$src1, f64mem:$src2), OneArgFPRW, - [(set RFP80:$dst, - (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2))))]>; + [!if(Forward, + (set RFP80:$dst, + (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2)))), + (set RFP80:$dst, + (OpNode (f80 (extloadf64 addr:$src2)), RFP80:$src1)))]>; +let mayLoad = 1 in def _F32m : FPI<0xD8, fp, (outs), (ins f32mem:$src), - !strconcat("f", asmstring, "{s}\t$src")> { - let mayLoad = 1; -} + !strconcat("f", asmstring, "{s}\t$src")>; +let mayLoad = 1 in def _F64m : FPI<0xDC, fp, (outs), (ins f64mem:$src), - !strconcat("f", asmstring, "{l}\t$src")> { - let mayLoad = 1; -} + !strconcat("f", asmstring, "{l}\t$src")>; // ST(0) = ST(0) + [memint] def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2), OneArgFPRW, - [(set RFP32:$dst, (OpNode RFP32:$src1, - (X86fild addr:$src2, i16)))]>; + [!if(Forward, + (set RFP32:$dst, + (OpNode RFP32:$src1, (X86fild addr:$src2, i16))), + (set RFP32:$dst, + (OpNode (X86fild addr:$src2, i16), RFP32:$src1)))]>; def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2), OneArgFPRW, - [(set RFP32:$dst, (OpNode RFP32:$src1, - (X86fild addr:$src2, i32)))]>; + [!if(Forward, + (set RFP32:$dst, + (OpNode RFP32:$src1, (X86fild addr:$src2, i32))), + (set RFP32:$dst, + (OpNode (X86fild addr:$src2, i32), RFP32:$src1)))]>; def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2), OneArgFPRW, - [(set RFP64:$dst, (OpNode RFP64:$src1, - (X86fild addr:$src2, i16)))]>; + [!if(Forward, + (set RFP64:$dst, + (OpNode RFP64:$src1, (X86fild addr:$src2, i16))), + (set RFP64:$dst, + (OpNode (X86fild addr:$src2, i16), RFP64:$src1)))]>; def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2), OneArgFPRW, - [(set RFP64:$dst, (OpNode RFP64:$src1, - (X86fild addr:$src2, i32)))]>; + [!if(Forward, + (set RFP64:$dst, + (OpNode RFP64:$src1, (X86fild addr:$src2, i32))), + (set RFP64:$dst, + (OpNode (X86fild addr:$src2, i32), RFP64:$src1)))]>; def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2), - OneArgFPRW, - [(set RFP80:$dst, (OpNode RFP80:$src1, - (X86fild addr:$src2, i16)))]>; + OneArgFPRW, + [!if(Forward, + (set RFP80:$dst, + (OpNode RFP80:$src1, (X86fild addr:$src2, i16))), + (set RFP80:$dst, + (OpNode (X86fild addr:$src2, i16), RFP80:$src1)))]>; def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2), - OneArgFPRW, - [(set RFP80:$dst, (OpNode RFP80:$src1, - (X86fild addr:$src2, i32)))]>; + OneArgFPRW, + [!if(Forward, + (set RFP80:$dst, + (OpNode RFP80:$src1, (X86fild addr:$src2, i32))), + (set RFP80:$dst, + (OpNode (X86fild addr:$src2, i32), RFP80:$src1)))]>; +let mayLoad = 1 in def _FI16m : FPI<0xDE, fp, (outs), (ins i16mem:$src), - !strconcat("fi", asmstring, "{s}\t$src")> { - let mayLoad = 1; -} + !strconcat("fi", asmstring, "{s}\t$src")>; +let mayLoad = 1 in def _FI32m : FPI<0xDA, fp, (outs), (ins i32mem:$src), - !strconcat("fi", asmstring, "{l}\t$src")> { - let mayLoad = 1; -} + !strconcat("fi", asmstring, "{l}\t$src")>; } let Defs = [FPSW] in { @@ -213,14 +243,14 @@ defm DIV : FPBinary_rr; let SchedRW = [WriteFAddLd] in { defm ADD : FPBinary; defm SUB : FPBinary; -defm SUBR: FPBinary; +defm SUBR: FPBinary; } let SchedRW = [WriteFMulLd] in { defm MUL : FPBinary; } let SchedRW = [WriteFDivLd] in { defm DIV : FPBinary; -defm DIVR: FPBinary; +defm DIVR: FPBinary; } } From b1ce35cdfb58d4e93de58445aef744cded16b386 Mon Sep 17 00:00:00 2001 From: Sanjoy Das Date: Tue, 1 Dec 2015 07:49:23 +0000 Subject: [PATCH 063/186] Introduce a range version of std::any_of, and use it in SCEV Reviewers: dblaikie, pcc Subscribers: llvm-commits Differential Revision: http://reviews.llvm.org/D15063 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254390 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/ADT/STLExtras.h | 8 ++++++++ lib/Analysis/ScalarEvolution.cpp | 7 +++---- tools/llvm-pdbdump/LinePrinter.cpp | 9 +++------ 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/include/llvm/ADT/STLExtras.h b/include/llvm/ADT/STLExtras.h index 1bd3b291e0ef..3655a20d8831 100644 --- a/include/llvm/ADT/STLExtras.h +++ b/include/llvm/ADT/STLExtras.h @@ -371,6 +371,14 @@ bool all_of(R &&Range, UnaryPredicate &&P) { std::forward(P)); } +/// Provide wrappers to std::any_of which take ranges instead of having to pass +/// begin/end explicitly. +template +bool any_of(R &&Range, UnaryPredicate &&P) { + return std::any_of(Range.begin(), Range.end(), + std::forward(P)); +} + //===----------------------------------------------------------------------===// // Extra additions to //===----------------------------------------------------------------------===// diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp index 23daeb67d653..4c8b6e7de84e 100644 --- a/lib/Analysis/ScalarEvolution.cpp +++ b/lib/Analysis/ScalarEvolution.cpp @@ -8403,8 +8403,7 @@ const SCEV *SCEVAddRecExpr::getNumIterationsInRange(ConstantRange Range, // The only time we can solve this is when we have all constant indices. // Otherwise, we cannot determine the overflow conditions. - if (std::any_of(op_begin(), op_end(), - [](const SCEV *Op) { return !isa(Op);})) + if (any_of(operands(), [](const SCEV *Op) { return !isa(Op); })) return SE.getCouldNotCompute(); // Okay at this point we know that all elements of the chrec are constants and @@ -9694,8 +9693,8 @@ bool SCEVUnionPredicate::implies(const SCEVPredicate *N) const { return false; auto &SCEVPreds = ScevPredsIt->second; - return std::any_of(SCEVPreds.begin(), SCEVPreds.end(), - [N](const SCEVPredicate *I) { return I->implies(N); }); + return any_of(SCEVPreds, + [N](const SCEVPredicate *I) { return I->implies(N); }); } const SCEV *SCEVUnionPredicate::getExpr() const { return nullptr; } diff --git a/tools/llvm-pdbdump/LinePrinter.cpp b/tools/llvm-pdbdump/LinePrinter.cpp index 4f3ee54c7691..a43727f02b5e 100644 --- a/tools/llvm-pdbdump/LinePrinter.cpp +++ b/tools/llvm-pdbdump/LinePrinter.cpp @@ -11,15 +11,12 @@ #include "llvm-pdbdump.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/Support/Regex.h" #include namespace { -template bool any_of_range(T &&R, Pred P) { - return std::any_of(R.begin(), R.end(), P); -} - bool IsItemExcluded(llvm::StringRef Item, std::list &IncludeFilters, std::list &ExcludeFilters) { @@ -30,10 +27,10 @@ bool IsItemExcluded(llvm::StringRef Item, // Include takes priority over exclude. If the user specified include // filters, and none of them include this item, them item is gone. - if (!IncludeFilters.empty() && !any_of_range(IncludeFilters, match_pred)) + if (!IncludeFilters.empty() && !any_of(IncludeFilters, match_pred)) return true; - if (any_of_range(ExcludeFilters, match_pred)) + if (any_of(ExcludeFilters, match_pred)) return true; return false; From 94214379c485ff7c89cf49c4c56aca8ec790ec4a Mon Sep 17 00:00:00 2001 From: Sanjoy Das Date: Tue, 1 Dec 2015 07:49:27 +0000 Subject: [PATCH 064/186] Introduce a range version of std::find, and use in SCEV Reviewers: dblaikie, pcc Subscribers: llvm-commits Differential Revision: http://reviews.llvm.org/D15064 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254391 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/ADT/STLExtras.h | 7 +++++++ lib/Analysis/ScalarEvolution.cpp | 3 +-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/include/llvm/ADT/STLExtras.h b/include/llvm/ADT/STLExtras.h index 3655a20d8831..d4360fa8d218 100644 --- a/include/llvm/ADT/STLExtras.h +++ b/include/llvm/ADT/STLExtras.h @@ -379,6 +379,13 @@ bool any_of(R &&Range, UnaryPredicate &&P) { std::forward(P)); } +/// Provide wrappers to std::find which take ranges instead of having to pass +/// begin/end explicitly. +template +auto find(R &&Range, const T &val) -> decltype(Range.begin()) { + return std::find(Range.begin(), Range.end(), val); +} + //===----------------------------------------------------------------------===// // Extra additions to //===----------------------------------------------------------------------===// diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp index 4c8b6e7de84e..d04028b15e2f 100644 --- a/lib/Analysis/ScalarEvolution.cpp +++ b/lib/Analysis/ScalarEvolution.cpp @@ -7964,8 +7964,7 @@ static bool IsMaxConsistingOf(const SCEV *MaybeMaxExpr, const MaxExprType *MaxExpr = dyn_cast(MaybeMaxExpr); if (!MaxExpr) return false; - auto It = std::find(MaxExpr->op_begin(), MaxExpr->op_end(), Candidate); - return It != MaxExpr->op_end(); + return find(MaxExpr->operands(), Candidate) != MaxExpr->op_end(); } From eade7630a7b4cac1dbefbe3600458b4267a8ff3a Mon Sep 17 00:00:00 2001 From: NAKAMURA Takumi Date: Tue, 1 Dec 2015 10:07:37 +0000 Subject: [PATCH 065/186] Move llvm/test/DebugInfo/Generic/safestack-byval.ll to X86. It depends on x86-64. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254396 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/DebugInfo/{Generic => X86}/safestack-byval.ll | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename test/DebugInfo/{Generic => X86}/safestack-byval.ll (100%) diff --git a/test/DebugInfo/Generic/safestack-byval.ll b/test/DebugInfo/X86/safestack-byval.ll similarity index 100% rename from test/DebugInfo/Generic/safestack-byval.ll rename to test/DebugInfo/X86/safestack-byval.ll From 175c9617cce54dd0b30b990863d066f52d73f93e Mon Sep 17 00:00:00 2001 From: NAKAMURA Takumi Date: Tue, 1 Dec 2015 10:07:41 +0000 Subject: [PATCH 066/186] llvm/test/DebugInfo/X86/safestack-byval.ll: Give an explicit triple for now. It crashes for targeting *-win32. Also revert r254375 and r254361. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254397 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/DebugInfo/X86/safestack-byval.ll | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/test/DebugInfo/X86/safestack-byval.ll b/test/DebugInfo/X86/safestack-byval.ll index 1329a95a2201..f1f6b6c1d911 100644 --- a/test/DebugInfo/X86/safestack-byval.ll +++ b/test/DebugInfo/X86/safestack-byval.ll @@ -1,8 +1,7 @@ ; Test dwarf codegen for DILocalVariable of a byval function argument that ; points to neither an argument nor an alloca. This kind of IR is generated by ; SafeStack for unsafe byval arguments. -; RUN: llc -stop-after expand-isel-pseudos %s -o /dev/null | FileCheck %s -; XFAIL: hexagon +; RUN: llc -mtriple=x86_64-unknown-unknown -stop-after expand-isel-pseudos %s -o /dev/null | FileCheck %s ; This was built by compiling the following source with SafeStack and ; simplifying the result a little. @@ -14,8 +13,6 @@ ; return zzz.a[len]; ; } -; REQUIRES: tls - ; CHECK: ![[ZZZ:.*]] = !DILocalVariable(name: "zzz", ; CHECK: ![[ZZZ_EXPR:.*]] = !DIExpression(DW_OP_deref, DW_OP_minus, 400) ; CHECK: DBG_VALUE {{.*}} ![[ZZZ]], ![[ZZZ_EXPR]] From f025ca33c1fee1854bfa4476a771d3fff1c8b0b9 Mon Sep 17 00:00:00 2001 From: Oliver Stannard Date: Tue, 1 Dec 2015 10:23:06 +0000 Subject: [PATCH 067/186] [ARM] Add subtarget features for ARMv8.2-A This adds subtarget features for ARMv8.2-A, which builds on (and requires the features from) ARMv8.1-A. Most assembler-visible features of ARMv8.2-A are system instructions, and are all required parts of the architecture, so just depend on the HasV8_2aOps subtarget feature. There is also one large, optional feature, which adds 16-bit floating point versions of all existing floating-point instructions (VFP and SIMD), this is represented by the FeatureFullFP16 subtarget feature. Differential Revision: http://reviews.llvm.org/D15036 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254399 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARM.td | 6 ++++++ lib/Target/ARM/ARMInstrInfo.td | 6 +++++- lib/Target/ARM/ARMSubtarget.cpp | 2 ++ lib/Target/ARM/ARMSubtarget.h | 9 +++++++-- 4 files changed, 20 insertions(+), 3 deletions(-) diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index ceb48d83cd84..57d5429e0aab 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -62,6 +62,9 @@ def FeatureVFP4 : SubtargetFeature<"vfp4", "HasVFPv4", "true", def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true", "Enable ARMv8 FP", [FeatureVFP4]>; +def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true", + "Enable full half-precision floating point", + [FeatureFPARMv8]>; def FeatureD16 : SubtargetFeature<"d16", "HasD16", "true", "Restrict FP to 16 double registers">; def FeatureHWDiv : SubtargetFeature<"hwdiv", "HasHardwareDivide", "true", @@ -212,6 +215,9 @@ def HasV8Ops : SubtargetFeature<"v8", "HasV8Ops", "true", def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true", "Support ARM v8.1a instructions", [HasV8Ops]>; +def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true", + "Support ARM v8.2a instructions", + [HasV8_1aOps]>; //===----------------------------------------------------------------------===// diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index 238dc338d141..4c7107aee6a2 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -215,6 +215,8 @@ def PreV8 : Predicate<"!Subtarget->hasV8Ops()">, AssemblerPredicate<"!HasV8Ops", "armv7 or earlier">; def HasV8_1a : Predicate<"Subtarget->hasV8_1aOps()">, AssemblerPredicate<"HasV8_1aOps", "armv8.1a">; +def HasV8_2a : Predicate<"Subtarget->hasV8_2aOps()">, + AssemblerPredicate<"HasV8_2aOps", "armv8.2a">; def NoVFP : Predicate<"!Subtarget->hasVFP2()">; def HasVFP2 : Predicate<"Subtarget->hasVFP2()">, AssemblerPredicate<"FeatureVFP2", "VFP2">; @@ -234,7 +236,9 @@ def HasCrypto : Predicate<"Subtarget->hasCrypto()">, def HasCRC : Predicate<"Subtarget->hasCRC()">, AssemblerPredicate<"FeatureCRC", "crc">; def HasFP16 : Predicate<"Subtarget->hasFP16()">, - AssemblerPredicate<"FeatureFP16","half-float">; + AssemblerPredicate<"FeatureFP16","half-float conversions">; +def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">, + AssemblerPredicate<"FeatureFullFP16","full half-float">; def HasDivide : Predicate<"Subtarget->hasDivide()">, AssemblerPredicate<"FeatureHWDiv", "divide in THUMB">; def HasDivideInARM : Predicate<"Subtarget->hasDivideInARMMode()">, diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp index 9e3cd36d49ef..bb6ae28065bd 100644 --- a/lib/Target/ARM/ARMSubtarget.cpp +++ b/lib/Target/ARM/ARMSubtarget.cpp @@ -112,6 +112,7 @@ void ARMSubtarget::initializeEnvironment() { HasV7Ops = false; HasV8Ops = false; HasV8_1aOps = false; + HasV8_2aOps = false; HasVFPv2 = false; HasVFPv3 = false; HasVFPv4 = false; @@ -130,6 +131,7 @@ void ARMSubtarget::initializeEnvironment() { NoMovt = false; SupportsTailCall = false; HasFP16 = false; + HasFullFP16 = false; HasD16 = false; HasHardwareDivide = false; HasHardwareDivideInARM = false; diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index c194149e8452..3addd4175a04 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -77,6 +77,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo { bool HasV7Ops; bool HasV8Ops; bool HasV8_1aOps; + bool HasV8_2aOps; /// HasVFPv2, HasVFPv3, HasVFPv4, HasFPARMv8, HasNEON - Specify what /// floating point ISAs are supported. @@ -130,10 +131,12 @@ class ARMSubtarget : public ARMGenSubtargetInfo { /// Thumb. bool SupportsTailCall; - /// HasFP16 - True if subtarget supports half-precision FP (We support VFP+HF - /// only so far) + /// HasFP16 - True if subtarget supports half-precision FP conversions bool HasFP16; + /// HasFullFP16 - True if subtarget supports half-precision FP operations + bool HasFullFP16; + /// HasD16 - True if subtarget is limited to 16 double precision /// FP registers for VFPv3. bool HasD16; @@ -309,6 +312,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo { bool hasV7Ops() const { return HasV7Ops; } bool hasV8Ops() const { return HasV8Ops; } bool hasV8_1aOps() const { return HasV8_1aOps; } + bool hasV8_2aOps() const { return HasV8_2aOps; } bool isCortexA5() const { return ARMProcFamily == CortexA5; } bool isCortexA7() const { return ARMProcFamily == CortexA7; } @@ -362,6 +366,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo { bool hasFP16() const { return HasFP16; } bool hasD16() const { return HasD16; } + bool hasFullFP16() const { return HasFullFP16; } const Triple &getTargetTriple() const { return TargetTriple; } From 27fff2c5ff6be081149b7e72ff8e667cb423be7d Mon Sep 17 00:00:00 2001 From: Oliver Stannard Date: Tue, 1 Dec 2015 10:33:56 +0000 Subject: [PATCH 068/186] [ARM] Add ARMv8.2-A to TargetParser Add ARMv8.2-A to TargetParser, so that it can be used by the clang command-line options and the .arch directive. Most testing of this will be done in clang, checking that the command-line options that this enables work. Differential Revision: http://reviews.llvm.org/D15037 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254400 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/ADT/Triple.h | 1 + include/llvm/Support/ARMTargetParser.def | 4 ++ include/llvm/Support/TargetParser.h | 1 + lib/Support/TargetParser.cpp | 3 ++ lib/Support/Triple.cpp | 2 + lib/Target/ARM/ARM.td | 12 +++++ lib/Target/ARM/ARMSubtarget.h | 2 +- lib/Target/ARM/AsmParser/ARMAsmParser.cpp | 1 + .../ARM/MCTargetDesc/ARMELFStreamer.cpp | 1 + test/MC/ARM/directive-arch-armv8.2-a.s | 46 +++++++++++++++++++ 10 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 test/MC/ARM/directive-arch-armv8.2-a.s diff --git a/include/llvm/ADT/Triple.h b/include/llvm/ADT/Triple.h index e50cec1f5e80..e01db0a61fd5 100644 --- a/include/llvm/ADT/Triple.h +++ b/include/llvm/ADT/Triple.h @@ -93,6 +93,7 @@ class Triple { enum SubArchType { NoSubArch, + ARMSubArch_v8_2a, ARMSubArch_v8_1a, ARMSubArch_v8, ARMSubArch_v7, diff --git a/include/llvm/Support/ARMTargetParser.def b/include/llvm/Support/ARMTargetParser.def index f76ac8899359..bc007923b383 100644 --- a/include/llvm/Support/ARMTargetParser.def +++ b/include/llvm/Support/ARMTargetParser.def @@ -88,6 +88,9 @@ ARM_ARCH("armv8-a", AK_ARMV8A, "8-A", "v8", ARMBuildAttrs::CPUArch::v8, ARM_ARCH("armv8.1-a", AK_ARMV8_1A, "8.1-A", "v8.1a", ARMBuildAttrs::CPUArch::v8, FK_CRYPTO_NEON_FP_ARMV8, (AEK_SEC | AEK_MP | AEK_VIRT | AEK_HWDIVARM | AEK_HWDIV | AEK_DSP | AEK_CRC)) +ARM_ARCH("armv8.2-a", AK_ARMV8_2A, "8.2-A", "v8.2a", ARMBuildAttrs::CPUArch::v8, + FK_CRYPTO_NEON_FP_ARMV8, (AEK_SEC | AEK_MP | AEK_VIRT | AEK_HWDIVARM | + AEK_HWDIV | AEK_DSP | AEK_CRC)) // Non-standard Arch names. ARM_ARCH("iwmmxt", AK_IWMMXT, "iwmmxt", "", ARMBuildAttrs::CPUArch::v5TE, FK_NONE, AEK_NONE) @@ -115,6 +118,7 @@ ARM_ARCH_EXT_NAME("mp", AEK_MP, nullptr, nullptr) ARM_ARCH_EXT_NAME("simd", AEK_SIMD, nullptr, nullptr) ARM_ARCH_EXT_NAME("sec", AEK_SEC, nullptr, nullptr) ARM_ARCH_EXT_NAME("virt", AEK_VIRT, nullptr, nullptr) +ARM_ARCH_EXT_NAME("fp16", AEK_FP16, "+fullfp16", "-fullfp16") ARM_ARCH_EXT_NAME("os", AEK_OS, nullptr, nullptr) ARM_ARCH_EXT_NAME("iwmmxt", AEK_IWMMXT, nullptr, nullptr) ARM_ARCH_EXT_NAME("iwmmxt2", AEK_IWMMXT2, nullptr, nullptr) diff --git a/include/llvm/Support/TargetParser.h b/include/llvm/Support/TargetParser.h index 6ca0281515e2..c21019d0c5b8 100644 --- a/include/llvm/Support/TargetParser.h +++ b/include/llvm/Support/TargetParser.h @@ -82,6 +82,7 @@ enum ArchExtKind : unsigned { AEK_SEC = 0x100, AEK_VIRT = 0x200, AEK_DSP = 0x400, + AEK_FP16 = 0x800, // Unsupported extensions. AEK_OS = 0x8000000, AEK_IWMMXT = 0x10000000, diff --git a/lib/Support/TargetParser.cpp b/lib/Support/TargetParser.cpp index 3aa55b3c8850..aa3a4235d794 100644 --- a/lib/Support/TargetParser.cpp +++ b/lib/Support/TargetParser.cpp @@ -410,6 +410,7 @@ static StringRef getArchSynonym(StringRef Arch) { .Case("v7em", "v7e-m") .Cases("v8", "v8a", "aarch64", "arm64", "v8-a") .Case("v8.1a", "v8.1-a") + .Case("v8.2a", "v8.2-a") .Default(Arch); } @@ -554,6 +555,7 @@ unsigned llvm::ARM::parseArchProfile(StringRef Arch) { case ARM::AK_ARMV7K: case ARM::AK_ARMV8A: case ARM::AK_ARMV8_1A: + case ARM::AK_ARMV8_2A: return ARM::PK_A; } return ARM::PK_INVALID; @@ -594,6 +596,7 @@ unsigned llvm::ARM::parseArchVersion(StringRef Arch) { return 7; case ARM::AK_ARMV8A: case ARM::AK_ARMV8_1A: + case ARM::AK_ARMV8_2A: return 8; } return 0; diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp index f1f2d26b4e70..ed91c209d545 100644 --- a/lib/Support/Triple.cpp +++ b/lib/Support/Triple.cpp @@ -519,6 +519,8 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) { return Triple::ARMSubArch_v8; case ARM::AK_ARMV8_1A: return Triple::ARMSubArch_v8_1a; + case ARM::AK_ARMV8_2A: + return Triple::ARMSubArch_v8_2a; default: return Triple::NoSubArch; } diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index 57d5429e0aab..a0fc5f68eb25 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -360,6 +360,18 @@ def ARMv81a : Architecture<"armv8.1-a", "ARMv81a", [HasV8_1aOps, FeatureCrypto, FeatureCRC]>; +def ARMv82a : Architecture<"armv8.2-a", "ARMv82a", [HasV8_2aOps, + FeatureAClass, + FeatureDB, + FeatureFPARMv8, + FeatureNEON, + FeatureDSP, + FeatureTrustZone, + FeatureMP, + FeatureVirtualization, + FeatureCrypto, + FeatureCRC]>; + // Aliases def IWMMXT : Architecture<"iwmmxt", "ARMv5te", [ARMv5te]>; def IWMMXT2 : Architecture<"iwmmxt2", "ARMv5te", [ARMv5te]>; diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index 3addd4175a04..3ad35d24ebab 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -52,7 +52,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo { enum ARMArchEnum { ARMv2, ARMv2a, ARMv3, ARMv3m, ARMv4, ARMv4t, ARMv5, ARMv5t, ARMv5te, ARMv5tej, ARMv6, ARMv6k, ARMv6kz, ARMv6t2, ARMv6m, ARMv6sm, ARMv7a, ARMv7r, - ARMv7m, ARMv7em, ARMv8a, ARMv81a + ARMv7m, ARMv7em, ARMv8a, ARMv81a, ARMv82a }; /// ARMProcFamily - ARM processor family: Cortex-A8, Cortex-A9, and others. diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index ba144458386d..8341fbc4efd1 100644 --- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -9930,6 +9930,7 @@ static const struct { { ARM::AEK_SEC, Feature_HasV6K, {ARM::FeatureTrustZone} }, // FIXME: Only available in A-class, isel not predicated { ARM::AEK_VIRT, Feature_HasV7, {ARM::FeatureVirtualization} }, + { ARM::AEK_FP16, Feature_HasV8_2a, {ARM::FeatureFPARMv8, ARM::FeatureFullFP16} }, // FIXME: Unsupported extensions. { ARM::AEK_OS, Feature_None, {} }, { ARM::AEK_IWMMXT, Feature_None, {} }, diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp index 42591c25d6e6..f316ad17576a 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp @@ -749,6 +749,7 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() { case ARM::AK_ARMV8A: case ARM::AK_ARMV8_1A: + case ARM::AK_ARMV8_2A: setAttributeItem(CPU_arch_profile, ApplicationProfile, false); setAttributeItem(ARM_ISA_use, Allowed, false); setAttributeItem(THUMB_ISA_use, AllowThumb32, false); diff --git a/test/MC/ARM/directive-arch-armv8.2-a.s b/test/MC/ARM/directive-arch-armv8.2-a.s new file mode 100644 index 000000000000..c9f4469fb0ae --- /dev/null +++ b/test/MC/ARM/directive-arch-armv8.2-a.s @@ -0,0 +1,46 @@ +@ Test the .arch directive for armv8.2-a + +@ This test case will check the default .ARM.attributes value for the +@ armv8-a architecture. + +@ RUN: llvm-mc -triple arm-eabi -filetype asm %s \ +@ RUN: | FileCheck %s -check-prefix CHECK-ASM +@ RUN: llvm-mc -triple arm-eabi -filetype obj %s \ +@ RUN: | llvm-readobj -arm-attributes | FileCheck %s -check-prefix CHECK-ATTR + + .syntax unified + .arch armv8.2-a + +@ CHECK-ASM: .arch armv8.2-a + +@ CHECK-ATTR: FileAttributes { +@ CHECK-ATTR: Attribute { +@ CHECK-ATTR: TagName: CPU_name +@ CHECK-ATTR: Value: 8.2-A +@ CHECK-ATTR: } +@ CHECK-ATTR: Attribute { +@ CHECK-ATTR: TagName: CPU_arch +@ CHECK-ATTR: Description: ARM v8 +@ CHECK-ATTR: } +@ CHECK-ATTR: Attribute { +@ CHECK-ATTR: TagName: CPU_arch_profile +@ CHECK-ATTR: Description: Application +@ CHECK-ATTR: } +@ CHECK-ATTR: Attribute { +@ CHECK-ATTR: TagName: ARM_ISA_use +@ CHECK-ATTR: Description: Permitted +@ CHECK-ATTR: } +@ CHECK-ATTR: Attribute { +@ CHECK-ATTR: TagName: THUMB_ISA_use +@ CHECK-ATTR: Description: Thumb-2 +@ CHECK-ATTR: } +@ CHECK-ATTR: Attribute { +@ CHECK-ATTR: TagName: MPextension_use +@ CHECK-ATTR: Description: Permitted +@ CHECK-ATTR: } +@ CHECK-ATTR: Attribute { +@ CHECK-ATTR: TagName: Virtualization_use +@ CHECK-ATTR: Description: TrustZone + Virtualization Extensions +@ CHECK-ATTR: } +@ CHECK-ATTR: } + From ce8e2a0d91724b07d3a262c6005e5705a3b7837e Mon Sep 17 00:00:00 2001 From: Oliver Stannard Date: Tue, 1 Dec 2015 10:48:51 +0000 Subject: [PATCH 069/186] [AArch64] Add ARMv8.2-A Statistical Profiling Extension The Statistical Profiling Extension is an optional extension to ARMv8.2-A. Since it is an optional extension, I have added the FeatureSPE subtarget feature to control it. The assembler-visible parts of this extension are the new "psb csync" instruction, which is equivalent to "hint #17", and a number of system registers. Differential Revision: http://reviews.llvm.org/D15021 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254401 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AArch64/AArch64.td | 3 + lib/Target/AArch64/AArch64InstrFormats.td | 19 ++++ lib/Target/AArch64/AArch64InstrInfo.td | 5 + lib/Target/AArch64/AArch64Subtarget.h | 2 + .../AArch64/AsmParser/AArch64AsmParser.cpp | 73 ++++++++++++++- .../InstPrinter/AArch64InstPrinter.cpp | 13 +++ .../AArch64/InstPrinter/AArch64InstPrinter.h | 3 + lib/Target/AArch64/Utils/AArch64BaseInfo.cpp | 23 +++++ lib/Target/AArch64/Utils/AArch64BaseInfo.h | 30 ++++++ .../AArch64/armv8.2a-statistical-profiling.s | 87 ++++++++++++++++++ .../armv8.2a-statistical-profiling.txt | 91 +++++++++++++++++++ utils/TableGen/AsmWriterEmitter.cpp | 12 ++- 12 files changed, 355 insertions(+), 6 deletions(-) create mode 100644 test/MC/AArch64/armv8.2a-statistical-profiling.s create mode 100644 test/MC/Disassembler/AArch64/armv8.2a-statistical-profiling.txt diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td index e82cdd00ba1e..5c19b3efdb11 100644 --- a/lib/Target/AArch64/AArch64.td +++ b/lib/Target/AArch64/AArch64.td @@ -38,6 +38,9 @@ def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true", def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true", "Full FP16", [FeatureFPARMv8]>; +def FeatureSPE : SubtargetFeature<"spe", "HasSPE", "true", + "Enable Statistical Profiling extension">; + /// Cyclone has register move instructions which are "free". def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true", "Has zero-cycle register moves">; diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td index 752a153c0574..5eef82153e39 100644 --- a/lib/Target/AArch64/AArch64InstrFormats.td +++ b/lib/Target/AArch64/AArch64InstrFormats.td @@ -911,6 +911,25 @@ def msr_sysreg_op : Operand { let PrintMethod = "printMSRSystemRegister"; } +def PSBHintOperand : AsmOperandClass { + let Name = "PSBHint"; + let ParserMethod = "tryParsePSBHint"; +} +def psbhint_op : Operand { + let ParserMatchClass = PSBHintOperand; + let PrintMethod = "printPSBHintOp"; + let MCOperandPredicate = [{ + // Check, if operand is valid, to fix exhaustive aliasing in disassembly. + // "psb" is an alias to "hint" only for certain values of CRm:Op2 fields. + if (!MCOp.isImm()) + return false; + bool ValidNamed; + (void)AArch64PSBHint::PSBHintMapper().toString(MCOp.getImm(), + STI.getFeatureBits(), ValidNamed); + return ValidNamed; + }]; +} + class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg), "mrs", "\t$Rt, $systemreg"> { bits<16> systemreg; diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index 0c43003975c5..881f55ebeef9 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -29,6 +29,8 @@ def HasCRC : Predicate<"Subtarget->hasCRC()">, def HasPerfMon : Predicate<"Subtarget->hasPerfMon()">; def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">, AssemblerPredicate<"FeatureFullFP16", "fullfp16">; +def HasSPE : Predicate<"Subtarget->hasSPE()">, + AssemblerPredicate<"FeatureSPE", "spe">; def IsLE : Predicate<"Subtarget->isLittleEndian()">; def IsBE : Predicate<"!Subtarget->isLittleEndian()">; @@ -382,6 +384,9 @@ def : InstAlias<"wfi", (HINT 0b011)>; def : InstAlias<"sev", (HINT 0b100)>; def : InstAlias<"sevl", (HINT 0b101)>; +// v8.2a Statistical Profiling extension +def : InstAlias<"psb $op", (HINT psbhint_op:$op)>, Requires<[HasSPE]>; + // As far as LLVM is concerned this writes to the system's exclusive monitors. let mayLoad = 1, mayStore = 1 in def CLREX : CRmSystemI; diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h index 9aa6ef9ab670..73daf6051b70 100644 --- a/lib/Target/AArch64/AArch64Subtarget.h +++ b/lib/Target/AArch64/AArch64Subtarget.h @@ -47,6 +47,7 @@ class AArch64Subtarget : public AArch64GenSubtargetInfo { bool HasCRC; bool HasPerfMon; bool HasFullFP16; + bool HasSPE; // HasZeroCycleRegMove - Has zero-cycle register mov instructions. bool HasZeroCycleRegMove; @@ -124,6 +125,7 @@ class AArch64Subtarget : public AArch64GenSubtargetInfo { bool hasPerfMon() const { return HasPerfMon; } bool hasFullFP16() const { return HasFullFP16; } + bool hasSPE() const { return HasSPE; } bool isLittleEndian() const { return IsLittle; } diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 165843fc84c9..f0ad855ed5e6 100644 --- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -100,6 +100,7 @@ class AArch64AsmParser : public MCTargetAsmParser { OperandMatchResultTy tryParseSysReg(OperandVector &Operands); OperandMatchResultTy tryParseSysCROperand(OperandVector &Operands); OperandMatchResultTy tryParsePrefetch(OperandVector &Operands); + OperandMatchResultTy tryParsePSBHint(OperandVector &Operands); OperandMatchResultTy tryParseAdrpLabel(OperandVector &Operands); OperandMatchResultTy tryParseAdrLabel(OperandVector &Operands); OperandMatchResultTy tryParseFPImm(OperandVector &Operands); @@ -159,7 +160,8 @@ class AArch64Operand : public MCParsedAsmOperand { k_Prefetch, k_ShiftExtend, k_FPImm, - k_Barrier + k_Barrier, + k_PSBHint, } Kind; SMLoc StartLoc, EndLoc; @@ -227,6 +229,12 @@ class AArch64Operand : public MCParsedAsmOperand { unsigned Length; }; + struct PSBHintOp { + unsigned Val; + const char *Data; + unsigned Length; + }; + struct ShiftExtendOp { AArch64_AM::ShiftExtendType Type; unsigned Amount; @@ -250,6 +258,7 @@ class AArch64Operand : public MCParsedAsmOperand { struct SysRegOp SysReg; struct SysCRImmOp SysCRImm; struct PrefetchOp Prefetch; + struct PSBHintOp PSBHint; struct ShiftExtendOp ShiftExtend; }; @@ -301,6 +310,9 @@ class AArch64Operand : public MCParsedAsmOperand { case k_Prefetch: Prefetch = o.Prefetch; break; + case k_PSBHint: + PSBHint = o.PSBHint; + break; case k_ShiftExtend: ShiftExtend = o.ShiftExtend; break; @@ -392,6 +404,16 @@ class AArch64Operand : public MCParsedAsmOperand { return Prefetch.Val; } + unsigned getPSBHint() const { + assert(Kind == k_PSBHint && "Invalid access!"); + return PSBHint.Val; + } + + StringRef getPSBHintName() const { + assert(Kind == k_PSBHint && "Invalid access!"); + return StringRef(PSBHint.Data, PSBHint.Length); + } + StringRef getPrefetchName() const { assert(Kind == k_Prefetch && "Invalid access!"); return StringRef(Prefetch.Data, Prefetch.Length); @@ -961,6 +983,7 @@ class AArch64Operand : public MCParsedAsmOperand { } bool isSysCR() const { return Kind == k_SysCR; } bool isPrefetch() const { return Kind == k_Prefetch; } + bool isPSBHint() const { return Kind == k_PSBHint; } bool isShiftExtend() const { return Kind == k_ShiftExtend; } bool isShifter() const { if (!isShiftExtend()) @@ -1534,6 +1557,11 @@ class AArch64Operand : public MCParsedAsmOperand { Inst.addOperand(MCOperand::createImm(getPrefetch())); } + void addPSBHintOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(getPSBHint())); + } + void addShifterOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); unsigned Imm = @@ -1730,6 +1758,19 @@ class AArch64Operand : public MCParsedAsmOperand { return Op; } + static std::unique_ptr CreatePSBHint(unsigned Val, + StringRef Str, + SMLoc S, + MCContext &Ctx) { + auto Op = make_unique(k_PSBHint, Ctx); + Op->PSBHint.Val = Val; + Op->PSBHint.Data = Str.data(); + Op->PSBHint.Length = Str.size(); + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + static std::unique_ptr CreateShiftExtend(AArch64_AM::ShiftExtendType ShOp, unsigned Val, bool HasExplicitAmount, SMLoc S, SMLoc E, MCContext &Ctx) { @@ -1803,6 +1844,10 @@ void AArch64Operand::print(raw_ostream &OS) const { OS << ""; break; } + case k_PSBHint: { + OS << getPSBHintName(); + break; + } case k_ShiftExtend: { OS << "<" << AArch64_AM::getShiftExtendName(getShiftExtendType()) << " #" << getShiftExtendAmount(); @@ -2069,6 +2114,32 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) { return MatchOperand_Success; } +/// tryParsePSBHint - Try to parse a PSB operand, mapped to Hint command +AArch64AsmParser::OperandMatchResultTy +AArch64AsmParser::tryParsePSBHint(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + SMLoc S = getLoc(); + const AsmToken &Tok = Parser.getTok(); + if (Tok.isNot(AsmToken::Identifier)) { + TokError("invalid operand for instruction"); + return MatchOperand_ParseFail; + } + + bool Valid; + auto Mapper = AArch64PSBHint::PSBHintMapper(); + unsigned psbhint = + Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid); + if (!Valid) { + TokError("invalid operand for instruction"); + return MatchOperand_ParseFail; + } + + Parser.Lex(); // Eat identifier token. + Operands.push_back(AArch64Operand::CreatePSBHint(psbhint, Tok.getString(), + S, getContext())); + return MatchOperand_Success; +} + /// tryParseAdrpLabel - Parse and validate a source label for the ADRP /// instruction. AArch64AsmParser::OperandMatchResultTy diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp index d8937b57e490..480ed0d263ac 100644 --- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp +++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp @@ -1144,6 +1144,19 @@ void AArch64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum, O << '#' << prfop; } +void AArch64InstPrinter::printPSBHintOp(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned psbhintop = MI->getOperand(OpNum).getImm(); + bool Valid; + StringRef Name = + AArch64PSBHint::PSBHintMapper().toString(psbhintop, STI.getFeatureBits(), Valid); + if (Valid) + O << Name; + else + O << '#' << psbhintop; +} + void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h index a94721816d33..a767aa451c6a 100644 --- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h +++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h @@ -123,6 +123,9 @@ class AArch64InstPrinter : public MCInstPrinter { void printPrefetchOp(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); + void printPSBHintOp(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printFPImmOperand(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp index f657eaab8151..78f5289ec26d 100644 --- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp +++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp @@ -154,6 +154,14 @@ const AArch64NamedImmMapper::Mapping AArch64PState::PStateMapper::PStateMappings AArch64PState::PStateMapper::PStateMapper() : AArch64NamedImmMapper(PStateMappings, 0) {} +const AArch64NamedImmMapper::Mapping AArch64PSBHint::PSBHintMapper::PSBHintMappings[] = { + // v8.2a "Statistical Profiling" extension-specific PSB operand + {"csync", CSync, {AArch64::FeatureSPE}}, +}; + +AArch64PSBHint::PSBHintMapper::PSBHintMapper() + : AArch64NamedImmMapper(PSBHintMappings, 0) {} + const AArch64NamedImmMapper::Mapping AArch64SysReg::MRSMapper::MRSMappings[] = { {"mdccsr_el0", MDCCSR_EL0, {}}, {"dbgdtrrx_el0", DBGDTRRX_EL0, {}}, @@ -808,6 +816,21 @@ const AArch64NamedImmMapper::Mapping AArch64SysReg::SysRegMapper::SysRegMappings // v8.2a registers {"uao", UAO, {AArch64::HasV8_2aOps}}, + + // v8.2a "Statistical Profiling extension" registers + {"pmblimitr_el1", PMBLIMITR_EL1, {AArch64::FeatureSPE}}, + {"pmbptr_el1", PMBPTR_EL1, {AArch64::FeatureSPE}}, + {"pmbsr_el1", PMBSR_EL1, {AArch64::FeatureSPE}}, + {"pmbidr_el1", PMBIDR_EL1, {AArch64::FeatureSPE}}, + {"pmscr_el2", PMSCR_EL2, {AArch64::FeatureSPE}}, + {"pmscr_el12", PMSCR_EL12, {AArch64::FeatureSPE}}, + {"pmscr_el1", PMSCR_EL1, {AArch64::FeatureSPE}}, + {"pmsicr_el1", PMSICR_EL1, {AArch64::FeatureSPE}}, + {"pmsirr_el1", PMSIRR_EL1, {AArch64::FeatureSPE}}, + {"pmsfcr_el1", PMSFCR_EL1, {AArch64::FeatureSPE}}, + {"pmsevfr_el1", PMSEVFR_EL1, {AArch64::FeatureSPE}}, + {"pmslatfr_el1", PMSLATFR_EL1, {AArch64::FeatureSPE}}, + {"pmsidr_el1", PMSIDR_EL1, {AArch64::FeatureSPE}}, }; uint32_t diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h index 5a6b54bbee83..f649cb9b8a8d 100644 --- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -478,6 +478,21 @@ namespace AArch64PState { } +namespace AArch64PSBHint { + enum PSBHintValues { + Invalid = -1, + // v8.2a "Statistical Profiling" extension-specific PSB operands + CSync = 0x11, // psb csync = hint #0x11 + }; + + struct PSBHintMapper : AArch64NamedImmMapper { + const static Mapping PSBHintMappings[]; + + PSBHintMapper(); + }; + +} + namespace AArch64SE { enum ShiftExtSpecifiers { Invalid = -1, @@ -1199,6 +1214,21 @@ namespace AArch64SysReg { // v8.2a registers UAO = 0xc214, // 11 000 0100 0010 100 + // v8.2a "Statistical Profiling extension" registers + PMBLIMITR_EL1 = 0xc4d0, // 11 000 1001 1010 000 + PMBPTR_EL1 = 0xc4d1, // 11 000 1001 1010 001 + PMBSR_EL1 = 0xc4d3, // 11 000 1001 1010 011 + PMBIDR_EL1 = 0xc4d7, // 11 000 1001 1010 111 + PMSCR_EL2 = 0xe4c8, // 11 100 1001 1001 000 + PMSCR_EL12 = 0xecc8, // 11 101 1001 1001 000 + PMSCR_EL1 = 0xc4c8, // 11 000 1001 1001 000 + PMSICR_EL1 = 0xc4ca, // 11 000 1001 1001 010 + PMSIRR_EL1 = 0xc4cb, // 11 000 1001 1001 011 + PMSFCR_EL1 = 0xc4cc, // 11 000 1001 1001 100 + PMSEVFR_EL1 = 0xc4cd, // 11 000 1001 1001 101 + PMSLATFR_EL1 = 0xc4ce, // 11 000 1001 1001 110 + PMSIDR_EL1 = 0xc4cf, // 11 000 1001 1001 111 + // Cyclone specific system registers CPM_IOACC_CTL_EL3 = 0xff90, }; diff --git a/test/MC/AArch64/armv8.2a-statistical-profiling.s b/test/MC/AArch64/armv8.2a-statistical-profiling.s new file mode 100644 index 000000000000..5cb109318786 --- /dev/null +++ b/test/MC/AArch64/armv8.2a-statistical-profiling.s @@ -0,0 +1,87 @@ +// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+spe < %s | FileCheck %s +// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding < %s 2>&1 | FileCheck --check-prefix=NO_SPE %s + + psb csync +// CHECK: psb csync // encoding: [0x3f,0x22,0x03,0xd5] +// NO_SPE: invalid operand for instruction + + msr pmblimitr_el1, x0 + msr pmbptr_el1, x0 + msr pmbsr_el1, x0 + msr pmbidr_el1, x0 + msr pmscr_el2, x0 + msr pmscr_el12, x0 + msr pmscr_el1, x0 + msr pmsicr_el1, x0 + msr pmsirr_el1, x0 + msr pmsfcr_el1, x0 + msr pmsevfr_el1, x0 + msr pmslatfr_el1, x0 + msr pmsidr_el1, x0 +// CHECK: msr PMBLIMITR_EL1, x0 // encoding: [0x00,0x9a,0x18,0xd5] +// CHECK: msr PMBPTR_EL1, x0 // encoding: [0x20,0x9a,0x18,0xd5] +// CHECK: msr PMBSR_EL1, x0 // encoding: [0x60,0x9a,0x18,0xd5] +// CHECK: msr PMBIDR_EL1, x0 // encoding: [0xe0,0x9a,0x18,0xd5] +// CHECK: msr PMSCR_EL2, x0 // encoding: [0x00,0x99,0x1c,0xd5] +// CHECK: msr PMSCR_EL12, x0 // encoding: [0x00,0x99,0x1d,0xd5] +// CHECK: msr PMSCR_EL1, x0 // encoding: [0x00,0x99,0x18,0xd5] +// CHECK: msr PMSICR_EL1, x0 // encoding: [0x40,0x99,0x18,0xd5] +// CHECK: msr PMSIRR_EL1, x0 // encoding: [0x60,0x99,0x18,0xd5] +// CHECK: msr PMSFCR_EL1, x0 // encoding: [0x80,0x99,0x18,0xd5] +// CHECK: msr PMSEVFR_EL1, x0 // encoding: [0xa0,0x99,0x18,0xd5] +// CHECK: msr PMSLATFR_EL1, x0 // encoding: [0xc0,0x99,0x18,0xd5] +// CHECK: msr PMSIDR_EL1, x0 // encoding: [0xe0,0x99,0x18,0xd5] +// NO_SPE: error: expected writable system register or pstate +// NO_SPE: error: expected writable system register or pstate +// NO_SPE: error: expected writable system register or pstate +// NO_SPE: error: expected writable system register or pstate +// NO_SPE: error: expected writable system register or pstate +// NO_SPE: error: expected writable system register or pstate +// NO_SPE: error: expected writable system register or pstate +// NO_SPE: error: expected writable system register or pstate +// NO_SPE: error: expected writable system register or pstate +// NO_SPE: error: expected writable system register or pstate +// NO_SPE: error: expected writable system register or pstate +// NO_SPE: error: expected writable system register or pstate +// NO_SPE: error: expected writable system register or pstate + +mrs x0, pmblimitr_el1 + mrs x0, pmbptr_el1 + mrs x0, pmbsr_el1 + mrs x0, pmbidr_el1 + mrs x0, pmscr_el2 + mrs x0, pmscr_el12 + mrs x0, pmscr_el1 + mrs x0, pmsicr_el1 + mrs x0, pmsirr_el1 + mrs x0, pmsfcr_el1 + mrs x0, pmsevfr_el1 + mrs x0, pmslatfr_el1 + mrs x0, pmsidr_el1 + +// CHECK: mrs x0, PMBLIMITR_EL1 // encoding: [0x00,0x9a,0x38,0xd5] +// CHECK: mrs x0, PMBPTR_EL1 // encoding: [0x20,0x9a,0x38,0xd5] +// CHECK: mrs x0, PMBSR_EL1 // encoding: [0x60,0x9a,0x38,0xd5] +// CHECK: mrs x0, PMBIDR_EL1 // encoding: [0xe0,0x9a,0x38,0xd5] +// CHECK: mrs x0, PMSCR_EL2 // encoding: [0x00,0x99,0x3c,0xd5] +// CHECK: mrs x0, PMSCR_EL12 // encoding: [0x00,0x99,0x3d,0xd5] +// CHECK: mrs x0, PMSCR_EL1 // encoding: [0x00,0x99,0x38,0xd5] +// CHECK: mrs x0, PMSICR_EL1 // encoding: [0x40,0x99,0x38,0xd5] +// CHECK: mrs x0, PMSIRR_EL1 // encoding: [0x60,0x99,0x38,0xd5] +// CHECK: mrs x0, PMSFCR_EL1 // encoding: [0x80,0x99,0x38,0xd5] +// CHECK: mrs x0, PMSEVFR_EL1 // encoding: [0xa0,0x99,0x38,0xd5] +// CHECK: mrs x0, PMSLATFR_EL1 // encoding: [0xc0,0x99,0x38,0xd5] +// CHECK: mrs x0, PMSIDR_EL1 // encoding: [0xe0,0x99,0x38,0xd5] +// NO_SPE: error: expected readable system register +// NO_SPE: error: expected readable system register +// NO_SPE: error: expected readable system register +// NO_SPE: error: expected readable system register +// NO_SPE: error: expected readable system register +// NO_SPE: error: expected readable system register +// NO_SPE: error: expected readable system register +// NO_SPE: error: expected readable system register +// NO_SPE: error: expected readable system register +// NO_SPE: error: expected readable system register +// NO_SPE: error: expected readable system register +// NO_SPE: error: expected readable system register +// NO_SPE: error: expected readable system register diff --git a/test/MC/Disassembler/AArch64/armv8.2a-statistical-profiling.txt b/test/MC/Disassembler/AArch64/armv8.2a-statistical-profiling.txt new file mode 100644 index 000000000000..e83d750e715e --- /dev/null +++ b/test/MC/Disassembler/AArch64/armv8.2a-statistical-profiling.txt @@ -0,0 +1,91 @@ +# RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+spe --disassemble < %s | FileCheck %s +# RUN: llvm-mc -triple aarch64-none-linux-gnu --disassemble < %s | FileCheck --check-prefix=NO_SPE %s + +[0x1f,0x22,0x03,0xd5] +# CHECK: hint #0x10 +# NO_SPE: hint #0x10 + +[0x3f,0x22,0x03,0xd5] +# CHECK: psb csync +# NO_SPE: hint #0x11 + +[0x00,0x9a,0x18,0xd5] +[0x20,0x9a,0x18,0xd5] +[0x60,0x9a,0x18,0xd5] +[0xe0,0x9a,0x18,0xd5] +[0x00,0x99,0x1c,0xd5] +[0x00,0x99,0x1d,0xd5] +[0x00,0x99,0x18,0xd5] +[0x40,0x99,0x18,0xd5] +[0x60,0x99,0x18,0xd5] +[0x80,0x99,0x18,0xd5] +[0xa0,0x99,0x18,0xd5] +[0xc0,0x99,0x18,0xd5] +[0xe0,0x99,0x18,0xd5] +# CHECK: msr PMBLIMITR_EL1, x0 +# NO_SPE: msr S3_0_C9_C10_0, x0 +# CHECK: msr PMBPTR_EL1, x0 +# NO_SPE: msr S3_0_C9_C10_1, x0 +# CHECK: msr PMBSR_EL1, x0 +# NO_SPE: msr S3_0_C9_C10_3, x0 +# CHECK: msr PMBIDR_EL1, x0 +# NO_SPE: msr S3_0_C9_C10_7, x0 +# CHECK: msr PMSCR_EL2, x0 +# NO_SPE: msr S3_4_C9_C9_0, x0 +# CHECK: msr PMSCR_EL12, x0 +# NO_SPE: msr S3_5_C9_C9_0, x0 +# CHECK: msr PMSCR_EL1, x0 +# NO_SPE: msr S3_0_C9_C9_0, x0 +# CHECK: msr PMSICR_EL1, x0 +# NO_SPE: msr S3_0_C9_C9_2, x0 +# CHECK: msr PMSIRR_EL1, x0 +# NO_SPE: msr S3_0_C9_C9_3, x0 +# CHECK: msr PMSFCR_EL1, x0 +# NO_SPE: msr S3_0_C9_C9_4, x0 +# CHECK: msr PMSEVFR_EL1, x0 +# NO_SPE: msr S3_0_C9_C9_5, x0 +# CHECK: msr PMSLATFR_EL1, x0 +# NO_SPE: msr S3_0_C9_C9_6, x0 +# CHECK: msr PMSIDR_EL1, x0 +# NO_SPE: msr S3_0_C9_C9_7, x0 + +[0x00,0x9a,0x38,0xd5] +[0x20,0x9a,0x38,0xd5] +[0x60,0x9a,0x38,0xd5] +[0xe0,0x9a,0x38,0xd5] +[0x00,0x99,0x3c,0xd5] +[0x00,0x99,0x3d,0xd5] +[0x00,0x99,0x38,0xd5] +[0x40,0x99,0x38,0xd5] +[0x60,0x99,0x38,0xd5] +[0x80,0x99,0x38,0xd5] +[0xa0,0x99,0x38,0xd5] +[0xc0,0x99,0x38,0xd5] +[0xe0,0x99,0x38,0xd5] + +# CHECK: mrs x0, PMBLIMITR_EL1 +# NO_SPE: mrs x0, S3_0_C9_C10_0 +# CHECK: mrs x0, PMBPTR_EL1 +# NO_SPE: mrs x0, S3_0_C9_C10_1 +# CHECK: mrs x0, PMBSR_EL1 +# NO_SPE: mrs x0, S3_0_C9_C10_3 +# CHECK: mrs x0, PMBIDR_EL1 +# NO_SPE: mrs x0, S3_0_C9_C10_7 +# CHECK: mrs x0, PMSCR_EL2 +# NO_SPE: mrs x0, S3_4_C9_C9_0 +# CHECK: mrs x0, PMSCR_EL12 +# NO_SPE: mrs x0, S3_5_C9_C9_0 +# CHECK: mrs x0, PMSCR_EL1 +# NO_SPE: mrs x0, S3_0_C9_C9_0 +# CHECK: mrs x0, PMSICR_EL1 +# NO_SPE: mrs x0, S3_0_C9_C9_2 +# CHECK: mrs x0, PMSIRR_EL1 +# NO_SPE: mrs x0, S3_0_C9_C9_3 +# CHECK: mrs x0, PMSFCR_EL1 +# NO_SPE: mrs x0, S3_0_C9_C9_4 +# CHECK: mrs x0, PMSEVFR_EL1 +# NO_SPE: mrs x0, S3_0_C9_C9_5 +# CHECK: mrs x0, PMSLATFR_EL1 +# NO_SPE: mrs x0, S3_0_C9_C9_6 +# CHECK: mrs x0, PMSIDR_EL1 +# NO_SPE: mrs x0, S3_0_C9_C9_7 diff --git a/utils/TableGen/AsmWriterEmitter.cpp b/utils/TableGen/AsmWriterEmitter.cpp index 4b543d3f9fc6..a954998d36e9 100644 --- a/utils/TableGen/AsmWriterEmitter.cpp +++ b/utils/TableGen/AsmWriterEmitter.cpp @@ -901,7 +901,7 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) { break; // No conditions on this operand at all } Cond = Target.getName() + ClassName + "ValidateMCOperand(" + - Op + ", " + llvm::utostr(Entry) + ")"; + Op + ", STI, " + llvm::utostr(Entry) + ")"; } // for all subcases of ResultOperand::K_Record: IAP.addCond(Cond); @@ -996,8 +996,9 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) { if (!MCOpPredicates.empty()) O << "static bool " << Target.getName() << ClassName - << "ValidateMCOperand(\n" - << " const MCOperand &MCOp, unsigned PredicateIndex);\n"; + << "ValidateMCOperand(const MCOperand &MCOp,\n" + << " const MCSubtargetInfo &STI,\n" + << " unsigned PredicateIndex);\n"; O << HeaderO.str(); O.indent(2) << "const char *AsmString;\n"; @@ -1069,8 +1070,9 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) { if (!MCOpPredicates.empty()) { O << "static bool " << Target.getName() << ClassName - << "ValidateMCOperand(\n" - << " const MCOperand &MCOp, unsigned PredicateIndex) {\n" + << "ValidateMCOperand(const MCOperand &MCOp,\n" + << " const MCSubtargetInfo &STI,\n" + << " unsigned PredicateIndex) {\n" << " switch (PredicateIndex) {\n" << " default:\n" << " llvm_unreachable(\"Unknown MCOperandPredicate kind\");\n" From 1b91dc22354928116e2af525b10c86a1c645b8a0 Mon Sep 17 00:00:00 2001 From: Cong Hou Date: Tue, 1 Dec 2015 11:05:39 +0000 Subject: [PATCH 070/186] Allow known and unknown probabilities coexist in MBB's successor list. Previously it is not allowed for each MBB to have successors with both known and unknown probabilities. However, this may be too strict as at this stage we could not always guarantee that. It is better to remove this restriction now, and I will work on validating MBB's successors' probabilities first (for example, check if the sum is approximate one). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254402 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/MachineBasicBlock.cpp | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp index c9c6a9d62462..de91f0db75a8 100644 --- a/lib/CodeGen/MachineBasicBlock.cpp +++ b/lib/CodeGen/MachineBasicBlock.cpp @@ -510,13 +510,8 @@ void MachineBasicBlock::addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob) { // Probability list is either empty (if successor list isn't empty, this means // disabled optimization) or has the same size as successor list. - if (!(Probs.empty() && !Successors.empty())) { - assert((Probs.empty() || (Prob.isUnknown() && Probs.back().isUnknown()) || - (!Prob.isUnknown() && !Probs.back().isUnknown())) && - "Successors with both known and unknwon probabilities are not " - "allowed."); + if (!(Probs.empty() && !Successors.empty())) Probs.push_back(Prob); - } Successors.push_back(Succ); Succ->addPredecessor(this); } @@ -1116,10 +1111,24 @@ MachineBasicBlock::findDebugLoc(instr_iterator MBBI) { /// Return probability of the edge from this block to MBB. BranchProbability MachineBasicBlock::getSuccProbability(const_succ_iterator Succ) const { - if (Probs.empty() || Probs.back().isUnknown()) + if (Probs.empty()) return BranchProbability(1, succ_size()); - return *getProbabilityIterator(Succ); + const auto &Prob = *getProbabilityIterator(Succ); + if (Prob.isUnknown()) { + // For unknown probabilities, collect the sum of all known ones, and evenly + // ditribute the complemental of the sum to each unknown probability. + unsigned KnownProbNum = 0; + auto Sum = BranchProbability::getZero(); + for (auto &P : Probs) { + if (!P.isUnknown()) { + Sum += P; + KnownProbNum++; + } + } + return Sum.getCompl() / (Probs.size() - KnownProbNum); + } else + return Prob; } /// Set successor probability of a given iterator. From 2fdb3b844fe006f3b28030a5c2e99940daa74d28 Mon Sep 17 00:00:00 2001 From: Yury Gribov Date: Tue, 1 Dec 2015 11:40:55 +0000 Subject: [PATCH 071/186] Introduce new @llvm.get.dynamic.area.offset.i{32, 64} intrinsics. The @llvm.get.dynamic.area.offset.* intrinsic family is used to get the offset from native stack pointer to the address of the most recent dynamic alloca on the caller's stack. These intrinsics are intendend for use in combination with @llvm.stacksave and @llvm.restore to get a pointer to the most recent dynamic alloca. This is useful, for example, for AddressSanitizer's stack unpoisoning routines. Patch by Max Ostapenko. Differential Revision: http://reviews.llvm.org/D14983 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254404 91177308-0d34-0410-b5e6-96231b3b80d8 --- docs/LangRef.rst | 42 +++++++++++++++++++ include/llvm/CodeGen/ISDOpcodes.h | 6 +++ include/llvm/IR/Intrinsics.td | 2 + lib/CodeGen/IntrinsicLowering.cpp | 7 ++++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 8 ++++ .../SelectionDAG/SelectionDAGBuilder.cpp | 15 +++++++ .../SelectionDAG/SelectionDAGDumper.cpp | 1 + lib/CodeGen/TargetLoweringBase.cpp | 3 ++ lib/Target/PowerPC/PPCISelLowering.cpp | 20 +++++++++ lib/Target/PowerPC/PPCISelLowering.h | 7 ++++ lib/Target/PowerPC/PPCInstr64Bit.td | 2 + lib/Target/PowerPC/PPCInstrInfo.td | 4 ++ lib/Target/PowerPC/PPCRegisterInfo.cpp | 26 ++++++++++++ lib/Target/PowerPC/PPCRegisterInfo.h | 1 + test/CodeGen/PowerPC/dyn-alloca-offset.ll | 21 ++++++++++ 15 files changed, 165 insertions(+) create mode 100644 test/CodeGen/PowerPC/dyn-alloca-offset.ll diff --git a/docs/LangRef.rst b/docs/LangRef.rst index be8e63bf071f..36bfb5167795 100644 --- a/docs/LangRef.rst +++ b/docs/LangRef.rst @@ -9337,6 +9337,48 @@ Semantics: See the description for :ref:`llvm.stacksave `. +.. _int_get_dynamic_area_offset: + +'``llvm.get.dynamic.area.offset``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare i32 @llvm.get.dynamic.area.offset.i32() + declare i64 @llvm.get.dynamic.area.offset.i64() + + Overview: + """"""""" + + The '``llvm.get.dynamic.area.offset.*``' intrinsic family is used to + get the offset from native stack pointer to the address of the most + recent dynamic alloca on the caller's stack. These intrinsics are + intendend for use in combination with + :ref:`llvm.stacksave ` to get a + pointer to the most recent dynamic alloca. This is useful, for example, + for AddressSanitizer's stack unpoisoning routines. + +Semantics: +"""""""""" + + These intrinsics return a non-negative integer value that can be used to + get the address of the most recent dynamic alloca, allocated by :ref:`alloca ` + on the caller's stack. In particular, for targets where stack grows downwards, + adding this offset to the native stack pointer would get the address of the most + recent dynamic alloca. For targets where stack grows upwards, the situation is a bit more + complicated, because substracting this value from stack pointer would get the address + one past the end of the most recent dynamic alloca. + + Although for most targets `llvm.get.dynamic.area.offset ` + returns just a zero, for others, such as PowerPC and PowerPC64, it returns a + compile-time-known constant value. + + The return value type of :ref:`llvm.get.dynamic.area.offset ` + must match the target's generic address space's (address space 0) pointer type. + '``llvm.prefetch``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h index aaf08e14f57d..4be993a9fbbb 100644 --- a/include/llvm/CodeGen/ISDOpcodes.h +++ b/include/llvm/CodeGen/ISDOpcodes.h @@ -754,6 +754,12 @@ namespace ISD { GC_TRANSITION_START, GC_TRANSITION_END, + /// GET_DYNAMIC_AREA_OFFSET - get offset from native SP to the address of + /// the most recent dynamic alloca. For most targets that would be 0, but + /// for some others (e.g. PowerPC, PowerPC64) that would be compile-time + /// known nonzero constant. The only operand here is the chain. + GET_DYNAMIC_AREA_OFFSET, + /// BUILTIN_OP_END - This must be the last enum value in this list. /// The target-specific pre-isel opcode values start here. BUILTIN_OP_END diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td index 917cf56e2e88..e838fb332de9 100644 --- a/include/llvm/IR/Intrinsics.td +++ b/include/llvm/IR/Intrinsics.td @@ -304,6 +304,8 @@ def int_stacksave : Intrinsic<[llvm_ptr_ty]>, def int_stackrestore : Intrinsic<[], [llvm_ptr_ty]>, GCCBuiltin<"__builtin_stack_restore">; +def int_get_dynamic_area_offset : Intrinsic<[llvm_anyint_ty]>; + // IntrReadWriteArgMem is more pessimistic than strictly necessary for prefetch, // however it does conveniently prevent the prefetch from being reordered // with respect to nearby accesses to the same memory. diff --git a/lib/CodeGen/IntrinsicLowering.cpp b/lib/CodeGen/IntrinsicLowering.cpp index 5b895fff5c43..47a9f64e9080 100644 --- a/lib/CodeGen/IntrinsicLowering.cpp +++ b/lib/CodeGen/IntrinsicLowering.cpp @@ -424,6 +424,13 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) { break; } + case Intrinsic::get_dynamic_area_offset: + errs() << "WARNING: this target does not support the custom llvm.get." + "dynamic.area.offset. It is being lowered to a constant 0\n"; + // Just lower it to a constant 0 because for most targets + // @llvm.get.dynamic.area.offset is lowered to zero. + CI->replaceAllUsesWith(ConstantInt::get(CI->getType(), 0)); + break; case Intrinsic::returnaddress: case Intrinsic::frameaddress: errs() << "WARNING: this target does not support the llvm." diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index c5810525f3c7..8238cdeb59ca 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -1213,6 +1213,10 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) { case ISD::STACKSAVE: Action = TLI.getOperationAction(Node->getOpcode(), MVT::Other); break; + case ISD::GET_DYNAMIC_AREA_OFFSET: + Action = TLI.getOperationAction(Node->getOpcode(), + Node->getValueType(0)); + break; case ISD::VAARG: Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0)); @@ -3295,6 +3299,10 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { Results.push_back(Node->getOperand(0)); } break; + case ISD::GET_DYNAMIC_AREA_OFFSET: + Results.push_back(DAG.getConstant(0, dl, Node->getValueType(0))); + Results.push_back(Results[0].getValue(0)); + break; case ISD::FCOPYSIGN: Results.push_back(ExpandFCOPYSIGN(Node)); break; diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index d880bcfbdf64..38b8bced2399 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -4928,6 +4928,21 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { DAG.setRoot(DAG.getNode(ISD::STACKRESTORE, sdl, MVT::Other, getRoot(), Res)); return nullptr; } + case Intrinsic::get_dynamic_area_offset: { + SDValue Op = getRoot(); + EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout()); + EVT ResTy = TLI.getValueType(DAG.getDataLayout(), I.getType()); + // Result type for @llvm.get.dynamic.area.offset should match PtrTy for + // target. + if (PtrTy != ResTy) + report_fatal_error("Wrong result type for @llvm.get.dynamic.area.offset" + " intrinsic!"); + Res = DAG.getNode(ISD::GET_DYNAMIC_AREA_OFFSET, sdl, DAG.getVTList(ResTy), + Op); + DAG.setRoot(Op); + setValue(&I, Res); + return nullptr; + } case Intrinsic::stackprotector: { // Emit code into the DAG to store the stack guard onto the stack. MachineFunction &MF = DAG.getMachineFunction(); diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 195b48498605..a6f9699bb29c 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -310,6 +310,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::LIFETIME_END: return "lifetime.end"; case ISD::GC_TRANSITION_START: return "gc_transition.start"; case ISD::GC_TRANSITION_END: return "gc_transition.end"; + case ISD::GET_DYNAMIC_AREA_OFFSET: return "get.dynamic.area.offset"; // Bit manipulation case ISD::BITREVERSE: return "bitreverse"; diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp index e348095aa8fc..69c130809bb8 100644 --- a/lib/CodeGen/TargetLoweringBase.cpp +++ b/lib/CodeGen/TargetLoweringBase.cpp @@ -840,6 +840,9 @@ void TargetLoweringBase::initActions() { setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Expand); setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Expand); } + + // For most targets @llvm.get.dynamic.area.offest just returns 0. + setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, VT, Expand); } // Most targets ignore the @llvm.prefetch intrinsic. diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 1b1e0cf57865..72a3fbe83e13 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -329,6 +329,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::STACKRESTORE , MVT::Other, Custom); setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32 , Custom); setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64 , Custom); + setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i32, Custom); + setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i64, Custom); // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); @@ -998,6 +1000,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::Lo: return "PPCISD::Lo"; case PPCISD::TOC_ENTRY: return "PPCISD::TOC_ENTRY"; case PPCISD::DYNALLOC: return "PPCISD::DYNALLOC"; + case PPCISD::DYNAREAOFFSET: return "PPCISD::DYNAREAOFFSET"; case PPCISD::GlobalBaseReg: return "PPCISD::GlobalBaseReg"; case PPCISD::SRL: return "PPCISD::SRL"; case PPCISD::SRA: return "PPCISD::SRA"; @@ -5808,6 +5811,22 @@ PPCTargetLowering::LowerReturn(SDValue Chain, return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, RetOps); } +SDValue PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET( + SDValue Op, SelectionDAG &DAG, const PPCSubtarget &Subtarget) const { + SDLoc dl(Op); + + // Get the corect type for integers. + EVT IntVT = Op.getValueType(); + + // Get the inputs. + SDValue Chain = Op.getOperand(0); + SDValue FPSIdx = getFramePointerFrameIndex(DAG); + // Build a DYNAREAOFFSET node. + SDValue Ops[2] = {Chain, FPSIdx}; + SDVTList VTs = DAG.getVTList(IntVT); + return DAG.getNode(PPCISD::DYNAREAOFFSET, dl, VTs, Ops); +} + SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG, const PPCSubtarget &Subtarget) const { // When we pop the dynamic allocation we need to restore the SP link. @@ -7938,6 +7957,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::STACKRESTORE: return LowerSTACKRESTORE(Op, DAG, Subtarget); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG, Subtarget); + case ISD::GET_DYNAMIC_AREA_OFFSET: return LowerGET_DYNAMIC_AREA_OFFSET(Op, DAG, Subtarget); case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index 415c47c286e3..c0aafbac1aa0 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -79,6 +79,11 @@ namespace llvm { /// compute an allocation on the stack. DYNALLOC, + /// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to + /// compute an offset from native SP to the address of the most recent + /// dynamic alloca. + DYNAREAOFFSET, + /// GlobalBaseReg - On Darwin, this node represents the result of the mflr /// at function entry, used for PIC code. GlobalBaseReg, @@ -728,6 +733,8 @@ namespace llvm { const PPCSubtarget &Subtarget) const; SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG, const PPCSubtarget &Subtarget) const; + SDValue LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op, SelectionDAG &DAG, + const PPCSubtarget &Subtarget) const; SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG, const PPCSubtarget &Subtarget) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td index d62833037db5..075e093e41a1 100644 --- a/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/lib/Target/PowerPC/PPCInstr64Bit.td @@ -369,6 +369,8 @@ let Defs = [X1], Uses = [X1] in def DYNALLOC8 : Pseudo<(outs g8rc:$result), (ins g8rc:$negsize, memri:$fpsi),"#DYNALLOC8", [(set i64:$result, (PPCdynalloc i64:$negsize, iaddr:$fpsi))]>; +def DYNAREAOFFSET8 : Pseudo<(outs i64imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET8", + [(set i64:$result, (PPCdynareaoffset iaddr:$fpsi))]>; let Defs = [LR8] in { def MTLR8 : XFXForm_7_ext<31, 467, 8, (outs), (ins g8rc:$rS), diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td index cc1af1a7132f..6c4364aad331 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.td +++ b/lib/Target/PowerPC/PPCInstrInfo.td @@ -226,7 +226,9 @@ def PPCcr6unset : SDNode<"PPCISD::CR6UNSET", SDTNone, // Instructions to support dynamic alloca. def SDTDynOp : SDTypeProfile<1, 2, []>; +def SDTDynAreaOp : SDTypeProfile<1, 1, []>; def PPCdynalloc : SDNode<"PPCISD::DYNALLOC", SDTDynOp, [SDNPHasChain]>; +def PPCdynareaoffset : SDNode<"PPCISD::DYNAREAOFFSET", SDTDynAreaOp, [SDNPHasChain]>; //===----------------------------------------------------------------------===// // PowerPC specific transformation functions and pattern fragments. @@ -1029,6 +1031,8 @@ let Defs = [R1], Uses = [R1] in def DYNALLOC : Pseudo<(outs gprc:$result), (ins gprc:$negsize, memri:$fpsi), "#DYNALLOC", [(set i32:$result, (PPCdynalloc i32:$negsize, iaddr:$fpsi))]>; +def DYNAREAOFFSET : Pseudo<(outs i32imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET", + [(set i32:$result, (PPCdynareaoffset iaddr:$fpsi))]>; // SELECT_CC_* - Used to implement the SELECT_CC DAG operation. Expanded after // instruction selection into a branch sequence. diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp index 6d53f876c062..934bdf622418 100644 --- a/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -430,6 +430,27 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const { MBB.erase(II); } +void PPCRegisterInfo::lowerDynamicAreaOffset( + MachineBasicBlock::iterator II) const { + // Get the instruction. + MachineInstr &MI = *II; + // Get the instruction's basic block. + MachineBasicBlock &MBB = *MI.getParent(); + // Get the basic block's function. + MachineFunction &MF = *MBB.getParent(); + // Get the frame info. + MachineFrameInfo *MFI = MF.getFrameInfo(); + const PPCSubtarget &Subtarget = MF.getSubtarget(); + // Get the instruction info. + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + + unsigned maxCallFrameSize = MFI->getMaxCallFrameSize(); + DebugLoc dl = MI.getDebugLoc(); + BuildMI(MBB, II, dl, TII.get(PPC::LI), MI.getOperand(0).getReg()) + .addImm(maxCallFrameSize); + MBB.erase(II); +} + /// lowerCRSpilling - Generate the code for spilling a CR register. Instead of /// reserving a whole register (R0), we scrounge for one here. This generates /// code like this: @@ -754,6 +775,11 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // Get the instruction opcode. unsigned OpC = MI.getOpcode(); + if ((OpC == PPC::DYNAREAOFFSET || OpC == PPC::DYNAREAOFFSET8)) { + lowerDynamicAreaOffset(II); + return; + } + // Special case for dynamic alloca. if (FPSI && FrameIndex == FPSI && (OpC == PPC::DYNALLOC || OpC == PPC::DYNALLOC8)) { diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h index 1b1e160d836c..b15fde83c9f3 100644 --- a/lib/Target/PowerPC/PPCRegisterInfo.h +++ b/lib/Target/PowerPC/PPCRegisterInfo.h @@ -101,6 +101,7 @@ class PPCRegisterInfo : public PPCGenRegisterInfo { } void lowerDynamicAlloc(MachineBasicBlock::iterator II) const; + void lowerDynamicAreaOffset(MachineBasicBlock::iterator II) const; void lowerCRSpilling(MachineBasicBlock::iterator II, unsigned FrameIndex) const; void lowerCRRestore(MachineBasicBlock::iterator II, diff --git a/test/CodeGen/PowerPC/dyn-alloca-offset.ll b/test/CodeGen/PowerPC/dyn-alloca-offset.ll new file mode 100644 index 000000000000..7159b9da736d --- /dev/null +++ b/test/CodeGen/PowerPC/dyn-alloca-offset.ll @@ -0,0 +1,21 @@ +; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 < %s | FileCheck %s +target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" +target triple = "powerpc64-unknown-linux-gnu" + +declare i64 @llvm.get.dynamic.area.offset.i64() + +declare i64 @bar(i64) + +attributes #0 = { nounwind } + +; Function Attrs: nounwind sanitize_address uwtable +define signext i64 @foo(i32 signext %N, i32 signext %M) #0 { + %1 = alloca i64, align 32 + %dynamic_area_offset = call i64 @llvm.get.dynamic.area.offset.i64() + %2 = call i64 @bar(i64 %dynamic_area_offset) + ret i64 %2 + +; CHECK-DAG: li [[REG1:[0-9]+]], 112 +; CHECK: blr + +} From c309d733467cec5a6b04706a6bccc0f399ad0fdf Mon Sep 17 00:00:00 2001 From: Hrvoje Varga Date: Tue, 1 Dec 2015 11:59:21 +0000 Subject: [PATCH 072/186] [mips][microMIPS] Implement RECIP.fmt, RINT.fmt, ROUND.L.fmt, ROUND.W.fmt, SEL.fmt, SELEQZ.fmt, SELNEQZ.fmt and CLASS.fmt Differential Revision: http://reviews.llvm.org/D13885 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254405 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/Mips/MicroMips32r6InstrFormats.td | 62 +++++++++++++++ lib/Target/Mips/MicroMips32r6InstrInfo.td | 75 +++++++++++++++++++ lib/Target/Mips/MicroMipsInstrFPU.td | 4 +- lib/Target/Mips/Mips32r6InstrInfo.td | 44 ++++++----- lib/Target/Mips/MipsInstrFPU.td | 10 +-- .../Disassembler/Mips/micromips32r6/valid.txt | 16 ++++ .../Disassembler/Mips/micromips64r6/valid.txt | 16 ++++ test/MC/Mips/micromips32r6/valid.s | 16 ++++ test/MC/Mips/micromips64r6/valid.s | 16 ++++ 9 files changed, 233 insertions(+), 26 deletions(-) diff --git a/lib/Target/Mips/MicroMips32r6InstrFormats.td b/lib/Target/Mips/MicroMips32r6InstrFormats.td index c4cdb0c2fadd..349b3b88a07a 100644 --- a/lib/Target/Mips/MicroMips32r6InstrFormats.td +++ b/lib/Target/Mips/MicroMips32r6InstrFormats.td @@ -796,3 +796,65 @@ class POOL32A_WRPGPR_WSBH_FM_MMR6 funct> : MipsR6Inst { let Inst{15-6} = funct; let Inst{5-0} = 0x3c; } + +class POOL32F_RECIP_ROUND_FM_MMR6 fmt, bits<8> funct> + : MMR6Arch, MipsR6Inst { + bits<5> ft; + bits<5> fs; + + bits<32> Inst; + + let Inst{31-26} = 0b010101; + let Inst{25-21} = ft; + let Inst{20-16} = fs; + let Inst{15} = 0; + let Inst{14} = fmt; + let Inst{13-6} = funct; + let Inst{5-0} = 0b111011; +} + +class POOL32F_RINT_FM_MMR6 fmt> + : MMR6Arch, MipsR6Inst { + bits<5> fs; + bits<5> fd; + + bits<32> Inst; + + let Inst{31-26} = 0b010101; + let Inst{25-21} = fs; + let Inst{20-16} = fd; + let Inst{15-11} = 0; + let Inst{10-9} = fmt; + let Inst{8-0} = 0b000100000; +} + +class POOL32F_SEL_FM_MMR6 fmt, bits<9> funct> + : MMR6Arch, MipsR6Inst { + bits<5> ft; + bits<5> fs; + bits<5> fd; + + bits<32> Inst; + + let Inst{31-26} = 0b010101; + let Inst{25-21} = ft; + let Inst{20-16} = fs; + let Inst{15-11} = fd; + let Inst{10-9} = fmt; + let Inst{8-0} = funct; +} + +class POOL32F_CLASS_FM_MMR6 fmt, bits<9> funct> + : MMR6Arch, MipsR6Inst { + bits<5> fs; + bits<5> fd; + + bits<32> Inst; + + let Inst{31-26} = 0b010101; + let Inst{25-21} = fs; + let Inst{20-16} = fd; + let Inst{15-11} = 0b00000; + let Inst{10-9} = fmt; + let Inst{8-0} = funct; +} diff --git a/lib/Target/Mips/MicroMips32r6InstrInfo.td b/lib/Target/Mips/MicroMips32r6InstrInfo.td index 2dbd20cfad99..8c744d8924bb 100644 --- a/lib/Target/Mips/MicroMips32r6InstrInfo.td +++ b/lib/Target/Mips/MicroMips32r6InstrInfo.td @@ -132,6 +132,26 @@ class LLE_MMR6_ENC : LOAD_WORD_EVA_FM_MMR6<0b110>; class LWE_MMR6_ENC : LOAD_WORD_EVA_FM_MMR6<0b111>; class LW_MMR6_ENC : LOAD_WORD_FM_MMR6; class LUI_MMR6_ENC : LOAD_UPPER_IMM_FM_MMR6; +class RECIP_S_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"recip.s", 0, 0b01001000>; +class RECIP_D_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"recip.d", 1, 0b01001000>; +class RINT_S_MMR6_ENC : POOL32F_RINT_FM_MMR6<"rint.s", 0>; +class RINT_D_MMR6_ENC : POOL32F_RINT_FM_MMR6<"rint.d", 1>; +class ROUND_L_S_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.l.s", 0, + 0b11001100>; +class ROUND_L_D_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.l.d", 1, + 0b11001100>; +class ROUND_W_S_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.w.s", 0, + 0b11101100>; +class ROUND_W_D_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.w.d", 1, + 0b11101100>; +class SEL_S_MMR6_ENC : POOL32F_SEL_FM_MMR6<"sel.s", 0, 0b010111000>; +class SEL_D_MMR6_ENC : POOL32F_SEL_FM_MMR6<"sel.d", 1, 0b010111000>; +class SELEQZ_S_MMR6_ENC : POOL32F_SEL_FM_MMR6<"seleqz.s", 0, 0b000111000>; +class SELEQZ_D_MMR6_ENC : POOL32F_SEL_FM_MMR6<"seleqz.d", 1, 0b000111000>; +class SELENZ_S_MMR6_ENC : POOL32F_SEL_FM_MMR6<"selenz.s", 0, 0b001111000>; +class SELENZ_D_MMR6_ENC : POOL32F_SEL_FM_MMR6<"selenz.d", 1, 0b001111000>; +class CLASS_S_MMR6_ENC : POOL32F_CLASS_FM_MMR6<"class.s", 0, 0b001100000>; +class CLASS_D_MMR6_ENC : POOL32F_CLASS_FM_MMR6<"class.d", 1, 0b001100000>; class ADDU16_MMR6_ENC : POOL16A_ADDU16_FM_MMR6; class AND16_MMR6_ENC : POOL16C_AND16_FM_MMR6; @@ -724,6 +744,33 @@ class RSQRT_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"rsqrt.s", FGR32Opnd, FGR32Opnd, II_TRUNC>; class RSQRT_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"rsqrt.d", FGR32Opnd, AFGR64Opnd, II_TRUNC>; +class RECIP_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"recip.s", FGR32Opnd, + FGR32Opnd, II_ROUND>; +class RECIP_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"recip.d", FGR32Opnd, FGR32Opnd, + II_ROUND>; +class ROUND_L_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.l.s", FGR64Opnd, + FGR32Opnd, II_ROUND>; +class ROUND_L_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.l.d", FGR64Opnd, + FGR64Opnd, II_ROUND>; +class ROUND_W_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.w.s", FGR32Opnd, + FGR32Opnd, II_ROUND>; +class ROUND_W_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.w.d", FGR64Opnd, + FGR64Opnd, II_ROUND>; + +class SEL_S_MMR6_DESC : COP1_SEL_DESC_BASE<"sel.s", FGR32Opnd>; +class SEL_D_MMR6_DESC : COP1_SEL_DESC_BASE<"sel.d", FGR64Opnd> { + // We must insert a SUBREG_TO_REG around $fd_in + bit usesCustomInserter = 1; +} + +class SELEQZ_S_MMR6_DESC : SELEQNEZ_DESC_BASE<"seleqz.s", FGR32Opnd>; +class SELEQZ_D_MMR6_DESC : SELEQNEZ_DESC_BASE<"seleqz.d", FGR64Opnd>; +class SELENZ_S_MMR6_DESC : SELEQNEZ_DESC_BASE<"selnez.s", FGR32Opnd>; +class SELENZ_D_MMR6_DESC : SELEQNEZ_DESC_BASE<"selnez.d", FGR64Opnd>; +class RINT_S_MMR6_DESC : CLASS_RINT_DESC_BASE<"rint.s", FGR32Opnd>; +class RINT_D_MMR6_DESC : CLASS_RINT_DESC_BASE<"rint.d", FGR64Opnd>; +class CLASS_S_MMR6_DESC : CLASS_RINT_DESC_BASE<"class.s", FGR32Opnd>; +class CLASS_D_MMR6_DESC : CLASS_RINT_DESC_BASE<"class.d", FGR64Opnd>; class STORE_MMR6_DESC_BASE : Store, MMR6Arch { @@ -1121,6 +1168,34 @@ def SUBU16_MMR6 : StdMMR6Rel, SUBU16_MMR6_DESC, SUBU16_MMR6_ENC, ISA_MICROMIPS32R6; def XOR16_MMR6 : StdMMR6Rel, XOR16_MMR6_DESC, XOR16_MMR6_ENC, ISA_MICROMIPS32R6; +def RECIP_S_MMR6 : StdMMR6Rel, RECIP_S_MMR6_ENC, RECIP_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def RECIP_D_MMR6 : StdMMR6Rel, RECIP_D_MMR6_ENC, RECIP_D_MMR6_DESC, ISA_MICROMIPS32R6; +def RINT_S_MMR6 : StdMMR6Rel, RINT_S_MMR6_ENC, RINT_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def RINT_D_MMR6 : StdMMR6Rel, RINT_D_MMR6_ENC, RINT_D_MMR6_DESC, ISA_MICROMIPS32R6; +def ROUND_L_S_MMR6 : StdMMR6Rel, ROUND_L_S_MMR6_ENC, ROUND_L_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def ROUND_L_D_MMR6 : StdMMR6Rel, ROUND_L_D_MMR6_ENC, ROUND_L_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def ROUND_W_S_MMR6 : StdMMR6Rel, ROUND_W_S_MMR6_ENC, ROUND_W_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def ROUND_W_D_MMR6 : StdMMR6Rel, ROUND_W_D_MMR6_ENC, ROUND_W_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def SEL_S_MMR6 : StdMMR6Rel, SEL_S_MMR6_ENC, SEL_S_MMR6_DESC, ISA_MICROMIPS32R6; +def SEL_D_MMR6 : StdMMR6Rel, SEL_D_MMR6_ENC, SEL_D_MMR6_DESC, ISA_MICROMIPS32R6; +def SELEQZ_S_MMR6 : StdMMR6Rel, SELEQZ_S_MMR6_ENC, SELEQZ_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def SELEQZ_D_MMR6 : StdMMR6Rel, SELEQZ_D_MMR6_ENC, SELEQZ_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def SELENZ_S_MMR6 : StdMMR6Rel, SELENZ_S_MMR6_ENC, SELENZ_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def SELENZ_D_MMR6 : StdMMR6Rel, SELENZ_D_MMR6_ENC, SELENZ_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def CLASS_S_MMR6 : StdMMR6Rel, CLASS_S_MMR6_ENC, CLASS_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def CLASS_D_MMR6 : StdMMR6Rel, CLASS_D_MMR6_ENC, CLASS_D_MMR6_DESC, + ISA_MICROMIPS32R6; } //===----------------------------------------------------------------------===// diff --git a/lib/Target/Mips/MicroMipsInstrFPU.td b/lib/Target/Mips/MicroMipsInstrFPU.td index 120a841c3d9d..756e6c92c1d1 100644 --- a/lib/Target/Mips/MicroMipsInstrFPU.td +++ b/lib/Target/Mips/MicroMipsInstrFPU.td @@ -43,7 +43,7 @@ def BC1T_MM : MMRel, BC1F_FT<"bc1t", brtarget_mm, II_BC1T, MIPS_BRANCH_T>, BC1F_FM_MM<0x1d>, ISA_MIPS1_NOT_32R6_64R6; def CVT_W_S_MM : MMRel, ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, II_CVT>, ROUND_W_FM_MM<0, 0x24>; -def ROUND_W_S_MM : MMRel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>, +def ROUND_W_S_MM : MMRel, StdMMR6Rel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>, ROUND_W_FM_MM<0, 0xec>; def CEIL_W_MM : MMRel, ABSS_FT<"ceil.w.d", FGR32Opnd, AFGR64Opnd, II_CEIL>, @@ -52,7 +52,7 @@ def CVT_W_MM : MMRel, ABSS_FT<"cvt.w.d", FGR32Opnd, AFGR64Opnd, II_CVT>, ROUND_W_FM_MM<1, 0x24>; def FLOOR_W_MM : MMRel, ABSS_FT<"floor.w.d", FGR32Opnd, AFGR64Opnd, II_FLOOR>, ROUND_W_FM_MM<1, 0x2c>; -def ROUND_W_MM : MMRel, ABSS_FT<"round.w.d", FGR32Opnd, AFGR64Opnd, II_ROUND>, +def ROUND_W_MM : MMRel, StdMMR6Rel, ABSS_FT<"round.w.d", FGR32Opnd, AFGR64Opnd, II_ROUND>, ROUND_W_FM_MM<1, 0xec>; def TRUNC_W_MM : MMRel, ABSS_FT<"trunc.w.d", FGR32Opnd, AFGR64Opnd, II_TRUNC>, ROUND_W_FM_MM<1, 0xac>; diff --git a/lib/Target/Mips/Mips32r6InstrInfo.td b/lib/Target/Mips/Mips32r6InstrInfo.td index 9dd4d1e034e9..c36a45acbf79 100644 --- a/lib/Target/Mips/Mips32r6InstrInfo.td +++ b/lib/Target/Mips/Mips32r6InstrInfo.td @@ -687,8 +687,10 @@ def BNEZC : BNEZC_ENC, BNEZC_DESC, ISA_MIPS32R6; def BNVC : BNVC_ENC, BNVC_DESC, ISA_MIPS32R6; def BOVC : BOVC_ENC, BOVC_DESC, ISA_MIPS32R6; def CACHE_R6 : R6MMR6Rel, CACHE_ENC, CACHE_DESC, ISA_MIPS32R6; -def CLASS_D : CLASS_D_ENC, CLASS_D_DESC, ISA_MIPS32R6, HARDFLOAT; -def CLASS_S : CLASS_S_ENC, CLASS_S_DESC, ISA_MIPS32R6, HARDFLOAT; +let AdditionalPredicates = [NotInMicroMips] in { + def CLASS_D : CLASS_D_ENC, CLASS_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def CLASS_S : CLASS_S_ENC, CLASS_S_DESC, ISA_MIPS32R6, HARDFLOAT; +} def CLO_R6 : R6MMR6Rel, CLO_R6_ENC, CLO_R6_DESC, ISA_MIPS32R6; def CLZ_R6 : R6MMR6Rel, CLZ_R6_ENC, CLZ_R6_DESC, ISA_MIPS32R6; defm S : CMP_CC_M; @@ -707,14 +709,14 @@ def LWUPC : LWUPC_ENC, LWUPC_DESC, ISA_MIPS32R6; let AdditionalPredicates = [NotInMicroMips] in { def MADDF_S : MADDF_S_ENC, MADDF_S_DESC, ISA_MIPS32R6, HARDFLOAT; def MADDF_D : MADDF_D_ENC, MADDF_D_DESC, ISA_MIPS32R6, HARDFLOAT; - def MAXA_D : MAXA_D_ENC, MAXA_D_DESC, ISA_MIPS32R6, HARDFLOAT; - def MAXA_S : MAXA_S_ENC, MAXA_S_DESC, ISA_MIPS32R6, HARDFLOAT; - def MAX_D : MAX_D_ENC, MAX_D_DESC, ISA_MIPS32R6, HARDFLOAT; - def MAX_S : MAX_S_ENC, MAX_S_DESC, ISA_MIPS32R6, HARDFLOAT; - def MINA_D : MINA_D_ENC, MINA_D_DESC, ISA_MIPS32R6, HARDFLOAT; - def MINA_S : MINA_S_ENC, MINA_S_DESC, ISA_MIPS32R6, HARDFLOAT; - def MIN_D : MIN_D_ENC, MIN_D_DESC, ISA_MIPS32R6, HARDFLOAT; - def MIN_S : MIN_S_ENC, MIN_S_DESC, ISA_MIPS32R6, HARDFLOAT; + def MAXA_D : MAXA_D_ENC, MAXA_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def MAXA_S : MAXA_S_ENC, MAXA_S_DESC, ISA_MIPS32R6, HARDFLOAT; + def MAX_D : MAX_D_ENC, MAX_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def MAX_S : MAX_S_ENC, MAX_S_DESC, ISA_MIPS32R6, HARDFLOAT; + def MINA_D : MINA_D_ENC, MINA_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def MINA_S : MINA_S_ENC, MINA_S_DESC, ISA_MIPS32R6, HARDFLOAT; + def MIN_D : MIN_D_ENC, MIN_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def MIN_S : MIN_S_ENC, MIN_S_DESC, ISA_MIPS32R6, HARDFLOAT; } def MOD : R6MMR6Rel, MOD_ENC, MOD_DESC, ISA_MIPS32R6; def MODU : R6MMR6Rel, MODU_ENC, MODU_DESC, ISA_MIPS32R6; @@ -728,21 +730,27 @@ def MUL_R6 : R6MMR6Rel, MUL_R6_ENC, MUL_R6_DESC, ISA_MIPS32R6; def MULU : R6MMR6Rel, MULU_ENC, MULU_DESC, ISA_MIPS32R6; def NAL; // BAL with rd=0 def PREF_R6 : R6MMR6Rel, PREF_ENC, PREF_DESC, ISA_MIPS32R6; -def RINT_D : RINT_D_ENC, RINT_D_DESC, ISA_MIPS32R6, HARDFLOAT; -def RINT_S : RINT_S_ENC, RINT_S_DESC, ISA_MIPS32R6, HARDFLOAT; +let AdditionalPredicates = [NotInMicroMips] in { + def RINT_D : RINT_D_ENC, RINT_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def RINT_S : RINT_S_ENC, RINT_S_DESC, ISA_MIPS32R6, HARDFLOAT; +} def SC_R6 : SC_R6_ENC, SC_R6_DESC, ISA_MIPS32R6; let AdditionalPredicates = [NotInMicroMips] in { def SDBBP_R6 : SDBBP_R6_ENC, SDBBP_R6_DESC, ISA_MIPS32R6; } def SDC2_R6 : SDC2_R6_ENC, SDC2_R6_DESC, ISA_MIPS32R6; def SELEQZ : R6MMR6Rel, SELEQZ_ENC, SELEQZ_DESC, ISA_MIPS32R6, GPR_32; -def SELEQZ_D : SELEQZ_D_ENC, SELEQZ_D_DESC, ISA_MIPS32R6, HARDFLOAT; -def SELEQZ_S : SELEQZ_S_ENC, SELEQZ_S_DESC, ISA_MIPS32R6, HARDFLOAT; +let AdditionalPredicates = [NotInMicroMips] in { + def SELEQZ_D : SELEQZ_D_ENC, SELEQZ_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def SELEQZ_S : SELEQZ_S_ENC, SELEQZ_S_DESC, ISA_MIPS32R6, HARDFLOAT; +} def SELNEZ : R6MMR6Rel, SELNEZ_ENC, SELNEZ_DESC, ISA_MIPS32R6, GPR_32; -def SELNEZ_D : SELNEZ_D_ENC, SELNEZ_D_DESC, ISA_MIPS32R6, HARDFLOAT; -def SELNEZ_S : SELNEZ_S_ENC, SELNEZ_S_DESC, ISA_MIPS32R6, HARDFLOAT; -def SEL_D : SEL_D_ENC, SEL_D_DESC, ISA_MIPS32R6, HARDFLOAT; -def SEL_S : SEL_S_ENC, SEL_S_DESC, ISA_MIPS32R6, HARDFLOAT; +let AdditionalPredicates = [NotInMicroMips] in { + def SELNEZ_D : SELNEZ_D_ENC, SELNEZ_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def SELNEZ_S : SELNEZ_S_ENC, SELNEZ_S_DESC, ISA_MIPS32R6, HARDFLOAT; + def SEL_D : SEL_D_ENC, SEL_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def SEL_S : SEL_S_ENC, SEL_S_DESC, ISA_MIPS32R6, HARDFLOAT; +} def SWC2_R6 : SWC2_R6_ENC, SWC2_R6_DESC, ISA_MIPS32R6; //===----------------------------------------------------------------------===// diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td index d1a724944335..377260f89d10 100644 --- a/lib/Target/Mips/MipsInstrFPU.td +++ b/lib/Target/Mips/MipsInstrFPU.td @@ -136,7 +136,7 @@ multiclass ABSS_M { def _D32 : MMRel, ABSS_FT, FGR_32; - def _D64 : ABSS_FT, FGR_64 { + def _D64 : StdMMR6Rel, ABSS_FT, FGR_64 { let DecoderNamespace = "Mips64"; } } @@ -267,31 +267,29 @@ defm D64 : C_COND_M<"d", FGR64Opnd, 17, II_C_CC_D>, ISA_MIPS1_NOT_32R6_64R6, //===----------------------------------------------------------------------===// // Floating Point Instructions //===----------------------------------------------------------------------===// -def ROUND_W_S : MMRel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>, +def ROUND_W_S : MMRel, StdMMR6Rel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>, ABSS_FM<0xc, 16>, ISA_MIPS2; -let AdditionalPredicates = [NotInMicroMips] in { +defm ROUND_W : ROUND_M<"round.w.d", II_ROUND>, ABSS_FM<0xc, 17>, ISA_MIPS2; def TRUNC_W_S : MMRel, StdMMR6Rel, ABSS_FT<"trunc.w.s", FGR32Opnd, FGR32Opnd, II_TRUNC>, ABSS_FM<0xd, 16>, ISA_MIPS2; def CEIL_W_S : MMRel, StdMMR6Rel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>, ABSS_FM<0xe, 16>, ISA_MIPS2; def FLOOR_W_S : MMRel, StdMMR6Rel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd, II_FLOOR>, ABSS_FM<0xf, 16>, ISA_MIPS2; -} def CVT_W_S : MMRel, ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, II_CVT>, ABSS_FM<0x24, 16>; -defm ROUND_W : ROUND_M<"round.w.d", II_ROUND>, ABSS_FM<0xc, 17>, ISA_MIPS2; defm TRUNC_W : ROUND_M<"trunc.w.d", II_TRUNC>, ABSS_FM<0xd, 17>, ISA_MIPS2; defm CEIL_W : ROUND_M<"ceil.w.d", II_CEIL>, ABSS_FM<0xe, 17>, ISA_MIPS2; defm FLOOR_W : ROUND_M<"floor.w.d", II_FLOOR>, ABSS_FM<0xf, 17>, ISA_MIPS2; defm CVT_W : ROUND_M<"cvt.w.d", II_CVT>, ABSS_FM<0x24, 17>; let DecoderNamespace = "Mips64" in { + let AdditionalPredicates = [NotInMicroMips] in { def ROUND_L_S : ABSS_FT<"round.l.s", FGR64Opnd, FGR32Opnd, II_ROUND>, ABSS_FM<0x8, 16>, FGR_64; def ROUND_L_D64 : ABSS_FT<"round.l.d", FGR64Opnd, FGR64Opnd, II_ROUND>, ABSS_FM<0x8, 17>, FGR_64; - let AdditionalPredicates = [NotInMicroMips] in { def TRUNC_L_S : ABSS_FT<"trunc.l.s", FGR64Opnd, FGR32Opnd, II_TRUNC>, ABSS_FM<0x9, 16>, FGR_64; def TRUNC_L_D64 : ABSS_FT<"trunc.l.d", FGR64Opnd, FGR64Opnd, II_TRUNC>, diff --git a/test/MC/Disassembler/Mips/micromips32r6/valid.txt b/test/MC/Disassembler/Mips/micromips32r6/valid.txt index 82c5d50df92d..5fa2138262a4 100644 --- a/test/MC/Disassembler/Mips/micromips32r6/valid.txt +++ b/test/MC/Disassembler/Mips/micromips32r6/valid.txt @@ -237,3 +237,19 @@ 0xea 0x11 # CHECK: sw16 $4, 4($17) 0xe8 0x11 # CHECK: sw16 $zero, 4($17) 0x45 0x2a # CHECK: swm16 $16, $17, $ra, 8($sp) +0x54 0x44 0x12 0x3b # CHECK: recip.s $f2, $f4 +0x54 0x44 0x52 0x3b # CHECK: recip.d $f2, $f4 +0x54 0x82 0x00 0x20 # CHECK: rint.s $f2, $f4 +0x54 0x82 0x02 0x20 # CHECK: rint.d $f2, $f4 +0x54 0x44 0x33 0x3b # CHECK: round.l.s $f2, $f4 +0x54 0x44 0x73 0x3b # CHECK: round.l.d $f2, $f4 +0x54 0x44 0x3b 0x3b # CHECK: round.w.s $f2, $f4 +0x54 0x44 0x7b 0x3b # CHECK: round.w.d $f2, $f4 +0x54 0x41 0x08 0xb8 # CHECK: sel.s $f1, $f1, $f2 +0x54 0x82 0x02 0xb8 # CHECK: sel.d $f0, $f2, $f4 +0x54 0x62 0x08 0x38 # CHECK: seleqz.s $f1, $f2, $f3 +0x55 0x04 0x12 0x38 # CHECK: seleqz.d $f2, $f4, $f8 +0x54 0x62 0x08 0x78 # CHECK: selnez.s $f1, $f2, $f3 +0x55 0x04 0x12 0x78 # CHECK: selnez.d $f2, $f4, $f8 +0x54 0x62 0x00 0x60 # CHECK: class.s $f2, $f3 +0x54 0x82 0x02 0x60 # CHECK: class.d $f2, $f4 diff --git a/test/MC/Disassembler/Mips/micromips64r6/valid.txt b/test/MC/Disassembler/Mips/micromips64r6/valid.txt index fb641c06bfba..10a9687384ea 100644 --- a/test/MC/Disassembler/Mips/micromips64r6/valid.txt +++ b/test/MC/Disassembler/Mips/micromips64r6/valid.txt @@ -150,3 +150,19 @@ 0xea 0x11 # CHECK: sw16 $4, 4($17) 0xe8 0x11 # CHECK: sw16 $zero, 4($17) 0x45 0x2a # CHECK: swm16 $16, $17, $ra, 8($sp) +0x54 0x44 0x12 0x3b # CHECK: recip.s $f2, $f4 +0x54 0x44 0x52 0x3b # CHECK: recip.d $f2, $f4 +0x54 0x82 0x00 0x20 # CHECK: rint.s $f2, $f4 +0x54 0x82 0x02 0x20 # CHECK: rint.d $f2, $f4 +0x54 0x44 0x33 0x3b # CHECK: round.l.s $f2, $f4 +0x54 0x44 0x73 0x3b # CHECK: round.l.d $f2, $f4 +0x54 0x44 0x3b 0x3b # CHECK: round.w.s $f2, $f4 +0x54 0x44 0x7b 0x3b # CHECK: round.w.d $f2, $f4 +0x54 0x41 0x08 0xb8 # CHECK: sel.s $f1, $f1, $f2 +0x54 0x82 0x02 0xb8 # CHECK: sel.d $f0, $f2, $f4 +0x54 0x62 0x08 0x38 # CHECK: seleqz.s $f1, $f2, $f3 +0x55 0x04 0x12 0x38 # CHECK: seleqz.d $f2, $f4, $f8 +0x54 0x62 0x08 0x78 # CHECK: selnez.s $f1, $f2, $f3 +0x55 0x04 0x12 0x78 # CHECK: selnez.d $f2, $f4, $f8 +0x54 0x62 0x00 0x60 # CHECK: class.s $f2, $f3 +0x54 0x82 0x02 0x60 # CHECK: class.d $f2, $f4 diff --git a/test/MC/Mips/micromips32r6/valid.s b/test/MC/Mips/micromips32r6/valid.s index 81d4b6c6d456..194b15e1a4f6 100644 --- a/test/MC/Mips/micromips32r6/valid.s +++ b/test/MC/Mips/micromips32r6/valid.s @@ -230,3 +230,19 @@ lbu $4, 8($5) # CHECK: lbu $4, 8($5) # encoding: [0x14,0x85,0x00,0x08] lbe $4, 8($5) # CHECK: lbe $4, 8($5) # encoding: [0x60,0x85,0x68,0x08] lbue $4, 8($5) # CHECK: lbue $4, 8($5) # encoding: [0x60,0x85,0x60,0x08] + recip.s $f2, $f4 # CHECK: recip.s $f2, $f4 # encoding: [0x54,0x44,0x12,0x3b] + recip.d $f2, $f4 # CHECK: recip.d $f2, $f4 # encoding: [0x54,0x44,0x52,0x3b] + rint.s $f2, $f4 # CHECK: rint.s $f2, $f4 # encoding: [0x54,0x82,0x00,0x20] + rint.d $f2, $f4 # CHECK: rint.d $f2, $f4 # encoding: [0x54,0x82,0x02,0x20] + round.l.s $f2, $f4 # CHECK: round.l.s $f2, $f4 # encoding: [0x54,0x44,0x33,0x3b] + round.l.d $f2, $f4 # CHECK: round.l.d $f2, $f4 # encoding: [0x54,0x44,0x73,0x3b] + round.w.s $f2, $f4 # CHECK: round.w.s $f2, $f4 # encoding: [0x54,0x44,0x3b,0x3b] + round.w.d $f2, $f4 # CHECK: round.w.d $f2, $f4 # encoding: [0x54,0x44,0x7b,0x3b] + sel.s $f1, $f1, $f2 # CHECK: sel.s $f1, $f1, $f2 # encoding: [0x54,0x41,0x08,0xb8] + sel.d $f0, $f2, $f4 # CHECK: sel.d $f0, $f2, $f4 # encoding: [0x54,0x82,0x02,0xb8] + seleqz.s $f1, $f2, $f3 # CHECK: seleqz.s $f1, $f2, $f3 # encoding: [0x54,0x62,0x08,0x38] + seleqz.d $f2, $f4, $f8 # CHECK: seleqz.d $f2, $f4, $f8 # encoding: [0x55,0x04,0x12,0x38] + selnez.s $f1, $f2, $f3 # CHECK: selnez.s $f1, $f2, $f3 # encoding: [0x54,0x62,0x08,0x78] + selnez.d $f2, $f4, $f8 # CHECK: selnez.d $f2, $f4, $f8 # encoding: [0x55,0x04,0x12,0x78] + class.s $f2, $f3 # CHECK: class.s $f2, $f3 # encoding: [0x54,0x62,0x00,0x60] + class.d $f2, $f4 # CHECK: class.d $f2, $f4 # encoding: [0x54,0x82,0x02,0x60] diff --git a/test/MC/Mips/micromips64r6/valid.s b/test/MC/Mips/micromips64r6/valid.s index 685810741780..1c8781b6e966 100644 --- a/test/MC/Mips/micromips64r6/valid.s +++ b/test/MC/Mips/micromips64r6/valid.s @@ -130,5 +130,21 @@ a: sw16 $0, 4($17) # CHECK: sw16 $zero, 4($17) # encoding: [0xe8,0x11] swm $16, $17, $ra, 8($sp) # CHECK: swm16 $16, $17, $ra, 8($sp) # encoding: [0x45,0x2a] swm16 $16, $17, $ra, 8($sp) # CHECK: swm16 $16, $17, $ra, 8($sp) # encoding: [0x45,0x2a] + recip.s $f2, $f4 # CHECK: recip.s $f2, $f4 # encoding: [0x54,0x44,0x12,0x3b] + recip.d $f2, $f4 # CHECK: recip.d $f2, $f4 # encoding: [0x54,0x44,0x52,0x3b] + rint.s $f2, $f4 # CHECK: rint.s $f2, $f4 # encoding: [0x54,0x82,0x00,0x20] + rint.d $f2, $f4 # CHECK: rint.d $f2, $f4 # encoding: [0x54,0x82,0x02,0x20] + round.l.s $f2, $f4 # CHECK: round.l.s $f2, $f4 # encoding: [0x54,0x44,0x33,0x3b] + round.l.d $f2, $f4 # CHECK: round.l.d $f2, $f4 # encoding: [0x54,0x44,0x73,0x3b] + round.w.s $f2, $f4 # CHECK: round.w.s $f2, $f4 # encoding: [0x54,0x44,0x3b,0x3b] + round.w.d $f2, $f4 # CHECK: round.w.d $f2, $f4 # encoding: [0x54,0x44,0x7b,0x3b] + sel.s $f1, $f1, $f2 # CHECK: sel.s $f1, $f1, $f2 # encoding: [0x54,0x41,0x08,0xb8] + sel.d $f0, $f2, $f4 # CHECK: sel.d $f0, $f2, $f4 # encoding: [0x54,0x82,0x02,0xb8] + seleqz.s $f1, $f2, $f3 # CHECK: seleqz.s $f1, $f2, $f3 # encoding: [0x54,0x62,0x08,0x38] + seleqz.d $f2, $f4, $f8 # CHECK: seleqz.d $f2, $f4, $f8 # encoding: [0x55,0x04,0x12,0x38] + selnez.s $f1, $f2, $f3 # CHECK: selnez.s $f1, $f2, $f3 # encoding: [0x54,0x62,0x08,0x78] + selnez.d $f2, $f4, $f8 # CHECK: selnez.d $f2, $f4, $f8 # encoding: [0x55,0x04,0x12,0x78] + class.s $f2, $f3 # CHECK: class.s $f2, $f3 # encoding: [0x54,0x62,0x00,0x60] + class.d $f2, $f4 # CHECK: class.d $f2, $f4 # encoding: [0x54,0x82,0x02,0x60] 1: From ef5008e6d0966564098a04dbddb058a01c195062 Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Tue, 1 Dec 2015 12:08:36 +0000 Subject: [PATCH 073/186] Fixed a failure in cost calculation for vector GEP Cost calculation for vector GEP failed with due to invalid cast to GEP index operand. The bug is fixed, added a test. http://reviews.llvm.org/D14976 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254408 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../llvm/Analysis/TargetTransformInfoImpl.h | 18 +++++++++++++----- include/llvm/Analysis/VectorUtils.h | 2 +- lib/Analysis/VectorUtils.cpp | 7 ++++--- .../SelectionDAG/SelectionDAGBuilder.cpp | 16 ++++++++-------- test/Analysis/CostModel/X86/vector_gep.ll | 17 +++++++++++++++++ 5 files changed, 43 insertions(+), 17 deletions(-) create mode 100644 test/Analysis/CostModel/X86/vector_gep.ll diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h index 415a85e99069..eedf1a61ba82 100644 --- a/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -22,6 +22,7 @@ #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/Operator.h" #include "llvm/IR/Type.h" +#include "llvm/Analysis/VectorUtils.h" namespace llvm { @@ -415,21 +416,28 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase { (Ptr == nullptr ? 0 : Ptr->getType()->getPointerAddressSpace()); auto GTI = gep_type_begin(PointerType::get(PointeeType, AS), Operands); for (auto I = Operands.begin(); I != Operands.end(); ++I, ++GTI) { + // We assume that the cost of Scalar GEP with constant index and the + // cost of Vector GEP with splat constant index are the same. + const ConstantInt *ConstIdx = dyn_cast(*I); + if (!ConstIdx) + if (auto Splat = getSplatValue(*I)) + ConstIdx = dyn_cast(Splat); if (isa(*GTI)) { int64_t ElementSize = DL.getTypeAllocSize(GTI.getIndexedType()); - if (const ConstantInt *ConstIdx = dyn_cast(*I)) { + if (ConstIdx) BaseOffset += ConstIdx->getSExtValue() * ElementSize; - } else { + else { // Needs scale register. - if (Scale != 0) { + if (Scale != 0) // No addressing mode takes two scale registers. return TTI::TCC_Basic; - } Scale = ElementSize; } } else { StructType *STy = cast(*GTI); - uint64_t Field = cast(*I)->getZExtValue(); + // For structures the index is always splat or scalar constant + assert(ConstIdx && "Unexpected GEP index"); + uint64_t Field = ConstIdx->getZExtValue(); BaseOffset += DL.getStructLayout(STy)->getElementOffset(Field); } } diff --git a/include/llvm/Analysis/VectorUtils.h b/include/llvm/Analysis/VectorUtils.h index 48ef76a9c8da..531803adf5e4 100644 --- a/include/llvm/Analysis/VectorUtils.h +++ b/include/llvm/Analysis/VectorUtils.h @@ -86,7 +86,7 @@ Value *findScalarElement(Value *V, unsigned EltNo); /// \brief Get splat value if the input is a splat vector or return nullptr. /// The value may be extracted from a splat constants vector or from /// a sequence of instructions that broadcast a single value into a vector. -Value *getSplatValue(Value *V); +const Value *getSplatValue(const Value *V); /// \brief Compute a map of integer instructions to their minimum legal type /// size. diff --git a/lib/Analysis/VectorUtils.cpp b/lib/Analysis/VectorUtils.cpp index fbf387b3ee20..e461294c1bee 100644 --- a/lib/Analysis/VectorUtils.cpp +++ b/lib/Analysis/VectorUtils.cpp @@ -417,9 +417,10 @@ Value *llvm::findScalarElement(Value *V, unsigned EltNo) { /// the input value is (1) a splat constants vector or (2) a sequence /// of instructions that broadcast a single value into a vector. /// -llvm::Value *llvm::getSplatValue(Value *V) { - if (auto *CV = dyn_cast(V)) - return CV->getSplatValue(); +const llvm::Value *llvm::getSplatValue(const Value *V) { + + if (auto *C = dyn_cast(V)) + return C->getSplatValue(); auto *ShuffleInst = dyn_cast(V); if (!ShuffleInst) diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 38b8bced2399..85e7e3c1bc8c 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -3301,18 +3301,18 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I) { // extract the spalt value and use it as a uniform base. // In all other cases the function returns 'false'. // -static bool getUniformBase(Value *& Ptr, SDValue& Base, SDValue& Index, +static bool getUniformBase(const Value *& Ptr, SDValue& Base, SDValue& Index, SelectionDAGBuilder* SDB) { SelectionDAG& DAG = SDB->DAG; LLVMContext &Context = *DAG.getContext(); assert(Ptr->getType()->isVectorTy() && "Uexpected pointer type"); - GetElementPtrInst *GEP = dyn_cast(Ptr); + const GetElementPtrInst *GEP = dyn_cast(Ptr); if (!GEP || GEP->getNumOperands() > 2) return false; - Value *GEPPtr = GEP->getPointerOperand(); + const Value *GEPPtr = GEP->getPointerOperand(); if (!GEPPtr->getType()->isVectorTy()) Ptr = GEPPtr; else if (!(Ptr = getSplatValue(GEPPtr))) @@ -3348,7 +3348,7 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) { SDLoc sdl = getCurSDLoc(); // llvm.masked.scatter.*(Src0, Ptrs, alignemt, Mask) - Value *Ptr = I.getArgOperand(1); + const Value *Ptr = I.getArgOperand(1); SDValue Src0 = getValue(I.getArgOperand(0)); SDValue Mask = getValue(I.getArgOperand(3)); EVT VT = Src0.getValueType(); @@ -3362,10 +3362,10 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) { SDValue Base; SDValue Index; - Value *BasePtr = Ptr; + const Value *BasePtr = Ptr; bool UniformBase = getUniformBase(BasePtr, Base, Index, this); - Value *MemOpBasePtr = UniformBase ? BasePtr : nullptr; + const Value *MemOpBasePtr = UniformBase ? BasePtr : nullptr; MachineMemOperand *MMO = DAG.getMachineFunction(). getMachineMemOperand(MachinePointerInfo(MemOpBasePtr), MachineMemOperand::MOStore, VT.getStoreSize(), @@ -3425,7 +3425,7 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) { SDLoc sdl = getCurSDLoc(); // @llvm.masked.gather.*(Ptrs, alignment, Mask, Src0) - Value *Ptr = I.getArgOperand(0); + const Value *Ptr = I.getArgOperand(0); SDValue Src0 = getValue(I.getArgOperand(3)); SDValue Mask = getValue(I.getArgOperand(2)); @@ -3442,7 +3442,7 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) { SDValue Root = DAG.getRoot(); SDValue Base; SDValue Index; - Value *BasePtr = Ptr; + const Value *BasePtr = Ptr; bool UniformBase = getUniformBase(BasePtr, Base, Index, this); bool ConstantMemory = false; if (UniformBase && diff --git a/test/Analysis/CostModel/X86/vector_gep.ll b/test/Analysis/CostModel/X86/vector_gep.ll new file mode 100644 index 000000000000..e49f25871d66 --- /dev/null +++ b/test/Analysis/CostModel/X86/vector_gep.ll @@ -0,0 +1,17 @@ +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-linux-unknown-unknown -mattr=+avx512f | FileCheck %s + +%struct.S = type { [1000 x i32] } + + +declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) + +define <4 x i32> @foov(<4 x %struct.S*> %s, i64 %base){ + %temp = insertelement <4 x i64> undef, i64 %base, i32 0 + %vector = shufflevector <4 x i64> %temp, <4 x i64> undef, <4 x i32> zeroinitializer +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds %struct.S + %B = getelementptr inbounds %struct.S, <4 x %struct.S*> %s, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer +;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds [1000 x i32] + %arrayidx = getelementptr inbounds [1000 x i32], <4 x [1000 x i32]*> %B, <4 x i64> zeroinitializer, <4 x i64> %vector + %res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %arrayidx, i32 4, <4 x i1> , <4 x i32> undef) + ret <4 x i32> %res +} From 235dd3bb99ceb616e6688ad74427bc8d519c662f Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Tue, 1 Dec 2015 12:30:40 +0000 Subject: [PATCH 074/186] Fixed a failure in getSpaltValue() git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254409 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Analysis/VectorUtils.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/Analysis/VectorUtils.cpp b/lib/Analysis/VectorUtils.cpp index e461294c1bee..5fb517e8edb5 100644 --- a/lib/Analysis/VectorUtils.cpp +++ b/lib/Analysis/VectorUtils.cpp @@ -420,7 +420,8 @@ Value *llvm::findScalarElement(Value *V, unsigned EltNo) { const llvm::Value *llvm::getSplatValue(const Value *V) { if (auto *C = dyn_cast(V)) - return C->getSplatValue(); + if (isa(V->getType())) + return C->getSplatValue(); auto *ShuffleInst = dyn_cast(V); if (!ShuffleInst) From 8de1fedc98a9c57039597498438a31ab8b1a6c23 Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Tue, 1 Dec 2015 12:35:03 +0000 Subject: [PATCH 075/186] AVX-512: regenerated test for avx512 arithmetics, NFC git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254410 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/X86/avx512-arith.ll | 283 ++++++++++++++++++++++++------- 1 file changed, 222 insertions(+), 61 deletions(-) diff --git a/test/CodeGen/X86/avx512-arith.ll b/test/CodeGen/X86/avx512-arith.ll index 522abd261472..d7da77a5eb54 100644 --- a/test/CodeGen/X86/avx512-arith.ll +++ b/test/CodeGen/X86/avx512-arith.ll @@ -1,4 +1,8 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck --check-prefix=CHECK --check-prefix=AVX512F %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vl | FileCheck --check-prefix=CHECK --check-prefix=AVX512VL %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512bw | FileCheck --check-prefix=CHECK --check-prefix=AVX512BW %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512dq | FileCheck --check-prefix=CHECK --check-prefix=AVX512DQ %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512dq -mattr=+avx512bw -mattr=+avx512vl| FileCheck --check-prefix=CHECK --check-prefix=SKX %s define <8 x double> @addpd512(<8 x double> %y, <8 x double> %x) { ; CHECK-LABEL: addpd512: @@ -83,18 +87,54 @@ entry: } define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) { -; CHECK-LABEL: imulq512: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmuludq %zmm0, %zmm1, %zmm2 -; CHECK-NEXT: vpsrlq $32, %zmm0, %zmm3 -; CHECK-NEXT: vpmuludq %zmm3, %zmm1, %zmm3 -; CHECK-NEXT: vpsllq $32, %zmm3, %zmm3 -; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm2 -; CHECK-NEXT: vpsrlq $32, %zmm1, %zmm1 -; CHECK-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: vpsllq $32, %zmm0, %zmm0 -; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 -; CHECK-NEXT: retq +; AVX512F-LABEL: imulq512: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpmuludq %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vpsrlq $32, %zmm0, %zmm3 +; AVX512F-NEXT: vpmuludq %zmm3, %zmm1, %zmm3 +; AVX512F-NEXT: vpsllq $32, %zmm3, %zmm3 +; AVX512F-NEXT: vpaddq %zmm3, %zmm2, %zmm2 +; AVX512F-NEXT: vpsrlq $32, %zmm1, %zmm1 +; AVX512F-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpsllq $32, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: imulq512: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vpmuludq %zmm0, %zmm1, %zmm2 +; AVX512VL-NEXT: vpsrlq $32, %zmm0, %zmm3 +; AVX512VL-NEXT: vpmuludq %zmm3, %zmm1, %zmm3 +; AVX512VL-NEXT: vpsllq $32, %zmm3, %zmm3 +; AVX512VL-NEXT: vpaddq %zmm3, %zmm2, %zmm2 +; AVX512VL-NEXT: vpsrlq $32, %zmm1, %zmm1 +; AVX512VL-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 +; AVX512VL-NEXT: vpsllq $32, %zmm0, %zmm0 +; AVX512VL-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: imulq512: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpmuludq %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm3 +; AVX512BW-NEXT: vpmuludq %zmm3, %zmm1, %zmm3 +; AVX512BW-NEXT: vpsllq $32, %zmm3, %zmm3 +; AVX512BW-NEXT: vpaddq %zmm3, %zmm2, %zmm2 +; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpsllq $32, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: imulq512: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: vpmullq %zmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: retq +; +; SKX-LABEL: imulq512: +; SKX: ## BB#0: +; SKX-NEXT: vpmullq %zmm0, %zmm1, %zmm0 +; SKX-NEXT: retq %z = mul <8 x i64>%x, %y ret <8 x i64>%z } @@ -463,10 +503,13 @@ entry: ret <8 x i64>%d } -; CHECK-LABEL: test_mask_vaddps -; CHECK: vaddps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}} -; CHECK: ret define <16 x float> @test_mask_vaddps(<16 x float> %dst, <16 x float> %i, +; CHECK-LABEL: test_mask_vaddps: +; CHECK: ## BB#0: +; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 +; CHECK-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq <16 x float> %j, <16 x i32> %mask1) nounwind readnone { %mask = icmp ne <16 x i32> %mask1, zeroinitializer @@ -475,10 +518,13 @@ define <16 x float> @test_mask_vaddps(<16 x float> %dst, <16 x float> %i, ret <16 x float> %r } -; CHECK-LABEL: test_mask_vmulps -; CHECK: vmulps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}} -; CHECK: ret define <16 x float> @test_mask_vmulps(<16 x float> %dst, <16 x float> %i, +; CHECK-LABEL: test_mask_vmulps: +; CHECK: ## BB#0: +; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 +; CHECK-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; CHECK-NEXT: vmulps %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq <16 x float> %j, <16 x i32> %mask1) nounwind readnone { %mask = icmp ne <16 x i32> %mask1, zeroinitializer @@ -487,10 +533,13 @@ define <16 x float> @test_mask_vmulps(<16 x float> %dst, <16 x float> %i, ret <16 x float> %r } -; CHECK-LABEL: test_mask_vminps -; CHECK: vminps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}} -; CHECK: ret define <16 x float> @test_mask_vminps(<16 x float> %dst, <16 x float> %i, +; CHECK-LABEL: test_mask_vminps: +; CHECK: ## BB#0: +; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 +; CHECK-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; CHECK-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq <16 x float> %j, <16 x i32> %mask1) nounwind readnone { %mask = icmp ne <16 x i32> %mask1, zeroinitializer @@ -500,10 +549,41 @@ define <16 x float> @test_mask_vminps(<16 x float> %dst, <16 x float> %i, ret <16 x float> %r } -; CHECK-LABEL: test_mask_vminpd -; CHECK: vminpd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}} -; CHECK: ret define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i, +; AVX512F-LABEL: test_mask_vminpd: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX512F-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; AVX512F-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: test_mask_vminpd: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX512VL-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 +; AVX512VL-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: test_mask_vminpd: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX512BW-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; AVX512BW-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: test_mask_vminpd: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX512DQ-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; AVX512DQ-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} +; AVX512DQ-NEXT: retq +; +; SKX-LABEL: test_mask_vminpd: +; SKX: ## BB#0: +; SKX-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; SKX-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 +; SKX-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} +; SKX-NEXT: retq <8 x double> %j, <8 x i32> %mask1) nounwind readnone { %mask = icmp ne <8 x i32> %mask1, zeroinitializer @@ -513,10 +593,13 @@ define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i, ret <8 x double> %r } -; CHECK-LABEL: test_mask_vmaxps -; CHECK: vmaxps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}} -; CHECK: ret define <16 x float> @test_mask_vmaxps(<16 x float> %dst, <16 x float> %i, +; CHECK-LABEL: test_mask_vmaxps: +; CHECK: ## BB#0: +; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 +; CHECK-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; CHECK-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq <16 x float> %j, <16 x i32> %mask1) nounwind readnone { %mask = icmp ne <16 x i32> %mask1, zeroinitializer @@ -526,10 +609,41 @@ define <16 x float> @test_mask_vmaxps(<16 x float> %dst, <16 x float> %i, ret <16 x float> %r } -; CHECK-LABEL: test_mask_vmaxpd -; CHECK: vmaxpd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}} -; CHECK: ret define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i, +; AVX512F-LABEL: test_mask_vmaxpd: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX512F-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; AVX512F-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: test_mask_vmaxpd: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX512VL-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 +; AVX512VL-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: test_mask_vmaxpd: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX512BW-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; AVX512BW-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: test_mask_vmaxpd: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX512DQ-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; AVX512DQ-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} +; AVX512DQ-NEXT: retq +; +; SKX-LABEL: test_mask_vmaxpd: +; SKX: ## BB#0: +; SKX-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; SKX-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 +; SKX-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} +; SKX-NEXT: retq <8 x double> %j, <8 x i32> %mask1) nounwind readnone { %mask = icmp ne <8 x i32> %mask1, zeroinitializer @@ -539,10 +653,13 @@ define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i, ret <8 x double> %r } -; CHECK-LABEL: test_mask_vsubps -; CHECK: vsubps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}} -; CHECK: ret define <16 x float> @test_mask_vsubps(<16 x float> %dst, <16 x float> %i, +; CHECK-LABEL: test_mask_vsubps: +; CHECK: ## BB#0: +; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 +; CHECK-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; CHECK-NEXT: vsubps %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq <16 x float> %j, <16 x i32> %mask1) nounwind readnone { %mask = icmp ne <16 x i32> %mask1, zeroinitializer @@ -551,10 +668,13 @@ define <16 x float> @test_mask_vsubps(<16 x float> %dst, <16 x float> %i, ret <16 x float> %r } -; CHECK-LABEL: test_mask_vdivps -; CHECK: vdivps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}} -; CHECK: ret define <16 x float> @test_mask_vdivps(<16 x float> %dst, <16 x float> %i, +; CHECK-LABEL: test_mask_vdivps: +; CHECK: ## BB#0: +; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 +; CHECK-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; CHECK-NEXT: vdivps %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq <16 x float> %j, <16 x i32> %mask1) nounwind readnone { %mask = icmp ne <16 x i32> %mask1, zeroinitializer @@ -563,10 +683,13 @@ define <16 x float> @test_mask_vdivps(<16 x float> %dst, <16 x float> %i, ret <16 x float> %r } -; CHECK-LABEL: test_mask_vaddpd -; CHECK: vaddpd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}} -; CHECK: ret define <8 x double> @test_mask_vaddpd(<8 x double> %dst, <8 x double> %i, +; CHECK-LABEL: test_mask_vaddpd: +; CHECK: ## BB#0: +; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 +; CHECK-NEXT: vpcmpneqq %zmm4, %zmm3, %k1 +; CHECK-NEXT: vaddpd %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq <8 x double> %j, <8 x i64> %mask1) nounwind readnone { %mask = icmp ne <8 x i64> %mask1, zeroinitializer @@ -575,10 +698,13 @@ define <8 x double> @test_mask_vaddpd(<8 x double> %dst, <8 x double> %i, ret <8 x double> %r } -; CHECK-LABEL: test_maskz_vaddpd -; CHECK: vaddpd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} {z}}} -; CHECK: ret define <8 x double> @test_maskz_vaddpd(<8 x double> %i, <8 x double> %j, +; CHECK-LABEL: test_maskz_vaddpd: +; CHECK: ## BB#0: +; CHECK-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; CHECK-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 +; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq <8 x i64> %mask1) nounwind readnone { %mask = icmp ne <8 x i64> %mask1, zeroinitializer %x = fadd <8 x double> %i, %j @@ -586,10 +712,13 @@ define <8 x double> @test_maskz_vaddpd(<8 x double> %i, <8 x double> %j, ret <8 x double> %r } -; CHECK-LABEL: test_mask_fold_vaddpd -; CHECK: vaddpd (%rdi), {{.*%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}.*}} -; CHECK: ret define <8 x double> @test_mask_fold_vaddpd(<8 x double> %dst, <8 x double> %i, +; CHECK-LABEL: test_mask_fold_vaddpd: +; CHECK: ## BB#0: +; CHECK-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; CHECK-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 +; CHECK-NEXT: vaddpd (%rdi), %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq <8 x double>* %j, <8 x i64> %mask1) nounwind { %mask = icmp ne <8 x i64> %mask1, zeroinitializer @@ -599,10 +728,13 @@ define <8 x double> @test_mask_fold_vaddpd(<8 x double> %dst, <8 x double> %i, ret <8 x double> %r } -; CHECK-LABEL: test_maskz_fold_vaddpd -; CHECK: vaddpd (%rdi), {{.*%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} {z}.*}} -; CHECK: ret define <8 x double> @test_maskz_fold_vaddpd(<8 x double> %i, <8 x double>* %j, +; CHECK-LABEL: test_maskz_fold_vaddpd: +; CHECK: ## BB#0: +; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; CHECK-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 +; CHECK-NEXT: vaddpd (%rdi), %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq <8 x i64> %mask1) nounwind { %mask = icmp ne <8 x i64> %mask1, zeroinitializer %tmp = load <8 x double>, <8 x double>* %j, align 8 @@ -611,10 +743,11 @@ define <8 x double> @test_maskz_fold_vaddpd(<8 x double> %i, <8 x double>* %j, ret <8 x double> %r } -; CHECK-LABEL: test_broadcast_vaddpd -; CHECK: vaddpd (%rdi){1to8}, %zmm{{.*}} -; CHECK: ret define <8 x double> @test_broadcast_vaddpd(<8 x double> %i, double* %j) nounwind { +; CHECK-LABEL: test_broadcast_vaddpd: +; CHECK: ## BB#0: +; CHECK-NEXT: vaddpd (%rdi){1to8}, %zmm0, %zmm0 +; CHECK-NEXT: retq %tmp = load double, double* %j %b = insertelement <8 x double> undef, double %tmp, i32 0 %c = shufflevector <8 x double> %b, <8 x double> undef, @@ -623,10 +756,14 @@ define <8 x double> @test_broadcast_vaddpd(<8 x double> %i, double* %j) nounwind ret <8 x double> %x } -; CHECK-LABEL: test_mask_broadcast_vaddpd -; CHECK: vaddpd (%rdi){1to8}, %zmm{{.*{%k[1-7]}.*}} -; CHECK: ret define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double> %i, +; CHECK-LABEL: test_mask_broadcast_vaddpd: +; CHECK: ## BB#0: +; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; CHECK-NEXT: vpcmpneqq %zmm0, %zmm2, %k1 +; CHECK-NEXT: vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq double* %j, <8 x i64> %mask1) nounwind { %mask = icmp ne <8 x i64> %mask1, zeroinitializer %tmp = load double, double* %j @@ -638,10 +775,13 @@ define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double> ret <8 x double> %r } -; CHECK-LABEL: test_maskz_broadcast_vaddpd -; CHECK: vaddpd (%rdi){1to8}, %zmm{{.*{%k[1-7]} {z}.*}} -; CHECK: ret define <8 x double> @test_maskz_broadcast_vaddpd(<8 x double> %i, double* %j, +; CHECK-LABEL: test_maskz_broadcast_vaddpd: +; CHECK: ## BB#0: +; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; CHECK-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 +; CHECK-NEXT: vaddpd (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq <8 x i64> %mask1) nounwind { %mask = icmp ne <8 x i64> %mask1, zeroinitializer %tmp = load double, double* %j @@ -653,10 +793,31 @@ define <8 x double> @test_maskz_broadcast_vaddpd(<8 x double> %i, double* %j, ret <8 x double> %r } -; CHECK-LABEL: test_fxor -; CHECK: vpxord -; CHECK: ret define <16 x float> @test_fxor(<16 x float> %a) { +; AVX512F-LABEL: test_fxor: +; AVX512F: ## BB#0: +; AVX512F-NEXT: vpxord {{.*}}(%rip), %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: test_fxor: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vpxord {{.*}}(%rip), %zmm0, %zmm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: test_fxor: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpxord {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: test_fxor: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: vxorps {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT: retq +; +; SKX-LABEL: test_fxor: +; SKX: ## BB#0: +; SKX-NEXT: vxorps {{.*}}(%rip), %zmm0, %zmm0 +; SKX-NEXT: retq %res = fsub <16 x float> , %a ret <16 x float>%res From 5836554b41d37c6e41cbbb13d1858107e5365960 Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Tue, 1 Dec 2015 12:43:46 +0000 Subject: [PATCH 076/186] AVX-512: fixed asm string of vsqrtss (vvsqrtss was generated before) git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254411 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86InstrAVX512.td | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 16b1f3b59b0d..1dfd8d4510f5 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -5911,12 +5911,12 @@ multiclass avx512_sqrt_scalar opc, string OpcodeStr,X86VectorVTInfo _, EVEX_B, EVEX_RC; let isCodeGenOnly = 1 in { - def r : SI; let mayLoad = 1 in - def m : SI; } From f0052c5d2a4706ff0693509f79199b473484918e Mon Sep 17 00:00:00 2001 From: Yury Gribov Date: Tue, 1 Dec 2015 13:24:48 +0000 Subject: [PATCH 077/186] Fix "WARNING: Title underline too short." introduced by r254404. Patch by Max Ostapenko. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254413 91177308-0d34-0410-b5e6-96231b3b80d8 --- docs/LangRef.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/LangRef.rst b/docs/LangRef.rst index 36bfb5167795..cf1ceab1f1c6 100644 --- a/docs/LangRef.rst +++ b/docs/LangRef.rst @@ -9340,7 +9340,7 @@ See the description for :ref:`llvm.stacksave `. .. _int_get_dynamic_area_offset: '``llvm.get.dynamic.area.offset``' Intrinsic -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Syntax: """"""" From 6979eb43962a1a5ad230f01cb5213f3490d22afd Mon Sep 17 00:00:00 2001 From: Chad Rosier Date: Tue, 1 Dec 2015 14:26:35 +0000 Subject: [PATCH 078/186] [LIR] Push check into helper function. NFC. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254416 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Scalar/LoopIdiomRecognize.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index a3658ed64976..9dc41ba2f328 100644 --- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -257,6 +257,10 @@ static unsigned getStoreStride(const SCEVAddRecExpr *StoreEv) { } bool LoopIdiomRecognize::isLegalStore(StoreInst *SI) { + // Don't touch volatile stores. + if (!SI->isSimple()) + return false; + Value *StoredVal = SI->getValueOperand(); Value *StorePtr = SI->getPointerOperand(); @@ -287,10 +291,6 @@ void LoopIdiomRecognize::collectStores(BasicBlock *BB) { if (!SI) continue; - // Don't touch volatile stores. - if (!SI->isSimple()) - continue; - // Make sure this is a strided store with a constant stride. if (!isLegalStore(SI)) continue; From 39e89e8fd833c2e13cf6c29379ff608d6b7c52a2 Mon Sep 17 00:00:00 2001 From: Rafael Espindola Date: Tue, 1 Dec 2015 15:19:48 +0000 Subject: [PATCH 079/186] Bring r254336 back: The difference is that now we don't error on out-of-comdat access to internal global values. We copy them instead. This seems to match the expectation of COFF linkers (see pr25686). Original message: Start deciding earlier what to link. A traditional linker is roughly split in symbol resolution and "copying stuff". The two tasks are badly mixed in lib/Linker. This starts splitting them apart. With this patch there are no direct call to linkGlobalValueBody or linkGlobalValueProto. Everything is linked via WapValue. This also includes a few fixes: * A GV goes undefined if the comdat is dropped (comdat11.ll). * We error if an internal GV goes undefined (comdat13.ll). * We don't link an unused comdat. The first two match the behavior of an ELF linker. The second one is equivalent to running globaldce on the input. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254418 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Linker/LinkModules.cpp | 155 ++++++++++++++------------- lib/Transforms/Utils/ValueMapper.cpp | 6 +- test/Linker/Inputs/comdat11.ll | 9 ++ test/Linker/Inputs/comdat13.ll | 9 ++ test/Linker/comdat11.ll | 13 +++ test/Linker/comdat12.ll | 8 ++ test/Linker/comdat13.ll | 30 ++++++ test/Linker/comdat9.ll | 3 + 8 files changed, 153 insertions(+), 80 deletions(-) create mode 100644 test/Linker/Inputs/comdat11.ll create mode 100644 test/Linker/Inputs/comdat13.ll create mode 100644 test/Linker/comdat11.ll create mode 100644 test/Linker/comdat12.ll create mode 100644 test/Linker/comdat13.ll diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp index cdf1decc8131..680b19ae2337 100644 --- a/lib/Linker/LinkModules.cpp +++ b/lib/Linker/LinkModules.cpp @@ -436,6 +436,8 @@ class ModuleLinker { /// references. bool DoneLinkingBodies; + bool HasError = false; + public: ModuleLinker(Module *dstM, Linker::IdentifiedStructTypeSet &Set, Module *srcM, DiagnosticHandlerFunction DiagnosticHandler, unsigned Flags, @@ -483,6 +485,7 @@ class ModuleLinker { /// Helper method for setting a message and returning an error code. bool emitError(const Twine &Message) { DiagnosticHandler(LinkDiagnosticInfo(DS_Error, Message)); + HasError = true; return true; } @@ -531,6 +534,7 @@ class ModuleLinker { void upgradeMismatchedGlobalArray(StringRef Name); void upgradeMismatchedGlobals(); + bool linkIfNeeded(GlobalValue &GV); bool linkAppendingVarProto(GlobalVariable *DstGV, const GlobalVariable *SrcGV); @@ -904,16 +908,12 @@ Value *ModuleLinker::materializeDeclFor(Value *V) { if (doneLinkingBodies()) return nullptr; - GlobalValue *DGV = copyGlobalValueProto(TypeMap, SGV); - - if (Comdat *SC = SGV->getComdat()) { - if (auto *DGO = dyn_cast(DGV)) { - Comdat *DC = DstM->getOrInsertComdat(SC->getName()); - DGO->setComdat(DC); - } - } - - return DGV; + linkGlobalValueProto(SGV); + if (HasError) + return nullptr; + Value *Ret = ValueMap[SGV]; + assert(Ret); + return Ret; } void ValueMaterializerTy::materializeInitFor(GlobalValue *New, @@ -922,15 +922,27 @@ void ValueMaterializerTy::materializeInitFor(GlobalValue *New, } void ModuleLinker::materializeInitFor(GlobalValue *New, GlobalValue *Old) { + if (auto *F = dyn_cast(New)) { + if (!F->isDeclaration()) + return; + } else if (auto *V = dyn_cast(New)) { + if (V->hasInitializer()) + return; + } else { + auto *A = cast(New); + if (A->getAliasee()) + return; + } + + if (Old->isDeclaration()) + return; + if (isPerformingImport() && !doImportAsDefinition(Old)) return; - // Skip declarations that ValueMaterializer may have created in - // case we link in only some of SrcM. - if (shouldLinkOnlyNeeded() && Old->isDeclaration()) + if (!New->hasLocalLinkage() && DoNotLinkFromSource.count(Old)) return; - assert(!Old->isDeclaration() && "users should not pass down decls"); linkGlobalValueBody(*Old); } @@ -1405,7 +1417,8 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) { std::tie(SK, LinkFromSrc) = ComdatsChosen[SC]; C = DstM->getOrInsertComdat(SC->getName()); C->setSelectionKind(SK); - ComdatMembers[SC].push_back(SGV); + if (SGV->hasInternalLinkage()) + LinkFromSrc = true; } else if (DGV) { if (shouldLinkFromSource(LinkFromSrc, *DGV, *SGV)) return true; @@ -1425,31 +1438,12 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) { if (DGV) HasUnnamedAddr = HasUnnamedAddr && DGV->hasUnnamedAddr(); - if (!LinkFromSrc && !DGV) - return false; - GlobalValue *NewGV; - if (!LinkFromSrc) { + if (!LinkFromSrc && DGV) { NewGV = DGV; // When linking from source we setVisibility from copyGlobalValueProto. setVisibility(NewGV, SGV, DGV); } else { - // If the GV is to be lazily linked, don't create it just yet. - // The ValueMaterializerTy will deal with creating it if it's used. - if (!DGV && !shouldOverrideFromSrc() && SGV != ImportFunction && - (SGV->hasLocalLinkage() || SGV->hasLinkOnceLinkage() || - SGV->hasAvailableExternallyLinkage())) { - DoNotLinkFromSource.insert(SGV); - return false; - } - - // When we only want to link in unresolved dependencies, blacklist - // the symbol unless unless DestM has a matching declaration (DGV). - if (shouldLinkOnlyNeeded() && !(DGV && DGV->isDeclaration())) { - DoNotLinkFromSource.insert(SGV); - return false; - } - NewGV = copyGlobalValueProto(TypeMap, SGV, DGV); if (isPerformingImport() && !doImportAsDefinition(SGV)) @@ -1459,7 +1453,7 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) { NewGV->setUnnamedAddr(HasUnnamedAddr); if (auto *NewGO = dyn_cast(NewGV)) { - if (C) + if (C && LinkFromSrc) NewGO->setComdat(C); if (DGV && DGV->hasCommonLinkage() && SGV->hasCommonLinkage()) @@ -1842,6 +1836,38 @@ static std::string mergeTriples(const Triple &SrcTriple, const Triple &DstTriple return DstTriple.str(); } +bool ModuleLinker::linkIfNeeded(GlobalValue &GV) { + GlobalValue *DGV = getLinkedToGlobal(&GV); + + if (shouldLinkOnlyNeeded() && !(DGV && DGV->isDeclaration())) + return false; + + if (DGV && !GV.hasLocalLinkage()) { + GlobalValue::VisibilityTypes Visibility = + getMinVisibility(DGV->getVisibility(), GV.getVisibility()); + DGV->setVisibility(Visibility); + GV.setVisibility(Visibility); + } + + if (const Comdat *SC = GV.getComdat()) { + bool LinkFromSrc; + Comdat::SelectionKind SK; + std::tie(SK, LinkFromSrc) = ComdatsChosen[SC]; + if (!LinkFromSrc) { + DoNotLinkFromSource.insert(&GV); + return false; + } + } + + if (!DGV && !shouldOverrideFromSrc() && + (GV.hasLocalLinkage() || GV.hasLinkOnceLinkage() || + GV.hasAvailableExternallyLinkage())) { + return false; + } + MapValue(&GV, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer); + return HasError; +} + bool ModuleLinker::run() { assert(DstM && "Null destination module"); assert(SrcM && "Null source module"); @@ -1901,24 +1927,30 @@ bool ModuleLinker::run() { // Upgrade mismatched global arrays. upgradeMismatchedGlobals(); + for (GlobalVariable &GV : SrcM->globals()) + if (const Comdat *SC = GV.getComdat()) + ComdatMembers[SC].push_back(&GV); + + for (Function &SF : *SrcM) + if (const Comdat *SC = SF.getComdat()) + ComdatMembers[SC].push_back(&SF); + + for (GlobalAlias &GA : SrcM->aliases()) + if (const Comdat *SC = GA.getComdat()) + ComdatMembers[SC].push_back(&GA); + // Insert all of the globals in src into the DstM module... without linking // initializers (which could refer to functions not yet mapped over). for (GlobalVariable &GV : SrcM->globals()) - if (linkGlobalValueProto(&GV)) + if (linkIfNeeded(GV)) return true; - // Link the functions together between the two modules, without doing function - // bodies... this just adds external function prototypes to the DstM - // function... We do this so that when we begin processing function bodies, - // all of the global values that may be referenced are available in our - // ValueMap. - for (Function &F :*SrcM) - if (linkGlobalValueProto(&F)) + for (Function &SF : *SrcM) + if (linkIfNeeded(SF)) return true; - // If there were any aliases, link them now. for (GlobalAlias &GA : SrcM->aliases()) - if (linkGlobalValueProto(&GA)) + if (linkIfNeeded(GA)) return true; for (AppendingVarInfo &AppendingVar : AppendingVars) @@ -1933,37 +1965,6 @@ bool ModuleLinker::run() { MapValue(GV, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer); } - // Link in the function bodies that are defined in the source module into - // DstM. - for (Function &SF : *SrcM) { - // Skip if no body (function is external). - if (SF.isDeclaration()) - continue; - - // Skip if not linking from source. - if (DoNotLinkFromSource.count(&SF)) - continue; - - if (linkGlobalValueBody(SF)) - return true; - } - - // Resolve all uses of aliases with aliasees. - for (GlobalAlias &Src : SrcM->aliases()) { - if (DoNotLinkFromSource.count(&Src)) - continue; - linkGlobalValueBody(Src); - } - - // Update the initializers in the DstM module now that all globals that may - // be referenced are in DstM. - for (GlobalVariable &Src : SrcM->globals()) { - // Only process initialized GV's or ones not already in dest. - if (!Src.hasInitializer() || DoNotLinkFromSource.count(&Src)) - continue; - linkGlobalValueBody(Src); - } - // Note that we are done linking global value bodies. This prevents // metadata linking from creating new references. DoneLinkingBodies = true; diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp index 0a63c1d5153c..00a8984845dd 100644 --- a/lib/Transforms/Utils/ValueMapper.cpp +++ b/lib/Transforms/Utils/ValueMapper.cpp @@ -41,9 +41,9 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, if (Value *NewV = Materializer->materializeDeclFor(const_cast(V))) { VM[V] = NewV; - if (auto *GV = dyn_cast(V)) - Materializer->materializeInitFor(cast(NewV), - const_cast(GV)); + if (auto *NewGV = dyn_cast(NewV)) + Materializer->materializeInitFor( + NewGV, const_cast(cast(V))); return NewV; } } diff --git a/test/Linker/Inputs/comdat11.ll b/test/Linker/Inputs/comdat11.ll new file mode 100644 index 000000000000..5b7f74cf0b24 --- /dev/null +++ b/test/Linker/Inputs/comdat11.ll @@ -0,0 +1,9 @@ +$foo = comdat any +@foo = global i8 1, comdat +define void @zed() { + call void @bar() + ret void +} +define void @bar() comdat($foo) { + ret void +} diff --git a/test/Linker/Inputs/comdat13.ll b/test/Linker/Inputs/comdat13.ll new file mode 100644 index 000000000000..85515210ed7e --- /dev/null +++ b/test/Linker/Inputs/comdat13.ll @@ -0,0 +1,9 @@ +$foo = comdat any +@foo = internal global i8 1, comdat +define i8* @zed() { + call void @bax() + ret i8* @foo +} +define internal void @bax() comdat($foo) { + ret void +} diff --git a/test/Linker/comdat11.ll b/test/Linker/comdat11.ll new file mode 100644 index 000000000000..dbade4104fe3 --- /dev/null +++ b/test/Linker/comdat11.ll @@ -0,0 +1,13 @@ +; RUN: llvm-link -S %s %p/Inputs/comdat11.ll -o - | FileCheck %s + +$foo = comdat any +@foo = global i8 0, comdat + +; CHECK: @foo = global i8 0, comdat + +; CHECK: define void @zed() { +; CHECK: call void @bar() +; CHECK: ret void +; CHECK: } + +; CHECK: declare void @bar() diff --git a/test/Linker/comdat12.ll b/test/Linker/comdat12.ll new file mode 100644 index 000000000000..d06e222b63ac --- /dev/null +++ b/test/Linker/comdat12.ll @@ -0,0 +1,8 @@ +; RUN: llvm-link %s -S -o - | FileCheck %s + +$foo = comdat largest +define internal void @foo() comdat($foo) { + ret void +} + +; CHECK-NOT: foo diff --git a/test/Linker/comdat13.ll b/test/Linker/comdat13.ll new file mode 100644 index 000000000000..d1e382a2f278 --- /dev/null +++ b/test/Linker/comdat13.ll @@ -0,0 +1,30 @@ +; RUN: llvm-link -S %s %p/Inputs/comdat13.ll -o - | FileCheck %s + +; In Inputs/comdat13.ll a function not in the $foo comdat (zed) references an +; internal function in the comdat $foo. +; The IR would be ilegal on ELF ("relocation refers to discarded section"), +; but COFF linkers seem to just duplicate the comdat. + +$foo = comdat any +@foo = internal global i8 0, comdat +define i8* @bar() { + ret i8* @foo +} + +; CHECK: $foo = comdat any + +; CHECK: @foo = internal global i8 0, comdat +; CHECK: @foo.1 = internal global i8 1, comdat($foo) + +; CHECK: define i8* @bar() { +; CHECK-NEXT: ret i8* @foo +; CHECK-NEXT: } + +; CHECK: define i8* @zed() { +; CHECK-NEXT: call void @bax() +; CHECK-NEXT: ret i8* @foo.1 +; CHECK-NEXT: } + +; CHECK: define internal void @bax() comdat($foo) { +; CHECK-NEXT: ret void +; CHECK-NEXT: } diff --git a/test/Linker/comdat9.ll b/test/Linker/comdat9.ll index 274957401aac..4f6f2cfb845d 100644 --- a/test/Linker/comdat9.ll +++ b/test/Linker/comdat9.ll @@ -14,6 +14,9 @@ $f2 = comdat largest define internal void @f2() comdat($f2) { ret void } +define void @f3() comdat($f2) { + ret void +} ; CHECK-DAG: $f2 = comdat largest ; CHECK-DAG: define internal void @f2() comdat { From d0903bddbc09ae3fc2aaca4679cb71a5f6158b6e Mon Sep 17 00:00:00 2001 From: Rafael Espindola Date: Tue, 1 Dec 2015 15:46:46 +0000 Subject: [PATCH 080/186] Simplify test. NFC. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254419 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/Linker/global_ctors.ll | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/test/Linker/global_ctors.ll b/test/Linker/global_ctors.ll index 49df81a00759..cc28471df59d 100644 --- a/test/Linker/global_ctors.ll +++ b/test/Linker/global_ctors.ll @@ -1,6 +1,5 @@ -; RUN: llvm-as %s -o %t.new.bc -; RUN: llvm-link %t.new.bc %S/Inputs/old_global_ctors.3.4.bc | llvm-dis | FileCheck %s -; RUN: llvm-link %S/Inputs/old_global_ctors.3.4.bc %t.new.bc | llvm-dis | FileCheck %s +; RUN: llvm-link -S %s %S/Inputs/old_global_ctors.3.4.bc | FileCheck %s +; RUN: llvm-link -S %S/Inputs/old_global_ctors.3.4.bc %s | FileCheck %s ; old_global_ctors.3.4.bc contains the following LLVM IL, assembled into ; bitcode by llvm-as from 3.4. It uses a two element @llvm.global_ctors array. From 2b9196851ccf22799f23c75377672a93a5c02665 Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Tue, 1 Dec 2015 16:45:23 +0000 Subject: [PATCH 081/186] [ThinLTO] Remove stale comment (NFC) Stale as of r254036 which added basic profitability check. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254421 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/IPO/FunctionImport.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp index ab0f7114957b..345b2f540adf 100644 --- a/lib/Transforms/IPO/FunctionImport.cpp +++ b/lib/Transforms/IPO/FunctionImport.cpp @@ -136,10 +136,6 @@ bool FunctionImporter::importFunctions(Module &M) { continue; } - // - // No profitability notion right now, just import all the time... - // - // Get the module path from the summary. auto FileName = Summary->modulePath(); DEBUG(dbgs() << "Importing " << CalledFunctionName << " from " << FileName From 3c43768b6387f77f9e11450b49e5ea7b56ce1800 Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Tue, 1 Dec 2015 17:12:10 +0000 Subject: [PATCH 082/186] [ThinLTO] Wrap dbgs() output in DEBUG macro Missed in a couple places. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254422 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/IPO/FunctionImport.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp index 345b2f540adf..10bba1939270 100644 --- a/lib/Transforms/IPO/FunctionImport.cpp +++ b/lib/Transforms/IPO/FunctionImport.cpp @@ -124,15 +124,15 @@ bool FunctionImporter::importFunctions(Module &M) { auto *Summary = Info->functionSummary(); if (!Summary) { // FIXME: in case we are lazyloading summaries, we can do it now. - dbgs() << "Missing summary for " << CalledFunctionName - << ", error at import?\n"; + DEBUG(dbgs() << "Missing summary for " << CalledFunctionName + << ", error at import?\n"); llvm_unreachable("Missing summary"); } if (Summary->instCount() > ImportInstrLimit) { - dbgs() << "Skip import of " << CalledFunctionName << " with " - << Summary->instCount() << " instructions (limit " - << ImportInstrLimit << ")\n"; + DEBUG(dbgs() << "Skip import of " << CalledFunctionName << " with " + << Summary->instCount() << " instructions (limit " + << ImportInstrLimit << ")\n"); continue; } From 2b3f97809d173409a1442d7925e4a64806a738f7 Mon Sep 17 00:00:00 2001 From: Rafael Espindola Date: Tue, 1 Dec 2015 17:17:04 +0000 Subject: [PATCH 083/186] Make appending var linking less of a special case. It has to be a bit special because: * materializeInitFor is not really supposed to call replaceAllUsesWith. The caller has a plain variable with Dst and expects just the initializer to be set, not for it to be removed. * Calling mutateType as we used to do before gets some type inconsistency which breaks the bitcode writer. * If linkAppendingVarProto create a dest decl with the correct type to avoid the above problems, it needs to put the original dst init in some side table for materializeInitFor to use. In the end the simplest solution seems to be to just have linkAppendingVarProto do all the work and set ValueMap[SrcGV to avoid recursion. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254424 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Linker/LinkModules.cpp | 111 ++++++++++++------------------------- 1 file changed, 36 insertions(+), 75 deletions(-) diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp index 680b19ae2337..f745df56f9aa 100644 --- a/lib/Linker/LinkModules.cpp +++ b/lib/Linker/LinkModules.cpp @@ -401,14 +401,6 @@ class ModuleLinker { /// but this allows us to reuse the ValueMapper code. ValueToValueMapTy ValueMap; - struct AppendingVarInfo { - GlobalVariable *NewGV; // New aggregate global in dest module. - const Constant *DstInit; // Old initializer from dest module. - const Constant *SrcInit; // Old initializer from src module. - }; - - std::vector AppendingVars; - // Set of items not to link in from source. SmallPtrSet DoNotLinkFromSource; @@ -541,8 +533,6 @@ class ModuleLinker { bool linkGlobalValueProto(GlobalValue *GV); bool linkModuleFlagsMetadata(); - void linkAppendingVarInit(AppendingVarInfo &AVI); - void linkGlobalInit(GlobalVariable &Dst, GlobalVariable &Src); bool linkFunctionBody(Function &Dst, Function &Src); void linkAliasBody(GlobalAlias &Dst, GlobalAlias &Src); @@ -1318,6 +1308,14 @@ void ModuleLinker::upgradeMismatchedGlobals() { upgradeMismatchedGlobalArray("llvm.global_dtors"); } +static void getArrayElements(const Constant *C, + SmallVectorImpl &Dest) { + unsigned NumElements = cast(C->getType())->getNumElements(); + + for (unsigned i = 0; i != NumElements; ++i) + Dest.push_back(C->getAggregateElement(i)); +} + /// If there were any appending global variables, link them together now. /// Return true on error. bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV, @@ -1326,10 +1324,8 @@ bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV, cast(TypeMap.get(SrcGV->getType()->getElementType())); Type *EltTy = SrcTy->getElementType(); - uint64_t NewSize = SrcTy->getNumElements(); if (DstGV) { ArrayType *DstTy = cast(DstGV->getType()->getElementType()); - NewSize += DstTy->getNumElements(); if (!SrcGV->hasAppendingLinkage() || !DstGV->hasAppendingLinkage()) return emitError( @@ -1359,6 +1355,27 @@ bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV, "Appending variables with different section name need to be linked!"); } + SmallVector DstElements; + if (DstGV) + getArrayElements(DstGV->getInitializer(), DstElements); + + SmallVector SrcElements; + getArrayElements(SrcGV->getInitializer(), SrcElements); + + StringRef Name = SrcGV->getName(); + bool IsNewStructor = + (Name == "llvm.global_ctors" || Name == "llvm.global_dtors") && + cast(EltTy)->getNumElements() == 3; + if (IsNewStructor) + SrcElements.erase( + std::remove_if(SrcElements.begin(), SrcElements.end(), + [this](Constant *E) { + auto *Key = dyn_cast( + E->getAggregateElement(2)->stripPointerCasts()); + return DoNotLinkFromSource.count(Key); + }), + SrcElements.end()); + uint64_t NewSize = DstElements.size() + SrcElements.size(); ArrayType *NewType = ArrayType::get(EltTy, NewSize); // Create the new global variable. @@ -1370,24 +1387,22 @@ bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV, // Propagate alignment, visibility and section info. copyGVAttributes(NG, SrcGV); - AppendingVarInfo AVI; - AVI.NewGV = NG; - AVI.DstInit = DstGV ? DstGV->getInitializer() : nullptr; - AVI.SrcInit = SrcGV->getInitializer(); - AppendingVars.push_back(AVI); - // Replace any uses of the two global variables with uses of the new // global. ValueMap[SrcGV] = ConstantExpr::getBitCast(NG, TypeMap.get(SrcGV->getType())); + for (auto *V : SrcElements) { + DstElements.push_back( + MapValue(V, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer)); + } + + NG->setInitializer(ConstantArray::get(NewType, DstElements)); + if (DstGV) { DstGV->replaceAllUsesWith(ConstantExpr::getBitCast(NG, DstGV->getType())); DstGV->eraseFromParent(); } - // Track the source variable so we don't try to link it. - DoNotLinkFromSource.insert(SrcGV); - return false; } @@ -1480,57 +1495,6 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) { return false; } -static void getArrayElements(const Constant *C, - SmallVectorImpl &Dest) { - unsigned NumElements = cast(C->getType())->getNumElements(); - - for (unsigned i = 0; i != NumElements; ++i) - Dest.push_back(C->getAggregateElement(i)); -} - -void ModuleLinker::linkAppendingVarInit(AppendingVarInfo &AVI) { - // Merge the initializer. - SmallVector DstElements; - if (AVI.DstInit) - getArrayElements(AVI.DstInit, DstElements); - - SmallVector SrcElements; - getArrayElements(AVI.SrcInit, SrcElements); - - ArrayType *NewType = cast(AVI.NewGV->getType()->getElementType()); - - StringRef Name = AVI.NewGV->getName(); - bool IsNewStructor = - (Name == "llvm.global_ctors" || Name == "llvm.global_dtors") && - cast(NewType->getElementType())->getNumElements() == 3; - - for (auto *V : SrcElements) { - if (IsNewStructor) { - auto *Key = - dyn_cast(V->getAggregateElement(2)->stripPointerCasts()); - if (DoNotLinkFromSource.count(Key)) - continue; - } - DstElements.push_back( - MapValue(V, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer)); - } - if (DstElements.size() != NewType->getNumElements()) { - NewType = ArrayType::get(NewType->getElementType(), DstElements.size()); - GlobalVariable *Old = AVI.NewGV; - GlobalVariable *NG = new GlobalVariable( - *DstM, NewType, Old->isConstant(), Old->getLinkage(), /*init*/ nullptr, - /*name*/ "", Old, Old->getThreadLocalMode(), - Old->getType()->getAddressSpace()); - copyGVAttributes(NG, Old); - AVI.NewGV->replaceAllUsesWith( - ConstantExpr::getBitCast(NG, AVI.NewGV->getType())); - AVI.NewGV->eraseFromParent(); - AVI.NewGV = NG; - } - - AVI.NewGV->setInitializer(ConstantArray::get(NewType, DstElements)); -} - /// Update the initializers in the Dest module now that all globals that may be /// referenced are in Dest. void ModuleLinker::linkGlobalInit(GlobalVariable &Dst, GlobalVariable &Src) { @@ -1953,9 +1917,6 @@ bool ModuleLinker::run() { if (linkIfNeeded(GA)) return true; - for (AppendingVarInfo &AppendingVar : AppendingVars) - linkAppendingVarInit(AppendingVar); - for (const auto &Entry : DstM->getComdatSymbolTable()) { const Comdat &C = Entry.getValue(); if (C.getSelectionKind() == Comdat::Any) From 40adef82a5ba533fb6a1c60789f396c2baa976cc Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Tue, 1 Dec 2015 17:27:55 +0000 Subject: [PATCH 084/186] [x86] add a convenience method to check for FMA capability; NFCI git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254425 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 8 +++----- lib/Target/X86/X86Subtarget.h | 1 + 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 4ec4ec280675..7ddcd8fcda48 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1148,7 +1148,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom); - if (Subtarget->hasFMA() || Subtarget->hasFMA4() || Subtarget->hasAVX512()) { + if (Subtarget->hasAnyFMA()) { setOperationAction(ISD::FMA, MVT::v8f32, Legal); setOperationAction(ISD::FMA, MVT::v4f64, Legal); setOperationAction(ISD::FMA, MVT::v4f32, Legal); @@ -20463,7 +20463,7 @@ bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; } bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { - if (!(Subtarget->hasFMA() || Subtarget->hasFMA4() || Subtarget->hasAVX512())) + if (!Subtarget->hasAnyFMA()) return false; VT = VT.getScalarType(); @@ -26471,9 +26471,7 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, return SDValue(); EVT ScalarVT = VT.getScalarType(); - if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || - (!Subtarget->hasFMA() && !Subtarget->hasFMA4() && - !Subtarget->hasAVX512())) + if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasAnyFMA()) return SDValue(); SDValue A = N->getOperand(0); diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index eb0199aecbeb..353b4f7f5ebd 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -358,6 +358,7 @@ class X86Subtarget final : public X86GenSubtargetInfo { // has equal or better performance on all supported targets. bool hasFMA() const { return HasFMA && !HasFMA4; } bool hasFMA4() const { return HasFMA4; } + bool hasAnyFMA() const { return hasFMA() || hasFMA4() || hasAVX512(); } bool hasXOP() const { return HasXOP; } bool hasTBM() const { return HasTBM; } bool hasMOVBE() const { return HasMOVBE; } From 368c76102b9d3f5a430f842964b12fefe3d3a18c Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Tue, 1 Dec 2015 17:45:17 +0000 Subject: [PATCH 085/186] AMDGPU: Use the default strings for data emission directives Summary: This makes the assembly output look nicer and there is no reason to have custom strings for these. Reviewers: arsenm Subscribers: arsenm, llvm-commits Differential Revision: http://reviews.llvm.org/D14671 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254426 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp index d79ffdf52a74..01548e50f7c5 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -22,13 +22,6 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() { InlineAsmEnd = ";#ASMEND"; //===--- Data Emission Directives -------------------------------------===// - ZeroDirective = ".zero"; - AsciiDirective = ".ascii\t"; - AscizDirective = ".asciz\t"; - Data8bitsDirective = ".byte\t"; - Data16bitsDirective = ".short\t"; - Data32bitsDirective = ".long\t"; - Data64bitsDirective = ".quad\t"; SunStyleELFSectionSwitchSyntax = true; UsesELFSectionDirectiveForBSS = true; From 76eb0395686d266a7416d578867fc62ef5e4ee82 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Tue, 1 Dec 2015 17:45:22 +0000 Subject: [PATCH 086/186] AMDGPU/SI: Remove REGISTER_STORE/REGISTER_LOAD code which is now dead Reviewers: arsenm Subscribers: arsenm, llvm-commits Differential Revision: http://reviews.llvm.org/D15050 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254427 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 35 ------------------------ lib/Target/AMDGPU/SIISelLowering.cpp | 16 ----------- lib/Target/AMDGPU/SIInstructions.td | 30 -------------------- 3 files changed, 81 deletions(-) diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 6aa4fddd3ec4..710c6771b171 100644 --- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -458,41 +458,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { N = glueCopyToM0(N); break; } - case AMDGPUISD::REGISTER_LOAD: { - if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) - break; - SDValue Addr, Offset; - - SDLoc DL(N); - SelectADDRIndirect(N->getOperand(1), Addr, Offset); - const SDValue Ops[] = { - Addr, - Offset, - CurDAG->getTargetConstant(0, DL, MVT::i32), - N->getOperand(0), - }; - return CurDAG->getMachineNode(AMDGPU::SI_RegisterLoad, DL, - CurDAG->getVTList(MVT::i32, MVT::i64, - MVT::Other), - Ops); - } - case AMDGPUISD::REGISTER_STORE: { - if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) - break; - SDValue Addr, Offset; - SelectADDRIndirect(N->getOperand(2), Addr, Offset); - SDLoc DL(N); - const SDValue Ops[] = { - N->getOperand(1), - Addr, - Offset, - CurDAG->getTargetConstant(0, DL, MVT::i32), - N->getOperand(0), - }; - return CurDAG->getMachineNode(AMDGPU::SI_RegisterStorePseudo, DL, - CurDAG->getVTList(MVT::Other), - Ops); - } case AMDGPUISD::BFE_I32: case AMDGPUISD::BFE_U32: { diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 1b0cc87206f4..ab93bceb96ec 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -848,27 +848,11 @@ SDValue SITargetLowering::LowerFormalArguments( MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MachineInstr * MI, MachineBasicBlock * BB) const { - MachineBasicBlock::iterator I = *MI; - const SIInstrInfo *TII = - static_cast(Subtarget->getInstrInfo()); - switch (MI->getOpcode()) { default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); case AMDGPU::BRANCH: return BB; - case AMDGPU::SI_RegisterStorePseudo: { - MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); - unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - MachineInstrBuilder MIB = - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore), - Reg); - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) - MIB.addOperand(MI->getOperand(i)); - - MI->eraseFromParent(); - break; - } } return BB; } diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index bc2b0c6c07fb..2cee993d751c 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -1942,36 +1942,6 @@ def SI_KILL : InstSI < let Uses = [EXEC], Defs = [EXEC,VCC,M0] in { -//defm SI_ : RegisterLoadStore ; - -let UseNamedOperandTable = 1 in { - -def SI_RegisterLoad : InstSI < - (outs VGPR_32:$dst, SReg_64:$temp), - (ins FRAMEri32:$addr, i32imm:$chan), - "", [] -> { - let isRegisterLoad = 1; - let mayLoad = 1; -} - -class SIRegStore : InstSI < - outs, - (ins VGPR_32:$val, FRAMEri32:$addr, i32imm:$chan), - "", [] -> { - let isRegisterStore = 1; - let mayStore = 1; -} - -let usesCustomInserter = 1 in { -def SI_RegisterStorePseudo : SIRegStore<(outs)>; -} // End usesCustomInserter = 1 -def SI_RegisterStore : SIRegStore<(outs SReg_64:$temp)>; - - -} // End UseNamedOperandTable = 1 - class SI_INDIRECT_SRC : InstSI < (outs VGPR_32:$dst, SReg_64:$temp), (ins rc:$src, VSrc_32:$idx, i32imm:$off), From 0fb89f7b8241946815f6eb9ae59589e184d9b016 Mon Sep 17 00:00:00 2001 From: David Blaikie Date: Tue, 1 Dec 2015 18:07:07 +0000 Subject: [PATCH 087/186] [llvm-dwp] Correctly update debug_str_offsets.dwo when linking dwo files This doesn't deduplicate strings in the debug_str section, nor does it properly wire up the index so that debug_info can /find/ these strings, but it does correct the str_offsets specifically. Follow up patches to address those related/next issues. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254431 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/tools/llvm-dwp/X86/simple.test | 6 +- tools/llvm-dwp/llvm-dwp.cpp | 85 +++++++++++++++++++++++------ 2 files changed, 71 insertions(+), 20 deletions(-) diff --git a/test/tools/llvm-dwp/X86/simple.test b/test/tools/llvm-dwp/X86/simple.test index 26215bdc8c91..d70bdda072c5 100644 --- a/test/tools/llvm-dwp/X86/simple.test +++ b/test/tools/llvm-dwp/X86/simple.test @@ -47,12 +47,12 @@ CHECK: .debug_str.dwo contents: CHECK: "clang version CHECK: 0x[[ACPP:.*]]: "a.cpp" FIXME: Remove duplicates -CHECK: "clang version +CHECK: 0x[[SECONDREV:.*]]: "clang version CHECK: 0x[[BCPP:.*]]: "b.cpp" CHECK: .debug_str_offsets.dwo contents: CHECK: : 00000000 CHECK: : [[ACPP]] -CHECK: : 00000000 +CHECK: : [[SECONDREV]] FIXME: Update str offset indexes, this should be BCPP \/ -CHECK: : [[ACPP]] +CHECK: : [[BCPP]] diff --git a/tools/llvm-dwp/llvm-dwp.cpp b/tools/llvm-dwp/llvm-dwp.cpp index 7f9f6678db0b..c89be222e6c9 100644 --- a/tools/llvm-dwp/llvm-dwp.cpp +++ b/tools/llvm-dwp/llvm-dwp.cpp @@ -9,6 +9,7 @@ #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" #include "llvm/Object/ObjectFile.h" +#include "llvm/Support/DataExtractor.h" #include "llvm/Support/Options.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/MemoryBuffer.h" @@ -21,6 +22,7 @@ #include using namespace llvm; +using namespace llvm::object; using namespace cl; OptionCategory DwpCategory("Specific Options"); @@ -36,39 +38,88 @@ static int error(const Twine &Error, const Twine &Context) { return 1; } -static std::error_code writeSection(MCStreamer &Out, MCSection *OutSection, - const object::SectionRef &Sym) { - StringRef Contents; - if (auto Err = Sym.getContents(Contents)) - return Err; - Out.SwitchSection(OutSection); - Out.EmitBytes(Contents); +static std::error_code +writeStringsAndOffsets(MCStreamer &Out, StringMap &Strings, + uint32_t &StringOffset, MCSection *StrOffsetSection, + StringRef CurStrSection, StringRef CurStrOffsetSection) { + // Could possibly produce an error or warning if one of these was non-null but + // the other was null. + if (CurStrSection.empty() || CurStrOffsetSection.empty()) + return std::error_code(); + + DenseMap OffsetRemapping; + + DataExtractor Data(CurStrSection, true, 0); + uint32_t LocalOffset = 0; + uint32_t PrevOffset = 0; + while (const char *s = Data.getCStr(&LocalOffset)) { + StringRef Str(s, LocalOffset - PrevOffset - 1); + OffsetRemapping[PrevOffset] = StringOffset; + // insert, if successful, write new string to the str.dwo section + StringOffset += Str.size() + 1; + PrevOffset = LocalOffset; + } + + Data = DataExtractor(CurStrOffsetSection, true, 0); + + Out.SwitchSection(StrOffsetSection); + + uint32_t Offset = 0; + uint64_t Size = CurStrOffsetSection.size(); + while (Offset < Size) { + auto OldOffset = Data.getU32(&Offset); + auto NewOffset = OffsetRemapping[OldOffset]; + Out.EmitIntValue(NewOffset, 4); + } + return std::error_code(); } static std::error_code write(MCStreamer &Out, ArrayRef Inputs) { + const auto &MCOFI = *Out.getContext().getObjectFileInfo(); + MCSection *const StrSection = MCOFI.getDwarfStrDWOSection(); + MCSection *const StrOffsetSection = MCOFI.getDwarfStrOffDWOSection(); + const StringMap KnownSections = { + {"debug_info.dwo", MCOFI.getDwarfInfoDWOSection()}, + {"debug_types.dwo", MCOFI.getDwarfTypesDWOSection()}, + {"debug_str_offsets.dwo", StrOffsetSection}, + {"debug_str.dwo", StrSection}, + {"debug_loc.dwo", MCOFI.getDwarfLocDWOSection()}, + {"debug_abbrev.dwo", MCOFI.getDwarfAbbrevDWOSection()}}; + + StringMap Strings; + uint32_t StringOffset = 0; + for (const auto &Input : Inputs) { auto ErrOrObj = object::ObjectFile::createObjectFile(Input); if (!ErrOrObj) return ErrOrObj.getError(); const auto *Obj = ErrOrObj->getBinary(); + StringRef CurStrSection; + StringRef CurStrOffsetSection; for (const auto &Section : Obj->sections()) { - const auto &MCOFI = *Out.getContext().getObjectFileInfo(); - static const StringMap KnownSections = { - {"debug_info.dwo", MCOFI.getDwarfInfoDWOSection()}, - {"debug_types.dwo", MCOFI.getDwarfTypesDWOSection()}, - {"debug_str_offsets.dwo", MCOFI.getDwarfStrOffDWOSection()}, - {"debug_str.dwo", MCOFI.getDwarfStrDWOSection()}, - {"debug_loc.dwo", MCOFI.getDwarfLocDWOSection()}, - {"debug_abbrev.dwo", MCOFI.getDwarfAbbrevDWOSection()}}; StringRef Name; if (std::error_code Err = Section.getName(Name)) return Err; if (MCSection *OutSection = - KnownSections.lookup(Name.substr(Name.find_first_not_of("._")))) - if (auto Err = writeSection(Out, OutSection, Section)) + KnownSections.lookup(Name.substr(Name.find_first_not_of("._")))) { + StringRef Contents; + if (auto Err = Section.getContents(Contents)) return Err; + if (OutSection == StrOffsetSection) { + CurStrOffsetSection = Contents; + continue; + } + if (OutSection == StrSection) + CurStrSection = Contents; + Out.SwitchSection(OutSection); + Out.EmitBytes(Contents); + } } + if (auto Err = + writeStringsAndOffsets(Out, Strings, StringOffset, StrOffsetSection, + CurStrSection, CurStrOffsetSection)) + return Err; } return std::error_code(); } From 1d06f0bf4e070385fbb6d112318c190fc9aadda8 Mon Sep 17 00:00:00 2001 From: Rafael Espindola Date: Tue, 1 Dec 2015 18:41:30 +0000 Subject: [PATCH 088/186] Delete the setModule method from the Linker. It was only used from LTO for a debug feature, and LTO can just create another linker. It is pretty odd to have a method to reset the module in the middle of a link. It would make IdentifiedStructTypes inconsistent with the Module for example. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254434 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/LTO/LTOCodeGenerator.h | 4 ++-- include/llvm/Linker/Linker.h | 3 --- lib/LTO/LTOCodeGenerator.cpp | 8 ++++---- lib/Linker/LinkModules.cpp | 4 ---- 4 files changed, 6 insertions(+), 13 deletions(-) diff --git a/include/llvm/LTO/LTOCodeGenerator.h b/include/llvm/LTO/LTOCodeGenerator.h index 0d3c79bf5e84..c322288a1ae9 100644 --- a/include/llvm/LTO/LTOCodeGenerator.h +++ b/include/llvm/LTO/LTOCodeGenerator.h @@ -39,7 +39,6 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/StringMap.h" -#include "llvm/Linker/Linker.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include @@ -49,6 +48,7 @@ namespace llvm { class LLVMContext; class DiagnosticInfo; class GlobalValue; + class Linker; class Mangler; class MemoryBuffer; class TargetLibraryInfo; @@ -171,7 +171,7 @@ struct LTOCodeGenerator { std::unique_ptr OwnedContext; LLVMContext &Context; std::unique_ptr MergedModule; - Linker IRLinker; + std::unique_ptr IRLinker; std::unique_ptr TargetMach; bool EmitDwarfDebugInfo = false; bool ScopeRestrictionsDone = false; diff --git a/include/llvm/Linker/Linker.h b/include/llvm/Linker/Linker.h index 610b1ddf9893..3f6c7b6c6942 100644 --- a/include/llvm/Linker/Linker.h +++ b/include/llvm/Linker/Linker.h @@ -85,9 +85,6 @@ class Linker { const FunctionInfoIndex *Index = nullptr, Function *FuncToImport = nullptr); - /// \brief Set the composite to the passed-in module. - void setModule(Module *Dst); - static bool LinkModules(Module *Dest, Module *Src, DiagnosticHandlerFunction DiagnosticHandler, unsigned Flags = Flags::None); diff --git a/lib/LTO/LTOCodeGenerator.cpp b/lib/LTO/LTOCodeGenerator.cpp index 931bcf0d23fc..37ee7e8c53cc 100644 --- a/lib/LTO/LTOCodeGenerator.cpp +++ b/lib/LTO/LTOCodeGenerator.cpp @@ -67,14 +67,14 @@ const char* LTOCodeGenerator::getVersionString() { LTOCodeGenerator::LTOCodeGenerator() : Context(getGlobalContext()), MergedModule(new Module("ld-temp.o", Context)), - IRLinker(MergedModule.get()) { + IRLinker(new Linker(MergedModule.get())) { initializeLTOPasses(); } LTOCodeGenerator::LTOCodeGenerator(std::unique_ptr Context) : OwnedContext(std::move(Context)), Context(*OwnedContext), MergedModule(new Module("ld-temp.o", *OwnedContext)), - IRLinker(MergedModule.get()) { + IRLinker(new Linker(MergedModule.get())) { initializeLTOPasses(); } @@ -114,7 +114,7 @@ bool LTOCodeGenerator::addModule(LTOModule *Mod) { assert(&Mod->getModule().getContext() == &Context && "Expected module in same context"); - bool ret = IRLinker.linkInModule(&Mod->getModule()); + bool ret = IRLinker->linkInModule(&Mod->getModule()); const std::vector &undefs = Mod->getAsmUndefinedRefs(); for (int i = 0, e = undefs.size(); i != e; ++i) @@ -130,7 +130,7 @@ void LTOCodeGenerator::setModule(std::unique_ptr Mod) { AsmUndefinedRefs.clear(); MergedModule = Mod->takeModule(); - IRLinker.setModule(MergedModule.get()); + IRLinker = make_unique(MergedModule.get()); const std::vector &Undefs = Mod->getAsmUndefinedRefs(); for (int I = 0, E = Undefs.size(); I != E; ++I) diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp index f745df56f9aa..c57c70e322ab 100644 --- a/lib/Linker/LinkModules.cpp +++ b/lib/Linker/LinkModules.cpp @@ -2071,10 +2071,6 @@ bool Linker::linkInModule(Module *Src, unsigned Flags, return RetCode; } -void Linker::setModule(Module *Dst) { - init(Dst, DiagnosticHandler); -} - //===----------------------------------------------------------------------===// // LinkModules entrypoint. //===----------------------------------------------------------------------===// From f0edaf8e00d7977407738b770f1d7adc7f1b25b8 Mon Sep 17 00:00:00 2001 From: Rafael Espindola Date: Tue, 1 Dec 2015 18:46:19 +0000 Subject: [PATCH 089/186] Use a forwarding constructor instead of an init method. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254435 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Linker/Linker.h | 1 - lib/Linker/LinkModules.cpp | 15 +++++---------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/include/llvm/Linker/Linker.h b/include/llvm/Linker/Linker.h index 3f6c7b6c6942..7ac457856a19 100644 --- a/include/llvm/Linker/Linker.h +++ b/include/llvm/Linker/Linker.h @@ -93,7 +93,6 @@ class Linker { unsigned Flags = Flags::None); private: - void init(Module *M, DiagnosticHandlerFunction DiagnosticHandler); Module *Composite; IdentifiedStructTypeSet IdentifiedStructTypes; diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp index c57c70e322ab..9aff43f31990 100644 --- a/lib/Linker/LinkModules.cpp +++ b/lib/Linker/LinkModules.cpp @@ -2032,7 +2032,7 @@ bool Linker::IdentifiedStructTypeSet::hasType(StructType *Ty) { return *I == Ty; } -void Linker::init(Module *M, DiagnosticHandlerFunction DiagnosticHandler) { +Linker::Linker(Module *M, DiagnosticHandlerFunction DiagnosticHandler) { this->Composite = M; this->DiagnosticHandler = DiagnosticHandler; @@ -2046,15 +2046,10 @@ void Linker::init(Module *M, DiagnosticHandlerFunction DiagnosticHandler) { } } -Linker::Linker(Module *M, DiagnosticHandlerFunction DiagnosticHandler) { - init(M, DiagnosticHandler); -} - -Linker::Linker(Module *M) { - init(M, [this](const DiagnosticInfo &DI) { - Composite->getContext().diagnose(DI); - }); -} +Linker::Linker(Module *M) + : Linker(M, [this](const DiagnosticInfo &DI) { + Composite->getContext().diagnose(DI); + }) {} void Linker::deleteModule() { delete Composite; From c6cd4955cdd9643c7a3ad7eeb30713a8dcb503dc Mon Sep 17 00:00:00 2001 From: Rafael Espindola Date: Tue, 1 Dec 2015 18:50:35 +0000 Subject: [PATCH 090/186] Delete dead code. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254436 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Linker/Linker.h | 1 - lib/Linker/LinkModules.cpp | 5 ----- 2 files changed, 6 deletions(-) diff --git a/include/llvm/Linker/Linker.h b/include/llvm/Linker/Linker.h index 7ac457856a19..e307e06f50be 100644 --- a/include/llvm/Linker/Linker.h +++ b/include/llvm/Linker/Linker.h @@ -72,7 +72,6 @@ class Linker { Linker(Module *M); Module *getModule() const { return Composite; } - void deleteModule(); /// \brief Link \p Src into the composite. The source is destroyed. /// Passing OverrideSymbols as true will have symbols from Src diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp index 9aff43f31990..7ba622f6ee1d 100644 --- a/lib/Linker/LinkModules.cpp +++ b/lib/Linker/LinkModules.cpp @@ -2051,11 +2051,6 @@ Linker::Linker(Module *M) Composite->getContext().diagnose(DI); }) {} -void Linker::deleteModule() { - delete Composite; - Composite = nullptr; -} - bool Linker::linkInModule(Module *Src, unsigned Flags, const FunctionInfoIndex *Index, Function *FuncToImport) { From 6cb642d909d9120d35d6ce3a499b075cc3b99a5d Mon Sep 17 00:00:00 2001 From: Keno Fischer Date: Tue, 1 Dec 2015 19:06:36 +0000 Subject: [PATCH 091/186] [Verifier] Improve error for cross-module refs By including the module name in the error message. This makes the error message much more useful and saves a trip to the debugger. Reviewers: dexonsmith Subscribers: dexonsmith, llvm-commits Differential Revision: http://reviews.llvm.org/D14473 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254437 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/IR/Verifier.cpp | 13 +++++++--- unittests/IR/VerifierTest.cpp | 47 +++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 3 deletions(-) diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp index 617d965f4cfc..5cbb597ca269 100644 --- a/lib/IR/Verifier.cpp +++ b/lib/IR/Verifier.cpp @@ -95,6 +95,12 @@ struct VerifierSupport { Write(&*I); } + void Write(const Module *M) { + if (!M) + return; + OS << "; ModuleID = '" << M->getModuleIdentifier() << "'\n"; + } + void Write(const Value *V) { if (!V) return; @@ -1721,7 +1727,8 @@ void Verifier::visitFunction(const Function &F) { auto *Per = dyn_cast(F.getPersonalityFn()->stripPointerCasts()); if (Per) Assert(Per->getParent() == F.getParent(), - "Referencing personality function in another module!", &F, Per); + "Referencing personality function in another module!", + &F, F.getParent(), Per, Per->getParent()); } if (F.isMaterializable()) { @@ -3165,7 +3172,7 @@ void Verifier::visitInstruction(Instruction &I) { " donothing or patchpoint", &I); Assert(F->getParent() == M, "Referencing function in another module!", - &I); + &I, M, F, F->getParent()); } else if (BasicBlock *OpBB = dyn_cast(I.getOperand(i))) { Assert(OpBB->getParent() == BB->getParent(), "Referring to a basic block in another function!", &I); @@ -3173,7 +3180,7 @@ void Verifier::visitInstruction(Instruction &I) { Assert(OpArg->getParent() == BB->getParent(), "Referring to an argument in another function!", &I); } else if (GlobalValue *GV = dyn_cast(I.getOperand(i))) { - Assert(GV->getParent() == M, "Referencing global in another module!", &I); + Assert(GV->getParent() == M, "Referencing global in another module!", &I, M, GV, GV->getParent()); } else if (isa(I.getOperand(i))) { verifyDominatesUse(I, i); } else if (isa(I.getOperand(i))) { diff --git a/unittests/IR/VerifierTest.cpp b/unittests/IR/VerifierTest.cpp index 71e3168b686b..4e94b4375f92 100644 --- a/unittests/IR/VerifierTest.cpp +++ b/unittests/IR/VerifierTest.cpp @@ -60,5 +60,52 @@ TEST(VerifierTest, InvalidRetAttribute) { "Attribute 'uwtable' only applies to functions!")); } +TEST(VerifierTest, CrossModuleRef) { + LLVMContext &C = getGlobalContext(); + Module M1("M1", C); + Module M2("M2", C); + Module M3("M2", C); + FunctionType *FTy = FunctionType::get(Type::getInt32Ty(C), /*isVarArg=*/false); + Function *F1 = cast(M1.getOrInsertFunction("foo1", FTy)); + Function *F2 = cast(M2.getOrInsertFunction("foo2", FTy)); + Function *F3 = cast(M3.getOrInsertFunction("foo3", FTy)); + + BasicBlock *Entry1 = BasicBlock::Create(C, "entry", F1); + BasicBlock *Entry3 = BasicBlock::Create(C, "entry", F3); + + // BAD: Referencing function in another module + CallInst::Create(F2,"call",Entry1); + + // BAD: Referencing personality routine in another module + F3->setPersonalityFn(F2); + + // Fill in the body + Constant *ConstZero = ConstantInt::get(Type::getInt32Ty(C), 0); + ReturnInst::Create(C, ConstZero, Entry1); + ReturnInst::Create(C, ConstZero, Entry3); + + std::string Error; + raw_string_ostream ErrorOS(Error); + EXPECT_FALSE(verifyModule(M2, &ErrorOS)); + EXPECT_TRUE(verifyModule(M1, &ErrorOS)); + EXPECT_TRUE(StringRef(ErrorOS.str()).equals( + "Referencing function in another module!\n" + " %call = call i32 @foo2()\n" + "; ModuleID = 'M1'\n" + "i32 ()* @foo2\n" + "; ModuleID = 'M2'\n")); + + Error.clear(); + EXPECT_TRUE(verifyModule(M3, &ErrorOS)); + EXPECT_TRUE(StringRef(ErrorOS.str()).startswith( + "Referencing personality function in another module!")); + + // Erase bad methods to avoid triggering an assertion failure on destruction + F1->eraseFromParent(); + F3->eraseFromParent(); +} + + + } } From 2320de6adb4f38633ae1698ac522def2bf0aacc5 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 1 Dec 2015 19:08:39 +0000 Subject: [PATCH 092/186] AMDGPU: Report extractelement as free in cost model The cost for scalarized operations is computed as N * (scalar operation cost + 1 extractelement + 1 insertelement). This partially fixes inflating the cost of scalarized operations since every operation is scalarized and free. I don't think we want any cost asociated with scalarization, but for now insertelement is still counted. I'm not sure if we should pretend that insertelement is also free, or add a way to compute a custom scalarization cost. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254438 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 11 ++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h | 2 + .../CostModel/AMDGPU/extractelement.ll | 110 ++++++++++++++++++ test/Analysis/CostModel/AMDGPU/lit.local.cfg | 2 + 4 files changed, 125 insertions(+) create mode 100644 test/Analysis/CostModel/AMDGPU/extractelement.ll create mode 100644 test/Analysis/CostModel/AMDGPU/lit.local.cfg diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 6dacc742b129..4afcc60984fc 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -80,3 +80,14 @@ unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) { // Semi-arbitrary large amount. return 64; } + +int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, + unsigned Index) { + switch (Opcode) { + case Instruction::ExtractElement: + // Dynamic indexing isn't free and is best avoided. + return Index == ~0u ? 2 : 0; + default: + return BaseT::getVectorInstrCost(Opcode, ValTy, Index); + } +} diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index dee0a69d1e68..5a94a0ba4706 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -60,6 +60,8 @@ class AMDGPUTTIImpl : public BasicTTIImplBase { unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); unsigned getMaxInterleaveFactor(unsigned VF); + + int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index); }; } // end namespace llvm diff --git a/test/Analysis/CostModel/AMDGPU/extractelement.ll b/test/Analysis/CostModel/AMDGPU/extractelement.ll new file mode 100644 index 000000000000..c328d7686466 --- /dev/null +++ b/test/Analysis/CostModel/AMDGPU/extractelement.ll @@ -0,0 +1,110 @@ +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s + +; CHECK: 'extractelement_v2i32' +; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i32> +define void @extractelement_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) { + %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr + %elt = extractelement <2 x i32> %vec, i32 1 + store i32 %elt, i32 addrspace(1)* %out + ret void +} + +; CHECK: 'extractelement_v2f32' +; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x float> +define void @extractelement_v2f32(float addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) { + %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr + %elt = extractelement <2 x float> %vec, i32 1 + store float %elt, float addrspace(1)* %out + ret void +} + +; CHECK: 'extractelement_v3i32' +; CHECK: estimated cost of 0 for {{.*}} extractelement <3 x i32> +define void @extractelement_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr) { + %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr + %elt = extractelement <3 x i32> %vec, i32 1 + store i32 %elt, i32 addrspace(1)* %out + ret void +} + +; CHECK: 'extractelement_v4i32' +; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i32> +define void @extractelement_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr) { + %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr + %elt = extractelement <4 x i32> %vec, i32 1 + store i32 %elt, i32 addrspace(1)* %out + ret void +} + +; CHECK: 'extractelement_v8i32' +; CHECK: estimated cost of 0 for {{.*}} extractelement <8 x i32> +define void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr) { + %vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr + %elt = extractelement <8 x i32> %vec, i32 1 + store i32 %elt, i32 addrspace(1)* %out + ret void +} + +; FIXME: Should be non-0 +; CHECK: 'extractelement_v8i32_dynindex' +; CHECK: estimated cost of 2 for {{.*}} extractelement <8 x i32> +define void @extractelement_v8i32_dynindex(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr, i32 %idx) { + %vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr + %elt = extractelement <8 x i32> %vec, i32 %idx + store i32 %elt, i32 addrspace(1)* %out + ret void +} + +; CHECK: 'extractelement_v2i64' +; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i64> +define void @extractelement_v2i64(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) { + %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr + %elt = extractelement <2 x i64> %vec, i64 1 + store i64 %elt, i64 addrspace(1)* %out + ret void +} + +; CHECK: 'extractelement_v3i64' +; CHECK: estimated cost of 0 for {{.*}} extractelement <3 x i64> +define void @extractelement_v3i64(i64 addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr) { + %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr + %elt = extractelement <3 x i64> %vec, i64 1 + store i64 %elt, i64 addrspace(1)* %out + ret void +} + +; CHECK: 'extractelement_v4i64' +; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i64> +define void @extractelement_v4i64(i64 addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr) { + %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr + %elt = extractelement <4 x i64> %vec, i64 1 + store i64 %elt, i64 addrspace(1)* %out + ret void +} + +; CHECK: 'extractelement_v8i64' +; CHECK: estimated cost of 0 for {{.*}} extractelement <8 x i64> +define void @extractelement_v8i64(i64 addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr) { + %vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr + %elt = extractelement <8 x i64> %vec, i64 1 + store i64 %elt, i64 addrspace(1)* %out + ret void +} + +; CHECK: 'extractelement_v4i8' +; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i8> +define void @extractelement_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %vaddr) { + %vec = load <4 x i8>, <4 x i8> addrspace(1)* %vaddr + %elt = extractelement <4 x i8> %vec, i8 1 + store i8 %elt, i8 addrspace(1)* %out + ret void +} + +; CHECK: 'extractelement_v2i16' +; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i16> +define void @extractelement_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) { + %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr + %elt = extractelement <2 x i16> %vec, i16 1 + store i16 %elt, i16 addrspace(1)* %out + ret void +} diff --git a/test/Analysis/CostModel/AMDGPU/lit.local.cfg b/test/Analysis/CostModel/AMDGPU/lit.local.cfg new file mode 100644 index 000000000000..2a665f06be72 --- /dev/null +++ b/test/Analysis/CostModel/AMDGPU/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'AMDGPU' in config.root.targets: + config.unsupported = True From 6a07b977968eafc8ead24c5b973fd44091c8aa73 Mon Sep 17 00:00:00 2001 From: Weiming Zhao Date: Tue, 1 Dec 2015 19:17:49 +0000 Subject: [PATCH 093/186] [AArch64] Fix a corner case in BitFeild select Summary: When not useful bits, BitWidth becomes 0 and APInt will not be happy. See https://llvm.org/bugs/show_bug.cgi?id=25571 We can just mark the operand as IMPLICIT_DEF is none bits of it is used. Reviewers: t.p.northover, jmolloy Subscribers: gberry, jmolloy, mgrang, aemerson, llvm-commits, rengolin Differential Revision: http://reviews.llvm.org/D14803 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254440 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AArch64/AArch64ISelDAGToDAG.cpp | 16 +++++++++++----- test/CodeGen/AArch64/bitfield-insert.ll | 22 ++++++++++++++++++++++ 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 4311198403fa..6c868880bcac 100644 --- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -1974,7 +1974,8 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op, // f = Opc Opd0, Opd1, LSB, MSB ; where Opc is a BFM, LSB = imm, and MSB = imm2 static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst, SDValue &Src, unsigned &ImmR, - unsigned &ImmS, SelectionDAG *CurDAG) { + unsigned &ImmS, const APInt &UsefulBits, + SelectionDAG *CurDAG) { assert(N->getOpcode() == ISD::OR && "Expect a OR operation"); // Set Opc @@ -1988,8 +1989,6 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst, // Because of simplify-demanded-bits in DAGCombine, involved masks may not // have the expected shape. Try to undo that. - APInt UsefulBits; - getUsefulBits(SDValue(N, 0), UsefulBits); unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros(); unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros(); @@ -2083,11 +2082,18 @@ SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) { unsigned Opc; unsigned LSB, MSB; SDValue Opd0, Opd1; + EVT VT = N->getValueType(0); + APInt NUsefulBits; + getUsefulBits(SDValue(N, 0), NUsefulBits); + + // If all bits are not useful, just return UNDEF. + if (!NUsefulBits) + return CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, VT); - if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, CurDAG)) + if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, NUsefulBits, + CurDAG)) return nullptr; - EVT VT = N->getValueType(0); SDLoc dl(N); SDValue Ops[] = { Opd0, Opd1, diff --git a/test/CodeGen/AArch64/bitfield-insert.ll b/test/CodeGen/AArch64/bitfield-insert.ll index 9df51dcc4478..509b547a5c82 100644 --- a/test/CodeGen/AArch64/bitfield-insert.ll +++ b/test/CodeGen/AArch64/bitfield-insert.ll @@ -215,3 +215,25 @@ define void @test_32bit_opnd1_better(i32* %existing, i32* %new) { ret void } + +; Tests when all the bits from one operand are not useful +define i32 @test_nouseful_bits(i8 %a, i32 %b) { +; CHECK-LABEL: test_nouseful_bits: +; CHECK: bfi +; CHECK: bfi +; CHECK: bfi +; CHECK-NOT: bfi +; CHECK-NOT: or +; CHECK: lsl + %conv = zext i8 %a to i32 ; 0 0 0 A + %shl = shl i32 %b, 8 ; B2 B1 B0 0 + %or = or i32 %conv, %shl ; B2 B1 B0 A + %shl.1 = shl i32 %or, 8 ; B1 B0 A 0 + %or.1 = or i32 %conv, %shl.1 ; B1 B0 A A + %shl.2 = shl i32 %or.1, 8 ; B0 A A 0 + %or.2 = or i32 %conv, %shl.2 ; B0 A A A + %shl.3 = shl i32 %or.2, 8 ; A A A 0 + %or.3 = or i32 %conv, %shl.3 ; A A A A + %shl.4 = shl i32 %or.3, 8 ; A A A 0 + ret i32 %shl.4 +} From 8e31526513c32e9e432bcd98abee665aca8c0dcf Mon Sep 17 00:00:00 2001 From: David Blaikie Date: Tue, 1 Dec 2015 19:17:58 +0000 Subject: [PATCH 094/186] [llvm-dwp] Deduplicate strings in the debug_str.dwo section Also, ensure that references to those strings in debug_str_offsets.dwo correctly refer to the deduplicated strings. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254441 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/tools/llvm-dwp/X86/simple.test | 6 ++--- tools/llvm-dwp/llvm-dwp.cpp | 34 +++++++++++++++++------------ 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/test/tools/llvm-dwp/X86/simple.test b/test/tools/llvm-dwp/X86/simple.test index d70bdda072c5..754450f13f3f 100644 --- a/test/tools/llvm-dwp/X86/simple.test +++ b/test/tools/llvm-dwp/X86/simple.test @@ -46,13 +46,11 @@ FIXME: Emit and verify the cu_index contents CHECK: .debug_str.dwo contents: CHECK: "clang version CHECK: 0x[[ACPP:.*]]: "a.cpp" -FIXME: Remove duplicates -CHECK: 0x[[SECONDREV:.*]]: "clang version +CHECK-NOT: "clang version CHECK: 0x[[BCPP:.*]]: "b.cpp" CHECK: .debug_str_offsets.dwo contents: CHECK: : 00000000 CHECK: : [[ACPP]] -CHECK: : [[SECONDREV]] -FIXME: Update str offset indexes, this should be BCPP \/ +CHECK: : 00000000 CHECK: : [[BCPP]] diff --git a/tools/llvm-dwp/llvm-dwp.cpp b/tools/llvm-dwp/llvm-dwp.cpp index c89be222e6c9..9ce37ec2ceee 100644 --- a/tools/llvm-dwp/llvm-dwp.cpp +++ b/tools/llvm-dwp/llvm-dwp.cpp @@ -40,8 +40,9 @@ static int error(const Twine &Error, const Twine &Context) { static std::error_code writeStringsAndOffsets(MCStreamer &Out, StringMap &Strings, - uint32_t &StringOffset, MCSection *StrOffsetSection, - StringRef CurStrSection, StringRef CurStrOffsetSection) { + uint32_t &StringOffset, MCSection *StrSection, + MCSection *StrOffsetSection, StringRef CurStrSection, + StringRef CurStrOffsetSection) { // Could possibly produce an error or warning if one of these was non-null but // the other was null. if (CurStrSection.empty() || CurStrOffsetSection.empty()) @@ -54,9 +55,14 @@ writeStringsAndOffsets(MCStreamer &Out, StringMap &Strings, uint32_t PrevOffset = 0; while (const char *s = Data.getCStr(&LocalOffset)) { StringRef Str(s, LocalOffset - PrevOffset - 1); - OffsetRemapping[PrevOffset] = StringOffset; - // insert, if successful, write new string to the str.dwo section - StringOffset += Str.size() + 1; + auto Pair = Strings.insert(std::make_pair(Str, StringOffset)); + if (Pair.second) { + Out.SwitchSection(StrSection); + Out.EmitBytes( + StringRef(Pair.first->getKeyData(), Pair.first->getKeyLength() + 1)); + StringOffset += Str.size() + 1; + } + OffsetRemapping[PrevOffset] = Pair.first->second; PrevOffset = LocalOffset; } @@ -106,19 +112,19 @@ static std::error_code write(MCStreamer &Out, ArrayRef Inputs) { StringRef Contents; if (auto Err = Section.getContents(Contents)) return Err; - if (OutSection == StrOffsetSection) { + if (OutSection == StrOffsetSection) CurStrOffsetSection = Contents; - continue; - } - if (OutSection == StrSection) + else if (OutSection == StrSection) CurStrSection = Contents; - Out.SwitchSection(OutSection); - Out.EmitBytes(Contents); + else { + Out.SwitchSection(OutSection); + Out.EmitBytes(Contents); + } } } - if (auto Err = - writeStringsAndOffsets(Out, Strings, StringOffset, StrOffsetSection, - CurStrSection, CurStrOffsetSection)) + if (auto Err = writeStringsAndOffsets(Out, Strings, StringOffset, + StrSection, StrOffsetSection, + CurStrSection, CurStrOffsetSection)) return Err; } return std::error_code(); From bedc55e06360f5baa33cd63e7543294463773931 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Tue, 1 Dec 2015 19:19:18 +0000 Subject: [PATCH 095/186] fix typo; NFC git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254442 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/TwoAddressInstructionPass.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp index c96c813b0c9b..c407d594addd 100644 --- a/lib/CodeGen/TwoAddressInstructionPass.cpp +++ b/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -1181,7 +1181,7 @@ bool TwoAddressInstructionPass::tryInstructionCommute(MachineInstr *MI, unsigned OtherOpIdx = MI->getDesc().getNumDefs(); for (; OtherOpIdx < OpsNum; OtherOpIdx++) { // The call of findCommutedOpIndices below only checks if BaseOpIdx - // and OtherOpIdx are commutable, it does not really searches for + // and OtherOpIdx are commutable, it does not really search for // other commutable operands and does not change the values of passed // variables. if (OtherOpIdx == BaseOpIdx || From 8efaea8c4583732bd030292a59d3081a176242bf Mon Sep 17 00:00:00 2001 From: Artyom Skrobov Date: Tue, 1 Dec 2015 19:25:11 +0000 Subject: [PATCH 096/186] Fix Thumb1 epilogue generation Summary: This had been broken for a very long time, but nobody noticed until D14357 enabled shrink-wrapping by default. Reviewers: jroelofs, qcolombet Subscribers: tyomitch, llvm-commits, rengolin Differential Revision: http://reviews.llvm.org/D14986 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254444 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/Thumb1FrameLowering.cpp | 67 ++++++++++++++++++++----- test/CodeGen/Thumb/pop-special-fixup.ll | 60 ++++++++++++++++++++++ 2 files changed, 115 insertions(+), 12 deletions(-) create mode 100644 test/CodeGen/Thumb/pop-special-fixup.ll diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp index 064cff6f5704..fd96af6cb6e0 100644 --- a/lib/Target/ARM/Thumb1FrameLowering.cpp +++ b/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -406,11 +406,15 @@ bool Thumb1FrameLowering::needPopSpecialFixUp(const MachineFunction &MF) const { if (AFI->getArgRegsSaveSize()) return true; - bool IsV4PopReturn = false; + // FIXME: this doesn't make sense, and the following patch will remove it. + if (!STI.hasV4TOps()) return false; + + // LR cannot be encoded with Thumb1, i.e., it requires a special fix-up. for (const CalleeSavedInfo &CSI : MF.getFrameInfo()->getCalleeSavedInfo()) if (CSI.getReg() == ARM::LR) - IsV4PopReturn = true; - return IsV4PopReturn && STI.hasV4TOps() && !STI.hasV5TOps(); + return true; + + return false; } bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB, @@ -422,12 +426,45 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB, const ThumbRegisterInfo *RegInfo = static_cast(STI.getRegisterInfo()); - // When we need a special fix up for POP, this means that - // we either cannot use PC in POP or we have to update - // SP after poping the return address. - // In other words, we cannot use a pop {pc} like construction - // here, no matter what. + // If MBBI is a return instruction, or is a tPOP followed by a return + // instruction in the successor BB, we may be able to directly restore + // LR in the PC. + // This is only possible with v5T ops (v4T can't change the Thumb bit via + // a POP PC instruction), and only if we do not need to emit any SP update. + // Otherwise, we need a temporary register to pop the value + // and copy that value into LR. auto MBBI = MBB.getFirstTerminator(); + bool CanRestoreDirectly = STI.hasV5TOps() && !ArgRegsSaveSize; + if (CanRestoreDirectly) { + if (MBBI != MBB.end()) + CanRestoreDirectly = (MBBI->getOpcode() == ARM::tBX_RET || + MBBI->getOpcode() == ARM::tPOP_RET); + else { + assert(MBB.back().getOpcode() == ARM::tPOP); + assert(MBB.succ_size() == 1); + if ((*MBB.succ_begin())->begin()->getOpcode() == ARM::tBX_RET) + MBBI--; // Replace the final tPOP with a tPOP_RET. + else + CanRestoreDirectly = false; + } + } + + if (CanRestoreDirectly) { + if (!DoIt || MBBI->getOpcode() == ARM::tPOP_RET) + return true; + MachineInstrBuilder MIB = + AddDefaultPred( + BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII.get(ARM::tPOP_RET))); + // Copy implicit ops and popped registers, if any. + for (auto MO: MBBI->operands()) + if (MO.isReg() && (MO.isImplicit() || MO.isDef()) && + MO.getReg() != ARM::LR) + MIB.addOperand(MO); + MIB.addReg(ARM::PC, RegState::Define); + // Erase the old instruction (tBX_RET or tPOP). + MBB.erase(MBBI); + return true; + } // Look for a temporary register to use. // First, compute the liveness information. @@ -446,10 +483,10 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB, if (MBBI != MBB.end()) { dl = MBBI->getDebugLoc(); auto InstUpToMBBI = MBB.end(); - // The post-decrement is on purpose here. - // We want to have the liveness right before MBBI. - while (InstUpToMBBI-- != MBBI) - UsedRegs.stepBackward(*InstUpToMBBI); + while (InstUpToMBBI != MBBI) + // The pre-decrement is on purpose here. + // We want to have the liveness right before MBBI. + UsedRegs.stepBackward(*--InstUpToMBBI); } // Look for a register that can be directly use in the POP. @@ -495,6 +532,12 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB, .addReg(PopReg, RegState::Kill)); } + if (MBBI == MBB.end()) { + MachineInstr& Pop = MBB.back(); + assert(Pop.getOpcode() == ARM::tPOP); + Pop.RemoveOperand(Pop.findRegisterDefOperandIdx(ARM::LR)); + } + assert(PopReg && "Do not know how to get LR"); AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP))) .addReg(PopReg, RegState::Define); diff --git a/test/CodeGen/Thumb/pop-special-fixup.ll b/test/CodeGen/Thumb/pop-special-fixup.ll new file mode 100644 index 000000000000..9ba589d6cec3 --- /dev/null +++ b/test/CodeGen/Thumb/pop-special-fixup.ll @@ -0,0 +1,60 @@ +; RUN: llc %s -enable-shrink-wrap=true -o - | FileCheck %s + +target triple = "thumbv6m-none-none-eabi" + +@retval = global i32 0, align 4 + +define i32 @test(i32 %i, i32 %argc, i8** nocapture readonly %argv) { + %1 = icmp sgt i32 %argc, %i + br i1 %1, label %2, label %19 + + %3 = getelementptr inbounds i8*, i8** %argv, i32 %i + %4 = load i8*, i8** %3, align 4 + %5 = load i8, i8* %4, align 1 + %6 = icmp eq i8 %5, 45 + %7 = getelementptr inbounds i8, i8* %4, i32 1 + %. = select i1 %6, i8* %7, i8* %4 + %.1 = select i1 %6, i32 -1, i32 1 + %8 = load i8, i8* %., align 1 + %.off2 = add i8 %8, -48 + %9 = icmp ult i8 %.off2, 10 + %.pre = load i32, i32* @retval, align 4 + br i1 %9, label %.lr.ph.preheader, label %.critedge + +.lr.ph.preheader: ; preds = %2 + br label %.lr.ph + +.lr.ph: ; preds = %.lr.ph.preheader, %.lr.ph + %10 = phi i32 [ %14, %.lr.ph ], [ %.pre, %.lr.ph.preheader ] + %11 = phi i8 [ %15, %.lr.ph ], [ %8, %.lr.ph.preheader ] + %valstring.03 = phi i8* [ %13, %.lr.ph ], [ %., %.lr.ph.preheader ] + %12 = zext i8 %11 to i32 + %13 = getelementptr inbounds i8, i8* %valstring.03, i32 1 + %14 = add nsw i32 %10, %12 + store i32 %14, i32* @retval, align 4 + %15 = load i8, i8* %13, align 1 + %.off = add i8 %15, -48 + %16 = icmp ult i8 %.off, 10 + br i1 %16, label %.lr.ph, label %.critedge.loopexit + +.critedge.loopexit: ; preds = %.lr.ph + %.lcssa = phi i32 [ %14, %.lr.ph ] + br label %.critedge + +.critedge: ; preds = %.critedge.loopexit, %2 + %17 = phi i32 [ %.pre, %2 ], [ %.lcssa, %.critedge.loopexit ] + %18 = mul nsw i32 %17, %.1 + store i32 %18, i32* @retval, align 4 + br label %19 + +;