From 0dd0d39ecd32367375708455da1e22bd9e30fa38 Mon Sep 17 00:00:00 2001
From: Zlatko Buljan <Zlatko.Buljan@imgtec.com>
Date: Mon, 30 Nov 2015 08:37:38 +0000
Subject: [PATCH 001/186] [mips][microMIPS] Implement PRECR.QB.PH,
 PRECR_SRA[_R].PH.W, PRECRQ.PH.W, PRECRQ.QB.PH, PRECRQU_S.QB.PH and
 PRECRQ_RS.PH.W instructions Differential Revision:
 http://reviews.llvm.org/D14605

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254291 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Mips/MicroMipsDSPInstrFormats.td   | 12 +++++++++++
 lib/Target/Mips/MicroMipsDSPInstrInfo.td      | 20 +++++++++++++++++++
 lib/Target/Mips/MipsDSPInstrInfo.td           | 16 ++++++++-------
 .../Disassembler/Mips/micromips-dsp/valid.txt |  4 ++++
 .../Mips/micromips-dspr2/valid.txt            |  7 +++++++
 test/MC/Mips/micromips-dsp/valid.s            |  4 ++++
 test/MC/Mips/micromips-dspr2/valid.s          |  7 +++++++
 7 files changed, 63 insertions(+), 7 deletions(-)
diff --git a/lib/Target/Mips/MicroMipsDSPInstrFormats.td b/lib/Target/Mips/MicroMipsDSPInstrFormats.td
index 65c8303f25fe..d3f9fe31afb7 100644
--- a/lib/Target/Mips/MicroMipsDSPInstrFormats.td
+++ b/lib/Target/Mips/MicroMipsDSPInstrFormats.td
@@ -141,3 +141,15 @@ class POOL32A_1RIMM5AC_FMT<string opstr, bits<8> funct> : MMDSPInst<opstr> {
   let Inst{13-6}  = funct;
   let Inst{5-0}   = 0b111100;
 }
+
+class POOL32A_2RSA5_FMT<string opstr, bits<11> op> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> sa;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = sa;
+  let Inst{10-0}  = op;
+}
diff --git a/lib/Target/Mips/MicroMipsDSPInstrInfo.td b/lib/Target/Mips/MicroMipsDSPInstrInfo.td
index b2e5ec61c8b4..f515f380f0db 100644
--- a/lib/Target/Mips/MicroMipsDSPInstrInfo.td
+++ b/lib/Target/Mips/MicroMipsDSPInstrInfo.td
@@ -120,6 +120,16 @@ class MULQ_RS_PH_MM_ENC : POOL32A_3RB0_FMT<"mulq_rs.ph", 0b0100010101>;
 class MULQ_RS_W_MMR2_ENC : POOL32A_3RB0_FMT<"mulq_rs.w", 0b0110010101>;
 class MULQ_S_PH_MMR2_ENC : POOL32A_3RB0_FMT<"mulq_s.ph", 0b0101010101>;
 class MULQ_S_W_MMR2_ENC : POOL32A_3RB0_FMT<"mulq_s.w", 0b0111010101>;
+class PRECR_QB_PH_MMR2_ENC : POOL32A_3RB0_FMT<"precr.qb.ph", 0b0001101101>;
+class PRECR_SRA_PH_W_MMR2_ENC
+    : POOL32A_2RSA5_FMT<"precr_sra.ph.w", 0b01111001101>;
+class PRECR_SRA_R_PH_W_MMR2_ENC
+    : POOL32A_2RSA5_FMT<"precr_sra_r.ph.w", 0b11111001101>;
+class PRECRQ_PH_W_MM_ENC : POOL32A_3RB0_FMT<"precrq.ph.w", 0b0011101101>;
+class PRECRQ_QB_PH_MM_ENC : POOL32A_3RB0_FMT<"precrq.qb.ph", 0b0010101101>;
+class PRECRQU_S_QB_PH_MM_ENC
+    : POOL32A_3RB0_FMT<"precrqu_s.qb.ph", 0b0101101101>;
+class PRECRQ_RS_PH_W_MM_ENC : POOL32A_3RB0_FMT<"precrq_rs.ph.w", 0b0100101101>;
 
 // Instruction desc.
 class ABSQ_S_PH_MM_R2_DESC_BASE<string opstr, SDPatternOperator OpNode,
@@ -354,6 +364,10 @@ def MULEQ_S_W_PHR_MM : DspMMRel, MULEQ_S_W_PHR_MM_ENC, MULEQ_S_W_PHR_DESC;
 def MULEU_S_PH_QBL_MM : DspMMRel, MULEU_S_PH_QBL_MM_ENC, MULEU_S_PH_QBL_DESC;
 def MULEU_S_PH_QBR_MM : DspMMRel, MULEU_S_PH_QBR_MM_ENC, MULEU_S_PH_QBR_DESC;
 def MULQ_RS_PH_MM : DspMMRel, MULQ_RS_PH_MM_ENC, MULQ_RS_PH_DESC;
+def PRECRQ_PH_W_MM : DspMMRel, PRECRQ_PH_W_MM_ENC, PRECRQ_PH_W_DESC;
+def PRECRQ_QB_PH_MM : DspMMRel, PRECRQ_QB_PH_MM_ENC, PRECRQ_QB_PH_DESC;
+def PRECRQU_S_QB_PH_MM : DspMMRel, PRECRQU_S_QB_PH_MM_ENC, PRECRQU_S_QB_PH_DESC;
+def PRECRQ_RS_PH_W_MM : DspMMRel, PRECRQ_RS_PH_W_MM_ENC, PRECRQ_RS_PH_W_DESC;
 // microMIPS DSP Rev 2
 def ABSQ_S_QB_MMR2 : DspMMRel, ABSQ_S_QB_MMR2_ENC, ABSQ_S_QB_MMR2_DESC,
                      ISA_DSPR2;
@@ -398,3 +412,9 @@ def MUL_S_PH_MMR2 : DspMMRel, MUL_S_PH_MMR2_ENC, MUL_S_PH_DESC, ISA_DSPR2;
 def MULQ_RS_W_MMR2 : DspMMRel, MULQ_RS_W_MMR2_ENC, MULQ_RS_W_DESC, ISA_DSPR2;
 def MULQ_S_PH_MMR2 : DspMMRel, MULQ_S_PH_MMR2_ENC, MULQ_S_PH_DESC, ISA_DSPR2;
 def MULQ_S_W_MMR2 : DspMMRel, MULQ_S_W_MMR2_ENC, MULQ_S_W_DESC, ISA_DSPR2;
+def PRECR_QB_PH_MMR2 : DspMMRel, PRECR_QB_PH_MMR2_ENC, PRECR_QB_PH_DESC,
+                       ISA_DSPR2;
+def PRECR_SRA_PH_W_MMR2 : DspMMRel, PRECR_SRA_PH_W_MMR2_ENC,
+                          PRECR_SRA_PH_W_DESC, ISA_DSPR2;
+def PRECR_SRA_R_PH_W_MMR2 : DspMMRel, PRECR_SRA_R_PH_W_MMR2_ENC,
+                            PRECR_SRA_R_PH_W_DESC, ISA_DSPR2;
diff --git a/lib/Target/Mips/MipsDSPInstrInfo.td b/lib/Target/Mips/MipsDSPInstrInfo.td
index 9b4b9d178183..91513f3f2c63 100644
--- a/lib/Target/Mips/MipsDSPInstrInfo.td
+++ b/lib/Target/Mips/MipsDSPInstrInfo.td
@@ -295,6 +295,7 @@ class CMP_EQ_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
   list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs, ROT:$rt))];
   InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
 }
 
 class PRECR_SRA_PH_W_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -306,6 +307,7 @@ class PRECR_SRA_PH_W_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   list<dag> Pattern = [(set ROT:$rt, (OpNode ROS:$src, ROS:$rs, immZExt5:$sa))];
   InstrItinClass Itinerary = itin;
   string Constraints = "$src = $rt";
+  string BaseOpcode = instr_asm;
 }
 
 class ABSQ_S_PH_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -1105,10 +1107,10 @@ def MODSUB : MODSUB_ENC, MODSUB_DESC;
 def RADDU_W_QB : RADDU_W_QB_ENC, RADDU_W_QB_DESC;
 def ABSQ_S_PH : DspMMRel, ABSQ_S_PH_ENC, ABSQ_S_PH_DESC;
 def ABSQ_S_W : DspMMRel, ABSQ_S_W_ENC, ABSQ_S_W_DESC;
-def PRECRQ_QB_PH : PRECRQ_QB_PH_ENC, PRECRQ_QB_PH_DESC;
-def PRECRQ_PH_W : PRECRQ_PH_W_ENC, PRECRQ_PH_W_DESC;
-def PRECRQ_RS_PH_W : PRECRQ_RS_PH_W_ENC, PRECRQ_RS_PH_W_DESC;
-def PRECRQU_S_QB_PH : PRECRQU_S_QB_PH_ENC, PRECRQU_S_QB_PH_DESC;
+def PRECRQ_QB_PH : DspMMRel, PRECRQ_QB_PH_ENC, PRECRQ_QB_PH_DESC;
+def PRECRQ_PH_W : DspMMRel, PRECRQ_PH_W_ENC, PRECRQ_PH_W_DESC;
+def PRECRQ_RS_PH_W : DspMMRel, PRECRQ_RS_PH_W_ENC, PRECRQ_RS_PH_W_DESC;
+def PRECRQU_S_QB_PH : DspMMRel, PRECRQU_S_QB_PH_ENC, PRECRQU_S_QB_PH_DESC;
 def PRECEQ_W_PHL : DspMMRel, PRECEQ_W_PHL_ENC, PRECEQ_W_PHL_DESC;
 def PRECEQ_W_PHR : DspMMRel, PRECEQ_W_PHR_ENC, PRECEQ_W_PHR_DESC;
 def PRECEQU_PH_QBL : DspMMRel, PRECEQU_PH_QBL_ENC, PRECEQU_PH_QBL_DESC;
@@ -1240,9 +1242,9 @@ def DPSX_W_PH : DspMMRel, DPSX_W_PH_ENC, DPSX_W_PH_DESC;
 def DPSQX_S_W_PH : DspMMRel, DPSQX_S_W_PH_ENC, DPSQX_S_W_PH_DESC;
 def DPSQX_SA_W_PH : DspMMRel, DPSQX_SA_W_PH_ENC, DPSQX_SA_W_PH_DESC;
 def MULSA_W_PH : MULSA_W_PH_ENC, MULSA_W_PH_DESC;
-def PRECR_QB_PH : PRECR_QB_PH_ENC, PRECR_QB_PH_DESC;
-def PRECR_SRA_PH_W : PRECR_SRA_PH_W_ENC, PRECR_SRA_PH_W_DESC;
-def PRECR_SRA_R_PH_W : PRECR_SRA_R_PH_W_ENC, PRECR_SRA_R_PH_W_DESC;
+def PRECR_QB_PH : DspMMRel, PRECR_QB_PH_ENC, PRECR_QB_PH_DESC;
+def PRECR_SRA_PH_W : DspMMRel, PRECR_SRA_PH_W_ENC, PRECR_SRA_PH_W_DESC;
+def PRECR_SRA_R_PH_W : DspMMRel, PRECR_SRA_R_PH_W_ENC, PRECR_SRA_R_PH_W_DESC;
 def SHRA_QB : DspMMRel, SHRA_QB_ENC, SHRA_QB_DESC;
 def SHRAV_QB : DspMMRel, SHRAV_QB_ENC, SHRAV_QB_DESC;
 def SHRA_R_QB : DspMMRel, SHRA_R_QB_ENC, SHRA_R_QB_DESC;
diff --git a/test/MC/Disassembler/Mips/micromips-dsp/valid.txt b/test/MC/Disassembler/Mips/micromips-dsp/valid.txt
index 2bae34e2d833..433022c1e62c 100644
--- a/test/MC/Disassembler/Mips/micromips-dsp/valid.txt
+++ b/test/MC/Disassembler/Mips/micromips-dsp/valid.txt
@@ -42,6 +42,10 @@
 0x01 0xf0 0xb3 0x3c # CHECK: preceu.ph.qbla $15, $16
 0x02 0x32 0xd1 0x3c # CHECK: preceu.ph.qbr $17, $18
 0x02 0x74 0xd3 0x3c # CHECK: preceu.ph.qbra $19, $20
+0x01 0x49 0x40 0xed # CHECK: precrq.ph.w $8, $9, $10
+0x01 0xac 0x58 0xad # CHECK: precrq.qb.ph $11, $12, $13
+0x02 0x0f 0x71 0x6d # CHECK: precrqu_s.qb.ph $14, $15, $16
+0x02 0x72 0x89 0x2d # CHECK: precrq_rs.ph.w $17, $18, $19
 0x00 0x64 0x53 0xb5 # CHECK: shll.ph $3, $4, 5
 0x00 0x64 0x5b 0xb5 # CHECK: shll_s.ph $3, $4, 5
 0x00 0x64 0xa8 0x7c # CHECK: shll.qb $3, $4, 5
diff --git a/test/MC/Disassembler/Mips/micromips-dspr2/valid.txt b/test/MC/Disassembler/Mips/micromips-dspr2/valid.txt
index d35f3f0019d3..8049e0e3174f 100644
--- a/test/MC/Disassembler/Mips/micromips-dspr2/valid.txt
+++ b/test/MC/Disassembler/Mips/micromips-dspr2/valid.txt
@@ -55,6 +55,13 @@
 0x01 0xf0 0xb3 0x3c # CHECK: preceu.ph.qbla $15, $16
 0x02 0x32 0xd1 0x3c # CHECK: preceu.ph.qbr $17, $18
 0x02 0x74 0xd3 0x3c # CHECK: preceu.ph.qbra $19, $20
+0x00 0x62 0x08 0x6d # CHECK: precr.qb.ph $1, $2, $3
+0x00 0x85 0x0b 0xcd # CHECK: precr_sra.ph.w $4, $5, 1
+0x00 0xc7 0x17 0xcd # CHECK: precr_sra_r.ph.w $6, $7, 2
+0x01 0x49 0x40 0xed # CHECK: precrq.ph.w $8, $9, $10
+0x01 0xac 0x58 0xad # CHECK: precrq.qb.ph $11, $12, $13
+0x02 0x0f 0x71 0x6d # CHECK: precrqu_s.qb.ph $14, $15, $16
+0x02 0x72 0x89 0x2d # CHECK: precrq_rs.ph.w $17, $18, $19
 0x00 0x64 0x53 0xb5 # CHECK: shll.ph $3, $4, 5
 0x00 0x64 0x5b 0xb5 # CHECK: shll_s.ph $3, $4, 5
 0x00 0x64 0xa8 0x7c # CHECK: shll.qb $3, $4, 5
diff --git a/test/MC/Mips/micromips-dsp/valid.s b/test/MC/Mips/micromips-dsp/valid.s
index a78f6c1bf21e..24e4a07365ab 100644
--- a/test/MC/Mips/micromips-dsp/valid.s
+++ b/test/MC/Mips/micromips-dsp/valid.s
@@ -43,6 +43,10 @@
   preceu.ph.qbla $15, $16      # CHECK: preceu.ph.qbla $15, $16  # encoding: [0x01,0xf0,0xb3,0x3c]
   preceu.ph.qbr $17, $18       # CHECK: preceu.ph.qbr $17, $18   # encoding: [0x02,0x32,0xd1,0x3c]
   preceu.ph.qbra $19, $20      # CHECK: preceu.ph.qbra $19, $20  # encoding: [0x02,0x74,0xd3,0x3c]
+  precrq.ph.w $8, $9, $10       # CHECK: precrq.ph.w $8, $9, $10       # encoding: [0x01,0x49,0x40,0xed]
+  precrq.qb.ph $11, $12, $13    # CHECK: precrq.qb.ph $11, $12, $13    # encoding: [0x01,0xac,0x58,0xad]
+  precrqu_s.qb.ph $14, $15, $16 # CHECK: precrqu_s.qb.ph $14, $15, $16 # encoding: [0x02,0x0f,0x71,0x6d]
+  precrq_rs.ph.w $17, $18, $19  # CHECK: precrq_rs.ph.w $17, $18, $19  # encoding: [0x02,0x72,0x89,0x2d]
   shll.ph $3, $4, 5            # CHECK: shll.ph $3, $4, 5       # encoding: [0x00,0x64,0x53,0xb5]
   shll_s.ph $3, $4, 5          # CHECK: shll_s.ph $3, $4, 5     # encoding: [0x00,0x64,0x5b,0xb5]
   shll.qb $3, $4, 5            # CHECK: shll.qb $3, $4, 5       # encoding: [0x00,0x64,0xa8,0x7c]
diff --git a/test/MC/Mips/micromips-dspr2/valid.s b/test/MC/Mips/micromips-dspr2/valid.s
index 37949d4f561d..3035ae5ba815 100644
--- a/test/MC/Mips/micromips-dspr2/valid.s
+++ b/test/MC/Mips/micromips-dspr2/valid.s
@@ -56,6 +56,13 @@
   preceu.ph.qbla $15, $16      # CHECK: preceu.ph.qbla $15, $16  # encoding: [0x01,0xf0,0xb3,0x3c]
   preceu.ph.qbr $17, $18       # CHECK: preceu.ph.qbr $17, $18   # encoding: [0x02,0x32,0xd1,0x3c]
   preceu.ph.qbra $19, $20      # CHECK: preceu.ph.qbra $19, $20  # encoding: [0x02,0x74,0xd3,0x3c]
+  precr.qb.ph $1, $2, $3        # CHECK: precr.qb.ph $1, $2, $3        # encoding: [0x00,0x62,0x08,0x6d]
+  precr_sra.ph.w $4, $5, 1      # CHECK: precr_sra.ph.w $4, $5, 1      # encoding: [0x00,0x85,0x0b,0xcd]
+  precr_sra_r.ph.w $6, $7, 2    # CHECK: precr_sra_r.ph.w $6, $7, 2    # encoding: [0x00,0xc7,0x17,0xcd]
+  precrq.ph.w $8, $9, $10       # CHECK: precrq.ph.w $8, $9, $10       # encoding: [0x01,0x49,0x40,0xed]
+  precrq.qb.ph $11, $12, $13    # CHECK: precrq.qb.ph $11, $12, $13    # encoding: [0x01,0xac,0x58,0xad]
+  precrqu_s.qb.ph $14, $15, $16 # CHECK: precrqu_s.qb.ph $14, $15, $16 # encoding: [0x02,0x0f,0x71,0x6d]
+  precrq_rs.ph.w $17, $18, $19  # CHECK: precrq_rs.ph.w $17, $18, $19  # encoding: [0x02,0x72,0x89,0x2d]
   shll.ph $3, $4, 5            # CHECK: shll.ph $3, $4, 5       # encoding: [0x00,0x64,0x53,0xb5]
   shll_s.ph $3, $4, 5          # CHECK: shll_s.ph $3, $4, 5     # encoding: [0x00,0x64,0x5b,0xb5]
   shll.qb $3, $4, 5            # CHECK: shll.qb $3, $4, 5       # encoding: [0x00,0x64,0xa8,0x7c]

From 1d44f044d75cd47bfe717d43b05007f73ed5f3d9 Mon Sep 17 00:00:00 2001
From: Daniel Sanders <daniel.sanders@imgtec.com>
Date: Mon, 30 Nov 2015 09:52:00 +0000
Subject: [PATCH 002/186] [mips][ias] Removed MSA instructions from base
 architecture valid-xfail.s's.

valid-xfail.s is for instructions that should be valid in the given ISA but
incorrectly fail. MSA instructions are correct to fail since MSA is not enabled.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254293 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/MC/Mips/mips32r2/invalid-msa.s | 62 +++++++++++++++++++++++++++++
 test/MC/Mips/mips32r2/valid-xfail.s | 55 -------------------------
 test/MC/Mips/mips32r3/valid-xfail.s | 55 -------------------------
 test/MC/Mips/mips32r5/valid-xfail.s | 55 -------------------------
 test/MC/Mips/mips64r2/valid-xfail.s | 55 -------------------------
 test/MC/Mips/mips64r3/valid-xfail.s | 55 -------------------------
 test/MC/Mips/mips64r5/valid-xfail.s | 55 -------------------------
 7 files changed, 62 insertions(+), 330 deletions(-)
 create mode 100644 test/MC/Mips/mips32r2/invalid-msa.s

diff --git a/test/MC/Mips/mips32r2/invalid-msa.s b/test/MC/Mips/mips32r2/invalid-msa.s
new file mode 100644
index 000000000000..2ad99b147209
--- /dev/null
+++ b/test/MC/Mips/mips32r2/invalid-msa.s
@@ -0,0 +1,62 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding \
+# RUN:     -mcpu=mips32r2 2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        and.v           $w10,$w25,$w29 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bmnz.v          $w15,$w2,$w28  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bmz.v           $w13,$w11,$w21 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bsel.v          $w28,$w7,$w0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        fclass.d        $w14,$w27      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        fclass.w        $w19,$w28      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        fexupl.d        $w10,$w29      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        fexupl.w        $w12,$w27      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        fexupr.d        $w31,$w15      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        fexupr.w        $w29,$w12      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ffint_s.d       $w1,$w30       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ffint_s.w       $w16,$w14      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ffint_u.d       $w23,$w18      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ffint_u.w       $w19,$w12      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ffql.d          $w2,$w3        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ffql.w          $w9,$w0        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ffqr.d          $w25,$w24      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ffqr.w          $w10,$w6       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        fill.b          $w9,$v1        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        fill.h          $w9,$8         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        fill.w          $w31,$15       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        flog2.d         $w12,$w16      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        flog2.w         $w19,$w23      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        frcp.d          $w12,$w4       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        frcp.w          $w30,$w8       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        frint.d         $w20,$w8       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        frint.w         $w11,$w29      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        frsqrt.d        $w29,$w2       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        frsqrt.w        $w9,$w8        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        fsqrt.d         $w3,$w1        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        fsqrt.w         $w5,$w15       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ftint_s.d       $w31,$w26      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ftint_s.w       $w27,$w14      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ftint_u.d       $w5,$w31       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ftint_u.w       $w12,$w29      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ftrunc_s.d      $w4,$w22       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ftrunc_s.w      $w24,$w7       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ftrunc_u.d      $w20,$w25      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ftrunc_u.w      $w7,$w26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        move.v          $w8,$w17       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        nloc.b          $w12,$w30      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        nloc.d          $w16,$w7       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        nloc.h          $w21,$w17      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        nloc.w          $w17,$w16      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        nlzc.b          $w12,$w7       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        nlzc.d          $w14,$w14      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        nlzc.h          $w24,$w24      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        nlzc.w          $w10,$w4       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        nor.v           $w20,$w20,$w15 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        or.v            $w13,$w23,$w12 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        pcnt.b          $w30,$w15      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        pcnt.d          $w5,$w16       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        pcnt.h          $w20,$w24      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        pcnt.w          $w22,$w20      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        xor.v           $w20,$w21,$w30 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips32r2/valid-xfail.s b/test/MC/Mips/mips32r2/valid-xfail.s
index 13385d06ce81..658f172aec3d 100644
--- a/test/MC/Mips/mips32r2/valid-xfail.s
+++ b/test/MC/Mips/mips32r2/valid-xfail.s
@@ -28,11 +28,7 @@
         adduh_r.qb      $a0,$9,$12
         addwc           $k0,$s6,$s7
         alnv.ps         $f12,$f18,$f30,$12
-        and.v           $w10,$w25,$w29
         bitrev          $14,$at
-        bmnz.v          $w15,$w2,$w28
-        bmz.v           $w13,$w11,$w21
-        bsel.v          $w28,$w7,$w0
         c.eq.d          $fcc1,$f15,$f15
         c.eq.ps         $fcc5,$f0,$f9
         c.eq.s          $fcc5,$f24,$f17
@@ -124,44 +120,9 @@
         extrv_r.w       $8,$ac1,$s6
         extrv_rs.w      $gp,$ac1,$s6
         extrv_s.h       $s2,$ac1,$14
-        fclass.d        $w14,$w27
-        fclass.w        $w19,$w28
-        fexupl.d        $w10,$w29
-        fexupl.w        $w12,$w27
-        fexupr.d        $w31,$w15
-        fexupr.w        $w29,$w12
-        ffint_s.d       $w1,$w30
-        ffint_s.w       $w16,$w14
-        ffint_u.d       $w23,$w18
-        ffint_u.w       $w19,$w12
-        ffql.d          $w2,$w3
-        ffql.w          $w9,$w0
-        ffqr.d          $w25,$w24
-        ffqr.w          $w10,$w6
-        fill.b          $w9,$v1
-        fill.h          $w9,$8
-        fill.w          $w31,$15
-        flog2.d         $w12,$w16
-        flog2.w         $w19,$w23
         floor.l.d       $f26,$f7
         floor.l.s       $f12,$f5
         fork            $s2,$8,$a0
-        frcp.d          $w12,$w4
-        frcp.w          $w30,$w8
-        frint.d         $w20,$w8
-        frint.w         $w11,$w29
-        frsqrt.d        $w29,$w2
-        frsqrt.w        $w9,$w8
-        fsqrt.d         $w3,$w1
-        fsqrt.w         $w5,$w15
-        ftint_s.d       $w31,$w26
-        ftint_s.w       $w27,$w14
-        ftint_u.d       $w5,$w31
-        ftint_u.w       $w12,$w29
-        ftrunc_s.d      $w4,$w22
-        ftrunc_s.w      $w24,$w7
-        ftrunc_u.d      $w20,$w25
-        ftrunc_u.w      $w7,$w26
         insv            $s2,$at
         iret
         lbe             $14,122($9)
@@ -184,7 +145,6 @@
         mflo            $9,$ac2
         modsub          $a3,$12,$a3
         mov.ps          $f22,$f17
-        move.v          $w8,$w17
         movf.ps         $f10,$f28,$fcc6
         movn.ps         $f31,$f31,$s3
         movt.ps         $f20,$f25,$fcc2
@@ -210,23 +170,9 @@
         mulsa.w.ph      $ac1,$s4,$s6
         mulsaq_s.w.ph   $ac0,$ra,$s2
         neg.ps          $f19,$f13
-        nloc.b          $w12,$w30
-        nloc.d          $w16,$w7
-        nloc.h          $w21,$w17
-        nloc.w          $w17,$w16
-        nlzc.b          $w12,$w7
-        nlzc.d          $w14,$w14
-        nlzc.h          $w24,$w24
-        nlzc.w          $w10,$w4
         nmadd.ps        $f27,$f4,$f9,$f25
         nmsub.ps        $f6,$f12,$f14,$f17
-        nor.v           $w20,$w20,$w15
-        or.v            $w13,$w23,$w12
         packrl.ph       $ra,$24,$14
-        pcnt.b          $w30,$w15
-        pcnt.d          $w5,$w16
-        pcnt.h          $w20,$w24
-        pcnt.w          $w22,$w20
         pick.ph         $ra,$a2,$gp
         pick.qb         $11,$a0,$gp
         pll.ps          $f25,$f9,$f30
@@ -304,5 +250,4 @@
         trunc.l.d       $f23,$f23
         trunc.l.s       $f28,$f31
         wrpgpr          $zero,$13
-        xor.v           $w20,$w21,$w30
         yield           $v1,$s0
diff --git a/test/MC/Mips/mips32r3/valid-xfail.s b/test/MC/Mips/mips32r3/valid-xfail.s
index b0fc3a1d23f7..09e19e8bb3b6 100644
--- a/test/MC/Mips/mips32r3/valid-xfail.s
+++ b/test/MC/Mips/mips32r3/valid-xfail.s
@@ -28,11 +28,7 @@
         adduh_r.qb      $a0,$9,$12
         addwc           $k0,$s6,$s7
         alnv.ps         $f12,$f18,$f30,$12
-        and.v           $w10,$w25,$w29
         bitrev          $14,$at
-        bmnz.v          $w15,$w2,$w28
-        bmz.v           $w13,$w11,$w21
-        bsel.v          $w28,$w7,$w0
         c.eq.d          $fcc1,$f15,$f15
         c.eq.ps         $fcc5,$f0,$f9
         c.eq.s          $fcc5,$f24,$f17
@@ -124,44 +120,9 @@
         extrv_r.w       $8,$ac1,$s6
         extrv_rs.w      $gp,$ac1,$s6
         extrv_s.h       $s2,$ac1,$14
-        fclass.d        $w14,$w27
-        fclass.w        $w19,$w28
-        fexupl.d        $w10,$w29
-        fexupl.w        $w12,$w27
-        fexupr.d        $w31,$w15
-        fexupr.w        $w29,$w12
-        ffint_s.d       $w1,$w30
-        ffint_s.w       $w16,$w14
-        ffint_u.d       $w23,$w18
-        ffint_u.w       $w19,$w12
-        ffql.d          $w2,$w3
-        ffql.w          $w9,$w0
-        ffqr.d          $w25,$w24
-        ffqr.w          $w10,$w6
-        fill.b          $w9,$v1
-        fill.h          $w9,$8
-        fill.w          $w31,$15
-        flog2.d         $w12,$w16
-        flog2.w         $w19,$w23
         floor.l.d       $f26,$f7
         floor.l.s       $f12,$f5
         fork            $s2,$8,$a0
-        frcp.d          $w12,$w4
-        frcp.w          $w30,$w8
-        frint.d         $w20,$w8
-        frint.w         $w11,$w29
-        frsqrt.d        $w29,$w2
-        frsqrt.w        $w9,$w8
-        fsqrt.d         $w3,$w1
-        fsqrt.w         $w5,$w15
-        ftint_s.d       $w31,$w26
-        ftint_s.w       $w27,$w14
-        ftint_u.d       $w5,$w31
-        ftint_u.w       $w12,$w29
-        ftrunc_s.d      $w4,$w22
-        ftrunc_s.w      $w24,$w7
-        ftrunc_u.d      $w20,$w25
-        ftrunc_u.w      $w7,$w26
         insv            $s2,$at
         iret
         lbe             $14,122($9)
@@ -184,7 +145,6 @@
         mflo            $9,$ac2
         modsub          $a3,$12,$a3
         mov.ps          $f22,$f17
-        move.v          $w8,$w17
         movf.ps         $f10,$f28,$fcc6
         movn.ps         $f31,$f31,$s3
         movt.ps         $f20,$f25,$fcc2
@@ -210,23 +170,9 @@
         mulsa.w.ph      $ac1,$s4,$s6
         mulsaq_s.w.ph   $ac0,$ra,$s2
         neg.ps          $f19,$f13
-        nloc.b          $w12,$w30
-        nloc.d          $w16,$w7
-        nloc.h          $w21,$w17
-        nloc.w          $w17,$w16
-        nlzc.b          $w12,$w7
-        nlzc.d          $w14,$w14
-        nlzc.h          $w24,$w24
-        nlzc.w          $w10,$w4
         nmadd.ps        $f27,$f4,$f9,$f25
         nmsub.ps        $f6,$f12,$f14,$f17
-        nor.v           $w20,$w20,$w15
-        or.v            $w13,$w23,$w12
         packrl.ph       $ra,$24,$14
-        pcnt.b          $w30,$w15
-        pcnt.d          $w5,$w16
-        pcnt.h          $w20,$w24
-        pcnt.w          $w22,$w20
         pick.ph         $ra,$a2,$gp
         pick.qb         $11,$a0,$gp
         pll.ps          $f25,$f9,$f30
@@ -304,5 +250,4 @@
         trunc.l.d       $f23,$f23
         trunc.l.s       $f28,$f31
         wrpgpr          $zero,$13
-        xor.v           $w20,$w21,$w30
         yield           $v1,$s0
diff --git a/test/MC/Mips/mips32r5/valid-xfail.s b/test/MC/Mips/mips32r5/valid-xfail.s
index a821dddb85ca..30fc4b98e056 100644
--- a/test/MC/Mips/mips32r5/valid-xfail.s
+++ b/test/MC/Mips/mips32r5/valid-xfail.s
@@ -28,11 +28,7 @@
         adduh_r.qb      $a0,$9,$12
         addwc           $k0,$s6,$s7
         alnv.ps         $f12,$f18,$f30,$12
-        and.v           $w10,$w25,$w29
         bitrev          $14,$at
-        bmnz.v          $w15,$w2,$w28
-        bmz.v           $w13,$w11,$w21
-        bsel.v          $w28,$w7,$w0
         c.eq.d          $fcc1,$f15,$f15
         c.eq.ps         $fcc5,$f0,$f9
         c.eq.s          $fcc5,$f24,$f17
@@ -124,44 +120,9 @@
         extrv_r.w       $8,$ac1,$s6
         extrv_rs.w      $gp,$ac1,$s6
         extrv_s.h       $s2,$ac1,$14
-        fclass.d        $w14,$w27
-        fclass.w        $w19,$w28
-        fexupl.d        $w10,$w29
-        fexupl.w        $w12,$w27
-        fexupr.d        $w31,$w15
-        fexupr.w        $w29,$w12
-        ffint_s.d       $w1,$w30
-        ffint_s.w       $w16,$w14
-        ffint_u.d       $w23,$w18
-        ffint_u.w       $w19,$w12
-        ffql.d          $w2,$w3
-        ffql.w          $w9,$w0
-        ffqr.d          $w25,$w24
-        ffqr.w          $w10,$w6
-        fill.b          $w9,$v1
-        fill.h          $w9,$8
-        fill.w          $w31,$15
-        flog2.d         $w12,$w16
-        flog2.w         $w19,$w23
         floor.l.d       $f26,$f7
         floor.l.s       $f12,$f5
         fork            $s2,$8,$a0
-        frcp.d          $w12,$w4
-        frcp.w          $w30,$w8
-        frint.d         $w20,$w8
-        frint.w         $w11,$w29
-        frsqrt.d        $w29,$w2
-        frsqrt.w        $w9,$w8
-        fsqrt.d         $w3,$w1
-        fsqrt.w         $w5,$w15
-        ftint_s.d       $w31,$w26
-        ftint_s.w       $w27,$w14
-        ftint_u.d       $w5,$w31
-        ftint_u.w       $w12,$w29
-        ftrunc_s.d      $w4,$w22
-        ftrunc_s.w      $w24,$w7
-        ftrunc_u.d      $w20,$w25
-        ftrunc_u.w      $w7,$w26
         insv            $s2,$at
         iret
         lbe             $14,122($9)
@@ -184,7 +145,6 @@
         mflo            $9,$ac2
         modsub          $a3,$12,$a3
         mov.ps          $f22,$f17
-        move.v          $w8,$w17
         movf.ps         $f10,$f28,$fcc6
         movn.ps         $f31,$f31,$s3
         movt.ps         $f20,$f25,$fcc2
@@ -210,23 +170,9 @@
         mulsa.w.ph      $ac1,$s4,$s6
         mulsaq_s.w.ph   $ac0,$ra,$s2
         neg.ps          $f19,$f13
-        nloc.b          $w12,$w30
-        nloc.d          $w16,$w7
-        nloc.h          $w21,$w17
-        nloc.w          $w17,$w16
-        nlzc.b          $w12,$w7
-        nlzc.d          $w14,$w14
-        nlzc.h          $w24,$w24
-        nlzc.w          $w10,$w4
         nmadd.ps        $f27,$f4,$f9,$f25
         nmsub.ps        $f6,$f12,$f14,$f17
-        nor.v           $w20,$w20,$w15
-        or.v            $w13,$w23,$w12
         packrl.ph       $ra,$24,$14
-        pcnt.b          $w30,$w15
-        pcnt.d          $w5,$w16
-        pcnt.h          $w20,$w24
-        pcnt.w          $w22,$w20
         pick.ph         $ra,$a2,$gp
         pick.qb         $11,$a0,$gp
         pll.ps          $f25,$f9,$f30
@@ -304,5 +250,4 @@
         trunc.l.d       $f23,$f23
         trunc.l.s       $f28,$f31
         wrpgpr          $zero,$13
-        xor.v           $w20,$w21,$w30
         yield           $v1,$s0
diff --git a/test/MC/Mips/mips64r2/valid-xfail.s b/test/MC/Mips/mips64r2/valid-xfail.s
index 148758cd3263..5faa29d6468e 100644
--- a/test/MC/Mips/mips64r2/valid-xfail.s
+++ b/test/MC/Mips/mips64r2/valid-xfail.s
@@ -31,11 +31,7 @@
         alnv.ob         $v31,$v23,$v30,$at
         alnv.ob         $v8,$v17,$v30,$a1
         alnv.ps         $f12,$f18,$f30,$12
-        and.v           $w10,$w25,$w29
         bitrev          $14,$at
-        bmnz.v          $w15,$w2,$w28
-        bmz.v           $w13,$w11,$w21
-        bsel.v          $w28,$w7,$w0
         c.eq.d          $fcc1,$f15,$f15
         c.eq.ps         $fcc5,$f0,$f9
         c.eq.s          $fcc5,$f24,$f17
@@ -126,43 +122,7 @@
         extrv_r.w       $8,$ac1,$s6
         extrv_rs.w      $gp,$ac1,$s6
         extrv_s.h       $s2,$ac1,$14
-        fclass.d        $w14,$w27
-        fclass.w        $w19,$w28
-        fexupl.d        $w10,$w29
-        fexupl.w        $w12,$w27
-        fexupr.d        $w31,$w15
-        fexupr.w        $w29,$w12
-        ffint_s.d       $w1,$w30
-        ffint_s.w       $w16,$w14
-        ffint_u.d       $w23,$w18
-        ffint_u.w       $w19,$w12
-        ffql.d          $w2,$w3
-        ffql.w          $w9,$w0
-        ffqr.d          $w25,$w24
-        ffqr.w          $w10,$w6
-        fill.b          $w9,$v1
-        fill.d          $w28,$8
-        fill.h          $w9,$8
-        fill.w          $w31,$15
-        flog2.d         $w12,$w16
-        flog2.w         $w19,$w23
         fork            $s2,$8,$a0
-        frcp.d          $w12,$w4
-        frcp.w          $w30,$w8
-        frint.d         $w20,$w8
-        frint.w         $w11,$w29
-        frsqrt.d        $w29,$w2
-        frsqrt.w        $w9,$w8
-        fsqrt.d         $w3,$w1
-        fsqrt.w         $w5,$w15
-        ftint_s.d       $w31,$w26
-        ftint_s.w       $w27,$w14
-        ftint_u.d       $w5,$w31
-        ftint_u.w       $w12,$w29
-        ftrunc_s.d      $w4,$w22
-        ftrunc_s.w      $w24,$w7
-        ftrunc_u.d      $w20,$w25
-        ftrunc_u.w      $w7,$w26
         insv            $s2,$at
         iret
         lbe $14,122($9)
@@ -212,23 +172,9 @@
         mulsa.w.ph      $ac1,$s4,$s6
         mulsaq_s.w.ph   $ac0,$ra,$s2
         neg.ps          $f19,$f13
-        nloc.b          $w12,$w30
-        nloc.d          $w16,$w7
-        nloc.h          $w21,$w17
-        nloc.w          $w17,$w16
-        nlzc.b          $w12,$w7
-        nlzc.d          $w14,$w14
-        nlzc.h          $w24,$w24
-        nlzc.w          $w10,$w4
         nmadd.ps        $f27,$f4,$f9,$f25
         nmsub.ps        $f6,$f12,$f14,$f17
-        nor.v           $w20,$w20,$w15
-        or.v            $w13,$w23,$w12
         packrl.ph       $ra,$24,$14
-        pcnt.b          $w30,$w15
-        pcnt.d          $w5,$w16
-        pcnt.h          $w20,$w24
-        pcnt.w          $w22,$w20
         pick.ph         $ra,$a2,$gp
         pick.qb         $11,$a0,$gp
         pll.ps          $f25,$f9,$f30
@@ -302,5 +248,4 @@
         tlbinv
         tlbinvf
         wrpgpr          $zero,$13
-        xor.v           $w20,$w21,$w30
         yield           $v1,$s0
diff --git a/test/MC/Mips/mips64r3/valid-xfail.s b/test/MC/Mips/mips64r3/valid-xfail.s
index f2949c4f2dda..dcf66bf97d68 100644
--- a/test/MC/Mips/mips64r3/valid-xfail.s
+++ b/test/MC/Mips/mips64r3/valid-xfail.s
@@ -31,11 +31,7 @@
         alnv.ob         $v31,$v23,$v30,$at
         alnv.ob         $v8,$v17,$v30,$a1
         alnv.ps         $f12,$f18,$f30,$12
-        and.v           $w10,$w25,$w29
         bitrev          $14,$at
-        bmnz.v          $w15,$w2,$w28
-        bmz.v           $w13,$w11,$w21
-        bsel.v          $w28,$w7,$w0
         c.eq.d          $fcc1,$f15,$f15
         c.eq.ps         $fcc5,$f0,$f9
         c.eq.s          $fcc5,$f24,$f17
@@ -126,43 +122,7 @@
         extrv_r.w       $8,$ac1,$s6
         extrv_rs.w      $gp,$ac1,$s6
         extrv_s.h       $s2,$ac1,$14
-        fclass.d        $w14,$w27
-        fclass.w        $w19,$w28
-        fexupl.d        $w10,$w29
-        fexupl.w        $w12,$w27
-        fexupr.d        $w31,$w15
-        fexupr.w        $w29,$w12
-        ffint_s.d       $w1,$w30
-        ffint_s.w       $w16,$w14
-        ffint_u.d       $w23,$w18
-        ffint_u.w       $w19,$w12
-        ffql.d          $w2,$w3
-        ffql.w          $w9,$w0
-        ffqr.d          $w25,$w24
-        ffqr.w          $w10,$w6
-        fill.b          $w9,$v1
-        fill.d          $w28,$8
-        fill.h          $w9,$8
-        fill.w          $w31,$15
-        flog2.d         $w12,$w16
-        flog2.w         $w19,$w23
         fork            $s2,$8,$a0
-        frcp.d          $w12,$w4
-        frcp.w          $w30,$w8
-        frint.d         $w20,$w8
-        frint.w         $w11,$w29
-        frsqrt.d        $w29,$w2
-        frsqrt.w        $w9,$w8
-        fsqrt.d         $w3,$w1
-        fsqrt.w         $w5,$w15
-        ftint_s.d       $w31,$w26
-        ftint_s.w       $w27,$w14
-        ftint_u.d       $w5,$w31
-        ftint_u.w       $w12,$w29
-        ftrunc_s.d      $w4,$w22
-        ftrunc_s.w      $w24,$w7
-        ftrunc_u.d      $w20,$w25
-        ftrunc_u.w      $w7,$w26
         insv            $s2,$at
         iret
         lbe $14,122($9)
@@ -212,23 +172,9 @@
         mulsa.w.ph      $ac1,$s4,$s6
         mulsaq_s.w.ph   $ac0,$ra,$s2
         neg.ps          $f19,$f13
-        nloc.b          $w12,$w30
-        nloc.d          $w16,$w7
-        nloc.h          $w21,$w17
-        nloc.w          $w17,$w16
-        nlzc.b          $w12,$w7
-        nlzc.d          $w14,$w14
-        nlzc.h          $w24,$w24
-        nlzc.w          $w10,$w4
         nmadd.ps        $f27,$f4,$f9,$f25
         nmsub.ps        $f6,$f12,$f14,$f17
-        nor.v           $w20,$w20,$w15
-        or.v            $w13,$w23,$w12
         packrl.ph       $ra,$24,$14
-        pcnt.b          $w30,$w15
-        pcnt.d          $w5,$w16
-        pcnt.h          $w20,$w24
-        pcnt.w          $w22,$w20
         pick.ph         $ra,$a2,$gp
         pick.qb         $11,$a0,$gp
         pll.ps          $f25,$f9,$f30
@@ -302,5 +248,4 @@
         tlbinv
         tlbinvf
         wrpgpr          $zero,$13
-        xor.v           $w20,$w21,$w30
         yield           $v1,$s0
diff --git a/test/MC/Mips/mips64r5/valid-xfail.s b/test/MC/Mips/mips64r5/valid-xfail.s
index 04221ddb8630..0f7788359cf2 100644
--- a/test/MC/Mips/mips64r5/valid-xfail.s
+++ b/test/MC/Mips/mips64r5/valid-xfail.s
@@ -31,11 +31,7 @@
         alnv.ob         $v31,$v23,$v30,$at
         alnv.ob         $v8,$v17,$v30,$a1
         alnv.ps         $f12,$f18,$f30,$12
-        and.v           $w10,$w25,$w29
         bitrev          $14,$at
-        bmnz.v          $w15,$w2,$w28
-        bmz.v           $w13,$w11,$w21
-        bsel.v          $w28,$w7,$w0
         c.eq.d          $fcc1,$f15,$f15
         c.eq.ps         $fcc5,$f0,$f9
         c.eq.s          $fcc5,$f24,$f17
@@ -126,43 +122,7 @@
         extrv_r.w       $8,$ac1,$s6
         extrv_rs.w      $gp,$ac1,$s6
         extrv_s.h       $s2,$ac1,$14
-        fclass.d        $w14,$w27
-        fclass.w        $w19,$w28
-        fexupl.d        $w10,$w29
-        fexupl.w        $w12,$w27
-        fexupr.d        $w31,$w15
-        fexupr.w        $w29,$w12
-        ffint_s.d       $w1,$w30
-        ffint_s.w       $w16,$w14
-        ffint_u.d       $w23,$w18
-        ffint_u.w       $w19,$w12
-        ffql.d          $w2,$w3
-        ffql.w          $w9,$w0
-        ffqr.d          $w25,$w24
-        ffqr.w          $w10,$w6
-        fill.b          $w9,$v1
-        fill.d          $w28,$8
-        fill.h          $w9,$8
-        fill.w          $w31,$15
-        flog2.d         $w12,$w16
-        flog2.w         $w19,$w23
         fork            $s2,$8,$a0
-        frcp.d          $w12,$w4
-        frcp.w          $w30,$w8
-        frint.d         $w20,$w8
-        frint.w         $w11,$w29
-        frsqrt.d        $w29,$w2
-        frsqrt.w        $w9,$w8
-        fsqrt.d         $w3,$w1
-        fsqrt.w         $w5,$w15
-        ftint_s.d       $w31,$w26
-        ftint_s.w       $w27,$w14
-        ftint_u.d       $w5,$w31
-        ftint_u.w       $w12,$w29
-        ftrunc_s.d      $w4,$w22
-        ftrunc_s.w      $w24,$w7
-        ftrunc_u.d      $w20,$w25
-        ftrunc_u.w      $w7,$w26
         insv            $s2,$at
         iret
         lbe $14,122($9)
@@ -212,23 +172,9 @@
         mulsa.w.ph      $ac1,$s4,$s6
         mulsaq_s.w.ph   $ac0,$ra,$s2
         neg.ps          $f19,$f13
-        nloc.b          $w12,$w30
-        nloc.d          $w16,$w7
-        nloc.h          $w21,$w17
-        nloc.w          $w17,$w16
-        nlzc.b          $w12,$w7
-        nlzc.d          $w14,$w14
-        nlzc.h          $w24,$w24
-        nlzc.w          $w10,$w4
         nmadd.ps        $f27,$f4,$f9,$f25
         nmsub.ps        $f6,$f12,$f14,$f17
-        nor.v           $w20,$w20,$w15
-        or.v            $w13,$w23,$w12
         packrl.ph       $ra,$24,$14
-        pcnt.b          $w30,$w15
-        pcnt.d          $w5,$w16
-        pcnt.h          $w20,$w24
-        pcnt.w          $w22,$w20
         pick.ph         $ra,$a2,$gp
         pick.qb         $11,$a0,$gp
         pll.ps          $f25,$f9,$f30
@@ -302,5 +248,4 @@
         tlbinv
         tlbinvf
         wrpgpr          $zero,$13
-        xor.v           $w20,$w21,$w30
         yield           $v1,$s0

From bffca30038b9b302ecaa601025e67f1b578d0103 Mon Sep 17 00:00:00 2001
From: Igor Breger <igor.breger@intel.com>
Date: Mon, 30 Nov 2015 10:40:52 +0000
Subject: [PATCH 003/186] AVX512: regenerate avx512bw intrincics tests results.

Differential Revision: http://reviews.llvm.org/D15069

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254295 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/avx512bw-intrinsics.ll | 1349 +++++++++++++++--------
 1 file changed, 879 insertions(+), 470 deletions(-)

diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index 7b9768c95944..6b032e0e6d78 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -1,15 +1,24 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw --show-mc-encoding| FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512BW
+
 
 define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) {
-; CHECK-LABEL: test_pcmpeq_b
-; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 ##
+; AVX512BW-LABEL: test_pcmpeq_b:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    retq
   %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
   ret i64 %res
 }
 
 define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
-; CHECK-LABEL: test_mask_pcmpeq_b
-; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} ##
+; AVX512BW-LABEL: test_mask_pcmpeq_b:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    retq
   %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
   ret i64 %res
 }
@@ -17,15 +26,22 @@ define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
 declare i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8>, <64 x i8>, i64)
 
 define i32 @test_pcmpeq_w(<32 x i16> %a, <32 x i16> %b) {
-; CHECK-LABEL: test_pcmpeq_w
-; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 ##
+; AVX512BW-LABEL: test_pcmpeq_w:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    retq
   %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
   ret i32 %res
 }
 
 define i32 @test_mask_pcmpeq_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
-; CHECK-LABEL: test_mask_pcmpeq_w
-; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} ##
+; AVX512BW-LABEL: test_mask_pcmpeq_w:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    retq
   %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
   ret i32 %res
 }
@@ -33,15 +49,22 @@ define i32 @test_mask_pcmpeq_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
 declare i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16>, <32 x i16>, i32)
 
 define i64 @test_pcmpgt_b(<64 x i8> %a, <64 x i8> %b) {
-; CHECK-LABEL: test_pcmpgt_b
-; CHECK: vpcmpgtb %zmm1, %zmm0, %k0 ##
+; AVX512BW-LABEL: test_pcmpgt_b:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    retq
   %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
   ret i64 %res
 }
 
 define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
-; CHECK-LABEL: test_mask_pcmpgt_b
-; CHECK: vpcmpgtb %zmm1, %zmm0, %k0 {%k1} ##
+; AVX512BW-LABEL: test_mask_pcmpgt_b:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    retq
   %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
   ret i64 %res
 }
@@ -49,326 +72,491 @@ define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
 declare i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8>, <64 x i8>, i64)
 
 define i32 @test_pcmpgt_w(<32 x i16> %a, <32 x i16> %b) {
-; CHECK-LABEL: test_pcmpgt_w
-; CHECK: vpcmpgtw %zmm1, %zmm0, %k0 ##
+; AVX512BW-LABEL: test_pcmpgt_w:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    retq
   %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
   ret i32 %res
 }
 
 define i32 @test_mask_pcmpgt_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
-; CHECK-LABEL: test_mask_pcmpgt_w
-; CHECK: vpcmpgtw %zmm1, %zmm0, %k0 {%k1} ##
+; AVX512BW-LABEL: test_mask_pcmpgt_w:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    retq
   %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
   ret i32 %res
 }
 
 declare i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16>, <32 x i16>, i32)
 
-define <8 x i64> @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
-; CHECK_LABEL: test_cmp_b_512
-; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 ##
+define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
+; AVX512BW-LABEL: test_cmp_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    vpcmpltb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpleb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rcx, %rax
+; AVX512BW-NEXT:    vpcmpunordb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rcx, %rax
+; AVX512BW-NEXT:    vpcmpnltb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpnleb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rdx
+; AVX512BW-NEXT:    addq %rcx, %rdx
+; AVX512BW-NEXT:    vpcmpordb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rdx, %rax
+; AVX512BW-NEXT:    retq
   %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
-  %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
-; CHECK: vpcmpltb %zmm1, %zmm0, %k0 ##
   %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1)
-  %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
-; CHECK: vpcmpleb %zmm1, %zmm0, %k0 ##
+  %ret1 = add i64 %res0, %res1
   %res2 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 -1)
-  %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
-; CHECK: vpcmpunordb %zmm1, %zmm0, %k0 ##
+  %ret2 = add i64 %ret1, %res2
   %res3 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 -1)
-  %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
-; CHECK: vpcmpneqb %zmm1, %zmm0, %k0 ##
+  %ret3 = add i64 %ret2, %res3
   %res4 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 -1)
-  %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
-; CHECK: vpcmpnltb %zmm1, %zmm0, %k0 ##
+  %ret4 = add i64 %ret3, %res4
   %res5 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 -1)
-  %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
-; CHECK: vpcmpnleb %zmm1, %zmm0, %k0 ##
+  %ret5 = add i64 %ret4, %res5
   %res6 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 -1)
-  %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
-; CHECK: vpcmpordb %zmm1, %zmm0, %k0 ##
+  %ret6 = add i64 %ret5, %res6
   %res7 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 -1)
-  %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
-  ret <8 x i64> %vec7
-}
-
-define <8 x i64> @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
-; CHECK_LABEL: test_mask_cmp_b_512
-; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} ##
+  %ret7 = add i64 %ret6, %res7
+  ret i64 %ret7
+}
+
+define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
+; AVX512BW-LABEL: test_mask_cmp_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    vpcmpltb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpleb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rcx, %rax
+; AVX512BW-NEXT:    vpcmpunordb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rcx, %rax
+; AVX512BW-NEXT:    vpcmpnltb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpnleb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rdx
+; AVX512BW-NEXT:    addq %rcx, %rdx
+; AVX512BW-NEXT:    vpcmpordb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rdx, %rax
+; AVX512BW-NEXT:    retq
   %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
-  %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
-; CHECK: vpcmpltb %zmm1, %zmm0, %k0 {%k1} ##
   %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask)
-  %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
-; CHECK: vpcmpleb %zmm1, %zmm0, %k0 {%k1} ##
+  %ret1 = add i64 %res0, %res1
   %res2 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 %mask)
-  %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
-; CHECK: vpcmpunordb %zmm1, %zmm0, %k0 {%k1} ##
+  %ret2 = add i64 %ret1, %res2
   %res3 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 %mask)
-  %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
-; CHECK: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} ##
+  %ret3 = add i64 %ret2, %res3
   %res4 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 %mask)
-  %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
-; CHECK: vpcmpnltb %zmm1, %zmm0, %k0 {%k1} ##
+  %ret4 = add i64 %ret3, %res4
   %res5 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 %mask)
-  %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
-; CHECK: vpcmpnleb %zmm1, %zmm0, %k0 {%k1} ##
+  %ret5 = add i64 %ret4, %res5
   %res6 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 %mask)
-  %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
-; CHECK: vpcmpordb %zmm1, %zmm0, %k0 {%k1} ##
+  %ret6 = add i64 %ret5, %res6
   %res7 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 %mask)
-  %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
-  ret <8 x i64> %vec7
+  %ret7 = add i64 %ret6, %res7
+  ret i64 %ret7
 }
 
 declare i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8>, <64 x i8>, i32, i64) nounwind readnone
 
-define <8 x i64> @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
-; CHECK_LABEL: test_ucmp_b_512
-; CHECK: vpcmpequb %zmm1, %zmm0, %k0 ##
+define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
+; AVX512BW-LABEL: test_ucmp_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpcmpequb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    vpcmpltub %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpleub %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rcx, %rax
+; AVX512BW-NEXT:    vpcmpunordub %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpnequb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rcx, %rax
+; AVX512BW-NEXT:    vpcmpnltub %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rdx
+; AVX512BW-NEXT:    addq %rcx, %rdx
+; AVX512BW-NEXT:    vpcmpordub %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rdx, %rax
+; AVX512BW-NEXT:    retq
   %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
-  %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
-; CHECK: vpcmpltub %zmm1, %zmm0, %k0 ##
   %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1)
-  %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
-; CHECK: vpcmpleub %zmm1, %zmm0, %k0 ##
+  %ret1 = add i64 %res0, %res1
   %res2 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 -1)
-  %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
-; CHECK: vpcmpunordub %zmm1, %zmm0, %k0 ##
+  %ret2 = add i64 %ret1, %res2
   %res3 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 -1)
-  %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
-; CHECK: vpcmpnequb %zmm1, %zmm0, %k0 ##
+  %ret3 = add i64 %ret2, %res3
   %res4 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 -1)
-  %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
-; CHECK: vpcmpnltub %zmm1, %zmm0, %k0 ##
+  %ret4 = add i64 %ret3, %res4
   %res5 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 -1)
-  %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
-; CHECK: vpcmpnleub %zmm1, %zmm0, %k0 ##
+  %ret5 = add i64 %ret4, %res5
   %res6 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 -1)
-  %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
-; CHECK: vpcmpordub %zmm1, %zmm0, %k0 ##
+  %ret6 = add i64 %ret5, %res6
   %res7 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 -1)
-  %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
-  ret <8 x i64> %vec7
-}
-
-define <8 x i64> @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
-; CHECK_LABEL: test_mask_ucmp_b_512
-; CHECK: vpcmpequb %zmm1, %zmm0, %k0 {%k1} ##
+  %ret7 = add i64 %ret6, %res7
+  ret i64 %ret7
+}
+
+define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
+; AVX512BW-LABEL: test_mask_x86_avx512_ucmp_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpcmpequb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    vpcmpltub %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpleub %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rcx, %rax
+; AVX512BW-NEXT:    vpcmpunordub %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpnequb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rcx, %rax
+; AVX512BW-NEXT:    vpcmpnltub %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rcx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rdx
+; AVX512BW-NEXT:    addq %rcx, %rdx
+; AVX512BW-NEXT:    vpcmpordub %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    addq %rdx, %rax
+; AVX512BW-NEXT:    retq
   %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
-  %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
-; CHECK: vpcmpltub %zmm1, %zmm0, %k0 {%k1} ##
   %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask)
-  %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
-; CHECK: vpcmpleub %zmm1, %zmm0, %k0 {%k1} ##
+  %ret1 = add i64 %res0, %res1
   %res2 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 %mask)
-  %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
-; CHECK: vpcmpunordub %zmm1, %zmm0, %k0 {%k1} ##
+  %ret2 = add i64 %ret1, %res2
   %res3 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 %mask)
-  %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
-; CHECK: vpcmpnequb %zmm1, %zmm0, %k0 {%k1} ##
+  %ret3 = add i64 %ret2, %res3
   %res4 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 %mask)
-  %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
-; CHECK: vpcmpnltub %zmm1, %zmm0, %k0 {%k1} ##
+  %ret4 = add i64 %ret3, %res4
   %res5 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 %mask)
-  %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
-; CHECK: vpcmpnleub %zmm1, %zmm0, %k0 {%k1} ##
+  %ret5 = add i64 %ret4, %res5
   %res6 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 %mask)
-  %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
-; CHECK: vpcmpordub %zmm1, %zmm0, %k0 {%k1} ##
+  %ret6 = add i64 %ret5, %res6
   %res7 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 %mask)
-  %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
-  ret <8 x i64> %vec7
+  %ret7 = add i64 %ret6, %res7
+  ret i64 %ret7
 }
 
 declare i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8>, <64 x i8>, i32, i64) nounwind readnone
 
-define <8 x i32> @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
-; CHECK_LABEL: test_cmp_w_512
-; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 ##
+define i32 @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
+; AVX512BW-LABEL: test_cmp_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    vpcmpltw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmplew %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %ecx, %eax
+; AVX512BW-NEXT:    vpcmpunordw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %ecx, %eax
+; AVX512BW-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmpnlew %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %edx
+; AVX512BW-NEXT:    addl %ecx, %edx
+; AVX512BW-NEXT:    vpcmpordw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %edx, %eax
+; AVX512BW-NEXT:    retq
   %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
-  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
-; CHECK: vpcmpltw %zmm1, %zmm0, %k0 ##
   %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1)
-  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
-; CHECK: vpcmplew %zmm1, %zmm0, %k0 ##
+  %ret1 = add i32 %res0, %res1
   %res2 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 -1)
-  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
-; CHECK: vpcmpunordw %zmm1, %zmm0, %k0 ##
+  %ret2 = add i32 %ret1, %res2
   %res3 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 -1)
-  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
-; CHECK: vpcmpneqw %zmm1, %zmm0, %k0 ##
+  %ret3 = add i32 %ret2, %res3
   %res4 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 -1)
-  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
-; CHECK: vpcmpnltw %zmm1, %zmm0, %k0 ##
+  %ret4 = add i32 %ret3, %res4
   %res5 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 -1)
-  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
-; CHECK: vpcmpnlew %zmm1, %zmm0, %k0 ##
+  %ret5 = add i32 %ret4, %res5
   %res6 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 -1)
-  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
-; CHECK: vpcmpordw %zmm1, %zmm0, %k0 ##
+  %ret6 = add i32 %ret5, %res6
   %res7 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 -1)
-  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
-  ret <8 x i32> %vec7
-}
-
-define <8 x i32> @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
-; CHECK_LABEL: test_mask_cmp_w_512
-; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret7 = add i32 %ret6, %res7
+  ret i32 %ret7
+}
+
+define i32 @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
+; AVX512BW-LABEL: test_mask_cmp_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    vpcmpltw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmplew %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %ecx, %eax
+; AVX512BW-NEXT:    vpcmpunordw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %ecx, %eax
+; AVX512BW-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmpnlew %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %edx
+; AVX512BW-NEXT:    addl %ecx, %edx
+; AVX512BW-NEXT:    vpcmpordw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %edx, %eax
+; AVX512BW-NEXT:    retq
   %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
-  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
-; CHECK: vpcmpltw %zmm1, %zmm0, %k0 {%k1} ##
   %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask)
-  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
-; CHECK: vpcmplew %zmm1, %zmm0, %k0 {%k1} ##
+  %ret1 = add i32 %res0, %res1
   %res2 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 %mask)
-  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
-; CHECK: vpcmpunordw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret2 = add i32 %ret1, %res2
   %res3 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 %mask)
-  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
-; CHECK: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret3 = add i32 %ret2, %res3
   %res4 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 %mask)
-  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
-; CHECK: vpcmpnltw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret4 = add i32 %ret3, %res4
   %res5 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 %mask)
-  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
-; CHECK: vpcmpnlew %zmm1, %zmm0, %k0 {%k1} ##
+  %ret5 = add i32 %ret4, %res5
   %res6 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 %mask)
-  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
-; CHECK: vpcmpordw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret6 = add i32 %ret5, %res6
   %res7 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 %mask)
-  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
-  ret <8 x i32> %vec7
+  %ret7 = add i32 %ret6, %res7
+  ret i32 %ret7
 }
 
 declare i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16>, <32 x i16>, i32, i32) nounwind readnone
 
-define <8 x i32> @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
-; CHECK_LABEL: test_ucmp_w_512
-; CHECK: vpcmpequw %zmm1, %zmm0, %k0 ##
+define i32 @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
+; AVX512BW-LABEL: test_ucmp_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpcmpequw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %ecx, %eax
+; AVX512BW-NEXT:    vpcmpunorduw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmpnequw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %ecx, %eax
+; AVX512BW-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %edx
+; AVX512BW-NEXT:    addl %ecx, %edx
+; AVX512BW-NEXT:    vpcmporduw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %edx, %eax
+; AVX512BW-NEXT:    retq
   %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
-  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
-; CHECK: vpcmpltuw %zmm1, %zmm0, %k0 ##
   %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1)
-  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
-; CHECK: vpcmpleuw %zmm1, %zmm0, %k0 ##
+  %ret1 = add i32 %res0, %res1
   %res2 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 -1)
-  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
-; CHECK: vpcmpunorduw %zmm1, %zmm0, %k0 ##
+  %ret2 = add i32 %ret1, %res2
   %res3 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 -1)
-  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
-; CHECK: vpcmpnequw %zmm1, %zmm0, %k0 ##
+  %ret3 = add i32 %ret2, %res3
   %res4 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 -1)
-  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
-; CHECK: vpcmpnltuw %zmm1, %zmm0, %k0 ##
+  %ret4 = add i32 %ret3, %res4
   %res5 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 -1)
-  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
-; CHECK: vpcmpnleuw %zmm1, %zmm0, %k0 ##
+  %ret5 = add i32 %ret4, %res5
   %res6 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 -1)
-  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
-; CHECK: vpcmporduw %zmm1, %zmm0, %k0 ##
+  %ret6 = add i32 %ret5, %res6
   %res7 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 -1)
-  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
-  ret <8 x i32> %vec7
-}
-
-define <8 x i32> @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
-; CHECK_LABEL: test_mask_ucmp_w_512
-; CHECK: vpcmpequw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret7 = add i32 %ret6, %res7
+  ret i32 %ret7
+}
+
+define i32 @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
+; AVX512BW-LABEL: test_mask_ucmp_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpcmpequw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %ecx, %eax
+; AVX512BW-NEXT:    vpcmpunorduw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmpnequw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %ecx, %eax
+; AVX512BW-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %ecx
+; AVX512BW-NEXT:    addl %eax, %ecx
+; AVX512BW-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %edx
+; AVX512BW-NEXT:    addl %ecx, %edx
+; AVX512BW-NEXT:    vpcmporduw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    addl %edx, %eax
+; AVX512BW-NEXT:    retq
   %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
-  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
-; CHECK: vpcmpltuw %zmm1, %zmm0, %k0 {%k1} ##
   %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask)
-  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
-; CHECK: vpcmpleuw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret1 = add i32 %res0, %res1
   %res2 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 %mask)
-  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
-; CHECK: vpcmpunorduw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret2 = add i32 %ret1, %res2
   %res3 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 %mask)
-  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
-; CHECK: vpcmpnequw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret3 = add i32 %ret2, %res3
   %res4 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 %mask)
-  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
-; CHECK: vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret4 = add i32 %ret3, %res4
   %res5 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 %mask)
-  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
-; CHECK: vpcmpnleuw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret5 = add i32 %ret4, %res5
   %res6 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 %mask)
-  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
-; CHECK: vpcmporduw %zmm1, %zmm0, %k0 {%k1} ##
+  %ret6 = add i32 %ret5, %res6
   %res7 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 %mask)
-  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
-  ret <8 x i32> %vec7
+  %ret7 = add i32 %ret6, %res7
+  ret i32 %ret7
 }
 
 declare i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16>, <32 x i16>, i32, i32) nounwind readnone
 
 declare <64 x i8> @llvm.x86.avx512.mask.blend.b.512(<64 x i8>, <64 x i8>, i64) nounwind readonly
 
-; CHECK-LABEL: test_x86_mask_blend_w_512
 define <32 x i16> @test_x86_mask_blend_w_512(i32 %mask, <32 x i16> %a1, <32 x i16> %a2) {
-  ; CHECK: vpblendmw
-  %res = call <32 x i16> @llvm.x86.avx512.mask.blend.w.512(<32 x i16> %a1, <32 x i16> %a2, i32 %mask) ; <<32 x i16>> [#uses=1]
+; AVX512BW-LABEL: test_x86_mask_blend_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpblendmw %zmm1, %zmm0, %zmm0 {%k1}
+; AVX512BW-NEXT:    retq
+    %res = call <32 x i16> @llvm.x86.avx512.mask.blend.w.512(<32 x i16> %a1, <32 x i16> %a2, i32 %mask) ; <<32 x i16>> [#uses=1]
   ret <32 x i16> %res
 }
 declare <32 x i16> @llvm.x86.avx512.mask.blend.w.512(<32 x i16>, <32 x i16>, i32) nounwind readonly
 
-; CHECK-LABEL: test_x86_mask_blend_b_512
-; CHECK: vpblendmb
 define <64 x i8> @test_x86_mask_blend_b_512(i64 %a0, <64 x i8> %a1, <64 x i8> %a2) {
+; AVX512BW-LABEL: test_x86_mask_blend_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpblendmb %zmm1, %zmm0, %zmm0 {%k1}
+; AVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.x86.avx512.mask.blend.b.512(<64 x i8> %a1, <64 x i8> %a2, i64 %a0) ; <<64 x i8>> [#uses=1]
   ret <64 x i8> %res
 }
 
 define <32 x i16> @test_mask_packs_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
-  ;CHECK-LABEL: test_mask_packs_epi32_rr_512
-  ;CHECK: vpackssdw       %zmm1, %zmm0, %zmm0  ## encoding: [0x62,0xf1,0x7d,0x48,0x6b,0xc1]
+; AVX512BW-LABEL: test_mask_packs_epi32_rr_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packs_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packs_epi32_rrk_512
-  ;CHECK: vpackssdw       %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x6b,0xd1]
+; AVX512BW-LABEL: test_mask_packs_epi32_rrk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpackssdw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packs_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packs_epi32_rrkz_512
-  ;CHECK: vpackssdw       %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x6b,0xc1]
+; AVX512BW-LABEL: test_mask_packs_epi32_rrkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packs_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
-  ;CHECK-LABEL: test_mask_packs_epi32_rm_512
-  ;CHECK: vpackssdw       (%rdi), %zmm0, %zmm0  ## encoding: [0x62,0xf1,0x7d,0x48,0x6b,0x07]
+; AVX512BW-LABEL: test_mask_packs_epi32_rm_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpackssdw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packs_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packs_epi32_rmk_512
-  ;CHECK: vpackssdw       (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x6b,0x0f]
+; AVX512BW-LABEL: test_mask_packs_epi32_rmk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackssdw (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packs_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packs_epi32_rmkz_512
-  ;CHECK: vpackssdw       (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x6b,0x07]
+; AVX512BW-LABEL: test_mask_packs_epi32_rmkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackssdw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packs_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
-  ;CHECK-LABEL: test_mask_packs_epi32_rmb_512
-  ;CHECK: vpackssdw       (%rdi){1to16}, %zmm0, %zmm0  ## encoding: [0x62,0xf1,0x7d,0x58,0x6b,0x07]
+; AVX512BW-LABEL: test_mask_packs_epi32_rmb_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -377,8 +565,12 @@ define <32 x i16> @test_mask_packs_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
 }
 
 define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packs_epi32_rmbk_512
-  ;CHECK: vpackssdw       (%rdi){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x59,0x6b,0x0f]
+; AVX512BW-LABEL: test_mask_packs_epi32_rmbk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -387,8 +579,11 @@ define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <3
 }
 
 define <32 x i16> @test_mask_packs_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packs_epi32_rmbkz_512
-  ;CHECK: vpackssdw       (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xd9,0x6b,0x07]
+; AVX512BW-LABEL: test_mask_packs_epi32_rmbkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -399,45 +594,63 @@ define <32 x i16> @test_mask_packs_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i
 declare <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32>, <16 x i32>, <32 x i16>, i32)
 
 define <64 x i8> @test_mask_packs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
-  ;CHECK-LABEL: test_mask_packs_epi16_rr_512
-  ;CHECK: vpacksswb       %zmm1, %zmm0, %zmm0  ## encoding: [0x62,0xf1,0xfd,0x48,0x63,0xc1]
+; AVX512BW-LABEL: test_mask_packs_epi16_rr_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) {
-  ;CHECK-LABEL: test_mask_packs_epi16_rrk_512
-  ;CHECK: vpacksswb       %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x63,0xd1]
+; AVX512BW-LABEL: test_mask_packs_epi16_rrk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpacksswb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) {
-  ;CHECK-LABEL: test_mask_packs_epi16_rrkz_512
-  ;CHECK: vpacksswb       %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x63,0xc1]
+; AVX512BW-LABEL: test_mask_packs_epi16_rrkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
-  ;CHECK-LABEL: test_mask_packs_epi16_rm_512
-  ;CHECK: vpacksswb       (%rdi), %zmm0, %zmm0  ## encoding: [0x62,0xf1,0xfd,0x48,0x63,0x07]
+; AVX512BW-LABEL: test_mask_packs_epi16_rm_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpacksswb (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) {
-  ;CHECK-LABEL: test_mask_packs_epi16_rmk_512
-  ;CHECK: vpacksswb       (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x63,0x0f]
+; AVX512BW-LABEL: test_mask_packs_epi16_rmk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rsi, %k1
+; AVX512BW-NEXT:    vpacksswb (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) {
-  ;CHECK-LABEL: test_mask_packs_epi16_rmkz_512
-  ;CHECK: vpacksswb       (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x63,0x07]
+; AVX512BW-LABEL: test_mask_packs_epi16_rmkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rsi, %k1
+; AVX512BW-NEXT:    vpacksswb (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
   ret <64 x i8> %res
@@ -447,53 +660,73 @@ declare <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16>, <32 x i16>, <64
 
 
 define <32 x i16> @test_mask_packus_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
-  ;CHECK-LABEL: test_mask_packus_epi32_rr_512
-  ;CHECK: vpackusdw       %zmm1, %zmm0, %zmm0  
+; AVX512BW-LABEL: test_mask_packus_epi32_rr_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packus_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packus_epi32_rrk_512
-  ;CHECK: vpackusdw       %zmm1, %zmm0, %zmm2 {%k1} 
+; AVX512BW-LABEL: test_mask_packus_epi32_rrk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpackusdw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packus_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packus_epi32_rrkz_512
-  ;CHECK: vpackusdw       %zmm1, %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_packus_epi32_rrkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packus_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
-  ;CHECK-LABEL: test_mask_packus_epi32_rm_512
-  ;CHECK: vpackusdw       (%rdi), %zmm0, %zmm0  
+; AVX512BW-LABEL: test_mask_packus_epi32_rm_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpackusdw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packus_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packus_epi32_rmk_512
-  ;CHECK: vpackusdw       (%rdi), %zmm0, %zmm1 {%k1} 
+; AVX512BW-LABEL: test_mask_packus_epi32_rmk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackusdw (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packus_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packus_epi32_rmkz_512
-  ;CHECK: vpackusdw       (%rdi), %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_packus_epi32_rmkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackusdw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packus_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
-  ;CHECK-LABEL: test_mask_packus_epi32_rmb_512
-  ;CHECK: vpackusdw       (%rdi){1to16}, %zmm0, %zmm0  
+; AVX512BW-LABEL: test_mask_packus_epi32_rmb_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -502,8 +735,12 @@ define <32 x i16> @test_mask_packus_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
 }
 
 define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packus_epi32_rmbk_512
-  ;CHECK: vpackusdw       (%rdi){1to16}, %zmm0, %zmm1 {%k1} 
+; AVX512BW-LABEL: test_mask_packus_epi32_rmbk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -512,8 +749,11 @@ define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <
 }
 
 define <32 x i16> @test_mask_packus_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_packus_epi32_rmbkz_512
-  ;CHECK: vpackusdw       (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_packus_epi32_rmbkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -524,45 +764,63 @@ define <32 x i16> @test_mask_packus_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b,
 declare <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32>, <16 x i32>, <32 x i16>, i32)
 
 define <64 x i8> @test_mask_packus_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
-  ;CHECK-LABEL: test_mask_packus_epi16_rr_512
-  ;CHECK: vpackuswb       %zmm1, %zmm0, %zmm0  
+; AVX512BW-LABEL: test_mask_packus_epi16_rr_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) {
-  ;CHECK-LABEL: test_mask_packus_epi16_rrk_512
-  ;CHECK: vpackuswb       %zmm1, %zmm0, %zmm2 {%k1} 
+; AVX512BW-LABEL: test_mask_packus_epi16_rrk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packus_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) {
-  ;CHECK-LABEL: test_mask_packus_epi16_rrkz_512
-  ;CHECK: vpackuswb       %zmm1, %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_packus_epi16_rrkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packus_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
-  ;CHECK-LABEL: test_mask_packus_epi16_rm_512
-  ;CHECK: vpackuswb       (%rdi), %zmm0, %zmm0  
+; AVX512BW-LABEL: test_mask_packus_epi16_rm_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpackuswb (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packus_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) {
-  ;CHECK-LABEL: test_mask_packus_epi16_rmk_512
-  ;CHECK: vpackuswb       (%rdi), %zmm0, %zmm1 {%k1} 
+; AVX512BW-LABEL: test_mask_packus_epi16_rmk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rsi, %k1
+; AVX512BW-NEXT:    vpackuswb (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packus_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) {
-  ;CHECK-LABEL: test_mask_packus_epi16_rmkz_512
-  ;CHECK: vpackuswb       (%rdi), %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_packus_epi16_rmkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rsi, %k1
+; AVX512BW-NEXT:    vpackuswb (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
   ret <64 x i8> %res
@@ -571,45 +829,63 @@ define <64 x i8> @test_mask_packus_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %pt
 declare <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16>, <32 x i16>, <64 x i8>, i64)
 
 define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
-  ;CHECK-LABEL: test_mask_adds_epi16_rr_512
-  ;CHECK: vpaddsw %zmm1, %zmm0, %zmm0     
+; AVX512BW-LABEL: test_mask_adds_epi16_rr_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_adds_epi16_rrk_512
-  ;CHECK: vpaddsw %zmm1, %zmm0, %zmm2 {%k1} 
+; AVX512BW-LABEL: test_mask_adds_epi16_rrk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpaddsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_adds_epi16_rrkz_512
-  ;CHECK: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_adds_epi16_rrkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
-  ;CHECK-LABEL: test_mask_adds_epi16_rm_512
-  ;CHECK: vpaddsw (%rdi), %zmm0, %zmm0    
+; AVX512BW-LABEL: test_mask_adds_epi16_rm_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpaddsw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_adds_epi16_rmk_512
-  ;CHECK: vpaddsw (%rdi), %zmm0, %zmm1 {%k1} 
+; AVX512BW-LABEL: test_mask_adds_epi16_rmk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpaddsw (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_adds_epi16_rmkz_512
-  ;CHECK: vpaddsw (%rdi), %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_adds_epi16_rmkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpaddsw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
@@ -618,45 +894,63 @@ define <32 x i16> @test_mask_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr
 declare <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16> @test_mask_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
-  ;CHECK-LABEL: test_mask_subs_epi16_rr_512
-  ;CHECK: vpsubsw %zmm1, %zmm0, %zmm0     
+; AVX512BW-LABEL: test_mask_subs_epi16_rr_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_subs_epi16_rrk_512
-  ;CHECK: vpsubsw %zmm1, %zmm0, %zmm2 {%k1} 
+; AVX512BW-LABEL: test_mask_subs_epi16_rrk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpsubsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_subs_epi16_rrkz_512
-  ;CHECK: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_subs_epi16_rrkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
-  ;CHECK-LABEL: test_mask_subs_epi16_rm_512
-  ;CHECK: vpsubsw (%rdi), %zmm0, %zmm0    
+; AVX512BW-LABEL: test_mask_subs_epi16_rm_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsubsw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_subs_epi16_rmk_512
-  ;CHECK: vpsubsw (%rdi), %zmm0, %zmm1 {%k1} 
+; AVX512BW-LABEL: test_mask_subs_epi16_rmk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpsubsw (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_subs_epi16_rmkz_512
-  ;CHECK: vpsubsw (%rdi), %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_subs_epi16_rmkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpsubsw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
@@ -665,45 +959,63 @@ define <32 x i16> @test_mask_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr
 declare <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16> @test_mask_adds_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) {
-  ;CHECK-LABEL: test_mask_adds_epu16_rr_512
-  ;CHECK: vpaddusw %zmm1, %zmm0, %zmm0     
+; AVX512BW-LABEL: test_mask_adds_epu16_rr_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_adds_epu16_rrk_512
-  ;CHECK: vpaddusw %zmm1, %zmm0, %zmm2 {%k1} 
+; AVX512BW-LABEL: test_mask_adds_epu16_rrk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpaddusw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_adds_epu16_rrkz_512
-  ;CHECK: vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_adds_epu16_rrkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
-  ;CHECK-LABEL: test_mask_adds_epu16_rm_512
-  ;CHECK: vpaddusw (%rdi), %zmm0, %zmm0    
+; AVX512BW-LABEL: test_mask_adds_epu16_rm_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpaddusw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_adds_epu16_rmk_512
-  ;CHECK: vpaddusw (%rdi), %zmm0, %zmm1 {%k1} 
+; AVX512BW-LABEL: test_mask_adds_epu16_rmk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpaddusw (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_adds_epu16_rmkz_512
-  ;CHECK: vpaddusw (%rdi), %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_adds_epu16_rmkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpaddusw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
@@ -712,45 +1024,63 @@ define <32 x i16> @test_mask_adds_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr
 declare <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16> @test_mask_subs_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) {
-  ;CHECK-LABEL: test_mask_subs_epu16_rr_512
-  ;CHECK: vpsubusw %zmm1, %zmm0, %zmm0     
+; AVX512BW-LABEL: test_mask_subs_epu16_rr_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_subs_epu16_rrk_512
-  ;CHECK: vpsubusw %zmm1, %zmm0, %zmm2 {%k1} 
+; AVX512BW-LABEL: test_mask_subs_epu16_rrk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpsubusw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_subs_epu16_rrkz_512
-  ;CHECK: vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_subs_epu16_rrkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
-  ;CHECK-LABEL: test_mask_subs_epu16_rm_512
-  ;CHECK: vpsubusw (%rdi), %zmm0, %zmm0    
+; AVX512BW-LABEL: test_mask_subs_epu16_rm_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsubusw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-  ;CHECK-LABEL: test_mask_subs_epu16_rmk_512
-  ;CHECK: vpsubusw (%rdi), %zmm0, %zmm1 {%k1} 
+; AVX512BW-LABEL: test_mask_subs_epu16_rmk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpsubusw (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
-  ;CHECK-LABEL: test_mask_subs_epu16_rmkz_512
-  ;CHECK: vpsubusw (%rdi), %zmm0, %zmm0 {%k1} {z} 
+; AVX512BW-LABEL: test_mask_subs_epu16_rmkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpsubusw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
@@ -760,11 +1090,14 @@ declare <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16>, <32 x i16>, <3
 
 declare <64 x i8> @llvm.x86.avx512.mask.pmaxs.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_b_512
-; CHECK-NOT: call 
-; CHECK: vpmaxsb %zmm
-; CHECK: {%k1} 
 define <64 x i8>@test_int_x86_avx512_mask_pmaxs_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaxs_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.x86.avx512.mask.pmaxs.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmaxs.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -773,11 +1106,14 @@ define <64 x i8>@test_int_x86_avx512_mask_pmaxs_b_512(<64 x i8> %x0, <64 x i8> %
 
 declare <32 x i16> @llvm.x86.avx512.mask.pmaxs.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_w_512
-; CHECK-NOT: call 
-; CHECK: vpmaxsw %zmm
-; CHECK: {%k1} 
 define <32 x i16>@test_int_x86_avx512_mask_pmaxs_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaxs_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmaxs.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmaxs.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -786,11 +1122,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pmaxs_w_512(<32 x i16> %x0, <32 x i16
 
 declare <64 x i8> @llvm.x86.avx512.mask.pmaxu.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_b_512
-; CHECK-NOT: call 
-; CHECK: vpmaxub %zmm
-; CHECK: {%k1} 
 define <64 x i8>@test_int_x86_avx512_mask_pmaxu_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaxu_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpmaxub %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.x86.avx512.mask.pmaxu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmaxu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -799,11 +1138,14 @@ define <64 x i8>@test_int_x86_avx512_mask_pmaxu_b_512(<64 x i8> %x0, <64 x i8> %
 
 declare <32 x i16> @llvm.x86.avx512.mask.pmaxu.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_w_512
-; CHECK-NOT: call 
-; CHECK: vpmaxuw %zmm
-; CHECK: {%k1} 
 define <32 x i16>@test_int_x86_avx512_mask_pmaxu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaxu_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmaxu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmaxu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -812,11 +1154,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pmaxu_w_512(<32 x i16> %x0, <32 x i16
 
 declare <64 x i8> @llvm.x86.avx512.mask.pmins.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_b_512
-; CHECK-NOT: call 
-; CHECK: vpminsb %zmm
-; CHECK: {%k1} 
 define <64 x i8>@test_int_x86_avx512_mask_pmins_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmins_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpminsb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpminsb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.x86.avx512.mask.pmins.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmins.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -825,11 +1170,14 @@ define <64 x i8>@test_int_x86_avx512_mask_pmins_b_512(<64 x i8> %x0, <64 x i8> %
 
 declare <32 x i16> @llvm.x86.avx512.mask.pmins.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_w_512
-; CHECK-NOT: call 
-; CHECK: vpminsw %zmm
-; CHECK: {%k1} 
 define <32 x i16>@test_int_x86_avx512_mask_pmins_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmins_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpminsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpminsw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmins.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmins.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -838,11 +1186,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pmins_w_512(<32 x i16> %x0, <32 x i16
 
 declare <64 x i8> @llvm.x86.avx512.mask.pminu.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_b_512
-; CHECK-NOT: call 
-; CHECK: vpminub %zmm
-; CHECK: {%k1} 
 define <64 x i8>@test_int_x86_avx512_mask_pminu_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pminu_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpminub %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpminub %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.x86.avx512.mask.pminu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pminu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -851,11 +1202,14 @@ define <64 x i8>@test_int_x86_avx512_mask_pminu_b_512(<64 x i8> %x0, <64 x i8> %
 
 declare <32 x i16> @llvm.x86.avx512.mask.pminu.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_w_512
-; CHECK-NOT: call 
-; CHECK: vpminuw %zmm
-; CHECK: {%k1} 
 define <32 x i16>@test_int_x86_avx512_mask_pminu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pminu_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpminuw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpminuw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.pminu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pminu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -864,11 +1218,15 @@ define <32 x i16>@test_int_x86_avx512_mask_pminu_w_512(<32 x i16> %x0, <32 x i16
 
 declare <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_hi_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpermt2w %zmm{{.*}}{%k1} 
 define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm3
+; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm3 {%k1}
+; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpaddw %zmm1, %zmm3, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -877,11 +1235,15 @@ define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32
 
 declare <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_hi_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpermt2w %zmm{{.*}}{%k1} {z}
 define <32 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm3
+; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm3 {%k1} {z}
+; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpaddw %zmm1, %zmm3, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -890,11 +1252,15 @@ define <32 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <3
 
 declare <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_hi_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpermi2w %zmm{{.*}}{%k1} 
 define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm3
+; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm0, %zmm3 {%k1}
+; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpaddw %zmm1, %zmm3, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -903,11 +1269,14 @@ define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32
 
 declare <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pavg_b_512
-; CHECK-NOT: call 
-; CHECK: vpavgb %zmm
-; CHECK: {%k1} 
 define <64 x i8>@test_int_x86_avx512_mask_pavg_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pavg_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpavgb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpavgb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -916,11 +1285,14 @@ define <64 x i8>@test_int_x86_avx512_mask_pavg_b_512(<64 x i8> %x0, <64 x i8> %x
 
 declare <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pavg_w_512
-; CHECK-NOT: call 
-; CHECK: vpavgw %zmm
-; CHECK: {%k1} 
 define <32 x i16>@test_int_x86_avx512_mask_pavg_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pavg_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpavgw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpavgw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -929,11 +1301,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pavg_w_512(<32 x i16> %x0, <32 x i16>
 
 declare <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pshuf_b_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpshufb %zmm{{.*}}{%k1} 
 define <64 x i8>@test_int_x86_avx512_mask_pshuf_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pshuf_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpshufb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpshufb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -942,11 +1317,14 @@ define <64 x i8>@test_int_x86_avx512_mask_pshuf_b_512(<64 x i8> %x0, <64 x i8> %
 
 declare <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_w_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpabsw{{.*}}{%k1} 
 define <32 x i16>@test_int_x86_avx512_mask_pabs_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pabs_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpabsw %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vpabsw %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -955,11 +1333,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pabs_w_512(<32 x i16> %x0, <32 x i16>
 
 declare <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8>, <64 x i8>, i64)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_b_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: vpabsb{{.*}}{%k1} 
 define <64 x i8>@test_int_x86_avx512_mask_pabs_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pabs_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpabsb %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vpabsb %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -968,12 +1349,14 @@ define <64 x i8>@test_int_x86_avx512_mask_pabs_b_512(<64 x i8> %x0, <64 x i8> %x
 
 declare <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhu_w_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: {%k1} 
-; CHECK: vpmulhuw {{.*}}encoding: [0x62
 define <32 x i16>@test_int_x86_avx512_mask_pmulhu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmulhu_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -982,12 +1365,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pmulhu_w_512(<32 x i16> %x0, <32 x i1
 
 declare <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmulh_w_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: {%k1} 
-; CHECK: vpmulhw {{.*}}encoding: [0x62
 define <32 x i16>@test_int_x86_avx512_mask_pmulh_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmulh_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpmulhw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpmulhw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -996,12 +1381,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pmulh_w_512(<32 x i16> %x0, <32 x i16
 
 declare <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhr_sw_512
-; CHECK-NOT: call 
-; CHECK: kmov 
-; CHECK: {%k1} 
-; CHECK: vpmulhrsw {{.*}}encoding: [0x62
 define <32 x i16>@test_int_x86_avx512_mask_pmulhr_sw_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmulhr_sw_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpmulhrsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpmulhrsw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1011,10 +1398,15 @@ define <32 x i16>@test_int_x86_avx512_mask_pmulhr_sw_512(<32 x i16> %x0, <32 x i
 declare <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16>, <32 x i8>, i32)
 
 define <32 x i8>@test_int_x86_avx512_mask_pmov_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_512:
-; CHECK:       vpmovwb %zmm0, %ymm1 {%k1}
-; CHECK-NEXT:  vpmovwb %zmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT:  vpmovwb %zmm0, %ymm0
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmov_wb_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm1 {%k1}
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm2 {%k1} {z}
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
     %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1)
     %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2)
     %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2)
@@ -1026,9 +1418,12 @@ define <32 x i8>@test_int_x86_avx512_mask_pmov_wb_512(<32 x i16> %x0, <32 x i8>
 declare void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16>, i32)
 
 define void @test_int_x86_avx512_mask_pmov_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_512:
-; CHECK:  vpmovwb %zmm0, (%rdi)
-; CHECK:  vpmovwb %zmm0, (%rdi) {%k1}
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpmovwb %zmm0, (%rdi)
+; AVX512BW-NEXT:    vpmovwb %zmm0, (%rdi) {%k1}
+; AVX512BW-NEXT:    retq
     call void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1)
     call void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2)
     ret void
@@ -1037,10 +1432,15 @@ define void @test_int_x86_avx512_mask_pmov_wb_mem_512(i8* %ptr, <32 x i16> %x1,
 declare <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16>, <32 x i8>, i32)
 
 define <32 x i8>@test_int_x86_avx512_mask_pmovs_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_512:
-; CHECK:       vpmovswb %zmm0, %ymm1 {%k1}
-; CHECK-NEXT:  vpmovswb %zmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT:  vpmovswb %zmm0, %ymm0
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovs_wb_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpmovswb %zmm0, %ymm1 {%k1}
+; AVX512BW-NEXT:    vpmovswb %zmm0, %ymm2 {%k1} {z}
+; AVX512BW-NEXT:    vpmovswb %zmm0, %ymm0
+; AVX512BW-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
     %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1)
     %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2)
     %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2)
@@ -1052,9 +1452,12 @@ define <32 x i8>@test_int_x86_avx512_mask_pmovs_wb_512(<32 x i16> %x0, <32 x i8>
 declare void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16>, i32)
 
 define void @test_int_x86_avx512_mask_pmovs_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_512:
-; CHECK:  vpmovswb %zmm0, (%rdi)
-; CHECK:  vpmovswb %zmm0, (%rdi) {%k1}
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpmovswb %zmm0, (%rdi)
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpmovswb %zmm0, (%rdi) {%k1}
+; AVX512BW-NEXT:    retq
     call void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1)
     call void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2)
     ret void
@@ -1063,10 +1466,15 @@ define void @test_int_x86_avx512_mask_pmovs_wb_mem_512(i8* %ptr, <32 x i16> %x1,
 declare <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16>, <32 x i8>, i32)
 
 define <32 x i8>@test_int_x86_avx512_mask_pmovus_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_512:
-; CHECK:       vpmovuswb %zmm0, %ymm1 {%k1}
-; CHECK-NEXT:  vpmovuswb %zmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT:  vpmovuswb %zmm0, %ymm0
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovus_wb_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpmovuswb %zmm0, %ymm1 {%k1}
+; AVX512BW-NEXT:    vpmovuswb %zmm0, %ymm2 {%k1} {z}
+; AVX512BW-NEXT:    vpmovuswb %zmm0, %ymm0
+; AVX512BW-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
     %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1)
     %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2)
     %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2)
@@ -1078,9 +1486,12 @@ define <32 x i8>@test_int_x86_avx512_mask_pmovus_wb_512(<32 x i16> %x0, <32 x i8
 declare void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16>, i32)
 
 define void @test_int_x86_avx512_mask_pmovus_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_512:
-; CHECK:  vpmovuswb %zmm0, (%rdi)
-; CHECK:  vpmovuswb %zmm0, (%rdi) {%k1}
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpmovuswb %zmm0, (%rdi)
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpmovuswb %zmm0, (%rdi) {%k1}
+; AVX512BW-NEXT:    retq
     call void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1)
     call void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2)
     ret void
@@ -1089,13 +1500,13 @@ define void @test_int_x86_avx512_mask_pmovus_wb_mem_512(i8* %ptr, <32 x i16> %x1
 declare <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8>, <64 x i8>, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pmaddubs_w_512:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 
-; CHECK-NEXT:    retq
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaddubs_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1105,13 +1516,13 @@ define <32 x i16>@test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x i
 declare <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16>, <32 x i16>, <16 x i32>, i16)
 
 define <16 x i32>@test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_512:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm2 {%k1} 
-; CHECK-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0 
-; CHECK-NEXT:    retq
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaddw_d_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovw %edi, %k1
+; AVX512BW-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 -1)
   %res2 = add <16 x i32> %res, %res1
@@ -1121,15 +1532,13 @@ define <16 x i32>@test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i1
 declare <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
 define <64 x i8>@test_int_x86_avx512_mask_punpckhb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpckhb_w_512:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovq %rdi, %k1
-; CHECK-NEXT:    vpunpckhbw %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    ## zmm2 = zmm2[8],k1[8],zmm2[9],k1[9],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[12],k1[12],zmm2[13],k1[13],zmm2[14],k1[14],zmm2[15],k1[15],zmm2[24],k1[24],zmm2[25],k1[25],zmm2[26],k1[26],zmm2[27],k1[27],zmm2[28],k1[28],zmm2[29],k1[29],zmm2[30],k1[30],zmm2[31],k1[31],zmm2[40],k1[40],zmm2[41],k1[41],zmm2[42],k1[42],zmm2[43],k1[43],zmm2[44],k1[44],zmm2[45],k1[45],zmm2[46],k1[46],zmm2[47],k1[47],zmm2[56],k1[56],zmm2[57],k1[57],zmm2[58],k1[58],zmm2[59],k1[59],zmm2[60],k1[60],zmm2[61],k1[61],zmm2[62],k1[62],zmm2[63],k1[63]
-; CHECK-NEXT:    vpunpckhbw %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    ## zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
-; CHECK-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; AVX512BW-LABEL: test_int_x86_avx512_mask_punpckhb_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm2[8],k1[8],zmm2[9],k1[9],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[12],k1[12],zmm2[13],k1[13],zmm2[14],k1[14],zmm2[15],k1[15],zmm2[24],k1[24],zmm2[25],k1[25],zmm2[26],k1[26],zmm2[27],k1[27],zmm2[28],k1[28],zmm2[29],k1[29],zmm2[30],k1[30],zmm2[31],k1[31],zmm2[40],k1[40],zmm2[41],k1[41],zmm2[42],k1[42],zmm2[43],k1[43],zmm2[44],k1[44],zmm2[45],k1[45],zmm2[46],k1[46],zmm2[47],k1[47],zmm2[56],k1[56],zmm2[57],k1[57],zmm2[58],k1[58],zmm2[59],k1[59],zmm2[60],k1[60],zmm2[61],k1[61],zmm2[62],k1[62],zmm2[63],k1[63]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -1139,15 +1548,13 @@ define <64 x i8>@test_int_x86_avx512_mask_punpckhb_w_512(<64 x i8> %x0, <64 x i8
 declare <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
 define <64 x i8>@test_int_x86_avx512_mask_punpcklb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpcklb_w_512:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovq %rdi, %k1
-; CHECK-NEXT:    vpunpcklbw %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    ## zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[2],k1[2],zmm2[3],k1[3],zmm2[4],k1[4],zmm2[5],k1[5],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[16],k1[16],zmm2[17],k1[17],zmm2[18],k1[18],zmm2[19],k1[19],zmm2[20],k1[20],zmm2[21],k1[21],zmm2[22],k1[22],zmm2[23],k1[23],zmm2[32],k1[32],zmm2[33],k1[33],zmm2[34],k1[34],zmm2[35],k1[35],zmm2[36],k1[36],zmm2[37],k1[37],zmm2[38],k1[38],zmm2[39],k1[39],zmm2[48],k1[48],zmm2[49],k1[49],zmm2[50],k1[50],zmm2[51],k1[51],zmm2[52],k1[52],zmm2[53],k1[53],zmm2[54],k1[54],zmm2[55],k1[55]
-; CHECK-NEXT:    vpunpcklbw %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    ## zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
-; CHECK-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; AVX512BW-LABEL: test_int_x86_avx512_mask_punpcklb_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[2],k1[2],zmm2[3],k1[3],zmm2[4],k1[4],zmm2[5],k1[5],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[16],k1[16],zmm2[17],k1[17],zmm2[18],k1[18],zmm2[19],k1[19],zmm2[20],k1[20],zmm2[21],k1[21],zmm2[22],k1[22],zmm2[23],k1[23],zmm2[32],k1[32],zmm2[33],k1[33],zmm2[34],k1[34],zmm2[35],k1[35],zmm2[36],k1[36],zmm2[37],k1[37],zmm2[38],k1[38],zmm2[39],k1[39],zmm2[48],k1[48],zmm2[49],k1[49],zmm2[50],k1[50],zmm2[51],k1[51],zmm2[52],k1[52],zmm2[53],k1[53],zmm2[54],k1[54],zmm2[55],k1[55]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -1157,15 +1564,13 @@ define <64 x i8>@test_int_x86_avx512_mask_punpcklb_w_512(<64 x i8> %x0, <64 x i8
 declare <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_punpckhw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpckhw_d_512:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpunpckhwd %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    ## zmm2 = zmm2[4],k1[4],zmm2[5],k1[5],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[12],k1[12],zmm2[13],k1[13],zmm2[14],k1[14],zmm2[15],k1[15],zmm2[20],k1[20],zmm2[21],k1[21],zmm2[22],k1[22],zmm2[23],k1[23],zmm2[28],k1[28],zmm2[29],k1[29],zmm2[30],k1[30],zmm2[31],k1[31]
-; CHECK-NEXT:    vpunpckhwd %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    ## zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
-; CHECK-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; AVX512BW-LABEL: test_int_x86_avx512_mask_punpckhw_d_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpunpckhwd {{.*#+}} zmm2 = zmm2[4],k1[4],zmm2[5],k1[5],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[12],k1[12],zmm2[13],k1[13],zmm2[14],k1[14],zmm2[15],k1[15],zmm2[20],k1[20],zmm2[21],k1[21],zmm2[22],k1[22],zmm2[23],k1[23],zmm2[28],k1[28],zmm2[29],k1[29],zmm2[30],k1[30],zmm2[31],k1[31]
+; AVX512BW-NEXT:    vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1175,15 +1580,13 @@ define <32 x i16>@test_int_x86_avx512_mask_punpckhw_d_512(<32 x i16> %x0, <32 x
 declare <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_punpcklw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpcklw_d_512:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpunpcklwd %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    ## zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[2],k1[2],zmm2[3],k1[3],zmm2[8],k1[8],zmm2[9],k1[9],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[16],k1[16],zmm2[17],k1[17],zmm2[18],k1[18],zmm2[19],k1[19],zmm2[24],k1[24],zmm2[25],k1[25],zmm2[26],k1[26],zmm2[27],k1[27]
-; CHECK-NEXT:    vpunpcklwd %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    ## zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
-; CHECK-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; AVX512BW-LABEL: test_int_x86_avx512_mask_punpcklw_d_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[2],k1[2],zmm2[3],k1[3],zmm2[8],k1[8],zmm2[9],k1[9],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[16],k1[16],zmm2[17],k1[17],zmm2[18],k1[18],zmm2[19],k1[19],zmm2[24],k1[24],zmm2[25],k1[25],zmm2[26],k1[26],zmm2[27],k1[27]
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1193,15 +1596,15 @@ define <32 x i16>@test_int_x86_avx512_mask_punpcklw_d_512(<32 x i16> %x0, <32 x
 declare <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8>, <64 x i8>, i32, <64 x i8>, i64)
 
 define <64 x i8>@test_int_x86_avx512_mask_palignr_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x3, i64 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_palignr_512:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovq %rdi, %k1
-; CHECK-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm3 {%k1} {z}
-; CHECK-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddb %zmm3, %zmm2, %zmm1
-; CHECK-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; AVX512BW-LABEL: test_int_x86_avx512_mask_palignr_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm3 {%k1} {z}
+; AVX512BW-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm3, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 %x4)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> zeroinitializer, i64 %x4)
   %res2 = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 -1)
@@ -1213,15 +1616,15 @@ define <64 x i8>@test_int_x86_avx512_mask_palignr_512(<64 x i8> %x0, <64 x i8> %
 declare <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8>, <64 x i8>, i32, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x3, i32 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_dbpsadbw_512:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z}
-; CHECK-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddw %zmm3, %zmm2, %zmm1
-; CHECK-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; AVX512BW-LABEL: test_int_x86_avx512_mask_dbpsadbw_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z}
+; AVX512BW-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> %x3, i32 %x4)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> zeroinitializer, i32 %x4)
   %res2 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> %x3, i32 -1)
@@ -1232,11 +1635,13 @@ define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8>
 
 declare <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_psll_dq_512
-; CHECK-NOT: call 
-; CHECK: vpslldq 
-; CHECK: vpslldq 
 define <8 x i64>@test_int_x86_avx512_mask_psll_dq_512(<8 x i64> %x0) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_psll_dq_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpslldq $8, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpslldq $4, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 8)
   %res1 = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 4)
   %res2 = add <8 x i64> %res, %res1
@@ -1245,11 +1650,13 @@ define <8 x i64>@test_int_x86_avx512_mask_psll_dq_512(<8 x i64> %x0) {
 
 declare <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64>, i32)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_psrl_dq_512
-; CHECK-NOT: call 
-; CHECK: vpsrldq 
-; CHECK: vpsrldq
 define <8 x i64>@test_int_x86_avx512_mask_psrl_dq_512(<8 x i64> %x0) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_psrl_dq_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsrldq $8, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpsrldq $4, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 8)
   %res1 = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 4)
   %res2 = add <8 x i64> %res, %res1
@@ -1257,11 +1664,13 @@ define <8 x i64>@test_int_x86_avx512_mask_psrl_dq_512(<8 x i64> %x0) {
 }
 declare  <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8>, <64 x i8>)
 
-; CHECK-LABEL: @test_int_x86_avx512_mask_psadb_w_512
-; CHECK-NOT: call 
-; CHECK: vpsadbw %zmm1
-; CHECK: vpsadbw %zmm2
 define  <8 x i64>@test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2){
+; AVX512BW-LABEL: test_int_x86_avx512_mask_psadb_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpsadbw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %res = call  <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x1)
   %res1 = call  <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x2)
   %res2 = add  <8 x i64> %res, %res1

From b294a7b51a8ce78941701fec5e1fdcf806764bc9 Mon Sep 17 00:00:00 2001
From: Zoran Jovanovic <zoran.jovanovic@imgtec.com>
Date: Mon, 30 Nov 2015 12:56:18 +0000
Subject: [PATCH 004/186] [mips][microMIPS] Fix issue with offset operand of
 BALC and BC instructions Value of offset operand for microMIPS BALC and BC
 instructions is currently shifted 2 bits, but it should be 1 bit.
 Differential Revision: http://reviews.llvm.org/D14770

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254296 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../Mips/Disassembler/MipsDisassembler.cpp      | 17 +++++++++++++++++
 .../Mips/MCTargetDesc/MipsMCCodeEmitter.cpp     | 17 +++++++++++++++++
 .../Mips/MCTargetDesc/MipsMCCodeEmitter.h       |  7 +++++++
 lib/Target/Mips/MicroMips32r6InstrInfo.td       | 11 +++++++++--
 .../Disassembler/Mips/micromips32r6/valid.txt   |  4 ++--
 test/MC/Mips/micromips32r6/valid.s              |  4 ++--
 6 files changed, 54 insertions(+), 6 deletions(-)

diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index f9601839b44c..716a96e2c46b 100644
--- a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -229,6 +229,13 @@ static DecodeStatus DecodeBranchTargetMM(MCInst &Inst,
                                          uint64_t Address,
                                          const void *Decoder);
 
+// DecodeBranchTarget26MM - Decode microMIPS branch offset, which is
+// shifted left by 1 bit.
+static DecodeStatus DecodeBranchTarget26MM(MCInst &Inst,
+                                           unsigned Offset,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
 // DecodeJumpTargetMM - Decode microMIPS jump target, which is
 // shifted left by 1 bit.
 static DecodeStatus DecodeJumpTargetMM(MCInst &Inst,
@@ -1863,6 +1870,16 @@ static DecodeStatus DecodeBranchTargetMM(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeBranchTarget26MM(MCInst &Inst,
+  unsigned Offset,
+  uint64_t Address,
+  const void *Decoder) {
+  int32_t BranchOffset = SignExtend32<26>(Offset) << 1;
+
+  Inst.addOperand(MCOperand::createImm(BranchOffset));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeJumpTargetMM(MCInst &Inst,
                                        unsigned Insn,
                                        uint64_t Address,
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 86a5d5882184..ed917a4daba3 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -350,6 +350,23 @@ getBranchTarget26OpValue(const MCInst &MI, unsigned OpNo,
   return 0;
 }
 
+/// getBranchTarget26OpValueMM - Return binary encoding of the branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::getBranchTarget26OpValueMM(
+    const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  // If the destination is an immediate, divide by 2.
+  if (MO.isImm())
+    return MO.getImm() >> 1;
+
+  // TODO: Push 26 PC fixup.
+  return 0;
+}
+
 /// getJumpOffset16OpValue - Return binary encoding of the jump
 /// target operand. If the machine operand requires relocation,
 /// record the relocation and return zero.
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
index c2f4b6a72bbf..eb48914b0649 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
@@ -137,6 +137,13 @@ class MipsMCCodeEmitter : public MCCodeEmitter {
                                     SmallVectorImpl<MCFixup> &Fixups,
                                     const MCSubtargetInfo &STI) const;
 
+  // getBranchTarget26OpValueMM - Return binary encoding of the branch
+  // offset operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getBranchTarget26OpValueMM(const MCInst &MI, unsigned OpNo,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
+
   // getJumpOffset16OpValue - Return binary encoding of the jump
   // offset operand. If the machine operand requires relocation,
   // record the relocation and return zero.
diff --git a/lib/Target/Mips/MicroMips32r6InstrInfo.td b/lib/Target/Mips/MicroMips32r6InstrInfo.td
index cabaa53b2b1b..2dbd20cfad99 100644
--- a/lib/Target/Mips/MicroMips32r6InstrInfo.td
+++ b/lib/Target/Mips/MicroMips32r6InstrInfo.td
@@ -11,6 +11,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+def brtarget26_mm : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTarget26OpValueMM";
+  let OperandType = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeBranchTarget26MM";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
 //===----------------------------------------------------------------------===//
 //
 // Instruction Encodings
@@ -238,11 +245,11 @@ class BC_MMR6_DESC_BASE<string instr_asm, DAGOperand opnd>
   bit isBarrier = 1;
 }
 
-class BALC_MMR6_DESC : BC_MMR6_DESC_BASE<"balc", brtarget26> {
+class BALC_MMR6_DESC : BC_MMR6_DESC_BASE<"balc", brtarget26_mm> {
   bit isCall = 1;
   list<Register> Defs = [RA];
 }
-class BC_MMR6_DESC : BC_MMR6_DESC_BASE<"bc", brtarget26>;
+class BC_MMR6_DESC : BC_MMR6_DESC_BASE<"bc", brtarget26_mm>;
 
 class BC16_MMR6_DESC : MicroMipsInst16<(outs), (ins brtarget10_mm:$offset),
                                        !strconcat("bc16", "\t$offset"), [],
diff --git a/test/MC/Disassembler/Mips/micromips32r6/valid.txt b/test/MC/Disassembler/Mips/micromips32r6/valid.txt
index 619ce3faeda6..82c5d50df92d 100644
--- a/test/MC/Disassembler/Mips/micromips32r6/valid.txt
+++ b/test/MC/Disassembler/Mips/micromips32r6/valid.txt
@@ -44,8 +44,8 @@
 0xe0 0x40 0x02 0x9a # CHECK: bgtzalc $2, 1332
 0xe0 0x42 0x02 0x9a # CHECK: bltzalc $2, 1332
 0xc0 0x40 0x02 0x9a # CHECK: blezalc $2, 1332
-0xb4 0x37 0x96 0xb8 # CHECK: balc 14572256
-0x94 0x37 0x96 0xb8 # CHECK: bc 14572256
+0xb4 0x37 0x96 0xb8 # CHECK: balc 7286128
+0x94 0x37 0x96 0xb8 # CHECK: bc 7286128
 0x00 0x44 0x0b 0x3c # CHECK: bitswap $4, $2
 0x00 0x00 0x00 0x07 # CHECK: break
 0x00 0x07 0x00 0x07 # CHECK: break 7
diff --git a/test/MC/Mips/micromips32r6/valid.s b/test/MC/Mips/micromips32r6/valid.s
index b2bce8407656..81d4b6c6d456 100644
--- a/test/MC/Mips/micromips32r6/valid.s
+++ b/test/MC/Mips/micromips32r6/valid.s
@@ -26,9 +26,9 @@
   bgtzalc $2, 1332         # CHECK: bgtzalc $2, 1332    # encoding: [0xe0,0x40,0x02,0x9a]
   bltzalc $2, 1332         # CHECK: bltzalc $2, 1332    # encoding: [0xe0,0x42,0x02,0x9a]
   blezalc $2, 1332         # CHECK: blezalc $2, 1332    # encoding: [0xc0,0x40,0x02,0x9a]
-  balc 14572256            # CHECK: balc 14572256       # encoding: [0xb4,0x37,0x96,0xb8]
+  balc 7286128             # CHECK: balc 7286128        # encoding: [0xb4,0x37,0x96,0xb8]
   b 132                    # CHECK: bc16 132            # encoding: [0xcc,0x42]
-  bc 14572256              # CHECK: bc 14572256         # encoding: [0x94,0x37,0x96,0xb8]
+  bc 7286128               # CHECK: bc 7286128          # encoding: [0x94,0x37,0x96,0xb8]
   bc16 132                 # CHECK: bc16 132            # encoding: [0xcc,0x42]
   beqzc16 $6, 20           # CHECK: beqzc16 $6, 20      # encoding: [0x8f,0x0a]
   bnezc16 $6, 20           # CHECK: bnezc16 $6, 20      # encoding: [0xaf,0x0a]

From 30709900f394ba33d546b6830eadc7307add3219 Mon Sep 17 00:00:00 2001
From: Hrvoje Varga <Hrvoje.Varga@imgtec.com>
Date: Mon, 30 Nov 2015 12:58:39 +0000
Subject: [PATCH 005/186] [mips][microMIPS] Implement LBUX, LHX, LWX,
 MAQ_S[A].W.PHL, MAQ_S[A].W.PHR, MFHI, MFLO, MTHI and MTLO instructions
 Differential Revision: http://reviews.llvm.org/D14436

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254297 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Mips/MicroMipsDSPInstrFormats.td   | 25 +++++++++++++
 lib/Target/Mips/MicroMipsDSPInstrInfo.td      | 36 +++++++++++++++++++
 lib/Target/Mips/MipsDSPInstrInfo.td           | 25 +++++++------
 .../Disassembler/Mips/micromips-dsp/valid.txt | 11 ++++++
 test/MC/Mips/micromips-dsp/valid.s            | 11 ++++++
 5 files changed, 97 insertions(+), 11 deletions(-)

diff --git a/lib/Target/Mips/MicroMipsDSPInstrFormats.td b/lib/Target/Mips/MicroMipsDSPInstrFormats.td
index d3f9fe31afb7..f231d3a5294d 100644
--- a/lib/Target/Mips/MicroMipsDSPInstrFormats.td
+++ b/lib/Target/Mips/MicroMipsDSPInstrFormats.td
@@ -153,3 +153,28 @@ class POOL32A_2RSA5_FMT<string opstr, bits<11> op> : MMDSPInst<opstr> {
   let Inst{15-11} = sa;
   let Inst{10-0}  = op;
 }
+
+class POOL32A_1RMEMB0_FMT<string opstr, bits<10> funct> : MMDSPInst<opstr> {
+  bits<5> index;
+  bits<5> base;
+  bits<5> rd;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = index;
+  let Inst{20-16} = base;
+  let Inst{15-11} = rd;
+  let Inst{10}    = 0b0;
+  let Inst{9-0}   = funct;
+}
+
+class POOL32A_1RAC_FMT<string instr_asm, bits<8> funct> : MMDSPInst<instr_asm> {
+  bits<5> rs;
+  bits<2> ac;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = 0;
+  let Inst{20-16} = rs;
+  let Inst{15-14} = ac;
+  let Inst{13-6}  = funct;
+  let Inst{5-0}   = 0b111100;
+}
diff --git a/lib/Target/Mips/MicroMipsDSPInstrInfo.td b/lib/Target/Mips/MicroMipsDSPInstrInfo.td
index f515f380f0db..204a4ec60c5a 100644
--- a/lib/Target/Mips/MicroMipsDSPInstrInfo.td
+++ b/lib/Target/Mips/MicroMipsDSPInstrInfo.td
@@ -130,6 +130,17 @@ class PRECRQ_QB_PH_MM_ENC : POOL32A_3RB0_FMT<"precrq.qb.ph", 0b0010101101>;
 class PRECRQU_S_QB_PH_MM_ENC
     : POOL32A_3RB0_FMT<"precrqu_s.qb.ph", 0b0101101101>;
 class PRECRQ_RS_PH_W_MM_ENC : POOL32A_3RB0_FMT<"precrq_rs.ph.w", 0b0100101101>;
+class LBUX_MM_ENC : POOL32A_1RMEMB0_FMT<"lbux", 0b1000100101>;
+class LHX_MM_ENC : POOL32A_1RMEMB0_FMT<"lhx", 0b0101100101>;
+class LWX_MM_ENC : POOL32A_1RMEMB0_FMT<"lwx", 0b0110100101>;
+class MAQ_S_W_PHL_MM_ENC : POOL32A_2RAC_FMT<"maq_s.w.phl", 0b01101001>;
+class MAQ_SA_W_PHL_MM_ENC : POOL32A_2RAC_FMT<"maq_sa.w.phl", 0b11101001>;
+class MAQ_S_W_PHR_MM_ENC : POOL32A_2RAC_FMT<"maq_s.w.phr", 0b00101001>;
+class MAQ_SA_W_PHR_MM_ENC : POOL32A_2RAC_FMT<"maq_sa.w.phr", 0b10101001>;
+class MFHI_MM_ENC : POOL32A_1RAC_FMT<"mfhi", 0b00000001>;
+class MFLO_MM_ENC : POOL32A_1RAC_FMT<"mflo", 0b01000001>;
+class MTHI_MM_ENC : POOL32A_1RAC_FMT<"mthi", 0b10000001>;
+class MTLO_MM_ENC : POOL32A_1RAC_FMT<"mthi", 0b11000001>;
 
 // Instruction desc.
 class ABSQ_S_PH_MM_R2_DESC_BASE<string opstr, SDPatternOperator OpNode,
@@ -288,6 +299,20 @@ class EXTRV_S_H_MM_DESC
     : EXT_MM_2R_DESC_BASE<"extrv_s.h", MipsEXTR_S_H, NoItinerary>,
       Defs<[DSPOutFlag23]>;
 
+class MFHI_MM_DESC_BASE<string instr_asm, RegisterOperand RO, SDNode OpNode,
+                        InstrItinClass itin> {
+  dag OutOperandList = (outs GPR32Opnd:$rs);
+  dag InOperandList = (ins RO:$ac);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $ac");
+  list<dag> Pattern = [(set GPR32Opnd:$rs, (OpNode RO:$ac))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MFHI_MM_DESC : MFHI_MM_DESC_BASE<"mfhi", ACC64DSPOpnd, MipsMFHI,
+                                       NoItinerary>;
+class MFLO_MM_DESC : MFHI_MM_DESC_BASE<"mflo", ACC64DSPOpnd, MipsMFLO,
+                                       NoItinerary>;
+
 // Instruction defs.
 // microMIPS DSP Rev 1
 def ADDQ_PH_MM : DspMMRel, ADDQ_PH_MM_ENC, ADDQ_PH_DESC;
@@ -368,6 +393,17 @@ def PRECRQ_PH_W_MM : DspMMRel, PRECRQ_PH_W_MM_ENC, PRECRQ_PH_W_DESC;
 def PRECRQ_QB_PH_MM : DspMMRel, PRECRQ_QB_PH_MM_ENC, PRECRQ_QB_PH_DESC;
 def PRECRQU_S_QB_PH_MM : DspMMRel, PRECRQU_S_QB_PH_MM_ENC, PRECRQU_S_QB_PH_DESC;
 def PRECRQ_RS_PH_W_MM : DspMMRel, PRECRQ_RS_PH_W_MM_ENC, PRECRQ_RS_PH_W_DESC;
+def LBUX_MM : DspMMRel, LBUX_MM_ENC, LBUX_DESC;
+def LHX_MM : DspMMRel, LHX_MM_ENC, LHX_DESC;
+def LWX_MM : DspMMRel, LWX_MM_ENC, LWX_DESC;
+def MAQ_S_W_PHL_MM : DspMMRel, MAQ_S_W_PHL_MM_ENC, MAQ_S_W_PHL_DESC;
+def MAQ_SA_W_PHL_MM : DspMMRel, MAQ_SA_W_PHL_MM_ENC, MAQ_SA_W_PHL_DESC;
+def MAQ_S_W_PHR_MM : DspMMRel, MAQ_S_W_PHR_MM_ENC, MAQ_S_W_PHR_DESC;
+def MAQ_SA_W_PHR_MM : DspMMRel, MAQ_SA_W_PHR_MM_ENC, MAQ_SA_W_PHR_DESC;
+def MFHI_DSP_MM : DspMMRel, MFHI_MM_ENC, MFHI_MM_DESC;
+def MFLO_DSP_MM : DspMMRel, MFLO_MM_ENC, MFLO_MM_DESC;
+def MTHI_DSP_MM : DspMMRel, MTHI_MM_ENC, MTHI_DESC;
+def MTLO_DSP_MM : DspMMRel, MTLO_MM_ENC, MTLO_DESC;
 // microMIPS DSP Rev 2
 def ABSQ_S_QB_MMR2 : DspMMRel, ABSQ_S_QB_MMR2_ENC, ABSQ_S_QB_MMR2_DESC,
                      ISA_DSPR2;
diff --git a/lib/Target/Mips/MipsDSPInstrInfo.td b/lib/Target/Mips/MipsDSPInstrInfo.td
index 91513f3f2c63..e8cfbcf572d7 100644
--- a/lib/Target/Mips/MipsDSPInstrInfo.td
+++ b/lib/Target/Mips/MipsDSPInstrInfo.td
@@ -360,6 +360,7 @@ class LX_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode iPTR:$base, iPTR:$index))];
   InstrItinClass Itinerary = itin;
   bit mayLoad = 1;
+  string BaseOpcode = instr_asm;
 }
 
 class ADDUH_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -487,6 +488,7 @@ class MFHI_DESC_BASE<string instr_asm, RegisterOperand RO, SDNode OpNode,
   string AsmString = !strconcat(instr_asm, "\t$rd, $ac");
   list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode RO:$ac))];
   InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
 }
 
 class MTHI_DESC_BASE<string instr_asm, RegisterOperand RO, InstrItinClass itin> {
@@ -494,6 +496,7 @@ class MTHI_DESC_BASE<string instr_asm, RegisterOperand RO, InstrItinClass itin>
   dag InOperandList = (ins GPR32Opnd:$rs);
   string AsmString = !strconcat(instr_asm, "\t$rs, $ac");
   InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
 }
 
 class BPOSGE32_PSEUDO_DESC_BASE<SDPatternOperator OpNode, InstrItinClass itin> :
@@ -1143,14 +1146,14 @@ def MULEQ_S_W_PHL : DspMMRel, MULEQ_S_W_PHL_ENC, MULEQ_S_W_PHL_DESC;
 def MULEQ_S_W_PHR : DspMMRel, MULEQ_S_W_PHR_ENC, MULEQ_S_W_PHR_DESC;
 def MULQ_RS_PH : DspMMRel, MULQ_RS_PH_ENC, MULQ_RS_PH_DESC;
 def MULSAQ_S_W_PH : MULSAQ_S_W_PH_ENC, MULSAQ_S_W_PH_DESC;
-def MAQ_S_W_PHL : MAQ_S_W_PHL_ENC, MAQ_S_W_PHL_DESC;
-def MAQ_S_W_PHR : MAQ_S_W_PHR_ENC, MAQ_S_W_PHR_DESC;
-def MAQ_SA_W_PHL : MAQ_SA_W_PHL_ENC, MAQ_SA_W_PHL_DESC;
-def MAQ_SA_W_PHR : MAQ_SA_W_PHR_ENC, MAQ_SA_W_PHR_DESC;
-def MFHI_DSP : MFHI_ENC, MFHI_DESC;
-def MFLO_DSP : MFLO_ENC, MFLO_DESC;
-def MTHI_DSP : MTHI_ENC, MTHI_DESC;
-def MTLO_DSP : MTLO_ENC, MTLO_DESC;
+def MAQ_S_W_PHL : DspMMRel, MAQ_S_W_PHL_ENC, MAQ_S_W_PHL_DESC;
+def MAQ_S_W_PHR : DspMMRel, MAQ_S_W_PHR_ENC, MAQ_S_W_PHR_DESC;
+def MAQ_SA_W_PHL : DspMMRel, MAQ_SA_W_PHL_ENC, MAQ_SA_W_PHL_DESC;
+def MAQ_SA_W_PHR : DspMMRel, MAQ_SA_W_PHR_ENC, MAQ_SA_W_PHR_DESC;
+def MFHI_DSP : DspMMRel, MFHI_ENC, MFHI_DESC;
+def MFLO_DSP : DspMMRel, MFLO_ENC, MFLO_DESC;
+def MTHI_DSP : DspMMRel, MTHI_ENC, MTHI_DESC;
+def MTLO_DSP : DspMMRel, MTLO_ENC, MTLO_DESC;
 def DPAU_H_QBL : DspMMRel, DPAU_H_QBL_ENC, DPAU_H_QBL_DESC;
 def DPAU_H_QBR : DspMMRel, DPAU_H_QBR_ENC, DPAU_H_QBR_DESC;
 def DPSU_H_QBL : DspMMRel, DPSU_H_QBL_ENC, DPSU_H_QBL_DESC;
@@ -1182,9 +1185,9 @@ def REPLV_QB : REPLV_QB_ENC, REPLV_QB_DESC;
 def REPLV_PH : REPLV_PH_ENC, REPLV_PH_DESC;
 def PICK_QB : PICK_QB_ENC, PICK_QB_DESC;
 def PICK_PH : PICK_PH_ENC, PICK_PH_DESC;
-def LWX : LWX_ENC, LWX_DESC;
-def LHX : LHX_ENC, LHX_DESC;
-def LBUX : LBUX_ENC, LBUX_DESC;
+def LWX : DspMMRel, LWX_ENC, LWX_DESC;
+def LHX : DspMMRel, LHX_ENC, LHX_DESC;
+def LBUX : DspMMRel, LBUX_ENC, LBUX_DESC;
 def BPOSGE32 : BPOSGE32_ENC, BPOSGE32_DESC;
 def INSV : DspMMRel, INSV_ENC, INSV_DESC;
 def EXTP : DspMMRel, EXTP_ENC, EXTP_DESC;
diff --git a/test/MC/Disassembler/Mips/micromips-dsp/valid.txt b/test/MC/Disassembler/Mips/micromips-dsp/valid.txt
index 433022c1e62c..ceba6a9823b8 100644
--- a/test/MC/Disassembler/Mips/micromips-dsp/valid.txt
+++ b/test/MC/Disassembler/Mips/micromips-dsp/valid.txt
@@ -76,3 +76,14 @@
 0x00 0x62 0x08 0x95 # CHECK: muleu_s.ph.qbl $1, $2, $3
 0x00 0x62 0x08 0xd5 # CHECK: muleu_s.ph.qbr $1, $2, $3
 0x00,0x62,0x09,0x15 # CHECK: mulq_rs.ph $1, $2, $3
+0x00 0x43 0x0a 0x25 # CHECK: lbux $1, $2($3)
+0x00 0x43 0x09 0x65 # CHECK: lhx $1, $2($3)
+0x00 0x43 0x09 0xa5 # CHECK: lwx $1, $2($3)
+0x00 0x62 0x5a 0x7c # CHECK: maq_s.w.phl $ac1, $2, $3
+0x00 0x62 0x7a 0x7c # CHECK: maq_sa.w.phl $ac1, $2, $3
+0x00 0x62 0x4a 0x7c # CHECK: maq_s.w.phr $ac1, $2, $3
+0x00 0x62 0x6a 0x7c # CHECK: maq_sa.w.phr $ac1, $2, $3
+0x00 0x02 0x40 0x7c # CHECK: mfhi $2, $ac1
+0x00 0x01 0x50 0x7c # CHECK: mflo $1, $ac1
+0x00 0x01 0x60 0x7c # CHECK: mthi $1, $ac1
+0x00 0x01 0x70 0x7c # CHECK: mtlo $1, $ac1
diff --git a/test/MC/Mips/micromips-dsp/valid.s b/test/MC/Mips/micromips-dsp/valid.s
index 24e4a07365ab..f2eae8f6cae4 100644
--- a/test/MC/Mips/micromips-dsp/valid.s
+++ b/test/MC/Mips/micromips-dsp/valid.s
@@ -77,3 +77,14 @@
   muleu_s.ph.qbl $1, $2, $3    # CHECK: muleu_s.ph.qbl $1, $2, $3  # encoding: [0x00,0x62,0x08,0x95]
   muleu_s.ph.qbr $1, $2, $3    # CHECK: muleu_s.ph.qbr $1, $2, $3  # encoding: [0x00,0x62,0x08,0xd5]
   mulq_rs.ph $1, $2, $3        # CHECK: mulq_rs.ph $1, $2, $3      # encoding: [0x00,0x62,0x09,0x15]
+  lbux $1, $2($3)              # CHECK: lbux $1, $2($3)              # encoding: [0x00,0x43,0x0a,0x25]
+  lhx $1, $2($3)               # CHECK: lhx $1, $2($3)               # encoding: [0x00,0x43,0x09,0x65]
+  lwx $1, $2($3)               # CHECK: lwx $1, $2($3)               # encoding: [0x00,0x43,0x09,0xa5]
+  maq_s.w.phl $ac1, $2, $3     # CHECK: maq_s.w.phl $ac1, $2, $3     # encoding: [0x00,0x62,0x5a,0x7c]
+  maq_sa.w.phl $ac1, $2, $3    # CHECK: maq_sa.w.phl $ac1, $2, $3    # encoding: [0x00,0x62,0x7a,0x7c]
+  maq_s.w.phr $ac1, $2, $3     # CHECK: maq_s.w.phr $ac1, $2, $3     # encoding: [0x00,0x62,0x4a,0x7c]
+  maq_sa.w.phr $ac1, $2, $3    # CHECK: maq_sa.w.phr $ac1, $2, $3    # encoding: [0x00,0x62,0x6a,0x7c]
+  mfhi $2, $ac1                # CHECK: mfhi $2, $ac1                # encoding: [0x00,0x02,0x40,0x7c]
+  mflo $1, $ac1                # CHECK: mflo $1, $ac1                # encoding: [0x00,0x01,0x50,0x7c]
+  mthi $1, $ac1                # CHECK: mthi $1, $ac1                # encoding: [0x00,0x01,0x60,0x7c]
+  mtlo $1, $ac1                # CHECK: mtlo $1, $ac1                # encoding: [0x00,0x01,0x70,0x7c]

From 9ec9305f0679d052d7311691291e184cde92353c Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Mon, 30 Nov 2015 14:52:33 +0000
Subject: [PATCH 006/186] Silencing a 32-bit to 64-bit implicit conversion
 warning; NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254302 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 0174779223ca..9d8895fa7bd1 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -6431,7 +6431,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   SmallVector<SDValue, 8> V(NumElems);
   if (NumElems == 4 && NumZero > 0) {
     for (unsigned i = 0; i < 4; ++i) {
-      bool isZero = !(NonZeros & (1 << i));
+      bool isZero = !(NonZeros & (1ULL << i));
       if (isZero)
         V[i] = getZeroVector(VT, Subtarget, DAG, dl);
       else

From 9d75bc99041c25f82d0cd6cbce0a43913b540c6d Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 30 Nov 2015 15:46:47 +0000
Subject: [PATCH 007/186] AMDGPU: Don't reserve SCRATCH_PTR input register

This hasn't been doing anything since using relocations was added.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254304 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/SIISelLowering.cpp             | 16 ++++------------
 .../si-instr-info-correct-implicit-operands.ll   |  2 +-
 2 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 5c67bf80c175..e2c644451b43 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -633,21 +633,13 @@ SDValue SITargetLowering::LowerFormalArguments(
     unsigned InputPtrRegHi =
         TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 1);
 
-    unsigned ScratchPtrReg =
-        TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR);
-    unsigned ScratchPtrRegLo =
-        TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 0);
-    unsigned ScratchPtrRegHi =
-        TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 1);
-
     CCInfo.AllocateReg(InputPtrRegLo);
     CCInfo.AllocateReg(InputPtrRegHi);
-    CCInfo.AllocateReg(ScratchPtrRegLo);
-    CCInfo.AllocateReg(ScratchPtrRegHi);
     MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
-    MF.addLiveIn(ScratchPtrReg, &AMDGPU::SReg_64RegClass);
-    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-    if (Subtarget->isAmdHsaOS() && MFI->hasDispatchPtr()) {
+
+    const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+    if (MFI->hasDispatchPtr()) {
       unsigned DispatchPtrReg =
         TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR);
       unsigned DispatchPtrRegLo =
diff --git a/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll b/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll
index 0e15bc878650..27a8e70aae13 100644
--- a/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll
+++ b/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll
@@ -3,7 +3,7 @@
 ; register operands in the correct order when modifying the opcode of an
 ; instruction to V_ADD_I32_e32.
 
-; CHECK: %19 = V_ADD_I32_e32 %13, %12, implicit-def %vcc, implicit %exec
+; CHECK: %{{[0-9]+}} = V_ADD_I32_e32 %{{[0-9]+}}, %{{[0-9]+}}, implicit-def %vcc, implicit %exec
 
 define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:

From d27440bc805cb90ef7f0fe970ee08609a29e4a04 Mon Sep 17 00:00:00 2001
From: Colin LeMahieu <colinl@codeaurora.org>
Date: Mon, 30 Nov 2015 17:32:34 +0000
Subject: [PATCH 008/186] [Hexagon] NFC Reordering headers.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254307 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index 2f3521bfd717..b73af8249cb5 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -13,12 +13,12 @@
 #include "MCTargetDesc/HexagonBaseInfo.h"
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCAsmLayout.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/TargetRegistry.h"
 

From 351aaa5d3e5d537b9bfa5ca8d6b82250ca85c3b5 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 30 Nov 2015 17:52:02 +0000
Subject: [PATCH 009/186] fix formatting; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254310 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 9d8895fa7bd1..4ec4ec280675 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -25594,8 +25594,8 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
     for (unsigned i = 0; i != NumElems; ++i)
       ShuffleVec[i] = i * SizeRatio;
-    for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
-      ShuffleVec[i] = NumElems*SizeRatio;
+    for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
+      ShuffleVec[i] = NumElems * SizeRatio;
     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
                                    DAG.getConstant(0, dl, WideVecVT),
                                    &ShuffleVec[0]);
@@ -25676,8 +25676,8 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
          "WideVecVT should be legal");
 
   SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
-                                        DAG.getUNDEF(WideVecVT),
-                                        &ShuffleVec[0]);
+                                              DAG.getUNDEF(WideVecVT),
+                                              &ShuffleVec[0]);
 
   SDValue NewMask;
   SDValue Mask = Mst->getMask();
@@ -25709,8 +25709,9 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
   }
 
-  return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(),
-                            NewMask, StVT, Mst->getMemOperand(), false);
+  return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
+                            Mst->getBasePtr(), NewMask, StVT,
+                            Mst->getMemOperand(), false);
 }
 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
 static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,

From 8dc6e6c3bb8901096bf31a5aaca5223b5bbf2a4f Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Mon, 30 Nov 2015 18:42:08 +0000
Subject: [PATCH 010/186] [WebAssembly] Fix a few minor compiler warnings. NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254311 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index 728910422166..dbd00bc10b1c 100644
--- a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -162,9 +162,9 @@ static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI) {
   SmallPtrSet<MachineBasicBlock *, 16> Visited;
   SmallVector<POStackEntry, 16> Stack;
 
-  MachineBasicBlock *Entry = &*MF.begin();
-  Visited.insert(Entry);
-  Stack.push_back(POStackEntry(Entry, MF, MLI));
+  MachineBasicBlock *EntryBlock = &*MF.begin();
+  Visited.insert(EntryBlock);
+  Stack.push_back(POStackEntry(EntryBlock, MF, MLI));
 
   for (;;) {
     POStackEntry &Entry = Stack.back();
@@ -220,7 +220,7 @@ static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI) {
 #endif
 }
 
-static int GetLoopDepth(const MachineLoop *Loop) {
+static unsigned GetLoopDepth(const MachineLoop *Loop) {
   return Loop ? Loop->getLoopDepth() : 0;
 }
 
@@ -249,12 +249,12 @@ static void PlaceBlockMarkers(MachineBasicBlock &MBB,
 
   MachineBasicBlock::iterator InsertPos;
   MachineLoop *HeaderLoop = MLI.getLoopFor(Header);
-  int MBBLoopDepth = GetLoopDepth(MLI.getLoopFor(&MBB));
-  int HeaderLoopDepth = GetLoopDepth(HeaderLoop);
+  unsigned MBBLoopDepth = GetLoopDepth(MLI.getLoopFor(&MBB));
+  unsigned HeaderLoopDepth = GetLoopDepth(HeaderLoop);
   if (HeaderLoopDepth > MBBLoopDepth) {
     // The nearest common dominating point is more deeply nested. Insert the
     // BLOCK just above the LOOP.
-    for (int i = 0; i < HeaderLoopDepth - 1 - MBBLoopDepth; ++i)
+    for (unsigned i = 0; i < HeaderLoopDepth - 1 - MBBLoopDepth; ++i)
       HeaderLoop = HeaderLoop->getParentLoop();
     Header = HeaderLoop->getHeader();
     InsertPos = Header->begin();

From e87f0932d94a1bb29c7d4792adf8742ca165c4ef Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Mon, 30 Nov 2015 18:54:24 +0000
Subject: [PATCH 011/186] Fix another llvm.ctors merging bug.

We were not looking past casts to see if an element should be included
or not.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254313 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Linker/LinkModules.cpp   | 5 +++--
 test/Linker/Inputs/ctors3.ll | 7 +++++++
 test/Linker/ctors3.ll        | 8 ++++++++
 3 files changed, 18 insertions(+), 2 deletions(-)
 create mode 100644 test/Linker/Inputs/ctors3.ll
 create mode 100644 test/Linker/ctors3.ll

diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index 6b60379803e1..cdf1decc8131 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -410,7 +410,7 @@ class ModuleLinker {
   std::vector<AppendingVarInfo> AppendingVars;
 
   // Set of items not to link in from source.
-  SmallPtrSet<const Value *, 16> DoNotLinkFromSource;
+  SmallPtrSet<const GlobalValue *, 16> DoNotLinkFromSource;
 
   DiagnosticHandlerFunction DiagnosticHandler;
 
@@ -1512,7 +1512,8 @@ void ModuleLinker::linkAppendingVarInit(AppendingVarInfo &AVI) {
 
   for (auto *V : SrcElements) {
     if (IsNewStructor) {
-      Constant *Key = V->getAggregateElement(2);
+      auto *Key =
+          dyn_cast<GlobalValue>(V->getAggregateElement(2)->stripPointerCasts());
       if (DoNotLinkFromSource.count(Key))
         continue;
     }
diff --git a/test/Linker/Inputs/ctors3.ll b/test/Linker/Inputs/ctors3.ll
new file mode 100644
index 000000000000..449ccbd90faf
--- /dev/null
+++ b/test/Linker/Inputs/ctors3.ll
@@ -0,0 +1,7 @@
+$foo = comdat any
+%t = type { i8 }
+@foo = global %t zeroinitializer, comdat
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @bar, i8* getelementptr (%t, %t* @foo, i32 0, i32 0) }]
+define internal void @bar() comdat($foo) {
+  ret void
+}
diff --git a/test/Linker/ctors3.ll b/test/Linker/ctors3.ll
new file mode 100644
index 000000000000..e62b92dca0b4
--- /dev/null
+++ b/test/Linker/ctors3.ll
@@ -0,0 +1,8 @@
+; RUN: llvm-link -S %s %p/Inputs/ctors3.ll -o - | FileCheck %s
+
+$foo = comdat any
+%t = type { i8 }
+@foo = global %t zeroinitializer, comdat
+
+; CHECK: @foo = global %t zeroinitializer, comdat
+; CHECK: @llvm.global_ctors = appending global [0 x { i32, void ()*, i8* }] zeroinitializer

From 56faeb049c0b274ed12267ef74aed60d290be60f Mon Sep 17 00:00:00 2001
From: Kit Barton <kbarton@ca.ibm.com>
Date: Mon, 30 Nov 2015 18:59:41 +0000
Subject: [PATCH 012/186] Enable shrink wrapping for PPC64

Re-enable shrink wrapping for PPC64 Little Endian.

One minor modification to PPCFrameLowering::findScratchRegister was necessary to handle fall-thru blocks (blocks with no terminator) correctly.

Tested with all LLVM test, clang tests, and the self-hosting build, with no problems found.

PHabricator: http://reviews.llvm.org/D14778

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254314 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/PowerPC/PPCFrameLowering.cpp     | 20 ++++++++++++++------
 test/CodeGen/PowerPC/ppc-shrink-wrapping.ll |  1 -
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index c7a3bbd3762a..5a151eb6ab25 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -573,10 +573,18 @@ bool PPCFrameLowering::findScratchRegister(MachineBasicBlock *MBB,
   RS.initRegState();
   RS.enterBasicBlock(MBB);
 
-  // The scratch register will be used at the end of the block, so must consider
-  // all registers used within the block
-  if (UseAtEnd && MBB->begin() != MBB->getFirstTerminator())
-    RS.forward(MBB->getFirstTerminator());
+  if (UseAtEnd && !MBB->empty()) {
+    // The scratch register will be used at the end of the block, so must consider
+    // all registers used within the block
+
+    MachineBasicBlock::iterator MBBI = MBB->getFirstTerminator();
+    // If no terminator, back iterator up to previous instruction.
+    if (MBBI == MBB->end())
+      MBBI = std::prev(MBBI);
+
+    if (MBBI != MBB->begin())
+      RS.forward(MBBI);
+  }
   
   if (!RS.isRegUsed(R0)) 
     return true;
@@ -1768,6 +1776,6 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
 }
 
 bool PPCFrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
-  // FIXME: Enable this for non-Darwin PPC64 once it is confirmed working.
-  return false;
+  return (MF.getSubtarget<PPCSubtarget>().isSVR4ABI() &&
+          MF.getSubtarget<PPCSubtarget>().isPPC64());
 }
diff --git a/test/CodeGen/PowerPC/ppc-shrink-wrapping.ll b/test/CodeGen/PowerPC/ppc-shrink-wrapping.ll
index a23888425bf5..2da8e8d0984c 100644
--- a/test/CodeGen/PowerPC/ppc-shrink-wrapping.ll
+++ b/test/CodeGen/PowerPC/ppc-shrink-wrapping.ll
@@ -1,6 +1,5 @@
 ; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
 ; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu %s -o - -enable-shrink-wrap=false |  FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
-; XFAIL: *
 ;
 ; Note: Lots of tests use inline asm instead of regular calls.
 ; This allows to have a better control on what the allocation will do.

From fbb7433e076fcaa297905347a6c516935d2b90bc Mon Sep 17 00:00:00 2001
From: David Majnemer <david.majnemer@gmail.com>
Date: Mon, 30 Nov 2015 19:04:19 +0000
Subject: [PATCH 013/186] [X86] Add RIP to GR64_TCW64

The MachineVerifier wants to check that the register operands of an
instruction belong to the instruction's register class.  RIP-relative
control flow instructions violated this by referencing RIP.  While this
was fixed for SysV, it was never fixed for Win64.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254315 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86RegisterInfo.td       |  2 +-
 test/CodeGen/X86/coalescer-win64.ll     | 16 ++++++++++++++++
 test/CodeGen/X86/x86-shrink-wrapping.ll |  2 +-
 3 files changed, 18 insertions(+), 2 deletions(-)
 create mode 100644 test/CodeGen/X86/coalescer-win64.ll

diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index 12f38c7946a8..ceeb57d0cc4c 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -375,7 +375,7 @@ def GR32_TC   : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX)>;
 def GR64_TC   : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI,
                                                      R8, R9, R11, RIP)>;
 def GR64_TCW64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX,
-                                                      R8, R9, R10, R11)>;
+                                                      R8, R9, R10, R11, RIP)>;
 
 // GR8_NOREX - GR8 registers which do not require a REX prefix.
 def GR8_NOREX : RegisterClass<"X86", [i8], 8,
diff --git a/test/CodeGen/X86/coalescer-win64.ll b/test/CodeGen/X86/coalescer-win64.ll
new file mode 100644
index 000000000000..ff084ae5b9e0
--- /dev/null
+++ b/test/CodeGen/X86/coalescer-win64.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -verify-coalescing | FileCheck %s
+target triple = "x86_64-pc-win32"
+
+@fnptr = external global void ()*
+
+define void @test1() {
+entry:
+  %p = load void ()*, void ()** @fnptr
+  tail call void %p()
+  ret void
+}
+
+; CHECK-LABEL: test1{{$}}
+; CHECK: .seh_proc test1{{$}}
+; CHECK: rex64 jmpq *fnptr(%rip)
+; CHECK: .seh_endproc
diff --git a/test/CodeGen/X86/x86-shrink-wrapping.ll b/test/CodeGen/X86/x86-shrink-wrapping.ll
index 52e094b54174..0cab17f9de89 100644
--- a/test/CodeGen/X86/x86-shrink-wrapping.ll
+++ b/test/CodeGen/X86/x86-shrink-wrapping.ll
@@ -445,9 +445,9 @@ if.end:                                           ; preds = %for.body, %if.else
 ; CHECK-NEXT: xorl %eax, %eax
 ; CHECK-NEXT: %esi, %edi
 ; CHECK-NEXT: %esi, %edx
+; CHECK-NEXT: %esi, %ecx
 ; CHECK-NEXT: %esi, %r8d
 ; CHECK-NEXT: %esi, %r9d
-; CHECK-NEXT: %esi, %ecx
 ; CHECK-NEXT: callq _someVariadicFunc
 ; CHECK-NEXT: movl %eax, %esi
 ; CHECK-NEXT: shll $3, %esi

From 24c91af18b7b2da200d91cf5ae6a27f3be47cdc0 Mon Sep 17 00:00:00 2001
From: Davide Italiano <davide@freebsd.org>
Date: Mon, 30 Nov 2015 19:36:35 +0000
Subject: [PATCH 014/186] [SimplifyLibCalls] Transform log(exp2(y)) to y*log(2)
 under fast-math.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254317 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Utils/SimplifyLibCalls.cpp         | 10 +++++++++-
 test/Transforms/InstCombine/log-pow-nofastmath.ll | 13 +++++++++++++
 test/Transforms/InstCombine/log-pow.ll            | 13 +++++++++++++
 3 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 47e587fab7b6..83afb1a65ac0 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -1322,6 +1322,15 @@ Value *LibCallSimplifier::optimizeLog(CallInst *CI, IRBuilder<> &B) {
     return B.CreateFMul(OpC->getArgOperand(1),
       EmitUnaryFloatFnCall(OpC->getOperand(0), Callee->getName(), B,
                            Callee->getAttributes()), "mul");
+
+  // log(exp2(y)) -> y*log(2)
+  if (F && Name == "log" && TLI->getLibFunc(F->getName(), Func) &&
+      TLI->has(Func) && Func == LibFunc::exp2)
+    return B.CreateFMul(
+        OpC->getArgOperand(0),
+        EmitUnaryFloatFnCall(ConstantFP::get(CI->getType(), 2.0),
+                             Callee->getName(), B, Callee->getAttributes()),
+        "logmul");
   return Ret;
 }
 
@@ -2301,7 +2310,6 @@ void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) {
 // log, logf, logl:
 //   * log(exp(x))   -> x
 //   * log(exp(y))   -> y*log(e)
-//   * log(exp2(y))  -> y*log(2)
 //   * log(exp10(y)) -> y*log(10)
 //   * log(sqrt(x))  -> 0.5*log(x)
 //
diff --git a/test/Transforms/InstCombine/log-pow-nofastmath.ll b/test/Transforms/InstCombine/log-pow-nofastmath.ll
index 0811e63cc74a..6c37c5466cea 100644
--- a/test/Transforms/InstCombine/log-pow-nofastmath.ll
+++ b/test/Transforms/InstCombine/log-pow-nofastmath.ll
@@ -13,5 +13,18 @@ entry:
 ; CHECK:   ret double %call
 ; CHECK: }
 
+define double @test3(double %x) #0 {
+  %call2 = call double @exp2(double %x) #0
+  %call3 = call double @log(double %call2) #0
+  ret double %call3
+}
+
+; CHECK-LABEL: @test3
+; CHECK:   %call2 = call double @exp2(double %x)
+; CHECK:   %call3 = call double @log(double %call2)
+; CHECK:   ret double %call3
+; CHECK: }
+
 declare double @log(double) #0
+declare double @exp2(double)
 declare double @llvm.pow.f64(double, double)
diff --git a/test/Transforms/InstCombine/log-pow.ll b/test/Transforms/InstCombine/log-pow.ll
index c98a1a5bc628..1acd0354431e 100644
--- a/test/Transforms/InstCombine/log-pow.ll
+++ b/test/Transforms/InstCombine/log-pow.ll
@@ -22,7 +22,20 @@ define double @test2(double ()* %fptr, double %p1) #0 {
 ; CHECK-LABEL: @test2
 ; CHECK: log
 
+define double @test3(double %x) #0 {
+  %call2 = call double @exp2(double %x) #0
+  %call3 = call double @log(double %call2) #0
+  ret double %call3
+}
+
+; CHECK-LABEL: @test3
+; CHECK:  %call2 = call double @exp2(double %x) #0
+; CHECK:  %logmul = fmul fast double %x, 0x3FE62E42FEFA39EF
+; CHECK:  ret double %logmul
+; CHECK: }
+
 declare double @log(double) #0
+declare double @exp2(double) #0
 declare double @llvm.pow.f64(double, double)
 
 attributes #0 = { "unsafe-fp-math"="true" }

From 76755680f6325f0cad7502a2a832a3376b1e3747 Mon Sep 17 00:00:00 2001
From: Davide Italiano <davide@freebsd.org>
Date: Mon, 30 Nov 2015 19:38:35 +0000
Subject: [PATCH 015/186] [SimplifyLibCalls] Remove useless bits of this tests.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254318 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/InstCombine/log-pow-nofastmath.ll | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/Transforms/InstCombine/log-pow-nofastmath.ll b/test/Transforms/InstCombine/log-pow-nofastmath.ll
index 6c37c5466cea..faaef97311ec 100644
--- a/test/Transforms/InstCombine/log-pow-nofastmath.ll
+++ b/test/Transforms/InstCombine/log-pow-nofastmath.ll
@@ -1,9 +1,9 @@
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
-define double @mylog(double %x, double %y) #0 {
+define double @mylog(double %x, double %y) {
 entry:
   %pow = call double @llvm.pow.f64(double %x, double %y)
-  %call = call double @log(double %pow) #0
+  %call = call double @log(double %pow)
   ret double %call
 }
 
@@ -13,9 +13,9 @@ entry:
 ; CHECK:   ret double %call
 ; CHECK: }
 
-define double @test3(double %x) #0 {
-  %call2 = call double @exp2(double %x) #0
-  %call3 = call double @log(double %call2) #0
+define double @test3(double %x) {
+  %call2 = call double @exp2(double %x)
+  %call3 = call double @log(double %call2)
   ret double %call3
 }
 
@@ -25,6 +25,6 @@ define double @test3(double %x) #0 {
 ; CHECK:   ret double %call3
 ; CHECK: }
 
-declare double @log(double) #0
+declare double @log(double)
 declare double @exp2(double)
 declare double @llvm.pow.f64(double, double)

From 3be8e6a768d24304a43bc915d974c7551ec86248 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Mon, 30 Nov 2015 20:36:23 +0000
Subject: [PATCH 016/186] Avoid writing to source directory of tests

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254324 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Linker/ctors.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/Linker/ctors.ll b/test/Linker/ctors.ll
index 208fe86402ce..37dba23d4c91 100644
--- a/test/Linker/ctors.ll
+++ b/test/Linker/ctors.ll
@@ -4,7 +4,7 @@
 ; RUN:   FileCheck --check-prefix=ALL --check-prefix=CHECK2 %s
 
 ; Test the bitcode writer too. It used to crash.
-; RUN: llvm-link %s %p/Inputs/ctors.ll -o t.bc
+; RUN: llvm-link %s %p/Inputs/ctors.ll -o %t.bc
 
 @v = weak global i8 0
 ; CHECK1: @v = weak global i8 0

From ddaf09c1921d4306b865fae11bf9cfdca6b62731 Mon Sep 17 00:00:00 2001
From: Quentin Colombet <qcolombet@apple.com>
Date: Mon, 30 Nov 2015 20:37:58 +0000
Subject: [PATCH 017/186] [ARM] For old thumb ISA like v4t, we cannot use PC
 directly in pop.

Fix the epilogue emission to account for that.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254325 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/Thumb1FrameLowering.cpp     | 23 ++++--------------
 test/CodeGen/ARM/thumb1_return_sequence.ll | 27 ++++++++++++++++++----
 2 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index f5d4cb8a3ca1..064cff6f5704 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -422,25 +422,12 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
   const ThumbRegisterInfo *RegInfo =
       static_cast<const ThumbRegisterInfo *>(STI.getRegisterInfo());
 
-  // If MBBI is a return instruction, we may be able to directly restore
-  // LR in the PC.
-  // This is possible if we do not need to emit any SP update.
-  // Otherwise, we need a temporary register to pop the value
-  // and copy that value into LR.
+  // When we need a special fix up for POP, this means that
+  // we either cannot use PC in POP or we have to update
+  // SP after poping the return address.
+  // In other words, we cannot use a pop {pc} like construction
+  // here, no matter what.
   auto MBBI = MBB.getFirstTerminator();
-  if (!ArgRegsSaveSize && MBBI != MBB.end() &&
-      MBBI->getOpcode() == ARM::tBX_RET) {
-    if (!DoIt)
-      return true;
-    MachineInstrBuilder MIB =
-        AddDefaultPred(
-            BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII.get(ARM::tPOP_RET)))
-            .addReg(ARM::PC, RegState::Define);
-    MIB.copyImplicitOps(&*MBBI);
-    // erase the old tBX_RET instruction
-    MBB.erase(MBBI);
-    return true;
-  }
 
   // Look for a temporary register to use.
   // First, compute the liveness information.
diff --git a/test/CodeGen/ARM/thumb1_return_sequence.ll b/test/CodeGen/ARM/thumb1_return_sequence.ll
index 5b9c19ab5eb2..67d1cad2cf68 100644
--- a/test/CodeGen/ARM/thumb1_return_sequence.ll
+++ b/test/CodeGen/ARM/thumb1_return_sequence.ll
@@ -23,9 +23,22 @@ entry:
 ; --------
 ; CHECK-V4T:         add sp,
 ; CHECK-V4T-NEXT:    pop {[[SAVED]]}
-; We do not have any SP update to insert so we can just optimize
-; the pop sequence.
-; CHECK-V4T-NEXT:    pop {pc}
+; The ISA for v4 does not support pop pc, so make sure we do not emit
+; one even when we do not need to update SP.
+; CHECK-V4T-NOT:     pop {pc}
+; We may only use lo register to pop, but in that case, all the scratch
+; ones are used.
+; r12 is the only register we are allowed to clobber for AAPCS.
+; Use it to save a lo register.
+; CHECK-V4T-NEXT:    mov [[TEMP_REG:r12]], [[POP_REG:r[0-7]]]
+; Pop the value of LR.
+; CHECK-V4T-NEXT:    pop {[[POP_REG]]}
+; Copy the value of LR in the right register.
+; CHECK-V4T-NEXT:    mov lr, [[POP_REG]]
+; Restore the value that was in the register we used to pop the value of LR.
+; CHECK-V4T-NEXT:    mov [[POP_REG]], [[TEMP_REG]]
+; Return.
+; CHECK-V4T-NEXT:    bx lr
 ; CHECK-V5T:         pop {[[SAVED]], pc}
 }
 
@@ -93,7 +106,13 @@ entry:
 ; Epilogue
 ; --------
 ; CHECK-V4T:    pop {[[SAVED]]}
-; CHECK-V4T:    pop {pc}
+; The ISA for v4 does not support pop pc, so make sure we do not emit
+; one even when we do not need to update SP.
+; CHECK-V4T-NOT:     pop {pc}
+; Pop the value of LR into a scratch lo register other than r0 (it is
+; used for the return value).
+; CHECK-V4T-NEXT:    pop {[[POP_REG:r[1-3]]]}
+; CHECK-V4T-NEXT:    bx [[POP_REG]]
 ; CHECK-V5T:    pop {[[SAVED]], pc}
 }
 

From 32851c191d611f67e22dea11f0c1a0adee584806 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 30 Nov 2015 21:15:45 +0000
Subject: [PATCH 018/186] AMDGPU: Use assert zext for workgroup sizes

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254328 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/SIISelLowering.cpp        | 31 +++++++----
 lib/Target/AMDGPU/SIISelLowering.h          |  3 ++
 test/CodeGen/AMDGPU/work-item-intrinsics.ll | 60 +++++++++++++++++++++
 3 files changed, 84 insertions(+), 10 deletions(-)

diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index e2c644451b43..94fad32c0070 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1043,6 +1043,18 @@ SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL,
                                                       // a glue result.
 }
 
+SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
+                                                 SDValue Op,
+                                                 MVT VT,
+                                                 unsigned Offset) const {
+  SDLoc SL(Op);
+  SDValue Param = LowerParameter(DAG, MVT::i32, MVT::i32, SL,
+                                 DAG.getEntryNode(), Offset, false);
+  // The local size values will have the hi 16-bits as zero.
+  return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
+                     DAG.getValueType(VT));
+}
+
 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                   SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -1080,19 +1092,18 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
                           SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
   case Intrinsic::r600_read_local_size_x:
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::LOCAL_SIZE_X, false);
+    return lowerImplicitZextParam(DAG, Op, MVT::i16,
+                                  SI::KernelInputOffsets::LOCAL_SIZE_X);
   case Intrinsic::r600_read_local_size_y:
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::LOCAL_SIZE_Y, false);
+    return lowerImplicitZextParam(DAG, Op, MVT::i16,
+                                  SI::KernelInputOffsets::LOCAL_SIZE_Y);
   case Intrinsic::r600_read_local_size_z:
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::LOCAL_SIZE_Z, false);
-
+    return lowerImplicitZextParam(DAG, Op, MVT::i16,
+                                  SI::KernelInputOffsets::LOCAL_SIZE_Z);
   case Intrinsic::AMDGPU_read_workdim:
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          getImplicitParameterOffset(MFI, GRID_DIM), false);
-
+    // Really only 2 bits.
+    return lowerImplicitZextParam(DAG, Op, MVT::i8,
+                                  getImplicitParameterOffset(MFI, GRID_DIM));
   case Intrinsic::r600_read_tgid_x:
     return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
       TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_X), VT);
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index a8b8ad34ed9d..a358e3fc3c0d 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -28,6 +28,9 @@ class SITargetLowering : public AMDGPUTargetLowering {
   SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
                              SelectionDAG &DAG) const override;
 
+  SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
+                                 MVT VT, unsigned Offset) const;
+
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
diff --git a/test/CodeGen/AMDGPU/work-item-intrinsics.ll b/test/CodeGen/AMDGPU/work-item-intrinsics.ll
index 4328e964c1bf..ebe73429da9c 100644
--- a/test/CodeGen/AMDGPU/work-item-intrinsics.ll
+++ b/test/CodeGen/AMDGPU/work-item-intrinsics.ll
@@ -213,6 +213,66 @@ entry:
   ret void
 }
 
+; FUNC-LABEL: {{^}}local_size_x_known_bits:
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18
+; GCN-NOT: 0xffff
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NEXT: buffer_store_dword [[VVAL]]
+define void @local_size_x_known_bits(i32 addrspace(1)* %out) {
+entry:
+  %size = call i32 @llvm.r600.read.local.size.x() #0
+  %shl = shl i32 %size, 16
+  %shr = lshr i32 %shl, 16
+  store i32 %shr, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_y_known_bits:
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
+; GCN-NOT: 0xffff
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NEXT: buffer_store_dword [[VVAL]]
+define void @local_size_y_known_bits(i32 addrspace(1)* %out) {
+entry:
+  %size = call i32 @llvm.r600.read.local.size.y() #0
+  %shl = shl i32 %size, 16
+  %shr = lshr i32 %shl, 16
+  store i32 %shr, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_z_known_bits:
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
+; GCN-NOT: 0xffff
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NEXT: buffer_store_dword [[VVAL]]
+define void @local_size_z_known_bits(i32 addrspace(1)* %out) {
+entry:
+  %size = call i32 @llvm.r600.read.local.size.z() #0
+  %shl = shl i32 %size, 16
+  %shr = lshr i32 %shl, 16
+  store i32 %shr, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}get_work_dim_known_bits:
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
+; GCN-NOT: 0xff
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @get_work_dim_known_bits(i32 addrspace(1)* %out) {
+entry:
+  %dim = call i32 @llvm.AMDGPU.read.workdim() #0
+  %shl = shl i32 %dim, 24
+  %shr = lshr i32 %shl, 24
+  store i32 %shr, i32 addrspace(1)* %out
+  ret void
+}
+
 declare i32 @llvm.r600.read.ngroups.x() #0
 declare i32 @llvm.r600.read.ngroups.y() #0
 declare i32 @llvm.r600.read.ngroups.z() #0

From 956f59ab56fcbaf4256b6dc92dc6d8a9ac592365 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 30 Nov 2015 21:15:53 +0000
Subject: [PATCH 019/186] AMDGPU: Remove SIPrepareScratchRegs

It does not work because of emergency stack slots.
This pass was supposed to eliminate dummy registers for the
spill instructions, but the register scavenger can introduce
more during PrologEpilogInserter, so some would end up
left behind if they were needed.

The potential for spilling the scratch resource descriptor
and offset register makes doing something like this
overly complicated. Reserve registers to use for the resource
descriptor and use them directly in eliminateFrameIndex.

Also removes creating another scratch resource descriptor
when directly selecting scratch MUBUF instructions.

The choice of which registers are reserved is temporary.
For now it attempts to pick the next available registers
after the user and system SGPRs.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254329 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/AMDGPU.h                    |   1 -
 lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp      |  32 +-
 lib/Target/AMDGPU/AMDGPUTargetMachine.cpp     |   1 -
 lib/Target/AMDGPU/CMakeLists.txt              |   1 -
 lib/Target/AMDGPU/SIFrameLowering.cpp         |  72 +++
 lib/Target/AMDGPU/SIFrameLowering.h           |   3 +
 lib/Target/AMDGPU/SIISelLowering.cpp          |  17 +-
 lib/Target/AMDGPU/SIISelLowering.h            |   4 -
 lib/Target/AMDGPU/SIInstrInfo.cpp             |  18 +-
 lib/Target/AMDGPU/SIMachineFunctionInfo.cpp   |   8 +
 lib/Target/AMDGPU/SIPrepareScratchRegs.cpp    | 196 ------
 lib/Target/AMDGPU/SIRegisterInfo.cpp          |  19 +
 test/CodeGen/AMDGPU/kernel-args.ll            |  14 +-
 test/CodeGen/AMDGPU/large-alloca-compute.ll   |  42 ++
 test/CodeGen/AMDGPU/large-alloca-graphics.ll  |  47 ++
 test/CodeGen/AMDGPU/large-alloca.ll           |  18 -
 test/CodeGen/AMDGPU/si-sgpr-spill.ll          |  10 +
 ...vgpr-spill-emergency-stack-slot-compute.ll | 583 ++++++++++++++++++
 .../AMDGPU/vgpr-spill-emergency-stack-slot.ll |  22 +-
 19 files changed, 827 insertions(+), 281 deletions(-)
 delete mode 100644 lib/Target/AMDGPU/SIPrepareScratchRegs.cpp
 create mode 100644 test/CodeGen/AMDGPU/large-alloca-compute.ll
 create mode 100644 test/CodeGen/AMDGPU/large-alloca-graphics.ll
 delete mode 100644 test/CodeGen/AMDGPU/large-alloca.ll
 create mode 100644 test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll

diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 80766086e15c..a620e85101e6 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -48,7 +48,6 @@ FunctionPass *createSIFixSGPRCopiesPass();
 FunctionPass *createSIFixSGPRLiveRangesPass();
 FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
 FunctionPass *createSIInsertWaits(TargetMachine &tm);
-FunctionPass *createSIPrepareScratchRegs();
 
 ModulePass *createAMDGPUAnnotateKernelFeaturesPass();
 void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 04a0c1d06aff..85a06882ffe3 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1064,34 +1064,12 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
   MachineFunction &MF = CurDAG->getMachineFunction();
   const SIRegisterInfo *TRI =
       static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  const SITargetLowering& Lowering =
-    *static_cast<const SITargetLowering*>(getTargetLowering());
-
-  unsigned ScratchOffsetReg =
-      TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
-  Lowering.CreateLiveInRegister(*CurDAG, &AMDGPU::SReg_32RegClass,
-                                ScratchOffsetReg, MVT::i32);
-  SDValue Sym0 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD0", MVT::i32);
-  SDValue ScratchRsrcDword0 =
-      SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym0), 0);
-
-  SDValue Sym1 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD1", MVT::i32);
-  SDValue ScratchRsrcDword1 =
-      SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym1), 0);
+  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
 
-  const SDValue RsrcOps[] = {
-      CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
-      ScratchRsrcDword0,
-      CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
-      ScratchRsrcDword1,
-      CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
-  };
-  SDValue ScratchPtr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
-                                              MVT::v2i32, RsrcOps), 0);
-  Rsrc = SDValue(Lowering.buildScratchRSRC(*CurDAG, DL, ScratchPtr), 0);
-  SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
-      MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32);
+  unsigned ScratchOffsetReg
+    = TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
+  Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
+  SOffset = CurDAG->getRegister(ScratchOffsetReg, MVT::i32);
 
   // (add n0, c1)
   if (CurDAG->isBaseWithConstantOffset(Addr)) {
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 4e31c7ab4d4c..7b0445db4df2 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -327,7 +327,6 @@ void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
 }
 
 void GCNPassConfig::addPostRegAlloc() {
-  addPass(createSIPrepareScratchRegs(), false);
   addPass(createSIShrinkInstructionsPass(), false);
 }
 
diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt
index 7a4b5bb6d359..64c9e1882e4f 100644
--- a/lib/Target/AMDGPU/CMakeLists.txt
+++ b/lib/Target/AMDGPU/CMakeLists.txt
@@ -57,7 +57,6 @@ add_llvm_target(AMDGPUCodeGen
   SILowerControlFlow.cpp
   SILowerI1Copies.cpp
   SIMachineFunctionInfo.cpp
-  SIPrepareScratchRegs.cpp
   SIRegisterInfo.cpp
   SIShrinkInstructions.cpp
   SITypeRewriter.cpp
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp
index a2d8fa1b0a10..6aff4b5700d4 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -8,17 +8,89 @@
 //==-----------------------------------------------------------------------===//
 
 #include "SIFrameLowering.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 
 using namespace llvm;
 
+
+static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo,
+                              const MachineFrameInfo *FrameInfo) {
+  if (!FuncInfo->hasSpilledSGPRs())
+    return false;
+
+  if (FuncInfo->hasSpilledVGPRs())
+    return false;
+
+  for (int I = FrameInfo->getObjectIndexBegin(),
+         E = FrameInfo->getObjectIndexEnd(); I != E; ++I) {
+    if (!FrameInfo->isSpillSlotObjectIndex(I))
+      return false;
+  }
+
+  return true;
+}
+
+void SIFrameLowering::emitPrologue(MachineFunction &MF,
+                                   MachineBasicBlock &MBB) const {
+  if (!MF.getFrameInfo()->hasStackObjects())
+    return;
+
+  assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
+
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  // If we only have SGPR spills, we won't actually be using scratch memory
+  // since these spill to VGPRs.
+  //
+  // FIXME: We should be cleaning up these unused SGPR spill frame indices
+  // somewhere.
+  if (hasOnlySGPRSpills(MFI, MF.getFrameInfo()))
+    return;
+
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const SIRegisterInfo *TRI = &TII->getRegisterInfo();
+
+  // We need to insert initialization of the scratch resource descriptor.
+  unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
+  assert(ScratchRsrcReg != AMDGPU::NoRegister);
+
+  uint64_t Rsrc23 = TII->getScratchRsrcWords23();
+  MachineBasicBlock::iterator I = MBB.begin();
+  DebugLoc DL;
+
+  unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
+  unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
+  unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
+  unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
+
+  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0)
+    .addExternalSymbol("SCRATCH_RSRC_DWORD0");
+
+  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1)
+    .addExternalSymbol("SCRATCH_RSRC_DWORD1");
+
+  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2)
+    .addImm(Rsrc23 & 0xffffffff);
+
+  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3)
+    .addImm(Rsrc23 >> 32);
+}
+
 void SIFrameLowering::processFunctionBeforeFrameFinalized(
   MachineFunction &MF,
   RegScavenger *RS) const {
   MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  if (!MFI->hasStackObjects())
+    return;
+
   bool MayNeedScavengingEmergencySlot = MFI->hasStackObjects();
 
   assert((RS || !MayNeedScavengingEmergencySlot) &&
diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h
index 677128d6ce0a..a9152fd8b2aa 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/lib/Target/AMDGPU/SIFrameLowering.h
@@ -21,6 +21,9 @@ class SIFrameLowering final : public AMDGPUFrameLowering {
     AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
   ~SIFrameLowering() override {}
 
+  void emitPrologue(MachineFunction &MF,
+                    MachineBasicBlock &MBB) const override;
+
   void processFunctionBeforeFrameFinalized(
     MachineFunction &MF,
     RegScavenger *RS = nullptr) const override;
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 94fad32c0070..51cbc95bc07c 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -552,6 +552,7 @@ SDValue SITargetLowering::LowerFormalArguments(
   MachineFunction &MF = DAG.getMachineFunction();
   FunctionType *FType = MF.getFunction()->getFunctionType();
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
 
   if (Subtarget->isAmdHsaOS() && Info->getShaderType() != ShaderType::COMPUTE) {
     const Function *Fn = MF.getFunction();
@@ -622,9 +623,9 @@ SDValue SITargetLowering::LowerFormalArguments(
   // The pointer to the scratch buffer is stored in SGPR2, SGPR3
   if (Info->getShaderType() == ShaderType::COMPUTE) {
     if (Subtarget->isAmdHsaOS())
-      Info->NumUserSGPRs = 2;  // FIXME: Need to support scratch buffers.
+      Info->NumUserSGPRs += 4;  // FIXME: Need to support scratch buffers.
     else
-      Info->NumUserSGPRs = 4;
+      Info->NumUserSGPRs += 4;
 
     unsigned InputPtrReg =
         TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR);
@@ -750,6 +751,9 @@ SDValue SITargetLowering::LowerFormalArguments(
     Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx);
   }
 
+  if (MF.getFrameInfo()->hasStackObjects() || ST.isVGPRSpillingEnabled(Info))
+    Info->setScratchRSrcReg(TRI);
+
   if (Chains.empty())
     return Chain;
 
@@ -2335,15 +2339,6 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG,
   return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
 }
 
-MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG,
-                                                  SDLoc DL,
-                                                  SDValue Ptr) const {
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
-
-  return buildRSRC(DAG, DL, Ptr, 0, TII->getScratchRsrcWords23());
-}
-
 SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
                                                const TargetRegisterClass *RC,
                                                unsigned Reg, EVT VT) const {
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index a358e3fc3c0d..0659dd7d5d05 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -116,10 +116,6 @@ class SITargetLowering : public AMDGPUTargetLowering {
                            SDValue Ptr,
                            uint32_t RsrcDword1,
                            uint64_t RsrcDword2And3) const;
-  MachineSDNode *buildScratchRSRC(SelectionDAG &DAG,
-                                  SDLoc DL,
-                                  SDValue Ptr) const;
-
   std::pair<unsigned, const TargetRegisterClass *>
   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                StringRef Constraint, MVT VT) const override;
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 9a85a1d515fe..b7d2a4712759 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -551,15 +551,16 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
 
   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
 
+  unsigned ScratchOffsetPreloadReg
+    = RI.getPreloadedValue(*MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
+
   unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize());
   MFI->setHasSpilledVGPRs();
   BuildMI(MBB, MI, DL, get(Opcode))
     .addReg(SrcReg)                   // src
     .addFrameIndex(FrameIndex)        // frame_idx
-    // Place-holder registers, these will be filled in by
-    // SIPrepareScratchRegs.
-    .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
-    .addReg(AMDGPU::SGPR0, RegState::Undef)
+    .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
+    .addReg(ScratchOffsetPreloadReg)  // scratch_offset
     .addMemOperand(MMO);
 }
 
@@ -637,13 +638,14 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
 
   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
 
+  unsigned ScratchOffsetPreloadReg
+    = RI.getPreloadedValue(*MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
+
   unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize());
   BuildMI(MBB, MI, DL, get(Opcode), DestReg)
     .addFrameIndex(FrameIndex)        // frame_idx
-    // Place-holder registers, these will be filled in by
-    // SIPrepareScratchRegs.
-    .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
-    .addReg(AMDGPU::SGPR0, RegState::Undef)
+    .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
+    .addReg(ScratchOffsetPreloadReg)  // scratch_offset
     .addMemOperand(MMO);
 }
 
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 6269dce553f6..d042844aa138 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -68,6 +68,14 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     WorkItemIDZ = true;
 }
 
+void SIMachineFunctionInfo::setScratchRSrcReg(const SIRegisterInfo *TRI) {
+  // We need to round up to next multiple of 4.
+  unsigned NextSReg128 = RoundUpToAlignment(NumUserSGPRs + 5, 4);
+  unsigned RegSub0 = AMDGPU::SReg_32RegClass.getRegister(NextSReg128);
+  ScratchRSrcReg = TRI->getMatchingSuperReg(RegSub0, AMDGPU::sub0,
+                                            &AMDGPU::SReg_128RegClass);
+}
+
 SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
                                                        MachineFunction *MF,
                                                        unsigned FrameIndex,
diff --git a/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp b/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp
deleted file mode 100644
index a6c22775e098..000000000000
--- a/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp
+++ /dev/null
@@ -1,196 +0,0 @@
-//===-- SIPrepareScratchRegs.cpp - Use predicates for control flow --------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-///
-/// This pass loads scratch pointer and scratch offset into a register or a
-/// frame index which can be used anywhere in the program.  These values will
-/// be used for spilling VGPRs.
-///
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIDefines.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
-
-using namespace llvm;
-
-namespace {
-
-class SIPrepareScratchRegs : public MachineFunctionPass {
-
-private:
-  static char ID;
-
-public:
-  SIPrepareScratchRegs() : MachineFunctionPass(ID) { }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  const char *getPassName() const override {
-    return "SI prepare scratch registers";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-};
-
-} // End anonymous namespace
-
-char SIPrepareScratchRegs::ID = 0;
-
-FunctionPass *llvm::createSIPrepareScratchRegs() {
-  return new SIPrepareScratchRegs();
-}
-
-bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) {
-  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
-  const SIRegisterInfo *TRI = &TII->getRegisterInfo();
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  MachineFrameInfo *FrameInfo = MF.getFrameInfo();
-  MachineBasicBlock *Entry = &MF.front();
-  MachineBasicBlock::iterator I = Entry->begin();
-  DebugLoc DL = I->getDebugLoc();
-
-  // FIXME: If we don't have enough VGPRs for SGPR spilling we will need to
-  // run this pass.
-  if (!MFI->hasSpilledVGPRs())
-    return false;
-
-  unsigned ScratchPtrPreloadReg =
-      TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR);
-  unsigned ScratchOffsetPreloadReg =
-      TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
-
-  if (!Entry->isLiveIn(ScratchPtrPreloadReg))
-    Entry->addLiveIn(ScratchPtrPreloadReg);
-
-  if (!Entry->isLiveIn(ScratchOffsetPreloadReg))
-    Entry->addLiveIn(ScratchOffsetPreloadReg);
-
-  // Load the scratch offset.
-  unsigned ScratchOffsetReg =
-      TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_32RegClass);
-  int ScratchOffsetFI = -1;
-
-  if (ScratchOffsetReg != AMDGPU::NoRegister) {
-    // Found an SGPR to use
-    BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B32), ScratchOffsetReg)
-            .addReg(ScratchOffsetPreloadReg);
-  } else {
-    // No SGPR is available, we must spill.
-    ScratchOffsetFI = FrameInfo->CreateSpillStackObject(4,4);
-    BuildMI(*Entry, I, DL, TII->get(AMDGPU::SI_SPILL_S32_SAVE))
-            .addReg(ScratchOffsetPreloadReg)
-            .addFrameIndex(ScratchOffsetFI)
-            .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
-            .addReg(AMDGPU::SGPR0, RegState::Undef);
-  }
-
-
-  // Now that we have the scratch pointer and offset values, we need to
-  // add them to all the SI_SPILL_V* instructions.
-
-  RegScavenger RS;
-  unsigned ScratchRsrcFI = FrameInfo->CreateSpillStackObject(16, 4);
-  RS.addScavengingFrameIndex(ScratchRsrcFI);
-
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-       BI != BE; ++BI) {
-
-    MachineBasicBlock &MBB = *BI;
-    // Add the scratch offset reg as a live-in so that the register scavenger
-    // doesn't re-use it.
-    if (!MBB.isLiveIn(ScratchOffsetReg) &&
-        ScratchOffsetReg != AMDGPU::NoRegister)
-      MBB.addLiveIn(ScratchOffsetReg);
-    RS.enterBasicBlock(&MBB);
-
-    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
-         I != E; ++I) {
-      MachineInstr &MI = *I;
-      RS.forward(I);
-      DebugLoc DL = MI.getDebugLoc();
-      if (!TII->isVGPRSpill(MI))
-        continue;
-
-      // Scratch resource
-      unsigned ScratchRsrcReg =
-          RS.scavengeRegister(&AMDGPU::SReg_128RegClass, 0);
-
-      uint64_t Rsrc23 = TII->getScratchRsrcWords23();
-
-      unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
-      unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
-      unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
-      unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
-
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0)
-              .addExternalSymbol("SCRATCH_RSRC_DWORD0")
-              .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1)
-              .addExternalSymbol("SCRATCH_RSRC_DWORD1")
-              .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2)
-              .addImm(Rsrc23 & 0xffffffff)
-              .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3)
-              .addImm(Rsrc23 >> 32)
-              .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-
-      // Scratch Offset
-      if (ScratchOffsetReg == AMDGPU::NoRegister) {
-        ScratchOffsetReg = RS.scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
-        BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_SPILL_S32_RESTORE),
-                ScratchOffsetReg)
-                .addFrameIndex(ScratchOffsetFI)
-                .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
-                .addReg(AMDGPU::SGPR0, RegState::Undef);
-      } else if (!MBB.isLiveIn(ScratchOffsetReg)) {
-        MBB.addLiveIn(ScratchOffsetReg);
-      }
-
-      if (ScratchRsrcReg == AMDGPU::NoRegister ||
-          ScratchOffsetReg == AMDGPU::NoRegister) {
-        LLVMContext &Ctx = MF.getFunction()->getContext();
-        Ctx.emitError("ran out of SGPRs for spilling VGPRs");
-        ScratchRsrcReg = AMDGPU::SGPR0;
-        ScratchOffsetReg = AMDGPU::SGPR0;
-      }
-      MI.getOperand(2).setReg(ScratchRsrcReg);
-      MI.getOperand(2).setIsKill(true);
-      MI.getOperand(2).setIsUndef(false);
-      MI.getOperand(3).setReg(ScratchOffsetReg);
-      MI.getOperand(3).setIsUndef(false);
-      MI.getOperand(3).setIsKill(false);
-      MI.addOperand(MachineOperand::CreateReg(Rsrc0, false, true, true));
-      MI.addOperand(MachineOperand::CreateReg(Rsrc1, false, true, true));
-      MI.addOperand(MachineOperand::CreateReg(Rsrc2, false, true, true));
-      MI.addOperand(MachineOperand::CreateReg(Rsrc3, false, true, true));
-    }
-  }
-  return true;
-}
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index ab7539b6fb3a..b392c86fa2e1 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -68,6 +68,22 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     }
   }
 
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
+  if (ScratchRSrcReg != AMDGPU::NoRegister) {
+    unsigned ScratchOffsetPreloadReg
+      = getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
+    // We will need to use this user SGPR argument for spilling, and thus never
+    // want it to be spilled.
+    reserveRegisterTuples(Reserved, ScratchOffsetPreloadReg);
+
+    // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
+    // to spill.
+    // TODO: May need to reserve a VGPR if doing LDS spilling.
+    reserveRegisterTuples(Reserved, ScratchRSrcReg);
+    assert(!isSubRegister(ScratchRSrcReg, ScratchOffsetPreloadReg));
+  }
+
   return Reserved;
 }
 
@@ -243,6 +259,9 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
                 .addReg(SubReg)
                 .addImm(Spill.Lane);
 
+        // FIXME: Since this spills to another register instead of an actual
+        // frame index, we should delete the frame index when all references to
+        // it are fixed.
       }
       MI->eraseFromParent();
       break;
diff --git a/test/CodeGen/AMDGPU/kernel-args.ll b/test/CodeGen/AMDGPU/kernel-args.ll
index 803b2ecced01..e9d98ac89e72 100644
--- a/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/test/CodeGen/AMDGPU/kernel-args.ll
@@ -294,8 +294,8 @@ entry:
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
-; SI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
-; VI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x44
+; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
+; VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
 define void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
 entry:
   store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
@@ -311,7 +311,7 @@ entry:
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
-; SI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
+; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
 define void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
 entry:
   store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
@@ -413,8 +413,8 @@ entry:
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
-; SI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
-; VI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64
+; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
+; VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
 define void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
 entry:
   store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
@@ -438,8 +438,8 @@ entry:
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
-; SI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
-; VI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64
+; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
+; VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
 define void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
 entry:
   store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/large-alloca-compute.ll b/test/CodeGen/AMDGPU/large-alloca-compute.ll
new file mode 100644
index 000000000000..5e8cf5bb3d25
--- /dev/null
+++ b/test/CodeGen/AMDGPU/large-alloca-compute.ll
@@ -0,0 +1,42 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
+; XUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s
+; XUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s
+
+; FIXME: align on alloca seems to be ignored for private_segment_alignment
+
+; ALL-LABEL: {{^}}large_alloca_compute_shader:
+
+; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GCN: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GCN: s_mov_b32 s14, -1
+; CI: s_mov_b32 s15, 0x80f000
+; VI: s_mov_b32 s15, 0x800000
+
+
+; GCNHSA: .amd_kernel_code_t
+; GCNHSA: private_segment_alignment = 4
+; GCNHSA: .end_amd_kernel_code_t
+
+; GCNHSA: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GCNHSA: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GCNHSA: s_mov_b32 s10, -1
+; CIHSA: s_mov_b32 s11, 0x180f000
+; VIHSA: s_mov_b32 s11, 0x11800000
+
+; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s6 offen
+; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s6 offen
+
+; Scratch size = alloca size + emergency stack slot
+; ALL: ; ScratchSize: 32772
+define void @large_alloca_compute_shader(i32 %x, i32 %y) #0 {
+  %large = alloca [8192 x i32], align 4
+  %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191
+  store volatile i32 %x, i32* %gep
+  %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y
+  %val = load volatile i32, i32* %gep1
+  store volatile i32 %val, i32 addrspace(1)* undef
+  ret void
+}
+
+attributes #0 = { nounwind  }
diff --git a/test/CodeGen/AMDGPU/large-alloca-graphics.ll b/test/CodeGen/AMDGPU/large-alloca-graphics.ll
new file mode 100644
index 000000000000..208b9a10050c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/large-alloca-graphics.ll
@@ -0,0 +1,47 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
+
+; ALL-LABEL: {{^}}large_alloca_pixel_shader:
+; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GCN: s_mov_b32 s10, -1
+; CI: s_mov_b32 s11, 0x80f000
+; VI: s_mov_b32 s11, 0x800000
+
+; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen
+
+; ALL: ; ScratchSize: 32772
+define void @large_alloca_pixel_shader(i32 %x, i32 %y) #1 {
+  %large = alloca [8192 x i32], align 4
+  %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191
+  store volatile i32 %x, i32* %gep
+  %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y
+  %val = load volatile i32, i32* %gep1
+  store volatile i32 %val, i32 addrspace(1)* undef
+  ret void
+}
+
+; ALL-LABEL: {{^}}large_alloca_pixel_shader_inreg:
+; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GCN: s_mov_b32 s10, -1
+; CI: s_mov_b32 s11, 0x80f000
+; VI: s_mov_b32 s11, 0x800000
+
+; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen
+
+; ALL: ; ScratchSize: 32772
+define void @large_alloca_pixel_shader_inreg(i32 inreg %x, i32 inreg %y) #1 {
+  %large = alloca [8192 x i32], align 4
+  %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191
+  store volatile i32 %x, i32* %gep
+  %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y
+  %val = load volatile i32, i32* %gep1
+  store volatile i32 %val, i32 addrspace(1)* undef
+  ret void
+}
+
+attributes #0 = { nounwind  }
+attributes #1 = { nounwind "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/large-alloca.ll b/test/CodeGen/AMDGPU/large-alloca.ll
deleted file mode 100644
index e1122da78ef5..000000000000
--- a/test/CodeGen/AMDGPU/large-alloca.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-
-; GCN-LABEL: {{^}}large_alloca:
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
-; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
-; GCN: ScratchSize: 32776
-define void @large_alloca(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
-  %large = alloca [8192 x i32], align 4
-  %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191
-  store i32 %x, i32* %gep
-  %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y
-  %load = load i32, i32* %gep1
-  store i32 %load, i32 addrspace(1)* %out
-  ret void
-}
-
-attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/si-sgpr-spill.ll b/test/CodeGen/AMDGPU/si-sgpr-spill.ll
index 84652701f773..d7b35fc631eb 100644
--- a/test/CodeGen/AMDGPU/si-sgpr-spill.ll
+++ b/test/CodeGen/AMDGPU/si-sgpr-spill.ll
@@ -6,6 +6,16 @@
 
 ; CHECK-LABEL: {{^}}main:
 ; CHECK: s_wqm
+
+; Make sure not emitting unused scratch resource descriptor setup
+; CHECK-NOT: s_mov_b32
+; CHECK-NOT: s_mov_b32
+; CHECK-NOT: s_mov_b32
+; CHECK-NOT: s_mov_b32
+
+; CHECK: s_mov_b32 m0
+
+
 ; Writing to M0 from an SMRD instruction will hang the GPU.
 ; CHECK-NOT: s_buffer_load_dword m0
 ; CHECK: s_endpgm
diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
new file mode 100644
index 000000000000..2cbb67d85fba
--- /dev/null
+++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
@@ -0,0 +1,583 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; XUN: llc -march=amdgcn -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA %s
+; XUN: llc -march=amdgcn -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA %s
+
+; This ends up using all 256 registers and requires register
+; scavenging which will fail to find an unsued register.
+
+; Check the ScratchSize to avoid regressions from spilling
+; intermediate register class copies.
+
+; FIXME: The same register is initialized to 0 for every spill.
+
+declare i32 @llvm.r600.read.tgid.x() #1
+declare i32 @llvm.r600.read.tgid.y() #1
+declare i32 @llvm.r600.read.tgid.z() #1
+
+; GCN-LABEL: {{^}}spill_vgpr_compute:
+
+; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GCN-NEXT: s_mov_b32 s14, -1
+; SI-NEXT: s_mov_b32 s15, 0x80f000
+; VI-NEXT: s_mov_b32 s15, 0x800000
+
+; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s8 offset:{{[0-9]+}} ; 4-byte Folded Spill
+
+; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s8 offen offset:{{[0-9]+}}
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s8 offen offset:{{[0-9]+}}
+
+; GCN: NumVgprs: 256
+; GCN: ScratchSize: 1024
+
+; s[0:3] input user SGPRs. s4,s5,s6 = workgroup IDs. s8 scratch offset.
+define void @spill_vgpr_compute(<4 x float> %arg6, float addrspace(1)* %arg, i32 %arg1, i32 %arg2, float %arg3, float %arg4, float %arg5) #0 {
+bb:
+  %tmp = add i32 %arg1, %arg2
+  %tmp7 = extractelement <4 x float> %arg6, i32 0
+  %tmp8 = extractelement <4 x float> %arg6, i32 1
+  %tmp9 = extractelement <4 x float> %arg6, i32 2
+  %tmp10 = extractelement <4 x float> %arg6, i32 3
+  %tmp11 = bitcast float %arg5 to i32
+  br label %bb12
+
+bb12:                                             ; preds = %bb145, %bb
+  %tmp13 = phi float [ 0.000000e+00, %bb ], [ %tmp338, %bb145 ]
+  %tmp14 = phi float [ 0.000000e+00, %bb ], [ %tmp337, %bb145 ]
+  %tmp15 = phi float [ 0.000000e+00, %bb ], [ %tmp336, %bb145 ]
+  %tmp16 = phi float [ 0.000000e+00, %bb ], [ %tmp339, %bb145 ]
+  %tmp17 = phi float [ 0.000000e+00, %bb ], [ %tmp335, %bb145 ]
+  %tmp18 = phi float [ 0.000000e+00, %bb ], [ %tmp334, %bb145 ]
+  %tmp19 = phi float [ 0.000000e+00, %bb ], [ %tmp333, %bb145 ]
+  %tmp20 = phi float [ 0.000000e+00, %bb ], [ %tmp340, %bb145 ]
+  %tmp21 = phi float [ 0.000000e+00, %bb ], [ %tmp332, %bb145 ]
+  %tmp22 = phi float [ 0.000000e+00, %bb ], [ %tmp331, %bb145 ]
+  %tmp23 = phi float [ 0.000000e+00, %bb ], [ %tmp330, %bb145 ]
+  %tmp24 = phi float [ 0.000000e+00, %bb ], [ %tmp341, %bb145 ]
+  %tmp25 = phi float [ 0.000000e+00, %bb ], [ %tmp329, %bb145 ]
+  %tmp26 = phi float [ 0.000000e+00, %bb ], [ %tmp328, %bb145 ]
+  %tmp27 = phi float [ 0.000000e+00, %bb ], [ %tmp327, %bb145 ]
+  %tmp28 = phi float [ 0.000000e+00, %bb ], [ %tmp342, %bb145 ]
+  %tmp29 = phi float [ 0.000000e+00, %bb ], [ %tmp326, %bb145 ]
+  %tmp30 = phi float [ 0.000000e+00, %bb ], [ %tmp325, %bb145 ]
+  %tmp31 = phi float [ 0.000000e+00, %bb ], [ %tmp324, %bb145 ]
+  %tmp32 = phi float [ 0.000000e+00, %bb ], [ %tmp343, %bb145 ]
+  %tmp33 = phi float [ 0.000000e+00, %bb ], [ %tmp323, %bb145 ]
+  %tmp34 = phi float [ 0.000000e+00, %bb ], [ %tmp322, %bb145 ]
+  %tmp35 = phi float [ 0.000000e+00, %bb ], [ %tmp321, %bb145 ]
+  %tmp36 = phi float [ 0.000000e+00, %bb ], [ %tmp344, %bb145 ]
+  %tmp37 = phi float [ 0.000000e+00, %bb ], [ %tmp320, %bb145 ]
+  %tmp38 = phi float [ 0.000000e+00, %bb ], [ %tmp319, %bb145 ]
+  %tmp39 = phi float [ 0.000000e+00, %bb ], [ %tmp318, %bb145 ]
+  %tmp40 = phi float [ 0.000000e+00, %bb ], [ %tmp345, %bb145 ]
+  %tmp41 = phi float [ 0.000000e+00, %bb ], [ %tmp317, %bb145 ]
+  %tmp42 = phi float [ 0.000000e+00, %bb ], [ %tmp316, %bb145 ]
+  %tmp43 = phi float [ 0.000000e+00, %bb ], [ %tmp315, %bb145 ]
+  %tmp44 = phi float [ 0.000000e+00, %bb ], [ %tmp346, %bb145 ]
+  %tmp45 = phi float [ 0.000000e+00, %bb ], [ %tmp314, %bb145 ]
+  %tmp46 = phi float [ 0.000000e+00, %bb ], [ %tmp313, %bb145 ]
+  %tmp47 = phi float [ 0.000000e+00, %bb ], [ %tmp312, %bb145 ]
+  %tmp48 = phi float [ 0.000000e+00, %bb ], [ %tmp347, %bb145 ]
+  %tmp49 = phi float [ 0.000000e+00, %bb ], [ %tmp311, %bb145 ]
+  %tmp50 = phi float [ 0.000000e+00, %bb ], [ %tmp310, %bb145 ]
+  %tmp51 = phi float [ 0.000000e+00, %bb ], [ %tmp309, %bb145 ]
+  %tmp52 = phi float [ 0.000000e+00, %bb ], [ %tmp348, %bb145 ]
+  %tmp53 = phi float [ 0.000000e+00, %bb ], [ %tmp308, %bb145 ]
+  %tmp54 = phi float [ 0.000000e+00, %bb ], [ %tmp307, %bb145 ]
+  %tmp55 = phi float [ 0.000000e+00, %bb ], [ %tmp306, %bb145 ]
+  %tmp56 = phi float [ 0.000000e+00, %bb ], [ %tmp349, %bb145 ]
+  %tmp57 = phi float [ 0.000000e+00, %bb ], [ %tmp305, %bb145 ]
+  %tmp58 = phi float [ 0.000000e+00, %bb ], [ %tmp304, %bb145 ]
+  %tmp59 = phi float [ 0.000000e+00, %bb ], [ %tmp303, %bb145 ]
+  %tmp60 = phi float [ 0.000000e+00, %bb ], [ %tmp350, %bb145 ]
+  %tmp61 = phi float [ 0.000000e+00, %bb ], [ %tmp302, %bb145 ]
+  %tmp62 = phi float [ 0.000000e+00, %bb ], [ %tmp301, %bb145 ]
+  %tmp63 = phi float [ 0.000000e+00, %bb ], [ %tmp300, %bb145 ]
+  %tmp64 = phi float [ 0.000000e+00, %bb ], [ %tmp351, %bb145 ]
+  %tmp65 = phi float [ 0.000000e+00, %bb ], [ %tmp299, %bb145 ]
+  %tmp66 = phi float [ 0.000000e+00, %bb ], [ %tmp298, %bb145 ]
+  %tmp67 = phi float [ 0.000000e+00, %bb ], [ %tmp297, %bb145 ]
+  %tmp68 = phi float [ 0.000000e+00, %bb ], [ %tmp352, %bb145 ]
+  %tmp69 = phi float [ 0.000000e+00, %bb ], [ %tmp296, %bb145 ]
+  %tmp70 = phi float [ 0.000000e+00, %bb ], [ %tmp295, %bb145 ]
+  %tmp71 = phi float [ 0.000000e+00, %bb ], [ %tmp294, %bb145 ]
+  %tmp72 = phi float [ 0.000000e+00, %bb ], [ %tmp353, %bb145 ]
+  %tmp73 = phi float [ 0.000000e+00, %bb ], [ %tmp293, %bb145 ]
+  %tmp74 = phi float [ 0.000000e+00, %bb ], [ %tmp292, %bb145 ]
+  %tmp75 = phi float [ 0.000000e+00, %bb ], [ %tmp291, %bb145 ]
+  %tmp76 = phi float [ 0.000000e+00, %bb ], [ %tmp354, %bb145 ]
+  %tmp77 = phi float [ 0.000000e+00, %bb ], [ %tmp290, %bb145 ]
+  %tmp78 = phi float [ 0.000000e+00, %bb ], [ %tmp289, %bb145 ]
+  %tmp79 = phi float [ 0.000000e+00, %bb ], [ %tmp288, %bb145 ]
+  %tmp80 = phi float [ 0.000000e+00, %bb ], [ %tmp355, %bb145 ]
+  %tmp81 = phi float [ 0.000000e+00, %bb ], [ %tmp287, %bb145 ]
+  %tmp82 = phi float [ 0.000000e+00, %bb ], [ %tmp286, %bb145 ]
+  %tmp83 = phi float [ 0.000000e+00, %bb ], [ %tmp285, %bb145 ]
+  %tmp84 = phi float [ 0.000000e+00, %bb ], [ %tmp356, %bb145 ]
+  %tmp85 = phi float [ 0.000000e+00, %bb ], [ %tmp284, %bb145 ]
+  %tmp86 = phi float [ 0.000000e+00, %bb ], [ %tmp283, %bb145 ]
+  %tmp87 = phi float [ 0.000000e+00, %bb ], [ %tmp282, %bb145 ]
+  %tmp88 = phi float [ 0.000000e+00, %bb ], [ %tmp357, %bb145 ]
+  %tmp89 = phi float [ 0.000000e+00, %bb ], [ %tmp281, %bb145 ]
+  %tmp90 = phi float [ 0.000000e+00, %bb ], [ %tmp280, %bb145 ]
+  %tmp91 = phi float [ 0.000000e+00, %bb ], [ %tmp279, %bb145 ]
+  %tmp92 = phi float [ 0.000000e+00, %bb ], [ %tmp358, %bb145 ]
+  %tmp93 = phi float [ 0.000000e+00, %bb ], [ %tmp359, %bb145 ]
+  %tmp94 = phi float [ 0.000000e+00, %bb ], [ %tmp360, %bb145 ]
+  %tmp95 = phi float [ 0.000000e+00, %bb ], [ %tmp409, %bb145 ]
+  %tmp96 = phi float [ 0.000000e+00, %bb ], [ %tmp361, %bb145 ]
+  %tmp97 = phi float [ 0.000000e+00, %bb ], [ %tmp362, %bb145 ]
+  %tmp98 = phi float [ 0.000000e+00, %bb ], [ %tmp363, %bb145 ]
+  %tmp99 = phi float [ 0.000000e+00, %bb ], [ %tmp364, %bb145 ]
+  %tmp100 = phi float [ 0.000000e+00, %bb ], [ %tmp365, %bb145 ]
+  %tmp101 = phi float [ 0.000000e+00, %bb ], [ %tmp366, %bb145 ]
+  %tmp102 = phi float [ 0.000000e+00, %bb ], [ %tmp367, %bb145 ]
+  %tmp103 = phi float [ 0.000000e+00, %bb ], [ %tmp368, %bb145 ]
+  %tmp104 = phi float [ 0.000000e+00, %bb ], [ %tmp369, %bb145 ]
+  %tmp105 = phi float [ 0.000000e+00, %bb ], [ %tmp370, %bb145 ]
+  %tmp106 = phi float [ 0.000000e+00, %bb ], [ %tmp371, %bb145 ]
+  %tmp107 = phi float [ 0.000000e+00, %bb ], [ %tmp372, %bb145 ]
+  %tmp108 = phi float [ 0.000000e+00, %bb ], [ %tmp373, %bb145 ]
+  %tmp109 = phi float [ 0.000000e+00, %bb ], [ %tmp374, %bb145 ]
+  %tmp110 = phi float [ 0.000000e+00, %bb ], [ %tmp375, %bb145 ]
+  %tmp111 = phi float [ 0.000000e+00, %bb ], [ %tmp376, %bb145 ]
+  %tmp112 = phi float [ 0.000000e+00, %bb ], [ %tmp377, %bb145 ]
+  %tmp113 = phi float [ 0.000000e+00, %bb ], [ %tmp378, %bb145 ]
+  %tmp114 = phi float [ 0.000000e+00, %bb ], [ %tmp379, %bb145 ]
+  %tmp115 = phi float [ 0.000000e+00, %bb ], [ %tmp380, %bb145 ]
+  %tmp116 = phi float [ 0.000000e+00, %bb ], [ %tmp381, %bb145 ]
+  %tmp117 = phi float [ 0.000000e+00, %bb ], [ %tmp382, %bb145 ]
+  %tmp118 = phi float [ 0.000000e+00, %bb ], [ %tmp383, %bb145 ]
+  %tmp119 = phi float [ 0.000000e+00, %bb ], [ %tmp384, %bb145 ]
+  %tmp120 = phi float [ 0.000000e+00, %bb ], [ %tmp385, %bb145 ]
+  %tmp121 = phi float [ 0.000000e+00, %bb ], [ %tmp386, %bb145 ]
+  %tmp122 = phi float [ 0.000000e+00, %bb ], [ %tmp387, %bb145 ]
+  %tmp123 = phi float [ 0.000000e+00, %bb ], [ %tmp388, %bb145 ]
+  %tmp124 = phi float [ 0.000000e+00, %bb ], [ %tmp389, %bb145 ]
+  %tmp125 = phi float [ 0.000000e+00, %bb ], [ %tmp390, %bb145 ]
+  %tmp126 = phi float [ 0.000000e+00, %bb ], [ %tmp391, %bb145 ]
+  %tmp127 = phi float [ 0.000000e+00, %bb ], [ %tmp392, %bb145 ]
+  %tmp128 = phi float [ 0.000000e+00, %bb ], [ %tmp393, %bb145 ]
+  %tmp129 = phi float [ 0.000000e+00, %bb ], [ %tmp394, %bb145 ]
+  %tmp130 = phi float [ 0.000000e+00, %bb ], [ %tmp395, %bb145 ]
+  %tmp131 = phi float [ 0.000000e+00, %bb ], [ %tmp396, %bb145 ]
+  %tmp132 = phi float [ 0.000000e+00, %bb ], [ %tmp397, %bb145 ]
+  %tmp133 = phi float [ 0.000000e+00, %bb ], [ %tmp398, %bb145 ]
+  %tmp134 = phi float [ 0.000000e+00, %bb ], [ %tmp399, %bb145 ]
+  %tmp135 = phi float [ 0.000000e+00, %bb ], [ %tmp400, %bb145 ]
+  %tmp136 = phi float [ 0.000000e+00, %bb ], [ %tmp401, %bb145 ]
+  %tmp137 = phi float [ 0.000000e+00, %bb ], [ %tmp402, %bb145 ]
+  %tmp138 = phi float [ 0.000000e+00, %bb ], [ %tmp403, %bb145 ]
+  %tmp139 = phi float [ 0.000000e+00, %bb ], [ %tmp404, %bb145 ]
+  %tmp140 = phi float [ 0.000000e+00, %bb ], [ %tmp405, %bb145 ]
+  %tmp141 = phi float [ 0.000000e+00, %bb ], [ %tmp406, %bb145 ]
+  %tmp142 = bitcast float %tmp95 to i32
+  %tmp143 = icmp sgt i32 %tmp142, 125
+  br i1 %tmp143, label %bb144, label %bb145
+
+bb144:                                            ; preds = %bb12
+  store volatile float %arg3, float addrspace(1)* %arg
+  store volatile float %tmp91, float addrspace(1)* %arg
+  store volatile float %tmp90, float addrspace(1)* %arg
+  store volatile float %tmp89, float addrspace(1)* %arg
+  store volatile float %tmp87, float addrspace(1)* %arg
+  store volatile float %tmp86, float addrspace(1)* %arg
+  store volatile float %tmp85, float addrspace(1)* %arg
+  store volatile float %tmp83, float addrspace(1)* %arg
+  store volatile float %tmp82, float addrspace(1)* %arg
+  store volatile float %tmp81, float addrspace(1)* %arg
+  store volatile float %tmp79, float addrspace(1)* %arg
+  store volatile float %tmp78, float addrspace(1)* %arg
+  store volatile float %tmp77, float addrspace(1)* %arg
+  store volatile float %tmp75, float addrspace(1)* %arg
+  store volatile float %tmp74, float addrspace(1)* %arg
+  store volatile float %tmp73, float addrspace(1)* %arg
+  store volatile float %tmp71, float addrspace(1)* %arg
+  store volatile float %tmp70, float addrspace(1)* %arg
+  store volatile float %tmp69, float addrspace(1)* %arg
+  store volatile float %tmp67, float addrspace(1)* %arg
+  store volatile float %tmp66, float addrspace(1)* %arg
+  store volatile float %tmp65, float addrspace(1)* %arg
+  store volatile float %tmp63, float addrspace(1)* %arg
+  store volatile float %tmp62, float addrspace(1)* %arg
+  store volatile float %tmp61, float addrspace(1)* %arg
+  store volatile float %tmp59, float addrspace(1)* %arg
+  store volatile float %tmp58, float addrspace(1)* %arg
+  store volatile float %tmp57, float addrspace(1)* %arg
+  store volatile float %tmp55, float addrspace(1)* %arg
+  store volatile float %tmp54, float addrspace(1)* %arg
+  store volatile float %tmp53, float addrspace(1)* %arg
+  store volatile float %tmp51, float addrspace(1)* %arg
+  store volatile float %tmp50, float addrspace(1)* %arg
+  store volatile float %tmp49, float addrspace(1)* %arg
+  store volatile float %tmp47, float addrspace(1)* %arg
+  store volatile float %tmp46, float addrspace(1)* %arg
+  store volatile float %tmp45, float addrspace(1)* %arg
+  store volatile float %tmp43, float addrspace(1)* %arg
+  store volatile float %tmp42, float addrspace(1)* %arg
+  store volatile float %tmp41, float addrspace(1)* %arg
+  store volatile float %tmp39, float addrspace(1)* %arg
+  store volatile float %tmp38, float addrspace(1)* %arg
+  store volatile float %tmp37, float addrspace(1)* %arg
+  store volatile float %tmp35, float addrspace(1)* %arg
+  store volatile float %tmp34, float addrspace(1)* %arg
+  store volatile float %tmp33, float addrspace(1)* %arg
+  store volatile float %tmp31, float addrspace(1)* %arg
+  store volatile float %tmp30, float addrspace(1)* %arg
+  store volatile float %tmp29, float addrspace(1)* %arg
+  store volatile float %tmp27, float addrspace(1)* %arg
+  store volatile float %tmp26, float addrspace(1)* %arg
+  store volatile float %tmp25, float addrspace(1)* %arg
+  store volatile float %tmp23, float addrspace(1)* %arg
+  store volatile float %tmp22, float addrspace(1)* %arg
+  store volatile float %tmp21, float addrspace(1)* %arg
+  store volatile float %tmp19, float addrspace(1)* %arg
+  store volatile float %tmp18, float addrspace(1)* %arg
+  store volatile float %tmp17, float addrspace(1)* %arg
+  store volatile float %tmp15, float addrspace(1)* %arg
+  store volatile float %tmp14, float addrspace(1)* %arg
+  store volatile float %tmp13, float addrspace(1)* %arg
+  store volatile float %tmp16, float addrspace(1)* %arg
+  store volatile float %tmp20, float addrspace(1)* %arg
+  store volatile float %tmp24, float addrspace(1)* %arg
+  store volatile float %tmp28, float addrspace(1)* %arg
+  store volatile float %tmp32, float addrspace(1)* %arg
+  store volatile float %tmp36, float addrspace(1)* %arg
+  store volatile float %tmp40, float addrspace(1)* %arg
+  store volatile float %tmp44, float addrspace(1)* %arg
+  store volatile float %tmp48, float addrspace(1)* %arg
+  store volatile float %tmp52, float addrspace(1)* %arg
+  store volatile float %tmp56, float addrspace(1)* %arg
+  store volatile float %tmp60, float addrspace(1)* %arg
+  store volatile float %tmp64, float addrspace(1)* %arg
+  store volatile float %tmp68, float addrspace(1)* %arg
+  store volatile float %tmp72, float addrspace(1)* %arg
+  store volatile float %tmp76, float addrspace(1)* %arg
+  store volatile float %tmp80, float addrspace(1)* %arg
+  store volatile float %tmp84, float addrspace(1)* %arg
+  store volatile float %tmp88, float addrspace(1)* %arg
+  store volatile float %tmp92, float addrspace(1)* %arg
+  store volatile float %tmp93, float addrspace(1)* %arg
+  store volatile float %tmp94, float addrspace(1)* %arg
+  store volatile float %tmp96, float addrspace(1)* %arg
+  store volatile float %tmp97, float addrspace(1)* %arg
+  store volatile float %tmp98, float addrspace(1)* %arg
+  store volatile float %tmp99, float addrspace(1)* %arg
+  store volatile float %tmp100, float addrspace(1)* %arg
+  store volatile float %tmp101, float addrspace(1)* %arg
+  store volatile float %tmp102, float addrspace(1)* %arg
+  store volatile float %tmp103, float addrspace(1)* %arg
+  store volatile float %tmp104, float addrspace(1)* %arg
+  store volatile float %tmp105, float addrspace(1)* %arg
+  store volatile float %tmp106, float addrspace(1)* %arg
+  store volatile float %tmp107, float addrspace(1)* %arg
+  store volatile float %tmp108, float addrspace(1)* %arg
+  store volatile float %tmp109, float addrspace(1)* %arg
+  store volatile float %tmp110, float addrspace(1)* %arg
+  store volatile float %tmp111, float addrspace(1)* %arg
+  store volatile float %tmp112, float addrspace(1)* %arg
+  store volatile float %tmp113, float addrspace(1)* %arg
+  store volatile float %tmp114, float addrspace(1)* %arg
+  store volatile float %tmp115, float addrspace(1)* %arg
+  store volatile float %tmp116, float addrspace(1)* %arg
+  store volatile float %tmp117, float addrspace(1)* %arg
+  store volatile float %tmp118, float addrspace(1)* %arg
+  store volatile float %tmp119, float addrspace(1)* %arg
+  store volatile float %tmp120, float addrspace(1)* %arg
+  store volatile float %tmp121, float addrspace(1)* %arg
+  store volatile float %tmp122, float addrspace(1)* %arg
+  store volatile float %tmp123, float addrspace(1)* %arg
+  store volatile float %tmp124, float addrspace(1)* %arg
+  store volatile float %tmp125, float addrspace(1)* %arg
+  store volatile float %tmp126, float addrspace(1)* %arg
+  store volatile float %tmp127, float addrspace(1)* %arg
+  store volatile float %tmp128, float addrspace(1)* %arg
+  store volatile float %tmp129, float addrspace(1)* %arg
+  store volatile float %tmp130, float addrspace(1)* %arg
+  store volatile float %tmp131, float addrspace(1)* %arg
+  store volatile float %tmp132, float addrspace(1)* %arg
+  store volatile float %tmp133, float addrspace(1)* %arg
+  store volatile float %tmp134, float addrspace(1)* %arg
+  store volatile float %tmp135, float addrspace(1)* %arg
+  store volatile float %tmp136, float addrspace(1)* %arg
+  store volatile float %tmp137, float addrspace(1)* %arg
+  store volatile float %tmp138, float addrspace(1)* %arg
+  store volatile float %tmp139, float addrspace(1)* %arg
+  store volatile float %arg4, float addrspace(1)* %arg
+  store volatile float %tmp7, float addrspace(1)* %arg
+  store volatile float %tmp8, float addrspace(1)* %arg
+  store volatile float %tmp9, float addrspace(1)* %arg
+  store volatile float %tmp10, float addrspace(1)* %arg
+  ret void
+
+bb145:                                            ; preds = %bb12
+  %tmp146 = bitcast float %tmp95 to i32
+  %tmp147 = bitcast float %tmp95 to i32
+  %tmp148 = add i32 %tmp11, %tmp147
+  %tmp149 = bitcast i32 %tmp148 to float
+  %tmp150 = insertelement <128 x float> undef, float %tmp91, i32 0
+  %tmp151 = insertelement <128 x float> %tmp150, float %tmp90, i32 1
+  %tmp152 = insertelement <128 x float> %tmp151, float %tmp89, i32 2
+  %tmp153 = insertelement <128 x float> %tmp152, float %tmp87, i32 3
+  %tmp154 = insertelement <128 x float> %tmp153, float %tmp86, i32 4
+  %tmp155 = insertelement <128 x float> %tmp154, float %tmp85, i32 5
+  %tmp156 = insertelement <128 x float> %tmp155, float %tmp83, i32 6
+  %tmp157 = insertelement <128 x float> %tmp156, float %tmp82, i32 7
+  %tmp158 = insertelement <128 x float> %tmp157, float %tmp81, i32 8
+  %tmp159 = insertelement <128 x float> %tmp158, float %tmp79, i32 9
+  %tmp160 = insertelement <128 x float> %tmp159, float %tmp78, i32 10
+  %tmp161 = insertelement <128 x float> %tmp160, float %tmp77, i32 11
+  %tmp162 = insertelement <128 x float> %tmp161, float %tmp75, i32 12
+  %tmp163 = insertelement <128 x float> %tmp162, float %tmp74, i32 13
+  %tmp164 = insertelement <128 x float> %tmp163, float %tmp73, i32 14
+  %tmp165 = insertelement <128 x float> %tmp164, float %tmp71, i32 15
+  %tmp166 = insertelement <128 x float> %tmp165, float %tmp70, i32 16
+  %tmp167 = insertelement <128 x float> %tmp166, float %tmp69, i32 17
+  %tmp168 = insertelement <128 x float> %tmp167, float %tmp67, i32 18
+  %tmp169 = insertelement <128 x float> %tmp168, float %tmp66, i32 19
+  %tmp170 = insertelement <128 x float> %tmp169, float %tmp65, i32 20
+  %tmp171 = insertelement <128 x float> %tmp170, float %tmp63, i32 21
+  %tmp172 = insertelement <128 x float> %tmp171, float %tmp62, i32 22
+  %tmp173 = insertelement <128 x float> %tmp172, float %tmp61, i32 23
+  %tmp174 = insertelement <128 x float> %tmp173, float %tmp59, i32 24
+  %tmp175 = insertelement <128 x float> %tmp174, float %tmp58, i32 25
+  %tmp176 = insertelement <128 x float> %tmp175, float %tmp57, i32 26
+  %tmp177 = insertelement <128 x float> %tmp176, float %tmp55, i32 27
+  %tmp178 = insertelement <128 x float> %tmp177, float %tmp54, i32 28
+  %tmp179 = insertelement <128 x float> %tmp178, float %tmp53, i32 29
+  %tmp180 = insertelement <128 x float> %tmp179, float %tmp51, i32 30
+  %tmp181 = insertelement <128 x float> %tmp180, float %tmp50, i32 31
+  %tmp182 = insertelement <128 x float> %tmp181, float %tmp49, i32 32
+  %tmp183 = insertelement <128 x float> %tmp182, float %tmp47, i32 33
+  %tmp184 = insertelement <128 x float> %tmp183, float %tmp46, i32 34
+  %tmp185 = insertelement <128 x float> %tmp184, float %tmp45, i32 35
+  %tmp186 = insertelement <128 x float> %tmp185, float %tmp43, i32 36
+  %tmp187 = insertelement <128 x float> %tmp186, float %tmp42, i32 37
+  %tmp188 = insertelement <128 x float> %tmp187, float %tmp41, i32 38
+  %tmp189 = insertelement <128 x float> %tmp188, float %tmp39, i32 39
+  %tmp190 = insertelement <128 x float> %tmp189, float %tmp38, i32 40
+  %tmp191 = insertelement <128 x float> %tmp190, float %tmp37, i32 41
+  %tmp192 = insertelement <128 x float> %tmp191, float %tmp35, i32 42
+  %tmp193 = insertelement <128 x float> %tmp192, float %tmp34, i32 43
+  %tmp194 = insertelement <128 x float> %tmp193, float %tmp33, i32 44
+  %tmp195 = insertelement <128 x float> %tmp194, float %tmp31, i32 45
+  %tmp196 = insertelement <128 x float> %tmp195, float %tmp30, i32 46
+  %tmp197 = insertelement <128 x float> %tmp196, float %tmp29, i32 47
+  %tmp198 = insertelement <128 x float> %tmp197, float %tmp27, i32 48
+  %tmp199 = insertelement <128 x float> %tmp198, float %tmp26, i32 49
+  %tmp200 = insertelement <128 x float> %tmp199, float %tmp25, i32 50
+  %tmp201 = insertelement <128 x float> %tmp200, float %tmp23, i32 51
+  %tmp202 = insertelement <128 x float> %tmp201, float %tmp22, i32 52
+  %tmp203 = insertelement <128 x float> %tmp202, float %tmp21, i32 53
+  %tmp204 = insertelement <128 x float> %tmp203, float %tmp19, i32 54
+  %tmp205 = insertelement <128 x float> %tmp204, float %tmp18, i32 55
+  %tmp206 = insertelement <128 x float> %tmp205, float %tmp17, i32 56
+  %tmp207 = insertelement <128 x float> %tmp206, float %tmp15, i32 57
+  %tmp208 = insertelement <128 x float> %tmp207, float %tmp14, i32 58
+  %tmp209 = insertelement <128 x float> %tmp208, float %tmp13, i32 59
+  %tmp210 = insertelement <128 x float> %tmp209, float %tmp16, i32 60
+  %tmp211 = insertelement <128 x float> %tmp210, float %tmp20, i32 61
+  %tmp212 = insertelement <128 x float> %tmp211, float %tmp24, i32 62
+  %tmp213 = insertelement <128 x float> %tmp212, float %tmp28, i32 63
+  %tmp214 = insertelement <128 x float> %tmp213, float %tmp32, i32 64
+  %tmp215 = insertelement <128 x float> %tmp214, float %tmp36, i32 65
+  %tmp216 = insertelement <128 x float> %tmp215, float %tmp40, i32 66
+  %tmp217 = insertelement <128 x float> %tmp216, float %tmp44, i32 67
+  %tmp218 = insertelement <128 x float> %tmp217, float %tmp48, i32 68
+  %tmp219 = insertelement <128 x float> %tmp218, float %tmp52, i32 69
+  %tmp220 = insertelement <128 x float> %tmp219, float %tmp56, i32 70
+  %tmp221 = insertelement <128 x float> %tmp220, float %tmp60, i32 71
+  %tmp222 = insertelement <128 x float> %tmp221, float %tmp64, i32 72
+  %tmp223 = insertelement <128 x float> %tmp222, float %tmp68, i32 73
+  %tmp224 = insertelement <128 x float> %tmp223, float %tmp72, i32 74
+  %tmp225 = insertelement <128 x float> %tmp224, float %tmp76, i32 75
+  %tmp226 = insertelement <128 x float> %tmp225, float %tmp80, i32 76
+  %tmp227 = insertelement <128 x float> %tmp226, float %tmp84, i32 77
+  %tmp228 = insertelement <128 x float> %tmp227, float %tmp88, i32 78
+  %tmp229 = insertelement <128 x float> %tmp228, float %tmp92, i32 79
+  %tmp230 = insertelement <128 x float> %tmp229, float %tmp93, i32 80
+  %tmp231 = insertelement <128 x float> %tmp230, float %tmp94, i32 81
+  %tmp232 = insertelement <128 x float> %tmp231, float %tmp96, i32 82
+  %tmp233 = insertelement <128 x float> %tmp232, float %tmp97, i32 83
+  %tmp234 = insertelement <128 x float> %tmp233, float %tmp98, i32 84
+  %tmp235 = insertelement <128 x float> %tmp234, float %tmp99, i32 85
+  %tmp236 = insertelement <128 x float> %tmp235, float %tmp100, i32 86
+  %tmp237 = insertelement <128 x float> %tmp236, float %tmp101, i32 87
+  %tmp238 = insertelement <128 x float> %tmp237, float %tmp102, i32 88
+  %tmp239 = insertelement <128 x float> %tmp238, float %tmp103, i32 89
+  %tmp240 = insertelement <128 x float> %tmp239, float %tmp104, i32 90
+  %tmp241 = insertelement <128 x float> %tmp240, float %tmp105, i32 91
+  %tmp242 = insertelement <128 x float> %tmp241, float %tmp106, i32 92
+  %tmp243 = insertelement <128 x float> %tmp242, float %tmp107, i32 93
+  %tmp244 = insertelement <128 x float> %tmp243, float %tmp108, i32 94
+  %tmp245 = insertelement <128 x float> %tmp244, float %tmp109, i32 95
+  %tmp246 = insertelement <128 x float> %tmp245, float %tmp110, i32 96
+  %tmp247 = insertelement <128 x float> %tmp246, float %tmp111, i32 97
+  %tmp248 = insertelement <128 x float> %tmp247, float %tmp112, i32 98
+  %tmp249 = insertelement <128 x float> %tmp248, float %tmp113, i32 99
+  %tmp250 = insertelement <128 x float> %tmp249, float %tmp114, i32 100
+  %tmp251 = insertelement <128 x float> %tmp250, float %tmp115, i32 101
+  %tmp252 = insertelement <128 x float> %tmp251, float %tmp116, i32 102
+  %tmp253 = insertelement <128 x float> %tmp252, float %tmp117, i32 103
+  %tmp254 = insertelement <128 x float> %tmp253, float %tmp118, i32 104
+  %tmp255 = insertelement <128 x float> %tmp254, float %tmp119, i32 105
+  %tmp256 = insertelement <128 x float> %tmp255, float %tmp120, i32 106
+  %tmp257 = insertelement <128 x float> %tmp256, float %tmp121, i32 107
+  %tmp258 = insertelement <128 x float> %tmp257, float %tmp122, i32 108
+  %tmp259 = insertelement <128 x float> %tmp258, float %tmp123, i32 109
+  %tmp260 = insertelement <128 x float> %tmp259, float %tmp124, i32 110
+  %tmp261 = insertelement <128 x float> %tmp260, float %tmp125, i32 111
+  %tmp262 = insertelement <128 x float> %tmp261, float %tmp126, i32 112
+  %tmp263 = insertelement <128 x float> %tmp262, float %tmp127, i32 113
+  %tmp264 = insertelement <128 x float> %tmp263, float %tmp128, i32 114
+  %tmp265 = insertelement <128 x float> %tmp264, float %tmp129, i32 115
+  %tmp266 = insertelement <128 x float> %tmp265, float %tmp130, i32 116
+  %tmp267 = insertelement <128 x float> %tmp266, float %tmp131, i32 117
+  %tmp268 = insertelement <128 x float> %tmp267, float %tmp132, i32 118
+  %tmp269 = insertelement <128 x float> %tmp268, float %tmp133, i32 119
+  %tmp270 = insertelement <128 x float> %tmp269, float %tmp134, i32 120
+  %tmp271 = insertelement <128 x float> %tmp270, float %tmp135, i32 121
+  %tmp272 = insertelement <128 x float> %tmp271, float %tmp136, i32 122
+  %tmp273 = insertelement <128 x float> %tmp272, float %tmp137, i32 123
+  %tmp274 = insertelement <128 x float> %tmp273, float %tmp138, i32 124
+  %tmp275 = insertelement <128 x float> %tmp274, float %tmp139, i32 125
+  %tmp276 = insertelement <128 x float> %tmp275, float %tmp140, i32 126
+  %tmp277 = insertelement <128 x float> %tmp276, float %tmp141, i32 127
+  %tmp278 = insertelement <128 x float> %tmp277, float %tmp149, i32 %tmp146
+  %tmp279 = extractelement <128 x float> %tmp278, i32 0
+  %tmp280 = extractelement <128 x float> %tmp278, i32 1
+  %tmp281 = extractelement <128 x float> %tmp278, i32 2
+  %tmp282 = extractelement <128 x float> %tmp278, i32 3
+  %tmp283 = extractelement <128 x float> %tmp278, i32 4
+  %tmp284 = extractelement <128 x float> %tmp278, i32 5
+  %tmp285 = extractelement <128 x float> %tmp278, i32 6
+  %tmp286 = extractelement <128 x float> %tmp278, i32 7
+  %tmp287 = extractelement <128 x float> %tmp278, i32 8
+  %tmp288 = extractelement <128 x float> %tmp278, i32 9
+  %tmp289 = extractelement <128 x float> %tmp278, i32 10
+  %tmp290 = extractelement <128 x float> %tmp278, i32 11
+  %tmp291 = extractelement <128 x float> %tmp278, i32 12
+  %tmp292 = extractelement <128 x float> %tmp278, i32 13
+  %tmp293 = extractelement <128 x float> %tmp278, i32 14
+  %tmp294 = extractelement <128 x float> %tmp278, i32 15
+  %tmp295 = extractelement <128 x float> %tmp278, i32 16
+  %tmp296 = extractelement <128 x float> %tmp278, i32 17
+  %tmp297 = extractelement <128 x float> %tmp278, i32 18
+  %tmp298 = extractelement <128 x float> %tmp278, i32 19
+  %tmp299 = extractelement <128 x float> %tmp278, i32 20
+  %tmp300 = extractelement <128 x float> %tmp278, i32 21
+  %tmp301 = extractelement <128 x float> %tmp278, i32 22
+  %tmp302 = extractelement <128 x float> %tmp278, i32 23
+  %tmp303 = extractelement <128 x float> %tmp278, i32 24
+  %tmp304 = extractelement <128 x float> %tmp278, i32 25
+  %tmp305 = extractelement <128 x float> %tmp278, i32 26
+  %tmp306 = extractelement <128 x float> %tmp278, i32 27
+  %tmp307 = extractelement <128 x float> %tmp278, i32 28
+  %tmp308 = extractelement <128 x float> %tmp278, i32 29
+  %tmp309 = extractelement <128 x float> %tmp278, i32 30
+  %tmp310 = extractelement <128 x float> %tmp278, i32 31
+  %tmp311 = extractelement <128 x float> %tmp278, i32 32
+  %tmp312 = extractelement <128 x float> %tmp278, i32 33
+  %tmp313 = extractelement <128 x float> %tmp278, i32 34
+  %tmp314 = extractelement <128 x float> %tmp278, i32 35
+  %tmp315 = extractelement <128 x float> %tmp278, i32 36
+  %tmp316 = extractelement <128 x float> %tmp278, i32 37
+  %tmp317 = extractelement <128 x float> %tmp278, i32 38
+  %tmp318 = extractelement <128 x float> %tmp278, i32 39
+  %tmp319 = extractelement <128 x float> %tmp278, i32 40
+  %tmp320 = extractelement <128 x float> %tmp278, i32 41
+  %tmp321 = extractelement <128 x float> %tmp278, i32 42
+  %tmp322 = extractelement <128 x float> %tmp278, i32 43
+  %tmp323 = extractelement <128 x float> %tmp278, i32 44
+  %tmp324 = extractelement <128 x float> %tmp278, i32 45
+  %tmp325 = extractelement <128 x float> %tmp278, i32 46
+  %tmp326 = extractelement <128 x float> %tmp278, i32 47
+  %tmp327 = extractelement <128 x float> %tmp278, i32 48
+  %tmp328 = extractelement <128 x float> %tmp278, i32 49
+  %tmp329 = extractelement <128 x float> %tmp278, i32 50
+  %tmp330 = extractelement <128 x float> %tmp278, i32 51
+  %tmp331 = extractelement <128 x float> %tmp278, i32 52
+  %tmp332 = extractelement <128 x float> %tmp278, i32 53
+  %tmp333 = extractelement <128 x float> %tmp278, i32 54
+  %tmp334 = extractelement <128 x float> %tmp278, i32 55
+  %tmp335 = extractelement <128 x float> %tmp278, i32 56
+  %tmp336 = extractelement <128 x float> %tmp278, i32 57
+  %tmp337 = extractelement <128 x float> %tmp278, i32 58
+  %tmp338 = extractelement <128 x float> %tmp278, i32 59
+  %tmp339 = extractelement <128 x float> %tmp278, i32 60
+  %tmp340 = extractelement <128 x float> %tmp278, i32 61
+  %tmp341 = extractelement <128 x float> %tmp278, i32 62
+  %tmp342 = extractelement <128 x float> %tmp278, i32 63
+  %tmp343 = extractelement <128 x float> %tmp278, i32 64
+  %tmp344 = extractelement <128 x float> %tmp278, i32 65
+  %tmp345 = extractelement <128 x float> %tmp278, i32 66
+  %tmp346 = extractelement <128 x float> %tmp278, i32 67
+  %tmp347 = extractelement <128 x float> %tmp278, i32 68
+  %tmp348 = extractelement <128 x float> %tmp278, i32 69
+  %tmp349 = extractelement <128 x float> %tmp278, i32 70
+  %tmp350 = extractelement <128 x float> %tmp278, i32 71
+  %tmp351 = extractelement <128 x float> %tmp278, i32 72
+  %tmp352 = extractelement <128 x float> %tmp278, i32 73
+  %tmp353 = extractelement <128 x float> %tmp278, i32 74
+  %tmp354 = extractelement <128 x float> %tmp278, i32 75
+  %tmp355 = extractelement <128 x float> %tmp278, i32 76
+  %tmp356 = extractelement <128 x float> %tmp278, i32 77
+  %tmp357 = extractelement <128 x float> %tmp278, i32 78
+  %tmp358 = extractelement <128 x float> %tmp278, i32 79
+  %tmp359 = extractelement <128 x float> %tmp278, i32 80
+  %tmp360 = extractelement <128 x float> %tmp278, i32 81
+  %tmp361 = extractelement <128 x float> %tmp278, i32 82
+  %tmp362 = extractelement <128 x float> %tmp278, i32 83
+  %tmp363 = extractelement <128 x float> %tmp278, i32 84
+  %tmp364 = extractelement <128 x float> %tmp278, i32 85
+  %tmp365 = extractelement <128 x float> %tmp278, i32 86
+  %tmp366 = extractelement <128 x float> %tmp278, i32 87
+  %tmp367 = extractelement <128 x float> %tmp278, i32 88
+  %tmp368 = extractelement <128 x float> %tmp278, i32 89
+  %tmp369 = extractelement <128 x float> %tmp278, i32 90
+  %tmp370 = extractelement <128 x float> %tmp278, i32 91
+  %tmp371 = extractelement <128 x float> %tmp278, i32 92
+  %tmp372 = extractelement <128 x float> %tmp278, i32 93
+  %tmp373 = extractelement <128 x float> %tmp278, i32 94
+  %tmp374 = extractelement <128 x float> %tmp278, i32 95
+  %tmp375 = extractelement <128 x float> %tmp278, i32 96
+  %tmp376 = extractelement <128 x float> %tmp278, i32 97
+  %tmp377 = extractelement <128 x float> %tmp278, i32 98
+  %tmp378 = extractelement <128 x float> %tmp278, i32 99
+  %tmp379 = extractelement <128 x float> %tmp278, i32 100
+  %tmp380 = extractelement <128 x float> %tmp278, i32 101
+  %tmp381 = extractelement <128 x float> %tmp278, i32 102
+  %tmp382 = extractelement <128 x float> %tmp278, i32 103
+  %tmp383 = extractelement <128 x float> %tmp278, i32 104
+  %tmp384 = extractelement <128 x float> %tmp278, i32 105
+  %tmp385 = extractelement <128 x float> %tmp278, i32 106
+  %tmp386 = extractelement <128 x float> %tmp278, i32 107
+  %tmp387 = extractelement <128 x float> %tmp278, i32 108
+  %tmp388 = extractelement <128 x float> %tmp278, i32 109
+  %tmp389 = extractelement <128 x float> %tmp278, i32 110
+  %tmp390 = extractelement <128 x float> %tmp278, i32 111
+  %tmp391 = extractelement <128 x float> %tmp278, i32 112
+  %tmp392 = extractelement <128 x float> %tmp278, i32 113
+  %tmp393 = extractelement <128 x float> %tmp278, i32 114
+  %tmp394 = extractelement <128 x float> %tmp278, i32 115
+  %tmp395 = extractelement <128 x float> %tmp278, i32 116
+  %tmp396 = extractelement <128 x float> %tmp278, i32 117
+  %tmp397 = extractelement <128 x float> %tmp278, i32 118
+  %tmp398 = extractelement <128 x float> %tmp278, i32 119
+  %tmp399 = extractelement <128 x float> %tmp278, i32 120
+  %tmp400 = extractelement <128 x float> %tmp278, i32 121
+  %tmp401 = extractelement <128 x float> %tmp278, i32 122
+  %tmp402 = extractelement <128 x float> %tmp278, i32 123
+  %tmp403 = extractelement <128 x float> %tmp278, i32 124
+  %tmp404 = extractelement <128 x float> %tmp278, i32 125
+  %tmp405 = extractelement <128 x float> %tmp278, i32 126
+  %tmp406 = extractelement <128 x float> %tmp278, i32 127
+  %tmp407 = bitcast float %tmp95 to i32
+  %tmp408 = add i32 %tmp407, 1
+  %tmp409 = bitcast i32 %tmp408 to float
+  br label %bb12
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
index 5ce65371b01c..ef492cef5767 100644
--- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -1,7 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling < %s | FileCheck %s
-
-; FIXME: Enable -verify-instructions
+; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; This ends up using all 255 registers and requires register
 ; scavenging which will fail to find an unsued register.
@@ -11,9 +9,19 @@
 
 ; FIXME: The same register is initialized to 0 for every spill.
 
-; CHECK-LABEL: {{^}}main:
-; CHECK: NumVgprs: 256
-; CHECK: ScratchSize: 1024
+; GCN-LABEL: {{^}}main:
+
+; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GCN-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GCN-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s11, 0x80f000
+; VI-NEXT: s_mov_b32 s11, 0x800000
+
+; s12 is offset user SGPR
+; GCN: buffer_store_dword {{v[0-9]+}}, s[8:11], s12 offset:{{[0-9]+}} ; 4-byte Folded Spill
+
+; GCN: NumVgprs: 256
+; GCN: ScratchSize: 1024
 
 define void @main([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <16 x i8>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 {
 bb:

From d4a0a430cc9eca9157aed5751616515db30b8b39 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 30 Nov 2015 21:15:57 +0000
Subject: [PATCH 020/186] AMDGPU: Rename enums to be consistent with HSA code
 object terminology

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254330 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp |  4 ++--
 lib/Target/AMDGPU/SIISelLowering.cpp     | 26 +++++++++-------------
 lib/Target/AMDGPU/SIInstrInfo.cpp        | 19 +++++++++-------
 lib/Target/AMDGPU/SIRegisterInfo.cpp     | 28 ++++++++++++------------
 lib/Target/AMDGPU/SIRegisterInfo.h       | 22 ++++++++++---------
 5 files changed, 49 insertions(+), 50 deletions(-)

diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 85a06882ffe3..aa3909060e52 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1066,8 +1066,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
       static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
 
-  unsigned ScratchOffsetReg
-    = TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
+  unsigned ScratchOffsetReg = TRI->getPreloadedValue(
+    MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
   Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
   SOffset = CurDAG->getRegister(ScratchOffsetReg, MVT::i32);
 
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 51cbc95bc07c..de52b483bbbc 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -514,7 +514,7 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
   MachineFunction &MF = DAG.getMachineFunction();
   const SIRegisterInfo *TRI =
       static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
-  unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR);
+  unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
 
   Type *Ty = VT.getTypeForEVT(*DAG.getContext());
 
@@ -628,7 +628,7 @@ SDValue SITargetLowering::LowerFormalArguments(
       Info->NumUserSGPRs += 4;
 
     unsigned InputPtrReg =
-        TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR);
+        TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
     unsigned InputPtrRegLo =
         TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 0);
     unsigned InputPtrRegHi =
@@ -641,14 +641,8 @@ SDValue SITargetLowering::LowerFormalArguments(
     const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
     if (MFI->hasDispatchPtr()) {
-      unsigned DispatchPtrReg =
-        TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR);
-      unsigned DispatchPtrRegLo =
-        TRI->getPhysRegSubReg(DispatchPtrReg, &AMDGPU::SReg_32RegClass, 0);
-      unsigned DispatchPtrRegHi =
-        TRI->getPhysRegSubReg(DispatchPtrReg, &AMDGPU::SReg_32RegClass, 1);
-      CCInfo.AllocateReg(DispatchPtrRegLo);
-      CCInfo.AllocateReg(DispatchPtrRegHi);
+      unsigned DispatchPtrReg
+        = TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR);
       MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass);
     }
   }
@@ -1110,22 +1104,22 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                   getImplicitParameterOffset(MFI, GRID_DIM));
   case Intrinsic::r600_read_tgid_x:
     return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_X), VT);
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT);
   case Intrinsic::r600_read_tgid_y:
     return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Y), VT);
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT);
   case Intrinsic::r600_read_tgid_z:
     return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Z), VT);
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT);
   case Intrinsic::r600_read_tidig_x:
     return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_X), VT);
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT);
   case Intrinsic::r600_read_tidig_y:
     return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Y), VT);
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT);
   case Intrinsic::r600_read_tidig_z:
     return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Z), VT);
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT);
   case AMDGPUIntrinsic::SI_load_const: {
     SDValue Ops[] = {
       Op.getOperand(1),
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index b7d2a4712759..5350edbd74e2 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -551,8 +551,8 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
 
   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
 
-  unsigned ScratchOffsetPreloadReg
-    = RI.getPreloadedValue(*MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
+  unsigned ScratchOffsetPreloadReg = RI.getPreloadedValue(
+    *MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
 
   unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize());
   MFI->setHasSpilledVGPRs();
@@ -638,8 +638,8 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
 
   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
 
-  unsigned ScratchOffsetPreloadReg
-    = RI.getPreloadedValue(*MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
+  unsigned ScratchOffsetPreloadReg = RI.getPreloadedValue(
+    *MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
 
   unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize());
   BuildMI(MBB, MI, DL, get(Opcode), DestReg)
@@ -678,11 +678,14 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
     if (MFI->getShaderType() == ShaderType::COMPUTE &&
         WorkGroupSize > WavefrontSize) {
 
-      unsigned TIDIGXReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_X);
-      unsigned TIDIGYReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Y);
-      unsigned TIDIGZReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Z);
+      unsigned TIDIGXReg
+        = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X);
+      unsigned TIDIGYReg
+        = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y);
+      unsigned TIDIGZReg
+        = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z);
       unsigned InputPtrReg =
-          TRI->getPreloadedValue(*MF, SIRegisterInfo::INPUT_PTR);
+          TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
       for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
         if (!Entry.isLiveIn(Reg))
           Entry.addLiveIn(Reg);
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index b392c86fa2e1..68629cd50995 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -72,7 +72,7 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
   if (ScratchRSrcReg != AMDGPU::NoRegister) {
     unsigned ScratchOffsetPreloadReg
-      = getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
+      = getPreloadedValue(MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
     // We will need to use this user SGPR argument for spilling, and thus never
     // want it to be spilled.
     reserveRegisterTuples(Reserved, ScratchOffsetPreloadReg);
@@ -532,30 +532,30 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
   const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   switch (Value) {
-  case SIRegisterInfo::TGID_X:
+  case SIRegisterInfo::WORKGROUP_ID_X:
     return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0);
-  case SIRegisterInfo::TGID_Y:
+  case SIRegisterInfo::WORKGROUP_ID_Y:
     return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1);
-  case SIRegisterInfo::TGID_Z:
+  case SIRegisterInfo::WORKGROUP_ID_Z:
     return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2);
-  case SIRegisterInfo::SCRATCH_WAVE_OFFSET:
+  case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET:
     if (MFI->getShaderType() != ShaderType::COMPUTE)
       return MFI->ScratchOffsetReg;
     return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4);
-  case SIRegisterInfo::SCRATCH_PTR:
-    return AMDGPU::SGPR2_SGPR3;
-  case SIRegisterInfo::INPUT_PTR:
-    if (ST.isAmdHsaOS())
-      return MFI->hasDispatchPtr() ? AMDGPU::SGPR2_SGPR3 : AMDGPU::SGPR0_SGPR1;
-    return AMDGPU::SGPR0_SGPR1;
+  case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER:
+    llvm_unreachable("currently unused");
+  case SIRegisterInfo::KERNARG_SEGMENT_PTR:
+    return ST.isAmdHsaOS() ? AMDGPU::SGPR2_SGPR3 : AMDGPU::SGPR0_SGPR1;
   case SIRegisterInfo::DISPATCH_PTR:
     assert(MFI->hasDispatchPtr());
     return AMDGPU::SGPR0_SGPR1;
-  case SIRegisterInfo::TIDIG_X:
+  case SIRegisterInfo::QUEUE_PTR:
+    llvm_unreachable("not implemented");
+  case SIRegisterInfo::WORKITEM_ID_X:
     return AMDGPU::VGPR0;
-  case SIRegisterInfo::TIDIG_Y:
+  case SIRegisterInfo::WORKITEM_ID_Y:
     return AMDGPU::VGPR1;
-  case SIRegisterInfo::TIDIG_Z:
+  case SIRegisterInfo::WORKITEM_ID_Z:
     return AMDGPU::VGPR2;
   }
   llvm_unreachable("unexpected preloaded value type");
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h
index 36f6d1c7a261..bb93242b0f88 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -93,23 +93,25 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
 
   /// \returns True if operands defined with this operand type can accept
   /// an inline constant. i.e. An integer value in the range (-16, 64) or
-  /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f. 
+  /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f.
   bool opCanUseInlineConstant(unsigned OpType) const;
 
   enum PreloadedValue {
     // SGPRS:
-    SCRATCH_PTR         =  0,
+    PRIVATE_SEGMENT_BUFFER =  0,
     DISPATCH_PTR        =  1,
-    INPUT_PTR           =  3,
-    TGID_X              = 10,
-    TGID_Y              = 11,
-    TGID_Z              = 12,
-    SCRATCH_WAVE_OFFSET = 14,
+    QUEUE_PTR           =  2,
+    KERNARG_SEGMENT_PTR =  3,
+    WORKGROUP_ID_X      = 10,
+    WORKGROUP_ID_Y      = 11,
+    WORKGROUP_ID_Z      = 12,
+    PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
+
     // VGPRS:
     FIRST_VGPR_VALUE    = 15,
-    TIDIG_X             = FIRST_VGPR_VALUE,
-    TIDIG_Y             = 16,
-    TIDIG_Z             = 17,
+    WORKITEM_ID_X       = FIRST_VGPR_VALUE,
+    WORKITEM_ID_Y       = 16,
+    WORKITEM_ID_Z       = 17
   };
 
   /// \brief Returns the physical register that \p Value is stored in.

From 0f1b95f8188f2c2ce4a93394ca185c707463c75e Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 30 Nov 2015 21:16:03 +0000
Subject: [PATCH 021/186] AMDGPU: Rework how private buffer passed for HSA

If we know we have stack objects, we reserve the registers
that the private buffer resource and wave offset are passed
and use them directly.

If not, reserve the last 5 SGPRs just in case we need to spill.
After register allocation, try to pick the next available registers
instead of the last SGPRs, and then insert copies from the inputs
to the reserved registers in the progloue.

This also only selectively enables all of the input registers
which are really required instead of always enabling them.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254331 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp        |  66 +++-
 lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp      |   6 +-
 lib/Target/AMDGPU/SIFrameLowering.cpp         | 168 +++++++++-
 lib/Target/AMDGPU/SIISelLowering.cpp          | 160 ++++++++--
 lib/Target/AMDGPU/SIInstrInfo.cpp             |  14 +-
 lib/Target/AMDGPU/SIMachineFunctionInfo.cpp   |  80 ++++-
 lib/Target/AMDGPU/SIMachineFunctionInfo.h     | 113 ++++++-
 lib/Target/AMDGPU/SIRegisterInfo.cpp          |  80 +++--
 lib/Target/AMDGPU/SIRegisterInfo.h            |   9 +
 test/CodeGen/AMDGPU/hsa.ll                    |   4 +-
 test/CodeGen/AMDGPU/large-alloca-compute.ll   |  43 ++-
 test/CodeGen/AMDGPU/large-alloca-graphics.ll  |   8 +-
 .../AMDGPU/llvm.AMDGPU.read.workdim.ll        |  37 +++
 .../AMDGPU/llvm.amdgcn.dispatch.ptr.ll        |   2 +-
 test/CodeGen/AMDGPU/llvm.dbg.value.ll         |   4 +-
 .../AMDGPU/llvm.r600.read.local.size.ll       | 184 +++++++++++
 .../AMDGPU/local-memory-two-objects.ll        |   2 +-
 test/CodeGen/AMDGPU/local-memory.ll           |   4 +-
 ...vgpr-spill-emergency-stack-slot-compute.ll |   8 +-
 .../AMDGPU/vgpr-spill-emergency-stack-slot.ll |  12 +-
 test/CodeGen/AMDGPU/work-item-intrinsics.ll   | 293 ++++++++----------
 21 files changed, 1000 insertions(+), 297 deletions(-)
 create mode 100644 test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll
 create mode 100644 test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll

diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 314ef721c1fc..b96c48f0561d 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -452,18 +452,27 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
       S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
       S_00B848_PRIV(ProgInfo.Priv) |
       S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) |
-      S_00B848_IEEE_MODE(ProgInfo.DebugMode) |
+      S_00B848_DEBUG_MODE(ProgInfo.DebugMode) |
       S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
 
+  // 0 = X, 1 = XY, 2 = XYZ
+  unsigned TIDIGCompCnt = 0;
+  if (MFI->hasWorkItemIDZ())
+    TIDIGCompCnt = 2;
+  else if (MFI->hasWorkItemIDY())
+    TIDIGCompCnt = 1;
+
   ProgInfo.ComputePGMRSrc2 =
       S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
-      S_00B84C_USER_SGPR(MFI->NumUserSGPRs) |
-      S_00B84C_TGID_X_EN(1) |
-      S_00B84C_TGID_Y_EN(1) |
-      S_00B84C_TGID_Z_EN(1) |
-      S_00B84C_TG_SIZE_EN(1) |
-      S_00B84C_TIDIG_COMP_CNT(2) |
-      S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks);
+      S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
+      S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) |
+      S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) |
+      S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) |
+      S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) |
+      S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) |
+      S_00B84C_EXCP_EN_MSB(0) |
+      S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks) |
+      S_00B84C_EXCP_EN(0);
 }
 
 static unsigned getRsrcReg(unsigned ShaderType) {
@@ -524,9 +533,44 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
   header.compute_pgm_resource_registers =
       KernelInfo.ComputePGMRSrc1 |
       (KernelInfo.ComputePGMRSrc2 << 32);
-  header.code_properties =
-      AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR |
-      AMD_CODE_PROPERTY_IS_PTR64;
+  header.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
+
+  if (MFI->hasPrivateSegmentBuffer()) {
+    header.code_properties |=
+      AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
+  }
+
+  if (MFI->hasDispatchPtr())
+    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
+
+  if (MFI->hasQueuePtr())
+    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
+
+  if (MFI->hasKernargSegmentPtr())
+    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
+
+  if (MFI->hasDispatchID())
+    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
+
+  if (MFI->hasFlatScratchInit())
+    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
+
+  // TODO: Private segment size
+
+  if (MFI->hasGridWorkgroupCountX()) {
+    header.code_properties |=
+      AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X;
+  }
+
+  if (MFI->hasGridWorkgroupCountY()) {
+    header.code_properties |=
+      AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y;
+  }
+
+  if (MFI->hasGridWorkgroupCountZ()) {
+    header.code_properties |=
+      AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z;
+  }
 
   if (MFI->hasDispatchPtr())
     header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index aa3909060e52..6aa4fddd3ec4 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1062,14 +1062,10 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
 
   SDLoc DL(Addr);
   MachineFunction &MF = CurDAG->getMachineFunction();
-  const SIRegisterInfo *TRI =
-      static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
 
-  unsigned ScratchOffsetReg = TRI->getPreloadedValue(
-    MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
   Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
-  SOffset = CurDAG->getRegister(ScratchOffsetReg, MVT::i32);
+  SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32);
 
   // (add n0, c1)
   if (CurDAG->isBaseWithConstantOffset(Addr)) {
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp
index 6aff4b5700d4..6b3c81c3af74 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -36,6 +36,16 @@ static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo,
   return true;
 }
 
+static ArrayRef<MCPhysReg> getAllSGPR128() {
+  return makeArrayRef(AMDGPU::SReg_128RegClass.begin(),
+                      AMDGPU::SReg_128RegClass.getNumRegs());
+}
+
+static ArrayRef<MCPhysReg> getAllSGPRs() {
+  return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
+                      AMDGPU::SGPR_32RegClass.getNumRegs());
+}
+
 void SIFrameLowering::emitPrologue(MachineFunction &MF,
                                    MachineBasicBlock &MBB) const {
   if (!MF.getFrameInfo()->hasStackObjects())
@@ -43,7 +53,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
 
   assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
 
-  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
   // If we only have SGPR spills, we won't actually be using scratch memory
   // since these spill to VGPRs.
@@ -56,31 +66,159 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
   const SIInstrInfo *TII =
       static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
 
   // We need to insert initialization of the scratch resource descriptor.
   unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
   assert(ScratchRsrcReg != AMDGPU::NoRegister);
 
-  uint64_t Rsrc23 = TII->getScratchRsrcWords23();
+  unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
+  assert(ScratchWaveOffsetReg != AMDGPU::NoRegister);
+
+  unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue(
+    MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+
+  unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
+  if (ST.isAmdHsaOS()) {
+    PreloadedPrivateBufferReg = TRI->getPreloadedValue(
+      MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
+  }
+
+  // If we reserved the original input registers, we don't need to copy to the
+  // reserved registers.
+  if (ScratchRsrcReg == PreloadedPrivateBufferReg) {
+    // We should always reserve these 5 registers at the same time.
+    assert(ScratchWaveOffsetReg == PreloadedScratchWaveOffsetReg &&
+           "scratch wave offset and private segment buffer inconsistent");
+    return;
+  }
+
+
+  // We added live-ins during argument lowering, but since they were not used
+  // they were deleted. We're adding the uses now, so add them back.
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
+  MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
+
+  if (ST.isAmdHsaOS()) {
+    MRI.addLiveIn(PreloadedPrivateBufferReg);
+    MBB.addLiveIn(PreloadedPrivateBufferReg);
+  }
+
+  // We reserved the last registers for this. Shift it down to the end of those
+  // which were actually used.
+  //
+  // FIXME: It might be safer to use a pseudoregister before replacement.
+
+  // FIXME: We should be able to eliminate unused input registers. We only
+  // cannot do this for the resources required for scratch access. For now we
+  // skip over user SGPRs and may leave unused holes.
+
+  // We find the resource first because it has an alignment requirement.
+  if (ScratchRsrcReg == TRI->reservedPrivateSegmentBufferReg(MF)) {
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+
+    unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4;
+    // Skip the last 2 elements because the last one is reserved for VCC, and
+    // this is the 2nd to last element already.
+    for (MCPhysReg Reg : getAllSGPR128().drop_back(2).slice(NumPreloaded)) {
+      // Pick the first unallocated one. Make sure we don't clobber the other
+      // reserved input we needed.
+      if (!MRI.isPhysRegUsed(Reg)) {
+        assert(MRI.isAllocatable(Reg));
+        MRI.replaceRegWith(ScratchRsrcReg, Reg);
+        ScratchRsrcReg = Reg;
+        MFI->setScratchRSrcReg(ScratchRsrcReg);
+        break;
+      }
+    }
+  }
+
+  if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) {
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+    // Skip the last 2 elements because the last one is reserved for VCC, and
+    // this is the 2nd to last element already.
+    unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
+    for (MCPhysReg Reg : getAllSGPRs().drop_back(6).slice(NumPreloaded)) {
+      // Pick the first unallocated SGPR. Be careful not to pick an alias of the
+      // scratch descriptor, since we haven’t added its uses yet.
+      if (!MRI.isPhysRegUsed(Reg)) {
+        assert(MRI.isAllocatable(Reg) &&
+               !TRI->isSubRegisterEq(ScratchRsrcReg, Reg));
+
+        MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
+        ScratchWaveOffsetReg = Reg;
+        MFI->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
+        break;
+      }
+    }
+  }
+
+
+  assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg));
+
+  const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
   MachineBasicBlock::iterator I = MBB.begin();
   DebugLoc DL;
 
-  unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
-  unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
-  unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
-  unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
-
-  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0)
-    .addExternalSymbol("SCRATCH_RSRC_DWORD0");
+  if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) {
+    // Make sure we emit the copy for the offset first. We may have chosen to copy
+    // the buffer resource into a register that aliases the input offset register.
+    BuildMI(MBB, I, DL, SMovB32, ScratchWaveOffsetReg)
+      .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
+  }
 
-  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1)
-    .addExternalSymbol("SCRATCH_RSRC_DWORD1");
+  if (ST.isAmdHsaOS()) {
+    // Insert copies from argument register.
+    assert(
+      !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchRsrcReg) &&
+      !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchWaveOffsetReg));
+
+    unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
+    unsigned Rsrc23 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2_sub3);
+
+    unsigned Lo = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub0_sub1);
+    unsigned Hi = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub2_sub3);
+
+    const MCInstrDesc &SMovB64 = TII->get(AMDGPU::S_MOV_B64);
+
+    BuildMI(MBB, I, DL, SMovB64, Rsrc01)
+      .addReg(Lo, RegState::Kill);
+    BuildMI(MBB, I, DL, SMovB64, Rsrc23)
+      .addReg(Hi, RegState::Kill);
+  } else {
+    unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
+    unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
+    unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
+    unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
+
+    // Use relocations to get the pointer, and setup the other bits manually.
+    uint64_t Rsrc23 = TII->getScratchRsrcWords23();
+    BuildMI(MBB, I, DL, SMovB32, Rsrc0)
+      .addExternalSymbol("SCRATCH_RSRC_DWORD0")
+      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+    BuildMI(MBB, I, DL, SMovB32, Rsrc1)
+      .addExternalSymbol("SCRATCH_RSRC_DWORD1")
+      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+    BuildMI(MBB, I, DL, SMovB32, Rsrc2)
+      .addImm(Rsrc23 & 0xffffffff)
+      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+    BuildMI(MBB, I, DL, SMovB32, Rsrc3)
+      .addImm(Rsrc23 >> 32)
+      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+  }
 
-  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2)
-    .addImm(Rsrc23 & 0xffffffff);
+  // Make the register selected live throughout the function.
+  for (MachineBasicBlock &OtherBB : MF) {
+    if (&OtherBB == &MBB)
+      continue;
 
-  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3)
-    .addImm(Rsrc23 >> 32);
+    OtherBB.addLiveIn(ScratchRsrcReg);
+    OtherBB.addLiveIn(ScratchWaveOffsetReg);
+  }
 }
 
 void SIFrameLowering::processFunctionBeforeFrameFinalized(
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index de52b483bbbc..05b54d0bae51 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -542,6 +542,11 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
                      Align); // Alignment
 }
 
+static ArrayRef<MCPhysReg> getAllSGPRs() {
+  return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
+                      AMDGPU::SGPR_32RegClass.getNumRegs());
+}
+
 SDValue SITargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
@@ -619,37 +624,28 @@ SDValue SITargetLowering::LowerFormalArguments(
     CCInfo.AllocateReg(AMDGPU::VGPR1);
   }
 
-  // The pointer to the list of arguments is stored in SGPR0, SGPR1
-  // The pointer to the scratch buffer is stored in SGPR2, SGPR3
   if (Info->getShaderType() == ShaderType::COMPUTE) {
-    if (Subtarget->isAmdHsaOS())
-      Info->NumUserSGPRs += 4;  // FIXME: Need to support scratch buffers.
-    else
-      Info->NumUserSGPRs += 4;
-
-    unsigned InputPtrReg =
-        TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
-    unsigned InputPtrRegLo =
-        TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 0);
-    unsigned InputPtrRegHi =
-        TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 1);
-
-    CCInfo.AllocateReg(InputPtrRegLo);
-    CCInfo.AllocateReg(InputPtrRegHi);
-    MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
+    getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
+                            Splits);
+  }
 
-    const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
+  if (Info->hasPrivateSegmentBuffer()) {
+    unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI);
+    MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass);
+    CCInfo.AllocateReg(PrivateSegmentBufferReg);
+  }
 
-    if (MFI->hasDispatchPtr()) {
-      unsigned DispatchPtrReg
-        = TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR);
-      MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass);
-    }
+  if (Info->hasDispatchPtr()) {
+    unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI);
+    MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass);
+    CCInfo.AllocateReg(DispatchPtrReg);
   }
 
-  if (Info->getShaderType() == ShaderType::COMPUTE) {
-    getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
-                            Splits);
+  if (Info->hasKernargSegmentPtr()) {
+    unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
+    MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
+    CCInfo.AllocateReg(InputPtrReg);
   }
 
   AnalyzeFormalArguments(CCInfo, Splits);
@@ -739,14 +735,114 @@ SDValue SITargetLowering::LowerFormalArguments(
     InVals.push_back(Val);
   }
 
-  if (Info->getShaderType() != ShaderType::COMPUTE) {
-    unsigned ScratchIdx = CCInfo.getFirstUnallocated(makeArrayRef(
-        AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs()));
-    Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx);
+  // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
+  // these from the dispatch pointer.
+
+  // Start adding system SGPRs.
+  if (Info->hasWorkGroupIDX()) {
+    unsigned Reg = Info->addWorkGroupIDX();
+    MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  } else
+    llvm_unreachable("work group id x is always enabled");
+
+  if (Info->hasWorkGroupIDY()) {
+    unsigned Reg = Info->addWorkGroupIDY();
+    MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  }
+
+  if (Info->hasWorkGroupIDZ()) {
+    unsigned Reg = Info->addWorkGroupIDZ();
+    MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+    CCInfo.AllocateReg(Reg);
   }
 
-  if (MF.getFrameInfo()->hasStackObjects() || ST.isVGPRSpillingEnabled(Info))
-    Info->setScratchRSrcReg(TRI);
+  if (Info->hasWorkGroupInfo()) {
+    unsigned Reg = Info->addWorkGroupInfo();
+    MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  }
+
+  if (Info->hasPrivateSegmentWaveByteOffset()) {
+    // Scratch wave offset passed in system SGPR.
+    unsigned PrivateSegmentWaveByteOffsetReg
+      = Info->addPrivateSegmentWaveByteOffset();
+
+    MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
+    CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
+  }
+
+  // Now that we've figured out where the scratch register inputs are, see if
+  // should reserve the arguments and use them directly.
+
+  bool HasStackObjects = MF.getFrameInfo()->hasStackObjects();
+
+  if (ST.isAmdHsaOS()) {
+    // TODO: Assume we will spill without optimizations.
+    if (HasStackObjects) {
+      // If we have stack objects, we unquestionably need the private buffer
+      // resource. For the HSA ABI, this will be the first 4 user SGPR
+      // inputs. We can reserve those and use them directly.
+
+      unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue(
+        MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
+      Info->setScratchRSrcReg(PrivateSegmentBufferReg);
+
+      unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue(
+        MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+      Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
+    } else {
+      unsigned ReservedBufferReg
+        = TRI->reservedPrivateSegmentBufferReg(MF);
+      unsigned ReservedOffsetReg
+        = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
+
+      // We tentatively reserve the last registers (skipping the last two
+      // which may contain VCC). After register allocation, we'll replace
+      // these with the ones immediately after those which were really
+      // allocated. In the prologue copies will be inserted from the argument
+      // to these reserved registers.
+      Info->setScratchRSrcReg(ReservedBufferReg);
+      Info->setScratchWaveOffsetReg(ReservedOffsetReg);
+    }
+  } else {
+    unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF);
+
+    // Without HSA, relocations are used for the scratch pointer and the
+    // buffer resource setup is always inserted in the prologue. Scratch wave
+    // offset is still in an input SGPR.
+    Info->setScratchRSrcReg(ReservedBufferReg);
+
+    if (HasStackObjects) {
+      unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue(
+        MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+      Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
+    } else {
+      unsigned ReservedOffsetReg
+        = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
+      Info->setScratchWaveOffsetReg(ReservedOffsetReg);
+    }
+  }
+
+  if (Info->hasWorkItemIDX()) {
+    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X);
+    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  } else
+    llvm_unreachable("workitem id x should always be enabled");
+
+  if (Info->hasWorkItemIDY()) {
+    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y);
+    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  }
+
+  if (Info->hasWorkItemIDZ()) {
+    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z);
+    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  }
 
   if (Chains.empty())
     return Chain;
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 5350edbd74e2..e1668649139d 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -551,16 +551,13 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
 
   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
 
-  unsigned ScratchOffsetPreloadReg = RI.getPreloadedValue(
-    *MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
-
   unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize());
   MFI->setHasSpilledVGPRs();
   BuildMI(MBB, MI, DL, get(Opcode))
     .addReg(SrcReg)                   // src
     .addFrameIndex(FrameIndex)        // frame_idx
-    .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
-    .addReg(ScratchOffsetPreloadReg)  // scratch_offset
+    .addReg(MFI->getScratchRSrcReg())       // scratch_rsrc
+    .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
     .addMemOperand(MMO);
 }
 
@@ -638,14 +635,11 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
 
   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
 
-  unsigned ScratchOffsetPreloadReg = RI.getPreloadedValue(
-    *MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
-
   unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize());
   BuildMI(MBB, MI, DL, get(Opcode), DestReg)
     .addFrameIndex(FrameIndex)        // frame_idx
-    .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
-    .addReg(ScratchOffsetPreloadReg)  // scratch_offset
+    .addReg(MFI->getScratchRSrcReg())       // scratch_rsrc
+    .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
     .addMemOperand(MMO);
 }
 
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index d042844aa138..935aad427198 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -30,15 +30,33 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   : AMDGPUMachineFunction(MF),
     TIDReg(AMDGPU::NoRegister),
     ScratchRSrcReg(AMDGPU::NoRegister),
+    ScratchWaveOffsetReg(AMDGPU::NoRegister),
+    PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister),
+    DispatchPtrUserSGPR(AMDGPU::NoRegister),
+    QueuePtrUserSGPR(AMDGPU::NoRegister),
+    KernargSegmentPtrUserSGPR(AMDGPU::NoRegister),
+    DispatchIDUserSGPR(AMDGPU::NoRegister),
+    FlatScratchInitUserSGPR(AMDGPU::NoRegister),
+    PrivateSegmentSizeUserSGPR(AMDGPU::NoRegister),
+    GridWorkGroupCountXUserSGPR(AMDGPU::NoRegister),
+    GridWorkGroupCountYUserSGPR(AMDGPU::NoRegister),
+    GridWorkGroupCountZUserSGPR(AMDGPU::NoRegister),
+    WorkGroupIDXSystemSGPR(AMDGPU::NoRegister),
+    WorkGroupIDYSystemSGPR(AMDGPU::NoRegister),
+    WorkGroupIDZSystemSGPR(AMDGPU::NoRegister),
+    WorkGroupInfoSystemSGPR(AMDGPU::NoRegister),
+    PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
     LDSWaveSpillSize(0),
     PSInputAddr(0),
     NumUserSGPRs(0),
+    NumSystemSGPRs(0),
     HasSpilledSGPRs(false),
     HasSpilledVGPRs(false),
+    PrivateSegmentBuffer(false),
     DispatchPtr(false),
     QueuePtr(false),
     DispatchID(false),
-    KernargSegmentPtr(true),
+    KernargSegmentPtr(false),
     FlatScratchInit(false),
     GridWorkgroupCountX(false),
     GridWorkgroupCountY(false),
@@ -47,13 +65,17 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     WorkGroupIDY(false),
     WorkGroupIDZ(false),
     WorkGroupInfo(false),
+    PrivateSegmentWaveByteOffset(false),
     WorkItemIDX(true),
     WorkItemIDY(false),
     WorkItemIDZ(false) {
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
   const Function *F = MF.getFunction();
 
-  if (F->hasFnAttribute("amdgpu-dispatch-ptr"))
-    DispatchPtr = true;
+  const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
+
+  if (getShaderType() == ShaderType::COMPUTE)
+    KernargSegmentPtr = true;
 
   if (F->hasFnAttribute("amdgpu-work-group-id-y"))
     WorkGroupIDY = true;
@@ -66,14 +88,54 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
 
   if (F->hasFnAttribute("amdgpu-work-item-id-z"))
     WorkItemIDZ = true;
+
+  bool MaySpill = ST.isVGPRSpillingEnabled(this);
+  bool HasStackObjects = FrameInfo->hasStackObjects();
+
+  if (HasStackObjects || MaySpill)
+    PrivateSegmentWaveByteOffset = true;
+
+  if (ST.isAmdHsaOS()) {
+    if (HasStackObjects || MaySpill)
+      PrivateSegmentBuffer = true;
+
+    if (F->hasFnAttribute("amdgpu-dispatch-ptr"))
+      DispatchPtr = true;
+  }
+
+  // X, XY, and XYZ are the only supported combinations, so make sure Y is
+  // enabled if Z is.
+  if (WorkItemIDZ)
+    WorkItemIDY = true;
+}
+
+unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
+  const SIRegisterInfo &TRI) {
+  PrivateSegmentBufferUserSGPR = TRI.getMatchingSuperReg(
+    getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
+  NumUserSGPRs += 4;
+  return PrivateSegmentBufferUserSGPR;
+}
+
+unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) {
+  DispatchPtrUserSGPR = TRI.getMatchingSuperReg(
+    getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+  NumUserSGPRs += 2;
+  return DispatchPtrUserSGPR;
+}
+
+unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) {
+  QueuePtrUserSGPR = TRI.getMatchingSuperReg(
+    getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+  NumUserSGPRs += 2;
+  return QueuePtrUserSGPR;
 }
 
-void SIMachineFunctionInfo::setScratchRSrcReg(const SIRegisterInfo *TRI) {
-  // We need to round up to next multiple of 4.
-  unsigned NextSReg128 = RoundUpToAlignment(NumUserSGPRs + 5, 4);
-  unsigned RegSub0 = AMDGPU::SReg_32RegClass.getRegister(NextSReg128);
-  ScratchRSrcReg = TRI->getMatchingSuperReg(RegSub0, AMDGPU::sub0,
-                                            &AMDGPU::SReg_128RegClass);
+unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) {
+  KernargSegmentPtrUserSGPR = TRI.getMatchingSuperReg(
+    getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+  NumUserSGPRs += 2;
+  return KernargSegmentPtrUserSGPR;
 }
 
 SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 377c5ce94846..9c528d63bd0e 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -26,10 +26,36 @@ class MachineRegisterInfo;
 /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
 /// tells the hardware which interpolation parameters to load.
 class SIMachineFunctionInfo : public AMDGPUMachineFunction {
+  // FIXME: This should be removed and getPreloadedValue moved here.
+  friend struct SIRegisterInfo;
   void anchor() override;
 
   unsigned TIDReg;
+
+  // Registers that may be reserved for spilling purposes. These may be the same
+  // as the input registers.
   unsigned ScratchRSrcReg;
+  unsigned ScratchWaveOffsetReg;
+
+  // Input registers setup for the HSA ABI.
+  // User SGPRs in allocation order.
+  unsigned PrivateSegmentBufferUserSGPR;
+  unsigned DispatchPtrUserSGPR;
+  unsigned QueuePtrUserSGPR;
+  unsigned KernargSegmentPtrUserSGPR;
+  unsigned DispatchIDUserSGPR;
+  unsigned FlatScratchInitUserSGPR;
+  unsigned PrivateSegmentSizeUserSGPR;
+  unsigned GridWorkGroupCountXUserSGPR;
+  unsigned GridWorkGroupCountYUserSGPR;
+  unsigned GridWorkGroupCountZUserSGPR;
+
+  // System SGPRs in allocation order.
+  unsigned WorkGroupIDXSystemSGPR;
+  unsigned WorkGroupIDYSystemSGPR;
+  unsigned WorkGroupIDZSystemSGPR;
+  unsigned WorkGroupInfoSystemSGPR;
+  unsigned PrivateSegmentWaveByteOffsetSystemSGPR;
 
 public:
   // FIXME: Make private
@@ -38,12 +64,14 @@ class SIMachineFunctionInfo : public AMDGPUMachineFunction {
   std::map<unsigned, unsigned> LaneVGPRs;
   unsigned ScratchOffsetReg;
   unsigned NumUserSGPRs;
+  unsigned NumSystemSGPRs;
 
 private:
   bool HasSpilledSGPRs;
   bool HasSpilledVGPRs;
 
-  // Feature bits required for inputs passed in user / system SGPRs.
+  // Feature bits required for inputs passed in user SGPRs.
+  bool PrivateSegmentBuffer : 1;
   bool DispatchPtr : 1;
   bool QueuePtr : 1;
   bool DispatchID : 1;
@@ -53,15 +81,27 @@ class SIMachineFunctionInfo : public AMDGPUMachineFunction {
   bool GridWorkgroupCountY : 1;
   bool GridWorkgroupCountZ : 1;
 
+  // Feature bits required for inputs passed in system SGPRs.
   bool WorkGroupIDX : 1; // Always initialized.
   bool WorkGroupIDY : 1;
   bool WorkGroupIDZ : 1;
   bool WorkGroupInfo : 1;
+  bool PrivateSegmentWaveByteOffset : 1;
 
   bool WorkItemIDX : 1; // Always initialized.
   bool WorkItemIDY : 1;
   bool WorkItemIDZ : 1;
 
+
+  MCPhysReg getNextUserSGPR() const {
+    assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
+    return AMDGPU::SGPR0 + NumUserSGPRs;
+  }
+
+  MCPhysReg getNextSystemSGPR() const {
+    return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
+  }
+
 public:
   struct SpilledReg {
     unsigned VGPR;
@@ -80,6 +120,47 @@ class SIMachineFunctionInfo : public AMDGPUMachineFunction {
   unsigned getTIDReg() const { return TIDReg; };
   void setTIDReg(unsigned Reg) { TIDReg = Reg; }
 
+  // Add user SGPRs.
+  unsigned addPrivateSegmentBuffer(const SIRegisterInfo &TRI);
+  unsigned addDispatchPtr(const SIRegisterInfo &TRI);
+  unsigned addQueuePtr(const SIRegisterInfo &TRI);
+  unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI);
+
+  // Add system SGPRs.
+  unsigned addWorkGroupIDX() {
+    WorkGroupIDXSystemSGPR = getNextSystemSGPR();
+    NumSystemSGPRs += 1;
+    return WorkGroupIDXSystemSGPR;
+  }
+
+  unsigned addWorkGroupIDY() {
+    WorkGroupIDYSystemSGPR = getNextSystemSGPR();
+    NumSystemSGPRs += 1;
+    return WorkGroupIDYSystemSGPR;
+  }
+
+  unsigned addWorkGroupIDZ() {
+    WorkGroupIDZSystemSGPR = getNextSystemSGPR();
+    NumSystemSGPRs += 1;
+    return WorkGroupIDZSystemSGPR;
+  }
+
+  unsigned addWorkGroupInfo() {
+    WorkGroupInfoSystemSGPR = getNextSystemSGPR();
+    NumSystemSGPRs += 1;
+    return WorkGroupInfoSystemSGPR;
+  }
+
+  unsigned addPrivateSegmentWaveByteOffset() {
+    PrivateSegmentWaveByteOffsetSystemSGPR = getNextSystemSGPR();
+    NumSystemSGPRs += 1;
+    return PrivateSegmentWaveByteOffsetSystemSGPR;
+  }
+
+  bool hasPrivateSegmentBuffer() const {
+    return PrivateSegmentBuffer;
+  }
+
   bool hasDispatchPtr() const {
     return DispatchPtr;
   }
@@ -128,6 +209,10 @@ class SIMachineFunctionInfo : public AMDGPUMachineFunction {
     return WorkGroupInfo;
   }
 
+  bool hasPrivateSegmentWaveByteOffset() const {
+    return PrivateSegmentWaveByteOffset;
+  }
+
   bool hasWorkItemIDX() const {
     return WorkItemIDX;
   }
@@ -140,13 +225,37 @@ class SIMachineFunctionInfo : public AMDGPUMachineFunction {
     return WorkItemIDZ;
   }
 
+  unsigned getNumUserSGPRs() const {
+    return NumUserSGPRs;
+  }
+
+  unsigned getNumPreloadedSGPRs() const {
+    return NumUserSGPRs + NumSystemSGPRs;
+  }
+
+  unsigned getPrivateSegmentWaveByteOffsetSystemSGPR() const {
+    return PrivateSegmentWaveByteOffsetSystemSGPR;
+  }
+
   /// \brief Returns the physical register reserved for use as the resource
   /// descriptor for scratch accesses.
   unsigned getScratchRSrcReg() const {
     return ScratchRSrcReg;
   }
 
-  void setScratchRSrcReg(const SIRegisterInfo *TRI);
+  void setScratchRSrcReg(unsigned Reg) {
+    assert(Reg != AMDGPU::NoRegister && "Should never be unset");
+    ScratchRSrcReg = Reg;
+  }
+
+  unsigned getScratchWaveOffsetReg() const {
+    return ScratchWaveOffsetReg;
+  }
+
+  void setScratchWaveOffsetReg(unsigned Reg) {
+    assert(Reg != AMDGPU::NoRegister && "Should never be unset");
+    ScratchWaveOffsetReg = Reg;
+  }
 
   bool hasSpilledSGPRs() const {
     return HasSpilledSGPRs;
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 68629cd50995..d9799812d453 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -32,6 +32,40 @@ void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) co
     Reserved.set(*R);
 }
 
+unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
+  const MachineFunction &MF) const {
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+  if (ST.hasSGPRInitBug()) {
+    unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 4;
+    unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
+    return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
+  }
+
+  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+    // 98/99 need to be reserved for flat_scr, and 100/101 for vcc. This is the
+    // next sgpr128 down.
+    return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95;
+  }
+
+  return AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99;
+}
+
+unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
+  const MachineFunction &MF) const {
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+  if (ST.hasSGPRInitBug()) {
+    unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 5;
+    return AMDGPU::SGPR_32RegClass.getRegister(Idx);
+  }
+
+  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+    // Next register before reservations for flat_scr and vcc.
+    return AMDGPU::SGPR97;
+  }
+
+  return AMDGPU::SGPR95;
+}
+
 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
@@ -69,19 +103,20 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   }
 
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
+  if (ScratchWaveOffsetReg != AMDGPU::NoRegister) {
+    // Reserve 1 SGPR for scratch wave offset in case we need to spill.
+    reserveRegisterTuples(Reserved, ScratchWaveOffsetReg);
+  }
+
   unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
   if (ScratchRSrcReg != AMDGPU::NoRegister) {
-    unsigned ScratchOffsetPreloadReg
-      = getPreloadedValue(MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
-    // We will need to use this user SGPR argument for spilling, and thus never
-    // want it to be spilled.
-    reserveRegisterTuples(Reserved, ScratchOffsetPreloadReg);
-
     // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
     // to spill.
     // TODO: May need to reserve a VGPR if doing LDS spilling.
     reserveRegisterTuples(Reserved, ScratchRSrcReg);
-    assert(!isSubRegister(ScratchRSrcReg, ScratchOffsetPreloadReg));
+    assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
   }
 
   return Reserved;
@@ -204,11 +239,10 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
     unsigned SubReg = NumSubRegs > 1 ?
         getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) :
         Value;
-    bool IsKill = (i == e - 1);
 
     BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
       .addReg(SubReg, getDefRegState(IsLoad))
-      .addReg(ScratchRsrcReg, getKillRegState(IsKill))
+      .addReg(ScratchRsrcReg)
       .addReg(SOffset)
       .addImm(Offset)
       .addImm(0) // glc
@@ -526,6 +560,9 @@ bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
   return OpType == AMDGPU::OPERAND_REG_INLINE_C;
 }
 
+// FIXME: Most of these are flexible with HSA and we don't need to reserve them
+// as input registers if unused. Whether the dispatch ptr is necessary should be
+// easy to detect from used intrinsics. Scratch setup is harder to know.
 unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
                                            enum PreloadedValue Value) const {
 
@@ -533,29 +570,36 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   switch (Value) {
   case SIRegisterInfo::WORKGROUP_ID_X:
-    return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0);
+    assert(MFI->hasWorkGroupIDX());
+    return MFI->WorkGroupIDXSystemSGPR;
   case SIRegisterInfo::WORKGROUP_ID_Y:
-    return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1);
+    assert(MFI->hasWorkGroupIDY());
+    return MFI->WorkGroupIDYSystemSGPR;
   case SIRegisterInfo::WORKGROUP_ID_Z:
-    return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2);
+    assert(MFI->hasWorkGroupIDZ());
+    return MFI->WorkGroupIDZSystemSGPR;
   case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET:
-    if (MFI->getShaderType() != ShaderType::COMPUTE)
-      return MFI->ScratchOffsetReg;
-    return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4);
+    return MFI->PrivateSegmentWaveByteOffsetSystemSGPR;
   case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER:
-    llvm_unreachable("currently unused");
+    assert(ST.isAmdHsaOS() && "Non-HSA ABI currently uses relocations");
+    assert(MFI->hasPrivateSegmentBuffer());
+    return MFI->PrivateSegmentBufferUserSGPR;
   case SIRegisterInfo::KERNARG_SEGMENT_PTR:
-    return ST.isAmdHsaOS() ? AMDGPU::SGPR2_SGPR3 : AMDGPU::SGPR0_SGPR1;
+    assert(MFI->hasKernargSegmentPtr());
+    return MFI->KernargSegmentPtrUserSGPR;
   case SIRegisterInfo::DISPATCH_PTR:
     assert(MFI->hasDispatchPtr());
-    return AMDGPU::SGPR0_SGPR1;
+    return MFI->DispatchPtrUserSGPR;
   case SIRegisterInfo::QUEUE_PTR:
     llvm_unreachable("not implemented");
   case SIRegisterInfo::WORKITEM_ID_X:
+    assert(MFI->hasWorkItemIDX());
     return AMDGPU::VGPR0;
   case SIRegisterInfo::WORKITEM_ID_Y:
+    assert(MFI->hasWorkItemIDY());
     return AMDGPU::VGPR1;
   case SIRegisterInfo::WORKITEM_ID_Z:
+    assert(MFI->hasWorkItemIDZ());
     return AMDGPU::VGPR2;
   }
   llvm_unreachable("unexpected preloaded value type");
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h
index bb93242b0f88..eafe4053e878 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -29,6 +29,15 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
 public:
   SIRegisterInfo();
 
+  /// Return the end register initially reserved for the scratch buffer in case
+  /// spilling is needed.
+  unsigned reservedPrivateSegmentBufferReg(const MachineFunction &MF) const;
+
+  /// Return the end register initially reserved for the scratch wave offset in
+  /// case spilling is needed.
+  unsigned reservedPrivateSegmentWaveByteOffsetReg(
+    const MachineFunction &MF) const;
+
   BitVector getReservedRegs(const MachineFunction &MF) const override;
 
   unsigned getRegPressureSetLimit(const MachineFunction &MF,
diff --git a/test/CodeGen/AMDGPU/hsa.ll b/test/CodeGen/AMDGPU/hsa.ll
index ab87fdbc00da..d9bb586163dc 100644
--- a/test/CodeGen/AMDGPU/hsa.ll
+++ b/test/CodeGen/AMDGPU/hsa.ll
@@ -38,8 +38,10 @@
 ; HSA: .amdgpu_hsa_kernel simple
 ; HSA: {{^}}simple:
 ; HSA: .amd_kernel_code_t
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_kernarg_segment_ptr = 1
 ; HSA: .end_amd_kernel_code_t
-; HSA: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[0:1], 0x0
+; HSA: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
 
 ; Make sure we are setting the ATC bit:
 ; HSA-CI: s_mov_b32 s[[HI:[0-9]]], 0x100f000
diff --git a/test/CodeGen/AMDGPU/large-alloca-compute.ll b/test/CodeGen/AMDGPU/large-alloca-compute.ll
index 5e8cf5bb3d25..c348a2e7980f 100644
--- a/test/CodeGen/AMDGPU/large-alloca-compute.ll
+++ b/test/CodeGen/AMDGPU/large-alloca-compute.ll
@@ -1,31 +1,46 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s
 ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
-; XUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s
-; XUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s
 
 ; FIXME: align on alloca seems to be ignored for private_segment_alignment
 
 ; ALL-LABEL: {{^}}large_alloca_compute_shader:
 
-; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GCN: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GCN: s_mov_b32 s14, -1
-; CI: s_mov_b32 s15, 0x80f000
-; VI: s_mov_b32 s15, 0x800000
+; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GCN: s_mov_b32 s10, -1
+; CI: s_mov_b32 s11, 0x80f000
+; VI: s_mov_b32 s11, 0x800000
 
 
 ; GCNHSA: .amd_kernel_code_t
+
+; GCNHSA: compute_pgm_rsrc2_scratch_en = 1
+; GCNHSA: compute_pgm_rsrc2_user_sgpr = 6
+; GCNHSA: compute_pgm_rsrc2_tgid_x_en = 1
+; GCNHSA: compute_pgm_rsrc2_tgid_y_en = 0
+; GCNHSA: compute_pgm_rsrc2_tgid_z_en = 0
+; GCNHSA: compute_pgm_rsrc2_tg_size_en = 0
+; GCNHSA: compute_pgm_rsrc2_tidig_comp_cnt = 0
+
+; GCNHSA: enable_sgpr_private_segment_buffer = 1
+; GCNHSA: enable_sgpr_dispatch_ptr = 0
+; GCNHSA: enable_sgpr_queue_ptr = 0
+; GCNHSA: enable_sgpr_kernarg_segment_ptr = 1
+; GCNHSA: enable_sgpr_dispatch_id = 0
+; GCNHSA: enable_sgpr_flat_scratch_init = 0
+; GCNHSA: enable_sgpr_private_segment_size = 0
+; GCNHSA: enable_sgpr_grid_workgroup_count_x = 0
+; GCNHSA: enable_sgpr_grid_workgroup_count_y = 0
+; GCNHSA: enable_sgpr_grid_workgroup_count_z = 0
+; GCNHSA: workitem_private_segment_byte_size = 0
 ; GCNHSA: private_segment_alignment = 4
 ; GCNHSA: .end_amd_kernel_code_t
 
-; GCNHSA: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GCNHSA: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GCNHSA: s_mov_b32 s10, -1
-; CIHSA: s_mov_b32 s11, 0x180f000
-; VIHSA: s_mov_b32 s11, 0x11800000
 
-; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s6 offen
-; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s6 offen
+; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s7 offen
+; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s7 offen
 
 ; Scratch size = alloca size + emergency stack slot
 ; ALL: ; ScratchSize: 32772
diff --git a/test/CodeGen/AMDGPU/large-alloca-graphics.ll b/test/CodeGen/AMDGPU/large-alloca-graphics.ll
index 208b9a10050c..141ee2560152 100644
--- a/test/CodeGen/AMDGPU/large-alloca-graphics.ll
+++ b/test/CodeGen/AMDGPU/large-alloca-graphics.ll
@@ -8,8 +8,8 @@
 ; CI: s_mov_b32 s11, 0x80f000
 ; VI: s_mov_b32 s11, 0x800000
 
-; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen
+; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
 
 ; ALL: ; ScratchSize: 32772
 define void @large_alloca_pixel_shader(i32 %x, i32 %y) #1 {
@@ -29,8 +29,8 @@ define void @large_alloca_pixel_shader(i32 %x, i32 %y) #1 {
 ; CI: s_mov_b32 s11, 0x80f000
 ; VI: s_mov_b32 s11, 0x800000
 
-; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen
+; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
 
 ; ALL: ; ScratchSize: 32772
 define void @large_alloca_pixel_shader_inreg(i32 inreg %x, i32 inreg %y) #1 {
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll
new file mode 100644
index 000000000000..6dc9d050eee6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll
@@ -0,0 +1,37 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI  -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}read_workdim:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[2].Z
+
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
+define void @read_workdim(i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.AMDGPU.read.workdim() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}read_workdim_known_bits:
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
+; GCN-NOT: 0xff
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @read_workdim_known_bits(i32 addrspace(1)* %out) {
+entry:
+  %dim = call i32 @llvm.AMDGPU.read.workdim() #0
+  %shl = shl i32 %dim, 24
+  %shr = lshr i32 %shl, 24
+  store i32 %shr, i32 addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.AMDGPU.read.workdim() #0
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
index 719f7ffe0f1c..dc95cd1ee012 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
@@ -2,7 +2,7 @@
 
 ; GCN-LABEL: {{^}}test:
 ; GCN: enable_sgpr_dispatch_ptr = 1
-; GCN: s_load_dword s{{[0-9]+}}, s[0:1], 0x0
+; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
 define void @test(i32 addrspace(1)* %out) {
   %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
   %header_ptr = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*
diff --git a/test/CodeGen/AMDGPU/llvm.dbg.value.ll b/test/CodeGen/AMDGPU/llvm.dbg.value.ll
index c5aba2b76b89..cc109327d929 100644
--- a/test/CodeGen/AMDGPU/llvm.dbg.value.ll
+++ b/test/CodeGen/AMDGPU/llvm.dbg.value.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -O0 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck %s
 
 ; CHECK-LABEL: {{^}}test_debug_value:
-; CHECK: s_load_dwordx2
-; CHECK: DEBUG_VALUE: test_debug_value:globalptr_arg <- %SGPR0_SGPR1
+; CHECK: s_load_dwordx2 s[4:5]
+; CHECK: DEBUG_VALUE: test_debug_value:globalptr_arg <- %SGPR4_SGPR5
 ; CHECK: buffer_store_dword
 ; CHECK: s_endpgm
 define void @test_debug_value(i32 addrspace(1)* nocapture %globalptr_arg) #0 !dbg !4 {
diff --git a/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
new file mode 100644
index 000000000000..f2a7256e812d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
@@ -0,0 +1,184 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI  -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}local_size_x:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[1].Z
+
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18
+; CI-HSA: s_load_dword [[XY:s[0-9]+]], s[4:5], 0x1
+; VI-HSA: s_load_dword [[XY:s[0-9]+]], s[4:5], 0x4
+
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @local_size_x(i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.local.size.x() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_y:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[1].W
+
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @local_size_y(i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.local.size.y() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_z:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[2].X
+
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @local_size_z(i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.local.size.z() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_xy:
+; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6
+; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7
+; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18
+; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c
+; GCN-DAG: v_mov_b32_e32 [[VY:v[0-9]+]], [[Y]]
+; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VY]]
+; GCN: buffer_store_dword [[VAL]]
+define void @local_size_xy(i32 addrspace(1)* %out) {
+entry:
+  %x = call i32 @llvm.r600.read.local.size.x() #0
+  %y = call i32 @llvm.r600.read.local.size.y() #0
+  %val = mul i32 %x, %y
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_xz:
+
+; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6
+; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8
+; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18
+; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20
+; HSA-DAG: s_and_b32 [[X:s[0-9]+]], [[XY]], 0xffff
+; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
+; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VZ]]
+; GCN: buffer_store_dword [[VAL]]
+define void @local_size_xz(i32 addrspace(1)* %out) {
+entry:
+  %x = call i32 @llvm.r600.read.local.size.x() #0
+  %z = call i32 @llvm.r600.read.local.size.z() #0
+  %val = mul i32 %x, %z
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_yz:
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_dispatch_ptr = 1
+
+; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7
+; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8
+; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c
+; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20
+; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
+; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[Y]], [[VZ]]
+; GCN: buffer_store_dword [[VAL]]
+define void @local_size_yz(i32 addrspace(1)* %out) {
+entry:
+  %y = call i32 @llvm.r600.read.local.size.y() #0
+  %z = call i32 @llvm.r600.read.local.size.z() #0
+  %val = mul i32 %y, %z
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_xyz:
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_dispatch_ptr = 1
+
+; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6
+; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7
+; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8
+; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18
+; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c
+; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20
+; GCN-DAG: v_mov_b32_e32 [[VY:v[0-9]+]], [[Y]]
+; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
+; GCN: v_mad_u32_u24 [[VAL:v[0-9]+]], [[X]], [[VY]], [[VZ]]
+; GCN: buffer_store_dword [[VAL]]
+define void @local_size_xyz(i32 addrspace(1)* %out) {
+entry:
+  %x = call i32 @llvm.r600.read.local.size.x() #0
+  %y = call i32 @llvm.r600.read.local.size.y() #0
+  %z = call i32 @llvm.r600.read.local.size.z() #0
+  %xy = mul i32 %x, %y
+  %xyz = add i32 %xy, %z
+  store i32 %xyz, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_x_known_bits:
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18
+; GCN-NOT: 0xffff
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NEXT: buffer_store_dword [[VVAL]]
+define void @local_size_x_known_bits(i32 addrspace(1)* %out) {
+entry:
+  %size = call i32 @llvm.r600.read.local.size.x() #0
+  %shl = shl i32 %size, 16
+  %shr = lshr i32 %shl, 16
+  store i32 %shr, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_y_known_bits:
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
+; GCN-NOT: 0xffff
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NEXT: buffer_store_dword [[VVAL]]
+define void @local_size_y_known_bits(i32 addrspace(1)* %out) {
+entry:
+  %size = call i32 @llvm.r600.read.local.size.y() #0
+  %shl = shl i32 %size, 16
+  %shr = lshr i32 %shl, 16
+  store i32 %shr, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_z_known_bits:
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
+; GCN-NOT: 0xffff
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NEXT: buffer_store_dword [[VVAL]]
+define void @local_size_z_known_bits(i32 addrspace(1)* %out) {
+entry:
+  %size = call i32 @llvm.r600.read.local.size.z() #0
+  %shl = shl i32 %size, 16
+  %shr = lshr i32 %shl, 16
+  store i32 %shr, i32 addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.r600.read.local.size.x() #0
+declare i32 @llvm.r600.read.local.size.y() #0
+declare i32 @llvm.r600.read.local.size.z() #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/local-memory-two-objects.ll b/test/CodeGen/AMDGPU/local-memory-two-objects.ll
index 7f31ef45b628..6b52b80ba082 100644
--- a/test/CodeGen/AMDGPU/local-memory-two-objects.ll
+++ b/test/CodeGen/AMDGPU/local-memory-two-objects.ll
@@ -10,7 +10,7 @@
 ; EG: .long 166120
 ; EG-NEXT: .long 8
 ; GCN: .long 47180
-; GCN-NEXT: .long 38792
+; GCN-NEXT: .long 32900
 
 ; EG: {{^}}local_memory_two_objects:
 
diff --git a/test/CodeGen/AMDGPU/local-memory.ll b/test/CodeGen/AMDGPU/local-memory.ll
index 9494ed75bd0c..9ffb59e70920 100644
--- a/test/CodeGen/AMDGPU/local-memory.ll
+++ b/test/CodeGen/AMDGPU/local-memory.ll
@@ -9,9 +9,9 @@
 ; EG: .long 166120
 ; EG-NEXT: .long 128
 ; SI: .long 47180
-; SI-NEXT: .long 71560
+; SI-NEXT: .long 65668
 ; CI: .long 47180
-; CI-NEXT: .long 38792
+; CI-NEXT: .long 32900
 
 ; FUNC-LABEL: {{^}}local_memory:
 
diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
index 2cbb67d85fba..cd7c78f408dd 100644
--- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
+++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
@@ -17,16 +17,18 @@ declare i32 @llvm.r600.read.tgid.z() #1
 
 ; GCN-LABEL: {{^}}spill_vgpr_compute:
 
+; GCN: s_mov_b32 s16, s3
 ; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GCN-NEXT: s_mov_b32 s14, -1
 ; SI-NEXT: s_mov_b32 s15, 0x80f000
 ; VI-NEXT: s_mov_b32 s15, 0x800000
 
-; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s8 offset:{{[0-9]+}} ; 4-byte Folded Spill
 
-; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s8 offen offset:{{[0-9]+}}
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s8 offen offset:{{[0-9]+}}
+; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill
+
+; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
+; GCN: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
 
 ; GCN: NumVgprs: 256
 ; GCN: ScratchSize: 1024
diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
index ef492cef5767..16abb89bb0b8 100644
--- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -11,14 +11,14 @@
 
 ; GCN-LABEL: {{^}}main:
 
-; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GCN-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GCN-NEXT: s_mov_b32 s10, -1
-; SI-NEXT: s_mov_b32 s11, 0x80f000
-; VI-NEXT: s_mov_b32 s11, 0x800000
+; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GCN-NEXT: s_mov_b32 s14, -1
+; SI-NEXT: s_mov_b32 s15, 0x80f000
+; VI-NEXT: s_mov_b32 s15, 0x800000
 
 ; s12 is offset user SGPR
-; GCN: buffer_store_dword {{v[0-9]+}}, s[8:11], s12 offset:{{[0-9]+}} ; 4-byte Folded Spill
+; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Spill
 
 ; GCN: NumVgprs: 256
 ; GCN: ScratchSize: 1024
diff --git a/test/CodeGen/AMDGPU/work-item-intrinsics.ll b/test/CodeGen/AMDGPU/work-item-intrinsics.ll
index ebe73429da9c..a704a23b0f92 100644
--- a/test/CodeGen/AMDGPU/work-item-intrinsics.ll
+++ b/test/CodeGen/AMDGPU/work-item-intrinsics.ll
@@ -1,5 +1,7 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI  -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=HSA -check-prefix=CI-HSA -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=HSA -check-prefix=VI-HSA -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 
@@ -7,9 +9,26 @@
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV [[VAL]], KC0[0].X
 
-; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
+; HSA: .amd_kernel_code_t
+
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_dispatch_ptr = 0
+; HSA: enable_sgpr_queue_ptr = 0
+; HSA: enable_sgpr_kernarg_segment_ptr = 1
+; HSA: enable_sgpr_dispatch_id = 0
+; HSA: enable_sgpr_flat_scratch_init = 0
+; HSA: enable_sgpr_private_segment_size = 0
+; HSA: enable_sgpr_grid_workgroup_count_x = 0
+; HSA: enable_sgpr_grid_workgroup_count_y = 0
+; HSA: enable_sgpr_grid_workgroup_count_z = 0
+
+; HSA: .end_amd_kernel_code_t
+
+
+; GCN-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
+
 define void @ngroups_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.x() #0
@@ -21,10 +40,10 @@ entry:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV [[VAL]], KC0[0].Y
 
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
 define void @ngroups_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.y() #0
@@ -36,10 +55,10 @@ entry:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV [[VAL]], KC0[0].Z
 
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
 define void @ngroups_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.z() #0
@@ -51,10 +70,10 @@ entry:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV [[VAL]], KC0[0].W
 
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
 define void @global_size_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.x() #0
@@ -66,10 +85,10 @@ entry:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV [[VAL]], KC0[1].X
 
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
 define void @global_size_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.y() #0
@@ -81,10 +100,10 @@ entry:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV [[VAL]], KC0[1].Y
 
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
+; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5
+; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN-NOHSA: buffer_store_dword [[VVAL]]
 define void @global_size_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.z() #0
@@ -92,74 +111,33 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_size_x:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[1].Z
-
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
-define void @local_size_x (i32 addrspace(1)* %out) {
-entry:
-  %0 = call i32 @llvm.r600.read.local.size.x() #0
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}local_size_y:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[1].W
-
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
-define void @local_size_y (i32 addrspace(1)* %out) {
-entry:
-  %0 = call i32 @llvm.r600.read.local.size.y() #0
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}local_size_z:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[2].X
-
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
-define void @local_size_z (i32 addrspace(1)* %out) {
-entry:
-  %0 = call i32 @llvm.r600.read.local.size.z() #0
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}get_work_dim:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[2].Z
-
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
-define void @get_work_dim (i32 addrspace(1)* %out) {
-entry:
-  %0 = call i32 @llvm.AMDGPU.read.workdim() #0
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
-
-; The tgid values are stored in sgprs offset by the number of user sgprs.
-; Currently we always use exactly 2 user sgprs for the pointer to the
-; kernel arguments, but this may change in the future.
+; The tgid values are stored in sgprs offset by the number of user
+; sgprs.
 
 ; FUNC-LABEL: {{^}}tgid_x:
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s4
+; HSA: .amd_kernel_code_t
+; HSA: compute_pgm_rsrc2_user_sgpr = 6
+; HSA: compute_pgm_rsrc2_tgid_x_en = 1
+; HSA: compute_pgm_rsrc2_tgid_y_en = 0
+; HSA: compute_pgm_rsrc2_tgid_z_en = 0
+; HSA: compute_pgm_rsrc2_tg_size_en = 0
+; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0
+; HSA: enable_sgpr_grid_workgroup_count_x = 0
+; HSA: enable_sgpr_grid_workgroup_count_y = 0
+; HSA: enable_sgpr_grid_workgroup_count_z = 0
+; HSA: .end_amd_kernel_code_t
+
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s2{{$}}
+; HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6{{$}}
 ; GCN: buffer_store_dword [[VVAL]]
-define void @tgid_x (i32 addrspace(1)* %out) {
+
+; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6
+; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+define void @tgid_x(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -167,9 +145,25 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}tgid_y:
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s5
+; HSA: compute_pgm_rsrc2_user_sgpr = 6
+; HSA: compute_pgm_rsrc2_tgid_x_en = 1
+; HSA: compute_pgm_rsrc2_tgid_y_en = 1
+; HSA: compute_pgm_rsrc2_tgid_z_en = 0
+; HSA: compute_pgm_rsrc2_tg_size_en = 0
+; HSA: enable_sgpr_grid_workgroup_count_x = 0
+; HSA: enable_sgpr_grid_workgroup_count_y = 0
+; HSA: enable_sgpr_grid_workgroup_count_z = 0
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3
+; GCN-HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s7
 ; GCN: buffer_store_dword [[VVAL]]
-define void @tgid_y (i32 addrspace(1)* %out) {
+
+; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6
+; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 1
+; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+define void @tgid_y(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -177,102 +171,83 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}tgid_z:
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6
+; HSA: compute_pgm_rsrc2_user_sgpr = 6
+; HSA: compute_pgm_rsrc2_tgid_x_en = 1
+; HSA: compute_pgm_rsrc2_tgid_y_en = 0
+; HSA: compute_pgm_rsrc2_tgid_z_en = 1
+; HSA: compute_pgm_rsrc2_tg_size_en = 0
+; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_dispatch_ptr = 0
+; HSA: enable_sgpr_queue_ptr = 0
+; HSA: enable_sgpr_kernarg_segment_ptr = 1
+; HSA: enable_sgpr_dispatch_id = 0
+; HSA: enable_sgpr_flat_scratch_init = 0
+; HSA: enable_sgpr_private_segment_size = 0
+; HSA: enable_sgpr_grid_workgroup_count_x = 0
+; HSA: enable_sgpr_grid_workgroup_count_y = 0
+; HSA: enable_sgpr_grid_workgroup_count_z = 0
+
+; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3{{$}}
+; HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s7{{$}}
 ; GCN: buffer_store_dword [[VVAL]]
-define void @tgid_z (i32 addrspace(1)* %out) {
+
+; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6
+; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
+; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+define void @tgid_z(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.z() #0
   store i32 %0, i32 addrspace(1)* %out
   ret void
 }
 
+; GCN-NOHSA: .section .AMDGPU.config
+; GCN-NOHSA: .long 47180
+; GCN-NOHSA-NEXT: .long 132{{$}}
+
 ; FUNC-LABEL: {{^}}tidig_x:
+; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0
 ; GCN: buffer_store_dword v0
-define void @tidig_x (i32 addrspace(1)* %out) {
+define void @tidig_x(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.x() #0
   store i32 %0, i32 addrspace(1)* %out
   ret void
 }
 
+; GCN-NOHSA: .section .AMDGPU.config
+; GCN-NOHSA: .long 47180
+; GCN-NOHSA-NEXT: .long 2180{{$}}
+
 ; FUNC-LABEL: {{^}}tidig_y:
+
+; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 1
 ; GCN: buffer_store_dword v1
-define void @tidig_y (i32 addrspace(1)* %out) {
+define void @tidig_y(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.y() #0
   store i32 %0, i32 addrspace(1)* %out
   ret void
 }
 
+; GCN-NOHSA: .section .AMDGPU.config
+; GCN-NOHSA: .long 47180
+; GCN-NOHSA-NEXT: .long 4228{{$}}
+
 ; FUNC-LABEL: {{^}}tidig_z:
+; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 2
 ; GCN: buffer_store_dword v2
-define void @tidig_z (i32 addrspace(1)* %out) {
+define void @tidig_z(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.z() #0
   store i32 %0, i32 addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_size_x_known_bits:
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18
-; GCN-NOT: 0xffff
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN-NEXT: buffer_store_dword [[VVAL]]
-define void @local_size_x_known_bits(i32 addrspace(1)* %out) {
-entry:
-  %size = call i32 @llvm.r600.read.local.size.x() #0
-  %shl = shl i32 %size, 16
-  %shr = lshr i32 %shl, 16
-  store i32 %shr, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}local_size_y_known_bits:
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
-; GCN-NOT: 0xffff
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN-NEXT: buffer_store_dword [[VVAL]]
-define void @local_size_y_known_bits(i32 addrspace(1)* %out) {
-entry:
-  %size = call i32 @llvm.r600.read.local.size.y() #0
-  %shl = shl i32 %size, 16
-  %shr = lshr i32 %shl, 16
-  store i32 %shr, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}local_size_z_known_bits:
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
-; GCN-NOT: 0xffff
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN-NEXT: buffer_store_dword [[VVAL]]
-define void @local_size_z_known_bits(i32 addrspace(1)* %out) {
-entry:
-  %size = call i32 @llvm.r600.read.local.size.z() #0
-  %shl = shl i32 %size, 16
-  %shr = lshr i32 %shl, 16
-  store i32 %shr, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}get_work_dim_known_bits:
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
-; GCN-NOT: 0xff
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
-define void @get_work_dim_known_bits(i32 addrspace(1)* %out) {
-entry:
-  %dim = call i32 @llvm.AMDGPU.read.workdim() #0
-  %shl = shl i32 %dim, 24
-  %shr = lshr i32 %shl, 24
-  store i32 %shr, i32 addrspace(1)* %out
-  ret void
-}
-
 declare i32 @llvm.r600.read.ngroups.x() #0
 declare i32 @llvm.r600.read.ngroups.y() #0
 declare i32 @llvm.r600.read.ngroups.z() #0
@@ -281,10 +256,6 @@ declare i32 @llvm.r600.read.global.size.x() #0
 declare i32 @llvm.r600.read.global.size.y() #0
 declare i32 @llvm.r600.read.global.size.z() #0
 
-declare i32 @llvm.r600.read.local.size.x() #0
-declare i32 @llvm.r600.read.local.size.y() #0
-declare i32 @llvm.r600.read.local.size.z() #0
-
 declare i32 @llvm.r600.read.tgid.x() #0
 declare i32 @llvm.r600.read.tgid.y() #0
 declare i32 @llvm.r600.read.tgid.z() #0

From 7a47a7be128018d0b3353d8213068cc57e022d87 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 30 Nov 2015 21:16:07 +0000
Subject: [PATCH 022/186] AMDGPU: Error if too many user SGPRs used

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254332 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 5 +++++
 lib/Target/AMDGPU/AMDGPUSubtarget.h    | 3 +++
 2 files changed, 8 insertions(+)

diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index b96c48f0561d..87a0cf20d176 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -401,6 +401,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
     ProgInfo.NumSGPR = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
   }
 
+  if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) {
+    LLVMContext &Ctx = MF.getFunction()->getContext();
+    Ctx.emitError("too many user SGPRs used");
+  }
+
   ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4;
   ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8;
   // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 9921630326b4..971b5179b13c 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -303,6 +303,9 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
     return isAmdHsaOS() ? 0 : 36;
   }
 
+  unsigned getMaxNumUserSGPRs() const {
+    return 16;
+  }
 };
 
 } // End namespace llvm

From 504d89597a278daef61805ae47c96cadb290e9ae Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 30 Nov 2015 21:32:10 +0000
Subject: [PATCH 023/186] AMDGPU: Fix unused function

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254333 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/SIISelLowering.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 05b54d0bae51..1b0cc87206f4 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -542,11 +542,6 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
                      Align); // Alignment
 }
 
-static ArrayRef<MCPhysReg> getAllSGPRs() {
-  return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
-                      AMDGPU::SGPR_32RegClass.getNumRegs());
-}
-
 SDValue SITargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,

From f44d69a9c3a9c295184011ba37ad36321ced7531 Mon Sep 17 00:00:00 2001
From: Cong Hou <congh@google.com>
Date: Mon, 30 Nov 2015 21:46:08 +0000
Subject: [PATCH 024/186] [X86] Update test/CodeGen/X86/avg.ll with the help of
 update_llc_test_checks.py. NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254334 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/avg.ll | 597 +++++++++++++++++++++++-----------------
 1 file changed, 347 insertions(+), 250 deletions(-)

diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll
index ce2bf0fdad19..f1c636a73305 100644
--- a/test/CodeGen/X86/avg.ll
+++ b/test/CodeGen/X86/avg.ll
@@ -1,24 +1,31 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512bw | FileCheck %s --check-prefix=AVX2 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512BW
 
 define void @avg_v4i8(<4 x i8>* %a, <4 x i8>* %b) {
-; SSE2-LABEL: avg_v4i8
-; SSE2:      # BB#0:
-; SSE2-NEXT: movd (%rdi), %xmm0           # xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: movd (%rsi), %xmm1           # xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: pavgb %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, (%rax)
-; SSE2-NEXT: retq
+; SSE2-LABEL: avg_v4i8:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    pavgb %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, (%rax)
+; SSE2-NEXT:    retq
 ;
-; AVX2-LABEL: avg_v4i8
+; AVX2-LABEL: avg_v4i8:
 ; AVX2:       # BB#0:
-; AVX2-NEXT: vmovd (%rdi), %xmm0
-; AVX2-NEXT: vmovd (%rsi), %xmm1
-; AVX2-NEXT: vpavgb %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovd %xmm0, (%rax)
-; AVX2-NEXT: retq
+; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, (%rax)
+; AVX2-NEXT:    retq
 ;
+; AVX512BW-LABEL: avg_v4i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovd (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovd (%rsi), %xmm1
+; AVX512BW-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT:    vmovd %xmm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <4 x i8>, <4 x i8>* %a
   %2 = load <4 x i8>, <4 x i8>* %b
   %3 = zext <4 x i8> %1 to <4 x i32>
@@ -32,22 +39,29 @@ define void @avg_v4i8(<4 x i8>* %a, <4 x i8>* %b) {
 }
 
 define void @avg_v8i8(<8 x i8>* %a, <8 x i8>* %b) {
-; SSE2-LABEL: avg_v8i8
+; SSE2-LABEL: avg_v8i8:
 ; SSE2:       # BB#0:
-; SSE2-NEXT: movq (%rdi), %xmm0           # xmm0 = mem[0],zero
-; SSE2-NEXT: movq (%rsi), %xmm1           # xmm1 = mem[0],zero
-; SSE2-NEXT: pavgb %xmm0, %xmm1
-; SSE2-NEXT: movq %xmm1, (%rax)
-; SSE2-NEXT: retq
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    pavgb %xmm0, %xmm1
+; SSE2-NEXT:    movq %xmm1, (%rax)
+; SSE2-NEXT:    retq
 ;
-; AVX2-LABEL: avg_v8i8
+; AVX2-LABEL: avg_v8i8:
 ; AVX2:       # BB#0:
-; AVX2-NEXT: vmovq (%rdi), %xmm0
-; AVX2-NEXT: vmovq (%rsi), %xmm1
-; AVX2-NEXT: vpavgb %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovq %xmm0, (%rax)
-; AVX2-NEXT: retq
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-NEXT:    retq
 ;
+; AVX512BW-LABEL: avg_v8i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovq (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovq (%rsi), %xmm1
+; AVX512BW-NEXT:    vpavgb %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <8 x i8>, <8 x i8>* %a
   %2 = load <8 x i8>, <8 x i8>* %b
   %3 = zext <8 x i8> %1 to <8 x i32>
@@ -61,20 +75,19 @@ define void @avg_v8i8(<8 x i8>* %a, <8 x i8>* %b) {
 }
 
 define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) {
-; SSE2-LABEL: avg_v16i8
+; SSE2-LABEL: avg_v16i8:
 ; SSE2:       # BB#0:
-; SSE2-NEXT: movdqa (%rsi), %xmm0
-; SSE2-NEXT: pavgb (%rdi), %xmm0
-; SSE2-NEXT: movdqu %xmm0, (%rax)
-; SSE2-NEXT: retq
-;
-; AVX2-LABEL: avg_v16i8
-; AVX2:       # BB#0:
-; AVX2-NEXT: vmovdqa (%rsi), %xmm0
-; AVX2-NEXT: vpavgb (%rdi), %xmm0, %xmm0
-; AVX2-NEXT: vmovdqu %xmm0, (%rax)
-; AVX2-NEXT: retq
-;
+; SSE2-NEXT:    movdqa (%rsi), %xmm0
+; SSE2-NEXT:    pavgb (%rdi), %xmm0
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: avg_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rsi), %xmm0
+; AVX-NEXT:    vpavgb (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX-NEXT:    retq
   %1 = load <16 x i8>, <16 x i8>* %a
   %2 = load <16 x i8>, <16 x i8>* %b
   %3 = zext <16 x i8> %1 to <16 x i32>
@@ -88,12 +101,20 @@ define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) {
 }
 
 define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) {
-; AVX2-LABEL: avg_v32i8
+; AVX2-LABEL: avg_v32i8:
 ; AVX2:       # BB#0:
-; AVX2-NEXT: vmovdqa (%rsi), %ymm0
-; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX2-NEXT:    vpavgb (%rdi), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
+; AVX512BW-LABEL: avg_v32i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX512BW-NEXT:    vpavgb (%rdi), %ymm0, %ymm0
+; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <32 x i8>, <32 x i8>* %a
   %2 = load <32 x i8>, <32 x i8>* %b
   %3 = zext <32 x i8> %1 to <32 x i32>
@@ -107,13 +128,12 @@ define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) {
 }
 
 define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
-; AVX512BW-LABEL: avg_v64i8
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0
-; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax)
-; AVX512BW-NEXT: retq
-;
+; AVX512BW-LABEL: avg_v64i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqu8 (%rsi), %zmm0
+; AVX512BW-NEXT:    vpavgb (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqu8 %zmm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <64 x i8>, <64 x i8>* %a
   %2 = load <64 x i8>, <64 x i8>* %b
   %3 = zext <64 x i8> %1 to <64 x i32>
@@ -127,22 +147,29 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
 }
 
 define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) {
-; SSE2-LABEL: avg_v4i16
+; SSE2-LABEL: avg_v4i16:
 ; SSE2:       # BB#0:
-; SSE2-NEXT: movq (%rdi), %xmm0           # xmm0 = mem[0],zero
-; SSE2-NEXT: movq (%rsi), %xmm1           # xmm1 = mem[0],zero
-; SSE2-NEXT: pavgw %xmm0, %xmm1
-; SSE2-NEXT: movq %xmm1, (%rax)
-; SSE2-NEXT: retq
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    pavgw %xmm0, %xmm1
+; SSE2-NEXT:    movq %xmm1, (%rax)
+; SSE2-NEXT:    retq
 ;
-; AVX2-LABEL: avg_v4i16
-; AVX2:      # BB#0:
-; AVX2-NEXT: vmovq (%rdi), %xmm0
-; AVX2-NEXT: vmovq (%rsi), %xmm1
-; AVX2-NEXT: vpavgw %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovq %xmm0, (%rax)
-; AVX2-NEXT: retq
+; AVX2-LABEL: avg_v4i16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpavgw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-NEXT:    retq
 ;
+; AVX512BW-LABEL: avg_v4i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovq (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovq (%rsi), %xmm1
+; AVX512BW-NEXT:    vpavgw %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <4 x i16>, <4 x i16>* %a
   %2 = load <4 x i16>, <4 x i16>* %b
   %3 = zext <4 x i16> %1 to <4 x i32>
@@ -156,20 +183,19 @@ define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) {
 }
 
 define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) {
-; SSE2-LABEL: avg_v8i16
+; SSE2-LABEL: avg_v8i16:
 ; SSE2:       # BB#0:
-; SSE2-NEXT: movdqa (%rsi), %xmm0
-; SSE2-NEXT: pavgw (%rdi), %xmm0
-; SSE2-NEXT: movdqu %xmm0, (%rax)
-; SSE2-NEXT: retq
-;
-; AVX2-LABEL: avg_v8i16
-; AVX2:       # BB#0:
-; AVX2-NEXT: vmovdqa (%rsi), %xmm0
-; AVX2-NEXT: vpavgw (%rdi), %xmm0, %xmm0
-; AVX2-NEXT: vmovdqu %xmm0, (%rax)
-; AVX2-NEXT: retq
-;
+; SSE2-NEXT:    movdqa (%rsi), %xmm0
+; SSE2-NEXT:    pavgw (%rdi), %xmm0
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: avg_v8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rsi), %xmm0
+; AVX-NEXT:    vpavgw (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX-NEXT:    retq
   %1 = load <8 x i16>, <8 x i16>* %a
   %2 = load <8 x i16>, <8 x i16>* %b
   %3 = zext <8 x i16> %1 to <8 x i32>
@@ -183,12 +209,20 @@ define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) {
 }
 
 define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) {
-; AVX2-LABEL: avg_v16i16
+; AVX2-LABEL: avg_v16i16:
 ; AVX2:       # BB#0:
-; AVX2-NEXT: vmovdqa (%rsi), %ymm0
-; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX2-NEXT:    vpavgw (%rdi), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
+; AVX512BW-LABEL: avg_v16i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rsi), %ymm0
+; AVX512BW-NEXT:    vpavgw (%rdi), %ymm0, %ymm0
+; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <16 x i16>, <16 x i16>* %a
   %2 = load <16 x i16>, <16 x i16>* %b
   %3 = zext <16 x i16> %1 to <16 x i32>
@@ -202,13 +236,12 @@ define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) {
 }
 
 define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
-; AVX512BW-LABEL: avg_v32i16
-; AVX512BW:      # BB#0:
-; AVX512BW-NEXT: vmovdqu16 (%rsi), %zmm0
-; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax)
-; AVX512BW-NEXT: retq
-;
+; AVX512BW-LABEL: avg_v32i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqu16 (%rsi), %zmm0
+; AVX512BW-NEXT:    vpavgw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqu16 %zmm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <32 x i16>, <32 x i16>* %a
   %2 = load <32 x i16>, <32 x i16>* %b
   %3 = zext <32 x i16> %1 to <32 x i32>
@@ -222,22 +255,29 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
 }
 
 define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) {
-; SSE2-LABEL: avg_v4i8_2
+; SSE2-LABEL: avg_v4i8_2:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:  movd (%rdi), %xmm0
-; SSE2-NEXT:  movd (%rsi), %xmm1
-; SSE2-NEXT:  pavgb %xmm0, %xmm1
-; SSE2-NEXT:  movd %xmm1, (%rax)
-; SSE2-NEXT:  retq
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    pavgb %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, (%rax)
+; SSE2-NEXT:    retq
 ;
-; AVX2-LABEL: avg_v4i8_2
-; AVX2:      # BB#0:
-; AVX2-NEXT: vmovd (%rdi), %xmm0
-; AVX2-NEXT: vmovd (%rsi), %xmm1
-; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, (%rax)
-; AVX2-NEXT: retq
+; AVX2-LABEL: avg_v4i8_2:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, (%rax)
+; AVX2-NEXT:    retq
 ;
+; AVX512BW-LABEL: avg_v4i8_2:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovd (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovd (%rsi), %xmm1
+; AVX512BW-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovd %xmm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <4 x i8>, <4 x i8>* %a
   %2 = load <4 x i8>, <4 x i8>* %b
   %3 = zext <4 x i8> %1 to <4 x i32>
@@ -251,22 +291,29 @@ define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) {
 }
 
 define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) {
-; SSE2-LABEL: avg_v8i8_2
+; SSE2-LABEL: avg_v8i8_2:
 ; SSE2:       # BB#0:
-; SSE2-NEXT: movq (%rdi), %xmm0           # xmm0 = mem[0],zero
-; SSE2-NEXT: movq (%rsi), %xmm1           # xmm1 = mem[0],zero
-; SSE2-NEXT: pavgb %xmm0, %xmm1
-; SSE2-NEXT: movq %xmm1, (%rax)
-; SSE2-NEXT: retq
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    pavgb %xmm0, %xmm1
+; SSE2-NEXT:    movq %xmm1, (%rax)
+; SSE2-NEXT:    retq
 ;
-; AVX2-LABEL: avg_v8i8_2
+; AVX2-LABEL: avg_v8i8_2:
 ; AVX2:       # BB#0:
-; AVX2-NEXT: vmovq (%rdi), %xmm0
-; AVX2-NEXT: vmovq (%rsi), %xmm1
-; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovq %xmm0, (%rax)
-; AVX2-NEXT: retq
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-NEXT:    retq
 ;
+; AVX512BW-LABEL: avg_v8i8_2:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovq (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovq (%rsi), %xmm1
+; AVX512BW-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <8 x i8>, <8 x i8>* %a
   %2 = load <8 x i8>, <8 x i8>* %b
   %3 = zext <8 x i8> %1 to <8 x i32>
@@ -280,20 +327,19 @@ define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) {
 }
 
 define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) {
-; SSE2-LABEL: avg_v16i8_2
+; SSE2-LABEL: avg_v16i8_2:
 ; SSE2:       # BB#0:
-; SSE2-NEXT: movdqa (%rdi), %xmm0
-; SSE2-NEXT: pavgb (%rsi), %xmm0
-; SSE2-NEXT: movdqu %xmm0, (%rax)
-; SSE2-NEXT: retq
-;
-; AVX2-LABEL: avg_v16i8_2
-; AVX2:       # BB#0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vpavgb (%rsi), %xmm0, %xmm0
-; AVX2-NEXT: vmovdqu %xmm0, (%rax)
-; AVX2-NEXT: retq
-;
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    pavgb (%rsi), %xmm0
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: avg_v16i8_2:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpavgb (%rsi), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX-NEXT:    retq
   %1 = load <16 x i8>, <16 x i8>* %a
   %2 = load <16 x i8>, <16 x i8>* %b
   %3 = zext <16 x i8> %1 to <16 x i32>
@@ -307,12 +353,20 @@ define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) {
 }
 
 define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) {
-; AVX2-LABEL: avg_v32i8_2
+; AVX2-LABEL: avg_v32i8_2:
 ; AVX2:       # BB#0:
-; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-NEXT:    vpavgb (%rsi), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
+; AVX512BW-LABEL: avg_v32i8_2:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpavgb (%rsi), %ymm0, %ymm0
+; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <32 x i8>, <32 x i8>* %a
   %2 = load <32 x i8>, <32 x i8>* %b
   %3 = zext <32 x i8> %1 to <32 x i32>
@@ -326,13 +380,12 @@ define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) {
 }
 
 define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) {
-; AVX512BW-LABEL: avg_v64i8_2
+; AVX512BW-LABEL: avg_v64i8_2:
 ; AVX512BW:       # BB#0:
-; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0
-; AVX512BW-NEXT: vpavgb %zmm0, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax)
-; AVX512BW-NEXT: retq
-;
+; AVX512BW-NEXT:    vmovdqu8 (%rsi), %zmm0
+; AVX512BW-NEXT:    vpavgb %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqu8 %zmm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <64 x i8>, <64 x i8>* %a
   %2 = load <64 x i8>, <64 x i8>* %b
   %3 = zext <64 x i8> %1 to <64 x i32>
@@ -347,22 +400,29 @@ define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) {
 
 
 define void @avg_v4i16_2(<4 x i16>* %a, <4 x i16>* %b) {
-; SSE2-LABEL: avg_v4i16_2
+; SSE2-LABEL: avg_v4i16_2:
 ; SSE2:       # BB#0:
-; SSE2-NEXT: movq (%rdi), %xmm0           # xmm0 = mem[0],zero
-; SSE2-NEXT: movq (%rsi), %xmm1           # xmm1 = mem[0],zero
-; SSE2-NEXT: pavgw %xmm0, %xmm1
-; SSE2-NEXT: movq %xmm1, (%rax)
-; SSE2-NEXT: retq
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    pavgw %xmm0, %xmm1
+; SSE2-NEXT:    movq %xmm1, (%rax)
+; SSE2-NEXT:    retq
 ;
-; AVX2-LABEL: avg_v4i16_2
+; AVX2-LABEL: avg_v4i16_2:
 ; AVX2:       # BB#0:
-; AVX2-NEXT: vmovq (%rdi), %xmm0
-; AVX2-NEXT: vmovq (%rsi), %xmm1
-; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovq %xmm0, (%rax)
-; AVX2-NEXT: retq
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-NEXT:    retq
 ;
+; AVX512BW-LABEL: avg_v4i16_2:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovq (%rdi), %xmm0
+; AVX512BW-NEXT:    vmovq (%rsi), %xmm1
+; AVX512BW-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <4 x i16>, <4 x i16>* %a
   %2 = load <4 x i16>, <4 x i16>* %b
   %3 = zext <4 x i16> %1 to <4 x i32>
@@ -376,20 +436,19 @@ define void @avg_v4i16_2(<4 x i16>* %a, <4 x i16>* %b) {
 }
 
 define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) {
-; SSE2-LABEL: avg_v8i16_2
+; SSE2-LABEL: avg_v8i16_2:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:  movdqa (%rdi), %xmm0
-; SSE2-NEXT:  pavgw (%rsi), %xmm0
-; SSE2-NEXT:  movdqu %xmm0, (%rax)
-; SSE2-NEXT:  retq
-;
-; AVX2-LABEL: avg_v8i16_2
-; AVX2:      # BB#0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vpavgw (%rsi), %xmm0, %xmm0
-; AVX2-NEXT: vmovdqu %xmm0, (%rax)
-; AVX2-NEXT: retq
-;
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    pavgw (%rsi), %xmm0
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: avg_v8i16_2:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpavgw (%rsi), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX-NEXT:    retq
   %1 = load <8 x i16>, <8 x i16>* %a
   %2 = load <8 x i16>, <8 x i16>* %b
   %3 = zext <8 x i16> %1 to <8 x i32>
@@ -403,12 +462,20 @@ define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) {
 }
 
 define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) {
-; AVX2-LABEL: avg_v16i16_2
+; AVX2-LABEL: avg_v16i16_2:
 ; AVX2:       # BB#0:
-; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-NEXT:    vpavgw (%rsi), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
+; AVX512BW-LABEL: avg_v16i16_2:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpavgw (%rsi), %ymm0, %ymm0
+; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <16 x i16>, <16 x i16>* %a
   %2 = load <16 x i16>, <16 x i16>* %b
   %3 = zext <16 x i16> %1 to <16 x i32>
@@ -422,13 +489,12 @@ define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) {
 }
 
 define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
-; AVX512BW-LABEL: avg_v32i16_2
+; AVX512BW-LABEL: avg_v32i16_2:
 ; AVX512BW:       # BB#0:
-; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
-; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax)
-; AVX512BW-NEXT: retq
-;
+; AVX512BW-NEXT:    vmovdqu16 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpavgw (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqu16 %zmm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <32 x i16>, <32 x i16>* %a
   %2 = load <32 x i16>, <32 x i16>* %b
   %3 = zext <32 x i16> %1 to <32 x i32>
@@ -442,20 +508,26 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
 }
 
 define void @avg_v4i8_const(<4 x i8>* %a) {
-; SSE2-LABEL: avg_v4i8_const
+; SSE2-LABEL: avg_v4i8_const:
 ; SSE2:       # BB#0:
-; SSE2-NEXT: movd (%rdi), %xmm0           # xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: pavgb {{.*}}, %xmm0
-; SSE2-NEXT: movd %xmm0, (%rax)
-; SSE2-NEXT: retq
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    pavgb {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movd %xmm0, (%rax)
+; SSE2-NEXT:    retq
 ;
-; AVX2-LABEL: avg_v4i8_const
+; AVX2-LABEL: avg_v4i8_const:
 ; AVX2:       # BB#0:
-; AVX2-NEXT: vmovd (%rdi), %xmm0
-; AVX2-NEXT: vpavgb {{.*}}, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, (%rax)
-; AVX2-NEXT: retq
+; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vpavgb {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, (%rax)
+; AVX2-NEXT:    retq
 ;
+; AVX512BW-LABEL: avg_v4i8_const:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovd (%rdi), %xmm0
+; AVX512BW-NEXT:    vpavgb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovd %xmm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <4 x i8>, <4 x i8>* %a
   %2 = zext <4 x i8> %1 to <4 x i32>
   %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4>
@@ -466,20 +538,26 @@ define void @avg_v4i8_const(<4 x i8>* %a) {
 }
 
 define void @avg_v8i8_const(<8 x i8>* %a) {
-; SSE2-LABEL: avg_v8i8_const
+; SSE2-LABEL: avg_v8i8_const:
 ; SSE2:       # BB#0:
-; SSE2-NEXT: movq (%rdi), %xmm0           # xmm0 = mem[0],zero
-; SSE2-NEXT: pavgb {{.*}}, %xmm0
-; SSE2-NEXT: movq %xmm0, (%rax)
-; SSE2-NEXT: retq
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    pavgb {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movq %xmm0, (%rax)
+; SSE2-NEXT:    retq
 ;
-; AVX2-LABEL: avg_v8i8_const
+; AVX2-LABEL: avg_v8i8_const:
 ; AVX2:       # BB#0:
-; AVX2-NEXT: vmovq (%rdi), %xmm0
-; AVX2-NEXT: vpavgb {{.*}}, %xmm0, %xmm0
-; AVX2-NEXT: vmovq %xmm0, (%rax)
-; AVX2-NEXT: retq
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vpavgb {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-NEXT:    retq
 ;
+; AVX512BW-LABEL: avg_v8i8_const:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovq (%rdi), %xmm0
+; AVX512BW-NEXT:    vpavgb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <8 x i8>, <8 x i8>* %a
   %2 = zext <8 x i8> %1 to <8 x i32>
   %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
@@ -490,20 +568,19 @@ define void @avg_v8i8_const(<8 x i8>* %a) {
 }
 
 define void @avg_v16i8_const(<16 x i8>* %a) {
-; SSE2-LABEL: avg_v16i8_const
+; SSE2-LABEL: avg_v16i8_const:
 ; SSE2:       # BB#0:
-; SSE2-NEXT: movdqa (%rdi), %xmm0
-; SSE2-NEXT: pavgb {{.*}}, %xmm0
-; SSE2-NEXT: movdqu %xmm0, (%rax)
-; SSE2-NEXT: retq
-;
-; AVX2-LABEL: avg_v16i8_const
-; AVX2:       # BB#0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vpavgb {{.*}}, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqu %xmm0, (%rax)
-; AVX2-NEXT: retq
-;
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    pavgb {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: avg_v16i8_const:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpavgb {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX-NEXT:    retq
   %1 = load <16 x i8>, <16 x i8>* %a
   %2 = zext <16 x i8> %1 to <16 x i32>
   %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
@@ -514,53 +591,66 @@ define void @avg_v16i8_const(<16 x i8>* %a) {
 }
 
 define void @avg_v32i8_const(<32 x i8>* %a) {
-; AVX2-LABEL: avg_v32i8_const
+; AVX2-LABEL: avg_v32i8_const:
 ; AVX2:       # BB#0:
-; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vpavgb {{.*}}, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-NEXT:    vpavgb {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
+; AVX512BW-LABEL: avg_v32i8_const:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpavgb {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <32 x i8>, <32 x i8>* %a
   %2 = zext <32 x i8> %1 to <32 x i32>
   %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-  %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 
+  %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   %5 = trunc <32 x i32> %4 to <32 x i8>
   store <32 x i8> %5, <32 x i8>* undef, align 4
   ret void
 }
 
 define void @avg_v64i8_const(<64 x i8>* %a) {
-; AVX512BW-LABEL: avg_v64i8_const
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0
-; AVX512BW-NEXT: vpavgb {{.*}}, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax)
-; AVX512BW-NEXT: retq
-;
+; AVX512BW-LABEL: avg_v64i8_const:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqu8 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpavgb {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqu8 %zmm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <64 x i8>, <64 x i8>* %a
   %2 = zext <64 x i8> %1 to <64 x i32>
   %3 = add nuw nsw <64 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-  %4 = lshr <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 
+  %4 = lshr <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   %5 = trunc <64 x i32> %4 to <64 x i8>
   store <64 x i8> %5, <64 x i8>* undef, align 4
   ret void
 }
 
 define void @avg_v4i16_const(<4 x i16>* %a) {
-; SSE2-LABEL: avg_v4i16_const
+; SSE2-LABEL: avg_v4i16_const:
 ; SSE2:       # BB#0:
-; SSE2-NEXT: movq (%rdi), %xmm0
-; SSE2-NEXT: pavgw {{.*}}, %xmm0
-; SSE2-NEXT: movq %xmm0, (%rax)
-; SSE2-NEXT: retq
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    pavgw {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movq %xmm0, (%rax)
+; SSE2-NEXT:    retq
 ;
-; AVX2-LABEL: avg_v4i16_const
+; AVX2-LABEL: avg_v4i16_const:
 ; AVX2:       # BB#0:
-; AVX2-NEXT: vmovq (%rdi), %xmm0
-; AVX2-NEXT: vpavgw {{.*}}, %xmm0, %xmm0
-; AVX2-NEXT: vmovq %xmm0, (%rax)
-; AVX2-NEXT: retq
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vpavgw {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-NEXT:    retq
 ;
+; AVX512BW-LABEL: avg_v4i16_const:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovq (%rdi), %xmm0
+; AVX512BW-NEXT:    vpavgw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <4 x i16>, <4 x i16>* %a
   %2 = zext <4 x i16> %1 to <4 x i32>
   %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4>
@@ -571,20 +661,19 @@ define void @avg_v4i16_const(<4 x i16>* %a) {
 }
 
 define void @avg_v8i16_const(<8 x i16>* %a) {
-; SSE2-LABEL: avg_v8i16_const
+; SSE2-LABEL: avg_v8i16_const:
 ; SSE2:       # BB#0:
-; SSE2-NEXT: movdqa (%rdi), %xmm0
-; SSE2-NEXT: pavgw {{.*}}, %xmm0
-; SSE2-NEXT: movdqu %xmm0, (%rax)
-; SSE2-NEXT: retq
-;
-; AVX2-LABEL: avg_v8i16_const
-; AVX2:       # BB#0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vpavgw {{.*}}, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqu %xmm0, (%rax)
-; AVX2-NEXT: retq
-;
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    pavgw {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: avg_v8i16_const:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpavgw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX-NEXT:    retq
   %1 = load <8 x i16>, <8 x i16>* %a
   %2 = zext <8 x i16> %1 to <8 x i32>
   %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
@@ -595,12 +684,20 @@ define void @avg_v8i16_const(<8 x i16>* %a) {
 }
 
 define void @avg_v16i16_const(<16 x i16>* %a) {
-; AVX2-LABEL: avg_v16i16_const
+; AVX2-LABEL: avg_v16i16_const:
 ; AVX2:       # BB#0:
-; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vpavgw {{.*}}, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-NEXT:    vpavgw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
+; AVX512BW-LABEL: avg_v16i16_const:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpavgw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <16 x i16>, <16 x i16>* %a
   %2 = zext <16 x i16> %1 to <16 x i32>
   %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
@@ -611,16 +708,16 @@ define void @avg_v16i16_const(<16 x i16>* %a) {
 }
 
 define void @avg_v32i16_const(<32 x i16>* %a) {
-; AVX512BW-LABEL: avg_v32i16_const
+; AVX512BW-LABEL: avg_v32i16_const:
 ; AVX512BW:       # BB#0:
-; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
-; AVX512BW-NEXT: vpavgw {{.*}}, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax)
-;
+; AVX512BW-NEXT:    vmovdqu16 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpavgw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqu16 %zmm0, (%rax)
+; AVX512BW-NEXT:    retq
   %1 = load <32 x i16>, <32 x i16>* %a
   %2 = zext <32 x i16> %1 to <32 x i32>
   %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-  %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 
+  %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   %5 = trunc <32 x i32> %4 to <32 x i16>
   store <32 x i16> %5, <32 x i16>* undef, align 4
   ret void

From ae5d4bfbbefb29e8759287f13bc1d3bb44474044 Mon Sep 17 00:00:00 2001
From: Paul Robinson <paul_robinson@playstation.sony.com>
Date: Mon, 30 Nov 2015 21:56:16 +0000
Subject: [PATCH 025/186] Have 'optnone' respect the -fast-isel=false option.

This is primarily useful for debugging optnone v. ISel issues.

Differential Revision: http://reviews.llvm.org/D14792


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254335 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/LangRef.rst                                  |  6 +++---
 include/llvm/Target/TargetMachine.h               |  3 +++
 lib/CodeGen/LLVMTargetMachine.cpp                 |  3 ++-
 lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp     | 10 +++++++---
 test/CodeGen/Mips/emergency-spill-slot-near-fp.ll |  4 ++--
 test/Feature/optnone-llc.ll                       |  8 +++++++-
 6 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index c7ea1c1bf236..be8e63bf071f 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -1289,9 +1289,9 @@ example:
     that are recognized by LLVM to handle asynchronous exceptions, such
     as SEH, will still provide their implementation defined semantics.
 ``optnone``
-    This function attribute indicates that the function is not optimized
-    by any optimization or code generator passes with the
-    exception of interprocedural optimization passes.
+    This function attribute indicates that most optimization passes will skip
+    this function, with the exception of interprocedural optimization passes.
+    Code generation defaults to the "fast" instruction selector.
     This attribute cannot be used together with the ``alwaysinline``
     attribute; this attribute is also incompatible
     with the ``minsize`` attribute and the ``optsize`` attribute.
diff --git a/include/llvm/Target/TargetMachine.h b/include/llvm/Target/TargetMachine.h
index b40e4a69a4d2..b7760a61806f 100644
--- a/include/llvm/Target/TargetMachine.h
+++ b/include/llvm/Target/TargetMachine.h
@@ -102,6 +102,7 @@ class TargetMachine {
   const MCSubtargetInfo *STI;
 
   unsigned RequireStructuredCFG : 1;
+  unsigned O0WantsFastISel : 1;
 
   /// This API is here to support the C API, deprecated in 3.7 release.
   /// This should never be used outside of legacy existing client.
@@ -190,6 +191,8 @@ class TargetMachine {
   void setOptLevel(CodeGenOpt::Level Level) const;
 
   void setFastISel(bool Enable) { Options.EnableFastISel = Enable; }
+  bool getO0WantsFastISel() { return O0WantsFastISel; }
+  void setO0WantsFastISel(bool Enable) { O0WantsFastISel = Enable; }
 
   bool shouldPrintMachineCode() const { return Options.PrintMachineCode; }
 
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index 5b8c8258b285..da24cb17918b 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -125,9 +125,10 @@ addPassesToGenerateCode(LLVMTargetMachine *TM, PassManagerBase &PM,
   PM.add(new MachineFunctionAnalysis(*TM, MFInitializer));
 
   // Enable FastISel with -fast, but allow that to be overridden.
+  TM->setO0WantsFastISel(EnableFastISelOption != cl::BOU_FALSE);
   if (EnableFastISelOption == cl::BOU_TRUE ||
       (TM->getOptLevel() == CodeGenOpt::None &&
-       EnableFastISelOption != cl::BOU_FALSE))
+       TM->getO0WantsFastISel()))
     TM->setFastISel(true);
 
   // Ask the target for an isel.
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index ebf071cb9946..3bbe5d4203bb 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -264,13 +264,17 @@ namespace llvm {
         return;
       IS.OptLevel = NewOptLevel;
       IS.TM.setOptLevel(NewOptLevel);
-      SavedFastISel = IS.TM.Options.EnableFastISel;
-      if (NewOptLevel == CodeGenOpt::None)
-        IS.TM.setFastISel(true);
       DEBUG(dbgs() << "\nChanging optimization level for Function "
             << IS.MF->getFunction()->getName() << "\n");
       DEBUG(dbgs() << "\tBefore: -O" << SavedOptLevel
             << " ; After: -O" << NewOptLevel << "\n");
+      SavedFastISel = IS.TM.Options.EnableFastISel;
+      if (NewOptLevel == CodeGenOpt::None) {
+        IS.TM.setFastISel(IS.TM.getO0WantsFastISel());
+        DEBUG(dbgs() << "\tFastISel is "
+              << (IS.TM.Options.EnableFastISel ? "enabled" : "disabled")
+              << "\n");
+      }
     }
 
     ~OptLevelChanger() {
diff --git a/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll b/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll
index 58dd16c9f9c8..54092b4e3ebe 100644
--- a/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll
+++ b/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll
@@ -1,5 +1,5 @@
 ; Check that register scavenging spill slot is close to $fp.
-; RUN: llc -march=mipsel -O0 -fast-isel=false < %s | FileCheck %s
+; RUN: llc -march=mipsel -O0 < %s | FileCheck %s
 
 ; CHECK: sw ${{.*}}, 8($sp)
 ; CHECK: lw ${{.*}}, 8($sp)
@@ -31,4 +31,4 @@ entry:
   ret i32 0
 }
 
-attributes #0 = { noinline optnone "no-frame-pointer-elim"="true" }
+attributes #0 = { noinline "no-frame-pointer-elim"="true" }
diff --git a/test/Feature/optnone-llc.ll b/test/Feature/optnone-llc.ll
index 015cc842d9f8..94f61efea4aa 100644
--- a/test/Feature/optnone-llc.ll
+++ b/test/Feature/optnone-llc.ll
@@ -3,11 +3,13 @@
 ; RUN: llc -O2 -debug %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=LLC-Ox
 ; RUN: llc -O3 -debug %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=LLC-Ox
 ; RUN: llc -misched-postra -debug %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=LLC-MORE
+; RUN: llc -O1 -debug-only=isel %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc -O1 -debug-only=isel -fast-isel=false %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=NOFAST
 
 ; REQUIRES: asserts, default_triple
 
 ; This test verifies that we don't run Machine Function optimizations
-; on optnone functions.
+; on optnone functions, and that we can turn off FastISel.
 
 ; Function Attrs: noinline optnone
 define i32 @_Z3fooi(i32 %x) #0 {
@@ -52,3 +54,7 @@ attributes #0 = { optnone noinline }
 
 ; Alternate post-RA scheduler.
 ; LLC-MORE: Skipping pass 'PostRA Machine Instruction Scheduler'
+
+; Selectively disable FastISel for optnone functions.
+; FAST:   FastISel is enabled
+; NOFAST: FastISel is disabled

From a586fd2c5665c75c1b2870b1361d794ac25caf9e Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Mon, 30 Nov 2015 22:01:43 +0000
Subject: [PATCH 026/186] Start deciding earlier what to link.

A traditional linker is roughly split in symbol resolution and "copying
stuff".

The two tasks are badly mixed in lib/Linker.

This starts splitting them apart.

With this patch there are no direct call to linkGlobalValueBody or
linkGlobalValueProto. Everything is linked via WapValue.

This also includes a few fixes:
* A GV goes undefined if the comdat is dropped (comdat11.ll).
* We error if an internal GV goes undefined (comdat13.ll).
* We don't link an unused comdat.

The first two match the behavior of an ELF linker. The second one is
equivalent to running globaldce on the input.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254336 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Linker/LinkModules.cpp           | 157 ++++++++++++++-------------
 lib/Transforms/Utils/ValueMapper.cpp |   6 +-
 test/Linker/Inputs/comdat11.ll       |   9 ++
 test/Linker/Inputs/comdat13.ll       |   9 ++
 test/Linker/comdat11.ll              |  13 +++
 test/Linker/comdat12.ll              |   8 ++
 test/Linker/comdat13.ll              |  13 +++
 test/Linker/comdat9.ll               |   3 +
 8 files changed, 138 insertions(+), 80 deletions(-)
 create mode 100644 test/Linker/Inputs/comdat11.ll
 create mode 100644 test/Linker/Inputs/comdat13.ll
 create mode 100644 test/Linker/comdat11.ll
 create mode 100644 test/Linker/comdat12.ll
 create mode 100644 test/Linker/comdat13.ll

diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index cdf1decc8131..aeaa7eb90903 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -436,6 +436,8 @@ class ModuleLinker {
   /// references.
   bool DoneLinkingBodies;
 
+  bool HasError = false;
+
 public:
   ModuleLinker(Module *dstM, Linker::IdentifiedStructTypeSet &Set, Module *srcM,
                DiagnosticHandlerFunction DiagnosticHandler, unsigned Flags,
@@ -483,6 +485,7 @@ class ModuleLinker {
   /// Helper method for setting a message and returning an error code.
   bool emitError(const Twine &Message) {
     DiagnosticHandler(LinkDiagnosticInfo(DS_Error, Message));
+    HasError = true;
     return true;
   }
 
@@ -531,6 +534,7 @@ class ModuleLinker {
   void upgradeMismatchedGlobalArray(StringRef Name);
   void upgradeMismatchedGlobals();
 
+  bool linkIfNeeded(GlobalValue &GV);
   bool linkAppendingVarProto(GlobalVariable *DstGV,
                              const GlobalVariable *SrcGV);
 
@@ -904,16 +908,12 @@ Value *ModuleLinker::materializeDeclFor(Value *V) {
   if (doneLinkingBodies())
     return nullptr;
 
-  GlobalValue *DGV = copyGlobalValueProto(TypeMap, SGV);
-
-  if (Comdat *SC = SGV->getComdat()) {
-    if (auto *DGO = dyn_cast<GlobalObject>(DGV)) {
-      Comdat *DC = DstM->getOrInsertComdat(SC->getName());
-      DGO->setComdat(DC);
-    }
-  }
-
-  return DGV;
+  linkGlobalValueProto(SGV);
+  if (HasError)
+    return nullptr;
+  Value *Ret = ValueMap[SGV];
+  assert(Ret);
+  return Ret;
 }
 
 void ValueMaterializerTy::materializeInitFor(GlobalValue *New,
@@ -922,15 +922,31 @@ void ValueMaterializerTy::materializeInitFor(GlobalValue *New,
 }
 
 void ModuleLinker::materializeInitFor(GlobalValue *New, GlobalValue *Old) {
+  if (auto *F = dyn_cast<Function>(New)) {
+    if (!F->isDeclaration())
+      return;
+  } else if (auto *V = dyn_cast<GlobalVariable>(New)) {
+    if (V->hasInitializer())
+      return;
+  } else {
+    auto *A = cast<GlobalAlias>(New);
+    if (A->getAliasee())
+      return;
+  }
+
+  if (Old->isDeclaration())
+    return;
+
   if (isPerformingImport() && !doImportAsDefinition(Old))
     return;
 
-  // Skip declarations that ValueMaterializer may have created in
-  // case we link in only some of SrcM.
-  if (shouldLinkOnlyNeeded() && Old->isDeclaration())
+  if (DoNotLinkFromSource.count(Old)) {
+    if (!New->hasExternalLinkage() && !New->hasExternalWeakLinkage() &&
+        !New->hasAppendingLinkage())
+      emitError("Declaration points to discarded value");
     return;
+  }
 
-  assert(!Old->isDeclaration() && "users should not pass down decls");
   linkGlobalValueBody(*Old);
 }
 
@@ -1405,7 +1421,6 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) {
     std::tie(SK, LinkFromSrc) = ComdatsChosen[SC];
     C = DstM->getOrInsertComdat(SC->getName());
     C->setSelectionKind(SK);
-    ComdatMembers[SC].push_back(SGV);
   } else if (DGV) {
     if (shouldLinkFromSource(LinkFromSrc, *DGV, *SGV))
       return true;
@@ -1425,31 +1440,12 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) {
   if (DGV)
     HasUnnamedAddr = HasUnnamedAddr && DGV->hasUnnamedAddr();
 
-  if (!LinkFromSrc && !DGV)
-    return false;
-
   GlobalValue *NewGV;
-  if (!LinkFromSrc) {
+  if (!LinkFromSrc && DGV) {
     NewGV = DGV;
     // When linking from source we setVisibility from copyGlobalValueProto.
     setVisibility(NewGV, SGV, DGV);
   } else {
-    // If the GV is to be lazily linked, don't create it just yet.
-    // The ValueMaterializerTy will deal with creating it if it's used.
-    if (!DGV && !shouldOverrideFromSrc() && SGV != ImportFunction &&
-        (SGV->hasLocalLinkage() || SGV->hasLinkOnceLinkage() ||
-         SGV->hasAvailableExternallyLinkage())) {
-      DoNotLinkFromSource.insert(SGV);
-      return false;
-    }
-
-    // When we only want to link in unresolved dependencies, blacklist
-    // the symbol unless unless DestM has a matching declaration (DGV).
-    if (shouldLinkOnlyNeeded() && !(DGV && DGV->isDeclaration())) {
-      DoNotLinkFromSource.insert(SGV);
-      return false;
-    }
-
     NewGV = copyGlobalValueProto(TypeMap, SGV, DGV);
 
     if (isPerformingImport() && !doImportAsDefinition(SGV))
@@ -1459,7 +1455,7 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) {
   NewGV->setUnnamedAddr(HasUnnamedAddr);
 
   if (auto *NewGO = dyn_cast<GlobalObject>(NewGV)) {
-    if (C)
+    if (C && LinkFromSrc)
       NewGO->setComdat(C);
 
     if (DGV && DGV->hasCommonLinkage() && SGV->hasCommonLinkage())
@@ -1842,6 +1838,38 @@ static std::string mergeTriples(const Triple &SrcTriple, const Triple &DstTriple
   return DstTriple.str();
 }
 
+bool ModuleLinker::linkIfNeeded(GlobalValue &GV) {
+  GlobalValue *DGV = getLinkedToGlobal(&GV);
+
+  if (shouldLinkOnlyNeeded() && !(DGV && DGV->isDeclaration()))
+    return false;
+
+  if (DGV && !GV.hasLocalLinkage()) {
+    GlobalValue::VisibilityTypes Visibility =
+        getMinVisibility(DGV->getVisibility(), GV.getVisibility());
+    DGV->setVisibility(Visibility);
+    GV.setVisibility(Visibility);
+  }
+
+  if (const Comdat *SC = GV.getComdat()) {
+    bool LinkFromSrc;
+    Comdat::SelectionKind SK;
+    std::tie(SK, LinkFromSrc) = ComdatsChosen[SC];
+    if (!LinkFromSrc) {
+      DoNotLinkFromSource.insert(&GV);
+      return false;
+    }
+  }
+
+  if (!DGV && !shouldOverrideFromSrc() &&
+      (GV.hasLocalLinkage() || GV.hasLinkOnceLinkage() ||
+       GV.hasAvailableExternallyLinkage())) {
+    return false;
+  }
+  MapValue(&GV, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer);
+  return HasError;
+}
+
 bool ModuleLinker::run() {
   assert(DstM && "Null destination module");
   assert(SrcM && "Null source module");
@@ -1901,24 +1929,30 @@ bool ModuleLinker::run() {
   // Upgrade mismatched global arrays.
   upgradeMismatchedGlobals();
 
+  for (GlobalVariable &GV : SrcM->globals())
+    if (const Comdat *SC = GV.getComdat())
+      ComdatMembers[SC].push_back(&GV);
+
+  for (Function &SF : *SrcM)
+    if (const Comdat *SC = SF.getComdat())
+      ComdatMembers[SC].push_back(&SF);
+
+  for (GlobalAlias &GA : SrcM->aliases())
+    if (const Comdat *SC = GA.getComdat())
+      ComdatMembers[SC].push_back(&GA);
+
   // Insert all of the globals in src into the DstM module... without linking
   // initializers (which could refer to functions not yet mapped over).
   for (GlobalVariable &GV : SrcM->globals())
-    if (linkGlobalValueProto(&GV))
+    if (linkIfNeeded(GV))
       return true;
 
-  // Link the functions together between the two modules, without doing function
-  // bodies... this just adds external function prototypes to the DstM
-  // function...  We do this so that when we begin processing function bodies,
-  // all of the global values that may be referenced are available in our
-  // ValueMap.
-  for (Function &F :*SrcM)
-    if (linkGlobalValueProto(&F))
+  for (Function &SF : *SrcM)
+    if (linkIfNeeded(SF))
       return true;
 
-  // If there were any aliases, link them now.
   for (GlobalAlias &GA : SrcM->aliases())
-    if (linkGlobalValueProto(&GA))
+    if (linkIfNeeded(GA))
       return true;
 
   for (AppendingVarInfo &AppendingVar : AppendingVars)
@@ -1933,37 +1967,6 @@ bool ModuleLinker::run() {
       MapValue(GV, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer);
   }
 
-  // Link in the function bodies that are defined in the source module into
-  // DstM.
-  for (Function &SF : *SrcM) {
-    // Skip if no body (function is external).
-    if (SF.isDeclaration())
-      continue;
-
-    // Skip if not linking from source.
-    if (DoNotLinkFromSource.count(&SF))
-      continue;
-
-    if (linkGlobalValueBody(SF))
-      return true;
-  }
-
-  // Resolve all uses of aliases with aliasees.
-  for (GlobalAlias &Src : SrcM->aliases()) {
-    if (DoNotLinkFromSource.count(&Src))
-      continue;
-    linkGlobalValueBody(Src);
-  }
-
-  // Update the initializers in the DstM module now that all globals that may
-  // be referenced are in DstM.
-  for (GlobalVariable &Src : SrcM->globals()) {
-    // Only process initialized GV's or ones not already in dest.
-    if (!Src.hasInitializer() || DoNotLinkFromSource.count(&Src))
-      continue;
-    linkGlobalValueBody(Src);
-  }
-
   // Note that we are done linking global value bodies. This prevents
   // metadata linking from creating new references.
   DoneLinkingBodies = true;
diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp
index 0a63c1d5153c..00a8984845dd 100644
--- a/lib/Transforms/Utils/ValueMapper.cpp
+++ b/lib/Transforms/Utils/ValueMapper.cpp
@@ -41,9 +41,9 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
     if (Value *NewV =
             Materializer->materializeDeclFor(const_cast<Value *>(V))) {
       VM[V] = NewV;
-      if (auto *GV = dyn_cast<GlobalValue>(V))
-        Materializer->materializeInitFor(cast<GlobalValue>(NewV),
-                                         const_cast<GlobalValue *>(GV));
+      if (auto *NewGV = dyn_cast<GlobalValue>(NewV))
+        Materializer->materializeInitFor(
+            NewGV, const_cast<GlobalValue *>(cast<GlobalValue>(V)));
       return NewV;
     }
   }
diff --git a/test/Linker/Inputs/comdat11.ll b/test/Linker/Inputs/comdat11.ll
new file mode 100644
index 000000000000..5b7f74cf0b24
--- /dev/null
+++ b/test/Linker/Inputs/comdat11.ll
@@ -0,0 +1,9 @@
+$foo = comdat any
+@foo = global i8 1, comdat
+define void @zed() {
+  call void @bar()
+  ret void
+}
+define void @bar() comdat($foo) {
+  ret void
+}
diff --git a/test/Linker/Inputs/comdat13.ll b/test/Linker/Inputs/comdat13.ll
new file mode 100644
index 000000000000..a2d16bd261b5
--- /dev/null
+++ b/test/Linker/Inputs/comdat13.ll
@@ -0,0 +1,9 @@
+$foo = comdat any
+@foo = global i8 1, comdat
+define void @zed() {
+  call void @bar()
+  ret void
+}
+define internal void @bar() comdat($foo) {
+  ret void
+}
diff --git a/test/Linker/comdat11.ll b/test/Linker/comdat11.ll
new file mode 100644
index 000000000000..dbade4104fe3
--- /dev/null
+++ b/test/Linker/comdat11.ll
@@ -0,0 +1,13 @@
+; RUN: llvm-link -S %s %p/Inputs/comdat11.ll -o - | FileCheck %s
+
+$foo = comdat any
+@foo = global i8 0, comdat
+
+; CHECK: @foo = global i8 0, comdat
+
+; CHECK: define void @zed() {
+; CHECK:   call void @bar()
+; CHECK:   ret void
+; CHECK: }
+
+; CHECK: declare void @bar()
diff --git a/test/Linker/comdat12.ll b/test/Linker/comdat12.ll
new file mode 100644
index 000000000000..d06e222b63ac
--- /dev/null
+++ b/test/Linker/comdat12.ll
@@ -0,0 +1,8 @@
+; RUN: llvm-link %s -S -o - | FileCheck %s
+
+$foo = comdat largest
+define internal void @foo() comdat($foo) {
+  ret void
+}
+
+; CHECK-NOT: foo
diff --git a/test/Linker/comdat13.ll b/test/Linker/comdat13.ll
new file mode 100644
index 000000000000..a8e51f04ae11
--- /dev/null
+++ b/test/Linker/comdat13.ll
@@ -0,0 +1,13 @@
+; RUN: not llvm-link -S %s %p/Inputs/comdat13.ll -o %t.ll 2>&1 | FileCheck %s
+
+; In Inputs/comdat13.ll a function not in the $foo comdat (zed) references an
+; internal function in the comdat $foo.
+; We might want to have the verifier reject that, but for now we at least check
+; that the linker produces an error.
+; This is the IR equivalent of the "relocation refers to discarded section" in
+; an ELF linker.
+
+; CHECK: Declaration points to discarded value
+
+$foo = comdat any
+@foo = global i8 0, comdat
diff --git a/test/Linker/comdat9.ll b/test/Linker/comdat9.ll
index 274957401aac..4f6f2cfb845d 100644
--- a/test/Linker/comdat9.ll
+++ b/test/Linker/comdat9.ll
@@ -14,6 +14,9 @@ $f2 = comdat largest
 define internal void @f2() comdat($f2) {
   ret void
 }
+define void @f3() comdat($f2) {
+  ret void
+}
 
 ; CHECK-DAG: $f2 = comdat largest
 ; CHECK-DAG: define internal void @f2() comdat {

From 0f73ee481b5c64201c9be66a309d353ee0a677c8 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 30 Nov 2015 22:22:06 +0000
Subject: [PATCH 027/186] [X86][FMA4] Prefer FMA4 to FMA

We currently output FMA instructions on targets which support both FMA4 + FMA (i.e. later Bulldozer CPUS bdver2/bdver3/bdver4).

This patch flips this so FMA4 is preferred; this is for several reasons:

1 - FMA4 is non-destructive reducing the need for mov instructions.
2 - Its more straighforward to commute and fold inputs (although the recent work on FMA has reduced this difference).
3 - All supported targets have FMA4 performance equal or better to FMA - Piledriver (bdver2) in particular has half the throughput when executing FMA instructions.

Its looks like no future AMD processor lines will support FMA4 after the Bulldozer series so we're not causing problems for later CPUs.

Differential Revision: http://reviews.llvm.org/D14997

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254339 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86Subtarget.h         | 7 ++++---
 test/CodeGen/X86/fma-commute-x86.ll   | 2 +-
 test/CodeGen/X86/fma_patterns.ll      | 2 +-
 test/CodeGen/X86/fma_patterns_wide.ll | 2 +-
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 425bc2482e9b..eb0199aecbeb 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -354,9 +354,10 @@ class X86Subtarget final : public X86GenSubtargetInfo {
   bool hasXSAVEC() const { return HasXSAVEC; }
   bool hasXSAVES() const { return HasXSAVES; }
   bool hasPCLMUL() const { return HasPCLMUL; }
-  bool hasFMA() const { return HasFMA; }
-  // FIXME: Favor FMA when both are enabled. Is this the right thing to do?
-  bool hasFMA4() const { return HasFMA4 && !HasFMA; }
+  // Prefer FMA4 to FMA - its better for commutation/memory folding and
+  // has equal or better performance on all supported targets.
+  bool hasFMA() const { return HasFMA && !HasFMA4; }
+  bool hasFMA4() const { return HasFMA4; }
   bool hasXOP() const { return HasXOP; }
   bool hasTBM() const { return HasTBM; }
   bool hasMOVBE() const { return HasMOVBE; }
diff --git a/test/CodeGen/X86/fma-commute-x86.ll b/test/CodeGen/X86/fma-commute-x86.ll
index 9e57b00bc0b4..9a368792133b 100644
--- a/test/CodeGen/X86/fma-commute-x86.ll
+++ b/test/CodeGen/X86/fma-commute-x86.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-pc-win32 -mattr=+fma,+fma4 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-win32 -mattr=+fma | FileCheck %s
 ; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 -mattr=-fma4 | FileCheck %s
 
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll
index 942d799d9761..e3295e45823c 100644
--- a/test/CodeGen/X86/fma_patterns.ll
+++ b/test/CodeGen/X86/fma_patterns.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA4
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA4
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=AVX512
 
diff --git a/test/CodeGen/X86/fma_patterns_wide.ll b/test/CodeGen/X86/fma_patterns_wide.ll
index de77a27ad2b6..f412c174fe37 100644
--- a/test/CodeGen/X86/fma_patterns_wide.ll
+++ b/test/CodeGen/X86/fma_patterns_wide.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -fp-contract=fast | FileCheck %s --check-prefix=FMA
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast | FileCheck %s --check-prefix=FMA
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast | FileCheck %s --check-prefix=FMA4
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -fp-contract=fast | FileCheck %s --check-prefix=FMA4
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -fp-contract=fast | FileCheck %s --check-prefix=AVX512
 

From 7cb4a767f86b71670480bdfb99710371d6c93d1c Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 30 Nov 2015 22:39:36 +0000
Subject: [PATCH 028/186] [InstCombine] add tests to show potential vector IR
 shuffle transforms

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254342 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/insert-extract-shuffle.ll     | 53 +++++++++++++++++--
 1 file changed, 48 insertions(+), 5 deletions(-)

diff --git a/test/Transforms/InstCombine/insert-extract-shuffle.ll b/test/Transforms/InstCombine/insert-extract-shuffle.ll
index 8929c82def7b..6841cd64abe3 100644
--- a/test/Transforms/InstCombine/insert-extract-shuffle.ll
+++ b/test/Transforms/InstCombine/insert-extract-shuffle.ll
@@ -24,14 +24,57 @@ define <4 x i16> @test2(<8 x i16> %in, <8 x i16> %in2) {
   ret <4 x i16> %vec.3
 }
 
-define <2 x i64> @test_vcopyq_lane_p64(<2 x i64> %a, <1 x i64> %b) #0 {
+define <2 x i64> @test_vcopyq_lane_p64(<2 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: @test_vcopyq_lane_p64
-; CHECK: extractelement
-; CHECK: insertelement
-; CHECK-NOT: shufflevector
-entry:
+; CHECK-NEXT: extractelement
+; CHECK-NEXT: insertelement
+; CHECK-NEXT: ret <2 x i64> %res
   %elt = extractelement <1 x i64> %b, i32 0
   %res = insertelement <2 x i64> %a, i64 %elt, i32 1
   ret <2 x i64> %res
 }
 
+; PR2109: https://llvm.org/bugs/show_bug.cgi?id=2109
+
+define <4 x float> @widen_extract2(<4 x float> %ins, <2 x float> %ext) {
+; CHECK-LABEL: @widen_extract2(
+; CHECK-NEXT: extractelement
+; CHECK-NEXT: extractelement
+; CHECK-NEXT: insertelement
+; CHECK-NEXT: insertelement
+; CHECK-NEXT: ret <4 x float> %i2
+  %e1 = extractelement <2 x float> %ext, i32 0
+  %e2 = extractelement <2 x float> %ext, i32 1
+  %i1 = insertelement <4 x float> %ins, float %e1, i32 1
+  %i2 = insertelement <4 x float> %i1, float %e2, i32 3
+  ret <4 x float> %i2
+}
+
+define <4 x float> @widen_extract3(<4 x float> %ins, <3 x float> %ext) {
+; CHECK-LABEL: @widen_extract3(
+; CHECK-NEXT: extractelement
+; CHECK-NEXT: extractelement
+; CHECK-NEXT: extractelement
+; CHECK-NEXT: insertelement
+; CHECK-NEXT: insertelement
+; CHECK-NEXT: insertelement
+; CHECK-NEXT: ret <4 x float> %i3
+  %e1 = extractelement <3 x float> %ext, i32 0
+  %e2 = extractelement <3 x float> %ext, i32 1
+  %e3 = extractelement <3 x float> %ext, i32 2
+  %i1 = insertelement <4 x float> %ins, float %e1, i32 2
+  %i2 = insertelement <4 x float> %i1, float %e2, i32 1
+  %i3 = insertelement <4 x float> %i2, float %e3, i32 0
+  ret <4 x float> %i3
+}
+
+define <8 x float> @too_wide(<8 x float> %ins, <2 x float> %ext) {
+; CHECK-LABEL: @too_wide(
+; CHECK-NEXT: extractelement
+; CHECK-NEXT: insertelement
+; CHECK-NEXT: ret <8 x float> %i1
+  %e1 = extractelement <2 x float> %ext, i32 0
+  %i1 = insertelement <8 x float> %ins, float %e1, i32 2
+  ret <8 x float> %i1
+}
+

From 36d161829490a6e8655db29b13bb247ea4717800 Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Mon, 30 Nov 2015 23:05:25 +0000
Subject: [PATCH 029/186] Disable a consistency check.

Trying to figure out why it fails on a bot but passes locally.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254344 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Linker/LinkModules.cpp     |  6 +-----
 test/Linker/Inputs/comdat13.ll |  9 ---------
 test/Linker/comdat13.ll        | 13 -------------
 3 files changed, 1 insertion(+), 27 deletions(-)
 delete mode 100644 test/Linker/Inputs/comdat13.ll
 delete mode 100644 test/Linker/comdat13.ll

diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index aeaa7eb90903..edee55a1f9f6 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -940,12 +940,8 @@ void ModuleLinker::materializeInitFor(GlobalValue *New, GlobalValue *Old) {
   if (isPerformingImport() && !doImportAsDefinition(Old))
     return;
 
-  if (DoNotLinkFromSource.count(Old)) {
-    if (!New->hasExternalLinkage() && !New->hasExternalWeakLinkage() &&
-        !New->hasAppendingLinkage())
-      emitError("Declaration points to discarded value");
+  if (DoNotLinkFromSource.count(Old))
     return;
-  }
 
   linkGlobalValueBody(*Old);
 }
diff --git a/test/Linker/Inputs/comdat13.ll b/test/Linker/Inputs/comdat13.ll
deleted file mode 100644
index a2d16bd261b5..000000000000
--- a/test/Linker/Inputs/comdat13.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-$foo = comdat any
-@foo = global i8 1, comdat
-define void @zed() {
-  call void @bar()
-  ret void
-}
-define internal void @bar() comdat($foo) {
-  ret void
-}
diff --git a/test/Linker/comdat13.ll b/test/Linker/comdat13.ll
deleted file mode 100644
index a8e51f04ae11..000000000000
--- a/test/Linker/comdat13.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: not llvm-link -S %s %p/Inputs/comdat13.ll -o %t.ll 2>&1 | FileCheck %s
-
-; In Inputs/comdat13.ll a function not in the $foo comdat (zed) references an
-; internal function in the comdat $foo.
-; We might want to have the verifier reject that, but for now we at least check
-; that the linker produces an error.
-; This is the IR equivalent of the "relocation refers to discarded section" in
-; an ELF linker.
-
-; CHECK: Declaration points to discarded value
-
-$foo = comdat any
-@foo = global i8 0, comdat

From 7c4c88cf5f63239ee60f335af4d66ed363c92310 Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Mon, 30 Nov 2015 23:54:19 +0000
Subject: [PATCH 030/186] This reverts commit r254336 and r254344.

They broke a bot and I am debugging why.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254347 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Linker/LinkModules.cpp           | 153 ++++++++++++++-------------
 lib/Transforms/Utils/ValueMapper.cpp |   6 +-
 test/Linker/Inputs/comdat11.ll       |   9 --
 test/Linker/comdat11.ll              |  13 ---
 test/Linker/comdat12.ll              |   8 --
 test/Linker/comdat9.ll               |   3 -
 6 files changed, 80 insertions(+), 112 deletions(-)
 delete mode 100644 test/Linker/Inputs/comdat11.ll
 delete mode 100644 test/Linker/comdat11.ll
 delete mode 100644 test/Linker/comdat12.ll

diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index edee55a1f9f6..cdf1decc8131 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -436,8 +436,6 @@ class ModuleLinker {
   /// references.
   bool DoneLinkingBodies;
 
-  bool HasError = false;
-
 public:
   ModuleLinker(Module *dstM, Linker::IdentifiedStructTypeSet &Set, Module *srcM,
                DiagnosticHandlerFunction DiagnosticHandler, unsigned Flags,
@@ -485,7 +483,6 @@ class ModuleLinker {
   /// Helper method for setting a message and returning an error code.
   bool emitError(const Twine &Message) {
     DiagnosticHandler(LinkDiagnosticInfo(DS_Error, Message));
-    HasError = true;
     return true;
   }
 
@@ -534,7 +531,6 @@ class ModuleLinker {
   void upgradeMismatchedGlobalArray(StringRef Name);
   void upgradeMismatchedGlobals();
 
-  bool linkIfNeeded(GlobalValue &GV);
   bool linkAppendingVarProto(GlobalVariable *DstGV,
                              const GlobalVariable *SrcGV);
 
@@ -908,12 +904,16 @@ Value *ModuleLinker::materializeDeclFor(Value *V) {
   if (doneLinkingBodies())
     return nullptr;
 
-  linkGlobalValueProto(SGV);
-  if (HasError)
-    return nullptr;
-  Value *Ret = ValueMap[SGV];
-  assert(Ret);
-  return Ret;
+  GlobalValue *DGV = copyGlobalValueProto(TypeMap, SGV);
+
+  if (Comdat *SC = SGV->getComdat()) {
+    if (auto *DGO = dyn_cast<GlobalObject>(DGV)) {
+      Comdat *DC = DstM->getOrInsertComdat(SC->getName());
+      DGO->setComdat(DC);
+    }
+  }
+
+  return DGV;
 }
 
 void ValueMaterializerTy::materializeInitFor(GlobalValue *New,
@@ -922,27 +922,15 @@ void ValueMaterializerTy::materializeInitFor(GlobalValue *New,
 }
 
 void ModuleLinker::materializeInitFor(GlobalValue *New, GlobalValue *Old) {
-  if (auto *F = dyn_cast<Function>(New)) {
-    if (!F->isDeclaration())
-      return;
-  } else if (auto *V = dyn_cast<GlobalVariable>(New)) {
-    if (V->hasInitializer())
-      return;
-  } else {
-    auto *A = cast<GlobalAlias>(New);
-    if (A->getAliasee())
-      return;
-  }
-
-  if (Old->isDeclaration())
-    return;
-
   if (isPerformingImport() && !doImportAsDefinition(Old))
     return;
 
-  if (DoNotLinkFromSource.count(Old))
+  // Skip declarations that ValueMaterializer may have created in
+  // case we link in only some of SrcM.
+  if (shouldLinkOnlyNeeded() && Old->isDeclaration())
     return;
 
+  assert(!Old->isDeclaration() && "users should not pass down decls");
   linkGlobalValueBody(*Old);
 }
 
@@ -1417,6 +1405,7 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) {
     std::tie(SK, LinkFromSrc) = ComdatsChosen[SC];
     C = DstM->getOrInsertComdat(SC->getName());
     C->setSelectionKind(SK);
+    ComdatMembers[SC].push_back(SGV);
   } else if (DGV) {
     if (shouldLinkFromSource(LinkFromSrc, *DGV, *SGV))
       return true;
@@ -1436,12 +1425,31 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) {
   if (DGV)
     HasUnnamedAddr = HasUnnamedAddr && DGV->hasUnnamedAddr();
 
+  if (!LinkFromSrc && !DGV)
+    return false;
+
   GlobalValue *NewGV;
-  if (!LinkFromSrc && DGV) {
+  if (!LinkFromSrc) {
     NewGV = DGV;
     // When linking from source we setVisibility from copyGlobalValueProto.
     setVisibility(NewGV, SGV, DGV);
   } else {
+    // If the GV is to be lazily linked, don't create it just yet.
+    // The ValueMaterializerTy will deal with creating it if it's used.
+    if (!DGV && !shouldOverrideFromSrc() && SGV != ImportFunction &&
+        (SGV->hasLocalLinkage() || SGV->hasLinkOnceLinkage() ||
+         SGV->hasAvailableExternallyLinkage())) {
+      DoNotLinkFromSource.insert(SGV);
+      return false;
+    }
+
+    // When we only want to link in unresolved dependencies, blacklist
+    // the symbol unless unless DestM has a matching declaration (DGV).
+    if (shouldLinkOnlyNeeded() && !(DGV && DGV->isDeclaration())) {
+      DoNotLinkFromSource.insert(SGV);
+      return false;
+    }
+
     NewGV = copyGlobalValueProto(TypeMap, SGV, DGV);
 
     if (isPerformingImport() && !doImportAsDefinition(SGV))
@@ -1451,7 +1459,7 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) {
   NewGV->setUnnamedAddr(HasUnnamedAddr);
 
   if (auto *NewGO = dyn_cast<GlobalObject>(NewGV)) {
-    if (C && LinkFromSrc)
+    if (C)
       NewGO->setComdat(C);
 
     if (DGV && DGV->hasCommonLinkage() && SGV->hasCommonLinkage())
@@ -1834,38 +1842,6 @@ static std::string mergeTriples(const Triple &SrcTriple, const Triple &DstTriple
   return DstTriple.str();
 }
 
-bool ModuleLinker::linkIfNeeded(GlobalValue &GV) {
-  GlobalValue *DGV = getLinkedToGlobal(&GV);
-
-  if (shouldLinkOnlyNeeded() && !(DGV && DGV->isDeclaration()))
-    return false;
-
-  if (DGV && !GV.hasLocalLinkage()) {
-    GlobalValue::VisibilityTypes Visibility =
-        getMinVisibility(DGV->getVisibility(), GV.getVisibility());
-    DGV->setVisibility(Visibility);
-    GV.setVisibility(Visibility);
-  }
-
-  if (const Comdat *SC = GV.getComdat()) {
-    bool LinkFromSrc;
-    Comdat::SelectionKind SK;
-    std::tie(SK, LinkFromSrc) = ComdatsChosen[SC];
-    if (!LinkFromSrc) {
-      DoNotLinkFromSource.insert(&GV);
-      return false;
-    }
-  }
-
-  if (!DGV && !shouldOverrideFromSrc() &&
-      (GV.hasLocalLinkage() || GV.hasLinkOnceLinkage() ||
-       GV.hasAvailableExternallyLinkage())) {
-    return false;
-  }
-  MapValue(&GV, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer);
-  return HasError;
-}
-
 bool ModuleLinker::run() {
   assert(DstM && "Null destination module");
   assert(SrcM && "Null source module");
@@ -1925,30 +1901,24 @@ bool ModuleLinker::run() {
   // Upgrade mismatched global arrays.
   upgradeMismatchedGlobals();
 
-  for (GlobalVariable &GV : SrcM->globals())
-    if (const Comdat *SC = GV.getComdat())
-      ComdatMembers[SC].push_back(&GV);
-
-  for (Function &SF : *SrcM)
-    if (const Comdat *SC = SF.getComdat())
-      ComdatMembers[SC].push_back(&SF);
-
-  for (GlobalAlias &GA : SrcM->aliases())
-    if (const Comdat *SC = GA.getComdat())
-      ComdatMembers[SC].push_back(&GA);
-
   // Insert all of the globals in src into the DstM module... without linking
   // initializers (which could refer to functions not yet mapped over).
   for (GlobalVariable &GV : SrcM->globals())
-    if (linkIfNeeded(GV))
+    if (linkGlobalValueProto(&GV))
       return true;
 
-  for (Function &SF : *SrcM)
-    if (linkIfNeeded(SF))
+  // Link the functions together between the two modules, without doing function
+  // bodies... this just adds external function prototypes to the DstM
+  // function...  We do this so that when we begin processing function bodies,
+  // all of the global values that may be referenced are available in our
+  // ValueMap.
+  for (Function &F :*SrcM)
+    if (linkGlobalValueProto(&F))
       return true;
 
+  // If there were any aliases, link them now.
   for (GlobalAlias &GA : SrcM->aliases())
-    if (linkIfNeeded(GA))
+    if (linkGlobalValueProto(&GA))
       return true;
 
   for (AppendingVarInfo &AppendingVar : AppendingVars)
@@ -1963,6 +1933,37 @@ bool ModuleLinker::run() {
       MapValue(GV, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer);
   }
 
+  // Link in the function bodies that are defined in the source module into
+  // DstM.
+  for (Function &SF : *SrcM) {
+    // Skip if no body (function is external).
+    if (SF.isDeclaration())
+      continue;
+
+    // Skip if not linking from source.
+    if (DoNotLinkFromSource.count(&SF))
+      continue;
+
+    if (linkGlobalValueBody(SF))
+      return true;
+  }
+
+  // Resolve all uses of aliases with aliasees.
+  for (GlobalAlias &Src : SrcM->aliases()) {
+    if (DoNotLinkFromSource.count(&Src))
+      continue;
+    linkGlobalValueBody(Src);
+  }
+
+  // Update the initializers in the DstM module now that all globals that may
+  // be referenced are in DstM.
+  for (GlobalVariable &Src : SrcM->globals()) {
+    // Only process initialized GV's or ones not already in dest.
+    if (!Src.hasInitializer() || DoNotLinkFromSource.count(&Src))
+      continue;
+    linkGlobalValueBody(Src);
+  }
+
   // Note that we are done linking global value bodies. This prevents
   // metadata linking from creating new references.
   DoneLinkingBodies = true;
diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp
index 00a8984845dd..0a63c1d5153c 100644
--- a/lib/Transforms/Utils/ValueMapper.cpp
+++ b/lib/Transforms/Utils/ValueMapper.cpp
@@ -41,9 +41,9 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
     if (Value *NewV =
             Materializer->materializeDeclFor(const_cast<Value *>(V))) {
       VM[V] = NewV;
-      if (auto *NewGV = dyn_cast<GlobalValue>(NewV))
-        Materializer->materializeInitFor(
-            NewGV, const_cast<GlobalValue *>(cast<GlobalValue>(V)));
+      if (auto *GV = dyn_cast<GlobalValue>(V))
+        Materializer->materializeInitFor(cast<GlobalValue>(NewV),
+                                         const_cast<GlobalValue *>(GV));
       return NewV;
     }
   }
diff --git a/test/Linker/Inputs/comdat11.ll b/test/Linker/Inputs/comdat11.ll
deleted file mode 100644
index 5b7f74cf0b24..000000000000
--- a/test/Linker/Inputs/comdat11.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-$foo = comdat any
-@foo = global i8 1, comdat
-define void @zed() {
-  call void @bar()
-  ret void
-}
-define void @bar() comdat($foo) {
-  ret void
-}
diff --git a/test/Linker/comdat11.ll b/test/Linker/comdat11.ll
deleted file mode 100644
index dbade4104fe3..000000000000
--- a/test/Linker/comdat11.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: llvm-link -S %s %p/Inputs/comdat11.ll -o - | FileCheck %s
-
-$foo = comdat any
-@foo = global i8 0, comdat
-
-; CHECK: @foo = global i8 0, comdat
-
-; CHECK: define void @zed() {
-; CHECK:   call void @bar()
-; CHECK:   ret void
-; CHECK: }
-
-; CHECK: declare void @bar()
diff --git a/test/Linker/comdat12.ll b/test/Linker/comdat12.ll
deleted file mode 100644
index d06e222b63ac..000000000000
--- a/test/Linker/comdat12.ll
+++ /dev/null
@@ -1,8 +0,0 @@
-; RUN: llvm-link %s -S -o - | FileCheck %s
-
-$foo = comdat largest
-define internal void @foo() comdat($foo) {
-  ret void
-}
-
-; CHECK-NOT: foo
diff --git a/test/Linker/comdat9.ll b/test/Linker/comdat9.ll
index 4f6f2cfb845d..274957401aac 100644
--- a/test/Linker/comdat9.ll
+++ b/test/Linker/comdat9.ll
@@ -14,9 +14,6 @@ $f2 = comdat largest
 define internal void @f2() comdat($f2) {
   ret void
 }
-define void @f3() comdat($f2) {
-  ret void
-}
 
 ; CHECK-DAG: $f2 = comdat largest
 ; CHECK-DAG: define internal void @f2() comdat {

From 92989cbe8452a9406f6e0d3c5709b5745f27158a Mon Sep 17 00:00:00 2001
From: Cong Hou <congh@google.com>
Date: Tue, 1 Dec 2015 00:02:51 +0000
Subject: [PATCH 031/186] Replace all weight-based interfaces in MBB with
 probability-based interfaces, and update all uses of old interfaces.

The patch in http://reviews.llvm.org/D13745 is broken into four parts:

1. New interfaces without functional changes (http://reviews.llvm.org/D13908).
2. Use new interfaces in SelectionDAG, while in other passes treat probabilities
as weights (http://reviews.llvm.org/D14361).
3. Use new interfaces in all other passes.
4. Remove old interfaces.

This patch is 3+4 above. In this patch, MBB won't provide weight-based
interfaces any more, which are totally replaced by probability-based ones.
The interface addSuccessor() is redesigned so that the default probability is
unknown. We allow unknown probabilities but don't allow using it together
with known probabilities in successor list. That is to say, we either have a
list of successors with all known probabilities, or all unknown
probabilities. In the latter case, we assume each successor has 1/N
probability where N is the number of successors. An assertion checks if the
user is attempting to add a successor with the disallowed mixed use as stated
above. This can help us catch many misuses.

All uses of weight-based interfaces are now updated to use probability-based
ones.


Differential revision: http://reviews.llvm.org/D14973




git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254348 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/BranchProbabilityInfo.h |   3 +
 include/llvm/CodeGen/MachineBasicBlock.h      |  46 +-----
 .../CodeGen/MachineBranchProbabilityInfo.h    |  22 +--
 include/llvm/Support/BranchProbability.h      |  31 +++-
 lib/Analysis/BranchProbabilityInfo.cpp        |   6 +
 lib/CodeGen/BranchFolding.cpp                 |   9 +-
 lib/CodeGen/IfConversion.cpp                  | 155 ++++++++----------
 lib/CodeGen/MIRParser/MIParser.cpp            |   3 +-
 lib/CodeGen/MIRPrinter.cpp                    |   4 +-
 lib/CodeGen/MachineBasicBlock.cpp             | 142 ++++------------
 lib/CodeGen/MachineBlockPlacement.cpp         |  67 +++-----
 lib/CodeGen/MachineBranchProbabilityInfo.cpp  |  82 +++------
 lib/CodeGen/TailDuplication.cpp               |   6 +-
 lib/Support/BranchProbability.cpp             |  20 ++-
 lib/Target/AMDGPU/AMDILCFGStructurizer.cpp    |   6 +-
 lib/Target/ARM/ARMConstantIslandPass.cpp      |   3 +-
 lib/Target/ARM/ARMISelLowering.cpp            |   2 +-
 lib/Target/Hexagon/HexagonCFGOptimizer.cpp    |   6 +-
 lib/Target/Mips/MipsLongBranch.cpp            |   3 +-
 test/CodeGen/ARM/ifcvt-branch-weight-bug.ll   |  14 +-
 test/CodeGen/ARM/ifcvt-branch-weight.ll       |   2 +-
 test/CodeGen/ARM/ifcvt-iter-indbr.ll          |  10 +-
 test/CodeGen/ARM/tail-merge-branch-weight.ll  |   2 +-
 test/CodeGen/ARM/taildup-branch-weight.ll     |   4 +-
 test/CodeGen/Generic/MachineBranchProb.ll     |   8 +-
 test/CodeGen/Hexagon/ifcvt-edge-weight.ll     |   2 +-
 test/CodeGen/MIR/X86/newline-handling.mir     |   4 +-
 .../X86/successor-basic-blocks-weights.mir    |   6 +-
 .../MIR/X86/successor-basic-blocks.mir        |   4 +-
 test/CodeGen/X86/MachineBranchProb.ll         |   4 +-
 test/CodeGen/X86/catchpad-weight.ll           |   2 +-
 test/CodeGen/X86/stack-protector-weight.ll    |   4 +-
 test/CodeGen/X86/switch-edge-weight.ll        |  22 +--
 test/CodeGen/X86/switch-jump-table.ll         |   8 +-
 34 files changed, 292 insertions(+), 420 deletions(-)

diff --git a/include/llvm/Analysis/BranchProbabilityInfo.h b/include/llvm/Analysis/BranchProbabilityInfo.h
index 89dec14b2b3e..69dae5e90785 100644
--- a/include/llvm/Analysis/BranchProbabilityInfo.h
+++ b/include/llvm/Analysis/BranchProbabilityInfo.h
@@ -61,6 +61,9 @@ class BranchProbabilityInfo {
   BranchProbability getEdgeProbability(const BasicBlock *Src,
                                        const BasicBlock *Dst) const;
 
+  BranchProbability getEdgeProbability(const BasicBlock *Src,
+                                       succ_const_iterator Dst) const;
+
   /// \brief Test if an edge is hot relative to other out-edges of the Src.
   ///
   /// Check whether this edge out of the source block is 'hot'. We define hot
diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h
index a2b1a850ec76..ac87f4f901f5 100644
--- a/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/include/llvm/CodeGen/MachineBasicBlock.h
@@ -91,13 +91,6 @@ class MachineBasicBlock
   std::vector<MachineBasicBlock *> Predecessors;
   std::vector<MachineBasicBlock *> Successors;
 
-  /// Keep track of the weights to the successors. This vector has the same
-  /// order as Successors, or it is empty if we don't use it (disable
-  /// optimization).
-  std::vector<uint32_t> Weights;
-  typedef std::vector<uint32_t>::iterator weight_iterator;
-  typedef std::vector<uint32_t>::const_iterator const_weight_iterator;
-
   /// Keep track of the probabilities to the successors. This vector has the
   /// same order as Successors, or it is empty if we don't use it (disable
   /// optimization).
@@ -440,26 +433,16 @@ class MachineBasicBlock
 
   // Machine-CFG mutators
 
-  /// Add Succ as a successor of this MachineBasicBlock.  The Predecessors list
-  /// of Succ is automatically updated. WEIGHT parameter is stored in Weights
-  /// list and it may be used by MachineBranchProbabilityInfo analysis to
-  /// calculate branch probability.
-  ///
-  /// Note that duplicate Machine CFG edges are not allowed.
-  void addSuccessor(MachineBasicBlock *Succ, uint32_t Weight = 0);
-
-  /// Add Succ as a successor of this MachineBasicBlock.  The Predecessors list
-  /// of Succ is automatically updated. The weight is not provided because BPI
-  /// is not available (e.g. -O0 is used), in which case edge weights won't be
-  /// used. Using this interface can save some space.
-  void addSuccessorWithoutWeight(MachineBasicBlock *Succ);
-
   /// Add Succ as a successor of this MachineBasicBlock.  The Predecessors list
   /// of Succ is automatically updated. PROB parameter is stored in
-  /// Probabilities list.
+  /// Probabilities list. The default probability is set as unknown. Mixing
+  /// known and unknown probabilities in successor list is not allowed. When all
+  /// successors have unknown probabilities, 1 / N is returned as the
+  /// probability for each successor, where N is the number of successors.
   ///
   /// Note that duplicate Machine CFG edges are not allowed.
-  void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob);
+  void addSuccessor(MachineBasicBlock *Succ,
+                    BranchProbability Prob = BranchProbability::getUnknown());
 
   /// Add Succ as a successor of this MachineBasicBlock.  The Predecessors list
   /// of Succ is automatically updated. The probability is not provided because
@@ -467,9 +450,6 @@ class MachineBasicBlock
   /// won't be used. Using this interface can save some space.
   void addSuccessorWithoutProb(MachineBasicBlock *Succ);
 
-  /// Set successor weight of a given iterator.
-  void setSuccWeight(succ_iterator I, uint32_t Weight);
-
   /// Set successor probability of a given iterator.
   void setSuccProbability(succ_iterator I, BranchProbability Prob);
 
@@ -488,7 +468,7 @@ class MachineBasicBlock
   /// Return the iterator to the element after the one removed.
   succ_iterator removeSuccessor(succ_iterator I);
 
-  /// Replace successor OLD with NEW and update weight info.
+  /// Replace successor OLD with NEW and update probability info.
   void replaceSuccessor(MachineBasicBlock *Old, MachineBasicBlock *New);
 
   /// Transfers all the successors from MBB to this machine basic block (i.e.,
@@ -500,9 +480,6 @@ class MachineBasicBlock
   /// operands in the successor blocks which refer to FromMBB to refer to this.
   void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB);
 
-  /// Return true if any of the successors have weights attached to them.
-  bool hasSuccessorWeights() const { return !Weights.empty(); }
-
   /// Return true if any of the successors have probabilities attached to them.
   bool hasSuccessorProbabilities() const { return !Probs.empty(); }
 
@@ -759,10 +736,6 @@ class MachineBasicBlock
 
 
 private:
-  /// Return weight iterator corresponding to the I successor iterator.
-  weight_iterator getWeightIterator(succ_iterator I);
-  const_weight_iterator getWeightIterator(const_succ_iterator I) const;
-
   /// Return probability iterator corresponding to the I successor iterator.
   probability_iterator getProbabilityIterator(succ_iterator I);
   const_probability_iterator
@@ -771,11 +744,6 @@ class MachineBasicBlock
   friend class MachineBranchProbabilityInfo;
   friend class MIPrinter;
 
-  /// Return weight of the edge from this block to MBB. This method should NOT
-  /// be called directly, but by using getEdgeWeight method from
-  /// MachineBranchProbabilityInfo class.
-  uint32_t getSuccWeight(const_succ_iterator Succ) const;
-
   /// Return probability of the edge from this block to MBB. This method should
   /// NOT be called directly, but by using getEdgeProbability method from
   /// MachineBranchProbabilityInfo class.
diff --git a/include/llvm/CodeGen/MachineBranchProbabilityInfo.h b/include/llvm/CodeGen/MachineBranchProbabilityInfo.h
index 058ab32f3aa9..608e8d257874 100644
--- a/include/llvm/CodeGen/MachineBranchProbabilityInfo.h
+++ b/include/llvm/CodeGen/MachineBranchProbabilityInfo.h
@@ -55,10 +55,15 @@ class MachineBranchProbabilityInfo : public ImmutablePass {
   uint32_t getEdgeWeight(const MachineBasicBlock *Src,
                          MachineBasicBlock::const_succ_iterator Dst) const;
 
-  // Get sum of the block successors' weights, potentially scaling them to fit
-  // within 32-bits. If scaling is required, sets Scale based on the necessary
-  // adjustment. Any edge weights used with the sum should be divided by Scale.
-  uint32_t getSumForBlock(const MachineBasicBlock *MBB, uint32_t &Scale) const;
+  // Return edge probability.
+  BranchProbability getEdgeProbability(const MachineBasicBlock *Src,
+                                       const MachineBasicBlock *Dst) const;
+
+  // Same as above, but using a const_succ_iterator from Src. This is faster
+  // when the iterator is already available.
+  BranchProbability
+  getEdgeProbability(const MachineBasicBlock *Src,
+                     MachineBasicBlock::const_succ_iterator Dst) const;
 
   // A 'Hot' edge is an edge which probability is >= 80%.
   bool isEdgeHot(const MachineBasicBlock *Src,
@@ -68,15 +73,6 @@ class MachineBranchProbabilityInfo : public ImmutablePass {
   // NB: This routine's complexity is linear on the number of successors.
   MachineBasicBlock *getHotSucc(MachineBasicBlock *MBB) const;
 
-  // Return a probability as a fraction between 0 (0% probability) and
-  // 1 (100% probability), however the value is never equal to 0, and can be 1
-  // only iff SRC block has only one successor.
-  // NB: This routine's complexity is linear on the number of successors of
-  // Src. Querying sequentially for each successor's probability is a quadratic
-  // query pattern.
-  BranchProbability getEdgeProbability(const MachineBasicBlock *Src,
-                                       const MachineBasicBlock *Dst) const;
-
   // Print value between 0 (0% probability) and 1 (100% probability),
   // however the value is never equal to 0, and can be 1 only iff SRC block
   // has only one successor.
diff --git a/include/llvm/Support/BranchProbability.h b/include/llvm/Support/BranchProbability.h
index 3620d4d5d772..2548384f346b 100644
--- a/include/llvm/Support/BranchProbability.h
+++ b/include/llvm/Support/BranchProbability.h
@@ -53,6 +53,9 @@ class BranchProbability {
   // Create a BranchProbability object with the given numerator and 1<<31
   // as denominator.
   static BranchProbability getRaw(uint32_t N) { return BranchProbability(N); }
+  // Create a BranchProbability object from 64-bit integers.
+  static BranchProbability getBranchProbability(uint64_t Numerator,
+                                                uint64_t Denominator);
 
   // Normalize given probabilties so that the sum of them becomes approximate
   // one.
@@ -131,10 +134,30 @@ class BranchProbability {
 
   bool operator==(BranchProbability RHS) const { return N == RHS.N; }
   bool operator!=(BranchProbability RHS) const { return !(*this == RHS); }
-  bool operator<(BranchProbability RHS) const { return N < RHS.N; }
-  bool operator>(BranchProbability RHS) const { return RHS < *this; }
-  bool operator<=(BranchProbability RHS) const { return !(RHS < *this); }
-  bool operator>=(BranchProbability RHS) const { return !(*this < RHS); }
+
+  bool operator<(BranchProbability RHS) const {
+    assert(N != UnknownN && RHS.N != UnknownN &&
+           "Unknown probability cannot participate in comparisons.");
+    return N < RHS.N;
+  }
+
+  bool operator>(BranchProbability RHS) const {
+    assert(N != UnknownN && RHS.N != UnknownN &&
+           "Unknown probability cannot participate in comparisons.");
+    return RHS < *this;
+  }
+
+  bool operator<=(BranchProbability RHS) const {
+    assert(N != UnknownN && RHS.N != UnknownN &&
+           "Unknown probability cannot participate in comparisons.");
+    return !(RHS < *this);
+  }
+
+  bool operator>=(BranchProbability RHS) const {
+    assert(N != UnknownN && RHS.N != UnknownN &&
+           "Unknown probability cannot participate in comparisons.");
+    return !(*this < RHS);
+  }
 };
 
 inline raw_ostream &operator<<(raw_ostream &OS, BranchProbability Prob) {
diff --git a/lib/Analysis/BranchProbabilityInfo.cpp b/lib/Analysis/BranchProbabilityInfo.cpp
index f48394698699..6cdf43a06a9f 100644
--- a/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/lib/Analysis/BranchProbabilityInfo.cpp
@@ -647,6 +647,12 @@ getEdgeProbability(const BasicBlock *Src, const BasicBlock *Dst) const {
   return BranchProbability(N, D);
 }
 
+BranchProbability
+BranchProbabilityInfo::getEdgeProbability(const BasicBlock *Src,
+                                          succ_const_iterator Dst) const {
+  return getEdgeProbability(Src, Dst.getSuccessorIndex());
+}
+
 raw_ostream &
 BranchProbabilityInfo::printEdgeProbability(raw_ostream &OS,
                                             const BasicBlock *Src,
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index 0b2495cc996e..ba21d9cc9d55 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -1099,13 +1099,16 @@ void BranchFolder::setCommonTailEdgeWeights(MachineBasicBlock &TailMBB) {
   if (TailMBB.succ_size() <= 1)
     return;
 
-  auto MaxEdgeFreq = *std::max_element(EdgeFreqLs.begin(), EdgeFreqLs.end());
-  uint64_t Scale = MaxEdgeFreq.getFrequency() / UINT32_MAX + 1;
+  auto SumEdgeFreq =
+      std::accumulate(EdgeFreqLs.begin(), EdgeFreqLs.end(), BlockFrequency(0))
+          .getFrequency();
   auto EdgeFreq = EdgeFreqLs.begin();
 
   for (auto SuccI = TailMBB.succ_begin(), SuccE = TailMBB.succ_end();
        SuccI != SuccE; ++SuccI, ++EdgeFreq)
-    TailMBB.setSuccWeight(SuccI, EdgeFreq->getFrequency() / Scale);
+    TailMBB.setSuccProbability(
+        SuccI, BranchProbability::getBranchProbability(EdgeFreq->getFrequency(),
+                                                       SumEdgeFreq));
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index 0b2f3ea165f8..ff28f95cc33d 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -32,6 +32,7 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
 
 using namespace llvm;
 
@@ -1151,28 +1152,6 @@ bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) {
   return true;
 }
 
-/// Scale down weights to fit into uint32_t. NewTrue is the new weight
-/// for successor TrueBB, and NewFalse is the new weight for successor
-/// FalseBB.
-static void ScaleWeights(uint64_t NewTrue, uint64_t NewFalse,
-                         MachineBasicBlock *MBB,
-                         const MachineBasicBlock *TrueBB,
-                         const MachineBasicBlock *FalseBB,
-                         const MachineBranchProbabilityInfo *MBPI) {
-  uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse;
-  uint32_t Scale = (NewMax / UINT32_MAX) + 1;
-  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
-                                        SE = MBB->succ_end();
-       SI != SE; ++SI) {
-    if (*SI == TrueBB)
-      MBB->setSuccWeight(SI, (uint32_t)(NewTrue / Scale));
-    else if (*SI == FalseBB)
-      MBB->setSuccWeight(SI, (uint32_t)(NewFalse / Scale));
-    else
-      MBB->setSuccWeight(SI, MBPI->getEdgeWeight(MBB, SI) / Scale);
-  }
-}
-
 /// IfConvertTriangle - If convert a triangle sub-CFG.
 ///
 bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
@@ -1229,16 +1208,14 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
   DontKill.clear();
 
   bool HasEarlyExit = CvtBBI->FalseBB != nullptr;
-  uint64_t CvtNext = 0, CvtFalse = 0, BBNext = 0, BBCvt = 0, SumWeight = 0;
-  uint32_t WeightScale = 0;
+  BranchProbability CvtNext, CvtFalse, BBNext, BBCvt;
 
   if (HasEarlyExit) {
-    // Get weights before modifying CvtBBI->BB and BBI.BB.
-    CvtNext = MBPI->getEdgeWeight(CvtBBI->BB, NextBBI->BB);
-    CvtFalse = MBPI->getEdgeWeight(CvtBBI->BB, CvtBBI->FalseBB);
-    BBNext = MBPI->getEdgeWeight(BBI.BB, NextBBI->BB);
-    BBCvt = MBPI->getEdgeWeight(BBI.BB, CvtBBI->BB);
-    SumWeight = MBPI->getSumForBlock(CvtBBI->BB, WeightScale);
+    // Get probabilities before modifying CvtBBI->BB and BBI.BB.
+    CvtNext = MBPI->getEdgeProbability(CvtBBI->BB, NextBBI->BB);
+    CvtFalse = MBPI->getEdgeProbability(CvtBBI->BB, CvtBBI->FalseBB);
+    BBNext = MBPI->getEdgeProbability(BBI.BB, NextBBI->BB);
+    BBCvt = MBPI->getEdgeProbability(BBI.BB, CvtBBI->BB);
   }
 
   if (CvtBBI->BB->pred_size() > 1) {
@@ -1266,22 +1243,24 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
                                            CvtBBI->BrCond.end());
     if (TII->ReverseBranchCondition(RevCond))
       llvm_unreachable("Unable to reverse branch condition!");
+
+    // Update the edge probability for both CvtBBI->FalseBB and NextBBI.
+    // NewNext = New_Prob(BBI.BB, NextBBI->BB) =
+    //   Prob(BBI.BB, NextBBI->BB) +
+    //   Prob(BBI.BB, CvtBBI->BB) * Prob(CvtBBI->BB, NextBBI->BB)
+    // NewFalse = New_Prob(BBI.BB, CvtBBI->FalseBB) =
+    //   Prob(BBI.BB, CvtBBI->BB) * Prob(CvtBBI->BB, CvtBBI->FalseBB)
+    auto NewTrueBB = getNextBlock(BBI.BB);
+    auto NewNext = BBNext + BBCvt * CvtNext;
+    auto NewTrueBBIter =
+        std::find(BBI.BB->succ_begin(), BBI.BB->succ_end(), NewTrueBB);
+    assert(NewTrueBBIter != BBI.BB->succ_end() &&
+           "NewTrueBB is not a successor of BBI.BB.");
+    BBI.BB->setSuccProbability(NewTrueBBIter, NewNext);
+
+    auto NewFalse = BBCvt * CvtFalse;
     TII->InsertBranch(*BBI.BB, CvtBBI->FalseBB, nullptr, RevCond, dl);
-    BBI.BB->addSuccessor(CvtBBI->FalseBB);
-    // Update the edge weight for both CvtBBI->FalseBB and NextBBI.
-    // New_Weight(BBI.BB, NextBBI->BB) =
-    //   Weight(BBI.BB, NextBBI->BB) * getSumForBlock(CvtBBI->BB) +
-    //   Weight(BBI.BB, CvtBBI->BB) * Weight(CvtBBI->BB, NextBBI->BB)
-    // New_Weight(BBI.BB, CvtBBI->FalseBB) =
-    //   Weight(BBI.BB, CvtBBI->BB) * Weight(CvtBBI->BB, CvtBBI->FalseBB)
-
-    uint64_t NewNext = BBNext * SumWeight + (BBCvt * CvtNext) / WeightScale;
-    uint64_t NewFalse = (BBCvt * CvtFalse) / WeightScale;
-    // We need to scale down all weights of BBI.BB to fit uint32_t.
-    // Here BBI.BB is connected to CvtBBI->FalseBB and will fall through to
-    // the next block.
-    ScaleWeights(NewNext, NewFalse, BBI.BB, getNextBlock(BBI.BB),
-                 CvtBBI->FalseBB, MBPI);
+    BBI.BB->addSuccessor(CvtBBI->FalseBB, NewFalse);
   }
 
   // Merge in the 'false' block if the 'false' block has no other
@@ -1524,7 +1503,7 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
       MergeBlocks(BBI, TailBBI);
       TailBBI.IsDone = true;
     } else {
-      BBI.BB->addSuccessor(TailBB);
+      BBI.BB->addSuccessor(TailBB, BranchProbability::getOne());
       InsertUncondBranch(BBI.BB, TailBB, TII);
       BBI.HasFallThrough = false;
     }
@@ -1688,21 +1667,26 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) {
                                                 FromBBI.BB->succ_end());
   MachineBasicBlock *NBB = getNextBlock(FromBBI.BB);
   MachineBasicBlock *FallThrough = FromBBI.HasFallThrough ? NBB : nullptr;
-
-  // The edge weight from ToBBI.BB to FromBBI.BB, which is only needed when
+  // The edge probability from ToBBI.BB to FromBBI.BB, which is only needed when
   // AddEdges is true and FromBBI.BB is a successor of ToBBI.BB.
-  uint32_t To2FromWeight = 0;
-  // WeightScale and SumWeight are for calculating successor probabilities of
-  // FromBBI.BB.
-  uint32_t WeightScale = 0;
-  uint32_t SumWeight = 0;
+  auto To2FromProb = BranchProbability::getZero();
   if (AddEdges && ToBBI.BB->isSuccessor(FromBBI.BB)) {
-    To2FromWeight = MBPI->getEdgeWeight(ToBBI.BB, FromBBI.BB);
-    // Set the edge weight from ToBBI.BB to FromBBI.BB to zero to avoid the edge
-    // weight being merged to other edges when this edge is removed later.
-    ToBBI.BB->setSuccWeight(
-        std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), FromBBI.BB), 0);
-    SumWeight = MBPI->getSumForBlock(FromBBI.BB, WeightScale);
+    To2FromProb = MBPI->getEdgeProbability(ToBBI.BB, FromBBI.BB);
+    // Set the edge probability from ToBBI.BB to FromBBI.BB to zero to avoid the
+    // edge probability being merged to other edges when this edge is removed
+    // later.
+    ToBBI.BB->setSuccProbability(
+        std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), FromBBI.BB),
+        BranchProbability::getZero());
+  }
+
+  if (AddEdges && ToBBI.BB->isSuccessor(FromBBI.BB)) {
+    // Set the edge probability from ToBBI.BB to FromBBI.BB to zero to avoid the
+    // edge probability being merged to other edges when this edge is removed
+    // later.
+    ToBBI.BB->setSuccProbability(
+        std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), FromBBI.BB),
+        BranchProbability::getZero());
   }
 
   for (unsigned i = 0, e = FromSuccs.size(); i != e; ++i) {
@@ -1711,39 +1695,38 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) {
     if (Succ == FallThrough)
       continue;
 
-    uint32_t NewWeight = 0;
+    auto NewProb = BranchProbability::getZero();
     if (AddEdges) {
-      // Calculate the edge weight for the edge from ToBBI.BB to Succ, which is
-      // a portion of the edge weight from FromBBI.BB to Succ. The portion ratio
-      // is the edge probability from ToBBI.BB to FromBBI.BB (if FromBBI is a
-      // successor of ToBBI.BB. See comment below for excepion).
-      NewWeight = MBPI->getEdgeWeight(FromBBI.BB, Succ);
+      // Calculate the edge probability for the edge from ToBBI.BB to Succ,
+      // which is a portion of the edge probability from FromBBI.BB to Succ. The
+      // portion ratio is the edge probability from ToBBI.BB to FromBBI.BB (if
+      // FromBBI is a successor of ToBBI.BB. See comment below for excepion).
+      NewProb = MBPI->getEdgeProbability(FromBBI.BB, Succ);
 
-      // To2FromWeight is 0 when FromBBI.BB is not a successor of ToBBI.BB. This
+      // To2FromProb is 0 when FromBBI.BB is not a successor of ToBBI.BB. This
       // only happens when if-converting a diamond CFG and FromBBI.BB is the
       // tail BB.  In this case FromBBI.BB post-dominates ToBBI.BB and hence we
-      // could just use the weights on FromBBI.BB's out-edges when adding new
-      // successors.
-      if (To2FromWeight > 0) {
-        BranchProbability Prob(NewWeight / WeightScale, SumWeight);
-        NewWeight = Prob.scale(To2FromWeight);
-      }
+      // could just use the probabilities on FromBBI.BB's out-edges when adding
+      // new successors.
+      if (!To2FromProb.isZero())
+        NewProb *= To2FromProb;
     }
 
     FromBBI.BB->removeSuccessor(Succ);
 
     if (AddEdges) {
-      // If the edge from ToBBI.BB to Succ already exists, update the weight of
-      // this edge by adding NewWeight to it. An example is shown below, in
-      // which A is ToBBI.BB and B is FromBBI.BB. In this case we don't have to
-      // set C as A's successor as it already is. We only need to update the
-      // edge weight on A->C. Note that B will not be immediately removed from
-      // A's successors. It is possible that B->D is not removed either if D is
-      // a fallthrough of B. Later the edge A->D (generated here) and B->D will
-      // be combined into one edge. To maintain correct edge weight of this
-      // combined edge, we need to set the edge weight of A->B to zero, which is
-      // already done above. The edge weight on A->D is calculated by scaling
-      // the original weight on A->B by the probability of B->D.
+      // If the edge from ToBBI.BB to Succ already exists, update the
+      // probability of this edge by adding NewWeight to it. An example is shown
+      // below, in which A is ToBBI.BB and B is FromBBI.BB. In this case we
+      // don't have to set C as A's successor as it already is. We only need to
+      // update the edge probability on A->C. Note that B will not be
+      // immediately removed from A's successors. It is possible that B->D is
+      // not removed either if D is a fallthrough of B. Later the edge A->D
+      // (generated here) and B->D will be combined into one edge. To maintain
+      // correct edge probability of this combined edge, we need to set the edge
+      // probability of A->B to zero, which is already done above. The edge
+      // probability on A->D is calculated by scaling the original probability
+      // on A->B by the probability of B->D.
       //
       // Before ifcvt:      After ifcvt (assume B->D is kept):
       //
@@ -1755,11 +1738,11 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) {
       //    C  D             C  D
       //
       if (ToBBI.BB->isSuccessor(Succ))
-        ToBBI.BB->setSuccWeight(
+        ToBBI.BB->setSuccProbability(
             std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), Succ),
-            MBPI->getEdgeWeight(ToBBI.BB, Succ) + NewWeight);
+            MBPI->getEdgeProbability(ToBBI.BB, Succ) + NewProb);
       else
-        ToBBI.BB->addSuccessor(Succ, NewWeight);
+        ToBBI.BB->addSuccessor(Succ, NewProb);
     }
   }
 
diff --git a/lib/CodeGen/MIRParser/MIParser.cpp b/lib/CodeGen/MIRParser/MIParser.cpp
index 5a8e96df7603..c9c2d62cec30 100644
--- a/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/lib/CodeGen/MIRParser/MIParser.cpp
@@ -459,8 +459,9 @@ bool MIParser::parseBasicBlockSuccessors(MachineBasicBlock &MBB) {
       if (expectAndConsume(MIToken::rparen))
         return true;
     }
-    MBB.addSuccessor(SuccMBB, Weight);
+    MBB.addSuccessor(SuccMBB, BranchProbability::getRaw(Weight));
   } while (consumeIfPresent(MIToken::comma));
+  MBB.normalizeSuccProbs();
   return false;
 }
 
diff --git a/lib/CodeGen/MIRPrinter.cpp b/lib/CodeGen/MIRPrinter.cpp
index 0be7807064fb..175cb0d51437 100644
--- a/lib/CodeGen/MIRPrinter.cpp
+++ b/lib/CodeGen/MIRPrinter.cpp
@@ -461,8 +461,8 @@ void MIPrinter::print(const MachineBasicBlock &MBB) {
       if (I != MBB.succ_begin())
         OS << ", ";
       printMBBReference(**I);
-      if (MBB.hasSuccessorWeights())
-        OS << '(' << MBB.getSuccWeight(I) << ')';
+      if (MBB.hasSuccessorProbabilities())
+        OS << '(' << MBB.getSuccProbability(I) << ')';
     }
     OS << "\n";
     HasLineAttributes = true;
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index 602b75182fca..c9c6a9d62462 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -319,8 +319,8 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
     OS << "    Successors according to CFG:";
     for (const_succ_iterator SI = succ_begin(), E = succ_end(); SI != E; ++SI) {
       OS << " BB#" << (*SI)->getNumber();
-      if (!Weights.empty())
-        OS << '(' << *getWeightIterator(SI) << ')';
+      if (!Probs.empty())
+        OS << '(' << *getProbabilityIterator(SI) << ')';
     }
     OS << '\n';
   }
@@ -506,34 +506,16 @@ void MachineBasicBlock::updateTerminator() {
   }
 }
 
-void MachineBasicBlock::addSuccessor(MachineBasicBlock *Succ, uint32_t Weight) {
-  // Weight list is either empty (if successor list isn't empty, this means
-  // disabled optimization) or has the same size as successor list.
-  if (!(Weights.empty() && !Successors.empty()))
-    Weights.push_back(Weight);
-  Successors.push_back(Succ);
-  Succ->addPredecessor(this);
-}
-
-void MachineBasicBlock::addSuccessorWithoutWeight(MachineBasicBlock *Succ) {
-  // We need to make sure weight list is either empty or has the same size of
-  // successor list. When this function is called, we can safely delete all
-  // weight in the list.
-  Weights.clear();
-  Successors.push_back(Succ);
-  Succ->addPredecessor(this);
-}
-
 void MachineBasicBlock::addSuccessor(MachineBasicBlock *Succ,
                                      BranchProbability Prob) {
   // Probability list is either empty (if successor list isn't empty, this means
   // disabled optimization) or has the same size as successor list.
   if (!(Probs.empty() && !Successors.empty())) {
+    assert((Probs.empty() || (Prob.isUnknown() && Probs.back().isUnknown()) ||
+            (!Prob.isUnknown() && !Probs.back().isUnknown())) &&
+           "Successors with both known and unknwon probabilities are not "
+           "allowed.");
     Probs.push_back(Prob);
-    // FIXME: Temporarily use the numerator of the probability to represent edge
-    // weight. This will be removed once all weight-version interfaces in MBB
-    // are replaced with probability-version interfaces.
-    Weights.push_back(Prob.getNumerator());
   }
   Successors.push_back(Succ);
   Succ->addPredecessor(this);
@@ -544,7 +526,6 @@ void MachineBasicBlock::addSuccessorWithoutProb(MachineBasicBlock *Succ) {
   // of successor list. When this function is called, we can safely delete all
   // probability in the list.
   Probs.clear();
-  Weights.clear();
   Successors.push_back(Succ);
   Succ->addPredecessor(this);
 }
@@ -558,23 +539,12 @@ MachineBasicBlock::succ_iterator
 MachineBasicBlock::removeSuccessor(succ_iterator I) {
   assert(I != Successors.end() && "Not a current successor!");
 
-  // If Weight list is empty it means we don't use it (disabled optimization).
-  if (!Weights.empty()) {
-    weight_iterator WI = getWeightIterator(I);
-    Weights.erase(WI);
-  }
-
-  // FIXME: Temporarily comment the following code as probabilities are now only
-  // used during instruction lowering, but this interface is called in later
-  // passes. Uncomment it once all edge weights are replaced with probabilities.
-#if 0
   // If probability list is empty it means we don't use it (disabled
   // optimization).
   if (!Probs.empty()) {
     probability_iterator WI = getProbabilityIterator(I);
     Probs.erase(WI);
   }
-#endif
 
   (*I)->removePredecessor(this);
   return Successors.erase(I);
@@ -611,17 +581,12 @@ void MachineBasicBlock::replaceSuccessor(MachineBasicBlock *Old,
   }
 
   // New is already a successor.
-  // Update its weight instead of adding a duplicate edge.
-  if (!Weights.empty())
-    *getWeightIterator(NewI) += *getWeightIterator(OldI);
-  // FIXME: Temporarily comment the following code as probabilities are now only
-  // used during instruction lowering, but this interface is called in later
-  // passes. Uncomment it once all edge weights are replaced with probabilities.
-#if 0
   // Update its probability instead of adding a duplicate edge.
-  if (!Probs.empty())
-    *getProbabilityIterator(NewI) += *getProbabilityIterator(OldI);
-#endif
+  if (!Probs.empty()) {
+    auto ProbIter = getProbabilityIterator(NewI);
+    if (!ProbIter->isUnknown())
+      *ProbIter += *getProbabilityIterator(OldI);
+  }
   removeSuccessor(OldI);
 }
 
@@ -641,13 +606,14 @@ void MachineBasicBlock::transferSuccessors(MachineBasicBlock *FromMBB) {
 
   while (!FromMBB->succ_empty()) {
     MachineBasicBlock *Succ = *FromMBB->succ_begin();
-    uint32_t Weight = 0;
 
-    // If Weight list is empty it means we don't use it (disabled optimization).
-    if (!FromMBB->Weights.empty())
-      Weight = *FromMBB->Weights.begin();
+    // If probability list is empty it means we don't use it (disabled optimization).
+    if (!FromMBB->Probs.empty()) {
+      auto Prob = *FromMBB->Probs.begin();
+      addSuccessor(Succ, Prob);
+    } else
+      addSuccessorWithoutProb(Succ);
 
-    addSuccessor(Succ, Weight);
     FromMBB->removeSuccessor(Succ);
   }
 }
@@ -659,10 +625,11 @@ MachineBasicBlock::transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB) {
 
   while (!FromMBB->succ_empty()) {
     MachineBasicBlock *Succ = *FromMBB->succ_begin();
-    uint32_t Weight = 0;
-    if (!FromMBB->Weights.empty())
-      Weight = *FromMBB->Weights.begin();
-    addSuccessor(Succ, Weight);
+    if (!FromMBB->Probs.empty()) {
+      auto Prob = *FromMBB->Probs.begin();
+      addSuccessor(Succ, Prob);
+    } else
+      addSuccessorWithoutProb(Succ);
     FromMBB->removeSuccessor(Succ);
 
     // Fix up any PHI nodes in the successor.
@@ -1146,80 +1113,37 @@ MachineBasicBlock::findDebugLoc(instr_iterator MBBI) {
   return DL;
 }
 
-/// Return weight of the edge from this block to MBB.
-uint32_t MachineBasicBlock::getSuccWeight(const_succ_iterator Succ) const {
-  if (Weights.empty())
-    return 0;
-
-  return *getWeightIterator(Succ);
-}
-
-/// Return probability of the edge from this block to MBB. If probability list
-/// is empty, return a default probability which is 1/N, where N is the number
-/// of successors. If the probability of the given successor is unknown, then
-/// sum up all known probabilities and return the complement of the sum divided
-/// by the number of unknown probabilities.
+/// Return probability of the edge from this block to MBB.
 BranchProbability
 MachineBasicBlock::getSuccProbability(const_succ_iterator Succ) const {
-  if (Probs.empty())
+  if (Probs.empty() || Probs.back().isUnknown())
     return BranchProbability(1, succ_size());
 
-  auto Prob = *getProbabilityIterator(Succ);
-  assert(!Prob.isUnknown());
-  return Prob;
-}
-
-/// Set successor weight of a given iterator.
-void MachineBasicBlock::setSuccWeight(succ_iterator I, uint32_t Weight) {
-  if (Weights.empty())
-    return;
-  *getWeightIterator(I) = Weight;
+  return *getProbabilityIterator(Succ);
 }
 
 /// Set successor probability of a given iterator.
 void MachineBasicBlock::setSuccProbability(succ_iterator I,
                                            BranchProbability Prob) {
   assert(!Prob.isUnknown());
-  if (Probs.empty() || Weights.empty())
+  if (Probs.empty())
     return;
   *getProbabilityIterator(I) = Prob;
-  // FIXME: Temporarily use the numerator of the probability to represent edge
-  // weight. This will be removed once all weight-version interfaces in MBB
-  // are replaces with probability-version interfaces.
-  *getWeightIterator(I) = Prob.getNumerator();
-}
-
-/// Return wight iterator corresonding to the I successor iterator.
-MachineBasicBlock::weight_iterator MachineBasicBlock::
-getWeightIterator(MachineBasicBlock::succ_iterator I) {
-  assert(Weights.size() == Successors.size() && "Async weight list!");
-  size_t index = std::distance(Successors.begin(), I);
-  assert(index < Weights.size() && "Not a current successor!");
-  return Weights.begin() + index;
 }
 
-/// Return wight iterator corresonding to the I successor iterator.
-MachineBasicBlock::const_weight_iterator MachineBasicBlock::
-getWeightIterator(MachineBasicBlock::const_succ_iterator I) const {
-  assert(Weights.size() == Successors.size() && "Async weight list!");
-  const size_t index = std::distance(Successors.begin(), I);
-  assert(index < Weights.size() && "Not a current successor!");
-  return Weights.begin() + index;
-}
-
-/// Return probability iterator corresonding to the I successor iterator.
-MachineBasicBlock::probability_iterator
-MachineBasicBlock::getProbabilityIterator(MachineBasicBlock::succ_iterator I) {
+/// Return probability iterator corresonding to the I successor iterator
+MachineBasicBlock::const_probability_iterator
+MachineBasicBlock::getProbabilityIterator(
+    MachineBasicBlock::const_succ_iterator I) const {
   assert(Probs.size() == Successors.size() && "Async probability list!");
   const size_t index = std::distance(Successors.begin(), I);
   assert(index < Probs.size() && "Not a current successor!");
   return Probs.begin() + index;
 }
 
-/// Return probability iterator corresonding to the I successor iterator
-MachineBasicBlock::const_probability_iterator
-MachineBasicBlock::getProbabilityIterator(
-    MachineBasicBlock::const_succ_iterator I) const {
+/// Return probability iterator corresonding to the I successor iterator.
+MachineBasicBlock::probability_iterator
+MachineBasicBlock::getProbabilityIterator(MachineBasicBlock::succ_iterator I) {
   assert(Probs.size() == Successors.size() && "Async probability list!");
   const size_t index = std::distance(Successors.begin(), I);
   assert(index < Probs.size() && "Not a current successor!");
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
index fba33eb93d5f..ddddd483e801 100644
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -380,19 +380,11 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
   const BranchProbability HotProb(4, 5); // 80%
 
   MachineBasicBlock *BestSucc = nullptr;
-  // FIXME: Due to the performance of the probability and weight routines in
-  // the MBPI analysis, we manually compute probabilities using the edge
-  // weights. This is suboptimal as it means that the somewhat subtle
-  // definition of edge weight semantics is encoded here as well. We should
-  // improve the MBPI interface to efficiently support query patterns such as
-  // this.
-  uint32_t BestWeight = 0;
-  uint32_t WeightScale = 0;
-  uint32_t SumWeight = MBPI->getSumForBlock(BB, WeightScale);
-
-  // Adjust sum of weights by excluding weights on edges pointing to blocks that
-  // is either not in BlockFilter or is already in the current chain. Consider
-  // the following CFG:
+  auto BestProb = BranchProbability::getZero();
+
+  // Adjust edge probabilities by excluding edges pointing to blocks that is
+  // either not in BlockFilter or is already in the current chain. Consider the
+  // following CFG:
   //
   //     --->A
   //     |  / \
@@ -406,7 +398,7 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
   // HotProb). If we exclude E that is not in BlockFilter when calculating the
   // probability of C->D, D will be selected and we will get A C D B as the
   // layout of this loop.
-  uint32_t AdjustedSumWeight = SumWeight;
+  auto AdjustedSumProb = BranchProbability::getOne();
   SmallVector<MachineBasicBlock *, 4> Successors;
   for (MachineBasicBlock *Succ : BB->successors()) {
     bool SkipSucc = false;
@@ -424,15 +416,16 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
       }
     }
     if (SkipSucc)
-      AdjustedSumWeight -= MBPI->getEdgeWeight(BB, Succ) / WeightScale;
+      AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ);
     else
       Successors.push_back(Succ);
   }
 
   DEBUG(dbgs() << "Attempting merge from: " << getBlockName(BB) << "\n");
   for (MachineBasicBlock *Succ : Successors) {
-    uint32_t SuccWeight = MBPI->getEdgeWeight(BB, Succ);
-    BranchProbability SuccProb(SuccWeight / WeightScale, AdjustedSumWeight);
+    BranchProbability SuccProb(
+        MBPI->getEdgeProbability(BB, Succ).getNumerator(),
+        AdjustedSumProb.getNumerator());
 
     // If we outline optional branches, look whether Succ is unavoidable, i.e.
     // dominates all terminators of the MachineFunction. If it does, other
@@ -470,7 +463,7 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
 
       // Make sure that a hot successor doesn't have a globally more
       // important predecessor.
-      BranchProbability RealSuccProb(SuccWeight / WeightScale, SumWeight);
+      auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
       BlockFrequency CandidateEdgeFreq =
           MBFI->getBlockFreq(BB) * RealSuccProb * HotProb.getCompl();
       bool BadCFGConflict = false;
@@ -496,10 +489,10 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
                  << " (prob)"
                  << (SuccChain.LoopPredecessors != 0 ? " (CFG break)" : "")
                  << "\n");
-    if (BestSucc && BestWeight >= SuccWeight)
+    if (BestSucc && BestProb >= SuccProb)
       continue;
     BestSucc = Succ;
-    BestWeight = SuccWeight;
+    BestProb = SuccProb;
   }
   return BestSucc;
 }
@@ -728,11 +721,6 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L,
     MachineBasicBlock *OldExitingBB = ExitingBB;
     BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
     bool HasLoopingSucc = false;
-    // FIXME: Due to the performance of the probability and weight routines in
-    // the MBPI analysis, we use the internal weights and manually compute the
-    // probabilities to avoid quadratic behavior.
-    uint32_t WeightScale = 0;
-    uint32_t SumWeight = MBPI->getSumForBlock(MBB, WeightScale);
     for (MachineBasicBlock *Succ : MBB->successors()) {
       if (Succ->isEHPad())
         continue;
@@ -746,10 +734,10 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L,
         continue;
       }
 
-      uint32_t SuccWeight = MBPI->getEdgeWeight(MBB, Succ);
+      auto SuccProb = MBPI->getEdgeProbability(MBB, Succ);
       if (LoopBlockSet.count(Succ)) {
         DEBUG(dbgs() << "    looping: " << getBlockName(MBB) << " -> "
-                     << getBlockName(Succ) << " (" << SuccWeight << ")\n");
+                     << getBlockName(Succ) << " (" << SuccProb << ")\n");
         HasLoopingSucc = true;
         continue;
       }
@@ -761,7 +749,6 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L,
           BlocksExitingToOuterLoop.insert(MBB);
       }
 
-      BranchProbability SuccProb(SuccWeight / WeightScale, SumWeight);
       BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb;
       DEBUG(dbgs() << "    exiting: " << getBlockName(MBB) << " -> "
                    << getBlockName(Succ) << " [L:" << SuccLoopDepth << "] (";
@@ -904,21 +891,17 @@ void MachineBlockPlacement::rotateLoopWithProfile(
   // edge from the tail of the loop chain.
   SmallVector<std::pair<MachineBasicBlock *, BlockFrequency>, 4> ExitsWithFreq;
   for (auto BB : LoopChain) {
-    uint32_t LargestExitEdgeWeight = 0;
+    auto LargestExitEdgeProb = BranchProbability::getZero();
     for (auto *Succ : BB->successors()) {
       BlockChain *SuccChain = BlockToChain[Succ];
       if (!LoopBlockSet.count(Succ) &&
           (!SuccChain || Succ == *SuccChain->begin())) {
-        uint32_t SuccWeight = MBPI->getEdgeWeight(BB, Succ);
-        LargestExitEdgeWeight = std::max(LargestExitEdgeWeight, SuccWeight);
+        auto SuccProb = MBPI->getEdgeProbability(BB, Succ);
+        LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb);
       }
     }
-    if (LargestExitEdgeWeight > 0) {
-      uint32_t WeightScale = 0;
-      uint32_t SumWeight = MBPI->getSumForBlock(BB, WeightScale);
-      auto ExitFreq =
-          MBFI->getBlockFreq(BB) *
-          BranchProbability(LargestExitEdgeWeight / WeightScale, SumWeight);
+    if (LargestExitEdgeProb > BranchProbability::getZero()) {
+      auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb;
       ExitsWithFreq.emplace_back(BB, ExitFreq);
     }
   }
@@ -1290,14 +1273,16 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
       }
 
       // If PrevBB has a two-way branch, try to re-order the branches
-      // such that we branch to the successor with higher weight first.
+      // such that we branch to the successor with higher probability first.
       if (TBB && !Cond.empty() && FBB &&
-          MBPI->getEdgeWeight(PrevBB, FBB) > MBPI->getEdgeWeight(PrevBB, TBB) &&
+          MBPI->getEdgeProbability(PrevBB, FBB) >
+              MBPI->getEdgeProbability(PrevBB, TBB) &&
           !TII->ReverseBranchCondition(Cond)) {
         DEBUG(dbgs() << "Reverse order of the two branches: "
                      << getBlockName(PrevBB) << "\n");
-        DEBUG(dbgs() << "    Edge weight: " << MBPI->getEdgeWeight(PrevBB, FBB)
-                     << " vs " << MBPI->getEdgeWeight(PrevBB, TBB) << "\n");
+        DEBUG(dbgs() << "    Edge probability: "
+                     << MBPI->getEdgeProbability(PrevBB, FBB) << " vs "
+                     << MBPI->getEdgeProbability(PrevBB, TBB) << "\n");
         DebugLoc dl; // FIXME: this is nowhere
         TII->RemoveBranch(*PrevBB);
         TII->InsertBranch(*PrevBB, FBB, TBB, Cond, dl);
diff --git a/lib/CodeGen/MachineBranchProbabilityInfo.cpp b/lib/CodeGen/MachineBranchProbabilityInfo.cpp
index 6fbc2be70486..5478dcba261a 100644
--- a/lib/CodeGen/MachineBranchProbabilityInfo.cpp
+++ b/lib/CodeGen/MachineBranchProbabilityInfo.cpp
@@ -28,91 +28,61 @@ char MachineBranchProbabilityInfo::ID = 0;
 
 void MachineBranchProbabilityInfo::anchor() { }
 
-uint32_t MachineBranchProbabilityInfo::
-getSumForBlock(const MachineBasicBlock *MBB, uint32_t &Scale) const {
-  // First we compute the sum with 64-bits of precision, ensuring that cannot
-  // overflow by bounding the number of weights considered. Hopefully no one
-  // actually needs 2^32 successors.
-  assert(MBB->succ_size() < UINT32_MAX);
-  uint64_t Sum = 0;
-  Scale = 1;
-  for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
-       E = MBB->succ_end(); I != E; ++I) {
-    uint32_t Weight = getEdgeWeight(MBB, I);
-    Sum += Weight;
-  }
-
-  // If the computed sum fits in 32-bits, we're done.
-  if (Sum <= UINT32_MAX)
-    return Sum;
+uint32_t MachineBranchProbabilityInfo::getEdgeWeight(
+    const MachineBasicBlock *Src,
+    MachineBasicBlock::const_succ_iterator Dst) const {
+  return Src->getSuccProbability(Dst).getNumerator();
+}
 
-  // Otherwise, compute the scale necessary to cause the weights to fit, and
-  // re-sum with that scale applied.
-  assert((Sum / UINT32_MAX) < UINT32_MAX);
-  Scale = (Sum / UINT32_MAX) + 1;
-  Sum = 0;
-  for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
-       E = MBB->succ_end(); I != E; ++I) {
-    uint32_t Weight = getEdgeWeight(MBB, I);
-    Sum += Weight / Scale;
-  }
-  assert(Sum <= UINT32_MAX);
-  return Sum;
+uint32_t MachineBranchProbabilityInfo::getEdgeWeight(
+    const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const {
+  // This is a linear search. Try to use the const_succ_iterator version when
+  // possible.
+  return getEdgeWeight(Src, std::find(Src->succ_begin(), Src->succ_end(), Dst));
 }
 
-uint32_t MachineBranchProbabilityInfo::
-getEdgeWeight(const MachineBasicBlock *Src,
-              MachineBasicBlock::const_succ_iterator Dst) const {
-  uint32_t Weight = Src->getSuccWeight(Dst);
-  if (!Weight)
-    return DEFAULT_WEIGHT;
-  return Weight;
+BranchProbability MachineBranchProbabilityInfo::getEdgeProbability(
+    const MachineBasicBlock *Src,
+    MachineBasicBlock::const_succ_iterator Dst) const {
+  return Src->getSuccProbability(Dst);
 }
 
-uint32_t MachineBranchProbabilityInfo::
-getEdgeWeight(const MachineBasicBlock *Src,
-              const MachineBasicBlock *Dst) const {
+BranchProbability MachineBranchProbabilityInfo::getEdgeProbability(
+    const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const {
   // This is a linear search. Try to use the const_succ_iterator version when
   // possible.
-  return getEdgeWeight(Src, std::find(Src->succ_begin(), Src->succ_end(), Dst));
+  return getEdgeProbability(Src,
+                            std::find(Src->succ_begin(), Src->succ_end(), Dst));
 }
 
 bool
 MachineBranchProbabilityInfo::isEdgeHot(const MachineBasicBlock *Src,
                                         const MachineBasicBlock *Dst) const {
   // Hot probability is at least 4/5 = 80%
-  // FIXME: Compare against a static "hot" BranchProbability.
-  return getEdgeProbability(Src, Dst) > BranchProbability(4, 5);
+  static BranchProbability HotProb(4, 5);
+  return getEdgeProbability(Src, Dst) > HotProb;
 }
 
 MachineBasicBlock *
 MachineBranchProbabilityInfo::getHotSucc(MachineBasicBlock *MBB) const {
-  uint32_t MaxWeight = 0;
+  auto MaxProb = BranchProbability::getZero();
   MachineBasicBlock *MaxSucc = nullptr;
   for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
        E = MBB->succ_end(); I != E; ++I) {
-    uint32_t Weight = getEdgeWeight(MBB, I);
-    if (Weight > MaxWeight) {
-      MaxWeight = Weight;
+    auto Prob = getEdgeProbability(MBB, I);
+    if (Prob > MaxProb) {
+      MaxProb = Prob;
       MaxSucc = *I;
     }
   }
 
-  if (getEdgeProbability(MBB, MaxSucc) >= BranchProbability(4, 5))
+  static BranchProbability HotProb(4, 5);
+  if (getEdgeProbability(MBB, MaxSucc) >= HotProb)
     return MaxSucc;
 
   return nullptr;
 }
 
-BranchProbability MachineBranchProbabilityInfo::getEdgeProbability(
-    const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const {
-  uint32_t Scale = 1;
-  uint32_t D = getSumForBlock(Src, Scale);
-  uint32_t N = getEdgeWeight(Src, Dst) / Scale;
-
-  return BranchProbability(N, D);
-}
-
 raw_ostream &MachineBranchProbabilityInfo::printEdgeProbability(
     raw_ostream &OS, const MachineBasicBlock *Src,
     const MachineBasicBlock *Dst) const {
diff --git a/lib/CodeGen/TailDuplication.cpp b/lib/CodeGen/TailDuplication.cpp
index ff86dabfac59..1f5b54866ac6 100644
--- a/lib/CodeGen/TailDuplication.cpp
+++ b/lib/CodeGen/TailDuplication.cpp
@@ -745,12 +745,12 @@ TailDuplicatePass::duplicateSimpleBB(MachineBasicBlock *TailBB,
     if (PredTBB)
       TII->InsertBranch(*PredBB, PredTBB, PredFBB, PredCond, DebugLoc());
 
-    uint32_t Weight = MBPI->getEdgeWeight(PredBB, TailBB);
+    auto Prob = MBPI->getEdgeProbability(PredBB, TailBB);
     PredBB->removeSuccessor(TailBB);
     unsigned NumSuccessors = PredBB->succ_size();
     assert(NumSuccessors <= 1);
     if (NumSuccessors == 0 || *PredBB->succ_begin() != NewTarget)
-      PredBB->addSuccessor(NewTarget, Weight);
+      PredBB->addSuccessor(NewTarget, Prob);
 
     TDBBs.push_back(PredBB);
   }
@@ -858,7 +858,7 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB,
            "TailDuplicate called on block with multiple successors!");
     for (MachineBasicBlock::succ_iterator I = TailBB->succ_begin(),
            E = TailBB->succ_end(); I != E; ++I)
-      PredBB->addSuccessor(*I, MBPI->getEdgeWeight(TailBB, I));
+      PredBB->addSuccessor(*I, MBPI->getEdgeProbability(TailBB, I));
 
     Changed = true;
     ++NumTailDups;
diff --git a/lib/Support/BranchProbability.cpp b/lib/Support/BranchProbability.cpp
index 3b0f6e6f06e4..771d02c0aa3c 100644
--- a/lib/Support/BranchProbability.cpp
+++ b/lib/Support/BranchProbability.cpp
@@ -22,11 +22,14 @@ using namespace llvm;
 const uint32_t BranchProbability::D;
 
 raw_ostream &BranchProbability::print(raw_ostream &OS) const {
+  if (isUnknown())
+    return OS << "?%";
+
   // Get a percentage rounded to two decimal digits. This avoids
   // implementation-defined rounding inside printf.
   double Percent = rint(((double)N / D) * 100.0 * 100.0) / 100.0;
-  OS << format("0x%08" PRIx32 " / 0x%08" PRIx32 " = %.2f%%", N, D, Percent);
-  return OS;
+  return OS << format("0x%08" PRIx32 " / 0x%08" PRIx32 " = %.2f%%", N, D,
+                      Percent);
 }
 
 void BranchProbability::dump() const { print(dbgs()) << '\n'; }
@@ -43,6 +46,19 @@ BranchProbability::BranchProbability(uint32_t Numerator, uint32_t Denominator) {
   }
 }
 
+BranchProbability
+BranchProbability::getBranchProbability(uint64_t Numerator,
+                                        uint64_t Denominator) {
+  assert(Numerator <= Denominator && "Probability cannot be bigger than 1!");
+  // Scale down Denominator to fit in a 32-bit integer.
+  int Scale = 0;
+  while (Denominator > UINT32_MAX) {
+    Denominator >>= 1;
+    Scale++;
+  }
+  return BranchProbability(Numerator >> Scale, Denominator);
+}
+
 // If ConstD is not zero, then replace D by ConstD so that division and modulo
 // operations by D can be optimized, in case this function is not inlined by the
 // compiler.
diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index f1b383017901..cdbd12092150 100644
--- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -1570,8 +1570,7 @@ void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
 
   insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc());
   insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc());
-  DstBlk->addSuccessor(LandMBB);
-  DstBlk->removeSuccessor(DstBlk);
+  DstBlk->replaceSuccessor(DstBlk, LandMBB);
 }
 
 
@@ -1666,8 +1665,7 @@ AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
   replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB);
   //srcBlk, oldBlk, newBlk
 
-  PredMBB->removeSuccessor(MBB);
-  PredMBB->addSuccessor(CloneMBB);
+  PredMBB->replaceSuccessor(MBB, CloneMBB);
 
   // add all successor to cloneBlk
   cloneSuccessorList(CloneMBB, MBB);
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index 0bf2d374df6a..e89757c19ecc 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -2274,8 +2274,7 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
 
   // Update the CFG.
   NewBB->addSuccessor(BB);
-  JTBB->removeSuccessor(BB);
-  JTBB->addSuccessor(NewBB);
+  JTBB->replaceSuccessor(BB, NewBB);
 
   ++NumJTInserted;
   return NewBB;
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 0cc41812d71c..e8f3ab65bdbe 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -7346,7 +7346,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
       }
     }
 
-    BB->addSuccessor(DispatchBB);
+    BB->addSuccessor(DispatchBB, BranchProbability::getZero());
 
     // Find the invoke call and mark all of the callee-saved registers as
     // 'implicit defined' so that they're spilled. This prevents code from
diff --git a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
index 96bb61750805..efafdd007289 100644
--- a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
+++ b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
@@ -186,13 +186,11 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) {
 
             if (case1 || case2) {
               InvertAndChangeJumpTarget(MI, UncondTarget);
-              MBB->removeSuccessor(JumpAroundTarget);
-              MBB->addSuccessor(UncondTarget);
+              MBB->replaceSuccessor(JumpAroundTarget, UncondTarget);
 
               // Remove the unconditional branch in LayoutSucc.
               LayoutSucc->erase(LayoutSucc->begin());
-              LayoutSucc->removeSuccessor(UncondTarget);
-              LayoutSucc->addSuccessor(JumpAroundTarget);
+              LayoutSucc->replaceSuccessor(UncondTarget, JumpAroundTarget);
 
               // This code performs the conversion for case 2, which moves
               // the block to the fall-thru case (BB3 in the code above).
diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp
index d09843ed0e53..e75858a181e5 100644
--- a/lib/Target/Mips/MipsLongBranch.cpp
+++ b/lib/Target/Mips/MipsLongBranch.cpp
@@ -262,8 +262,7 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
       static_cast<const MipsInstrInfo *>(Subtarget.getInstrInfo());
 
   MF->insert(FallThroughMBB, LongBrMBB);
-  MBB->removeSuccessor(TgtMBB);
-  MBB->addSuccessor(LongBrMBB);
+  MBB->replaceSuccessor(TgtMBB, LongBrMBB);
 
   if (IsPIC) {
     MachineBasicBlock *BalTgtMBB = MF->CreateMachineBasicBlock(BB);
diff --git a/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll b/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll
index e17da7a97205..a44c9721d6c1 100644
--- a/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll
+++ b/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll
@@ -14,15 +14,15 @@ entry:
   br i1 undef, label %for.end, label %for.body
 
 ; Before if conversion, we have
-; for.body -> lor.lhs.false.i (62)
-;          -> for.cond.backedge (62)
-; lor.lhs.false.i -> for.cond.backedge (1048575)
-;                 -> cond.false.i (1)
+; for.body -> lor.lhs.false.i (50%)
+;          -> for.cond.backedge (50%)
+; lor.lhs.false.i -> for.cond.backedge (100%)
+;                 -> cond.false.i (0%)
 ; Afer if conversion, we have
-; for.body -> for.cond.backedge (130023362)
-;          -> cond.false.i (62)
+; for.body -> for.cond.backedge (100%)
+;          -> cond.false.i (0%)
 ; CHECK: BB#1: derived from LLVM BB %for.body
-; CHECK: Successors according to CFG: BB#2(4294967291) BB#4(2048)
+; CHECK: Successors according to CFG: BB#2(0x7ffffc00 / 0x80000000 = 100.00%) BB#4(0x00000400 / 0x80000000 = 0.00%)
 for.body:
   br i1 undef, label %for.cond.backedge, label %lor.lhs.false.i, !prof !1
 
diff --git a/test/CodeGen/ARM/ifcvt-branch-weight.ll b/test/CodeGen/ARM/ifcvt-branch-weight.ll
index f2a1229d0d8a..0de039cde23c 100644
--- a/test/CodeGen/ARM/ifcvt-branch-weight.ll
+++ b/test/CodeGen/ARM/ifcvt-branch-weight.ll
@@ -19,7 +19,7 @@ bb:
   br i1 %9, label %return, label %bb2
 
 ; CHECK: BB#2: derived from LLVM BB %bb2
-; CHECK: Successors according to CFG: BB#3(4294967289) BB#4(4294967287)
+; CHECK: Successors according to CFG: BB#3({{[0-9a-fx/= ]+}}50.00%) BB#4({{[0-9a-fx/= ]+}}50.00%)
 
 bb2:
   %v10 = icmp eq i32 %3, 16
diff --git a/test/CodeGen/ARM/ifcvt-iter-indbr.ll b/test/CodeGen/ARM/ifcvt-iter-indbr.ll
index 6ce9bcb56ef4..a96b6e8a1e83 100644
--- a/test/CodeGen/ARM/ifcvt-iter-indbr.ll
+++ b/test/CodeGen/ARM/ifcvt-iter-indbr.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple thumbv7s-apple-darwin  -asm-verbose=false | FileCheck %s
-; RUN: llc < %s -mtriple thumbv7s-apple-darwin  -asm-verbose=false -print-machineinstrs=if-converter 2>&1 | FileCheck --check-prefix=CHECK-WEIGHT %s
+; RUN: llc < %s -mtriple thumbv7s-apple-darwin  -asm-verbose=false -print-machineinstrs=if-converter 2>&1 | FileCheck --check-prefix=CHECK-PROB %s
 
 declare i32 @foo(i32)
 declare i8* @bar(i32, i8*, i8*)
@@ -29,10 +29,10 @@ declare i8* @bar(i32, i8*, i8*)
 ; CHECK-NEXT: [[FOOCALL]]:
 ; CHECK-NEXT:  blx _foo
 ;
-; CHECK-WEIGHT: BB#0:
-; CHECK-WEIGHT: Successors according to CFG: BB#1(1073741824) BB#2(536870912) BB#4(536870912)
-; CHECK-WEIGHT: BB#1:
-; CHECK-WEIGHT: Successors according to CFG: BB#2(1610612736) BB#4(536870912)
+; CHECK-PROB: BB#0:
+; CHECK-PROB: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}50.00%) BB#2({{[0-9a-fx/= ]+}}25.00%) BB#4({{[0-9a-fx/= ]+}}25.00%)
+; CHECK-PROB: BB#1:
+; CHECK-PROB: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}75.00%) BB#4({{[0-9a-fx/= ]+}}25.00%)
 
 define i32 @test(i32 %a, i32 %a2, i32* %p, i32* %p2) {
 entry:
diff --git a/test/CodeGen/ARM/tail-merge-branch-weight.ll b/test/CodeGen/ARM/tail-merge-branch-weight.ll
index 95b0a202e7ff..f83f28815793 100644
--- a/test/CodeGen/ARM/tail-merge-branch-weight.ll
+++ b/test/CodeGen/ARM/tail-merge-branch-weight.ll
@@ -9,7 +9,7 @@
 ;                = 0.2 * 0.4 + 0.8 * 0.7 = 0.64
 
 ; CHECK: # Machine code for function test0:
-; CHECK: Successors according to CFG: BB#{{[0-9]+}}(13) BB#{{[0-9]+}}(24)
+; CHECK: Successors according to CFG: BB#{{[0-9]+}}({{[0-9a-fx/= ]+}}20.00%) BB#{{[0-9]+}}({{[0-9a-fx/= ]+}}80.00%)
 ; CHECK: BB#{{[0-9]+}}:
 ; CHECK: BB#{{[0-9]+}}:
 ; CHECK: # End machine code for function test0.
diff --git a/test/CodeGen/ARM/taildup-branch-weight.ll b/test/CodeGen/ARM/taildup-branch-weight.ll
index 576c120b444e..799ef62416e6 100644
--- a/test/CodeGen/ARM/taildup-branch-weight.ll
+++ b/test/CodeGen/ARM/taildup-branch-weight.ll
@@ -3,7 +3,7 @@
 ; RUN:	| FileCheck %s
 
 ; CHECK: Machine code for function test0:
-; CHECK: Successors according to CFG: BB#1(67108864) BB#2(2080374784)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}3.12%) BB#2({{[0-9a-fx/= ]+}}96.88%)
 
 define void @test0(i32 %a, i32 %b, i32* %c, i32* %d) {
 entry:
@@ -30,7 +30,7 @@ B4:
 !0 = !{!"branch_weights", i32 4, i32 124}
 
 ; CHECK: Machine code for function test1:
-; CHECK: Successors according to CFG: BB#1(67108864) BB#2(2080374784)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}3.12%) BB#2({{[0-9a-fx/= ]+}}96.88%)
 
 @g0 = common global i32 0, align 4
 
diff --git a/test/CodeGen/Generic/MachineBranchProb.ll b/test/CodeGen/Generic/MachineBranchProb.ll
index 5a4a4672f7eb..ae3c8da21471 100644
--- a/test/CodeGen/Generic/MachineBranchProb.ll
+++ b/test/CodeGen/Generic/MachineBranchProb.ll
@@ -16,11 +16,11 @@ entry:
     i64 5, label %sw.bb1
   ], !prof !0
 ; CHECK: BB#0: derived from LLVM BB %entry
-; CHECK: Successors according to CFG: BB#2(1616928864) BB#4(530554784)
+; CHECK: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}75.29%) BB#4({{[0-9a-fx/= ]+}}24.71%)
 ; CHECK: BB#4: derived from LLVM BB %entry
-; CHECK: Successors according to CFG: BB#1(252645135) BB#5(277909649)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}47.62%) BB#5({{[0-9a-fx/= ]+}}52.38%)
 ; CHECK: BB#5: derived from LLVM BB %entry
-; CHECK: Successors according to CFG: BB#1(101058054) BB#3(176851595)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}36.36%) BB#3({{[0-9a-fx/= ]+}}63.64%)
 
 sw.bb:
   br label %return
@@ -62,7 +62,7 @@ return: ret void
 ; CHECK-LABEL: Machine code for function left_leaning_weight_balanced_tree:
 ; CHECK: BB#0: derived from LLVM BB %entry
 ; CHECK-NOT: Successors
-; CHECK: Successors according to CFG: BB#8(852677332) BB#9(1294806318)
+; CHECK: Successors according to CFG: BB#8({{[0-9a-fx/= ]+}}39.71%) BB#9({{[0-9a-fx/= ]+}}60.29%)
 }
 
 !1 = !{!"branch_weights",
diff --git a/test/CodeGen/Hexagon/ifcvt-edge-weight.ll b/test/CodeGen/Hexagon/ifcvt-edge-weight.ll
index f84fd95e4fbd..341567e1d02f 100644
--- a/test/CodeGen/Hexagon/ifcvt-edge-weight.ll
+++ b/test/CodeGen/Hexagon/ifcvt-edge-weight.ll
@@ -2,7 +2,7 @@
 ; Check that the edge weights are updated correctly after if-conversion.
 
 ; CHECK: BB#3:
-; CHECK: Successors according to CFG: BB#2(214748365) BB#1(1932735283)
+; CHECK: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}10.00%) BB#1({{[0-9a-fx/= ]+}}90.00%)
 @a = external global i32
 @d = external global i32
 
diff --git a/test/CodeGen/MIR/X86/newline-handling.mir b/test/CodeGen/MIR/X86/newline-handling.mir
index b5ed3b7f27e1..bce06d540114 100644
--- a/test/CodeGen/MIR/X86/newline-handling.mir
+++ b/test/CodeGen/MIR/X86/newline-handling.mir
@@ -35,7 +35,7 @@ liveins:
 # CHECK-LABEL: name: foo
 # CHECK: body: |
 # CHECK-NEXT: bb.0.entry:
-# CHECK-NEXT: successors: %bb.1.less(0), %bb.2.exit(0)
+# CHECK-NEXT: successors: %bb.1.less(0x40000000 / 0x80000000 = 50.00%), %bb.2.exit(0x40000000 / 0x80000000 = 50.00%)
 # CHECK-NEXT: liveins: %edi
 # CHECK:      CMP32ri8 %edi, 10, implicit-def %eflags
 # CHECK-NEXT: JG_1 %bb.2.exit, implicit killed %eflags
@@ -79,7 +79,7 @@ liveins:
 # CHECK-LABEL: name: bar
 # CHECK: body: |
 # CHECK-NEXT: bb.0.entry:
-# CHECK-NEXT: successors: %bb.1.less(0), %bb.2.exit(0)
+# CHECK-NEXT: successors: %bb.1.less(0x40000000 / 0x80000000 = 50.00%), %bb.2.exit(0x40000000 / 0x80000000 = 50.00%)
 # CHECK-NEXT: liveins: %edi
 # CHECK:      CMP32ri8 %edi, 10, implicit-def %eflags
 # CHECK-NEXT: JG_1 %bb.2.exit, implicit killed %eflags
diff --git a/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir b/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir
index fc5e5d640f7f..64af6121189a 100644
--- a/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir
+++ b/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir
@@ -1,6 +1,6 @@
 # RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
 # This test ensures that the MIR parser parses basic block successors and
-# weights correctly.
+# probabilities correctly.
 
 --- |
 
@@ -21,10 +21,10 @@
 name:            foo
 body: |
   ; CHECK-LABEL: bb.0.entry:
-  ; CHECK:         successors: %bb.1.less(16), %bb.2.exit(32)
+  ; CHECK:         successors: %bb.1.less({{[0-9a-fx/= ]+}}33.00%), %bb.2.exit({{[0-9a-fx/= ]+}}67.00%)
   ; CHECK-LABEL: bb.1.less:
   bb.0.entry:
-    successors: %bb.1.less (16), %bb.2.exit(32)
+    successors: %bb.1.less (33), %bb.2.exit(67)
     liveins: %edi
 
     CMP32ri8 %edi, 10, implicit-def %eflags
diff --git a/test/CodeGen/MIR/X86/successor-basic-blocks.mir b/test/CodeGen/MIR/X86/successor-basic-blocks.mir
index aa80fe9fbeef..a6c14f70bc7c 100644
--- a/test/CodeGen/MIR/X86/successor-basic-blocks.mir
+++ b/test/CodeGen/MIR/X86/successor-basic-blocks.mir
@@ -32,7 +32,7 @@
 name:            foo
 body: |
   ; CHECK-LABEL: bb.0.entry:
-  ; CHECK:         successors: %bb.1.less(0), %bb.2.exit(0)
+  ; CHECK:         successors: %bb.1.less(0x40000000 / 0x80000000 = 50.00%), %bb.2.exit(0x40000000 / 0x80000000 = 50.00%)
   ; CHECK-LABEL: bb.1.less:
   bb.0.entry:
     successors: %bb.1.less, %bb.2.exit
@@ -58,7 +58,7 @@ body: |
   ; Verify that we can have multiple lists of successors that will be merged
   ; into one.
   ; CHECK-LABEL: bb.0.entry:
-  ; CHECK:         successors: %bb.1(0), %bb.2(0)
+  ; CHECK:         successors: %bb.1(0x80000000 / 0x80000000 = 100.00%), %bb.2(0x00000000 / 0x80000000 = 0.00%)
   bb.0.entry:
     liveins: %edi
     successors: %bb.1
diff --git a/test/CodeGen/X86/MachineBranchProb.ll b/test/CodeGen/X86/MachineBranchProb.ll
index da0bf517ecfa..ee1c658d4c55 100644
--- a/test/CodeGen/X86/MachineBranchProb.ll
+++ b/test/CodeGen/X86/MachineBranchProb.ll
@@ -18,9 +18,9 @@ for.cond2:                                        ; preds = %for.inc, %for.cond
   %or.cond = or i1 %tobool, %cmp4
   br i1 %or.cond, label %for.inc20, label %for.inc, !prof !0
 ; CHECK: BB#1: derived from LLVM BB %for.cond2
-; CHECK: Successors according to CFG: BB#3(32756933) BB#4(2114726715)
+; CHECK: Successors according to CFG: BB#3({{[0-9a-fx/= ]+}}1.53%) BB#4({{[0-9a-fx/= ]+}}98.47%)
 ; CHECK: BB#4: derived from LLVM BB %for.cond2
-; CHECK: Successors according to CFG: BB#3(33264335) BB#2(2114219313)
+; CHECK: Successors according to CFG: BB#3({{[0-9a-fx/= ]+}}1.55%) BB#2({{[0-9a-fx/= ]+}}98.45%)
 
 for.inc:                                          ; preds = %for.cond2
   %shl = shl i32 %bit.0, 1
diff --git a/test/CodeGen/X86/catchpad-weight.ll b/test/CodeGen/X86/catchpad-weight.ll
index 9b06f2abc81c..e8b416845ec1 100644
--- a/test/CodeGen/X86/catchpad-weight.ll
+++ b/test/CodeGen/X86/catchpad-weight.ll
@@ -2,7 +2,7 @@
 
 ; Check if the edge weight to the catchpad is calculated correctly.
 
-; CHECK: Successors according to CFG: BB#3(2147481600) BB#1(2048) BB#4(1024) BB#6(512) BB#8(256)
+; CHECK: Successors according to CFG: BB#3(0x7ffff100 / 0x80000000 = 100.00%) BB#1(0x00000800 / 0x80000000 = 0.00%) BB#4(0x00000400 / 0x80000000 = 0.00%) BB#6(0x00000200 / 0x80000000 = 0.00%) BB#8(0x00000100 / 0x80000000 = 0.00%)
 
 target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64--windows-msvc18.0.0"
diff --git a/test/CodeGen/X86/stack-protector-weight.ll b/test/CodeGen/X86/stack-protector-weight.ll
index 16877ef70a31..dea66d28e3dd 100644
--- a/test/CodeGen/X86/stack-protector-weight.ll
+++ b/test/CodeGen/X86/stack-protector-weight.ll
@@ -2,13 +2,13 @@
 ; RUN: llc -mtriple=x86_64-apple-darwin -print-machineinstrs=expand-isel-pseudos -enable-selectiondag-sp=false %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=IR
 
 ; SELDAG: # Machine code for function test_branch_weights:
-; SELDAG: Successors according to CFG: BB#[[SUCCESS:[0-9]+]](2147481600) BB#[[FAILURE:[0-9]+]](2048)
+; SELDAG: Successors according to CFG: BB#[[SUCCESS:[0-9]+]]({{[0-9a-fx/= ]+}}100.00%) BB#[[FAILURE:[0-9]+]]
 ; SELDAG: BB#[[FAILURE]]:
 ; SELDAG: CALL64pcrel32 <es:__stack_chk_fail>
 ; SELDAG: BB#[[SUCCESS]]:
 
 ; IR: # Machine code for function test_branch_weights:
-; IR: Successors according to CFG: BB#[[SUCCESS:[0-9]+]](2147481600) BB#[[FAILURE:[0-9]+]](2048)
+; IR: Successors according to CFG: BB#[[SUCCESS:[0-9]+]]({{[0-9a-fx/= ]+}}100.00%) BB#[[FAILURE:[0-9]+]]
 ; IR: BB#[[SUCCESS]]:
 ; IR: BB#[[FAILURE]]:
 ; IR: CALL64pcrel32 <ga:@__stack_chk_fail>
diff --git a/test/CodeGen/X86/switch-edge-weight.ll b/test/CodeGen/X86/switch-edge-weight.ll
index 9026f6f05f0e..6f594868c7ad 100644
--- a/test/CodeGen/X86/switch-edge-weight.ll
+++ b/test/CodeGen/X86/switch-edge-weight.ll
@@ -34,22 +34,22 @@ sw.epilog:
 ; CHECK: BB#0:
 ; BB#0 to BB#4: [0, 1133] (65 = 60 + 5)
 ; BB#0 to BB#5: [1134, UINT32_MAX] (25 = 20 + 5)
-; CHECK: Successors according to CFG: BB#4(1550960411) BB#5(596523235)
+; CHECK: Successors according to CFG: BB#4({{[0-9a-fx/= ]+}}72.22%) BB#5({{[0-9a-fx/= ]+}}27.78%)
 ;
 ; CHECK: BB#4:
 ; BB#4 to BB#1: [155, 159] (50)
 ; BB#4 to BB#5: [0, 1133] - [155, 159] (15 = 10 + 5)
-; CHECK: Successors according to CFG: BB#1(1193046470) BB#7(357913941)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}76.92%) BB#7({{[0-9a-fx/= ]+}}23.08%)
 ;
 ; CHECK: BB#5:
 ; BB#5 to BB#1: {1140} (10)
 ; BB#5 to BB#6: [1134, UINT32_MAX] - {1140} (15 = 10 + 5)
-; CHECK: Successors according to CFG: BB#1(238609294) BB#6(357913941)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}40.00%) BB#6({{[0-9a-fx/= ]+}}60.00%)
 ;
 ; CHECK: BB#6:
 ; BB#6 to BB#1: {1134} (10)
 ; BB#6 to BB#2: [1134, UINT32_MAX] - {1134, 1140} (5)
-; CHECK: Successors according to CFG: BB#1(238609294) BB#2(119304647)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}66.67%) BB#2({{[0-9a-fx/= ]+}}33.33%)
 }
 
 ; CHECK-LABEL: test2
@@ -102,7 +102,7 @@ sw.epilog:
 ; CHECK: BB#0:
 ; BB#0 to BB#6: {0} + [15, UINT32_MAX] (5)
 ; BB#0 to BB#8: [1, 14] (jump table) (65 = 60 + 5)
-; CHECK: Successors according to CFG: BB#6(153391689) BB#8(1994091957)
+; CHECK: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}7.14%) BB#8({{[0-9a-fx/= ]+}}92.86%
 ;
 ; CHECK: BB#8:
 ; BB#8 to BB#1: {1} (10)
@@ -111,7 +111,7 @@ sw.epilog:
 ; BB#8 to BB#3: {11} (10)
 ; BB#8 to BB#4: {12} (10)
 ; BB#8 to BB#5: {13, 14} (20)
-; CHECK: Successors according to CFG: BB#1(306783378) BB#6(153391689) BB#2(306783378) BB#3(306783378) BB#4(306783378) BB#5(613566756)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}14.29%) BB#6({{[0-9a-fx/= ]+}}7.14%) BB#2({{[0-9a-fx/= ]+}}14.29%) BB#3({{[0-9a-fx/= ]+}}14.29%) BB#4({{[0-9a-fx/= ]+}}14.29%) BB#5({{[0-9a-fx/= ]+}}28.57%)
 }
 
 ; CHECK-LABEL: test3
@@ -163,7 +163,7 @@ sw.epilog:
 ; CHECK: BB#0:
 ; BB#0 to BB#6: [0, 9] + [15, UINT32_MAX] {10}
 ; BB#0 to BB#8: [10, 14] (jump table) (50)
-; CHECK: Successors according to CFG: BB#6(357913941) BB#8(1789569705)
+; CHECK: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}16.67%) BB#8({{[0-9a-fx/= ]+}}83.33%)
 ;
 ; CHECK: BB#8:
 ; BB#8 to BB#1: {10} (10)
@@ -171,7 +171,7 @@ sw.epilog:
 ; BB#8 to BB#3: {12} (10)
 ; BB#8 to BB#4: {13} (10)
 ; BB#8 to BB#5: {14} (10)
-; CHECK: Successors according to CFG: BB#1(357913941) BB#2(357913941) BB#3(357913941) BB#4(357913941) BB#5(357913941)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}20.00%) BB#2({{[0-9a-fx/= ]+}}20.00%) BB#3({{[0-9a-fx/= ]+}}20.00%) BB#4({{[0-9a-fx/= ]+}}20.00%) BB#5({{[0-9a-fx/= ]+}}20.00%)
 }
 
 ; CHECK-LABEL: test4
@@ -216,12 +216,12 @@ sw.epilog:
 ; CHECK: BB#0:
 ; BB#0 to BB#6: [0, 110] + [116, UINT32_MAX] (20)
 ; BB#0 to BB#7: [111, 115] (bit test) (50)
-; CHECK: Successors according to CFG: BB#6(613566756) BB#7(1533916890)
+; CHECK: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}28.57%) BB#7({{[0-9a-fx/= ]+}}71.43%)
 ;
 ; CHECK: BB#7:
 ; BB#7 to BB#2: {111, 114, 115} (30)
 ; BB#7 to BB#3: {112, 113} (20)
-; CHECK: Successors according to CFG: BB#2(920350134) BB#3(613566756)
+; CHECK: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}60.00%) BB#3({{[0-9a-fx/= ]+}}40.00%)
 }
 
 ; CHECK-LABEL: test5
@@ -273,7 +273,7 @@ sw.epilog:
 ; CHECK: BB#0:
 ; BB#0 to BB#6: [10, UINT32_MAX] (15)
 ; BB#0 to BB#8: [1, 5, 7, 9] (jump table) (45)
-; CHECK: Successors according to CFG: BB#8(536870912) BB#9(1610612734)
+; CHECK: Successors according to CFG: BB#8({{[0-9a-fx/= ]+}}25.00%) BB#9({{[0-9a-fx/= ]+}}75.00%)
 }
 
 !1 = !{!"branch_weights", i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10} 
diff --git a/test/CodeGen/X86/switch-jump-table.ll b/test/CodeGen/X86/switch-jump-table.ll
index 3cfee1cd80e6..896a067da230 100644
--- a/test/CodeGen/X86/switch-jump-table.ll
+++ b/test/CodeGen/X86/switch-jump-table.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=i686-pc-gnu-linux < %s | FileCheck %s -check-prefix=CHECK
-; RUN: llc -mtriple=i686-pc-gnu-linux -print-machineinstrs=expand-isel-pseudos %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=CHECK-JT-WEIGHT
+; RUN: llc -mtriple=i686-pc-gnu-linux -print-machineinstrs=expand-isel-pseudos %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=CHECK-JT-PROB
 
 
 ; An unreachable default destination is replaced with the most popular case label.
@@ -54,9 +54,9 @@ default:
 ; Check if branch probabilities are correctly assigned to the jump table.
 
 define void @bar(i32 %x, i32* %to) {
-; CHECK-JT-WEIGHT-LABEL: bar:
-; CHECK-JT-WEIGHT: Successors according to CFG: BB#6(306783378) BB#8(1840700268)
-; CHECK-JT-WEIGHT: Successors according to CFG: BB#1(306783378) BB#2(306783378) BB#3(306783378) BB#4(306783378) BB#5(613566756)
+; CHECK-JT-PROB-LABEL: bar:
+; CHECK-JT-PROB: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}14.29%) BB#8({{[0-9a-fx/= ]+}}85.71%)
+; CHECK-JT-PROB: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}16.67%) BB#2({{[0-9a-fx/= ]+}}16.67%) BB#3({{[0-9a-fx/= ]+}}16.67%) BB#4({{[0-9a-fx/= ]+}}16.67%) BB#5({{[0-9a-fx/= ]+}}33.33%)
 
 entry:
   switch i32 %x, label %default [

From 531c21057053b9dd7d77ef59ddd92a1d99f0e1bc Mon Sep 17 00:00:00 2001
From: Evgeniy Stepanov <eugeni.stepanov@gmail.com>
Date: Tue, 1 Dec 2015 00:06:13 +0000
Subject: [PATCH 032/186] [safestack] Fix handling of array allocas.

The current code does not take alloca array size into account and,
as a result, considers any access past the first array element to be
unsafe.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254350 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Instrumentation/SafeStack.cpp | 22 +++++++--
 test/Transforms/SafeStack/array.ll           | 48 ++++++++++++++++++++
 2 files changed, 65 insertions(+), 5 deletions(-)

diff --git a/lib/Transforms/Instrumentation/SafeStack.cpp b/lib/Transforms/Instrumentation/SafeStack.cpp
index 6071ca5a8754..f8c4058ae22a 100644
--- a/lib/Transforms/Instrumentation/SafeStack.cpp
+++ b/lib/Transforms/Instrumentation/SafeStack.cpp
@@ -118,6 +118,10 @@ class SafeStack : public FunctionPass {
                  SmallVectorImpl<ReturnInst *> &Returns,
                  SmallVectorImpl<Instruction *> &StackRestorePoints);
 
+  /// \brief Calculate the allocation size of a given alloca. Returns 0 if the
+  /// size can not be statically determined.
+  uint64_t getStaticAllocaAllocationSize(const AllocaInst* AI);
+
   /// \brief Allocate space for all static allocas in \p StaticAllocas,
   /// replace allocas with pointers into the unsafe stack and generate code to
   /// restore the stack pointer before all return instructions in \p Returns.
@@ -177,6 +181,17 @@ class SafeStack : public FunctionPass {
   bool runOnFunction(Function &F) override;
 }; // class SafeStack
 
+uint64_t SafeStack::getStaticAllocaAllocationSize(const AllocaInst* AI) {
+  uint64_t Size = DL->getTypeAllocSize(AI->getAllocatedType());
+  if (AI->isArrayAllocation()) {
+    auto C = dyn_cast<ConstantInt>(AI->getArraySize());
+    if (!C)
+      return 0;
+    Size *= C->getZExtValue();
+  }
+  return Size;
+}
+
 bool SafeStack::IsAccessSafe(Value *Addr, uint64_t Size, const AllocaInst *AI) {
   AllocaOffsetRewriter Rewriter(*SE, AI);
   const SCEV *Expr = Rewriter.visit(SE->getSCEV(Addr));
@@ -187,8 +202,7 @@ bool SafeStack::IsAccessSafe(Value *Addr, uint64_t Size, const AllocaInst *AI) {
       ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, Size));
   ConstantRange AccessRange = AccessStartRange.add(SizeRange);
   ConstantRange AllocaRange = ConstantRange(
-      APInt(BitWidth, 0),
-      APInt(BitWidth, DL->getTypeStoreSize(AI->getAllocatedType())));
+      APInt(BitWidth, 0), APInt(BitWidth, getStaticAllocaAllocationSize(AI)));
   bool Safe = AllocaRange.contains(AccessRange);
 
   DEBUG(dbgs() << "[SafeStack] Alloca " << *AI << "\n"
@@ -463,10 +477,8 @@ SafeStack::moveStaticAllocasToUnsafeStack(IRBuilder<> &IRB, Function &F,
   for (AllocaInst *AI : StaticAllocas) {
     IRB.SetInsertPoint(AI);
 
-    auto CArraySize = cast<ConstantInt>(AI->getArraySize());
     Type *Ty = AI->getAllocatedType();
-
-    uint64_t Size = DL->getTypeAllocSize(Ty) * CArraySize->getZExtValue();
+    uint64_t Size = getStaticAllocaAllocationSize(AI);
     if (Size == 0)
       Size = 1; // Don't create zero-sized stack objects.
 
diff --git a/test/Transforms/SafeStack/array.ll b/test/Transforms/SafeStack/array.ll
index 6036bfc2c9c5..b2454dc2bb9e 100644
--- a/test/Transforms/SafeStack/array.ll
+++ b/test/Transforms/SafeStack/array.ll
@@ -35,4 +35,52 @@ entry:
   ret void
 }
 
+; Load from an array at a fixed offset, no overflow.
+define i8 @StaticArrayFixedSafe() nounwind uwtable safestack {
+entry:
+  ; CHECK-LABEL: define i8 @StaticArrayFixedSafe(
+  ; CHECK-NOT: __safestack_unsafe_stack_ptr
+  ; CHECK: ret i8
+  %buf = alloca i8, i32 4, align 1
+  %gep = getelementptr inbounds i8, i8* %buf, i32 2
+  %x = load i8, i8* %gep, align 1
+  ret i8 %x
+}
+
+; Load from an array at a fixed offset with overflow.
+define i8 @StaticArrayFixedUnsafe() nounwind uwtable safestack {
+entry:
+  ; CHECK-LABEL: define i8 @StaticArrayFixedUnsafe(
+  ; CHECK: __safestack_unsafe_stack_ptr
+  ; CHECK: ret i8
+  %buf = alloca i8, i32 4, align 1
+  %gep = getelementptr inbounds i8, i8* %buf, i32 5
+  %x = load i8, i8* %gep, align 1
+  ret i8 %x
+}
+
+; Load from an array at an unknown offset.
+define i8 @StaticArrayVariableUnsafe(i32 %ofs) nounwind uwtable safestack {
+entry:
+  ; CHECK-LABEL: define i8 @StaticArrayVariableUnsafe(
+  ; CHECK: __safestack_unsafe_stack_ptr
+  ; CHECK: ret i8
+  %buf = alloca i8, i32 4, align 1
+  %gep = getelementptr inbounds i8, i8* %buf, i32 %ofs
+  %x = load i8, i8* %gep, align 1
+  ret i8 %x
+}
+
+; Load from an array of an unknown size.
+define i8 @DynamicArrayUnsafe(i32 %sz) nounwind uwtable safestack {
+entry:
+  ; CHECK-LABEL: define i8 @DynamicArrayUnsafe(
+  ; CHECK: __safestack_unsafe_stack_ptr
+  ; CHECK: ret i8
+  %buf = alloca i8, i32 %sz, align 1
+  %gep = getelementptr inbounds i8, i8* %buf, i32 2
+  %x = load i8, i8* %gep, align 1
+  ret i8 %x
+}
+
 declare i8* @strcpy(i8*, i8*)

From 390ced11080a9faa958b4b6e08e8d7fd40347ac2 Mon Sep 17 00:00:00 2001
From: Evgeniy Stepanov <eugeni.stepanov@gmail.com>
Date: Tue, 1 Dec 2015 00:34:30 +0000
Subject: [PATCH 033/186] Extend debug info for function parameters in SDAG.

SDAG currently can emit debug location for function parameters when
an llvm.dbg.declare points to either a function argument SSA temp,
or to an AllocaInst. This change extends this logic by adding a
fallback case when neither of the above is true.

This is required for SafeStack, which may copy the contents of a
byval function argument into something that is not an alloca, and
then describe the target as the new location of the said argument.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254352 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../SelectionDAG/SelectionDAGBuilder.cpp      | 27 +++---
 test/DebugInfo/Generic/safestack-byval.ll     | 91 +++++++++++++++++++
 2 files changed, 102 insertions(+), 16 deletions(-)
 create mode 100644 test/DebugInfo/Generic/safestack-byval.ll

diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index f754e24e3231..d880bcfbdf64 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4463,22 +4463,17 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
         Address = BCI->getOperand(0);
       // Parameters are handled specially.
       bool isParameter = Variable->isParameter() || isa<Argument>(Address);
-
-      const AllocaInst *AI = dyn_cast<AllocaInst>(Address);
-
-      if (isParameter && !AI) {
-        FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(N.getNode());
-        if (FINode)
-          // Byval parameter.  We have a frame index at this point.
-          SDV = DAG.getFrameIndexDbgValue(
-              Variable, Expression, FINode->getIndex(), 0, dl, SDNodeOrder);
-        else {
-          // Address is an argument, so try to emit its dbg value using
-          // virtual register info from the FuncInfo.ValueMap.
-          EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, 0, false,
-                                   N);
-          return nullptr;
-        }
+      auto FINode = dyn_cast<FrameIndexSDNode>(N.getNode());
+      if (isParameter && FINode) {
+        // Byval parameter. We have a frame index at this point.
+        SDV = DAG.getFrameIndexDbgValue(Variable, Expression,
+                                        FINode->getIndex(), 0, dl, SDNodeOrder);
+      } else if (isa<Argument>(Address)) {
+        // Address is an argument, so try to emit its dbg value using
+        // virtual register info from the FuncInfo.ValueMap.
+        EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, 0, false,
+                                 N);
+        return nullptr;
       } else {
         SDV = DAG.getDbgValue(Variable, Expression, N.getNode(), N.getResNo(),
                               true, 0, dl, SDNodeOrder);
diff --git a/test/DebugInfo/Generic/safestack-byval.ll b/test/DebugInfo/Generic/safestack-byval.ll
new file mode 100644
index 000000000000..01ef064a78b6
--- /dev/null
+++ b/test/DebugInfo/Generic/safestack-byval.ll
@@ -0,0 +1,91 @@
+; Test dwarf codegen for DILocalVariable of a byval function argument that
+; points to neither an argument nor an alloca. This kind of IR is generated by
+; SafeStack for unsafe byval arguments.
+; RUN: llc -stop-after expand-isel-pseudos %s -o /dev/null | FileCheck %s
+
+; This was built by compiling the following source with SafeStack and
+; simplifying the result a little.
+; struct S {
+;   int a[100];
+; };
+;
+; int f(S zzz, unsigned long len) {
+;   return zzz.a[len];
+; }
+
+; CHECK: ![[ZZZ:.*]] = !DILocalVariable(name: "zzz",
+; CHECK: ![[ZZZ_EXPR:.*]] = !DIExpression(DW_OP_deref, DW_OP_minus, 400)
+; CHECK: DBG_VALUE {{.*}} ![[ZZZ]], ![[ZZZ_EXPR]]
+
+%struct.S = type { [100 x i32] }
+
+@__safestack_unsafe_stack_ptr = external thread_local(initialexec) global i8*
+
+; Function Attrs: norecurse nounwind readonly safestack uwtable
+define i32 @_Z1f1Sm(%struct.S* byval nocapture readonly align 8 %zzz, i64 %len) #0 !dbg !12 {
+entry:
+  %unsafe_stack_ptr = load i8*, i8** @__safestack_unsafe_stack_ptr, !dbg !22
+  %unsafe_stack_static_top = getelementptr i8, i8* %unsafe_stack_ptr, i32 -400, !dbg !22
+  store i8* %unsafe_stack_static_top, i8** @__safestack_unsafe_stack_ptr, !dbg !22
+; !17 describes "zzz"
+  call void @llvm.dbg.declare(metadata i8* %unsafe_stack_ptr, metadata !17, metadata !23), !dbg !22
+  %0 = getelementptr i8, i8* %unsafe_stack_ptr, i32 -400, !dbg !22
+  %zzz.unsafe-byval = bitcast i8* %0 to %struct.S*, !dbg !22
+  %1 = bitcast %struct.S* %zzz to i8*, !dbg !24
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 400, i32 8, i1 false), !dbg !24
+  tail call void @llvm.dbg.value(metadata i64 %len, i64 0, metadata !18, metadata !25), !dbg !24
+  %arrayidx = getelementptr inbounds %struct.S, %struct.S* %zzz.unsafe-byval, i64 0, i32 0, i64 %len, !dbg !26
+  %2 = load i32, i32* %arrayidx, align 4, !dbg !26, !tbaa !27
+  store i8* %unsafe_stack_ptr, i8** @__safestack_unsafe_stack_ptr, !dbg !31
+  ret i32 %2, !dbg !31
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #2
+
+attributes #0 = { norecurse nounwind readonly safestack uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { argmemonly nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!19, !20}
+!llvm.ident = !{!21}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.8.0 (trunk 254107) (llvm/trunk 254109)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !3, subprograms: !11)
+!1 = !DIFile(filename: "../llvm/1.cc", directory: "/tmp/build")
+!2 = !{}
+!3 = !{!4}
+!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "S", file: !1, line: 4, size: 3200, align: 32, elements: !5, identifier: "_ZTS1S")
+!5 = !{!6}
+!6 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !"_ZTS1S", file: !1, line: 5, baseType: !7, size: 3200, align: 32)
+!7 = !DICompositeType(tag: DW_TAG_array_type, baseType: !8, size: 3200, align: 32, elements: !9)
+!8 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!9 = !{!10}
+!10 = !DISubrange(count: 100)
+!11 = !{!12}
+!12 = distinct !DISubprogram(name: "f", linkageName: "_Z1f1Sm", scope: !1, file: !1, line: 8, type: !13, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, variables: !16)
+!13 = !DISubroutineType(types: !14)
+!14 = !{!8, !"_ZTS1S", !15}
+!15 = !DIBasicType(name: "long unsigned int", size: 64, align: 64, encoding: DW_ATE_unsigned)
+!16 = !{!17, !18}
+!17 = !DILocalVariable(name: "zzz", arg: 1, scope: !12, file: !1, line: 8, type: !"_ZTS1S")
+!18 = !DILocalVariable(name: "len", arg: 2, scope: !12, file: !1, line: 8, type: !15)
+!19 = !{i32 2, !"Dwarf Version", i32 4}
+!20 = !{i32 2, !"Debug Info Version", i32 3}
+!21 = !{!"clang version 3.8.0 (trunk 254107) (llvm/trunk 254109)"}
+!22 = !DILocation(line: 8, column: 9, scope: !12)
+!23 = !DIExpression(DW_OP_deref, DW_OP_minus, 400)
+!24 = !DILocation(line: 8, column: 28, scope: !12)
+!25 = !DIExpression()
+!26 = !DILocation(line: 9, column: 10, scope: !12)
+!27 = !{!28, !28, i64 0}
+!28 = !{!"int", !29, i64 0}
+!29 = !{!"omnipotent char", !30, i64 0}
+!30 = !{!"Simple C/C++ TBAA"}
+!31 = !DILocation(line: 9, column: 3, scope: !12)

From dd121fc8471347915e14b54c50409b27813e990b Mon Sep 17 00:00:00 2001
From: Evgeniy Stepanov <eugeni.stepanov@gmail.com>
Date: Tue, 1 Dec 2015 00:40:05 +0000
Subject: [PATCH 034/186] [safestack] Protect byval function arguments.

Detect unsafe byval function arguments and move them to the unsafe
stack.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254353 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Transforms/Utils/Local.h        |  18 ++-
 lib/Transforms/Instrumentation/SafeStack.cpp | 149 +++++++++++++------
 lib/Transforms/Utils/Local.cpp               |  16 +-
 test/Transforms/SafeStack/byval.ll           |  51 +++++++
 test/Transforms/SafeStack/debug-loc.ll       | 118 +++++++--------
 5 files changed, 241 insertions(+), 111 deletions(-)
 create mode 100644 test/Transforms/SafeStack/byval.ll

diff --git a/include/llvm/Transforms/Utils/Local.h b/include/llvm/Transforms/Utils/Local.h
index b7d67eaea3a0..1d707a1e5307 100644
--- a/include/llvm/Transforms/Utils/Local.h
+++ b/include/llvm/Transforms/Utils/Local.h
@@ -271,10 +271,20 @@ bool LowerDbgDeclare(Function &F);
 /// an alloca, if any.
 DbgDeclareInst *FindAllocaDbgDeclare(Value *V);
 
-/// \brief Replaces llvm.dbg.declare instruction when an alloca is replaced with
-/// a new value. If Deref is true, an additional DW_OP_deref is prepended to the
-/// expression. If Offset is non-zero, a constant displacement is added to the
-/// expression (after the optional Deref). Offset can be negative.
+/// \brief Replaces llvm.dbg.declare instruction when the address it describes
+/// is replaced with a new value. If Deref is true, an additional DW_OP_deref is
+/// prepended to the expression. If Offset is non-zero, a constant displacement
+/// is added to the expression (after the optional Deref). Offset can be
+/// negative.
+bool replaceDbgDeclare(Value *Address, Value *NewAddress,
+                       Instruction *InsertBefore, DIBuilder &Builder,
+                       bool Deref, int Offset);
+
+/// \brief Replaces llvm.dbg.declare instruction when the alloca it describes
+/// is replaced with a new value. If Deref is true, an additional DW_OP_deref is
+/// prepended to the expression. If Offset is non-zero, a constant displacement
+/// is added to the expression (after the optional Deref). Offset can be
+/// negative. New llvm.dbg.declare is inserted immediately before AI.
 bool replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
                                 DIBuilder &Builder, bool Deref, int Offset = 0);
 
diff --git a/lib/Transforms/Instrumentation/SafeStack.cpp b/lib/Transforms/Instrumentation/SafeStack.cpp
index f8c4058ae22a..4441663fc6de 100644
--- a/lib/Transforms/Instrumentation/SafeStack.cpp
+++ b/lib/Transforms/Instrumentation/SafeStack.cpp
@@ -57,6 +57,7 @@ STATISTIC(NumUnsafeStackRestorePointsFunctions,
 STATISTIC(NumAllocas, "Total number of allocas");
 STATISTIC(NumUnsafeStaticAllocas, "Number of unsafe static allocas");
 STATISTIC(NumUnsafeDynamicAllocas, "Number of unsafe dynamic allocas");
+STATISTIC(NumUnsafeByValArguments, "Number of unsafe byval arguments");
 STATISTIC(NumUnsafeStackRestorePoints, "Number of setjmps and landingpads");
 
 } // namespace llvm
@@ -68,14 +69,14 @@ namespace {
 ///
 /// The implementation simply replaces all mentions of the alloca with zero.
 class AllocaOffsetRewriter : public SCEVRewriteVisitor<AllocaOffsetRewriter> {
-  const AllocaInst *AI;
+  const Value *AllocaPtr;
 
 public:
-  AllocaOffsetRewriter(ScalarEvolution &SE, const AllocaInst *AI)
-      : SCEVRewriteVisitor(SE), AI(AI) {}
+  AllocaOffsetRewriter(ScalarEvolution &SE, const Value *AllocaPtr)
+      : SCEVRewriteVisitor(SE), AllocaPtr(AllocaPtr) {}
 
   const SCEV *visitUnknown(const SCEVUnknown *Expr) {
-    if (Expr->getValue() == AI)
+    if (Expr->getValue() == AllocaPtr)
       return SE.getZero(Expr->getType());
     return Expr;
   }
@@ -115,6 +116,7 @@ class SafeStack : public FunctionPass {
   /// given function and append them to the respective vectors.
   void findInsts(Function &F, SmallVectorImpl<AllocaInst *> &StaticAllocas,
                  SmallVectorImpl<AllocaInst *> &DynamicAllocas,
+                 SmallVectorImpl<Argument *> &ByValArguments,
                  SmallVectorImpl<ReturnInst *> &Returns,
                  SmallVectorImpl<Instruction *> &StackRestorePoints);
 
@@ -130,6 +132,7 @@ class SafeStack : public FunctionPass {
   /// allocas are allocated.
   Value *moveStaticAllocasToUnsafeStack(IRBuilder<> &IRB, Function &F,
                                         ArrayRef<AllocaInst *> StaticAllocas,
+                                        ArrayRef<Argument *> ByValArguments,
                                         ArrayRef<ReturnInst *> Returns);
 
   /// \brief Generate code to restore the stack after all stack restore points
@@ -149,11 +152,12 @@ class SafeStack : public FunctionPass {
                                        AllocaInst *DynamicTop,
                                        ArrayRef<AllocaInst *> DynamicAllocas);
 
-  bool IsSafeStackAlloca(const AllocaInst *AI);
+  bool IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize);
 
   bool IsMemIntrinsicSafe(const MemIntrinsic *MI, const Use &U,
-                          const AllocaInst *AI);
-  bool IsAccessSafe(Value *Addr, uint64_t Size, const AllocaInst *AI);
+                          const Value *AllocaPtr, uint64_t AllocaSize);
+  bool IsAccessSafe(Value *Addr, uint64_t Size, const Value *AllocaPtr,
+                    uint64_t AllocaSize);
 
 public:
   static char ID; // Pass identification, replacement for typeid.
@@ -192,20 +196,23 @@ uint64_t SafeStack::getStaticAllocaAllocationSize(const AllocaInst* AI) {
   return Size;
 }
 
-bool SafeStack::IsAccessSafe(Value *Addr, uint64_t Size, const AllocaInst *AI) {
-  AllocaOffsetRewriter Rewriter(*SE, AI);
+bool SafeStack::IsAccessSafe(Value *Addr, uint64_t AccessSize,
+                             const Value *AllocaPtr, uint64_t AllocaSize) {
+  AllocaOffsetRewriter Rewriter(*SE, AllocaPtr);
   const SCEV *Expr = Rewriter.visit(SE->getSCEV(Addr));
 
   uint64_t BitWidth = SE->getTypeSizeInBits(Expr->getType());
   ConstantRange AccessStartRange = SE->getUnsignedRange(Expr);
   ConstantRange SizeRange =
-      ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, Size));
+      ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, AccessSize));
   ConstantRange AccessRange = AccessStartRange.add(SizeRange);
-  ConstantRange AllocaRange = ConstantRange(
-      APInt(BitWidth, 0), APInt(BitWidth, getStaticAllocaAllocationSize(AI)));
+  ConstantRange AllocaRange =
+      ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, AllocaSize));
   bool Safe = AllocaRange.contains(AccessRange);
 
-  DEBUG(dbgs() << "[SafeStack] Alloca " << *AI << "\n"
+  DEBUG(dbgs() << "[SafeStack] "
+               << (isa<AllocaInst>(AllocaPtr) ? "Alloca " : "ByValArgument ")
+               << *AllocaPtr << "\n"
                << "            Access " << *Addr << "\n"
                << "            SCEV " << *Expr
                << " U: " << SE->getUnsignedRange(Expr)
@@ -218,36 +225,38 @@ bool SafeStack::IsAccessSafe(Value *Addr, uint64_t Size, const AllocaInst *AI) {
 }
 
 bool SafeStack::IsMemIntrinsicSafe(const MemIntrinsic *MI, const Use &U,
-                                   const AllocaInst *AI) {
+                                   const Value *AllocaPtr,
+                                   uint64_t AllocaSize) {
   // All MemIntrinsics have destination address in Arg0 and size in Arg2.
   if (MI->getRawDest() != U) return true;
   const auto *Len = dyn_cast<ConstantInt>(MI->getLength());
   // Non-constant size => unsafe. FIXME: try SCEV getRange.
   if (!Len) return false;
-  return IsAccessSafe(U, Len->getZExtValue(), AI);
+  return IsAccessSafe(U, Len->getZExtValue(), AllocaPtr, AllocaSize);
 }
 
-/// Check whether a given alloca instruction (AI) should be put on the safe
+/// Check whether a given allocation must be put on the safe
 /// stack or not. The function analyzes all uses of AI and checks whether it is
 /// only accessed in a memory safe way (as decided statically).
-bool SafeStack::IsSafeStackAlloca(const AllocaInst *AI) {
+bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) {
   // Go through all uses of this alloca and check whether all accesses to the
   // allocated object are statically known to be memory safe and, hence, the
   // object can be placed on the safe stack.
   SmallPtrSet<const Value *, 16> Visited;
-  SmallVector<const Instruction *, 8> WorkList;
-  WorkList.push_back(AI);
+  SmallVector<const Value *, 8> WorkList;
+  WorkList.push_back(AllocaPtr);
 
   // A DFS search through all uses of the alloca in bitcasts/PHI/GEPs/etc.
   while (!WorkList.empty()) {
-    const Instruction *V = WorkList.pop_back_val();
+    const Value *V = WorkList.pop_back_val();
     for (const Use &UI : V->uses()) {
       auto I = cast<const Instruction>(UI.getUser());
       assert(V == UI.get());
 
       switch (I->getOpcode()) {
       case Instruction::Load: {
-        if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getType()), AI))
+        if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getType()), AllocaPtr,
+                          AllocaSize))
           return false;
         break;
       }
@@ -257,13 +266,13 @@ bool SafeStack::IsSafeStackAlloca(const AllocaInst *AI) {
       case Instruction::Store: {
         if (V == I->getOperand(0)) {
           // Stored the pointer - conservatively assume it may be unsafe.
-          DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AI
+          DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AllocaPtr
                        << "\n            store of address: " << *I << "\n");
           return false;
         }
 
-        if (!IsAccessSafe(
-                UI, DL->getTypeStoreSize(I->getOperand(0)->getType()), AI))
+        if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getOperand(0)->getType()),
+                          AllocaPtr, AllocaSize))
           return false;
         break;
       }
@@ -283,8 +292,8 @@ bool SafeStack::IsSafeStackAlloca(const AllocaInst *AI) {
         }
 
         if (const MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) {
-          if (!IsMemIntrinsicSafe(MI, UI, AI)) {
-            DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AI
+          if (!IsMemIntrinsicSafe(MI, UI, AllocaPtr, AllocaSize)) {
+            DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AllocaPtr
                          << "\n            unsafe memintrinsic: " << *I
                          << "\n");
             return false;
@@ -302,9 +311,9 @@ bool SafeStack::IsSafeStackAlloca(const AllocaInst *AI) {
         ImmutableCallSite::arg_iterator B = CS.arg_begin(), E = CS.arg_end();
         for (ImmutableCallSite::arg_iterator A = B; A != E; ++A)
           if (A->get() == V)
-            if (!(CS.doesNotCapture(A - B) &&
-                  (CS.doesNotAccessMemory(A - B) || CS.doesNotAccessMemory()))) {
-              DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AI
+            if (!(CS.doesNotCapture(A - B) && (CS.doesNotAccessMemory(A - B) ||
+                                               CS.doesNotAccessMemory()))) {
+              DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AllocaPtr
                            << "\n            unsafe call: " << *I << "\n");
               return false;
             }
@@ -355,13 +364,15 @@ Value *SafeStack::getOrCreateUnsafeStackPtr(IRBuilder<> &IRB, Function &F) {
 void SafeStack::findInsts(Function &F,
                           SmallVectorImpl<AllocaInst *> &StaticAllocas,
                           SmallVectorImpl<AllocaInst *> &DynamicAllocas,
+                          SmallVectorImpl<Argument *> &ByValArguments,
                           SmallVectorImpl<ReturnInst *> &Returns,
                           SmallVectorImpl<Instruction *> &StackRestorePoints) {
   for (Instruction &I : instructions(&F)) {
     if (auto AI = dyn_cast<AllocaInst>(&I)) {
       ++NumAllocas;
 
-      if (IsSafeStackAlloca(AI))
+      uint64_t Size = getStaticAllocaAllocationSize(AI);
+      if (IsSafeStackAlloca(AI, Size))
         continue;
 
       if (AI->isStaticAlloca()) {
@@ -386,6 +397,17 @@ void SafeStack::findInsts(Function &F,
             "gcroot intrinsic not compatible with safestack attribute");
     }
   }
+  for (Argument &Arg : F.args()) {
+    if (!Arg.hasByValAttr())
+      continue;
+    uint64_t Size =
+        DL->getTypeStoreSize(Arg.getType()->getPointerElementType());
+    if (IsSafeStackAlloca(&Arg, Size))
+      continue;
+
+    ++NumUnsafeByValArguments;
+    ByValArguments.push_back(&Arg);
+  }
 }
 
 AllocaInst *
@@ -420,7 +442,7 @@ SafeStack::createStackRestorePoints(IRBuilder<> &IRB, Function &F,
   for (Instruction *I : StackRestorePoints) {
     ++NumUnsafeStackRestorePoints;
 
-    IRB.SetInsertPoint(cast<Instruction>(I->getNextNode()));
+    IRB.SetInsertPoint(I->getNextNode());
     Value *CurrentTop = DynamicTop ? IRB.CreateLoad(DynamicTop) : StaticTop;
     IRB.CreateStore(CurrentTop, UnsafeStackPtr);
   }
@@ -428,11 +450,10 @@ SafeStack::createStackRestorePoints(IRBuilder<> &IRB, Function &F,
   return DynamicTop;
 }
 
-Value *
-SafeStack::moveStaticAllocasToUnsafeStack(IRBuilder<> &IRB, Function &F,
-                                          ArrayRef<AllocaInst *> StaticAllocas,
-                                          ArrayRef<ReturnInst *> Returns) {
-  if (StaticAllocas.empty())
+Value *SafeStack::moveStaticAllocasToUnsafeStack(
+    IRBuilder<> &IRB, Function &F, ArrayRef<AllocaInst *> StaticAllocas,
+    ArrayRef<Argument *> ByValArguments, ArrayRef<ReturnInst *> Returns) {
+  if (StaticAllocas.empty() && ByValArguments.empty())
     return nullptr;
 
   DIBuilder DIB(*F.getParent());
@@ -454,6 +475,13 @@ SafeStack::moveStaticAllocasToUnsafeStack(IRBuilder<> &IRB, Function &F,
 
   // Compute maximum alignment among static objects on the unsafe stack.
   unsigned MaxAlignment = 0;
+  for (Argument *Arg : ByValArguments) {
+    Type *Ty = Arg->getType()->getPointerElementType();
+    unsigned Align = std::max((unsigned)DL->getPrefTypeAlignment(Ty),
+                              Arg->getParamAlignment());
+    if (Align > MaxAlignment)
+      MaxAlignment = Align;
+  }
   for (AllocaInst *AI : StaticAllocas) {
     Type *Ty = AI->getAllocatedType();
     unsigned Align =
@@ -465,15 +493,46 @@ SafeStack::moveStaticAllocasToUnsafeStack(IRBuilder<> &IRB, Function &F,
   if (MaxAlignment > StackAlignment) {
     // Re-align the base pointer according to the max requested alignment.
     assert(isPowerOf2_32(MaxAlignment));
-    IRB.SetInsertPoint(cast<Instruction>(BasePointer->getNextNode()));
+    IRB.SetInsertPoint(BasePointer->getNextNode());
     BasePointer = cast<Instruction>(IRB.CreateIntToPtr(
         IRB.CreateAnd(IRB.CreatePtrToInt(BasePointer, IntPtrTy),
                       ConstantInt::get(IntPtrTy, ~uint64_t(MaxAlignment - 1))),
         StackPtrTy));
   }
 
-  // Allocate space for every unsafe static AllocaInst on the unsafe stack.
   int64_t StaticOffset = 0; // Current stack top.
+  IRB.SetInsertPoint(BasePointer->getNextNode());
+
+  for (Argument *Arg : ByValArguments) {
+    Type *Ty = Arg->getType()->getPointerElementType();
+
+    uint64_t Size = DL->getTypeStoreSize(Ty);
+    if (Size == 0)
+      Size = 1; // Don't create zero-sized stack objects.
+
+    // Ensure the object is properly aligned.
+    unsigned Align = std::max((unsigned)DL->getPrefTypeAlignment(Ty),
+                              Arg->getParamAlignment());
+
+    // Add alignment.
+    // NOTE: we ensure that BasePointer itself is aligned to >= Align.
+    StaticOffset += Size;
+    StaticOffset = RoundUpToAlignment(StaticOffset, Align);
+
+    Value *Off = IRB.CreateGEP(BasePointer, // BasePointer is i8*
+                               ConstantInt::get(Int32Ty, -StaticOffset));
+    Value *NewArg = IRB.CreateBitCast(Off, Arg->getType(),
+                                     Arg->getName() + ".unsafe-byval");
+
+    // Replace alloc with the new location.
+    replaceDbgDeclare(Arg, BasePointer, BasePointer->getNextNode(), DIB,
+                      /*Deref=*/true, -StaticOffset);
+    Arg->replaceAllUsesWith(NewArg);
+    IRB.SetInsertPoint(cast<Instruction>(NewArg)->getNextNode());
+    IRB.CreateMemCpy(Off, Arg, Size, Arg->getParamAlignment());
+  }
+
+  // Allocate space for every unsafe static AllocaInst on the unsafe stack.
   for (AllocaInst *AI : StaticAllocas) {
     IRB.SetInsertPoint(AI);
 
@@ -509,7 +568,7 @@ SafeStack::moveStaticAllocasToUnsafeStack(IRBuilder<> &IRB, Function &F,
   StaticOffset = RoundUpToAlignment(StaticOffset, StackAlignment);
 
   // Update shadow stack pointer in the function epilogue.
-  IRB.SetInsertPoint(cast<Instruction>(BasePointer->getNextNode()));
+  IRB.SetInsertPoint(BasePointer->getNextNode());
 
   Value *StaticTop =
       IRB.CreateGEP(BasePointer, ConstantInt::get(Int32Ty, -StaticOffset),
@@ -621,6 +680,7 @@ bool SafeStack::runOnFunction(Function &F) {
 
   SmallVector<AllocaInst *, 16> StaticAllocas;
   SmallVector<AllocaInst *, 4> DynamicAllocas;
+  SmallVector<Argument *, 4> ByValArguments;
   SmallVector<ReturnInst *, 4> Returns;
 
   // Collect all points where stack gets unwound and needs to be restored
@@ -632,13 +692,15 @@ bool SafeStack::runOnFunction(Function &F) {
 
   // Find all static and dynamic alloca instructions that must be moved to the
   // unsafe stack, all return instructions and stack restore points.
-  findInsts(F, StaticAllocas, DynamicAllocas, Returns, StackRestorePoints);
+  findInsts(F, StaticAllocas, DynamicAllocas, ByValArguments, Returns,
+            StackRestorePoints);
 
   if (StaticAllocas.empty() && DynamicAllocas.empty() &&
-      StackRestorePoints.empty())
+      ByValArguments.empty() && StackRestorePoints.empty())
     return false; // Nothing to do in this function.
 
-  if (!StaticAllocas.empty() || !DynamicAllocas.empty())
+  if (!StaticAllocas.empty() || !DynamicAllocas.empty() ||
+      !ByValArguments.empty())
     ++NumUnsafeStackFunctions; // This function has the unsafe stack.
 
   if (!StackRestorePoints.empty())
@@ -648,7 +710,8 @@ bool SafeStack::runOnFunction(Function &F) {
   UnsafeStackPtr = getOrCreateUnsafeStackPtr(IRB, F);
 
   // The top of the unsafe stack after all unsafe static allocas are allocated.
-  Value *StaticTop = moveStaticAllocasToUnsafeStack(IRB, F, StaticAllocas, Returns);
+  Value *StaticTop = moveStaticAllocasToUnsafeStack(IRB, F, StaticAllocas,
+                                                    ByValArguments, Returns);
 
   // Safe stack object that stores the current unsafe stack top. It is updated
   // as unsafe dynamic (non-constant-sized) allocas are allocated and freed.
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 0bd5fa9f8777..623da675e05b 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -1136,9 +1136,10 @@ DbgDeclareInst *llvm::FindAllocaDbgDeclare(Value *V) {
   return nullptr;
 }
 
-bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
-                                      DIBuilder &Builder, bool Deref, int Offset) {
-  DbgDeclareInst *DDI = FindAllocaDbgDeclare(AI);
+bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress,
+                             Instruction *InsertBefore, DIBuilder &Builder,
+                             bool Deref, int Offset) {
+  DbgDeclareInst *DDI = FindAllocaDbgDeclare(Address);
   if (!DDI)
     return false;
   DebugLoc Loc = DDI->getDebugLoc();
@@ -1168,12 +1169,17 @@ bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
 
   // Insert llvm.dbg.declare immediately after the original alloca, and remove
   // old llvm.dbg.declare.
-  Builder.insertDeclare(NewAllocaAddress, DIVar, DIExpr, Loc,
-                        AI->getNextNode());
+  Builder.insertDeclare(NewAddress, DIVar, DIExpr, Loc, InsertBefore);
   DDI->eraseFromParent();
   return true;
 }
 
+bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
+                                      DIBuilder &Builder, bool Deref, int Offset) {
+  return replaceDbgDeclare(AI, NewAllocaAddress, AI->getNextNode(), Builder,
+                           Deref, Offset);
+}
+
 /// changeToUnreachable - Insert an unreachable instruction before the specified
 /// instruction, making it and the rest of the code in the block dead.
 static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) {
diff --git a/test/Transforms/SafeStack/byval.ll b/test/Transforms/SafeStack/byval.ll
new file mode 100644
index 000000000000..f9a06e54d2df
--- /dev/null
+++ b/test/Transforms/SafeStack/byval.ll
@@ -0,0 +1,51 @@
+; RUN: opt -safe-stack -S -mtriple=i386-pc-linux-gnu < %s -o - | FileCheck %s
+; RUN: opt -safe-stack -S -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.S = type { [100 x i32] }
+
+; Safe access to a byval argument.
+define i32 @ByValSafe(%struct.S* byval nocapture readonly align 8 %zzz) norecurse nounwind readonly safestack uwtable {
+entry:
+  ; CHECK-LABEL: @ByValSafe
+  ; CHECK-NOT: __safestack_unsafe_stack_ptr
+  ; CHECK: ret i32
+  %arrayidx = getelementptr inbounds %struct.S, %struct.S* %zzz, i64 0, i32 0, i64 3
+  %0 = load i32, i32* %arrayidx, align 4
+  ret i32 %0
+}
+
+; Unsafe access to a byval argument.
+; Argument is copied to the unsafe stack.
+define i32 @ByValUnsafe(%struct.S* byval nocapture readonly align 8 %zzz, i64 %idx) norecurse nounwind readonly safestack uwtable {
+entry:
+  ; CHECK-LABEL: @ByValUnsafe
+  ; CHECK: %[[A:.*]] = load {{.*}} @__safestack_unsafe_stack_ptr
+  ; CHECK: store {{.*}} @__safestack_unsafe_stack_ptr
+  ; CHECK: %[[B:.*]] = getelementptr i8, i8* %[[A]], i32 -400
+  ; CHECK: %[[C:.*]] = bitcast %struct.S* %zzz to i8*
+  ; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[B]], i8* %[[C]], i64 400, i32 8, i1 false)
+  ; CHECK: ret i32
+  %arrayidx = getelementptr inbounds %struct.S, %struct.S* %zzz, i64 0, i32 0, i64 %idx
+  %0 = load i32, i32* %arrayidx, align 4
+  ret i32 %0
+}
+
+; Highly aligned byval argument.
+define i32 @ByValUnsafeAligned(%struct.S* byval nocapture readonly align 64 %zzz, i64 %idx) norecurse nounwind readonly safestack uwtable {
+entry:
+  ; CHECK-LABEL: @ByValUnsafeAligned
+  ; CHECK: %[[A:.*]] = load {{.*}} @__safestack_unsafe_stack_ptr
+  ; CHECK: %[[B:.*]] = ptrtoint i8* %[[A]] to i64
+  ; CHECK: and i64 %[[B]], -64
+  ; CHECK: ret i32
+  %arrayidx = getelementptr inbounds %struct.S, %struct.S* %zzz, i64 0, i32 0, i64 0
+  %0 = load i32, i32* %arrayidx, align 64
+  %arrayidx2 = getelementptr inbounds %struct.S, %struct.S* %zzz, i64 0, i32 0, i64 %idx
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  ret i32 %add
+}
+
diff --git a/test/Transforms/SafeStack/debug-loc.ll b/test/Transforms/SafeStack/debug-loc.ll
index 57d565f8cfd0..e72d0e9d2ff2 100644
--- a/test/Transforms/SafeStack/debug-loc.ll
+++ b/test/Transforms/SafeStack/debug-loc.ll
@@ -1,83 +1,83 @@
 ; RUN: opt -safe-stack -S -mtriple=i386-pc-linux-gnu < %s -o - | FileCheck %s
 
 ; Test debug location for the local variables moved onto the unsafe stack.
-; CHECK: define void @f
-; CHECK: %[[USP:.*]] = load i8*, i8** @__safestack_unsafe_stack_ptr
 
-; dbg.declare for %buf is gone; replaced with dbg.declare based off the unsafe stack pointer
-; CHECK-NOT: @llvm.dbg.declare.*%buf
-; CHECK: call void @llvm.dbg.declare(metadata i8* %[[USP]], metadata ![[VAR:.*]], metadata ![[EXPR:.*]])
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
 
-; dbg.declare appears before the first use of %buf
-; CHECK: getelementptr{{.*}}%buf
-; CHECK: call{{.*}}@Capture
-; CHECK: ret void
+%struct.S = type { [100 x i8] }
 
-; dbg.declare describes "buf"...
-; CHECK: ![[VAR]] = !DILocalVariable(name: "buf"
+; Function Attrs: safestack uwtable
+define void @f(%struct.S* byval align 8 %zzz) #0 !dbg !12 {
+; CHECK: define void @f
 
-; ... as an offset from the unsafe stack pointer
-; CHECK: ![[EXPR]] = !DIExpression(DW_OP_deref, DW_OP_minus, 400)
+entry:
+; CHECK: %[[USP:.*]] = load i8*, i8** @__safestack_unsafe_stack_ptr
 
+  %xxx = alloca %struct.S, align 1
+  call void @llvm.dbg.declare(metadata %struct.S* %zzz, metadata !18, metadata !19), !dbg !20
+  call void @llvm.dbg.declare(metadata %struct.S* %xxx, metadata !21, metadata !19), !dbg !22
 
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
+; dbg.declare for %zzz and %xxx are gone; replaced with dbg.declare based off the unsafe stack pointer
+; CHECK-NOT: call void @llvm.dbg.declare
+; CHECK: call void @llvm.dbg.declare(metadata i8* %[[USP]], metadata ![[VAR_ARG:.*]], metadata ![[EXPR_ARG:.*]])
+; CHECK-NOT: call void @llvm.dbg.declare
+; CHECK: call void @llvm.dbg.declare(metadata i8* %[[USP]], metadata ![[VAR_LOCAL:.*]], metadata ![[EXPR_LOCAL:.*]])
+; CHECK-NOT: call void @llvm.dbg.declare
 
-; Function Attrs: safestack uwtable
-define void @f() #0 !dbg !4 {
-entry:
-  %buf = alloca [100 x i32], align 16
-  %0 = bitcast [100 x i32]* %buf to i8*, !dbg !16
-  call void @llvm.lifetime.start(i64 400, i8* %0) #4, !dbg !16
-  tail call void @llvm.dbg.declare(metadata [100 x i32]* %buf, metadata !8, metadata !17), !dbg !18
+  call void @Capture(%struct.S* %zzz), !dbg !23
+  call void @Capture(%struct.S* %xxx), !dbg !24
 
+; dbg.declare appears before the first use
+; CHECK:   call void @Capture
+; CHECK:   call void @Capture
 
-  %arraydecay = getelementptr inbounds [100 x i32], [100 x i32]* %buf, i64 0, i64 0, !dbg !19
-  call void @Capture(i32* %arraydecay), !dbg !20
-  call void @llvm.lifetime.end(i64 400, i8* %0) #4, !dbg !21
-  ret void, !dbg !21
+  ret void, !dbg !25
 }
 
-; Function Attrs: nounwind argmemonly
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+; CHECK-DAG: ![[VAR_ARG]] = !DILocalVariable(name: "zzz"
+; 100 aligned up to 8
+; CHECK-DAG: ![[EXPR_ARG]] = !DIExpression(DW_OP_deref, DW_OP_minus, 104
 
-; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
+; CHECK-DAG: ![[VAR_LOCAL]] = !DILocalVariable(name: "xxx"
+; CHECK-DAG: ![[EXPR_LOCAL]] = !DIExpression(DW_OP_deref, DW_OP_minus, 208
 
-declare void @Capture(i32*) #3
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
-; Function Attrs: nounwind argmemonly
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @Capture(%struct.S*) #2
 
-attributes #0 = { safestack uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind argmemonly }
-attributes #2 = { nounwind readnone }
-attributes #3 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind }
+attributes #0 = { safestack uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!13, !14}
-!llvm.ident = !{!15}
+!llvm.module.flags = !{!15, !16}
+!llvm.ident = !{!17}
 
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.8.0 (trunk 248518) (llvm/trunk 248512)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, subprograms: !3)
-!1 = !DIFile(filename: "1.cc", directory: "/tmp")
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.8.0 (trunk 254019) (llvm/trunk 254036)", isOptimized: false, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !3, subprograms: !11)
+!1 = !DIFile(filename: "../llvm/2.cc", directory: "/code/build-llvm")
 !2 = !{}
 !3 = !{!4}
-!4 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 4, type: !5, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: true, variables: !7)
-!5 = !DISubroutineType(types: !6)
-!6 = !{null}
-!7 = !{!8}
-!8 = !DILocalVariable(name: "buf", scope: !4, file: !1, line: 5, type: !9)
-!9 = !DICompositeType(tag: DW_TAG_array_type, baseType: !10, size: 3200, align: 32, elements: !11)
-!10 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "S", file: !1, line: 4, size: 800, align: 8, elements: !5, identifier: "_ZTS1S")
+!5 = !{!6}
+!6 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !"_ZTS1S", file: !1, line: 5, baseType: !7, size: 800, align: 8)
+!7 = !DICompositeType(tag: DW_TAG_array_type, baseType: !8, size: 800, align: 8, elements: !9)
+!8 = !DIBasicType(name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
+!9 = !{!10}
+!10 = !DISubrange(count: 100)
 !11 = !{!12}
-!12 = !DISubrange(count: 100)
-!13 = !{i32 2, !"Dwarf Version", i32 4}
-!14 = !{i32 2, !"Debug Info Version", i32 3}
-!15 = !{!"clang version 3.8.0 (trunk 248518) (llvm/trunk 248512)"}
-!16 = !DILocation(line: 5, column: 3, scope: !4)
-!17 = !DIExpression()
-!18 = !DILocation(line: 5, column: 7, scope: !4)
-!19 = !DILocation(line: 6, column: 11, scope: !4)
-!20 = !DILocation(line: 6, column: 3, scope: !4)
-!21 = !DILocation(line: 7, column: 1, scope: !4)
+!12 = distinct !DISubprogram(name: "f", linkageName: "_Z1f1S", scope: !1, file: !1, line: 10, type: !13, isLocal: false, isDefinition: true, scopeLine: 10, flags: DIFlagPrototyped, isOptimized: false, variables: !2)
+!13 = !DISubroutineType(types: !14)
+!14 = !{null, !"_ZTS1S"}
+!15 = !{i32 2, !"Dwarf Version", i32 4}
+!16 = !{i32 2, !"Debug Info Version", i32 3}
+!17 = !{!"clang version 3.8.0 (trunk 254019) (llvm/trunk 254036)"}
+!18 = !DILocalVariable(name: "zzz", arg: 1, scope: !12, file: !1, line: 10, type: !"_ZTS1S")
+!19 = !DIExpression()
+!20 = !DILocation(line: 10, column: 10, scope: !12)
+!21 = !DILocalVariable(name: "xxx", scope: !12, file: !1, line: 11, type: !"_ZTS1S")
+!22 = !DILocation(line: 11, column: 5, scope: !12)
+!23 = !DILocation(line: 12, column: 3, scope: !12)
+!24 = !DILocation(line: 13, column: 3, scope: !12)
+!25 = !DILocation(line: 14, column: 1, scope: !12)

From 629180472c0c66f4c44ef9bf06949e79331cd031 Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Tue, 1 Dec 2015 00:48:34 +0000
Subject: [PATCH 035/186] llvm-dwp: Initial layout

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254354 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/LLVMBuild.txt           |  1 +
 tools/Makefile                |  2 +-
 tools/llvm-dwp/CMakeLists.txt | 13 +++++++++++++
 tools/llvm-dwp/LLVMBuild.txt  | 23 +++++++++++++++++++++++
 tools/llvm-dwp/llvm-dwp.cpp   |  2 ++
 5 files changed, 40 insertions(+), 1 deletion(-)
 create mode 100644 tools/llvm-dwp/CMakeLists.txt
 create mode 100644 tools/llvm-dwp/LLVMBuild.txt
 create mode 100644 tools/llvm-dwp/llvm-dwp.cpp

diff --git a/tools/LLVMBuild.txt b/tools/LLVMBuild.txt
index d6c08d25d944..d4b014771859 100644
--- a/tools/LLVMBuild.txt
+++ b/tools/LLVMBuild.txt
@@ -28,6 +28,7 @@ subdirectories =
  llvm-diff
  llvm-dis
  llvm-dwarfdump
+ llvm-dwp
  llvm-extract
  llvm-jitlistener
  llvm-link
diff --git a/tools/Makefile b/tools/Makefile
index a2ec8b065ebd..92d495451879 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -33,7 +33,7 @@ PARALLEL_DIRS := opt llvm-as llvm-dis llc llvm-ar llvm-nm llvm-link \
                  llvm-dwarfdump llvm-cov llvm-size llvm-stress llvm-mcmarkup \
                  llvm-profdata llvm-symbolizer obj2yaml yaml2obj llvm-c-test \
                  llvm-cxxdump verify-uselistorder dsymutil llvm-pdbdump \
-                 llvm-split sancov
+                 llvm-split sancov llvm-dwp
 
 # If Intel JIT Events support is configured, build an extra tool to test it.
 ifeq ($(USE_INTEL_JITEVENTS), 1)
diff --git a/tools/llvm-dwp/CMakeLists.txt b/tools/llvm-dwp/CMakeLists.txt
new file mode 100644
index 000000000000..b29c00d49c3d
--- /dev/null
+++ b/tools/llvm-dwp/CMakeLists.txt
@@ -0,0 +1,13 @@
+set(LLVM_LINK_COMPONENTS
+  ${LLVM_TARGETS_TO_BUILD}
+  AsmPrinter
+  DebugInfoDWARF
+  MC
+  Object
+  Support
+  Target
+  )
+
+add_llvm_tool(llvm-dwp
+  llvm-dwp.cpp
+  )
diff --git a/tools/llvm-dwp/LLVMBuild.txt b/tools/llvm-dwp/LLVMBuild.txt
new file mode 100644
index 000000000000..345a73757255
--- /dev/null
+++ b/tools/llvm-dwp/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./tools/llvm-dwp/LLVMBuild.txt ---------------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Tool
+name = llvm-dwp
+parent = Tools
+required_libraries = AsmPrinter DebugInfoDWARF MC Object Support all-targets
+
diff --git a/tools/llvm-dwp/llvm-dwp.cpp b/tools/llvm-dwp/llvm-dwp.cpp
new file mode 100644
index 000000000000..b2f997621bea
--- /dev/null
+++ b/tools/llvm-dwp/llvm-dwp.cpp
@@ -0,0 +1,2 @@
+int main() {
+}

From bb823837e04dc83f63c5a6c021aae05655b68dc9 Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Tue, 1 Dec 2015 00:48:39 +0000
Subject: [PATCH 036/186] [llvm-dwp] Initial partial prototype

This just concatenates the common DWP sections without doing any of the
fancy DWP things like:

1) update str_offsets
2) deduplicating strings
3) merging/creating cu/tu_index

Patches for these will follow shortly.

(also not sure about target triple/object file type for this tool - do I
really need a whole triple just to write an object file that contains
purely static/hardcoded bytes in each section? & I guess I should just
pick it based on the first input, maybe, rather than hardcoding for now
- but we only produce .dwo on ELF platforms with objcopy for now anyway)

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254355 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ADT/StringMap.h            |   7 ++
 include/llvm/ADT/StringSet.h            |   5 +
 test/tools/llvm-dwp/Inputs/simple/a.cpp |   2 +
 test/tools/llvm-dwp/Inputs/simple/a.dwo | Bin 0 -> 1193 bytes
 test/tools/llvm-dwp/Inputs/simple/b.cpp |   3 +
 test/tools/llvm-dwp/Inputs/simple/b.dwo | Bin 0 -> 1241 bytes
 test/tools/llvm-dwp/X86/lit.local.cfg   |   4 +
 test/tools/llvm-dwp/X86/simple.test     |  58 ++++++++++
 tools/llvm-dwp/llvm-dwp.cpp             | 144 +++++++++++++++++++++++-
 9 files changed, 222 insertions(+), 1 deletion(-)
 create mode 100644 test/tools/llvm-dwp/Inputs/simple/a.cpp
 create mode 100644 test/tools/llvm-dwp/Inputs/simple/a.dwo
 create mode 100644 test/tools/llvm-dwp/Inputs/simple/b.cpp
 create mode 100644 test/tools/llvm-dwp/Inputs/simple/b.dwo
 create mode 100644 test/tools/llvm-dwp/X86/lit.local.cfg
 create mode 100644 test/tools/llvm-dwp/X86/simple.test

diff --git a/include/llvm/ADT/StringMap.h b/include/llvm/ADT/StringMap.h
index 194235fac570..700bb9e10ef7 100644
--- a/include/llvm/ADT/StringMap.h
+++ b/include/llvm/ADT/StringMap.h
@@ -232,6 +232,13 @@ class StringMap : public StringMapImpl {
     : StringMapImpl(InitialSize, static_cast<unsigned>(sizeof(MapEntryTy))),
       Allocator(A) {}
 
+  StringMap(std::initializer_list<std::pair<StringRef, ValueTy>> List)
+      : StringMapImpl(static_cast<unsigned>(sizeof(MapEntryTy))) {
+    for (const auto &P : List) {
+      insert(P);
+    }
+  }
+
   StringMap(StringMap &&RHS)
       : StringMapImpl(std::move(RHS)), Allocator(std::move(RHS.Allocator)) {}
 
diff --git a/include/llvm/ADT/StringSet.h b/include/llvm/ADT/StringSet.h
index 3e0cc200b6dd..08626dc7af84 100644
--- a/include/llvm/ADT/StringSet.h
+++ b/include/llvm/ADT/StringSet.h
@@ -23,6 +23,11 @@ namespace llvm {
   class StringSet : public llvm::StringMap<char, AllocatorTy> {
     typedef llvm::StringMap<char, AllocatorTy> base;
   public:
+    StringSet() = default;
+    StringSet(std::initializer_list<StringRef> S) {
+      for (StringRef X : S)
+        insert(X);
+    }
 
     std::pair<typename base::iterator, bool> insert(StringRef Key) {
       assert(!Key.empty());
diff --git a/test/tools/llvm-dwp/Inputs/simple/a.cpp b/test/tools/llvm-dwp/Inputs/simple/a.cpp
new file mode 100644
index 000000000000..f85d105a99f8
--- /dev/null
+++ b/test/tools/llvm-dwp/Inputs/simple/a.cpp
@@ -0,0 +1,2 @@
+struct foo { };
+foo a;
diff --git a/test/tools/llvm-dwp/Inputs/simple/a.dwo b/test/tools/llvm-dwp/Inputs/simple/a.dwo
new file mode 100644
index 0000000000000000000000000000000000000000..7bdb2a7b9f826e0dfcbafb94b64ef9285b42f4c6
GIT binary patch
literal 1193
zcmbtTT}s115T4ytTP=#EFN!Y#5n3N&wTf1;EsBC5;tgtAQ!S=RNq<C8l!_<t5`Fbu
z@D8593wQ!SoK1Gy?HcgGfthc<Z)SEUB)LC0KBR<zGy(+}GN}SQC}Js)OE3j7s5+K=
zVRUW3Rrg#YXKtBUBi;5p?v=5&k;`ZE86)jD-NwpLTV2b5WmcO_u%PC7!C{*ichQUJ
zOXy?h0C9o|IeEW(d-`CvlrEx1DE$J)2<}XhWT{t3jk8|iWiJLm*8$U`RB?Vgv6G;4
z04&9oM1pgiokpj)G@e4fX804Ct(!*Msvx%g5HDqRZdW=NWyh<U=hq&XA;qFb%U-S4
zvfC|HuDdl)<yNKQ+g(+0>aHDd+~HrgfYZvW8JtxbWTT1RyNqS7Wk=hfB>29j%Xx`;
ztjk<&eH3I?6(f32BQ!qp%mZ2`ghABrI>U;t@#a?^&{jhjM7`I?imvhIPce}cC_(S#
zbta1nev7=PXgt;DIj34rK*K)(`l2?8$Uoe;;P`O|#`UQC`(i%oJ~8?yuwUDTBP@Dj
W+#_lezMoh|yOF`TrpWUre(OK6DqH*j

literal 0
HcmV?d00001

diff --git a/test/tools/llvm-dwp/Inputs/simple/b.cpp b/test/tools/llvm-dwp/Inputs/simple/b.cpp
new file mode 100644
index 000000000000..fedcc160f41e
--- /dev/null
+++ b/test/tools/llvm-dwp/Inputs/simple/b.cpp
@@ -0,0 +1,3 @@
+struct bar { };
+void b(bar) {
+}
diff --git a/test/tools/llvm-dwp/Inputs/simple/b.dwo b/test/tools/llvm-dwp/Inputs/simple/b.dwo
new file mode 100644
index 0000000000000000000000000000000000000000..f41243dc722b011d346dccede7fd91268525d578
GIT binary patch
literal 1241
zcmbtU%`O8`6h8M(>#rfHT2yRiBf(;{8ll7(f`nLDk=RtHrZmQkOsiOl+FI}gEG>Bo
zPayFgmNw4my)(wmBo=OR?m6FgzVkC}r`PM-8=Nr+4+9f^P{;zj#7L`4TQC5<P;;x^
zk=}NKX2bXNk}+=-^odq*?w#l}v!!yOJgHB(Zu@llhbzuZf^F2!&cLn)P}wiqB?|0r
z*EzhG&{oj4(28g&G=M&a0o%KMyTAW@c;vV=)1U_vY*$M$p5hOejOV*jmSy7|Gndvn
z=Id%N0M7x2nao0FF$16(EPG=P_gHzaV<J!N@+gi;1J4uK%wd;7G>#95{n^(LJ-yMq
zI&D>L^sPYn<1jjO?DM0F>(`9K3m*&tF{!|cU#~ZvRx?gFyt*I9Rof1ncARk=p3}v+
z!@tiMc8GnIQ9mn+eg~>{3e&_De#^j;cw6Da`p}m#AI5R%odhFsGC{kOz){H)U3g}N
zi*9w+0cKQ=#P4+BSrab0Rc(M7l_T+}B(qVa2vjZQHAO>kN(wtlPOT@*6%E0HcuC+{
z=K9gokGD@^@nQfFpEUnN)+f!AE&g;`;wMA%k~StjX%ph*GRi~^(h4J>_#04C-$$xq
AOaK4?

literal 0
HcmV?d00001

diff --git a/test/tools/llvm-dwp/X86/lit.local.cfg b/test/tools/llvm-dwp/X86/lit.local.cfg
new file mode 100644
index 000000000000..05f8b38b3346
--- /dev/null
+++ b/test/tools/llvm-dwp/X86/lit.local.cfg
@@ -0,0 +1,4 @@
+if not 'X86' in config.root.targets:
+    config.unsupported = True
+
+config.suffixes = ['.test', '.cpp', '.m', '.s']
diff --git a/test/tools/llvm-dwp/X86/simple.test b/test/tools/llvm-dwp/X86/simple.test
new file mode 100644
index 000000000000..26215bdc8c91
--- /dev/null
+++ b/test/tools/llvm-dwp/X86/simple.test
@@ -0,0 +1,58 @@
+RUN: llvm-dwp %p/../Inputs/simple/a.dwo %p/../Inputs/simple/b.dwo -o %t
+RUN: llvm-dwarfdump %t | FileCheck %s
+
+FIXME: For some reason, piping straight from llvm-dwp to llvm-dwarfdump doesn't behave well - looks like dwarfdump is reading/closes before dwp has finished.
+
+DWP from non-type-unit debug info for these two translation units:
+a.cpp:
+  struct foo { };
+  foo a;
+
+b.cpp:
+  struct bar { };
+  void b(bar) {
+  }
+
+CHECK: .debug_abbrev.dwo contents:
+CHECK: Abbrev table for offset: 0x00000000
+CHECK: DW_TAG_compile_unit
+CHECK: DW_TAG_variable
+CHECK: DW_TAG_structure_type
+CHECK: Abbrev table for offset: 0x00000031
+CHECK: DW_TAG_compile_unit
+CHECK: DW_TAG_structure_type
+CHECK: DW_TAG_subprogram
+CHECK: DW_TAG_formal_parameter
+
+CHECK: .debug_info.dwo contents:
+CHECK: 0x00000000: Compile Unit: length = 0x00000025 version = 0x0004 abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x00000029)
+CHECK: DW_TAG_compile_unit
+CHECK:   DW_AT_name {{.*}} "a.cpp"
+CHECK:   DW_TAG_variable
+CHECK:     DW_AT_name {{.*}} "a"
+CHECK:   DW_TAG_structure_type
+CHECK:     DW_AT_name {{.*}} "foo"
+
+FIXME: Using cu_index, identify that abbr_offset is 0x0031, not 0x0000
+CHECK: 0x00000029: Compile Unit: length = 0x00000031 version = 0x0004 abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x0000005e)
+FIXME: Using cu_index, use strings based on the right str index offset
+CHECK:   DW_AT_name {{.*}} "a.cpp"
+FIXME: Using cu_index to find the right abbrevs at abbr_offset, this abbrevation should actually be structure_type
+CHECK:   DW_TAG_variable
+
+CHECK: .debug_cu_index contents:
+FIXME: Emit and verify the cu_index contents
+
+CHECK: .debug_str.dwo contents:
+CHECK: "clang version
+CHECK: 0x[[ACPP:.*]]: "a.cpp"
+FIXME: Remove duplicates
+CHECK: "clang version
+CHECK: 0x[[BCPP:.*]]: "b.cpp"
+
+CHECK: .debug_str_offsets.dwo contents:
+CHECK: : 00000000
+CHECK: : [[ACPP]]
+CHECK: : 00000000
+FIXME: Update str offset indexes, this should be BCPP \/
+CHECK: : [[ACPP]]
diff --git a/tools/llvm-dwp/llvm-dwp.cpp b/tools/llvm-dwp/llvm-dwp.cpp
index b2f997621bea..7f9f6678db0b 100644
--- a/tools/llvm-dwp/llvm-dwp.cpp
+++ b/tools/llvm-dwp/llvm-dwp.cpp
@@ -1,2 +1,144 @@
-int main() {
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Options.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/TargetSelect.h"
+#include <memory>
+#include <list>
+#include <unordered_set>
+
+using namespace llvm;
+using namespace cl;
+
+OptionCategory DwpCategory("Specific Options");
+static list<std::string> InputFiles(Positional, OneOrMore,
+                                    desc("<input files>"), cat(DwpCategory));
+
+static opt<std::string> OutputFilename(Required, "o", desc("Specify the output file."),
+                                      value_desc("filename"), cat(DwpCategory));
+
+static int error(const Twine &Error, const Twine &Context) {
+  errs() << Twine("while processing ") + Context + ":\n";
+  errs() << Twine("error: ") + Error + "\n";
+  return 1;
+}
+
+static std::error_code writeSection(MCStreamer &Out, MCSection *OutSection,
+                                    const object::SectionRef &Sym) {
+  StringRef Contents;
+  if (auto Err = Sym.getContents(Contents))
+    return Err;
+  Out.SwitchSection(OutSection);
+  Out.EmitBytes(Contents);
+  return std::error_code();
+}
+
+static std::error_code write(MCStreamer &Out, ArrayRef<std::string> Inputs) {
+  for (const auto &Input : Inputs) {
+    auto ErrOrObj = object::ObjectFile::createObjectFile(Input);
+    if (!ErrOrObj)
+      return ErrOrObj.getError();
+    const auto *Obj = ErrOrObj->getBinary();
+    for (const auto &Section : Obj->sections()) {
+      const auto &MCOFI = *Out.getContext().getObjectFileInfo();
+      static const StringMap<MCSection *> KnownSections = {
+          {"debug_info.dwo", MCOFI.getDwarfInfoDWOSection()},
+          {"debug_types.dwo", MCOFI.getDwarfTypesDWOSection()},
+          {"debug_str_offsets.dwo", MCOFI.getDwarfStrOffDWOSection()},
+          {"debug_str.dwo", MCOFI.getDwarfStrDWOSection()},
+          {"debug_loc.dwo", MCOFI.getDwarfLocDWOSection()},
+          {"debug_abbrev.dwo", MCOFI.getDwarfAbbrevDWOSection()}};
+      StringRef Name;
+      if (std::error_code Err = Section.getName(Name))
+        return Err;
+      if (MCSection *OutSection =
+              KnownSections.lookup(Name.substr(Name.find_first_not_of("._"))))
+        if (auto Err = writeSection(Out, OutSection, Section))
+          return Err;
+    }
+  }
+  return std::error_code();
+}
+
+int main(int argc, char** argv) {
+
+  ParseCommandLineOptions(argc, argv, "merge split dwarf (.dwo) files");
+
+  llvm::InitializeAllTargetInfos();
+  llvm::InitializeAllTargetMCs();
+  llvm::InitializeAllTargets();
+  llvm::InitializeAllAsmPrinters();
+
+  std::string ErrorStr;
+  StringRef Context = "dwarf streamer init";
+
+  Triple TheTriple("x86_64-linux-gnu");
+
+  // Get the target.
+  const Target *TheTarget =
+      TargetRegistry::lookupTarget("", TheTriple, ErrorStr);
+  if (!TheTarget)
+    return error(ErrorStr, Context);
+  std::string TripleName = TheTriple.getTriple();
+
+  // Create all the MC Objects.
+  std::unique_ptr<MCRegisterInfo> MRI(TheTarget->createMCRegInfo(TripleName));
+  if (!MRI)
+    return error(Twine("no register info for target ") + TripleName, Context);
+
+  std::unique_ptr<MCAsmInfo> MAI(TheTarget->createMCAsmInfo(*MRI, TripleName));
+  if (!MAI)
+    return error("no asm info for target " + TripleName, Context);
+
+  MCObjectFileInfo MOFI;
+  MCContext MC(MAI.get(), MRI.get(), &MOFI);
+  MOFI.InitMCObjectFileInfo(TheTriple, Reloc::Default, CodeModel::Default,
+                             MC);
+
+  auto MAB = TheTarget->createMCAsmBackend(*MRI, TripleName, "");
+  if (!MAB)
+    return error("no asm backend for target " + TripleName, Context);
+
+  std::unique_ptr<MCInstrInfo> MII(TheTarget->createMCInstrInfo());
+  if (!MII)
+    return error("no instr info info for target " + TripleName, Context);
+
+  std::unique_ptr<MCSubtargetInfo> MSTI(
+      TheTarget->createMCSubtargetInfo(TripleName, "", ""));
+  if (!MSTI)
+    return error("no subtarget info for target " + TripleName, Context);
+
+  MCCodeEmitter *MCE = TheTarget->createMCCodeEmitter(*MII, *MRI, MC);
+  if (!MCE)
+    return error("no code emitter for target " + TripleName, Context);
+
+  // Create the output file.
+  std::error_code EC;
+  raw_fd_ostream OutFile(OutputFilename, EC, sys::fs::F_None);
+  if (EC)
+    return error(Twine(OutputFilename) + ": " + EC.message(), Context);
+
+  std::unique_ptr<MCStreamer> MS(TheTarget->createMCObjectStreamer(
+      TheTriple, MC, *MAB, OutFile, MCE, *MSTI, false,
+      /*DWARFMustBeAtTheEnd*/ false));
+  if (!MS)
+    return error("no object streamer for target " + TripleName, Context);
+
+  if (auto Err = write(*MS, InputFiles))
+    return error(Err.message(), "Writing DWP file");
+
+  MS->Finish();
 }

From 08802fa03338ebbe7f759586ef039d53f0d53194 Mon Sep 17 00:00:00 2001
From: Cong Hou <congh@google.com>
Date: Tue, 1 Dec 2015 00:55:42 +0000
Subject: [PATCH 037/186] Fix a bug in MachineBlockPlacement that may cause
 assertion failure during BranchProbability construction.

The root cause is the rounding behavior in BranchProbability construction. We may consider to use truncation instead in the future.




git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254356 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/MachineBlockPlacement.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
index ddddd483e801..fcddf346cf68 100644
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -423,9 +423,13 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
 
   DEBUG(dbgs() << "Attempting merge from: " << getBlockName(BB) << "\n");
   for (MachineBasicBlock *Succ : Successors) {
-    BranchProbability SuccProb(
-        MBPI->getEdgeProbability(BB, Succ).getNumerator(),
-        AdjustedSumProb.getNumerator());
+    BranchProbability SuccProb;
+    uint32_t SuccProbN = MBPI->getEdgeProbability(BB, Succ).getNumerator();
+    uint32_t SuccProbD = AdjustedSumProb.getNumerator();
+    if (SuccProbN >= SuccProbD)
+      SuccProb = BranchProbability::getOne();
+    else
+      SuccProb = BranchProbability(SuccProbN, SuccProbD);
 
     // If we outline optional branches, look whether Succ is unavoidable, i.e.
     // dominates all terminators of the MachineFunction. If it does, other

From a825191bcb5a553d4333464592353842d601988f Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Tue, 1 Dec 2015 00:57:05 +0000
Subject: [PATCH 038/186] [llvm-dwp] Add missing dependency from llvm tests on
 the llvm-dwp tool

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254357 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 9bee504efece..138450ba8e02 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -39,6 +39,7 @@ set(LLVM_TEST_DEPENDS
           llvm-dis
           llvm-dsymutil
           llvm-dwarfdump
+          llvm-dwp
           llvm-extract
           llvm-lib
           llvm-link

From 316c5ff46f55ca4bff15829f504f72da48acfc1d Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Tue, 1 Dec 2015 01:07:20 +0000
Subject: [PATCH 039/186] [llvm-dwp] Add missing Makefile for the old
 configure+make build

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254358 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-dwp/Makefile | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 tools/llvm-dwp/Makefile

diff --git a/tools/llvm-dwp/Makefile b/tools/llvm-dwp/Makefile
new file mode 100644
index 000000000000..826371ecf915
--- /dev/null
+++ b/tools/llvm-dwp/Makefile
@@ -0,0 +1,18 @@
+##===- tools/llvm-dwp/Makefile -----------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL := ../..
+TOOLNAME := llvm-dwp
+LINK_COMPONENTS := all-targets AsmPrinter DebugInfoDWARF MC Object Support
+
+# This tool has no plugins, optimize startup time.
+TOOL_NO_EXPORTS := 1
+
+include $(LEVEL)/Makefile.common
+

From be0d74465db88aa17438edbd8389180b19c47bf3 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Tue, 1 Dec 2015 01:14:58 +0000
Subject: [PATCH 040/186] check-llvm: Introduce the new feature "tls".

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254360 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/lit.cfg | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/lit.cfg b/test/lit.cfg
index 3f710debf23a..b77f892d4d92 100644
--- a/test/lit.cfg
+++ b/test/lit.cfg
@@ -374,6 +374,10 @@ if config.target_triple:
 if config.host_triple == config.target_triple:
     config.available_features.add("native")
 
+# Not set for targeting tls-incapable targets.
+if not re.match(r'.*-cygwin$', config.target_triple):
+    config.available_features.add('tls')
+
 import subprocess
 
 def have_ld_plugin_support():

From a1d72288c959c435c11fc74f5338aeb693709e01 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Tue, 1 Dec 2015 01:15:03 +0000
Subject: [PATCH 041/186] llvm/test/DebugInfo/Generic/safestack-byval.ll is
 using tls.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254361 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/DebugInfo/Generic/safestack-byval.ll | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/DebugInfo/Generic/safestack-byval.ll b/test/DebugInfo/Generic/safestack-byval.ll
index 01ef064a78b6..24df3815068e 100644
--- a/test/DebugInfo/Generic/safestack-byval.ll
+++ b/test/DebugInfo/Generic/safestack-byval.ll
@@ -13,6 +13,8 @@
 ;   return zzz.a[len];
 ; }
 
+; REQUIRES: tls
+
 ; CHECK: ![[ZZZ:.*]] = !DILocalVariable(name: "zzz",
 ; CHECK: ![[ZZZ_EXPR:.*]] = !DIExpression(DW_OP_deref, DW_OP_minus, 400)
 ; CHECK: DBG_VALUE {{.*}} ![[ZZZ]], ![[ZZZ_EXPR]]

From be0d94b85bd29d2947d2cbeef741b2d736daf143 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 1 Dec 2015 02:14:33 +0000
Subject: [PATCH 042/186] Squelch unused variable warning in
 SIRegisterInfo.cpp.

Patch by Justin Lebar

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254362 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/SIRegisterInfo.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index d9799812d453..bf87f0225272 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -566,8 +566,9 @@ bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
 unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
                                            enum PreloadedValue Value) const {
 
-  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+  (void)ST;
   switch (Value) {
   case SIRegisterInfo::WORKGROUP_ID_X:
     assert(MFI->hasWorkGroupIDX());

From e1ef1867304e6858a290939d38193ce6e4cfe1ad Mon Sep 17 00:00:00 2001
From: Davide Italiano <davide@freebsd.org>
Date: Tue, 1 Dec 2015 02:35:04 +0000
Subject: [PATCH 043/186] [Windows] Simplify assertion code. NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254363 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Support/Windows/DynamicLibrary.inc | 6 ++----
 lib/Support/Windows/Signals.inc        | 5 +----
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/lib/Support/Windows/DynamicLibrary.inc b/lib/Support/Windows/DynamicLibrary.inc
index 17418b015c75..77146d47cf29 100644
--- a/lib/Support/Windows/DynamicLibrary.inc
+++ b/lib/Support/Windows/DynamicLibrary.inc
@@ -61,10 +61,8 @@ DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
       OpenedHandles = new DenseSet<HMODULE>();
 
     if (!fEnumerateLoadedModules) {
-      if (!loadDebugHelp()) {
-        assert(false && "These APIs should always be available");
-        return DynamicLibrary();
-      }
+      assert(loadDebugHelp() && "These APIs should always be available");
+      return DynamicLibrary();
     }
 
     fEnumerateLoadedModules(GetCurrentProcess(), ELM_Callback, 0);
diff --git a/lib/Support/Windows/Signals.inc b/lib/Support/Windows/Signals.inc
index f40ca72996a1..d109a66d7035 100644
--- a/lib/Support/Windows/Signals.inc
+++ b/lib/Support/Windows/Signals.inc
@@ -405,10 +405,7 @@ static void RegisterHandler() {
   // If we cannot load up the APIs (which would be unexpected as they should
   // exist on every version of Windows we support), we will bail out since
   // there would be nothing to report.
-  if (!load64BitDebugHelp()) {
-    assert(false && "These APIs should always be available");
-    return;
-  }
+  assert(load64BitDebugHelp() && "These APIs should always be available");
 
   if (RegisteredUnhandledExceptionFilter) {
     EnterCriticalSection(&CriticalSection);

From 5d2885f554db8eb89a3e2b376c4d6507d456eed2 Mon Sep 17 00:00:00 2001
From: Davide Italiano <davide@freebsd.org>
Date: Tue, 1 Dec 2015 02:38:42 +0000
Subject: [PATCH 044/186] [Windows] Follow-up r254363, remove return.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254364 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Support/Windows/DynamicLibrary.inc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/lib/Support/Windows/DynamicLibrary.inc b/lib/Support/Windows/DynamicLibrary.inc
index 77146d47cf29..e612283e630f 100644
--- a/lib/Support/Windows/DynamicLibrary.inc
+++ b/lib/Support/Windows/DynamicLibrary.inc
@@ -60,10 +60,8 @@ DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
     if (OpenedHandles == 0)
       OpenedHandles = new DenseSet<HMODULE>();
 
-    if (!fEnumerateLoadedModules) {
+    if (!fEnumerateLoadedModules)
       assert(loadDebugHelp() && "These APIs should always be available");
-      return DynamicLibrary();
-    }
 
     fEnumerateLoadedModules(GetCurrentProcess(), ELM_Callback, 0);
     // Dummy library that represents "search all handles".

From 8e83fe2e97c232086674f9f04b00043bccb89629 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@hanshq.net>
Date: Tue, 1 Dec 2015 03:49:42 +0000
Subject: [PATCH 045/186] Revert r254348: "Replace all weight-based interfaces
 in MBB with probability-based interfaces, and update all uses of old
 interfaces."

and the follow-up r254356: "Fix a bug in MachineBlockPlacement that may cause assertion failure during BranchProbability construction."

Asserts were firing in Chromium builds. See PR25687.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254366 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/BranchProbabilityInfo.h |   3 -
 include/llvm/CodeGen/MachineBasicBlock.h      |  46 +++++-
 .../CodeGen/MachineBranchProbabilityInfo.h    |  22 ++-
 include/llvm/Support/BranchProbability.h      |  31 +---
 lib/Analysis/BranchProbabilityInfo.cpp        |   6 -
 lib/CodeGen/BranchFolding.cpp                 |   9 +-
 lib/CodeGen/IfConversion.cpp                  | 155 ++++++++++--------
 lib/CodeGen/MIRParser/MIParser.cpp            |   3 +-
 lib/CodeGen/MIRPrinter.cpp                    |   4 +-
 lib/CodeGen/MachineBasicBlock.cpp             | 142 ++++++++++++----
 lib/CodeGen/MachineBlockPlacement.cpp         |  71 ++++----
 lib/CodeGen/MachineBranchProbabilityInfo.cpp  |  82 ++++++---
 lib/CodeGen/TailDuplication.cpp               |   6 +-
 lib/Support/BranchProbability.cpp             |  20 +--
 lib/Target/AMDGPU/AMDILCFGStructurizer.cpp    |   6 +-
 lib/Target/ARM/ARMConstantIslandPass.cpp      |   3 +-
 lib/Target/ARM/ARMISelLowering.cpp            |   2 +-
 lib/Target/Hexagon/HexagonCFGOptimizer.cpp    |   6 +-
 lib/Target/Mips/MipsLongBranch.cpp            |   3 +-
 test/CodeGen/ARM/ifcvt-branch-weight-bug.ll   |  14 +-
 test/CodeGen/ARM/ifcvt-branch-weight.ll       |   2 +-
 test/CodeGen/ARM/ifcvt-iter-indbr.ll          |  10 +-
 test/CodeGen/ARM/tail-merge-branch-weight.ll  |   2 +-
 test/CodeGen/ARM/taildup-branch-weight.ll     |   4 +-
 test/CodeGen/Generic/MachineBranchProb.ll     |   8 +-
 test/CodeGen/Hexagon/ifcvt-edge-weight.ll     |   2 +-
 test/CodeGen/MIR/X86/newline-handling.mir     |   4 +-
 .../X86/successor-basic-blocks-weights.mir    |   6 +-
 .../MIR/X86/successor-basic-blocks.mir        |   4 +-
 test/CodeGen/X86/MachineBranchProb.ll         |   4 +-
 test/CodeGen/X86/catchpad-weight.ll           |   2 +-
 test/CodeGen/X86/stack-protector-weight.ll    |   4 +-
 test/CodeGen/X86/switch-edge-weight.ll        |  22 +--
 test/CodeGen/X86/switch-jump-table.ll         |   8 +-
 34 files changed, 420 insertions(+), 296 deletions(-)

diff --git a/include/llvm/Analysis/BranchProbabilityInfo.h b/include/llvm/Analysis/BranchProbabilityInfo.h
index 69dae5e90785..89dec14b2b3e 100644
--- a/include/llvm/Analysis/BranchProbabilityInfo.h
+++ b/include/llvm/Analysis/BranchProbabilityInfo.h
@@ -61,9 +61,6 @@ class BranchProbabilityInfo {
   BranchProbability getEdgeProbability(const BasicBlock *Src,
                                        const BasicBlock *Dst) const;
 
-  BranchProbability getEdgeProbability(const BasicBlock *Src,
-                                       succ_const_iterator Dst) const;
-
   /// \brief Test if an edge is hot relative to other out-edges of the Src.
   ///
   /// Check whether this edge out of the source block is 'hot'. We define hot
diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h
index ac87f4f901f5..a2b1a850ec76 100644
--- a/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/include/llvm/CodeGen/MachineBasicBlock.h
@@ -91,6 +91,13 @@ class MachineBasicBlock
   std::vector<MachineBasicBlock *> Predecessors;
   std::vector<MachineBasicBlock *> Successors;
 
+  /// Keep track of the weights to the successors. This vector has the same
+  /// order as Successors, or it is empty if we don't use it (disable
+  /// optimization).
+  std::vector<uint32_t> Weights;
+  typedef std::vector<uint32_t>::iterator weight_iterator;
+  typedef std::vector<uint32_t>::const_iterator const_weight_iterator;
+
   /// Keep track of the probabilities to the successors. This vector has the
   /// same order as Successors, or it is empty if we don't use it (disable
   /// optimization).
@@ -433,16 +440,26 @@ class MachineBasicBlock
 
   // Machine-CFG mutators
 
+  /// Add Succ as a successor of this MachineBasicBlock.  The Predecessors list
+  /// of Succ is automatically updated. WEIGHT parameter is stored in Weights
+  /// list and it may be used by MachineBranchProbabilityInfo analysis to
+  /// calculate branch probability.
+  ///
+  /// Note that duplicate Machine CFG edges are not allowed.
+  void addSuccessor(MachineBasicBlock *Succ, uint32_t Weight = 0);
+
+  /// Add Succ as a successor of this MachineBasicBlock.  The Predecessors list
+  /// of Succ is automatically updated. The weight is not provided because BPI
+  /// is not available (e.g. -O0 is used), in which case edge weights won't be
+  /// used. Using this interface can save some space.
+  void addSuccessorWithoutWeight(MachineBasicBlock *Succ);
+
   /// Add Succ as a successor of this MachineBasicBlock.  The Predecessors list
   /// of Succ is automatically updated. PROB parameter is stored in
-  /// Probabilities list. The default probability is set as unknown. Mixing
-  /// known and unknown probabilities in successor list is not allowed. When all
-  /// successors have unknown probabilities, 1 / N is returned as the
-  /// probability for each successor, where N is the number of successors.
+  /// Probabilities list.
   ///
   /// Note that duplicate Machine CFG edges are not allowed.
-  void addSuccessor(MachineBasicBlock *Succ,
-                    BranchProbability Prob = BranchProbability::getUnknown());
+  void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob);
 
   /// Add Succ as a successor of this MachineBasicBlock.  The Predecessors list
   /// of Succ is automatically updated. The probability is not provided because
@@ -450,6 +467,9 @@ class MachineBasicBlock
   /// won't be used. Using this interface can save some space.
   void addSuccessorWithoutProb(MachineBasicBlock *Succ);
 
+  /// Set successor weight of a given iterator.
+  void setSuccWeight(succ_iterator I, uint32_t Weight);
+
   /// Set successor probability of a given iterator.
   void setSuccProbability(succ_iterator I, BranchProbability Prob);
 
@@ -468,7 +488,7 @@ class MachineBasicBlock
   /// Return the iterator to the element after the one removed.
   succ_iterator removeSuccessor(succ_iterator I);
 
-  /// Replace successor OLD with NEW and update probability info.
+  /// Replace successor OLD with NEW and update weight info.
   void replaceSuccessor(MachineBasicBlock *Old, MachineBasicBlock *New);
 
   /// Transfers all the successors from MBB to this machine basic block (i.e.,
@@ -480,6 +500,9 @@ class MachineBasicBlock
   /// operands in the successor blocks which refer to FromMBB to refer to this.
   void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB);
 
+  /// Return true if any of the successors have weights attached to them.
+  bool hasSuccessorWeights() const { return !Weights.empty(); }
+
   /// Return true if any of the successors have probabilities attached to them.
   bool hasSuccessorProbabilities() const { return !Probs.empty(); }
 
@@ -736,6 +759,10 @@ class MachineBasicBlock
 
 
 private:
+  /// Return weight iterator corresponding to the I successor iterator.
+  weight_iterator getWeightIterator(succ_iterator I);
+  const_weight_iterator getWeightIterator(const_succ_iterator I) const;
+
   /// Return probability iterator corresponding to the I successor iterator.
   probability_iterator getProbabilityIterator(succ_iterator I);
   const_probability_iterator
@@ -744,6 +771,11 @@ class MachineBasicBlock
   friend class MachineBranchProbabilityInfo;
   friend class MIPrinter;
 
+  /// Return weight of the edge from this block to MBB. This method should NOT
+  /// be called directly, but by using getEdgeWeight method from
+  /// MachineBranchProbabilityInfo class.
+  uint32_t getSuccWeight(const_succ_iterator Succ) const;
+
   /// Return probability of the edge from this block to MBB. This method should
   /// NOT be called directly, but by using getEdgeProbability method from
   /// MachineBranchProbabilityInfo class.
diff --git a/include/llvm/CodeGen/MachineBranchProbabilityInfo.h b/include/llvm/CodeGen/MachineBranchProbabilityInfo.h
index 608e8d257874..058ab32f3aa9 100644
--- a/include/llvm/CodeGen/MachineBranchProbabilityInfo.h
+++ b/include/llvm/CodeGen/MachineBranchProbabilityInfo.h
@@ -55,15 +55,10 @@ class MachineBranchProbabilityInfo : public ImmutablePass {
   uint32_t getEdgeWeight(const MachineBasicBlock *Src,
                          MachineBasicBlock::const_succ_iterator Dst) const;
 
-  // Return edge probability.
-  BranchProbability getEdgeProbability(const MachineBasicBlock *Src,
-                                       const MachineBasicBlock *Dst) const;
-
-  // Same as above, but using a const_succ_iterator from Src. This is faster
-  // when the iterator is already available.
-  BranchProbability
-  getEdgeProbability(const MachineBasicBlock *Src,
-                     MachineBasicBlock::const_succ_iterator Dst) const;
+  // Get sum of the block successors' weights, potentially scaling them to fit
+  // within 32-bits. If scaling is required, sets Scale based on the necessary
+  // adjustment. Any edge weights used with the sum should be divided by Scale.
+  uint32_t getSumForBlock(const MachineBasicBlock *MBB, uint32_t &Scale) const;
 
   // A 'Hot' edge is an edge which probability is >= 80%.
   bool isEdgeHot(const MachineBasicBlock *Src,
@@ -73,6 +68,15 @@ class MachineBranchProbabilityInfo : public ImmutablePass {
   // NB: This routine's complexity is linear on the number of successors.
   MachineBasicBlock *getHotSucc(MachineBasicBlock *MBB) const;
 
+  // Return a probability as a fraction between 0 (0% probability) and
+  // 1 (100% probability), however the value is never equal to 0, and can be 1
+  // only iff SRC block has only one successor.
+  // NB: This routine's complexity is linear on the number of successors of
+  // Src. Querying sequentially for each successor's probability is a quadratic
+  // query pattern.
+  BranchProbability getEdgeProbability(const MachineBasicBlock *Src,
+                                       const MachineBasicBlock *Dst) const;
+
   // Print value between 0 (0% probability) and 1 (100% probability),
   // however the value is never equal to 0, and can be 1 only iff SRC block
   // has only one successor.
diff --git a/include/llvm/Support/BranchProbability.h b/include/llvm/Support/BranchProbability.h
index 2548384f346b..3620d4d5d772 100644
--- a/include/llvm/Support/BranchProbability.h
+++ b/include/llvm/Support/BranchProbability.h
@@ -53,9 +53,6 @@ class BranchProbability {
   // Create a BranchProbability object with the given numerator and 1<<31
   // as denominator.
   static BranchProbability getRaw(uint32_t N) { return BranchProbability(N); }
-  // Create a BranchProbability object from 64-bit integers.
-  static BranchProbability getBranchProbability(uint64_t Numerator,
-                                                uint64_t Denominator);
 
   // Normalize given probabilties so that the sum of them becomes approximate
   // one.
@@ -134,30 +131,10 @@ class BranchProbability {
 
   bool operator==(BranchProbability RHS) const { return N == RHS.N; }
   bool operator!=(BranchProbability RHS) const { return !(*this == RHS); }
-
-  bool operator<(BranchProbability RHS) const {
-    assert(N != UnknownN && RHS.N != UnknownN &&
-           "Unknown probability cannot participate in comparisons.");
-    return N < RHS.N;
-  }
-
-  bool operator>(BranchProbability RHS) const {
-    assert(N != UnknownN && RHS.N != UnknownN &&
-           "Unknown probability cannot participate in comparisons.");
-    return RHS < *this;
-  }
-
-  bool operator<=(BranchProbability RHS) const {
-    assert(N != UnknownN && RHS.N != UnknownN &&
-           "Unknown probability cannot participate in comparisons.");
-    return !(RHS < *this);
-  }
-
-  bool operator>=(BranchProbability RHS) const {
-    assert(N != UnknownN && RHS.N != UnknownN &&
-           "Unknown probability cannot participate in comparisons.");
-    return !(*this < RHS);
-  }
+  bool operator<(BranchProbability RHS) const { return N < RHS.N; }
+  bool operator>(BranchProbability RHS) const { return RHS < *this; }
+  bool operator<=(BranchProbability RHS) const { return !(RHS < *this); }
+  bool operator>=(BranchProbability RHS) const { return !(*this < RHS); }
 };
 
 inline raw_ostream &operator<<(raw_ostream &OS, BranchProbability Prob) {
diff --git a/lib/Analysis/BranchProbabilityInfo.cpp b/lib/Analysis/BranchProbabilityInfo.cpp
index 6cdf43a06a9f..f48394698699 100644
--- a/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/lib/Analysis/BranchProbabilityInfo.cpp
@@ -647,12 +647,6 @@ getEdgeProbability(const BasicBlock *Src, const BasicBlock *Dst) const {
   return BranchProbability(N, D);
 }
 
-BranchProbability
-BranchProbabilityInfo::getEdgeProbability(const BasicBlock *Src,
-                                          succ_const_iterator Dst) const {
-  return getEdgeProbability(Src, Dst.getSuccessorIndex());
-}
-
 raw_ostream &
 BranchProbabilityInfo::printEdgeProbability(raw_ostream &OS,
                                             const BasicBlock *Src,
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index ba21d9cc9d55..0b2495cc996e 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -1099,16 +1099,13 @@ void BranchFolder::setCommonTailEdgeWeights(MachineBasicBlock &TailMBB) {
   if (TailMBB.succ_size() <= 1)
     return;
 
-  auto SumEdgeFreq =
-      std::accumulate(EdgeFreqLs.begin(), EdgeFreqLs.end(), BlockFrequency(0))
-          .getFrequency();
+  auto MaxEdgeFreq = *std::max_element(EdgeFreqLs.begin(), EdgeFreqLs.end());
+  uint64_t Scale = MaxEdgeFreq.getFrequency() / UINT32_MAX + 1;
   auto EdgeFreq = EdgeFreqLs.begin();
 
   for (auto SuccI = TailMBB.succ_begin(), SuccE = TailMBB.succ_end();
        SuccI != SuccE; ++SuccI, ++EdgeFreq)
-    TailMBB.setSuccProbability(
-        SuccI, BranchProbability::getBranchProbability(EdgeFreq->getFrequency(),
-                                                       SumEdgeFreq));
+    TailMBB.setSuccWeight(SuccI, EdgeFreq->getFrequency() / Scale);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index ff28f95cc33d..0b2f3ea165f8 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -32,7 +32,6 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
-#include <algorithm>
 
 using namespace llvm;
 
@@ -1152,6 +1151,28 @@ bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) {
   return true;
 }
 
+/// Scale down weights to fit into uint32_t. NewTrue is the new weight
+/// for successor TrueBB, and NewFalse is the new weight for successor
+/// FalseBB.
+static void ScaleWeights(uint64_t NewTrue, uint64_t NewFalse,
+                         MachineBasicBlock *MBB,
+                         const MachineBasicBlock *TrueBB,
+                         const MachineBasicBlock *FalseBB,
+                         const MachineBranchProbabilityInfo *MBPI) {
+  uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse;
+  uint32_t Scale = (NewMax / UINT32_MAX) + 1;
+  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+                                        SE = MBB->succ_end();
+       SI != SE; ++SI) {
+    if (*SI == TrueBB)
+      MBB->setSuccWeight(SI, (uint32_t)(NewTrue / Scale));
+    else if (*SI == FalseBB)
+      MBB->setSuccWeight(SI, (uint32_t)(NewFalse / Scale));
+    else
+      MBB->setSuccWeight(SI, MBPI->getEdgeWeight(MBB, SI) / Scale);
+  }
+}
+
 /// IfConvertTriangle - If convert a triangle sub-CFG.
 ///
 bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
@@ -1208,14 +1229,16 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
   DontKill.clear();
 
   bool HasEarlyExit = CvtBBI->FalseBB != nullptr;
-  BranchProbability CvtNext, CvtFalse, BBNext, BBCvt;
+  uint64_t CvtNext = 0, CvtFalse = 0, BBNext = 0, BBCvt = 0, SumWeight = 0;
+  uint32_t WeightScale = 0;
 
   if (HasEarlyExit) {
-    // Get probabilities before modifying CvtBBI->BB and BBI.BB.
-    CvtNext = MBPI->getEdgeProbability(CvtBBI->BB, NextBBI->BB);
-    CvtFalse = MBPI->getEdgeProbability(CvtBBI->BB, CvtBBI->FalseBB);
-    BBNext = MBPI->getEdgeProbability(BBI.BB, NextBBI->BB);
-    BBCvt = MBPI->getEdgeProbability(BBI.BB, CvtBBI->BB);
+    // Get weights before modifying CvtBBI->BB and BBI.BB.
+    CvtNext = MBPI->getEdgeWeight(CvtBBI->BB, NextBBI->BB);
+    CvtFalse = MBPI->getEdgeWeight(CvtBBI->BB, CvtBBI->FalseBB);
+    BBNext = MBPI->getEdgeWeight(BBI.BB, NextBBI->BB);
+    BBCvt = MBPI->getEdgeWeight(BBI.BB, CvtBBI->BB);
+    SumWeight = MBPI->getSumForBlock(CvtBBI->BB, WeightScale);
   }
 
   if (CvtBBI->BB->pred_size() > 1) {
@@ -1243,24 +1266,22 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
                                            CvtBBI->BrCond.end());
     if (TII->ReverseBranchCondition(RevCond))
       llvm_unreachable("Unable to reverse branch condition!");
-
-    // Update the edge probability for both CvtBBI->FalseBB and NextBBI.
-    // NewNext = New_Prob(BBI.BB, NextBBI->BB) =
-    //   Prob(BBI.BB, NextBBI->BB) +
-    //   Prob(BBI.BB, CvtBBI->BB) * Prob(CvtBBI->BB, NextBBI->BB)
-    // NewFalse = New_Prob(BBI.BB, CvtBBI->FalseBB) =
-    //   Prob(BBI.BB, CvtBBI->BB) * Prob(CvtBBI->BB, CvtBBI->FalseBB)
-    auto NewTrueBB = getNextBlock(BBI.BB);
-    auto NewNext = BBNext + BBCvt * CvtNext;
-    auto NewTrueBBIter =
-        std::find(BBI.BB->succ_begin(), BBI.BB->succ_end(), NewTrueBB);
-    assert(NewTrueBBIter != BBI.BB->succ_end() &&
-           "NewTrueBB is not a successor of BBI.BB.");
-    BBI.BB->setSuccProbability(NewTrueBBIter, NewNext);
-
-    auto NewFalse = BBCvt * CvtFalse;
     TII->InsertBranch(*BBI.BB, CvtBBI->FalseBB, nullptr, RevCond, dl);
-    BBI.BB->addSuccessor(CvtBBI->FalseBB, NewFalse);
+    BBI.BB->addSuccessor(CvtBBI->FalseBB);
+    // Update the edge weight for both CvtBBI->FalseBB and NextBBI.
+    // New_Weight(BBI.BB, NextBBI->BB) =
+    //   Weight(BBI.BB, NextBBI->BB) * getSumForBlock(CvtBBI->BB) +
+    //   Weight(BBI.BB, CvtBBI->BB) * Weight(CvtBBI->BB, NextBBI->BB)
+    // New_Weight(BBI.BB, CvtBBI->FalseBB) =
+    //   Weight(BBI.BB, CvtBBI->BB) * Weight(CvtBBI->BB, CvtBBI->FalseBB)
+
+    uint64_t NewNext = BBNext * SumWeight + (BBCvt * CvtNext) / WeightScale;
+    uint64_t NewFalse = (BBCvt * CvtFalse) / WeightScale;
+    // We need to scale down all weights of BBI.BB to fit uint32_t.
+    // Here BBI.BB is connected to CvtBBI->FalseBB and will fall through to
+    // the next block.
+    ScaleWeights(NewNext, NewFalse, BBI.BB, getNextBlock(BBI.BB),
+                 CvtBBI->FalseBB, MBPI);
   }
 
   // Merge in the 'false' block if the 'false' block has no other
@@ -1503,7 +1524,7 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
       MergeBlocks(BBI, TailBBI);
       TailBBI.IsDone = true;
     } else {
-      BBI.BB->addSuccessor(TailBB, BranchProbability::getOne());
+      BBI.BB->addSuccessor(TailBB);
       InsertUncondBranch(BBI.BB, TailBB, TII);
       BBI.HasFallThrough = false;
     }
@@ -1667,26 +1688,21 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) {
                                                 FromBBI.BB->succ_end());
   MachineBasicBlock *NBB = getNextBlock(FromBBI.BB);
   MachineBasicBlock *FallThrough = FromBBI.HasFallThrough ? NBB : nullptr;
-  // The edge probability from ToBBI.BB to FromBBI.BB, which is only needed when
-  // AddEdges is true and FromBBI.BB is a successor of ToBBI.BB.
-  auto To2FromProb = BranchProbability::getZero();
-  if (AddEdges && ToBBI.BB->isSuccessor(FromBBI.BB)) {
-    To2FromProb = MBPI->getEdgeProbability(ToBBI.BB, FromBBI.BB);
-    // Set the edge probability from ToBBI.BB to FromBBI.BB to zero to avoid the
-    // edge probability being merged to other edges when this edge is removed
-    // later.
-    ToBBI.BB->setSuccProbability(
-        std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), FromBBI.BB),
-        BranchProbability::getZero());
-  }
 
+  // The edge weight from ToBBI.BB to FromBBI.BB, which is only needed when
+  // AddEdges is true and FromBBI.BB is a successor of ToBBI.BB.
+  uint32_t To2FromWeight = 0;
+  // WeightScale and SumWeight are for calculating successor probabilities of
+  // FromBBI.BB.
+  uint32_t WeightScale = 0;
+  uint32_t SumWeight = 0;
   if (AddEdges && ToBBI.BB->isSuccessor(FromBBI.BB)) {
-    // Set the edge probability from ToBBI.BB to FromBBI.BB to zero to avoid the
-    // edge probability being merged to other edges when this edge is removed
-    // later.
-    ToBBI.BB->setSuccProbability(
-        std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), FromBBI.BB),
-        BranchProbability::getZero());
+    To2FromWeight = MBPI->getEdgeWeight(ToBBI.BB, FromBBI.BB);
+    // Set the edge weight from ToBBI.BB to FromBBI.BB to zero to avoid the edge
+    // weight being merged to other edges when this edge is removed later.
+    ToBBI.BB->setSuccWeight(
+        std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), FromBBI.BB), 0);
+    SumWeight = MBPI->getSumForBlock(FromBBI.BB, WeightScale);
   }
 
   for (unsigned i = 0, e = FromSuccs.size(); i != e; ++i) {
@@ -1695,38 +1711,39 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) {
     if (Succ == FallThrough)
       continue;
 
-    auto NewProb = BranchProbability::getZero();
+    uint32_t NewWeight = 0;
     if (AddEdges) {
-      // Calculate the edge probability for the edge from ToBBI.BB to Succ,
-      // which is a portion of the edge probability from FromBBI.BB to Succ. The
-      // portion ratio is the edge probability from ToBBI.BB to FromBBI.BB (if
-      // FromBBI is a successor of ToBBI.BB. See comment below for excepion).
-      NewProb = MBPI->getEdgeProbability(FromBBI.BB, Succ);
+      // Calculate the edge weight for the edge from ToBBI.BB to Succ, which is
+      // a portion of the edge weight from FromBBI.BB to Succ. The portion ratio
+      // is the edge probability from ToBBI.BB to FromBBI.BB (if FromBBI is a
+      // successor of ToBBI.BB. See comment below for excepion).
+      NewWeight = MBPI->getEdgeWeight(FromBBI.BB, Succ);
 
-      // To2FromProb is 0 when FromBBI.BB is not a successor of ToBBI.BB. This
+      // To2FromWeight is 0 when FromBBI.BB is not a successor of ToBBI.BB. This
       // only happens when if-converting a diamond CFG and FromBBI.BB is the
       // tail BB.  In this case FromBBI.BB post-dominates ToBBI.BB and hence we
-      // could just use the probabilities on FromBBI.BB's out-edges when adding
-      // new successors.
-      if (!To2FromProb.isZero())
-        NewProb *= To2FromProb;
+      // could just use the weights on FromBBI.BB's out-edges when adding new
+      // successors.
+      if (To2FromWeight > 0) {
+        BranchProbability Prob(NewWeight / WeightScale, SumWeight);
+        NewWeight = Prob.scale(To2FromWeight);
+      }
     }
 
     FromBBI.BB->removeSuccessor(Succ);
 
     if (AddEdges) {
-      // If the edge from ToBBI.BB to Succ already exists, update the
-      // probability of this edge by adding NewWeight to it. An example is shown
-      // below, in which A is ToBBI.BB and B is FromBBI.BB. In this case we
-      // don't have to set C as A's successor as it already is. We only need to
-      // update the edge probability on A->C. Note that B will not be
-      // immediately removed from A's successors. It is possible that B->D is
-      // not removed either if D is a fallthrough of B. Later the edge A->D
-      // (generated here) and B->D will be combined into one edge. To maintain
-      // correct edge probability of this combined edge, we need to set the edge
-      // probability of A->B to zero, which is already done above. The edge
-      // probability on A->D is calculated by scaling the original probability
-      // on A->B by the probability of B->D.
+      // If the edge from ToBBI.BB to Succ already exists, update the weight of
+      // this edge by adding NewWeight to it. An example is shown below, in
+      // which A is ToBBI.BB and B is FromBBI.BB. In this case we don't have to
+      // set C as A's successor as it already is. We only need to update the
+      // edge weight on A->C. Note that B will not be immediately removed from
+      // A's successors. It is possible that B->D is not removed either if D is
+      // a fallthrough of B. Later the edge A->D (generated here) and B->D will
+      // be combined into one edge. To maintain correct edge weight of this
+      // combined edge, we need to set the edge weight of A->B to zero, which is
+      // already done above. The edge weight on A->D is calculated by scaling
+      // the original weight on A->B by the probability of B->D.
       //
       // Before ifcvt:      After ifcvt (assume B->D is kept):
       //
@@ -1738,11 +1755,11 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) {
       //    C  D             C  D
       //
       if (ToBBI.BB->isSuccessor(Succ))
-        ToBBI.BB->setSuccProbability(
+        ToBBI.BB->setSuccWeight(
             std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), Succ),
-            MBPI->getEdgeProbability(ToBBI.BB, Succ) + NewProb);
+            MBPI->getEdgeWeight(ToBBI.BB, Succ) + NewWeight);
       else
-        ToBBI.BB->addSuccessor(Succ, NewProb);
+        ToBBI.BB->addSuccessor(Succ, NewWeight);
     }
   }
 
diff --git a/lib/CodeGen/MIRParser/MIParser.cpp b/lib/CodeGen/MIRParser/MIParser.cpp
index c9c2d62cec30..5a8e96df7603 100644
--- a/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/lib/CodeGen/MIRParser/MIParser.cpp
@@ -459,9 +459,8 @@ bool MIParser::parseBasicBlockSuccessors(MachineBasicBlock &MBB) {
       if (expectAndConsume(MIToken::rparen))
         return true;
     }
-    MBB.addSuccessor(SuccMBB, BranchProbability::getRaw(Weight));
+    MBB.addSuccessor(SuccMBB, Weight);
   } while (consumeIfPresent(MIToken::comma));
-  MBB.normalizeSuccProbs();
   return false;
 }
 
diff --git a/lib/CodeGen/MIRPrinter.cpp b/lib/CodeGen/MIRPrinter.cpp
index 175cb0d51437..0be7807064fb 100644
--- a/lib/CodeGen/MIRPrinter.cpp
+++ b/lib/CodeGen/MIRPrinter.cpp
@@ -461,8 +461,8 @@ void MIPrinter::print(const MachineBasicBlock &MBB) {
       if (I != MBB.succ_begin())
         OS << ", ";
       printMBBReference(**I);
-      if (MBB.hasSuccessorProbabilities())
-        OS << '(' << MBB.getSuccProbability(I) << ')';
+      if (MBB.hasSuccessorWeights())
+        OS << '(' << MBB.getSuccWeight(I) << ')';
     }
     OS << "\n";
     HasLineAttributes = true;
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index c9c6a9d62462..602b75182fca 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -319,8 +319,8 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
     OS << "    Successors according to CFG:";
     for (const_succ_iterator SI = succ_begin(), E = succ_end(); SI != E; ++SI) {
       OS << " BB#" << (*SI)->getNumber();
-      if (!Probs.empty())
-        OS << '(' << *getProbabilityIterator(SI) << ')';
+      if (!Weights.empty())
+        OS << '(' << *getWeightIterator(SI) << ')';
     }
     OS << '\n';
   }
@@ -506,16 +506,34 @@ void MachineBasicBlock::updateTerminator() {
   }
 }
 
+void MachineBasicBlock::addSuccessor(MachineBasicBlock *Succ, uint32_t Weight) {
+  // Weight list is either empty (if successor list isn't empty, this means
+  // disabled optimization) or has the same size as successor list.
+  if (!(Weights.empty() && !Successors.empty()))
+    Weights.push_back(Weight);
+  Successors.push_back(Succ);
+  Succ->addPredecessor(this);
+}
+
+void MachineBasicBlock::addSuccessorWithoutWeight(MachineBasicBlock *Succ) {
+  // We need to make sure weight list is either empty or has the same size of
+  // successor list. When this function is called, we can safely delete all
+  // weight in the list.
+  Weights.clear();
+  Successors.push_back(Succ);
+  Succ->addPredecessor(this);
+}
+
 void MachineBasicBlock::addSuccessor(MachineBasicBlock *Succ,
                                      BranchProbability Prob) {
   // Probability list is either empty (if successor list isn't empty, this means
   // disabled optimization) or has the same size as successor list.
   if (!(Probs.empty() && !Successors.empty())) {
-    assert((Probs.empty() || (Prob.isUnknown() && Probs.back().isUnknown()) ||
-            (!Prob.isUnknown() && !Probs.back().isUnknown())) &&
-           "Successors with both known and unknwon probabilities are not "
-           "allowed.");
     Probs.push_back(Prob);
+    // FIXME: Temporarily use the numerator of the probability to represent edge
+    // weight. This will be removed once all weight-version interfaces in MBB
+    // are replaced with probability-version interfaces.
+    Weights.push_back(Prob.getNumerator());
   }
   Successors.push_back(Succ);
   Succ->addPredecessor(this);
@@ -526,6 +544,7 @@ void MachineBasicBlock::addSuccessorWithoutProb(MachineBasicBlock *Succ) {
   // of successor list. When this function is called, we can safely delete all
   // probability in the list.
   Probs.clear();
+  Weights.clear();
   Successors.push_back(Succ);
   Succ->addPredecessor(this);
 }
@@ -539,12 +558,23 @@ MachineBasicBlock::succ_iterator
 MachineBasicBlock::removeSuccessor(succ_iterator I) {
   assert(I != Successors.end() && "Not a current successor!");
 
+  // If Weight list is empty it means we don't use it (disabled optimization).
+  if (!Weights.empty()) {
+    weight_iterator WI = getWeightIterator(I);
+    Weights.erase(WI);
+  }
+
+  // FIXME: Temporarily comment the following code as probabilities are now only
+  // used during instruction lowering, but this interface is called in later
+  // passes. Uncomment it once all edge weights are replaced with probabilities.
+#if 0
   // If probability list is empty it means we don't use it (disabled
   // optimization).
   if (!Probs.empty()) {
     probability_iterator WI = getProbabilityIterator(I);
     Probs.erase(WI);
   }
+#endif
 
   (*I)->removePredecessor(this);
   return Successors.erase(I);
@@ -581,12 +611,17 @@ void MachineBasicBlock::replaceSuccessor(MachineBasicBlock *Old,
   }
 
   // New is already a successor.
+  // Update its weight instead of adding a duplicate edge.
+  if (!Weights.empty())
+    *getWeightIterator(NewI) += *getWeightIterator(OldI);
+  // FIXME: Temporarily comment the following code as probabilities are now only
+  // used during instruction lowering, but this interface is called in later
+  // passes. Uncomment it once all edge weights are replaced with probabilities.
+#if 0
   // Update its probability instead of adding a duplicate edge.
-  if (!Probs.empty()) {
-    auto ProbIter = getProbabilityIterator(NewI);
-    if (!ProbIter->isUnknown())
-      *ProbIter += *getProbabilityIterator(OldI);
-  }
+  if (!Probs.empty())
+    *getProbabilityIterator(NewI) += *getProbabilityIterator(OldI);
+#endif
   removeSuccessor(OldI);
 }
 
@@ -606,14 +641,13 @@ void MachineBasicBlock::transferSuccessors(MachineBasicBlock *FromMBB) {
 
   while (!FromMBB->succ_empty()) {
     MachineBasicBlock *Succ = *FromMBB->succ_begin();
+    uint32_t Weight = 0;
 
-    // If probability list is empty it means we don't use it (disabled optimization).
-    if (!FromMBB->Probs.empty()) {
-      auto Prob = *FromMBB->Probs.begin();
-      addSuccessor(Succ, Prob);
-    } else
-      addSuccessorWithoutProb(Succ);
+    // If Weight list is empty it means we don't use it (disabled optimization).
+    if (!FromMBB->Weights.empty())
+      Weight = *FromMBB->Weights.begin();
 
+    addSuccessor(Succ, Weight);
     FromMBB->removeSuccessor(Succ);
   }
 }
@@ -625,11 +659,10 @@ MachineBasicBlock::transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB) {
 
   while (!FromMBB->succ_empty()) {
     MachineBasicBlock *Succ = *FromMBB->succ_begin();
-    if (!FromMBB->Probs.empty()) {
-      auto Prob = *FromMBB->Probs.begin();
-      addSuccessor(Succ, Prob);
-    } else
-      addSuccessorWithoutProb(Succ);
+    uint32_t Weight = 0;
+    if (!FromMBB->Weights.empty())
+      Weight = *FromMBB->Weights.begin();
+    addSuccessor(Succ, Weight);
     FromMBB->removeSuccessor(Succ);
 
     // Fix up any PHI nodes in the successor.
@@ -1113,32 +1146,65 @@ MachineBasicBlock::findDebugLoc(instr_iterator MBBI) {
   return DL;
 }
 
-/// Return probability of the edge from this block to MBB.
+/// Return weight of the edge from this block to MBB.
+uint32_t MachineBasicBlock::getSuccWeight(const_succ_iterator Succ) const {
+  if (Weights.empty())
+    return 0;
+
+  return *getWeightIterator(Succ);
+}
+
+/// Return probability of the edge from this block to MBB. If probability list
+/// is empty, return a default probability which is 1/N, where N is the number
+/// of successors. If the probability of the given successor is unknown, then
+/// sum up all known probabilities and return the complement of the sum divided
+/// by the number of unknown probabilities.
 BranchProbability
 MachineBasicBlock::getSuccProbability(const_succ_iterator Succ) const {
-  if (Probs.empty() || Probs.back().isUnknown())
+  if (Probs.empty())
     return BranchProbability(1, succ_size());
 
-  return *getProbabilityIterator(Succ);
+  auto Prob = *getProbabilityIterator(Succ);
+  assert(!Prob.isUnknown());
+  return Prob;
+}
+
+/// Set successor weight of a given iterator.
+void MachineBasicBlock::setSuccWeight(succ_iterator I, uint32_t Weight) {
+  if (Weights.empty())
+    return;
+  *getWeightIterator(I) = Weight;
 }
 
 /// Set successor probability of a given iterator.
 void MachineBasicBlock::setSuccProbability(succ_iterator I,
                                            BranchProbability Prob) {
   assert(!Prob.isUnknown());
-  if (Probs.empty())
+  if (Probs.empty() || Weights.empty())
     return;
   *getProbabilityIterator(I) = Prob;
+  // FIXME: Temporarily use the numerator of the probability to represent edge
+  // weight. This will be removed once all weight-version interfaces in MBB
+  // are replaces with probability-version interfaces.
+  *getWeightIterator(I) = Prob.getNumerator();
 }
 
-/// Return probability iterator corresonding to the I successor iterator
-MachineBasicBlock::const_probability_iterator
-MachineBasicBlock::getProbabilityIterator(
-    MachineBasicBlock::const_succ_iterator I) const {
-  assert(Probs.size() == Successors.size() && "Async probability list!");
+/// Return wight iterator corresonding to the I successor iterator.
+MachineBasicBlock::weight_iterator MachineBasicBlock::
+getWeightIterator(MachineBasicBlock::succ_iterator I) {
+  assert(Weights.size() == Successors.size() && "Async weight list!");
+  size_t index = std::distance(Successors.begin(), I);
+  assert(index < Weights.size() && "Not a current successor!");
+  return Weights.begin() + index;
+}
+
+/// Return wight iterator corresonding to the I successor iterator.
+MachineBasicBlock::const_weight_iterator MachineBasicBlock::
+getWeightIterator(MachineBasicBlock::const_succ_iterator I) const {
+  assert(Weights.size() == Successors.size() && "Async weight list!");
   const size_t index = std::distance(Successors.begin(), I);
-  assert(index < Probs.size() && "Not a current successor!");
-  return Probs.begin() + index;
+  assert(index < Weights.size() && "Not a current successor!");
+  return Weights.begin() + index;
 }
 
 /// Return probability iterator corresonding to the I successor iterator.
@@ -1150,6 +1216,16 @@ MachineBasicBlock::getProbabilityIterator(MachineBasicBlock::succ_iterator I) {
   return Probs.begin() + index;
 }
 
+/// Return probability iterator corresonding to the I successor iterator
+MachineBasicBlock::const_probability_iterator
+MachineBasicBlock::getProbabilityIterator(
+    MachineBasicBlock::const_succ_iterator I) const {
+  assert(Probs.size() == Successors.size() && "Async probability list!");
+  const size_t index = std::distance(Successors.begin(), I);
+  assert(index < Probs.size() && "Not a current successor!");
+  return Probs.begin() + index;
+}
+
 /// Return whether (physical) register "Reg" has been <def>ined and not <kill>ed
 /// as of just before "MI".
 /// 
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
index fcddf346cf68..fba33eb93d5f 100644
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -380,11 +380,19 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
   const BranchProbability HotProb(4, 5); // 80%
 
   MachineBasicBlock *BestSucc = nullptr;
-  auto BestProb = BranchProbability::getZero();
-
-  // Adjust edge probabilities by excluding edges pointing to blocks that is
-  // either not in BlockFilter or is already in the current chain. Consider the
-  // following CFG:
+  // FIXME: Due to the performance of the probability and weight routines in
+  // the MBPI analysis, we manually compute probabilities using the edge
+  // weights. This is suboptimal as it means that the somewhat subtle
+  // definition of edge weight semantics is encoded here as well. We should
+  // improve the MBPI interface to efficiently support query patterns such as
+  // this.
+  uint32_t BestWeight = 0;
+  uint32_t WeightScale = 0;
+  uint32_t SumWeight = MBPI->getSumForBlock(BB, WeightScale);
+
+  // Adjust sum of weights by excluding weights on edges pointing to blocks that
+  // is either not in BlockFilter or is already in the current chain. Consider
+  // the following CFG:
   //
   //     --->A
   //     |  / \
@@ -398,7 +406,7 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
   // HotProb). If we exclude E that is not in BlockFilter when calculating the
   // probability of C->D, D will be selected and we will get A C D B as the
   // layout of this loop.
-  auto AdjustedSumProb = BranchProbability::getOne();
+  uint32_t AdjustedSumWeight = SumWeight;
   SmallVector<MachineBasicBlock *, 4> Successors;
   for (MachineBasicBlock *Succ : BB->successors()) {
     bool SkipSucc = false;
@@ -416,20 +424,15 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
       }
     }
     if (SkipSucc)
-      AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ);
+      AdjustedSumWeight -= MBPI->getEdgeWeight(BB, Succ) / WeightScale;
     else
       Successors.push_back(Succ);
   }
 
   DEBUG(dbgs() << "Attempting merge from: " << getBlockName(BB) << "\n");
   for (MachineBasicBlock *Succ : Successors) {
-    BranchProbability SuccProb;
-    uint32_t SuccProbN = MBPI->getEdgeProbability(BB, Succ).getNumerator();
-    uint32_t SuccProbD = AdjustedSumProb.getNumerator();
-    if (SuccProbN >= SuccProbD)
-      SuccProb = BranchProbability::getOne();
-    else
-      SuccProb = BranchProbability(SuccProbN, SuccProbD);
+    uint32_t SuccWeight = MBPI->getEdgeWeight(BB, Succ);
+    BranchProbability SuccProb(SuccWeight / WeightScale, AdjustedSumWeight);
 
     // If we outline optional branches, look whether Succ is unavoidable, i.e.
     // dominates all terminators of the MachineFunction. If it does, other
@@ -467,7 +470,7 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
 
       // Make sure that a hot successor doesn't have a globally more
       // important predecessor.
-      auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
+      BranchProbability RealSuccProb(SuccWeight / WeightScale, SumWeight);
       BlockFrequency CandidateEdgeFreq =
           MBFI->getBlockFreq(BB) * RealSuccProb * HotProb.getCompl();
       bool BadCFGConflict = false;
@@ -493,10 +496,10 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
                  << " (prob)"
                  << (SuccChain.LoopPredecessors != 0 ? " (CFG break)" : "")
                  << "\n");
-    if (BestSucc && BestProb >= SuccProb)
+    if (BestSucc && BestWeight >= SuccWeight)
       continue;
     BestSucc = Succ;
-    BestProb = SuccProb;
+    BestWeight = SuccWeight;
   }
   return BestSucc;
 }
@@ -725,6 +728,11 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L,
     MachineBasicBlock *OldExitingBB = ExitingBB;
     BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
     bool HasLoopingSucc = false;
+    // FIXME: Due to the performance of the probability and weight routines in
+    // the MBPI analysis, we use the internal weights and manually compute the
+    // probabilities to avoid quadratic behavior.
+    uint32_t WeightScale = 0;
+    uint32_t SumWeight = MBPI->getSumForBlock(MBB, WeightScale);
     for (MachineBasicBlock *Succ : MBB->successors()) {
       if (Succ->isEHPad())
         continue;
@@ -738,10 +746,10 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L,
         continue;
       }
 
-      auto SuccProb = MBPI->getEdgeProbability(MBB, Succ);
+      uint32_t SuccWeight = MBPI->getEdgeWeight(MBB, Succ);
       if (LoopBlockSet.count(Succ)) {
         DEBUG(dbgs() << "    looping: " << getBlockName(MBB) << " -> "
-                     << getBlockName(Succ) << " (" << SuccProb << ")\n");
+                     << getBlockName(Succ) << " (" << SuccWeight << ")\n");
         HasLoopingSucc = true;
         continue;
       }
@@ -753,6 +761,7 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L,
           BlocksExitingToOuterLoop.insert(MBB);
       }
 
+      BranchProbability SuccProb(SuccWeight / WeightScale, SumWeight);
       BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb;
       DEBUG(dbgs() << "    exiting: " << getBlockName(MBB) << " -> "
                    << getBlockName(Succ) << " [L:" << SuccLoopDepth << "] (";
@@ -895,17 +904,21 @@ void MachineBlockPlacement::rotateLoopWithProfile(
   // edge from the tail of the loop chain.
   SmallVector<std::pair<MachineBasicBlock *, BlockFrequency>, 4> ExitsWithFreq;
   for (auto BB : LoopChain) {
-    auto LargestExitEdgeProb = BranchProbability::getZero();
+    uint32_t LargestExitEdgeWeight = 0;
     for (auto *Succ : BB->successors()) {
       BlockChain *SuccChain = BlockToChain[Succ];
       if (!LoopBlockSet.count(Succ) &&
           (!SuccChain || Succ == *SuccChain->begin())) {
-        auto SuccProb = MBPI->getEdgeProbability(BB, Succ);
-        LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb);
+        uint32_t SuccWeight = MBPI->getEdgeWeight(BB, Succ);
+        LargestExitEdgeWeight = std::max(LargestExitEdgeWeight, SuccWeight);
       }
     }
-    if (LargestExitEdgeProb > BranchProbability::getZero()) {
-      auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb;
+    if (LargestExitEdgeWeight > 0) {
+      uint32_t WeightScale = 0;
+      uint32_t SumWeight = MBPI->getSumForBlock(BB, WeightScale);
+      auto ExitFreq =
+          MBFI->getBlockFreq(BB) *
+          BranchProbability(LargestExitEdgeWeight / WeightScale, SumWeight);
       ExitsWithFreq.emplace_back(BB, ExitFreq);
     }
   }
@@ -1277,16 +1290,14 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
       }
 
       // If PrevBB has a two-way branch, try to re-order the branches
-      // such that we branch to the successor with higher probability first.
+      // such that we branch to the successor with higher weight first.
       if (TBB && !Cond.empty() && FBB &&
-          MBPI->getEdgeProbability(PrevBB, FBB) >
-              MBPI->getEdgeProbability(PrevBB, TBB) &&
+          MBPI->getEdgeWeight(PrevBB, FBB) > MBPI->getEdgeWeight(PrevBB, TBB) &&
           !TII->ReverseBranchCondition(Cond)) {
         DEBUG(dbgs() << "Reverse order of the two branches: "
                      << getBlockName(PrevBB) << "\n");
-        DEBUG(dbgs() << "    Edge probability: "
-                     << MBPI->getEdgeProbability(PrevBB, FBB) << " vs "
-                     << MBPI->getEdgeProbability(PrevBB, TBB) << "\n");
+        DEBUG(dbgs() << "    Edge weight: " << MBPI->getEdgeWeight(PrevBB, FBB)
+                     << " vs " << MBPI->getEdgeWeight(PrevBB, TBB) << "\n");
         DebugLoc dl; // FIXME: this is nowhere
         TII->RemoveBranch(*PrevBB);
         TII->InsertBranch(*PrevBB, FBB, TBB, Cond, dl);
diff --git a/lib/CodeGen/MachineBranchProbabilityInfo.cpp b/lib/CodeGen/MachineBranchProbabilityInfo.cpp
index 5478dcba261a..6fbc2be70486 100644
--- a/lib/CodeGen/MachineBranchProbabilityInfo.cpp
+++ b/lib/CodeGen/MachineBranchProbabilityInfo.cpp
@@ -28,61 +28,91 @@ char MachineBranchProbabilityInfo::ID = 0;
 
 void MachineBranchProbabilityInfo::anchor() { }
 
-uint32_t MachineBranchProbabilityInfo::getEdgeWeight(
-    const MachineBasicBlock *Src,
-    MachineBasicBlock::const_succ_iterator Dst) const {
-  return Src->getSuccProbability(Dst).getNumerator();
-}
+uint32_t MachineBranchProbabilityInfo::
+getSumForBlock(const MachineBasicBlock *MBB, uint32_t &Scale) const {
+  // First we compute the sum with 64-bits of precision, ensuring that cannot
+  // overflow by bounding the number of weights considered. Hopefully no one
+  // actually needs 2^32 successors.
+  assert(MBB->succ_size() < UINT32_MAX);
+  uint64_t Sum = 0;
+  Scale = 1;
+  for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
+       E = MBB->succ_end(); I != E; ++I) {
+    uint32_t Weight = getEdgeWeight(MBB, I);
+    Sum += Weight;
+  }
 
-uint32_t MachineBranchProbabilityInfo::getEdgeWeight(
-    const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const {
-  // This is a linear search. Try to use the const_succ_iterator version when
-  // possible.
-  return getEdgeWeight(Src, std::find(Src->succ_begin(), Src->succ_end(), Dst));
+  // If the computed sum fits in 32-bits, we're done.
+  if (Sum <= UINT32_MAX)
+    return Sum;
+
+  // Otherwise, compute the scale necessary to cause the weights to fit, and
+  // re-sum with that scale applied.
+  assert((Sum / UINT32_MAX) < UINT32_MAX);
+  Scale = (Sum / UINT32_MAX) + 1;
+  Sum = 0;
+  for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
+       E = MBB->succ_end(); I != E; ++I) {
+    uint32_t Weight = getEdgeWeight(MBB, I);
+    Sum += Weight / Scale;
+  }
+  assert(Sum <= UINT32_MAX);
+  return Sum;
 }
 
-BranchProbability MachineBranchProbabilityInfo::getEdgeProbability(
-    const MachineBasicBlock *Src,
-    MachineBasicBlock::const_succ_iterator Dst) const {
-  return Src->getSuccProbability(Dst);
+uint32_t MachineBranchProbabilityInfo::
+getEdgeWeight(const MachineBasicBlock *Src,
+              MachineBasicBlock::const_succ_iterator Dst) const {
+  uint32_t Weight = Src->getSuccWeight(Dst);
+  if (!Weight)
+    return DEFAULT_WEIGHT;
+  return Weight;
 }
 
-BranchProbability MachineBranchProbabilityInfo::getEdgeProbability(
-    const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const {
+uint32_t MachineBranchProbabilityInfo::
+getEdgeWeight(const MachineBasicBlock *Src,
+              const MachineBasicBlock *Dst) const {
   // This is a linear search. Try to use the const_succ_iterator version when
   // possible.
-  return getEdgeProbability(Src,
-                            std::find(Src->succ_begin(), Src->succ_end(), Dst));
+  return getEdgeWeight(Src, std::find(Src->succ_begin(), Src->succ_end(), Dst));
 }
 
 bool
 MachineBranchProbabilityInfo::isEdgeHot(const MachineBasicBlock *Src,
                                         const MachineBasicBlock *Dst) const {
   // Hot probability is at least 4/5 = 80%
-  static BranchProbability HotProb(4, 5);
-  return getEdgeProbability(Src, Dst) > HotProb;
+  // FIXME: Compare against a static "hot" BranchProbability.
+  return getEdgeProbability(Src, Dst) > BranchProbability(4, 5);
 }
 
 MachineBasicBlock *
 MachineBranchProbabilityInfo::getHotSucc(MachineBasicBlock *MBB) const {
-  auto MaxProb = BranchProbability::getZero();
+  uint32_t MaxWeight = 0;
   MachineBasicBlock *MaxSucc = nullptr;
   for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
        E = MBB->succ_end(); I != E; ++I) {
-    auto Prob = getEdgeProbability(MBB, I);
-    if (Prob > MaxProb) {
-      MaxProb = Prob;
+    uint32_t Weight = getEdgeWeight(MBB, I);
+    if (Weight > MaxWeight) {
+      MaxWeight = Weight;
       MaxSucc = *I;
     }
   }
 
-  static BranchProbability HotProb(4, 5);
-  if (getEdgeProbability(MBB, MaxSucc) >= HotProb)
+  if (getEdgeProbability(MBB, MaxSucc) >= BranchProbability(4, 5))
     return MaxSucc;
 
   return nullptr;
 }
 
+BranchProbability MachineBranchProbabilityInfo::getEdgeProbability(
+    const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const {
+  uint32_t Scale = 1;
+  uint32_t D = getSumForBlock(Src, Scale);
+  uint32_t N = getEdgeWeight(Src, Dst) / Scale;
+
+  return BranchProbability(N, D);
+}
+
 raw_ostream &MachineBranchProbabilityInfo::printEdgeProbability(
     raw_ostream &OS, const MachineBasicBlock *Src,
     const MachineBasicBlock *Dst) const {
diff --git a/lib/CodeGen/TailDuplication.cpp b/lib/CodeGen/TailDuplication.cpp
index 1f5b54866ac6..ff86dabfac59 100644
--- a/lib/CodeGen/TailDuplication.cpp
+++ b/lib/CodeGen/TailDuplication.cpp
@@ -745,12 +745,12 @@ TailDuplicatePass::duplicateSimpleBB(MachineBasicBlock *TailBB,
     if (PredTBB)
       TII->InsertBranch(*PredBB, PredTBB, PredFBB, PredCond, DebugLoc());
 
-    auto Prob = MBPI->getEdgeProbability(PredBB, TailBB);
+    uint32_t Weight = MBPI->getEdgeWeight(PredBB, TailBB);
     PredBB->removeSuccessor(TailBB);
     unsigned NumSuccessors = PredBB->succ_size();
     assert(NumSuccessors <= 1);
     if (NumSuccessors == 0 || *PredBB->succ_begin() != NewTarget)
-      PredBB->addSuccessor(NewTarget, Prob);
+      PredBB->addSuccessor(NewTarget, Weight);
 
     TDBBs.push_back(PredBB);
   }
@@ -858,7 +858,7 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB,
            "TailDuplicate called on block with multiple successors!");
     for (MachineBasicBlock::succ_iterator I = TailBB->succ_begin(),
            E = TailBB->succ_end(); I != E; ++I)
-      PredBB->addSuccessor(*I, MBPI->getEdgeProbability(TailBB, I));
+      PredBB->addSuccessor(*I, MBPI->getEdgeWeight(TailBB, I));
 
     Changed = true;
     ++NumTailDups;
diff --git a/lib/Support/BranchProbability.cpp b/lib/Support/BranchProbability.cpp
index 771d02c0aa3c..3b0f6e6f06e4 100644
--- a/lib/Support/BranchProbability.cpp
+++ b/lib/Support/BranchProbability.cpp
@@ -22,14 +22,11 @@ using namespace llvm;
 const uint32_t BranchProbability::D;
 
 raw_ostream &BranchProbability::print(raw_ostream &OS) const {
-  if (isUnknown())
-    return OS << "?%";
-
   // Get a percentage rounded to two decimal digits. This avoids
   // implementation-defined rounding inside printf.
   double Percent = rint(((double)N / D) * 100.0 * 100.0) / 100.0;
-  return OS << format("0x%08" PRIx32 " / 0x%08" PRIx32 " = %.2f%%", N, D,
-                      Percent);
+  OS << format("0x%08" PRIx32 " / 0x%08" PRIx32 " = %.2f%%", N, D, Percent);
+  return OS;
 }
 
 void BranchProbability::dump() const { print(dbgs()) << '\n'; }
@@ -46,19 +43,6 @@ BranchProbability::BranchProbability(uint32_t Numerator, uint32_t Denominator) {
   }
 }
 
-BranchProbability
-BranchProbability::getBranchProbability(uint64_t Numerator,
-                                        uint64_t Denominator) {
-  assert(Numerator <= Denominator && "Probability cannot be bigger than 1!");
-  // Scale down Denominator to fit in a 32-bit integer.
-  int Scale = 0;
-  while (Denominator > UINT32_MAX) {
-    Denominator >>= 1;
-    Scale++;
-  }
-  return BranchProbability(Numerator >> Scale, Denominator);
-}
-
 // If ConstD is not zero, then replace D by ConstD so that division and modulo
 // operations by D can be optimized, in case this function is not inlined by the
 // compiler.
diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index cdbd12092150..f1b383017901 100644
--- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -1570,7 +1570,8 @@ void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
 
   insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc());
   insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc());
-  DstBlk->replaceSuccessor(DstBlk, LandMBB);
+  DstBlk->addSuccessor(LandMBB);
+  DstBlk->removeSuccessor(DstBlk);
 }
 
 
@@ -1665,7 +1666,8 @@ AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
   replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB);
   //srcBlk, oldBlk, newBlk
 
-  PredMBB->replaceSuccessor(MBB, CloneMBB);
+  PredMBB->removeSuccessor(MBB);
+  PredMBB->addSuccessor(CloneMBB);
 
   // add all successor to cloneBlk
   cloneSuccessorList(CloneMBB, MBB);
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index e89757c19ecc..0bf2d374df6a 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -2274,7 +2274,8 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
 
   // Update the CFG.
   NewBB->addSuccessor(BB);
-  JTBB->replaceSuccessor(BB, NewBB);
+  JTBB->removeSuccessor(BB);
+  JTBB->addSuccessor(NewBB);
 
   ++NumJTInserted;
   return NewBB;
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index e8f3ab65bdbe..0cc41812d71c 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -7346,7 +7346,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
       }
     }
 
-    BB->addSuccessor(DispatchBB, BranchProbability::getZero());
+    BB->addSuccessor(DispatchBB);
 
     // Find the invoke call and mark all of the callee-saved registers as
     // 'implicit defined' so that they're spilled. This prevents code from
diff --git a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
index efafdd007289..96bb61750805 100644
--- a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
+++ b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
@@ -186,11 +186,13 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) {
 
             if (case1 || case2) {
               InvertAndChangeJumpTarget(MI, UncondTarget);
-              MBB->replaceSuccessor(JumpAroundTarget, UncondTarget);
+              MBB->removeSuccessor(JumpAroundTarget);
+              MBB->addSuccessor(UncondTarget);
 
               // Remove the unconditional branch in LayoutSucc.
               LayoutSucc->erase(LayoutSucc->begin());
-              LayoutSucc->replaceSuccessor(UncondTarget, JumpAroundTarget);
+              LayoutSucc->removeSuccessor(UncondTarget);
+              LayoutSucc->addSuccessor(JumpAroundTarget);
 
               // This code performs the conversion for case 2, which moves
               // the block to the fall-thru case (BB3 in the code above).
diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp
index e75858a181e5..d09843ed0e53 100644
--- a/lib/Target/Mips/MipsLongBranch.cpp
+++ b/lib/Target/Mips/MipsLongBranch.cpp
@@ -262,7 +262,8 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
       static_cast<const MipsInstrInfo *>(Subtarget.getInstrInfo());
 
   MF->insert(FallThroughMBB, LongBrMBB);
-  MBB->replaceSuccessor(TgtMBB, LongBrMBB);
+  MBB->removeSuccessor(TgtMBB);
+  MBB->addSuccessor(LongBrMBB);
 
   if (IsPIC) {
     MachineBasicBlock *BalTgtMBB = MF->CreateMachineBasicBlock(BB);
diff --git a/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll b/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll
index a44c9721d6c1..e17da7a97205 100644
--- a/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll
+++ b/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll
@@ -14,15 +14,15 @@ entry:
   br i1 undef, label %for.end, label %for.body
 
 ; Before if conversion, we have
-; for.body -> lor.lhs.false.i (50%)
-;          -> for.cond.backedge (50%)
-; lor.lhs.false.i -> for.cond.backedge (100%)
-;                 -> cond.false.i (0%)
+; for.body -> lor.lhs.false.i (62)
+;          -> for.cond.backedge (62)
+; lor.lhs.false.i -> for.cond.backedge (1048575)
+;                 -> cond.false.i (1)
 ; Afer if conversion, we have
-; for.body -> for.cond.backedge (100%)
-;          -> cond.false.i (0%)
+; for.body -> for.cond.backedge (130023362)
+;          -> cond.false.i (62)
 ; CHECK: BB#1: derived from LLVM BB %for.body
-; CHECK: Successors according to CFG: BB#2(0x7ffffc00 / 0x80000000 = 100.00%) BB#4(0x00000400 / 0x80000000 = 0.00%)
+; CHECK: Successors according to CFG: BB#2(4294967291) BB#4(2048)
 for.body:
   br i1 undef, label %for.cond.backedge, label %lor.lhs.false.i, !prof !1
 
diff --git a/test/CodeGen/ARM/ifcvt-branch-weight.ll b/test/CodeGen/ARM/ifcvt-branch-weight.ll
index 0de039cde23c..f2a1229d0d8a 100644
--- a/test/CodeGen/ARM/ifcvt-branch-weight.ll
+++ b/test/CodeGen/ARM/ifcvt-branch-weight.ll
@@ -19,7 +19,7 @@ bb:
   br i1 %9, label %return, label %bb2
 
 ; CHECK: BB#2: derived from LLVM BB %bb2
-; CHECK: Successors according to CFG: BB#3({{[0-9a-fx/= ]+}}50.00%) BB#4({{[0-9a-fx/= ]+}}50.00%)
+; CHECK: Successors according to CFG: BB#3(4294967289) BB#4(4294967287)
 
 bb2:
   %v10 = icmp eq i32 %3, 16
diff --git a/test/CodeGen/ARM/ifcvt-iter-indbr.ll b/test/CodeGen/ARM/ifcvt-iter-indbr.ll
index a96b6e8a1e83..6ce9bcb56ef4 100644
--- a/test/CodeGen/ARM/ifcvt-iter-indbr.ll
+++ b/test/CodeGen/ARM/ifcvt-iter-indbr.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple thumbv7s-apple-darwin  -asm-verbose=false | FileCheck %s
-; RUN: llc < %s -mtriple thumbv7s-apple-darwin  -asm-verbose=false -print-machineinstrs=if-converter 2>&1 | FileCheck --check-prefix=CHECK-PROB %s
+; RUN: llc < %s -mtriple thumbv7s-apple-darwin  -asm-verbose=false -print-machineinstrs=if-converter 2>&1 | FileCheck --check-prefix=CHECK-WEIGHT %s
 
 declare i32 @foo(i32)
 declare i8* @bar(i32, i8*, i8*)
@@ -29,10 +29,10 @@ declare i8* @bar(i32, i8*, i8*)
 ; CHECK-NEXT: [[FOOCALL]]:
 ; CHECK-NEXT:  blx _foo
 ;
-; CHECK-PROB: BB#0:
-; CHECK-PROB: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}50.00%) BB#2({{[0-9a-fx/= ]+}}25.00%) BB#4({{[0-9a-fx/= ]+}}25.00%)
-; CHECK-PROB: BB#1:
-; CHECK-PROB: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}75.00%) BB#4({{[0-9a-fx/= ]+}}25.00%)
+; CHECK-WEIGHT: BB#0:
+; CHECK-WEIGHT: Successors according to CFG: BB#1(1073741824) BB#2(536870912) BB#4(536870912)
+; CHECK-WEIGHT: BB#1:
+; CHECK-WEIGHT: Successors according to CFG: BB#2(1610612736) BB#4(536870912)
 
 define i32 @test(i32 %a, i32 %a2, i32* %p, i32* %p2) {
 entry:
diff --git a/test/CodeGen/ARM/tail-merge-branch-weight.ll b/test/CodeGen/ARM/tail-merge-branch-weight.ll
index f83f28815793..95b0a202e7ff 100644
--- a/test/CodeGen/ARM/tail-merge-branch-weight.ll
+++ b/test/CodeGen/ARM/tail-merge-branch-weight.ll
@@ -9,7 +9,7 @@
 ;                = 0.2 * 0.4 + 0.8 * 0.7 = 0.64
 
 ; CHECK: # Machine code for function test0:
-; CHECK: Successors according to CFG: BB#{{[0-9]+}}({{[0-9a-fx/= ]+}}20.00%) BB#{{[0-9]+}}({{[0-9a-fx/= ]+}}80.00%)
+; CHECK: Successors according to CFG: BB#{{[0-9]+}}(13) BB#{{[0-9]+}}(24)
 ; CHECK: BB#{{[0-9]+}}:
 ; CHECK: BB#{{[0-9]+}}:
 ; CHECK: # End machine code for function test0.
diff --git a/test/CodeGen/ARM/taildup-branch-weight.ll b/test/CodeGen/ARM/taildup-branch-weight.ll
index 799ef62416e6..576c120b444e 100644
--- a/test/CodeGen/ARM/taildup-branch-weight.ll
+++ b/test/CodeGen/ARM/taildup-branch-weight.ll
@@ -3,7 +3,7 @@
 ; RUN:	| FileCheck %s
 
 ; CHECK: Machine code for function test0:
-; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}3.12%) BB#2({{[0-9a-fx/= ]+}}96.88%)
+; CHECK: Successors according to CFG: BB#1(67108864) BB#2(2080374784)
 
 define void @test0(i32 %a, i32 %b, i32* %c, i32* %d) {
 entry:
@@ -30,7 +30,7 @@ B4:
 !0 = !{!"branch_weights", i32 4, i32 124}
 
 ; CHECK: Machine code for function test1:
-; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}3.12%) BB#2({{[0-9a-fx/= ]+}}96.88%)
+; CHECK: Successors according to CFG: BB#1(67108864) BB#2(2080374784)
 
 @g0 = common global i32 0, align 4
 
diff --git a/test/CodeGen/Generic/MachineBranchProb.ll b/test/CodeGen/Generic/MachineBranchProb.ll
index ae3c8da21471..5a4a4672f7eb 100644
--- a/test/CodeGen/Generic/MachineBranchProb.ll
+++ b/test/CodeGen/Generic/MachineBranchProb.ll
@@ -16,11 +16,11 @@ entry:
     i64 5, label %sw.bb1
   ], !prof !0
 ; CHECK: BB#0: derived from LLVM BB %entry
-; CHECK: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}75.29%) BB#4({{[0-9a-fx/= ]+}}24.71%)
+; CHECK: Successors according to CFG: BB#2(1616928864) BB#4(530554784)
 ; CHECK: BB#4: derived from LLVM BB %entry
-; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}47.62%) BB#5({{[0-9a-fx/= ]+}}52.38%)
+; CHECK: Successors according to CFG: BB#1(252645135) BB#5(277909649)
 ; CHECK: BB#5: derived from LLVM BB %entry
-; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}36.36%) BB#3({{[0-9a-fx/= ]+}}63.64%)
+; CHECK: Successors according to CFG: BB#1(101058054) BB#3(176851595)
 
 sw.bb:
   br label %return
@@ -62,7 +62,7 @@ return: ret void
 ; CHECK-LABEL: Machine code for function left_leaning_weight_balanced_tree:
 ; CHECK: BB#0: derived from LLVM BB %entry
 ; CHECK-NOT: Successors
-; CHECK: Successors according to CFG: BB#8({{[0-9a-fx/= ]+}}39.71%) BB#9({{[0-9a-fx/= ]+}}60.29%)
+; CHECK: Successors according to CFG: BB#8(852677332) BB#9(1294806318)
 }
 
 !1 = !{!"branch_weights",
diff --git a/test/CodeGen/Hexagon/ifcvt-edge-weight.ll b/test/CodeGen/Hexagon/ifcvt-edge-weight.ll
index 341567e1d02f..f84fd95e4fbd 100644
--- a/test/CodeGen/Hexagon/ifcvt-edge-weight.ll
+++ b/test/CodeGen/Hexagon/ifcvt-edge-weight.ll
@@ -2,7 +2,7 @@
 ; Check that the edge weights are updated correctly after if-conversion.
 
 ; CHECK: BB#3:
-; CHECK: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}10.00%) BB#1({{[0-9a-fx/= ]+}}90.00%)
+; CHECK: Successors according to CFG: BB#2(214748365) BB#1(1932735283)
 @a = external global i32
 @d = external global i32
 
diff --git a/test/CodeGen/MIR/X86/newline-handling.mir b/test/CodeGen/MIR/X86/newline-handling.mir
index bce06d540114..b5ed3b7f27e1 100644
--- a/test/CodeGen/MIR/X86/newline-handling.mir
+++ b/test/CodeGen/MIR/X86/newline-handling.mir
@@ -35,7 +35,7 @@ liveins:
 # CHECK-LABEL: name: foo
 # CHECK: body: |
 # CHECK-NEXT: bb.0.entry:
-# CHECK-NEXT: successors: %bb.1.less(0x40000000 / 0x80000000 = 50.00%), %bb.2.exit(0x40000000 / 0x80000000 = 50.00%)
+# CHECK-NEXT: successors: %bb.1.less(0), %bb.2.exit(0)
 # CHECK-NEXT: liveins: %edi
 # CHECK:      CMP32ri8 %edi, 10, implicit-def %eflags
 # CHECK-NEXT: JG_1 %bb.2.exit, implicit killed %eflags
@@ -79,7 +79,7 @@ liveins:
 # CHECK-LABEL: name: bar
 # CHECK: body: |
 # CHECK-NEXT: bb.0.entry:
-# CHECK-NEXT: successors: %bb.1.less(0x40000000 / 0x80000000 = 50.00%), %bb.2.exit(0x40000000 / 0x80000000 = 50.00%)
+# CHECK-NEXT: successors: %bb.1.less(0), %bb.2.exit(0)
 # CHECK-NEXT: liveins: %edi
 # CHECK:      CMP32ri8 %edi, 10, implicit-def %eflags
 # CHECK-NEXT: JG_1 %bb.2.exit, implicit killed %eflags
diff --git a/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir b/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir
index 64af6121189a..fc5e5d640f7f 100644
--- a/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir
+++ b/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir
@@ -1,6 +1,6 @@
 # RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
 # This test ensures that the MIR parser parses basic block successors and
-# probabilities correctly.
+# weights correctly.
 
 --- |
 
@@ -21,10 +21,10 @@
 name:            foo
 body: |
   ; CHECK-LABEL: bb.0.entry:
-  ; CHECK:         successors: %bb.1.less({{[0-9a-fx/= ]+}}33.00%), %bb.2.exit({{[0-9a-fx/= ]+}}67.00%)
+  ; CHECK:         successors: %bb.1.less(16), %bb.2.exit(32)
   ; CHECK-LABEL: bb.1.less:
   bb.0.entry:
-    successors: %bb.1.less (33), %bb.2.exit(67)
+    successors: %bb.1.less (16), %bb.2.exit(32)
     liveins: %edi
 
     CMP32ri8 %edi, 10, implicit-def %eflags
diff --git a/test/CodeGen/MIR/X86/successor-basic-blocks.mir b/test/CodeGen/MIR/X86/successor-basic-blocks.mir
index a6c14f70bc7c..aa80fe9fbeef 100644
--- a/test/CodeGen/MIR/X86/successor-basic-blocks.mir
+++ b/test/CodeGen/MIR/X86/successor-basic-blocks.mir
@@ -32,7 +32,7 @@
 name:            foo
 body: |
   ; CHECK-LABEL: bb.0.entry:
-  ; CHECK:         successors: %bb.1.less(0x40000000 / 0x80000000 = 50.00%), %bb.2.exit(0x40000000 / 0x80000000 = 50.00%)
+  ; CHECK:         successors: %bb.1.less(0), %bb.2.exit(0)
   ; CHECK-LABEL: bb.1.less:
   bb.0.entry:
     successors: %bb.1.less, %bb.2.exit
@@ -58,7 +58,7 @@ body: |
   ; Verify that we can have multiple lists of successors that will be merged
   ; into one.
   ; CHECK-LABEL: bb.0.entry:
-  ; CHECK:         successors: %bb.1(0x80000000 / 0x80000000 = 100.00%), %bb.2(0x00000000 / 0x80000000 = 0.00%)
+  ; CHECK:         successors: %bb.1(0), %bb.2(0)
   bb.0.entry:
     liveins: %edi
     successors: %bb.1
diff --git a/test/CodeGen/X86/MachineBranchProb.ll b/test/CodeGen/X86/MachineBranchProb.ll
index ee1c658d4c55..da0bf517ecfa 100644
--- a/test/CodeGen/X86/MachineBranchProb.ll
+++ b/test/CodeGen/X86/MachineBranchProb.ll
@@ -18,9 +18,9 @@ for.cond2:                                        ; preds = %for.inc, %for.cond
   %or.cond = or i1 %tobool, %cmp4
   br i1 %or.cond, label %for.inc20, label %for.inc, !prof !0
 ; CHECK: BB#1: derived from LLVM BB %for.cond2
-; CHECK: Successors according to CFG: BB#3({{[0-9a-fx/= ]+}}1.53%) BB#4({{[0-9a-fx/= ]+}}98.47%)
+; CHECK: Successors according to CFG: BB#3(32756933) BB#4(2114726715)
 ; CHECK: BB#4: derived from LLVM BB %for.cond2
-; CHECK: Successors according to CFG: BB#3({{[0-9a-fx/= ]+}}1.55%) BB#2({{[0-9a-fx/= ]+}}98.45%)
+; CHECK: Successors according to CFG: BB#3(33264335) BB#2(2114219313)
 
 for.inc:                                          ; preds = %for.cond2
   %shl = shl i32 %bit.0, 1
diff --git a/test/CodeGen/X86/catchpad-weight.ll b/test/CodeGen/X86/catchpad-weight.ll
index e8b416845ec1..9b06f2abc81c 100644
--- a/test/CodeGen/X86/catchpad-weight.ll
+++ b/test/CodeGen/X86/catchpad-weight.ll
@@ -2,7 +2,7 @@
 
 ; Check if the edge weight to the catchpad is calculated correctly.
 
-; CHECK: Successors according to CFG: BB#3(0x7ffff100 / 0x80000000 = 100.00%) BB#1(0x00000800 / 0x80000000 = 0.00%) BB#4(0x00000400 / 0x80000000 = 0.00%) BB#6(0x00000200 / 0x80000000 = 0.00%) BB#8(0x00000100 / 0x80000000 = 0.00%)
+; CHECK: Successors according to CFG: BB#3(2147481600) BB#1(2048) BB#4(1024) BB#6(512) BB#8(256)
 
 target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64--windows-msvc18.0.0"
diff --git a/test/CodeGen/X86/stack-protector-weight.ll b/test/CodeGen/X86/stack-protector-weight.ll
index dea66d28e3dd..16877ef70a31 100644
--- a/test/CodeGen/X86/stack-protector-weight.ll
+++ b/test/CodeGen/X86/stack-protector-weight.ll
@@ -2,13 +2,13 @@
 ; RUN: llc -mtriple=x86_64-apple-darwin -print-machineinstrs=expand-isel-pseudos -enable-selectiondag-sp=false %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=IR
 
 ; SELDAG: # Machine code for function test_branch_weights:
-; SELDAG: Successors according to CFG: BB#[[SUCCESS:[0-9]+]]({{[0-9a-fx/= ]+}}100.00%) BB#[[FAILURE:[0-9]+]]
+; SELDAG: Successors according to CFG: BB#[[SUCCESS:[0-9]+]](2147481600) BB#[[FAILURE:[0-9]+]](2048)
 ; SELDAG: BB#[[FAILURE]]:
 ; SELDAG: CALL64pcrel32 <es:__stack_chk_fail>
 ; SELDAG: BB#[[SUCCESS]]:
 
 ; IR: # Machine code for function test_branch_weights:
-; IR: Successors according to CFG: BB#[[SUCCESS:[0-9]+]]({{[0-9a-fx/= ]+}}100.00%) BB#[[FAILURE:[0-9]+]]
+; IR: Successors according to CFG: BB#[[SUCCESS:[0-9]+]](2147481600) BB#[[FAILURE:[0-9]+]](2048)
 ; IR: BB#[[SUCCESS]]:
 ; IR: BB#[[FAILURE]]:
 ; IR: CALL64pcrel32 <ga:@__stack_chk_fail>
diff --git a/test/CodeGen/X86/switch-edge-weight.ll b/test/CodeGen/X86/switch-edge-weight.ll
index 6f594868c7ad..9026f6f05f0e 100644
--- a/test/CodeGen/X86/switch-edge-weight.ll
+++ b/test/CodeGen/X86/switch-edge-weight.ll
@@ -34,22 +34,22 @@ sw.epilog:
 ; CHECK: BB#0:
 ; BB#0 to BB#4: [0, 1133] (65 = 60 + 5)
 ; BB#0 to BB#5: [1134, UINT32_MAX] (25 = 20 + 5)
-; CHECK: Successors according to CFG: BB#4({{[0-9a-fx/= ]+}}72.22%) BB#5({{[0-9a-fx/= ]+}}27.78%)
+; CHECK: Successors according to CFG: BB#4(1550960411) BB#5(596523235)
 ;
 ; CHECK: BB#4:
 ; BB#4 to BB#1: [155, 159] (50)
 ; BB#4 to BB#5: [0, 1133] - [155, 159] (15 = 10 + 5)
-; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}76.92%) BB#7({{[0-9a-fx/= ]+}}23.08%)
+; CHECK: Successors according to CFG: BB#1(1193046470) BB#7(357913941)
 ;
 ; CHECK: BB#5:
 ; BB#5 to BB#1: {1140} (10)
 ; BB#5 to BB#6: [1134, UINT32_MAX] - {1140} (15 = 10 + 5)
-; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}40.00%) BB#6({{[0-9a-fx/= ]+}}60.00%)
+; CHECK: Successors according to CFG: BB#1(238609294) BB#6(357913941)
 ;
 ; CHECK: BB#6:
 ; BB#6 to BB#1: {1134} (10)
 ; BB#6 to BB#2: [1134, UINT32_MAX] - {1134, 1140} (5)
-; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}66.67%) BB#2({{[0-9a-fx/= ]+}}33.33%)
+; CHECK: Successors according to CFG: BB#1(238609294) BB#2(119304647)
 }
 
 ; CHECK-LABEL: test2
@@ -102,7 +102,7 @@ sw.epilog:
 ; CHECK: BB#0:
 ; BB#0 to BB#6: {0} + [15, UINT32_MAX] (5)
 ; BB#0 to BB#8: [1, 14] (jump table) (65 = 60 + 5)
-; CHECK: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}7.14%) BB#8({{[0-9a-fx/= ]+}}92.86%
+; CHECK: Successors according to CFG: BB#6(153391689) BB#8(1994091957)
 ;
 ; CHECK: BB#8:
 ; BB#8 to BB#1: {1} (10)
@@ -111,7 +111,7 @@ sw.epilog:
 ; BB#8 to BB#3: {11} (10)
 ; BB#8 to BB#4: {12} (10)
 ; BB#8 to BB#5: {13, 14} (20)
-; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}14.29%) BB#6({{[0-9a-fx/= ]+}}7.14%) BB#2({{[0-9a-fx/= ]+}}14.29%) BB#3({{[0-9a-fx/= ]+}}14.29%) BB#4({{[0-9a-fx/= ]+}}14.29%) BB#5({{[0-9a-fx/= ]+}}28.57%)
+; CHECK: Successors according to CFG: BB#1(306783378) BB#6(153391689) BB#2(306783378) BB#3(306783378) BB#4(306783378) BB#5(613566756)
 }
 
 ; CHECK-LABEL: test3
@@ -163,7 +163,7 @@ sw.epilog:
 ; CHECK: BB#0:
 ; BB#0 to BB#6: [0, 9] + [15, UINT32_MAX] {10}
 ; BB#0 to BB#8: [10, 14] (jump table) (50)
-; CHECK: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}16.67%) BB#8({{[0-9a-fx/= ]+}}83.33%)
+; CHECK: Successors according to CFG: BB#6(357913941) BB#8(1789569705)
 ;
 ; CHECK: BB#8:
 ; BB#8 to BB#1: {10} (10)
@@ -171,7 +171,7 @@ sw.epilog:
 ; BB#8 to BB#3: {12} (10)
 ; BB#8 to BB#4: {13} (10)
 ; BB#8 to BB#5: {14} (10)
-; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}20.00%) BB#2({{[0-9a-fx/= ]+}}20.00%) BB#3({{[0-9a-fx/= ]+}}20.00%) BB#4({{[0-9a-fx/= ]+}}20.00%) BB#5({{[0-9a-fx/= ]+}}20.00%)
+; CHECK: Successors according to CFG: BB#1(357913941) BB#2(357913941) BB#3(357913941) BB#4(357913941) BB#5(357913941)
 }
 
 ; CHECK-LABEL: test4
@@ -216,12 +216,12 @@ sw.epilog:
 ; CHECK: BB#0:
 ; BB#0 to BB#6: [0, 110] + [116, UINT32_MAX] (20)
 ; BB#0 to BB#7: [111, 115] (bit test) (50)
-; CHECK: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}28.57%) BB#7({{[0-9a-fx/= ]+}}71.43%)
+; CHECK: Successors according to CFG: BB#6(613566756) BB#7(1533916890)
 ;
 ; CHECK: BB#7:
 ; BB#7 to BB#2: {111, 114, 115} (30)
 ; BB#7 to BB#3: {112, 113} (20)
-; CHECK: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}60.00%) BB#3({{[0-9a-fx/= ]+}}40.00%)
+; CHECK: Successors according to CFG: BB#2(920350134) BB#3(613566756)
 }
 
 ; CHECK-LABEL: test5
@@ -273,7 +273,7 @@ sw.epilog:
 ; CHECK: BB#0:
 ; BB#0 to BB#6: [10, UINT32_MAX] (15)
 ; BB#0 to BB#8: [1, 5, 7, 9] (jump table) (45)
-; CHECK: Successors according to CFG: BB#8({{[0-9a-fx/= ]+}}25.00%) BB#9({{[0-9a-fx/= ]+}}75.00%)
+; CHECK: Successors according to CFG: BB#8(536870912) BB#9(1610612734)
 }
 
 !1 = !{!"branch_weights", i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10} 
diff --git a/test/CodeGen/X86/switch-jump-table.ll b/test/CodeGen/X86/switch-jump-table.ll
index 896a067da230..3cfee1cd80e6 100644
--- a/test/CodeGen/X86/switch-jump-table.ll
+++ b/test/CodeGen/X86/switch-jump-table.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=i686-pc-gnu-linux < %s | FileCheck %s -check-prefix=CHECK
-; RUN: llc -mtriple=i686-pc-gnu-linux -print-machineinstrs=expand-isel-pseudos %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=CHECK-JT-PROB
+; RUN: llc -mtriple=i686-pc-gnu-linux -print-machineinstrs=expand-isel-pseudos %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=CHECK-JT-WEIGHT
 
 
 ; An unreachable default destination is replaced with the most popular case label.
@@ -54,9 +54,9 @@ default:
 ; Check if branch probabilities are correctly assigned to the jump table.
 
 define void @bar(i32 %x, i32* %to) {
-; CHECK-JT-PROB-LABEL: bar:
-; CHECK-JT-PROB: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}14.29%) BB#8({{[0-9a-fx/= ]+}}85.71%)
-; CHECK-JT-PROB: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}16.67%) BB#2({{[0-9a-fx/= ]+}}16.67%) BB#3({{[0-9a-fx/= ]+}}16.67%) BB#4({{[0-9a-fx/= ]+}}16.67%) BB#5({{[0-9a-fx/= ]+}}33.33%)
+; CHECK-JT-WEIGHT-LABEL: bar:
+; CHECK-JT-WEIGHT: Successors according to CFG: BB#6(306783378) BB#8(1840700268)
+; CHECK-JT-WEIGHT: Successors according to CFG: BB#1(306783378) BB#2(306783378) BB#3(306783378) BB#4(306783378) BB#5(613566756)
 
 entry:
   switch i32 %x, label %default [

From 85e0db955c7d5f7b1baf3d337ff0fb0cd8fb0404 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Tue, 1 Dec 2015 04:19:56 +0000
Subject: [PATCH 046/186] RegisterPressure: Split RegisterOperands analysis
 code from result object; NFC

This is in preparation to expose the RegisterOperands class as
RegisterPressure API.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254368 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/RegisterPressure.cpp | 103 ++++++++++++++++++-------------
 1 file changed, 60 insertions(+), 43 deletions(-)

diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp
index 6ff16e551be2..3c9da13c4a39 100644
--- a/lib/CodeGen/RegisterPressure.cpp
+++ b/lib/CodeGen/RegisterPressure.cpp
@@ -314,71 +314,88 @@ static bool containsReg(ArrayRef<unsigned> RegUnits, unsigned RegUnit) {
 }
 
 namespace {
-/// Collect this instruction's unique uses and defs into SmallVectors for
-/// processing defs and uses in order.
-///
-/// FIXME: always ignore tied opers
-class RegisterOperands {
-  const TargetRegisterInfo *TRI;
-  const MachineRegisterInfo *MRI;
-  bool IgnoreDead;
 
+/// List of register defined and used by a machine instruction.
+class RegisterOperands {
 public:
   SmallVector<unsigned, 8> Uses;
   SmallVector<unsigned, 8> Defs;
   SmallVector<unsigned, 8> DeadDefs;
 
-  RegisterOperands(const TargetRegisterInfo *tri,
-                   const MachineRegisterInfo *mri, bool ID = false):
-    TRI(tri), MRI(mri), IgnoreDead(ID) {}
+  void collect(const MachineInstr &MI, const TargetRegisterInfo &TRI,
+               const MachineRegisterInfo &MRI, bool IgnoreDead = false);
+};
 
-  /// Push this operand's register onto the correct vector.
-  void collect(const MachineOperand &MO) {
+/// Collect this instruction's unique uses and defs into SmallVectors for
+/// processing defs and uses in order.
+///
+/// FIXME: always ignore tied opers
+class RegisterOperandsCollector {
+  RegisterOperands &RegOpers;
+  const TargetRegisterInfo &TRI;
+  const MachineRegisterInfo &MRI;
+  bool IgnoreDead;
+
+  RegisterOperandsCollector(RegisterOperands &RegOpers,
+                            const TargetRegisterInfo &TRI,
+                            const MachineRegisterInfo &MRI,
+                            bool IgnoreDead)
+    : RegOpers(RegOpers), TRI(TRI), MRI(MRI), IgnoreDead(IgnoreDead) {}
+
+  void collectInstr(const MachineInstr &MI) const {
+    for (ConstMIBundleOperands OperI(&MI); OperI.isValid(); ++OperI)
+      collectOperand(*OperI);
+
+    // Remove redundant physreg dead defs.
+    SmallVectorImpl<unsigned>::iterator I =
+      std::remove_if(RegOpers.DeadDefs.begin(), RegOpers.DeadDefs.end(),
+                     std::bind1st(std::ptr_fun(containsReg), RegOpers.Defs));
+    RegOpers.DeadDefs.erase(I, RegOpers.DeadDefs.end());
+  }
+
+  /// Push this operand's register onto the correct vectors.
+  void collectOperand(const MachineOperand &MO) const {
     if (!MO.isReg() || !MO.getReg())
       return;
+    unsigned Reg = MO.getReg();
     if (MO.readsReg())
-      pushRegUnits(MO.getReg(), Uses);
+      pushRegUnits(Reg, RegOpers.Uses);
     if (MO.isDef()) {
       if (MO.isDead()) {
         if (!IgnoreDead)
-          pushRegUnits(MO.getReg(), DeadDefs);
-      }
-      else
-        pushRegUnits(MO.getReg(), Defs);
+          pushRegUnits(Reg, RegOpers.DeadDefs);
+      } else
+        pushRegUnits(Reg, RegOpers.Defs);
     }
   }
 
-protected:
-  void pushRegUnits(unsigned Reg, SmallVectorImpl<unsigned> &RegUnits) {
+  void pushRegUnits(unsigned Reg, SmallVectorImpl<unsigned> &RegUnits) const {
     if (TargetRegisterInfo::isVirtualRegister(Reg)) {
       if (containsReg(RegUnits, Reg))
         return;
       RegUnits.push_back(Reg);
-    }
-    else if (MRI->isAllocatable(Reg)) {
-      for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
+    } else if (MRI.isAllocatable(Reg)) {
+      for (MCRegUnitIterator Units(Reg, &TRI); Units.isValid(); ++Units) {
         if (containsReg(RegUnits, *Units))
           continue;
         RegUnits.push_back(*Units);
       }
     }
   }
+
+  friend class RegisterOperands;
 };
-} // namespace
 
-/// Collect physical and virtual register operands.
-static void collectOperands(const MachineInstr *MI,
-                            RegisterOperands &RegOpers) {
-  for (ConstMIBundleOperands OperI(MI); OperI.isValid(); ++OperI)
-    RegOpers.collect(*OperI);
-
-  // Remove redundant physreg dead defs.
-  SmallVectorImpl<unsigned>::iterator I =
-    std::remove_if(RegOpers.DeadDefs.begin(), RegOpers.DeadDefs.end(),
-                   std::bind1st(std::ptr_fun(containsReg), RegOpers.Defs));
-  RegOpers.DeadDefs.erase(I, RegOpers.DeadDefs.end());
+void RegisterOperands::collect(const MachineInstr &MI,
+                               const TargetRegisterInfo &TRI,
+                               const MachineRegisterInfo &MRI,
+                               bool IgnoreDead) {
+  RegisterOperandsCollector Collector(*this, TRI, MRI, IgnoreDead);
+  Collector.collectInstr(MI);
 }
 
+} // namespace
+
 /// Initialize an array of N PressureDiffs.
 void PressureDiffs::init(unsigned N) {
   Size = N;
@@ -505,8 +522,8 @@ bool RegPressureTracker::recede(SmallVectorImpl<unsigned> *LiveUses,
   if (RequireIntervals && isTopClosed())
     static_cast<IntervalPressure&>(P).openTop(SlotIdx);
 
-  RegisterOperands RegOpers(TRI, MRI);
-  collectOperands(CurrPos, RegOpers);
+  RegisterOperands RegOpers;
+  RegOpers.collect(*CurrPos, *TRI, *MRI);
 
   if (PDiff)
     collectPDiff(*PDiff, RegOpers, MRI);
@@ -594,8 +611,8 @@ bool RegPressureTracker::advance() {
       static_cast<RegionPressure&>(P).openBottom(CurrPos);
   }
 
-  RegisterOperands RegOpers(TRI, MRI);
-  collectOperands(CurrPos, RegOpers);
+  RegisterOperands RegOpers;
+  RegOpers.collect(*CurrPos, *TRI, *MRI);
 
   for (unsigned i = 0, e = RegOpers.Uses.size(); i < e; ++i) {
     unsigned Reg = RegOpers.Uses[i];
@@ -728,8 +745,8 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) {
   assert(!MI->isDebugValue() && "Expect a nondebug instruction.");
 
   // Account for register pressure similar to RegPressureTracker::recede().
-  RegisterOperands RegOpers(TRI, MRI, /*IgnoreDead=*/true);
-  collectOperands(MI, RegOpers);
+  RegisterOperands RegOpers;
+  RegOpers.collect(*MI, *TRI, *MRI, /*IgnoreDead=*/true);
 
   // Boost max pressure for all dead defs together.
   // Since CurrSetPressure and MaxSetPressure
@@ -923,8 +940,8 @@ void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) {
   assert(!MI->isDebugValue() && "Expect a nondebug instruction.");
 
   // Account for register pressure similar to RegPressureTracker::recede().
-  RegisterOperands RegOpers(TRI, MRI);
-  collectOperands(MI, RegOpers);
+  RegisterOperands RegOpers;
+  RegOpers.collect(*MI, *TRI, *MRI);
 
   // Kill liveness at last uses. Assume allocatable physregs are single-use
   // rather than checking LiveIntervals.

From acac7f59d5e9e0da8ae63d26ca2ae946902d784c Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Tue, 1 Dec 2015 04:19:58 +0000
Subject: [PATCH 047/186] RegisterPressure: There is no need to make
 discoverLive{In|Out} public

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254369 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/RegisterPressure.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/llvm/CodeGen/RegisterPressure.h b/include/llvm/CodeGen/RegisterPressure.h
index 166bd8686891..b6e0a20526fc 100644
--- a/include/llvm/CodeGen/RegisterPressure.h
+++ b/include/llvm/CodeGen/RegisterPressure.h
@@ -365,9 +365,6 @@ class RegPressureTracker {
     return CurrSetPressure;
   }
 
-  void discoverLiveOut(unsigned Reg);
-  void discoverLiveIn(unsigned Reg);
-
   bool isTopClosed() const;
   bool isBottomClosed() const;
 
@@ -442,6 +439,9 @@ class RegPressureTracker {
   void dump() const;
 
 protected:
+  void discoverLiveOut(unsigned Reg);
+  void discoverLiveIn(unsigned Reg);
+
   const LiveRange *getLiveRange(unsigned Reg) const;
 
   void increaseRegPressure(ArrayRef<unsigned> Regs);

From 4bee6aee5acbc40b98a19e09739470ae1fdd46c9 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Tue, 1 Dec 2015 04:20:01 +0000
Subject: [PATCH 048/186] RegisterPressure: There is no need to make
 getCurSlot() public

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254370 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/RegisterPressure.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/llvm/CodeGen/RegisterPressure.h b/include/llvm/CodeGen/RegisterPressure.h
index b6e0a20526fc..42131c831ea1 100644
--- a/include/llvm/CodeGen/RegisterPressure.h
+++ b/include/llvm/CodeGen/RegisterPressure.h
@@ -328,10 +328,6 @@ class RegPressureTracker {
   // position changes while pressure does not.
   void setPos(MachineBasicBlock::const_iterator Pos) { CurrPos = Pos; }
 
-  /// \brief Get the SlotIndex for the first nondebug instruction including or
-  /// after the current position.
-  SlotIndex getCurrSlot() const;
-
   /// Recede across the previous instruction.
   bool recede(SmallVectorImpl<unsigned> *LiveUses = nullptr,
               PressureDiff *PDiff = nullptr);
@@ -442,6 +438,10 @@ class RegPressureTracker {
   void discoverLiveOut(unsigned Reg);
   void discoverLiveIn(unsigned Reg);
 
+  /// \brief Get the SlotIndex for the first nondebug instruction including or
+  /// after the current position.
+  SlotIndex getCurrSlot() const;
+
   const LiveRange *getLiveRange(unsigned Reg) const;
 
   void increaseRegPressure(ArrayRef<unsigned> Regs);

From bce24cfb9ee40462a5fdc09503dfd3305e45326c Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Tue, 1 Dec 2015 04:20:04 +0000
Subject: [PATCH 049/186] RegisterPressure: Remove support for
 recede()/advance() at MBB boundaries

Nobody was checking the returnvalue of recede()/advance() so we can
simply replace this code with asserts.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254371 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/RegisterPressure.h |  7 +++----
 lib/CodeGen/RegisterPressure.cpp        | 23 +++++------------------
 2 files changed, 8 insertions(+), 22 deletions(-)

diff --git a/include/llvm/CodeGen/RegisterPressure.h b/include/llvm/CodeGen/RegisterPressure.h
index 42131c831ea1..e296701d8e8c 100644
--- a/include/llvm/CodeGen/RegisterPressure.h
+++ b/include/llvm/CodeGen/RegisterPressure.h
@@ -329,11 +329,11 @@ class RegPressureTracker {
   void setPos(MachineBasicBlock::const_iterator Pos) { CurrPos = Pos; }
 
   /// Recede across the previous instruction.
-  bool recede(SmallVectorImpl<unsigned> *LiveUses = nullptr,
+  void recede(SmallVectorImpl<unsigned> *LiveUses = nullptr,
               PressureDiff *PDiff = nullptr);
 
   /// Advance across the current instruction.
-  bool advance();
+  void advance();
 
   /// Finalize the region boundaries and recored live ins and live outs.
   void closeRegion();
@@ -350,8 +350,7 @@ class RegPressureTracker {
   ArrayRef<unsigned> getLiveThru() const { return LiveThruPressure; }
 
   /// Get the resulting register pressure over the traversed region.
-  /// This result is complete if either advance() or recede() has returned true,
-  /// or if closeRegion() was explicitly invoked.
+  /// This result is complete if closeRegion() was explicitly invoked.
   RegisterPressure &getPressure() { return P; }
   const RegisterPressure &getPressure() const { return P; }
 
diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp
index 3c9da13c4a39..18002c8fbd56 100644
--- a/lib/CodeGen/RegisterPressure.cpp
+++ b/lib/CodeGen/RegisterPressure.cpp
@@ -491,13 +491,9 @@ void RegPressureTracker::discoverLiveOut(unsigned Reg) {
 /// registers that are both defined and used by the instruction.  If a pressure
 /// difference pointer is provided record the changes is pressure caused by this
 /// instruction independent of liveness.
-bool RegPressureTracker::recede(SmallVectorImpl<unsigned> *LiveUses,
+void RegPressureTracker::recede(SmallVectorImpl<unsigned> *LiveUses,
                                 PressureDiff *PDiff) {
-  // Check for the top of the analyzable region.
-  if (CurrPos == MBB->begin()) {
-    closeRegion();
-    return false;
-  }
+  assert(CurrPos != MBB->begin());
   if (!isBottomClosed())
     closeBottom();
 
@@ -509,11 +505,8 @@ bool RegPressureTracker::recede(SmallVectorImpl<unsigned> *LiveUses,
   do
     --CurrPos;
   while (CurrPos != MBB->begin() && CurrPos->isDebugValue());
+  assert(!CurrPos->isDebugValue());
 
-  if (CurrPos->isDebugValue()) {
-    closeRegion();
-    return false;
-  }
   SlotIndex SlotIdx;
   if (RequireIntervals)
     SlotIdx = LIS->getInstructionIndex(CurrPos).getRegSlot();
@@ -584,18 +577,13 @@ bool RegPressureTracker::recede(SmallVectorImpl<unsigned> *LiveUses,
         UntiedDefs.insert(Reg);
     }
   }
-  return true;
 }
 
 /// Advance across the current instruction.
-bool RegPressureTracker::advance() {
+void RegPressureTracker::advance() {
   assert(!TrackUntiedDefs && "unsupported mode");
 
-  // Check for the bottom of the analyzable region.
-  if (CurrPos == MBB->end()) {
-    closeRegion();
-    return false;
-  }
+  assert(CurrPos != MBB->end());
   if (!isTopClosed())
     closeTop();
 
@@ -653,7 +641,6 @@ bool RegPressureTracker::advance() {
   do
     ++CurrPos;
   while (CurrPos != MBB->end() && CurrPos->isDebugValue());
-  return true;
 }
 
 /// Find the max change in excess pressure across all sets.

From 104db6b94a127e4e68db7528f50d3fcf24d543bd Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Tue, 1 Dec 2015 04:20:06 +0000
Subject: [PATCH 050/186] RegisterPressure: If we do not collect dead defs the
 list must be empty

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254372 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/RegisterPressure.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp
index 18002c8fbd56..8eba0edc8bfe 100644
--- a/lib/CodeGen/RegisterPressure.cpp
+++ b/lib/CodeGen/RegisterPressure.cpp
@@ -734,11 +734,7 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) {
   // Account for register pressure similar to RegPressureTracker::recede().
   RegisterOperands RegOpers;
   RegOpers.collect(*MI, *TRI, *MRI, /*IgnoreDead=*/true);
-
-  // Boost max pressure for all dead defs together.
-  // Since CurrSetPressure and MaxSetPressure
-  increaseRegPressure(RegOpers.DeadDefs);
-  decreaseRegPressure(RegOpers.DeadDefs);
+  assert(RegOpers.DeadDefs.size() == 0);
 
   // Kill liveness at live defs.
   for (unsigned i = 0, e = RegOpers.Defs.size(); i < e; ++i) {

From 442a04a282ce8f9998be131be50f5a5c73d7791f Mon Sep 17 00:00:00 2001
From: Colin LeMahieu <colinl@codeaurora.org>
Date: Tue, 1 Dec 2015 04:56:25 +0000
Subject: [PATCH 051/186] [Hexagon] Disabling failing safestack test

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254375 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/DebugInfo/Generic/safestack-byval.ll | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/DebugInfo/Generic/safestack-byval.ll b/test/DebugInfo/Generic/safestack-byval.ll
index 24df3815068e..1329a95a2201 100644
--- a/test/DebugInfo/Generic/safestack-byval.ll
+++ b/test/DebugInfo/Generic/safestack-byval.ll
@@ -2,6 +2,7 @@
 ; points to neither an argument nor an alloca. This kind of IR is generated by
 ; SafeStack for unsafe byval arguments.
 ; RUN: llc -stop-after expand-isel-pseudos %s -o /dev/null | FileCheck %s
+; XFAIL: hexagon
 
 ; This was built by compiling the following source with SafeStack and
 ; simplifying the result a little.

From 5155021519d454bbd00404a5a0ac66377dffe7be Mon Sep 17 00:00:00 2001
From: Cong Hou <congh@google.com>
Date: Tue, 1 Dec 2015 05:29:22 +0000
Subject: [PATCH 052/186] Replace all weight-based interfaces in MBB with
 probability-based interfaces, and update all uses of old interfaces.

(This is the second attempt to submit this patch. The first caused two assertion
 failures and was reverted. See https://llvm.org/bugs/show_bug.cgi?id=25687)

The patch in http://reviews.llvm.org/D13745 is broken into four parts:

1. New interfaces without functional changes (http://reviews.llvm.org/D13908).
2. Use new interfaces in SelectionDAG, while in other passes treat probabilities
as weights (http://reviews.llvm.org/D14361).
3. Use new interfaces in all other passes.
4. Remove old interfaces.

This patch is 3+4 above. In this patch, MBB won't provide weight-based
interfaces any more, which are totally replaced by probability-based ones.
The interface addSuccessor() is redesigned so that the default probability is
unknown. We allow unknown probabilities but don't allow using it together
with known probabilities in successor list. That is to say, we either have a
list of successors with all known probabilities, or all unknown
probabilities. In the latter case, we assume each successor has 1/N
probability where N is the number of successors. An assertion checks if the
user is attempting to add a successor with the disallowed mixed use as stated
above. This can help us catch many misuses.

All uses of weight-based interfaces are now updated to use probability-based
ones.


Differential revision: http://reviews.llvm.org/D14973




git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254377 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/BranchProbabilityInfo.h |   3 +
 include/llvm/CodeGen/MachineBasicBlock.h      |  46 +-----
 .../CodeGen/MachineBranchProbabilityInfo.h    |  22 +--
 include/llvm/Support/BranchProbability.h      |  31 +++-
 lib/Analysis/BranchProbabilityInfo.cpp        |   6 +
 lib/CodeGen/BranchFolding.cpp                 |  16 +-
 lib/CodeGen/IfConversion.cpp                  | 155 ++++++++----------
 lib/CodeGen/MIRParser/MIParser.cpp            |   3 +-
 lib/CodeGen/MIRPrinter.cpp                    |   4 +-
 lib/CodeGen/MachineBasicBlock.cpp             | 142 ++++------------
 lib/CodeGen/MachineBlockPlacement.cpp         |  71 ++++----
 lib/CodeGen/MachineBranchProbabilityInfo.cpp  |  82 +++------
 lib/CodeGen/TailDuplication.cpp               |   6 +-
 lib/Support/BranchProbability.cpp             |  20 ++-
 lib/Target/AMDGPU/AMDILCFGStructurizer.cpp    |   6 +-
 lib/Target/ARM/ARMConstantIslandPass.cpp      |   3 +-
 lib/Target/ARM/ARMISelLowering.cpp            |   2 +-
 lib/Target/Hexagon/HexagonCFGOptimizer.cpp    |   6 +-
 lib/Target/Mips/MipsLongBranch.cpp            |   3 +-
 test/CodeGen/ARM/ifcvt-branch-weight-bug.ll   |  14 +-
 test/CodeGen/ARM/ifcvt-branch-weight.ll       |   2 +-
 test/CodeGen/ARM/ifcvt-iter-indbr.ll          |  10 +-
 test/CodeGen/ARM/tail-merge-branch-weight.ll  |   2 +-
 test/CodeGen/ARM/taildup-branch-weight.ll     |   4 +-
 test/CodeGen/Generic/MachineBranchProb.ll     |   8 +-
 test/CodeGen/Hexagon/ifcvt-edge-weight.ll     |   2 +-
 test/CodeGen/MIR/X86/newline-handling.mir     |   4 +-
 .../X86/successor-basic-blocks-weights.mir    |   6 +-
 .../MIR/X86/successor-basic-blocks.mir        |   4 +-
 test/CodeGen/X86/MachineBranchProb.ll         |   4 +-
 test/CodeGen/X86/catchpad-weight.ll           |   2 +-
 test/CodeGen/X86/stack-protector-weight.ll    |   4 +-
 test/CodeGen/X86/switch-edge-weight.ll        |  22 +--
 test/CodeGen/X86/switch-jump-table.ll         |   8 +-
 34 files changed, 301 insertions(+), 422 deletions(-)

diff --git a/include/llvm/Analysis/BranchProbabilityInfo.h b/include/llvm/Analysis/BranchProbabilityInfo.h
index 89dec14b2b3e..69dae5e90785 100644
--- a/include/llvm/Analysis/BranchProbabilityInfo.h
+++ b/include/llvm/Analysis/BranchProbabilityInfo.h
@@ -61,6 +61,9 @@ class BranchProbabilityInfo {
   BranchProbability getEdgeProbability(const BasicBlock *Src,
                                        const BasicBlock *Dst) const;
 
+  BranchProbability getEdgeProbability(const BasicBlock *Src,
+                                       succ_const_iterator Dst) const;
+
   /// \brief Test if an edge is hot relative to other out-edges of the Src.
   ///
   /// Check whether this edge out of the source block is 'hot'. We define hot
diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h
index a2b1a850ec76..ac87f4f901f5 100644
--- a/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/include/llvm/CodeGen/MachineBasicBlock.h
@@ -91,13 +91,6 @@ class MachineBasicBlock
   std::vector<MachineBasicBlock *> Predecessors;
   std::vector<MachineBasicBlock *> Successors;
 
-  /// Keep track of the weights to the successors. This vector has the same
-  /// order as Successors, or it is empty if we don't use it (disable
-  /// optimization).
-  std::vector<uint32_t> Weights;
-  typedef std::vector<uint32_t>::iterator weight_iterator;
-  typedef std::vector<uint32_t>::const_iterator const_weight_iterator;
-
   /// Keep track of the probabilities to the successors. This vector has the
   /// same order as Successors, or it is empty if we don't use it (disable
   /// optimization).
@@ -440,26 +433,16 @@ class MachineBasicBlock
 
   // Machine-CFG mutators
 
-  /// Add Succ as a successor of this MachineBasicBlock.  The Predecessors list
-  /// of Succ is automatically updated. WEIGHT parameter is stored in Weights
-  /// list and it may be used by MachineBranchProbabilityInfo analysis to
-  /// calculate branch probability.
-  ///
-  /// Note that duplicate Machine CFG edges are not allowed.
-  void addSuccessor(MachineBasicBlock *Succ, uint32_t Weight = 0);
-
-  /// Add Succ as a successor of this MachineBasicBlock.  The Predecessors list
-  /// of Succ is automatically updated. The weight is not provided because BPI
-  /// is not available (e.g. -O0 is used), in which case edge weights won't be
-  /// used. Using this interface can save some space.
-  void addSuccessorWithoutWeight(MachineBasicBlock *Succ);
-
   /// Add Succ as a successor of this MachineBasicBlock.  The Predecessors list
   /// of Succ is automatically updated. PROB parameter is stored in
-  /// Probabilities list.
+  /// Probabilities list. The default probability is set as unknown. Mixing
+  /// known and unknown probabilities in successor list is not allowed. When all
+  /// successors have unknown probabilities, 1 / N is returned as the
+  /// probability for each successor, where N is the number of successors.
   ///
   /// Note that duplicate Machine CFG edges are not allowed.
-  void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob);
+  void addSuccessor(MachineBasicBlock *Succ,
+                    BranchProbability Prob = BranchProbability::getUnknown());
 
   /// Add Succ as a successor of this MachineBasicBlock.  The Predecessors list
   /// of Succ is automatically updated. The probability is not provided because
@@ -467,9 +450,6 @@ class MachineBasicBlock
   /// won't be used. Using this interface can save some space.
   void addSuccessorWithoutProb(MachineBasicBlock *Succ);
 
-  /// Set successor weight of a given iterator.
-  void setSuccWeight(succ_iterator I, uint32_t Weight);
-
   /// Set successor probability of a given iterator.
   void setSuccProbability(succ_iterator I, BranchProbability Prob);
 
@@ -488,7 +468,7 @@ class MachineBasicBlock
   /// Return the iterator to the element after the one removed.
   succ_iterator removeSuccessor(succ_iterator I);
 
-  /// Replace successor OLD with NEW and update weight info.
+  /// Replace successor OLD with NEW and update probability info.
   void replaceSuccessor(MachineBasicBlock *Old, MachineBasicBlock *New);
 
   /// Transfers all the successors from MBB to this machine basic block (i.e.,
@@ -500,9 +480,6 @@ class MachineBasicBlock
   /// operands in the successor blocks which refer to FromMBB to refer to this.
   void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB);
 
-  /// Return true if any of the successors have weights attached to them.
-  bool hasSuccessorWeights() const { return !Weights.empty(); }
-
   /// Return true if any of the successors have probabilities attached to them.
   bool hasSuccessorProbabilities() const { return !Probs.empty(); }
 
@@ -759,10 +736,6 @@ class MachineBasicBlock
 
 
 private:
-  /// Return weight iterator corresponding to the I successor iterator.
-  weight_iterator getWeightIterator(succ_iterator I);
-  const_weight_iterator getWeightIterator(const_succ_iterator I) const;
-
   /// Return probability iterator corresponding to the I successor iterator.
   probability_iterator getProbabilityIterator(succ_iterator I);
   const_probability_iterator
@@ -771,11 +744,6 @@ class MachineBasicBlock
   friend class MachineBranchProbabilityInfo;
   friend class MIPrinter;
 
-  /// Return weight of the edge from this block to MBB. This method should NOT
-  /// be called directly, but by using getEdgeWeight method from
-  /// MachineBranchProbabilityInfo class.
-  uint32_t getSuccWeight(const_succ_iterator Succ) const;
-
   /// Return probability of the edge from this block to MBB. This method should
   /// NOT be called directly, but by using getEdgeProbability method from
   /// MachineBranchProbabilityInfo class.
diff --git a/include/llvm/CodeGen/MachineBranchProbabilityInfo.h b/include/llvm/CodeGen/MachineBranchProbabilityInfo.h
index 058ab32f3aa9..608e8d257874 100644
--- a/include/llvm/CodeGen/MachineBranchProbabilityInfo.h
+++ b/include/llvm/CodeGen/MachineBranchProbabilityInfo.h
@@ -55,10 +55,15 @@ class MachineBranchProbabilityInfo : public ImmutablePass {
   uint32_t getEdgeWeight(const MachineBasicBlock *Src,
                          MachineBasicBlock::const_succ_iterator Dst) const;
 
-  // Get sum of the block successors' weights, potentially scaling them to fit
-  // within 32-bits. If scaling is required, sets Scale based on the necessary
-  // adjustment. Any edge weights used with the sum should be divided by Scale.
-  uint32_t getSumForBlock(const MachineBasicBlock *MBB, uint32_t &Scale) const;
+  // Return edge probability.
+  BranchProbability getEdgeProbability(const MachineBasicBlock *Src,
+                                       const MachineBasicBlock *Dst) const;
+
+  // Same as above, but using a const_succ_iterator from Src. This is faster
+  // when the iterator is already available.
+  BranchProbability
+  getEdgeProbability(const MachineBasicBlock *Src,
+                     MachineBasicBlock::const_succ_iterator Dst) const;
 
   // A 'Hot' edge is an edge which probability is >= 80%.
   bool isEdgeHot(const MachineBasicBlock *Src,
@@ -68,15 +73,6 @@ class MachineBranchProbabilityInfo : public ImmutablePass {
   // NB: This routine's complexity is linear on the number of successors.
   MachineBasicBlock *getHotSucc(MachineBasicBlock *MBB) const;
 
-  // Return a probability as a fraction between 0 (0% probability) and
-  // 1 (100% probability), however the value is never equal to 0, and can be 1
-  // only iff SRC block has only one successor.
-  // NB: This routine's complexity is linear on the number of successors of
-  // Src. Querying sequentially for each successor's probability is a quadratic
-  // query pattern.
-  BranchProbability getEdgeProbability(const MachineBasicBlock *Src,
-                                       const MachineBasicBlock *Dst) const;
-
   // Print value between 0 (0% probability) and 1 (100% probability),
   // however the value is never equal to 0, and can be 1 only iff SRC block
   // has only one successor.
diff --git a/include/llvm/Support/BranchProbability.h b/include/llvm/Support/BranchProbability.h
index 3620d4d5d772..2548384f346b 100644
--- a/include/llvm/Support/BranchProbability.h
+++ b/include/llvm/Support/BranchProbability.h
@@ -53,6 +53,9 @@ class BranchProbability {
   // Create a BranchProbability object with the given numerator and 1<<31
   // as denominator.
   static BranchProbability getRaw(uint32_t N) { return BranchProbability(N); }
+  // Create a BranchProbability object from 64-bit integers.
+  static BranchProbability getBranchProbability(uint64_t Numerator,
+                                                uint64_t Denominator);
 
   // Normalize given probabilties so that the sum of them becomes approximate
   // one.
@@ -131,10 +134,30 @@ class BranchProbability {
 
   bool operator==(BranchProbability RHS) const { return N == RHS.N; }
   bool operator!=(BranchProbability RHS) const { return !(*this == RHS); }
-  bool operator<(BranchProbability RHS) const { return N < RHS.N; }
-  bool operator>(BranchProbability RHS) const { return RHS < *this; }
-  bool operator<=(BranchProbability RHS) const { return !(RHS < *this); }
-  bool operator>=(BranchProbability RHS) const { return !(*this < RHS); }
+
+  bool operator<(BranchProbability RHS) const {
+    assert(N != UnknownN && RHS.N != UnknownN &&
+           "Unknown probability cannot participate in comparisons.");
+    return N < RHS.N;
+  }
+
+  bool operator>(BranchProbability RHS) const {
+    assert(N != UnknownN && RHS.N != UnknownN &&
+           "Unknown probability cannot participate in comparisons.");
+    return RHS < *this;
+  }
+
+  bool operator<=(BranchProbability RHS) const {
+    assert(N != UnknownN && RHS.N != UnknownN &&
+           "Unknown probability cannot participate in comparisons.");
+    return !(RHS < *this);
+  }
+
+  bool operator>=(BranchProbability RHS) const {
+    assert(N != UnknownN && RHS.N != UnknownN &&
+           "Unknown probability cannot participate in comparisons.");
+    return !(*this < RHS);
+  }
 };
 
 inline raw_ostream &operator<<(raw_ostream &OS, BranchProbability Prob) {
diff --git a/lib/Analysis/BranchProbabilityInfo.cpp b/lib/Analysis/BranchProbabilityInfo.cpp
index f48394698699..6cdf43a06a9f 100644
--- a/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/lib/Analysis/BranchProbabilityInfo.cpp
@@ -647,6 +647,12 @@ getEdgeProbability(const BasicBlock *Src, const BasicBlock *Dst) const {
   return BranchProbability(N, D);
 }
 
+BranchProbability
+BranchProbabilityInfo::getEdgeProbability(const BasicBlock *Src,
+                                          succ_const_iterator Dst) const {
+  return getEdgeProbability(Src, Dst.getSuccessorIndex());
+}
+
 raw_ostream &
 BranchProbabilityInfo::printEdgeProbability(raw_ostream &OS,
                                             const BasicBlock *Src,
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index 0b2495cc996e..54d92ad67a97 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -1099,13 +1099,19 @@ void BranchFolder::setCommonTailEdgeWeights(MachineBasicBlock &TailMBB) {
   if (TailMBB.succ_size() <= 1)
     return;
 
-  auto MaxEdgeFreq = *std::max_element(EdgeFreqLs.begin(), EdgeFreqLs.end());
-  uint64_t Scale = MaxEdgeFreq.getFrequency() / UINT32_MAX + 1;
+  auto SumEdgeFreq =
+      std::accumulate(EdgeFreqLs.begin(), EdgeFreqLs.end(), BlockFrequency(0))
+          .getFrequency();
   auto EdgeFreq = EdgeFreqLs.begin();
 
-  for (auto SuccI = TailMBB.succ_begin(), SuccE = TailMBB.succ_end();
-       SuccI != SuccE; ++SuccI, ++EdgeFreq)
-    TailMBB.setSuccWeight(SuccI, EdgeFreq->getFrequency() / Scale);
+  if (SumEdgeFreq > 0) {
+    for (auto SuccI = TailMBB.succ_begin(), SuccE = TailMBB.succ_end();
+         SuccI != SuccE; ++SuccI, ++EdgeFreq) {
+      auto Prob = BranchProbability::getBranchProbability(
+          EdgeFreq->getFrequency(), SumEdgeFreq);
+      TailMBB.setSuccProbability(SuccI, Prob);
+    }
+  }
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index 0b2f3ea165f8..ff28f95cc33d 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -32,6 +32,7 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
 
 using namespace llvm;
 
@@ -1151,28 +1152,6 @@ bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) {
   return true;
 }
 
-/// Scale down weights to fit into uint32_t. NewTrue is the new weight
-/// for successor TrueBB, and NewFalse is the new weight for successor
-/// FalseBB.
-static void ScaleWeights(uint64_t NewTrue, uint64_t NewFalse,
-                         MachineBasicBlock *MBB,
-                         const MachineBasicBlock *TrueBB,
-                         const MachineBasicBlock *FalseBB,
-                         const MachineBranchProbabilityInfo *MBPI) {
-  uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse;
-  uint32_t Scale = (NewMax / UINT32_MAX) + 1;
-  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
-                                        SE = MBB->succ_end();
-       SI != SE; ++SI) {
-    if (*SI == TrueBB)
-      MBB->setSuccWeight(SI, (uint32_t)(NewTrue / Scale));
-    else if (*SI == FalseBB)
-      MBB->setSuccWeight(SI, (uint32_t)(NewFalse / Scale));
-    else
-      MBB->setSuccWeight(SI, MBPI->getEdgeWeight(MBB, SI) / Scale);
-  }
-}
-
 /// IfConvertTriangle - If convert a triangle sub-CFG.
 ///
 bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
@@ -1229,16 +1208,14 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
   DontKill.clear();
 
   bool HasEarlyExit = CvtBBI->FalseBB != nullptr;
-  uint64_t CvtNext = 0, CvtFalse = 0, BBNext = 0, BBCvt = 0, SumWeight = 0;
-  uint32_t WeightScale = 0;
+  BranchProbability CvtNext, CvtFalse, BBNext, BBCvt;
 
   if (HasEarlyExit) {
-    // Get weights before modifying CvtBBI->BB and BBI.BB.
-    CvtNext = MBPI->getEdgeWeight(CvtBBI->BB, NextBBI->BB);
-    CvtFalse = MBPI->getEdgeWeight(CvtBBI->BB, CvtBBI->FalseBB);
-    BBNext = MBPI->getEdgeWeight(BBI.BB, NextBBI->BB);
-    BBCvt = MBPI->getEdgeWeight(BBI.BB, CvtBBI->BB);
-    SumWeight = MBPI->getSumForBlock(CvtBBI->BB, WeightScale);
+    // Get probabilities before modifying CvtBBI->BB and BBI.BB.
+    CvtNext = MBPI->getEdgeProbability(CvtBBI->BB, NextBBI->BB);
+    CvtFalse = MBPI->getEdgeProbability(CvtBBI->BB, CvtBBI->FalseBB);
+    BBNext = MBPI->getEdgeProbability(BBI.BB, NextBBI->BB);
+    BBCvt = MBPI->getEdgeProbability(BBI.BB, CvtBBI->BB);
   }
 
   if (CvtBBI->BB->pred_size() > 1) {
@@ -1266,22 +1243,24 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
                                            CvtBBI->BrCond.end());
     if (TII->ReverseBranchCondition(RevCond))
       llvm_unreachable("Unable to reverse branch condition!");
+
+    // Update the edge probability for both CvtBBI->FalseBB and NextBBI.
+    // NewNext = New_Prob(BBI.BB, NextBBI->BB) =
+    //   Prob(BBI.BB, NextBBI->BB) +
+    //   Prob(BBI.BB, CvtBBI->BB) * Prob(CvtBBI->BB, NextBBI->BB)
+    // NewFalse = New_Prob(BBI.BB, CvtBBI->FalseBB) =
+    //   Prob(BBI.BB, CvtBBI->BB) * Prob(CvtBBI->BB, CvtBBI->FalseBB)
+    auto NewTrueBB = getNextBlock(BBI.BB);
+    auto NewNext = BBNext + BBCvt * CvtNext;
+    auto NewTrueBBIter =
+        std::find(BBI.BB->succ_begin(), BBI.BB->succ_end(), NewTrueBB);
+    assert(NewTrueBBIter != BBI.BB->succ_end() &&
+           "NewTrueBB is not a successor of BBI.BB.");
+    BBI.BB->setSuccProbability(NewTrueBBIter, NewNext);
+
+    auto NewFalse = BBCvt * CvtFalse;
     TII->InsertBranch(*BBI.BB, CvtBBI->FalseBB, nullptr, RevCond, dl);
-    BBI.BB->addSuccessor(CvtBBI->FalseBB);
-    // Update the edge weight for both CvtBBI->FalseBB and NextBBI.
-    // New_Weight(BBI.BB, NextBBI->BB) =
-    //   Weight(BBI.BB, NextBBI->BB) * getSumForBlock(CvtBBI->BB) +
-    //   Weight(BBI.BB, CvtBBI->BB) * Weight(CvtBBI->BB, NextBBI->BB)
-    // New_Weight(BBI.BB, CvtBBI->FalseBB) =
-    //   Weight(BBI.BB, CvtBBI->BB) * Weight(CvtBBI->BB, CvtBBI->FalseBB)
-
-    uint64_t NewNext = BBNext * SumWeight + (BBCvt * CvtNext) / WeightScale;
-    uint64_t NewFalse = (BBCvt * CvtFalse) / WeightScale;
-    // We need to scale down all weights of BBI.BB to fit uint32_t.
-    // Here BBI.BB is connected to CvtBBI->FalseBB and will fall through to
-    // the next block.
-    ScaleWeights(NewNext, NewFalse, BBI.BB, getNextBlock(BBI.BB),
-                 CvtBBI->FalseBB, MBPI);
+    BBI.BB->addSuccessor(CvtBBI->FalseBB, NewFalse);
   }
 
   // Merge in the 'false' block if the 'false' block has no other
@@ -1524,7 +1503,7 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
       MergeBlocks(BBI, TailBBI);
       TailBBI.IsDone = true;
     } else {
-      BBI.BB->addSuccessor(TailBB);
+      BBI.BB->addSuccessor(TailBB, BranchProbability::getOne());
       InsertUncondBranch(BBI.BB, TailBB, TII);
       BBI.HasFallThrough = false;
     }
@@ -1688,21 +1667,26 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) {
                                                 FromBBI.BB->succ_end());
   MachineBasicBlock *NBB = getNextBlock(FromBBI.BB);
   MachineBasicBlock *FallThrough = FromBBI.HasFallThrough ? NBB : nullptr;
-
-  // The edge weight from ToBBI.BB to FromBBI.BB, which is only needed when
+  // The edge probability from ToBBI.BB to FromBBI.BB, which is only needed when
   // AddEdges is true and FromBBI.BB is a successor of ToBBI.BB.
-  uint32_t To2FromWeight = 0;
-  // WeightScale and SumWeight are for calculating successor probabilities of
-  // FromBBI.BB.
-  uint32_t WeightScale = 0;
-  uint32_t SumWeight = 0;
+  auto To2FromProb = BranchProbability::getZero();
   if (AddEdges && ToBBI.BB->isSuccessor(FromBBI.BB)) {
-    To2FromWeight = MBPI->getEdgeWeight(ToBBI.BB, FromBBI.BB);
-    // Set the edge weight from ToBBI.BB to FromBBI.BB to zero to avoid the edge
-    // weight being merged to other edges when this edge is removed later.
-    ToBBI.BB->setSuccWeight(
-        std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), FromBBI.BB), 0);
-    SumWeight = MBPI->getSumForBlock(FromBBI.BB, WeightScale);
+    To2FromProb = MBPI->getEdgeProbability(ToBBI.BB, FromBBI.BB);
+    // Set the edge probability from ToBBI.BB to FromBBI.BB to zero to avoid the
+    // edge probability being merged to other edges when this edge is removed
+    // later.
+    ToBBI.BB->setSuccProbability(
+        std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), FromBBI.BB),
+        BranchProbability::getZero());
+  }
+
+  if (AddEdges && ToBBI.BB->isSuccessor(FromBBI.BB)) {
+    // Set the edge probability from ToBBI.BB to FromBBI.BB to zero to avoid the
+    // edge probability being merged to other edges when this edge is removed
+    // later.
+    ToBBI.BB->setSuccProbability(
+        std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), FromBBI.BB),
+        BranchProbability::getZero());
   }
 
   for (unsigned i = 0, e = FromSuccs.size(); i != e; ++i) {
@@ -1711,39 +1695,38 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) {
     if (Succ == FallThrough)
       continue;
 
-    uint32_t NewWeight = 0;
+    auto NewProb = BranchProbability::getZero();
     if (AddEdges) {
-      // Calculate the edge weight for the edge from ToBBI.BB to Succ, which is
-      // a portion of the edge weight from FromBBI.BB to Succ. The portion ratio
-      // is the edge probability from ToBBI.BB to FromBBI.BB (if FromBBI is a
-      // successor of ToBBI.BB. See comment below for excepion).
-      NewWeight = MBPI->getEdgeWeight(FromBBI.BB, Succ);
+      // Calculate the edge probability for the edge from ToBBI.BB to Succ,
+      // which is a portion of the edge probability from FromBBI.BB to Succ. The
+      // portion ratio is the edge probability from ToBBI.BB to FromBBI.BB (if
+      // FromBBI is a successor of ToBBI.BB. See comment below for excepion).
+      NewProb = MBPI->getEdgeProbability(FromBBI.BB, Succ);
 
-      // To2FromWeight is 0 when FromBBI.BB is not a successor of ToBBI.BB. This
+      // To2FromProb is 0 when FromBBI.BB is not a successor of ToBBI.BB. This
       // only happens when if-converting a diamond CFG and FromBBI.BB is the
       // tail BB.  In this case FromBBI.BB post-dominates ToBBI.BB and hence we
-      // could just use the weights on FromBBI.BB's out-edges when adding new
-      // successors.
-      if (To2FromWeight > 0) {
-        BranchProbability Prob(NewWeight / WeightScale, SumWeight);
-        NewWeight = Prob.scale(To2FromWeight);
-      }
+      // could just use the probabilities on FromBBI.BB's out-edges when adding
+      // new successors.
+      if (!To2FromProb.isZero())
+        NewProb *= To2FromProb;
     }
 
     FromBBI.BB->removeSuccessor(Succ);
 
     if (AddEdges) {
-      // If the edge from ToBBI.BB to Succ already exists, update the weight of
-      // this edge by adding NewWeight to it. An example is shown below, in
-      // which A is ToBBI.BB and B is FromBBI.BB. In this case we don't have to
-      // set C as A's successor as it already is. We only need to update the
-      // edge weight on A->C. Note that B will not be immediately removed from
-      // A's successors. It is possible that B->D is not removed either if D is
-      // a fallthrough of B. Later the edge A->D (generated here) and B->D will
-      // be combined into one edge. To maintain correct edge weight of this
-      // combined edge, we need to set the edge weight of A->B to zero, which is
-      // already done above. The edge weight on A->D is calculated by scaling
-      // the original weight on A->B by the probability of B->D.
+      // If the edge from ToBBI.BB to Succ already exists, update the
+      // probability of this edge by adding NewWeight to it. An example is shown
+      // below, in which A is ToBBI.BB and B is FromBBI.BB. In this case we
+      // don't have to set C as A's successor as it already is. We only need to
+      // update the edge probability on A->C. Note that B will not be
+      // immediately removed from A's successors. It is possible that B->D is
+      // not removed either if D is a fallthrough of B. Later the edge A->D
+      // (generated here) and B->D will be combined into one edge. To maintain
+      // correct edge probability of this combined edge, we need to set the edge
+      // probability of A->B to zero, which is already done above. The edge
+      // probability on A->D is calculated by scaling the original probability
+      // on A->B by the probability of B->D.
       //
       // Before ifcvt:      After ifcvt (assume B->D is kept):
       //
@@ -1755,11 +1738,11 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) {
       //    C  D             C  D
       //
       if (ToBBI.BB->isSuccessor(Succ))
-        ToBBI.BB->setSuccWeight(
+        ToBBI.BB->setSuccProbability(
             std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), Succ),
-            MBPI->getEdgeWeight(ToBBI.BB, Succ) + NewWeight);
+            MBPI->getEdgeProbability(ToBBI.BB, Succ) + NewProb);
       else
-        ToBBI.BB->addSuccessor(Succ, NewWeight);
+        ToBBI.BB->addSuccessor(Succ, NewProb);
     }
   }
 
diff --git a/lib/CodeGen/MIRParser/MIParser.cpp b/lib/CodeGen/MIRParser/MIParser.cpp
index 5a8e96df7603..c9c2d62cec30 100644
--- a/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/lib/CodeGen/MIRParser/MIParser.cpp
@@ -459,8 +459,9 @@ bool MIParser::parseBasicBlockSuccessors(MachineBasicBlock &MBB) {
       if (expectAndConsume(MIToken::rparen))
         return true;
     }
-    MBB.addSuccessor(SuccMBB, Weight);
+    MBB.addSuccessor(SuccMBB, BranchProbability::getRaw(Weight));
   } while (consumeIfPresent(MIToken::comma));
+  MBB.normalizeSuccProbs();
   return false;
 }
 
diff --git a/lib/CodeGen/MIRPrinter.cpp b/lib/CodeGen/MIRPrinter.cpp
index 0be7807064fb..175cb0d51437 100644
--- a/lib/CodeGen/MIRPrinter.cpp
+++ b/lib/CodeGen/MIRPrinter.cpp
@@ -461,8 +461,8 @@ void MIPrinter::print(const MachineBasicBlock &MBB) {
       if (I != MBB.succ_begin())
         OS << ", ";
       printMBBReference(**I);
-      if (MBB.hasSuccessorWeights())
-        OS << '(' << MBB.getSuccWeight(I) << ')';
+      if (MBB.hasSuccessorProbabilities())
+        OS << '(' << MBB.getSuccProbability(I) << ')';
     }
     OS << "\n";
     HasLineAttributes = true;
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index 602b75182fca..c9c6a9d62462 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -319,8 +319,8 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
     OS << "    Successors according to CFG:";
     for (const_succ_iterator SI = succ_begin(), E = succ_end(); SI != E; ++SI) {
       OS << " BB#" << (*SI)->getNumber();
-      if (!Weights.empty())
-        OS << '(' << *getWeightIterator(SI) << ')';
+      if (!Probs.empty())
+        OS << '(' << *getProbabilityIterator(SI) << ')';
     }
     OS << '\n';
   }
@@ -506,34 +506,16 @@ void MachineBasicBlock::updateTerminator() {
   }
 }
 
-void MachineBasicBlock::addSuccessor(MachineBasicBlock *Succ, uint32_t Weight) {
-  // Weight list is either empty (if successor list isn't empty, this means
-  // disabled optimization) or has the same size as successor list.
-  if (!(Weights.empty() && !Successors.empty()))
-    Weights.push_back(Weight);
-  Successors.push_back(Succ);
-  Succ->addPredecessor(this);
-}
-
-void MachineBasicBlock::addSuccessorWithoutWeight(MachineBasicBlock *Succ) {
-  // We need to make sure weight list is either empty or has the same size of
-  // successor list. When this function is called, we can safely delete all
-  // weight in the list.
-  Weights.clear();
-  Successors.push_back(Succ);
-  Succ->addPredecessor(this);
-}
-
 void MachineBasicBlock::addSuccessor(MachineBasicBlock *Succ,
                                      BranchProbability Prob) {
   // Probability list is either empty (if successor list isn't empty, this means
   // disabled optimization) or has the same size as successor list.
   if (!(Probs.empty() && !Successors.empty())) {
+    assert((Probs.empty() || (Prob.isUnknown() && Probs.back().isUnknown()) ||
+            (!Prob.isUnknown() && !Probs.back().isUnknown())) &&
+           "Successors with both known and unknwon probabilities are not "
+           "allowed.");
     Probs.push_back(Prob);
-    // FIXME: Temporarily use the numerator of the probability to represent edge
-    // weight. This will be removed once all weight-version interfaces in MBB
-    // are replaced with probability-version interfaces.
-    Weights.push_back(Prob.getNumerator());
   }
   Successors.push_back(Succ);
   Succ->addPredecessor(this);
@@ -544,7 +526,6 @@ void MachineBasicBlock::addSuccessorWithoutProb(MachineBasicBlock *Succ) {
   // of successor list. When this function is called, we can safely delete all
   // probability in the list.
   Probs.clear();
-  Weights.clear();
   Successors.push_back(Succ);
   Succ->addPredecessor(this);
 }
@@ -558,23 +539,12 @@ MachineBasicBlock::succ_iterator
 MachineBasicBlock::removeSuccessor(succ_iterator I) {
   assert(I != Successors.end() && "Not a current successor!");
 
-  // If Weight list is empty it means we don't use it (disabled optimization).
-  if (!Weights.empty()) {
-    weight_iterator WI = getWeightIterator(I);
-    Weights.erase(WI);
-  }
-
-  // FIXME: Temporarily comment the following code as probabilities are now only
-  // used during instruction lowering, but this interface is called in later
-  // passes. Uncomment it once all edge weights are replaced with probabilities.
-#if 0
   // If probability list is empty it means we don't use it (disabled
   // optimization).
   if (!Probs.empty()) {
     probability_iterator WI = getProbabilityIterator(I);
     Probs.erase(WI);
   }
-#endif
 
   (*I)->removePredecessor(this);
   return Successors.erase(I);
@@ -611,17 +581,12 @@ void MachineBasicBlock::replaceSuccessor(MachineBasicBlock *Old,
   }
 
   // New is already a successor.
-  // Update its weight instead of adding a duplicate edge.
-  if (!Weights.empty())
-    *getWeightIterator(NewI) += *getWeightIterator(OldI);
-  // FIXME: Temporarily comment the following code as probabilities are now only
-  // used during instruction lowering, but this interface is called in later
-  // passes. Uncomment it once all edge weights are replaced with probabilities.
-#if 0
   // Update its probability instead of adding a duplicate edge.
-  if (!Probs.empty())
-    *getProbabilityIterator(NewI) += *getProbabilityIterator(OldI);
-#endif
+  if (!Probs.empty()) {
+    auto ProbIter = getProbabilityIterator(NewI);
+    if (!ProbIter->isUnknown())
+      *ProbIter += *getProbabilityIterator(OldI);
+  }
   removeSuccessor(OldI);
 }
 
@@ -641,13 +606,14 @@ void MachineBasicBlock::transferSuccessors(MachineBasicBlock *FromMBB) {
 
   while (!FromMBB->succ_empty()) {
     MachineBasicBlock *Succ = *FromMBB->succ_begin();
-    uint32_t Weight = 0;
 
-    // If Weight list is empty it means we don't use it (disabled optimization).
-    if (!FromMBB->Weights.empty())
-      Weight = *FromMBB->Weights.begin();
+    // If probability list is empty it means we don't use it (disabled optimization).
+    if (!FromMBB->Probs.empty()) {
+      auto Prob = *FromMBB->Probs.begin();
+      addSuccessor(Succ, Prob);
+    } else
+      addSuccessorWithoutProb(Succ);
 
-    addSuccessor(Succ, Weight);
     FromMBB->removeSuccessor(Succ);
   }
 }
@@ -659,10 +625,11 @@ MachineBasicBlock::transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB) {
 
   while (!FromMBB->succ_empty()) {
     MachineBasicBlock *Succ = *FromMBB->succ_begin();
-    uint32_t Weight = 0;
-    if (!FromMBB->Weights.empty())
-      Weight = *FromMBB->Weights.begin();
-    addSuccessor(Succ, Weight);
+    if (!FromMBB->Probs.empty()) {
+      auto Prob = *FromMBB->Probs.begin();
+      addSuccessor(Succ, Prob);
+    } else
+      addSuccessorWithoutProb(Succ);
     FromMBB->removeSuccessor(Succ);
 
     // Fix up any PHI nodes in the successor.
@@ -1146,80 +1113,37 @@ MachineBasicBlock::findDebugLoc(instr_iterator MBBI) {
   return DL;
 }
 
-/// Return weight of the edge from this block to MBB.
-uint32_t MachineBasicBlock::getSuccWeight(const_succ_iterator Succ) const {
-  if (Weights.empty())
-    return 0;
-
-  return *getWeightIterator(Succ);
-}
-
-/// Return probability of the edge from this block to MBB. If probability list
-/// is empty, return a default probability which is 1/N, where N is the number
-/// of successors. If the probability of the given successor is unknown, then
-/// sum up all known probabilities and return the complement of the sum divided
-/// by the number of unknown probabilities.
+/// Return probability of the edge from this block to MBB.
 BranchProbability
 MachineBasicBlock::getSuccProbability(const_succ_iterator Succ) const {
-  if (Probs.empty())
+  if (Probs.empty() || Probs.back().isUnknown())
     return BranchProbability(1, succ_size());
 
-  auto Prob = *getProbabilityIterator(Succ);
-  assert(!Prob.isUnknown());
-  return Prob;
-}
-
-/// Set successor weight of a given iterator.
-void MachineBasicBlock::setSuccWeight(succ_iterator I, uint32_t Weight) {
-  if (Weights.empty())
-    return;
-  *getWeightIterator(I) = Weight;
+  return *getProbabilityIterator(Succ);
 }
 
 /// Set successor probability of a given iterator.
 void MachineBasicBlock::setSuccProbability(succ_iterator I,
                                            BranchProbability Prob) {
   assert(!Prob.isUnknown());
-  if (Probs.empty() || Weights.empty())
+  if (Probs.empty())
     return;
   *getProbabilityIterator(I) = Prob;
-  // FIXME: Temporarily use the numerator of the probability to represent edge
-  // weight. This will be removed once all weight-version interfaces in MBB
-  // are replaces with probability-version interfaces.
-  *getWeightIterator(I) = Prob.getNumerator();
-}
-
-/// Return wight iterator corresonding to the I successor iterator.
-MachineBasicBlock::weight_iterator MachineBasicBlock::
-getWeightIterator(MachineBasicBlock::succ_iterator I) {
-  assert(Weights.size() == Successors.size() && "Async weight list!");
-  size_t index = std::distance(Successors.begin(), I);
-  assert(index < Weights.size() && "Not a current successor!");
-  return Weights.begin() + index;
 }
 
-/// Return wight iterator corresonding to the I successor iterator.
-MachineBasicBlock::const_weight_iterator MachineBasicBlock::
-getWeightIterator(MachineBasicBlock::const_succ_iterator I) const {
-  assert(Weights.size() == Successors.size() && "Async weight list!");
-  const size_t index = std::distance(Successors.begin(), I);
-  assert(index < Weights.size() && "Not a current successor!");
-  return Weights.begin() + index;
-}
-
-/// Return probability iterator corresonding to the I successor iterator.
-MachineBasicBlock::probability_iterator
-MachineBasicBlock::getProbabilityIterator(MachineBasicBlock::succ_iterator I) {
+/// Return probability iterator corresonding to the I successor iterator
+MachineBasicBlock::const_probability_iterator
+MachineBasicBlock::getProbabilityIterator(
+    MachineBasicBlock::const_succ_iterator I) const {
   assert(Probs.size() == Successors.size() && "Async probability list!");
   const size_t index = std::distance(Successors.begin(), I);
   assert(index < Probs.size() && "Not a current successor!");
   return Probs.begin() + index;
 }
 
-/// Return probability iterator corresonding to the I successor iterator
-MachineBasicBlock::const_probability_iterator
-MachineBasicBlock::getProbabilityIterator(
-    MachineBasicBlock::const_succ_iterator I) const {
+/// Return probability iterator corresonding to the I successor iterator.
+MachineBasicBlock::probability_iterator
+MachineBasicBlock::getProbabilityIterator(MachineBasicBlock::succ_iterator I) {
   assert(Probs.size() == Successors.size() && "Async probability list!");
   const size_t index = std::distance(Successors.begin(), I);
   assert(index < Probs.size() && "Not a current successor!");
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
index fba33eb93d5f..fcddf346cf68 100644
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -380,19 +380,11 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
   const BranchProbability HotProb(4, 5); // 80%
 
   MachineBasicBlock *BestSucc = nullptr;
-  // FIXME: Due to the performance of the probability and weight routines in
-  // the MBPI analysis, we manually compute probabilities using the edge
-  // weights. This is suboptimal as it means that the somewhat subtle
-  // definition of edge weight semantics is encoded here as well. We should
-  // improve the MBPI interface to efficiently support query patterns such as
-  // this.
-  uint32_t BestWeight = 0;
-  uint32_t WeightScale = 0;
-  uint32_t SumWeight = MBPI->getSumForBlock(BB, WeightScale);
-
-  // Adjust sum of weights by excluding weights on edges pointing to blocks that
-  // is either not in BlockFilter or is already in the current chain. Consider
-  // the following CFG:
+  auto BestProb = BranchProbability::getZero();
+
+  // Adjust edge probabilities by excluding edges pointing to blocks that is
+  // either not in BlockFilter or is already in the current chain. Consider the
+  // following CFG:
   //
   //     --->A
   //     |  / \
@@ -406,7 +398,7 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
   // HotProb). If we exclude E that is not in BlockFilter when calculating the
   // probability of C->D, D will be selected and we will get A C D B as the
   // layout of this loop.
-  uint32_t AdjustedSumWeight = SumWeight;
+  auto AdjustedSumProb = BranchProbability::getOne();
   SmallVector<MachineBasicBlock *, 4> Successors;
   for (MachineBasicBlock *Succ : BB->successors()) {
     bool SkipSucc = false;
@@ -424,15 +416,20 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
       }
     }
     if (SkipSucc)
-      AdjustedSumWeight -= MBPI->getEdgeWeight(BB, Succ) / WeightScale;
+      AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ);
     else
       Successors.push_back(Succ);
   }
 
   DEBUG(dbgs() << "Attempting merge from: " << getBlockName(BB) << "\n");
   for (MachineBasicBlock *Succ : Successors) {
-    uint32_t SuccWeight = MBPI->getEdgeWeight(BB, Succ);
-    BranchProbability SuccProb(SuccWeight / WeightScale, AdjustedSumWeight);
+    BranchProbability SuccProb;
+    uint32_t SuccProbN = MBPI->getEdgeProbability(BB, Succ).getNumerator();
+    uint32_t SuccProbD = AdjustedSumProb.getNumerator();
+    if (SuccProbN >= SuccProbD)
+      SuccProb = BranchProbability::getOne();
+    else
+      SuccProb = BranchProbability(SuccProbN, SuccProbD);
 
     // If we outline optional branches, look whether Succ is unavoidable, i.e.
     // dominates all terminators of the MachineFunction. If it does, other
@@ -470,7 +467,7 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
 
       // Make sure that a hot successor doesn't have a globally more
       // important predecessor.
-      BranchProbability RealSuccProb(SuccWeight / WeightScale, SumWeight);
+      auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
       BlockFrequency CandidateEdgeFreq =
           MBFI->getBlockFreq(BB) * RealSuccProb * HotProb.getCompl();
       bool BadCFGConflict = false;
@@ -496,10 +493,10 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
                  << " (prob)"
                  << (SuccChain.LoopPredecessors != 0 ? " (CFG break)" : "")
                  << "\n");
-    if (BestSucc && BestWeight >= SuccWeight)
+    if (BestSucc && BestProb >= SuccProb)
       continue;
     BestSucc = Succ;
-    BestWeight = SuccWeight;
+    BestProb = SuccProb;
   }
   return BestSucc;
 }
@@ -728,11 +725,6 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L,
     MachineBasicBlock *OldExitingBB = ExitingBB;
     BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
     bool HasLoopingSucc = false;
-    // FIXME: Due to the performance of the probability and weight routines in
-    // the MBPI analysis, we use the internal weights and manually compute the
-    // probabilities to avoid quadratic behavior.
-    uint32_t WeightScale = 0;
-    uint32_t SumWeight = MBPI->getSumForBlock(MBB, WeightScale);
     for (MachineBasicBlock *Succ : MBB->successors()) {
       if (Succ->isEHPad())
         continue;
@@ -746,10 +738,10 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L,
         continue;
       }
 
-      uint32_t SuccWeight = MBPI->getEdgeWeight(MBB, Succ);
+      auto SuccProb = MBPI->getEdgeProbability(MBB, Succ);
       if (LoopBlockSet.count(Succ)) {
         DEBUG(dbgs() << "    looping: " << getBlockName(MBB) << " -> "
-                     << getBlockName(Succ) << " (" << SuccWeight << ")\n");
+                     << getBlockName(Succ) << " (" << SuccProb << ")\n");
         HasLoopingSucc = true;
         continue;
       }
@@ -761,7 +753,6 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L,
           BlocksExitingToOuterLoop.insert(MBB);
       }
 
-      BranchProbability SuccProb(SuccWeight / WeightScale, SumWeight);
       BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb;
       DEBUG(dbgs() << "    exiting: " << getBlockName(MBB) << " -> "
                    << getBlockName(Succ) << " [L:" << SuccLoopDepth << "] (";
@@ -904,21 +895,17 @@ void MachineBlockPlacement::rotateLoopWithProfile(
   // edge from the tail of the loop chain.
   SmallVector<std::pair<MachineBasicBlock *, BlockFrequency>, 4> ExitsWithFreq;
   for (auto BB : LoopChain) {
-    uint32_t LargestExitEdgeWeight = 0;
+    auto LargestExitEdgeProb = BranchProbability::getZero();
     for (auto *Succ : BB->successors()) {
       BlockChain *SuccChain = BlockToChain[Succ];
       if (!LoopBlockSet.count(Succ) &&
           (!SuccChain || Succ == *SuccChain->begin())) {
-        uint32_t SuccWeight = MBPI->getEdgeWeight(BB, Succ);
-        LargestExitEdgeWeight = std::max(LargestExitEdgeWeight, SuccWeight);
+        auto SuccProb = MBPI->getEdgeProbability(BB, Succ);
+        LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb);
       }
     }
-    if (LargestExitEdgeWeight > 0) {
-      uint32_t WeightScale = 0;
-      uint32_t SumWeight = MBPI->getSumForBlock(BB, WeightScale);
-      auto ExitFreq =
-          MBFI->getBlockFreq(BB) *
-          BranchProbability(LargestExitEdgeWeight / WeightScale, SumWeight);
+    if (LargestExitEdgeProb > BranchProbability::getZero()) {
+      auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb;
       ExitsWithFreq.emplace_back(BB, ExitFreq);
     }
   }
@@ -1290,14 +1277,16 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
       }
 
       // If PrevBB has a two-way branch, try to re-order the branches
-      // such that we branch to the successor with higher weight first.
+      // such that we branch to the successor with higher probability first.
       if (TBB && !Cond.empty() && FBB &&
-          MBPI->getEdgeWeight(PrevBB, FBB) > MBPI->getEdgeWeight(PrevBB, TBB) &&
+          MBPI->getEdgeProbability(PrevBB, FBB) >
+              MBPI->getEdgeProbability(PrevBB, TBB) &&
           !TII->ReverseBranchCondition(Cond)) {
         DEBUG(dbgs() << "Reverse order of the two branches: "
                      << getBlockName(PrevBB) << "\n");
-        DEBUG(dbgs() << "    Edge weight: " << MBPI->getEdgeWeight(PrevBB, FBB)
-                     << " vs " << MBPI->getEdgeWeight(PrevBB, TBB) << "\n");
+        DEBUG(dbgs() << "    Edge probability: "
+                     << MBPI->getEdgeProbability(PrevBB, FBB) << " vs "
+                     << MBPI->getEdgeProbability(PrevBB, TBB) << "\n");
         DebugLoc dl; // FIXME: this is nowhere
         TII->RemoveBranch(*PrevBB);
         TII->InsertBranch(*PrevBB, FBB, TBB, Cond, dl);
diff --git a/lib/CodeGen/MachineBranchProbabilityInfo.cpp b/lib/CodeGen/MachineBranchProbabilityInfo.cpp
index 6fbc2be70486..5478dcba261a 100644
--- a/lib/CodeGen/MachineBranchProbabilityInfo.cpp
+++ b/lib/CodeGen/MachineBranchProbabilityInfo.cpp
@@ -28,91 +28,61 @@ char MachineBranchProbabilityInfo::ID = 0;
 
 void MachineBranchProbabilityInfo::anchor() { }
 
-uint32_t MachineBranchProbabilityInfo::
-getSumForBlock(const MachineBasicBlock *MBB, uint32_t &Scale) const {
-  // First we compute the sum with 64-bits of precision, ensuring that cannot
-  // overflow by bounding the number of weights considered. Hopefully no one
-  // actually needs 2^32 successors.
-  assert(MBB->succ_size() < UINT32_MAX);
-  uint64_t Sum = 0;
-  Scale = 1;
-  for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
-       E = MBB->succ_end(); I != E; ++I) {
-    uint32_t Weight = getEdgeWeight(MBB, I);
-    Sum += Weight;
-  }
-
-  // If the computed sum fits in 32-bits, we're done.
-  if (Sum <= UINT32_MAX)
-    return Sum;
+uint32_t MachineBranchProbabilityInfo::getEdgeWeight(
+    const MachineBasicBlock *Src,
+    MachineBasicBlock::const_succ_iterator Dst) const {
+  return Src->getSuccProbability(Dst).getNumerator();
+}
 
-  // Otherwise, compute the scale necessary to cause the weights to fit, and
-  // re-sum with that scale applied.
-  assert((Sum / UINT32_MAX) < UINT32_MAX);
-  Scale = (Sum / UINT32_MAX) + 1;
-  Sum = 0;
-  for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
-       E = MBB->succ_end(); I != E; ++I) {
-    uint32_t Weight = getEdgeWeight(MBB, I);
-    Sum += Weight / Scale;
-  }
-  assert(Sum <= UINT32_MAX);
-  return Sum;
+uint32_t MachineBranchProbabilityInfo::getEdgeWeight(
+    const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const {
+  // This is a linear search. Try to use the const_succ_iterator version when
+  // possible.
+  return getEdgeWeight(Src, std::find(Src->succ_begin(), Src->succ_end(), Dst));
 }
 
-uint32_t MachineBranchProbabilityInfo::
-getEdgeWeight(const MachineBasicBlock *Src,
-              MachineBasicBlock::const_succ_iterator Dst) const {
-  uint32_t Weight = Src->getSuccWeight(Dst);
-  if (!Weight)
-    return DEFAULT_WEIGHT;
-  return Weight;
+BranchProbability MachineBranchProbabilityInfo::getEdgeProbability(
+    const MachineBasicBlock *Src,
+    MachineBasicBlock::const_succ_iterator Dst) const {
+  return Src->getSuccProbability(Dst);
 }
 
-uint32_t MachineBranchProbabilityInfo::
-getEdgeWeight(const MachineBasicBlock *Src,
-              const MachineBasicBlock *Dst) const {
+BranchProbability MachineBranchProbabilityInfo::getEdgeProbability(
+    const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const {
   // This is a linear search. Try to use the const_succ_iterator version when
   // possible.
-  return getEdgeWeight(Src, std::find(Src->succ_begin(), Src->succ_end(), Dst));
+  return getEdgeProbability(Src,
+                            std::find(Src->succ_begin(), Src->succ_end(), Dst));
 }
 
 bool
 MachineBranchProbabilityInfo::isEdgeHot(const MachineBasicBlock *Src,
                                         const MachineBasicBlock *Dst) const {
   // Hot probability is at least 4/5 = 80%
-  // FIXME: Compare against a static "hot" BranchProbability.
-  return getEdgeProbability(Src, Dst) > BranchProbability(4, 5);
+  static BranchProbability HotProb(4, 5);
+  return getEdgeProbability(Src, Dst) > HotProb;
 }
 
 MachineBasicBlock *
 MachineBranchProbabilityInfo::getHotSucc(MachineBasicBlock *MBB) const {
-  uint32_t MaxWeight = 0;
+  auto MaxProb = BranchProbability::getZero();
   MachineBasicBlock *MaxSucc = nullptr;
   for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
        E = MBB->succ_end(); I != E; ++I) {
-    uint32_t Weight = getEdgeWeight(MBB, I);
-    if (Weight > MaxWeight) {
-      MaxWeight = Weight;
+    auto Prob = getEdgeProbability(MBB, I);
+    if (Prob > MaxProb) {
+      MaxProb = Prob;
       MaxSucc = *I;
     }
   }
 
-  if (getEdgeProbability(MBB, MaxSucc) >= BranchProbability(4, 5))
+  static BranchProbability HotProb(4, 5);
+  if (getEdgeProbability(MBB, MaxSucc) >= HotProb)
     return MaxSucc;
 
   return nullptr;
 }
 
-BranchProbability MachineBranchProbabilityInfo::getEdgeProbability(
-    const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const {
-  uint32_t Scale = 1;
-  uint32_t D = getSumForBlock(Src, Scale);
-  uint32_t N = getEdgeWeight(Src, Dst) / Scale;
-
-  return BranchProbability(N, D);
-}
-
 raw_ostream &MachineBranchProbabilityInfo::printEdgeProbability(
     raw_ostream &OS, const MachineBasicBlock *Src,
     const MachineBasicBlock *Dst) const {
diff --git a/lib/CodeGen/TailDuplication.cpp b/lib/CodeGen/TailDuplication.cpp
index ff86dabfac59..1f5b54866ac6 100644
--- a/lib/CodeGen/TailDuplication.cpp
+++ b/lib/CodeGen/TailDuplication.cpp
@@ -745,12 +745,12 @@ TailDuplicatePass::duplicateSimpleBB(MachineBasicBlock *TailBB,
     if (PredTBB)
       TII->InsertBranch(*PredBB, PredTBB, PredFBB, PredCond, DebugLoc());
 
-    uint32_t Weight = MBPI->getEdgeWeight(PredBB, TailBB);
+    auto Prob = MBPI->getEdgeProbability(PredBB, TailBB);
     PredBB->removeSuccessor(TailBB);
     unsigned NumSuccessors = PredBB->succ_size();
     assert(NumSuccessors <= 1);
     if (NumSuccessors == 0 || *PredBB->succ_begin() != NewTarget)
-      PredBB->addSuccessor(NewTarget, Weight);
+      PredBB->addSuccessor(NewTarget, Prob);
 
     TDBBs.push_back(PredBB);
   }
@@ -858,7 +858,7 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB,
            "TailDuplicate called on block with multiple successors!");
     for (MachineBasicBlock::succ_iterator I = TailBB->succ_begin(),
            E = TailBB->succ_end(); I != E; ++I)
-      PredBB->addSuccessor(*I, MBPI->getEdgeWeight(TailBB, I));
+      PredBB->addSuccessor(*I, MBPI->getEdgeProbability(TailBB, I));
 
     Changed = true;
     ++NumTailDups;
diff --git a/lib/Support/BranchProbability.cpp b/lib/Support/BranchProbability.cpp
index 3b0f6e6f06e4..771d02c0aa3c 100644
--- a/lib/Support/BranchProbability.cpp
+++ b/lib/Support/BranchProbability.cpp
@@ -22,11 +22,14 @@ using namespace llvm;
 const uint32_t BranchProbability::D;
 
 raw_ostream &BranchProbability::print(raw_ostream &OS) const {
+  if (isUnknown())
+    return OS << "?%";
+
   // Get a percentage rounded to two decimal digits. This avoids
   // implementation-defined rounding inside printf.
   double Percent = rint(((double)N / D) * 100.0 * 100.0) / 100.0;
-  OS << format("0x%08" PRIx32 " / 0x%08" PRIx32 " = %.2f%%", N, D, Percent);
-  return OS;
+  return OS << format("0x%08" PRIx32 " / 0x%08" PRIx32 " = %.2f%%", N, D,
+                      Percent);
 }
 
 void BranchProbability::dump() const { print(dbgs()) << '\n'; }
@@ -43,6 +46,19 @@ BranchProbability::BranchProbability(uint32_t Numerator, uint32_t Denominator) {
   }
 }
 
+BranchProbability
+BranchProbability::getBranchProbability(uint64_t Numerator,
+                                        uint64_t Denominator) {
+  assert(Numerator <= Denominator && "Probability cannot be bigger than 1!");
+  // Scale down Denominator to fit in a 32-bit integer.
+  int Scale = 0;
+  while (Denominator > UINT32_MAX) {
+    Denominator >>= 1;
+    Scale++;
+  }
+  return BranchProbability(Numerator >> Scale, Denominator);
+}
+
 // If ConstD is not zero, then replace D by ConstD so that division and modulo
 // operations by D can be optimized, in case this function is not inlined by the
 // compiler.
diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index f1b383017901..cdbd12092150 100644
--- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -1570,8 +1570,7 @@ void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
 
   insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc());
   insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc());
-  DstBlk->addSuccessor(LandMBB);
-  DstBlk->removeSuccessor(DstBlk);
+  DstBlk->replaceSuccessor(DstBlk, LandMBB);
 }
 
 
@@ -1666,8 +1665,7 @@ AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
   replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB);
   //srcBlk, oldBlk, newBlk
 
-  PredMBB->removeSuccessor(MBB);
-  PredMBB->addSuccessor(CloneMBB);
+  PredMBB->replaceSuccessor(MBB, CloneMBB);
 
   // add all successor to cloneBlk
   cloneSuccessorList(CloneMBB, MBB);
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index 0bf2d374df6a..e89757c19ecc 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -2274,8 +2274,7 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
 
   // Update the CFG.
   NewBB->addSuccessor(BB);
-  JTBB->removeSuccessor(BB);
-  JTBB->addSuccessor(NewBB);
+  JTBB->replaceSuccessor(BB, NewBB);
 
   ++NumJTInserted;
   return NewBB;
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 0cc41812d71c..e8f3ab65bdbe 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -7346,7 +7346,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
       }
     }
 
-    BB->addSuccessor(DispatchBB);
+    BB->addSuccessor(DispatchBB, BranchProbability::getZero());
 
     // Find the invoke call and mark all of the callee-saved registers as
     // 'implicit defined' so that they're spilled. This prevents code from
diff --git a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
index 96bb61750805..efafdd007289 100644
--- a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
+++ b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
@@ -186,13 +186,11 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) {
 
             if (case1 || case2) {
               InvertAndChangeJumpTarget(MI, UncondTarget);
-              MBB->removeSuccessor(JumpAroundTarget);
-              MBB->addSuccessor(UncondTarget);
+              MBB->replaceSuccessor(JumpAroundTarget, UncondTarget);
 
               // Remove the unconditional branch in LayoutSucc.
               LayoutSucc->erase(LayoutSucc->begin());
-              LayoutSucc->removeSuccessor(UncondTarget);
-              LayoutSucc->addSuccessor(JumpAroundTarget);
+              LayoutSucc->replaceSuccessor(UncondTarget, JumpAroundTarget);
 
               // This code performs the conversion for case 2, which moves
               // the block to the fall-thru case (BB3 in the code above).
diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp
index d09843ed0e53..e75858a181e5 100644
--- a/lib/Target/Mips/MipsLongBranch.cpp
+++ b/lib/Target/Mips/MipsLongBranch.cpp
@@ -262,8 +262,7 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
       static_cast<const MipsInstrInfo *>(Subtarget.getInstrInfo());
 
   MF->insert(FallThroughMBB, LongBrMBB);
-  MBB->removeSuccessor(TgtMBB);
-  MBB->addSuccessor(LongBrMBB);
+  MBB->replaceSuccessor(TgtMBB, LongBrMBB);
 
   if (IsPIC) {
     MachineBasicBlock *BalTgtMBB = MF->CreateMachineBasicBlock(BB);
diff --git a/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll b/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll
index e17da7a97205..a44c9721d6c1 100644
--- a/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll
+++ b/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll
@@ -14,15 +14,15 @@ entry:
   br i1 undef, label %for.end, label %for.body
 
 ; Before if conversion, we have
-; for.body -> lor.lhs.false.i (62)
-;          -> for.cond.backedge (62)
-; lor.lhs.false.i -> for.cond.backedge (1048575)
-;                 -> cond.false.i (1)
+; for.body -> lor.lhs.false.i (50%)
+;          -> for.cond.backedge (50%)
+; lor.lhs.false.i -> for.cond.backedge (100%)
+;                 -> cond.false.i (0%)
 ; Afer if conversion, we have
-; for.body -> for.cond.backedge (130023362)
-;          -> cond.false.i (62)
+; for.body -> for.cond.backedge (100%)
+;          -> cond.false.i (0%)
 ; CHECK: BB#1: derived from LLVM BB %for.body
-; CHECK: Successors according to CFG: BB#2(4294967291) BB#4(2048)
+; CHECK: Successors according to CFG: BB#2(0x7ffffc00 / 0x80000000 = 100.00%) BB#4(0x00000400 / 0x80000000 = 0.00%)
 for.body:
   br i1 undef, label %for.cond.backedge, label %lor.lhs.false.i, !prof !1
 
diff --git a/test/CodeGen/ARM/ifcvt-branch-weight.ll b/test/CodeGen/ARM/ifcvt-branch-weight.ll
index f2a1229d0d8a..0de039cde23c 100644
--- a/test/CodeGen/ARM/ifcvt-branch-weight.ll
+++ b/test/CodeGen/ARM/ifcvt-branch-weight.ll
@@ -19,7 +19,7 @@ bb:
   br i1 %9, label %return, label %bb2
 
 ; CHECK: BB#2: derived from LLVM BB %bb2
-; CHECK: Successors according to CFG: BB#3(4294967289) BB#4(4294967287)
+; CHECK: Successors according to CFG: BB#3({{[0-9a-fx/= ]+}}50.00%) BB#4({{[0-9a-fx/= ]+}}50.00%)
 
 bb2:
   %v10 = icmp eq i32 %3, 16
diff --git a/test/CodeGen/ARM/ifcvt-iter-indbr.ll b/test/CodeGen/ARM/ifcvt-iter-indbr.ll
index 6ce9bcb56ef4..a96b6e8a1e83 100644
--- a/test/CodeGen/ARM/ifcvt-iter-indbr.ll
+++ b/test/CodeGen/ARM/ifcvt-iter-indbr.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple thumbv7s-apple-darwin  -asm-verbose=false | FileCheck %s
-; RUN: llc < %s -mtriple thumbv7s-apple-darwin  -asm-verbose=false -print-machineinstrs=if-converter 2>&1 | FileCheck --check-prefix=CHECK-WEIGHT %s
+; RUN: llc < %s -mtriple thumbv7s-apple-darwin  -asm-verbose=false -print-machineinstrs=if-converter 2>&1 | FileCheck --check-prefix=CHECK-PROB %s
 
 declare i32 @foo(i32)
 declare i8* @bar(i32, i8*, i8*)
@@ -29,10 +29,10 @@ declare i8* @bar(i32, i8*, i8*)
 ; CHECK-NEXT: [[FOOCALL]]:
 ; CHECK-NEXT:  blx _foo
 ;
-; CHECK-WEIGHT: BB#0:
-; CHECK-WEIGHT: Successors according to CFG: BB#1(1073741824) BB#2(536870912) BB#4(536870912)
-; CHECK-WEIGHT: BB#1:
-; CHECK-WEIGHT: Successors according to CFG: BB#2(1610612736) BB#4(536870912)
+; CHECK-PROB: BB#0:
+; CHECK-PROB: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}50.00%) BB#2({{[0-9a-fx/= ]+}}25.00%) BB#4({{[0-9a-fx/= ]+}}25.00%)
+; CHECK-PROB: BB#1:
+; CHECK-PROB: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}75.00%) BB#4({{[0-9a-fx/= ]+}}25.00%)
 
 define i32 @test(i32 %a, i32 %a2, i32* %p, i32* %p2) {
 entry:
diff --git a/test/CodeGen/ARM/tail-merge-branch-weight.ll b/test/CodeGen/ARM/tail-merge-branch-weight.ll
index 95b0a202e7ff..f83f28815793 100644
--- a/test/CodeGen/ARM/tail-merge-branch-weight.ll
+++ b/test/CodeGen/ARM/tail-merge-branch-weight.ll
@@ -9,7 +9,7 @@
 ;                = 0.2 * 0.4 + 0.8 * 0.7 = 0.64
 
 ; CHECK: # Machine code for function test0:
-; CHECK: Successors according to CFG: BB#{{[0-9]+}}(13) BB#{{[0-9]+}}(24)
+; CHECK: Successors according to CFG: BB#{{[0-9]+}}({{[0-9a-fx/= ]+}}20.00%) BB#{{[0-9]+}}({{[0-9a-fx/= ]+}}80.00%)
 ; CHECK: BB#{{[0-9]+}}:
 ; CHECK: BB#{{[0-9]+}}:
 ; CHECK: # End machine code for function test0.
diff --git a/test/CodeGen/ARM/taildup-branch-weight.ll b/test/CodeGen/ARM/taildup-branch-weight.ll
index 576c120b444e..799ef62416e6 100644
--- a/test/CodeGen/ARM/taildup-branch-weight.ll
+++ b/test/CodeGen/ARM/taildup-branch-weight.ll
@@ -3,7 +3,7 @@
 ; RUN:	| FileCheck %s
 
 ; CHECK: Machine code for function test0:
-; CHECK: Successors according to CFG: BB#1(67108864) BB#2(2080374784)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}3.12%) BB#2({{[0-9a-fx/= ]+}}96.88%)
 
 define void @test0(i32 %a, i32 %b, i32* %c, i32* %d) {
 entry:
@@ -30,7 +30,7 @@ B4:
 !0 = !{!"branch_weights", i32 4, i32 124}
 
 ; CHECK: Machine code for function test1:
-; CHECK: Successors according to CFG: BB#1(67108864) BB#2(2080374784)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}3.12%) BB#2({{[0-9a-fx/= ]+}}96.88%)
 
 @g0 = common global i32 0, align 4
 
diff --git a/test/CodeGen/Generic/MachineBranchProb.ll b/test/CodeGen/Generic/MachineBranchProb.ll
index 5a4a4672f7eb..ae3c8da21471 100644
--- a/test/CodeGen/Generic/MachineBranchProb.ll
+++ b/test/CodeGen/Generic/MachineBranchProb.ll
@@ -16,11 +16,11 @@ entry:
     i64 5, label %sw.bb1
   ], !prof !0
 ; CHECK: BB#0: derived from LLVM BB %entry
-; CHECK: Successors according to CFG: BB#2(1616928864) BB#4(530554784)
+; CHECK: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}75.29%) BB#4({{[0-9a-fx/= ]+}}24.71%)
 ; CHECK: BB#4: derived from LLVM BB %entry
-; CHECK: Successors according to CFG: BB#1(252645135) BB#5(277909649)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}47.62%) BB#5({{[0-9a-fx/= ]+}}52.38%)
 ; CHECK: BB#5: derived from LLVM BB %entry
-; CHECK: Successors according to CFG: BB#1(101058054) BB#3(176851595)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}36.36%) BB#3({{[0-9a-fx/= ]+}}63.64%)
 
 sw.bb:
   br label %return
@@ -62,7 +62,7 @@ return: ret void
 ; CHECK-LABEL: Machine code for function left_leaning_weight_balanced_tree:
 ; CHECK: BB#0: derived from LLVM BB %entry
 ; CHECK-NOT: Successors
-; CHECK: Successors according to CFG: BB#8(852677332) BB#9(1294806318)
+; CHECK: Successors according to CFG: BB#8({{[0-9a-fx/= ]+}}39.71%) BB#9({{[0-9a-fx/= ]+}}60.29%)
 }
 
 !1 = !{!"branch_weights",
diff --git a/test/CodeGen/Hexagon/ifcvt-edge-weight.ll b/test/CodeGen/Hexagon/ifcvt-edge-weight.ll
index f84fd95e4fbd..341567e1d02f 100644
--- a/test/CodeGen/Hexagon/ifcvt-edge-weight.ll
+++ b/test/CodeGen/Hexagon/ifcvt-edge-weight.ll
@@ -2,7 +2,7 @@
 ; Check that the edge weights are updated correctly after if-conversion.
 
 ; CHECK: BB#3:
-; CHECK: Successors according to CFG: BB#2(214748365) BB#1(1932735283)
+; CHECK: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}10.00%) BB#1({{[0-9a-fx/= ]+}}90.00%)
 @a = external global i32
 @d = external global i32
 
diff --git a/test/CodeGen/MIR/X86/newline-handling.mir b/test/CodeGen/MIR/X86/newline-handling.mir
index b5ed3b7f27e1..bce06d540114 100644
--- a/test/CodeGen/MIR/X86/newline-handling.mir
+++ b/test/CodeGen/MIR/X86/newline-handling.mir
@@ -35,7 +35,7 @@ liveins:
 # CHECK-LABEL: name: foo
 # CHECK: body: |
 # CHECK-NEXT: bb.0.entry:
-# CHECK-NEXT: successors: %bb.1.less(0), %bb.2.exit(0)
+# CHECK-NEXT: successors: %bb.1.less(0x40000000 / 0x80000000 = 50.00%), %bb.2.exit(0x40000000 / 0x80000000 = 50.00%)
 # CHECK-NEXT: liveins: %edi
 # CHECK:      CMP32ri8 %edi, 10, implicit-def %eflags
 # CHECK-NEXT: JG_1 %bb.2.exit, implicit killed %eflags
@@ -79,7 +79,7 @@ liveins:
 # CHECK-LABEL: name: bar
 # CHECK: body: |
 # CHECK-NEXT: bb.0.entry:
-# CHECK-NEXT: successors: %bb.1.less(0), %bb.2.exit(0)
+# CHECK-NEXT: successors: %bb.1.less(0x40000000 / 0x80000000 = 50.00%), %bb.2.exit(0x40000000 / 0x80000000 = 50.00%)
 # CHECK-NEXT: liveins: %edi
 # CHECK:      CMP32ri8 %edi, 10, implicit-def %eflags
 # CHECK-NEXT: JG_1 %bb.2.exit, implicit killed %eflags
diff --git a/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir b/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir
index fc5e5d640f7f..64af6121189a 100644
--- a/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir
+++ b/test/CodeGen/MIR/X86/successor-basic-blocks-weights.mir
@@ -1,6 +1,6 @@
 # RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
 # This test ensures that the MIR parser parses basic block successors and
-# weights correctly.
+# probabilities correctly.
 
 --- |
 
@@ -21,10 +21,10 @@
 name:            foo
 body: |
   ; CHECK-LABEL: bb.0.entry:
-  ; CHECK:         successors: %bb.1.less(16), %bb.2.exit(32)
+  ; CHECK:         successors: %bb.1.less({{[0-9a-fx/= ]+}}33.00%), %bb.2.exit({{[0-9a-fx/= ]+}}67.00%)
   ; CHECK-LABEL: bb.1.less:
   bb.0.entry:
-    successors: %bb.1.less (16), %bb.2.exit(32)
+    successors: %bb.1.less (33), %bb.2.exit(67)
     liveins: %edi
 
     CMP32ri8 %edi, 10, implicit-def %eflags
diff --git a/test/CodeGen/MIR/X86/successor-basic-blocks.mir b/test/CodeGen/MIR/X86/successor-basic-blocks.mir
index aa80fe9fbeef..a6c14f70bc7c 100644
--- a/test/CodeGen/MIR/X86/successor-basic-blocks.mir
+++ b/test/CodeGen/MIR/X86/successor-basic-blocks.mir
@@ -32,7 +32,7 @@
 name:            foo
 body: |
   ; CHECK-LABEL: bb.0.entry:
-  ; CHECK:         successors: %bb.1.less(0), %bb.2.exit(0)
+  ; CHECK:         successors: %bb.1.less(0x40000000 / 0x80000000 = 50.00%), %bb.2.exit(0x40000000 / 0x80000000 = 50.00%)
   ; CHECK-LABEL: bb.1.less:
   bb.0.entry:
     successors: %bb.1.less, %bb.2.exit
@@ -58,7 +58,7 @@ body: |
   ; Verify that we can have multiple lists of successors that will be merged
   ; into one.
   ; CHECK-LABEL: bb.0.entry:
-  ; CHECK:         successors: %bb.1(0), %bb.2(0)
+  ; CHECK:         successors: %bb.1(0x80000000 / 0x80000000 = 100.00%), %bb.2(0x00000000 / 0x80000000 = 0.00%)
   bb.0.entry:
     liveins: %edi
     successors: %bb.1
diff --git a/test/CodeGen/X86/MachineBranchProb.ll b/test/CodeGen/X86/MachineBranchProb.ll
index da0bf517ecfa..ee1c658d4c55 100644
--- a/test/CodeGen/X86/MachineBranchProb.ll
+++ b/test/CodeGen/X86/MachineBranchProb.ll
@@ -18,9 +18,9 @@ for.cond2:                                        ; preds = %for.inc, %for.cond
   %or.cond = or i1 %tobool, %cmp4
   br i1 %or.cond, label %for.inc20, label %for.inc, !prof !0
 ; CHECK: BB#1: derived from LLVM BB %for.cond2
-; CHECK: Successors according to CFG: BB#3(32756933) BB#4(2114726715)
+; CHECK: Successors according to CFG: BB#3({{[0-9a-fx/= ]+}}1.53%) BB#4({{[0-9a-fx/= ]+}}98.47%)
 ; CHECK: BB#4: derived from LLVM BB %for.cond2
-; CHECK: Successors according to CFG: BB#3(33264335) BB#2(2114219313)
+; CHECK: Successors according to CFG: BB#3({{[0-9a-fx/= ]+}}1.55%) BB#2({{[0-9a-fx/= ]+}}98.45%)
 
 for.inc:                                          ; preds = %for.cond2
   %shl = shl i32 %bit.0, 1
diff --git a/test/CodeGen/X86/catchpad-weight.ll b/test/CodeGen/X86/catchpad-weight.ll
index 9b06f2abc81c..e8b416845ec1 100644
--- a/test/CodeGen/X86/catchpad-weight.ll
+++ b/test/CodeGen/X86/catchpad-weight.ll
@@ -2,7 +2,7 @@
 
 ; Check if the edge weight to the catchpad is calculated correctly.
 
-; CHECK: Successors according to CFG: BB#3(2147481600) BB#1(2048) BB#4(1024) BB#6(512) BB#8(256)
+; CHECK: Successors according to CFG: BB#3(0x7ffff100 / 0x80000000 = 100.00%) BB#1(0x00000800 / 0x80000000 = 0.00%) BB#4(0x00000400 / 0x80000000 = 0.00%) BB#6(0x00000200 / 0x80000000 = 0.00%) BB#8(0x00000100 / 0x80000000 = 0.00%)
 
 target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64--windows-msvc18.0.0"
diff --git a/test/CodeGen/X86/stack-protector-weight.ll b/test/CodeGen/X86/stack-protector-weight.ll
index 16877ef70a31..dea66d28e3dd 100644
--- a/test/CodeGen/X86/stack-protector-weight.ll
+++ b/test/CodeGen/X86/stack-protector-weight.ll
@@ -2,13 +2,13 @@
 ; RUN: llc -mtriple=x86_64-apple-darwin -print-machineinstrs=expand-isel-pseudos -enable-selectiondag-sp=false %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=IR
 
 ; SELDAG: # Machine code for function test_branch_weights:
-; SELDAG: Successors according to CFG: BB#[[SUCCESS:[0-9]+]](2147481600) BB#[[FAILURE:[0-9]+]](2048)
+; SELDAG: Successors according to CFG: BB#[[SUCCESS:[0-9]+]]({{[0-9a-fx/= ]+}}100.00%) BB#[[FAILURE:[0-9]+]]
 ; SELDAG: BB#[[FAILURE]]:
 ; SELDAG: CALL64pcrel32 <es:__stack_chk_fail>
 ; SELDAG: BB#[[SUCCESS]]:
 
 ; IR: # Machine code for function test_branch_weights:
-; IR: Successors according to CFG: BB#[[SUCCESS:[0-9]+]](2147481600) BB#[[FAILURE:[0-9]+]](2048)
+; IR: Successors according to CFG: BB#[[SUCCESS:[0-9]+]]({{[0-9a-fx/= ]+}}100.00%) BB#[[FAILURE:[0-9]+]]
 ; IR: BB#[[SUCCESS]]:
 ; IR: BB#[[FAILURE]]:
 ; IR: CALL64pcrel32 <ga:@__stack_chk_fail>
diff --git a/test/CodeGen/X86/switch-edge-weight.ll b/test/CodeGen/X86/switch-edge-weight.ll
index 9026f6f05f0e..6f594868c7ad 100644
--- a/test/CodeGen/X86/switch-edge-weight.ll
+++ b/test/CodeGen/X86/switch-edge-weight.ll
@@ -34,22 +34,22 @@ sw.epilog:
 ; CHECK: BB#0:
 ; BB#0 to BB#4: [0, 1133] (65 = 60 + 5)
 ; BB#0 to BB#5: [1134, UINT32_MAX] (25 = 20 + 5)
-; CHECK: Successors according to CFG: BB#4(1550960411) BB#5(596523235)
+; CHECK: Successors according to CFG: BB#4({{[0-9a-fx/= ]+}}72.22%) BB#5({{[0-9a-fx/= ]+}}27.78%)
 ;
 ; CHECK: BB#4:
 ; BB#4 to BB#1: [155, 159] (50)
 ; BB#4 to BB#5: [0, 1133] - [155, 159] (15 = 10 + 5)
-; CHECK: Successors according to CFG: BB#1(1193046470) BB#7(357913941)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}76.92%) BB#7({{[0-9a-fx/= ]+}}23.08%)
 ;
 ; CHECK: BB#5:
 ; BB#5 to BB#1: {1140} (10)
 ; BB#5 to BB#6: [1134, UINT32_MAX] - {1140} (15 = 10 + 5)
-; CHECK: Successors according to CFG: BB#1(238609294) BB#6(357913941)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}40.00%) BB#6({{[0-9a-fx/= ]+}}60.00%)
 ;
 ; CHECK: BB#6:
 ; BB#6 to BB#1: {1134} (10)
 ; BB#6 to BB#2: [1134, UINT32_MAX] - {1134, 1140} (5)
-; CHECK: Successors according to CFG: BB#1(238609294) BB#2(119304647)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}66.67%) BB#2({{[0-9a-fx/= ]+}}33.33%)
 }
 
 ; CHECK-LABEL: test2
@@ -102,7 +102,7 @@ sw.epilog:
 ; CHECK: BB#0:
 ; BB#0 to BB#6: {0} + [15, UINT32_MAX] (5)
 ; BB#0 to BB#8: [1, 14] (jump table) (65 = 60 + 5)
-; CHECK: Successors according to CFG: BB#6(153391689) BB#8(1994091957)
+; CHECK: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}7.14%) BB#8({{[0-9a-fx/= ]+}}92.86%
 ;
 ; CHECK: BB#8:
 ; BB#8 to BB#1: {1} (10)
@@ -111,7 +111,7 @@ sw.epilog:
 ; BB#8 to BB#3: {11} (10)
 ; BB#8 to BB#4: {12} (10)
 ; BB#8 to BB#5: {13, 14} (20)
-; CHECK: Successors according to CFG: BB#1(306783378) BB#6(153391689) BB#2(306783378) BB#3(306783378) BB#4(306783378) BB#5(613566756)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}14.29%) BB#6({{[0-9a-fx/= ]+}}7.14%) BB#2({{[0-9a-fx/= ]+}}14.29%) BB#3({{[0-9a-fx/= ]+}}14.29%) BB#4({{[0-9a-fx/= ]+}}14.29%) BB#5({{[0-9a-fx/= ]+}}28.57%)
 }
 
 ; CHECK-LABEL: test3
@@ -163,7 +163,7 @@ sw.epilog:
 ; CHECK: BB#0:
 ; BB#0 to BB#6: [0, 9] + [15, UINT32_MAX] {10}
 ; BB#0 to BB#8: [10, 14] (jump table) (50)
-; CHECK: Successors according to CFG: BB#6(357913941) BB#8(1789569705)
+; CHECK: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}16.67%) BB#8({{[0-9a-fx/= ]+}}83.33%)
 ;
 ; CHECK: BB#8:
 ; BB#8 to BB#1: {10} (10)
@@ -171,7 +171,7 @@ sw.epilog:
 ; BB#8 to BB#3: {12} (10)
 ; BB#8 to BB#4: {13} (10)
 ; BB#8 to BB#5: {14} (10)
-; CHECK: Successors according to CFG: BB#1(357913941) BB#2(357913941) BB#3(357913941) BB#4(357913941) BB#5(357913941)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}20.00%) BB#2({{[0-9a-fx/= ]+}}20.00%) BB#3({{[0-9a-fx/= ]+}}20.00%) BB#4({{[0-9a-fx/= ]+}}20.00%) BB#5({{[0-9a-fx/= ]+}}20.00%)
 }
 
 ; CHECK-LABEL: test4
@@ -216,12 +216,12 @@ sw.epilog:
 ; CHECK: BB#0:
 ; BB#0 to BB#6: [0, 110] + [116, UINT32_MAX] (20)
 ; BB#0 to BB#7: [111, 115] (bit test) (50)
-; CHECK: Successors according to CFG: BB#6(613566756) BB#7(1533916890)
+; CHECK: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}28.57%) BB#7({{[0-9a-fx/= ]+}}71.43%)
 ;
 ; CHECK: BB#7:
 ; BB#7 to BB#2: {111, 114, 115} (30)
 ; BB#7 to BB#3: {112, 113} (20)
-; CHECK: Successors according to CFG: BB#2(920350134) BB#3(613566756)
+; CHECK: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}60.00%) BB#3({{[0-9a-fx/= ]+}}40.00%)
 }
 
 ; CHECK-LABEL: test5
@@ -273,7 +273,7 @@ sw.epilog:
 ; CHECK: BB#0:
 ; BB#0 to BB#6: [10, UINT32_MAX] (15)
 ; BB#0 to BB#8: [1, 5, 7, 9] (jump table) (45)
-; CHECK: Successors according to CFG: BB#8(536870912) BB#9(1610612734)
+; CHECK: Successors according to CFG: BB#8({{[0-9a-fx/= ]+}}25.00%) BB#9({{[0-9a-fx/= ]+}}75.00%)
 }
 
 !1 = !{!"branch_weights", i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10} 
diff --git a/test/CodeGen/X86/switch-jump-table.ll b/test/CodeGen/X86/switch-jump-table.ll
index 3cfee1cd80e6..896a067da230 100644
--- a/test/CodeGen/X86/switch-jump-table.ll
+++ b/test/CodeGen/X86/switch-jump-table.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=i686-pc-gnu-linux < %s | FileCheck %s -check-prefix=CHECK
-; RUN: llc -mtriple=i686-pc-gnu-linux -print-machineinstrs=expand-isel-pseudos %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=CHECK-JT-WEIGHT
+; RUN: llc -mtriple=i686-pc-gnu-linux -print-machineinstrs=expand-isel-pseudos %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=CHECK-JT-PROB
 
 
 ; An unreachable default destination is replaced with the most popular case label.
@@ -54,9 +54,9 @@ default:
 ; Check if branch probabilities are correctly assigned to the jump table.
 
 define void @bar(i32 %x, i32* %to) {
-; CHECK-JT-WEIGHT-LABEL: bar:
-; CHECK-JT-WEIGHT: Successors according to CFG: BB#6(306783378) BB#8(1840700268)
-; CHECK-JT-WEIGHT: Successors according to CFG: BB#1(306783378) BB#2(306783378) BB#3(306783378) BB#4(306783378) BB#5(613566756)
+; CHECK-JT-PROB-LABEL: bar:
+; CHECK-JT-PROB: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}14.29%) BB#8({{[0-9a-fx/= ]+}}85.71%)
+; CHECK-JT-PROB: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}16.67%) BB#2({{[0-9a-fx/= ]+}}16.67%) BB#3({{[0-9a-fx/= ]+}}16.67%) BB#4({{[0-9a-fx/= ]+}}16.67%) BB#5({{[0-9a-fx/= ]+}}33.33%)
 
 entry:
   switch i32 %x, label %default [

From ed8c8fab46478302926233d2339c7c28b3711edc Mon Sep 17 00:00:00 2001
From: Davide Italiano <davide@freebsd.org>
Date: Tue, 1 Dec 2015 05:33:24 +0000
Subject: [PATCH 053/186] [Windows] Partially revert r254363 until I can test
 the right fix.

Reported by:  David Blaikie


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254378 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Support/Windows/DynamicLibrary.inc | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/lib/Support/Windows/DynamicLibrary.inc b/lib/Support/Windows/DynamicLibrary.inc
index e612283e630f..17418b015c75 100644
--- a/lib/Support/Windows/DynamicLibrary.inc
+++ b/lib/Support/Windows/DynamicLibrary.inc
@@ -60,8 +60,12 @@ DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
     if (OpenedHandles == 0)
       OpenedHandles = new DenseSet<HMODULE>();
 
-    if (!fEnumerateLoadedModules)
-      assert(loadDebugHelp() && "These APIs should always be available");
+    if (!fEnumerateLoadedModules) {
+      if (!loadDebugHelp()) {
+        assert(false && "These APIs should always be available");
+        return DynamicLibrary();
+      }
+    }
 
     fEnumerateLoadedModules(GetCurrentProcess(), ELM_Callback, 0);
     // Dummy library that represents "search all handles".

From 9654ce949f87b5e7713504ad79fb6bee41b3ce1c Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Tue, 1 Dec 2015 06:12:59 +0000
Subject: [PATCH 054/186] Use array_lengthof instead of manually calculating
 it. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254380 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Support/Unix/Signals.inc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lib/Support/Unix/Signals.inc b/lib/Support/Unix/Signals.inc
index de98d4adf996..061cdb3da216 100644
--- a/lib/Support/Unix/Signals.inc
+++ b/lib/Support/Unix/Signals.inc
@@ -86,12 +86,11 @@ static unsigned NumRegisteredSignals = 0;
 static struct {
   struct sigaction SA;
   int SigNo;
-} RegisteredSignalInfo[(sizeof(IntSigs)+sizeof(KillSigs))/sizeof(KillSigs[0])];
+} RegisteredSignalInfo[array_lengthof(IntSigs) + array_lengthof(KillSigs)];
 
 
 static void RegisterHandler(int Signal) {
-  assert(NumRegisteredSignals <
-         sizeof(RegisteredSignalInfo)/sizeof(RegisteredSignalInfo[0]) &&
+  assert(NumRegisteredSignals < array_lengthof(RegisteredSignalInfo) &&
          "Out of space for signal handlers!");
 
   struct sigaction NewHandler;

From 08bbdebfc9c6ec9a83db984d97687414ef9e2989 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Tue, 1 Dec 2015 06:13:01 +0000
Subject: [PATCH 055/186] [ARM] Use range-based for loops to avoid the need for
 calculating an array size that I would have otherwise cconverted to
 array_lengthof. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254381 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/ARMFrameLowering.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index 5b3229456317..c5990bb7d1fb 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -1889,10 +1889,9 @@ void ARMFrameLowering::adjustForSegmentedStacks(
   // first in the list.
   MachineBasicBlock *AddedBlocks[] = {PrevStackMBB, McrMBB, GetMBB, AllocMBB,
                                       PostStackMBB};
-  const int NbAddedBlocks = sizeof(AddedBlocks) / sizeof(AddedBlocks[0]);
 
-  for (int Idx = 0; Idx < NbAddedBlocks; ++Idx)
-    BeforePrologueRegion.insert(AddedBlocks[Idx]);
+  for (MachineBasicBlock *B : AddedBlocks)
+    BeforePrologueRegion.insert(B);
 
   for (const auto &LI : PrologueMBB.liveins()) {
     for (MachineBasicBlock *PredBB : BeforePrologueRegion)
@@ -1901,9 +1900,9 @@ void ARMFrameLowering::adjustForSegmentedStacks(
 
   // Remove the newly added blocks from the list, since we know
   // we do not have to do the following updates for them.
-  for (int Idx = 0; Idx < NbAddedBlocks; ++Idx) {
-    BeforePrologueRegion.erase(AddedBlocks[Idx]);
-    MF.insert(PrologueMBB.getIterator(), AddedBlocks[Idx]);
+  for (MachineBasicBlock *B : AddedBlocks) {
+    BeforePrologueRegion.erase(B);
+    MF.insert(PrologueMBB.getIterator(), B);
   }
 
   for (MachineBasicBlock *MBB : BeforePrologueRegion) {

From 586117052154a19a311af41201d360c698d33499 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Tue, 1 Dec 2015 06:13:04 +0000
Subject: [PATCH 056/186] [Hexagon] Use ArrayRef to avoid needing to calculate
 an array size. Interestingly the original code may have had a bug because it
 was passing the byte size of a uint16_t array instead of the number of
 entries.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254382 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../Disassembler/HexagonDisassembler.cpp      | 27 ++++++++-----------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index 4036650bf74b..51bce8fb01fc 100644
--- a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -471,12 +471,13 @@ extern const MCInstrDesc HexagonInsts[];
 }
 
 static DecodeStatus DecodeRegisterClass(MCInst &Inst, unsigned RegNo,
-                                        const uint16_t Table[], size_t Size) {
-  if (RegNo < Size) {
+                                        ArrayRef<uint16_t> Table) {
+  if (RegNo < Table.size()) {
     Inst.addOperand(MCOperand::createReg(Table[RegNo]));
     return MCDisassembler::Success;
-  } else
-    return MCDisassembler::Fail;
+  }
+
+  return MCDisassembler::Fail;
 }
 
 static DecodeStatus DecodeIntRegsLow8RegisterClass(MCInst &Inst, unsigned RegNo,
@@ -497,8 +498,7 @@ static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo,
       Hexagon::R25, Hexagon::R26, Hexagon::R27, Hexagon::R28, Hexagon::R29,
       Hexagon::R30, Hexagon::R31};
 
-  return (DecodeRegisterClass(Inst, RegNo, IntRegDecoderTable,
-                              sizeof(IntRegDecoderTable)));
+  return (DecodeRegisterClass(Inst, RegNo, IntRegDecoderTable));
 }
 
 static DecodeStatus DecodeVectorRegsRegisterClass(MCInst &Inst, unsigned RegNo,
@@ -513,8 +513,7 @@ static DecodeStatus DecodeVectorRegsRegisterClass(MCInst &Inst, unsigned RegNo,
       Hexagon::V25, Hexagon::V26, Hexagon::V27, Hexagon::V28, Hexagon::V29,
       Hexagon::V30, Hexagon::V31};
 
-  return (DecodeRegisterClass(Inst, RegNo, VecRegDecoderTable,
-                              sizeof(VecRegDecoderTable)));
+  return (DecodeRegisterClass(Inst, RegNo, VecRegDecoderTable));
 }
 
 static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo,
@@ -526,8 +525,7 @@ static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo,
       Hexagon::D8,  Hexagon::D9,  Hexagon::D10, Hexagon::D11,
       Hexagon::D12, Hexagon::D13, Hexagon::D14, Hexagon::D15};
 
-  return (DecodeRegisterClass(Inst, RegNo >> 1, DoubleRegDecoderTable,
-                              sizeof(DoubleRegDecoderTable)));
+  return (DecodeRegisterClass(Inst, RegNo >> 1, DoubleRegDecoderTable));
 }
 
 static DecodeStatus DecodeVecDblRegsRegisterClass(MCInst &Inst, unsigned RegNo,
@@ -539,8 +537,7 @@ static DecodeStatus DecodeVecDblRegsRegisterClass(MCInst &Inst, unsigned RegNo,
       Hexagon::W8,  Hexagon::W9,  Hexagon::W10, Hexagon::W11,
       Hexagon::W12, Hexagon::W13, Hexagon::W14, Hexagon::W15};
 
-  return (DecodeRegisterClass(Inst, RegNo >> 1, VecDblRegDecoderTable,
-                              sizeof(VecDblRegDecoderTable)));
+  return (DecodeRegisterClass(Inst, RegNo >> 1, VecDblRegDecoderTable));
 }
 
 static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
@@ -549,8 +546,7 @@ static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
   static const uint16_t PredRegDecoderTable[] = {Hexagon::P0, Hexagon::P1,
                                                  Hexagon::P2, Hexagon::P3};
 
-  return (DecodeRegisterClass(Inst, RegNo, PredRegDecoderTable,
-                              sizeof(PredRegDecoderTable)));
+  return (DecodeRegisterClass(Inst, RegNo, PredRegDecoderTable));
 }
 
 static DecodeStatus DecodeVecPredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
@@ -559,8 +555,7 @@ static DecodeStatus DecodeVecPredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
   static const uint16_t VecPredRegDecoderTable[] = {Hexagon::Q0, Hexagon::Q1,
                                                     Hexagon::Q2, Hexagon::Q3};
 
-  return (DecodeRegisterClass(Inst, RegNo, VecPredRegDecoderTable,
-                              sizeof(VecPredRegDecoderTable)));
+  return (DecodeRegisterClass(Inst, RegNo, VecPredRegDecoderTable));
 }
 
 static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo,

From cac7246b4c5ae1841bbcfd9a9ea30f062b2f1963 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Tue, 1 Dec 2015 06:13:06 +0000
Subject: [PATCH 057/186] Use array_lengthof instead of manually calculating
 it. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254383 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index 51bce8fb01fc..ee7f569c4bfd 100644
--- a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -568,7 +568,7 @@ static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo,
     Hexagon::CS0, Hexagon::CS1, Hexagon::UPCL, Hexagon::UPC
   };
 
-  if (RegNo >= sizeof(CtrlRegDecoderTable) / sizeof(CtrlRegDecoderTable[0]))
+  if (RegNo >= array_lengthof(CtrlRegDecoderTable))
     return MCDisassembler::Fail;
 
   if (CtrlRegDecoderTable[RegNo] == Hexagon::NoRegister)
@@ -592,7 +592,7 @@ static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
       Hexagon::UPC,    Hexagon::NoRegister
   };
 
-  if (RegNo >= sizeof(CtrlReg64DecoderTable) / sizeof(CtrlReg64DecoderTable[0]))
+  if (RegNo >= array_lengthof(CtrlReg64DecoderTable))
     return MCDisassembler::Fail;
 
   if (CtrlReg64DecoderTable[RegNo] == Hexagon::NoRegister)

From 97beeddf5b474f56e63f9ebb956deea036f4bec4 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Tue, 1 Dec 2015 06:13:08 +0000
Subject: [PATCH 058/186] [Hexagon] Use array_lengthof and const correct and
 type correct the array and array size. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254384 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../Hexagon/Disassembler/HexagonDisassembler.cpp     | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index ee7f569c4bfd..1db59e1dd99d 100644
--- a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -779,7 +779,7 @@ static DecodeStatus brtargetDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
 // Please note that the instructions must be ordered in the descending order
 // of their opcode.
 // HexagonII::INST_ICLASS_ST
-static unsigned int StoreConditionalOpcodeData[][2] = {
+static const unsigned int StoreConditionalOpcodeData[][2] = {
     {S4_pstorerdfnew_abs, 0xafc02084},
     {S4_pstorerdtnew_abs, 0xafc02080},
     {S4_pstorerdf_abs, 0xafc00084},
@@ -825,18 +825,16 @@ static unsigned int LoadStoreOpcodeData[][2] = {{L4_loadrd_abs, 0x49c00000},
                                                 {S2_storerfabs, 0x48600000},
                                                 {S2_storerhabs, 0x48400000},
                                                 {S2_storerbabs, 0x48000000}};
-static int NumCondS =
-    sizeof(StoreConditionalOpcodeData) / sizeof(StoreConditionalOpcodeData[0]);
-static int NumLS = sizeof(LoadStoreOpcodeData) / sizeof(LoadStoreOpcodeData[0]);
+static const size_t NumCondS = array_lengthof(StoreConditionalOpcodeData);
+static const size_t NumLS = array_lengthof(LoadStoreOpcodeData);
 
 static DecodeStatus decodeSpecial(MCInst &MI, uint32_t insn) {
 
   unsigned MachineOpcode = 0;
   unsigned LLVMOpcode = 0;
-  int i;
 
   if ((insn & HexagonII::INST_ICLASS_MASK) == HexagonII::INST_ICLASS_ST) {
-    for (i = 0; i < NumCondS; ++i) {
+    for (size_t i = 0; i < NumCondS; ++i) {
       if ((insn & StoreConditionalOpcodeData[i][1]) ==
           StoreConditionalOpcodeData[i][1]) {
         MachineOpcode = StoreConditionalOpcodeData[i][1];
@@ -846,7 +844,7 @@ static DecodeStatus decodeSpecial(MCInst &MI, uint32_t insn) {
     }
   }
   if ((insn & HexagonII::INST_ICLASS_MASK) == HexagonII::INST_ICLASS_LD_ST_2) {
-    for (i = 0; i < NumLS; ++i) {
+    for (size_t i = 0; i < NumLS; ++i) {
       if ((insn & LoadStoreOpcodeData[i][1]) == LoadStoreOpcodeData[i][1]) {
         MachineOpcode = LoadStoreOpcodeData[i][1];
         LLVMOpcode = LoadStoreOpcodeData[i][0];

From b6f477da5ec08012d2fa8739139eac13123411da Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Tue, 1 Dec 2015 06:13:10 +0000
Subject: [PATCH 059/186] [Hexagon] Use std::begin() and std::end() instead of
 doing the same manually. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254385 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
index 34817cd98f2f..e6194f61a6ba 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
@@ -81,8 +81,7 @@ static const std::pair<unsigned, unsigned> opcodeData[] = {
     std::make_pair((unsigned)V4_SS2_storewi1, 4352)};
 
 static std::map<unsigned, unsigned>
-    subinstOpcodeMap(opcodeData,
-                     opcodeData + sizeof(opcodeData) / sizeof(opcodeData[0]));
+    subinstOpcodeMap(std::begin(opcodeData), std::end(opcodeData));
 
 bool HexagonMCInstrInfo::isDuplexPairMatch(unsigned Ga, unsigned Gb) {
   switch (Ga) {

From 9a293e525c52342f7d286ac999dc84d04c8897e9 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Tue, 1 Dec 2015 06:13:13 +0000
Subject: [PATCH 060/186] [X86] Use array_lengthof instead of calculating
 manually. Also change index types to size_t to match.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254386 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86InstrInfo.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index aaeef465bf50..04b68cd509fb 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -3517,23 +3517,23 @@ unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(MachineInstr *MI,
   bool IsIntrinOpcode;
   isFMA3(Opc, &IsIntrinOpcode);
 
-  unsigned GroupsNum;
+  size_t GroupsNum;
   const unsigned (*OpcodeGroups)[3];
   if (IsIntrinOpcode) {
-    GroupsNum = sizeof(IntrinOpcodeGroups) / sizeof(IntrinOpcodeGroups[0]);
+    GroupsNum = array_lengthof(IntrinOpcodeGroups);
     OpcodeGroups = IntrinOpcodeGroups;
   } else {
-    GroupsNum = sizeof(RegularOpcodeGroups) / sizeof(RegularOpcodeGroups[0]);
+    GroupsNum = array_lengthof(RegularOpcodeGroups);
     OpcodeGroups = RegularOpcodeGroups;
   }
 
   const unsigned *FoundOpcodesGroup = nullptr;
-  unsigned FormIndex;
+  size_t FormIndex;
 
   // Look for the input opcode in the corresponding opcodes table.
-  unsigned GroupIndex = 0;
-  for (; GroupIndex < GroupsNum && !FoundOpcodesGroup; GroupIndex++) {
-    for (FormIndex = 0; FormIndex < FormsNum; FormIndex++) {
+  for (size_t GroupIndex = 0; GroupIndex < GroupsNum && !FoundOpcodesGroup;
+         ++GroupIndex) {
+    for (FormIndex = 0; FormIndex < FormsNum; ++FormIndex) {
       if (OpcodeGroups[GroupIndex][FormIndex] == Opc) {
         FoundOpcodesGroup = OpcodeGroups[GroupIndex];
         break;

From b676925e7b5b51a5dda9b5af06594a830ae8915a Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Tue, 1 Dec 2015 06:13:15 +0000
Subject: [PATCH 061/186] [X86] Use range-based for loops. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254387 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86InstrInfo.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 04b68cd509fb..12da3a9319e6 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -6715,16 +6715,16 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = {
 // domains, but they require a bit more work than just switching opcodes.
 
 static const uint16_t *lookup(unsigned opcode, unsigned domain) {
-  for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i)
-    if (ReplaceableInstrs[i][domain-1] == opcode)
-      return ReplaceableInstrs[i];
+  for (const uint16_t (&Row)[3] : ReplaceableInstrs)
+    if (Row[domain-1] == opcode)
+      return Row;
   return nullptr;
 }
 
 static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) {
-  for (unsigned i = 0, e = array_lengthof(ReplaceableInstrsAVX2); i != e; ++i)
-    if (ReplaceableInstrsAVX2[i][domain-1] == opcode)
-      return ReplaceableInstrsAVX2[i];
+  for (const uint16_t (&Row)[3] : ReplaceableInstrsAVX2)
+    if (Row[domain-1] == opcode)
+      return Row;
   return nullptr;
 }
 

From c8fda7a60ac134e3658d9db112319897b8a6f36f Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Tue, 1 Dec 2015 06:13:16 +0000
Subject: [PATCH 062/186] [X86] Fix patterns for memory forms of FP FSUBR and
 FDIVR. They need to have memory on the left hand side of the fsub/fdiv
 operations in their patterns.

Not sure how to test this. I noticed by inspection in the isel tables where the same pattern tried to produce DIV and DIVR or SUB and SUBR.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254388 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86InstrFPStack.td | 108 +++++++++++++++++++-----------
 1 file changed, 69 insertions(+), 39 deletions(-)

diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td
index 51648c6c567e..8fecc9b1a3fd 100644
--- a/lib/Target/X86/X86InstrFPStack.td
+++ b/lib/Target/X86/X86InstrFPStack.td
@@ -137,69 +137,99 @@ def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP,
 // The FopST0 series are not included here because of the irregularities
 // in where the 'r' goes in assembly output.
 // These instructions cannot address 80-bit memory.
-multiclass FPBinary<SDNode OpNode, Format fp, string asmstring> {
+multiclass FPBinary<SDNode OpNode, Format fp, string asmstring,
+                    bit Forward = 1> {
 // ST(0) = ST(0) + [mem]
 def _Fp32m  : FpIf32<(outs RFP32:$dst),
                      (ins RFP32:$src1, f32mem:$src2), OneArgFPRW,
-                  [(set RFP32:$dst,
-                    (OpNode RFP32:$src1, (loadf32 addr:$src2)))]>;
+                  [!if(Forward,
+                       (set RFP32:$dst,
+                        (OpNode RFP32:$src1, (loadf32 addr:$src2))),
+                       (set RFP32:$dst,
+                        (OpNode (loadf32 addr:$src2), RFP32:$src1)))]>;
 def _Fp64m  : FpIf64<(outs RFP64:$dst),
                      (ins RFP64:$src1, f64mem:$src2), OneArgFPRW,
-                  [(set RFP64:$dst,
-                    (OpNode RFP64:$src1, (loadf64 addr:$src2)))]>;
+                  [!if(Forward,
+                       (set RFP64:$dst,
+                        (OpNode RFP64:$src1, (loadf64 addr:$src2))),
+                       (set RFP64:$dst,
+                        (OpNode (loadf64 addr:$src2), RFP64:$src1)))]>;
 def _Fp64m32: FpIf64<(outs RFP64:$dst),
                      (ins RFP64:$src1, f32mem:$src2), OneArgFPRW,
-                  [(set RFP64:$dst,
-                    (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2))))]>;
+                  [!if(Forward,
+                       (set RFP64:$dst,
+                        (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2)))),
+                       (set RFP64:$dst,
+                        (OpNode (f64 (extloadf32 addr:$src2)), RFP64:$src1)))]>;
 def _Fp80m32: FpI_<(outs RFP80:$dst),
                    (ins RFP80:$src1, f32mem:$src2), OneArgFPRW,
-                  [(set RFP80:$dst,
-                    (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2))))]>;
+                  [!if(Forward,
+                       (set RFP80:$dst,
+                        (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2)))),
+                       (set RFP80:$dst,
+                        (OpNode (f80 (extloadf32 addr:$src2)), RFP80:$src1)))]>;
 def _Fp80m64: FpI_<(outs RFP80:$dst),
                    (ins RFP80:$src1, f64mem:$src2), OneArgFPRW,
-                  [(set RFP80:$dst,
-                    (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2))))]>;
+                  [!if(Forward,
+                       (set RFP80:$dst,
+                        (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2)))),
+                       (set RFP80:$dst,
+                        (OpNode (f80 (extloadf64 addr:$src2)), RFP80:$src1)))]>;
+let mayLoad = 1 in
 def _F32m  : FPI<0xD8, fp, (outs), (ins f32mem:$src),
-                 !strconcat("f", asmstring, "{s}\t$src")> {
-  let mayLoad = 1;
-}
+                 !strconcat("f", asmstring, "{s}\t$src")>;
+let mayLoad = 1 in
 def _F64m  : FPI<0xDC, fp, (outs), (ins f64mem:$src),
-                 !strconcat("f", asmstring, "{l}\t$src")> {
-  let mayLoad = 1;
-}
+                 !strconcat("f", asmstring, "{l}\t$src")>;
 // ST(0) = ST(0) + [memint]
 def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2),
                        OneArgFPRW,
-                    [(set RFP32:$dst, (OpNode RFP32:$src1,
-                                       (X86fild addr:$src2, i16)))]>;
+                       [!if(Forward,
+                            (set RFP32:$dst,
+                             (OpNode RFP32:$src1, (X86fild addr:$src2, i16))),
+                            (set RFP32:$dst,
+                             (OpNode (X86fild addr:$src2, i16), RFP32:$src1)))]>;
 def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2),
                        OneArgFPRW,
-                    [(set RFP32:$dst, (OpNode RFP32:$src1,
-                                       (X86fild addr:$src2, i32)))]>;
+                       [!if(Forward,
+                            (set RFP32:$dst,
+                             (OpNode RFP32:$src1, (X86fild addr:$src2, i32))),
+                            (set RFP32:$dst,
+                             (OpNode (X86fild addr:$src2, i32), RFP32:$src1)))]>;
 def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2),
                        OneArgFPRW,
-                    [(set RFP64:$dst, (OpNode RFP64:$src1,
-                                       (X86fild addr:$src2, i16)))]>;
+                       [!if(Forward,
+                            (set RFP64:$dst,
+                             (OpNode RFP64:$src1, (X86fild addr:$src2, i16))),
+                            (set RFP64:$dst,
+                             (OpNode (X86fild addr:$src2, i16), RFP64:$src1)))]>;
 def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2),
                        OneArgFPRW,
-                    [(set RFP64:$dst, (OpNode RFP64:$src1,
-                                       (X86fild addr:$src2, i32)))]>;
+                       [!if(Forward,
+                            (set RFP64:$dst,
+                             (OpNode RFP64:$src1, (X86fild addr:$src2, i32))),
+                            (set RFP64:$dst,
+                             (OpNode (X86fild addr:$src2, i32), RFP64:$src1)))]>;
 def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2),
-                       OneArgFPRW,
-                    [(set RFP80:$dst, (OpNode RFP80:$src1,
-                                       (X86fild addr:$src2, i16)))]>;
+                     OneArgFPRW,
+                     [!if(Forward,
+                          (set RFP80:$dst,
+                           (OpNode RFP80:$src1, (X86fild addr:$src2, i16))),
+                          (set RFP80:$dst,
+                           (OpNode (X86fild addr:$src2, i16), RFP80:$src1)))]>;
 def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2),
-                       OneArgFPRW,
-                    [(set RFP80:$dst, (OpNode RFP80:$src1,
-                                       (X86fild addr:$src2, i32)))]>;
+                     OneArgFPRW,
+                     [!if(Forward,
+                          (set RFP80:$dst,
+                           (OpNode RFP80:$src1, (X86fild addr:$src2, i32))),
+                          (set RFP80:$dst,
+                           (OpNode (X86fild addr:$src2, i32), RFP80:$src1)))]>;
+let mayLoad = 1 in
 def _FI16m  : FPI<0xDE, fp, (outs), (ins i16mem:$src),
-                  !strconcat("fi", asmstring, "{s}\t$src")> {
-  let mayLoad = 1;
-}
+                  !strconcat("fi", asmstring, "{s}\t$src")>;
+let mayLoad = 1 in
 def _FI32m  : FPI<0xDA, fp, (outs), (ins i32mem:$src),
-                  !strconcat("fi", asmstring, "{l}\t$src")> {
-  let mayLoad = 1;
-}
+                  !strconcat("fi", asmstring, "{l}\t$src")>;
 }
 
 let Defs = [FPSW] in {
@@ -213,14 +243,14 @@ defm DIV : FPBinary_rr<fdiv>;
 let SchedRW = [WriteFAddLd] in {
 defm ADD : FPBinary<fadd, MRM0m, "add">;
 defm SUB : FPBinary<fsub, MRM4m, "sub">;
-defm SUBR: FPBinary<fsub ,MRM5m, "subr">;
+defm SUBR: FPBinary<fsub ,MRM5m, "subr", 0>;
 }
 let SchedRW = [WriteFMulLd] in {
 defm MUL : FPBinary<fmul, MRM1m, "mul">;
 }
 let SchedRW = [WriteFDivLd] in {
 defm DIV : FPBinary<fdiv, MRM6m, "div">;
-defm DIVR: FPBinary<fdiv, MRM7m, "divr">;
+defm DIVR: FPBinary<fdiv, MRM7m, "divr", 0>;
 }
 }
 

From b1ce35cdfb58d4e93de58445aef744cded16b386 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@playingwithpointers.com>
Date: Tue, 1 Dec 2015 07:49:23 +0000
Subject: [PATCH 063/186] Introduce a range version of std::any_of, and use it
 in SCEV

Reviewers: dblaikie, pcc

Subscribers: llvm-commits

Differential Revision: http://reviews.llvm.org/D15063

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254390 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ADT/STLExtras.h       | 8 ++++++++
 lib/Analysis/ScalarEvolution.cpp   | 7 +++----
 tools/llvm-pdbdump/LinePrinter.cpp | 9 +++------
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/include/llvm/ADT/STLExtras.h b/include/llvm/ADT/STLExtras.h
index 1bd3b291e0ef..3655a20d8831 100644
--- a/include/llvm/ADT/STLExtras.h
+++ b/include/llvm/ADT/STLExtras.h
@@ -371,6 +371,14 @@ bool all_of(R &&Range, UnaryPredicate &&P) {
                      std::forward<UnaryPredicate>(P));
 }
 
+/// Provide wrappers to std::any_of which take ranges instead of having to pass
+/// begin/end explicitly.
+template <typename R, class UnaryPredicate>
+bool any_of(R &&Range, UnaryPredicate &&P) {
+  return std::any_of(Range.begin(), Range.end(),
+                     std::forward<UnaryPredicate>(P));
+}
+
 //===----------------------------------------------------------------------===//
 //     Extra additions to <memory>
 //===----------------------------------------------------------------------===//
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 23daeb67d653..4c8b6e7de84e 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -8403,8 +8403,7 @@ const SCEV *SCEVAddRecExpr::getNumIterationsInRange(ConstantRange Range,
 
   // The only time we can solve this is when we have all constant indices.
   // Otherwise, we cannot determine the overflow conditions.
-  if (std::any_of(op_begin(), op_end(),
-                  [](const SCEV *Op) { return !isa<SCEVConstant>(Op);}))
+  if (any_of(operands(), [](const SCEV *Op) { return !isa<SCEVConstant>(Op); }))
     return SE.getCouldNotCompute();
 
   // Okay at this point we know that all elements of the chrec are constants and
@@ -9694,8 +9693,8 @@ bool SCEVUnionPredicate::implies(const SCEVPredicate *N) const {
     return false;
   auto &SCEVPreds = ScevPredsIt->second;
 
-  return std::any_of(SCEVPreds.begin(), SCEVPreds.end(),
-                     [N](const SCEVPredicate *I) { return I->implies(N); });
+  return any_of(SCEVPreds,
+                [N](const SCEVPredicate *I) { return I->implies(N); });
 }
 
 const SCEV *SCEVUnionPredicate::getExpr() const { return nullptr; }
diff --git a/tools/llvm-pdbdump/LinePrinter.cpp b/tools/llvm-pdbdump/LinePrinter.cpp
index 4f3ee54c7691..a43727f02b5e 100644
--- a/tools/llvm-pdbdump/LinePrinter.cpp
+++ b/tools/llvm-pdbdump/LinePrinter.cpp
@@ -11,15 +11,12 @@
 
 #include "llvm-pdbdump.h"
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Regex.h"
 
 #include <algorithm>
 
 namespace {
-template <class T, class Pred> bool any_of_range(T &&R, Pred P) {
-  return std::any_of(R.begin(), R.end(), P);
-}
-
 bool IsItemExcluded(llvm::StringRef Item,
                     std::list<llvm::Regex> &IncludeFilters,
                     std::list<llvm::Regex> &ExcludeFilters) {
@@ -30,10 +27,10 @@ bool IsItemExcluded(llvm::StringRef Item,
 
   // Include takes priority over exclude.  If the user specified include
   // filters, and none of them include this item, them item is gone.
-  if (!IncludeFilters.empty() && !any_of_range(IncludeFilters, match_pred))
+  if (!IncludeFilters.empty() && !any_of(IncludeFilters, match_pred))
     return true;
 
-  if (any_of_range(ExcludeFilters, match_pred))
+  if (any_of(ExcludeFilters, match_pred))
     return true;
 
   return false;

From 94214379c485ff7c89cf49c4c56aca8ec790ec4a Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@playingwithpointers.com>
Date: Tue, 1 Dec 2015 07:49:27 +0000
Subject: [PATCH 064/186] Introduce a range version of std::find, and use in
 SCEV

Reviewers: dblaikie, pcc

Subscribers: llvm-commits

Differential Revision: http://reviews.llvm.org/D15064

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254391 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ADT/STLExtras.h     | 7 +++++++
 lib/Analysis/ScalarEvolution.cpp | 3 +--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/include/llvm/ADT/STLExtras.h b/include/llvm/ADT/STLExtras.h
index 3655a20d8831..d4360fa8d218 100644
--- a/include/llvm/ADT/STLExtras.h
+++ b/include/llvm/ADT/STLExtras.h
@@ -379,6 +379,13 @@ bool any_of(R &&Range, UnaryPredicate &&P) {
                      std::forward<UnaryPredicate>(P));
 }
 
+/// Provide wrappers to std::find which take ranges instead of having to pass
+/// begin/end explicitly.
+template<typename R, class T>
+auto find(R &&Range, const T &val) -> decltype(Range.begin()) {
+  return std::find(Range.begin(), Range.end(), val);
+}
+
 //===----------------------------------------------------------------------===//
 //     Extra additions to <memory>
 //===----------------------------------------------------------------------===//
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 4c8b6e7de84e..d04028b15e2f 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -7964,8 +7964,7 @@ static bool IsMaxConsistingOf(const SCEV *MaybeMaxExpr,
   const MaxExprType *MaxExpr = dyn_cast<MaxExprType>(MaybeMaxExpr);
   if (!MaxExpr) return false;
 
-  auto It = std::find(MaxExpr->op_begin(), MaxExpr->op_end(), Candidate);
-  return It != MaxExpr->op_end();
+  return find(MaxExpr->operands(), Candidate) != MaxExpr->op_end();
 }
 
 

From eade7630a7b4cac1dbefbe3600458b4267a8ff3a Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Tue, 1 Dec 2015 10:07:37 +0000
Subject: [PATCH 065/186] Move llvm/test/DebugInfo/Generic/safestack-byval.ll
 to X86. It depends on x86-64.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254396 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/DebugInfo/{Generic => X86}/safestack-byval.ll | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename test/DebugInfo/{Generic => X86}/safestack-byval.ll (100%)

diff --git a/test/DebugInfo/Generic/safestack-byval.ll b/test/DebugInfo/X86/safestack-byval.ll
similarity index 100%
rename from test/DebugInfo/Generic/safestack-byval.ll
rename to test/DebugInfo/X86/safestack-byval.ll

From 175c9617cce54dd0b30b990863d066f52d73f93e Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Tue, 1 Dec 2015 10:07:41 +0000
Subject: [PATCH 066/186] llvm/test/DebugInfo/X86/safestack-byval.ll: Give an
 explicit triple for now. It crashes for targeting *-win32.

Also revert r254375 and r254361.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254397 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/DebugInfo/X86/safestack-byval.ll | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/test/DebugInfo/X86/safestack-byval.ll b/test/DebugInfo/X86/safestack-byval.ll
index 1329a95a2201..f1f6b6c1d911 100644
--- a/test/DebugInfo/X86/safestack-byval.ll
+++ b/test/DebugInfo/X86/safestack-byval.ll
@@ -1,8 +1,7 @@
 ; Test dwarf codegen for DILocalVariable of a byval function argument that
 ; points to neither an argument nor an alloca. This kind of IR is generated by
 ; SafeStack for unsafe byval arguments.
-; RUN: llc -stop-after expand-isel-pseudos %s -o /dev/null | FileCheck %s
-; XFAIL: hexagon
+; RUN: llc -mtriple=x86_64-unknown-unknown -stop-after expand-isel-pseudos %s -o /dev/null | FileCheck %s
 
 ; This was built by compiling the following source with SafeStack and
 ; simplifying the result a little.
@@ -14,8 +13,6 @@
 ;   return zzz.a[len];
 ; }
 
-; REQUIRES: tls
-
 ; CHECK: ![[ZZZ:.*]] = !DILocalVariable(name: "zzz",
 ; CHECK: ![[ZZZ_EXPR:.*]] = !DIExpression(DW_OP_deref, DW_OP_minus, 400)
 ; CHECK: DBG_VALUE {{.*}} ![[ZZZ]], ![[ZZZ_EXPR]]

From f025ca33c1fee1854bfa4476a771d3fff1c8b0b9 Mon Sep 17 00:00:00 2001
From: Oliver Stannard <oliver.stannard@arm.com>
Date: Tue, 1 Dec 2015 10:23:06 +0000
Subject: [PATCH 067/186] [ARM] Add subtarget features for ARMv8.2-A

This adds subtarget features for ARMv8.2-A, which builds on (and
requires the features from) ARMv8.1-A. Most assembler-visible features
of ARMv8.2-A are system instructions, and are all required parts of the
architecture, so just depend on the HasV8_2aOps subtarget feature.
There is also one large, optional feature, which adds 16-bit floating
point versions of all existing floating-point instructions (VFP and
SIMD), this is represented by the FeatureFullFP16 subtarget feature.

Differential Revision: http://reviews.llvm.org/D15036



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254399 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/ARM.td           | 6 ++++++
 lib/Target/ARM/ARMInstrInfo.td  | 6 +++++-
 lib/Target/ARM/ARMSubtarget.cpp | 2 ++
 lib/Target/ARM/ARMSubtarget.h   | 9 +++++++--
 4 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index ceb48d83cd84..57d5429e0aab 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -62,6 +62,9 @@ def FeatureVFP4   : SubtargetFeature<"vfp4", "HasVFPv4", "true",
 def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8",
                                    "true", "Enable ARMv8 FP",
                                    [FeatureVFP4]>;
+def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true",
+                                       "Enable full half-precision floating point",
+                                       [FeatureFPARMv8]>;
 def FeatureD16    : SubtargetFeature<"d16", "HasD16", "true",
                                      "Restrict FP to 16 double registers">;
 def FeatureHWDiv  : SubtargetFeature<"hwdiv", "HasHardwareDivide", "true",
@@ -212,6 +215,9 @@ def HasV8Ops    : SubtargetFeature<"v8", "HasV8Ops", "true",
 def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true",
                                    "Support ARM v8.1a instructions",
                                    [HasV8Ops]>;
+def HasV8_2aOps   : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
+                                   "Support ARM v8.2a instructions",
+                                   [HasV8_1aOps]>;
 
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 238dc338d141..4c7107aee6a2 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -215,6 +215,8 @@ def PreV8            : Predicate<"!Subtarget->hasV8Ops()">,
                                  AssemblerPredicate<"!HasV8Ops", "armv7 or earlier">;
 def HasV8_1a         : Predicate<"Subtarget->hasV8_1aOps()">,
                                  AssemblerPredicate<"HasV8_1aOps", "armv8.1a">;
+def HasV8_2a         : Predicate<"Subtarget->hasV8_2aOps()">,
+                                 AssemblerPredicate<"HasV8_2aOps", "armv8.2a">;
 def NoVFP            : Predicate<"!Subtarget->hasVFP2()">;
 def HasVFP2          : Predicate<"Subtarget->hasVFP2()">,
                                  AssemblerPredicate<"FeatureVFP2", "VFP2">;
@@ -234,7 +236,9 @@ def HasCrypto        : Predicate<"Subtarget->hasCrypto()">,
 def HasCRC           : Predicate<"Subtarget->hasCRC()">,
                                  AssemblerPredicate<"FeatureCRC", "crc">;
 def HasFP16          : Predicate<"Subtarget->hasFP16()">,
-                                 AssemblerPredicate<"FeatureFP16","half-float">;
+                                 AssemblerPredicate<"FeatureFP16","half-float conversions">;
+def HasFullFP16      : Predicate<"Subtarget->hasFullFP16()">,
+                                 AssemblerPredicate<"FeatureFullFP16","full half-float">;
 def HasDivide        : Predicate<"Subtarget->hasDivide()">,
                                  AssemblerPredicate<"FeatureHWDiv", "divide in THUMB">;
 def HasDivideInARM   : Predicate<"Subtarget->hasDivideInARMMode()">,
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 9e3cd36d49ef..bb6ae28065bd 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -112,6 +112,7 @@ void ARMSubtarget::initializeEnvironment() {
   HasV7Ops = false;
   HasV8Ops = false;
   HasV8_1aOps = false;
+  HasV8_2aOps = false;
   HasVFPv2 = false;
   HasVFPv3 = false;
   HasVFPv4 = false;
@@ -130,6 +131,7 @@ void ARMSubtarget::initializeEnvironment() {
   NoMovt = false;
   SupportsTailCall = false;
   HasFP16 = false;
+  HasFullFP16 = false;
   HasD16 = false;
   HasHardwareDivide = false;
   HasHardwareDivideInARM = false;
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index c194149e8452..3addd4175a04 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -77,6 +77,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
   bool HasV7Ops;
   bool HasV8Ops;
   bool HasV8_1aOps;
+  bool HasV8_2aOps;
 
   /// HasVFPv2, HasVFPv3, HasVFPv4, HasFPARMv8, HasNEON - Specify what
   /// floating point ISAs are supported.
@@ -130,10 +131,12 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
   /// Thumb.
   bool SupportsTailCall;
 
-  /// HasFP16 - True if subtarget supports half-precision FP (We support VFP+HF
-  /// only so far)
+  /// HasFP16 - True if subtarget supports half-precision FP conversions
   bool HasFP16;
 
+  /// HasFullFP16 - True if subtarget supports half-precision FP operations
+  bool HasFullFP16;
+
   /// HasD16 - True if subtarget is limited to 16 double precision
   /// FP registers for VFPv3.
   bool HasD16;
@@ -309,6 +312,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
   bool hasV7Ops()   const { return HasV7Ops;  }
   bool hasV8Ops()   const { return HasV8Ops;  }
   bool hasV8_1aOps() const { return HasV8_1aOps; }
+  bool hasV8_2aOps() const { return HasV8_2aOps; }
 
   bool isCortexA5() const { return ARMProcFamily == CortexA5; }
   bool isCortexA7() const { return ARMProcFamily == CortexA7; }
@@ -362,6 +366,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
 
   bool hasFP16() const { return HasFP16; }
   bool hasD16() const { return HasD16; }
+  bool hasFullFP16() const { return HasFullFP16; }
 
   const Triple &getTargetTriple() const { return TargetTriple; }
 

From 27fff2c5ff6be081149b7e72ff8e667cb423be7d Mon Sep 17 00:00:00 2001
From: Oliver Stannard <oliver.stannard@arm.com>
Date: Tue, 1 Dec 2015 10:33:56 +0000
Subject: [PATCH 068/186] [ARM] Add ARMv8.2-A to TargetParser

Add ARMv8.2-A to TargetParser, so that it can be used by the clang
command-line options and the .arch directive.

Most testing of this will be done in clang, checking that the
command-line options that this enables work.

Differential Revision: http://reviews.llvm.org/D15037



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254400 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ADT/Triple.h                     |  1 +
 include/llvm/Support/ARMTargetParser.def      |  4 ++
 include/llvm/Support/TargetParser.h           |  1 +
 lib/Support/TargetParser.cpp                  |  3 ++
 lib/Support/Triple.cpp                        |  2 +
 lib/Target/ARM/ARM.td                         | 12 +++++
 lib/Target/ARM/ARMSubtarget.h                 |  2 +-
 lib/Target/ARM/AsmParser/ARMAsmParser.cpp     |  1 +
 .../ARM/MCTargetDesc/ARMELFStreamer.cpp       |  1 +
 test/MC/ARM/directive-arch-armv8.2-a.s        | 46 +++++++++++++++++++
 10 files changed, 72 insertions(+), 1 deletion(-)
 create mode 100644 test/MC/ARM/directive-arch-armv8.2-a.s

diff --git a/include/llvm/ADT/Triple.h b/include/llvm/ADT/Triple.h
index e50cec1f5e80..e01db0a61fd5 100644
--- a/include/llvm/ADT/Triple.h
+++ b/include/llvm/ADT/Triple.h
@@ -93,6 +93,7 @@ class Triple {
   enum SubArchType {
     NoSubArch,
 
+    ARMSubArch_v8_2a,
     ARMSubArch_v8_1a,
     ARMSubArch_v8,
     ARMSubArch_v7,
diff --git a/include/llvm/Support/ARMTargetParser.def b/include/llvm/Support/ARMTargetParser.def
index f76ac8899359..bc007923b383 100644
--- a/include/llvm/Support/ARMTargetParser.def
+++ b/include/llvm/Support/ARMTargetParser.def
@@ -88,6 +88,9 @@ ARM_ARCH("armv8-a", AK_ARMV8A, "8-A", "v8", ARMBuildAttrs::CPUArch::v8,
 ARM_ARCH("armv8.1-a", AK_ARMV8_1A, "8.1-A", "v8.1a", ARMBuildAttrs::CPUArch::v8,
           FK_CRYPTO_NEON_FP_ARMV8, (AEK_SEC | AEK_MP | AEK_VIRT | AEK_HWDIVARM |
                              AEK_HWDIV | AEK_DSP | AEK_CRC))
+ARM_ARCH("armv8.2-a", AK_ARMV8_2A, "8.2-A", "v8.2a", ARMBuildAttrs::CPUArch::v8,
+          FK_CRYPTO_NEON_FP_ARMV8, (AEK_SEC | AEK_MP | AEK_VIRT | AEK_HWDIVARM |
+                             AEK_HWDIV | AEK_DSP | AEK_CRC))
 // Non-standard Arch names.
 ARM_ARCH("iwmmxt", AK_IWMMXT, "iwmmxt", "", ARMBuildAttrs::CPUArch::v5TE,
           FK_NONE, AEK_NONE)
@@ -115,6 +118,7 @@ ARM_ARCH_EXT_NAME("mp",       AEK_MP,       nullptr,  nullptr)
 ARM_ARCH_EXT_NAME("simd",     AEK_SIMD,     nullptr,  nullptr)
 ARM_ARCH_EXT_NAME("sec",      AEK_SEC,      nullptr,  nullptr)
 ARM_ARCH_EXT_NAME("virt",     AEK_VIRT,     nullptr,  nullptr)
+ARM_ARCH_EXT_NAME("fp16",     AEK_FP16,     "+fullfp16",  "-fullfp16")
 ARM_ARCH_EXT_NAME("os",       AEK_OS,       nullptr,  nullptr)
 ARM_ARCH_EXT_NAME("iwmmxt",   AEK_IWMMXT,   nullptr,  nullptr)
 ARM_ARCH_EXT_NAME("iwmmxt2",  AEK_IWMMXT2,  nullptr,  nullptr)
diff --git a/include/llvm/Support/TargetParser.h b/include/llvm/Support/TargetParser.h
index 6ca0281515e2..c21019d0c5b8 100644
--- a/include/llvm/Support/TargetParser.h
+++ b/include/llvm/Support/TargetParser.h
@@ -82,6 +82,7 @@ enum ArchExtKind : unsigned {
   AEK_SEC = 0x100,
   AEK_VIRT = 0x200,
   AEK_DSP = 0x400,
+  AEK_FP16 = 0x800,
   // Unsupported extensions.
   AEK_OS = 0x8000000,
   AEK_IWMMXT = 0x10000000,
diff --git a/lib/Support/TargetParser.cpp b/lib/Support/TargetParser.cpp
index 3aa55b3c8850..aa3a4235d794 100644
--- a/lib/Support/TargetParser.cpp
+++ b/lib/Support/TargetParser.cpp
@@ -410,6 +410,7 @@ static StringRef getArchSynonym(StringRef Arch) {
       .Case("v7em", "v7e-m")
       .Cases("v8", "v8a", "aarch64", "arm64", "v8-a")
       .Case("v8.1a", "v8.1-a")
+      .Case("v8.2a", "v8.2-a")
       .Default(Arch);
 }
 
@@ -554,6 +555,7 @@ unsigned llvm::ARM::parseArchProfile(StringRef Arch) {
   case ARM::AK_ARMV7K:
   case ARM::AK_ARMV8A:
   case ARM::AK_ARMV8_1A:
+  case ARM::AK_ARMV8_2A:
     return ARM::PK_A;
   }
   return ARM::PK_INVALID;
@@ -594,6 +596,7 @@ unsigned llvm::ARM::parseArchVersion(StringRef Arch) {
     return 7;
   case ARM::AK_ARMV8A:
   case ARM::AK_ARMV8_1A:
+  case ARM::AK_ARMV8_2A:
     return 8;
   }
   return 0;
diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
index f1f2d26b4e70..ed91c209d545 100644
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp
@@ -519,6 +519,8 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) {
     return Triple::ARMSubArch_v8;
   case ARM::AK_ARMV8_1A:
     return Triple::ARMSubArch_v8_1a;
+  case ARM::AK_ARMV8_2A:
+    return Triple::ARMSubArch_v8_2a;
   default:
     return Triple::NoSubArch;
   }
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index 57d5429e0aab..a0fc5f68eb25 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -360,6 +360,18 @@ def ARMv81a   : Architecture<"armv8.1-a", "ARMv81a",  [HasV8_1aOps,
                                                        FeatureCrypto,
                                                        FeatureCRC]>;
 
+def ARMv82a   : Architecture<"armv8.2-a", "ARMv82a",  [HasV8_2aOps,
+                                                       FeatureAClass,
+                                                       FeatureDB,
+                                                       FeatureFPARMv8,
+                                                       FeatureNEON,
+                                                       FeatureDSP,
+                                                       FeatureTrustZone,
+                                                       FeatureMP,
+                                                       FeatureVirtualization,
+                                                       FeatureCrypto,
+                                                       FeatureCRC]>;
+
 // Aliases
 def IWMMXT   : Architecture<"iwmmxt",      "ARMv5te",  [ARMv5te]>;
 def IWMMXT2  : Architecture<"iwmmxt2",     "ARMv5te",  [ARMv5te]>;
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index 3addd4175a04..3ad35d24ebab 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -52,7 +52,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
   enum ARMArchEnum {
     ARMv2, ARMv2a, ARMv3, ARMv3m, ARMv4, ARMv4t, ARMv5, ARMv5t, ARMv5te,
     ARMv5tej, ARMv6, ARMv6k, ARMv6kz, ARMv6t2, ARMv6m, ARMv6sm, ARMv7a, ARMv7r,
-    ARMv7m, ARMv7em, ARMv8a, ARMv81a
+    ARMv7m, ARMv7em, ARMv8a, ARMv81a, ARMv82a
   };
 
   /// ARMProcFamily - ARM processor family: Cortex-A8, Cortex-A9, and others.
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index ba144458386d..8341fbc4efd1 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -9930,6 +9930,7 @@ static const struct {
   { ARM::AEK_SEC, Feature_HasV6K, {ARM::FeatureTrustZone} },
   // FIXME: Only available in A-class, isel not predicated
   { ARM::AEK_VIRT, Feature_HasV7, {ARM::FeatureVirtualization} },
+  { ARM::AEK_FP16, Feature_HasV8_2a, {ARM::FeatureFPARMv8, ARM::FeatureFullFP16} },
   // FIXME: Unsupported extensions.
   { ARM::AEK_OS, Feature_None, {} },
   { ARM::AEK_IWMMXT, Feature_None, {} },
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 42591c25d6e6..f316ad17576a 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -749,6 +749,7 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() {
 
   case ARM::AK_ARMV8A:
   case ARM::AK_ARMV8_1A:
+  case ARM::AK_ARMV8_2A:
     setAttributeItem(CPU_arch_profile, ApplicationProfile, false);
     setAttributeItem(ARM_ISA_use, Allowed, false);
     setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
diff --git a/test/MC/ARM/directive-arch-armv8.2-a.s b/test/MC/ARM/directive-arch-armv8.2-a.s
new file mode 100644
index 000000000000..c9f4469fb0ae
--- /dev/null
+++ b/test/MC/ARM/directive-arch-armv8.2-a.s
@@ -0,0 +1,46 @@
+@ Test the .arch directive for armv8.2-a
+
+@ This test case will check the default .ARM.attributes value for the
+@ armv8-a architecture.
+
+@ RUN: llvm-mc -triple arm-eabi -filetype asm %s \
+@ RUN:   | FileCheck %s -check-prefix CHECK-ASM
+@ RUN: llvm-mc -triple arm-eabi -filetype obj %s \
+@ RUN:   | llvm-readobj -arm-attributes | FileCheck %s -check-prefix CHECK-ATTR
+
+	.syntax	unified
+	.arch	armv8.2-a
+
+@ CHECK-ASM: 	.arch	armv8.2-a
+
+@ CHECK-ATTR: FileAttributes {
+@ CHECK-ATTR:   Attribute {
+@ CHECK-ATTR:     TagName: CPU_name
+@ CHECK-ATTR:     Value: 8.2-A
+@ CHECK-ATTR:   }
+@ CHECK-ATTR:   Attribute {
+@ CHECK-ATTR:     TagName: CPU_arch
+@ CHECK-ATTR:     Description: ARM v8
+@ CHECK-ATTR:   }
+@ CHECK-ATTR:   Attribute {
+@ CHECK-ATTR:     TagName: CPU_arch_profile
+@ CHECK-ATTR:     Description: Application
+@ CHECK-ATTR:   }
+@ CHECK-ATTR:   Attribute {
+@ CHECK-ATTR:     TagName: ARM_ISA_use
+@ CHECK-ATTR:     Description: Permitted
+@ CHECK-ATTR:   }
+@ CHECK-ATTR:   Attribute {
+@ CHECK-ATTR:     TagName: THUMB_ISA_use
+@ CHECK-ATTR:     Description: Thumb-2
+@ CHECK-ATTR:   }
+@ CHECK-ATTR:   Attribute {
+@ CHECK-ATTR:     TagName: MPextension_use
+@ CHECK-ATTR:     Description: Permitted
+@ CHECK-ATTR:   }
+@ CHECK-ATTR:   Attribute {
+@ CHECK-ATTR:     TagName: Virtualization_use
+@ CHECK-ATTR:     Description: TrustZone + Virtualization Extensions
+@ CHECK-ATTR:   }
+@ CHECK-ATTR: }
+

From ce8e2a0d91724b07d3a262c6005e5705a3b7837e Mon Sep 17 00:00:00 2001
From: Oliver Stannard <oliver.stannard@arm.com>
Date: Tue, 1 Dec 2015 10:48:51 +0000
Subject: [PATCH 069/186] [AArch64] Add ARMv8.2-A Statistical Profiling
 Extension

The Statistical Profiling Extension is an optional extension to
ARMv8.2-A. Since it is an optional extension, I have added the
FeatureSPE subtarget feature to control it. The assembler-visible parts
of this extension are the new "psb csync" instruction, which is
equivalent to "hint #17", and a number of system registers.

Differential Revision: http://reviews.llvm.org/D15021



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254401 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64.td                 |  3 +
 lib/Target/AArch64/AArch64InstrFormats.td     | 19 ++++
 lib/Target/AArch64/AArch64InstrInfo.td        |  5 +
 lib/Target/AArch64/AArch64Subtarget.h         |  2 +
 .../AArch64/AsmParser/AArch64AsmParser.cpp    | 73 ++++++++++++++-
 .../InstPrinter/AArch64InstPrinter.cpp        | 13 +++
 .../AArch64/InstPrinter/AArch64InstPrinter.h  |  3 +
 lib/Target/AArch64/Utils/AArch64BaseInfo.cpp  | 23 +++++
 lib/Target/AArch64/Utils/AArch64BaseInfo.h    | 30 ++++++
 .../AArch64/armv8.2a-statistical-profiling.s  | 87 ++++++++++++++++++
 .../armv8.2a-statistical-profiling.txt        | 91 +++++++++++++++++++
 utils/TableGen/AsmWriterEmitter.cpp           | 12 ++-
 12 files changed, 355 insertions(+), 6 deletions(-)
 create mode 100644 test/MC/AArch64/armv8.2a-statistical-profiling.s
 create mode 100644 test/MC/Disassembler/AArch64/armv8.2a-statistical-profiling.txt

diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index e82cdd00ba1e..5c19b3efdb11 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -38,6 +38,9 @@ def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true",
 def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true",
   "Full FP16", [FeatureFPARMv8]>;
 
+def FeatureSPE : SubtargetFeature<"spe", "HasSPE", "true",
+  "Enable Statistical Profiling extension">;
+
 /// Cyclone has register move instructions which are "free".
 def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
                                         "Has zero-cycle register moves">;
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index 752a153c0574..5eef82153e39 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -911,6 +911,25 @@ def msr_sysreg_op : Operand<i32> {
   let PrintMethod = "printMSRSystemRegister";
 }
 
+def PSBHintOperand : AsmOperandClass {
+  let Name = "PSBHint";
+  let ParserMethod = "tryParsePSBHint";
+}
+def psbhint_op : Operand<i32> {
+  let ParserMatchClass = PSBHintOperand;
+  let PrintMethod = "printPSBHintOp";
+  let MCOperandPredicate = [{
+    // Check, if operand is valid, to fix exhaustive aliasing in disassembly.
+    // "psb" is an alias to "hint" only for certain values of CRm:Op2 fields.
+    if (!MCOp.isImm())
+      return false;
+    bool ValidNamed;
+    (void)AArch64PSBHint::PSBHintMapper().toString(MCOp.getImm(),
+      STI.getFeatureBits(), ValidNamed);
+    return ValidNamed;
+  }];
+}
+
 class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg),
                        "mrs", "\t$Rt, $systemreg"> {
   bits<16> systemreg;
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 0c43003975c5..881f55ebeef9 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -29,6 +29,8 @@ def HasCRC           : Predicate<"Subtarget->hasCRC()">,
 def HasPerfMon       : Predicate<"Subtarget->hasPerfMon()">;
 def HasFullFP16      : Predicate<"Subtarget->hasFullFP16()">,
                                  AssemblerPredicate<"FeatureFullFP16", "fullfp16">;
+def HasSPE           : Predicate<"Subtarget->hasSPE()">,
+                                 AssemblerPredicate<"FeatureSPE", "spe">;
 
 def IsLE             : Predicate<"Subtarget->isLittleEndian()">;
 def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
@@ -382,6 +384,9 @@ def : InstAlias<"wfi",  (HINT 0b011)>;
 def : InstAlias<"sev",  (HINT 0b100)>;
 def : InstAlias<"sevl", (HINT 0b101)>;
 
+// v8.2a Statistical Profiling extension
+def : InstAlias<"psb $op",  (HINT psbhint_op:$op)>, Requires<[HasSPE]>;
+
 // As far as LLVM is concerned this writes to the system's exclusive monitors.
 let mayLoad = 1, mayStore = 1 in
 def CLREX : CRmSystemI<imm0_15, 0b010, "clrex">;
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index 9aa6ef9ab670..73daf6051b70 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -47,6 +47,7 @@ class AArch64Subtarget : public AArch64GenSubtargetInfo {
   bool HasCRC;
   bool HasPerfMon;
   bool HasFullFP16;
+  bool HasSPE;
 
   // HasZeroCycleRegMove - Has zero-cycle register mov instructions.
   bool HasZeroCycleRegMove;
@@ -124,6 +125,7 @@ class AArch64Subtarget : public AArch64GenSubtargetInfo {
 
   bool hasPerfMon() const { return HasPerfMon; }
   bool hasFullFP16() const { return HasFullFP16; }
+  bool hasSPE() const { return HasSPE; }
 
   bool isLittleEndian() const { return IsLittle; }
 
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 165843fc84c9..f0ad855ed5e6 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -100,6 +100,7 @@ class AArch64AsmParser : public MCTargetAsmParser {
   OperandMatchResultTy tryParseSysReg(OperandVector &Operands);
   OperandMatchResultTy tryParseSysCROperand(OperandVector &Operands);
   OperandMatchResultTy tryParsePrefetch(OperandVector &Operands);
+  OperandMatchResultTy tryParsePSBHint(OperandVector &Operands);
   OperandMatchResultTy tryParseAdrpLabel(OperandVector &Operands);
   OperandMatchResultTy tryParseAdrLabel(OperandVector &Operands);
   OperandMatchResultTy tryParseFPImm(OperandVector &Operands);
@@ -159,7 +160,8 @@ class AArch64Operand : public MCParsedAsmOperand {
     k_Prefetch,
     k_ShiftExtend,
     k_FPImm,
-    k_Barrier
+    k_Barrier,
+    k_PSBHint,
   } Kind;
 
   SMLoc StartLoc, EndLoc;
@@ -227,6 +229,12 @@ class AArch64Operand : public MCParsedAsmOperand {
     unsigned Length;
   };
 
+  struct PSBHintOp {
+    unsigned Val;
+    const char *Data;
+    unsigned Length;
+  };
+
   struct ShiftExtendOp {
     AArch64_AM::ShiftExtendType Type;
     unsigned Amount;
@@ -250,6 +258,7 @@ class AArch64Operand : public MCParsedAsmOperand {
     struct SysRegOp SysReg;
     struct SysCRImmOp SysCRImm;
     struct PrefetchOp Prefetch;
+    struct PSBHintOp PSBHint;
     struct ShiftExtendOp ShiftExtend;
   };
 
@@ -301,6 +310,9 @@ class AArch64Operand : public MCParsedAsmOperand {
     case k_Prefetch:
       Prefetch = o.Prefetch;
       break;
+    case k_PSBHint:
+      PSBHint = o.PSBHint;
+      break;
     case k_ShiftExtend:
       ShiftExtend = o.ShiftExtend;
       break;
@@ -392,6 +404,16 @@ class AArch64Operand : public MCParsedAsmOperand {
     return Prefetch.Val;
   }
 
+  unsigned getPSBHint() const {
+    assert(Kind == k_PSBHint && "Invalid access!");
+    return PSBHint.Val;
+  }
+
+  StringRef getPSBHintName() const {
+    assert(Kind == k_PSBHint && "Invalid access!");
+    return StringRef(PSBHint.Data, PSBHint.Length);
+  }
+
   StringRef getPrefetchName() const {
     assert(Kind == k_Prefetch && "Invalid access!");
     return StringRef(Prefetch.Data, Prefetch.Length);
@@ -961,6 +983,7 @@ class AArch64Operand : public MCParsedAsmOperand {
   }
   bool isSysCR() const { return Kind == k_SysCR; }
   bool isPrefetch() const { return Kind == k_Prefetch; }
+  bool isPSBHint() const { return Kind == k_PSBHint; }
   bool isShiftExtend() const { return Kind == k_ShiftExtend; }
   bool isShifter() const {
     if (!isShiftExtend())
@@ -1534,6 +1557,11 @@ class AArch64Operand : public MCParsedAsmOperand {
     Inst.addOperand(MCOperand::createImm(getPrefetch()));
   }
 
+  void addPSBHintOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createImm(getPSBHint()));
+  }
+
   void addShifterOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     unsigned Imm =
@@ -1730,6 +1758,19 @@ class AArch64Operand : public MCParsedAsmOperand {
     return Op;
   }
 
+  static std::unique_ptr<AArch64Operand> CreatePSBHint(unsigned Val,
+                                                       StringRef Str,
+                                                       SMLoc S,
+                                                       MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_PSBHint, Ctx);
+    Op->PSBHint.Val = Val;
+    Op->PSBHint.Data = Str.data();
+    Op->PSBHint.Length = Str.size();
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
   static std::unique_ptr<AArch64Operand>
   CreateShiftExtend(AArch64_AM::ShiftExtendType ShOp, unsigned Val,
                     bool HasExplicitAmount, SMLoc S, SMLoc E, MCContext &Ctx) {
@@ -1803,6 +1844,10 @@ void AArch64Operand::print(raw_ostream &OS) const {
       OS << "<prfop invalid #" << getPrefetch() << ">";
     break;
   }
+  case k_PSBHint: {
+    OS << getPSBHintName();
+    break;
+  }
   case k_ShiftExtend: {
     OS << "<" << AArch64_AM::getShiftExtendName(getShiftExtendType()) << " #"
        << getShiftExtendAmount();
@@ -2069,6 +2114,32 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
   return MatchOperand_Success;
 }
 
+/// tryParsePSBHint - Try to parse a PSB operand, mapped to Hint command
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParsePSBHint(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  SMLoc S = getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Identifier)) {
+    TokError("invalid operand for instruction");
+    return MatchOperand_ParseFail;
+  }
+
+  bool Valid;
+  auto Mapper = AArch64PSBHint::PSBHintMapper();
+  unsigned psbhint =
+      Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid);
+  if (!Valid) {
+    TokError("invalid operand for instruction");
+    return MatchOperand_ParseFail;
+  }
+
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(AArch64Operand::CreatePSBHint(psbhint, Tok.getString(),
+                                                   S, getContext()));
+  return MatchOperand_Success;
+}
+
 /// tryParseAdrpLabel - Parse and validate a source label for the ADRP
 /// instruction.
 AArch64AsmParser::OperandMatchResultTy
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
index d8937b57e490..480ed0d263ac 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -1144,6 +1144,19 @@ void AArch64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum,
     O << '#' << prfop;
 }
 
+void AArch64InstPrinter::printPSBHintOp(const MCInst *MI, unsigned OpNum,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O) {
+  unsigned psbhintop = MI->getOperand(OpNum).getImm();
+  bool Valid;
+  StringRef Name =
+      AArch64PSBHint::PSBHintMapper().toString(psbhintop, STI.getFeatureBits(), Valid);
+  if (Valid)
+    O << Name;
+  else
+    O << '#' << psbhintop;
+}
+
 void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
                                            const MCSubtargetInfo &STI,
                                            raw_ostream &O) {
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
index a94721816d33..a767aa451c6a 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
@@ -123,6 +123,9 @@ class AArch64InstPrinter : public MCInstPrinter {
   void printPrefetchOp(const MCInst *MI, unsigned OpNum,
                        const MCSubtargetInfo &STI, raw_ostream &O);
 
+  void printPSBHintOp(const MCInst *MI, unsigned OpNum,
+                      const MCSubtargetInfo &STI, raw_ostream &O);
+
   void printFPImmOperand(const MCInst *MI, unsigned OpNum,
                          const MCSubtargetInfo &STI, raw_ostream &O);
 
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
index f657eaab8151..78f5289ec26d 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -154,6 +154,14 @@ const AArch64NamedImmMapper::Mapping AArch64PState::PStateMapper::PStateMappings
 AArch64PState::PStateMapper::PStateMapper()
   : AArch64NamedImmMapper(PStateMappings, 0) {}
 
+const AArch64NamedImmMapper::Mapping AArch64PSBHint::PSBHintMapper::PSBHintMappings[] = {
+  // v8.2a "Statistical Profiling" extension-specific PSB operand
+  {"csync", CSync, {AArch64::FeatureSPE}},
+};
+
+AArch64PSBHint::PSBHintMapper::PSBHintMapper()
+  : AArch64NamedImmMapper(PSBHintMappings, 0) {}
+
 const AArch64NamedImmMapper::Mapping AArch64SysReg::MRSMapper::MRSMappings[] = {
   {"mdccsr_el0", MDCCSR_EL0, {}},
   {"dbgdtrrx_el0", DBGDTRRX_EL0, {}},
@@ -808,6 +816,21 @@ const AArch64NamedImmMapper::Mapping AArch64SysReg::SysRegMapper::SysRegMappings
 
   // v8.2a registers
   {"uao",           UAO,           {AArch64::HasV8_2aOps}},
+
+  // v8.2a "Statistical Profiling extension" registers
+  {"pmblimitr_el1", PMBLIMITR_EL1, {AArch64::FeatureSPE}},
+  {"pmbptr_el1",    PMBPTR_EL1,    {AArch64::FeatureSPE}},
+  {"pmbsr_el1",     PMBSR_EL1,     {AArch64::FeatureSPE}},
+  {"pmbidr_el1",    PMBIDR_EL1,    {AArch64::FeatureSPE}},
+  {"pmscr_el2",     PMSCR_EL2,     {AArch64::FeatureSPE}},
+  {"pmscr_el12",    PMSCR_EL12,    {AArch64::FeatureSPE}},
+  {"pmscr_el1",     PMSCR_EL1,     {AArch64::FeatureSPE}},
+  {"pmsicr_el1",    PMSICR_EL1,    {AArch64::FeatureSPE}},
+  {"pmsirr_el1",    PMSIRR_EL1,    {AArch64::FeatureSPE}},
+  {"pmsfcr_el1",    PMSFCR_EL1,    {AArch64::FeatureSPE}},
+  {"pmsevfr_el1",   PMSEVFR_EL1,   {AArch64::FeatureSPE}},
+  {"pmslatfr_el1",  PMSLATFR_EL1,  {AArch64::FeatureSPE}},
+  {"pmsidr_el1",    PMSIDR_EL1,    {AArch64::FeatureSPE}},
 };
 
 uint32_t
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index 5a6b54bbee83..f649cb9b8a8d 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -478,6 +478,21 @@ namespace AArch64PState {
 
 }
 
+namespace AArch64PSBHint {
+  enum PSBHintValues {
+    Invalid = -1,
+    // v8.2a "Statistical Profiling" extension-specific PSB operands
+    CSync = 0x11,  // psb csync = hint #0x11
+  };
+
+  struct PSBHintMapper : AArch64NamedImmMapper {
+    const static Mapping PSBHintMappings[];
+
+    PSBHintMapper();
+  };
+
+}
+
 namespace AArch64SE {
     enum ShiftExtSpecifiers {
         Invalid = -1,
@@ -1199,6 +1214,21 @@ namespace AArch64SysReg {
     // v8.2a registers
     UAO               = 0xc214, // 11  000  0100  0010  100
 
+    // v8.2a "Statistical Profiling extension" registers
+    PMBLIMITR_EL1     = 0xc4d0, // 11  000  1001  1010  000
+    PMBPTR_EL1        = 0xc4d1, // 11  000  1001  1010  001
+    PMBSR_EL1         = 0xc4d3, // 11  000  1001  1010  011
+    PMBIDR_EL1        = 0xc4d7, // 11  000  1001  1010  111
+    PMSCR_EL2         = 0xe4c8, // 11  100  1001  1001  000
+    PMSCR_EL12        = 0xecc8, // 11  101  1001  1001  000
+    PMSCR_EL1         = 0xc4c8, // 11  000  1001  1001  000
+    PMSICR_EL1        = 0xc4ca, // 11  000  1001  1001  010
+    PMSIRR_EL1        = 0xc4cb, // 11  000  1001  1001  011
+    PMSFCR_EL1        = 0xc4cc, // 11  000  1001  1001  100
+    PMSEVFR_EL1       = 0xc4cd, // 11  000  1001  1001  101
+    PMSLATFR_EL1      = 0xc4ce, // 11  000  1001  1001  110
+    PMSIDR_EL1        = 0xc4cf, // 11  000  1001  1001  111
+
     // Cyclone specific system registers
     CPM_IOACC_CTL_EL3 = 0xff90,
   };
diff --git a/test/MC/AArch64/armv8.2a-statistical-profiling.s b/test/MC/AArch64/armv8.2a-statistical-profiling.s
new file mode 100644
index 000000000000..5cb109318786
--- /dev/null
+++ b/test/MC/AArch64/armv8.2a-statistical-profiling.s
@@ -0,0 +1,87 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+spe < %s | FileCheck %s
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding < %s 2>&1 | FileCheck --check-prefix=NO_SPE %s
+
+  psb csync
+// CHECK: psb csync              // encoding: [0x3f,0x22,0x03,0xd5]
+// NO_SPE:  invalid operand for instruction
+
+  msr pmblimitr_el1, x0
+  msr pmbptr_el1, x0
+  msr pmbsr_el1, x0
+  msr pmbidr_el1, x0
+  msr pmscr_el2, x0
+  msr pmscr_el12, x0
+  msr pmscr_el1, x0
+  msr pmsicr_el1, x0
+  msr pmsirr_el1, x0
+  msr pmsfcr_el1, x0
+  msr pmsevfr_el1, x0
+  msr pmslatfr_el1, x0
+  msr pmsidr_el1, x0
+// CHECK:     msr PMBLIMITR_EL1, x0       // encoding: [0x00,0x9a,0x18,0xd5]
+// CHECK:     msr PMBPTR_EL1, x0          // encoding: [0x20,0x9a,0x18,0xd5]
+// CHECK:     msr PMBSR_EL1, x0           // encoding: [0x60,0x9a,0x18,0xd5]
+// CHECK:     msr PMBIDR_EL1, x0          // encoding: [0xe0,0x9a,0x18,0xd5]
+// CHECK:     msr PMSCR_EL2, x0           // encoding: [0x00,0x99,0x1c,0xd5]
+// CHECK:     msr PMSCR_EL12, x0          // encoding: [0x00,0x99,0x1d,0xd5]
+// CHECK:     msr PMSCR_EL1, x0           // encoding: [0x00,0x99,0x18,0xd5]
+// CHECK:     msr PMSICR_EL1, x0          // encoding: [0x40,0x99,0x18,0xd5]
+// CHECK:     msr PMSIRR_EL1, x0          // encoding: [0x60,0x99,0x18,0xd5]
+// CHECK:     msr PMSFCR_EL1, x0          // encoding: [0x80,0x99,0x18,0xd5]
+// CHECK:     msr PMSEVFR_EL1, x0         // encoding: [0xa0,0x99,0x18,0xd5]
+// CHECK:     msr PMSLATFR_EL1, x0        // encoding: [0xc0,0x99,0x18,0xd5]
+// CHECK:     msr PMSIDR_EL1, x0          // encoding: [0xe0,0x99,0x18,0xd5]
+// NO_SPE: error: expected writable system register or pstate
+// NO_SPE: error: expected writable system register or pstate
+// NO_SPE: error: expected writable system register or pstate
+// NO_SPE: error: expected writable system register or pstate
+// NO_SPE: error: expected writable system register or pstate
+// NO_SPE: error: expected writable system register or pstate
+// NO_SPE: error: expected writable system register or pstate
+// NO_SPE: error: expected writable system register or pstate
+// NO_SPE: error: expected writable system register or pstate
+// NO_SPE: error: expected writable system register or pstate
+// NO_SPE: error: expected writable system register or pstate
+// NO_SPE: error: expected writable system register or pstate
+// NO_SPE: error: expected writable system register or pstate
+
+mrs x0, pmblimitr_el1
+  mrs x0, pmbptr_el1
+  mrs x0, pmbsr_el1
+  mrs x0, pmbidr_el1
+  mrs x0, pmscr_el2
+  mrs x0, pmscr_el12
+  mrs x0, pmscr_el1
+  mrs x0, pmsicr_el1
+  mrs x0, pmsirr_el1
+  mrs x0, pmsfcr_el1
+  mrs x0, pmsevfr_el1
+  mrs x0, pmslatfr_el1
+  mrs x0, pmsidr_el1
+
+// CHECK:    mrs x0, PMBLIMITR_EL1       // encoding: [0x00,0x9a,0x38,0xd5]
+// CHECK:    mrs x0, PMBPTR_EL1          // encoding: [0x20,0x9a,0x38,0xd5]
+// CHECK:    mrs x0, PMBSR_EL1           // encoding: [0x60,0x9a,0x38,0xd5]
+// CHECK:    mrs x0, PMBIDR_EL1          // encoding: [0xe0,0x9a,0x38,0xd5]
+// CHECK:    mrs x0, PMSCR_EL2           // encoding: [0x00,0x99,0x3c,0xd5]
+// CHECK:    mrs x0, PMSCR_EL12          // encoding: [0x00,0x99,0x3d,0xd5]
+// CHECK:    mrs x0, PMSCR_EL1           // encoding: [0x00,0x99,0x38,0xd5]
+// CHECK:    mrs x0, PMSICR_EL1          // encoding: [0x40,0x99,0x38,0xd5]
+// CHECK:    mrs x0, PMSIRR_EL1          // encoding: [0x60,0x99,0x38,0xd5]
+// CHECK:    mrs x0, PMSFCR_EL1          // encoding: [0x80,0x99,0x38,0xd5]
+// CHECK:    mrs x0, PMSEVFR_EL1         // encoding: [0xa0,0x99,0x38,0xd5]
+// CHECK:    mrs x0, PMSLATFR_EL1        // encoding: [0xc0,0x99,0x38,0xd5]
+// CHECK:    mrs x0, PMSIDR_EL1          // encoding: [0xe0,0x99,0x38,0xd5]
+// NO_SPE: error: expected readable system register
+// NO_SPE: error: expected readable system register
+// NO_SPE: error: expected readable system register
+// NO_SPE: error: expected readable system register
+// NO_SPE: error: expected readable system register
+// NO_SPE: error: expected readable system register
+// NO_SPE: error: expected readable system register
+// NO_SPE: error: expected readable system register
+// NO_SPE: error: expected readable system register
+// NO_SPE: error: expected readable system register
+// NO_SPE: error: expected readable system register
+// NO_SPE: error: expected readable system register
+// NO_SPE: error: expected readable system register
diff --git a/test/MC/Disassembler/AArch64/armv8.2a-statistical-profiling.txt b/test/MC/Disassembler/AArch64/armv8.2a-statistical-profiling.txt
new file mode 100644
index 000000000000..e83d750e715e
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/armv8.2a-statistical-profiling.txt
@@ -0,0 +1,91 @@
+# RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+spe --disassemble < %s | FileCheck %s
+# RUN: llvm-mc -triple aarch64-none-linux-gnu --disassemble < %s | FileCheck --check-prefix=NO_SPE %s
+
+[0x1f,0x22,0x03,0xd5]
+# CHECK: hint #0x10
+# NO_SPE: hint #0x10
+
+[0x3f,0x22,0x03,0xd5]
+# CHECK: psb csync
+# NO_SPE: hint #0x11
+
+[0x00,0x9a,0x18,0xd5]
+[0x20,0x9a,0x18,0xd5]
+[0x60,0x9a,0x18,0xd5]
+[0xe0,0x9a,0x18,0xd5]
+[0x00,0x99,0x1c,0xd5]
+[0x00,0x99,0x1d,0xd5]
+[0x00,0x99,0x18,0xd5]
+[0x40,0x99,0x18,0xd5]
+[0x60,0x99,0x18,0xd5]
+[0x80,0x99,0x18,0xd5]
+[0xa0,0x99,0x18,0xd5]
+[0xc0,0x99,0x18,0xd5]
+[0xe0,0x99,0x18,0xd5]
+# CHECK: msr PMBLIMITR_EL1, x0
+# NO_SPE: msr S3_0_C9_C10_0, x0
+# CHECK: msr PMBPTR_EL1, x0
+# NO_SPE: msr S3_0_C9_C10_1, x0
+# CHECK: msr PMBSR_EL1, x0
+# NO_SPE: msr S3_0_C9_C10_3, x0
+# CHECK: msr PMBIDR_EL1, x0
+# NO_SPE: msr S3_0_C9_C10_7, x0
+# CHECK: msr PMSCR_EL2, x0
+# NO_SPE: msr S3_4_C9_C9_0, x0
+# CHECK: msr PMSCR_EL12, x0
+# NO_SPE: msr S3_5_C9_C9_0, x0
+# CHECK: msr PMSCR_EL1, x0
+# NO_SPE: msr S3_0_C9_C9_0, x0
+# CHECK: msr PMSICR_EL1, x0
+# NO_SPE: msr S3_0_C9_C9_2, x0
+# CHECK: msr PMSIRR_EL1, x0
+# NO_SPE: msr S3_0_C9_C9_3, x0
+# CHECK: msr PMSFCR_EL1, x0
+# NO_SPE: msr S3_0_C9_C9_4, x0
+# CHECK: msr PMSEVFR_EL1, x0
+# NO_SPE: msr S3_0_C9_C9_5, x0
+# CHECK: msr PMSLATFR_EL1, x0
+# NO_SPE: msr S3_0_C9_C9_6, x0
+# CHECK: msr PMSIDR_EL1, x0
+# NO_SPE: msr S3_0_C9_C9_7, x0
+
+[0x00,0x9a,0x38,0xd5]
+[0x20,0x9a,0x38,0xd5]
+[0x60,0x9a,0x38,0xd5]
+[0xe0,0x9a,0x38,0xd5]
+[0x00,0x99,0x3c,0xd5]
+[0x00,0x99,0x3d,0xd5]
+[0x00,0x99,0x38,0xd5]
+[0x40,0x99,0x38,0xd5]
+[0x60,0x99,0x38,0xd5]
+[0x80,0x99,0x38,0xd5]
+[0xa0,0x99,0x38,0xd5]
+[0xc0,0x99,0x38,0xd5]
+[0xe0,0x99,0x38,0xd5]
+
+# CHECK: mrs x0, PMBLIMITR_EL1
+# NO_SPE: mrs x0, S3_0_C9_C10_0
+# CHECK: mrs x0, PMBPTR_EL1
+# NO_SPE: mrs x0, S3_0_C9_C10_1
+# CHECK: mrs x0, PMBSR_EL1
+# NO_SPE: mrs x0, S3_0_C9_C10_3
+# CHECK: mrs x0, PMBIDR_EL1
+# NO_SPE: mrs x0, S3_0_C9_C10_7
+# CHECK: mrs x0, PMSCR_EL2
+# NO_SPE: mrs x0, S3_4_C9_C9_0
+# CHECK: mrs x0, PMSCR_EL12
+# NO_SPE: mrs x0, S3_5_C9_C9_0
+# CHECK: mrs x0, PMSCR_EL1
+# NO_SPE: mrs x0, S3_0_C9_C9_0
+# CHECK: mrs x0, PMSICR_EL1
+# NO_SPE: mrs x0, S3_0_C9_C9_2
+# CHECK: mrs x0, PMSIRR_EL1
+# NO_SPE: mrs x0, S3_0_C9_C9_3
+# CHECK: mrs x0, PMSFCR_EL1
+# NO_SPE: mrs x0, S3_0_C9_C9_4
+# CHECK: mrs x0, PMSEVFR_EL1
+# NO_SPE: mrs x0, S3_0_C9_C9_5
+# CHECK: mrs x0, PMSLATFR_EL1
+# NO_SPE: mrs x0, S3_0_C9_C9_6
+# CHECK: mrs x0, PMSIDR_EL1
+# NO_SPE: mrs x0, S3_0_C9_C9_7
diff --git a/utils/TableGen/AsmWriterEmitter.cpp b/utils/TableGen/AsmWriterEmitter.cpp
index 4b543d3f9fc6..a954998d36e9 100644
--- a/utils/TableGen/AsmWriterEmitter.cpp
+++ b/utils/TableGen/AsmWriterEmitter.cpp
@@ -901,7 +901,7 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
                 break; // No conditions on this operand at all
             }
             Cond = Target.getName() + ClassName + "ValidateMCOperand(" +
-                   Op + ", " + llvm::utostr(Entry) + ")";
+                   Op + ", STI, " + llvm::utostr(Entry) + ")";
           }
           // for all subcases of ResultOperand::K_Record:
           IAP.addCond(Cond);
@@ -996,8 +996,9 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
 
   if (!MCOpPredicates.empty())
     O << "static bool " << Target.getName() << ClassName
-      << "ValidateMCOperand(\n"
-      << "       const MCOperand &MCOp, unsigned PredicateIndex);\n";
+      << "ValidateMCOperand(const MCOperand &MCOp,\n"
+      << "                  const MCSubtargetInfo &STI,\n"
+      << "                  unsigned PredicateIndex);\n";
 
   O << HeaderO.str();
   O.indent(2) << "const char *AsmString;\n";
@@ -1069,8 +1070,9 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
 
   if (!MCOpPredicates.empty()) {
     O << "static bool " << Target.getName() << ClassName
-      << "ValidateMCOperand(\n"
-      << "       const MCOperand &MCOp, unsigned PredicateIndex) {\n"
+      << "ValidateMCOperand(const MCOperand &MCOp,\n"
+      << "                  const MCSubtargetInfo &STI,\n"
+      << "                  unsigned PredicateIndex) {\n"      
       << "  switch (PredicateIndex) {\n"
       << "  default:\n"
       << "    llvm_unreachable(\"Unknown MCOperandPredicate kind\");\n"

From 1b91dc22354928116e2af525b10c86a1c645b8a0 Mon Sep 17 00:00:00 2001
From: Cong Hou <congh@google.com>
Date: Tue, 1 Dec 2015 11:05:39 +0000
Subject: [PATCH 070/186] Allow known and unknown probabilities coexist in
 MBB's successor list.

Previously it is not allowed for each MBB to have successors with both known and
unknown probabilities. However, this may be too strict as at this stage we could
not always guarantee that. It is better to remove this restriction now, and I
will work on validating MBB's successors' probabilities first (for example,
check if the sum is approximate one).




git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254402 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/MachineBasicBlock.cpp | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index c9c6a9d62462..de91f0db75a8 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -510,13 +510,8 @@ void MachineBasicBlock::addSuccessor(MachineBasicBlock *Succ,
                                      BranchProbability Prob) {
   // Probability list is either empty (if successor list isn't empty, this means
   // disabled optimization) or has the same size as successor list.
-  if (!(Probs.empty() && !Successors.empty())) {
-    assert((Probs.empty() || (Prob.isUnknown() && Probs.back().isUnknown()) ||
-            (!Prob.isUnknown() && !Probs.back().isUnknown())) &&
-           "Successors with both known and unknwon probabilities are not "
-           "allowed.");
+  if (!(Probs.empty() && !Successors.empty()))
     Probs.push_back(Prob);
-  }
   Successors.push_back(Succ);
   Succ->addPredecessor(this);
 }
@@ -1116,10 +1111,24 @@ MachineBasicBlock::findDebugLoc(instr_iterator MBBI) {
 /// Return probability of the edge from this block to MBB.
 BranchProbability
 MachineBasicBlock::getSuccProbability(const_succ_iterator Succ) const {
-  if (Probs.empty() || Probs.back().isUnknown())
+  if (Probs.empty())
     return BranchProbability(1, succ_size());
 
-  return *getProbabilityIterator(Succ);
+  const auto &Prob = *getProbabilityIterator(Succ);
+  if (Prob.isUnknown()) {
+    // For unknown probabilities, collect the sum of all known ones, and evenly
+    // ditribute the complemental of the sum to each unknown probability.
+    unsigned KnownProbNum = 0;
+    auto Sum = BranchProbability::getZero();
+    for (auto &P : Probs) {
+      if (!P.isUnknown()) {
+        Sum += P;
+        KnownProbNum++;
+      }
+    }
+    return Sum.getCompl() / (Probs.size() - KnownProbNum);
+  } else
+    return Prob;
 }
 
 /// Set successor probability of a given iterator.

From 2fdb3b844fe006f3b28030a5c2e99940daa74d28 Mon Sep 17 00:00:00 2001
From: Yury Gribov <y.gribov@samsung.com>
Date: Tue, 1 Dec 2015 11:40:55 +0000
Subject: [PATCH 071/186] Introduce new @llvm.get.dynamic.area.offset.i{32, 64}
 intrinsics.

The @llvm.get.dynamic.area.offset.* intrinsic family is used to get the offset
from native stack pointer to the address of the most recent dynamic alloca on
the caller's stack. These intrinsics are intendend for use in combination with
@llvm.stacksave and @llvm.restore to get a pointer to the most recent dynamic
alloca. This is useful, for example, for AddressSanitizer's stack unpoisoning
routines.

Patch by Max Ostapenko.

Differential Revision: http://reviews.llvm.org/D14983


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254404 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/LangRef.rst                              | 42 +++++++++++++++++++
 include/llvm/CodeGen/ISDOpcodes.h             |  6 +++
 include/llvm/IR/Intrinsics.td                 |  2 +
 lib/CodeGen/IntrinsicLowering.cpp             |  7 ++++
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp      |  8 ++++
 .../SelectionDAG/SelectionDAGBuilder.cpp      | 15 +++++++
 .../SelectionDAG/SelectionDAGDumper.cpp       |  1 +
 lib/CodeGen/TargetLoweringBase.cpp            |  3 ++
 lib/Target/PowerPC/PPCISelLowering.cpp        | 20 +++++++++
 lib/Target/PowerPC/PPCISelLowering.h          |  7 ++++
 lib/Target/PowerPC/PPCInstr64Bit.td           |  2 +
 lib/Target/PowerPC/PPCInstrInfo.td            |  4 ++
 lib/Target/PowerPC/PPCRegisterInfo.cpp        | 26 ++++++++++++
 lib/Target/PowerPC/PPCRegisterInfo.h          |  1 +
 test/CodeGen/PowerPC/dyn-alloca-offset.ll     | 21 ++++++++++
 15 files changed, 165 insertions(+)
 create mode 100644 test/CodeGen/PowerPC/dyn-alloca-offset.ll

diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index be8e63bf071f..36bfb5167795 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -9337,6 +9337,48 @@ Semantics:
 
 See the description for :ref:`llvm.stacksave <int_stacksave>`.
 
+.. _int_get_dynamic_area_offset:
+
+'``llvm.get.dynamic.area.offset``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare i32 @llvm.get.dynamic.area.offset.i32()
+      declare i64 @llvm.get.dynamic.area.offset.i64()
+
+      Overview:
+      """""""""
+
+      The '``llvm.get.dynamic.area.offset.*``' intrinsic family is used to
+      get the offset from native stack pointer to the address of the most
+      recent dynamic alloca on the caller's stack. These intrinsics are
+      intendend for use in combination with
+      :ref:`llvm.stacksave <int_stacksave>` to get a
+      pointer to the most recent dynamic alloca. This is useful, for example,
+      for AddressSanitizer's stack unpoisoning routines.
+
+Semantics:
+""""""""""
+
+      These intrinsics return a non-negative integer value that can be used to
+      get the address of the most recent dynamic alloca, allocated by :ref:`alloca <i_alloca>`
+      on the caller's stack. In particular, for targets where stack grows downwards,
+      adding this offset to the native stack pointer would get the address of the most
+      recent dynamic alloca. For targets where stack grows upwards, the situation is a bit more
+      complicated, because substracting this value from stack pointer would get the address
+      one past the end of the most recent dynamic alloca.
+
+      Although for most targets `llvm.get.dynamic.area.offset <int_get_dynamic_area_offset>`
+      returns just a zero, for others, such as PowerPC and PowerPC64, it returns a
+      compile-time-known constant value.
+
+      The return value type of :ref:`llvm.get.dynamic.area.offset <int_get_dynamic_area_offset>`
+      must match the target's generic address space's (address space 0) pointer type.
+
 '``llvm.prefetch``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index aaf08e14f57d..4be993a9fbbb 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@@ -754,6 +754,12 @@ namespace ISD {
     GC_TRANSITION_START,
     GC_TRANSITION_END,
 
+    /// GET_DYNAMIC_AREA_OFFSET - get offset from native SP to the address of
+    /// the most recent dynamic alloca. For most targets that would be 0, but
+    /// for some others (e.g. PowerPC, PowerPC64) that would be compile-time
+    /// known nonzero constant. The only operand here is the chain.
+    GET_DYNAMIC_AREA_OFFSET,
+
     /// BUILTIN_OP_END - This must be the last enum value in this list.
     /// The target-specific pre-isel opcode values start here.
     BUILTIN_OP_END
diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td
index 917cf56e2e88..e838fb332de9 100644
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@@ -304,6 +304,8 @@ def int_stacksave     : Intrinsic<[llvm_ptr_ty]>,
 def int_stackrestore  : Intrinsic<[], [llvm_ptr_ty]>,
                         GCCBuiltin<"__builtin_stack_restore">;
 
+def int_get_dynamic_area_offset : Intrinsic<[llvm_anyint_ty]>;
+
 // IntrReadWriteArgMem is more pessimistic than strictly necessary for prefetch,
 // however it does conveniently prevent the prefetch from being reordered
 // with respect to nearby accesses to the same memory.
diff --git a/lib/CodeGen/IntrinsicLowering.cpp b/lib/CodeGen/IntrinsicLowering.cpp
index 5b895fff5c43..47a9f64e9080 100644
--- a/lib/CodeGen/IntrinsicLowering.cpp
+++ b/lib/CodeGen/IntrinsicLowering.cpp
@@ -424,6 +424,13 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
     break;
   }
     
+  case Intrinsic::get_dynamic_area_offset:
+    errs() << "WARNING: this target does not support the custom llvm.get."
+              "dynamic.area.offset.  It is being lowered to a constant 0\n";
+    // Just lower it to a constant 0 because for most targets
+    // @llvm.get.dynamic.area.offset is lowered to zero.
+    CI->replaceAllUsesWith(ConstantInt::get(CI->getType(), 0));
+    break;
   case Intrinsic::returnaddress:
   case Intrinsic::frameaddress:
     errs() << "WARNING: this target does not support the llvm."
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index c5810525f3c7..8238cdeb59ca 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1213,6 +1213,10 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
   case ISD::STACKSAVE:
     Action = TLI.getOperationAction(Node->getOpcode(), MVT::Other);
     break;
+  case ISD::GET_DYNAMIC_AREA_OFFSET:
+    Action = TLI.getOperationAction(Node->getOpcode(),
+                                    Node->getValueType(0));
+    break;
   case ISD::VAARG:
     Action = TLI.getOperationAction(Node->getOpcode(),
                                     Node->getValueType(0));
@@ -3295,6 +3299,10 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       Results.push_back(Node->getOperand(0));
     }
     break;
+  case ISD::GET_DYNAMIC_AREA_OFFSET:
+    Results.push_back(DAG.getConstant(0, dl, Node->getValueType(0)));
+    Results.push_back(Results[0].getValue(0));
+    break;
   case ISD::FCOPYSIGN:
     Results.push_back(ExpandFCOPYSIGN(Node));
     break;
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index d880bcfbdf64..38b8bced2399 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4928,6 +4928,21 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     DAG.setRoot(DAG.getNode(ISD::STACKRESTORE, sdl, MVT::Other, getRoot(), Res));
     return nullptr;
   }
+  case Intrinsic::get_dynamic_area_offset: {
+    SDValue Op = getRoot();
+    EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
+    EVT ResTy = TLI.getValueType(DAG.getDataLayout(), I.getType());
+    // Result type for @llvm.get.dynamic.area.offset should match PtrTy for
+    // target.
+    if (PtrTy != ResTy)
+      report_fatal_error("Wrong result type for @llvm.get.dynamic.area.offset"
+                         " intrinsic!");
+    Res = DAG.getNode(ISD::GET_DYNAMIC_AREA_OFFSET, sdl, DAG.getVTList(ResTy),
+                      Op);
+    DAG.setRoot(Op);
+    setValue(&I, Res);
+    return nullptr;
+  }
   case Intrinsic::stackprotector: {
     // Emit code into the DAG to store the stack guard onto the stack.
     MachineFunction &MF = DAG.getMachineFunction();
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 195b48498605..a6f9699bb29c 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -310,6 +310,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::LIFETIME_END:               return "lifetime.end";
   case ISD::GC_TRANSITION_START:        return "gc_transition.start";
   case ISD::GC_TRANSITION_END:          return "gc_transition.end";
+  case ISD::GET_DYNAMIC_AREA_OFFSET:    return "get.dynamic.area.offset";
 
   // Bit manipulation
   case ISD::BITREVERSE:                 return "bitreverse";
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index e348095aa8fc..69c130809bb8 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -840,6 +840,9 @@ void TargetLoweringBase::initActions() {
       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Expand);
       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Expand);
     }
+
+    // For most targets @llvm.get.dynamic.area.offest just returns 0.
+    setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, VT, Expand);
   }
 
   // Most targets ignore the @llvm.prefetch intrinsic.
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 1b1e0cf57865..72a3fbe83e13 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -329,6 +329,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   setOperationAction(ISD::STACKRESTORE      , MVT::Other, Custom);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Custom);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64  , Custom);
+  setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i32, Custom);
+  setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i64, Custom);
 
   // We want to custom lower some of our intrinsics.
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
@@ -998,6 +1000,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::Lo:              return "PPCISD::Lo";
   case PPCISD::TOC_ENTRY:       return "PPCISD::TOC_ENTRY";
   case PPCISD::DYNALLOC:        return "PPCISD::DYNALLOC";
+  case PPCISD::DYNAREAOFFSET:   return "PPCISD::DYNAREAOFFSET";
   case PPCISD::GlobalBaseReg:   return "PPCISD::GlobalBaseReg";
   case PPCISD::SRL:             return "PPCISD::SRL";
   case PPCISD::SRA:             return "PPCISD::SRA";
@@ -5808,6 +5811,22 @@ PPCTargetLowering::LowerReturn(SDValue Chain,
   return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, RetOps);
 }
 
+SDValue PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET(
+    SDValue Op, SelectionDAG &DAG, const PPCSubtarget &Subtarget) const {
+  SDLoc dl(Op);
+
+  // Get the corect type for integers.
+  EVT IntVT = Op.getValueType();
+
+  // Get the inputs.
+  SDValue Chain = Op.getOperand(0);
+  SDValue FPSIdx = getFramePointerFrameIndex(DAG);
+  // Build a DYNAREAOFFSET node.
+  SDValue Ops[2] = {Chain, FPSIdx};
+  SDVTList VTs = DAG.getVTList(IntVT);
+  return DAG.getNode(PPCISD::DYNAREAOFFSET, dl, VTs, Ops);
+}
+
 SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG,
                                    const PPCSubtarget &Subtarget) const {
   // When we pop the dynamic allocation we need to restore the SP link.
@@ -7938,6 +7957,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::STACKRESTORE:       return LowerSTACKRESTORE(Op, DAG, Subtarget);
   case ISD::DYNAMIC_STACKALLOC:
     return LowerDYNAMIC_STACKALLOC(Op, DAG, Subtarget);
+  case ISD::GET_DYNAMIC_AREA_OFFSET: return LowerGET_DYNAMIC_AREA_OFFSET(Op, DAG, Subtarget);
 
   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 415c47c286e3..c0aafbac1aa0 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -79,6 +79,11 @@ namespace llvm {
       /// compute an allocation on the stack.
       DYNALLOC,
 
+      /// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to
+      /// compute an offset from native SP to the address  of the most recent
+      /// dynamic alloca.
+      DYNAREAOFFSET,
+
       /// GlobalBaseReg - On Darwin, this node represents the result of the mflr
       /// at function entry, used for PIC code.
       GlobalBaseReg,
@@ -728,6 +733,8 @@ namespace llvm {
                         const PPCSubtarget &Subtarget) const;
     SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG,
                                 const PPCSubtarget &Subtarget) const;
+    SDValue LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op, SelectionDAG &DAG,
+                                         const PPCSubtarget &Subtarget) const;
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG,
                                       const PPCSubtarget &Subtarget) const;
     SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index d62833037db5..075e093e41a1 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -369,6 +369,8 @@ let Defs = [X1], Uses = [X1] in
 def DYNALLOC8 : Pseudo<(outs g8rc:$result), (ins g8rc:$negsize, memri:$fpsi),"#DYNALLOC8",
                        [(set i64:$result,
                              (PPCdynalloc i64:$negsize, iaddr:$fpsi))]>;
+def DYNAREAOFFSET8 : Pseudo<(outs i64imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET8",
+                       [(set i64:$result, (PPCdynareaoffset iaddr:$fpsi))]>;
 
 let Defs = [LR8] in {
 def MTLR8  : XFXForm_7_ext<31, 467, 8, (outs), (ins g8rc:$rS),
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index cc1af1a7132f..6c4364aad331 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -226,7 +226,9 @@ def PPCcr6unset : SDNode<"PPCISD::CR6UNSET", SDTNone,
 
 // Instructions to support dynamic alloca.
 def SDTDynOp  : SDTypeProfile<1, 2, []>;
+def SDTDynAreaOp  : SDTypeProfile<1, 1, []>;
 def PPCdynalloc   : SDNode<"PPCISD::DYNALLOC", SDTDynOp, [SDNPHasChain]>;
+def PPCdynareaoffset   : SDNode<"PPCISD::DYNAREAOFFSET", SDTDynAreaOp, [SDNPHasChain]>;
 
 //===----------------------------------------------------------------------===//
 // PowerPC specific transformation functions and pattern fragments.
@@ -1029,6 +1031,8 @@ let Defs = [R1], Uses = [R1] in
 def DYNALLOC : Pseudo<(outs gprc:$result), (ins gprc:$negsize, memri:$fpsi), "#DYNALLOC",
                        [(set i32:$result,
                              (PPCdynalloc i32:$negsize, iaddr:$fpsi))]>;
+def DYNAREAOFFSET : Pseudo<(outs i32imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET",
+                       [(set i32:$result, (PPCdynareaoffset iaddr:$fpsi))]>;
                          
 // SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded after
 // instruction selection into a branch sequence.
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 6d53f876c062..934bdf622418 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -430,6 +430,27 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const {
   MBB.erase(II);
 }
 
+void PPCRegisterInfo::lowerDynamicAreaOffset(
+    MachineBasicBlock::iterator II) const {
+  // Get the instruction.
+  MachineInstr &MI = *II;
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  // Get the basic block's function.
+  MachineFunction &MF = *MBB.getParent();
+  // Get the frame info.
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+  // Get the instruction info.
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+
+  unsigned maxCallFrameSize = MFI->getMaxCallFrameSize();
+  DebugLoc dl = MI.getDebugLoc();
+  BuildMI(MBB, II, dl, TII.get(PPC::LI), MI.getOperand(0).getReg())
+      .addImm(maxCallFrameSize);
+  MBB.erase(II);
+}
+
 /// lowerCRSpilling - Generate the code for spilling a CR register. Instead of
 /// reserving a whole register (R0), we scrounge for one here. This generates
 /// code like this:
@@ -754,6 +775,11 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // Get the instruction opcode.
   unsigned OpC = MI.getOpcode();
 
+  if ((OpC == PPC::DYNAREAOFFSET || OpC == PPC::DYNAREAOFFSET8)) {
+    lowerDynamicAreaOffset(II);
+    return;
+  }
+
   // Special case for dynamic alloca.
   if (FPSI && FrameIndex == FPSI &&
       (OpC == PPC::DYNALLOC || OpC == PPC::DYNALLOC8)) {
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h
index 1b1e160d836c..b15fde83c9f3 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -101,6 +101,7 @@ class PPCRegisterInfo : public PPCGenRegisterInfo {
   }
 
   void lowerDynamicAlloc(MachineBasicBlock::iterator II) const;
+  void lowerDynamicAreaOffset(MachineBasicBlock::iterator II) const;
   void lowerCRSpilling(MachineBasicBlock::iterator II,
                        unsigned FrameIndex) const;
   void lowerCRRestore(MachineBasicBlock::iterator II,
diff --git a/test/CodeGen/PowerPC/dyn-alloca-offset.ll b/test/CodeGen/PowerPC/dyn-alloca-offset.ll
new file mode 100644
index 000000000000..7159b9da736d
--- /dev/null
+++ b/test/CodeGen/PowerPC/dyn-alloca-offset.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+declare i64 @llvm.get.dynamic.area.offset.i64()
+
+declare i64 @bar(i64)
+
+attributes #0 = { nounwind }
+
+; Function Attrs: nounwind sanitize_address uwtable
+define signext i64 @foo(i32 signext %N, i32 signext %M) #0 {
+  %1 = alloca i64, align 32
+  %dynamic_area_offset = call i64 @llvm.get.dynamic.area.offset.i64()
+  %2 = call i64 @bar(i64 %dynamic_area_offset)
+  ret i64 %2
+
+; CHECK-DAG: li [[REG1:[0-9]+]], 112
+; CHECK: blr
+
+}

From c309d733467cec5a6b04706a6bccc0f399ad0fdf Mon Sep 17 00:00:00 2001
From: Hrvoje Varga <Hrvoje.Varga@imgtec.com>
Date: Tue, 1 Dec 2015 11:59:21 +0000
Subject: [PATCH 072/186] [mips][microMIPS] Implement RECIP.fmt, RINT.fmt,
 ROUND.L.fmt, ROUND.W.fmt, SEL.fmt, SELEQZ.fmt, SELNEQZ.fmt and CLASS.fmt
 Differential Revision: http://reviews.llvm.org/D13885

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254405 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Mips/MicroMips32r6InstrFormats.td  | 62 +++++++++++++++
 lib/Target/Mips/MicroMips32r6InstrInfo.td     | 75 +++++++++++++++++++
 lib/Target/Mips/MicroMipsInstrFPU.td          |  4 +-
 lib/Target/Mips/Mips32r6InstrInfo.td          | 44 ++++++-----
 lib/Target/Mips/MipsInstrFPU.td               | 10 +--
 .../Disassembler/Mips/micromips32r6/valid.txt | 16 ++++
 .../Disassembler/Mips/micromips64r6/valid.txt | 16 ++++
 test/MC/Mips/micromips32r6/valid.s            | 16 ++++
 test/MC/Mips/micromips64r6/valid.s            | 16 ++++
 9 files changed, 233 insertions(+), 26 deletions(-)

diff --git a/lib/Target/Mips/MicroMips32r6InstrFormats.td b/lib/Target/Mips/MicroMips32r6InstrFormats.td
index c4cdb0c2fadd..349b3b88a07a 100644
--- a/lib/Target/Mips/MicroMips32r6InstrFormats.td
+++ b/lib/Target/Mips/MicroMips32r6InstrFormats.td
@@ -796,3 +796,65 @@ class POOL32A_WRPGPR_WSBH_FM_MMR6<bits<10> funct> : MipsR6Inst {
   let Inst{15-6}  = funct;
   let Inst{5-0}   = 0x3c;
 }
+
+class POOL32F_RECIP_ROUND_FM_MMR6<string instr_asm, bits<1> fmt, bits<8> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> ft;
+  bits<5> fs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = fs;
+  let Inst{15}    = 0;
+  let Inst{14}    = fmt;
+  let Inst{13-6}  = funct;
+  let Inst{5-0}   = 0b111011;
+}
+
+class POOL32F_RINT_FM_MMR6<string instr_asm, bits<2> fmt>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> fs;
+  bits<5> fd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = fs;
+  let Inst{20-16} = fd;
+  let Inst{15-11} = 0;
+  let Inst{10-9}  = fmt;
+  let Inst{8-0}   = 0b000100000;
+}
+
+class POOL32F_SEL_FM_MMR6<string instr_asm, bits<2> fmt, bits<9> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> ft;
+  bits<5> fs;
+  bits<5> fd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = fs;
+  let Inst{15-11} = fd;
+  let Inst{10-9}  = fmt;
+  let Inst{8-0}   = funct;
+}
+
+class POOL32F_CLASS_FM_MMR6<string instr_asm, bits<2> fmt, bits<9> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> fs;
+  bits<5> fd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = fs;
+  let Inst{20-16} = fd;
+  let Inst{15-11} = 0b00000;
+  let Inst{10-9}  = fmt;
+  let Inst{8-0}   = funct;
+}
diff --git a/lib/Target/Mips/MicroMips32r6InstrInfo.td b/lib/Target/Mips/MicroMips32r6InstrInfo.td
index 2dbd20cfad99..8c744d8924bb 100644
--- a/lib/Target/Mips/MicroMips32r6InstrInfo.td
+++ b/lib/Target/Mips/MicroMips32r6InstrInfo.td
@@ -132,6 +132,26 @@ class LLE_MMR6_ENC : LOAD_WORD_EVA_FM_MMR6<0b110>;
 class LWE_MMR6_ENC : LOAD_WORD_EVA_FM_MMR6<0b111>;
 class LW_MMR6_ENC : LOAD_WORD_FM_MMR6;
 class LUI_MMR6_ENC : LOAD_UPPER_IMM_FM_MMR6;
+class RECIP_S_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"recip.s", 0, 0b01001000>;
+class RECIP_D_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"recip.d", 1, 0b01001000>;
+class RINT_S_MMR6_ENC : POOL32F_RINT_FM_MMR6<"rint.s", 0>;
+class RINT_D_MMR6_ENC : POOL32F_RINT_FM_MMR6<"rint.d", 1>;
+class ROUND_L_S_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.l.s", 0,
+                                                       0b11001100>;
+class ROUND_L_D_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.l.d", 1,
+                                                       0b11001100>;
+class ROUND_W_S_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.w.s", 0,
+                                                       0b11101100>;
+class ROUND_W_D_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.w.d", 1,
+                                                       0b11101100>;
+class SEL_S_MMR6_ENC : POOL32F_SEL_FM_MMR6<"sel.s", 0, 0b010111000>;
+class SEL_D_MMR6_ENC : POOL32F_SEL_FM_MMR6<"sel.d", 1, 0b010111000>;
+class SELEQZ_S_MMR6_ENC : POOL32F_SEL_FM_MMR6<"seleqz.s", 0, 0b000111000>;
+class SELEQZ_D_MMR6_ENC : POOL32F_SEL_FM_MMR6<"seleqz.d", 1, 0b000111000>;
+class SELENZ_S_MMR6_ENC : POOL32F_SEL_FM_MMR6<"selenz.s", 0, 0b001111000>;
+class SELENZ_D_MMR6_ENC : POOL32F_SEL_FM_MMR6<"selenz.d", 1, 0b001111000>;
+class CLASS_S_MMR6_ENC : POOL32F_CLASS_FM_MMR6<"class.s", 0, 0b001100000>;
+class CLASS_D_MMR6_ENC : POOL32F_CLASS_FM_MMR6<"class.d", 1, 0b001100000>;
 
 class ADDU16_MMR6_ENC : POOL16A_ADDU16_FM_MMR6;
 class AND16_MMR6_ENC : POOL16C_AND16_FM_MMR6;
@@ -724,6 +744,33 @@ class RSQRT_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"rsqrt.s", FGR32Opnd,
                                                   FGR32Opnd, II_TRUNC>;
 class RSQRT_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"rsqrt.d", FGR32Opnd,
                                                   AFGR64Opnd, II_TRUNC>;
+class RECIP_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"recip.s", FGR32Opnd,
+                                                 FGR32Opnd, II_ROUND>;
+class RECIP_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"recip.d", FGR32Opnd, FGR32Opnd,
+                                                 II_ROUND>;
+class ROUND_L_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.l.s", FGR64Opnd,
+                                                   FGR32Opnd, II_ROUND>;
+class ROUND_L_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.l.d", FGR64Opnd,
+                                                   FGR64Opnd, II_ROUND>;
+class ROUND_W_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.w.s", FGR32Opnd,
+                                                   FGR32Opnd, II_ROUND>;
+class ROUND_W_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.w.d", FGR64Opnd,
+                                                   FGR64Opnd, II_ROUND>;
+
+class SEL_S_MMR6_DESC : COP1_SEL_DESC_BASE<"sel.s", FGR32Opnd>;
+class SEL_D_MMR6_DESC : COP1_SEL_DESC_BASE<"sel.d", FGR64Opnd> {
+  // We must insert a SUBREG_TO_REG around $fd_in
+  bit usesCustomInserter = 1;
+}
+
+class SELEQZ_S_MMR6_DESC : SELEQNEZ_DESC_BASE<"seleqz.s", FGR32Opnd>;
+class SELEQZ_D_MMR6_DESC : SELEQNEZ_DESC_BASE<"seleqz.d", FGR64Opnd>;
+class SELENZ_S_MMR6_DESC : SELEQNEZ_DESC_BASE<"selnez.s", FGR32Opnd>;
+class SELENZ_D_MMR6_DESC : SELEQNEZ_DESC_BASE<"selnez.d", FGR64Opnd>;
+class RINT_S_MMR6_DESC : CLASS_RINT_DESC_BASE<"rint.s", FGR32Opnd>;
+class RINT_D_MMR6_DESC : CLASS_RINT_DESC_BASE<"rint.d", FGR64Opnd>;
+class CLASS_S_MMR6_DESC  : CLASS_RINT_DESC_BASE<"class.s", FGR32Opnd>;
+class CLASS_D_MMR6_DESC  : CLASS_RINT_DESC_BASE<"class.d", FGR64Opnd>;
 
 class STORE_MMR6_DESC_BASE<string opstr, DAGOperand RO>
     : Store<opstr, RO>, MMR6Arch<opstr> {
@@ -1121,6 +1168,34 @@ def SUBU16_MMR6 : StdMMR6Rel, SUBU16_MMR6_DESC, SUBU16_MMR6_ENC,
                   ISA_MICROMIPS32R6;
 def XOR16_MMR6 : StdMMR6Rel, XOR16_MMR6_DESC, XOR16_MMR6_ENC,
                  ISA_MICROMIPS32R6;
+def RECIP_S_MMR6 : StdMMR6Rel, RECIP_S_MMR6_ENC, RECIP_S_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def RECIP_D_MMR6 : StdMMR6Rel, RECIP_D_MMR6_ENC, RECIP_D_MMR6_DESC, ISA_MICROMIPS32R6;
+def RINT_S_MMR6 : StdMMR6Rel, RINT_S_MMR6_ENC, RINT_S_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def RINT_D_MMR6 : StdMMR6Rel, RINT_D_MMR6_ENC, RINT_D_MMR6_DESC, ISA_MICROMIPS32R6;
+def ROUND_L_S_MMR6 : StdMMR6Rel, ROUND_L_S_MMR6_ENC, ROUND_L_S_MMR6_DESC,
+                     ISA_MICROMIPS32R6;
+def ROUND_L_D_MMR6 : StdMMR6Rel, ROUND_L_D_MMR6_ENC, ROUND_L_D_MMR6_DESC,
+                     ISA_MICROMIPS32R6;
+def ROUND_W_S_MMR6 : StdMMR6Rel, ROUND_W_S_MMR6_ENC, ROUND_W_S_MMR6_DESC,
+                     ISA_MICROMIPS32R6;
+def ROUND_W_D_MMR6 : StdMMR6Rel, ROUND_W_D_MMR6_ENC, ROUND_W_D_MMR6_DESC,
+                     ISA_MICROMIPS32R6;
+def SEL_S_MMR6 : StdMMR6Rel, SEL_S_MMR6_ENC, SEL_S_MMR6_DESC, ISA_MICROMIPS32R6;
+def SEL_D_MMR6 : StdMMR6Rel, SEL_D_MMR6_ENC, SEL_D_MMR6_DESC, ISA_MICROMIPS32R6;
+def SELEQZ_S_MMR6 : StdMMR6Rel, SELEQZ_S_MMR6_ENC, SELEQZ_S_MMR6_DESC,
+                    ISA_MICROMIPS32R6;
+def SELEQZ_D_MMR6 : StdMMR6Rel, SELEQZ_D_MMR6_ENC, SELEQZ_D_MMR6_DESC,
+                    ISA_MICROMIPS32R6;
+def SELENZ_S_MMR6 : StdMMR6Rel, SELENZ_S_MMR6_ENC, SELENZ_S_MMR6_DESC,
+                    ISA_MICROMIPS32R6;
+def SELENZ_D_MMR6 : StdMMR6Rel, SELENZ_D_MMR6_ENC, SELENZ_D_MMR6_DESC,
+                    ISA_MICROMIPS32R6;
+def CLASS_S_MMR6 : StdMMR6Rel, CLASS_S_MMR6_ENC, CLASS_S_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def CLASS_D_MMR6 : StdMMR6Rel, CLASS_D_MMR6_ENC, CLASS_D_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MicroMipsInstrFPU.td b/lib/Target/Mips/MicroMipsInstrFPU.td
index 120a841c3d9d..756e6c92c1d1 100644
--- a/lib/Target/Mips/MicroMipsInstrFPU.td
+++ b/lib/Target/Mips/MicroMipsInstrFPU.td
@@ -43,7 +43,7 @@ def BC1T_MM : MMRel, BC1F_FT<"bc1t", brtarget_mm, II_BC1T, MIPS_BRANCH_T>,
               BC1F_FM_MM<0x1d>, ISA_MIPS1_NOT_32R6_64R6;
 def CVT_W_S_MM   : MMRel, ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, II_CVT>,
                    ROUND_W_FM_MM<0, 0x24>;
-def ROUND_W_S_MM : MMRel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>,
+def ROUND_W_S_MM : MMRel, StdMMR6Rel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>,
                    ROUND_W_FM_MM<0, 0xec>;
 
 def CEIL_W_MM  : MMRel, ABSS_FT<"ceil.w.d", FGR32Opnd, AFGR64Opnd, II_CEIL>,
@@ -52,7 +52,7 @@ def CVT_W_MM   : MMRel, ABSS_FT<"cvt.w.d", FGR32Opnd, AFGR64Opnd, II_CVT>,
                  ROUND_W_FM_MM<1, 0x24>;
 def FLOOR_W_MM : MMRel, ABSS_FT<"floor.w.d", FGR32Opnd, AFGR64Opnd, II_FLOOR>,
                  ROUND_W_FM_MM<1, 0x2c>;
-def ROUND_W_MM : MMRel, ABSS_FT<"round.w.d", FGR32Opnd, AFGR64Opnd, II_ROUND>,
+def ROUND_W_MM : MMRel, StdMMR6Rel, ABSS_FT<"round.w.d", FGR32Opnd, AFGR64Opnd, II_ROUND>,
                  ROUND_W_FM_MM<1, 0xec>;
 def TRUNC_W_MM : MMRel, ABSS_FT<"trunc.w.d", FGR32Opnd, AFGR64Opnd, II_TRUNC>,
                  ROUND_W_FM_MM<1, 0xac>;
diff --git a/lib/Target/Mips/Mips32r6InstrInfo.td b/lib/Target/Mips/Mips32r6InstrInfo.td
index 9dd4d1e034e9..c36a45acbf79 100644
--- a/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -687,8 +687,10 @@ def BNEZC : BNEZC_ENC, BNEZC_DESC, ISA_MIPS32R6;
 def BNVC : BNVC_ENC, BNVC_DESC, ISA_MIPS32R6;
 def BOVC : BOVC_ENC, BOVC_DESC, ISA_MIPS32R6;
 def CACHE_R6 : R6MMR6Rel, CACHE_ENC, CACHE_DESC, ISA_MIPS32R6;
-def CLASS_D : CLASS_D_ENC, CLASS_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-def CLASS_S : CLASS_S_ENC, CLASS_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def CLASS_D : CLASS_D_ENC, CLASS_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def CLASS_S : CLASS_S_ENC, CLASS_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+}
 def CLO_R6 : R6MMR6Rel, CLO_R6_ENC, CLO_R6_DESC, ISA_MIPS32R6;
 def CLZ_R6 : R6MMR6Rel, CLZ_R6_ENC, CLZ_R6_DESC, ISA_MIPS32R6;
 defm S : CMP_CC_M<FIELD_CMP_FORMAT_S, "s", FGR32Opnd>;
@@ -707,14 +709,14 @@ def LWUPC : LWUPC_ENC, LWUPC_DESC, ISA_MIPS32R6;
 let AdditionalPredicates = [NotInMicroMips] in {
   def MADDF_S : MADDF_S_ENC, MADDF_S_DESC, ISA_MIPS32R6, HARDFLOAT;
   def MADDF_D : MADDF_D_ENC, MADDF_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-  def MAXA_D  : MAXA_D_ENC, MAXA_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-  def MAXA_S  : MAXA_S_ENC, MAXA_S_DESC, ISA_MIPS32R6, HARDFLOAT;
-  def MAX_D   : MAX_D_ENC, MAX_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-  def MAX_S   : MAX_S_ENC, MAX_S_DESC, ISA_MIPS32R6, HARDFLOAT;
-  def MINA_D  : MINA_D_ENC, MINA_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-  def MINA_S  : MINA_S_ENC, MINA_S_DESC, ISA_MIPS32R6, HARDFLOAT;
-  def MIN_D   : MIN_D_ENC, MIN_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-  def MIN_S   : MIN_S_ENC, MIN_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MAXA_D : MAXA_D_ENC, MAXA_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MAXA_S : MAXA_S_ENC, MAXA_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MAX_D : MAX_D_ENC, MAX_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MAX_S : MAX_S_ENC, MAX_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MINA_D : MINA_D_ENC, MINA_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MINA_S : MINA_S_ENC, MINA_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MIN_D : MIN_D_ENC, MIN_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MIN_S : MIN_S_ENC, MIN_S_DESC, ISA_MIPS32R6, HARDFLOAT;
 }
 def MOD : R6MMR6Rel, MOD_ENC, MOD_DESC, ISA_MIPS32R6;
 def MODU : R6MMR6Rel, MODU_ENC, MODU_DESC, ISA_MIPS32R6;
@@ -728,21 +730,27 @@ def MUL_R6 : R6MMR6Rel, MUL_R6_ENC, MUL_R6_DESC, ISA_MIPS32R6;
 def MULU   : R6MMR6Rel, MULU_ENC, MULU_DESC, ISA_MIPS32R6;
 def NAL; // BAL with rd=0
 def PREF_R6 : R6MMR6Rel, PREF_ENC, PREF_DESC, ISA_MIPS32R6;
-def RINT_D : RINT_D_ENC, RINT_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-def RINT_S : RINT_S_ENC, RINT_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def RINT_D : RINT_D_ENC, RINT_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def RINT_S : RINT_S_ENC, RINT_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+}
 def SC_R6 : SC_R6_ENC, SC_R6_DESC, ISA_MIPS32R6;
 let AdditionalPredicates = [NotInMicroMips] in {
 def SDBBP_R6 : SDBBP_R6_ENC, SDBBP_R6_DESC, ISA_MIPS32R6;
 }
 def SDC2_R6 : SDC2_R6_ENC, SDC2_R6_DESC, ISA_MIPS32R6;
 def SELEQZ : R6MMR6Rel, SELEQZ_ENC, SELEQZ_DESC, ISA_MIPS32R6, GPR_32;
-def SELEQZ_D : SELEQZ_D_ENC, SELEQZ_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-def SELEQZ_S : SELEQZ_S_ENC, SELEQZ_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def SELEQZ_D : SELEQZ_D_ENC, SELEQZ_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def SELEQZ_S : SELEQZ_S_ENC, SELEQZ_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+}
 def SELNEZ : R6MMR6Rel, SELNEZ_ENC, SELNEZ_DESC, ISA_MIPS32R6, GPR_32;
-def SELNEZ_D : SELNEZ_D_ENC, SELNEZ_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-def SELNEZ_S : SELNEZ_S_ENC, SELNEZ_S_DESC, ISA_MIPS32R6, HARDFLOAT;
-def SEL_D    : SEL_D_ENC, SEL_D_DESC, ISA_MIPS32R6, HARDFLOAT;
-def SEL_S    : SEL_S_ENC, SEL_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def SELNEZ_D : SELNEZ_D_ENC, SELNEZ_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def SELNEZ_S : SELNEZ_S_ENC, SELNEZ_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def SEL_D : SEL_D_ENC, SEL_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def SEL_S : SEL_S_ENC, SEL_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+}
 def SWC2_R6 : SWC2_R6_ENC, SWC2_R6_DESC, ISA_MIPS32R6;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index d1a724944335..377260f89d10 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -136,7 +136,7 @@ multiclass ABSS_M<string opstr, InstrItinClass Itin,
 
 multiclass ROUND_M<string opstr, InstrItinClass Itin> {
   def _D32 : MMRel, ABSS_FT<opstr, FGR32Opnd, AFGR64Opnd, Itin>, FGR_32;
-  def _D64 : ABSS_FT<opstr, FGR32Opnd, FGR64Opnd, Itin>, FGR_64 {
+  def _D64 : StdMMR6Rel, ABSS_FT<opstr, FGR32Opnd, FGR64Opnd, Itin>, FGR_64 {
     let DecoderNamespace = "Mips64";
   }
 }
@@ -267,31 +267,29 @@ defm D64 : C_COND_M<"d", FGR64Opnd, 17, II_C_CC_D>, ISA_MIPS1_NOT_32R6_64R6,
 //===----------------------------------------------------------------------===//
 // Floating Point Instructions
 //===----------------------------------------------------------------------===//
-def ROUND_W_S  : MMRel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>,
+def ROUND_W_S  : MMRel, StdMMR6Rel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>,
                  ABSS_FM<0xc, 16>, ISA_MIPS2;
-let AdditionalPredicates = [NotInMicroMips] in {
+defm ROUND_W : ROUND_M<"round.w.d", II_ROUND>, ABSS_FM<0xc, 17>, ISA_MIPS2;
 def TRUNC_W_S  : MMRel, StdMMR6Rel, ABSS_FT<"trunc.w.s", FGR32Opnd, FGR32Opnd, II_TRUNC>,
                  ABSS_FM<0xd, 16>, ISA_MIPS2;
 def CEIL_W_S   : MMRel, StdMMR6Rel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>,
                  ABSS_FM<0xe, 16>, ISA_MIPS2;
 def FLOOR_W_S  : MMRel, StdMMR6Rel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd, II_FLOOR>,
                  ABSS_FM<0xf, 16>, ISA_MIPS2;
-}
 def CVT_W_S    : MMRel, ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, II_CVT>,
                  ABSS_FM<0x24, 16>;
 
-defm ROUND_W : ROUND_M<"round.w.d", II_ROUND>, ABSS_FM<0xc, 17>, ISA_MIPS2;
 defm TRUNC_W : ROUND_M<"trunc.w.d", II_TRUNC>, ABSS_FM<0xd, 17>, ISA_MIPS2;
 defm CEIL_W  : ROUND_M<"ceil.w.d", II_CEIL>, ABSS_FM<0xe, 17>, ISA_MIPS2;
 defm FLOOR_W : ROUND_M<"floor.w.d", II_FLOOR>, ABSS_FM<0xf, 17>, ISA_MIPS2;
 defm CVT_W   : ROUND_M<"cvt.w.d", II_CVT>, ABSS_FM<0x24, 17>;
 
 let DecoderNamespace = "Mips64" in {
+  let AdditionalPredicates = [NotInMicroMips] in {
   def ROUND_L_S : ABSS_FT<"round.l.s", FGR64Opnd, FGR32Opnd, II_ROUND>,
                   ABSS_FM<0x8, 16>, FGR_64;
   def ROUND_L_D64 : ABSS_FT<"round.l.d", FGR64Opnd, FGR64Opnd, II_ROUND>,
                     ABSS_FM<0x8, 17>, FGR_64;
-  let AdditionalPredicates = [NotInMicroMips] in {
   def TRUNC_L_S : ABSS_FT<"trunc.l.s", FGR64Opnd, FGR32Opnd, II_TRUNC>,
                   ABSS_FM<0x9, 16>, FGR_64;
   def TRUNC_L_D64 : ABSS_FT<"trunc.l.d", FGR64Opnd, FGR64Opnd, II_TRUNC>,
diff --git a/test/MC/Disassembler/Mips/micromips32r6/valid.txt b/test/MC/Disassembler/Mips/micromips32r6/valid.txt
index 82c5d50df92d..5fa2138262a4 100644
--- a/test/MC/Disassembler/Mips/micromips32r6/valid.txt
+++ b/test/MC/Disassembler/Mips/micromips32r6/valid.txt
@@ -237,3 +237,19 @@
 0xea 0x11 # CHECK: sw16 $4, 4($17)
 0xe8 0x11 # CHECK: sw16 $zero, 4($17)
 0x45 0x2a # CHECK: swm16 $16, $17, $ra, 8($sp)
+0x54 0x44 0x12 0x3b # CHECK: recip.s $f2, $f4
+0x54 0x44 0x52 0x3b # CHECK: recip.d $f2, $f4
+0x54 0x82 0x00 0x20 # CHECK: rint.s $f2, $f4
+0x54 0x82 0x02 0x20 # CHECK: rint.d $f2, $f4
+0x54 0x44 0x33 0x3b # CHECK: round.l.s $f2, $f4
+0x54 0x44 0x73 0x3b # CHECK: round.l.d $f2, $f4
+0x54 0x44 0x3b 0x3b # CHECK: round.w.s $f2, $f4
+0x54 0x44 0x7b 0x3b # CHECK: round.w.d $f2, $f4
+0x54 0x41 0x08 0xb8 # CHECK: sel.s $f1, $f1, $f2
+0x54 0x82 0x02 0xb8 # CHECK: sel.d $f0, $f2, $f4
+0x54 0x62 0x08 0x38 # CHECK: seleqz.s $f1, $f2, $f3
+0x55 0x04 0x12 0x38 # CHECK: seleqz.d $f2, $f4, $f8
+0x54 0x62 0x08 0x78 # CHECK: selnez.s $f1, $f2, $f3
+0x55 0x04 0x12 0x78 # CHECK: selnez.d $f2, $f4, $f8
+0x54 0x62 0x00 0x60 # CHECK: class.s $f2, $f3
+0x54 0x82 0x02 0x60 # CHECK: class.d $f2, $f4
diff --git a/test/MC/Disassembler/Mips/micromips64r6/valid.txt b/test/MC/Disassembler/Mips/micromips64r6/valid.txt
index fb641c06bfba..10a9687384ea 100644
--- a/test/MC/Disassembler/Mips/micromips64r6/valid.txt
+++ b/test/MC/Disassembler/Mips/micromips64r6/valid.txt
@@ -150,3 +150,19 @@
 0xea 0x11 # CHECK: sw16 $4, 4($17)
 0xe8 0x11 # CHECK: sw16 $zero, 4($17)
 0x45 0x2a # CHECK: swm16 $16, $17, $ra, 8($sp)
+0x54 0x44 0x12 0x3b # CHECK: recip.s $f2, $f4
+0x54 0x44 0x52 0x3b # CHECK: recip.d $f2, $f4
+0x54 0x82 0x00 0x20 # CHECK: rint.s $f2, $f4
+0x54 0x82 0x02 0x20 # CHECK: rint.d $f2, $f4
+0x54 0x44 0x33 0x3b # CHECK: round.l.s $f2, $f4
+0x54 0x44 0x73 0x3b # CHECK: round.l.d $f2, $f4
+0x54 0x44 0x3b 0x3b # CHECK: round.w.s $f2, $f4
+0x54 0x44 0x7b 0x3b # CHECK: round.w.d $f2, $f4
+0x54 0x41 0x08 0xb8 # CHECK: sel.s $f1, $f1, $f2
+0x54 0x82 0x02 0xb8 # CHECK: sel.d $f0, $f2, $f4
+0x54 0x62 0x08 0x38 # CHECK: seleqz.s $f1, $f2, $f3
+0x55 0x04 0x12 0x38 # CHECK: seleqz.d $f2, $f4, $f8
+0x54 0x62 0x08 0x78 # CHECK: selnez.s $f1, $f2, $f3
+0x55 0x04 0x12 0x78 # CHECK: selnez.d $f2, $f4, $f8
+0x54 0x62 0x00 0x60 # CHECK: class.s $f2, $f3
+0x54 0x82 0x02 0x60 # CHECK: class.d $f2, $f4
diff --git a/test/MC/Mips/micromips32r6/valid.s b/test/MC/Mips/micromips32r6/valid.s
index 81d4b6c6d456..194b15e1a4f6 100644
--- a/test/MC/Mips/micromips32r6/valid.s
+++ b/test/MC/Mips/micromips32r6/valid.s
@@ -230,3 +230,19 @@
   lbu $4, 8($5)            # CHECK: lbu $4, 8($5)       # encoding: [0x14,0x85,0x00,0x08]
   lbe $4, 8($5)            # CHECK: lbe $4, 8($5)       # encoding: [0x60,0x85,0x68,0x08]
   lbue $4, 8($5)           # CHECK: lbue $4, 8($5)      # encoding: [0x60,0x85,0x60,0x08]
+  recip.s $f2, $f4         # CHECK: recip.s $f2, $f4    # encoding: [0x54,0x44,0x12,0x3b]
+  recip.d $f2, $f4         # CHECK: recip.d $f2, $f4    # encoding: [0x54,0x44,0x52,0x3b]
+  rint.s $f2, $f4          # CHECK: rint.s $f2, $f4     # encoding: [0x54,0x82,0x00,0x20]
+  rint.d $f2, $f4          # CHECK: rint.d $f2, $f4     # encoding: [0x54,0x82,0x02,0x20]
+  round.l.s $f2, $f4       # CHECK: round.l.s $f2, $f4  # encoding: [0x54,0x44,0x33,0x3b]
+  round.l.d $f2, $f4       # CHECK: round.l.d $f2, $f4  # encoding: [0x54,0x44,0x73,0x3b]
+  round.w.s $f2, $f4       # CHECK: round.w.s $f2, $f4  # encoding: [0x54,0x44,0x3b,0x3b]
+  round.w.d $f2, $f4       # CHECK: round.w.d $f2, $f4  # encoding: [0x54,0x44,0x7b,0x3b]
+  sel.s $f1, $f1, $f2      # CHECK: sel.s $f1, $f1, $f2 # encoding: [0x54,0x41,0x08,0xb8]
+  sel.d $f0, $f2, $f4      # CHECK: sel.d $f0, $f2, $f4 # encoding: [0x54,0x82,0x02,0xb8]
+  seleqz.s $f1, $f2, $f3   # CHECK: seleqz.s $f1, $f2, $f3 # encoding: [0x54,0x62,0x08,0x38]
+  seleqz.d $f2, $f4, $f8   # CHECK: seleqz.d $f2, $f4, $f8 # encoding: [0x55,0x04,0x12,0x38]
+  selnez.s $f1, $f2, $f3   # CHECK: selnez.s $f1, $f2, $f3 # encoding: [0x54,0x62,0x08,0x78]
+  selnez.d $f2, $f4, $f8   # CHECK: selnez.d $f2, $f4, $f8 # encoding: [0x55,0x04,0x12,0x78]
+  class.s $f2, $f3         # CHECK: class.s $f2, $f3       # encoding: [0x54,0x62,0x00,0x60]
+  class.d $f2, $f4         # CHECK: class.d $f2, $f4       # encoding: [0x54,0x82,0x02,0x60]
diff --git a/test/MC/Mips/micromips64r6/valid.s b/test/MC/Mips/micromips64r6/valid.s
index 685810741780..1c8781b6e966 100644
--- a/test/MC/Mips/micromips64r6/valid.s
+++ b/test/MC/Mips/micromips64r6/valid.s
@@ -130,5 +130,21 @@ a:
         sw16 $0, 4($17)          # CHECK: sw16 $zero, 4($17)  # encoding: [0xe8,0x11]
         swm $16, $17, $ra, 8($sp)   # CHECK: swm16 $16, $17, $ra, 8($sp) # encoding: [0x45,0x2a]
         swm16 $16, $17, $ra, 8($sp) # CHECK: swm16 $16, $17, $ra, 8($sp) # encoding: [0x45,0x2a]
+        recip.s $f2, $f4         # CHECK: recip.s $f2, $f4    # encoding: [0x54,0x44,0x12,0x3b]
+        recip.d $f2, $f4         # CHECK: recip.d $f2, $f4    # encoding: [0x54,0x44,0x52,0x3b]
+        rint.s $f2, $f4          # CHECK: rint.s $f2, $f4     # encoding: [0x54,0x82,0x00,0x20]
+        rint.d $f2, $f4          # CHECK: rint.d $f2, $f4     # encoding: [0x54,0x82,0x02,0x20]
+        round.l.s $f2, $f4       # CHECK: round.l.s $f2, $f4  # encoding: [0x54,0x44,0x33,0x3b]
+        round.l.d $f2, $f4       # CHECK: round.l.d $f2, $f4  # encoding: [0x54,0x44,0x73,0x3b]
+        round.w.s $f2, $f4       # CHECK: round.w.s $f2, $f4  # encoding: [0x54,0x44,0x3b,0x3b]
+        round.w.d $f2, $f4       # CHECK: round.w.d $f2, $f4  # encoding: [0x54,0x44,0x7b,0x3b]
+        sel.s $f1, $f1, $f2      # CHECK: sel.s $f1, $f1, $f2 # encoding: [0x54,0x41,0x08,0xb8]
+        sel.d $f0, $f2, $f4      # CHECK: sel.d $f0, $f2, $f4 # encoding: [0x54,0x82,0x02,0xb8]
+        seleqz.s $f1, $f2, $f3   # CHECK: seleqz.s $f1, $f2, $f3  # encoding: [0x54,0x62,0x08,0x38]
+        seleqz.d $f2, $f4, $f8   # CHECK: seleqz.d $f2, $f4, $f8  # encoding: [0x55,0x04,0x12,0x38]
+        selnez.s $f1, $f2, $f3   # CHECK: selnez.s $f1, $f2, $f3  # encoding: [0x54,0x62,0x08,0x78]
+        selnez.d $f2, $f4, $f8   # CHECK: selnez.d $f2, $f4, $f8  # encoding: [0x55,0x04,0x12,0x78]
+        class.s $f2, $f3         # CHECK: class.s $f2, $f3        # encoding: [0x54,0x62,0x00,0x60]
+        class.d $f2, $f4         # CHECK: class.d $f2, $f4        # encoding: [0x54,0x82,0x02,0x60]
 
 1:

From ef5008e6d0966564098a04dbddb058a01c195062 Mon Sep 17 00:00:00 2001
From: Elena Demikhovsky <elena.demikhovsky@intel.com>
Date: Tue, 1 Dec 2015 12:08:36 +0000
Subject: [PATCH 073/186] Fixed a failure in cost calculation for vector GEP

Cost calculation for vector GEP failed with due to invalid cast to GEP index operand.
The bug is fixed, added a test.

http://reviews.llvm.org/D14976



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254408 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../llvm/Analysis/TargetTransformInfoImpl.h    | 18 +++++++++++++-----
 include/llvm/Analysis/VectorUtils.h            |  2 +-
 lib/Analysis/VectorUtils.cpp                   |  7 ++++---
 .../SelectionDAG/SelectionDAGBuilder.cpp       | 16 ++++++++--------
 test/Analysis/CostModel/X86/vector_gep.ll      | 17 +++++++++++++++++
 5 files changed, 43 insertions(+), 17 deletions(-)
 create mode 100644 test/Analysis/CostModel/X86/vector_gep.ll

diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h
index 415a85e99069..eedf1a61ba82 100644
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -22,6 +22,7 @@
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/Type.h"
+#include "llvm/Analysis/VectorUtils.h"
 
 namespace llvm {
 
@@ -415,21 +416,28 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
         (Ptr == nullptr ? 0 : Ptr->getType()->getPointerAddressSpace());
     auto GTI = gep_type_begin(PointerType::get(PointeeType, AS), Operands);
     for (auto I = Operands.begin(); I != Operands.end(); ++I, ++GTI) {
+      // We assume that the cost of Scalar GEP with constant index and the
+      // cost of Vector GEP with splat constant index are the same.
+      const ConstantInt *ConstIdx = dyn_cast<ConstantInt>(*I);
+      if (!ConstIdx)
+        if (auto Splat = getSplatValue(*I))
+          ConstIdx = dyn_cast<ConstantInt>(Splat);
       if (isa<SequentialType>(*GTI)) {
         int64_t ElementSize = DL.getTypeAllocSize(GTI.getIndexedType());
-        if (const ConstantInt *ConstIdx = dyn_cast<ConstantInt>(*I)) {
+        if (ConstIdx)
           BaseOffset += ConstIdx->getSExtValue() * ElementSize;
-        } else {
+        else {
           // Needs scale register.
-          if (Scale != 0) {
+          if (Scale != 0)
             // No addressing mode takes two scale registers.
             return TTI::TCC_Basic;
-          }
           Scale = ElementSize;
         }
       } else {
         StructType *STy = cast<StructType>(*GTI);
-        uint64_t Field = cast<ConstantInt>(*I)->getZExtValue();
+        // For structures the index is always splat or scalar constant
+        assert(ConstIdx && "Unexpected GEP index");
+        uint64_t Field = ConstIdx->getZExtValue();
         BaseOffset += DL.getStructLayout(STy)->getElementOffset(Field);
       }
     }
diff --git a/include/llvm/Analysis/VectorUtils.h b/include/llvm/Analysis/VectorUtils.h
index 48ef76a9c8da..531803adf5e4 100644
--- a/include/llvm/Analysis/VectorUtils.h
+++ b/include/llvm/Analysis/VectorUtils.h
@@ -86,7 +86,7 @@ Value *findScalarElement(Value *V, unsigned EltNo);
 /// \brief Get splat value if the input is a splat vector or return nullptr.
 /// The value may be extracted from a splat constants vector or from
 /// a sequence of instructions that broadcast a single value into a vector.
-Value *getSplatValue(Value *V);
+const Value *getSplatValue(const Value *V);
 
 /// \brief Compute a map of integer instructions to their minimum legal type
 /// size.
diff --git a/lib/Analysis/VectorUtils.cpp b/lib/Analysis/VectorUtils.cpp
index fbf387b3ee20..e461294c1bee 100644
--- a/lib/Analysis/VectorUtils.cpp
+++ b/lib/Analysis/VectorUtils.cpp
@@ -417,9 +417,10 @@ Value *llvm::findScalarElement(Value *V, unsigned EltNo) {
 /// the input value is (1) a splat constants vector or (2) a sequence
 /// of instructions that broadcast a single value into a vector.
 ///
-llvm::Value *llvm::getSplatValue(Value *V) {
-  if (auto *CV = dyn_cast<ConstantDataVector>(V))
-    return CV->getSplatValue();
+const llvm::Value *llvm::getSplatValue(const Value *V) {
+
+  if (auto *C = dyn_cast<Constant>(V))
+    return C->getSplatValue();
 
   auto *ShuffleInst = dyn_cast<ShuffleVectorInst>(V);
   if (!ShuffleInst)
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 38b8bced2399..85e7e3c1bc8c 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3301,18 +3301,18 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I) {
 // extract the spalt value and use it as a uniform base.
 // In all other cases the function returns 'false'.
 //
-static bool getUniformBase(Value *& Ptr, SDValue& Base, SDValue& Index,
+static bool getUniformBase(const Value *& Ptr, SDValue& Base, SDValue& Index,
                            SelectionDAGBuilder* SDB) {
 
   SelectionDAG& DAG = SDB->DAG;
   LLVMContext &Context = *DAG.getContext();
 
   assert(Ptr->getType()->isVectorTy() && "Uexpected pointer type");
-  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+  const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
   if (!GEP || GEP->getNumOperands() > 2)
     return false;
 
-  Value *GEPPtr = GEP->getPointerOperand();
+  const Value *GEPPtr = GEP->getPointerOperand();
   if (!GEPPtr->getType()->isVectorTy())
     Ptr = GEPPtr;
   else if (!(Ptr = getSplatValue(GEPPtr)))
@@ -3348,7 +3348,7 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
   SDLoc sdl = getCurSDLoc();
 
   // llvm.masked.scatter.*(Src0, Ptrs, alignemt, Mask)
-  Value  *Ptr = I.getArgOperand(1);
+  const Value *Ptr = I.getArgOperand(1);
   SDValue Src0 = getValue(I.getArgOperand(0));
   SDValue Mask = getValue(I.getArgOperand(3));
   EVT VT = Src0.getValueType();
@@ -3362,10 +3362,10 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
 
   SDValue Base;
   SDValue Index;
-  Value *BasePtr = Ptr;
+  const Value *BasePtr = Ptr;
   bool UniformBase = getUniformBase(BasePtr, Base, Index, this);
 
-  Value *MemOpBasePtr = UniformBase ? BasePtr : nullptr;
+  const Value *MemOpBasePtr = UniformBase ? BasePtr : nullptr;
   MachineMemOperand *MMO = DAG.getMachineFunction().
     getMachineMemOperand(MachinePointerInfo(MemOpBasePtr),
                          MachineMemOperand::MOStore,  VT.getStoreSize(),
@@ -3425,7 +3425,7 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
   SDLoc sdl = getCurSDLoc();
 
   // @llvm.masked.gather.*(Ptrs, alignment, Mask, Src0)
-  Value  *Ptr = I.getArgOperand(0);
+  const Value *Ptr = I.getArgOperand(0);
   SDValue Src0 = getValue(I.getArgOperand(3));
   SDValue Mask = getValue(I.getArgOperand(2));
 
@@ -3442,7 +3442,7 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
   SDValue Root = DAG.getRoot();
   SDValue Base;
   SDValue Index;
-  Value *BasePtr = Ptr;
+  const Value *BasePtr = Ptr;
   bool UniformBase = getUniformBase(BasePtr, Base, Index, this);
   bool ConstantMemory = false;
   if (UniformBase &&
diff --git a/test/Analysis/CostModel/X86/vector_gep.ll b/test/Analysis/CostModel/X86/vector_gep.ll
new file mode 100644
index 000000000000..e49f25871d66
--- /dev/null
+++ b/test/Analysis/CostModel/X86/vector_gep.ll
@@ -0,0 +1,17 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-linux-unknown-unknown -mattr=+avx512f | FileCheck %s
+
+%struct.S = type { [1000 x i32] }
+
+
+declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
+
+define <4 x i32> @foov(<4 x %struct.S*> %s, i64 %base){
+  %temp = insertelement <4 x i64> undef, i64 %base, i32 0
+  %vector = shufflevector <4 x i64> %temp, <4 x i64> undef, <4 x i32> zeroinitializer
+;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds %struct.S
+  %B = getelementptr inbounds %struct.S, <4 x %struct.S*> %s, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds [1000 x i32]
+  %arrayidx = getelementptr inbounds [1000 x i32], <4 x [1000 x i32]*> %B, <4 x i64> zeroinitializer, <4 x i64> %vector
+  %res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %arrayidx, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  ret <4 x i32> %res
+}

From 235dd3bb99ceb616e6688ad74427bc8d519c662f Mon Sep 17 00:00:00 2001
From: Elena Demikhovsky <elena.demikhovsky@intel.com>
Date: Tue, 1 Dec 2015 12:30:40 +0000
Subject: [PATCH 074/186] Fixed a failure in getSpaltValue()

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254409 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Analysis/VectorUtils.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/Analysis/VectorUtils.cpp b/lib/Analysis/VectorUtils.cpp
index e461294c1bee..5fb517e8edb5 100644
--- a/lib/Analysis/VectorUtils.cpp
+++ b/lib/Analysis/VectorUtils.cpp
@@ -420,7 +420,8 @@ Value *llvm::findScalarElement(Value *V, unsigned EltNo) {
 const llvm::Value *llvm::getSplatValue(const Value *V) {
 
   if (auto *C = dyn_cast<Constant>(V))
-    return C->getSplatValue();
+    if (isa<VectorType>(V->getType()))
+      return C->getSplatValue();
 
   auto *ShuffleInst = dyn_cast<ShuffleVectorInst>(V);
   if (!ShuffleInst)

From 8de1fedc98a9c57039597498438a31ab8b1a6c23 Mon Sep 17 00:00:00 2001
From: Elena Demikhovsky <elena.demikhovsky@intel.com>
Date: Tue, 1 Dec 2015 12:35:03 +0000
Subject: [PATCH 075/186] AVX-512: regenerated test for avx512 arithmetics, NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254410 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/avx512-arith.ll | 283 ++++++++++++++++++++++++-------
 1 file changed, 222 insertions(+), 61 deletions(-)

diff --git a/test/CodeGen/X86/avx512-arith.ll b/test/CodeGen/X86/avx512-arith.ll
index 522abd261472..d7da77a5eb54 100644
--- a/test/CodeGen/X86/avx512-arith.ll
+++ b/test/CodeGen/X86/avx512-arith.ll
@@ -1,4 +1,8 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck --check-prefix=CHECK --check-prefix=AVX512F %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vl | FileCheck --check-prefix=CHECK --check-prefix=AVX512VL %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512bw | FileCheck --check-prefix=CHECK --check-prefix=AVX512BW %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512dq | FileCheck --check-prefix=CHECK --check-prefix=AVX512DQ %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512dq -mattr=+avx512bw -mattr=+avx512vl| FileCheck --check-prefix=CHECK --check-prefix=SKX %s
 
 define <8 x double> @addpd512(<8 x double> %y, <8 x double> %x) {
 ; CHECK-LABEL: addpd512:
@@ -83,18 +87,54 @@ entry:
 }
 
 define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) {
-; CHECK-LABEL: imulq512:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpmuludq %zmm0, %zmm1, %zmm2
-; CHECK-NEXT:    vpsrlq $32, %zmm0, %zmm3
-; CHECK-NEXT:    vpmuludq %zmm3, %zmm1, %zmm3
-; CHECK-NEXT:    vpsllq $32, %zmm3, %zmm3
-; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
-; CHECK-NEXT:    vpsrlq $32, %zmm1, %zmm1
-; CHECK-NEXT:    vpmuludq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    vpsllq $32, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; AVX512F-LABEL: imulq512:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpmuludq %zmm0, %zmm1, %zmm2
+; AVX512F-NEXT:    vpsrlq $32, %zmm0, %zmm3
+; AVX512F-NEXT:    vpmuludq %zmm3, %zmm1, %zmm3
+; AVX512F-NEXT:    vpsllq $32, %zmm3, %zmm3
+; AVX512F-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT:    vpsrlq $32, %zmm1, %zmm1
+; AVX512F-NEXT:    vpmuludq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    vpsllq $32, %zmm0, %zmm0
+; AVX512F-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: imulq512:
+; AVX512VL:       ## BB#0:
+; AVX512VL-NEXT:    vpmuludq %zmm0, %zmm1, %zmm2
+; AVX512VL-NEXT:    vpsrlq $32, %zmm0, %zmm3
+; AVX512VL-NEXT:    vpmuludq %zmm3, %zmm1, %zmm3
+; AVX512VL-NEXT:    vpsllq $32, %zmm3, %zmm3
+; AVX512VL-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512VL-NEXT:    vpsrlq $32, %zmm1, %zmm1
+; AVX512VL-NEXT:    vpmuludq %zmm0, %zmm1, %zmm0
+; AVX512VL-NEXT:    vpsllq $32, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: imulq512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpmuludq %zmm0, %zmm1, %zmm2
+; AVX512BW-NEXT:    vpsrlq $32, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpmuludq %zmm3, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpsllq $32, %zmm3, %zmm3
+; AVX512BW-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsrlq $32, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmuludq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpsllq $32, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: imulq512:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpmullq %zmm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    retq
+;
+; SKX-LABEL: imulq512:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmullq %zmm0, %zmm1, %zmm0
+; SKX-NEXT:    retq
   %z = mul <8 x i64>%x, %y
   ret <8 x i64>%z
 }
@@ -463,10 +503,13 @@ entry:
   ret <8 x i64>%d
 }
 
-; CHECK-LABEL: test_mask_vaddps
-; CHECK: vaddps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
-; CHECK: ret
 define <16 x float> @test_mask_vaddps(<16 x float> %dst, <16 x float> %i,
+; CHECK-LABEL: test_mask_vaddps:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vaddps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
                                      <16 x float> %j, <16 x i32> %mask1)
                                      nounwind readnone {
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -475,10 +518,13 @@ define <16 x float> @test_mask_vaddps(<16 x float> %dst, <16 x float> %i,
   ret <16 x float> %r
 }
 
-; CHECK-LABEL: test_mask_vmulps
-; CHECK: vmulps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
-; CHECK: ret
 define <16 x float> @test_mask_vmulps(<16 x float> %dst, <16 x float> %i,
+; CHECK-LABEL: test_mask_vmulps:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vmulps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
                                      <16 x float> %j, <16 x i32> %mask1)
                                      nounwind readnone {
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -487,10 +533,13 @@ define <16 x float> @test_mask_vmulps(<16 x float> %dst, <16 x float> %i,
   ret <16 x float> %r
 }
 
-; CHECK-LABEL: test_mask_vminps
-; CHECK: vminps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
-; CHECK: ret
 define <16 x float> @test_mask_vminps(<16 x float> %dst, <16 x float> %i,
+; CHECK-LABEL: test_mask_vminps:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vminps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
                                      <16 x float> %j, <16 x i32> %mask1)
                                      nounwind readnone {
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -500,10 +549,41 @@ define <16 x float> @test_mask_vminps(<16 x float> %dst, <16 x float> %i,
   ret <16 x float> %r
 }
 
-; CHECK-LABEL: test_mask_vminpd
-; CHECK: vminpd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
-; CHECK: ret
 define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i,
+; AVX512F-LABEL: test_mask_vminpd:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512F-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; AVX512F-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: test_mask_vminpd:
+; AVX512VL:       ## BB#0:
+; AVX512VL-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpcmpneqd %ymm4, %ymm3, %k1
+; AVX512VL-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: test_mask_vminpd:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512BW-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; AVX512BW-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_mask_vminpd:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; AVX512DQ-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512DQ-NEXT:    retq
+;
+; SKX-LABEL: test_mask_vminpd:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; SKX-NEXT:    vpcmpneqd %ymm4, %ymm3, %k1
+; SKX-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
+; SKX-NEXT:    retq
                                      <8 x double> %j, <8 x i32> %mask1)
                                      nounwind readnone {
   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -513,10 +593,13 @@ define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i,
   ret <8 x double> %r
 }
 
-; CHECK-LABEL: test_mask_vmaxps
-; CHECK: vmaxps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
-; CHECK: ret
 define <16 x float> @test_mask_vmaxps(<16 x float> %dst, <16 x float> %i,
+; CHECK-LABEL: test_mask_vmaxps:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vmaxps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
                                      <16 x float> %j, <16 x i32> %mask1)
                                      nounwind readnone {
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -526,10 +609,41 @@ define <16 x float> @test_mask_vmaxps(<16 x float> %dst, <16 x float> %i,
   ret <16 x float> %r
 }
 
-; CHECK-LABEL: test_mask_vmaxpd
-; CHECK: vmaxpd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
-; CHECK: ret
 define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i,
+; AVX512F-LABEL: test_mask_vmaxpd:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512F-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; AVX512F-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: test_mask_vmaxpd:
+; AVX512VL:       ## BB#0:
+; AVX512VL-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpcmpneqd %ymm4, %ymm3, %k1
+; AVX512VL-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: test_mask_vmaxpd:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512BW-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; AVX512BW-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_mask_vmaxpd:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; AVX512DQ-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512DQ-NEXT:    retq
+;
+; SKX-LABEL: test_mask_vmaxpd:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxor %ymm4, %ymm4, %ymm4
+; SKX-NEXT:    vpcmpneqd %ymm4, %ymm3, %k1
+; SKX-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
+; SKX-NEXT:    retq
                                      <8 x double> %j, <8 x i32> %mask1)
                                      nounwind readnone {
   %mask = icmp ne <8 x i32> %mask1, zeroinitializer
@@ -539,10 +653,13 @@ define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i,
   ret <8 x double> %r
 }
 
-; CHECK-LABEL: test_mask_vsubps
-; CHECK: vsubps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
-; CHECK: ret
 define <16 x float> @test_mask_vsubps(<16 x float> %dst, <16 x float> %i,
+; CHECK-LABEL: test_mask_vsubps:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vsubps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
                                      <16 x float> %j, <16 x i32> %mask1)
                                      nounwind readnone {
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -551,10 +668,13 @@ define <16 x float> @test_mask_vsubps(<16 x float> %dst, <16 x float> %i,
   ret <16 x float> %r
 }
 
-; CHECK-LABEL: test_mask_vdivps
-; CHECK: vdivps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
-; CHECK: ret
 define <16 x float> @test_mask_vdivps(<16 x float> %dst, <16 x float> %i,
+; CHECK-LABEL: test_mask_vdivps:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vdivps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
                                      <16 x float> %j, <16 x i32> %mask1)
                                      nounwind readnone {
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
@@ -563,10 +683,13 @@ define <16 x float> @test_mask_vdivps(<16 x float> %dst, <16 x float> %i,
   ret <16 x float> %r
 }
 
-; CHECK-LABEL: test_mask_vaddpd
-; CHECK: vaddpd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}}
-; CHECK: ret
 define <8 x double> @test_mask_vaddpd(<8 x double> %dst, <8 x double> %i,
+; CHECK-LABEL: test_mask_vaddpd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT:    vpcmpneqq %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vaddpd %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
                                      <8 x double> %j, <8 x i64> %mask1)
                                      nounwind readnone {
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
@@ -575,10 +698,13 @@ define <8 x double> @test_mask_vaddpd(<8 x double> %dst, <8 x double> %i,
   ret <8 x double> %r
 }
 
-; CHECK-LABEL: test_maskz_vaddpd
-; CHECK: vaddpd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} {z}}}
-; CHECK: ret
 define <8 x double> @test_maskz_vaddpd(<8 x double> %i, <8 x double> %j,
+; CHECK-LABEL: test_maskz_vaddpd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm3, %zmm3, %zmm3
+; CHECK-NEXT:    vpcmpneqq %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vaddpd %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
                                       <8 x i64> %mask1) nounwind readnone {
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
   %x = fadd <8 x double> %i, %j
@@ -586,10 +712,13 @@ define <8 x double> @test_maskz_vaddpd(<8 x double> %i, <8 x double> %j,
   ret <8 x double> %r
 }
 
-; CHECK-LABEL: test_mask_fold_vaddpd
-; CHECK: vaddpd (%rdi), {{.*%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}.*}}
-; CHECK: ret
 define <8 x double> @test_mask_fold_vaddpd(<8 x double> %dst, <8 x double> %i,
+; CHECK-LABEL: test_mask_fold_vaddpd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm3, %zmm3, %zmm3
+; CHECK-NEXT:    vpcmpneqq %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vaddpd (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
                                      <8 x double>* %j,  <8 x i64> %mask1)
                                      nounwind {
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
@@ -599,10 +728,13 @@ define <8 x double> @test_mask_fold_vaddpd(<8 x double> %dst, <8 x double> %i,
   ret <8 x double> %r
 }
 
-; CHECK-LABEL: test_maskz_fold_vaddpd
-; CHECK: vaddpd (%rdi), {{.*%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} {z}.*}}
-; CHECK: ret
 define <8 x double> @test_maskz_fold_vaddpd(<8 x double> %i, <8 x double>* %j,
+; CHECK-LABEL: test_maskz_fold_vaddpd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; CHECK-NEXT:    vpcmpneqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vaddpd (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
                                       <8 x i64> %mask1) nounwind {
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
   %tmp = load <8 x double>, <8 x double>* %j, align 8
@@ -611,10 +743,11 @@ define <8 x double> @test_maskz_fold_vaddpd(<8 x double> %i, <8 x double>* %j,
   ret <8 x double> %r
 }
 
-; CHECK-LABEL: test_broadcast_vaddpd
-; CHECK: vaddpd (%rdi){1to8}, %zmm{{.*}}
-; CHECK: ret
 define <8 x double> @test_broadcast_vaddpd(<8 x double> %i, double* %j) nounwind {
+; CHECK-LABEL: test_broadcast_vaddpd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vaddpd (%rdi){1to8}, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %tmp = load double, double* %j
   %b = insertelement <8 x double> undef, double %tmp, i32 0
   %c = shufflevector <8 x double> %b, <8 x double> undef,
@@ -623,10 +756,14 @@ define <8 x double> @test_broadcast_vaddpd(<8 x double> %i, double* %j) nounwind
   ret <8 x double> %x
 }
 
-; CHECK-LABEL: test_mask_broadcast_vaddpd
-; CHECK: vaddpd (%rdi){1to8}, %zmm{{.*{%k[1-7]}.*}}
-; CHECK: ret
 define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double> %i,
+; CHECK-LABEL: test_mask_broadcast_vaddpd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; CHECK-NEXT:    vpcmpneqq %zmm0, %zmm2, %k1
+; CHECK-NEXT:    vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
                                       double* %j, <8 x i64> %mask1) nounwind {
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
   %tmp = load double, double* %j
@@ -638,10 +775,13 @@ define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double>
   ret <8 x double> %r
 }
 
-; CHECK-LABEL: test_maskz_broadcast_vaddpd
-; CHECK: vaddpd (%rdi){1to8}, %zmm{{.*{%k[1-7]} {z}.*}}
-; CHECK: ret
 define <8 x double> @test_maskz_broadcast_vaddpd(<8 x double> %i, double* %j,
+; CHECK-LABEL: test_maskz_broadcast_vaddpd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; CHECK-NEXT:    vpcmpneqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vaddpd (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
                                        <8 x i64> %mask1) nounwind {
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
   %tmp = load double, double* %j
@@ -653,10 +793,31 @@ define <8 x double> @test_maskz_broadcast_vaddpd(<8 x double> %i, double* %j,
   ret <8 x double> %r
 }
 
-; CHECK-LABEL: test_fxor
-; CHECK: vpxord
-; CHECK: ret
 define <16 x float>  @test_fxor(<16 x float> %a) {
+; AVX512F-LABEL: test_fxor:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpxord {{.*}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: test_fxor:
+; AVX512VL:       ## BB#0:
+; AVX512VL-NEXT:    vpxord {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: test_fxor:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpxord {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_fxor:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vxorps {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT:    retq
+;
+; SKX-LABEL: test_fxor:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vxorps {{.*}}(%rip), %zmm0, %zmm0
+; SKX-NEXT:    retq
 
   %res = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
   ret <16 x float>%res

From 5836554b41d37c6e41cbbb13d1858107e5365960 Mon Sep 17 00:00:00 2001
From: Elena Demikhovsky <elena.demikhovsky@intel.com>
Date: Tue, 1 Dec 2015 12:43:46 +0000
Subject: [PATCH 076/186] AVX-512: fixed asm string of vsqrtss

(vvsqrtss was generated before)


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254411 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86InstrAVX512.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 16b1f3b59b0d..1dfd8d4510f5 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -5911,12 +5911,12 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                          EVEX_B, EVEX_RC;
 
   let isCodeGenOnly = 1 in {
-    def r : SI<opc, MRMSrcReg, (outs _.FRC:$dst),
+    def r : I<opc, MRMSrcReg, (outs _.FRC:$dst),
                (ins _.FRC:$src1, _.FRC:$src2),
                OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>;
 
     let mayLoad = 1 in
-      def m : SI<opc, MRMSrcMem, (outs _.FRC:$dst),
+      def m : I<opc, MRMSrcMem, (outs _.FRC:$dst),
                  (ins _.FRC:$src1, _.ScalarMemOp:$src2),
                  OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>;
   }

From f0052c5d2a4706ff0693509f79199b473484918e Mon Sep 17 00:00:00 2001
From: Yury Gribov <y.gribov@samsung.com>
Date: Tue, 1 Dec 2015 13:24:48 +0000
Subject: [PATCH 077/186] Fix "WARNING: Title underline too short." introduced
 by r254404.

Patch by Max Ostapenko.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254413 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/LangRef.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index 36bfb5167795..cf1ceab1f1c6 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -9340,7 +9340,7 @@ See the description for :ref:`llvm.stacksave <int_stacksave>`.
 .. _int_get_dynamic_area_offset:
 
 '``llvm.get.dynamic.area.offset``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""

From 6979eb43962a1a5ad230f01cb5213f3490d22afd Mon Sep 17 00:00:00 2001
From: Chad Rosier <mcrosier@codeaurora.org>
Date: Tue, 1 Dec 2015 14:26:35 +0000
Subject: [PATCH 078/186] [LIR] Push check into helper function. NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254416 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/LoopIdiomRecognize.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index a3658ed64976..9dc41ba2f328 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -257,6 +257,10 @@ static unsigned getStoreStride(const SCEVAddRecExpr *StoreEv) {
 }
 
 bool LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
+  // Don't touch volatile stores.
+  if (!SI->isSimple())
+    return false;
+
   Value *StoredVal = SI->getValueOperand();
   Value *StorePtr = SI->getPointerOperand();
 
@@ -287,10 +291,6 @@ void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
     if (!SI)
       continue;
 
-    // Don't touch volatile stores.
-    if (!SI->isSimple())
-      continue;
-
     // Make sure this is a strided store with a constant stride.
     if (!isLegalStore(SI))
       continue;

From 39e89e8fd833c2e13cf6c29379ff608d6b7c52a2 Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Tue, 1 Dec 2015 15:19:48 +0000
Subject: [PATCH 079/186] Bring r254336 back:

The difference is that now we don't error on out-of-comdat access to
internal global values. We copy them instead. This seems to match the
expectation of COFF linkers (see pr25686).

Original message:

    Start deciding earlier what to link.

    A traditional linker is roughly split in symbol resolution and
"copying
    stuff".

    The two tasks are badly mixed in lib/Linker.

    This starts splitting them apart.

    With this patch there are no direct call to linkGlobalValueBody or
    linkGlobalValueProto. Everything is linked via WapValue.

    This also includes a few fixes:
    * A GV goes undefined if the comdat is dropped (comdat11.ll).
    * We error if an internal GV goes undefined (comdat13.ll).
    * We don't link an unused comdat.

    The first two match the behavior of an ELF linker. The second one is
    equivalent to running globaldce on the input.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254418 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Linker/LinkModules.cpp           | 155 ++++++++++++++-------------
 lib/Transforms/Utils/ValueMapper.cpp |   6 +-
 test/Linker/Inputs/comdat11.ll       |   9 ++
 test/Linker/Inputs/comdat13.ll       |   9 ++
 test/Linker/comdat11.ll              |  13 +++
 test/Linker/comdat12.ll              |   8 ++
 test/Linker/comdat13.ll              |  30 ++++++
 test/Linker/comdat9.ll               |   3 +
 8 files changed, 153 insertions(+), 80 deletions(-)
 create mode 100644 test/Linker/Inputs/comdat11.ll
 create mode 100644 test/Linker/Inputs/comdat13.ll
 create mode 100644 test/Linker/comdat11.ll
 create mode 100644 test/Linker/comdat12.ll
 create mode 100644 test/Linker/comdat13.ll

diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index cdf1decc8131..680b19ae2337 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -436,6 +436,8 @@ class ModuleLinker {
   /// references.
   bool DoneLinkingBodies;
 
+  bool HasError = false;
+
 public:
   ModuleLinker(Module *dstM, Linker::IdentifiedStructTypeSet &Set, Module *srcM,
                DiagnosticHandlerFunction DiagnosticHandler, unsigned Flags,
@@ -483,6 +485,7 @@ class ModuleLinker {
   /// Helper method for setting a message and returning an error code.
   bool emitError(const Twine &Message) {
     DiagnosticHandler(LinkDiagnosticInfo(DS_Error, Message));
+    HasError = true;
     return true;
   }
 
@@ -531,6 +534,7 @@ class ModuleLinker {
   void upgradeMismatchedGlobalArray(StringRef Name);
   void upgradeMismatchedGlobals();
 
+  bool linkIfNeeded(GlobalValue &GV);
   bool linkAppendingVarProto(GlobalVariable *DstGV,
                              const GlobalVariable *SrcGV);
 
@@ -904,16 +908,12 @@ Value *ModuleLinker::materializeDeclFor(Value *V) {
   if (doneLinkingBodies())
     return nullptr;
 
-  GlobalValue *DGV = copyGlobalValueProto(TypeMap, SGV);
-
-  if (Comdat *SC = SGV->getComdat()) {
-    if (auto *DGO = dyn_cast<GlobalObject>(DGV)) {
-      Comdat *DC = DstM->getOrInsertComdat(SC->getName());
-      DGO->setComdat(DC);
-    }
-  }
-
-  return DGV;
+  linkGlobalValueProto(SGV);
+  if (HasError)
+    return nullptr;
+  Value *Ret = ValueMap[SGV];
+  assert(Ret);
+  return Ret;
 }
 
 void ValueMaterializerTy::materializeInitFor(GlobalValue *New,
@@ -922,15 +922,27 @@ void ValueMaterializerTy::materializeInitFor(GlobalValue *New,
 }
 
 void ModuleLinker::materializeInitFor(GlobalValue *New, GlobalValue *Old) {
+  if (auto *F = dyn_cast<Function>(New)) {
+    if (!F->isDeclaration())
+      return;
+  } else if (auto *V = dyn_cast<GlobalVariable>(New)) {
+    if (V->hasInitializer())
+      return;
+  } else {
+    auto *A = cast<GlobalAlias>(New);
+    if (A->getAliasee())
+      return;
+  }
+
+  if (Old->isDeclaration())
+    return;
+
   if (isPerformingImport() && !doImportAsDefinition(Old))
     return;
 
-  // Skip declarations that ValueMaterializer may have created in
-  // case we link in only some of SrcM.
-  if (shouldLinkOnlyNeeded() && Old->isDeclaration())
+  if (!New->hasLocalLinkage() && DoNotLinkFromSource.count(Old))
     return;
 
-  assert(!Old->isDeclaration() && "users should not pass down decls");
   linkGlobalValueBody(*Old);
 }
 
@@ -1405,7 +1417,8 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) {
     std::tie(SK, LinkFromSrc) = ComdatsChosen[SC];
     C = DstM->getOrInsertComdat(SC->getName());
     C->setSelectionKind(SK);
-    ComdatMembers[SC].push_back(SGV);
+    if (SGV->hasInternalLinkage())
+      LinkFromSrc = true;
   } else if (DGV) {
     if (shouldLinkFromSource(LinkFromSrc, *DGV, *SGV))
       return true;
@@ -1425,31 +1438,12 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) {
   if (DGV)
     HasUnnamedAddr = HasUnnamedAddr && DGV->hasUnnamedAddr();
 
-  if (!LinkFromSrc && !DGV)
-    return false;
-
   GlobalValue *NewGV;
-  if (!LinkFromSrc) {
+  if (!LinkFromSrc && DGV) {
     NewGV = DGV;
     // When linking from source we setVisibility from copyGlobalValueProto.
     setVisibility(NewGV, SGV, DGV);
   } else {
-    // If the GV is to be lazily linked, don't create it just yet.
-    // The ValueMaterializerTy will deal with creating it if it's used.
-    if (!DGV && !shouldOverrideFromSrc() && SGV != ImportFunction &&
-        (SGV->hasLocalLinkage() || SGV->hasLinkOnceLinkage() ||
-         SGV->hasAvailableExternallyLinkage())) {
-      DoNotLinkFromSource.insert(SGV);
-      return false;
-    }
-
-    // When we only want to link in unresolved dependencies, blacklist
-    // the symbol unless unless DestM has a matching declaration (DGV).
-    if (shouldLinkOnlyNeeded() && !(DGV && DGV->isDeclaration())) {
-      DoNotLinkFromSource.insert(SGV);
-      return false;
-    }
-
     NewGV = copyGlobalValueProto(TypeMap, SGV, DGV);
 
     if (isPerformingImport() && !doImportAsDefinition(SGV))
@@ -1459,7 +1453,7 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) {
   NewGV->setUnnamedAddr(HasUnnamedAddr);
 
   if (auto *NewGO = dyn_cast<GlobalObject>(NewGV)) {
-    if (C)
+    if (C && LinkFromSrc)
       NewGO->setComdat(C);
 
     if (DGV && DGV->hasCommonLinkage() && SGV->hasCommonLinkage())
@@ -1842,6 +1836,38 @@ static std::string mergeTriples(const Triple &SrcTriple, const Triple &DstTriple
   return DstTriple.str();
 }
 
+bool ModuleLinker::linkIfNeeded(GlobalValue &GV) {
+  GlobalValue *DGV = getLinkedToGlobal(&GV);
+
+  if (shouldLinkOnlyNeeded() && !(DGV && DGV->isDeclaration()))
+    return false;
+
+  if (DGV && !GV.hasLocalLinkage()) {
+    GlobalValue::VisibilityTypes Visibility =
+        getMinVisibility(DGV->getVisibility(), GV.getVisibility());
+    DGV->setVisibility(Visibility);
+    GV.setVisibility(Visibility);
+  }
+
+  if (const Comdat *SC = GV.getComdat()) {
+    bool LinkFromSrc;
+    Comdat::SelectionKind SK;
+    std::tie(SK, LinkFromSrc) = ComdatsChosen[SC];
+    if (!LinkFromSrc) {
+      DoNotLinkFromSource.insert(&GV);
+      return false;
+    }
+  }
+
+  if (!DGV && !shouldOverrideFromSrc() &&
+      (GV.hasLocalLinkage() || GV.hasLinkOnceLinkage() ||
+       GV.hasAvailableExternallyLinkage())) {
+    return false;
+  }
+  MapValue(&GV, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer);
+  return HasError;
+}
+
 bool ModuleLinker::run() {
   assert(DstM && "Null destination module");
   assert(SrcM && "Null source module");
@@ -1901,24 +1927,30 @@ bool ModuleLinker::run() {
   // Upgrade mismatched global arrays.
   upgradeMismatchedGlobals();
 
+  for (GlobalVariable &GV : SrcM->globals())
+    if (const Comdat *SC = GV.getComdat())
+      ComdatMembers[SC].push_back(&GV);
+
+  for (Function &SF : *SrcM)
+    if (const Comdat *SC = SF.getComdat())
+      ComdatMembers[SC].push_back(&SF);
+
+  for (GlobalAlias &GA : SrcM->aliases())
+    if (const Comdat *SC = GA.getComdat())
+      ComdatMembers[SC].push_back(&GA);
+
   // Insert all of the globals in src into the DstM module... without linking
   // initializers (which could refer to functions not yet mapped over).
   for (GlobalVariable &GV : SrcM->globals())
-    if (linkGlobalValueProto(&GV))
+    if (linkIfNeeded(GV))
       return true;
 
-  // Link the functions together between the two modules, without doing function
-  // bodies... this just adds external function prototypes to the DstM
-  // function...  We do this so that when we begin processing function bodies,
-  // all of the global values that may be referenced are available in our
-  // ValueMap.
-  for (Function &F :*SrcM)
-    if (linkGlobalValueProto(&F))
+  for (Function &SF : *SrcM)
+    if (linkIfNeeded(SF))
       return true;
 
-  // If there were any aliases, link them now.
   for (GlobalAlias &GA : SrcM->aliases())
-    if (linkGlobalValueProto(&GA))
+    if (linkIfNeeded(GA))
       return true;
 
   for (AppendingVarInfo &AppendingVar : AppendingVars)
@@ -1933,37 +1965,6 @@ bool ModuleLinker::run() {
       MapValue(GV, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer);
   }
 
-  // Link in the function bodies that are defined in the source module into
-  // DstM.
-  for (Function &SF : *SrcM) {
-    // Skip if no body (function is external).
-    if (SF.isDeclaration())
-      continue;
-
-    // Skip if not linking from source.
-    if (DoNotLinkFromSource.count(&SF))
-      continue;
-
-    if (linkGlobalValueBody(SF))
-      return true;
-  }
-
-  // Resolve all uses of aliases with aliasees.
-  for (GlobalAlias &Src : SrcM->aliases()) {
-    if (DoNotLinkFromSource.count(&Src))
-      continue;
-    linkGlobalValueBody(Src);
-  }
-
-  // Update the initializers in the DstM module now that all globals that may
-  // be referenced are in DstM.
-  for (GlobalVariable &Src : SrcM->globals()) {
-    // Only process initialized GV's or ones not already in dest.
-    if (!Src.hasInitializer() || DoNotLinkFromSource.count(&Src))
-      continue;
-    linkGlobalValueBody(Src);
-  }
-
   // Note that we are done linking global value bodies. This prevents
   // metadata linking from creating new references.
   DoneLinkingBodies = true;
diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp
index 0a63c1d5153c..00a8984845dd 100644
--- a/lib/Transforms/Utils/ValueMapper.cpp
+++ b/lib/Transforms/Utils/ValueMapper.cpp
@@ -41,9 +41,9 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
     if (Value *NewV =
             Materializer->materializeDeclFor(const_cast<Value *>(V))) {
       VM[V] = NewV;
-      if (auto *GV = dyn_cast<GlobalValue>(V))
-        Materializer->materializeInitFor(cast<GlobalValue>(NewV),
-                                         const_cast<GlobalValue *>(GV));
+      if (auto *NewGV = dyn_cast<GlobalValue>(NewV))
+        Materializer->materializeInitFor(
+            NewGV, const_cast<GlobalValue *>(cast<GlobalValue>(V)));
       return NewV;
     }
   }
diff --git a/test/Linker/Inputs/comdat11.ll b/test/Linker/Inputs/comdat11.ll
new file mode 100644
index 000000000000..5b7f74cf0b24
--- /dev/null
+++ b/test/Linker/Inputs/comdat11.ll
@@ -0,0 +1,9 @@
+$foo = comdat any
+@foo = global i8 1, comdat
+define void @zed() {
+  call void @bar()
+  ret void
+}
+define void @bar() comdat($foo) {
+  ret void
+}
diff --git a/test/Linker/Inputs/comdat13.ll b/test/Linker/Inputs/comdat13.ll
new file mode 100644
index 000000000000..85515210ed7e
--- /dev/null
+++ b/test/Linker/Inputs/comdat13.ll
@@ -0,0 +1,9 @@
+$foo = comdat any
+@foo = internal global i8 1, comdat
+define i8* @zed() {
+  call void @bax()
+  ret i8* @foo
+}
+define internal void @bax() comdat($foo) {
+  ret void
+}
diff --git a/test/Linker/comdat11.ll b/test/Linker/comdat11.ll
new file mode 100644
index 000000000000..dbade4104fe3
--- /dev/null
+++ b/test/Linker/comdat11.ll
@@ -0,0 +1,13 @@
+; RUN: llvm-link -S %s %p/Inputs/comdat11.ll -o - | FileCheck %s
+
+$foo = comdat any
+@foo = global i8 0, comdat
+
+; CHECK: @foo = global i8 0, comdat
+
+; CHECK: define void @zed() {
+; CHECK:   call void @bar()
+; CHECK:   ret void
+; CHECK: }
+
+; CHECK: declare void @bar()
diff --git a/test/Linker/comdat12.ll b/test/Linker/comdat12.ll
new file mode 100644
index 000000000000..d06e222b63ac
--- /dev/null
+++ b/test/Linker/comdat12.ll
@@ -0,0 +1,8 @@
+; RUN: llvm-link %s -S -o - | FileCheck %s
+
+$foo = comdat largest
+define internal void @foo() comdat($foo) {
+  ret void
+}
+
+; CHECK-NOT: foo
diff --git a/test/Linker/comdat13.ll b/test/Linker/comdat13.ll
new file mode 100644
index 000000000000..d1e382a2f278
--- /dev/null
+++ b/test/Linker/comdat13.ll
@@ -0,0 +1,30 @@
+; RUN: llvm-link -S %s %p/Inputs/comdat13.ll -o - | FileCheck %s
+
+; In Inputs/comdat13.ll a function not in the $foo comdat (zed) references an
+; internal function in the comdat $foo.
+; The IR would be ilegal on ELF ("relocation refers to discarded section"),
+; but COFF linkers seem to just duplicate the comdat.
+
+$foo = comdat any
+@foo = internal global i8 0, comdat
+define i8* @bar() {
+       ret i8* @foo
+}
+
+; CHECK: $foo = comdat any
+
+; CHECK: @foo = internal global i8 0, comdat
+; CHECK: @foo.1 = internal global i8 1, comdat($foo)
+
+; CHECK:      define i8* @bar() {
+; CHECK-NEXT:   ret i8* @foo
+; CHECK-NEXT: }
+
+; CHECK:      define i8* @zed() {
+; CHECK-NEXT:   call void @bax()
+; CHECK-NEXT:   ret i8* @foo.1
+; CHECK-NEXT: }
+
+; CHECK:      define internal void @bax() comdat($foo) {
+; CHECK-NEXT:   ret void
+; CHECK-NEXT: }
diff --git a/test/Linker/comdat9.ll b/test/Linker/comdat9.ll
index 274957401aac..4f6f2cfb845d 100644
--- a/test/Linker/comdat9.ll
+++ b/test/Linker/comdat9.ll
@@ -14,6 +14,9 @@ $f2 = comdat largest
 define internal void @f2() comdat($f2) {
   ret void
 }
+define void @f3() comdat($f2) {
+  ret void
+}
 
 ; CHECK-DAG: $f2 = comdat largest
 ; CHECK-DAG: define internal void @f2() comdat {

From d0903bddbc09ae3fc2aaca4679cb71a5f6158b6e Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Tue, 1 Dec 2015 15:46:46 +0000
Subject: [PATCH 080/186] Simplify test. NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254419 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Linker/global_ctors.ll | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/test/Linker/global_ctors.ll b/test/Linker/global_ctors.ll
index 49df81a00759..cc28471df59d 100644
--- a/test/Linker/global_ctors.ll
+++ b/test/Linker/global_ctors.ll
@@ -1,6 +1,5 @@
-; RUN: llvm-as %s -o %t.new.bc
-; RUN: llvm-link %t.new.bc %S/Inputs/old_global_ctors.3.4.bc | llvm-dis | FileCheck %s
-; RUN: llvm-link %S/Inputs/old_global_ctors.3.4.bc %t.new.bc | llvm-dis | FileCheck %s
+; RUN: llvm-link -S %s %S/Inputs/old_global_ctors.3.4.bc | FileCheck %s
+; RUN: llvm-link -S %S/Inputs/old_global_ctors.3.4.bc %s | FileCheck %s
 
 ; old_global_ctors.3.4.bc contains the following LLVM IL, assembled into
 ; bitcode by llvm-as from 3.4.  It uses a two element @llvm.global_ctors array.

From 2b9196851ccf22799f23c75377672a93a5c02665 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Tue, 1 Dec 2015 16:45:23 +0000
Subject: [PATCH 081/186] [ThinLTO] Remove stale comment (NFC)

Stale as of r254036 which added basic profitability check.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254421 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/IPO/FunctionImport.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp
index ab0f7114957b..345b2f540adf 100644
--- a/lib/Transforms/IPO/FunctionImport.cpp
+++ b/lib/Transforms/IPO/FunctionImport.cpp
@@ -136,10 +136,6 @@ bool FunctionImporter::importFunctions(Module &M) {
       continue;
     }
 
-    //
-    // No profitability notion right now, just import all the time...
-    //
-
     // Get the module path from the summary.
     auto FileName = Summary->modulePath();
     DEBUG(dbgs() << "Importing " << CalledFunctionName << " from " << FileName

From 3c43768b6387f77f9e11450b49e5ea7b56ce1800 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Tue, 1 Dec 2015 17:12:10 +0000
Subject: [PATCH 082/186] [ThinLTO] Wrap dbgs() output in DEBUG macro

Missed in a couple places.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254422 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/IPO/FunctionImport.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp
index 345b2f540adf..10bba1939270 100644
--- a/lib/Transforms/IPO/FunctionImport.cpp
+++ b/lib/Transforms/IPO/FunctionImport.cpp
@@ -124,15 +124,15 @@ bool FunctionImporter::importFunctions(Module &M) {
     auto *Summary = Info->functionSummary();
     if (!Summary) {
       // FIXME: in case we are lazyloading summaries, we can do it now.
-      dbgs() << "Missing summary for  " << CalledFunctionName
-             << ", error at import?\n";
+      DEBUG(dbgs() << "Missing summary for  " << CalledFunctionName
+                   << ", error at import?\n");
       llvm_unreachable("Missing summary");
     }
 
     if (Summary->instCount() > ImportInstrLimit) {
-      dbgs() << "Skip import of " << CalledFunctionName << " with "
-             << Summary->instCount() << " instructions (limit "
-             << ImportInstrLimit << ")\n";
+      DEBUG(dbgs() << "Skip import of " << CalledFunctionName << " with "
+                   << Summary->instCount() << " instructions (limit "
+                   << ImportInstrLimit << ")\n");
       continue;
     }
 

From 2b3f97809d173409a1442d7925e4a64806a738f7 Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Tue, 1 Dec 2015 17:17:04 +0000
Subject: [PATCH 083/186] Make appending var linking less of a special case.

It has to be a bit special because:
* materializeInitFor is not really supposed to call replaceAllUsesWith.
  The caller has a plain variable with Dst and expects just the
  initializer to be set, not for it to be removed.
* Calling mutateType as we used to do before gets some type
  inconsistency which breaks the bitcode writer.
* If linkAppendingVarProto create a dest decl with the correct type to
  avoid the above problems, it needs to put the original dst init in
  some side table for materializeInitFor to use.

In the end the simplest solution seems to be to just have
linkAppendingVarProto do all the work and set ValueMap[SrcGV to avoid
recursion.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254424 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Linker/LinkModules.cpp | 111 ++++++++++++-------------------------
 1 file changed, 36 insertions(+), 75 deletions(-)

diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index 680b19ae2337..f745df56f9aa 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -401,14 +401,6 @@ class ModuleLinker {
   /// but this allows us to reuse the ValueMapper code.
   ValueToValueMapTy ValueMap;
 
-  struct AppendingVarInfo {
-    GlobalVariable *NewGV;   // New aggregate global in dest module.
-    const Constant *DstInit; // Old initializer from dest module.
-    const Constant *SrcInit; // Old initializer from src module.
-  };
-
-  std::vector<AppendingVarInfo> AppendingVars;
-
   // Set of items not to link in from source.
   SmallPtrSet<const GlobalValue *, 16> DoNotLinkFromSource;
 
@@ -541,8 +533,6 @@ class ModuleLinker {
   bool linkGlobalValueProto(GlobalValue *GV);
   bool linkModuleFlagsMetadata();
 
-  void linkAppendingVarInit(AppendingVarInfo &AVI);
-
   void linkGlobalInit(GlobalVariable &Dst, GlobalVariable &Src);
   bool linkFunctionBody(Function &Dst, Function &Src);
   void linkAliasBody(GlobalAlias &Dst, GlobalAlias &Src);
@@ -1318,6 +1308,14 @@ void ModuleLinker::upgradeMismatchedGlobals() {
   upgradeMismatchedGlobalArray("llvm.global_dtors");
 }
 
+static void getArrayElements(const Constant *C,
+                             SmallVectorImpl<Constant *> &Dest) {
+  unsigned NumElements = cast<ArrayType>(C->getType())->getNumElements();
+
+  for (unsigned i = 0; i != NumElements; ++i)
+    Dest.push_back(C->getAggregateElement(i));
+}
+
 /// If there were any appending global variables, link them together now.
 /// Return true on error.
 bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
@@ -1326,10 +1324,8 @@ bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
       cast<ArrayType>(TypeMap.get(SrcGV->getType()->getElementType()));
   Type *EltTy = SrcTy->getElementType();
 
-  uint64_t NewSize = SrcTy->getNumElements();
   if (DstGV) {
     ArrayType *DstTy = cast<ArrayType>(DstGV->getType()->getElementType());
-    NewSize += DstTy->getNumElements();
 
     if (!SrcGV->hasAppendingLinkage() || !DstGV->hasAppendingLinkage())
       return emitError(
@@ -1359,6 +1355,27 @@ bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
           "Appending variables with different section name need to be linked!");
   }
 
+  SmallVector<Constant *, 16> DstElements;
+  if (DstGV)
+    getArrayElements(DstGV->getInitializer(), DstElements);
+
+  SmallVector<Constant *, 16> SrcElements;
+  getArrayElements(SrcGV->getInitializer(), SrcElements);
+
+  StringRef Name = SrcGV->getName();
+  bool IsNewStructor =
+      (Name == "llvm.global_ctors" || Name == "llvm.global_dtors") &&
+      cast<StructType>(EltTy)->getNumElements() == 3;
+  if (IsNewStructor)
+    SrcElements.erase(
+        std::remove_if(SrcElements.begin(), SrcElements.end(),
+                       [this](Constant *E) {
+                         auto *Key = dyn_cast<GlobalValue>(
+                             E->getAggregateElement(2)->stripPointerCasts());
+                         return DoNotLinkFromSource.count(Key);
+                       }),
+        SrcElements.end());
+  uint64_t NewSize = DstElements.size() + SrcElements.size();
   ArrayType *NewType = ArrayType::get(EltTy, NewSize);
 
   // Create the new global variable.
@@ -1370,24 +1387,22 @@ bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
   // Propagate alignment, visibility and section info.
   copyGVAttributes(NG, SrcGV);
 
-  AppendingVarInfo AVI;
-  AVI.NewGV = NG;
-  AVI.DstInit = DstGV ? DstGV->getInitializer() : nullptr;
-  AVI.SrcInit = SrcGV->getInitializer();
-  AppendingVars.push_back(AVI);
-
   // Replace any uses of the two global variables with uses of the new
   // global.
   ValueMap[SrcGV] = ConstantExpr::getBitCast(NG, TypeMap.get(SrcGV->getType()));
 
+  for (auto *V : SrcElements) {
+    DstElements.push_back(
+        MapValue(V, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer));
+  }
+
+  NG->setInitializer(ConstantArray::get(NewType, DstElements));
+
   if (DstGV) {
     DstGV->replaceAllUsesWith(ConstantExpr::getBitCast(NG, DstGV->getType()));
     DstGV->eraseFromParent();
   }
 
-  // Track the source variable so we don't try to link it.
-  DoNotLinkFromSource.insert(SrcGV);
-
   return false;
 }
 
@@ -1480,57 +1495,6 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) {
   return false;
 }
 
-static void getArrayElements(const Constant *C,
-                             SmallVectorImpl<Constant *> &Dest) {
-  unsigned NumElements = cast<ArrayType>(C->getType())->getNumElements();
-
-  for (unsigned i = 0; i != NumElements; ++i)
-    Dest.push_back(C->getAggregateElement(i));
-}
-
-void ModuleLinker::linkAppendingVarInit(AppendingVarInfo &AVI) {
-  // Merge the initializer.
-  SmallVector<Constant *, 16> DstElements;
-  if (AVI.DstInit)
-    getArrayElements(AVI.DstInit, DstElements);
-
-  SmallVector<Constant *, 16> SrcElements;
-  getArrayElements(AVI.SrcInit, SrcElements);
-
-  ArrayType *NewType = cast<ArrayType>(AVI.NewGV->getType()->getElementType());
-
-  StringRef Name = AVI.NewGV->getName();
-  bool IsNewStructor =
-      (Name == "llvm.global_ctors" || Name == "llvm.global_dtors") &&
-      cast<StructType>(NewType->getElementType())->getNumElements() == 3;
-
-  for (auto *V : SrcElements) {
-    if (IsNewStructor) {
-      auto *Key =
-          dyn_cast<GlobalValue>(V->getAggregateElement(2)->stripPointerCasts());
-      if (DoNotLinkFromSource.count(Key))
-        continue;
-    }
-    DstElements.push_back(
-        MapValue(V, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer));
-  }
-  if (DstElements.size() != NewType->getNumElements()) {
-    NewType = ArrayType::get(NewType->getElementType(), DstElements.size());
-    GlobalVariable *Old = AVI.NewGV;
-    GlobalVariable *NG = new GlobalVariable(
-        *DstM, NewType, Old->isConstant(), Old->getLinkage(), /*init*/ nullptr,
-        /*name*/ "", Old, Old->getThreadLocalMode(),
-        Old->getType()->getAddressSpace());
-    copyGVAttributes(NG, Old);
-    AVI.NewGV->replaceAllUsesWith(
-        ConstantExpr::getBitCast(NG, AVI.NewGV->getType()));
-    AVI.NewGV->eraseFromParent();
-    AVI.NewGV = NG;
-  }
-
-  AVI.NewGV->setInitializer(ConstantArray::get(NewType, DstElements));
-}
-
 /// Update the initializers in the Dest module now that all globals that may be
 /// referenced are in Dest.
 void ModuleLinker::linkGlobalInit(GlobalVariable &Dst, GlobalVariable &Src) {
@@ -1953,9 +1917,6 @@ bool ModuleLinker::run() {
     if (linkIfNeeded(GA))
       return true;
 
-  for (AppendingVarInfo &AppendingVar : AppendingVars)
-    linkAppendingVarInit(AppendingVar);
-
   for (const auto &Entry : DstM->getComdatSymbolTable()) {
     const Comdat &C = Entry.getValue();
     if (C.getSelectionKind() == Comdat::Any)

From 40adef82a5ba533fb6a1c60789f396c2baa976cc Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 1 Dec 2015 17:27:55 +0000
Subject: [PATCH 084/186] [x86] add a convenience method to check for FMA
 capability; NFCI

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254425 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp | 8 +++-----
 lib/Target/X86/X86Subtarget.h      | 1 +
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 4ec4ec280675..7ddcd8fcda48 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1148,7 +1148,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::CTTZ_ZERO_UNDEF,   MVT::v8i32, Custom);
     setOperationAction(ISD::CTTZ_ZERO_UNDEF,   MVT::v4i64, Custom);
 
-    if (Subtarget->hasFMA() || Subtarget->hasFMA4() || Subtarget->hasAVX512()) {
+    if (Subtarget->hasAnyFMA()) {
       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
       setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
       setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
@@ -20463,7 +20463,7 @@ bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
 
 bool
 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
-  if (!(Subtarget->hasFMA() || Subtarget->hasFMA4() || Subtarget->hasAVX512()))
+  if (!Subtarget->hasAnyFMA())
     return false;
 
   VT = VT.getScalarType();
@@ -26471,9 +26471,7 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   EVT ScalarVT = VT.getScalarType();
-  if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
-      (!Subtarget->hasFMA() && !Subtarget->hasFMA4() &&
-       !Subtarget->hasAVX512()))
+  if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasAnyFMA())
     return SDValue();
 
   SDValue A = N->getOperand(0);
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index eb0199aecbeb..353b4f7f5ebd 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -358,6 +358,7 @@ class X86Subtarget final : public X86GenSubtargetInfo {
   // has equal or better performance on all supported targets.
   bool hasFMA() const { return HasFMA && !HasFMA4; }
   bool hasFMA4() const { return HasFMA4; }
+  bool hasAnyFMA() const { return hasFMA() || hasFMA4() || hasAVX512(); }
   bool hasXOP() const { return HasXOP; }
   bool hasTBM() const { return HasTBM; }
   bool hasMOVBE() const { return HasMOVBE; }

From 368c76102b9d3f5a430f842964b12fefe3d3a18c Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Tue, 1 Dec 2015 17:45:17 +0000
Subject: [PATCH 085/186] AMDGPU: Use the default strings for data emission
 directives

Summary:
This makes the assembly output look nicer and there is no reason to
have custom strings for these.

Reviewers: arsenm

Subscribers: arsenm, llvm-commits

Differential Revision: http://reviews.llvm.org/D14671

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254426 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index d79ffdf52a74..01548e50f7c5 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -22,13 +22,6 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() {
   InlineAsmEnd = ";#ASMEND";
 
   //===--- Data Emission Directives -------------------------------------===//
-  ZeroDirective = ".zero";
-  AsciiDirective = ".ascii\t";
-  AscizDirective = ".asciz\t";
-  Data8bitsDirective = ".byte\t";
-  Data16bitsDirective = ".short\t";
-  Data32bitsDirective = ".long\t";
-  Data64bitsDirective = ".quad\t";
   SunStyleELFSectionSwitchSyntax = true;
   UsesELFSectionDirectiveForBSS = true;
 

From 76eb0395686d266a7416d578867fc62ef5e4ee82 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Tue, 1 Dec 2015 17:45:22 +0000
Subject: [PATCH 086/186] AMDGPU/SI: Remove REGISTER_STORE/REGISTER_LOAD code
 which is now dead

Reviewers: arsenm

Subscribers: arsenm, llvm-commits

Differential Revision: http://reviews.llvm.org/D15050

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254427 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 35 ------------------------
 lib/Target/AMDGPU/SIISelLowering.cpp     | 16 -----------
 lib/Target/AMDGPU/SIInstructions.td      | 30 --------------------
 3 files changed, 81 deletions(-)

diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 6aa4fddd3ec4..710c6771b171 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -458,41 +458,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     N = glueCopyToM0(N);
     break;
   }
-  case AMDGPUISD::REGISTER_LOAD: {
-    if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
-      break;
-    SDValue Addr, Offset;
-
-    SDLoc DL(N);
-    SelectADDRIndirect(N->getOperand(1), Addr, Offset);
-    const SDValue Ops[] = {
-      Addr,
-      Offset,
-      CurDAG->getTargetConstant(0, DL, MVT::i32),
-      N->getOperand(0),
-    };
-    return CurDAG->getMachineNode(AMDGPU::SI_RegisterLoad, DL,
-                                  CurDAG->getVTList(MVT::i32, MVT::i64,
-                                                    MVT::Other),
-                                  Ops);
-  }
-  case AMDGPUISD::REGISTER_STORE: {
-    if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
-      break;
-    SDValue Addr, Offset;
-    SelectADDRIndirect(N->getOperand(2), Addr, Offset);
-    SDLoc DL(N);
-    const SDValue Ops[] = {
-      N->getOperand(1),
-      Addr,
-      Offset,
-      CurDAG->getTargetConstant(0, DL, MVT::i32),
-      N->getOperand(0),
-    };
-    return CurDAG->getMachineNode(AMDGPU::SI_RegisterStorePseudo, DL,
-                                        CurDAG->getVTList(MVT::Other),
-                                        Ops);
-  }
 
   case AMDGPUISD::BFE_I32:
   case AMDGPUISD::BFE_U32: {
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 1b0cc87206f4..ab93bceb96ec 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -848,27 +848,11 @@ SDValue SITargetLowering::LowerFormalArguments(
 MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
     MachineInstr * MI, MachineBasicBlock * BB) const {
 
-  MachineBasicBlock::iterator I = *MI;
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
-
   switch (MI->getOpcode()) {
   default:
     return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
   case AMDGPU::BRANCH:
     return BB;
-  case AMDGPU::SI_RegisterStorePseudo: {
-    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-    unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
-    MachineInstrBuilder MIB =
-        BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore),
-                Reg);
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i)
-      MIB.addOperand(MI->getOperand(i));
-
-    MI->eraseFromParent();
-    break;
-  }
   }
   return BB;
 }
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index bc2b0c6c07fb..2cee993d751c 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -1942,36 +1942,6 @@ def SI_KILL : InstSI <
 
 let Uses = [EXEC], Defs = [EXEC,VCC,M0] in {
 
-//defm SI_ : RegisterLoadStore <VGPR_32, FRAMEri, ADDRIndirect>;
-
-let UseNamedOperandTable = 1 in {
-
-def SI_RegisterLoad : InstSI <
-  (outs VGPR_32:$dst, SReg_64:$temp),
-  (ins FRAMEri32:$addr, i32imm:$chan),
-  "", []
-> {
-  let isRegisterLoad = 1;
-  let mayLoad = 1;
-}
-
-class SIRegStore<dag outs> : InstSI <
-  outs,
-  (ins VGPR_32:$val, FRAMEri32:$addr, i32imm:$chan),
-  "", []
-> {
-  let isRegisterStore = 1;
-  let mayStore = 1;
-}
-
-let usesCustomInserter = 1 in {
-def SI_RegisterStorePseudo : SIRegStore<(outs)>;
-} // End usesCustomInserter = 1
-def SI_RegisterStore : SIRegStore<(outs SReg_64:$temp)>;
-
-
-} // End UseNamedOperandTable = 1
-
 class SI_INDIRECT_SRC<RegisterClass rc> : InstSI <
   (outs VGPR_32:$dst, SReg_64:$temp),
   (ins rc:$src, VSrc_32:$idx, i32imm:$off),

From 0fb89f7b8241946815f6eb9ae59589e184d9b016 Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Tue, 1 Dec 2015 18:07:07 +0000
Subject: [PATCH 087/186] [llvm-dwp] Correctly update debug_str_offsets.dwo
 when linking dwo files

This doesn't deduplicate strings in the debug_str section, nor does it
properly wire up the index so that debug_info can /find/ these strings,
but it does correct the str_offsets specifically.

Follow up patches to address those related/next issues.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254431 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/tools/llvm-dwp/X86/simple.test |  6 +-
 tools/llvm-dwp/llvm-dwp.cpp         | 85 +++++++++++++++++++++++------
 2 files changed, 71 insertions(+), 20 deletions(-)

diff --git a/test/tools/llvm-dwp/X86/simple.test b/test/tools/llvm-dwp/X86/simple.test
index 26215bdc8c91..d70bdda072c5 100644
--- a/test/tools/llvm-dwp/X86/simple.test
+++ b/test/tools/llvm-dwp/X86/simple.test
@@ -47,12 +47,12 @@ CHECK: .debug_str.dwo contents:
 CHECK: "clang version
 CHECK: 0x[[ACPP:.*]]: "a.cpp"
 FIXME: Remove duplicates
-CHECK: "clang version
+CHECK: 0x[[SECONDREV:.*]]: "clang version
 CHECK: 0x[[BCPP:.*]]: "b.cpp"
 
 CHECK: .debug_str_offsets.dwo contents:
 CHECK: : 00000000
 CHECK: : [[ACPP]]
-CHECK: : 00000000
+CHECK: : [[SECONDREV]]
 FIXME: Update str offset indexes, this should be BCPP \/
-CHECK: : [[ACPP]]
+CHECK: : [[BCPP]]
diff --git a/tools/llvm-dwp/llvm-dwp.cpp b/tools/llvm-dwp/llvm-dwp.cpp
index 7f9f6678db0b..c89be222e6c9 100644
--- a/tools/llvm-dwp/llvm-dwp.cpp
+++ b/tools/llvm-dwp/llvm-dwp.cpp
@@ -9,6 +9,7 @@
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Options.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -21,6 +22,7 @@
 #include <unordered_set>
 
 using namespace llvm;
+using namespace llvm::object;
 using namespace cl;
 
 OptionCategory DwpCategory("Specific Options");
@@ -36,39 +38,88 @@ static int error(const Twine &Error, const Twine &Context) {
   return 1;
 }
 
-static std::error_code writeSection(MCStreamer &Out, MCSection *OutSection,
-                                    const object::SectionRef &Sym) {
-  StringRef Contents;
-  if (auto Err = Sym.getContents(Contents))
-    return Err;
-  Out.SwitchSection(OutSection);
-  Out.EmitBytes(Contents);
+static std::error_code
+writeStringsAndOffsets(MCStreamer &Out, StringMap<uint32_t> &Strings,
+                       uint32_t &StringOffset, MCSection *StrOffsetSection,
+                       StringRef CurStrSection, StringRef CurStrOffsetSection) {
+  // Could possibly produce an error or warning if one of these was non-null but
+  // the other was null.
+  if (CurStrSection.empty() || CurStrOffsetSection.empty())
+    return std::error_code();
+
+  DenseMap<uint32_t, uint32_t> OffsetRemapping;
+
+  DataExtractor Data(CurStrSection, true, 0);
+  uint32_t LocalOffset = 0;
+  uint32_t PrevOffset = 0;
+  while (const char *s = Data.getCStr(&LocalOffset)) {
+    StringRef Str(s, LocalOffset - PrevOffset - 1);
+    OffsetRemapping[PrevOffset] = StringOffset;
+    // insert, if successful, write new string to the str.dwo section
+    StringOffset += Str.size() + 1;
+    PrevOffset = LocalOffset;
+  }
+
+  Data = DataExtractor(CurStrOffsetSection, true, 0);
+
+  Out.SwitchSection(StrOffsetSection);
+
+  uint32_t Offset = 0;
+  uint64_t Size = CurStrOffsetSection.size();
+  while (Offset < Size) {
+    auto OldOffset = Data.getU32(&Offset);
+    auto NewOffset = OffsetRemapping[OldOffset];
+    Out.EmitIntValue(NewOffset, 4);
+  }
+
   return std::error_code();
 }
 
 static std::error_code write(MCStreamer &Out, ArrayRef<std::string> Inputs) {
+  const auto &MCOFI = *Out.getContext().getObjectFileInfo();
+  MCSection *const StrSection = MCOFI.getDwarfStrDWOSection();
+  MCSection *const StrOffsetSection = MCOFI.getDwarfStrOffDWOSection();
+  const StringMap<MCSection *> KnownSections = {
+      {"debug_info.dwo", MCOFI.getDwarfInfoDWOSection()},
+      {"debug_types.dwo", MCOFI.getDwarfTypesDWOSection()},
+      {"debug_str_offsets.dwo", StrOffsetSection},
+      {"debug_str.dwo", StrSection},
+      {"debug_loc.dwo", MCOFI.getDwarfLocDWOSection()},
+      {"debug_abbrev.dwo", MCOFI.getDwarfAbbrevDWOSection()}};
+
+  StringMap<uint32_t> Strings;
+  uint32_t StringOffset = 0;
+
   for (const auto &Input : Inputs) {
     auto ErrOrObj = object::ObjectFile::createObjectFile(Input);
     if (!ErrOrObj)
       return ErrOrObj.getError();
     const auto *Obj = ErrOrObj->getBinary();
+    StringRef CurStrSection;
+    StringRef CurStrOffsetSection;
     for (const auto &Section : Obj->sections()) {
-      const auto &MCOFI = *Out.getContext().getObjectFileInfo();
-      static const StringMap<MCSection *> KnownSections = {
-          {"debug_info.dwo", MCOFI.getDwarfInfoDWOSection()},
-          {"debug_types.dwo", MCOFI.getDwarfTypesDWOSection()},
-          {"debug_str_offsets.dwo", MCOFI.getDwarfStrOffDWOSection()},
-          {"debug_str.dwo", MCOFI.getDwarfStrDWOSection()},
-          {"debug_loc.dwo", MCOFI.getDwarfLocDWOSection()},
-          {"debug_abbrev.dwo", MCOFI.getDwarfAbbrevDWOSection()}};
       StringRef Name;
       if (std::error_code Err = Section.getName(Name))
         return Err;
       if (MCSection *OutSection =
-              KnownSections.lookup(Name.substr(Name.find_first_not_of("._"))))
-        if (auto Err = writeSection(Out, OutSection, Section))
+              KnownSections.lookup(Name.substr(Name.find_first_not_of("._")))) {
+        StringRef Contents;
+        if (auto Err = Section.getContents(Contents))
           return Err;
+        if (OutSection == StrOffsetSection) {
+          CurStrOffsetSection = Contents;
+          continue;
+        }
+        if (OutSection == StrSection)
+          CurStrSection = Contents;
+        Out.SwitchSection(OutSection);
+        Out.EmitBytes(Contents);
+      }
     }
+    if (auto Err =
+            writeStringsAndOffsets(Out, Strings, StringOffset, StrOffsetSection,
+                                   CurStrSection, CurStrOffsetSection))
+      return Err;
   }
   return std::error_code();
 }

From 1d06f0bf4e070385fbb6d112318c190fc9aadda8 Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Tue, 1 Dec 2015 18:41:30 +0000
Subject: [PATCH 088/186] Delete the setModule method from the Linker.

It was only used from LTO for a debug feature, and LTO can just create
another linker.

It is pretty odd to have a method to reset the module in the middle of a
link. It would make IdentifiedStructTypes inconsistent with the Module
for example.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254434 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/LTO/LTOCodeGenerator.h | 4 ++--
 include/llvm/Linker/Linker.h        | 3 ---
 lib/LTO/LTOCodeGenerator.cpp        | 8 ++++----
 lib/Linker/LinkModules.cpp          | 4 ----
 4 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/include/llvm/LTO/LTOCodeGenerator.h b/include/llvm/LTO/LTOCodeGenerator.h
index 0d3c79bf5e84..c322288a1ae9 100644
--- a/include/llvm/LTO/LTOCodeGenerator.h
+++ b/include/llvm/LTO/LTOCodeGenerator.h
@@ -39,7 +39,6 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringMap.h"
-#include "llvm/Linker/Linker.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include <string>
@@ -49,6 +48,7 @@ namespace llvm {
   class LLVMContext;
   class DiagnosticInfo;
   class GlobalValue;
+  class Linker;
   class Mangler;
   class MemoryBuffer;
   class TargetLibraryInfo;
@@ -171,7 +171,7 @@ struct LTOCodeGenerator {
   std::unique_ptr<LLVMContext> OwnedContext;
   LLVMContext &Context;
   std::unique_ptr<Module> MergedModule;
-  Linker IRLinker;
+  std::unique_ptr<Linker> IRLinker;
   std::unique_ptr<TargetMachine> TargetMach;
   bool EmitDwarfDebugInfo = false;
   bool ScopeRestrictionsDone = false;
diff --git a/include/llvm/Linker/Linker.h b/include/llvm/Linker/Linker.h
index 610b1ddf9893..3f6c7b6c6942 100644
--- a/include/llvm/Linker/Linker.h
+++ b/include/llvm/Linker/Linker.h
@@ -85,9 +85,6 @@ class Linker {
                     const FunctionInfoIndex *Index = nullptr,
                     Function *FuncToImport = nullptr);
 
-  /// \brief Set the composite to the passed-in module.
-  void setModule(Module *Dst);
-
   static bool LinkModules(Module *Dest, Module *Src,
                           DiagnosticHandlerFunction DiagnosticHandler,
                           unsigned Flags = Flags::None);
diff --git a/lib/LTO/LTOCodeGenerator.cpp b/lib/LTO/LTOCodeGenerator.cpp
index 931bcf0d23fc..37ee7e8c53cc 100644
--- a/lib/LTO/LTOCodeGenerator.cpp
+++ b/lib/LTO/LTOCodeGenerator.cpp
@@ -67,14 +67,14 @@ const char* LTOCodeGenerator::getVersionString() {
 LTOCodeGenerator::LTOCodeGenerator()
     : Context(getGlobalContext()),
       MergedModule(new Module("ld-temp.o", Context)),
-      IRLinker(MergedModule.get()) {
+      IRLinker(new Linker(MergedModule.get())) {
   initializeLTOPasses();
 }
 
 LTOCodeGenerator::LTOCodeGenerator(std::unique_ptr<LLVMContext> Context)
     : OwnedContext(std::move(Context)), Context(*OwnedContext),
       MergedModule(new Module("ld-temp.o", *OwnedContext)),
-      IRLinker(MergedModule.get()) {
+      IRLinker(new Linker(MergedModule.get())) {
   initializeLTOPasses();
 }
 
@@ -114,7 +114,7 @@ bool LTOCodeGenerator::addModule(LTOModule *Mod) {
   assert(&Mod->getModule().getContext() == &Context &&
          "Expected module in same context");
 
-  bool ret = IRLinker.linkInModule(&Mod->getModule());
+  bool ret = IRLinker->linkInModule(&Mod->getModule());
 
   const std::vector<const char *> &undefs = Mod->getAsmUndefinedRefs();
   for (int i = 0, e = undefs.size(); i != e; ++i)
@@ -130,7 +130,7 @@ void LTOCodeGenerator::setModule(std::unique_ptr<LTOModule> Mod) {
   AsmUndefinedRefs.clear();
 
   MergedModule = Mod->takeModule();
-  IRLinker.setModule(MergedModule.get());
+  IRLinker = make_unique<Linker>(MergedModule.get());
 
   const std::vector<const char*> &Undefs = Mod->getAsmUndefinedRefs();
   for (int I = 0, E = Undefs.size(); I != E; ++I)
diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index f745df56f9aa..c57c70e322ab 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -2071,10 +2071,6 @@ bool Linker::linkInModule(Module *Src, unsigned Flags,
   return RetCode;
 }
 
-void Linker::setModule(Module *Dst) {
-  init(Dst, DiagnosticHandler);
-}
-
 //===----------------------------------------------------------------------===//
 // LinkModules entrypoint.
 //===----------------------------------------------------------------------===//

From f0edaf8e00d7977407738b770f1d7adc7f1b25b8 Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Tue, 1 Dec 2015 18:46:19 +0000
Subject: [PATCH 089/186] Use a forwarding constructor instead of an init
 method.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254435 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Linker/Linker.h |  1 -
 lib/Linker/LinkModules.cpp   | 15 +++++----------
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/include/llvm/Linker/Linker.h b/include/llvm/Linker/Linker.h
index 3f6c7b6c6942..7ac457856a19 100644
--- a/include/llvm/Linker/Linker.h
+++ b/include/llvm/Linker/Linker.h
@@ -93,7 +93,6 @@ class Linker {
                           unsigned Flags = Flags::None);
 
 private:
-  void init(Module *M, DiagnosticHandlerFunction DiagnosticHandler);
   Module *Composite;
 
   IdentifiedStructTypeSet IdentifiedStructTypes;
diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index c57c70e322ab..9aff43f31990 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -2032,7 +2032,7 @@ bool Linker::IdentifiedStructTypeSet::hasType(StructType *Ty) {
   return *I == Ty;
 }
 
-void Linker::init(Module *M, DiagnosticHandlerFunction DiagnosticHandler) {
+Linker::Linker(Module *M, DiagnosticHandlerFunction DiagnosticHandler) {
   this->Composite = M;
   this->DiagnosticHandler = DiagnosticHandler;
 
@@ -2046,15 +2046,10 @@ void Linker::init(Module *M, DiagnosticHandlerFunction DiagnosticHandler) {
   }
 }
 
-Linker::Linker(Module *M, DiagnosticHandlerFunction DiagnosticHandler) {
-  init(M, DiagnosticHandler);
-}
-
-Linker::Linker(Module *M) {
-  init(M, [this](const DiagnosticInfo &DI) {
-    Composite->getContext().diagnose(DI);
-  });
-}
+Linker::Linker(Module *M)
+    : Linker(M, [this](const DiagnosticInfo &DI) {
+        Composite->getContext().diagnose(DI);
+      }) {}
 
 void Linker::deleteModule() {
   delete Composite;

From c6cd4955cdd9643c7a3ad7eeb30713a8dcb503dc Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Tue, 1 Dec 2015 18:50:35 +0000
Subject: [PATCH 090/186] Delete dead code.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254436 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Linker/Linker.h | 1 -
 lib/Linker/LinkModules.cpp   | 5 -----
 2 files changed, 6 deletions(-)

diff --git a/include/llvm/Linker/Linker.h b/include/llvm/Linker/Linker.h
index 7ac457856a19..e307e06f50be 100644
--- a/include/llvm/Linker/Linker.h
+++ b/include/llvm/Linker/Linker.h
@@ -72,7 +72,6 @@ class Linker {
   Linker(Module *M);
 
   Module *getModule() const { return Composite; }
-  void deleteModule();
 
   /// \brief Link \p Src into the composite. The source is destroyed.
   /// Passing OverrideSymbols as true will have symbols from Src
diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index 9aff43f31990..7ba622f6ee1d 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -2051,11 +2051,6 @@ Linker::Linker(Module *M)
         Composite->getContext().diagnose(DI);
       }) {}
 
-void Linker::deleteModule() {
-  delete Composite;
-  Composite = nullptr;
-}
-
 bool Linker::linkInModule(Module *Src, unsigned Flags,
                           const FunctionInfoIndex *Index,
                           Function *FuncToImport) {

From 6cb642d909d9120d35d6ce3a499b075cc3b99a5d Mon Sep 17 00:00:00 2001
From: Keno Fischer <kfischer@college.harvard.edu>
Date: Tue, 1 Dec 2015 19:06:36 +0000
Subject: [PATCH 091/186] [Verifier] Improve error for cross-module refs

By including the module name in the error message.
This makes the error message much more useful and
saves a trip to the debugger.

Reviewers: dexonsmith

Subscribers: dexonsmith, llvm-commits

Differential Revision: http://reviews.llvm.org/D14473

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254437 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/IR/Verifier.cpp           | 13 +++++++---
 unittests/IR/VerifierTest.cpp | 47 +++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index 617d965f4cfc..5cbb597ca269 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -95,6 +95,12 @@ struct VerifierSupport {
     Write(&*I);
   }
 
+  void Write(const Module *M) {
+    if (!M)
+      return;
+    OS << "; ModuleID = '" << M->getModuleIdentifier() << "'\n";
+  }
+
   void Write(const Value *V) {
     if (!V)
       return;
@@ -1721,7 +1727,8 @@ void Verifier::visitFunction(const Function &F) {
     auto *Per = dyn_cast<Function>(F.getPersonalityFn()->stripPointerCasts());
     if (Per)
       Assert(Per->getParent() == F.getParent(),
-             "Referencing personality function in another module!", &F, Per);
+             "Referencing personality function in another module!",
+             &F, F.getParent(), Per, Per->getParent());
   }
 
   if (F.isMaterializable()) {
@@ -3165,7 +3172,7 @@ void Verifier::visitInstruction(Instruction &I) {
           " donothing or patchpoint",
           &I);
       Assert(F->getParent() == M, "Referencing function in another module!",
-             &I);
+             &I, M, F, F->getParent());
     } else if (BasicBlock *OpBB = dyn_cast<BasicBlock>(I.getOperand(i))) {
       Assert(OpBB->getParent() == BB->getParent(),
              "Referring to a basic block in another function!", &I);
@@ -3173,7 +3180,7 @@ void Verifier::visitInstruction(Instruction &I) {
       Assert(OpArg->getParent() == BB->getParent(),
              "Referring to an argument in another function!", &I);
     } else if (GlobalValue *GV = dyn_cast<GlobalValue>(I.getOperand(i))) {
-      Assert(GV->getParent() == M, "Referencing global in another module!", &I);
+      Assert(GV->getParent() == M, "Referencing global in another module!", &I, M, GV, GV->getParent());
     } else if (isa<Instruction>(I.getOperand(i))) {
       verifyDominatesUse(I, i);
     } else if (isa<InlineAsm>(I.getOperand(i))) {
diff --git a/unittests/IR/VerifierTest.cpp b/unittests/IR/VerifierTest.cpp
index 71e3168b686b..4e94b4375f92 100644
--- a/unittests/IR/VerifierTest.cpp
+++ b/unittests/IR/VerifierTest.cpp
@@ -60,5 +60,52 @@ TEST(VerifierTest, InvalidRetAttribute) {
       "Attribute 'uwtable' only applies to functions!"));
 }
 
+TEST(VerifierTest, CrossModuleRef) {
+  LLVMContext &C = getGlobalContext();
+  Module M1("M1", C);
+  Module M2("M2", C);
+  Module M3("M2", C);
+  FunctionType *FTy = FunctionType::get(Type::getInt32Ty(C), /*isVarArg=*/false);
+  Function *F1 = cast<Function>(M1.getOrInsertFunction("foo1", FTy));
+  Function *F2 = cast<Function>(M2.getOrInsertFunction("foo2", FTy));
+  Function *F3 = cast<Function>(M3.getOrInsertFunction("foo3", FTy));
+
+  BasicBlock *Entry1 = BasicBlock::Create(C, "entry", F1);
+  BasicBlock *Entry3 = BasicBlock::Create(C, "entry", F3);
+
+  // BAD: Referencing function in another module
+  CallInst::Create(F2,"call",Entry1);
+
+  // BAD: Referencing personality routine in another module
+  F3->setPersonalityFn(F2);
+
+  // Fill in the body
+  Constant *ConstZero = ConstantInt::get(Type::getInt32Ty(C), 0);
+  ReturnInst::Create(C, ConstZero, Entry1);
+  ReturnInst::Create(C, ConstZero, Entry3);
+
+  std::string Error;
+  raw_string_ostream ErrorOS(Error);
+  EXPECT_FALSE(verifyModule(M2, &ErrorOS));
+  EXPECT_TRUE(verifyModule(M1, &ErrorOS));
+  EXPECT_TRUE(StringRef(ErrorOS.str()).equals(
+      "Referencing function in another module!\n"
+      "  %call = call i32 @foo2()\n"
+      "; ModuleID = 'M1'\n"
+      "i32 ()* @foo2\n"
+      "; ModuleID = 'M2'\n"));
+
+  Error.clear();
+  EXPECT_TRUE(verifyModule(M3, &ErrorOS));
+  EXPECT_TRUE(StringRef(ErrorOS.str()).startswith(
+      "Referencing personality function in another module!"));
+
+  // Erase bad methods to avoid triggering an assertion failure on destruction
+  F1->eraseFromParent();
+  F3->eraseFromParent();
+}
+
+
+
 }
 }

From 2320de6adb4f38633ae1698ac522def2bf0aacc5 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 1 Dec 2015 19:08:39 +0000
Subject: [PATCH 092/186] AMDGPU: Report extractelement as free in cost model

The cost for scalarized operations is computed as N * (scalar operation
cost + 1 extractelement + 1 insertelement). This partially fixes
inflating the cost of scalarized operations since every operation is
scalarized and free. I don't think we want any cost asociated with
scalarization, but for now insertelement is still counted. I'm not sure
if we should pretend that insertelement is also free, or add a way
to compute a custom scalarization cost.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254438 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      |  11 ++
 lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h |   2 +
 .../CostModel/AMDGPU/extractelement.ll        | 110 ++++++++++++++++++
 test/Analysis/CostModel/AMDGPU/lit.local.cfg  |   2 +
 4 files changed, 125 insertions(+)
 create mode 100644 test/Analysis/CostModel/AMDGPU/extractelement.ll
 create mode 100644 test/Analysis/CostModel/AMDGPU/lit.local.cfg

diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 6dacc742b129..4afcc60984fc 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -80,3 +80,14 @@ unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
   // Semi-arbitrary large amount.
   return 64;
 }
+
+int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
+                                      unsigned Index) {
+  switch (Opcode) {
+  case Instruction::ExtractElement:
+    // Dynamic indexing isn't free and is best avoided.
+    return Index == ~0u ? 2 : 0;
+  default:
+    return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
+  }
+}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index dee0a69d1e68..5a94a0ba4706 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -60,6 +60,8 @@ class AMDGPUTTIImpl : public BasicTTIImplBase<AMDGPUTTIImpl> {
   unsigned getNumberOfRegisters(bool Vector);
   unsigned getRegisterBitWidth(bool Vector);
   unsigned getMaxInterleaveFactor(unsigned VF);
+
+  int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
 };
 
 } // end namespace llvm
diff --git a/test/Analysis/CostModel/AMDGPU/extractelement.ll b/test/Analysis/CostModel/AMDGPU/extractelement.ll
new file mode 100644
index 000000000000..c328d7686466
--- /dev/null
+++ b/test/Analysis/CostModel/AMDGPU/extractelement.ll
@@ -0,0 +1,110 @@
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
+
+; CHECK: 'extractelement_v2i32'
+; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i32>
+define void @extractelement_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
+  %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
+  %elt = extractelement <2 x i32> %vec, i32 1
+  store i32 %elt, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'extractelement_v2f32'
+; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x float>
+define void @extractelement_v2f32(float addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) {
+  %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
+  %elt = extractelement <2 x float> %vec, i32 1
+  store float %elt, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'extractelement_v3i32'
+; CHECK: estimated cost of 0 for {{.*}} extractelement <3 x i32>
+define void @extractelement_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr) {
+  %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
+  %elt = extractelement <3 x i32> %vec, i32 1
+  store i32 %elt, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'extractelement_v4i32'
+; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i32>
+define void @extractelement_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr) {
+  %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr
+  %elt = extractelement <4 x i32> %vec, i32 1
+  store i32 %elt, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'extractelement_v8i32'
+; CHECK: estimated cost of 0 for {{.*}} extractelement <8 x i32>
+define void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr) {
+  %vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr
+  %elt = extractelement <8 x i32> %vec, i32 1
+  store i32 %elt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Should be non-0
+; CHECK: 'extractelement_v8i32_dynindex'
+; CHECK: estimated cost of 2 for {{.*}} extractelement <8 x i32>
+define void @extractelement_v8i32_dynindex(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr, i32 %idx) {
+  %vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr
+  %elt = extractelement <8 x i32> %vec, i32 %idx
+  store i32 %elt, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'extractelement_v2i64'
+; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i64>
+define void @extractelement_v2i64(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
+  %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
+  %elt = extractelement <2 x i64> %vec, i64 1
+  store i64 %elt, i64 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'extractelement_v3i64'
+; CHECK: estimated cost of 0 for {{.*}} extractelement <3 x i64>
+define void @extractelement_v3i64(i64 addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr) {
+  %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
+  %elt = extractelement <3 x i64> %vec, i64 1
+  store i64 %elt, i64 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'extractelement_v4i64'
+; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i64>
+define void @extractelement_v4i64(i64 addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr) {
+  %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr
+  %elt = extractelement <4 x i64> %vec, i64 1
+  store i64 %elt, i64 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'extractelement_v8i64'
+; CHECK: estimated cost of 0 for {{.*}} extractelement <8 x i64>
+define void @extractelement_v8i64(i64 addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr) {
+  %vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr
+  %elt = extractelement <8 x i64> %vec, i64 1
+  store i64 %elt, i64 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'extractelement_v4i8'
+; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i8>
+define void @extractelement_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %vaddr) {
+  %vec = load <4 x i8>, <4 x i8> addrspace(1)* %vaddr
+  %elt = extractelement <4 x i8> %vec, i8 1
+  store i8 %elt, i8 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: 'extractelement_v2i16'
+; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i16>
+define void @extractelement_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
+  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
+  %elt = extractelement <2 x i16> %vec, i16 1
+  store i16 %elt, i16 addrspace(1)* %out
+  ret void
+}
diff --git a/test/Analysis/CostModel/AMDGPU/lit.local.cfg b/test/Analysis/CostModel/AMDGPU/lit.local.cfg
new file mode 100644
index 000000000000..2a665f06be72
--- /dev/null
+++ b/test/Analysis/CostModel/AMDGPU/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'AMDGPU' in config.root.targets:
+    config.unsupported = True

From 6a07b977968eafc8ead24c5b973fd44091c8aa73 Mon Sep 17 00:00:00 2001
From: Weiming Zhao <weimingz@codeaurora.org>
Date: Tue, 1 Dec 2015 19:17:49 +0000
Subject: [PATCH 093/186] [AArch64] Fix a corner case in BitFeild select

Summary:
When not useful bits, BitWidth becomes 0 and APInt will not be happy.

See https://llvm.org/bugs/show_bug.cgi?id=25571

We can just mark the operand as IMPLICIT_DEF is none bits of it is used.

Reviewers: t.p.northover, jmolloy

Subscribers: gberry, jmolloy, mgrang, aemerson, llvm-commits, rengolin

Differential Revision: http://reviews.llvm.org/D14803

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254440 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64ISelDAGToDAG.cpp | 16 +++++++++++-----
 test/CodeGen/AArch64/bitfield-insert.ll    | 22 ++++++++++++++++++++++
 2 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 4311198403fa..6c868880bcac 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -1974,7 +1974,8 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
 // f = Opc Opd0, Opd1, LSB, MSB ; where Opc is a BFM, LSB = imm, and MSB = imm2
 static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
                                      SDValue &Src, unsigned &ImmR,
-                                     unsigned &ImmS, SelectionDAG *CurDAG) {
+                                     unsigned &ImmS, const APInt &UsefulBits,
+                                     SelectionDAG *CurDAG) {
   assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
 
   // Set Opc
@@ -1988,8 +1989,6 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
 
   // Because of simplify-demanded-bits in DAGCombine, involved masks may not
   // have the expected shape. Try to undo that.
-  APInt UsefulBits;
-  getUsefulBits(SDValue(N, 0), UsefulBits);
 
   unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros();
   unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros();
@@ -2083,11 +2082,18 @@ SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) {
   unsigned Opc;
   unsigned LSB, MSB;
   SDValue Opd0, Opd1;
+  EVT VT = N->getValueType(0);
+  APInt NUsefulBits;
+  getUsefulBits(SDValue(N, 0), NUsefulBits);
+
+  // If all bits are not useful, just return UNDEF.
+  if (!NUsefulBits)
+    return CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, VT);
 
-  if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, CurDAG))
+  if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, NUsefulBits,
+                                CurDAG))
     return nullptr;
 
-  EVT VT = N->getValueType(0);
   SDLoc dl(N);
   SDValue Ops[] = { Opd0,
                     Opd1,
diff --git a/test/CodeGen/AArch64/bitfield-insert.ll b/test/CodeGen/AArch64/bitfield-insert.ll
index 9df51dcc4478..509b547a5c82 100644
--- a/test/CodeGen/AArch64/bitfield-insert.ll
+++ b/test/CodeGen/AArch64/bitfield-insert.ll
@@ -215,3 +215,25 @@ define void @test_32bit_opnd1_better(i32* %existing, i32* %new) {
 
   ret void
 }
+
+; Tests when all the bits from one operand are not useful
+define i32 @test_nouseful_bits(i8 %a, i32 %b) {
+; CHECK-LABEL: test_nouseful_bits:
+; CHECK: bfi
+; CHECK: bfi
+; CHECK: bfi
+; CHECK-NOT: bfi
+; CHECK-NOT: or
+; CHECK: lsl
+  %conv = zext i8 %a to i32     ;   0  0  0  A
+  %shl = shl i32 %b, 8          ;   B2 B1 B0 0
+  %or = or i32 %conv, %shl      ;   B2 B1 B0 A
+  %shl.1 = shl i32 %or, 8       ;   B1 B0 A 0
+  %or.1 = or i32 %conv, %shl.1  ;   B1 B0 A A
+  %shl.2 = shl i32 %or.1, 8     ;   B0 A A 0
+  %or.2 = or i32 %conv, %shl.2  ;   B0 A A A
+  %shl.3 = shl i32 %or.2, 8     ;   A A A 0
+  %or.3 = or i32 %conv, %shl.3  ;   A A A A
+  %shl.4 = shl i32 %or.3, 8     ;   A A A 0
+  ret i32 %shl.4
+}

From 8e31526513c32e9e432bcd98abee665aca8c0dcf Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Tue, 1 Dec 2015 19:17:58 +0000
Subject: [PATCH 094/186] [llvm-dwp] Deduplicate strings in the debug_str.dwo
 section

Also, ensure that references to those strings in debug_str_offsets.dwo
correctly refer to the deduplicated strings.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254441 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/tools/llvm-dwp/X86/simple.test |  6 ++---
 tools/llvm-dwp/llvm-dwp.cpp         | 34 +++++++++++++++++------------
 2 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/test/tools/llvm-dwp/X86/simple.test b/test/tools/llvm-dwp/X86/simple.test
index d70bdda072c5..754450f13f3f 100644
--- a/test/tools/llvm-dwp/X86/simple.test
+++ b/test/tools/llvm-dwp/X86/simple.test
@@ -46,13 +46,11 @@ FIXME: Emit and verify the cu_index contents
 CHECK: .debug_str.dwo contents:
 CHECK: "clang version
 CHECK: 0x[[ACPP:.*]]: "a.cpp"
-FIXME: Remove duplicates
-CHECK: 0x[[SECONDREV:.*]]: "clang version
+CHECK-NOT: "clang version
 CHECK: 0x[[BCPP:.*]]: "b.cpp"
 
 CHECK: .debug_str_offsets.dwo contents:
 CHECK: : 00000000
 CHECK: : [[ACPP]]
-CHECK: : [[SECONDREV]]
-FIXME: Update str offset indexes, this should be BCPP \/
+CHECK: : 00000000
 CHECK: : [[BCPP]]
diff --git a/tools/llvm-dwp/llvm-dwp.cpp b/tools/llvm-dwp/llvm-dwp.cpp
index c89be222e6c9..9ce37ec2ceee 100644
--- a/tools/llvm-dwp/llvm-dwp.cpp
+++ b/tools/llvm-dwp/llvm-dwp.cpp
@@ -40,8 +40,9 @@ static int error(const Twine &Error, const Twine &Context) {
 
 static std::error_code
 writeStringsAndOffsets(MCStreamer &Out, StringMap<uint32_t> &Strings,
-                       uint32_t &StringOffset, MCSection *StrOffsetSection,
-                       StringRef CurStrSection, StringRef CurStrOffsetSection) {
+                       uint32_t &StringOffset, MCSection *StrSection,
+                       MCSection *StrOffsetSection, StringRef CurStrSection,
+                       StringRef CurStrOffsetSection) {
   // Could possibly produce an error or warning if one of these was non-null but
   // the other was null.
   if (CurStrSection.empty() || CurStrOffsetSection.empty())
@@ -54,9 +55,14 @@ writeStringsAndOffsets(MCStreamer &Out, StringMap<uint32_t> &Strings,
   uint32_t PrevOffset = 0;
   while (const char *s = Data.getCStr(&LocalOffset)) {
     StringRef Str(s, LocalOffset - PrevOffset - 1);
-    OffsetRemapping[PrevOffset] = StringOffset;
-    // insert, if successful, write new string to the str.dwo section
-    StringOffset += Str.size() + 1;
+    auto Pair = Strings.insert(std::make_pair(Str, StringOffset));
+    if (Pair.second) {
+      Out.SwitchSection(StrSection);
+      Out.EmitBytes(
+          StringRef(Pair.first->getKeyData(), Pair.first->getKeyLength() + 1));
+      StringOffset += Str.size() + 1;
+    }
+    OffsetRemapping[PrevOffset] = Pair.first->second;
     PrevOffset = LocalOffset;
   }
 
@@ -106,19 +112,19 @@ static std::error_code write(MCStreamer &Out, ArrayRef<std::string> Inputs) {
         StringRef Contents;
         if (auto Err = Section.getContents(Contents))
           return Err;
-        if (OutSection == StrOffsetSection) {
+        if (OutSection == StrOffsetSection)
           CurStrOffsetSection = Contents;
-          continue;
-        }
-        if (OutSection == StrSection)
+        else if (OutSection == StrSection)
           CurStrSection = Contents;
-        Out.SwitchSection(OutSection);
-        Out.EmitBytes(Contents);
+        else {
+          Out.SwitchSection(OutSection);
+          Out.EmitBytes(Contents);
+        }
       }
     }
-    if (auto Err =
-            writeStringsAndOffsets(Out, Strings, StringOffset, StrOffsetSection,
-                                   CurStrSection, CurStrOffsetSection))
+    if (auto Err = writeStringsAndOffsets(Out, Strings, StringOffset,
+                                          StrSection, StrOffsetSection,
+                                          CurStrSection, CurStrOffsetSection))
       return Err;
   }
   return std::error_code();

From bedc55e06360f5baa33cd63e7543294463773931 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 1 Dec 2015 19:19:18 +0000
Subject: [PATCH 095/186] fix typo; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254442 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/TwoAddressInstructionPass.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index c96c813b0c9b..c407d594addd 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1181,7 +1181,7 @@ bool TwoAddressInstructionPass::tryInstructionCommute(MachineInstr *MI,
   unsigned OtherOpIdx = MI->getDesc().getNumDefs();
   for (; OtherOpIdx < OpsNum; OtherOpIdx++) {
     // The call of findCommutedOpIndices below only checks if BaseOpIdx
-    // and OtherOpIdx are commutable, it does not really searches for
+    // and OtherOpIdx are commutable, it does not really search for
     // other commutable operands and does not change the values of passed
     // variables.
     if (OtherOpIdx == BaseOpIdx ||

From 8efaea8c4583732bd030292a59d3081a176242bf Mon Sep 17 00:00:00 2001
From: Artyom Skrobov <Artyom.Skrobov@arm.com>
Date: Tue, 1 Dec 2015 19:25:11 +0000
Subject: [PATCH 096/186] Fix Thumb1 epilogue generation

Summary:
This had been broken for a very long time, but nobody noticed until
D14357 enabled shrink-wrapping by default.

Reviewers: jroelofs, qcolombet

Subscribers: tyomitch, llvm-commits, rengolin

Differential Revision: http://reviews.llvm.org/D14986

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254444 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/Thumb1FrameLowering.cpp  | 67 ++++++++++++++++++++-----
 test/CodeGen/Thumb/pop-special-fixup.ll | 60 ++++++++++++++++++++++
 2 files changed, 115 insertions(+), 12 deletions(-)
 create mode 100644 test/CodeGen/Thumb/pop-special-fixup.ll

diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index 064cff6f5704..fd96af6cb6e0 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -406,11 +406,15 @@ bool Thumb1FrameLowering::needPopSpecialFixUp(const MachineFunction &MF) const {
   if (AFI->getArgRegsSaveSize())
     return true;
 
-  bool IsV4PopReturn = false;
+  // FIXME: this doesn't make sense, and the following patch will remove it.
+  if (!STI.hasV4TOps()) return false;
+
+  // LR cannot be encoded with Thumb1, i.e., it requires a special fix-up.
   for (const CalleeSavedInfo &CSI : MF.getFrameInfo()->getCalleeSavedInfo())
     if (CSI.getReg() == ARM::LR)
-      IsV4PopReturn = true;
-  return IsV4PopReturn && STI.hasV4TOps() && !STI.hasV5TOps();
+      return true;
+
+  return false;
 }
 
 bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
@@ -422,12 +426,45 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
   const ThumbRegisterInfo *RegInfo =
       static_cast<const ThumbRegisterInfo *>(STI.getRegisterInfo());
 
-  // When we need a special fix up for POP, this means that
-  // we either cannot use PC in POP or we have to update
-  // SP after poping the return address.
-  // In other words, we cannot use a pop {pc} like construction
-  // here, no matter what.
+  // If MBBI is a return instruction, or is a tPOP followed by a return
+  // instruction in the successor BB, we may be able to directly restore
+  // LR in the PC.
+  // This is only possible with v5T ops (v4T can't change the Thumb bit via
+  // a POP PC instruction), and only if we do not need to emit any SP update.
+  // Otherwise, we need a temporary register to pop the value
+  // and copy that value into LR.
   auto MBBI = MBB.getFirstTerminator();
+  bool CanRestoreDirectly = STI.hasV5TOps() && !ArgRegsSaveSize;
+  if (CanRestoreDirectly) {
+    if (MBBI != MBB.end())
+      CanRestoreDirectly = (MBBI->getOpcode() == ARM::tBX_RET ||
+                            MBBI->getOpcode() == ARM::tPOP_RET);
+    else {
+      assert(MBB.back().getOpcode() == ARM::tPOP);
+      assert(MBB.succ_size() == 1);
+      if ((*MBB.succ_begin())->begin()->getOpcode() == ARM::tBX_RET)
+        MBBI--; // Replace the final tPOP with a tPOP_RET.
+      else
+        CanRestoreDirectly = false;
+    }
+  }
+
+  if (CanRestoreDirectly) {
+    if (!DoIt || MBBI->getOpcode() == ARM::tPOP_RET)
+      return true;
+    MachineInstrBuilder MIB =
+        AddDefaultPred(
+            BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII.get(ARM::tPOP_RET)));
+    // Copy implicit ops and popped registers, if any.
+    for (auto MO: MBBI->operands())
+      if (MO.isReg() && (MO.isImplicit() || MO.isDef()) &&
+          MO.getReg() != ARM::LR)
+        MIB.addOperand(MO);
+    MIB.addReg(ARM::PC, RegState::Define);
+    // Erase the old instruction (tBX_RET or tPOP).
+    MBB.erase(MBBI);
+    return true;
+  }
 
   // Look for a temporary register to use.
   // First, compute the liveness information.
@@ -446,10 +483,10 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
   if (MBBI != MBB.end()) {
     dl = MBBI->getDebugLoc();
     auto InstUpToMBBI = MBB.end();
-    // The post-decrement is on purpose here.
-    // We want to have the liveness right before MBBI.
-    while (InstUpToMBBI-- != MBBI)
-      UsedRegs.stepBackward(*InstUpToMBBI);
+    while (InstUpToMBBI != MBBI)
+      // The pre-decrement is on purpose here.
+      // We want to have the liveness right before MBBI.
+      UsedRegs.stepBackward(*--InstUpToMBBI);
   }
 
   // Look for a register that can be directly use in the POP.
@@ -495,6 +532,12 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
                        .addReg(PopReg, RegState::Kill));
   }
 
+  if (MBBI == MBB.end()) {
+    MachineInstr& Pop = MBB.back();
+    assert(Pop.getOpcode() == ARM::tPOP);
+    Pop.RemoveOperand(Pop.findRegisterDefOperandIdx(ARM::LR));
+  }
+
   assert(PopReg && "Do not know how to get LR");
   AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP)))
       .addReg(PopReg, RegState::Define);
diff --git a/test/CodeGen/Thumb/pop-special-fixup.ll b/test/CodeGen/Thumb/pop-special-fixup.ll
new file mode 100644
index 000000000000..9ba589d6cec3
--- /dev/null
+++ b/test/CodeGen/Thumb/pop-special-fixup.ll
@@ -0,0 +1,60 @@
+; RUN: llc %s -enable-shrink-wrap=true -o - | FileCheck %s
+
+target triple = "thumbv6m-none-none-eabi"
+
+@retval = global i32 0, align 4
+
+define i32 @test(i32 %i, i32 %argc, i8** nocapture readonly %argv) {
+  %1 = icmp sgt i32 %argc, %i
+  br i1 %1, label %2, label %19
+
+  %3 = getelementptr inbounds i8*, i8** %argv, i32 %i
+  %4 = load i8*, i8** %3, align 4
+  %5 = load i8, i8* %4, align 1
+  %6 = icmp eq i8 %5, 45
+  %7 = getelementptr inbounds i8, i8* %4, i32 1
+  %. = select i1 %6, i8* %7, i8* %4
+  %.1 = select i1 %6, i32 -1, i32 1
+  %8 = load i8, i8* %., align 1
+  %.off2 = add i8 %8, -48
+  %9 = icmp ult i8 %.off2, 10
+  %.pre = load i32, i32* @retval, align 4
+  br i1 %9, label %.lr.ph.preheader, label %.critedge
+
+.lr.ph.preheader:                                 ; preds = %2
+  br label %.lr.ph
+
+.lr.ph:                                           ; preds = %.lr.ph.preheader, %.lr.ph
+  %10 = phi i32 [ %14, %.lr.ph ], [ %.pre, %.lr.ph.preheader ]
+  %11 = phi i8 [ %15, %.lr.ph ], [ %8, %.lr.ph.preheader ]
+  %valstring.03 = phi i8* [ %13, %.lr.ph ], [ %., %.lr.ph.preheader ]
+  %12 = zext i8 %11 to i32
+  %13 = getelementptr inbounds i8, i8* %valstring.03, i32 1
+  %14 = add nsw i32 %10, %12
+  store i32 %14, i32* @retval, align 4
+  %15 = load i8, i8* %13, align 1
+  %.off = add i8 %15, -48
+  %16 = icmp ult i8 %.off, 10
+  br i1 %16, label %.lr.ph, label %.critedge.loopexit
+
+.critedge.loopexit:                               ; preds = %.lr.ph
+  %.lcssa = phi i32 [ %14, %.lr.ph ]
+  br label %.critedge
+
+.critedge:                                        ; preds = %.critedge.loopexit, %2
+  %17 = phi i32 [ %.pre, %2 ], [ %.lcssa, %.critedge.loopexit ]
+  %18 = mul nsw i32 %17, %.1
+  store i32 %18, i32* @retval, align 4
+  br label %19
+
+; <label>:19                                      ; preds = %.critedge, %0
+  ret i32 0
+}
+
+; CHECK: push {r4, r5, r7, lr}
+; CHECK: pop {r4, r5, r7}
+; CHECK: pop {r0}
+; CHECK: mov lr, r0
+; CHECK: movs r0, #0
+; CHECK: bx  lr
+

From 936f2daba20c6f5a049c2e5cf47e4ee98f89b88e Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 1 Dec 2015 19:32:35 +0000
Subject: [PATCH 097/186] don't repeat function/variable names in comments; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254445 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/TwoAddressInstructionPass.cpp | 121 ++++++++++------------
 1 file changed, 57 insertions(+), 64 deletions(-)

diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index c407d594addd..eb45f1ee31bd 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -83,21 +83,20 @@ class TwoAddressInstructionPass : public MachineFunctionPass {
   // The current basic block being processed.
   MachineBasicBlock *MBB;
 
-  // DistanceMap - Keep track the distance of a MI from the start of the
-  // current basic block.
+  // Keep track the distance of a MI from the start of the current basic block.
   DenseMap<MachineInstr*, unsigned> DistanceMap;
 
   // Set of already processed instructions in the current block.
   SmallPtrSet<MachineInstr*, 8> Processed;
 
-  // SrcRegMap - A map from virtual registers to physical registers which are
-  // likely targets to be coalesced to due to copies from physical registers to
-  // virtual registers. e.g. v1024 = move r0.
+  // A map from virtual registers to physical registers which are likely targets
+  // to be coalesced to due to copies from physical registers to virtual
+  // registers. e.g. v1024 = move r0.
   DenseMap<unsigned, unsigned> SrcRegMap;
 
-  // DstRegMap - A map from virtual registers to physical registers which are
-  // likely targets to be coalesced to due to copies to physical registers from
-  // virtual registers. e.g. r1 = move v1024.
+  // A map from virtual registers to physical registers which are likely targets
+  // to be coalesced to due to copies to physical registers from virtual
+  // registers. e.g. r1 = move v1024.
   DenseMap<unsigned, unsigned> DstRegMap;
 
   bool sink3AddrInstruction(MachineInstr *MI, unsigned Reg,
@@ -165,7 +164,7 @@ class TwoAddressInstructionPass : public MachineFunctionPass {
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
-  /// runOnMachineFunction - Pass entry point.
+  /// Pass entry point.
   bool runOnMachineFunction(MachineFunction&) override;
 };
 } // end anonymous namespace
@@ -181,10 +180,9 @@ char &llvm::TwoAddressInstructionPassID = TwoAddressInstructionPass::ID;
 
 static bool isPlainlyKilled(MachineInstr *MI, unsigned Reg, LiveIntervals *LIS);
 
-/// sink3AddrInstruction - A two-address instruction has been converted to a
-/// three-address instruction to avoid clobbering a register. Try to sink it
-/// past the instruction that would kill the above mentioned register to reduce
-/// register pressure.
+/// A two-address instruction has been converted to a three-address instruction
+/// to avoid clobbering a register. Try to sink it past the instruction that
+/// would kill the above mentioned register to reduce register pressure.
 bool TwoAddressInstructionPass::
 sink3AddrInstruction(MachineInstr *MI, unsigned SavedReg,
                      MachineBasicBlock::iterator OldPos) {
@@ -313,8 +311,7 @@ sink3AddrInstruction(MachineInstr *MI, unsigned SavedReg,
   return true;
 }
 
-/// getSingleDef -- return the MachineInstr* if it is the single def of the Reg
-/// in current BB.
+/// Return the MachineInstr* if it is the single def of the Reg in current BB.
 static MachineInstr *getSingleDef(unsigned Reg, MachineBasicBlock *BB,
                                   const MachineRegisterInfo *MRI) {
   MachineInstr *Ret = nullptr;
@@ -352,10 +349,10 @@ bool TwoAddressInstructionPass::isRevCopyChain(unsigned FromReg, unsigned ToReg,
   return false;
 }
 
-/// noUseAfterLastDef - Return true if there are no intervening uses between the
-/// last instruction in the MBB that defines the specified register and the
-/// two-address instruction which is being processed. It also returns the last
-/// def location by reference
+/// Return true if there are no intervening uses between the last instruction
+/// in the MBB that defines the specified register and the two-address
+/// instruction which is being processed. It also returns the last def location
+/// by reference.
 bool TwoAddressInstructionPass::noUseAfterLastDef(unsigned Reg, unsigned Dist,
                                                   unsigned &LastDef) {
   LastDef = 0;
@@ -376,9 +373,9 @@ bool TwoAddressInstructionPass::noUseAfterLastDef(unsigned Reg, unsigned Dist,
   return !(LastUse > LastDef && LastUse < Dist);
 }
 
-/// isCopyToReg - Return true if the specified MI is a copy instruction or
-/// a extract_subreg instruction. It also returns the source and destination
-/// registers and whether they are physical registers by reference.
+/// Return true if the specified MI is a copy instruction or an extract_subreg
+/// instruction. It also returns the source and destination registers and
+/// whether they are physical registers by reference.
 static bool isCopyToReg(MachineInstr &MI, const TargetInstrInfo *TII,
                         unsigned &SrcReg, unsigned &DstReg,
                         bool &IsSrcPhys, bool &IsDstPhys) {
@@ -398,8 +395,8 @@ static bool isCopyToReg(MachineInstr &MI, const TargetInstrInfo *TII,
   return true;
 }
 
-/// isPLainlyKilled - Test if the given register value, which is used by the
-// given instruction, is killed by the given instruction.
+/// Test if the given register value, which is used by the
+/// given instruction, is killed by the given instruction.
 static bool isPlainlyKilled(MachineInstr *MI, unsigned Reg,
                             LiveIntervals *LIS) {
   if (LIS && TargetRegisterInfo::isVirtualRegister(Reg) &&
@@ -425,7 +422,7 @@ static bool isPlainlyKilled(MachineInstr *MI, unsigned Reg,
   return MI->killsRegister(Reg);
 }
 
-/// isKilled - Test if the given register value, which is used by the given
+/// Test if the given register value, which is used by the given
 /// instruction, is killed by the given instruction. This looks through
 /// coalescable copies to see if the original value is potentially not killed.
 ///
@@ -473,8 +470,8 @@ static bool isKilled(MachineInstr &MI, unsigned Reg,
   }
 }
 
-/// isTwoAddrUse - Return true if the specified MI uses the specified register
-/// as a two-address use. If so, return the destination register by reference.
+/// Return true if the specified MI uses the specified register as a two-address
+/// use. If so, return the destination register by reference.
 static bool isTwoAddrUse(MachineInstr &MI, unsigned Reg, unsigned &DstReg) {
   for (unsigned i = 0, NumOps = MI.getNumOperands(); i != NumOps; ++i) {
     const MachineOperand &MO = MI.getOperand(i);
@@ -489,8 +486,8 @@ static bool isTwoAddrUse(MachineInstr &MI, unsigned Reg, unsigned &DstReg) {
   return false;
 }
 
-/// findOnlyInterestingUse - Given a register, if has a single in-basic block
-/// use, return the use instruction if it's a copy or a two-address use.
+/// Given a register, if has a single in-basic block use, return the use
+/// instruction if it's a copy or a two-address use.
 static
 MachineInstr *findOnlyInterestingUse(unsigned Reg, MachineBasicBlock *MBB,
                                      MachineRegisterInfo *MRI,
@@ -517,8 +514,8 @@ MachineInstr *findOnlyInterestingUse(unsigned Reg, MachineBasicBlock *MBB,
   return nullptr;
 }
 
-/// getMappedReg - Return the physical register the specified virtual register
-/// might be mapped to.
+/// Return the physical register the specified virtual register might be mapped
+/// to.
 static unsigned
 getMappedReg(unsigned Reg, DenseMap<unsigned, unsigned> &RegMap) {
   while (TargetRegisterInfo::isVirtualRegister(Reg))  {
@@ -532,8 +529,7 @@ getMappedReg(unsigned Reg, DenseMap<unsigned, unsigned> &RegMap) {
   return 0;
 }
 
-/// regsAreCompatible - Return true if the two registers are equal or aliased.
-///
+/// Return true if the two registers are equal or aliased.
 static bool
 regsAreCompatible(unsigned RegA, unsigned RegB, const TargetRegisterInfo *TRI) {
   if (RegA == RegB)
@@ -544,8 +540,8 @@ regsAreCompatible(unsigned RegA, unsigned RegB, const TargetRegisterInfo *TRI) {
 }
 
 
-/// isProfitableToCommute - Return true if it's potentially profitable to commute
-/// the two-address instruction that's being processed.
+/// Return true if it's potentially profitable to commute the two-address
+/// instruction that's being processed.
 bool
 TwoAddressInstructionPass::
 isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC,
@@ -643,9 +639,8 @@ isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC,
   return LastDefB && LastDefC && LastDefC > LastDefB;
 }
 
-/// commuteInstruction - Commute a two-address instruction and update the basic
-/// block, distance map, and live variables if needed. Return true if it is
-/// successful.
+/// Commute a two-address instruction and update the basic block, distance map,
+/// and live variables if needed. Return true if it is successful.
 bool TwoAddressInstructionPass::commuteInstruction(MachineInstr *MI,
                                                    unsigned RegBIdx,
                                                    unsigned RegCIdx,
@@ -674,8 +669,8 @@ bool TwoAddressInstructionPass::commuteInstruction(MachineInstr *MI,
   return true;
 }
 
-/// isProfitableToConv3Addr - Return true if it is profitable to convert the
-/// given 2-address instruction to a 3-address one.
+/// Return true if it is profitable to convert the given 2-address instruction
+/// to a 3-address one.
 bool
 TwoAddressInstructionPass::isProfitableToConv3Addr(unsigned RegA,unsigned RegB){
   // Look for situations like this:
@@ -691,8 +686,8 @@ TwoAddressInstructionPass::isProfitableToConv3Addr(unsigned RegA,unsigned RegB){
   return (ToRegA && !regsAreCompatible(FromRegB, ToRegA, TRI));
 }
 
-/// convertInstTo3Addr - Convert the specified two-address instruction into a
-/// three address one. Return true if this transformation was successful.
+/// Convert the specified two-address instruction into a three address one.
+/// Return true if this transformation was successful.
 bool
 TwoAddressInstructionPass::convertInstTo3Addr(MachineBasicBlock::iterator &mi,
                                               MachineBasicBlock::iterator &nmi,
@@ -733,8 +728,8 @@ TwoAddressInstructionPass::convertInstTo3Addr(MachineBasicBlock::iterator &mi,
   return true;
 }
 
-/// scanUses - Scan forward recursively for only uses, update maps if the use
-/// is a copy or a two-address instruction.
+/// Scan forward recursively for only uses, update maps if the use is a copy or
+/// a two-address instruction.
 void
 TwoAddressInstructionPass::scanUses(unsigned DstReg) {
   SmallVector<unsigned, 4> VirtRegPairs;
@@ -780,8 +775,8 @@ TwoAddressInstructionPass::scanUses(unsigned DstReg) {
   }
 }
 
-/// processCopy - If the specified instruction is not yet processed, process it
-/// if it's a copy. For a copy instruction, we find the physical registers the
+/// If the specified instruction is not yet processed, process it if it's a
+/// copy. For a copy instruction, we find the physical registers the
 /// source and destination registers might be mapped to. These are kept in
 /// point-to maps used to determine future optimizations. e.g.
 /// v1024 = mov r0
@@ -816,9 +811,9 @@ void TwoAddressInstructionPass::processCopy(MachineInstr *MI) {
   return;
 }
 
-/// rescheduleMIBelowKill - If there is one more local instruction that reads
-/// 'Reg' and it kills 'Reg, consider moving the instruction below the kill
-/// instruction in order to eliminate the need for the copy.
+/// If there is one more local instruction that reads 'Reg' and it kills 'Reg,
+/// consider moving the instruction below the kill instruction in order to
+/// eliminate the need for the copy.
 bool TwoAddressInstructionPass::
 rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
                       MachineBasicBlock::iterator &nmi,
@@ -987,8 +982,8 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
   return true;
 }
 
-/// isDefTooClose - Return true if the re-scheduling will put the given
-/// instruction too close to the defs of its register dependencies.
+/// Return true if the re-scheduling will put the given instruction too close
+/// to the defs of its register dependencies.
 bool TwoAddressInstructionPass::isDefTooClose(unsigned Reg, unsigned Dist,
                                               MachineInstr *MI) {
   for (MachineInstr &DefMI : MRI->def_instructions(Reg)) {
@@ -1007,10 +1002,9 @@ bool TwoAddressInstructionPass::isDefTooClose(unsigned Reg, unsigned Dist,
   return false;
 }
 
-/// rescheduleKillAboveMI - If there is one more local instruction that reads
-/// 'Reg' and it kills 'Reg, consider moving the kill instruction above the
-/// current two-address instruction in order to eliminate the need for the
-/// copy.
+/// If there is one more local instruction that reads 'Reg' and it kills 'Reg,
+/// consider moving the kill instruction above the current two-address
+/// instruction in order to eliminate the need for the copy.
 bool TwoAddressInstructionPass::
 rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
                       MachineBasicBlock::iterator &nmi,
@@ -1213,13 +1207,13 @@ bool TwoAddressInstructionPass::tryInstructionCommute(MachineInstr *MI,
   return false;
 }
 
-/// tryInstructionTransform - For the case where an instruction has a single
-/// pair of tied register operands, attempt some transformations that may
-/// either eliminate the tied operands or improve the opportunities for
-/// coalescing away the register copy.  Returns true if no copy needs to be
-/// inserted to untie mi's operands (either because they were untied, or
-/// because mi was rescheduled, and will be visited again later). If the
-/// shouldOnlyCommute flag is true, only instruction commutation is attempted.
+/// For the case where an instruction has a single pair of tied register
+/// operands, attempt some transformations that may either eliminate the tied
+/// operands or improve the opportunities for coalescing away the register copy.
+/// Returns true if no copy needs to be inserted to untie mi's operands
+/// (either because they were untied, or because mi was rescheduled, and will
+/// be visited again later). If the shouldOnlyCommute flag is true, only
+/// instruction commutation is attempted.
 bool TwoAddressInstructionPass::
 tryInstructionTransform(MachineBasicBlock::iterator &mi,
                         MachineBasicBlock::iterator &nmi,
@@ -1618,8 +1612,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
   }
 }
 
-/// runOnMachineFunction - Reduce two-address instructions to two operands.
-///
+/// Reduce two-address instructions to two operands.
 bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
   MF = &Func;
   const TargetMachine &TM = MF->getTarget();

From 67b32b28109a8a8121e5183e6b66002e7aed430f Mon Sep 17 00:00:00 2001
From: Xinliang David Li <davidxl@google.com>
Date: Tue, 1 Dec 2015 19:47:32 +0000
Subject: [PATCH 098/186] Use nullptr (NFC)

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254447 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/ProfileData/InstrProf.cpp           | 2 +-
 unittests/ProfileData/InstrProfTest.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/ProfileData/InstrProf.cpp b/lib/ProfileData/InstrProf.cpp
index d08ec9d73176..530be8ac044a 100644
--- a/lib/ProfileData/InstrProf.cpp
+++ b/lib/ProfileData/InstrProf.cpp
@@ -207,7 +207,7 @@ ValueProfData::serializeFrom(const InstrProfRecord &Record) {
   InstrProfRecordClosure.Record = &Record;
 
   std::unique_ptr<ValueProfData> VPD(
-      serializeValueProfDataFrom(&InstrProfRecordClosure, 0));
+      serializeValueProfDataFrom(&InstrProfRecordClosure, nullptr));
   return VPD;
 }
 
diff --git a/unittests/ProfileData/InstrProfTest.cpp b/unittests/ProfileData/InstrProfTest.cpp
index d50944ff77fd..2f3adb65a0ee 100644
--- a/unittests/ProfileData/InstrProfTest.cpp
+++ b/unittests/ProfileData/InstrProfTest.cpp
@@ -409,7 +409,7 @@ TEST_F(InstrProfTest, runtime_value_prof_data_read_write) {
   initializeValueProfRuntimeRecord(&RTRecord, &NumValueSites[0],
                                    &ValueProfNodes[0]);
 
-  ValueProfData *VPData = serializeValueProfDataFromRT(&RTRecord, 0);
+  ValueProfData *VPData = serializeValueProfDataFromRT(&RTRecord, nullptr);
 
   InstrProfRecord Record("caller", 0x1234, {1ULL << 31, 2});
 

From b0b27c572556a67b2a9f5246b8ceeeb0895c6eec Mon Sep 17 00:00:00 2001
From: Quentin Colombet <qcolombet@apple.com>
Date: Tue, 1 Dec 2015 19:49:31 +0000
Subject: [PATCH 099/186] [X86] Make sure the prologue does not clobber EFLAGS
 when it lives accross it.

This fixes PR25629.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254448 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86FrameLowering.cpp     | 33 ++++++++-
 test/CodeGen/X86/x86-shrink-wrapping.ll | 91 +++++++++++++++++++++++++
 2 files changed, 123 insertions(+), 1 deletion(-)

diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index f4f7f0cf33b2..4ce3dfe0dcb3 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -297,6 +297,28 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
   }
 }
 
+// Check if \p MBB defines the flags register before the first terminator.
+static bool flagsDefinedLocally(const MachineBasicBlock &MBB) {
+  MachineBasicBlock::const_iterator FirstTerminator = MBB.getFirstTerminator();
+  for (MachineBasicBlock::const_iterator MII : MBB) {
+    if (MII == FirstTerminator)
+      return false;
+
+    for (const MachineOperand &MO : MII->operands()) {
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (Reg != X86::EFLAGS)
+        continue;
+
+      // This instruction sets the eflag.
+      if (MO.isDef())
+        return true;
+    }
+  }
+  return false;
+}
+
 MachineInstrBuilder X86FrameLowering::BuildStackAdjustment(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL,
     int64_t Offset, bool InEpilogue) const {
@@ -306,7 +328,16 @@ MachineInstrBuilder X86FrameLowering::BuildStackAdjustment(
   // is tricky.
   bool UseLEA;
   if (!InEpilogue) {
-    UseLEA = STI.useLeaForSP();
+    // Check if inserting the prologue at the beginning
+    // of MBB would require to use LEA operations.
+    // We need to use LEA operations if both conditions are true:
+    // 1. One of the terminators need the flags.
+    // 2. The flags are not defined after the insertion point of the prologue.
+    // Note: Checking for the predecessors is a shortcut when obviously nothing
+    // will live accross the prologue.
+    UseLEA = STI.useLeaForSP() ||
+             (!MBB.pred_empty() && terminatorsNeedFlagsAsInput(MBB) &&
+              !flagsDefinedLocally(MBB));
   } else {
     // If we can use LEA for SP but we shouldn't, check that none
     // of the terminators uses the eflags. Otherwise we will insert
diff --git a/test/CodeGen/X86/x86-shrink-wrapping.ll b/test/CodeGen/X86/x86-shrink-wrapping.ll
index 0cab17f9de89..34e56919468b 100644
--- a/test/CodeGen/X86/x86-shrink-wrapping.ll
+++ b/test/CodeGen/X86/x86-shrink-wrapping.ll
@@ -788,3 +788,94 @@ end:
   %tmp.0 = phi i32 [ %tmp4, %true ], [ %tmp5, %false ]
   ret i32 %tmp.0
 }
+
+@b = internal unnamed_addr global i1 false
+@c = internal unnamed_addr global i8 0, align 1
+@a = common global i32 0, align 4
+
+; Make sure the prologue does not clobber the EFLAGS when
+; it is live accross.
+; PR25629.
+; Note: The registers may change in the following patterns, but
+; because they imply register hierarchy (e.g., eax, al) this is
+; tricky to write robust patterns.
+;
+; CHECK-LABEL: useLEAForPrologue:
+;
+; Prologue is at the beginning of the function when shrink-wrapping
+; is disabled.
+; DISABLE: pushq
+; The stack adjustment can use SUB instr because we do not need to
+; preserve the EFLAGS at this point.
+; DISABLE-NEXT: subq $16, %rsp
+;
+; Load the value of b.
+; CHECK: movb _b(%rip), [[BOOL:%cl]]
+; Extract i1 from the loaded value.
+; CHECK-NEXT: andb $1, [[BOOL]]
+; Create the zero value for the select assignment.
+; CHECK-NEXT: xorl [[CMOVE_VAL:%eax]], [[CMOVE_VAL]]
+; CHECK-NEXT: testb [[BOOL]], [[BOOL]]
+; CHECK-NEXT: jne [[STOREC_LABEL:LBB[0-9_]+]]
+;
+; CHECK: movb $48, [[CMOVE_VAL:%al]]
+;
+; CHECK: [[STOREC_LABEL]]:
+;
+; ENABLE-NEXT: pushq
+; For the stack adjustment, we need to preserve the EFLAGS.
+; ENABLE-NEXT: leaq -16(%rsp), %rsp
+;
+; Technically, we should use CMOVE_VAL here or its subregister.
+; CHECK-NEXT: movb %al, _c(%rip)
+; testb set the EFLAGS read here.
+; CHECK-NEXT: je [[VARFUNC_CALL:LBB[0-9_]+]]
+;
+; The code of the loop is not interesting.
+; [...]
+;
+; CHECK: [[VARFUNC_CALL]]:
+; Set the null parameter.
+; CHECK-NEXT: xorl %edi, %edi
+; CHECK-NEXT: callq _varfunc
+;
+; Set the return value.
+; CHECK-NEXT: xorl %eax, %eax
+;
+; Epilogue code.
+; CHECK-NEXT: addq $16, %rsp
+; CHECK-NEXT: popq
+; CHECK-NEXT: retq
+define i32 @useLEAForPrologue(i32 %d, i32 %a, i8 %c) #3 {
+entry:
+  %tmp = alloca i3
+  %.b = load i1, i1* @b, align 1
+  %bool = select i1 %.b, i8 0, i8 48
+  store i8 %bool, i8* @c, align 1
+  br i1 %.b, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  tail call void asm sideeffect "nop", "~{ebx}"()
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %inc6 = phi i8 [ %c, %for.body.lr.ph ], [ %inc, %for.body ]
+  %cond5 = phi i32 [ %a, %for.body.lr.ph ], [ %conv3, %for.body ]
+  %cmp2 = icmp slt i32 %d, %cond5
+  %conv3 = zext i1 %cmp2 to i32
+  %inc = add i8 %inc6, 1
+  %cmp = icmp slt i8 %inc, 45
+  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+  store i32 %conv3, i32* @a, align 4
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  %call = tail call i32 (i8*) @varfunc(i8* null)
+  ret i32 0
+}
+
+declare i32 @varfunc(i8* nocapture readonly)
+
+attributes #3 = { nounwind }

From 76c60c37de0a6250ce3be524e121d95ba7035c2e Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Tue, 1 Dec 2015 19:50:54 +0000
Subject: [PATCH 100/186] Use references now that it is natural to do so.

The linker never takes ownership of a module or changes which module it
is refering to, making it natural to use references.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254449 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Linker/Linker.h          |  14 +--
 lib/LTO/LTOCodeGenerator.cpp          |   8 +-
 lib/Linker/LinkModules.cpp            | 149 +++++++++++++-------------
 lib/Transforms/IPO/FunctionImport.cpp |   4 +-
 tools/bugpoint/BugDriver.cpp          |   2 +-
 tools/bugpoint/Miscompilation.cpp     |   8 +-
 tools/gold/gold-plugin.cpp            |   6 +-
 tools/llvm-link/llvm-link.cpp         |   6 +-
 unittests/Linker/LinkModulesTest.cpp  |  11 +-
 9 files changed, 101 insertions(+), 107 deletions(-)

diff --git a/include/llvm/Linker/Linker.h b/include/llvm/Linker/Linker.h
index e307e06f50be..38fa5562f300 100644
--- a/include/llvm/Linker/Linker.h
+++ b/include/llvm/Linker/Linker.h
@@ -68,10 +68,10 @@ class Linker {
     InternalizeLinkedSymbols = (1 << 2)
   };
 
-  Linker(Module *M, DiagnosticHandlerFunction DiagnosticHandler);
-  Linker(Module *M);
+  Linker(Module &M, DiagnosticHandlerFunction DiagnosticHandler);
+  Linker(Module &M);
 
-  Module *getModule() const { return Composite; }
+  Module &getModule() const { return Composite; }
 
   /// \brief Link \p Src into the composite. The source is destroyed.
   /// Passing OverrideSymbols as true will have symbols from Src
@@ -80,19 +80,19 @@ class Linker {
   /// is passed. If a \p FuncToImport is provided, only that single
   /// function is imported from the source module.
   /// Returns true on error.
-  bool linkInModule(Module *Src, unsigned Flags = Flags::None,
+  bool linkInModule(Module &Src, unsigned Flags = Flags::None,
                     const FunctionInfoIndex *Index = nullptr,
                     Function *FuncToImport = nullptr);
 
-  static bool LinkModules(Module *Dest, Module *Src,
+  static bool linkModules(Module &Dest, Module &Src,
                           DiagnosticHandlerFunction DiagnosticHandler,
                           unsigned Flags = Flags::None);
 
-  static bool LinkModules(Module *Dest, Module *Src,
+  static bool linkModules(Module &Dest, Module &Src,
                           unsigned Flags = Flags::None);
 
 private:
-  Module *Composite;
+  Module &Composite;
 
   IdentifiedStructTypeSet IdentifiedStructTypes;
 
diff --git a/lib/LTO/LTOCodeGenerator.cpp b/lib/LTO/LTOCodeGenerator.cpp
index 37ee7e8c53cc..468ec24e3a06 100644
--- a/lib/LTO/LTOCodeGenerator.cpp
+++ b/lib/LTO/LTOCodeGenerator.cpp
@@ -67,14 +67,14 @@ const char* LTOCodeGenerator::getVersionString() {
 LTOCodeGenerator::LTOCodeGenerator()
     : Context(getGlobalContext()),
       MergedModule(new Module("ld-temp.o", Context)),
-      IRLinker(new Linker(MergedModule.get())) {
+      IRLinker(new Linker(*MergedModule)) {
   initializeLTOPasses();
 }
 
 LTOCodeGenerator::LTOCodeGenerator(std::unique_ptr<LLVMContext> Context)
     : OwnedContext(std::move(Context)), Context(*OwnedContext),
       MergedModule(new Module("ld-temp.o", *OwnedContext)),
-      IRLinker(new Linker(MergedModule.get())) {
+      IRLinker(new Linker(*MergedModule)) {
   initializeLTOPasses();
 }
 
@@ -114,7 +114,7 @@ bool LTOCodeGenerator::addModule(LTOModule *Mod) {
   assert(&Mod->getModule().getContext() == &Context &&
          "Expected module in same context");
 
-  bool ret = IRLinker->linkInModule(&Mod->getModule());
+  bool ret = IRLinker->linkInModule(Mod->getModule());
 
   const std::vector<const char *> &undefs = Mod->getAsmUndefinedRefs();
   for (int i = 0, e = undefs.size(); i != e; ++i)
@@ -130,7 +130,7 @@ void LTOCodeGenerator::setModule(std::unique_ptr<LTOModule> Mod) {
   AsmUndefinedRefs.clear();
 
   MergedModule = Mod->takeModule();
-  IRLinker = make_unique<Linker>(MergedModule.get());
+  IRLinker = make_unique<Linker>(*MergedModule);
 
   const std::vector<const char*> &Undefs = Mod->getAsmUndefinedRefs();
   for (int I = 0, E = Undefs.size(); I != E; ++I)
diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index 7ba622f6ee1d..c2f472c7e33f 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -390,7 +390,8 @@ void LinkDiagnosticInfo::print(DiagnosticPrinter &DP) const { DP << Msg; }
 /// This is an implementation class for the LinkModules function, which is the
 /// entrypoint for this file.
 class ModuleLinker {
-  Module *DstM, *SrcM;
+  Module &DstM;
+  Module &SrcM;
 
   TypeMapTy TypeMap;
   ValueMaterializerTy ValMaterializer;
@@ -431,11 +432,11 @@ class ModuleLinker {
   bool HasError = false;
 
 public:
-  ModuleLinker(Module *dstM, Linker::IdentifiedStructTypeSet &Set, Module *srcM,
+  ModuleLinker(Module &DstM, Linker::IdentifiedStructTypeSet &Set, Module &SrcM,
                DiagnosticHandlerFunction DiagnosticHandler, unsigned Flags,
                const FunctionInfoIndex *Index = nullptr,
                Function *FuncToImport = nullptr)
-      : DstM(dstM), SrcM(srcM), TypeMap(Set), ValMaterializer(this),
+      : DstM(DstM), SrcM(SrcM), TypeMap(Set), ValMaterializer(this),
         DiagnosticHandler(DiagnosticHandler), Flags(Flags), ImportIndex(Index),
         ImportFunction(FuncToImport), HasExportedFunctions(false),
         DoneLinkingBodies(false) {
@@ -446,7 +447,7 @@ class ModuleLinker {
     // backend compilation, and we need to see if it has functions that
     // may be exported to another backend compilation.
     if (ImportIndex && !ImportFunction)
-      HasExportedFunctions = ImportIndex->hasExportedFunctions(SrcM);
+      HasExportedFunctions = ImportIndex->hasExportedFunctions(&SrcM);
   }
 
   bool run();
@@ -485,7 +486,7 @@ class ModuleLinker {
     DiagnosticHandler(LinkDiagnosticInfo(DS_Warning, Message));
   }
 
-  bool getComdatLeader(Module *M, StringRef ComdatName,
+  bool getComdatLeader(Module &M, StringRef ComdatName,
                        const GlobalVariable *&GVar);
   bool computeResultingSelectionKind(StringRef ComdatName,
                                      Comdat::SelectionKind Src,
@@ -508,7 +509,7 @@ class ModuleLinker {
       return nullptr;
 
     // Otherwise see if we have a match in the destination module's symtab.
-    GlobalValue *DGV = DstM->getNamedValue(getName(SrcGV));
+    GlobalValue *DGV = DstM.getNamedValue(getName(SrcGV));
     if (!DGV)
       return nullptr;
 
@@ -793,7 +794,7 @@ ModuleLinker::copyGlobalVariableProto(TypeMapTy &TypeMap,
   // identical version of the symbol over in the dest module... the
   // initializer will be filled in later by LinkGlobalInits.
   GlobalVariable *NewDGV = new GlobalVariable(
-      *DstM, TypeMap.get(SGVar->getType()->getElementType()),
+      DstM, TypeMap.get(SGVar->getType()->getElementType()),
       SGVar->isConstant(), getLinkage(SGVar), /*init*/ nullptr, getName(SGVar),
       /*insertbefore*/ nullptr, SGVar->getThreadLocalMode(),
       SGVar->getType()->getAddressSpace());
@@ -808,7 +809,7 @@ Function *ModuleLinker::copyFunctionProto(TypeMapTy &TypeMap,
   // If there is no linkage to be performed or we are linking from the source,
   // bring SF over.
   return Function::Create(TypeMap.get(SF->getFunctionType()), getLinkage(SF),
-                          getName(SF), DstM);
+                          getName(SF), &DstM);
 }
 
 /// Set up prototypes for any aliases that come over from the source module.
@@ -842,7 +843,7 @@ GlobalValue *ModuleLinker::copyGlobalAliasProto(TypeMapTy &TypeMap,
   // bring over SGA.
   auto *Ty = TypeMap.get(SGA->getValueType());
   return GlobalAlias::create(Ty, SGA->getType()->getPointerAddressSpace(),
-                             getLinkage(SGA), getName(SGA), DstM);
+                             getLinkage(SGA), getName(SGA), &DstM);
 }
 
 static GlobalValue::VisibilityTypes
@@ -936,9 +937,9 @@ void ModuleLinker::materializeInitFor(GlobalValue *New, GlobalValue *Old) {
   linkGlobalValueBody(*Old);
 }
 
-bool ModuleLinker::getComdatLeader(Module *M, StringRef ComdatName,
+bool ModuleLinker::getComdatLeader(Module &M, StringRef ComdatName,
                                    const GlobalVariable *&GVar) {
-  const GlobalValue *GVal = M->getNamedValue(ComdatName);
+  const GlobalValue *GVal = M.getNamedValue(ComdatName);
   if (const auto *GA = dyn_cast_or_null<GlobalAlias>(GVal)) {
     GVal = GA->getBaseObject();
     if (!GVal)
@@ -997,8 +998,8 @@ bool ModuleLinker::computeResultingSelectionKind(StringRef ComdatName,
         getComdatLeader(SrcM, ComdatName, SrcGV))
       return true;
 
-    const DataLayout &DstDL = DstM->getDataLayout();
-    const DataLayout &SrcDL = SrcM->getDataLayout();
+    const DataLayout &DstDL = DstM.getDataLayout();
+    const DataLayout &SrcDL = SrcM.getDataLayout();
     uint64_t DstSize =
         DstDL.getTypeAllocSize(DstGV->getType()->getPointerElementType());
     uint64_t SrcSize =
@@ -1030,7 +1031,7 @@ bool ModuleLinker::getComdatResult(const Comdat *SrcC,
                                    bool &LinkFromSrc) {
   Comdat::SelectionKind SSK = SrcC->getSelectionKind();
   StringRef ComdatName = SrcC->getName();
-  Module::ComdatSymTabType &ComdatSymTab = DstM->getComdatSymbolTable();
+  Module::ComdatSymTabType &ComdatSymTab = DstM.getComdatSymbolTable();
   Module::ComdatSymTabType::iterator DstCI = ComdatSymTab.find(ComdatName);
 
   if (DstCI == ComdatSymTab.end()) {
@@ -1158,7 +1159,7 @@ bool ModuleLinker::shouldLinkFromSource(bool &LinkFromSrc,
 /// types 'Foo' but one got renamed when the module was loaded into the same
 /// LLVMContext.
 void ModuleLinker::computeTypeMapping() {
-  for (GlobalValue &SGV : SrcM->globals()) {
+  for (GlobalValue &SGV : SrcM.globals()) {
     GlobalValue *DGV = getLinkedToGlobal(&SGV);
     if (!DGV)
       continue;
@@ -1174,12 +1175,12 @@ void ModuleLinker::computeTypeMapping() {
     TypeMap.addTypeMapping(DAT->getElementType(), SAT->getElementType());
   }
 
-  for (GlobalValue &SGV : *SrcM) {
+  for (GlobalValue &SGV : SrcM) {
     if (GlobalValue *DGV = getLinkedToGlobal(&SGV))
       TypeMap.addTypeMapping(DGV->getType(), SGV.getType());
   }
 
-  for (GlobalValue &SGV : SrcM->aliases()) {
+  for (GlobalValue &SGV : SrcM.aliases()) {
     if (GlobalValue *DGV = getLinkedToGlobal(&SGV))
       TypeMap.addTypeMapping(DGV->getType(), SGV.getType());
   }
@@ -1188,7 +1189,7 @@ void ModuleLinker::computeTypeMapping() {
   // At this point, the destination module may have a type "%foo = { i32 }" for
   // example.  When the source module got loaded into the same LLVMContext, if
   // it had the same type, it would have been renamed to "%foo.42 = { i32 }".
-  std::vector<StructType *> Types = SrcM->getIdentifiedStructTypes();
+  std::vector<StructType *> Types = SrcM.getIdentifiedStructTypes();
   for (StructType *ST : Types) {
     if (!ST->hasName())
       continue;
@@ -1201,7 +1202,7 @@ void ModuleLinker::computeTypeMapping() {
       continue;
 
     // Check to see if the destination module has a struct with the prefix name.
-    StructType *DST = DstM->getTypeByName(ST->getName().substr(0, DotPos));
+    StructType *DST = DstM.getTypeByName(ST->getName().substr(0, DotPos));
     if (!DST)
       continue;
 
@@ -1275,10 +1276,10 @@ static void upgradeGlobalArray(GlobalVariable *GV) {
 
 void ModuleLinker::upgradeMismatchedGlobalArray(StringRef Name) {
   // Look for the global arrays.
-  auto *DstGV = dyn_cast_or_null<GlobalVariable>(DstM->getNamedValue(Name));
+  auto *DstGV = dyn_cast_or_null<GlobalVariable>(DstM.getNamedValue(Name));
   if (!DstGV)
     return;
-  auto *SrcGV = dyn_cast_or_null<GlobalVariable>(SrcM->getNamedValue(Name));
+  auto *SrcGV = dyn_cast_or_null<GlobalVariable>(SrcM.getNamedValue(Name));
   if (!SrcGV)
     return;
 
@@ -1380,7 +1381,7 @@ bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
 
   // Create the new global variable.
   GlobalVariable *NG = new GlobalVariable(
-      *DstM, NewType, SrcGV->isConstant(), SrcGV->getLinkage(),
+      DstM, NewType, SrcGV->isConstant(), SrcGV->getLinkage(),
       /*init*/ nullptr, /*name*/ "", DstGV, SrcGV->getThreadLocalMode(),
       SrcGV->getType()->getAddressSpace());
 
@@ -1430,7 +1431,7 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) {
   if (const Comdat *SC = SGV->getComdat()) {
     Comdat::SelectionKind SK;
     std::tie(SK, LinkFromSrc) = ComdatsChosen[SC];
-    C = DstM->getOrInsertComdat(SC->getName());
+    C = DstM.getOrInsertComdat(SC->getName());
     C->setSelectionKind(SK);
     if (SGV->hasInternalLinkage())
       LinkFromSrc = true;
@@ -1606,12 +1607,12 @@ bool ModuleLinker::linkGlobalValueBody(GlobalValue &Src) {
 
 /// Insert all of the named MDNodes in Src into the Dest module.
 void ModuleLinker::linkNamedMDNodes() {
-  const NamedMDNode *SrcModFlags = SrcM->getModuleFlagsMetadata();
-  for (const NamedMDNode &NMD : SrcM->named_metadata()) {
+  const NamedMDNode *SrcModFlags = SrcM.getModuleFlagsMetadata();
+  for (const NamedMDNode &NMD : SrcM.named_metadata()) {
     // Don't link module flags here. Do them separately.
     if (&NMD == SrcModFlags)
       continue;
-    NamedMDNode *DestNMD = DstM->getOrInsertNamedMetadata(NMD.getName());
+    NamedMDNode *DestNMD = DstM.getOrInsertNamedMetadata(NMD.getName());
     // Add Src elements into Dest node.
     for (const MDNode *op : NMD.operands())
       DestNMD->addOperand(MapMetadata(
@@ -1623,12 +1624,12 @@ void ModuleLinker::linkNamedMDNodes() {
 /// Merge the linker flags in Src into the Dest module.
 bool ModuleLinker::linkModuleFlagsMetadata() {
   // If the source module has no module flags, we are done.
-  const NamedMDNode *SrcModFlags = SrcM->getModuleFlagsMetadata();
+  const NamedMDNode *SrcModFlags = SrcM.getModuleFlagsMetadata();
   if (!SrcModFlags) return false;
 
   // If the destination module doesn't have module flags yet, then just copy
   // over the source module's flags.
-  NamedMDNode *DstModFlags = DstM->getOrInsertModuleFlagsMetadata();
+  NamedMDNode *DstModFlags = DstM.getOrInsertModuleFlagsMetadata();
   if (DstModFlags->getNumOperands() == 0) {
     for (unsigned I = 0, E = SrcModFlags->getNumOperands(); I != E; ++I)
       DstModFlags->addOperand(SrcModFlags->getOperand(I));
@@ -1711,7 +1712,7 @@ bool ModuleLinker::linkModuleFlagsMetadata() {
 
     auto replaceDstValue = [&](MDNode *New) {
       Metadata *FlagOps[] = {DstOp->getOperand(0), ID, New};
-      MDNode *Flag = MDNode::get(DstM->getContext(), FlagOps);
+      MDNode *Flag = MDNode::get(DstM.getContext(), FlagOps);
       DstModFlags->setOperand(DstIndex, Flag);
       Flags[ID].first = Flag;
     };
@@ -1744,7 +1745,7 @@ bool ModuleLinker::linkModuleFlagsMetadata() {
       MDs.append(DstValue->op_begin(), DstValue->op_end());
       MDs.append(SrcValue->op_begin(), SrcValue->op_end());
 
-      replaceDstValue(MDNode::get(DstM->getContext(), MDs));
+      replaceDstValue(MDNode::get(DstM.getContext(), MDs));
       break;
     }
     case Module::AppendUnique: {
@@ -1754,7 +1755,7 @@ bool ModuleLinker::linkModuleFlagsMetadata() {
       Elts.insert(DstValue->op_begin(), DstValue->op_end());
       Elts.insert(SrcValue->op_begin(), SrcValue->op_end());
 
-      replaceDstValue(MDNode::get(DstM->getContext(),
+      replaceDstValue(MDNode::get(DstM.getContext(),
                                   makeArrayRef(Elts.begin(), Elts.end())));
       break;
     }
@@ -1833,51 +1834,47 @@ bool ModuleLinker::linkIfNeeded(GlobalValue &GV) {
 }
 
 bool ModuleLinker::run() {
-  assert(DstM && "Null destination module");
-  assert(SrcM && "Null source module");
-
   // Inherit the target data from the source module if the destination module
   // doesn't have one already.
-  if (DstM->getDataLayout().isDefault())
-    DstM->setDataLayout(SrcM->getDataLayout());
+  if (DstM.getDataLayout().isDefault())
+    DstM.setDataLayout(SrcM.getDataLayout());
 
-  if (SrcM->getDataLayout() != DstM->getDataLayout()) {
+  if (SrcM.getDataLayout() != DstM.getDataLayout()) {
     emitWarning("Linking two modules of different data layouts: '" +
-                SrcM->getModuleIdentifier() + "' is '" +
-                SrcM->getDataLayoutStr() + "' whereas '" +
-                DstM->getModuleIdentifier() + "' is '" +
-                DstM->getDataLayoutStr() + "'\n");
+                SrcM.getModuleIdentifier() + "' is '" +
+                SrcM.getDataLayoutStr() + "' whereas '" +
+                DstM.getModuleIdentifier() + "' is '" +
+                DstM.getDataLayoutStr() + "'\n");
   }
 
   // Copy the target triple from the source to dest if the dest's is empty.
-  if (DstM->getTargetTriple().empty() && !SrcM->getTargetTriple().empty())
-    DstM->setTargetTriple(SrcM->getTargetTriple());
+  if (DstM.getTargetTriple().empty() && !SrcM.getTargetTriple().empty())
+    DstM.setTargetTriple(SrcM.getTargetTriple());
 
-  Triple SrcTriple(SrcM->getTargetTriple()), DstTriple(DstM->getTargetTriple());
+  Triple SrcTriple(SrcM.getTargetTriple()), DstTriple(DstM.getTargetTriple());
 
-  if (!SrcM->getTargetTriple().empty() && !triplesMatch(SrcTriple, DstTriple))
+  if (!SrcM.getTargetTriple().empty() && !triplesMatch(SrcTriple, DstTriple))
     emitWarning("Linking two modules of different target triples: " +
-                SrcM->getModuleIdentifier() + "' is '" +
-                SrcM->getTargetTriple() + "' whereas '" +
-                DstM->getModuleIdentifier() + "' is '" +
-                DstM->getTargetTriple() + "'\n");
+                SrcM.getModuleIdentifier() + "' is '" + SrcM.getTargetTriple() +
+                "' whereas '" + DstM.getModuleIdentifier() + "' is '" +
+                DstM.getTargetTriple() + "'\n");
 
-  DstM->setTargetTriple(mergeTriples(SrcTriple, DstTriple));
+  DstM.setTargetTriple(mergeTriples(SrcTriple, DstTriple));
 
   // Append the module inline asm string.
-  if (!SrcM->getModuleInlineAsm().empty()) {
-    if (DstM->getModuleInlineAsm().empty())
-      DstM->setModuleInlineAsm(SrcM->getModuleInlineAsm());
+  if (!SrcM.getModuleInlineAsm().empty()) {
+    if (DstM.getModuleInlineAsm().empty())
+      DstM.setModuleInlineAsm(SrcM.getModuleInlineAsm());
     else
-      DstM->setModuleInlineAsm(DstM->getModuleInlineAsm()+"\n"+
-                               SrcM->getModuleInlineAsm());
+      DstM.setModuleInlineAsm(DstM.getModuleInlineAsm() + "\n" +
+                              SrcM.getModuleInlineAsm());
   }
 
   // Loop over all of the linked values to compute type mappings.
   computeTypeMapping();
 
   ComdatsChosen.clear();
-  for (const auto &SMEC : SrcM->getComdatSymbolTable()) {
+  for (const auto &SMEC : SrcM.getComdatSymbolTable()) {
     const Comdat &C = SMEC.getValue();
     if (ComdatsChosen.count(&C))
       continue;
@@ -1891,37 +1888,37 @@ bool ModuleLinker::run() {
   // Upgrade mismatched global arrays.
   upgradeMismatchedGlobals();
 
-  for (GlobalVariable &GV : SrcM->globals())
+  for (GlobalVariable &GV : SrcM.globals())
     if (const Comdat *SC = GV.getComdat())
       ComdatMembers[SC].push_back(&GV);
 
-  for (Function &SF : *SrcM)
+  for (Function &SF : SrcM)
     if (const Comdat *SC = SF.getComdat())
       ComdatMembers[SC].push_back(&SF);
 
-  for (GlobalAlias &GA : SrcM->aliases())
+  for (GlobalAlias &GA : SrcM.aliases())
     if (const Comdat *SC = GA.getComdat())
       ComdatMembers[SC].push_back(&GA);
 
   // Insert all of the globals in src into the DstM module... without linking
   // initializers (which could refer to functions not yet mapped over).
-  for (GlobalVariable &GV : SrcM->globals())
+  for (GlobalVariable &GV : SrcM.globals())
     if (linkIfNeeded(GV))
       return true;
 
-  for (Function &SF : *SrcM)
+  for (Function &SF : SrcM)
     if (linkIfNeeded(SF))
       return true;
 
-  for (GlobalAlias &GA : SrcM->aliases())
+  for (GlobalAlias &GA : SrcM.aliases())
     if (linkIfNeeded(GA))
       return true;
 
-  for (const auto &Entry : DstM->getComdatSymbolTable()) {
+  for (const auto &Entry : DstM.getComdatSymbolTable()) {
     const Comdat &C = Entry.getValue();
     if (C.getSelectionKind() == Comdat::Any)
       continue;
-    const GlobalValue *GV = SrcM->getNamedValue(C.getName());
+    const GlobalValue *GV = SrcM.getNamedValue(C.getName());
     if (GV)
       MapValue(GV, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer);
   }
@@ -2032,12 +2029,10 @@ bool Linker::IdentifiedStructTypeSet::hasType(StructType *Ty) {
   return *I == Ty;
 }
 
-Linker::Linker(Module *M, DiagnosticHandlerFunction DiagnosticHandler) {
-  this->Composite = M;
-  this->DiagnosticHandler = DiagnosticHandler;
-
+Linker::Linker(Module &M, DiagnosticHandlerFunction DiagnosticHandler)
+    : Composite(M), DiagnosticHandler(DiagnosticHandler) {
   TypeFinder StructTypes;
-  StructTypes.run(*M, true);
+  StructTypes.run(M, true);
   for (StructType *Ty : StructTypes) {
     if (Ty->isOpaque())
       IdentifiedStructTypes.addOpaque(Ty);
@@ -2046,18 +2041,18 @@ Linker::Linker(Module *M, DiagnosticHandlerFunction DiagnosticHandler) {
   }
 }
 
-Linker::Linker(Module *M)
+Linker::Linker(Module &M)
     : Linker(M, [this](const DiagnosticInfo &DI) {
-        Composite->getContext().diagnose(DI);
+        Composite.getContext().diagnose(DI);
       }) {}
 
-bool Linker::linkInModule(Module *Src, unsigned Flags,
+bool Linker::linkInModule(Module &Src, unsigned Flags,
                           const FunctionInfoIndex *Index,
                           Function *FuncToImport) {
   ModuleLinker TheLinker(Composite, IdentifiedStructTypes, Src,
                          DiagnosticHandler, Flags, Index, FuncToImport);
   bool RetCode = TheLinker.run();
-  Composite->dropTriviallyDeadConstantArrays();
+  Composite.dropTriviallyDeadConstantArrays();
   return RetCode;
 }
 
@@ -2070,14 +2065,14 @@ bool Linker::linkInModule(Module *Src, unsigned Flags,
 /// true is returned and ErrorMsg (if not null) is set to indicate the problem.
 /// Upon failure, the Dest module could be in a modified state, and shouldn't be
 /// relied on to be consistent.
-bool Linker::LinkModules(Module *Dest, Module *Src,
+bool Linker::linkModules(Module &Dest, Module &Src,
                          DiagnosticHandlerFunction DiagnosticHandler,
                          unsigned Flags) {
   Linker L(Dest, DiagnosticHandler);
   return L.linkInModule(Src, Flags);
 }
 
-bool Linker::LinkModules(Module *Dest, Module *Src, unsigned Flags) {
+bool Linker::linkModules(Module &Dest, Module &Src, unsigned Flags) {
   Linker L(Dest);
   return L.linkInModule(Src, Flags);
 }
@@ -2093,8 +2088,8 @@ LLVMBool LLVMLinkModules(LLVMModuleRef Dest, LLVMModuleRef Src,
   raw_string_ostream Stream(Message);
   DiagnosticPrinterRawOStream DP(Stream);
 
-  LLVMBool Result = Linker::LinkModules(
-      D, unwrap(Src), [&](const DiagnosticInfo &DI) { DI.print(DP); });
+  LLVMBool Result = Linker::linkModules(
+      *D, *unwrap(Src), [&](const DiagnosticInfo &DI) { DI.print(DP); });
 
   if (OutMessages && Result) {
     Stream.flush();
diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp
index 10bba1939270..444da3c55d2a 100644
--- a/lib/Transforms/IPO/FunctionImport.cpp
+++ b/lib/Transforms/IPO/FunctionImport.cpp
@@ -102,7 +102,7 @@ bool FunctionImporter::importFunctions(Module &M) {
   /// Second step: for every call to an external function, try to import it.
 
   // Linker that will be used for importing function
-  Linker L(&M, DiagnosticHandler);
+  Linker L(M, DiagnosticHandler);
 
   while (!Worklist.empty()) {
     auto CalledFunctionName = Worklist.pop_back_val();
@@ -182,7 +182,7 @@ bool FunctionImporter::importFunctions(Module &M) {
     }
 
     // Link in the specified function.
-    if (L.linkInModule(&Module, Linker::Flags::None, &Index, F))
+    if (L.linkInModule(Module, Linker::Flags::None, &Index, F))
       report_fatal_error("Function Import: link error");
 
     // Process the newly imported function and add callees to the worklist.
diff --git a/tools/bugpoint/BugDriver.cpp b/tools/bugpoint/BugDriver.cpp
index d0731f5f6d32..39887d5d59dc 100644
--- a/tools/bugpoint/BugDriver.cpp
+++ b/tools/bugpoint/BugDriver.cpp
@@ -132,7 +132,7 @@ bool BugDriver::addSources(const std::vector<std::string> &Filenames) {
     if (!M.get()) return true;
 
     outs() << "Linking in input file: '" << Filenames[i] << "'\n";
-    if (Linker::LinkModules(Program, M.get()))
+    if (Linker::linkModules(*Program, *M))
       return true;
   }
 
diff --git a/tools/bugpoint/Miscompilation.cpp b/tools/bugpoint/Miscompilation.cpp
index 0ca47e770411..e7eae40ec95a 100644
--- a/tools/bugpoint/Miscompilation.cpp
+++ b/tools/bugpoint/Miscompilation.cpp
@@ -222,7 +222,7 @@ static Module *TestMergedProgram(const BugDriver &BD, Module *M1, Module *M2,
     M1 = CloneModule(M1);
     M2 = CloneModule(M2);
   }
-  if (Linker::LinkModules(M1, M2))
+  if (Linker::linkModules(*M1, *M2))
     exit(1);
   delete M2;   // We are done with this module.
 
@@ -390,7 +390,7 @@ static bool ExtractLoops(BugDriver &BD,
         MisCompFunctions.emplace_back(F->getName(), F->getFunctionType());
       }
 
-      if (Linker::LinkModules(ToNotOptimize, ToOptimizeLoopExtracted))
+      if (Linker::linkModules(*ToNotOptimize, *ToOptimizeLoopExtracted))
         exit(1);
 
       MiscompiledFunctions.clear();
@@ -418,7 +418,7 @@ static bool ExtractLoops(BugDriver &BD,
     // extraction both didn't break the program, and didn't mask the problem.
     // Replace the current program with the loop extracted version, and try to
     // extract another loop.
-    if (Linker::LinkModules(ToNotOptimize, ToOptimizeLoopExtracted))
+    if (Linker::linkModules(*ToNotOptimize, *ToOptimizeLoopExtracted))
       exit(1);
 
     delete ToOptimizeLoopExtracted;
@@ -594,7 +594,7 @@ static bool ExtractBlocks(BugDriver &BD,
     if (!I->isDeclaration())
       MisCompFunctions.emplace_back(I->getName(), I->getFunctionType());
 
-  if (Linker::LinkModules(ProgClone, Extracted.get()))
+  if (Linker::linkModules(*ProgClone, *Extracted))
     exit(1);
 
   // Set the new program and delete the old one.
diff --git a/tools/gold/gold-plugin.cpp b/tools/gold/gold-plugin.cpp
index b8318be27ce0..1bd2f8afb290 100644
--- a/tools/gold/gold-plugin.cpp
+++ b/tools/gold/gold-plugin.cpp
@@ -938,7 +938,7 @@ static ld_plugin_status allSymbolsReadHook(raw_fd_ostream *ApiFile) {
   }
 
   std::unique_ptr<Module> Combined(new Module("ld-temp.o", Context));
-  Linker L(Combined.get());
+  Linker L(*Combined);
 
   std::string DefaultTriple = sys::getDefaultTargetTriple();
 
@@ -956,7 +956,7 @@ static ld_plugin_status allSymbolsReadHook(raw_fd_ostream *ApiFile) {
       M->setTargetTriple(DefaultTriple);
     }
 
-    if (L.linkInModule(M.get()))
+    if (L.linkInModule(*M))
       message(LDPL_FATAL, "Failed to link module");
     if (release_input_file(F.handle) != LDPS_OK)
       message(LDPL_FATAL, "Failed to release file information");
@@ -986,7 +986,7 @@ static ld_plugin_status allSymbolsReadHook(raw_fd_ostream *ApiFile) {
       path = output_name;
     else
       path = output_name + ".bc";
-    saveBCFile(path, *L.getModule());
+    saveBCFile(path, *Combined);
     if (options::TheOutputType == options::OT_BC_ONLY)
       return LDPS_OK;
   }
diff --git a/tools/llvm-link/llvm-link.cpp b/tools/llvm-link/llvm-link.cpp
index 984dfdcf70e5..2b63649cec2e 100644
--- a/tools/llvm-link/llvm-link.cpp
+++ b/tools/llvm-link/llvm-link.cpp
@@ -198,7 +198,7 @@ static bool importFunctions(const char *argv0, LLVMContext &Context,
     }
 
     // Link in the specified function.
-    if (L.linkInModule(M.get(), Linker::Flags::None, Index.get(), F))
+    if (L.linkInModule(*M, Linker::Flags::None, Index.get(), F))
       return false;
   }
   return true;
@@ -238,7 +238,7 @@ static bool linkFiles(const char *argv0, LLVMContext &Context, Linker &L,
     if (Verbose)
       errs() << "Linking in '" << File << "'\n";
 
-    if (L.linkInModule(M.get(), ApplicableFlags, Index.get()))
+    if (L.linkInModule(*M, ApplicableFlags, Index.get()))
       return false;
     // All linker flags apply to linking of subsequent files.
     ApplicableFlags = Flags;
@@ -266,7 +266,7 @@ int main(int argc, char **argv) {
   cl::ParseCommandLineOptions(argc, argv, "llvm linker\n");
 
   auto Composite = make_unique<Module>("llvm-link", Context);
-  Linker L(Composite.get(), diagnosticHandler);
+  Linker L(*Composite, diagnosticHandler);
 
   unsigned Flags = Linker::Flags::None;
   if (Internalize)
diff --git a/unittests/Linker/LinkModulesTest.cpp b/unittests/Linker/LinkModulesTest.cpp
index a1d889a6b3dd..4eba718e2663 100644
--- a/unittests/Linker/LinkModulesTest.cpp
+++ b/unittests/Linker/LinkModulesTest.cpp
@@ -93,7 +93,7 @@ TEST_F(LinkModuleTest, BlockAddress) {
   Builder.CreateRet(ConstantPointerNull::get(Type::getInt8PtrTy(Ctx)));
 
   Module *LinkedModule = new Module("MyModuleLinked", Ctx);
-  Linker::LinkModules(LinkedModule, M.get());
+  Linker::linkModules(*LinkedModule, *M);
 
   // Delete the original module.
   M.reset();
@@ -169,13 +169,13 @@ static Module *getInternal(LLVMContext &Ctx) {
 TEST_F(LinkModuleTest, EmptyModule) {
   std::unique_ptr<Module> InternalM(getInternal(Ctx));
   std::unique_ptr<Module> EmptyM(new Module("EmptyModule1", Ctx));
-  Linker::LinkModules(EmptyM.get(), InternalM.get());
+  Linker::linkModules(*EmptyM, *InternalM);
 }
 
 TEST_F(LinkModuleTest, EmptyModule2) {
   std::unique_ptr<Module> InternalM(getInternal(Ctx));
   std::unique_ptr<Module> EmptyM(new Module("EmptyModule1", Ctx));
-  Linker::LinkModules(InternalM.get(), EmptyM.get());
+  Linker::linkModules(*InternalM, *EmptyM);
 }
 
 TEST_F(LinkModuleTest, TypeMerge) {
@@ -190,7 +190,7 @@ TEST_F(LinkModuleTest, TypeMerge) {
                       "@t2 = weak global %t zeroinitializer\n";
   std::unique_ptr<Module> M2 = parseAssemblyString(M2Str, Err, C);
 
-  Linker::LinkModules(M1.get(), M2.get(), [](const llvm::DiagnosticInfo &){});
+  Linker::linkModules(*M1, *M2, [](const llvm::DiagnosticInfo &) {});
 
   EXPECT_EQ(M1->getNamedGlobal("t1")->getType(),
             M1->getNamedGlobal("t2")->getType());
@@ -267,8 +267,7 @@ TEST_F(LinkModuleTest, MoveDistinctMDs) {
   // Link into destination module.
   auto Dst = llvm::make_unique<Module>("Linked", C);
   ASSERT_TRUE(Dst.get());
-  Linker::LinkModules(Dst.get(), Src.get(),
-                      [](const llvm::DiagnosticInfo &) {});
+  Linker::linkModules(*Dst, *Src, [](const llvm::DiagnosticInfo &) {});
 
   // Check that distinct metadata was moved, not cloned.  Even !4, the uniqued
   // node, should effectively be moved, since its only operand hasn't changed.

From dc53fde2a4db54e23bbda116cf66e9557ed2487f Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 1 Dec 2015 19:57:17 +0000
Subject: [PATCH 101/186] AMDGPU: Optimize VOP2 operand legalization

Don't use commuteInstruction, and don't commute if
doing so will not improve legality. Skip the more
complex checks for literal operands and constant bus restrictions,
which are not a concern for VOP2 instructions because src1
does not accept SGPRs or constants and few implicitly
read vcc.

This gets called quite a few times and the
attempts at commuting are a significant fraction
of the time spent in SIFixSGPRCopies, so it's
somewhat worthwhile to optimize. With this patch and others
leading up to it, this reduces the compile time of SIFixSGPRCopies
on some of the LuxMark 2 kernels from ~8ms to ~5ms on my system.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254452 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/SIInstrInfo.cpp  | 170 +++++++++++++++++++++--------
 lib/Target/AMDGPU/SIInstrInfo.h    |  17 +++
 lib/Target/AMDGPU/SIRegisterInfo.h |   7 ++
 3 files changed, 149 insertions(+), 45 deletions(-)

diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index e1668649139d..a3a2d8c01eb5 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -871,20 +871,26 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr *MI,
 
   MachineOperand &Src1 = MI->getOperand(Src1Idx);
 
-  // Make sure it's legal to commute operands for VOP2.
-  if (isVOP2(*MI) &&
-      (!isOperandLegal(MI, Src0Idx, &Src1) ||
-       !isOperandLegal(MI, Src1Idx, &Src0))) {
-    return nullptr;
+
+  if (isVOP2(*MI)) {
+    const MCInstrDesc &InstrDesc = MI->getDesc();
+    // For VOP2 instructions, any operand type is valid to use for src0.  Make
+    // sure we can use the src1 as src0.
+    //
+    // We could be stricter here and only allow commuting if there is a reason
+    // to do so. i.e. if both operands are VGPRs there is no real benefit,
+    // although MachineCSE attempts to find matches by commuting.
+    const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+    if (!isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0))
+      return nullptr;
   }
 
   if (!Src1.isReg()) {
     // Allow commuting instructions with Imm operands.
     if (NewMI || !Src1.isImm() ||
-       (!isVOP2(*MI) && !isVOP3(*MI))) {
+        (!isVOP2(*MI) && !isVOP3(*MI))) {
       return nullptr;
     }
-
     // Be sure to copy the source modifiers to the right place.
     if (MachineOperand *Src0Mods
           = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {
@@ -1720,6 +1726,41 @@ void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const {
   Inst->addOperand(Op1);
 }
 
+bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
+                                    const MCOperandInfo &OpInfo,
+                                    const MachineOperand &MO) const {
+  if (!MO.isReg())
+    return false;
+
+  unsigned Reg = MO.getReg();
+  const TargetRegisterClass *RC =
+    TargetRegisterInfo::isVirtualRegister(Reg) ?
+    MRI.getRegClass(Reg) :
+    RI.getPhysRegClass(Reg);
+
+  // In order to be legal, the common sub-class must be equal to the
+  // class of the current operand.  For example:
+  //
+  // v_mov_b32 s0 ; Operand defined as vsrc_32
+  //              ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL
+  //
+  // s_sendmsg 0, s0 ; Operand defined as m0reg
+  //                 ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
+
+  return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
+}
+
+bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
+                                     const MCOperandInfo &OpInfo,
+                                     const MachineOperand &MO) const {
+  if (MO.isReg())
+    return isLegalRegOperand(MRI, OpInfo, MO);
+
+  // Handle non-register types that are treated like immediates.
+  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
+  return true;
+}
+
 bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
                                  const MachineOperand *MO) const {
   const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
@@ -1747,21 +1788,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
 
   if (MO->isReg()) {
     assert(DefinedRC);
-    const TargetRegisterClass *RC =
-        TargetRegisterInfo::isVirtualRegister(MO->getReg()) ?
-            MRI.getRegClass(MO->getReg()) :
-            RI.getPhysRegClass(MO->getReg());
-
-    // In order to be legal, the common sub-class must be equal to the
-    // class of the current operand.  For example:
-    //
-    // v_mov_b32 s0 ; Operand defined as vsrc_32
-    //              ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL
-    //
-    // s_sendmsg 0, s0 ; Operand defined as m0reg
-    //                 ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
-
-    return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
+    return isLegalRegOperand(MRI, OpInfo, *MO);
   }
 
 
@@ -1776,6 +1803,81 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
   return isImmOperandLegal(MI, OpIdx, *MO);
 }
 
+void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
+                                       MachineInstr *MI) const {
+  unsigned Opc = MI->getOpcode();
+  const MCInstrDesc &InstrDesc = get(Opc);
+
+  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+  MachineOperand &Src1 = MI->getOperand(Src1Idx);
+
+  // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
+  // we need to only have one constant bus use.
+  //
+  // Note we do not need to worry about literal constants here. They are
+  // disabled for the operand type for instructions because they will always
+  // violate the one constant bus use rule.
+  bool HasImplicitSGPR = findImplicitSGPRRead(*MI) != AMDGPU::NoRegister;
+  if (HasImplicitSGPR) {
+    int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+    MachineOperand &Src0 = MI->getOperand(Src0Idx);
+
+    if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg()))
+      legalizeOpWithMove(MI, Src0Idx);
+  }
+
+  // VOP2 src0 instructions support all operand types, so we don't need to check
+  // their legality. If src1 is already legal, we don't need to do anything.
+  if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
+    return;
+
+  // We do not use commuteInstruction here because it is too aggressive and will
+  // commute if it is possible. We only want to commute here if it improves
+  // legality. This can be called a fairly large number of times so don't waste
+  // compile time pointlessly swapping and checking legality again.
+  if (HasImplicitSGPR || !MI->isCommutable()) {
+    legalizeOpWithMove(MI, Src1Idx);
+    return;
+  }
+
+  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+  MachineOperand &Src0 = MI->getOperand(Src0Idx);
+
+  // If src0 can be used as src1, commuting will make the operands legal.
+  // Otherwise we have to give up and insert a move.
+  //
+  // TODO: Other immediate-like operand kinds could be commuted if there was a
+  // MachineOperand::ChangeTo* for them.
+  if ((!Src1.isImm() && !Src1.isReg()) ||
+      !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) {
+    legalizeOpWithMove(MI, Src1Idx);
+    return;
+  }
+
+  int CommutedOpc = commuteOpcode(*MI);
+  if (CommutedOpc == -1) {
+    legalizeOpWithMove(MI, Src1Idx);
+    return;
+  }
+
+  MI->setDesc(get(CommutedOpc));
+
+  unsigned Src0Reg = Src0.getReg();
+  unsigned Src0SubReg = Src0.getSubReg();
+  bool Src0Kill = Src0.isKill();
+
+  if (Src1.isImm())
+    Src0.ChangeToImmediate(Src1.getImm());
+  else if (Src1.isReg()) {
+    Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
+    Src0.setSubReg(Src1.getSubReg());
+  } else
+    llvm_unreachable("Should only have register or immediate operands");
+
+  Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
+  Src1.setSubReg(Src0SubReg);
+}
+
 // Legalize VOP3 operands. Because all operand types are supported for any
 // operand, and since literal constants are not allowed and should never be
 // seen, we only need to worry about inserting copies if we use multiple SGPR
@@ -1821,32 +1923,10 @@ void SIInstrInfo::legalizeOperandsVOP3(
 
 void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
   MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
-  unsigned Opc = MI->getOpcode();
 
   // Legalize VOP2
   if (isVOP2(*MI)) {
-    int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
-    int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
-
-    // Legalize src0
-    if (!isOperandLegal(MI, Src0Idx))
-      legalizeOpWithMove(MI, Src0Idx);
-
-    // Legalize src1
-    if (isOperandLegal(MI, Src1Idx))
-      return;
-
-    // Usually src0 of VOP2 instructions allow more types of inputs
-    // than src1, so try to commute the instruction to decrease our
-    // chances of having to insert a MOV instruction to legalize src1.
-    if (MI->isCommutable()) {
-      if (commuteInstruction(MI))
-        // If we are successful in commuting, then we know MI is legal, so
-        // we are done.
-        return;
-    }
-
-    legalizeOpWithMove(MI, Src1Idx);
+    legalizeOperandsVOP2(MRI, MI);
     return;
   }
 
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index 2bce87f3bd06..8d18d29196f7 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -381,6 +381,23 @@ class SIInstrInfo : public AMDGPUInstrInfo {
   bool isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
                       const MachineOperand *MO = nullptr) const;
 
+  /// \brief Check if \p MO would be a valid operand for the given operand
+  /// definition \p OpInfo. Note this does not attempt to validate constant bus
+  /// restrictions (e.g. literal constant usage).
+  bool isLegalVSrcOperand(const MachineRegisterInfo &MRI,
+                          const MCOperandInfo &OpInfo,
+                          const MachineOperand &MO) const;
+
+  /// \brief Check if \p MO (a register operand) is a legal register for the
+  /// given operand description.
+  bool isLegalRegOperand(const MachineRegisterInfo &MRI,
+                         const MCOperandInfo &OpInfo,
+                         const MachineOperand &MO) const;
+
+  /// \brief Legalize operands in \p MI by either commuting it or inserting a
+  /// copy of src1.
+  void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr *MI) const;
+
   /// \brief Fix operands in \p MI to satisfy constant bus requirements.
   void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr *MI) const;
 
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h
index eafe4053e878..1795237c2140 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -18,6 +18,7 @@
 
 #include "AMDGPURegisterInfo.h"
 #include "AMDGPUSubtarget.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 
 namespace llvm {
@@ -65,6 +66,12 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
     return isSGPRClass(getRegClass(RCID));
   }
 
+  bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const {
+    if (TargetRegisterInfo::isVirtualRegister(Reg))
+      return isSGPRClass(MRI.getRegClass(Reg));
+    return getPhysRegClass(Reg);
+  }
+
   /// \returns true if this class contains VGPR registers.
   bool hasVGPRs(const TargetRegisterClass *RC) const;
 

From 116b08e39566251bc1a980f68e5f3107f24a4b24 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 1 Dec 2015 19:57:43 +0000
Subject: [PATCH 102/186] use range-based for loops; NFCI

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254453 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/TwoAddressInstructionPass.cpp | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index eb45f1ee31bd..c6bae2434586 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -869,8 +869,7 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
   SmallSet<unsigned, 2> Uses;
   SmallSet<unsigned, 2> Kills;
   SmallSet<unsigned, 2> Defs;
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : MI->operands()) {
     if (!MO.isReg())
       continue;
     unsigned MOReg = MO.getReg();
@@ -912,8 +911,7 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
         OtherMI->isBranch() || OtherMI->isTerminator())
       // Don't move pass calls, etc.
       return false;
-    for (unsigned i = 0, e = OtherMI->getNumOperands(); i != e; ++i) {
-      const MachineOperand &MO = OtherMI->getOperand(i);
+    for (const MachineOperand &MO : OtherMI->operands()) {
       if (!MO.isReg())
         continue;
       unsigned MOReg = MO.getReg();
@@ -1052,8 +1050,7 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
   SmallSet<unsigned, 2> Kills;
   SmallSet<unsigned, 2> Defs;
   SmallSet<unsigned, 2> LiveDefs;
-  for (unsigned i = 0, e = KillMI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = KillMI->getOperand(i);
+  for (const MachineOperand &MO : KillMI->operands()) {
     if (!MO.isReg())
       continue;
     unsigned MOReg = MO.getReg();
@@ -1091,8 +1088,7 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
       // Don't move pass calls, etc.
       return false;
     SmallVector<unsigned, 2> OtherDefs;
-    for (unsigned i = 0, e = OtherMI->getNumOperands(); i != e; ++i) {
-      const MachineOperand &MO = OtherMI->getOperand(i);
+    for (const MachineOperand &MO : OtherMI->operands()) {
       if (!MO.isReg())
         continue;
       unsigned MOReg = MO.getReg();
@@ -1564,8 +1560,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
   if (AllUsesCopied) {
     if (!IsEarlyClobber) {
       // Replace other (un-tied) uses of regB with LastCopiedReg.
-      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-        MachineOperand &MO = MI->getOperand(i);
+      for (MachineOperand &MO : MI->operands()) {
         if (MO.isReg() && MO.getReg() == RegB && MO.getSubReg() == SubRegB &&
             MO.isUse()) {
           if (MO.isKill()) {
@@ -1602,8 +1597,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
     // regB is still used in this instruction, but a kill flag was
     // removed from a different tied use of regB, so now we need to add
     // a kill flag to one of the remaining uses of regB.
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = MI->getOperand(i);
+    for (MachineOperand &MO : MI->operands()) {
       if (MO.isReg() && MO.getReg() == RegB && MO.isUse()) {
         MO.setIsKill(true);
         break;

From e6b26fa8adb7f932fabb9880b7a84b2ab1370b8c Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Tue, 1 Dec 2015 20:11:43 +0000
Subject: [PATCH 103/186] clang-format LinkModules.cpp.

Most of the file has been changed recently and was already clang-format
clean.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254454 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Linker/LinkModules.cpp | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index c2f472c7e33f..602d012d9df1 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -34,7 +34,6 @@
 #include <tuple>
 using namespace llvm;
 
-
 //===----------------------------------------------------------------------===//
 // TypeMap implementation.
 //===----------------------------------------------------------------------===//
@@ -42,22 +41,22 @@ using namespace llvm;
 namespace {
 class TypeMapTy : public ValueMapTypeRemapper {
   /// This is a mapping from a source type to a destination type to use.
-  DenseMap<Type*, Type*> MappedTypes;
+  DenseMap<Type *, Type *> MappedTypes;
 
   /// When checking to see if two subgraphs are isomorphic, we speculatively
   /// add types to MappedTypes, but keep track of them here in case we need to
   /// roll back.
-  SmallVector<Type*, 16> SpeculativeTypes;
+  SmallVector<Type *, 16> SpeculativeTypes;
 
-  SmallVector<StructType*, 16> SpeculativeDstOpaqueTypes;
+  SmallVector<StructType *, 16> SpeculativeDstOpaqueTypes;
 
   /// This is a list of non-opaque structs in the source module that are mapped
   /// to an opaque struct in the destination module.
-  SmallVector<StructType*, 16> SrcDefinitionsToResolve;
+  SmallVector<StructType *, 16> SrcDefinitionsToResolve;
 
   /// This is the set of opaque types in the destination modules who are
   /// getting a body from the source module.
-  SmallPtrSet<StructType*, 16> DstResolvedOpaqueTypes;
+  SmallPtrSet<StructType *, 16> DstResolvedOpaqueTypes;
 
 public:
   TypeMapTy(Linker::IdentifiedStructTypeSet &DstStructTypesSet)
@@ -179,7 +178,7 @@ bool TypeMapTy::areTypesIsomorphic(Type *DstTy, Type *SrcTy) {
 
   // Fail if any of the extra properties (e.g. array size) of the type disagree.
   if (isa<IntegerType>(DstTy))
-    return false;  // bitwidth disagrees.
+    return false; // bitwidth disagrees.
   if (PointerType *PT = dyn_cast<PointerType>(DstTy)) {
     if (PT->getAddressSpace() != cast<PointerType>(SrcTy)->getAddressSpace())
       return false;
@@ -215,7 +214,7 @@ bool TypeMapTy::areTypesIsomorphic(Type *DstTy, Type *SrcTy) {
 }
 
 void TypeMapTy::linkDefinedTypeBodies() {
-  SmallVector<Type*, 16> Elements;
+  SmallVector<Type *, 16> Elements;
   for (StructType *SrcSTy : SrcDefinitionsToResolve) {
     StructType *DstSTy = cast<StructType>(MappedTypes[SrcSTy]);
     assert(DstSTy->isOpaque());
@@ -596,10 +595,10 @@ static void forceRenaming(GlobalValue *GV, StringRef Name) {
   // If there is a conflict, rename the conflict.
   if (GlobalValue *ConflictGV = M->getNamedValue(Name)) {
     GV->takeName(ConflictGV);
-    ConflictGV->setName(Name);    // This will cause ConflictGV to get renamed
+    ConflictGV->setName(Name); // This will cause ConflictGV to get renamed
     assert(ConflictGV->getName() != Name && "forceRenaming didn't work");
   } else {
-    GV->setName(Name);              // Force the name back
+    GV->setName(Name); // Force the name back
   }
 }
 
@@ -1534,7 +1533,7 @@ bool ModuleLinker::linkFunctionBody(Function &Dst, Function &Src) {
   // Go through and convert function arguments over, remembering the mapping.
   Function::arg_iterator DI = Dst.arg_begin();
   for (Argument &Arg : Src.args()) {
-    DI->setName(Arg.getName());  // Copy the name over.
+    DI->setName(Arg.getName()); // Copy the name over.
 
     // Add a mapping to our mapping.
     ValueMap[&Arg] = &*DI;
@@ -1625,7 +1624,8 @@ void ModuleLinker::linkNamedMDNodes() {
 bool ModuleLinker::linkModuleFlagsMetadata() {
   // If the source module has no module flags, we are done.
   const NamedMDNode *SrcModFlags = SrcM.getModuleFlagsMetadata();
-  if (!SrcModFlags) return false;
+  if (!SrcModFlags)
+    return false;
 
   // If the destination module doesn't have module flags yet, then just copy
   // over the source module's flags.
@@ -1639,7 +1639,7 @@ bool ModuleLinker::linkModuleFlagsMetadata() {
 
   // First build a map of the existing module flags and requirements.
   DenseMap<MDString *, std::pair<MDNode *, unsigned>> Flags;
-  SmallSetVector<MDNode*, 16> Requirements;
+  SmallSetVector<MDNode *, 16> Requirements;
   for (unsigned I = 0, E = DstModFlags->getNumOperands(); I != E; ++I) {
     MDNode *Op = DstModFlags->getOperand(I);
     ConstantInt *Behavior = mdconst::extract<ConstantInt>(Op->getOperand(0));
@@ -1720,7 +1720,8 @@ bool ModuleLinker::linkModuleFlagsMetadata() {
     // Perform the merge for standard behavior types.
     switch (SrcBehaviorValue) {
     case Module::Require:
-    case Module::Override: llvm_unreachable("not possible");
+    case Module::Override:
+      llvm_unreachable("not possible");
     case Module::Error: {
       // Emit an error if the values differ.
       if (SrcOp->getOperand(2) != DstOp->getOperand(2)) {
@@ -1783,16 +1784,15 @@ bool ModuleLinker::linkModuleFlagsMetadata() {
 static bool triplesMatch(const Triple &T0, const Triple &T1) {
   // If vendor is apple, ignore the version number.
   if (T0.getVendor() == Triple::Apple)
-    return T0.getArch() == T1.getArch() &&
-           T0.getSubArch() == T1.getSubArch() &&
-           T0.getVendor() == T1.getVendor() &&
-           T0.getOS() == T1.getOS();
+    return T0.getArch() == T1.getArch() && T0.getSubArch() == T1.getSubArch() &&
+           T0.getVendor() == T1.getVendor() && T0.getOS() == T1.getOS();
 
   return T0 == T1;
 }
 
 // This function returns the merged triple.
-static std::string mergeTriples(const Triple &SrcTriple, const Triple &DstTriple) {
+static std::string mergeTriples(const Triple &SrcTriple,
+                                const Triple &DstTriple) {
   // If vendor is apple, pick the triple with the larger version number.
   if (SrcTriple.getVendor() == Triple::Apple)
     if (DstTriple.isOSVersionLT(SrcTriple))

From fc16153e104c53dcfd54e79eee6549820a8a12f4 Mon Sep 17 00:00:00 2001
From: Justin Bogner <mail@justinbogner.com>
Date: Tue, 1 Dec 2015 20:20:49 +0000
Subject: [PATCH 104/186] IR: Clean up some duplicated code in
 ConstantDataSequential creation. NFC

ConstantDataArray::getImpl and ConstantDataVector::getImpl had a lot
of copy pasta in how they handled sequences of constants. Break that
out into a couple of simple functions.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254456 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/IR/Constants.cpp | 193 +++++++++++++------------------------------
 1 file changed, 57 insertions(+), 136 deletions(-)

diff --git a/lib/IR/Constants.cpp b/lib/IR/Constants.cpp
index d79fb3e7b0ba..b4a07a1b6b4a 100644
--- a/lib/IR/Constants.cpp
+++ b/lib/IR/Constants.cpp
@@ -857,6 +857,57 @@ static bool rangeOnlyContains(ItTy Start, ItTy End, EltTy Elt) {
   return true;
 }
 
+template <typename SequentialTy, typename ElementTy>
+static Constant *getIntSequenceIfElementsMatch(ArrayRef<Constant *> V) {
+  assert(!V.empty() && "Cannot get empty int sequence.");
+
+  SmallVector<ElementTy, 16> Elts;
+  for (Constant *C : V)
+    if (auto *CI = dyn_cast<ConstantInt>(C))
+      Elts.push_back(CI->getZExtValue());
+    else
+      return nullptr;
+  return SequentialTy::get(V[0]->getContext(), Elts);
+}
+
+template <typename SequentialTy, typename ElementTy>
+static Constant *getFPSequenceIfElementsMatch(ArrayRef<Constant *> V) {
+  assert(!V.empty() && "Cannot get empty FP sequence.");
+
+  SmallVector<ElementTy, 16> Elts;
+  for (Constant *C : V)
+    if (auto *CFP = dyn_cast<ConstantFP>(C))
+      Elts.push_back(CFP->getValueAPF().bitcastToAPInt().getLimitedValue());
+    else
+      return nullptr;
+  return SequentialTy::getFP(V[0]->getContext(), Elts);
+}
+
+template <typename SequenceTy>
+static Constant *getSequenceIfElementsMatch(Constant *C,
+                                            ArrayRef<Constant *> V) {
+  // We speculatively build the elements here even if it turns out that there is
+  // a constantexpr or something else weird, since it is so uncommon for that to
+  // happen.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(C)) {
+    if (CI->getType()->isIntegerTy(8))
+      return getIntSequenceIfElementsMatch<SequenceTy, uint8_t>(V);
+    else if (CI->getType()->isIntegerTy(16))
+      return getIntSequenceIfElementsMatch<SequenceTy, uint16_t>(V);
+    else if (CI->getType()->isIntegerTy(32))
+      return getIntSequenceIfElementsMatch<SequenceTy, uint32_t>(V);
+    else if (CI->getType()->isIntegerTy(64))
+      return getIntSequenceIfElementsMatch<SequenceTy, uint64_t>(V);
+  } else if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
+    if (CFP->getType()->isFloatTy())
+      return getFPSequenceIfElementsMatch<SequenceTy, uint32_t>(V);
+    else if (CFP->getType()->isDoubleTy())
+      return getFPSequenceIfElementsMatch<SequenceTy, uint64_t>(V);
+  }
+
+  return nullptr;
+}
+
 ConstantArray::ConstantArray(ArrayType *T, ArrayRef<Constant *> V)
   : Constant(T, ConstantArrayVal,
              OperandTraits<ConstantArray>::op_end(this) - V.size(),
@@ -874,6 +925,7 @@ Constant *ConstantArray::get(ArrayType *Ty, ArrayRef<Constant*> V) {
     return C;
   return Ty->getContext().pImpl->ArrayConstants.getOrCreate(Ty, V);
 }
+
 Constant *ConstantArray::getImpl(ArrayType *Ty, ArrayRef<Constant*> V) {
   // Empty arrays are canonicalized to ConstantAggregateZero.
   if (V.empty())
@@ -896,74 +948,8 @@ Constant *ConstantArray::getImpl(ArrayType *Ty, ArrayRef<Constant*> V) {
 
   // Check to see if all of the elements are ConstantFP or ConstantInt and if
   // the element type is compatible with ConstantDataVector.  If so, use it.
-  if (ConstantDataSequential::isElementTypeCompatible(C->getType())) {
-    // We speculatively build the elements here even if it turns out that there
-    // is a constantexpr or something else weird in the array, since it is so
-    // uncommon for that to happen.
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(C)) {
-      if (CI->getType()->isIntegerTy(8)) {
-        SmallVector<uint8_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i]))
-            Elts.push_back(CI->getZExtValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataArray::get(C->getContext(), Elts);
-      } else if (CI->getType()->isIntegerTy(16)) {
-        SmallVector<uint16_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i]))
-            Elts.push_back(CI->getZExtValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataArray::get(C->getContext(), Elts);
-      } else if (CI->getType()->isIntegerTy(32)) {
-        SmallVector<uint32_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i]))
-            Elts.push_back(CI->getZExtValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataArray::get(C->getContext(), Elts);
-      } else if (CI->getType()->isIntegerTy(64)) {
-        SmallVector<uint64_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i]))
-            Elts.push_back(CI->getZExtValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataArray::get(C->getContext(), Elts);
-      }
-    }
-
-    if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
-      if (CFP->getType()->isFloatTy()) {
-        SmallVector<uint32_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantFP *CFP = dyn_cast<ConstantFP>(V[i]))
-            Elts.push_back(
-                CFP->getValueAPF().bitcastToAPInt().getLimitedValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataArray::getFP(C->getContext(), Elts);
-      } else if (CFP->getType()->isDoubleTy()) {
-        SmallVector<uint64_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantFP *CFP = dyn_cast<ConstantFP>(V[i]))
-            Elts.push_back(
-                CFP->getValueAPF().bitcastToAPInt().getLimitedValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataArray::getFP(C->getContext(), Elts);
-      }
-    }
-  }
+  if (ConstantDataSequential::isElementTypeCompatible(C->getType()))
+    return getSequenceIfElementsMatch<ConstantDataArray>(C, V);
 
   // Otherwise, we really do want to create a ConstantArray.
   return nullptr;
@@ -1059,6 +1045,7 @@ Constant *ConstantVector::get(ArrayRef<Constant*> V) {
   VectorType *Ty = VectorType::get(V.front()->getType(), V.size());
   return Ty->getContext().pImpl->VectorConstants.getOrCreate(Ty, V);
 }
+
 Constant *ConstantVector::getImpl(ArrayRef<Constant*> V) {
   assert(!V.empty() && "Vectors can't be empty");
   VectorType *T = VectorType::get(V.front()->getType(), V.size());
@@ -1084,74 +1071,8 @@ Constant *ConstantVector::getImpl(ArrayRef<Constant*> V) {
 
   // Check to see if all of the elements are ConstantFP or ConstantInt and if
   // the element type is compatible with ConstantDataVector.  If so, use it.
-  if (ConstantDataSequential::isElementTypeCompatible(C->getType())) {
-    // We speculatively build the elements here even if it turns out that there
-    // is a constantexpr or something else weird in the array, since it is so
-    // uncommon for that to happen.
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(C)) {
-      if (CI->getType()->isIntegerTy(8)) {
-        SmallVector<uint8_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i]))
-            Elts.push_back(CI->getZExtValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataVector::get(C->getContext(), Elts);
-      } else if (CI->getType()->isIntegerTy(16)) {
-        SmallVector<uint16_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i]))
-            Elts.push_back(CI->getZExtValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataVector::get(C->getContext(), Elts);
-      } else if (CI->getType()->isIntegerTy(32)) {
-        SmallVector<uint32_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i]))
-            Elts.push_back(CI->getZExtValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataVector::get(C->getContext(), Elts);
-      } else if (CI->getType()->isIntegerTy(64)) {
-        SmallVector<uint64_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i]))
-            Elts.push_back(CI->getZExtValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataVector::get(C->getContext(), Elts);
-      }
-    }
-
-    if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
-      if (CFP->getType()->isFloatTy()) {
-        SmallVector<uint32_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantFP *CFP = dyn_cast<ConstantFP>(V[i]))
-            Elts.push_back(
-                CFP->getValueAPF().bitcastToAPInt().getLimitedValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataVector::getFP(C->getContext(), Elts);
-      } else if (CFP->getType()->isDoubleTy()) {
-        SmallVector<uint64_t, 16> Elts;
-        for (unsigned i = 0, e = V.size(); i != e; ++i)
-          if (ConstantFP *CFP = dyn_cast<ConstantFP>(V[i]))
-            Elts.push_back(
-                CFP->getValueAPF().bitcastToAPInt().getLimitedValue());
-          else
-            break;
-        if (Elts.size() == V.size())
-          return ConstantDataVector::getFP(C->getContext(), Elts);
-      }
-    }
-  }
+  if (ConstantDataSequential::isElementTypeCompatible(C->getType()))
+    return getSequenceIfElementsMatch<ConstantDataVector>(C, V);
 
   // Otherwise, the element type isn't compatible with ConstantDataVector, or
   // the operand list constants a ConstantExpr or something else strange.

From 146c16ca4dd4d3d06dcd78efb95906268fc67509 Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Tue, 1 Dec 2015 20:23:19 +0000
Subject: [PATCH 105/186] Delete unused includes.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254457 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Linker/LinkModules.cpp | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index 602d012d9df1..3c6d6f067c4e 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -13,25 +13,16 @@
 
 #include "llvm/Linker/Linker.h"
 #include "llvm-c/Linker.h"
-#include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/TypeFinder.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include <cctype>
-#include <tuple>
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//

From d89494bf0312145ad18320eab794b89b300f64e7 Mon Sep 17 00:00:00 2001
From: Xinliang David Li <davidxl@google.com>
Date: Tue, 1 Dec 2015 20:26:26 +0000
Subject: [PATCH 106/186] [PGO] Add support for reading multiple versions of
 indexed profile format profile data

Profile readers using incompatible on-disk hash table format can now share the same
implementation and interfaces.

Differential Revision: http://reviews.llvm.org/D15100


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254458 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ProfileData/InstrProfReader.h | 64 +++++++++++++---------
 include/llvm/Support/OnDiskHashTable.h     |  7 ++-
 lib/ProfileData/InstrProfReader.cpp        | 49 +++++++++--------
 3 files changed, 70 insertions(+), 50 deletions(-)

diff --git a/include/llvm/ProfileData/InstrProfReader.h b/include/llvm/ProfileData/InstrProfReader.h
index e0a93cec3208..49233366e164 100644
--- a/include/llvm/ProfileData/InstrProfReader.h
+++ b/include/llvm/ProfileData/InstrProfReader.h
@@ -259,36 +259,50 @@ class InstrProfLookupTrait {
   }
 };
 
-class InstrProfReaderIndex {
- private:
-  typedef OnDiskIterableChainedHashTable<InstrProfLookupTrait> IndexType;
+struct InstrProfReaderIndexBase {
+  // Read all the profile records with the same key pointed to the current
+  // iterator.
+  virtual std::error_code getRecords(ArrayRef<InstrProfRecord> &Data) = 0;
+  // Read all the profile records with the key equal to FuncName
+  virtual std::error_code getRecords(StringRef FuncName,
+                                     ArrayRef<InstrProfRecord> &Data) = 0;
+  virtual void advanceToNextKey() = 0;
+  virtual bool atEnd() const = 0;
+  virtual void setValueProfDataEndianness(support::endianness Endianness) = 0;
+  virtual ~InstrProfReaderIndexBase() {}
+};
+
+typedef OnDiskIterableChainedHashTable<InstrProfLookupTrait>
+    OnDiskHashTableImplV3;
 
-  std::unique_ptr<IndexType> Index;
-  IndexType::data_iterator RecordIterator;
+template <typename HashTableImpl>
+class InstrProfReaderIndex : public InstrProfReaderIndexBase {
+
+private:
+  std::unique_ptr<HashTableImpl> HashTable;
+  typename HashTableImpl::data_iterator RecordIterator;
   uint64_t FormatVersion;
 
   // String table for holding a unique copy of all the strings in the profile.
   InstrProfStringTable StringTable;
 
- public:
-  InstrProfReaderIndex() : Index(nullptr) {}
-  void Init(const unsigned char *Buckets, const unsigned char *const Payload,
-            const unsigned char *const Base, IndexedInstrProf::HashT HashType,
-            uint64_t Version);
+public:
+  InstrProfReaderIndex(const unsigned char *Buckets,
+                       const unsigned char *const Payload,
+                       const unsigned char *const Base,
+                       IndexedInstrProf::HashT HashType, uint64_t Version);
 
-  // Read all the pofile records with the same key pointed to the current
-  // iterator.
-  std::error_code getRecords(ArrayRef<InstrProfRecord> &Data);
-  // Read all the profile records with the key equal to FuncName
+  std::error_code getRecords(ArrayRef<InstrProfRecord> &Data) override;
   std::error_code getRecords(StringRef FuncName,
-                             ArrayRef<InstrProfRecord> &Data);
-
-  void advanceToNextKey() { RecordIterator++; }
-  bool atEnd() const { return RecordIterator == Index->data_end(); }
-  // Used for testing purpose only.
-  void setValueProfDataEndianness(support::endianness Endianness) {
-    Index->getInfoObj().setValueProfDataEndianness(Endianness);
+                             ArrayRef<InstrProfRecord> &Data) override;
+  void advanceToNextKey() override { RecordIterator++; }
+  bool atEnd() const override {
+    return RecordIterator == HashTable->data_end();
+  }
+  void setValueProfDataEndianness(support::endianness Endianness) override {
+    HashTable->getInfoObj().setValueProfDataEndianness(Endianness);
   }
+  ~InstrProfReaderIndex() override {}
 };
 
 /// Reader for the indexed binary instrprof format.
@@ -297,16 +311,16 @@ class IndexedInstrProfReader : public InstrProfReader {
   /// The profile data file contents.
   std::unique_ptr<MemoryBuffer> DataBuffer;
   /// The index into the profile data.
-  InstrProfReaderIndex Index;
+  std::unique_ptr<InstrProfReaderIndexBase> Index;
   /// The maximal execution count among all functions.
   uint64_t MaxFunctionCount;
 
   IndexedInstrProfReader(const IndexedInstrProfReader &) = delete;
   IndexedInstrProfReader &operator=(const IndexedInstrProfReader &) = delete;
 
- public:
+public:
   IndexedInstrProfReader(std::unique_ptr<MemoryBuffer> DataBuffer)
-      : DataBuffer(std::move(DataBuffer)), Index() {}
+      : DataBuffer(std::move(DataBuffer)), Index(nullptr) {}
 
   /// Return true if the given buffer is in an indexed instrprof format.
   static bool hasFormat(const MemoryBuffer &DataBuffer);
@@ -337,7 +351,7 @@ class IndexedInstrProfReader : public InstrProfReader {
 
   // Used for testing purpose only.
   void setValueProfDataEndianness(support::endianness Endianness) {
-    Index.setValueProfDataEndianness(Endianness);
+    Index->setValueProfDataEndianness(Endianness);
   }
 };
 
diff --git a/include/llvm/Support/OnDiskHashTable.h b/include/llvm/Support/OnDiskHashTable.h
index c47134f46c8d..ac978d4c242c 100644
--- a/include/llvm/Support/OnDiskHashTable.h
+++ b/include/llvm/Support/OnDiskHashTable.h
@@ -263,11 +263,12 @@ template <typename Info> class OnDiskChainedHashTable {
   Info InfoObj;
 
 public:
+  typedef Info InfoType;
   typedef typename Info::internal_key_type internal_key_type;
   typedef typename Info::external_key_type external_key_type;
-  typedef typename Info::data_type         data_type;
-  typedef typename Info::hash_value_type   hash_value_type;
-  typedef typename Info::offset_type       offset_type;
+  typedef typename Info::data_type data_type;
+  typedef typename Info::hash_value_type hash_value_type;
+  typedef typename Info::offset_type offset_type;
 
   OnDiskChainedHashTable(offset_type NumBuckets, offset_type NumEntries,
                          const unsigned char *Buckets,
diff --git a/lib/ProfileData/InstrProfReader.cpp b/lib/ProfileData/InstrProfReader.cpp
index e4348b19ac02..cfc968739806 100644
--- a/lib/ProfileData/InstrProfReader.cpp
+++ b/lib/ProfileData/InstrProfReader.cpp
@@ -441,11 +441,11 @@ data_type InstrProfLookupTrait::ReadData(StringRef K, const unsigned char *D,
   return DataBuffer;
 }
 
-std::error_code
-InstrProfReaderIndex::getRecords(StringRef FuncName,
-                                 ArrayRef<InstrProfRecord> &Data) {
-  auto Iter = Index->find(FuncName);
-  if (Iter == Index->end())
+template <typename HashTableImpl>
+std::error_code InstrProfReaderIndex<HashTableImpl>::getRecords(
+    StringRef FuncName, ArrayRef<InstrProfRecord> &Data) {
+  auto Iter = HashTable->find(FuncName);
+  if (Iter == HashTable->end())
     return instrprof_error::unknown_function;
 
   Data = (*Iter);
@@ -455,9 +455,11 @@ InstrProfReaderIndex::getRecords(StringRef FuncName,
   return instrprof_error::success;
 }
 
-std::error_code InstrProfReaderIndex::getRecords(
+template <typename HashTableImpl>
+std::error_code InstrProfReaderIndex<HashTableImpl>::getRecords(
     ArrayRef<InstrProfRecord> &Data) {
-  if (atEnd()) return instrprof_error::eof;
+  if (atEnd())
+    return instrprof_error::eof;
 
   Data = *RecordIterator;
 
@@ -466,25 +468,26 @@ std::error_code InstrProfReaderIndex::getRecords(
   return instrprof_error::success;
 }
 
-void InstrProfReaderIndex::Init(const unsigned char *Buckets,
-                                const unsigned char *const Payload,
-                                const unsigned char *const Base,
-                                IndexedInstrProf::HashT HashType,
-                                uint64_t Version) {
+template <typename HashTableImpl>
+InstrProfReaderIndex<HashTableImpl>::InstrProfReaderIndex(
+    const unsigned char *Buckets, const unsigned char *const Payload,
+    const unsigned char *const Base, IndexedInstrProf::HashT HashType,
+    uint64_t Version) {
   FormatVersion = Version;
-  Index.reset(IndexType::Create(Buckets, Payload, Base,
-                                InstrProfLookupTrait(HashType, Version)));
+  HashTable.reset(HashTableImpl::Create(
+      Buckets, Payload, Base,
+      typename HashTableImpl::InfoType(HashType, Version)));
   // Form the map of hash values to const char* keys in profiling data.
   std::vector<std::pair<uint64_t, const char *>> HashKeys;
-  for (auto Key : Index->keys()) {
+  for (auto Key : HashTable->keys()) {
     const char *KeyTableRef = StringTable.insertString(Key);
     HashKeys.push_back(std::make_pair(ComputeHash(HashType, Key), KeyTableRef));
   }
   std::sort(HashKeys.begin(), HashKeys.end(), less_first());
   HashKeys.erase(std::unique(HashKeys.begin(), HashKeys.end()), HashKeys.end());
   // Set the hash key map for the InstrLookupTrait
-  Index->getInfoObj().setHashKeys(std::move(HashKeys));
-  RecordIterator = Index->data_begin();
+  HashTable->getInfoObj().setHashKeys(std::move(HashKeys));
+  RecordIterator = HashTable->data_begin();
 }
 
 bool IndexedInstrProfReader::hasFormat(const MemoryBuffer &DataBuffer) {
@@ -532,8 +535,10 @@ std::error_code IndexedInstrProfReader::readHeader() {
   uint64_t HashOffset = endian::byte_swap<uint64_t, little>(Header->HashOffset);
 
   // The rest of the file is an on disk hash table.
-  Index.Init(Start + HashOffset, Cur, Start, HashType, FormatVersion);
-
+  InstrProfReaderIndexBase *IndexPtr = nullptr;
+  IndexPtr = new InstrProfReaderIndex<OnDiskHashTableImplV3>(
+      Start + HashOffset, Cur, Start, HashType, FormatVersion);
+  Index.reset(IndexPtr);
   return success();
 }
 
@@ -541,7 +546,7 @@ ErrorOr<InstrProfRecord>
 IndexedInstrProfReader::getInstrProfRecord(StringRef FuncName,
                                            uint64_t FuncHash) {
   ArrayRef<InstrProfRecord> Data;
-  std::error_code EC = Index.getRecords(FuncName, Data);
+  std::error_code EC = Index->getRecords(FuncName, Data);
   if (EC != instrprof_error::success)
     return EC;
   // Found it. Look for counters with the right hash.
@@ -571,13 +576,13 @@ std::error_code IndexedInstrProfReader::readNextRecord(
 
   ArrayRef<InstrProfRecord> Data;
 
-  std::error_code EC = Index.getRecords(Data);
+  std::error_code EC = Index->getRecords(Data);
   if (EC != instrprof_error::success)
     return error(EC);
 
   Record = Data[RecordIndex++];
   if (RecordIndex >= Data.size()) {
-    Index.advanceToNextKey();
+    Index->advanceToNextKey();
     RecordIndex = 0;
   }
   return success();

From 418d565e54fae3f339290787e25216011e894b0b Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 1 Dec 2015 20:31:08 +0000
Subject: [PATCH 107/186] AMDGPU: Disallow flat_scr in SI assembler

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254459 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      | 27 ++++++++++++++++--
 test/MC/AMDGPU/flat-scratch.s                 | 28 +++++++++++++++++++
 2 files changed, 52 insertions(+), 3 deletions(-)
 create mode 100644 test/MC/AMDGPU/flat-scratch.s

diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index db5cebf6e42c..184cf0f65819 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -332,6 +332,14 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
 
   unsigned ForcedEncodingSize;
 
+  bool isSI() const {
+    return STI->getFeatureBits()[AMDGPU::FeatureSouthernIslands];
+  }
+
+  bool isCI() const {
+    return STI->getFeatureBits()[AMDGPU::FeatureSeaIslands];
+  }
+
   bool isVI() const {
     return getSTI().getFeatureBits()[AMDGPU::FeatureVolcanicIslands];
   }
@@ -504,12 +512,14 @@ bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &End
   const AsmToken Tok = Parser.getTok();
   StartLoc = Tok.getLoc();
   EndLoc = Tok.getEndLoc();
+  const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+
   StringRef RegName = Tok.getString();
   RegNo = getRegForName(RegName);
 
   if (RegNo) {
     Parser.Lex();
-    return false;
+    return !subtargetHasRegister(*TRI, RegNo);
   }
 
   // Match vgprs and sgprs
@@ -562,7 +572,6 @@ bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &End
     }
   }
 
-  const MCRegisterInfo *TRI = getContext().getRegisterInfo();
   int RCID = getRegClass(IsVgpr, RegWidth);
   if (RCID == -1)
     return true;
@@ -980,9 +989,21 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
 
 bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
                                            unsigned RegNo) const {
-  if (!isVI())
+  if (isCI())
     return true;
 
+  if (isSI()) {
+    // No flat_scr
+    switch (RegNo) {
+    case AMDGPU::FLAT_SCR:
+    case AMDGPU::FLAT_SCR_LO:
+    case AMDGPU::FLAT_SCR_HI:
+      return false;
+    default:
+      return true;
+    }
+  }
+
   // VI only has 102 SGPRs, so make sure we aren't trying to use the 2 more that
   // SI/CI have.
   for (MCRegAliasIterator R(AMDGPU::SGPR102_SGPR103, &MRI, true);
diff --git a/test/MC/AMDGPU/flat-scratch.s b/test/MC/AMDGPU/flat-scratch.s
new file mode 100644
index 000000000000..e68f67f01516
--- /dev/null
+++ b/test/MC/AMDGPU/flat-scratch.s
@@ -0,0 +1,28 @@
+// RUN: not llvm-mc -arch=amdgcn -mcpu=tahiti %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=SI %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=hawaii %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=CI %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+s_mov_b64 flat_scratch, -1
+// SI: error: invalid operand for instruction
+// CI-NOT: error
+// VI-NOT: error
+
+s_mov_b32 flat_scratch_lo, -1
+// SI: error: invalid operand for instruction
+// CI-NOT: error
+// VI-NOT: error
+
+s_mov_b32 flat_scratch_hi, -1
+// SI: error: invalid operand for instruction
+// CI-NOT: error
+// VI-NOT: error
+
+
+s_mov_b64 flat_scratch_lo, -1
+// GCN: error: invalid operand for instruction
+
+s_mov_b64 flat_scratch_hi, -1
+// GCN: error: invalid operand for instruction
+
+s_mov_b32 flat_scratch, -1
+// GCN: error: invalid operand for instruction

From a46d791b1d6433e359764642e4545fab7a10e2cb Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Tue, 1 Dec 2015 21:48:52 +0000
Subject: [PATCH 108/186] ARM: Change ArchCheck field to uint64_t

The values in this field are compared against getAvailableFeatures()
which returns an uint64_t. This was causing problems in an internal
branch.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254462 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/AsmParser/ARMAsmParser.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 8341fbc4efd1..73f330877566 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -9916,7 +9916,7 @@ extern "C" void LLVMInitializeARMAsmParser() {
 // flags below, that were generated by table-gen.
 static const struct {
   const unsigned Kind;
-  const unsigned ArchCheck;
+  const uint64_t ArchCheck;
   const FeatureBitset Features;
 } Extensions[] = {
   { ARM::AEK_CRC, Feature_HasV8, {ARM::FeatureCRC} },

From d715c5a2cfd096fc5ac18e81a4b7cdfc55cc4f0f Mon Sep 17 00:00:00 2001
From: Cong Hou <congh@google.com>
Date: Tue, 1 Dec 2015 21:50:20 +0000
Subject: [PATCH 109/186] Fix a bug in IfConversion.cpp.

The bug is introduced in r254377 which failed some tests on ARM, where a new
probability is assigned to a successor but the provided BB may not be a
successor.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254463 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/IfConversion.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index ff28f95cc33d..e90cb02bd280 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -1254,9 +1254,8 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
     auto NewNext = BBNext + BBCvt * CvtNext;
     auto NewTrueBBIter =
         std::find(BBI.BB->succ_begin(), BBI.BB->succ_end(), NewTrueBB);
-    assert(NewTrueBBIter != BBI.BB->succ_end() &&
-           "NewTrueBB is not a successor of BBI.BB.");
-    BBI.BB->setSuccProbability(NewTrueBBIter, NewNext);
+    if (NewTrueBBIter != BBI.BB->succ_end())
+      BBI.BB->setSuccProbability(NewTrueBBIter, NewNext);
 
     auto NewFalse = BBCvt * CvtFalse;
     TII->InsertBranch(*BBI.BB, CvtBBI->FalseBB, nullptr, RevCond, dl);

From 427ad06f8e54f93f47c17fdbc43ae613474d642a Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Tue, 1 Dec 2015 22:40:40 +0000
Subject: [PATCH 110/186] Pass down the dst GV to linkGlobalValueBody. NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254465 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Linker/LinkModules.cpp | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index 3c6d6f067c4e..271c765d63c8 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -527,7 +527,7 @@ class ModuleLinker {
   void linkGlobalInit(GlobalVariable &Dst, GlobalVariable &Src);
   bool linkFunctionBody(Function &Dst, Function &Src);
   void linkAliasBody(GlobalAlias &Dst, GlobalAlias &Src);
-  bool linkGlobalValueBody(GlobalValue &Src);
+  bool linkGlobalValueBody(GlobalValue &Dst, GlobalValue &Src);
 
   /// Functions that take care of cloning a specific global value type
   /// into the destination module.
@@ -924,7 +924,7 @@ void ModuleLinker::materializeInitFor(GlobalValue *New, GlobalValue *Old) {
   if (!New->hasLocalLinkage() && DoNotLinkFromSource.count(Old))
     return;
 
-  linkGlobalValueBody(*Old);
+  linkGlobalValueBody(*New, *Old);
 }
 
 bool ModuleLinker::getComdatLeader(Module &M, StringRef ComdatName,
@@ -1566,9 +1566,7 @@ void ModuleLinker::linkAliasBody(GlobalAlias &Dst, GlobalAlias &Src) {
   Dst.setAliasee(Val);
 }
 
-bool ModuleLinker::linkGlobalValueBody(GlobalValue &Src) {
-  Value *Dst = ValueMap[&Src];
-  assert(Dst);
+bool ModuleLinker::linkGlobalValueBody(GlobalValue &Dst, GlobalValue &Src) {
   if (const Comdat *SC = Src.getComdat()) {
     // To ensure that we don't generate an incomplete comdat group,
     // we must materialize and map in any other members that are not
@@ -1583,15 +1581,15 @@ bool ModuleLinker::linkGlobalValueBody(GlobalValue &Src) {
     }
   }
   if (shouldInternalizeLinkedSymbols())
-    if (auto *DGV = dyn_cast<GlobalValue>(Dst))
+    if (auto *DGV = dyn_cast<GlobalValue>(&Dst))
       DGV->setLinkage(GlobalValue::InternalLinkage);
   if (auto *F = dyn_cast<Function>(&Src))
-    return linkFunctionBody(cast<Function>(*Dst), *F);
+    return linkFunctionBody(cast<Function>(Dst), *F);
   if (auto *GVar = dyn_cast<GlobalVariable>(&Src)) {
-    linkGlobalInit(cast<GlobalVariable>(*Dst), *GVar);
+    linkGlobalInit(cast<GlobalVariable>(Dst), *GVar);
     return false;
   }
-  linkAliasBody(cast<GlobalAlias>(*Dst), cast<GlobalAlias>(Src));
+  linkAliasBody(cast<GlobalAlias>(Dst), cast<GlobalAlias>(Src));
   return false;
 }
 

From 97a345e35acedef05160f5578dec11bc06f19ff5 Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Tue, 1 Dec 2015 23:01:51 +0000
Subject: [PATCH 111/186] Remove unnecessary getter.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254466 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Linker/LinkModules.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index 271c765d63c8..94010221846c 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -459,9 +459,6 @@ class ModuleLinker {
   /// Check if we should promote the given local value to global scope.
   bool doPromoteLocalToGlobal(const GlobalValue *SGV);
 
-  /// Check if all global value body linking is complete.
-  bool doneLinkingBodies() { return DoneLinkingBodies; }
-
   bool shouldLinkFromSource(bool &LinkFromSrc, const GlobalValue &Dest,
                             const GlobalValue &Src);
 
@@ -886,7 +883,7 @@ Value *ModuleLinker::materializeDeclFor(Value *V) {
   // If we are done linking global value bodies (i.e. we are performing
   // metadata linking), don't link in the global value due to this
   // reference, simply map it to null.
-  if (doneLinkingBodies())
+  if (DoneLinkingBodies)
     return nullptr;
 
   linkGlobalValueProto(SGV);

From 8f43193167c58861ee8c2a89031af0463828e1bf Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 1 Dec 2015 23:04:00 +0000
Subject: [PATCH 112/186] AMDGPU: Implement isNoopAddrSpaceCast

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254468 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/SIISelLowering.cpp          | 11 ++++
 lib/Target/AMDGPU/SIISelLowering.h            |  2 +
 .../AMDGPU/cgp-addressing-modes-flat.ll       | 66 +++++++++++++++++++
 3 files changed, 79 insertions(+)

diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index ab93bceb96ec..2cb801a707e1 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -492,6 +492,17 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
   return MVT::Other;
 }
 
+static bool isFlatGlobalAddrSpace(unsigned AS) {
+  return AS == AMDGPUAS::GLOBAL_ADDRESS ||
+    AS == AMDGPUAS::FLAT_ADDRESS ||
+    AS == AMDGPUAS::CONSTANT_ADDRESS;
+}
+
+bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
+                                           unsigned DestAS) const {
+  return isFlatGlobalAddrSpace(SrcAS) &&  isFlatGlobalAddrSpace(DestAS);
+}
+
 TargetLoweringBase::LegalizeTypeAction
 SITargetLowering::getPreferredVectorAction(EVT VT) const {
   if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index 0659dd7d5d05..b9f75cd11de0 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -80,6 +80,8 @@ class SITargetLowering : public AMDGPUTargetLowering {
                           bool MemcpyStrSrc,
                           MachineFunction &MF) const override;
 
+  bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
+
   TargetLoweringBase::LegalizeTypeAction
   getPreferredVectorAction(EVT VT) const override;
 
diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
index 4d70ba837816..1c5bed3b905f 100644
--- a/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
+++ b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
@@ -30,3 +30,69 @@ endif:
 done:
   ret void
 }
+
+; OPT-LABEL: @test_sink_noop_addrspacecast_flat_to_global_i32(
+; OPT: getelementptr i32, i32 addrspace(4)* %out,
+; OPT-CI-NOT: getelementptr
+; OPT: br i1
+
+; OPT-CI: ptrtoint
+; OPT-CI: add
+; OPT-CI: inttoptr
+; OPT: br label
+
+; GCN-LABEL: {{^}}test_sink_noop_addrspacecast_flat_to_global_i32:
+; CI: buffer_load_dword {{v[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
+define void @test_sink_noop_addrspacecast_flat_to_global_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
+  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
+  %cast = addrspacecast i32 addrspace(4)* %in.gep to i32 addrspace(1)*
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = load i32, i32 addrspace(1)* %cast
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(4)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
+; OPT-LABEL: @test_sink_noop_addrspacecast_flat_to_constant_i32(
+; OPT: getelementptr i32, i32 addrspace(4)* %out,
+; OPT-CI-NOT: getelementptr
+; OPT: br i1
+
+; OPT-CI: ptrtoint
+; OPT-CI: add
+; OPT-CI: inttoptr
+; OPT: br label
+
+; GCN-LABEL: {{^}}test_sink_noop_addrspacecast_flat_to_constant_i32:
+; CI: s_load_dword {{s[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
+define void @test_sink_noop_addrspacecast_flat_to_constant_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
+  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
+  %cast = addrspacecast i32 addrspace(4)* %in.gep to i32 addrspace(2)*
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = load i32, i32 addrspace(2)* %cast
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(4)* %out.gep
+  br label %done
+
+done:
+  ret void
+}

From 465e0cdebbe3ccb6f8301422bf8a175303c1a01d Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 1 Dec 2015 23:04:05 +0000
Subject: [PATCH 113/186] AMDGPU: Error on addrspacecasts that aren't actually
 implemented

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254469 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp  | 11 ++--
 test/CodeGen/AMDGPU/addrspacecast.ll      | 66 +++++++++++++++++++++++
 test/CodeGen/AMDGPU/flat-address-space.ll | 52 ------------------
 3 files changed, 73 insertions(+), 56 deletions(-)
 create mode 100644 test/CodeGen/AMDGPU/addrspacecast.ll

diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 710c6771b171..ea7c6429b7df 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -11,6 +11,8 @@
 /// \brief Defines an instruction selector for the AMDGPU target.
 //
 //===----------------------------------------------------------------------===//
+
+#include "AMDGPUDiagnosticInfoUnsupported.h"
 #include "AMDGPUInstrInfo.h"
 #include "AMDGPUISelLowering.h" // For AMDGPUISD
 #include "AMDGPURegisterInfo.h"
@@ -1208,13 +1210,14 @@ SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
   AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(N);
   SDLoc DL(N);
 
+  const MachineFunction &MF = CurDAG->getMachineFunction();
+  DiagnosticInfoUnsupported NotImplemented(*MF.getFunction(),
+                                           "addrspacecast not implemented");
+  CurDAG->getContext()->diagnose(NotImplemented);
+
   assert(Subtarget->hasFlatAddressSpace() &&
          "addrspacecast only supported with flat address space!");
 
-  assert((ASC->getSrcAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS &&
-          ASC->getDestAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) &&
-         "Cannot cast address space to / from constant address!");
-
   assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
           ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) &&
          "Can only cast to / from flat address space!");
diff --git a/test/CodeGen/AMDGPU/addrspacecast.ll b/test/CodeGen/AMDGPU/addrspacecast.ll
new file mode 100644
index 000000000000..9be212feef00
--- /dev/null
+++ b/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -0,0 +1,66 @@
+; RUN: not llc -O0 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+
+; ERROR: unsupported addrspacecast not implemented
+
+; XUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s
+; XUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s
+; XUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s
+; XUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s
+
+; Disable optimizations in case there are optimizations added that
+; specialize away generic pointer accesses.
+
+; CHECK-LABEL: {{^}}branch_use_flat_i32:
+; CHECK: flat_store_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
+; CHECK: s_endpgm
+define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
+entry:
+  %cmp = icmp ne i32 %c, 0
+  br i1 %cmp, label %local, label %global
+
+local:
+  %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32 addrspace(4)*
+  br label %end
+
+global:
+  %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
+  br label %end
+
+end:
+  %fptr = phi i32 addrspace(4)* [ %flat_local, %local ], [ %flat_global, %global ]
+  store i32 %x, i32 addrspace(4)* %fptr, align 4
+;  %val = load i32, i32 addrspace(4)* %fptr, align 4
+;  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; TODO: This should not be zero when registers are used for small
+; scratch allocations again.
+
+; Check for prologue initializing special SGPRs pointing to scratch.
+; CHECK-LABEL: {{^}}store_flat_scratch:
+; CHECK: s_movk_i32 flat_scratch_lo, 0
+; CHECK-NO-PROMOTE: s_movk_i32 flat_scratch_hi, 0x28{{$}}
+; CHECK-PROMOTE: s_movk_i32 flat_scratch_hi, 0x0{{$}}
+; CHECK: flat_store_dword
+; CHECK: s_barrier
+; CHECK: flat_load_dword
+define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
+  %alloca = alloca i32, i32 9, align 4
+  %x = call i32 @llvm.r600.read.tidig.x() #3
+  %pptr = getelementptr i32, i32* %alloca, i32 %x
+  %fptr = addrspacecast i32* %pptr to i32 addrspace(4)*
+  store i32 %x, i32 addrspace(4)* %fptr
+  ; Dummy call
+  call void @llvm.AMDGPU.barrier.local() #1
+  %reload = load i32, i32 addrspace(4)* %fptr, align 4
+  store i32 %reload, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+declare void @llvm.AMDGPU.barrier.local() #1
+declare i32 @llvm.r600.read.tidig.x() #3
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind noduplicate }
+attributes #3 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/flat-address-space.ll b/test/CodeGen/AMDGPU/flat-address-space.ll
index 571685ca6aeb..4b56d6f19832 100644
--- a/test/CodeGen/AMDGPU/flat-address-space.ll
+++ b/test/CodeGen/AMDGPU/flat-address-space.ll
@@ -7,32 +7,6 @@
 ; specialize away generic pointer accesses.
 
 
-; CHECK-LABEL: {{^}}branch_use_flat_i32:
-; CHECK: flat_store_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
-; CHECK: s_endpgm
-define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
-entry:
-  %cmp = icmp ne i32 %c, 0
-  br i1 %cmp, label %local, label %global
-
-local:
-  %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32 addrspace(4)*
-  br label %end
-
-global:
-  %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
-  br label %end
-
-end:
-  %fptr = phi i32 addrspace(4)* [ %flat_local, %local ], [ %flat_global, %global ]
-  store i32 %x, i32 addrspace(4)* %fptr, align 4
-;  %val = load i32, i32 addrspace(4)* %fptr, align 4
-;  store i32 %val, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-
-
 ; These testcases might become useless when there are optimizations to
 ; remove generic pointers.
 
@@ -150,32 +124,6 @@ define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)*
   ret void
 }
 
-
-
-; TODO: This should not be zero when registers are used for small
-; scratch allocations again.
-
-; Check for prologue initializing special SGPRs pointing to scratch.
-; CHECK-LABEL: {{^}}store_flat_scratch:
-; CHECK: s_movk_i32 flat_scratch_lo, 0
-; CHECK-NO-PROMOTE: s_movk_i32 flat_scratch_hi, 0x28{{$}}
-; CHECK-PROMOTE: s_movk_i32 flat_scratch_hi, 0x0{{$}}
-; CHECK: flat_store_dword
-; CHECK: s_barrier
-; CHECK: flat_load_dword
-define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
-  %alloca = alloca i32, i32 9, align 4
-  %x = call i32 @llvm.r600.read.tidig.x() #3
-  %pptr = getelementptr i32, i32* %alloca, i32 %x
-  %fptr = addrspacecast i32* %pptr to i32 addrspace(4)*
-  store i32 %x, i32 addrspace(4)* %fptr
-  ; Dummy call
-  call void @llvm.AMDGPU.barrier.local() #1
-  %reload = load i32, i32 addrspace(4)* %fptr, align 4
-  store i32 %reload, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
 declare void @llvm.AMDGPU.barrier.local() #1
 declare i32 @llvm.r600.read.tidig.x() #3
 

From b0e1e32b73001e71fc702febc0066bd8d0fb07ce Mon Sep 17 00:00:00 2001
From: Xinliang David Li <davidxl@google.com>
Date: Tue, 1 Dec 2015 23:05:27 +0000
Subject: [PATCH 114/186] Define member operator delete

For the struct with trailing objects, define
a member operator delete. Without this, the program
will fail when -fsized-deallocation option is used
where the wrong size will be passed to the global
delete operator.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254471 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ProfileData/InstrProfData.inc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/llvm/ProfileData/InstrProfData.inc b/include/llvm/ProfileData/InstrProfData.inc
index 8ff7003a0b2d..a2f0a8ae684b 100644
--- a/include/llvm/ProfileData/InstrProfData.inc
+++ b/include/llvm/ProfileData/InstrProfData.inc
@@ -291,6 +291,7 @@ typedef struct ValueProfData {
    */
   void deserializeTo(InstrProfRecord &Record,
                      InstrProfRecord::ValueMapType *VMap);
+  void operator delete(void *ptr) { ::operator delete(ptr); }
 #endif
 } ValueProfData;
 

From c322221121b04c570046a4c7c5f9100b163cef06 Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Tue, 1 Dec 2015 23:06:26 +0000
Subject: [PATCH 115/186] Use default member initializers.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254473 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Linker/LinkModules.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index 94010221846c..39ab5db8d6bd 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -412,12 +412,12 @@ class ModuleLinker {
   /// from this source module, in which case we must conservatively assume
   /// that any of its functions may be imported into another module
   /// as part of a different backend compilation process.
-  bool HasExportedFunctions;
+  bool HasExportedFunctions = false;
 
   /// Set to true when all global value body linking is complete (including
   /// lazy linking). Used to prevent metadata linking from creating new
   /// references.
-  bool DoneLinkingBodies;
+  bool DoneLinkingBodies = false;
 
   bool HasError = false;
 
@@ -428,8 +428,7 @@ class ModuleLinker {
                Function *FuncToImport = nullptr)
       : DstM(DstM), SrcM(SrcM), TypeMap(Set), ValMaterializer(this),
         DiagnosticHandler(DiagnosticHandler), Flags(Flags), ImportIndex(Index),
-        ImportFunction(FuncToImport), HasExportedFunctions(false),
-        DoneLinkingBodies(false) {
+        ImportFunction(FuncToImport) {
     assert((ImportIndex || !ImportFunction) &&
            "Expect a FunctionInfoIndex when importing");
     // If we have a FunctionInfoIndex but no function to import,

From 6837bcf91183ed877d02ecd1d78a4a56c957a1f9 Mon Sep 17 00:00:00 2001
From: Tim Northover <tnorthover@apple.com>
Date: Wed, 2 Dec 2015 00:33:54 +0000
Subject: [PATCH 116/186] AArch64: fix 128-bit shifts

We mustn't introduce a shift of exactly 64-bits for any inputs, since that's an
UNDEF value (and worse, it's not what you want with the natural Arch64
implementation).

The generated code is pretty horrific, but I couldn't come up with an obviously
better alternative (if the amount is constant EXTR could help). Turns out
128-bit shifts are just nasty.

rdar://22491037

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254475 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64ISelLowering.cpp | 87 ++++++++++++++--------
 test/CodeGen/AArch64/arm64-long-shift.ll   | 80 +++++++++++---------
 2 files changed, 97 insertions(+), 70 deletions(-)

diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2232e419a619..f0fb03451b2a 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -4380,46 +4380,57 @@ SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
   SDValue ShOpLo = Op.getOperand(0);
   SDValue ShOpHi = Op.getOperand(1);
   SDValue ShAmt = Op.getOperand(2);
-  SDValue ARMcc;
   unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
 
   assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
 
   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
                                  DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
-  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
+  SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
+
+  // Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which
+  // is "undef". We wanted 0, so CSEL it directly.
+  SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
+                               ISD::SETEQ, dl, DAG);
+  SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
+  HiBitsForLo =
+      DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
+                  HiBitsForLo, CCVal, Cmp);
+
   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
                                    DAG.getConstant(VTBits, dl, MVT::i64));
-  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
 
-  SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64),
-                               ISD::SETGE, dl, DAG);
-  SDValue CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
+  SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
+  SDValue LoForNormalShift =
+      DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo);
 
-  SDValue FalseValLo = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
-  SDValue TrueValLo = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
-  SDValue Lo =
-      DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
+  Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
+                       dl, DAG);
+  CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
+  SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
+  SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
+                           LoForNormalShift, CCVal, Cmp);
 
   // AArch64 shifts larger than the register width are wrapped rather than
   // clamped, so we can't just emit "hi >> x".
-  SDValue FalseValHi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
-  SDValue TrueValHi = Opc == ISD::SRA
-                          ? DAG.getNode(Opc, dl, VT, ShOpHi,
-                                        DAG.getConstant(VTBits - 1, dl,
-                                                        MVT::i64))
-                          : DAG.getConstant(0, dl, VT);
-  SDValue Hi =
-      DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValHi, FalseValHi, CCVal, Cmp);
+  SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+  SDValue HiForBigShift =
+      Opc == ISD::SRA
+          ? DAG.getNode(Opc, dl, VT, ShOpHi,
+                        DAG.getConstant(VTBits - 1, dl, MVT::i64))
+          : DAG.getConstant(0, dl, VT);
+  SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
+                           HiForNormalShift, CCVal, Cmp);
 
   SDValue Ops[2] = { Lo, Hi };
   return DAG.getMergeValues(Ops, dl);
 }
 
+
 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
 /// i64 values and take a 2 x i64 value to shift plus a shift amount.
 SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
-                                                 SelectionDAG &DAG) const {
+                                                   SelectionDAG &DAG) const {
   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
   EVT VT = Op.getValueType();
   unsigned VTBits = VT.getSizeInBits();
@@ -4427,31 +4438,41 @@ SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
   SDValue ShOpLo = Op.getOperand(0);
   SDValue ShOpHi = Op.getOperand(1);
   SDValue ShAmt = Op.getOperand(2);
-  SDValue ARMcc;
 
   assert(Op.getOpcode() == ISD::SHL_PARTS);
   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
                                  DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
-  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+  SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+
+  // Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which
+  // is "undef". We wanted 0, so CSEL it directly.
+  SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
+                               ISD::SETEQ, dl, DAG);
+  SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
+  LoBitsForHi =
+      DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
+                  LoBitsForHi, CCVal, Cmp);
+
   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
                                    DAG.getConstant(VTBits, dl, MVT::i64));
-  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
-  SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
+  SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
+  SDValue HiForNormalShift =
+      DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi);
 
-  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+  SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
 
-  SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64),
-                               ISD::SETGE, dl, DAG);
-  SDValue CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
-  SDValue Hi =
-      DAG.getNode(AArch64ISD::CSEL, dl, VT, Tmp3, FalseVal, CCVal, Cmp);
+  Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
+                       dl, DAG);
+  CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
+  SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
+                           HiForNormalShift, CCVal, Cmp);
 
   // AArch64 shifts of larger than register sizes are wrapped rather than
   // clamped, so we can't just emit "lo << a" if a is too big.
-  SDValue TrueValLo = DAG.getConstant(0, dl, VT);
-  SDValue FalseValLo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
-  SDValue Lo =
-      DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
+  SDValue LoForBigShift = DAG.getConstant(0, dl, VT);
+  SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+  SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
+                           LoForNormalShift, CCVal, Cmp);
 
   SDValue Ops[2] = { Lo, Hi };
   return DAG.getMergeValues(Ops, dl);
diff --git a/test/CodeGen/AArch64/arm64-long-shift.ll b/test/CodeGen/AArch64/arm64-long-shift.ll
index d5baf16bdd5c..ad89d3ff711b 100644
--- a/test/CodeGen/AArch64/arm64-long-shift.ll
+++ b/test/CodeGen/AArch64/arm64-long-shift.ll
@@ -2,18 +2,20 @@
 
 define i128 @shl(i128 %r, i128 %s) nounwind readnone {
 ; CHECK-LABEL: shl:
-; CHECK: lsl  [[XREG_0:x[0-9]+]], x1, x2
-; CHECK-NEXT: orr w[[XREG_1:[0-9]+]], wzr, #0x40
-; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], x[[XREG_1]], x2
-; CHECK-NEXT: lsr  [[XREG_3:x[0-9]+]], x0, [[XREG_2]]
-; CHECK-NEXT: orr [[XREG_6:x[0-9]+]], [[XREG_3]], [[XREG_0]]
-; CHECK-NEXT: sub [[XREG_4:x[0-9]+]], x2, #64
-; CHECK-NEXT: lsl  [[XREG_5:x[0-9]+]], x0, [[XREG_4]]
-; CHECK-NEXT: cmp   [[XREG_4]], #0
-; CHECK-NEXT: csel  x1, [[XREG_5]], [[XREG_6]], ge
-; CHECK-NEXT: lsl  [[SMALLSHIFT_LO:x[0-9]+]], x0, x2
-; CHECK-NEXT: csel  x0, xzr, [[SMALLSHIFT_LO]], ge
-; CHECK-NEXT: ret
+; CHECK: orr w[[SIXTY_FOUR:[0-9]+]], wzr, #0x40
+; CHECK: sub [[REV_SHIFT:x[0-9]+]], x[[SIXTY_FOUR]], x2
+; CHECK: lsr  [[LO_FOR_HI_NORMAL:x[0-9]+]], x0, [[REV_SHIFT]]
+; CHECK: cmp x2, #0
+; CHECK: csel [[LO_FOR_HI:x[0-9]+]], xzr, [[LO_FOR_HI_NORMAL]], eq
+; CHECK: lsl  [[HI_FOR_HI:x[0-9]+]], x1, x2
+; CHECK: orr [[HI_NORMAL:x[0-9]+]], [[LO_FOR_HI]], [[HI_FOR_HI]]
+; CHECK: sub [[EXTRA_SHIFT:x[0-9]+]], x2, #64
+; CHECK: lsl  [[HI_BIG_SHIFT:x[0-9]+]], x0, [[EXTRA_SHIFT]]
+; CHECK: cmp   [[EXTRA_SHIFT]], #0
+; CHECK: csel  x1, [[HI_BIG_SHIFT]], [[HI_NORMAL]], ge
+; CHECK: lsl  [[SMALLSHIFT_LO:x[0-9]+]], x0, x2
+; CHECK: csel  x0, xzr, [[SMALLSHIFT_LO]], ge
+; CHECK: ret
 
   %shl = shl i128 %r, %s
   ret i128 %shl
@@ -21,19 +23,21 @@ define i128 @shl(i128 %r, i128 %s) nounwind readnone {
 
 define i128 @ashr(i128 %r, i128 %s) nounwind readnone {
 ; CHECK-LABEL: ashr:
-; CHECK: lsr  [[XREG_0:x[0-9]+]], x0, x2
-; CHECK-NEXT: orr w[[XREG_1:[0-9]+]], wzr, #0x40
-; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], x[[XREG_1]], x2
-; CHECK-NEXT: lsl  [[XREG_3:x[0-9]+]], x1, [[XREG_2]]
-; CHECK-NEXT: orr [[XREG_4:x[0-9]+]], [[XREG_0]], [[XREG_3]]
-; CHECK-NEXT: sub [[XREG_5:x[0-9]+]], x2, #64
-; CHECK-NEXT: asr  [[XREG_6:x[0-9]+]], x1, [[XREG_5]]
-; CHECK-NEXT: cmp   [[XREG_5]], #0
-; CHECK-NEXT: csel  x0, [[XREG_6]], [[XREG_4]], ge
-; CHECK-NEXT: asr  [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
-; CHECK-NEXT: asr [[BIGSHIFT_HI:x[0-9]+]], x1, #63
-; CHECK-NEXT: csel x1, [[BIGSHIFT_HI]], [[SMALLSHIFT_HI]], ge
-; CHECK-NEXT: ret
+; CHECK: orr w[[SIXTY_FOUR:[0-9]+]], wzr, #0x40
+; CHECK: sub [[REV_SHIFT:x[0-9]+]], x[[SIXTY_FOUR]], x2
+; CHECK: lsl  [[HI_FOR_LO_NORMAL:x[0-9]+]], x1, [[REV_SHIFT]]
+; CHECK: cmp x2, #0
+; CHECK: csel [[HI_FOR_LO:x[0-9]+]], xzr, [[HI_FOR_LO_NORMAL]], eq
+; CHECK: lsr  [[LO_FOR_LO:x[0-9]+]], x0, x2
+; CHECK: orr [[LO_NORMAL:x[0-9]+]], [[LO_FOR_LO]], [[HI_FOR_LO]]
+; CHECK: sub [[EXTRA_SHIFT:x[0-9]+]], x2, #64
+; CHECK: asr  [[LO_BIG_SHIFT:x[0-9]+]], x1, [[EXTRA_SHIFT]]
+; CHECK: cmp   [[EXTRA_SHIFT]], #0
+; CHECK: csel  x0, [[LO_BIG_SHIFT]], [[LO_NORMAL]], ge
+; CHECK: asr  [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
+; CHECK: asr [[BIGSHIFT_HI:x[0-9]+]], x1, #63
+; CHECK: csel x1, [[BIGSHIFT_HI]], [[SMALLSHIFT_HI]], ge
+; CHECK: ret
 
   %shr = ashr i128 %r, %s
   ret i128 %shr
@@ -41,18 +45,20 @@ define i128 @ashr(i128 %r, i128 %s) nounwind readnone {
 
 define i128 @lshr(i128 %r, i128 %s) nounwind readnone {
 ; CHECK-LABEL: lshr:
-; CHECK: lsr  [[XREG_0:x[0-9]+]], x0, x2
-; CHECK-NEXT: orr w[[XREG_1:[0-9]+]], wzr, #0x40
-; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], x[[XREG_1]], x2
-; CHECK-NEXT: lsl  [[XREG_3:x[0-9]+]], x1, [[XREG_2]]
-; CHECK-NEXT: orr [[XREG_4:x[0-9]+]], [[XREG_0]], [[XREG_3]]
-; CHECK-NEXT: sub [[XREG_5:x[0-9]+]], x2, #64
-; CHECK-NEXT: lsr  [[XREG_6:x[0-9]+]], x1, [[XREG_5]]
-; CHECK-NEXT: cmp   [[XREG_5]], #0
-; CHECK-NEXT: csel  x0, [[XREG_6]], [[XREG_4]], ge
-; CHECK-NEXT: lsr  [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
-; CHECK-NEXT: csel x1, xzr, [[SMALLSHIFT_HI]], ge
-; CHECK-NEXT: ret
+; CHECK: orr w[[SIXTY_FOUR:[0-9]+]], wzr, #0x40
+; CHECK: sub [[REV_SHIFT:x[0-9]+]], x[[SIXTY_FOUR]], x2
+; CHECK: lsl  [[HI_FOR_LO_NORMAL:x[0-9]+]], x1, [[REV_SHIFT]]
+; CHECK: cmp x2, #0
+; CHECK: csel [[HI_FOR_LO:x[0-9]+]], xzr, [[HI_FOR_LO_NORMAL]], eq
+; CHECK: lsr  [[LO_FOR_LO:x[0-9]+]], x0, x2
+; CHECK: orr [[LO_NORMAL:x[0-9]+]], [[LO_FOR_LO]], [[HI_FOR_LO]]
+; CHECK: sub [[EXTRA_SHIFT:x[0-9]+]], x2, #64
+; CHECK: lsr  [[LO_BIG_SHIFT:x[0-9]+]], x1, [[EXTRA_SHIFT]]
+; CHECK: cmp   [[EXTRA_SHIFT]], #0
+; CHECK: csel  x0, [[LO_BIG_SHIFT]], [[LO_NORMAL]], ge
+; CHECK: lsr  [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
+; CHECK: csel x1, xzr, [[SMALLSHIFT_HI]], ge
+; CHECK: ret
 
   %shr = lshr i128 %r, %s
   ret i128 %shr

From 327823045115e7df540f8ed7f1871c432305066f Mon Sep 17 00:00:00 2001
From: Quentin Colombet <qcolombet@apple.com>
Date: Wed, 2 Dec 2015 01:22:54 +0000
Subject: [PATCH 117/186] [X86] Make sure the prologue does not clobber EFLAGS
 when it lives accross it.

This is a superset of the fix done in r254448.

This fixes PR25607.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254478 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86FrameLowering.cpp      |  69 ++++++--------
 test/CodeGen/X86/i386-shrink-wrapping.ll | 113 +++++++++++++++++++++++
 2 files changed, 139 insertions(+), 43 deletions(-)
 create mode 100644 test/CodeGen/X86/i386-shrink-wrapping.ll

diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 4ce3dfe0dcb3..cc8bbd09f507 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -204,10 +204,13 @@ static bool isEAXLiveIn(MachineFunction &MF) {
   return false;
 }
 
-/// Check whether or not the terminators of \p MBB needs to read EFLAGS.
-static bool terminatorsNeedFlagsAsInput(const MachineBasicBlock &MBB) {
+/// Check if the flags need to be preserved before the terminators.
+/// This would be the case, if the eflags is live-in of the region
+/// composed by the terminators or live-out of that region, without
+/// being defined by a terminator.
+static bool
+flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) {
   for (const MachineInstr &MI : MBB.terminators()) {
-    bool BreakNext = false;
     for (const MachineOperand &MO : MI.operands()) {
       if (!MO.isReg())
         continue;
@@ -215,15 +218,22 @@ static bool terminatorsNeedFlagsAsInput(const MachineBasicBlock &MBB) {
       if (Reg != X86::EFLAGS)
         continue;
 
-      // This terminator needs an eflag that is not defined
-      // by a previous terminator.
+      // This terminator needs an eflags that is not defined
+      // by a previous another terminator:
+      // EFLAGS is live-in of the region composed by the terminators.
       if (!MO.isDef())
         return true;
-      BreakNext = true;
+      // This terminator defines the eflags, i.e., we don't need to preserve it.
+      return false;
     }
-    if (BreakNext)
-      break;
   }
+
+  // None of the terminators use or define the eflags.
+  // Check if they are live-out, that would imply we need to preserve them.
+  for (const MachineBasicBlock *Succ : MBB.successors())
+    if (Succ->isLiveIn(X86::EFLAGS))
+      return true;
+
   return false;
 }
 
@@ -297,28 +307,6 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
   }
 }
 
-// Check if \p MBB defines the flags register before the first terminator.
-static bool flagsDefinedLocally(const MachineBasicBlock &MBB) {
-  MachineBasicBlock::const_iterator FirstTerminator = MBB.getFirstTerminator();
-  for (MachineBasicBlock::const_iterator MII : MBB) {
-    if (MII == FirstTerminator)
-      return false;
-
-    for (const MachineOperand &MO : MII->operands()) {
-      if (!MO.isReg())
-        continue;
-      unsigned Reg = MO.getReg();
-      if (Reg != X86::EFLAGS)
-        continue;
-
-      // This instruction sets the eflag.
-      if (MO.isDef())
-        return true;
-    }
-  }
-  return false;
-}
-
 MachineInstrBuilder X86FrameLowering::BuildStackAdjustment(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL,
     int64_t Offset, bool InEpilogue) const {
@@ -330,14 +318,9 @@ MachineInstrBuilder X86FrameLowering::BuildStackAdjustment(
   if (!InEpilogue) {
     // Check if inserting the prologue at the beginning
     // of MBB would require to use LEA operations.
-    // We need to use LEA operations if both conditions are true:
-    // 1. One of the terminators need the flags.
-    // 2. The flags are not defined after the insertion point of the prologue.
-    // Note: Checking for the predecessors is a shortcut when obviously nothing
-    // will live accross the prologue.
-    UseLEA = STI.useLeaForSP() ||
-             (!MBB.pred_empty() && terminatorsNeedFlagsAsInput(MBB) &&
-              !flagsDefinedLocally(MBB));
+    // We need to use LEA operations if EFLAGS is live in, because
+    // it means an instruction will read it before it gets defined.
+    UseLEA = STI.useLeaForSP() || MBB.isLiveIn(X86::EFLAGS);
   } else {
     // If we can use LEA for SP but we shouldn't, check that none
     // of the terminators uses the eflags. Otherwise we will insert
@@ -346,10 +329,10 @@ MachineInstrBuilder X86FrameLowering::BuildStackAdjustment(
     // and is an optimization anyway.
     UseLEA = canUseLEAForSPInEpilogue(*MBB.getParent());
     if (UseLEA && !STI.useLeaForSP())
-      UseLEA = terminatorsNeedFlagsAsInput(MBB);
+      UseLEA = flagsNeedToBePreservedBeforeTheTerminators(MBB);
     // If that assert breaks, that means we do not do the right thing
     // in canUseAsEpilogue.
-    assert((UseLEA || !terminatorsNeedFlagsAsInput(MBB)) &&
+    assert((UseLEA || !flagsNeedToBePreservedBeforeTheTerminators(MBB)) &&
            "We shouldn't have allowed this insertion point");
   }
 
@@ -2597,10 +2580,10 @@ bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
     return true;
 
   // If we cannot use LEA to adjust SP, we may need to use ADD, which
-  // clobbers the EFLAGS. Check that none of the terminators reads the
-  // EFLAGS, and if one uses it, conservatively assume this is not
+  // clobbers the EFLAGS. Check that we do not need to preserve it,
+  // otherwise, conservatively assume this is not
   // safe to insert the epilogue here.
-  return !terminatorsNeedFlagsAsInput(MBB);
+  return !flagsNeedToBePreservedBeforeTheTerminators(MBB);
 }
 
 MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
diff --git a/test/CodeGen/X86/i386-shrink-wrapping.ll b/test/CodeGen/X86/i386-shrink-wrapping.ll
new file mode 100644
index 000000000000..748c397143c5
--- /dev/null
+++ b/test/CodeGen/X86/i386-shrink-wrapping.ll
@@ -0,0 +1,113 @@
+; RUN: llc %s -o - -enable-shrink-wrap=true | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
+; RUN: llc %s -o - -enable-shrink-wrap=false | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-apple-macosx"
+
+@a = common global i32 0, align 4
+@d = internal unnamed_addr global i1 false
+@b = common global i32 0, align 4
+@e = common global i8 0, align 1
+@f = common global i8 0, align 1
+@c = common global i32 0, align 4
+@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
+
+
+; Check that we are clobbering the flags when they are live-in of the
+; prologue block and the prologue needs to adjust the stack.
+; PR25607.
+;
+; CHECK-LABEL: eflagsLiveInPrologue:
+;
+; DISABLE: pushl
+; DISABLE-NEXT: pushl
+; DISABLE-NEXT: subl $20, %esp
+;
+; CHECK: movl L_a$non_lazy_ptr, [[A:%[a-z]+]]
+; CHECK-NEXT: cmpl $0, ([[A]])
+; CHECK-NEXT: je [[PREHEADER_LABEL:LBB[0-9_]+]]
+;
+; CHECK: movb $1, _d
+;
+; CHECK: [[PREHEADER_LABEL]]:
+; CHECK-NEXT: movl L_b$non_lazy_ptr, [[B:%[a-z]+]]
+; CHECK-NEXT: movl ([[B]]), [[TMP1:%[a-z]+]]
+; CHECK-NEXT: testl [[TMP1]], [[TMP1]]
+; CHECK-NEXT: je  [[FOREND_LABEL:LBB[0-9_]+]]
+;
+; Skip the loop.
+; [...]
+;
+; The for.end block is split to accomadate the different selects.
+; We are interested in the one with the call, so skip until the branch.
+; CHECK: [[FOREND_LABEL]]:
+; CHECK-NEXT: movb _d, [[D:%[a-z]+]]
+; [...]
+; CHECK: jne [[CALL_LABEL:LBB[0-9_]+]]
+;
+; CHECK: movb $6, [[D]]
+;
+; CHECK: [[CALL_LABEL]]
+;
+; ENABLE-NEXT: pushl
+; ENABLE-NEXT: pushl
+; We must not use sub here otherwise we will clobber the eflags.
+; ENABLE-NEXT: leal -20(%esp), %esp
+;
+; CHECK-NEXT: L_e$non_lazy_ptr, [[E:%[a-z]+]]
+; CHECK-NEXT: movb [[D]], ([[E]])
+; CHECK-NEXT: L_f$non_lazy_ptr, [[F:%[a-z]+]]
+; CHECK-NEXT: movsbl ([[F]]), [[CONV:%[a-z]+]]
+; CHECK-NEXT: movl $6, [[CONV:%[a-z]+]]
+; The eflags is used in the next instruction.
+; If that instruction disappear, we are not exercising the bug
+; anymore.
+; CHECK-NEXT: cmovnel {{%[a-z]+}}, [[CONV]]
+;
+; Skip all the crust of vaarg lowering.
+; CHECK: calll L_varfunc$stub
+; Set the return value to 0.
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: addl $20, %esp
+; CHECK-NEXT: popl
+; CHECK-NEXT: popl
+; CHECK-NEXT: retl
+define i32 @eflagsLiveInPrologue() #0 {
+entry:
+  %tmp = load i32, i32* @a, align 4
+  %tobool = icmp eq i32 %tmp, 0
+  br i1 %tobool, label %for.cond.preheader, label %if.then
+
+if.then:                                          ; preds = %entry
+  store i1 true, i1* @d, align 1
+  br label %for.cond.preheader
+
+for.cond.preheader:                               ; preds = %if.then, %entry
+  %tmp1 = load i32, i32* @b, align 4
+  %tobool14 = icmp eq i32 %tmp1, 0
+  br i1 %tobool14, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %for.cond.preheader
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  br label %for.body
+
+for.end:                                          ; preds = %for.cond.preheader
+  %.b3 = load i1, i1* @d, align 1
+  %tmp2 = select i1 %.b3, i8 0, i8 6
+  store i8 %tmp2, i8* @e, align 1
+  %tmp3 = load i8, i8* @f, align 1
+  %conv = sext i8 %tmp3 to i32
+  %add = add nsw i32 %conv, 1
+  %rem = srem i32 %tmp1, %add
+  store i32 %rem, i32* @c, align 4
+  %conv2 = select i1 %.b3, i32 0, i32 6
+  %call = tail call i32 (i8*, ...) @varfunc(i8* nonnull getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %conv2) #1
+  ret i32 0
+}
+
+; Function Attrs: nounwind
+declare i32 @varfunc(i8* nocapture readonly, ...) #0
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }

From f7d58f12fa2377090dd844e52662e5a942606d0f Mon Sep 17 00:00:00 2001
From: Mehdi Amini <mehdi.amini@apple.com>
Date: Wed, 2 Dec 2015 02:00:29 +0000
Subject: [PATCH 118/186] Modify FunctionImport to take a callback to load
 modules

When linking static archive, there is no individual module files to
load. Instead they can be mmap'ed and could be initialized from a
buffer directly. The callback provide flexibility to override the
scheme for loading module from the summary.

Differential Revision: http://reviews.llvm.org/D15101

From: Mehdi Amini <mehdi.amini@apple.com>

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254479 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Transforms/IPO/FunctionImport.h | 31 ++++++++++++++------
 lib/Transforms/IPO/FunctionImport.cpp        | 11 ++++---
 2 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/include/llvm/Transforms/IPO/FunctionImport.h b/include/llvm/Transforms/IPO/FunctionImport.h
index f06a19021750..0315c72811c1 100644
--- a/include/llvm/Transforms/IPO/FunctionImport.h
+++ b/include/llvm/Transforms/IPO/FunctionImport.h
@@ -18,15 +18,26 @@ class LLVMContext;
 class Module;
 class FunctionInfoIndex;
 
-/// The function importer is automatically importing function from other modules
-/// based on the provided summary informations.
-class FunctionImporter {
+/// Helper to load on demand a Module from file and cache it for subsequent
+/// queries. It can be used with the FunctionImporter.
+class ModuleLazyLoaderCache {
+  /// The context that will be used for importing.
+  LLVMContext &Context;
 
   /// Cache of lazily loaded module for import.
   StringMap<std::unique_ptr<Module>> ModuleMap;
 
-  /// The context that will be used for importing.
-  LLVMContext &Context;
+public:
+  /// Create the loader, Module will be initialized in \p Context.
+  ModuleLazyLoaderCache(LLVMContext &Context) : Context(Context) {}
+
+  /// Retrieve a Module from the cache or lazily load it on demand.
+  Module &operator()(StringRef FileName);
+};
+
+/// The function importer is automatically importing function from other modules
+/// based on the provided summary informations.
+class FunctionImporter {
 
   /// The summaries index used to trigger importing.
   const FunctionInfoIndex &Index;
@@ -35,13 +46,15 @@ class FunctionImporter {
   DiagnosticHandlerFunction DiagnosticHandler;
 
   /// Retrieve a Module from the cache or lazily load it on demand.
-  Module &getOrLoadModule(StringRef FileName);
+  std::function<Module &(StringRef FileName)> getLazyModule;
 
 public:
   /// Create a Function Importer.
-  FunctionImporter(LLVMContext &Context, const FunctionInfoIndex &Index,
-                   DiagnosticHandlerFunction DiagnosticHandler)
-      : Context(Context), Index(Index), DiagnosticHandler(DiagnosticHandler) {}
+  FunctionImporter(const FunctionInfoIndex &Index,
+                   DiagnosticHandlerFunction DiagnosticHandler,
+                   std::function<Module &(StringRef FileName)> ModuleLoader)
+      : Index(Index), DiagnosticHandler(DiagnosticHandler),
+        getLazyModule(ModuleLoader) {}
 
   /// Import functions in Module \p M based on the summary informations.
   bool importFunctions(Module &M);
diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp
index 444da3c55d2a..92764c9e8c3f 100644
--- a/lib/Transforms/IPO/FunctionImport.cpp
+++ b/lib/Transforms/IPO/FunctionImport.cpp
@@ -51,7 +51,7 @@ static std::unique_ptr<Module> loadFile(const std::string &FileName,
 }
 
 // Get a Module for \p FileName from the cache, or load it lazily.
-Module &FunctionImporter::getOrLoadModule(StringRef FileName) {
+Module &ModuleLazyLoaderCache::operator()(StringRef FileName) {
   auto &Module = ModuleMap[FileName];
   if (!Module)
     Module = loadFile(FileName, Context);
@@ -86,7 +86,6 @@ static void findExternalCalls(const Function &F, StringSet<> &CalledFunctions,
 // The current implementation imports every called functions that exists in the
 // summaries index.
 bool FunctionImporter::importFunctions(Module &M) {
-  assert(&Context == &M.getContext());
 
   bool Changed = false;
 
@@ -142,7 +141,8 @@ bool FunctionImporter::importFunctions(Module &M) {
                  << "\n");
 
     // Get the module for the import (potentially from the cache).
-    auto &Module = getOrLoadModule(FileName);
+    auto &Module = getLazyModule(FileName);
+    assert(&Module.getContext() == &M.getContext());
 
     // The function that we will import!
     GlobalValue *SGV = Module.getNamedValue(CalledFunctionName);
@@ -255,7 +255,10 @@ class FunctionImportPass : public ModulePass {
     }
 
     // Perform the import now.
-    FunctionImporter Importer(M.getContext(), *Index, diagnosticHandler);
+    ModuleLazyLoaderCache Loader(M.getContext());
+    FunctionImporter Importer(*Index, diagnosticHandler,
+                              [&](StringRef Name)
+                                  -> Module &{ return Loader(Name); });
     return Importer.importFunctions(M);
 
     return false;

From 653cbb792b87ae54ee7fdd2477411f450791f27b Mon Sep 17 00:00:00 2001
From: Quentin Colombet <qcolombet@apple.com>
Date: Wed, 2 Dec 2015 02:07:00 +0000
Subject: [PATCH 119/186] [X86] Fix a think-o when checking if the eflags needs
 to be preserved.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254480 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86FrameLowering.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index cc8bbd09f507..52cf9fd44cbd 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -211,6 +211,7 @@ static bool isEAXLiveIn(MachineFunction &MF) {
 static bool
 flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) {
   for (const MachineInstr &MI : MBB.terminators()) {
+    bool BreakNext = false;
     for (const MachineOperand &MO : MI.operands()) {
       if (!MO.isReg())
         continue;
@@ -224,8 +225,13 @@ flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) {
       if (!MO.isDef())
         return true;
       // This terminator defines the eflags, i.e., we don't need to preserve it.
-      return false;
+      // However, we still need to check this specific terminator does not
+      // read a live-in value.
+      BreakNext = true;
     }
+    // We found a definition of the eflags, no need to preserve them.
+    if (BreakNext)
+      return false;
   }
 
   // None of the terminators use or define the eflags.

From 0445c4d68b6e2b33f7079bf9df076fad654e50bb Mon Sep 17 00:00:00 2001
From: Kostya Serebryany <kcc@google.com>
Date: Wed, 2 Dec 2015 02:37:13 +0000
Subject: [PATCH 120/186] [sanitizer coverage] when adding a bb trace
 instrumentation, do it instead, not in addition to, regular coverage. Do the
 regular coverage in the run-time instead

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254482 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../Instrumentation/SanitizerCoverage.cpp     | 25 ++++++++-----------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index 3430ee01035e..719a436b05de 100644
--- a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -156,7 +156,9 @@ class SanitizerCoverageModule : public ModulePass {
   void SetNoSanitizeMetadata(Instruction *I);
   void InjectCoverageAtBlock(Function &F, BasicBlock &BB, bool UseCalls);
   unsigned NumberOfInstrumentedBlocks() {
-    return SanCovFunction->getNumUses() + SanCovWithCheckFunction->getNumUses();
+    return SanCovFunction->getNumUses() +
+           SanCovWithCheckFunction->getNumUses() + SanCovTraceBB->getNumUses() +
+           SanCovTraceEnter->getNumUses();
   }
   Function *SanCovFunction;
   Function *SanCovWithCheckFunction;
@@ -211,12 +213,10 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
                             StringRef(""), StringRef(""),
                             /*hasSideEffects=*/true);
 
-  if (Options.TraceBB) {
-    SanCovTraceEnter = checkSanitizerInterfaceFunction(
-        M.getOrInsertFunction(kSanCovTraceEnter, VoidTy, Int32PtrTy, nullptr));
-    SanCovTraceBB = checkSanitizerInterfaceFunction(
-        M.getOrInsertFunction(kSanCovTraceBB, VoidTy, Int32PtrTy, nullptr));
-  }
+  SanCovTraceEnter = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction(kSanCovTraceEnter, VoidTy, Int32PtrTy, nullptr));
+  SanCovTraceBB = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction(kSanCovTraceBB, VoidTy, Int32PtrTy, nullptr));
 
   // At this point we create a dummy array of guards because we don't
   // know how many elements we will need.
@@ -466,7 +466,9 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
       ConstantInt::get(IntptrTy, (1 + NumberOfInstrumentedBlocks()) * 4));
   Type *Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty());
   GuardP = IRB.CreateIntToPtr(GuardP, Int32PtrTy);
-  if (UseCalls) {
+  if (Options.TraceBB) {
+    IRB.CreateCall(IsEntryBB ? SanCovTraceEnter : SanCovTraceBB, GuardP);
+  } else if (UseCalls) {
     IRB.CreateCall(SanCovWithCheckFunction, GuardP);
   } else {
     LoadInst *Load = IRB.CreateLoad(GuardP);
@@ -495,13 +497,6 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
     SetNoSanitizeMetadata(LI);
     SetNoSanitizeMetadata(SI);
   }
-
-  if (Options.TraceBB) {
-    // Experimental support for tracing.
-    // Insert a callback with the same guard variable as used for coverage.
-    IRB.SetInsertPoint(&*IP);
-    IRB.CreateCall(IsEntryBB ? SanCovTraceEnter : SanCovTraceBB, GuardP);
-  }
 }
 
 char SanitizerCoverageModule::ID = 0;

From 569415a25bdab720be12062c3a0540ba026453db Mon Sep 17 00:00:00 2001
From: Kostya Serebryany <kcc@google.com>
Date: Wed, 2 Dec 2015 02:49:37 +0000
Subject: [PATCH 121/186] [libFuzzer] add a test that is built with
 -fsanitize-coverage=trace-bb

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254484 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Fuzzer/test/CMakeLists.txt          |  8 ++++++++
 lib/Fuzzer/test/trace-bb/CMakeLists.txt | 14 ++++++++++++++
 2 files changed, 22 insertions(+)
 create mode 100644 lib/Fuzzer/test/trace-bb/CMakeLists.txt

diff --git a/lib/Fuzzer/test/CMakeLists.txt b/lib/Fuzzer/test/CMakeLists.txt
index 1e02af149ad8..85e8706f11cc 100644
--- a/lib/Fuzzer/test/CMakeLists.txt
+++ b/lib/Fuzzer/test/CMakeLists.txt
@@ -38,6 +38,9 @@ set(UninstrumentedTests
   UninstrumentedTest
   )
 
+set(TraceBBTests
+  SimpleTest
+  )
 
 set(TestBinaries)
 
@@ -99,6 +102,11 @@ foreach(Test ${UninstrumentedTests})
   set(TestBinaries ${TestBinaries} LLVMFuzzer-${Test}-Uninstrumented)
 endforeach()
 
+add_subdirectory(trace-bb)
+
+foreach(Test ${TraceBBTests})
+  set(TestBinaries ${TestBinaries} LLVMFuzzer-${Test}-TraceBB)
+endforeach()
 
 set_target_properties(${TestBinaries}
   PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
diff --git a/lib/Fuzzer/test/trace-bb/CMakeLists.txt b/lib/Fuzzer/test/trace-bb/CMakeLists.txt
new file mode 100644
index 000000000000..99af019565b5
--- /dev/null
+++ b/lib/Fuzzer/test/trace-bb/CMakeLists.txt
@@ -0,0 +1,14 @@
+# These tests are not instrumented with coverage.
+
+set(CMAKE_CXX_FLAGS_RELEASE
+  "${LIBFUZZER_FLAGS_BASE} -fsanitize-coverage=edge,trace-bb")
+
+foreach(Test ${TraceBBTests})
+  add_executable(LLVMFuzzer-${Test}-TraceBB
+    ../${Test}.cpp
+    )
+  target_link_libraries(LLVMFuzzer-${Test}-TraceBB
+    LLVMFuzzer
+    )
+endforeach()
+

From accfde3ad612bb30e426b1bc13760549a28fd9df Mon Sep 17 00:00:00 2001
From: Mehdi Amini <mehdi.amini@apple.com>
Date: Wed, 2 Dec 2015 04:34:28 +0000
Subject: [PATCH 122/186] Change ModuleLinker to take a set of GlobalValues to
 import instead of a single one

For efficiency reason, when importing multiple functions for the same Module,
we can avoid reparsing it every time.

Differential Revision: http://reviews.llvm.org/D15102

From: Mehdi Amini <mehdi.amini@apple.com>

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254486 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Linker/Linker.h          |  2 +-
 lib/Linker/LinkModules.cpp            | 10 +++++-----
 lib/Transforms/IPO/FunctionImport.cpp |  6 +++++-
 tools/llvm-link/llvm-link.cpp         |  5 ++++-
 4 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/include/llvm/Linker/Linker.h b/include/llvm/Linker/Linker.h
index 38fa5562f300..7c24eef74111 100644
--- a/include/llvm/Linker/Linker.h
+++ b/include/llvm/Linker/Linker.h
@@ -82,7 +82,7 @@ class Linker {
   /// Returns true on error.
   bool linkInModule(Module &Src, unsigned Flags = Flags::None,
                     const FunctionInfoIndex *Index = nullptr,
-                    Function *FuncToImport = nullptr);
+                    DenseSet<const GlobalValue *> *FuncToImport = nullptr);
 
   static bool linkModules(Module &Dest, Module &Src,
                           DiagnosticHandlerFunction DiagnosticHandler,
diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index 39ab5db8d6bd..fba231100ee3 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -406,7 +406,7 @@ class ModuleLinker {
 
   /// Function to import from source module, all other functions are
   /// imported as declarations instead of definitions.
-  Function *ImportFunction;
+  DenseSet<const GlobalValue *> *ImportFunction;
 
   /// Set to true if the given FunctionInfoIndex contains any functions
   /// from this source module, in which case we must conservatively assume
@@ -425,7 +425,7 @@ class ModuleLinker {
   ModuleLinker(Module &DstM, Linker::IdentifiedStructTypeSet &Set, Module &SrcM,
                DiagnosticHandlerFunction DiagnosticHandler, unsigned Flags,
                const FunctionInfoIndex *Index = nullptr,
-               Function *FuncToImport = nullptr)
+               DenseSet<const GlobalValue *> *FuncToImport = nullptr)
       : DstM(DstM), SrcM(SrcM), TypeMap(Set), ValMaterializer(this),
         DiagnosticHandler(DiagnosticHandler), Flags(Flags), ImportIndex(Index),
         ImportFunction(FuncToImport) {
@@ -632,7 +632,7 @@ bool ModuleLinker::doImportAsDefinition(const GlobalValue *SGV) {
     return true;
   // Only import the function requested for importing.
   auto *SF = dyn_cast<Function>(SGV);
-  if (SF && SF == ImportFunction)
+  if (SF && ImportFunction->count(SF))
     return true;
   // Otherwise no.
   return false;
@@ -1058,7 +1058,7 @@ bool ModuleLinker::shouldLinkFromSource(bool &LinkFromSrc,
     if (isa<Function>(&Src)) {
       // For functions, LinkFromSrc iff this is the function requested
       // for importing. For variables, decide below normally.
-      LinkFromSrc = (&Src == ImportFunction);
+      LinkFromSrc = ImportFunction->count(&Src);
       return false;
     }
 
@@ -2033,7 +2033,7 @@ Linker::Linker(Module &M)
 
 bool Linker::linkInModule(Module &Src, unsigned Flags,
                           const FunctionInfoIndex *Index,
-                          Function *FuncToImport) {
+                          DenseSet<const GlobalValue *> *FuncToImport) {
   ModuleLinker TheLinker(Composite, IdentifiedStructTypes, Src,
                          DiagnosticHandler, Flags, Index, FuncToImport);
   bool RetCode = TheLinker.run();
diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp
index 92764c9e8c3f..8230d64026c2 100644
--- a/lib/Transforms/IPO/FunctionImport.cpp
+++ b/lib/Transforms/IPO/FunctionImport.cpp
@@ -182,7 +182,10 @@ bool FunctionImporter::importFunctions(Module &M) {
     }
 
     // Link in the specified function.
-    if (L.linkInModule(Module, Linker::Flags::None, &Index, F))
+    DenseSet<const GlobalValue *> FunctionsToImport;
+    FunctionsToImport.insert(F);
+    if (L.linkInModule(Module, Linker::Flags::None, &Index,
+                       &FunctionsToImport))
       report_fatal_error("Function Import: link error");
 
     // Process the newly imported function and add callees to the worklist.
@@ -194,6 +197,7 @@ bool FunctionImporter::importFunctions(Module &M) {
 
     Changed = true;
   }
+
   return Changed;
 }
 
diff --git a/tools/llvm-link/llvm-link.cpp b/tools/llvm-link/llvm-link.cpp
index 2b63649cec2e..9a373c25cc5c 100644
--- a/tools/llvm-link/llvm-link.cpp
+++ b/tools/llvm-link/llvm-link.cpp
@@ -198,7 +198,10 @@ static bool importFunctions(const char *argv0, LLVMContext &Context,
     }
 
     // Link in the specified function.
-    if (L.linkInModule(*M, Linker::Flags::None, Index.get(), F))
+    DenseSet<const GlobalValue *> FunctionToImport;
+    FunctionToImport.insert(F);
+    if (L.linkInModule(*M, Linker::Flags::None, Index.get(),
+                       &FunctionToImport))
       return false;
   }
   return true;

From d217d70095d675d649e97281981c68ce2a8c7b54 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Wed, 2 Dec 2015 05:24:38 +0000
Subject: [PATCH 123/186] [X86] Fix weird identation. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254487 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 7ddcd8fcda48..55b9c4f1766d 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -26570,18 +26570,18 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
 
   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
     if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) {
-        SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS,
-                                   LHS.getOperand(1));
-        return DAG.getSetCC(DL, N->getValueType(0), addV,
-                            DAG.getConstant(0, DL, addV.getValueType()), CC);
-      }
+      SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS,
+                                 LHS.getOperand(1));
+      return DAG.getSetCC(DL, N->getValueType(0), addV,
+                          DAG.getConstant(0, DL, addV.getValueType()), CC);
+    }
   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
     if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) {
-        SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS,
-                                   RHS.getOperand(1));
-        return DAG.getSetCC(DL, N->getValueType(0), addV,
-                            DAG.getConstant(0, DL, addV.getValueType()), CC);
-      }
+      SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS,
+                                 RHS.getOperand(1));
+      return DAG.getSetCC(DL, N->getValueType(0), addV,
+                          DAG.getConstant(0, DL, addV.getValueType()), CC);
+    }
 
   if (VT.getScalarType() == MVT::i1 &&
       (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {

From b9f437f75925cb910934f522cde756ee532a2119 Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Wed, 2 Dec 2015 06:21:28 +0000
Subject: [PATCH 124/186] DebugInfo\DWARF: Privatize some accidentally public
 members

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254488 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/DebugInfo/DWARF/DWARFUnitIndex.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/llvm/DebugInfo/DWARF/DWARFUnitIndex.h b/include/llvm/DebugInfo/DWARF/DWARFUnitIndex.h
index 0d49d7c0cf82..83da6fef4411 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFUnitIndex.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFUnitIndex.h
@@ -55,6 +55,7 @@ class DWARFUnitIndex {
     const SectionContribution *getOffset() const;
   };
 
+private:
   struct Header Header;
 
   int InfoColumn = -1;

From 5001b2a939f5f87aec2279a4476c76d0415f97ef Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Wed, 2 Dec 2015 06:21:34 +0000
Subject: [PATCH 125/186] [llvm-dwp] Emit a rather fictional debug_cu_index

This is very rudimentary support for debug_cu_index, but it is enough to
allow llvm-dwarfdump to find the offsets for  contributions and
correctly dump debug_info.

It will need to actually find the real signature of the unit and build
the real hash table with the right number of buckets, as per the DWP
specification.

It will also need to be expanded to cover the tu_index as well.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254489 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/DebugInfo/DWARF/DWARFUnitIndex.h |  11 +-
 include/llvm/MC/MCObjectFileInfo.h            |   4 +
 lib/DebugInfo/DWARF/DWARFContext.cpp          |  16 +--
 lib/DebugInfo/DWARF/DWARFUnitIndex.cpp        |  12 +-
 lib/MC/MCObjectFileInfo.cpp                   |  12 ++
 test/tools/llvm-dwp/X86/simple.test           |  13 ++-
 tools/llvm-dwp/llvm-dwp.cpp                   | 103 ++++++++++++++----
 7 files changed, 128 insertions(+), 43 deletions(-)

diff --git a/include/llvm/DebugInfo/DWARF/DWARFUnitIndex.h b/include/llvm/DebugInfo/DWARF/DWARFUnitIndex.h
index 83da6fef4411..a85c2f9f0a23 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFUnitIndex.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFUnitIndex.h
@@ -41,12 +41,15 @@ class DWARFUnitIndex {
 
 public:
   class Entry {
-    const DWARFUnitIndex *Index;
-    uint64_t Signature;
+  public:
     struct SectionContribution {
       uint32_t Offset;
       uint32_t Length;
     };
+
+  private:
+    const DWARFUnitIndex *Index;
+    uint64_t Signature;
     std::unique_ptr<SectionContribution[]> Contributions;
     friend class DWARFUnitIndex;
 
@@ -58,14 +61,18 @@ class DWARFUnitIndex {
 private:
   struct Header Header;
 
+  DWARFSectionKind InfoColumnKind;
   int InfoColumn = -1;
   std::unique_ptr<DWARFSectionKind[]> ColumnKinds;
   std::unique_ptr<Entry[]> Rows;
 
   static StringRef getColumnHeader(DWARFSectionKind DS);
+  bool parseImpl(DataExtractor IndexData);
 
 public:
   bool parse(DataExtractor IndexData);
+  DWARFUnitIndex(DWARFSectionKind InfoColumnKind)
+      : InfoColumnKind(InfoColumnKind) {}
   void dump(raw_ostream &OS) const;
   const Entry *getFromOffset(uint32_t Offset) const;
 };
diff --git a/include/llvm/MC/MCObjectFileInfo.h b/include/llvm/MC/MCObjectFileInfo.h
index eaca3833dc89..388a208fb4a0 100644
--- a/include/llvm/MC/MCObjectFileInfo.h
+++ b/include/llvm/MC/MCObjectFileInfo.h
@@ -116,6 +116,9 @@ class MCObjectFileInfo {
   MCSection *DwarfStrOffDWOSection;
   MCSection *DwarfAddrSection;
 
+  // These are for Fission DWP files.
+  MCSection *DwarfCUIndexSection;
+
   /// Section for newer gnu pubnames.
   MCSection *DwarfGnuPubNamesSection;
   /// Section for newer gnu pubtypes.
@@ -262,6 +265,7 @@ class MCObjectFileInfo {
   MCSection *getDwarfLocDWOSection() const { return DwarfLocDWOSection; }
   MCSection *getDwarfStrOffDWOSection() const { return DwarfStrOffDWOSection; }
   MCSection *getDwarfAddrSection() const { return DwarfAddrSection; }
+  MCSection *getDwarfCUIndexSection() const { return DwarfCUIndexSection; }
 
   MCSection *getCOFFDebugSymbolsSection() const {
     return COFFDebugSymbolsSection;
diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp
index 2165d353ba09..a4195b75c47d 100644
--- a/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -163,20 +163,12 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType) {
 
   if (DumpType == DIDT_All || DumpType == DIDT_CUIndex) {
     OS << "\n.debug_cu_index contents:\n";
-    DataExtractor CUIndexData(getCUIndexSection(), isLittleEndian(),
-                              savedAddressByteSize);
-    DWARFUnitIndex CUIndex;
-    if (CUIndex.parse(CUIndexData))
-      CUIndex.dump(OS);
+    getCUIndex().dump(OS);
   }
 
   if (DumpType == DIDT_All || DumpType == DIDT_TUIndex) {
     OS << "\n.debug_tu_index contents:\n";
-    DataExtractor TUIndexData(getTUIndexSection(), isLittleEndian(),
-                              savedAddressByteSize);
-    DWARFUnitIndex TUIndex;
-    if (TUIndex.parse(TUIndexData))
-      TUIndex.dump(OS);
+    getTUIndex().dump(OS);
   }
 
   if (DumpType == DIDT_All || DumpType == DIDT_LineDwo) {
@@ -280,7 +272,7 @@ const DWARFUnitIndex &DWARFContext::getCUIndex() {
 
   DataExtractor CUIndexData(getCUIndexSection(), isLittleEndian(), 0);
 
-  CUIndex = llvm::make_unique<DWARFUnitIndex>();
+  CUIndex = llvm::make_unique<DWARFUnitIndex>(DW_SECT_INFO);
   CUIndex->parse(CUIndexData);
   return *CUIndex;
 }
@@ -291,7 +283,7 @@ const DWARFUnitIndex &DWARFContext::getTUIndex() {
 
   DataExtractor TUIndexData(getTUIndexSection(), isLittleEndian(), 0);
 
-  TUIndex = llvm::make_unique<DWARFUnitIndex>();
+  TUIndex = llvm::make_unique<DWARFUnitIndex>(DW_SECT_TYPES);
   TUIndex->parse(TUIndexData);
   return *TUIndex;
 }
diff --git a/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp b/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
index baefed30f368..e8e7441d9769 100644
--- a/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
+++ b/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
@@ -30,6 +30,13 @@ void DWARFUnitIndex::Header::dump(raw_ostream &OS) const {
 }
 
 bool DWARFUnitIndex::parse(DataExtractor IndexData) {
+  bool b = parseImpl(IndexData);
+  if (!b)
+    *this = DWARFUnitIndex(InfoColumnKind);
+  return b;
+}
+
+bool DWARFUnitIndex::parseImpl(DataExtractor IndexData) {
   uint32_t Offset = 0;
   if (!Header.parse(IndexData, &Offset))
     return false;
@@ -62,7 +69,7 @@ bool DWARFUnitIndex::parse(DataExtractor IndexData) {
   // Read the Column Headers
   for (unsigned i = 0; i != Header.NumColumns; ++i) {
     ColumnKinds[i] = static_cast<DWARFSectionKind>(IndexData.getU32(&Offset));
-    if (ColumnKinds[i] == DW_SECT_INFO || ColumnKinds[i] == DW_SECT_TYPES) {
+    if (ColumnKinds[i] == InfoColumnKind) {
       if (InfoColumn != -1)
         return false;
       InfoColumn = i;
@@ -107,6 +114,9 @@ StringRef DWARFUnitIndex::getColumnHeader(DWARFSectionKind DS) {
 }
 
 void DWARFUnitIndex::dump(raw_ostream &OS) const {
+  if (!Header.NumBuckets)
+    return;
+
   Header.dump(OS);
   OS << "Index Signature         ";
   for (unsigned i = 0; i != Header.NumColumns; ++i)
diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index 8b75457a2460..41e28698b1cc 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -259,6 +259,9 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(Triple T) {
   DwarfDebugInlineSection =
       Ctx->getMachOSection("__DWARF", "__debug_inlined", MachO::S_ATTR_DEBUG,
                            SectionKind::getMetadata());
+  DwarfCUIndexSection =
+      Ctx->getMachOSection("__DWARF", "__debug_cu_index", MachO::S_ATTR_DEBUG,
+                           SectionKind::getMetadata());
   StackMapSection = Ctx->getMachOSection("__LLVM_STACKMAPS", "__llvm_stackmaps",
                                          0, SectionKind::getMetadata());
 
@@ -531,6 +534,10 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(Triple T) {
   DwarfAddrSection =
       Ctx->getELFSection(".debug_addr", ELF::SHT_PROGBITS, 0, "addr_sec");
 
+  // DWP Sections
+  DwarfCUIndexSection =
+      Ctx->getELFSection(".debug_cu_index", ELF::SHT_PROGBITS, 0);
+
   StackMapSection =
       Ctx->getELFSection(".llvm_stackmaps", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
 
@@ -713,6 +720,11 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(Triple T) {
       COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
           COFF::IMAGE_SCN_MEM_READ,
       SectionKind::getMetadata(), "addr_sec");
+  DwarfCUIndexSection = Ctx->getCOFFSection(
+      ".debug_cu_index",
+      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+          COFF::IMAGE_SCN_MEM_READ,
+      SectionKind::getMetadata());
   DwarfAccelNamesSection = Ctx->getCOFFSection(
       ".apple_names",
       COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
diff --git a/test/tools/llvm-dwp/X86/simple.test b/test/tools/llvm-dwp/X86/simple.test
index 754450f13f3f..aa5ae40dc2b3 100644
--- a/test/tools/llvm-dwp/X86/simple.test
+++ b/test/tools/llvm-dwp/X86/simple.test
@@ -33,12 +33,13 @@ CHECK:     DW_AT_name {{.*}} "a"
 CHECK:   DW_TAG_structure_type
 CHECK:     DW_AT_name {{.*}} "foo"
 
-FIXME: Using cu_index, identify that abbr_offset is 0x0031, not 0x0000
-CHECK: 0x00000029: Compile Unit: length = 0x00000031 version = 0x0004 abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x0000005e)
-FIXME: Using cu_index, use strings based on the right str index offset
-CHECK:   DW_AT_name {{.*}} "a.cpp"
-FIXME: Using cu_index to find the right abbrevs at abbr_offset, this abbrevation should actually be structure_type
-CHECK:   DW_TAG_variable
+CHECK: 0x00000029: Compile Unit: length = 0x00000031 version = 0x0004 abbr_offset = 0x0031 addr_size = 0x08 (next unit at 0x0000005e)
+CHECK:   DW_AT_name {{.*}} "b.cpp"
+CHECK:   DW_TAG_structure_type
+CHECK:     DW_AT_name {{.*}} "bar"
+CHECK:   DW_TAG_subprogram
+CHECK:     DW_AT_name {{.*}} "b"
+CHECK:     DW_TAG_formal_parameter
 
 CHECK: .debug_cu_index contents:
 FIXME: Emit and verify the cu_index contents
diff --git a/tools/llvm-dwp/llvm-dwp.cpp b/tools/llvm-dwp/llvm-dwp.cpp
index 9ce37ec2ceee..6617b0b23aeb 100644
--- a/tools/llvm-dwp/llvm-dwp.cpp
+++ b/tools/llvm-dwp/llvm-dwp.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
 #include <memory>
 #include <list>
 #include <unordered_set>
@@ -85,48 +86,106 @@ static std::error_code write(MCStreamer &Out, ArrayRef<std::string> Inputs) {
   const auto &MCOFI = *Out.getContext().getObjectFileInfo();
   MCSection *const StrSection = MCOFI.getDwarfStrDWOSection();
   MCSection *const StrOffsetSection = MCOFI.getDwarfStrOffDWOSection();
-  const StringMap<MCSection *> KnownSections = {
-      {"debug_info.dwo", MCOFI.getDwarfInfoDWOSection()},
-      {"debug_types.dwo", MCOFI.getDwarfTypesDWOSection()},
-      {"debug_str_offsets.dwo", StrOffsetSection},
-      {"debug_str.dwo", StrSection},
-      {"debug_loc.dwo", MCOFI.getDwarfLocDWOSection()},
-      {"debug_abbrev.dwo", MCOFI.getDwarfAbbrevDWOSection()}};
+  const StringMap<std::pair<MCSection *, DWARFSectionKind>> KnownSections = {
+      {"debug_info.dwo", {MCOFI.getDwarfInfoDWOSection(), DW_SECT_INFO}},
+      {"debug_types.dwo", {MCOFI.getDwarfTypesDWOSection(), DW_SECT_TYPES}},
+      {"debug_str_offsets.dwo", {StrOffsetSection, DW_SECT_STR_OFFSETS}},
+      {"debug_str.dwo", {StrSection, static_cast<DWARFSectionKind>(0)}},
+      {"debug_loc.dwo", {MCOFI.getDwarfLocDWOSection(), DW_SECT_LOC}},
+      {"debug_abbrev.dwo", {MCOFI.getDwarfAbbrevDWOSection(), DW_SECT_ABBREV}}};
+
+  struct UnitIndexEntry {
+    uint64_t Signature;
+    DWARFUnitIndex::Entry::SectionContribution Contributions[8];
+  };
+
+  std::vector<UnitIndexEntry> IndexEntries;
 
   StringMap<uint32_t> Strings;
   uint32_t StringOffset = 0;
 
+  uint64_t UnitIndex = 0;
+  uint32_t ContributionOffsets[8] = {};
+
   for (const auto &Input : Inputs) {
     auto ErrOrObj = object::ObjectFile::createObjectFile(Input);
     if (!ErrOrObj)
       return ErrOrObj.getError();
-    const auto *Obj = ErrOrObj->getBinary();
+
+    IndexEntries.emplace_back();
+    UnitIndexEntry &CurEntry = IndexEntries.back();
+    CurEntry.Signature = UnitIndex++;
+
     StringRef CurStrSection;
     StringRef CurStrOffsetSection;
-    for (const auto &Section : Obj->sections()) {
+
+    for (const auto &Section : ErrOrObj->getBinary()->sections()) {
       StringRef Name;
       if (std::error_code Err = Section.getName(Name))
         return Err;
-      if (MCSection *OutSection =
-              KnownSections.lookup(Name.substr(Name.find_first_not_of("._")))) {
-        StringRef Contents;
-        if (auto Err = Section.getContents(Contents))
-          return Err;
-        if (OutSection == StrOffsetSection)
-          CurStrOffsetSection = Contents;
-        else if (OutSection == StrSection)
-          CurStrSection = Contents;
-        else {
-          Out.SwitchSection(OutSection);
-          Out.EmitBytes(Contents);
-        }
+
+      auto SectionPair =
+          KnownSections.find(Name.substr(Name.find_first_not_of("._")));
+      if (SectionPair == KnownSections.end())
+        continue;
+
+      StringRef Contents;
+      if (auto Err = Section.getContents(Contents))
+        return Err;
+
+      if (DWARFSectionKind Kind = SectionPair->second.second) {
+        auto Index = Kind - DW_SECT_INFO;
+        CurEntry.Contributions[Index].Offset = ContributionOffsets[Index];
+        ContributionOffsets[Index] +=
+            (CurEntry.Contributions[Index].Length = Contents.size());
+      }
+
+      MCSection *OutSection = SectionPair->second.first;
+      if (OutSection == StrOffsetSection)
+        CurStrOffsetSection = Contents;
+      else if (OutSection == StrSection)
+        CurStrSection = Contents;
+      else {
+        Out.SwitchSection(OutSection);
+        Out.EmitBytes(Contents);
       }
     }
+
     if (auto Err = writeStringsAndOffsets(Out, Strings, StringOffset,
                                           StrSection, StrOffsetSection,
                                           CurStrSection, CurStrOffsetSection))
       return Err;
   }
+
+  Out.SwitchSection(MCOFI.getDwarfCUIndexSection());
+  Out.EmitIntValue(2, 4);                   // Version
+  Out.EmitIntValue(8, 4);                   // Columns
+  Out.EmitIntValue(IndexEntries.size(), 4); // Num Units
+  // FIXME: This is not the right number of buckets for a real hash.
+  Out.EmitIntValue(IndexEntries.size(), 4); // Num Buckets
+
+  // Write the signatures.
+  for (const auto &E : IndexEntries)
+    Out.EmitIntValue(E.Signature, 8);
+
+  // Write the indexes.
+  for (size_t i = 0; i != IndexEntries.size(); ++i)
+    Out.EmitIntValue(i + 1, 4);
+
+  // Write the column headers (which sections will appear in the table)
+  for (size_t i = 1; i != 9; ++i)
+    Out.EmitIntValue(i, 4);
+
+  // Write the offsets.
+  for (const auto &E : IndexEntries)
+    for (const auto &C : E.Contributions)
+      Out.EmitIntValue(C.Offset, 4);
+
+  // Write the lengths.
+  for (const auto &E : IndexEntries)
+    for (const auto &C : E.Contributions)
+      Out.EmitIntValue(C.Length, 4);
+
   return std::error_code();
 }
 

From 1bd8c8c6356c8fc47152d886e332be93013b1efc Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Wed, 2 Dec 2015 06:39:19 +0000
Subject: [PATCH 126/186] [X86] Change getZeroVector to take an MVT instead of
 EVT. One minor change needed to only try to perform 256-it shuffle combines
 on legal vector types.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254490 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 55b9c4f1766d..5782284d46f2 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -4304,7 +4304,7 @@ static SDValue getConstVector(ArrayRef<int> Values, MVT VT,
 }
 
 /// Returns a vector of specified type with all zero elements.
-static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
+static SDValue getZeroVector(MVT VT, const X86Subtarget *Subtarget,
                              SelectionDAG &DAG, SDLoc dl) {
   assert(VT.isVector() && "Expected a vector type");
 
@@ -16803,7 +16803,7 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
   if (Src.getOpcode() == ISD::UNDEF)
-    Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
+    Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
   SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
@@ -22319,7 +22319,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
-  EVT VT = SVOp->getValueType(0);
+  MVT VT = SVOp->getSimpleValueType(0);
   unsigned NumElems = VT.getVectorNumElements();
 
   if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
@@ -23202,7 +23202,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
       return AddSub;
 
   // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
-  if (Subtarget->hasFp256() && VT.is256BitVector() &&
+  if (TLI.isTypeLegal(VT) && Subtarget->hasFp256() && VT.is256BitVector() &&
       N->getOpcode() == ISD::VECTOR_SHUFFLE)
     return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
 
@@ -24677,7 +24677,7 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
       // the element size. The constant shift amount will be
       // encoded as a 8-bit immediate.
       if (ShiftAmt.trunc(8).uge(MaxAmount))
-        return getZeroVector(VT, Subtarget, DAG, DL);
+        return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
     }
 
   return SDValue();

From ec268660c8dee1cc4831ca36df0577e057742eb8 Mon Sep 17 00:00:00 2001
From: Akira Hatanaka <ahatanaka@apple.com>
Date: Wed, 2 Dec 2015 06:58:49 +0000
Subject: [PATCH 127/186] [AttributeSet] Overload AttributeSet::addAttribute to
 reduce compile time.

The new overloaded function is used when an attribute is added to a
large number of slots of an AttributeSet (for example, to function
parameters). This is much faster than calling AttributeSet::addAttribute
once per slot, because AttributeSet::getImpl (which calls
FoldingSet::FIndNodeOrInsertPos) is called only once per function
instead of once per slot.

With this commit, clang compiles a file which used to take over 22
minutes in just 13 seconds.

rdar://problem/23581000

Differential Revision: http://reviews.llvm.org/D15085


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254491 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/Attributes.h                  |  5 ++++
 lib/IR/Attributes.cpp                         | 30 +++++++++++++++++++
 .../InstCombine/InstCombineCalls.cpp          | 21 ++++++++-----
 .../Scalar/CorrelatedValuePropagation.cpp     | 24 ++++++++-------
 4 files changed, 63 insertions(+), 17 deletions(-)

diff --git a/include/llvm/IR/Attributes.h b/include/llvm/IR/Attributes.h
index 52a9ca83013b..286a6dae37d8 100644
--- a/include/llvm/IR/Attributes.h
+++ b/include/llvm/IR/Attributes.h
@@ -227,6 +227,11 @@ class AttributeSet {
   AttributeSet addAttribute(LLVMContext &C, unsigned Index,
                             StringRef Kind, StringRef Value) const;
 
+  /// Add an attribute to the attribute set at the given indices. Because
+  /// attribute sets are immutable, this returns a new set.
+  AttributeSet addAttribute(LLVMContext &C, ArrayRef<unsigned> Indices,
+                            Attribute A) const;
+
   /// \brief Add attributes to the attribute set at the given index. Because
   /// attribute sets are immutable, this returns a new set.
   AttributeSet addAttributes(LLVMContext &C, unsigned Index,
diff --git a/lib/IR/Attributes.cpp b/lib/IR/Attributes.cpp
index bdefe5917fef..e9626ba7e3bc 100644
--- a/lib/IR/Attributes.cpp
+++ b/lib/IR/Attributes.cpp
@@ -770,6 +770,36 @@ AttributeSet AttributeSet::addAttribute(LLVMContext &C, unsigned Index,
   return addAttributes(C, Index, AttributeSet::get(C, Index, B));
 }
 
+AttributeSet AttributeSet::addAttribute(LLVMContext &C,
+                                        ArrayRef<unsigned> Indices,
+                                        Attribute A) const {
+  unsigned I = 0, E = pImpl ? pImpl->getNumAttributes() : 0;
+  auto IdxI = Indices.begin(), IdxE = Indices.end();
+  SmallVector<AttributeSet, 4> AttrSet;
+
+  while (I != E && IdxI != IdxE) {
+    if (getSlotIndex(I) < *IdxI)
+      AttrSet.emplace_back(getSlotAttributes(I++));
+    else if (getSlotIndex(I) > *IdxI)
+      AttrSet.emplace_back(AttributeSet::get(C, std::make_pair(*IdxI++, A)));
+    else {
+      AttrBuilder B(getSlotAttributes(I), *IdxI);
+      B.addAttribute(A);
+      AttrSet.emplace_back(AttributeSet::get(C, *IdxI, B));
+      ++I;
+      ++IdxI;
+    }
+  }
+
+  while (I != E)
+    AttrSet.emplace_back(getSlotAttributes(I++));
+
+  while (IdxI != IdxE)
+    AttrSet.emplace_back(AttributeSet::get(C, std::make_pair(*IdxI++, A)));
+
+  return get(C, AttrSet);
+}
+
 AttributeSet AttributeSet::addAttributes(LLVMContext &C, unsigned Index,
                                          AttributeSet Attrs) const {
   if (!pImpl) return Attrs;
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 26088bbe018a..d2341c8c05db 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1942,20 +1942,27 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
   // Mark any parameters that are known to be non-null with the nonnull
   // attribute.  This is helpful for inlining calls to functions with null
   // checks on their arguments.
+  SmallVector<unsigned, 4> Indices;
   unsigned ArgNo = 0;
+
   for (Value *V : CS.args()) {
     if (V->getType()->isPointerTy() && !CS.paramHasAttr(ArgNo+1, Attribute::NonNull) &&
-        isKnownNonNullAt(V, CS.getInstruction(), DT, TLI)) {
-      AttributeSet AS = CS.getAttributes();
-      AS = AS.addAttribute(CS.getInstruction()->getContext(), ArgNo+1,
-                           Attribute::NonNull);
-      CS.setAttributes(AS);
-      Changed = true;
-    }
+        isKnownNonNullAt(V, CS.getInstruction(), DT, TLI))
+      Indices.push_back(ArgNo + 1);
     ArgNo++;
   }
+
   assert(ArgNo == CS.arg_size() && "sanity check");
 
+  if (!Indices.empty()) {
+    AttributeSet AS = CS.getAttributes();
+    LLVMContext &Ctx = CS.getInstruction()->getContext();
+    AS = AS.addAttribute(Ctx, Indices,
+                         Attribute::get(Ctx, Attribute::NonNull));
+    CS.setAttributes(AS);
+    Changed = true;
+  }
+
   // If the callee is a pointer to a function, attempt to move any casts to the
   // arguments of the call/invoke.
   Value *Callee = CS.getCalledValue();
diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index d7e02b16a28e..686bd4071104 100644
--- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -307,27 +307,31 @@ bool CorrelatedValuePropagation::processSwitch(SwitchInst *SI) {
 /// processCallSite - Infer nonnull attributes for the arguments at the
 /// specified callsite.
 bool CorrelatedValuePropagation::processCallSite(CallSite CS) {
-  bool Changed = false;
-
+  SmallVector<unsigned, 4> Indices;
   unsigned ArgNo = 0;
+
   for (Value *V : CS.args()) {
     PointerType *Type = dyn_cast<PointerType>(V->getType());
 
     if (Type && !CS.paramHasAttr(ArgNo + 1, Attribute::NonNull) &&
         LVI->getPredicateAt(ICmpInst::ICMP_EQ, V,
                             ConstantPointerNull::get(Type),
-                            CS.getInstruction()) == LazyValueInfo::False) {
-      AttributeSet AS = CS.getAttributes();
-      AS = AS.addAttribute(CS.getInstruction()->getContext(), ArgNo + 1,
-                           Attribute::NonNull);
-      CS.setAttributes(AS);
-      Changed = true;
-    }
+                            CS.getInstruction()) == LazyValueInfo::False)
+      Indices.push_back(ArgNo + 1);
     ArgNo++;
   }
+
   assert(ArgNo == CS.arg_size() && "sanity check");
 
-  return Changed;
+  if (Indices.empty())
+    return false;
+
+  AttributeSet AS = CS.getAttributes();
+  LLVMContext &Ctx = CS.getInstruction()->getContext();
+  AS = AS.addAttribute(Ctx, Indices, Attribute::get(Ctx, Attribute::NonNull));
+  CS.setAttributes(AS);
+
+  return true;
 }
 
 Constant *CorrelatedValuePropagation::getConstantAt(Value *V, Instruction *At) {

From cf0403d373db0ec717f6f2678fc97dcb931ff114 Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Wed, 2 Dec 2015 07:09:26 +0000
Subject: [PATCH 128/186] [llvm-dwp] Don't rely on implicit move assignment
 operator (MSVC won't synthesize one)

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254492 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/DebugInfo/DWARF/DWARFUnitIndex.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp b/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
index e8e7441d9769..1f1921649b57 100644
--- a/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
+++ b/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
@@ -31,8 +31,13 @@ void DWARFUnitIndex::Header::dump(raw_ostream &OS) const {
 
 bool DWARFUnitIndex::parse(DataExtractor IndexData) {
   bool b = parseImpl(IndexData);
-  if (!b)
-    *this = DWARFUnitIndex(InfoColumnKind);
+  if (!b) {
+    // Make sure we don't try to dump anything
+    Header.NumBuckets = 0;
+    // Release any partially initialized data.
+    ColumnKinds.reset();
+    Rows.reset();
+  }
   return b;
 }
 

From d4f2260f2d8164e6a78021386622fe146e34c5d2 Mon Sep 17 00:00:00 2001
From: Asaf Badouh <asaf.badouh@intel.com>
Date: Wed, 2 Dec 2015 08:17:51 +0000
Subject: [PATCH 129/186] [X86][AVX512] add comi with Sae add
 builtin_ia32_vcomisd and builtin_ia32_vcomisd

Differential Revision: http://reviews.llvm.org/D14331



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254493 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/IntrinsicsX86.td        |   6 ++
 lib/Target/X86/X86ISelLowering.cpp      |  18 ++++
 lib/Target/X86/X86InstrAVX512.td        |  23 +++++
 lib/Target/X86/X86InstrFragmentsSIMD.td |   4 +
 lib/Target/X86/X86IntrinsicsInfo.h      |  93 ++++++++++++++++-
 test/CodeGen/X86/avx512-intrinsics.ll   |  75 ++++++++++++++
 test/MC/X86/avx512-encodings.s          | 127 ++++++++++++++++++++++++
 7 files changed, 345 insertions(+), 1 deletion(-)

diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td
index 7ec0f4b86f11..57ad278a68bd 100644
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@@ -6304,6 +6304,12 @@ let TargetPrefix = "x86" in {
 // Compares
 let TargetPrefix = "x86" in {
   // 512-bit
+  def int_x86_avx512_vcomi_sd : GCCBuiltin<"__builtin_ia32_vcomisd">,
+              Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
+                         llvm_v2f64_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_vcomi_ss : GCCBuiltin<"__builtin_ia32_vcomiss">,
+              Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
+                         llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_pcmpeq_b_512 : GCCBuiltin<"__builtin_ia32_pcmpeqb512_mask">,
         Intrinsic<[llvm_i64_ty], [llvm_v64i8_ty, llvm_v64i8_ty, llvm_i64_ty],
                   [IntrNoMem]>;
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 5782284d46f2..fb990e7499e2 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -16528,6 +16528,24 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
                                   DAG.getConstant(X86CC, dl, MVT::i8), Cond);
       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
     }
+    case COMI_RM: { // Comparison intrinsics with Sae
+      SDValue LHS = Op.getOperand(1);
+      SDValue RHS = Op.getOperand(2);
+      SDValue CC = Op.getOperand(3);
+      SDValue Sae = Op.getOperand(4);
+      auto ComiType = TranslateX86ConstCondToX86CC(CC);
+      // choose between ordered and unordered (comi/ucomi)
+      unsigned comiOp = std::get<0>(ComiType) ? IntrData->Opc0 : IntrData->Opc1;
+      SDValue Cond;
+      if (cast<ConstantSDNode>(Sae)->getZExtValue() !=
+                                           X86::STATIC_ROUNDING::CUR_DIRECTION)
+        Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS, Sae);
+      else
+        Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS);
+      SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+        DAG.getConstant(std::get<1>(ComiType), dl, MVT::i8), Cond);
+      return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+    }
     case VSHIFT:
       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
                                  Op.getOperand(1), Op.getOperand(2), DAG);
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 1dfd8d4510f5..d15d0dc96e6f 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -5605,6 +5605,29 @@ let Predicates = [HasAVX512] in {
                         EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
   }
 }
+
+//  Unordered/Ordered scalar fp compare with Sea and set EFLAGS
+multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _, SDNode OpNode,
+                            string OpcodeStr> {
+  def rb: AVX512<opc, MRMSrcReg, (outs), (ins _.RC:$src1, _.RC:$src2),
+                 !strconcat(OpcodeStr, "\t{{sae}, $src2, $src1|$src1, $src2, {sae}}"),
+                 [(set EFLAGS, (OpNode (_.VT _.RC:$src1), _.RC:$src2, 
+                                                        (i32 FROUND_NO_EXC)))],
+                 IIC_SSE_COMIS_RR>, EVEX, EVEX_B, VEX_LIG, EVEX_V128,
+                 Sched<[WriteFAdd]>;
+}
+
+let Defs = [EFLAGS], Predicates = [HasAVX512] in {
+  defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, X86ucomiSae, "vucomiss">,
+                                   AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>;
+  defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, X86ucomiSae, "vucomisd">,
+                                   AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>;
+  defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, X86comiSae, "vcomiss">,
+                                   AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>;
+  defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, X86comiSae, "vcomisd">,
+                                   AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>;
+}
+
 let Defs = [EFLAGS], Predicates = [HasAVX512] in {
   defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86cmp, f32, f32mem, loadf32,
                                  "ucomiss">, PS, EVEX, VEX_LIG,
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 25f247e9d620..b456460a5bb5 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -38,6 +38,8 @@ def bc_mmx  : PatFrag<(ops node:$in), (x86mmx  (bitconvert node:$in))>;
 def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>,
                                        SDTCisFP<1>, SDTCisVT<3, i8>,
                                        SDTCisVec<1>]>;
+def SDTX86CmpTestSae : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, 
+                                     SDTCisSameAs<1, 2>, SDTCisInt<3>]>;
 
 def X86fmin    : SDNode<"X86ISD::FMIN",      SDTFPBinOp>;
 def X86fmax    : SDNode<"X86ISD::FMAX",      SDTFPBinOp>;
@@ -66,7 +68,9 @@ def X86fhsub   : SDNode<"X86ISD::FHSUB",     SDTFPBinOp>;
 def X86hadd    : SDNode<"X86ISD::HADD",      SDTIntBinOp>;
 def X86hsub    : SDNode<"X86ISD::HSUB",      SDTIntBinOp>;
 def X86comi    : SDNode<"X86ISD::COMI",      SDTX86CmpTest>;
+def X86comiSae : SDNode<"X86ISD::COMI",      SDTX86CmpTestSae>;
 def X86ucomi   : SDNode<"X86ISD::UCOMI",     SDTX86CmpTest>;
+def X86ucomiSae: SDNode<"X86ISD::UCOMI",     SDTX86CmpTestSae>;
 def X86cmps    : SDNode<"X86ISD::FSETCC",     SDTX86Cmps>;
 //def X86cmpsd   : SDNode<"X86ISD::FSETCCsd",    SDTX86Cmpsd>;
 def X86cvtdq2pd: SDNode<"X86ISD::CVTDQ2PD",
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index 80e55d04c1f3..bb2f7248b0e9 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -20,7 +20,7 @@ enum IntrinsicType {
   INTR_NO_TYPE,
   GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX, FPCLASS, FPCLASSS,
   INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_2OP_IMM8, INTR_TYPE_3OP, INTR_TYPE_4OP,
-  CMP_MASK, CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, VSHIFT_MASK, COMI,
+  CMP_MASK, CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, VSHIFT_MASK, COMI, COMI_RM,
   INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM,
   INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, INTR_TYPE_2OP_IMM8_MASK,
   INTR_TYPE_3OP_MASK, INTR_TYPE_3OP_MASK_RM, INTR_TYPE_3OP_IMM8_MASK,
@@ -1630,6 +1630,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
   X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0),
   X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0),
+  X86_INTRINSIC_DATA(avx512_vcomi_sd, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
+  X86_INTRINSIC_DATA(avx512_vcomi_ss, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
   X86_INTRINSIC_DATA(avx_hadd_pd_256,   INTR_TYPE_2OP, X86ISD::FHADD, 0),
   X86_INTRINSIC_DATA(avx_hadd_ps_256,   INTR_TYPE_2OP, X86ISD::FHADD, 0),
   X86_INTRINSIC_DATA(avx_hsub_pd_256,   INTR_TYPE_2OP, X86ISD::FHSUB, 0),
@@ -1821,6 +1823,95 @@ static void verifyIntrinsicTables() {
          "Intrinsic data tables should have unique entries");
 }
 
+// X86 specific compare constants.
+// They must be kept in synch with avxintrin.h
+#define _X86_CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
+#define _X86_CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
+#define _X86_CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
+#define _X86_CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
+#define _X86_CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
+#define _X86_CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
+#define _X86_CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
+#define _X86_CMP_ORD_Q    0x07 /* Ordered (nonsignaling)   */
+#define _X86_CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
+#define _X86_CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unord, signaling)  */
+#define _X86_CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
+#define _X86_CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
+#define _X86_CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
+#define _X86_CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
+#define _X86_CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
+#define _X86_CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
+#define _X86_CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
+#define _X86_CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
+#define _X86_CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
+#define _X86_CMP_UNORD_S  0x13 /* Unordered (signaling)  */
+#define _X86_CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
+#define _X86_CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
+#define _X86_CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unord, non-signaling)  */
+#define _X86_CMP_ORD_S    0x17 /* Ordered (signaling)  */
+#define _X86_CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
+#define _X86_CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unord, non-sign)  */
+#define _X86_CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
+#define _X86_CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
+#define _X86_CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
+#define _X86_CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
+#define _X86_CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
+#define _X86_CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
+
+/*
+* Get comparison modifier from _mm_comi_round_sd/ss intrinsic
+* Return tuple <isOrdered, X86 condcode>
+*/
+static std::tuple<bool,unsigned> TranslateX86ConstCondToX86CC(SDValue &imm) {
+  ConstantSDNode *CImm = dyn_cast<ConstantSDNode>(imm);
+  unsigned IntImm = CImm->getZExtValue();
+  // On a floating point condition, the flags are set as follows:
+  // ZF  PF  CF   op
+  //  0 | 0 | 0 | X > Y
+  //  0 | 0 | 1 | X < Y
+  //  1 | 0 | 0 | X == Y
+  //  1 | 1 | 1 | unordered
+  switch (IntImm) {
+  default: llvm_unreachable("Invalid floating point compare value for Comi!");
+  case _X86_CMP_EQ_OQ:      // 0x00 - Equal (ordered, nonsignaling)
+  case _X86_CMP_EQ_OS:      // 0x10 - Equal (ordered, signaling)
+    return std::make_tuple(true, X86::COND_E);
+  case _X86_CMP_EQ_UQ:      // 0x08 - Equal (unordered, non-signaling)
+  case _X86_CMP_EQ_US:      // 0x18 - Equal (unordered, signaling)
+    return std::make_tuple(false , X86::COND_E);
+  case _X86_CMP_LT_OS:      // 0x01 - Less-than (ordered, signaling)
+  case _X86_CMP_LT_OQ:      // 0x11 - Less-than (ordered, nonsignaling)
+    return std::make_tuple(true, X86::COND_B);
+  case _X86_CMP_NGE_US:     // 0x09 - Not-greater-than-or-equal (unordered, signaling)
+  case _X86_CMP_NGE_UQ:     // 0x19 - Not-greater-than-or-equal (unordered, nonsignaling)
+    return std::make_tuple(false , X86::COND_B);
+  case _X86_CMP_LE_OS:      // 0x02 - Less-than-or-equal (ordered, signaling)
+  case _X86_CMP_LE_OQ:      // 0x12 - Less-than-or-equal (ordered, nonsignaling)
+    return std::make_tuple(true, X86::COND_BE);
+  case _X86_CMP_NGT_US:     // 0x0A - Not-greater-than (unordered, signaling)
+  case _X86_CMP_NGT_UQ:     // 0x1A - Not-greater-than (unordered, nonsignaling)
+    return std::make_tuple(false, X86::COND_BE);
+  case _X86_CMP_GT_OS:      // 0x0E - Greater-than (ordered, signaling)
+  case _X86_CMP_GT_OQ:      // 0x1E - Greater-than (ordered, nonsignaling)
+    return std::make_tuple(true, X86::COND_A);
+  case _X86_CMP_NLE_US:     // 0x06 - Not-less-than-or-equal (unordered,signaling)
+  case _X86_CMP_NLE_UQ:     // 0x16 - Not-less-than-or-equal (unordered, nonsignaling)
+    return std::make_tuple(false, X86::COND_A);
+  case _X86_CMP_GE_OS:      // 0x0D - Greater-than-or-equal (ordered, signaling)
+  case _X86_CMP_GE_OQ:      // 0x1D - Greater-than-or-equal (ordered, nonsignaling)
+    return std::make_tuple(true, X86::COND_AE);
+  case _X86_CMP_NLT_US:     // 0x05 - Not-less-than (unordered, signaling)
+  case _X86_CMP_NLT_UQ:     // 0x15 - Not-less-than (unordered, nonsignaling)
+    return std::make_tuple(false, X86::COND_AE);
+  case _X86_CMP_NEQ_OQ:     // 0x0C - Not-equal (ordered, non-signaling)
+  case _X86_CMP_NEQ_OS:     // 0x1C - Not-equal (ordered, signaling)
+    return std::make_tuple(true, X86::COND_NE);
+  case _X86_CMP_NEQ_UQ:     // 0x04 - Not-equal (unordered, nonsignaling)
+  case _X86_CMP_NEQ_US:     // 0x14 - Not-equal (unordered, signaling)
+    return std::make_tuple(false, X86::COND_NE);
+  }
+}
+
 } // End llvm namespace
 
 #endif
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index ea0394c54854..c47027eed2b2 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -6159,3 +6159,78 @@ define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x
   ret <8 x double> %res4
 }
 
+define i32 @test_x86_avx512_comi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) {
+; CHECK-LABEL: test_x86_avx512_comi_sd_eq_sae
+; CHECK: vcomisd {sae}, %xmm1, %xmm0
+; CHECK-NEXT: sete	%al
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 8) 
+  ret i32 %res
+}
+
+define i32 @test_x86_avx512_ucomi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) {
+; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq_sae
+; CHECK: vucomisd {sae}, %xmm1, %xmm0
+; CHECK-NEXT: sete %al
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 8) 
+  ret i32 %res
+}
+
+define i32 @test_x86_avx512_comi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
+; CHECK-LABEL: test_x86_avx512_comi_sd_eq
+; CHECK: vcomisd %xmm1, %xmm0
+; CHECK-NEXT: sete %al
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 4) 
+  ret i32 %res
+}
+
+define i32 @test_x86_avx512_ucomi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
+; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq
+; CHECK: vucomisd %xmm1, %xmm0
+; CHECK-NEXT: sete %al
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 4) 
+  ret i32 %res
+}
+
+define i32 @test_x86_avx512_comi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) {
+; CHECK-LABEL: test_x86_avx512_comi_sd_lt_sae
+; CHECK: vcomisd {sae}, %xmm1, %xmm0
+; CHECK-NEXT: sbbl %eax, %eax
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 8) 
+  ret i32 %res
+}
+
+define i32 @test_x86_avx512_ucomi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) {
+; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt_sae
+; CHECK: vucomisd {sae}, %xmm1, %xmm0
+; CHECK-NEXT: sbbl %eax, %eax
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 8) 
+  ret i32 %res
+}
+
+define i32 @test_x86_avx512_comi_sd_lt(<2 x double> %a0, <2 x double> %a1) {
+; CHECK-LABEL: test_x86_avx512_comi_sd_lt
+; CHECK: vcomisd %xmm1, %xmm0
+; CHECK-NEXT: sbbl %eax, %eax
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 4) 
+  ret i32 %res
+}
+
+define i32 @test_x86_avx512_ucomi_sd_lt(<2 x double> %a0, <2 x double> %a1) {
+; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt
+; CHECK: vucomisd %xmm1, %xmm0
+; CHECK-NEXT: sbbl %eax, %eax
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 4) 
+  ret i32 %res
+}
+
+declare i32 @llvm.x86.avx512.vcomi.sd(<2 x double>, <2 x double>, i32, i32) 
+
+define i32 @test_x86_avx512_ucomi_ss_lt(<4 x float> %a0, <4 x float> %a1) {
+; CHECK-LABEL: test_x86_avx512_ucomi_ss_lt
+; CHECK: vucomiss %xmm1, %xmm0
+; CHECK-NEXT: sbbl %eax, %eax
+  %res = call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> %a0, <4 x float> %a1, i32 9, i32 4) 
+  ret i32 %res
+}
+
+declare i32 @llvm.x86.avx512.vcomi.ss(<4 x float>, <4 x float>, i32, i32)
diff --git a/test/MC/X86/avx512-encodings.s b/test/MC/X86/avx512-encodings.s
index 7ab94e729611..2043100bf3e6 100644
--- a/test/MC/X86/avx512-encodings.s
+++ b/test/MC/X86/avx512-encodings.s
@@ -19093,3 +19093,130 @@ vpermilpd $0x23, 0x400(%rbx), %zmm2
 // CHECK:  encoding: [0x62,0x41,0xfd,0x08,0xd6,0xc4]
           vmovq.s  %xmm24, %xmm12
 
+// CHECK: vcomisd %xmm21, %xmm23
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x08,0x2f,0xfd]
+          vcomisd %xmm21, %xmm23
+
+// CHECK: vcomisd {sae}, %xmm21, %xmm23
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x18,0x2f,0xfd]
+          vcomisd {sae}, %xmm21, %xmm23
+
+// CHECK: vcomisd (%rcx), %xmm23
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x2f,0x39]
+          vcomisd (%rcx), %xmm23
+
+// CHECK: vcomisd 291(%rax,%r14,8), %xmm23
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x08,0x2f,0xbc,0xf0,0x23,0x01,0x00,0x00]
+          vcomisd 291(%rax,%r14,8), %xmm23
+
+// CHECK: vcomisd 1016(%rdx), %xmm23
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x2f,0x7a,0x7f]
+          vcomisd 1016(%rdx), %xmm23
+
+// CHECK: vcomisd 1024(%rdx), %xmm23
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x2f,0xba,0x00,0x04,0x00,0x00]
+          vcomisd 1024(%rdx), %xmm23
+
+// CHECK: vcomisd -1024(%rdx), %xmm23
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x2f,0x7a,0x80]
+          vcomisd -1024(%rdx), %xmm23
+
+// CHECK: vcomisd -1032(%rdx), %xmm23
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x2f,0xba,0xf8,0xfb,0xff,0xff]
+          vcomisd -1032(%rdx), %xmm23
+
+// CHECK: vcomiss %xmm28, %xmm14
+// CHECK:  encoding: [0x62,0x11,0x7c,0x08,0x2f,0xf4]
+          vcomiss %xmm28, %xmm14
+
+// CHECK: vcomiss {sae}, %xmm28, %xmm14
+// CHECK:  encoding: [0x62,0x11,0x7c,0x18,0x2f,0xf4]
+          vcomiss {sae}, %xmm28, %xmm14
+
+// CHECK: vcomiss (%rcx), %xmm14
+// CHECK:  encoding: [0xc5,0x78,0x2f,0x31]
+          vcomiss (%rcx), %xmm14
+
+// CHECK: vcomiss 291(%rax,%r14,8), %xmm14
+// CHECK:  encoding: [0xc4,0x21,0x78,0x2f,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vcomiss 291(%rax,%r14,8), %xmm14
+
+// CHECK: vcomiss 508(%rdx), %xmm14
+// CHECK:  encoding: [0xc5,0x78,0x2f,0xb2,0xfc,0x01,0x00,0x00]
+          vcomiss 508(%rdx), %xmm14
+
+// CHECK: vcomiss 512(%rdx), %xmm14
+// CHECK:  encoding: [0xc5,0x78,0x2f,0xb2,0x00,0x02,0x00,0x00]
+          vcomiss 512(%rdx), %xmm14
+
+// CHECK: vcomiss -512(%rdx), %xmm14
+// CHECK:  encoding: [0xc5,0x78,0x2f,0xb2,0x00,0xfe,0xff,0xff]
+          vcomiss -512(%rdx), %xmm14
+
+// CHECK: vcomiss -516(%rdx), %xmm14
+// CHECK:  encoding: [0xc5,0x78,0x2f,0xb2,0xfc,0xfd,0xff,0xff]
+          vcomiss -516(%rdx), %xmm14
+
+// CHECK: vucomisd %xmm10, %xmm11
+// CHECK:  encoding: [0xc4,0x41,0x79,0x2e,0xda]
+          vucomisd %xmm10, %xmm11
+
+// CHECK: vucomisd {sae}, %xmm10, %xmm11
+// CHECK:  encoding: [0x62,0x51,0xfd,0x18,0x2e,0xda]
+          vucomisd {sae}, %xmm10, %xmm11
+
+// CHECK: vucomisd (%rcx), %xmm11
+// CHECK:  encoding: [0xc5,0x79,0x2e,0x19]
+          vucomisd (%rcx), %xmm11
+
+// CHECK: vucomisd 291(%rax,%r14,8), %xmm11
+// CHECK:  encoding: [0xc4,0x21,0x79,0x2e,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vucomisd 291(%rax,%r14,8), %xmm11
+
+// CHECK: vucomisd 1016(%rdx), %xmm11
+// CHECK:  encoding: [0xc5,0x79,0x2e,0x9a,0xf8,0x03,0x00,0x00]
+          vucomisd 1016(%rdx), %xmm11
+
+// CHECK: vucomisd 1024(%rdx), %xmm11
+// CHECK:  encoding: [0xc5,0x79,0x2e,0x9a,0x00,0x04,0x00,0x00]
+          vucomisd 1024(%rdx), %xmm11
+
+// CHECK: vucomisd -1024(%rdx), %xmm11
+// CHECK:  encoding: [0xc5,0x79,0x2e,0x9a,0x00,0xfc,0xff,0xff]
+          vucomisd -1024(%rdx), %xmm11
+
+// CHECK: vucomisd -1032(%rdx), %xmm11
+// CHECK:  encoding: [0xc5,0x79,0x2e,0x9a,0xf8,0xfb,0xff,0xff]
+          vucomisd -1032(%rdx), %xmm11
+
+// CHECK: vucomiss %xmm11, %xmm22
+// CHECK:  encoding: [0x62,0xc1,0x7c,0x08,0x2e,0xf3]
+          vucomiss %xmm11, %xmm22
+
+// CHECK: vucomiss {sae}, %xmm11, %xmm22
+// CHECK:  encoding: [0x62,0xc1,0x7c,0x18,0x2e,0xf3]
+          vucomiss {sae}, %xmm11, %xmm22
+
+// CHECK: vucomiss (%rcx), %xmm22
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x08,0x2e,0x31]
+          vucomiss (%rcx), %xmm22
+
+// CHECK: vucomiss 291(%rax,%r14,8), %xmm22
+// CHECK:  encoding: [0x62,0xa1,0x7c,0x08,0x2e,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vucomiss 291(%rax,%r14,8), %xmm22
+
+// CHECK: vucomiss 508(%rdx), %xmm22
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x08,0x2e,0x72,0x7f]
+          vucomiss 508(%rdx), %xmm22
+
+// CHECK: vucomiss 512(%rdx), %xmm22
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x08,0x2e,0xb2,0x00,0x02,0x00,0x00]
+          vucomiss 512(%rdx), %xmm22
+
+// CHECK: vucomiss -512(%rdx), %xmm22
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x08,0x2e,0x72,0x80]
+          vucomiss -512(%rdx), %xmm22
+
+// CHECK: vucomiss -516(%rdx), %xmm22
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x08,0x2e,0xb2,0xfc,0xfd,0xff,0xff]
+          vucomiss -516(%rdx), %xmm22

From cd9551564bc737c64e44a9d3742b470d32ca4fb9 Mon Sep 17 00:00:00 2001
From: Elena Demikhovsky <elena.demikhovsky@intel.com>
Date: Wed, 2 Dec 2015 08:59:47 +0000
Subject: [PATCH 130/186] AVX-512: Updated cost of FP/SINT/UINT conversion
 operations

I checked and updated the cost of AVX-512 conversion operations. Added cost of conversion operations in DQ mode.
Conversion of illegal types that requires vector split is not calculated right now (like for other X86 targets).

Differential Revision: http://reviews.llvm.org/D15074



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254494 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86TargetTransformInfo.cpp |  72 +++-
 test/Analysis/CostModel/X86/cast.ll       |  38 +-
 test/Analysis/CostModel/X86/sitofp.ll     | 340 +++++++++---------
 test/Analysis/CostModel/X86/uitofp.ll     | 418 +++++++++++++---------
 4 files changed, 493 insertions(+), 375 deletions(-)

diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index c648750b202e..cf7a826ea85d 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -528,17 +528,31 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
-  static const TypeConversionCostTblEntry AVX512ConversionTbl[] = {
+  static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  1 },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  1 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  1 },
+    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  1 },
+    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  1 },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  1 },
+
+    { ISD::FP_TO_UINT,  MVT::v2i64, MVT::v2f64, 1 },
+    { ISD::FP_TO_UINT,  MVT::v4i64, MVT::v4f64, 1 },
+    { ISD::FP_TO_UINT,  MVT::v8i64, MVT::v8f64, 1 },
+    { ISD::FP_TO_UINT,  MVT::v2i64, MVT::v2f32, 1 },
+    { ISD::FP_TO_UINT,  MVT::v4i64, MVT::v4f32, 1 },
+    { ISD::FP_TO_UINT,  MVT::v8i64, MVT::v8f32, 1 },
+  };
+
+  static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
     { ISD::FP_EXTEND, MVT::v8f64,   MVT::v8f32,  1 },
     { ISD::FP_EXTEND, MVT::v8f64,   MVT::v16f32, 3 },
     { ISD::FP_ROUND,  MVT::v8f32,   MVT::v8f64,  1 },
-    { ISD::FP_ROUND,  MVT::v16f32,  MVT::v8f64,  3 },
 
     { ISD::TRUNCATE,  MVT::v16i8,   MVT::v16i32, 1 },
     { ISD::TRUNCATE,  MVT::v16i16,  MVT::v16i32, 1 },
     { ISD::TRUNCATE,  MVT::v8i16,   MVT::v8i64,  1 },
     { ISD::TRUNCATE,  MVT::v8i32,   MVT::v8i64,  1 },
-    { ISD::TRUNCATE,  MVT::v16i32,  MVT::v8i64,  4 },
 
     // v16i1 -> v16i32 - load + broadcast
     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1,  2 },
@@ -548,16 +562,46 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  1 },
     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
-    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v16i32, 3 },
-    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v16i32, 3 },
+    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
+    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
+    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i16,  1 },
+    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i16,  1 },
 
     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i1,  3 },
     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i8,  2 },
     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 2 },
     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
+    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i8,   2 },
     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  2 },
     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
+
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i1,  3 },
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i8,  2 },
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 2 },
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  1 },
+    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  1 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  2 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i8,   2 },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i8,   2 },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  2 },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i8,   2 },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i16,  2 },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  1 },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i8,   2 },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i16,  5 },
+    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  2 },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  5 },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64, 12 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64, 26 },
+
+    { ISD::FP_TO_UINT,  MVT::v2i32,  MVT::v2f32,  1 },
+    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  1 },
+    { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f32,  1 },
+    { ISD::FP_TO_UINT,  MVT::v16i32, MVT::v16f32, 1 },
   };
 
   static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
@@ -693,12 +737,6 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
       return LTSrc.first * Entry->Cost;
   }
 
-  if (ST->hasAVX512()) {
-    if (const auto *Entry = ConvertCostTableLookup(AVX512ConversionTbl, ISD,
-                                                   LTDest.second, LTSrc.second))
-      return Entry->Cost;
-  }
-
   EVT SrcTy = TLI->getValueType(DL, Src);
   EVT DstTy = TLI->getValueType(DL, Dst);
 
@@ -706,6 +744,18 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
   if (!SrcTy.isSimple() || !DstTy.isSimple())
     return BaseT::getCastInstrCost(Opcode, Dst, Src);
 
+  if (ST->hasDQI())
+    if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
+                                                   DstTy.getSimpleVT(),
+                                                   SrcTy.getSimpleVT()))
+      return Entry->Cost;
+
+  if (ST->hasAVX512())
+    if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
+                                                   DstTy.getSimpleVT(),
+                                                   SrcTy.getSimpleVT()))
+      return Entry->Cost;
+
   if (ST->hasAVX2()) {
     if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
                                                    DstTy.getSimpleVT(),
diff --git a/test/Analysis/CostModel/X86/cast.ll b/test/Analysis/CostModel/X86/cast.ll
index fb16af635f07..c518587c0e1a 100644
--- a/test/Analysis/CostModel/X86/cast.ll
+++ b/test/Analysis/CostModel/X86/cast.ll
@@ -84,11 +84,11 @@ define i32 @zext_sext(<8 x i1> %in) {
   ;CHECK-AVX: cost of 4 {{.*}} zext
   %D = zext <4 x i32> undef to <4 x i64>
 
-  ;CHECK-AVX512: cost of 3 {{.*}} %D1 = zext
-  %D1 = zext <16 x i32> undef to <16 x i64>
+  ;CHECK-AVX512: cost of 1 {{.*}} %D1 = zext
+  %D1 = zext <8 x i32> undef to <8 x i64>
 
-  ;CHECK-AVX512: cost of 3 {{.*}} %D2 = sext
-  %D2 = sext <16 x i32> undef to <16 x i64>
+  ;CHECK-AVX512: cost of 1 {{.*}} %D2 = sext
+  %D2 = sext <8 x i32> undef to <8 x i64>
 
   ;CHECK-AVX512: cost of 1 {{.*}} %D3 = zext
   %D3 = zext <16 x i16> undef to <16 x i32>
@@ -118,9 +118,11 @@ define i32 @zext_sext(<8 x i1> %in) {
   ;CHECK_AVX512: cost of 1 {{.*}} G = trunc
   %G = trunc <8 x i64> undef to <8 x i32>
 
-  ;CHECK-AVX512: cost of 4 {{.*}} %G1 = trunc
-  %G1 = trunc <16 x i64> undef to <16 x i32>
+  ;CHECK-AVX512: cost of 1 {{.*}} %G1 = trunc
+  %G1 = trunc <16 x i32> undef to <16 x i16>
 
+  ;CHECK-AVX512: cost of 1 {{.*}} %G2 = trunc
+  %G2 = trunc <16 x i32> undef to <16 x i8>
   ret i32 undef
 }
 
@@ -207,38 +209,40 @@ define void @uitofp4(<4 x i1> %a, <4 x i8> %b, <4 x i16> %c, <4 x i32> %d) {
   ; CHECK: cost of 2 {{.*}} uitofp
   %C2 = uitofp <4 x i16> %c to <4 x double>
 
-  ; CHECK: cost of 6 {{.*}} uitofp
+  ; CHECK-AVX2: cost of 6 {{.*}} uitofp
   %D1 = uitofp <4 x i32> %d to <4 x float>
-  ; CHECK: cost of 6 {{.*}} uitofp
+  ; CHECK-AVX2: cost of 6 {{.*}} uitofp
   %D2 = uitofp <4 x i32> %d to <4 x double>
   ret void
 }
 
 define void @uitofp8(<8 x i1> %a, <8 x i8> %b, <8 x i16> %c, <8 x i32> %d) {
 ; CHECK-LABEL: for function 'uitofp8'
-  ; CHECK: cost of 6 {{.*}} uitofp
+  ; CHECK-AVX2: cost of 6 {{.*}} uitofp
   %A1 = uitofp <8 x i1> %a to <8 x float>
 
-  ; CHECK: cost of 5 {{.*}} uitofp
+  ; CHECK-AVX2: cost of 5 {{.*}} uitofp
+  ; CHECK-AVX512: cost of 2 {{.*}} uitofp
   %B1 = uitofp <8 x i8> %b to <8 x float>
 
-  ; CHECK: cost of 5 {{.*}} uitofp
+  ; CHECK-AVX2: cost of 5 {{.*}} uitofp
+  ; CHECK-AVX512: cost of 2 {{.*}} uitofp
   %C1 = uitofp <8 x i16> %c to <8 x float>
 
   ; CHECK-AVX2: cost of 8 {{.*}} uitofp
-  ; CHECK-AVX512: cost of 8 {{.*}} uitofp
+  ; CHECK-AVX512: cost of 1 {{.*}} uitofp
   ; CHECK-AVX: cost of 9 {{.*}} uitofp
   %D1 = uitofp <8 x i32> %d to <8 x float>
   ret void
 }
 
-define void @fp_conv(<8 x float> %a, <16 x float>%b) {
+define void @fp_conv(<8 x float> %a, <16 x float>%b, <4 x float> %c) {
 ;CHECK-LABEL: for function 'fp_conv'
   ; CHECK-AVX512: cost of 1 {{.*}} fpext
   %A1 = fpext <8 x float> %a to <8 x double>
 
-  ; CHECK-AVX512: cost of 3 {{.*}} fpext
-  %A2 = fpext <16 x float> %b to <16 x double>
+  ; CHECK-AVX512: cost of 1 {{.*}} fpext
+  %A2 = fpext <4 x float> %c to <4 x double>
 
   ; CHECK-AVX2:   cost of 3 {{.*}} %A3 = fpext
   ; CHECK-AVX512: cost of 1 {{.*}} %A3 = fpext
@@ -248,7 +252,7 @@ define void @fp_conv(<8 x float> %a, <16 x float>%b) {
   ; CHECK-AVX512: cost of 1 {{.*}} %A4 = fptrunc
   %A4 = fptrunc <8 x double> undef to <8 x float>
 
-  ; CHECK-AVX512: cost of 3 {{.*}} %A5 = fptrunc
-  %A5 = fptrunc <16 x double> undef to <16 x float>
+  ; CHECK-AVX512: cost of 1 {{.*}} %A5 = fptrunc
+  %A5 = fptrunc <4 x double> undef to <4 x float>
   ret void
 }
diff --git a/test/Analysis/CostModel/X86/sitofp.ll b/test/Analysis/CostModel/X86/sitofp.ll
index dcd0088d0df7..9913a4896912 100644
--- a/test/Analysis/CostModel/X86/sitofp.ll
+++ b/test/Analysis/CostModel/X86/sitofp.ll
@@ -4,656 +4,656 @@
 ; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+avx512f -cost-model -analyze < %s | FileCheck --check-prefix=AVX512F %s
 
 define <2 x double> @sitofpv2i8v2double(<2 x i8> %a) {
-  ; SSE2: sitofpv2i8v2double
+  ; SSE2-LABEL: sitofpv2i8v2double
   ; SSE2: cost of 20 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv2i8v2double
+  ; AVX1-LABEL: sitofpv2i8v2double
   ; AVX1: cost of 4 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv2i8v2double
+  ; AVX2-LABEL: sitofpv2i8v2double
   ; AVX2: cost of 4 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv2i8v2double
+  ; AVX512F-LABEL: sitofpv2i8v2double
   ; AVX512F: cost of 4 {{.*}} sitofp
   %1 = sitofp <2 x i8> %a to <2 x double>
   ret <2 x double> %1
 }
 
 define <4 x double> @sitofpv4i8v4double(<4 x i8> %a) {
-  ; SSE2: sitofpv4i8v4double
+  ; SSE2-LABEL: sitofpv4i8v4double
   ; SSE2: cost of 40 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv4i8v4double
+  ; AVX1-LABEL: sitofpv4i8v4double
   ; AVX1: cost of 3 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv4i8v4double
+  ; AVX2-LABEL: sitofpv4i8v4double
   ; AVX2: cost of 3 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv4i8v4double
+  ; AVX512F-LABEL: sitofpv4i8v4double
   ; AVX512F: cost of 3 {{.*}} sitofp
   %1 = sitofp <4 x i8> %a to <4 x double>
   ret <4 x double> %1
 }
 
 define <8 x double> @sitofpv8i8v8double(<8 x i8> %a) {
-  ; SSE2: sitofpv8i8v8double
+  ; SSE2-LABEL: sitofpv8i8v8double
   ; SSE2: cost of 80 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv8i8v8double
+  ; AVX1-LABEL: sitofpv8i8v8double
   ; AVX1: cost of 20 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv8i8v8double
+  ; AVX2-LABEL: sitofpv8i8v8double
   ; AVX2: cost of 20 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv8i8v8double
+  ; AVX512F-LABEL: sitofpv8i8v8double
   ; AVX512F: cost of 2 {{.*}} sitofp
   %1 = sitofp <8 x i8> %a to <8 x double>
   ret <8 x double> %1
 }
 
 define <16 x double> @sitofpv16i8v16double(<16 x i8> %a) {
-  ; SSE2: sitofpv16i8v16double
+  ; SSE2-LABEL: sitofpv16i8v16double
   ; SSE2: cost of 160 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv16i8v16double
+  ; AVX1-LABEL: sitofpv16i8v16double
   ; AVX1: cost of 40 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv16i8v16double
+  ; AVX2-LABEL: sitofpv16i8v16double
   ; AVX2: cost of 40 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv16i8v16double
+  ; AVX512F-LABEL: sitofpv16i8v16double
   ; AVX512F: cost of 44 {{.*}} sitofp
   %1 = sitofp <16 x i8> %a to <16 x double>
   ret <16 x double> %1
 }
 
 define <32 x double> @sitofpv32i8v32double(<32 x i8> %a) {
-  ; SSE2: sitofpv32i8v32double
+  ; SSE2-LABEL: sitofpv32i8v32double
   ; SSE2: cost of 320 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv32i8v32double
+  ; AVX1-LABEL: sitofpv32i8v32double
   ; AVX1: cost of 80 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv32i8v32double
+  ; AVX2-LABEL: sitofpv32i8v32double
   ; AVX2: cost of 80 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv32i8v32double
+  ; AVX512F-LABEL: sitofpv32i8v32double
   ; AVX512F: cost of 88 {{.*}} sitofp
   %1 = sitofp <32 x i8> %a to <32 x double>
   ret <32 x double> %1
 }
 
 define <2 x double> @sitofpv2i16v2double(<2 x i16> %a) {
-  ; SSE2: sitofpv2i16v2double
+  ; SSE2-LABEL: sitofpv2i16v2double
   ; SSE2: cost of 20 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv2i16v2double
+  ; AVX1-LABEL: sitofpv2i16v2double
   ; AVX1: cost of 4 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv2i16v2double
+  ; AVX2-LABEL: sitofpv2i16v2double
   ; AVX2: cost of 4 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv2i16v2double
+  ; AVX512F-LABEL: sitofpv2i16v2double
   ; AVX512F: cost of 4 {{.*}} sitofp
   %1 = sitofp <2 x i16> %a to <2 x double>
   ret <2 x double> %1
 }
 
 define <4 x double> @sitofpv4i16v4double(<4 x i16> %a) {
-  ; SSE2: sitofpv4i16v4double
+  ; SSE2-LABEL: sitofpv4i16v4double
   ; SSE2: cost of 40 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv4i16v4double
+  ; AVX1-LABEL: sitofpv4i16v4double
   ; AVX1: cost of 3 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv4i16v4double
+  ; AVX2-LABEL: sitofpv4i16v4double
   ; AVX2: cost of 3 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv4i16v4double
+  ; AVX512F-LABEL: sitofpv4i16v4double
   ; AVX512F: cost of 3 {{.*}} sitofp
   %1 = sitofp <4 x i16> %a to <4 x double>
   ret <4 x double> %1
 }
 
 define <8 x double> @sitofpv8i16v8double(<8 x i16> %a) {
-  ; SSE2: sitofpv8i16v8double
+  ; SSE2-LABEL: sitofpv8i16v8double
   ; SSE2: cost of 80 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv8i16v8double
+  ; AVX1-LABEL: sitofpv8i16v8double
   ; AVX1: cost of 20 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv8i16v8double
+  ; AVX2-LABEL: sitofpv8i16v8double
   ; AVX2: cost of 20 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv8i16v8double
+  ; AVX512F-LABEL: sitofpv8i16v8double
   ; AVX512F: cost of 2 {{.*}} sitofp
   %1 = sitofp <8 x i16> %a to <8 x double>
   ret <8 x double> %1
 }
 
 define <16 x double> @sitofpv16i16v16double(<16 x i16> %a) {
-  ; SSE2: sitofpv16i16v16double
+  ; SSE2-LABEL: sitofpv16i16v16double
   ; SSE2: cost of 160 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv16i16v16double
+  ; AVX1-LABEL: sitofpv16i16v16double
   ; AVX1: cost of 40 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv16i16v16double
+  ; AVX2-LABEL: sitofpv16i16v16double
   ; AVX2: cost of 40 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv16i16v16double
+  ; AVX512F-LABEL: sitofpv16i16v16double
   ; AVX512F: cost of 44 {{.*}} sitofp
   %1 = sitofp <16 x i16> %a to <16 x double>
   ret <16 x double> %1
 }
 
 define <32 x double> @sitofpv32i16v32double(<32 x i16> %a) {
-  ; SSE2: sitofpv32i16v32double
+  ; SSE2-LABEL: sitofpv32i16v32double
   ; SSE2: cost of 320 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv32i16v32double
+  ; AVX1-LABEL: sitofpv32i16v32double
   ; AVX1: cost of 80 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv32i16v32double
+  ; AVX2-LABEL: sitofpv32i16v32double
   ; AVX2: cost of 80 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv32i16v32double
+  ; AVX512F-LABEL: sitofpv32i16v32double
   ; AVX512F: cost of 88 {{.*}} sitofp
   %1 = sitofp <32 x i16> %a to <32 x double>
   ret <32 x double> %1
 }
 
 define <2 x double> @sitofpv2i32v2double(<2 x i32> %a) {
-  ; SSE2: sitofpv2i32v2double
+  ; SSE2-LABEL: sitofpv2i32v2double
   ; SSE2: cost of 20 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv2i32v2double
+  ; AVX1-LABEL: sitofpv2i32v2double
   ; AVX1: cost of 4 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv2i32v2double
+  ; AVX2-LABEL: sitofpv2i32v2double
   ; AVX2: cost of 4 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv2i32v2double
+  ; AVX512F-LABEL: sitofpv2i32v2double
   ; AVX512F: cost of 4 {{.*}} sitofp
   %1 = sitofp <2 x i32> %a to <2 x double>
   ret <2 x double> %1
 }
 
 define <4 x double> @sitofpv4i32v4double(<4 x i32> %a) {
-  ; SSE2: sitofpv4i32v4double
+  ; SSE2-LABEL: sitofpv4i32v4double
   ; SSE2: cost of 40 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv4i32v4double
+  ; AVX1-LABEL: sitofpv4i32v4double
   ; AVX1: cost of 1 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv4i32v4double
+  ; AVX2-LABEL: sitofpv4i32v4double
   ; AVX2: cost of 1 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv4i32v4double
+  ; AVX512F-LABEL: sitofpv4i32v4double
   ; AVX512F: cost of 1 {{.*}} sitofp
   %1 = sitofp <4 x i32> %a to <4 x double>
   ret <4 x double> %1
 }
 
 define <8 x double> @sitofpv8i32v8double(<8 x i32> %a) {
-  ; SSE2: sitofpv8i32v8double
+  ; SSE2-LABEL: sitofpv8i32v8double
   ; SSE2: cost of 80 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv8i32v8double
+  ; AVX1-LABEL: sitofpv8i32v8double
   ; AVX1: cost of 20 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv8i32v8double
+  ; AVX2-LABEL: sitofpv8i32v8double
   ; AVX2: cost of 20 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv8i32v8double
+  ; AVX512F-LABEL: sitofpv8i32v8double
   ; AVX512F: cost of 1 {{.*}} sitofp
   %1 = sitofp <8 x i32> %a to <8 x double>
   ret <8 x double> %1
 }
 
 define <16 x double> @sitofpv16i32v16double(<16 x i32> %a) {
-  ; SSE2: sitofpv16i32v16double
+  ; SSE2-LABEL: sitofpv16i32v16double
   ; SSE2: cost of 160 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv16i32v16double
+  ; AVX1-LABEL: sitofpv16i32v16double
   ; AVX1: cost of 40 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv16i32v16double
+  ; AVX2-LABEL: sitofpv16i32v16double
   ; AVX2: cost of 40 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv16i32v16double
+  ; AVX512F-LABEL: sitofpv16i32v16double
   ; AVX512F: cost of 44 {{.*}} sitofp
   %1 = sitofp <16 x i32> %a to <16 x double>
   ret <16 x double> %1
 }
 
 define <32 x double> @sitofpv32i32v32double(<32 x i32> %a) {
-  ; SSE2: sitofpv32i32v32double
+  ; SSE2-LABEL: sitofpv32i32v32double
   ; SSE2: cost of 320 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv32i32v32double
+  ; AVX1-LABEL: sitofpv32i32v32double
   ; AVX1: cost of 80 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv32i32v32double
+  ; AVX2-LABEL: sitofpv32i32v32double
   ; AVX2: cost of 80 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv32i32v32double
+  ; AVX512F-LABEL: sitofpv32i32v32double
   ; AVX512F: cost of 88 {{.*}} sitofp
   %1 = sitofp <32 x i32> %a to <32 x double>
   ret <32 x double> %1
 }
 
 define <2 x double> @sitofpv2i64v2double(<2 x i64> %a) {
-  ; SSE2: sitofpv2i64v2double
+  ; SSE2-LABEL: sitofpv2i64v2double
   ; SSE2: cost of 20 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv2i64v2double
+  ; AVX1-LABEL: sitofpv2i64v2double
   ; AVX1: cost of 4 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv2i64v2double
+  ; AVX2-LABEL: sitofpv2i64v2double
   ; AVX2: cost of 4 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv2i64v2double
+  ; AVX512F-LABEL: sitofpv2i64v2double
   ; AVX512F: cost of 4 {{.*}} sitofp
   %1 = sitofp <2 x i64> %a to <2 x double>
   ret <2 x double> %1
 }
 
 define <4 x double> @sitofpv4i64v4double(<4 x i64> %a) {
-  ; SSE2: sitofpv4i64v4double
+  ; SSE2-LABEL: sitofpv4i64v4double
   ; SSE2: cost of 40 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv4i64v4double
+  ; AVX1-LABEL: sitofpv4i64v4double
   ; AVX1: cost of 10 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv4i64v4double
+  ; AVX2-LABEL: sitofpv4i64v4double
   ; AVX2: cost of 10 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv4i64v4double
+  ; AVX512F-LABEL: sitofpv4i64v4double
   ; AVX512F: cost of 10 {{.*}} sitofp
   %1 = sitofp <4 x i64> %a to <4 x double>
   ret <4 x double> %1
 }
 
 define <8 x double> @sitofpv8i64v8double(<8 x i64> %a) {
-  ; SSE2: sitofpv8i64v8double
+  ; SSE2-LABEL: sitofpv8i64v8double
   ; SSE2: cost of 80 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv8i64v8double
+  ; AVX1-LABEL: sitofpv8i64v8double
   ; AVX1: cost of 20 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv8i64v8double
+  ; AVX2-LABEL: sitofpv8i64v8double
   ; AVX2: cost of 20 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv8i64v8double
+  ; AVX512F-LABEL: sitofpv8i64v8double
   ; AVX512F: cost of 22 {{.*}} sitofp
   %1 = sitofp <8 x i64> %a to <8 x double>
   ret <8 x double> %1
 }
 
 define <16 x double> @sitofpv16i64v16double(<16 x i64> %a) {
-  ; SSE2: sitofpv16i64v16double
+  ; SSE2-LABEL: sitofpv16i64v16double
   ; SSE2: cost of 160 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv16i64v16double
+  ; AVX1-LABEL: sitofpv16i64v16double
   ; AVX1: cost of 40 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv16i64v16double
+  ; AVX2-LABEL: sitofpv16i64v16double
   ; AVX2: cost of 40 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv16i64v16double
+  ; AVX512F-LABEL: sitofpv16i64v16double
   ; AVX512F: cost of 44 {{.*}} sitofp
   %1 = sitofp <16 x i64> %a to <16 x double>
   ret <16 x double> %1
 }
 
 define <32 x double> @sitofpv32i64v32double(<32 x i64> %a) {
-  ; SSE2: sitofpv32i64v32double
+  ; SSE2-LABEL: sitofpv32i64v32double
   ; SSE2: cost of 320 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv32i64v32double
+  ; AVX1-LABEL: sitofpv32i64v32double
   ; AVX1: cost of 80 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv32i64v32double
+  ; AVX2-LABEL: sitofpv32i64v32double
   ; AVX2: cost of 80 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv32i64v32double
+  ; AVX512F-LABEL: sitofpv32i64v32double
   ; AVX512F: cost of 88 {{.*}} sitofp
   %1 = sitofp <32 x i64> %a to <32 x double>
   ret <32 x double> %1
 }
 
 define <2 x float> @sitofpv2i8v2float(<2 x i8> %a) {
-  ; SSE2: sitofpv2i8v2float
+  ; SSE2-LABEL: sitofpv2i8v2float
   ; SSE2: cost of 15 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv2i8v2float
+  ; AVX1-LABEL: sitofpv2i8v2float
   ; AVX1: cost of 4 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv2i8v2float
+  ; AVX2-LABEL: sitofpv2i8v2float
   ; AVX2: cost of 4 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv2i8v2float
+  ; AVX512F-LABEL: sitofpv2i8v2float
   ; AVX512F: cost of 4 {{.*}} sitofp
   %1 = sitofp <2 x i8> %a to <2 x float>
   ret <2 x float> %1
 }
 
 define <4 x float> @sitofpv4i8v4float(<4 x i8> %a) {
-  ; SSE2: sitofpv4i8v4float
+  ; SSE2-LABEL: sitofpv4i8v4float
   ; SSE2: cost of 15 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv4i8v4float
+  ; AVX1-LABEL: sitofpv4i8v4float
   ; AVX1: cost of 3 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv4i8v4float
+  ; AVX2-LABEL: sitofpv4i8v4float
   ; AVX2: cost of 3 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv4i8v4float
+  ; AVX512F-LABEL: sitofpv4i8v4float
   ; AVX512F: cost of 3 {{.*}} sitofp
   %1 = sitofp <4 x i8> %a to <4 x float>
   ret <4 x float> %1
 }
 
 define <8 x float> @sitofpv8i8v8float(<8 x i8> %a) {
-  ; SSE2: sitofpv8i8v8float
+  ; SSE2-LABEL: sitofpv8i8v8float
   ; SSE2: cost of 15 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv8i8v8float
+  ; AVX1-LABEL: sitofpv8i8v8float
   ; AVX1: cost of 8 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv8i8v8float
+  ; AVX2-LABEL: sitofpv8i8v8float
   ; AVX2: cost of 8 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv8i8v8float
+  ; AVX512F-LABEL: sitofpv8i8v8float
   ; AVX512F: cost of 8 {{.*}} sitofp
   %1 = sitofp <8 x i8> %a to <8 x float>
   ret <8 x float> %1
 }
 
 define <16 x float> @sitofpv16i8v16float(<16 x i8> %a) {
-  ; SSE2: sitofpv16i8v16float
+  ; SSE2-LABEL: sitofpv16i8v16float
   ; SSE2: cost of 8 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv16i8v16float
+  ; AVX1-LABEL: sitofpv16i8v16float
   ; AVX1: cost of 44 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv16i8v16float
+  ; AVX2-LABEL: sitofpv16i8v16float
   ; AVX2: cost of 44 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv16i8v16float
+  ; AVX512F-LABEL: sitofpv16i8v16float
   ; AVX512F: cost of 2 {{.*}} sitofp
   %1 = sitofp <16 x i8> %a to <16 x float>
   ret <16 x float> %1
 }
 
 define <32 x float> @sitofpv32i8v32float(<32 x i8> %a) {
-  ; SSE2: sitofpv32i8v32float
+  ; SSE2-LABEL: sitofpv32i8v32float
   ; SSE2: cost of 16 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv32i8v32float
+  ; AVX1-LABEL: sitofpv32i8v32float
   ; AVX1: cost of 88 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv32i8v32float
+  ; AVX2-LABEL: sitofpv32i8v32float
   ; AVX2: cost of 88 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv32i8v32float
+  ; AVX512F-LABEL: sitofpv32i8v32float
   ; AVX512F: cost of 92 {{.*}} sitofp
   %1 = sitofp <32 x i8> %a to <32 x float>
   ret <32 x float> %1
 }
 
 define <2 x float> @sitofpv2i16v2float(<2 x i16> %a) {
-  ; SSE2: sitofpv2i16v2float
+  ; SSE2-LABEL: sitofpv2i16v2float
   ; SSE2: cost of 15 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv2i16v2float
+  ; AVX1-LABEL: sitofpv2i16v2float
   ; AVX1: cost of 4 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv2i16v2float
+  ; AVX2-LABEL: sitofpv2i16v2float
   ; AVX2: cost of 4 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv2i16v2float
+  ; AVX512F-LABEL: sitofpv2i16v2float
   ; AVX512F: cost of 4 {{.*}} sitofp
   %1 = sitofp <2 x i16> %a to <2 x float>
   ret <2 x float> %1
 }
 
 define <4 x float> @sitofpv4i16v4float(<4 x i16> %a) {
-  ; SSE2: sitofpv4i16v4float
+  ; SSE2-LABEL: sitofpv4i16v4float
   ; SSE2: cost of 15 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv4i16v4float
+  ; AVX1-LABEL: sitofpv4i16v4float
   ; AVX1: cost of 3 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv4i16v4float
+  ; AVX2-LABEL: sitofpv4i16v4float
   ; AVX2: cost of 3 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv4i16v4float
+  ; AVX512F-LABEL: sitofpv4i16v4float
   ; AVX512F: cost of 3 {{.*}} sitofp
   %1 = sitofp <4 x i16> %a to <4 x float>
   ret <4 x float> %1
 }
 
 define <8 x float> @sitofpv8i16v8float(<8 x i16> %a) {
-  ; SSE2: sitofpv8i16v8float
+  ; SSE2-LABEL: sitofpv8i16v8float
   ; SSE2: cost of 15 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv8i16v8float
+  ; AVX1-LABEL: sitofpv8i16v8float
   ; AVX1: cost of 5 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv8i16v8float
+  ; AVX2-LABEL: sitofpv8i16v8float
   ; AVX2: cost of 5 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv8i16v8float
+  ; AVX512F-LABEL: sitofpv8i16v8float
   ; AVX512F: cost of 5 {{.*}} sitofp
   %1 = sitofp <8 x i16> %a to <8 x float>
   ret <8 x float> %1
 }
 
 define <16 x float> @sitofpv16i16v16float(<16 x i16> %a) {
-  ; SSE2: sitofpv16i16v16float
+  ; SSE2-LABEL: sitofpv16i16v16float
   ; SSE2: cost of 30 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv16i16v16float
+  ; AVX1-LABEL: sitofpv16i16v16float
   ; AVX1: cost of 44 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv16i16v16float
+  ; AVX2-LABEL: sitofpv16i16v16float
   ; AVX2: cost of 44 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv16i16v16float
+  ; AVX512F-LABEL: sitofpv16i16v16float
   ; AVX512F: cost of 2 {{.*}} sitofp
   %1 = sitofp <16 x i16> %a to <16 x float>
   ret <16 x float> %1
 }
 
 define <32 x float> @sitofpv32i16v32float(<32 x i16> %a) {
-  ; SSE2: sitofpv32i16v32float
+  ; SSE2-LABEL: sitofpv32i16v32float
   ; SSE2: cost of 60 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv32i16v32float
+  ; AVX1-LABEL: sitofpv32i16v32float
   ; AVX1: cost of 88 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv32i16v32float
+  ; AVX2-LABEL: sitofpv32i16v32float
   ; AVX2: cost of 88 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv32i16v32float
-  ; AVX512F: cost of 2 {{.*}} sitofp
+  ; AVX512F-LABEL: sitofpv32i16v32float
+  ; AVX512F: cost of 92 {{.*}} sitofp
   %1 = sitofp <32 x i16> %a to <32 x float>
   ret <32 x float> %1
 }
 
 define <2 x float> @sitofpv2i32v2float(<2 x i32> %a) {
-  ; SSE2: sitofpv2i32v2float
+  ; SSE2-LABEL: sitofpv2i32v2float
   ; SSE2: cost of 15 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv2i32v2float
+  ; AVX1-LABEL: sitofpv2i32v2float
   ; AVX1: cost of 4 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv2i32v2float
+  ; AVX2-LABEL: sitofpv2i32v2float
   ; AVX2: cost of 4 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv2i32v2float
+  ; AVX512F-LABEL: sitofpv2i32v2float
   ; AVX512F: cost of 4 {{.*}} sitofp
   %1 = sitofp <2 x i32> %a to <2 x float>
   ret <2 x float> %1
 }
 
 define <4 x float> @sitofpv4i32v4float(<4 x i32> %a) {
-  ; SSE2: sitofpv4i32v4float
+  ; SSE2-LABEL: sitofpv4i32v4float
   ; SSE2: cost of 15 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv4i32v4float
+  ; AVX1-LABEL: sitofpv4i32v4float
   ; AVX1: cost of 1 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv4i32v4float
+  ; AVX2-LABEL: sitofpv4i32v4float
   ; AVX2: cost of 1 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv4i32v4float
+  ; AVX512F-LABEL: sitofpv4i32v4float
   ; AVX512F: cost of 1 {{.*}} sitofp
   %1 = sitofp <4 x i32> %a to <4 x float>
   ret <4 x float> %1
 }
 
 define <8 x float> @sitofpv8i32v8float(<8 x i32> %a) {
-  ; SSE2: sitofpv8i32v8float
+  ; SSE2-LABEL: sitofpv8i32v8float
   ; SSE2: cost of 30 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv8i32v8float
+  ; AVX1-LABEL: sitofpv8i32v8float
   ; AVX1: cost of 1 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv8i32v8float
+  ; AVX2-LABEL: sitofpv8i32v8float
   ; AVX2: cost of 1 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv8i32v8float
+  ; AVX512F-LABEL: sitofpv8i32v8float
   ; AVX512F: cost of 1 {{.*}} sitofp
   %1 = sitofp <8 x i32> %a to <8 x float>
   ret <8 x float> %1
 }
 
 define <16 x float> @sitofpv16i32v16float(<16 x i32> %a) {
-  ; SSE2: sitofpv16i32v16float
+  ; SSE2-LABEL: sitofpv16i32v16float
   ; SSE2: cost of 60 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv16i32v16float
+  ; AVX1-LABEL: sitofpv16i32v16float
   ; AVX1: cost of 44 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv16i32v16float
+  ; AVX2-LABEL: sitofpv16i32v16float
   ; AVX2: cost of 44 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv16i32v16float
+  ; AVX512F-LABEL: sitofpv16i32v16float
   ; AVX512F: cost of 1 {{.*}} sitofp
   %1 = sitofp <16 x i32> %a to <16 x float>
   ret <16 x float> %1
 }
 
 define <32 x float> @sitofpv32i32v32float(<32 x i32> %a) {
-  ; SSE2: sitofpv32i32v32float
+  ; SSE2-LABEL: sitofpv32i32v32float
   ; SSE2: cost of 120 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv32i32v32float
+  ; AVX1-LABEL: sitofpv32i32v32float
   ; AVX1: cost of 88 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv32i32v32float
+  ; AVX2-LABEL: sitofpv32i32v32float
   ; AVX2: cost of 88 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv32i32v32float
-  ; AVX512F: cost of 1 {{.*}} sitofp
+  ; AVX512F-LABEL: sitofpv32i32v32float
+  ; AVX512F: cost of 92 {{.*}} sitofp
   %1 = sitofp <32 x i32> %a to <32 x float>
   ret <32 x float> %1
 }
 
 define <2 x float> @sitofpv2i64v2float(<2 x i64> %a) {
-  ; SSE2: sitofpv2i64v2float
+  ; SSE2-LABEL: sitofpv2i64v2float
   ; SSE2: cost of 15 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv2i64v2float
+  ; AVX1-LABEL: sitofpv2i64v2float
   ; AVX1: cost of 4 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv2i64v2float
+  ; AVX2-LABEL: sitofpv2i64v2float
   ; AVX2: cost of 4 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv2i64v2float
+  ; AVX512F-LABEL: sitofpv2i64v2float
   ; AVX512F: cost of 4 {{.*}} sitofp
   %1 = sitofp <2 x i64> %a to <2 x float>
   ret <2 x float> %1
 }
 
 define <4 x float> @sitofpv4i64v4float(<4 x i64> %a) {
-  ; SSE2: sitofpv4i64v4float
+  ; SSE2-LABEL: sitofpv4i64v4float
   ; SSE2: cost of 30 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv4i64v4float
+  ; AVX1-LABEL: sitofpv4i64v4float
   ; AVX1: cost of 10 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv4i64v4float
+  ; AVX2-LABEL: sitofpv4i64v4float
   ; AVX2: cost of 10 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv4i64v4float
+  ; AVX512F-LABEL: sitofpv4i64v4float
   ; AVX512F: cost of 10 {{.*}} sitofp
   %1 = sitofp <4 x i64> %a to <4 x float>
   ret <4 x float> %1
 }
 
 define <8 x float> @sitofpv8i64v8float(<8 x i64> %a) {
-  ; SSE2: sitofpv8i64v8float
+  ; SSE2-LABEL: sitofpv8i64v8float
   ; SSE2: cost of 60 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv8i64v8float
+  ; AVX1-LABEL: sitofpv8i64v8float
   ; AVX1: cost of 22 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv8i64v8float
+  ; AVX2-LABEL: sitofpv8i64v8float
   ; AVX2: cost of 22 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv8i64v8float
+  ; AVX512F-LABEL: sitofpv8i64v8float
   ; AVX512F: cost of 22 {{.*}} sitofp
   %1 = sitofp <8 x i64> %a to <8 x float>
   ret <8 x float> %1
 }
 
 define <16 x float> @sitofpv16i64v16float(<16 x i64> %a) {
-  ; SSE2: sitofpv16i64v16float
+  ; SSE2-LABEL: sitofpv16i64v16float
   ; SSE2: cost of 120 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv16i64v16float
+  ; AVX1-LABEL: sitofpv16i64v16float
   ; AVX1: cost of 44 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv16i64v16float
+  ; AVX2-LABEL: sitofpv16i64v16float
   ; AVX2: cost of 44 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv16i64v16float
+  ; AVX512F-LABEL: sitofpv16i64v16float
   ; AVX512F: cost of 46 {{.*}} sitofp
   %1 = sitofp <16 x i64> %a to <16 x float>
   ret <16 x float> %1
 }
 
 define <32 x float> @sitofpv32i64v32float(<32 x i64> %a) {
-  ; SSE2: sitofpv32i64v32float
+  ; SSE2-LABEL: sitofpv32i64v32float
   ; SSE2: cost of 240 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv32i64v32float
+  ; AVX1-LABEL: sitofpv32i64v32float
   ; AVX1: cost of 88 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv32i64v32float
+  ; AVX2-LABEL: sitofpv32i64v32float
   ; AVX2: cost of 88 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv32i64v32float
+  ; AVX512F-LABEL: sitofpv32i64v32float
   ; AVX512F: cost of 92 {{.*}} sitofp
   %1 = sitofp <32 x i64> %a to <32 x float>
   ret <32 x float> %1
 }
 
 define <8 x double> @sitofpv8i1v8double(<8 x double> %a) {
-  ; SSE2: sitofpv8i1v8double
+  ; SSE2-LABEL: sitofpv8i1v8double
   ; SSE2: cost of 80 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv8i1v8double
+  ; AVX1-LABEL: sitofpv8i1v8double
   ; AVX1: cost of 20 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv8i1v8double
+  ; AVX2-LABEL: sitofpv8i1v8double
   ; AVX2: cost of 20 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv8i1v8double
+  ; AVX512F-LABEL: sitofpv8i1v8double
   ; AVX512F: cost of 4 {{.*}} sitofp
   %cmpres = fcmp ogt <8 x double> %a, zeroinitializer
   %1 = sitofp <8 x i1> %cmpres to <8 x double>
@@ -661,16 +661,16 @@ define <8 x double> @sitofpv8i1v8double(<8 x double> %a) {
 }
 
 define <16 x float> @sitofpv16i1v16float(<16 x float> %a) {
-  ; SSE2: sitofpv16i1v16float
+  ; SSE2-LABEL: sitofpv16i1v16float
   ; SSE2: cost of 8 {{.*}} sitofp
   ;
-  ; AVX1: sitofpv16i1v16float
+  ; AVX1-LABEL: sitofpv16i1v16float
   ; AVX1: cost of 44 {{.*}} sitofp
   ;
-  ; AVX2: sitofpv16i1v16float
+  ; AVX2-LABEL: sitofpv16i1v16float
   ; AVX2: cost of 44 {{.*}} sitofp
   ;
-  ; AVX512F: sitofpv16i1v16float
+  ; AVX512F-LABEL: sitofpv16i1v16float
   ; AVX512F: cost of 3 {{.*}} sitofp
   %cmpres = fcmp ogt <16 x float> %a, zeroinitializer
   %1 = sitofp <16 x i1> %cmpres to <16 x float>
diff --git a/test/Analysis/CostModel/X86/uitofp.ll b/test/Analysis/CostModel/X86/uitofp.ll
index 9ffc483e3f5a..08e36650bec4 100644
--- a/test/Analysis/CostModel/X86/uitofp.ll
+++ b/test/Analysis/CostModel/X86/uitofp.ll
@@ -2,644 +2,708 @@
 ; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+avx  -cost-model -analyze < %s | FileCheck --check-prefix=AVX --check-prefix=AVX1 %s
 ; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+avx2 -cost-model -analyze < %s | FileCheck --check-prefix=AVX --check-prefix=AVX2 %s
 ; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+avx512f -cost-model -analyze < %s | FileCheck --check-prefix=AVX512F %s
+; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+avx512dq -cost-model -analyze < %s | FileCheck --check-prefix=AVX512DQ %s
 
 define <2 x double> @uitofpv2i8v2double(<2 x i8> %a) {
-  ; SSE2: uitofpv2i8v2double
+  ; SSE2-LABEL: uitofpv2i8v2double
   ; SSE2: cost of 20 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv2i8v2double
+  ; AVX1-LABEL: uitofpv2i8v2double
   ; AVX1: cost of 4 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv2i8v2double
+  ; AVX2-LABEL: uitofpv2i8v2double
   ; AVX2: cost of 4 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv2i8v2double
-  ; AVX512F: cost of 4 {{.*}} uitofp
+  ; AVX512F-LABEL: uitofpv2i8v2double
+  ; AVX512F: cost of 2 {{.*}} uitofp
   %1 = uitofp <2 x i8> %a to <2 x double>
   ret <2 x double> %1
 }
 
 define <4 x double> @uitofpv4i8v4double(<4 x i8> %a) {
-  ; SSE2: uitofpv4i8v4double
+  ; SSE2-LABEL: uitofpv4i8v4double
   ; SSE2: cost of 40 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv4i8v4double
+  ; AVX1-LABEL: uitofpv4i8v4double
   ; AVX1: cost of 2 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv4i8v4double
+  ; AVX2-LABEL: uitofpv4i8v4double
   ; AVX2: cost of 2 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv4i8v4double
+  ; AVX512F-LABEL: uitofpv4i8v4double
   ; AVX512F: cost of 2 {{.*}} uitofp
   %1 = uitofp <4 x i8> %a to <4 x double>
   ret <4 x double> %1
 }
 
 define <8 x double> @uitofpv8i8v8double(<8 x i8> %a) {
-  ; SSE2: uitofpv8i8v8double
+  ; SSE2-LABEL: uitofpv8i8v8double
   ; SSE2: cost of 80 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv8i8v8double
+  ; AVX1-LABEL: uitofpv8i8v8double
   ; AVX1: cost of 20 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv8i8v8double
+  ; AVX2-LABEL: uitofpv8i8v8double
   ; AVX2: cost of 20 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv8i8v8double
-  ; AVX512F: cost of 22 {{.*}} uitofp
+  ; AVX512F-LABEL: uitofpv8i8v8double
+  ; AVX512F: cost of 2 {{.*}} uitofp
   %1 = uitofp <8 x i8> %a to <8 x double>
   ret <8 x double> %1
 }
 
 define <16 x double> @uitofpv16i8v16double(<16 x i8> %a) {
-  ; SSE2: uitofpv16i8v16double
+  ; SSE2-LABEL: uitofpv16i8v16double
   ; SSE2: cost of 160 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv16i8v16double
+  ; AVX1-LABEL: uitofpv16i8v16double
   ; AVX1: cost of 40 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv16i8v16double
+  ; AVX2-LABEL: uitofpv16i8v16double
   ; AVX2: cost of 40 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv16i8v16double
+  ; AVX512F-LABEL: uitofpv16i8v16double
   ; AVX512F: cost of 44 {{.*}} uitofp
   %1 = uitofp <16 x i8> %a to <16 x double>
   ret <16 x double> %1
 }
 
 define <32 x double> @uitofpv32i8v32double(<32 x i8> %a) {
-  ; SSE2: uitofpv32i8v32double
+  ; SSE2-LABEL: uitofpv32i8v32double
   ; SSE2: cost of 320 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv32i8v32double
+  ; AVX1-LABEL: uitofpv32i8v32double
   ; AVX1: cost of 80 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv32i8v32double
+  ; AVX2-LABEL: uitofpv32i8v32double
   ; AVX2: cost of 80 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv32i8v32double
+  ; AVX512F-LABEL: uitofpv32i8v32double
   ; AVX512F: cost of 88 {{.*}} uitofp
   %1 = uitofp <32 x i8> %a to <32 x double>
   ret <32 x double> %1
 }
 
 define <2 x double> @uitofpv2i16v2double(<2 x i16> %a) {
-  ; SSE2: uitofpv2i16v2double
+  ; SSE2-LABEL: uitofpv2i16v2double
   ; SSE2: cost of 20 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv2i16v2double
+  ; AVX1-LABEL: uitofpv2i16v2double
   ; AVX1: cost of 4 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv2i16v2double
+  ; AVX2-LABEL: uitofpv2i16v2double
   ; AVX2: cost of 4 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv2i16v2double
-  ; AVX512F: cost of 4 {{.*}} uitofp
+  ; AVX512F-LABEL: uitofpv2i16v2double
+  ; AVX512F: cost of 5 {{.*}} uitofp
   %1 = uitofp <2 x i16> %a to <2 x double>
   ret <2 x double> %1
 }
 
 define <4 x double> @uitofpv4i16v4double(<4 x i16> %a) {
-  ; SSE2: uitofpv4i16v4double
+  ; SSE2-LABEL: uitofpv4i16v4double
   ; SSE2: cost of 40 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv4i16v4double
+  ; AVX1-LABEL: uitofpv4i16v4double
   ; AVX1: cost of 2 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv4i16v4double
+  ; AVX2-LABEL: uitofpv4i16v4double
   ; AVX2: cost of 2 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv4i16v4double
+  ; AVX512F-LABEL: uitofpv4i16v4double
   ; AVX512F: cost of 2 {{.*}} uitofp
   %1 = uitofp <4 x i16> %a to <4 x double>
   ret <4 x double> %1
 }
 
 define <8 x double> @uitofpv8i16v8double(<8 x i16> %a) {
-  ; SSE2: uitofpv8i16v8double
+  ; SSE2-LABEL: uitofpv8i16v8double
   ; SSE2: cost of 80 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv8i16v8double
+  ; AVX1-LABEL: uitofpv8i16v8double
   ; AVX1: cost of 20 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv8i16v8double
+  ; AVX2-LABEL: uitofpv8i16v8double
   ; AVX2: cost of 20 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv8i16v8double
-  ; AVX512F: cost of 22 {{.*}} uitofp
+  ; AVX512F-LABEL: uitofpv8i16v8double
+  ; AVX512F: cost of 2 {{.*}} uitofp
   %1 = uitofp <8 x i16> %a to <8 x double>
   ret <8 x double> %1
 }
 
 define <16 x double> @uitofpv16i16v16double(<16 x i16> %a) {
-  ; SSE2: uitofpv16i16v16double
+  ; SSE2-LABEL: uitofpv16i16v16double
   ; SSE2: cost of 160 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv16i16v16double
+  ; AVX1-LABEL: uitofpv16i16v16double
   ; AVX1: cost of 40 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv16i16v16double
+  ; AVX2-LABEL: uitofpv16i16v16double
   ; AVX2: cost of 40 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv16i16v16double
+  ; AVX512F-LABEL: uitofpv16i16v16double
   ; AVX512F: cost of 44 {{.*}} uitofp
   %1 = uitofp <16 x i16> %a to <16 x double>
   ret <16 x double> %1
 }
 
 define <32 x double> @uitofpv32i16v32double(<32 x i16> %a) {
-  ; SSE2: uitofpv32i16v32double
+  ; SSE2-LABEL: uitofpv32i16v32double
   ; SSE2: cost of 320 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv32i16v32double
+  ; AVX1-LABEL: uitofpv32i16v32double
   ; AVX1: cost of 80 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv32i16v32double
+  ; AVX2-LABEL: uitofpv32i16v32double
   ; AVX2: cost of 80 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv32i16v32double
+  ; AVX512F-LABEL: uitofpv32i16v32double
   ; AVX512F: cost of 88 {{.*}} uitofp
   %1 = uitofp <32 x i16> %a to <32 x double>
   ret <32 x double> %1
 }
 
 define <2 x double> @uitofpv2i32v2double(<2 x i32> %a) {
-  ; SSE2: uitofpv2i32v2double
+  ; SSE2-LABEL: uitofpv2i32v2double
   ; SSE2: cost of 20 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv2i32v2double
+  ; AVX1-LABEL: uitofpv2i32v2double
   ; AVX1: cost of 4 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv2i32v2double
+  ; AVX2-LABEL: uitofpv2i32v2double
   ; AVX2: cost of 4 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv2i32v2double
+  ; AVX512F-LABEL: uitofpv2i32v2double
   ; AVX512F: cost of 4 {{.*}} uitofp
   %1 = uitofp <2 x i32> %a to <2 x double>
   ret <2 x double> %1
 }
 
 define <4 x double> @uitofpv4i32v4double(<4 x i32> %a) {
-  ; SSE2: uitofpv4i32v4double
+  ; SSE2-LABEL: uitofpv4i32v4double
   ; SSE2: cost of 40 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv4i32v4double
+  ; AVX1-LABEL: uitofpv4i32v4double
   ; AVX1: cost of 6 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv4i32v4double
+  ; AVX2-LABEL: uitofpv4i32v4double
   ; AVX2: cost of 6 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv4i32v4double
-  ; AVX512F: cost of 6 {{.*}} uitofp
+  ; AVX512F-LABEL: uitofpv4i32v4double
+  ; AVX512F: cost of 1 {{.*}} uitofp
   %1 = uitofp <4 x i32> %a to <4 x double>
   ret <4 x double> %1
 }
 
 define <8 x double> @uitofpv8i32v8double(<8 x i32> %a) {
-  ; SSE2: uitofpv8i32v8double
+  ; SSE2-LABEL: uitofpv8i32v8double
   ; SSE2: cost of 80 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv8i32v8double
+  ; AVX1-LABEL: uitofpv8i32v8double
   ; AVX1: cost of 20 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv8i32v8double
+  ; AVX2-LABEL: uitofpv8i32v8double
   ; AVX2: cost of 20 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv8i32v8double
-  ; AVX512F: cost of 22 {{.*}} uitofp
+  ; AVX512F-LABEL: uitofpv8i32v8double
+  ; AVX512F: cost of 1 {{.*}} uitofp
   %1 = uitofp <8 x i32> %a to <8 x double>
   ret <8 x double> %1
 }
 
 define <16 x double> @uitofpv16i32v16double(<16 x i32> %a) {
-  ; SSE2: uitofpv16i32v16double
+  ; SSE2-LABEL: uitofpv16i32v16double
   ; SSE2: cost of 160 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv16i32v16double
+  ; AVX1-LABEL: uitofpv16i32v16double
   ; AVX1: cost of 40 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv16i32v16double
+  ; AVX2-LABEL: uitofpv16i32v16double
   ; AVX2: cost of 40 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv16i32v16double
+  ; AVX512F-LABEL: uitofpv16i32v16double
   ; AVX512F: cost of 44 {{.*}} uitofp
   %1 = uitofp <16 x i32> %a to <16 x double>
   ret <16 x double> %1
 }
 
 define <32 x double> @uitofpv32i32v32double(<32 x i32> %a) {
-  ; SSE2: uitofpv32i32v32double
+  ; SSE2-LABEL: uitofpv32i32v32double
   ; SSE2: cost of 320 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv32i32v32double
+  ; AVX1-LABEL: uitofpv32i32v32double
   ; AVX1: cost of 80 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv32i32v32double
+  ; AVX2-LABEL: uitofpv32i32v32double
   ; AVX2: cost of 80 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv32i32v32double
+  ; AVX512F-LABEL: uitofpv32i32v32double
   ; AVX512F: cost of 88 {{.*}} uitofp
   %1 = uitofp <32 x i32> %a to <32 x double>
   ret <32 x double> %1
 }
 
 define <2 x double> @uitofpv2i64v2double(<2 x i64> %a) {
-  ; SSE2: uitofpv2i64v2double
+  ; SSE2-LABEL: uitofpv2i64v2double
   ; SSE2: cost of 20 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv2i64v2double
+  ; AVX1-LABEL: uitofpv2i64v2double
   ; AVX1: cost of 20 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv2i64v2double
+  ; AVX2-LABEL: uitofpv2i64v2double
   ; AVX2: cost of 20 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv2i64v2double
-  ; AVX512F: cost of 20 {{.*}} uitofp
+  ; AVX512F-LABEL: uitofpv2i64v2double
+  ; AVX512F: cost of 5 {{.*}} uitofp
+  ;
+  ; AVX512DQ: uitofpv2i64v2double
+  ; AVX512DQ: cost of 1 {{.*}} uitofp
   %1 = uitofp <2 x i64> %a to <2 x double>
   ret <2 x double> %1
 }
 
 define <4 x double> @uitofpv4i64v4double(<4 x i64> %a) {
-  ; SSE2: uitofpv4i64v4double
+  ; SSE2-LABEL: uitofpv4i64v4double
   ; SSE2: cost of 40 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv4i64v4double
+  ; AVX1-LABEL: uitofpv4i64v4double
   ; AVX1: cost of 40 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv4i64v4double
+  ; AVX2-LABEL: uitofpv4i64v4double
   ; AVX2: cost of 40 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv4i64v4double
-  ; AVX512F: cost of 40 {{.*}} uitofp
+  ; AVX512F-LABEL: uitofpv4i64v4double
+  ; AVX512F: cost of 12 {{.*}} uitofp
+  ;
+  ; AVX512DQ: uitofpv4i64v4double
+  ; AVX512DQ: cost of 1 {{.*}} uitofp
   %1 = uitofp <4 x i64> %a to <4 x double>
   ret <4 x double> %1
 }
 
 define <8 x double> @uitofpv8i64v8double(<8 x i64> %a) {
-  ; SSE2: uitofpv8i64v8double
+  ; SSE2-LABEL: uitofpv8i64v8double
   ; SSE2: cost of 80 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv8i64v8double
+  ; AVX1-LABEL: uitofpv8i64v8double
   ; AVX1: cost of 20 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv8i64v8double
+  ; AVX2-LABEL: uitofpv8i64v8double
   ; AVX2: cost of 20 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv8i64v8double
-  ; AVX512F: cost of 22 {{.*}} uitofp
+  ; AVX512F-LABEL: uitofpv8i64v8double
+  ; AVX512F: cost of 26 {{.*}} uitofp
+  ;
+  ; AVX512DQ: uitofpv8i64v8double
+  ; AVX512DQ: cost of 1 {{.*}} uitofp
   %1 = uitofp <8 x i64> %a to <8 x double>
   ret <8 x double> %1
 }
 
 define <16 x double> @uitofpv16i64v16double(<16 x i64> %a) {
-  ; SSE2: uitofpv16i64v16double
+  ; SSE2-LABEL: uitofpv16i64v16double
   ; SSE2: cost of 160 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv16i64v16double
+  ; AVX1-LABEL: uitofpv16i64v16double
   ; AVX1: cost of 40 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv16i64v16double
+  ; AVX2-LABEL: uitofpv16i64v16double
   ; AVX2: cost of 40 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv16i64v16double
+  ; AVX512F-LABEL: uitofpv16i64v16double
   ; AVX512F: cost of 44 {{.*}} uitofp
+  ;
+  ; AVX512DQ: uitofpv16i64v16double
+  ; AVX512DQ: cost of 44 {{.*}} uitofp
   %1 = uitofp <16 x i64> %a to <16 x double>
   ret <16 x double> %1
 }
 
 define <32 x double> @uitofpv32i64v32double(<32 x i64> %a) {
-  ; SSE2: uitofpv32i64v32double
+  ; SSE2-LABEL: uitofpv32i64v32double
   ; SSE2: cost of 320 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv32i64v32double
+  ; AVX1-LABEL: uitofpv32i64v32double
   ; AVX1: cost of 80 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv32i64v32double
+  ; AVX2-LABEL: uitofpv32i64v32double
   ; AVX2: cost of 80 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv32i64v32double
+  ; AVX512F-LABEL: uitofpv32i64v32double
   ; AVX512F: cost of 88 {{.*}} uitofp
+  ;
+  ; AVX512DQ: uitofpv32i64v32double
+  ; AVX512DQ: cost of 88 {{.*}} uitofp
   %1 = uitofp <32 x i64> %a to <32 x double>
   ret <32 x double> %1
 }
 
 define <2 x float> @uitofpv2i8v2float(<2 x i8> %a) {
-  ; SSE2: uitofpv2i8v2float
+  ; SSE2-LABEL: uitofpv2i8v2float
   ; SSE2: cost of 15 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv2i8v2float
+  ; AVX1-LABEL: uitofpv2i8v2float
   ; AVX1: cost of 4 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv2i8v2float
+  ; AVX2-LABEL: uitofpv2i8v2float
   ; AVX2: cost of 4 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv2i8v2float
+  ; AVX512F-LABEL: uitofpv2i8v2float
   ; AVX512F: cost of 4 {{.*}} uitofp
   %1 = uitofp <2 x i8> %a to <2 x float>
   ret <2 x float> %1
 }
 
 define <4 x float> @uitofpv4i8v4float(<4 x i8> %a) {
-  ; SSE2: uitofpv4i8v4float
+  ; SSE2-LABEL: uitofpv4i8v4float
   ; SSE2: cost of 8 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv4i8v4float
+  ; AVX1-LABEL: uitofpv4i8v4float
   ; AVX1: cost of 2 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv4i8v4float
+  ; AVX2-LABEL: uitofpv4i8v4float
   ; AVX2: cost of 2 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv4i8v4float
+  ; AVX512F-LABEL: uitofpv4i8v4float
   ; AVX512F: cost of 2 {{.*}} uitofp
   %1 = uitofp <4 x i8> %a to <4 x float>
   ret <4 x float> %1
 }
 
 define <8 x float> @uitofpv8i8v8float(<8 x i8> %a) {
-  ; SSE2: uitofpv8i8v8float
+  ; SSE2-LABEL: uitofpv8i8v8float
   ; SSE2: cost of 15 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv8i8v8float
+  ; AVX1-LABEL: uitofpv8i8v8float
   ; AVX1: cost of 5 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv8i8v8float
+  ; AVX2-LABEL: uitofpv8i8v8float
   ; AVX2: cost of 5 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv8i8v8float
-  ; AVX512F: cost of 5 {{.*}} uitofp
+  ; AVX512F-LABEL: uitofpv8i8v8float
+  ; AVX512F: cost of 2 {{.*}} uitofp
   %1 = uitofp <8 x i8> %a to <8 x float>
   ret <8 x float> %1
 }
 
 define <16 x float> @uitofpv16i8v16float(<16 x i8> %a) {
-  ; SSE2: uitofpv16i8v16float
+  ; SSE2-LABEL: uitofpv16i8v16float
   ; SSE2: cost of 8 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv16i8v16float
+  ; AVX1-LABEL: uitofpv16i8v16float
   ; AVX1: cost of 44 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv16i8v16float
+  ; AVX2-LABEL: uitofpv16i8v16float
   ; AVX2: cost of 44 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv16i8v16float
-  ; AVX512F: cost of 46 {{.*}} uitofp
+  ; AVX512F-LABEL: uitofpv16i8v16float
+  ; AVX512F: cost of 2 {{.*}} uitofp
   %1 = uitofp <16 x i8> %a to <16 x float>
   ret <16 x float> %1
 }
 
 define <32 x float> @uitofpv32i8v32float(<32 x i8> %a) {
-  ; SSE2: uitofpv32i8v32float
+  ; SSE2-LABEL: uitofpv32i8v32float
   ; SSE2: cost of 16 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv32i8v32float
+  ; AVX1-LABEL: uitofpv32i8v32float
   ; AVX1: cost of 88 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv32i8v32float
+  ; AVX2-LABEL: uitofpv32i8v32float
   ; AVX2: cost of 88 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv32i8v32float
+  ; AVX512F-LABEL: uitofpv32i8v32float
   ; AVX512F: cost of 92 {{.*}} uitofp
   %1 = uitofp <32 x i8> %a to <32 x float>
   ret <32 x float> %1
 }
 
 define <2 x float> @uitofpv2i16v2float(<2 x i16> %a) {
-  ; SSE2: uitofpv2i16v2float
+  ; SSE2-LABEL: uitofpv2i16v2float
   ; SSE2: cost of 15 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv2i16v2float
+  ; AVX1-LABEL: uitofpv2i16v2float
   ; AVX1: cost of 4 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv2i16v2float
+  ; AVX2-LABEL: uitofpv2i16v2float
   ; AVX2: cost of 4 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv2i16v2float
+  ; AVX512F-LABEL: uitofpv2i16v2float
   ; AVX512F: cost of 4 {{.*}} uitofp
   %1 = uitofp <2 x i16> %a to <2 x float>
   ret <2 x float> %1
 }
 
 define <4 x float> @uitofpv4i16v4float(<4 x i16> %a) {
-  ; SSE2: uitofpv4i16v4float
+  ; SSE2-LABEL: uitofpv4i16v4float
   ; SSE2: cost of 8 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv4i16v4float
+  ; AVX1-LABEL: uitofpv4i16v4float
   ; AVX1: cost of 2 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv4i16v4float
+  ; AVX2-LABEL: uitofpv4i16v4float
   ; AVX2: cost of 2 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv4i16v4float
+  ; AVX512F-LABEL: uitofpv4i16v4float
   ; AVX512F: cost of 2 {{.*}} uitofp
   %1 = uitofp <4 x i16> %a to <4 x float>
   ret <4 x float> %1
 }
 
 define <8 x float> @uitofpv8i16v8float(<8 x i16> %a) {
-  ; SSE2: uitofpv8i16v8float
+  ; SSE2-LABEL: uitofpv8i16v8float
   ; SSE2: cost of 15 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv8i16v8float
+  ; AVX1-LABEL: uitofpv8i16v8float
   ; AVX1: cost of 5 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv8i16v8float
+  ; AVX2-LABEL: uitofpv8i16v8float
   ; AVX2: cost of 5 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv8i16v8float
-  ; AVX512F: cost of 5 {{.*}} uitofp
+  ; AVX512F-LABEL: uitofpv8i16v8float
+  ; AVX512F: cost of 2 {{.*}} uitofp
   %1 = uitofp <8 x i16> %a to <8 x float>
   ret <8 x float> %1
 }
 
 define <16 x float> @uitofpv16i16v16float(<16 x i16> %a) {
-  ; SSE2: uitofpv16i16v16float
+  ; SSE2-LABEL: uitofpv16i16v16float
   ; SSE2: cost of 30 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv16i16v16float
+  ; AVX1-LABEL: uitofpv16i16v16float
   ; AVX1: cost of 44 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv16i16v16float
+  ; AVX2-LABEL: uitofpv16i16v16float
   ; AVX2: cost of 44 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv16i16v16float
-  ; AVX512F: cost of 46 {{.*}} uitofp
+  ; AVX512F-LABEL: uitofpv16i16v16float
+  ; AVX512F: cost of 2 {{.*}} uitofp
   %1 = uitofp <16 x i16> %a to <16 x float>
   ret <16 x float> %1
 }
 
 define <32 x float> @uitofpv32i16v32float(<32 x i16> %a) {
-  ; SSE2: uitofpv32i16v32float
+  ; SSE2-LABEL: uitofpv32i16v32float
   ; SSE2: cost of 60 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv32i16v32float
+  ; AVX1-LABEL: uitofpv32i16v32float
   ; AVX1: cost of 88 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv32i16v32float
+  ; AVX2-LABEL: uitofpv32i16v32float
   ; AVX2: cost of 88 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv32i16v32float
+  ; AVX512F-LABEL: uitofpv32i16v32float
   ; AVX512F: cost of 92 {{.*}} uitofp
   %1 = uitofp <32 x i16> %a to <32 x float>
   ret <32 x float> %1
 }
 
 define <2 x float> @uitofpv2i32v2float(<2 x i32> %a) {
-  ; SSE2: uitofpv2i32v2float
+  ; SSE2-LABEL: uitofpv2i32v2float
   ; SSE2: cost of 15 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv2i32v2float
+  ; AVX1-LABEL: uitofpv2i32v2float
   ; AVX1: cost of 4 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv2i32v2float
+  ; AVX2-LABEL: uitofpv2i32v2float
   ; AVX2: cost of 4 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv2i32v2float
-  ; AVX512F: cost of 4 {{.*}} uitofp
+  ; AVX512F-LABEL: uitofpv2i32v2float
+  ; AVX512F: cost of 2 {{.*}} uitofp
   %1 = uitofp <2 x i32> %a to <2 x float>
   ret <2 x float> %1
 }
 
 define <4 x float> @uitofpv4i32v4float(<4 x i32> %a) {
-  ; SSE2: uitofpv4i32v4float
+  ; SSE2-LABEL: uitofpv4i32v4float
   ; SSE2: cost of 8 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv4i32v4float
+  ; AVX1-LABEL: uitofpv4i32v4float
   ; AVX1: cost of 6 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv4i32v4float
+  ; AVX2-LABEL: uitofpv4i32v4float
   ; AVX2: cost of 6 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv4i32v4float
-  ; AVX512F: cost of 6 {{.*}} uitofp
+  ; AVX512F-LABEL: uitofpv4i32v4float
+  ; AVX512F: cost of 1 {{.*}} uitofp
   %1 = uitofp <4 x i32> %a to <4 x float>
   ret <4 x float> %1
 }
 
 define <8 x float> @uitofpv8i32v8float(<8 x i32> %a) {
-  ; SSE2: uitofpv8i32v8float
+  ; SSE2-LABEL: uitofpv8i32v8float
   ; SSE2: cost of 16 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv8i32v8float
+  ; AVX1-LABEL: uitofpv8i32v8float
   ; AVX1: cost of 9 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv8i32v8float
+  ; AVX2-LABEL: uitofpv8i32v8float
   ; AVX2: cost of 8 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv8i32v8float
-  ; AVX512F: cost of 8 {{.*}} uitofp
+  ; AVX512F-LABEL: uitofpv8i32v8float
+  ; AVX512F: cost of 1 {{.*}} uitofp
   %1 = uitofp <8 x i32> %a to <8 x float>
   ret <8 x float> %1
 }
 
 define <16 x float> @uitofpv16i32v16float(<16 x i32> %a) {
-  ; SSE2: uitofpv16i32v16float
+  ; SSE2-LABEL: uitofpv16i32v16float
   ; SSE2: cost of 32 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv16i32v16float
+  ; AVX1-LABEL: uitofpv16i32v16float
   ; AVX1: cost of 44 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv16i32v16float
+  ; AVX2-LABEL: uitofpv16i32v16float
   ; AVX2: cost of 44 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv16i32v16float
-  ; AVX512F: cost of 46 {{.*}} uitofp
+  ; AVX512F-LABEL: uitofpv16i32v16float
+  ; AVX512F: cost of 1 {{.*}} uitofp
   %1 = uitofp <16 x i32> %a to <16 x float>
   ret <16 x float> %1
 }
 
 define <32 x float> @uitofpv32i32v32float(<32 x i32> %a) {
-  ; SSE2: uitofpv32i32v32float
+  ; SSE2-LABEL: uitofpv32i32v32float
   ; SSE2: cost of 64 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv32i32v32float
+  ; AVX1-LABEL: uitofpv32i32v32float
   ; AVX1: cost of 88 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv32i32v32float
+  ; AVX2-LABEL: uitofpv32i32v32float
   ; AVX2: cost of 88 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv32i32v32float
+  ; AVX512F-LABEL: uitofpv32i32v32float
   ; AVX512F: cost of 92 {{.*}} uitofp
   %1 = uitofp <32 x i32> %a to <32 x float>
   ret <32 x float> %1
 }
 
 define <2 x float> @uitofpv2i64v2float(<2 x i64> %a) {
-  ; SSE2: uitofpv2i64v2float
+  ; SSE2-LABEL: uitofpv2i64v2float
   ; SSE2: cost of 15 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv2i64v2float
+  ; AVX1-LABEL: uitofpv2i64v2float
   ; AVX1: cost of 4 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv2i64v2float
+  ; AVX2-LABEL: uitofpv2i64v2float
   ; AVX2: cost of 4 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv2i64v2float
+  ; AVX512F-LABEL: uitofpv2i64v2float
   ; AVX512F: cost of 4 {{.*}} uitofp
   %1 = uitofp <2 x i64> %a to <2 x float>
   ret <2 x float> %1
 }
 
 define <4 x float> @uitofpv4i64v4float(<4 x i64> %a) {
-  ; SSE2: uitofpv4i64v4float
+  ; SSE2-LABEL: uitofpv4i64v4float
   ; SSE2: cost of 30 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv4i64v4float
+  ; AVX1-LABEL: uitofpv4i64v4float
   ; AVX1: cost of 10 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv4i64v4float
+  ; AVX2-LABEL: uitofpv4i64v4float
   ; AVX2: cost of 10 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv4i64v4float
+  ; AVX512F-LABEL: uitofpv4i64v4float
   ; AVX512F: cost of 10 {{.*}} uitofp
   %1 = uitofp <4 x i64> %a to <4 x float>
   ret <4 x float> %1
 }
 
 define <8 x float> @uitofpv8i64v8float(<8 x i64> %a) {
-  ; SSE2: uitofpv8i64v8float
+  ; SSE2-LABEL: uitofpv8i64v8float
   ; SSE2: cost of 60 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv8i64v8float
+  ; AVX1-LABEL: uitofpv8i64v8float
   ; AVX1: cost of 22 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv8i64v8float
+  ; AVX2-LABEL: uitofpv8i64v8float
   ; AVX2: cost of 22 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv8i64v8float
+  ; AVX512F-LABEL: uitofpv8i64v8float
   ; AVX512F: cost of 22 {{.*}} uitofp
   %1 = uitofp <8 x i64> %a to <8 x float>
   ret <8 x float> %1
 }
 
 define <16 x float> @uitofpv16i64v16float(<16 x i64> %a) {
-  ; SSE2: uitofpv16i64v16float
+  ; SSE2-LABEL: uitofpv16i64v16float
   ; SSE2: cost of 120 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv16i64v16float
+  ; AVX1-LABEL: uitofpv16i64v16float
   ; AVX1: cost of 44 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv16i64v16float
+  ; AVX2-LABEL: uitofpv16i64v16float
   ; AVX2: cost of 44 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv16i64v16float
+  ; AVX512F-LABEL: uitofpv16i64v16float
   ; AVX512F: cost of 46 {{.*}} uitofp
   %1 = uitofp <16 x i64> %a to <16 x float>
   ret <16 x float> %1
 }
 
 define <32 x float> @uitofpv32i64v32float(<32 x i64> %a) {
-  ; SSE2: uitofpv32i64v32float
+  ; SSE2-LABEL: uitofpv32i64v32float
   ; SSE2: cost of 240 {{.*}} uitofp
   ;
-  ; AVX1: uitofpv32i64v32float
+  ; AVX1-LABEL: uitofpv32i64v32float
   ; AVX1: cost of 88 {{.*}} uitofp
   ;
-  ; AVX2: uitofpv32i64v32float
+  ; AVX2-LABEL: uitofpv32i64v32float
   ; AVX2: cost of 88 {{.*}} uitofp
   ;
-  ; AVX512F: uitofpv32i64v32float
+  ; AVX512F-LABEL: uitofpv32i64v32float
   ; AVX512F: cost of 92 {{.*}} uitofp
   %1 = uitofp <32 x i64> %a to <32 x float>
   ret <32 x float> %1
 }
 
+define <8 x i32> @fptouiv8f32v8i32(<8 x float> %a) {
+  ; AVX512F-LABEL: fptouiv8f32v8i32
+  ; AVX512F: cost of 1 {{.*}} fptoui
+  %1 = fptoui <8 x float> %a to <8 x i32>
+  ret <8 x i32> %1
+}
+
+define <4 x i32> @fptouiv4f32v4i32(<4 x float> %a) {
+  ; AVX512F-LABEL: fptouiv4f32v4i32
+  ; AVX512F: cost of 1 {{.*}} fptoui
+  %1 = fptoui <4 x float> %a to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <2 x i32> @fptouiv2f32v2i32(<2 x float> %a) {
+  ; AVX512F-LABEL: fptouiv2f32v2i32
+  ; AVX512F: cost of 1 {{.*}} fptoui
+  %1 = fptoui <2 x float> %a to <2 x i32>
+  ret <2 x i32> %1
+}
+
+define <16 x i32> @fptouiv16f32v16i32(<16 x float> %a) {
+  ; AVX512F-LABEL: fptouiv16f32v16i32
+  ; AVX512F: cost of 1 {{.*}} fptoui
+  %1 = fptoui <16 x float> %a to <16 x i32>
+  ret <16 x i32> %1
+}
+
+define <8 x i64> @fptouiv8f32v8i64(<8 x float> %a) {
+  ; AVX512DQ-LABEL: fptouiv8f32v8i64
+  ; AVX512DQ: cost of 1 {{.*}} fptoui
+  %1 = fptoui <8 x float> %a to <8 x i64>
+  ret <8 x i64> %1
+}
+
+define <4 x i64> @fptouiv4f32v4i64(<4 x float> %a) {
+  ; AVX512DQ-LABEL: fptouiv4f32v4i64
+  ; AVX512DQ: cost of 1 {{.*}} fptoui
+  %1 = fptoui <4 x float> %a to <4 x i64>
+  ret <4 x i64> %1
+}
+
+define <2 x i64> @fptouiv2f32v2i64(<2 x float> %a) {
+  ; AVX512DQ-LABEL: fptouiv2f32v2i64
+  ; AVX512DQ: cost of 1 {{.*}} fptoui
+  %1 = fptoui <2 x float> %a to <2 x i64>
+  ret <2 x i64> %1
+}

From 68cb3950c097e41c9ebb41fe0855bdf52e1cbe87 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 2 Dec 2015 09:07:55 +0000
Subject: [PATCH 131/186] [X86][FMA] Optimize FNEG(FMUL) Patterns

On FMA targets, we can avoid having to load a constant to negate a float/double multiply by instead using a FNMSUB (-(X*Y)-0)

Fix for PR24366

Differential Revision: http://reviews.llvm.org/D14909

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254495 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp    | 40 +++++++++----
 test/CodeGen/X86/fma_patterns.ll      | 85 ++++++++++++++++++++++++++-
 test/CodeGen/X86/fma_patterns_wide.ll | 83 +++++++++++++++++++++++++-
 3 files changed, 194 insertions(+), 14 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index fb990e7499e2..0877d96a32c8 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -26157,24 +26157,40 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
 static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG,
                                   const X86Subtarget *Subtarget) {
   EVT VT = N->getValueType(0);
+  EVT SVT = VT.getScalarType();
   SDValue Arg = N->getOperand(0);
+  SDLoc DL(N);
+
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  // If we're negating a FMUL node on a target with FMA, then we can avoid the
+  // use of a constant by performing (-0 - A*B) instead.
+  // FIXME: Check rounding control flags as well once it becomes available. 
+  if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
+      Arg->getFlags()->hasNoSignedZeros() && Subtarget->hasAnyFMA()) {
+    SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
+    return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
+                       Arg.getOperand(1), Zero);
+  }
 
   // If we're negating a FMA node, then we can adjust the
   // instruction to include the extra negation.
   if (Arg.hasOneUse()) {
     switch (Arg.getOpcode()) {
-      case X86ISD::FMADD:
-        return DAG.getNode(X86ISD::FNMSUB, SDLoc(N), VT, Arg.getOperand(0),
-                           Arg.getOperand(1), Arg.getOperand(2));
-      case X86ISD::FMSUB:
-        return DAG.getNode(X86ISD::FNMADD, SDLoc(N), VT, Arg.getOperand(0),
-                           Arg.getOperand(1), Arg.getOperand(2));
-      case X86ISD::FNMADD:
-        return DAG.getNode(X86ISD::FMSUB, SDLoc(N), VT, Arg.getOperand(0),
-                           Arg.getOperand(1), Arg.getOperand(2));
-      case X86ISD::FNMSUB:
-        return DAG.getNode(X86ISD::FMADD, SDLoc(N), VT, Arg.getOperand(0),
-                           Arg.getOperand(1), Arg.getOperand(2));
+    case X86ISD::FMADD:
+      return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
+                         Arg.getOperand(1), Arg.getOperand(2));
+    case X86ISD::FMSUB:
+      return DAG.getNode(X86ISD::FNMADD, DL, VT, Arg.getOperand(0),
+                         Arg.getOperand(1), Arg.getOperand(2));
+    case X86ISD::FNMADD:
+      return DAG.getNode(X86ISD::FMSUB, DL, VT, Arg.getOperand(0),
+                         Arg.getOperand(1), Arg.getOperand(2));
+    case X86ISD::FNMSUB:
+      return DAG.getNode(X86ISD::FMADD, DL, VT, Arg.getOperand(0),
+                         Arg.getOperand(1), Arg.getOperand(2));
     }
   }
   return SDValue();
diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll
index e3295e45823c..0f0dd20da040 100644
--- a/test/CodeGen/X86/fma_patterns.ll
+++ b/test/CodeGen/X86/fma_patterns.ll
@@ -2,7 +2,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA4
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA4
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=AVX512
 
 ;
 ; Pattern: (fadd (fmul x, y), z) -> (fmadd x,y,z)
@@ -1109,4 +1109,87 @@ define <4 x float> @test_v4f32_fma_fmul_x_c1_c2_y(<4 x float> %x, <4 x float> %y
   ret <4 x float> %a
 }
 
+; Pattern: (fneg (fmul x, y)) -> (fnmsub x, y, 0)
+
+define double @test_f64_fneg_fmul(double %x, double %y) #0 {
+; FMA-LABEL: test_f64_fneg_fmul:
+; FMA:       # BB#0:
+; FMA-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FMA-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_f64_fneg_fmul:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FMA4-NEXT:    vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_f64_fneg_fmul:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vfnmsub213sd %xmm2, %xmm0, %xmm1
+; AVX512-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %m = fmul nsz double %x, %y
+  %n = fsub double -0.0, %m
+  ret double %n
+}
+
+define <4 x float> @test_v4f32_fneg_fmul(<4 x float> %x, <4 x float> %y) #0 {
+; FMA-LABEL: test_v4f32_fneg_fmul:
+; FMA:       # BB#0:
+; FMA-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FMA-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v4f32_fneg_fmul:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FMA4-NEXT:    vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v4f32_fneg_fmul:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %m = fmul nsz <4 x float> %x, %y
+  %n = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %m
+  ret <4 x float> %n
+}
+
+define <4 x double> @test_v4f64_fneg_fmul(<4 x double> %x, <4 x double> %y) #0 {
+; FMA-LABEL: test_v4f64_fneg_fmul:
+; FMA:       # BB#0:
+; FMA-NEXT:    vxorpd %ymm2, %ymm2, %ymm2
+; FMA-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v4f64_fneg_fmul:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vxorpd %ymm2, %ymm2, %ymm2
+; FMA4-NEXT:    vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v4f64_fneg_fmul:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vxorps %ymm2, %ymm2, %ymm2
+; AVX512-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %m = fmul nsz <4 x double> %x, %y
+  %n = fsub <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %m
+  ret <4 x double> %n
+}
+
+define <4 x double> @test_v4f64_fneg_fmul_no_nsz(<4 x double> %x, <4 x double> %y) #0 {
+; ALL-LABEL: test_v4f64_fneg_fmul_no_nsz:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
+; ALL-NEXT:    vxorpd {{.*}}(%rip), %ymm0, %ymm0
+; ALL-NEXT:    retq
+  %m = fmul <4 x double> %x, %y
+  %n = fsub <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %m
+  ret <4 x double> %n
+}
+
 attributes #0 = { "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/X86/fma_patterns_wide.ll b/test/CodeGen/X86/fma_patterns_wide.ll
index f412c174fe37..7b6509ad51c7 100644
--- a/test/CodeGen/X86/fma_patterns_wide.ll
+++ b/test/CodeGen/X86/fma_patterns_wide.ll
@@ -2,7 +2,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -fp-contract=fast | FileCheck %s --check-prefix=FMA
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast | FileCheck %s --check-prefix=FMA4
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -fp-contract=fast | FileCheck %s --check-prefix=FMA4
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -fp-contract=fast | FileCheck %s --check-prefix=AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq -fp-contract=fast | FileCheck %s --check-prefix=AVX512
 
 ;
 ; Pattern: (fadd (fmul x, y), z) -> (fmadd x,y,z)
@@ -737,4 +737,85 @@ define <16 x float> @test_v16f32_fma_fmul_x_c1_c2_y(<16 x float> %x, <16 x float
   ret <16 x float> %a
 }
 
+; Pattern: (fneg (fmul x, y)) -> (fnmsub x, y, 0)
+
+define <16 x float> @test_v16f32_fneg_fmul(<16 x float> %x, <16 x float> %y) #0 {
+; FMA-LABEL: test_v16f32_fneg_fmul:
+; FMA:       # BB#0:
+; FMA-NEXT:    vxorps %ymm4, %ymm4, %ymm4
+; FMA-NEXT:    vfnmsub213ps %ymm4, %ymm2, %ymm0
+; FMA-NEXT:    vfnmsub213ps %ymm4, %ymm3, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v16f32_fneg_fmul:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vxorps %ymm4, %ymm4, %ymm4
+; FMA4-NEXT:    vfnmsubps %ymm4, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vfnmsubps %ymm4, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v16f32_fneg_fmul:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; AVX512-NEXT:    vfnmsub213ps %zmm2, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %m = fmul nsz <16 x float> %x, %y
+  %n = fsub <16 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %m
+  ret <16 x float> %n
+}
+
+define <8 x double> @test_v8f64_fneg_fmul(<8 x double> %x, <8 x double> %y) #0 {
+; FMA-LABEL: test_v8f64_fneg_fmul:
+; FMA:       # BB#0:
+; FMA-NEXT:    vxorpd %ymm4, %ymm4, %ymm4
+; FMA-NEXT:    vfnmsub213pd %ymm4, %ymm2, %ymm0
+; FMA-NEXT:    vfnmsub213pd %ymm4, %ymm3, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v8f64_fneg_fmul:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vxorpd %ymm4, %ymm4, %ymm4
+; FMA4-NEXT:    vfnmsubpd %ymm4, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vfnmsubpd %ymm4, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v8f64_fneg_fmul:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; AVX512-NEXT:    vfnmsub213pd %zmm2, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %m = fmul nsz <8 x double> %x, %y
+  %n = fsub <8 x double> <double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0>, %m
+  ret <8 x double> %n
+}
+
+define <8 x double> @test_v8f64_fneg_fmul_no_nsz(<8 x double> %x, <8 x double> %y) #0 {
+; FMA-LABEL: test_v8f64_fneg_fmul_no_nsz:
+; FMA:       # BB#0:
+; FMA-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
+; FMA-NEXT:    vmulpd %ymm2, %ymm0, %ymm0
+; FMA-NEXT:    vmovapd {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; FMA-NEXT:    vxorpd %ymm2, %ymm0, %ymm0
+; FMA-NEXT:    vxorpd %ymm2, %ymm1, %ymm1
+; FMA-NEXT:    retq
+;
+; FMA4-LABEL: test_v8f64_fneg_fmul_no_nsz:
+; FMA4:       # BB#0:
+; FMA4-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:    vmulpd %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vmovapd {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; FMA4-NEXT:    vxorpd %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vxorpd %ymm2, %ymm1, %ymm1
+; FMA4-NEXT:    retq
+;
+; AVX512-LABEL: test_v8f64_fneg_fmul_no_nsz:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vxorpd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %m = fmul <8 x double> %x, %y
+  %n = fsub <8 x double> <double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0>, %m
+  ret <8 x double> %n
+}
+
 attributes #0 = { "unsafe-fp-math"="true" }

From 044b4b4a4327c5a84e1c5a0560612e2ee7e0846a Mon Sep 17 00:00:00 2001
From: Hrvoje Varga <Hrvoje.Varga@imgtec.com>
Date: Wed, 2 Dec 2015 09:31:24 +0000
Subject: [PATCH 132/186] [mips][microMIPS] Implement PREPEND, RADDU.W.QB,
 RDDSP, REPL.PH, REPL.QB, REPLV.PH, REPLV.QB and MTHLIP instructions
 Differential Revision: http://reviews.llvm.org/D14527

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254496 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Mips/MicroMipsDSPInstrFormats.td   | 34 +++++++++++++
 lib/Target/Mips/MicroMipsDSPInstrInfo.td      | 50 ++++++++++++++++++-
 lib/Target/Mips/MipsDSPInstrInfo.td           | 21 +++++---
 .../Disassembler/Mips/micromips-dsp/valid.txt |  7 +++
 .../Mips/micromips-dspr2/valid.txt            |  1 +
 test/MC/Mips/micromips-dsp/valid.s            |  7 +++
 test/MC/Mips/micromips-dspr2/valid.s          |  1 +
 7 files changed, 112 insertions(+), 9 deletions(-)

diff --git a/lib/Target/Mips/MicroMipsDSPInstrFormats.td b/lib/Target/Mips/MicroMipsDSPInstrFormats.td
index f231d3a5294d..f24f80282b5e 100644
--- a/lib/Target/Mips/MicroMipsDSPInstrFormats.td
+++ b/lib/Target/Mips/MicroMipsDSPInstrFormats.td
@@ -178,3 +178,37 @@ class POOL32A_1RAC_FMT<string instr_asm, bits<8> funct> : MMDSPInst<instr_asm> {
   let Inst{13-6}  = funct;
   let Inst{5-0}   = 0b111100;
 }
+
+class POOL32A_1RMASK7_FMT<string opstr, bits<8> op> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<7> mask;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-14} = mask;
+  let Inst{13-6}  = op;
+  let Inst{5-0}   = 0b111100;
+}
+
+class POOL32A_1RIMM10_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> {
+  bits<5> rd;
+  bits<10> imm;
+
+  let Inst{31-26} = 0;
+  let Inst{25-16} = imm;
+  let Inst{15-11} = rd;
+  let Inst{10}    = 0;
+  let Inst{9-0}   = op;
+}
+
+class POOL32A_1RIMM8_FMT<string opstr, bits<6> op> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<8> imm;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rt;
+  let Inst{20-13} = imm;
+  let Inst{12}    = 0;
+  let Inst{11-6}  = op;
+  let Inst{5-0}   = 0b111100;
+}
diff --git a/lib/Target/Mips/MicroMipsDSPInstrInfo.td b/lib/Target/Mips/MicroMipsDSPInstrInfo.td
index 204a4ec60c5a..9b4fb6853180 100644
--- a/lib/Target/Mips/MicroMipsDSPInstrInfo.td
+++ b/lib/Target/Mips/MicroMipsDSPInstrInfo.td
@@ -141,6 +141,14 @@ class MFHI_MM_ENC : POOL32A_1RAC_FMT<"mfhi", 0b00000001>;
 class MFLO_MM_ENC : POOL32A_1RAC_FMT<"mflo", 0b01000001>;
 class MTHI_MM_ENC : POOL32A_1RAC_FMT<"mthi", 0b10000001>;
 class MTLO_MM_ENC : POOL32A_1RAC_FMT<"mthi", 0b11000001>;
+class PREPEND_MMR2_ENC : POOL32A_2RSA5B0_FMT<"prepend", 0b1001010101>;
+class RADDU_W_QB_MM_ENC : POOL32A_2R_FMT<"raddu.w.qb", 0b1111000100>;
+class RDDSP_MM_ENC : POOL32A_1RMASK7_FMT<"rddsp", 0b00011001>;
+class REPL_PH_MM_ENC : POOL32A_1RIMM10_FMT<"repl.ph", 0b0000111101>;
+class REPL_QB_MM_ENC : POOL32A_1RIMM8_FMT<"repl.qb", 0b010111>;
+class REPLV_PH_MM_ENC : POOL32A_2R_FMT<"replv.ph", 0b0000001100>;
+class REPLV_QB_MM_ENC : POOL32A_2R_FMT<"replv.qb", 0b0001001100>;
+class MTHLIP_MM_ENC : POOL32A_1RAC_FMT<"mthlip", 0b00001001>;
 
 // Instruction desc.
 class ABSQ_S_PH_MM_R2_DESC_BASE<string opstr, SDPatternOperator OpNode,
@@ -262,6 +270,7 @@ class EXT_MM_1R_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $imm");
   InstrItinClass Itinerary = itin;
 }
+
 class EXTP_MM_DESC
     : EXT_MM_1R_DESC_BASE<"extp", MipsEXTP, NoItinerary>,
       Uses<[DSPPos]>, Defs<[DSPEFI]>;
@@ -313,7 +322,38 @@ class MFHI_MM_DESC : MFHI_MM_DESC_BASE<"mfhi", ACC64DSPOpnd, MipsMFHI,
 class MFLO_MM_DESC : MFHI_MM_DESC_BASE<"mflo", ACC64DSPOpnd, MipsMFLO,
                                        NoItinerary>;
 
-// Instruction defs.
+class RADDU_W_QB_MM_DESC {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins DSPROpnd:$rs);
+  string AsmString = !strconcat("raddu.w.qb", "\t$rt, $rs");
+  list<dag> Pattern = [(set GPR32Opnd:$rt, (int_mips_raddu_w_qb DSPROpnd:$rs))];
+  InstrItinClass Itinerary = NoItinerary;
+  string BaseOpcode = "raddu.w.qb";
+}
+
+class RDDSP_MM_DESC {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins uimm16:$mask);
+  string AsmString = !strconcat("rddsp", "\t$rt, $mask");
+  list<dag> Pattern = [(set GPR32Opnd:$rt, (int_mips_rddsp immZExt10:$mask))];
+  InstrItinClass Itinerary = NoItinerary;
+}
+
+class REPL_QB_MM_DESC {
+  dag OutOperandList = (outs DSPROpnd:$rt);
+  dag InOperandList = (ins uimm16:$imm);
+  string AsmString = !strconcat("repl.qb", "\t$rt, $imm");
+  list<dag> Pattern = [(set DSPROpnd:$rt, (int_mips_repl_qb immZExt8:$imm))];
+  InstrItinClass Itinerary = NoItinerary;
+}
+
+class REPLV_PH_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<"replv.ph", int_mips_repl_ph,
+                                                   NoItinerary, DSPROpnd,
+                                                   GPR32Opnd>;
+class REPLV_QB_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<"replv.qb", int_mips_repl_qb,
+                                                   NoItinerary, DSPROpnd,
+                                                   GPR32Opnd>;
+
 // microMIPS DSP Rev 1
 def ADDQ_PH_MM : DspMMRel, ADDQ_PH_MM_ENC, ADDQ_PH_DESC;
 def ADDQ_S_PH_MM : DspMMRel, ADDQ_S_PH_MM_ENC, ADDQ_S_PH_DESC;
@@ -404,6 +444,13 @@ def MFHI_DSP_MM : DspMMRel, MFHI_MM_ENC, MFHI_MM_DESC;
 def MFLO_DSP_MM : DspMMRel, MFLO_MM_ENC, MFLO_MM_DESC;
 def MTHI_DSP_MM : DspMMRel, MTHI_MM_ENC, MTHI_DESC;
 def MTLO_DSP_MM : DspMMRel, MTLO_MM_ENC, MTLO_DESC;
+def RADDU_W_QB_MM : DspMMRel, RADDU_W_QB_MM_ENC, RADDU_W_QB_MM_DESC;
+def RDDSP_MM : DspMMRel, RDDSP_MM_ENC, RDDSP_MM_DESC;
+def REPL_PH_MM : DspMMRel, REPL_PH_MM_ENC, REPL_PH_DESC;
+def REPL_QB_MM : DspMMRel, REPL_QB_MM_ENC, REPL_QB_MM_DESC;
+def REPLV_PH_MM : DspMMRel, REPLV_PH_MM_ENC, REPLV_PH_MM_DESC;
+def REPLV_QB_MM : DspMMRel, REPLV_QB_MM_ENC, REPLV_QB_MM_DESC;
+def MTHLIP_MM : DspMMRel, MTHLIP_MM_ENC, MTHLIP_DESC;
 // microMIPS DSP Rev 2
 def ABSQ_S_QB_MMR2 : DspMMRel, ABSQ_S_QB_MMR2_ENC, ABSQ_S_QB_MMR2_DESC,
                      ISA_DSPR2;
@@ -454,3 +501,4 @@ def PRECR_SRA_PH_W_MMR2 : DspMMRel, PRECR_SRA_PH_W_MMR2_ENC,
                           PRECR_SRA_PH_W_DESC, ISA_DSPR2;
 def PRECR_SRA_R_PH_W_MMR2 : DspMMRel, PRECR_SRA_R_PH_W_MMR2_ENC,
                             PRECR_SRA_R_PH_W_DESC, ISA_DSPR2;
+def PREPEND_MMR2 : DspMMRel, PREPEND_MMR2_ENC, PREPEND_DESC, ISA_DSPR2;
diff --git a/lib/Target/Mips/MipsDSPInstrInfo.td b/lib/Target/Mips/MipsDSPInstrInfo.td
index e8cfbcf572d7..f696a38ac0f0 100644
--- a/lib/Target/Mips/MipsDSPInstrInfo.td
+++ b/lib/Target/Mips/MipsDSPInstrInfo.td
@@ -275,6 +275,7 @@ class RADDU_W_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs");
   list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs))];
   InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
 }
 
 class CMP_EQ_QB_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -328,6 +329,7 @@ class REPL_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string AsmString = !strconcat(instr_asm, "\t$rd, $imm");
   list<dag> Pattern = [(set RO:$rd, (OpNode immPat:$imm))];
   InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
 }
 
 class SHLL_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -383,6 +385,7 @@ class APPEND_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                         (OpNode GPR32Opnd:$src, GPR32Opnd:$rs, Imm:$sa))];
   InstrItinClass Itinerary = itin;
   string Constraints = "$src = $rt";
+  string BaseOpcode = instr_asm;
 }
 
 class EXTR_W_TY1_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -428,6 +431,7 @@ class MTHLIP_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
   list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
                         (OpNode GPR32Opnd:$rs, ACC64DSPOpnd:$acin))];
   string Constraints = "$acin = $ac";
+  string BaseOpcode = instr_asm;
 }
 
 class RDDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -437,6 +441,7 @@ class RDDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   string AsmString = !strconcat(instr_asm, "\t$rd, $mask");
   list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode immZExt10:$mask))];
   InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
 }
 
 class WRDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -1107,7 +1112,7 @@ def SUBQ_S_W : DspMMRel, SUBQ_S_W_ENC, SUBQ_S_W_DESC;
 def ADDSC : DspMMRel, ADDSC_ENC, ADDSC_DESC;
 def ADDWC : DspMMRel, ADDWC_ENC, ADDWC_DESC;
 def MODSUB : MODSUB_ENC, MODSUB_DESC;
-def RADDU_W_QB : RADDU_W_QB_ENC, RADDU_W_QB_DESC;
+def RADDU_W_QB : DspMMRel, RADDU_W_QB_ENC, RADDU_W_QB_DESC;
 def ABSQ_S_PH : DspMMRel, ABSQ_S_PH_ENC, ABSQ_S_PH_DESC;
 def ABSQ_S_W : DspMMRel, ABSQ_S_W_ENC, ABSQ_S_W_DESC;
 def PRECRQ_QB_PH : DspMMRel, PRECRQ_QB_PH_ENC, PRECRQ_QB_PH_DESC;
@@ -1179,10 +1184,10 @@ def CMP_LT_PH : CMP_LT_PH_ENC, CMP_LT_PH_DESC;
 def CMP_LE_PH : CMP_LE_PH_ENC, CMP_LE_PH_DESC;
 def BITREV : BITREV_ENC, BITREV_DESC;
 def PACKRL_PH : PACKRL_PH_ENC, PACKRL_PH_DESC;
-def REPL_QB : REPL_QB_ENC, REPL_QB_DESC;
-def REPL_PH : REPL_PH_ENC, REPL_PH_DESC;
-def REPLV_QB : REPLV_QB_ENC, REPLV_QB_DESC;
-def REPLV_PH : REPLV_PH_ENC, REPLV_PH_DESC;
+def REPL_QB : DspMMRel, REPL_QB_ENC, REPL_QB_DESC;
+def REPL_PH : DspMMRel, REPL_PH_ENC, REPL_PH_DESC;
+def REPLV_QB : DspMMRel, REPLV_QB_ENC, REPLV_QB_DESC;
+def REPLV_PH : DspMMRel, REPLV_PH_ENC, REPLV_PH_DESC;
 def PICK_QB : PICK_QB_ENC, PICK_QB_DESC;
 def PICK_PH : PICK_PH_ENC, PICK_PH_DESC;
 def LWX : DspMMRel, LWX_ENC, LWX_DESC;
@@ -1204,8 +1209,8 @@ def EXTR_S_H : DspMMRel, EXTR_S_H_ENC, EXTR_S_H_DESC;
 def EXTRV_S_H : DspMMRel, EXTRV_S_H_ENC, EXTRV_S_H_DESC;
 def SHILO : SHILO_ENC, SHILO_DESC;
 def SHILOV : SHILOV_ENC, SHILOV_DESC;
-def MTHLIP : MTHLIP_ENC, MTHLIP_DESC;
-def RDDSP : RDDSP_ENC, RDDSP_DESC;
+def MTHLIP : DspMMRel, MTHLIP_ENC, MTHLIP_DESC;
+def RDDSP : DspMMRel, RDDSP_ENC, RDDSP_DESC;
 def WRDSP : WRDSP_ENC, WRDSP_DESC;
 
 // MIPS DSP Rev 2
@@ -1256,7 +1261,7 @@ def SHRL_PH : DspMMRel, SHRL_PH_ENC, SHRL_PH_DESC;
 def SHRLV_PH : DspMMRel, SHRLV_PH_ENC, SHRLV_PH_DESC;
 def APPEND : APPEND_ENC, APPEND_DESC;
 def BALIGN : BALIGN_ENC, BALIGN_DESC;
-def PREPEND : PREPEND_ENC, PREPEND_DESC;
+def PREPEND : DspMMRel, PREPEND_ENC, PREPEND_DESC;
 
 }
 
diff --git a/test/MC/Disassembler/Mips/micromips-dsp/valid.txt b/test/MC/Disassembler/Mips/micromips-dsp/valid.txt
index ceba6a9823b8..7fef4e2d08cd 100644
--- a/test/MC/Disassembler/Mips/micromips-dsp/valid.txt
+++ b/test/MC/Disassembler/Mips/micromips-dsp/valid.txt
@@ -87,3 +87,10 @@
 0x00 0x01 0x50 0x7c # CHECK: mflo $1, $ac1
 0x00 0x01 0x60 0x7c # CHECK: mthi $1, $ac1
 0x00 0x01 0x70 0x7c # CHECK: mtlo $1, $ac1
+0x00 0x22 0xf1 0x3c # CHECK: raddu.w.qb $1, $2
+0x00 0x20 0x86 0x7c # CHECK: rddsp $1, 2
+0x02 0x00 0x08 0x3d # CHECK: repl.ph $1, 512
+0x00 0x30 0x05 0xfc # CHECK: repl.qb $1, 128
+0x00 0x22 0x03 0x3c # CHECK: replv.ph $1, $2
+0x00 0x22 0x13 0x3c # CHECK: replv.qb $1, $2
+0x00 0x01 0x82 0x7c # CHECK: mthlip $1, $ac2
diff --git a/test/MC/Disassembler/Mips/micromips-dspr2/valid.txt b/test/MC/Disassembler/Mips/micromips-dspr2/valid.txt
index 8049e0e3174f..096b5bda4a34 100644
--- a/test/MC/Disassembler/Mips/micromips-dspr2/valid.txt
+++ b/test/MC/Disassembler/Mips/micromips-dspr2/valid.txt
@@ -115,3 +115,4 @@
 0x00 0x62 0x08 0x95 # CHECK: muleu_s.ph.qbl $1, $2, $3
 0x00 0x62 0x08 0xd5 # CHECK: muleu_s.ph.qbr $1, $2, $3
 0x00,0x62,0x09,0x15 # CHECK: mulq_rs.ph $1, $2, $3
+0x00 0x22 0x1a 0x55 # CHECK: prepend $1, $2, 3
diff --git a/test/MC/Mips/micromips-dsp/valid.s b/test/MC/Mips/micromips-dsp/valid.s
index f2eae8f6cae4..85e3cd6f4b9d 100644
--- a/test/MC/Mips/micromips-dsp/valid.s
+++ b/test/MC/Mips/micromips-dsp/valid.s
@@ -88,3 +88,10 @@
   mflo $1, $ac1                # CHECK: mflo $1, $ac1                # encoding: [0x00,0x01,0x50,0x7c]
   mthi $1, $ac1                # CHECK: mthi $1, $ac1                # encoding: [0x00,0x01,0x60,0x7c]
   mtlo $1, $ac1                # CHECK: mtlo $1, $ac1                # encoding: [0x00,0x01,0x70,0x7c]
+  raddu.w.qb $1, $2            # CHECK: raddu.w.qb $1, $2       # encoding: [0x00,0x22,0xf1,0x3c]
+  rddsp $1, 2                  # CHECK: rddsp $1, 2             # encoding: [0x00,0x20,0x86,0x7c]
+  repl.ph $1, 512              # CHECK: repl.ph $1, 512         # encoding: [0x02,0x00,0x08,0x3d]
+  repl.qb $1, 128              # CHECK: repl.qb $1, 128         # encoding: [0x00,0x30,0x05,0xfc]
+  replv.ph $1, $2              # CHECK: replv.ph $1, $2         # encoding: [0x00,0x22,0x03,0x3c]
+  replv.qb $1, $2              # CHECK: replv.qb $1, $2         # encoding: [0x00,0x22,0x13,0x3c]
+  mthlip $1, $ac2              # CHECK: mthlip $1, $ac2         # encoding: [0x00,0x01,0x82,0x7c]
diff --git a/test/MC/Mips/micromips-dspr2/valid.s b/test/MC/Mips/micromips-dspr2/valid.s
index 3035ae5ba815..b236aea152a7 100644
--- a/test/MC/Mips/micromips-dspr2/valid.s
+++ b/test/MC/Mips/micromips-dspr2/valid.s
@@ -116,3 +116,4 @@
   muleu_s.ph.qbl $1, $2, $3    # CHECK: muleu_s.ph.qbl $1, $2, $3  # encoding: [0x00,0x62,0x08,0x95]
   muleu_s.ph.qbr $1, $2, $3    # CHECK: muleu_s.ph.qbr $1, $2, $3  # encoding: [0x00,0x62,0x08,0xd5]
   mulq_rs.ph $1, $2, $3        # CHECK: mulq_rs.ph $1, $2, $3      # encoding: [0x00,0x62,0x09,0x15]
+  prepend $1, $2, 3            # CHECK: prepend $1, $2, 3          # encoding: [0x00,0x22,0x1a,0x55]

From 8364ade5d93474e14943675f122e0f69c976ff55 Mon Sep 17 00:00:00 2001
From: Nemanja Ivanovic <nemanja.i.ibm@gmail.com>
Date: Wed, 2 Dec 2015 10:36:24 +0000
Subject: [PATCH 133/186] Patch to fix a crash in the PowerPC back end due to
 ISD::ROTL and ISD::ROTR not being expanded. Test case included.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254501 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/PowerPC/PPCISelLowering.cpp  |  2 ++
 test/CodeGen/PowerPC/rotl-rotr-crash.ll | 12 ++++++++++++
 2 files changed, 14 insertions(+)
 create mode 100644 test/CodeGen/PowerPC/rotl-rotr-crash.ll

diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 72a3fbe83e13..176a8b3ea59b 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -479,6 +479,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
       setOperationAction(ISD::VSELECT, VT, Expand);
       setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
+      setOperationAction(ISD::ROTL, VT, Expand);
+      setOperationAction(ISD::ROTR, VT, Expand);
 
       for (MVT InnerVT : MVT::vector_valuetypes()) {
         setTruncStoreAction(VT, InnerVT, Expand);
diff --git a/test/CodeGen/PowerPC/rotl-rotr-crash.ll b/test/CodeGen/PowerPC/rotl-rotr-crash.ll
new file mode 100644
index 000000000000..3fbb67ecf25e
--- /dev/null
+++ b/test/CodeGen/PowerPC/rotl-rotr-crash.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -mtriple=powerpc64le-unknown-unknown -mcpu=pwr8
+
+; Ensure this does not crash
+
+; Function Attrs: norecurse nounwind
+define <4 x i32> @func1 (<4 x i32> %a) {
+entry:
+  %0 = lshr <4 x i32> %a, <i32 16, i32 16, i32 16, i32 16>
+  %1 = shl <4 x i32> %a, <i32 16, i32 16, i32 16, i32 16>
+  %2 = or <4 x i32> %1, %0
+  ret <4 x i32> %2
+}

From 7f4f6f505925e7fa47cfde74e21bedb60faaa93e Mon Sep 17 00:00:00 2001
From: Christof Douma <Christof.Douma@arm.com>
Date: Wed, 2 Dec 2015 11:53:44 +0000
Subject: [PATCH 134/186] [AArch64]: Add support for Cortex-A35

Adds support for the new Cortex-A35 ARMv8-A core.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254503 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Support/ARMTargetParser.def |  1 +
 lib/Target/AArch64/AArch64.td            | 10 +++++++
 lib/Target/AArch64/AArch64Subtarget.h    |  2 +-
 lib/Target/ARM/ARM.td                    |  9 ++++++
 lib/Target/ARM/ARMSubtarget.h            |  4 +--
 test/CodeGen/AArch64/cpus.ll             |  1 +
 test/CodeGen/AArch64/remat.ll            |  1 +
 test/CodeGen/ARM/build-attributes.ll     | 35 ++++++++++++++++++++++++
 8 files changed, 60 insertions(+), 3 deletions(-)

diff --git a/include/llvm/Support/ARMTargetParser.def b/include/llvm/Support/ARMTargetParser.def
index bc007923b383..2f99b0717adf 100644
--- a/include/llvm/Support/ARMTargetParser.def
+++ b/include/llvm/Support/ARMTargetParser.def
@@ -208,6 +208,7 @@ ARM_CPU_NAME("sc300", AK_ARMV7M, FK_NONE, false, AEK_NONE)
 ARM_CPU_NAME("cortex-m3", AK_ARMV7M, FK_NONE, true, AEK_NONE)
 ARM_CPU_NAME("cortex-m4", AK_ARMV7EM, FK_FPV4_SP_D16, true, AEK_NONE)
 ARM_CPU_NAME("cortex-m7", AK_ARMV7EM, FK_FPV5_D16, false, AEK_NONE)
+ARM_CPU_NAME("cortex-a35", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, AEK_CRC)
 ARM_CPU_NAME("cortex-a53", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, true, AEK_CRC)
 ARM_CPU_NAME("cortex-a57", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, AEK_CRC)
 ARM_CPU_NAME("cortex-a72", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, AEK_CRC)
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index 5c19b3efdb11..0bff9b592c15 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -91,6 +91,14 @@ include "AArch64SchedA53.td"
 include "AArch64SchedA57.td"
 include "AArch64SchedCyclone.td"
 
+def ProcA35     : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
+                                   "Cortex-A35 ARM processors",
+                                   [FeatureFPARMv8,
+                                   FeatureNEON,
+                                   FeatureCrypto,
+                                   FeatureCRC,
+                                   FeaturePerfMon]>;
+
 def ProcA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
                                    "Cortex-A53 ARM processors",
                                    [FeatureFPARMv8,
@@ -121,6 +129,8 @@ def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8,
                                               FeatureCRC,
                                               FeaturePerfMon]>;
 
+// FIXME: Cortex-A35 is currently modelled as a Cortex-A53
+def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>;
 def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
 def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
 // FIXME: Cortex-A72 is currently modelled as an Cortex-A57.
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index 73daf6051b70..cf94445a885c 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -33,7 +33,7 @@ class Triple;
 
 class AArch64Subtarget : public AArch64GenSubtargetInfo {
 protected:
-  enum ARMProcFamilyEnum {Others, CortexA53, CortexA57, Cyclone};
+  enum ARMProcFamilyEnum {Others, CortexA35, CortexA53, CortexA57, Cyclone};
 
   /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.
   ARMProcFamilyEnum ARMProcFamily;
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index a0fc5f68eb25..dd33c3614b1a 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -238,6 +238,8 @@ def ProcA15     : SubtargetFeature<"a15", "ARMProcFamily", "CortexA15",
                                    "Cortex-A15 ARM processors", []>;
 def ProcA17     : SubtargetFeature<"a17", "ARMProcFamily", "CortexA17",
                                    "Cortex-A17 ARM processors", []>;
+def ProcA35     : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
+                                   "Cortex-A35 ARM processors", []>;
 def ProcA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
                                    "Cortex-A53 ARM processors", []>;
 def ProcA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
@@ -603,6 +605,13 @@ def : ProcNoItin<"cortex-m7",                           [ARMv7em,
                                                          FeatureD16]>;
 
 
+def : ProcNoItin<"cortex-a35",                          [ARMv8a, ProcA35,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM,
+                                                         FeatureT2XtPk,
+                                                         FeatureCrypto,
+                                                         FeatureCRC]>;
+
 def : ProcNoItin<"cortex-a53",                          [ARMv8a, ProcA53,
                                                          FeatureHWDiv,
                                                          FeatureHWDivARM,
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index 3ad35d24ebab..a8b28018f1b2 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -43,8 +43,8 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
 protected:
   enum ARMProcFamilyEnum {
     Others, CortexA5, CortexA7, CortexA8, CortexA9, CortexA12, CortexA15,
-    CortexA17, CortexR4, CortexR4F, CortexR5, CortexR7, CortexA53, CortexA57,
-    CortexA72, Krait, Swift
+    CortexA17, CortexR4, CortexR4F, CortexR5, CortexR7, CortexA35, CortexA53,
+    CortexA57, CortexA72, Krait, Swift
   };
   enum ARMProcClassEnum {
     None, AClass, RClass, MClass
diff --git a/test/CodeGen/AArch64/cpus.ll b/test/CodeGen/AArch64/cpus.ll
index 1266842fcc6d..a8399f92ebe4 100644
--- a/test/CodeGen/AArch64/cpus.ll
+++ b/test/CodeGen/AArch64/cpus.ll
@@ -2,6 +2,7 @@
 
 
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=generic 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a35 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a53 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a57 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a72 2>&1 | FileCheck %s
diff --git a/test/CodeGen/AArch64/remat.ll b/test/CodeGen/AArch64/remat.ll
index 8b3e6dd5ad92..a397c339a2d7 100644
--- a/test/CodeGen/AArch64/remat.ll
+++ b/test/CodeGen/AArch64/remat.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a35 -o - %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a57 -o - %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a53 -o - %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a72 -o - %s | FileCheck %s
diff --git a/test/CodeGen/ARM/build-attributes.ll b/test/CodeGen/ARM/build-attributes.ll
index 11aa1950b879..b80191d76012 100644
--- a/test/CodeGen/ARM/build-attributes.ll
+++ b/test/CodeGen/ARM/build-attributes.ll
@@ -96,6 +96,9 @@
 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r7 | FileCheck %s --check-prefix=CORTEX-R7
 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r7  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-R7-FAST
 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r7 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a35 | FileCheck %s --check-prefix=CORTEX-A35
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a35  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A35-FAST
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a35 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a53 | FileCheck %s --check-prefix=CORTEX-A53
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a53  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A53-FAST
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a53 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
@@ -129,6 +132,8 @@
 ; RUN: llc < %s -mtriple=armv8.1a-none-linux-gnueabi -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
 ; RUN: llc < %s -mtriple=armv8.1a-none-linux-gnueabi | FileCheck %s --check-prefix=NO-STRICT-ALIGN
 ; ARMv8a (AArch32)
+; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a35 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a35 -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
 ; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a57 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
 ; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a57 -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
 ; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a72 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
@@ -1113,6 +1118,36 @@
 ; CORTEX-R7-FAST-NOT:  .eabi_attribute 22
 ; CORTEX-R7-FAST:  .eabi_attribute 23, 1
 
+; CORTEX-A35:  .cpu cortex-a35
+; CORTEX-A35:  .eabi_attribute 6, 14
+; CORTEX-A35:  .eabi_attribute 7, 65
+; CORTEX-A35:  .eabi_attribute 8, 1
+; CORTEX-A35:  .eabi_attribute 9, 2
+; CORTEX-A35:  .fpu crypto-neon-fp-armv8
+; CORTEX-A35:  .eabi_attribute 12, 3
+; CORTEX-A35-NOT:   .eabi_attribute 19
+;; We default to IEEE 754 compliance
+; CORTEX-A35:  .eabi_attribute 20, 1
+; CORTEX-A35:  .eabi_attribute 21, 1
+; CORTEX-A35-NOT:  .eabi_attribute 22
+; CORTEX-A35:  .eabi_attribute 23, 3
+; CORTEX-A35:  .eabi_attribute 24, 1
+; CORTEX-A35:  .eabi_attribute 25, 1
+; CORTEX-A35-NOT:  .eabi_attribute 27
+; CORTEX-A35-NOT:  .eabi_attribute 28
+; CORTEX-A35:  .eabi_attribute 36, 1
+; CORTEX-A35:  .eabi_attribute 38, 1
+; CORTEX-A35:  .eabi_attribute 42, 1
+; CORTEX-A35-NOT:  .eabi_attribute 44
+; CORTEX-A35:  .eabi_attribute 68, 3
+
+; CORTEX-A35-FAST-NOT:   .eabi_attribute 19
+;; The A35 has the ARMv8 FP unit, which always flushes preserving sign.
+; CORTEX-A35-FAST:  .eabi_attribute 20, 2
+; CORTEX-A35-FAST-NOT:  .eabi_attribute 21
+; CORTEX-A35-FAST-NOT:  .eabi_attribute 22
+; CORTEX-A35-FAST:  .eabi_attribute 23, 1
+
 ; CORTEX-A53:  .cpu cortex-a53
 ; CORTEX-A53:  .eabi_attribute 6, 14
 ; CORTEX-A53:  .eabi_attribute 7, 65

From 3304e8735eaed35b561a0b1c2819ad39e0c87387 Mon Sep 17 00:00:00 2001
From: Andy Gibbs <andyg1001@hotmail.co.uk>
Date: Wed, 2 Dec 2015 13:41:24 +0000
Subject: [PATCH 135/186] Fix class SCEVPredicate has virtual functions and
 accessible non-virtual destructor

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254508 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/ScalarEvolution.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index da8c68aa6832..0000b42391af 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@@ -183,7 +183,7 @@ namespace llvm {
 
   protected:
     SCEVPredicateKind Kind;
-    ~SCEVPredicate() = default;
+    virtual ~SCEVPredicate() = default;
     SCEVPredicate(const SCEVPredicate&) = default;
     SCEVPredicate &operator=(const SCEVPredicate&) = default;
 

From 3fe37662ad60efa7faa432de8e7e86ca73d85d8d Mon Sep 17 00:00:00 2001
From: Andy Gibbs <andyg1001@hotmail.co.uk>
Date: Wed, 2 Dec 2015 14:22:18 +0000
Subject: [PATCH 136/186] Fix buildbots broken by r254508

g++ 4.7 does not allow an inline defaulted virtual destructor to be overridden,
giving the error "looser throw specifier for ... overridding ~SCEVPredicate()
noexcept (true)" (see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53613).

The work-around given in the bug report above has been utilised here.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254511 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/ScalarEvolution.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index 0000b42391af..3c7f1e052a94 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@@ -183,7 +183,7 @@ namespace llvm {
 
   protected:
     SCEVPredicateKind Kind;
-    virtual ~SCEVPredicate() = default;
+    virtual ~SCEVPredicate();
     SCEVPredicate(const SCEVPredicate&) = default;
     SCEVPredicate &operator=(const SCEVPredicate&) = default;
 
@@ -211,6 +211,9 @@ namespace llvm {
     /// if this is a SCEVUnionPredicate.
     virtual const SCEV *getExpr() const = 0;
   };
+  
+  /// Default destructor must be defined outside class due to g++ PR53613.
+  SCEVPredicate::~SCEVPredicate() = default;
 
   inline raw_ostream &operator<<(raw_ostream &OS, const SCEVPredicate &P) {
     P.print(OS);

From a933a559f6e509c1a55cf3824ddf7635538ebf1d Mon Sep 17 00:00:00 2001
From: Michael Zuckerman <Michael.zuckerman@intel.com>
Date: Wed, 2 Dec 2015 14:34:34 +0000
Subject: [PATCH 137/186] By intel spec

|9B DD /7| FSTSW m2byte| Valid Valid Store FPU status word at m2byteafter checking for pending unmasked floating-point exceptions.|
|9B DF E0| FSTSW AX| Valid Valid Store FPU status word in AX register after checking for pending unmasked floating-point exceptions.|
|DD /7 |FNSTSW *m2byte| Valid Valid Store FPU status word at m2bytewithout checking for pending unmasked floating-point exceptions.|
|DF E0 |FNSTSW *AX| Valid Valid Store FPU status word in AX register without checking for pending unmasked floating-point exceptions|

m2byte is word register, and therefor instruction operand need to be change from f32mem to i16mem.

Differential Revision: http://reviews.llvm.org/D14953


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254512 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86InstrFPStack.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td
index 8fecc9b1a3fd..03ae21125b0e 100644
--- a/lib/Target/X86/X86InstrFPStack.td
+++ b/lib/Target/X86/X86InstrFPStack.td
@@ -336,7 +336,7 @@ def FCOMP64m : FPI<0xDC, MRM3m, (outs), (ins f64mem:$src), "fcomp{l}\t$src">;
 
 def FRSTORm  : FPI<0xDD, MRM4m, (outs f32mem:$dst), (ins), "frstor\t$dst">;
 def FSAVEm   : FPI<0xDD, MRM6m, (outs f32mem:$dst), (ins), "fnsave\t$dst">;
-def FNSTSWm  : FPI<0xDD, MRM7m, (outs f32mem:$dst), (ins), "fnstsw\t$dst">;
+def FNSTSWm  : FPI<0xDD, MRM7m, (outs i16mem:$dst), (ins), "fnstsw\t$dst">;
 
 def FICOM16m : FPI<0xDE, MRM2m, (outs), (ins i16mem:$src), "ficom{s}\t$src">;
 def FICOMP16m: FPI<0xDE, MRM3m, (outs), (ins i16mem:$src), "ficomp{s}\t$src">;

From f9aa57da75782f8956ecaf6d33d32d7daaa929a6 Mon Sep 17 00:00:00 2001
From: Andy Gibbs <andyg1001@hotmail.co.uk>
Date: Wed, 2 Dec 2015 14:36:48 +0000
Subject: [PATCH 138/186] Rollback r254508 and r254511 to fix buildbots

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254513 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/ScalarEvolution.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index 3c7f1e052a94..da8c68aa6832 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@@ -183,7 +183,7 @@ namespace llvm {
 
   protected:
     SCEVPredicateKind Kind;
-    virtual ~SCEVPredicate();
+    ~SCEVPredicate() = default;
     SCEVPredicate(const SCEVPredicate&) = default;
     SCEVPredicate &operator=(const SCEVPredicate&) = default;
 
@@ -211,9 +211,6 @@ namespace llvm {
     /// if this is a SCEVUnionPredicate.
     virtual const SCEV *getExpr() const = 0;
   };
-  
-  /// Default destructor must be defined outside class due to g++ PR53613.
-  SCEVPredicate::~SCEVPredicate() = default;
 
   inline raw_ostream &operator<<(raw_ostream &OS, const SCEVPredicate &P) {
     P.print(OS);

From f43a53680ab84d603007b06ed2ff43573072673a Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Wed, 2 Dec 2015 15:02:43 +0000
Subject: [PATCH 139/186] Add an interesting case we already get right.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254514 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Linker/ctors4.ll | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 test/Linker/ctors4.ll

diff --git a/test/Linker/ctors4.ll b/test/Linker/ctors4.ll
new file mode 100644
index 000000000000..c4500841f174
--- /dev/null
+++ b/test/Linker/ctors4.ll
@@ -0,0 +1,14 @@
+; RUN: llvm-link -S %s -o - | FileCheck %s
+
+define void @f() {
+  ret void
+}
+
+; We lazy link @v, which causes llvm.global_ctors to have the corresponding
+; entry.
+@v = linkonce global i8 42
+
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @f, i8* @v }]
+
+; CHECK: @llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @f, i8* @v }]
+

From 26a5db075f0b48f6e539ebff21c060264935ea01 Mon Sep 17 00:00:00 2001
From: David Majnemer <david.majnemer@gmail.com>
Date: Wed, 2 Dec 2015 16:15:07 +0000
Subject: [PATCH 140/186] Do (A == C1 || A == C2) -> (A & ~(C1 ^ C2)) == C1
 rather than (A == C1 || A == C2) -> (A | (C1 ^ C2)) == C2 when C1 ^ C2 is a
 power of 2.

Differential Revision: http://reviews.llvm.org/D14223

Patch by Amaury SECHET!

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254518 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/InstCombine/InstCombineAndOrXor.cpp | 8 ++++----
 test/Transforms/InstCombine/icmp.ll                | 4 ++--
 test/Transforms/InstCombine/load-cmp.ll            | 4 ++--
 test/Transforms/InstCombine/or.ll                  | 2 +-
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index f72089e6c8ef..2bf6faa47b93 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1962,14 +1962,14 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
     case ICmpInst::ICMP_EQ:
       if (LHS->getOperand(0) == RHS->getOperand(0)) {
         // if LHSCst and RHSCst differ only by one bit:
-        // (A == C1 || A == C2) -> (A & ~(C1 ^ C2)) == C1
+        // (A == C1 || A == C2) -> (A | (C1 ^ C2)) == C2
         assert(LHSCst->getValue().ule(LHSCst->getValue()));
 
         APInt Xor = LHSCst->getValue() ^ RHSCst->getValue();
         if (Xor.isPowerOf2()) {
-          Value *NegCst = Builder->getInt(~Xor);
-          Value *And = Builder->CreateAnd(LHS->getOperand(0), NegCst);
-          return Builder->CreateICmp(ICmpInst::ICMP_EQ, And, LHSCst);
+          Value *Cst = Builder->getInt(Xor);
+          Value *Or = Builder->CreateOr(LHS->getOperand(0), Cst);
+          return Builder->CreateICmp(ICmpInst::ICMP_EQ, Or, RHSCst);
         }
       }
 
diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll
index a85bee5c2c9b..7d6ec96b5328 100644
--- a/test/Transforms/InstCombine/icmp.ll
+++ b/test/Transforms/InstCombine/icmp.ll
@@ -819,8 +819,8 @@ define i1 @test68(i32 %x) nounwind uwtable {
 
 ; PR14708
 ; CHECK-LABEL: @test69(
-; CHECK: %1 = and i32 %c, -33
-; CHECK: %2 = icmp eq i32 %1, 65
+; CHECK: %1 = or i32 %c, 32
+; CHECK: %2 = icmp eq i32 %1, 97
 ; CHECK: ret i1 %2
 define i1 @test69(i32 %c) nounwind uwtable {
   %1 = icmp eq i32 %c, 97
diff --git a/test/Transforms/InstCombine/load-cmp.ll b/test/Transforms/InstCombine/load-cmp.ll
index 0bd9eb52191a..fe1bf1517539 100644
--- a/test/Transforms/InstCombine/load-cmp.ll
+++ b/test/Transforms/InstCombine/load-cmp.ll
@@ -148,8 +148,8 @@ define i1 @test8(i32 %X) {
   %S = icmp eq i16 %R, 0
   ret i1 %S
 ; CHECK-LABEL: @test8(
-; CHECK-NEXT: and i32 %X, -2
-; CHECK-NEXT: icmp eq i32 {{.*}}, 8
+; CHECK-NEXT: or i32 %X, 1
+; CHECK-NEXT: icmp eq i32 {{.*}}, 9
 ; CHECK-NEXT: ret i1
 }
 
diff --git a/test/Transforms/InstCombine/or.ll b/test/Transforms/InstCombine/or.ll
index b91a5954d97e..a2bc4e7d9832 100644
--- a/test/Transforms/InstCombine/or.ll
+++ b/test/Transforms/InstCombine/or.ll
@@ -182,7 +182,7 @@ define i1 @test19(i32 %A) {
         %D = or i1 %B, %C
         ret i1 %D
 ; CHECK-LABEL: @test19(
-; CHECK: and i32
+; CHECK: or i32
 ; CHECK: icmp eq 
 ; CHECK: ret i1
 }

From 26ecf8e5acc30aee067dc759723a33ec24858485 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 2 Dec 2015 17:00:42 +0000
Subject: [PATCH 141/186] AMDGPU/SI: Don't emit group segment global variables

Summary: Only global or readonly segment variables should appear in object files.

Reviewers: arsenm

Subscribers: arsenm, llvm-commits

Differential Revision: http://reviews.llvm.org/D15111

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254519 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp     |  7 +++++++
 lib/Target/AMDGPU/AMDGPUAsmPrinter.h       |  2 ++
 lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp |  6 ++++++
 lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h   |  3 +++
 test/CodeGen/AMDGPU/hsa-group-segment.ll   | 14 ++++++++++++++
 5 files changed, 32 insertions(+)
 create mode 100644 test/CodeGen/AMDGPU/hsa-group-segment.ll

diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 87a0cf20d176..84c71e0cc9cf 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -123,6 +123,13 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
   AsmPrinter::EmitFunctionEntryLabel();
 }
 
+void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
+
+  if (TM.getTargetTriple().getOS() != Triple::AMDHSA ||
+      !AMDGPU::isGroupSegment(GV))
+    return AsmPrinter::EmitGlobalVariable(GV);
+}
+
 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 
   // The starting address of all shader programs must be 256 bytes aligned.
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 687f239ecab5..1aaef00a4dd0 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -103,6 +103,8 @@ class AMDGPUAsmPrinter : public AsmPrinter {
 
   void EmitFunctionEntryLabel() override;
 
+  void EmitGlobalVariable(const GlobalVariable *GV) override;
+
   bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                        unsigned AsmVariant, const char *ExtraCode,
                        raw_ostream &O) override;
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index e70f79d5a7be..81ade517254f 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -7,6 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 #include "AMDGPUBaseInfo.h"
+#include "AMDGPU.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/SubtargetFeature.h"
@@ -66,5 +68,9 @@ MCSection *getHSATextSection(MCContext &Ctx) {
                            ELF::SHF_AMDGPU_HSA_CODE);
 }
 
+bool isGroupSegment(const GlobalValue *GV) {
+  return GV->getType()->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+}
+
 } // End namespace AMDGPU
 } // End namespace llvm
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 59a32a6b592d..bf9377ed86cf 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -15,6 +15,7 @@
 namespace llvm {
 
 class FeatureBitset;
+class GlobalValue;
 class MCContext;
 class MCSection;
 
@@ -31,6 +32,8 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
                                const FeatureBitset &Features);
 MCSection *getHSATextSection(MCContext &Ctx);
 
+bool isGroupSegment(const GlobalValue *GV);
+
 } // end namespace AMDGPU
 } // end namespace llvm
 
diff --git a/test/CodeGen/AMDGPU/hsa-group-segment.ll b/test/CodeGen/AMDGPU/hsa-group-segment.ll
new file mode 100644
index 000000000000..1999dc38a6b0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/hsa-group-segment.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA %s
+
+@internal_group = internal addrspace(3) global i32 undef
+@external_group = addrspace(3) global i32 undef
+
+define void @test() {
+entry:
+  store i32 0, i32 addrspace(3)* @internal_group
+  store i32 0, i32 addrspace(3)* @external_group
+  ret void
+}
+
+; HSA-NOT: internal_group:
+; HSA-NOT: external_group:

From 454061bf3aeba2ac0b7b5dadb4f07e497139609f Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Wed, 2 Dec 2015 18:08:49 +0000
Subject: [PATCH 142/186] [WebAssembly] Fix comments to say "LIFO" instead of
 "FIFO" when describing a stack.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254523 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp       | 2 +-
 lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h | 2 +-
 lib/Target/WebAssembly/WebAssemblyRegStackify.cpp       | 2 +-
 lib/Target/WebAssembly/WebAssemblyRegisterInfo.td       | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index dbd00bc10b1c..110316ba57b2 100644
--- a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -341,7 +341,7 @@ bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) {
   PlaceMarkers(MF, MLI, TII, MDT);
 
 #ifndef NDEBUG
-  // Verify that block and loop beginnings and endings are in FIFO order, and
+  // Verify that block and loop beginnings and endings are in LIFO order, and
   // that all references to blocks are to blocks on the stack at the point of
   // the reference.
   SmallVector<std::pair<MachineBasicBlock *, bool>, 0> Stack;
diff --git a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
index 4760f0d576e4..62c5f33cfad7 100644
--- a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
@@ -37,7 +37,7 @@ class WebAssemblyFunctionInfo final : public MachineFunctionInfo {
   /// determined or made to meet the stack requirements:
   ///   - single use (per path)
   ///   - single def (per path)
-  ///   - defined and used in FIFO order with other stack registers
+  ///   - defined and used in LIFO order with other stack registers
   BitVector VRegStackified;
 
 public:
diff --git a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index c3847dd9fcb4..bdccc8577c5e 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -117,7 +117,7 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
         break;
 
       // Iterate through the inputs in reverse order, since we'll be pulling
-      // operands off the stack in FIFO order.
+      // operands off the stack in LIFO order.
       bool AnyStackified = false;
       for (MachineOperand &Op : reverse(Insert->uses())) {
         // We're only interested in explicit virtual register operands.
diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
index 4057ff7a9b43..80a83fa76b57 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
@@ -40,7 +40,7 @@ def F32_0 : WebAssemblyReg<"%f32.0">;
 def F64_0 : WebAssemblyReg<"%f64.0">;
 
 // The expression stack "register". This is an opaque entity which serves to
-// order uses and defs that must remain in FIFO order.
+// order uses and defs that must remain in LIFO order.
 def EXPR_STACK : WebAssemblyReg<"STACK">;
 
 // The incoming arguments "register". This is an opaque entity which serves to

From 57b1a9599b8b3a3c571103ed481932c2768d8da9 Mon Sep 17 00:00:00 2001
From: Tim Northover <tnorthover@apple.com>
Date: Wed, 2 Dec 2015 18:12:57 +0000
Subject: [PATCH 143/186] AArch64: use ldxp/stxp pair to implement 128-bit
 atomic loads.

The ARM ARM is clear that 128-bit loads are only guaranteed to have been atomic
if there has been a corresponding successful stxp. It's less clear for AArch32, so
I'm leaving that alone for now.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254524 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Target/TargetLowering.h       | 10 ++--
 lib/CodeGen/AtomicExpandPass.cpp           | 68 ++++++++++++----------
 lib/Target/ARM/ARMISelLowering.cpp         |  2 +-
 lib/Target/Hexagon/HexagonISelLowering.cpp |  2 +-
 test/CodeGen/AArch64/arm64-atomic-128.ll   |  7 ++-
 test/CodeGen/ARM/atomic-64bit.ll           |  6 ++
 6 files changed, 57 insertions(+), 38 deletions(-)

diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index cb5a5796e983..819458dbb0f0 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -130,10 +130,12 @@ class TargetLoweringBase {
   /// support for these atomic instructions, and also have different options
   /// w.r.t. what they should expand to.
   enum class AtomicExpansionKind {
-    None,      // Don't expand the instruction.
-    LLSC,      // Expand the instruction into loadlinked/storeconditional; used
-               // by ARM/AArch64.
-    CmpXChg,   // Expand the instruction into cmpxchg; used by at least X86.
+    None,    // Don't expand the instruction.
+    LLSC,    // Expand the instruction into loadlinked/storeconditional; used
+             // by ARM/AArch64.
+    LLOnly,  // Expand the (load) instruction into just a load-linked, which has
+             // greater atomic guarantees than a normal load.
+    CmpXChg, // Expand the instruction into cmpxchg; used by at least X86.
   };
 
   static ISD::NodeType getExtendForContent(BooleanContent Content) {
diff --git a/lib/CodeGen/AtomicExpandPass.cpp b/lib/CodeGen/AtomicExpandPass.cpp
index e7998db4a7c1..e4b7c5a62780 100644
--- a/lib/CodeGen/AtomicExpandPass.cpp
+++ b/lib/CodeGen/AtomicExpandPass.cpp
@@ -51,7 +51,9 @@ namespace {
     bool expandAtomicLoadToCmpXchg(LoadInst *LI);
     bool expandAtomicStore(StoreInst *SI);
     bool tryExpandAtomicRMW(AtomicRMWInst *AI);
-    bool expandAtomicRMWToLLSC(AtomicRMWInst *AI);
+    bool expandAtomicOpToLLSC(
+        Instruction *I, Value *Addr, AtomicOrdering MemOpOrder,
+        std::function<Value *(IRBuilder<> &, Value *)> PerformOp);
     bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
     bool isIdempotentRMW(AtomicRMWInst *AI);
     bool simplifyIdempotentRMW(AtomicRMWInst *AI);
@@ -174,13 +176,15 @@ bool AtomicExpand::tryExpandAtomicLoad(LoadInst *LI) {
   switch (TLI->shouldExpandAtomicLoadInIR(LI)) {
   case TargetLoweringBase::AtomicExpansionKind::None:
     return false;
-  case TargetLoweringBase::AtomicExpansionKind::LLSC: {
+  case TargetLoweringBase::AtomicExpansionKind::LLSC:
+    return expandAtomicOpToLLSC(
+        LI, LI->getPointerOperand(), LI->getOrdering(),
+        [](IRBuilder<> &Builder, Value *Loaded) { return Loaded; });
+  case TargetLoweringBase::AtomicExpansionKind::LLOnly:
     return expandAtomicLoadToLL(LI);
-  }
-  case TargetLoweringBase::AtomicExpansionKind::CmpXChg: {
+  case TargetLoweringBase::AtomicExpansionKind::CmpXChg:
     return expandAtomicLoadToCmpXchg(LI);
   }
-  }
   llvm_unreachable("Unhandled case in tryExpandAtomicLoad");
 }
 
@@ -192,6 +196,7 @@ bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) {
   // to be single-copy atomic by ARM is an ldrexd (A3.5.3).
   Value *Val =
       TLI->emitLoadLinked(Builder, LI->getPointerOperand(), LI->getOrdering());
+  TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder);
 
   LI->replaceAllUsesWith(Val);
   LI->eraseFromParent();
@@ -245,20 +250,6 @@ static void createCmpXchgInstFun(IRBuilder<> &Builder, Value *Addr,
   NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
 }
 
-bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) {
-  switch (TLI->shouldExpandAtomicRMWInIR(AI)) {
-  case TargetLoweringBase::AtomicExpansionKind::None:
-    return false;
-  case TargetLoweringBase::AtomicExpansionKind::LLSC: {
-    return expandAtomicRMWToLLSC(AI);
-  }
-  case TargetLoweringBase::AtomicExpansionKind::CmpXChg: {
-    return expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun);
-  }
-  }
-  llvm_unreachable("Unhandled case in tryExpandAtomicRMW");
-}
-
 /// Emit IR to implement the given atomicrmw operation on values in registers,
 /// returning the new value.
 static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder,
@@ -296,10 +287,28 @@ static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder,
   }
 }
 
-bool AtomicExpand::expandAtomicRMWToLLSC(AtomicRMWInst *AI) {
-  AtomicOrdering MemOpOrder = AI->getOrdering();
-  Value *Addr = AI->getPointerOperand();
-  BasicBlock *BB = AI->getParent();
+bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) {
+  switch (TLI->shouldExpandAtomicRMWInIR(AI)) {
+  case TargetLoweringBase::AtomicExpansionKind::None:
+    return false;
+  case TargetLoweringBase::AtomicExpansionKind::LLSC:
+    return expandAtomicOpToLLSC(AI, AI->getPointerOperand(), AI->getOrdering(),
+                                [&](IRBuilder<> &Builder, Value *Loaded) {
+                                  return performAtomicOp(AI->getOperation(),
+                                                         Builder, Loaded,
+                                                         AI->getValOperand());
+                                });
+  case TargetLoweringBase::AtomicExpansionKind::CmpXChg:
+    return expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun);
+  default:
+    llvm_unreachable("Unhandled case in tryExpandAtomicRMW");
+  }
+}
+
+bool AtomicExpand::expandAtomicOpToLLSC(
+    Instruction *I, Value *Addr, AtomicOrdering MemOpOrder,
+    std::function<Value *(IRBuilder<> &, Value *)> PerformOp) {
+  BasicBlock *BB = I->getParent();
   Function *F = BB->getParent();
   LLVMContext &Ctx = F->getContext();
 
@@ -317,11 +326,11 @@ bool AtomicExpand::expandAtomicRMWToLLSC(AtomicRMWInst *AI) {
   // atomicrmw.end:
   //     fence?
   //     [...]
-  BasicBlock *ExitBB = BB->splitBasicBlock(AI->getIterator(), "atomicrmw.end");
+  BasicBlock *ExitBB = BB->splitBasicBlock(I->getIterator(), "atomicrmw.end");
   BasicBlock *LoopBB =  BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
 
-  // This grabs the DebugLoc from AI.
-  IRBuilder<> Builder(AI);
+  // This grabs the DebugLoc from I.
+  IRBuilder<> Builder(I);
 
   // The split call above "helpfully" added a branch at the end of BB (to the
   // wrong place), but we might want a fence too. It's easiest to just remove
@@ -334,8 +343,7 @@ bool AtomicExpand::expandAtomicRMWToLLSC(AtomicRMWInst *AI) {
   Builder.SetInsertPoint(LoopBB);
   Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
 
-  Value *NewVal =
-      performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand());
+  Value *NewVal = PerformOp(Builder, Loaded);
 
   Value *StoreSuccess =
       TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder);
@@ -345,8 +353,8 @@ bool AtomicExpand::expandAtomicRMWToLLSC(AtomicRMWInst *AI) {
 
   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
 
-  AI->replaceAllUsesWith(Loaded);
-  AI->eraseFromParent();
+  I->replaceAllUsesWith(Loaded);
+  I->eraseFromParent();
 
   return true;
 }
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index e8f3ab65bdbe..33f74a3ba9fd 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -11891,7 +11891,7 @@ bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
 TargetLowering::AtomicExpansionKind
 ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
   unsigned Size = LI->getType()->getPrimitiveSizeInBits();
-  return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLSC
+  return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly
                                                   : AtomicExpansionKind::None;
 }
 
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index 969edf6d5572..04f5b6649293 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -2863,7 +2863,7 @@ TargetLowering::AtomicExpansionKind
 HexagonTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
   // Do not expand loads and stores that don't exceed 64 bits.
   return LI->getType()->getPrimitiveSizeInBits() > 64
-             ? AtomicExpansionKind::LLSC
+             ? AtomicExpansionKind::LLOnly
              : AtomicExpansionKind::None;
 }
 
diff --git a/test/CodeGen/AArch64/arm64-atomic-128.ll b/test/CodeGen/AArch64/arm64-atomic-128.ll
index a76cf74a6d0c..44c24c51f0df 100644
--- a/test/CodeGen/AArch64/arm64-atomic-128.ll
+++ b/test/CodeGen/AArch64/arm64-atomic-128.ll
@@ -173,10 +173,13 @@ define i128 @atomic_load_seq_cst(i128* %p) {
    ret i128 %r
 }
 
-define i128 @atomic_load_relaxed(i128* %p) {
+define i128 @atomic_load_relaxed(i64, i64, i128* %p) {
 ; CHECK-LABEL: atomic_load_relaxed:
 ; CHECK-NOT: dmb
-; CHECK: ldxp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x2]
+; CHECK-NEXT: stxp [[SUCCESS:w[0-9]+]], [[LO]], [[HI]], [x2]
+; CHECK: cbnz [[SUCCESS]], [[LABEL]]
 ; CHECK-NOT: dmb
    %r = load atomic i128, i128* %p monotonic, align 16
    ret i128 %r
diff --git a/test/CodeGen/ARM/atomic-64bit.ll b/test/CodeGen/ARM/atomic-64bit.ll
index 7510d6ccdc33..573cd45c0825 100644
--- a/test/CodeGen/ARM/atomic-64bit.ll
+++ b/test/CodeGen/ARM/atomic-64bit.ll
@@ -208,10 +208,16 @@ define i64 @test7(i64* %ptr, i64 %val1, i64 %val2) {
 define i64 @test8(i64* %ptr) {
 ; CHECK-LABEL: test8:
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
+; CHECK-NOT: strexd
+; CHECK: clrex
+; CHECK-NOT: strexd
 ; CHECK: dmb {{ish$}}
 
 ; CHECK-THUMB-LABEL: test8:
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
+; CHECK-THUMB-NOT: strexd
+; CHECK-THUMB: clrex
+; CHECK-THUMB-NOT: strexd
 ; CHECK-THUMB: dmb {{ish$}}
 
   %r = load atomic i64, i64* %ptr seq_cst, align 8

From 1c2b998913d05c83a781da1a15738ac4c9379f6f Mon Sep 17 00:00:00 2001
From: Nathan Slingerland <slingn@gmail.com>
Date: Wed, 2 Dec 2015 18:19:24 +0000
Subject: [PATCH 144/186] [llvm-profdata] Change instr prof counter overflow to
 saturate rather than discard

Summary: This changes overflow handling during instrumentation profile merge. Rathar than throwing away records that would result in counter overflow, merged counts are instead clamped to the maximum representable value. A warning about counter overflow is still surfaced to the user as before.

Reviewers: dnovillo, davidxl, silvas

Subscribers: llvm-commits

Differential Revision: http://reviews.llvm.org/D14893

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254525 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ProfileData/InstrProf.h       | 17 ++++++++++-------
 lib/ProfileData/InstrProfWriter.cpp        | 11 ++++++-----
 test/tools/llvm-profdata/overflow.proftext | 16 ++++++++++++----
 unittests/ProfileData/InstrProfTest.cpp    |  5 ++++-
 4 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/include/llvm/ProfileData/InstrProf.h b/include/llvm/ProfileData/InstrProf.h
index 13f6c70b3e83..956485119102 100644
--- a/include/llvm/ProfileData/InstrProf.h
+++ b/include/llvm/ProfileData/InstrProf.h
@@ -428,19 +428,22 @@ instrprof_error InstrProfRecord::merge(InstrProfRecord &Other) {
   if (Counts.size() != Other.Counts.size())
     return instrprof_error::count_mismatch;
 
+  instrprof_error Result = instrprof_error::success;
+
   for (size_t I = 0, E = Other.Counts.size(); I < E; ++I) {
-    if (Counts[I] + Other.Counts[I] < Counts[I])
-      return instrprof_error::counter_overflow;
-    Counts[I] += Other.Counts[I];
+    bool ResultOverflowed;
+    Counts[I] = SaturatingAdd(Counts[I], Other.Counts[I], ResultOverflowed);
+    if (ResultOverflowed)
+      Result = instrprof_error::counter_overflow;
   }
 
   for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) {
-    instrprof_error result = mergeValueProfData(Kind, Other);
-    if (result != instrprof_error::success)
-      return result;
+    instrprof_error MergeValueResult = mergeValueProfData(Kind, Other);
+    if (MergeValueResult != instrprof_error::success)
+      Result = MergeValueResult;
   }
 
-  return instrprof_error::success;
+  return Result;
 }
 
 inline support::endianness getHostEndianness() {
diff --git a/lib/ProfileData/InstrProfWriter.cpp b/lib/ProfileData/InstrProfWriter.cpp
index f9cc2afe3da0..78bec012eeb2 100644
--- a/lib/ProfileData/InstrProfWriter.cpp
+++ b/lib/ProfileData/InstrProfWriter.cpp
@@ -107,22 +107,23 @@ std::error_code InstrProfWriter::addRecord(InstrProfRecord &&I) {
   std::tie(Where, NewFunc) =
       ProfileDataMap.insert(std::make_pair(I.Hash, InstrProfRecord()));
   InstrProfRecord &Dest = Where->second;
+
+  instrprof_error Result;
   if (NewFunc) {
     // We've never seen a function with this name and hash, add it.
     Dest = std::move(I);
+    Result = instrprof_error::success;
   } else {
     // We're updating a function we've seen before.
-    instrprof_error MergeResult = Dest.merge(I);
-    if (MergeResult != instrprof_error::success) {
-      return MergeResult;
-    }
+    Result = Dest.merge(I);
   }
 
   // We keep track of the max function count as we go for simplicity.
+  // Update this statistic no matter the result of the merge.
   if (Dest.Counts[0] > MaxFunctionCount)
     MaxFunctionCount = Dest.Counts[0];
 
-  return instrprof_error::success;
+  return Result;
 }
 
 std::pair<uint64_t, uint64_t> InstrProfWriter::writeImpl(raw_ostream &OS) {
diff --git a/test/tools/llvm-profdata/overflow.proftext b/test/tools/llvm-profdata/overflow.proftext
index cbf3bf161823..b8401ffd84df 100644
--- a/test/tools/llvm-profdata/overflow.proftext
+++ b/test/tools/llvm-profdata/overflow.proftext
@@ -1,12 +1,20 @@
-# RUN: llvm-profdata merge %s -o %t.out 2>&1 | FileCheck %s
-# CHECK: overflow.proftext: overflow: Counter overflow
+# RUN: llvm-profdata merge %s -o %t.out 2>&1 | FileCheck %s --check-prefix=MERGE
+# RUN: llvm-profdata show %t.out | FileCheck %s --check-prefix=SHOW
+# MERGE: overflow.proftext: overflow: Counter overflow
+# SHOW: Total functions: 1
+# SHOW: Maximum function count: 18446744073709551615
+# SHOW: Maximum internal block count: 18446744073709551615
 
 overflow
 1
-1
+3
+18446744073709551615
 9223372036854775808
+18446744073709551615
 
 overflow
 1
-1
+3
+9223372036854775808
 9223372036854775808
+0
diff --git a/unittests/ProfileData/InstrProfTest.cpp b/unittests/ProfileData/InstrProfTest.cpp
index 2f3adb65a0ee..635a5431a513 100644
--- a/unittests/ProfileData/InstrProfTest.cpp
+++ b/unittests/ProfileData/InstrProfTest.cpp
@@ -354,7 +354,7 @@ TEST_F(InstrProfTest, get_icall_data_merge1_saturation) {
   const uint64_t Max = std::numeric_limits<uint64_t>::max();
 
   InstrProfRecord Record1("caller", 0x1234, {1});
-  InstrProfRecord Record2("caller", 0x1234, {1});
+  InstrProfRecord Record2("caller", 0x1234, {Max});
   InstrProfRecord Record3("callee1", 0x1235, {3, 4});
 
   Record1.reserveSites(IPVK_IndirectCallTarget, 1);
@@ -375,6 +375,9 @@ TEST_F(InstrProfTest, get_icall_data_merge1_saturation) {
   // Verify saturation of counts.
   ErrorOr<InstrProfRecord> R = Reader->getInstrProfRecord("caller", 0x1234);
   ASSERT_TRUE(NoError(R.getError()));
+
+  ASSERT_EQ(Max, R.get().Counts[0]);
+
   ASSERT_EQ(1U, R.get().getNumValueSites(IPVK_IndirectCallTarget));
   std::unique_ptr<InstrProfValueData[]> VD =
           R.get().getValueForSite(IPVK_IndirectCallTarget, 0);

From 2b2fb7fbf2a02811f52ebc0a428837927b6f6340 Mon Sep 17 00:00:00 2001
From: Fiona Glaser <escha@apple.com>
Date: Wed, 2 Dec 2015 18:32:59 +0000
Subject: [PATCH 145/186] Scheduler / Regalloc: use unique_ptr[] instead of
 std::vector

vector.resize() is significantly slower than memset in many STLs
and the cost of initializing these vectors is significant on targets
with many registers. Since we don't need the overhead of a vector,
use a simple unique_ptr instead.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254526 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/MachineRegisterInfo.h    |  2 +-
 lib/CodeGen/MachineRegisterInfo.cpp           |  7 +++---
 .../SelectionDAG/ScheduleDAGRRList.cpp        | 24 ++++++++++---------
 3 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/include/llvm/CodeGen/MachineRegisterInfo.h b/include/llvm/CodeGen/MachineRegisterInfo.h
index cbfd8a37eaa6..0a1f62006327 100644
--- a/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -73,7 +73,7 @@ class MachineRegisterInfo {
 
   /// PhysRegUseDefLists - This is an array of the head of the use/def list for
   /// physical registers.
-  std::vector<MachineOperand *> PhysRegUseDefLists;
+  std::unique_ptr<MachineOperand *[]> PhysRegUseDefLists;
 
   /// getRegUseDefListHead - Return the head pointer for the register use/def
   /// list for the specified virtual or physical register.
diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp
index abf9b4d67696..03c82f46da63 100644
--- a/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/lib/CodeGen/MachineRegisterInfo.cpp
@@ -27,12 +27,11 @@ void MachineRegisterInfo::Delegate::anchor() {}
 MachineRegisterInfo::MachineRegisterInfo(const MachineFunction *MF)
   : MF(MF), TheDelegate(nullptr), IsSSA(true), TracksLiveness(true),
     TracksSubRegLiveness(false) {
+  unsigned NumRegs = getTargetRegisterInfo()->getNumRegs();
   VRegInfo.reserve(256);
   RegAllocHints.reserve(256);
-  UsedPhysRegMask.resize(getTargetRegisterInfo()->getNumRegs());
-
-  // Create the physreg use/def lists.
-  PhysRegUseDefLists.resize(getTargetRegisterInfo()->getNumRegs(), nullptr);
+  UsedPhysRegMask.resize(NumRegs);
+  PhysRegUseDefLists.reset(new MachineOperand*[NumRegs]());
 }
 
 /// setRegClass - Set the register class of the specified virtual register.
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index e9bd52034ffd..cb573f2effde 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -141,8 +141,8 @@ class ScheduleDAGRRList : public ScheduleDAGSDNodes {
   /// that are "live". These nodes must be scheduled before any other nodes that
   /// modifies the registers can be scheduled.
   unsigned NumLiveRegs;
-  std::vector<SUnit*> LiveRegDefs;
-  std::vector<SUnit*> LiveRegGens;
+  std::unique_ptr<SUnit*[]> LiveRegDefs;
+  std::unique_ptr<SUnit*[]> LiveRegGens;
 
   // Collect interferences between physical register use/defs.
   // Each interference is an SUnit and set of physical registers.
@@ -328,8 +328,8 @@ void ScheduleDAGRRList::Schedule() {
   NumLiveRegs = 0;
   // Allocate slots for each physical register, plus one for a special register
   // to track the virtual resource of a calling sequence.
-  LiveRegDefs.resize(TRI->getNumRegs() + 1, nullptr);
-  LiveRegGens.resize(TRI->getNumRegs() + 1, nullptr);
+  LiveRegDefs.reset(new SUnit*[TRI->getNumRegs() + 1]());
+  LiveRegGens.reset(new SUnit*[TRI->getNumRegs() + 1]());
   CallSeqEndForStart.clear();
   assert(Interferences.empty() && LRegsMap.empty() && "stale Interferences");
 
@@ -1218,7 +1218,7 @@ static MVT getPhysicalRegisterVT(SDNode *N, unsigned Reg,
 /// CheckForLiveRegDef - Return true and update live register vector if the
 /// specified register def of the specified SUnit clobbers any "live" registers.
 static void CheckForLiveRegDef(SUnit *SU, unsigned Reg,
-                               std::vector<SUnit*> &LiveRegDefs,
+                               SUnit **LiveRegDefs,
                                SmallSet<unsigned, 4> &RegAdded,
                                SmallVectorImpl<unsigned> &LRegs,
                                const TargetRegisterInfo *TRI) {
@@ -1240,11 +1240,11 @@ static void CheckForLiveRegDef(SUnit *SU, unsigned Reg,
 /// CheckForLiveRegDefMasked - Check for any live physregs that are clobbered
 /// by RegMask, and add them to LRegs.
 static void CheckForLiveRegDefMasked(SUnit *SU, const uint32_t *RegMask,
-                                     std::vector<SUnit*> &LiveRegDefs,
+                                     ArrayRef<SUnit*> LiveRegDefs,
                                      SmallSet<unsigned, 4> &RegAdded,
                                      SmallVectorImpl<unsigned> &LRegs) {
   // Look at all live registers. Skip Reg0 and the special CallResource.
-  for (unsigned i = 1, e = LiveRegDefs.size()-1; i != e; ++i) {
+  for (unsigned i = 1, e = LiveRegDefs.size(); i != e; ++i) {
     if (!LiveRegDefs[i]) continue;
     if (LiveRegDefs[i] == SU) continue;
     if (!MachineOperand::clobbersPhysReg(RegMask, i)) continue;
@@ -1278,7 +1278,7 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) {
   for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
        I != E; ++I) {
     if (I->isAssignedRegDep() && LiveRegDefs[I->getReg()] != SU)
-      CheckForLiveRegDef(I->getSUnit(), I->getReg(), LiveRegDefs,
+      CheckForLiveRegDef(I->getSUnit(), I->getReg(), LiveRegDefs.get(),
                          RegAdded, LRegs, TRI);
   }
 
@@ -1302,7 +1302,7 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) {
           for (; NumVals; --NumVals, ++i) {
             unsigned Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
             if (TargetRegisterInfo::isPhysicalRegister(Reg))
-              CheckForLiveRegDef(SU, Reg, LiveRegDefs, RegAdded, LRegs, TRI);
+              CheckForLiveRegDef(SU, Reg, LiveRegDefs.get(), RegAdded, LRegs, TRI);
           }
         } else
           i += NumVals;
@@ -1328,13 +1328,15 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) {
       }
     }
     if (const uint32_t *RegMask = getNodeRegMask(Node))
-      CheckForLiveRegDefMasked(SU, RegMask, LiveRegDefs, RegAdded, LRegs);
+      CheckForLiveRegDefMasked(SU, RegMask,
+                               makeArrayRef(LiveRegDefs.get(), TRI->getNumRegs()),
+                               RegAdded, LRegs);
 
     const MCInstrDesc &MCID = TII->get(Node->getMachineOpcode());
     if (!MCID.ImplicitDefs)
       continue;
     for (const uint16_t *Reg = MCID.getImplicitDefs(); *Reg; ++Reg)
-      CheckForLiveRegDef(SU, *Reg, LiveRegDefs, RegAdded, LRegs, TRI);
+      CheckForLiveRegDef(SU, *Reg, LiveRegDefs.get(), RegAdded, LRegs, TRI);
   }
 
   return !LRegs.empty();

From 6e3c74fe8dd92a64a6cd876dc552dcb214f353ba Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 2 Dec 2015 18:35:23 +0000
Subject: [PATCH 146/186] AMDGPU: Fix msan test failure

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254527 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/AMDGPUMachineFunction.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 21c7da663234..7c595d5a83e4 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -13,6 +13,7 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
   MachineFunctionInfo(),
   ShaderType(ShaderType::COMPUTE),
   LDSSize(0),
+  ABIArgOffset(0),
   ScratchSize(0),
   IsKernel(true) {
   Attribute A = MF.getFunction()->getFnAttribute(ShaderTypeAttribute);

From 8afe0eb7ca0ceaa5e813a63879a2b96aa696cd08 Mon Sep 17 00:00:00 2001
From: Fiona Glaser <escha@apple.com>
Date: Wed, 2 Dec 2015 18:46:23 +0000
Subject: [PATCH 147/186] Fix accidental off by one change

Didn't break any tests, but did unnecessary extra work.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254529 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index cb573f2effde..78985e01ef9a 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -1244,7 +1244,7 @@ static void CheckForLiveRegDefMasked(SUnit *SU, const uint32_t *RegMask,
                                      SmallSet<unsigned, 4> &RegAdded,
                                      SmallVectorImpl<unsigned> &LRegs) {
   // Look at all live registers. Skip Reg0 and the special CallResource.
-  for (unsigned i = 1, e = LiveRegDefs.size(); i != e; ++i) {
+  for (unsigned i = 1, e = LiveRegDefs.size()-1; i != e; ++i) {
     if (!LiveRegDefs[i]) continue;
     if (LiveRegDefs[i] == SU) continue;
     if (!MachineOperand::clobbersPhysReg(RegMask, i)) continue;

From b8f6117b7f770fb8c91507d25e448b288609f84e Mon Sep 17 00:00:00 2001
From: Kyle Butt <kyle+llvm@iteratee.net>
Date: Wed, 2 Dec 2015 18:53:33 +0000
Subject: [PATCH 148/186] Test Commit: iteratee

Remove whitespace from blank lines. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254531 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 438a2980f2ce..2261b71c5aa9 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -2780,7 +2780,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     if (PPCSubTarget->hasVSX() && (N->getValueType(0) == MVT::v2f64 ||
                                   N->getValueType(0) == MVT::v2i64)) {
       ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
-      
+
       SDValue Op1 = N->getOperand(SVN->getMaskElt(0) < 2 ? 0 : 1),
               Op2 = N->getOperand(SVN->getMaskElt(1) < 2 ? 0 : 1);
       unsigned DM[2];
@@ -3106,7 +3106,7 @@ SDValue PPCDAGToDAGISel::combineToCMPB(SDNode *N) {
         if (!CurDAG->MaskedValueIsZero(Op0,
               APInt::getHighBitsSet(Bits, Bits - (b+1)*8)))
           return false;
-        
+
         LHS = Op0.getOperand(0);
         RHS = Op0.getOperand(1);
         return true;

From 325a79d6e98b9192276abe7421ba491fae199446 Mon Sep 17 00:00:00 2001
From: Kyle Butt <kyle+llvm@iteratee.net>
Date: Wed, 2 Dec 2015 18:58:51 +0000
Subject: [PATCH 149/186] [CodeGen]: Fix bad interaction with AntiDep breaking
 and inline asm.

AggressiveAntiDepBreaker was renaming registers specified by the user
for inline assembly. While this will work for compiler-specified
registers, it won't work for user-specified registers, and at the time
this runs, I don't currently see a way to distinguish them.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254532 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/AggressiveAntiDepBreaker.cpp      |   9 +-
 .../PowerPC/aantidep-inline-asm-use.ll        | 308 ++++++++++++++++++
 2 files changed, 314 insertions(+), 3 deletions(-)
 create mode 100644 test/CodeGen/PowerPC/aantidep-inline-asm-use.ll

diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index 845408f15f5c..4060db74a9b7 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -364,9 +364,11 @@ void AggressiveAntiDepBreaker::PrescanInstruction(MachineInstr *MI,
 
     // If MI's defs have a special allocation requirement, don't allow
     // any def registers to be changed. Also assume all registers
-    // defined in a call must not be changed (ABI).
+    // defined in a call must not be changed (ABI). Inline assembly may
+    // reference either system calls or the register directly. Skip it until we
+    // can tell user specified registers from compiler-specified.
     if (MI->isCall() || MI->hasExtraDefRegAllocReq() ||
-        TII->isPredicated(MI)) {
+        TII->isPredicated(MI) || MI->isInlineAsm()) {
       DEBUG(if (State->GetGroup(Reg) != 0) dbgs() << "->g0(alloc-req)");
       State->UnionGroups(Reg, 0);
     }
@@ -428,6 +430,7 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr *MI,
   // If MI's uses have special allocation requirement, don't allow
   // any use registers to be changed. Also assume all registers
   // used in a call must not be changed (ABI).
+  // Inline Assembly register uses also cannot be safely changed.
   // FIXME: The issue with predicated instruction is more complex. We are being
   // conservatively here because the kill markers cannot be trusted after
   // if-conversion:
@@ -443,7 +446,7 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr *MI,
   // changed.
   bool Special = MI->isCall() ||
     MI->hasExtraSrcRegAllocReq() ||
-    TII->isPredicated(MI);
+    TII->isPredicated(MI) || MI->isInlineAsm();
 
   // Scan the register uses for this instruction and update
   // live-ranges, groups and RegRefs.
diff --git a/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll b/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll
new file mode 100644
index 000000000000..4e7e97750448
--- /dev/null
+++ b/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll
@@ -0,0 +1,308 @@
+; RUN: llc -O2 < %s | FileCheck %s
+; ModuleID = 'bugpoint-reduced-simplified.bc'
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-grtev4-linux-gnu"
+
+; Function Attrs: nounwind
+define void @_ZN10SubProcess19ScrubbedForkAndExecEiPiS0_PNS_7ResultsE() #0 align 2 {
+; CHECK: lis 3, 1234
+; CHECK-NOT: li 3
+; CHECK-NOT: ori 3
+; CHECK-NOT: addi 3
+; CHECK-NOT: addis 3
+; CHECK-NOT: lis 3
+; CHECK: sc
+  br i1 undef, label %1, label %2
+
+; <label>:1                                       ; preds = %0
+  br label %60
+
+; <label>:2                                       ; preds = %0
+  br i1 undef, label %3, label %4
+
+; <label>:3                                       ; preds = %2
+  unreachable
+
+; <label>:4                                       ; preds = %2
+  br i1 undef, label %.lr.ph111, label %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit
+
+.lr.ph111:                                        ; preds = %4
+  br label %5
+
+_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit: ; preds = %12, %4
+  br i1 undef, label %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19.preheader, label %13
+
+; <label>:5                                       ; preds = %12, %.lr.ph111
+  br i1 undef, label %6, label %9
+
+; <label>:6                                       ; preds = %5
+  br i1 undef, label %7, label %8
+
+; <label>:7                                       ; preds = %6
+  unreachable
+
+; <label>:8                                       ; preds = %6
+  br label %12
+
+; <label>:9                                       ; preds = %5
+  br i1 undef, label %10, label %11
+
+; <label>:10                                      ; preds = %9
+  br label %12
+
+; <label>:11                                      ; preds = %9
+  br label %12
+
+; <label>:12                                      ; preds = %11, %10, %8
+  br i1 undef, label %5, label %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit
+
+; <label>:13                                      ; preds = %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit
+  br i1 undef, label %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19.preheader, label %14
+
+; <label>:14                                      ; preds = %13
+  br label %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19.preheader
+
+_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19.preheader: ; preds = %14, %13, %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit
+  br i1 undef, label %_ZN10SubProcess12SafeSyscalls5closeEi.exit.preheader, label %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19._crit_edge
+
+_ZN10SubProcess12SafeSyscalls5closeEi.exit.preheader: ; preds = %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19, %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19.preheader
+  br label %_ZN10SubProcess12SafeSyscalls5closeEi.exit
+
+_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19._crit_edge: ; preds = %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19, %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19.preheader
+  br i1 undef, label %15, label %19
+
+_ZN10SubProcess12SafeSyscalls5closeEi.exit:       ; preds = %_ZN10SubProcess12SafeSyscalls5closeEi.exit, %_ZN10SubProcess12SafeSyscalls5closeEi.exit.preheader
+  br i1 undef, label %_ZN10SubProcess12SafeSyscalls5closeEi.exit, label %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19
+
+_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19: ; preds = %_ZN10SubProcess12SafeSyscalls5closeEi.exit
+  br i1 undef, label %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19._crit_edge, label %_ZN10SubProcess12SafeSyscalls5closeEi.exit.preheader
+
+; <label>:15                                      ; preds = %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19._crit_edge
+  br label %16
+
+; <label>:16                                      ; preds = %17, %15
+  br i1 undef, label %17, label %.critedge.preheader
+
+; <label>:17                                      ; preds = %16
+  br i1 undef, label %16, label %.critedge.preheader
+
+.critedge.preheader:                              ; preds = %17, %16
+  br label %.critedge
+
+.critedge:                                        ; preds = %18, %.critedge.preheader
+  br i1 undef, label %18, label %.critedge8
+
+; <label>:18                                      ; preds = %.critedge
+  br i1 undef, label %.critedge, label %.critedge8
+
+.critedge8:                                       ; preds = %18, %.critedge
+  br label %59
+
+; <label>:19                                      ; preds = %_ZN10SubProcess12SafeSyscalls11sigprocmaskEiPKNS0_15kernel_sigset_tEPS1_.exit19._crit_edge
+  br label %_ZN10SubProcess12SafeSyscalls5closeEi.exit22
+
+_ZN10SubProcess12SafeSyscalls5closeEi.exit22:     ; preds = %_ZN10SubProcess12SafeSyscalls5closeEi.exit22, %19
+  br i1 undef, label %_ZN10SubProcess12SafeSyscalls5closeEi.exit22, label %20
+
+; <label>:20                                      ; preds = %_ZN10SubProcess12SafeSyscalls5closeEi.exit22
+  %21 = alloca i8, i64 undef, align 1
+  br label %.thread.outer
+
+.thread.outer:                                    ; preds = %._crit_edge, %20
+  br label %.thread
+
+.thread:                                          ; preds = %45, %.thread.outer
+  call void @llvm.memset.p0i8.i64(i8* undef, i8 0, i64 56, i32 8, i1 false)
+  store i8* %21, i8** undef, align 8
+  store i32 1073741824, i32* undef, align 8
+  %22 = call { i64, i64, i64, i64, i64, i64, i64 } asm sideeffect "sc\0A\09mfcr $0", "=&{r0},=&{r3},=&{r4},=&{r5},=&{r6},=&{r7},=&{r8},{r0},{r3},{r4},{r5},~{cr0},~{ctr},~{memory},~{r11},~{r12}"(i64 342, i64 80871424, i64 undef, i64 0) #2, !srcloc !1
+  br i1 undef, label %.lr.ph, label %.critedge15.preheader
+
+.critedge15.preheader:                            ; preds = %_ZN10SubProcess12SafeSyscalls7recvmsgEiPNS0_13kernel_msghdrEi.exit.backedge, %.thread
+  br i1 undef, label %.lr.ph93.preheader, label %.critedge15._crit_edge
+
+.lr.ph93.preheader:                               ; preds = %.critedge15.preheader
+  br label %.lr.ph93
+
+.lr.ph:                                           ; preds = %_ZN10SubProcess12SafeSyscalls7recvmsgEiPNS0_13kernel_msghdrEi.exit.backedge, %.thread
+  switch i32 undef, label %.critedge9 [
+    i32 11, label %_ZN10SubProcess12SafeSyscalls7recvmsgEiPNS0_13kernel_msghdrEi.exit.backedge
+    i32 4, label %_ZN10SubProcess12SafeSyscalls7recvmsgEiPNS0_13kernel_msghdrEi.exit.backedge
+  ]
+
+_ZN10SubProcess12SafeSyscalls7recvmsgEiPNS0_13kernel_msghdrEi.exit.backedge: ; preds = %.lr.ph, %.lr.ph
+  br i1 undef, label %.lr.ph, label %.critedge15.preheader
+
+.critedge9:                                       ; preds = %.lr.ph
+  unreachable
+
+.critedge15._crit_edge:                           ; preds = %.critedge15, %.critedge15.preheader
+  br i1 undef, label %35, label %34
+
+.lr.ph93:                                         ; preds = %.critedge15, %.lr.ph93.preheader
+  switch i32 undef, label %33 [
+    i32 0, label %23
+    i32 1, label %23
+    i32 2, label %23
+    i32 3, label %23
+    i32 4, label %23
+    i32 5, label %23
+    i32 6, label %23
+    i32 7, label %23
+    i32 8, label %27
+    i32 9, label %30
+  ]
+
+; <label>:23                                      ; preds = %.lr.ph93, %.lr.ph93, %.lr.ph93, %.lr.ph93, %.lr.ph93, %.lr.ph93, %.lr.ph93, %.lr.ph93
+  br i1 undef, label %24, label %.critedge15
+
+; <label>:24                                      ; preds = %23
+  br i1 undef, label %.critedge15, label %25
+
+; <label>:25                                      ; preds = %24
+  br i1 undef, label %.critedge15, label %26
+
+; <label>:26                                      ; preds = %25
+  unreachable
+
+; <label>:27                                      ; preds = %.lr.ph93
+  br i1 undef, label %.critedge15, label %28
+
+; <label>:28                                      ; preds = %27
+  br i1 undef, label %29, label %.critedge15
+
+; <label>:29                                      ; preds = %28
+  br label %.critedge15
+
+; <label>:30                                      ; preds = %.lr.ph93
+  br i1 undef, label %.critedge15, label %31
+
+; <label>:31                                      ; preds = %30
+  br i1 undef, label %32, label %.critedge15
+
+; <label>:32                                      ; preds = %31
+  br label %.critedge15
+
+; <label>:33                                      ; preds = %.lr.ph93
+  unreachable
+
+.critedge15:                                      ; preds = %32, %31, %30, %29, %28, %27, %25, %24, %23
+  br i1 undef, label %.lr.ph93, label %.critedge15._crit_edge
+
+; <label>:34                                      ; preds = %.critedge15._crit_edge
+  unreachable
+
+; <label>:35                                      ; preds = %.critedge15._crit_edge
+  br i1 undef, label %45, label %36
+
+; <label>:36                                      ; preds = %35
+  br i1 undef, label %37, label %38
+
+; <label>:37                                      ; preds = %36
+  br i1 undef, label %.preheader, label %38
+
+.preheader:                                       ; preds = %37
+  br i1 undef, label %.lr.ph101, label %._crit_edge
+
+.lr.ph101:                                        ; preds = %.preheader
+  br label %39
+
+; <label>:38                                      ; preds = %37, %36
+  unreachable
+
+; <label>:39                                      ; preds = %43, %.lr.ph101
+  br i1 undef, label %40, label %43
+
+; <label>:40                                      ; preds = %39
+  br i1 undef, label %_ZN10SubProcess12SafeSyscalls5fcntlEiil.exit17, label %41
+
+; <label>:41                                      ; preds = %40
+  unreachable
+
+_ZN10SubProcess12SafeSyscalls5fcntlEiil.exit17:   ; preds = %40
+  br i1 undef, label %42, label %_ZN10SubProcess12SafeSyscalls5fcntlEiil.exit
+
+; <label>:42                                      ; preds = %_ZN10SubProcess12SafeSyscalls5fcntlEiil.exit17
+  unreachable
+
+_ZN10SubProcess12SafeSyscalls5fcntlEiil.exit:     ; preds = %_ZN10SubProcess12SafeSyscalls5fcntlEiil.exit17
+  br i1 undef, label %.thread27, label %43
+
+; <label>:43                                      ; preds = %_ZN10SubProcess12SafeSyscalls5fcntlEiil.exit, %39
+  br i1 undef, label %39, label %._crit_edge
+
+.thread27:                                        ; preds = %_ZN10SubProcess12SafeSyscalls5fcntlEiil.exit
+  br label %58
+
+._crit_edge:                                      ; preds = %43, %.preheader
+  br i1 undef, label %.thread.outer, label %44
+
+; <label>:44                                      ; preds = %._crit_edge
+  unreachable
+
+; <label>:45                                      ; preds = %35
+  br i1 undef, label %46, label %.thread
+
+; <label>:46                                      ; preds = %45
+  br i1 undef, label %48, label %47
+
+; <label>:47                                      ; preds = %46
+  unreachable
+
+; <label>:48                                      ; preds = %46
+  br i1 undef, label %55, label %49
+
+; <label>:49                                      ; preds = %48
+  br i1 undef, label %50, label %51
+
+; <label>:50                                      ; preds = %49
+  br label %52
+
+; <label>:51                                      ; preds = %49
+  br label %52
+
+; <label>:52                                      ; preds = %51, %50
+  br label %53
+
+; <label>:53                                      ; preds = %54, %52
+  br i1 undef, label %54, label %.critedge13
+
+; <label>:54                                      ; preds = %53
+  br i1 undef, label %53, label %.critedge13
+
+.critedge13:                                      ; preds = %54, %53
+  br label %58
+
+; <label>:55                                      ; preds = %48
+  br label %56
+
+; <label>:56                                      ; preds = %57, %55
+  br i1 undef, label %57, label %.critedge14
+
+; <label>:57                                      ; preds = %56
+  br i1 undef, label %56, label %.critedge14
+
+.critedge14:                                      ; preds = %57, %56
+  br label %58
+
+; <label>:58                                      ; preds = %.critedge14, %.critedge13, %.thread27
+  br label %59
+
+; <label>:59                                      ; preds = %58, %.critedge8
+  br label %60
+
+; <label>:60                                      ; preds = %59, %1
+  ret void
+}
+
+; Function Attrs: nounwind argmemonly
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind argmemonly }
+attributes #2 = { nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version google3-trunk (trunk r251807)"}
+!1 = !{i32 -2140527538, i32 -2140527533}

From 21fca5ddcedd4c19303b08aaf1bf6b71399db7b4 Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Wed, 2 Dec 2015 19:30:52 +0000
Subject: [PATCH 150/186] Fix linking when we copy over only a decl.

We were failing to copy the fact that the GV is weak and in the case of
an alias, producing invalid IR.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254538 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Linker/LinkModules.cpp     | 56 +++++++++++++++++++++-------------
 test/Linker/Inputs/comdat14.ll | 12 ++++++++
 test/Linker/comdat14.ll        |  9 ++++++
 3 files changed, 55 insertions(+), 22 deletions(-)
 create mode 100644 test/Linker/Inputs/comdat14.ll
 create mode 100644 test/Linker/comdat14.ll

diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index fba231100ee3..1a82bbc7c90e 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -453,7 +453,7 @@ class ModuleLinker {
   /// Handles cloning of a global values from the source module into
   /// the destination module, including setting the attributes and visibility.
   GlobalValue *copyGlobalValueProto(TypeMapTy &TypeMap, const GlobalValue *SGV,
-                                    const GlobalValue *DGV = nullptr);
+                                    const GlobalValue *DGV, bool ForDefinition);
 
   /// Check if we should promote the given local value to global scope.
   bool doPromoteLocalToGlobal(const GlobalValue *SGV);
@@ -594,14 +594,8 @@ static void forceRenaming(GlobalValue *GV, StringRef Name) {
 void ModuleLinker::copyGVAttributes(GlobalValue *NewGV,
                                     const GlobalValue *SrcGV) {
   auto *GA = dyn_cast<GlobalAlias>(SrcGV);
-  // Check for the special case of converting an alias (definition) to a
-  // non-alias (declaration). This can happen when we are importing and
-  // encounter a weak_any alias (weak_any defs may not be imported, see
-  // comments in ModuleLinker::getLinkage) or an alias whose base object is
-  // being imported as a declaration. In that case copy the attributes from the
-  // base object.
   if (GA && !dyn_cast<GlobalAlias>(NewGV)) {
-    assert(isPerformingImport() && !doImportAsDefinition(GA));
+    // FIXME: this is likelly bogus:
     NewGV->copyAttributesFrom(GA->getBaseObject());
   } else
     NewGV->copyAttributesFrom(SrcGV);
@@ -779,11 +773,12 @@ ModuleLinker::copyGlobalVariableProto(TypeMapTy &TypeMap,
   // No linking to be performed or linking from the source: simply create an
   // identical version of the symbol over in the dest module... the
   // initializer will be filled in later by LinkGlobalInits.
-  GlobalVariable *NewDGV = new GlobalVariable(
-      DstM, TypeMap.get(SGVar->getType()->getElementType()),
-      SGVar->isConstant(), getLinkage(SGVar), /*init*/ nullptr, getName(SGVar),
-      /*insertbefore*/ nullptr, SGVar->getThreadLocalMode(),
-      SGVar->getType()->getAddressSpace());
+  GlobalVariable *NewDGV =
+      new GlobalVariable(DstM, TypeMap.get(SGVar->getType()->getElementType()),
+                         SGVar->isConstant(), GlobalValue::ExternalLinkage,
+                         /*init*/ nullptr, getName(SGVar),
+                         /*insertbefore*/ nullptr, SGVar->getThreadLocalMode(),
+                         SGVar->getType()->getAddressSpace());
 
   return NewDGV;
 }
@@ -794,8 +789,8 @@ Function *ModuleLinker::copyFunctionProto(TypeMapTy &TypeMap,
                                           const Function *SF) {
   // If there is no linkage to be performed or we are linking from the source,
   // bring SF over.
-  return Function::Create(TypeMap.get(SF->getFunctionType()), getLinkage(SF),
-                          getName(SF), &DstM);
+  return Function::Create(TypeMap.get(SF->getFunctionType()),
+                          GlobalValue::ExternalLinkage, getName(SF), &DstM);
 }
 
 /// Set up prototypes for any aliases that come over from the source module.
@@ -829,7 +824,7 @@ GlobalValue *ModuleLinker::copyGlobalAliasProto(TypeMapTy &TypeMap,
   // bring over SGA.
   auto *Ty = TypeMap.get(SGA->getValueType());
   return GlobalAlias::create(Ty, SGA->getType()->getPointerAddressSpace(),
-                             getLinkage(SGA), getName(SGA), &DstM);
+                             GlobalValue::ExternalLinkage, getName(SGA), &DstM);
 }
 
 static GlobalValue::VisibilityTypes
@@ -857,14 +852,31 @@ void ModuleLinker::setVisibility(GlobalValue *NewGV, const GlobalValue *SGV,
 
 GlobalValue *ModuleLinker::copyGlobalValueProto(TypeMapTy &TypeMap,
                                                 const GlobalValue *SGV,
-                                                const GlobalValue *DGV) {
+                                                const GlobalValue *DGV,
+                                                bool ForDefinition) {
   GlobalValue *NewGV;
-  if (auto *SGVar = dyn_cast<GlobalVariable>(SGV))
+  if (auto *SGVar = dyn_cast<GlobalVariable>(SGV)) {
     NewGV = copyGlobalVariableProto(TypeMap, SGVar);
-  else if (auto *SF = dyn_cast<Function>(SGV))
+  } else if (auto *SF = dyn_cast<Function>(SGV)) {
     NewGV = copyFunctionProto(TypeMap, SF);
-  else
-    NewGV = copyGlobalAliasProto(TypeMap, cast<GlobalAlias>(SGV));
+  } else {
+    if (ForDefinition)
+      NewGV = copyGlobalAliasProto(TypeMap, cast<GlobalAlias>(SGV));
+    else
+      NewGV = new GlobalVariable(
+          DstM, TypeMap.get(SGV->getType()->getElementType()),
+          /*isConstant*/ false, GlobalValue::ExternalLinkage,
+          /*init*/ nullptr, getName(SGV),
+          /*insertbefore*/ nullptr, SGV->getThreadLocalMode(),
+          SGV->getType()->getAddressSpace());
+  }
+
+  if (ForDefinition)
+    NewGV->setLinkage(getLinkage(SGV));
+  else if (SGV->hasAvailableExternallyLinkage() || SGV->hasWeakLinkage() ||
+           SGV->hasLinkOnceLinkage())
+    NewGV->setLinkage(GlobalValue::ExternalWeakLinkage);
+
   copyGVAttributes(NewGV, SGV);
   setVisibility(NewGV, SGV, DGV);
   return NewGV;
@@ -1446,7 +1458,7 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) {
     // When linking from source we setVisibility from copyGlobalValueProto.
     setVisibility(NewGV, SGV, DGV);
   } else {
-    NewGV = copyGlobalValueProto(TypeMap, SGV, DGV);
+    NewGV = copyGlobalValueProto(TypeMap, SGV, DGV, LinkFromSrc);
 
     if (isPerformingImport() && !doImportAsDefinition(SGV))
       DoNotLinkFromSource.insert(SGV);
diff --git a/test/Linker/Inputs/comdat14.ll b/test/Linker/Inputs/comdat14.ll
new file mode 100644
index 000000000000..7243ff864441
--- /dev/null
+++ b/test/Linker/Inputs/comdat14.ll
@@ -0,0 +1,12 @@
+$c = comdat any
+
+@v2 = weak global i32 0, comdat ($c)
+define i32* @f2() {
+  ret i32* @v2
+}
+
+@v3 = weak alias i32, i32* @v2
+define i32* @f3() {
+  ret i32* @v3
+}
+
diff --git a/test/Linker/comdat14.ll b/test/Linker/comdat14.ll
new file mode 100644
index 000000000000..70d1142bdf92
--- /dev/null
+++ b/test/Linker/comdat14.ll
@@ -0,0 +1,9 @@
+; RUN: llvm-link -S %s %p/Inputs/comdat14.ll -o - | FileCheck %s
+
+$c = comdat any
+
+@v = global i32 0, comdat ($c)
+
+; CHECK: @v = global i32 0, comdat($c)
+; CHECK: @v2 = extern_weak global i32
+; CHECK: @v3 = extern_weak global i32

From 4855629115495ddb0fd5a528089115e13b734162 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@codeaurora.org>
Date: Wed, 2 Dec 2015 19:44:35 +0000
Subject: [PATCH 151/186] [Hexagon] Remove TFRI_V4 instruction, use existing
 A2_tfrsi instead

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254539 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Hexagon/HexagonInstrInfoV4.td | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/lib/Target/Hexagon/HexagonInstrInfoV4.td b/lib/Target/Hexagon/HexagonInstrInfoV4.td
index 65612c590bfe..7389a40f4a4c 100644
--- a/lib/Target/Hexagon/HexagonInstrInfoV4.td
+++ b/lib/Target/Hexagon/HexagonInstrInfoV4.td
@@ -3864,26 +3864,6 @@ let AddedComplexity = 100 in {
   def: Stoream_pat<truncstorei32, I64, addrga, LoReg, S2_storeriabs>;
 }
 
-// Map from Pd = load(globaladdress) -> Rd = memb(globaladdress), Pd = Rd
-let AddedComplexity = 100 in
-def : Pat <(i1 (load (HexagonCONST32_GP tglobaladdr:$global))),
-           (i1 (C2_tfrrp (i32 (L2_loadrbgp tglobaladdr:$global))))>;
-
-// Transfer global address into a register
-let isExtended = 1, opExtendable = 1, AddedComplexity=50, isMoveImm = 1,
-isAsCheapAsAMove = 1, isReMaterializable = 1, isCodeGenOnly = 1 in
-def TFRI_V4 : ALU32_ri<(outs IntRegs:$dst), (ins s16Ext:$src1),
-           "$dst = #$src1",
-           [(set IntRegs:$dst, (HexagonCONST32 tglobaladdr:$src1))]>;
-
-// Transfer a block address into a register
-def : Pat<(HexagonCONST32_GP tblockaddress:$src1),
-          (TFRI_V4 tblockaddress:$src1)>;
-
-let AddedComplexity = 50 in
-def : Pat<(HexagonCONST32_GP tglobaladdr:$src1),
-           (TFRI_V4 tglobaladdr:$src1)>;
-
 // i8/i16/i32 -> i64 loads
 // We need a complexity of 120 here to override preceding handling of
 // zextload.

From 27cbe8f7172e782df3cee21f7df57b4bfa1b2330 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Wed, 2 Dec 2015 19:47:57 +0000
Subject: [PATCH 152/186] AMDGPU/SI: Correctly emit agent global segment
 variables when targeting HSA

Differential Revision: http://reviews.llvm.org/D14508

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254540 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp        |  39 ++++++-
 .../AMDGPU/AMDGPUHSATargetObjectFile.cpp      |  32 ++++++
 lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.h |   8 ++
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      |  50 +++++++++
 .../AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp   |   3 +-
 .../MCTargetDesc/AMDGPUTargetStreamer.cpp     |  28 +++++
 .../MCTargetDesc/AMDGPUTargetStreamer.h       |  12 ++
 lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  21 ++++
 lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h      |   6 +
 test/CodeGen/AMDGPU/hsa-globals.ll            | 105 ++++++++++++++++++
 10 files changed, 301 insertions(+), 3 deletions(-)
 create mode 100644 test/CodeGen/AMDGPU/hsa-globals.ll

diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 84c71e0cc9cf..1c438b24e6a4 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -123,11 +123,46 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
   AsmPrinter::EmitFunctionEntryLabel();
 }
 
+static bool isModuleLinkage(const GlobalValue *GV) {
+  switch (GV->getLinkage()) {
+  case GlobalValue::InternalLinkage:
+  case GlobalValue::CommonLinkage:
+   return true;
+  case GlobalValue::ExternalLinkage:
+   return false;
+  default: llvm_unreachable("unknown linkage type");
+  }
+}
+
 void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 
   if (TM.getTargetTriple().getOS() != Triple::AMDHSA ||
-      !AMDGPU::isGroupSegment(GV))
-    return AsmPrinter::EmitGlobalVariable(GV);
+      GV->isDeclaration() || AMDGPU::isReadOnlySegment(GV)) {
+    AsmPrinter::EmitGlobalVariable(GV);
+    return;
+  }
+
+  // Group segment variables aren't emitted in HSA.
+  if (AMDGPU::isGroupSegment(GV))
+    return;
+
+  AMDGPUTargetStreamer *TS =
+      static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
+  if (isModuleLinkage(GV)) {
+    TS->EmitAMDGPUHsaModuleScopeGlobal(GV->getName());
+  } else {
+    TS->EmitAMDGPUHsaProgramScopeGlobal(GV->getName());
+  }
+
+  const DataLayout &DL = getDataLayout();
+  OutStreamer->PushSection();
+  OutStreamer->SwitchSection(
+      getObjFileLowering().SectionForGlobal(GV, *Mang, TM));
+  MCSymbol *GVSym = getSymbol(GV);
+  const Constant *C = GV->getInitializer();
+  OutStreamer->EmitLabel(GVSym);
+  EmitGlobalConstant(DL, C);
+  OutStreamer->PopSection();
 }
 
 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
diff --git a/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.cpp b/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.cpp
index fa54f4a017cb..ee42eaacf546 100644
--- a/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.cpp
+++ b/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUHSATargetObjectFile.h"
+#include "AMDGPU.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
@@ -22,6 +23,29 @@ void AMDGPUHSATargetObjectFile::Initialize(MCContext &Ctx,
 
   TextSection = AMDGPU::getHSATextSection(Ctx);
 
+  DataGlobalAgentSection = AMDGPU::getHSADataGlobalAgentSection(Ctx);
+  DataGlobalProgramSection = AMDGPU::getHSADataGlobalProgramSection(Ctx);
+
+}
+
+bool AMDGPUHSATargetObjectFile::isAgentAllocationSection(
+    const char *SectionName) const {
+  return cast<MCSectionELF>(DataGlobalAgentSection)
+      ->getSectionName()
+      .equals(SectionName);
+}
+
+bool AMDGPUHSATargetObjectFile::isAgentAllocation(const GlobalValue *GV) const {
+  // Read-only segments can only have agent allocation.
+  return AMDGPU::isReadOnlySegment(GV) ||
+         (AMDGPU::isGlobalSegment(GV) && GV->hasSection() &&
+          isAgentAllocationSection(GV->getSection()));
+}
+
+bool AMDGPUHSATargetObjectFile::isProgramAllocation(
+    const GlobalValue *GV) const {
+  // The default for global segments is program allocation.
+  return AMDGPU::isGlobalSegment(GV) && !isAgentAllocation(GV);
 }
 
 MCSection *AMDGPUHSATargetObjectFile::SelectSectionForGlobal(
@@ -31,5 +55,13 @@ MCSection *AMDGPUHSATargetObjectFile::SelectSectionForGlobal(
   if (Kind.isText() && !GV->hasComdat())
     return getTextSection();
 
+  if (AMDGPU::isGlobalSegment(GV)) {
+    if (isAgentAllocation(GV))
+      return DataGlobalAgentSection;
+
+    if (isProgramAllocation(GV))
+      return DataGlobalProgramSection;
+  }
+
   return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang, TM);
 }
diff --git a/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.h b/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.h
index e1aca67b97c2..3697a96ffaf2 100644
--- a/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.h
+++ b/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.h
@@ -22,6 +22,14 @@
 namespace llvm {
 
 class AMDGPUHSATargetObjectFile final : public TargetLoweringObjectFileELF {
+private:
+  MCSection *DataGlobalAgentSection;
+  MCSection *DataGlobalProgramSection;
+
+  bool isAgentAllocationSection(const char *SectionName) const;
+  bool isAgentAllocation(const GlobalValue *GV) const;
+  bool isProgramAllocation(const GlobalValue *GV) const;
+
 public:
   void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
 
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 184cf0f65819..b360f2de0cb3 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -365,6 +365,10 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
   bool ParseSectionDirectiveHSAText();
   bool subtargetHasRegister(const MCRegisterInfo &MRI, unsigned RegNo) const;
   bool ParseDirectiveAMDGPUHsaKernel();
+  bool ParseDirectiveAMDGPUHsaModuleGlobal();
+  bool ParseDirectiveAMDGPUHsaProgramGlobal();
+  bool ParseSectionDirectiveHSADataGlobalAgent();
+  bool ParseSectionDirectiveHSADataGlobalProgram();
 
 public:
 public:
@@ -966,6 +970,40 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaKernel() {
   return false;
 }
 
+bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaModuleGlobal() {
+  if (getLexer().isNot(AsmToken::Identifier))
+    return TokError("expected symbol name");
+
+  StringRef GlobalName = Parser.getTok().getIdentifier();
+
+  getTargetStreamer().EmitAMDGPUHsaModuleScopeGlobal(GlobalName);
+  Lex();
+  return false;
+}
+
+bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaProgramGlobal() {
+  if (getLexer().isNot(AsmToken::Identifier))
+    return TokError("expected symbol name");
+
+  StringRef GlobalName = Parser.getTok().getIdentifier();
+
+  getTargetStreamer().EmitAMDGPUHsaProgramScopeGlobal(GlobalName);
+  Lex();
+  return false;
+}
+
+bool AMDGPUAsmParser::ParseSectionDirectiveHSADataGlobalAgent() {
+  getParser().getStreamer().SwitchSection(
+      AMDGPU::getHSADataGlobalAgentSection(getContext()));
+  return false;
+}
+
+bool AMDGPUAsmParser::ParseSectionDirectiveHSADataGlobalProgram() {
+  getParser().getStreamer().SwitchSection(
+      AMDGPU::getHSADataGlobalProgramSection(getContext()));
+  return false;
+}
+
 bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
   StringRef IDVal = DirectiveID.getString();
 
@@ -984,6 +1022,18 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
   if (IDVal == ".amdgpu_hsa_kernel")
     return ParseDirectiveAMDGPUHsaKernel();
 
+  if (IDVal == ".amdgpu_hsa_module_global")
+    return ParseDirectiveAMDGPUHsaModuleGlobal();
+
+  if (IDVal == ".amdgpu_hsa_program_global")
+    return ParseDirectiveAMDGPUHsaProgramGlobal();
+
+  if (IDVal == ".hsadata_global_agent")
+    return ParseSectionDirectiveHSADataGlobalAgent();
+
+  if (IDVal == ".hsadata_global_program")
+    return ParseSectionDirectiveHSADataGlobalProgram();
+
   return true;
 }
 
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 01548e50f7c5..89fa212de956 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -36,6 +36,7 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() {
 }
 
 bool AMDGPUMCAsmInfo::shouldOmitSectionDirective(StringRef SectionName) const {
-  return SectionName == ".hsatext" ||
+  return SectionName == ".hsatext" || SectionName == ".hsadata_global_agent" ||
+         SectionName == ".hsadata_global_program" ||
          MCAsmInfo::shouldOmitSectionDirective(SectionName);
 }
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index c876fa906a9f..b91134d2ee9b 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -231,6 +231,16 @@ void AMDGPUTargetAsmStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
   }
 }
 
+void AMDGPUTargetAsmStreamer::EmitAMDGPUHsaModuleScopeGlobal(
+    StringRef GlobalName) {
+  OS << "\t.amdgpu_hsa_module_global " << GlobalName << '\n';
+}
+
+void AMDGPUTargetAsmStreamer::EmitAMDGPUHsaProgramScopeGlobal(
+    StringRef GlobalName) {
+  OS << "\t.amdgpu_hsa_program_global " << GlobalName << '\n';
+}
+
 //===----------------------------------------------------------------------===//
 // AMDGPUTargetELFStreamer
 //===----------------------------------------------------------------------===//
@@ -316,3 +326,21 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
       getStreamer().getContext().getOrCreateSymbol(SymbolName));
   Symbol->setType(ELF::STT_AMDGPU_HSA_KERNEL);
 }
+
+void AMDGPUTargetELFStreamer::EmitAMDGPUHsaModuleScopeGlobal(
+    StringRef GlobalName) {
+
+  MCSymbolELF *Symbol = cast<MCSymbolELF>(
+      getStreamer().getContext().getOrCreateSymbol(GlobalName));
+  Symbol->setType(ELF::STT_OBJECT);
+  Symbol->setBinding(ELF::STB_LOCAL);
+}
+
+void AMDGPUTargetELFStreamer::EmitAMDGPUHsaProgramScopeGlobal(
+    StringRef GlobalName) {
+
+  MCSymbolELF *Symbol = cast<MCSymbolELF>(
+      getStreamer().getContext().getOrCreateSymbol(GlobalName));
+  Symbol->setType(ELF::STT_OBJECT);
+  Symbol->setBinding(ELF::STB_GLOBAL);
+}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 30fb3014f9ee..83bb728f541c 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -32,6 +32,10 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
   virtual void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) = 0;
 
   virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) = 0;
+
+  virtual void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) = 0;
+
+  virtual void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) = 0;
 };
 
 class AMDGPUTargetAsmStreamer : public AMDGPUTargetStreamer {
@@ -48,6 +52,10 @@ class AMDGPUTargetAsmStreamer : public AMDGPUTargetStreamer {
   void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override;
 
   void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override;
+
+  void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) override;
+
+  void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override;
 };
 
 class AMDGPUTargetELFStreamer : public AMDGPUTargetStreamer {
@@ -80,6 +88,10 @@ class AMDGPUTargetELFStreamer : public AMDGPUTargetStreamer {
   void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override;
 
   void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override;
+
+  void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) override;
+
+  void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override;
 };
 
 }
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 81ade517254f..e3c93384bd8e 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -68,9 +68,30 @@ MCSection *getHSATextSection(MCContext &Ctx) {
                            ELF::SHF_AMDGPU_HSA_CODE);
 }
 
+MCSection *getHSADataGlobalAgentSection(MCContext &Ctx) {
+  return Ctx.getELFSection(".hsadata_global_agent", ELF::SHT_PROGBITS,
+                           ELF::SHF_ALLOC | ELF::SHF_WRITE |
+                           ELF::SHF_AMDGPU_HSA_GLOBAL |
+                           ELF::SHF_AMDGPU_HSA_AGENT);
+}
+
+MCSection *getHSADataGlobalProgramSection(MCContext &Ctx) {
+  return  Ctx.getELFSection(".hsadata_global_program", ELF::SHT_PROGBITS,
+                            ELF::SHF_ALLOC | ELF::SHF_WRITE |
+                            ELF::SHF_AMDGPU_HSA_GLOBAL);
+}
+
 bool isGroupSegment(const GlobalValue *GV) {
   return GV->getType()->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
 }
 
+bool isGlobalSegment(const GlobalValue *GV) {
+  return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
+}
+
+bool isReadOnlySegment(const GlobalValue *GV) {
+  return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
+}
+
 } // End namespace AMDGPU
 } // End namespace llvm
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index bf9377ed86cf..cc70064dbc0f 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -32,7 +32,13 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
                                const FeatureBitset &Features);
 MCSection *getHSATextSection(MCContext &Ctx);
 
+MCSection *getHSADataGlobalAgentSection(MCContext &Ctx);
+
+MCSection *getHSADataGlobalProgramSection(MCContext &Ctx);
+
 bool isGroupSegment(const GlobalValue *GV);
+bool isGlobalSegment(const GlobalValue *GV);
+bool isReadOnlySegment(const GlobalValue *GV);
 
 } // end namespace AMDGPU
 } // end namespace llvm
diff --git a/test/CodeGen/AMDGPU/hsa-globals.ll b/test/CodeGen/AMDGPU/hsa-globals.ll
new file mode 100644
index 000000000000..c73ca3b0079a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/hsa-globals.ll
@@ -0,0 +1,105 @@
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=ASM %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri | llvm-readobj -symbols -s | FileCheck %s --check-prefix=ELF
+
+@internal_global_program = internal addrspace(1) global i32 0
+@common_global_program = common addrspace(1) global i32 0
+@external_global_program = addrspace(1) global i32 0
+
+@internal_global_agent = internal addrspace(1) global i32 0, section ".hsadata_global_agent"
+@common_global_agent = common addrspace(1) global i32 0, section ".hsadata_global_agent"
+@external_global_agent = addrspace(1) global i32 0, section ".hsadata_global_agent"
+
+define void @test() {
+  ret void
+}
+
+; ASM: .amdgpu_hsa_module_global internal_global
+; ASM: .hsadata_global_program
+; ASM: internal_global_program:
+; ASM: .long 0
+
+; ASM: .amdgpu_hsa_module_global common_global
+; ASM: .hsadata_global_program
+; ASM: common_global_program:
+; ASM: .long 0
+
+; ASM: .amdgpu_hsa_program_global external_global
+; ASM: .hsadata_global_program
+; ASM: external_global_program:
+; ASM: .long 0
+
+; ASM: .amdgpu_hsa_module_global internal_global
+; ASM: .hsadata_global_agent
+; ASM: internal_global_agent:
+; ASM: .long 0
+
+; ASM: .amdgpu_hsa_module_global common_global
+; ASM: .hsadata_global_agent
+; ASM: common_global_agent:
+; ASM: .long 0
+
+; ASM: .amdgpu_hsa_program_global external_global
+; ASM: .hsadata_global_agent
+; ASM: external_global_agent:
+; ASM: .long 0
+
+; ELF: Section {
+; ELF: Name: .hsadata_global_program
+; ELF: Type: SHT_PROGBITS (0x1)
+; ELF: Flags [ (0x100003)
+; ELF: SHF_ALLOC (0x2)
+; ELF: SHF_AMDGPU_HSA_GLOBAL (0x100000)
+; ELF: SHF_WRITE (0x1)
+; ELF: ]
+; ELF: }
+
+; ELF: Section {
+; ELF: Name: .hsadata_global_agent
+; ELF: Type: SHT_PROGBITS (0x1)
+; ELF: Flags [ (0x900003)
+; ELF: SHF_ALLOC (0x2)
+; ELF: SHF_AMDGPU_HSA_AGENT (0x800000)
+; ELF: SHF_AMDGPU_HSA_GLOBAL (0x100000)
+; ELF: SHF_WRITE (0x1)
+; ELF: ]
+; ELF: }
+
+; ELF: Symbol {
+; ELF: Name: common_global_agent
+; ELF: Binding: Local
+; ELF: Section: .hsadata_global_agent
+; ELF: }
+
+; ELF: Symbol {
+; ELF: Name: common_global_program
+; ELF: Binding: Local
+; ELF: Section: .hsadata_global_program
+; ELF: }
+
+; ELF: Symbol {
+; ELF: Name: internal_global_agent
+; ELF: Binding: Local
+; ELF: Type: Object
+; ELF: Section: .hsadata_global_agent
+; ELF: }
+
+; ELF: Symbol {
+; ELF: Name: internal_global_program
+; ELF: Binding: Local
+; ELF: Type: Object
+; ELF: Section: .hsadata_global_program
+; ELF: }
+
+; ELF: Symbol {
+; ELF: Name: external_global_agent
+; ELF: Binding: Global
+; ELF: Type: Object
+; ELF: Section: .hsadata_global_agent
+; ELF: }
+
+; ELF: Symbol {
+; ELF: Name: external_global_program
+; ELF: Binding: Global
+; ELF: Type: Object
+; ELF: Section: .hsadata_global_program
+; ELF: }

From 8855924964c05ae46fe93905a974d4b4577e7461 Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Wed, 2 Dec 2015 20:03:17 +0000
Subject: [PATCH 153/186] Don't copy information from aliasee to alias.

They are independent.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254541 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/IR/Function.cpp            | 10 ++++++----
 lib/IR/Globals.cpp             | 21 +++++++++++----------
 lib/Linker/LinkModules.cpp     |  7 +------
 test/Linker/Inputs/comdat14.ll |  2 +-
 test/Linker/comdat14.ll        |  2 +-
 5 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/lib/IR/Function.cpp b/lib/IR/Function.cpp
index 97e2a84a5cef..5e4d2d2054eb 100644
--- a/lib/IR/Function.cpp
+++ b/lib/IR/Function.cpp
@@ -411,12 +411,14 @@ void Function::clearGC() {
   }
 }
 
-/// copyAttributesFrom - copy all additional attributes (those not needed to
-/// create a Function) from the Function Src to this one.
+/// Copy all additional attributes (those not needed to create a Function) from
+/// the Function Src to this one.
 void Function::copyAttributesFrom(const GlobalValue *Src) {
-  assert(isa<Function>(Src) && "Expected a Function!");
   GlobalObject::copyAttributesFrom(Src);
-  const Function *SrcF = cast<Function>(Src);
+  const Function *SrcF = dyn_cast<Function>(Src);
+  if (!SrcF)
+    return;
+
   setCallingConv(SrcF->getCallingConv());
   setAttributes(SrcF->getAttributes());
   if (SrcF->hasGC())
diff --git a/lib/IR/Globals.cpp b/lib/IR/Globals.cpp
index 8fde4b8e9d77..c538c7baa1fe 100644
--- a/lib/IR/Globals.cpp
+++ b/lib/IR/Globals.cpp
@@ -97,10 +97,11 @@ void GlobalObject::setGlobalObjectSubClassData(unsigned Val) {
 }
 
 void GlobalObject::copyAttributesFrom(const GlobalValue *Src) {
-  const auto *GV = cast<GlobalObject>(Src);
-  GlobalValue::copyAttributesFrom(GV);
-  setAlignment(GV->getAlignment());
-  setSection(GV->getSection());
+  GlobalValue::copyAttributesFrom(Src);
+  if (const auto *GV = dyn_cast<GlobalObject>(Src)) {
+    setAlignment(GV->getAlignment());
+    setSection(GV->getSection());
+  }
 }
 
 const char *GlobalValue::getSection() const {
@@ -216,14 +217,14 @@ void GlobalVariable::setInitializer(Constant *InitVal) {
   }
 }
 
-/// copyAttributesFrom - copy all additional attributes (those not needed to
-/// create a GlobalVariable) from the GlobalVariable Src to this one.
+/// Copy all additional attributes (those not needed to create a GlobalVariable)
+/// from the GlobalVariable Src to this one.
 void GlobalVariable::copyAttributesFrom(const GlobalValue *Src) {
-  assert(isa<GlobalVariable>(Src) && "Expected a GlobalVariable!");
   GlobalObject::copyAttributesFrom(Src);
-  const GlobalVariable *SrcVar = cast<GlobalVariable>(Src);
-  setThreadLocalMode(SrcVar->getThreadLocalMode());
-  setExternallyInitialized(SrcVar->isExternallyInitialized());
+  if (const GlobalVariable *SrcVar = dyn_cast<GlobalVariable>(Src)) {
+    setThreadLocalMode(SrcVar->getThreadLocalMode());
+    setExternallyInitialized(SrcVar->isExternallyInitialized());
+  }
 }
 
 
diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index 1a82bbc7c90e..3e54e279573e 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -593,12 +593,7 @@ static void forceRenaming(GlobalValue *GV, StringRef Name) {
 /// from the SrcGV to the DestGV.
 void ModuleLinker::copyGVAttributes(GlobalValue *NewGV,
                                     const GlobalValue *SrcGV) {
-  auto *GA = dyn_cast<GlobalAlias>(SrcGV);
-  if (GA && !dyn_cast<GlobalAlias>(NewGV)) {
-    // FIXME: this is likelly bogus:
-    NewGV->copyAttributesFrom(GA->getBaseObject());
-  } else
-    NewGV->copyAttributesFrom(SrcGV);
+  NewGV->copyAttributesFrom(SrcGV);
   forceRenaming(NewGV, getName(SrcGV));
 }
 
diff --git a/test/Linker/Inputs/comdat14.ll b/test/Linker/Inputs/comdat14.ll
index 7243ff864441..5e79fbcdacc1 100644
--- a/test/Linker/Inputs/comdat14.ll
+++ b/test/Linker/Inputs/comdat14.ll
@@ -1,6 +1,6 @@
 $c = comdat any
 
-@v2 = weak global i32 0, comdat ($c)
+@v2 = weak dllexport global i32 0, comdat ($c)
 define i32* @f2() {
   ret i32* @v2
 }
diff --git a/test/Linker/comdat14.ll b/test/Linker/comdat14.ll
index 70d1142bdf92..793f8573a1f5 100644
--- a/test/Linker/comdat14.ll
+++ b/test/Linker/comdat14.ll
@@ -5,5 +5,5 @@ $c = comdat any
 @v = global i32 0, comdat ($c)
 
 ; CHECK: @v = global i32 0, comdat($c)
-; CHECK: @v2 = extern_weak global i32
+; CHECK: @v2 = extern_weak dllexport global i32
 ; CHECK: @v3 = extern_weak global i32

From 78c3317f0f7bbe25dca1dc5cb65f750f060b3285 Mon Sep 17 00:00:00 2001
From: Alexey Samsonov <vonosmas@gmail.com>
Date: Wed, 2 Dec 2015 20:46:51 +0000
Subject: [PATCH 154/186] Re-enable UBSan tests for SystemZ: PR20980 was fixed.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254542 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/MC/SystemZ/lit.local.cfg | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/test/MC/SystemZ/lit.local.cfg b/test/MC/SystemZ/lit.local.cfg
index 78c5738c7acc..2f3cf7d3f043 100644
--- a/test/MC/SystemZ/lit.local.cfg
+++ b/test/MC/SystemZ/lit.local.cfg
@@ -1,6 +1,2 @@
 if not 'SystemZ' in config.root.targets:
     config.unsupported = True
-
-# http://llvm.org/bugs/show_bug.cgi?id=20980
-if 'ubsan' in config.available_features:
-  config.unsupported = True

From e39a7a693345e4633edf72e1251ddd5982658724 Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Wed, 2 Dec 2015 20:57:33 +0000
Subject: [PATCH 155/186] Also copy private linkage globals when needed.

This was an omission when handling COFF style comdats with local keys.
Should fix the sanitizer-windows bot.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254543 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Linker/LinkModules.cpp     | 2 +-
 test/Linker/Inputs/comdat15.ll | 6 ++++++
 test/Linker/comdat15.ll        | 9 +++++++++
 3 files changed, 16 insertions(+), 1 deletion(-)
 create mode 100644 test/Linker/Inputs/comdat15.ll
 create mode 100644 test/Linker/comdat15.ll

diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index 3e54e279573e..e90ce166962e 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -1426,7 +1426,7 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) {
     std::tie(SK, LinkFromSrc) = ComdatsChosen[SC];
     C = DstM.getOrInsertComdat(SC->getName());
     C->setSelectionKind(SK);
-    if (SGV->hasInternalLinkage())
+    if (SGV->hasLocalLinkage())
       LinkFromSrc = true;
   } else if (DGV) {
     if (shouldLinkFromSource(LinkFromSrc, *DGV, *SGV))
diff --git a/test/Linker/Inputs/comdat15.ll b/test/Linker/Inputs/comdat15.ll
new file mode 100644
index 000000000000..5d2d41bba6aa
--- /dev/null
+++ b/test/Linker/Inputs/comdat15.ll
@@ -0,0 +1,6 @@
+$a1 = comdat any
+@baz = private global i32 42, comdat($a1)
+@a1 = internal alias i32, i32* @baz
+define i32* @abc() {
+  ret i32* @a1
+}
diff --git a/test/Linker/comdat15.ll b/test/Linker/comdat15.ll
new file mode 100644
index 000000000000..cf900263105a
--- /dev/null
+++ b/test/Linker/comdat15.ll
@@ -0,0 +1,9 @@
+; RUN: llvm-link -S %s %p/Inputs/comdat15.ll -o - | FileCheck %s
+
+$a1 = comdat any
+@bar = global i32 0, comdat($a1)
+
+; CHECK: @bar = global i32 0, comdat($a1)
+; CHECK: @baz = private global i32 42, comdat($a1)
+; CHECK: @a1 = internal alias i32, i32* @baz
+

From d3f55e582b38a074efbb3317fb27ec4e61a80a39 Mon Sep 17 00:00:00 2001
From: Kyle Butt <kyle+llvm@iteratee.net>
Date: Wed, 2 Dec 2015 21:08:03 +0000
Subject: [PATCH 156/186] Tests: PPC: remove unnecessary metadata. NFC

Remove unnecessary metadata from a test case.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254544 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/PowerPC/aantidep-inline-asm-use.ll | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll b/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll
index 4e7e97750448..f0c0deacf4dd 100644
--- a/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll
+++ b/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll
@@ -302,7 +302,4 @@ attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="fa
 attributes #1 = { nounwind argmemonly }
 attributes #2 = { nounwind }
 
-!llvm.ident = !{!0}
-
-!0 = !{!"clang version google3-trunk (trunk r251807)"}
 !1 = !{i32 -2140527538, i32 -2140527533}

From 9343bd9aff1ce501c26cb3d111bc868731ca0d96 Mon Sep 17 00:00:00 2001
From: Alexey Samsonov <vonosmas@gmail.com>
Date: Wed, 2 Dec 2015 21:13:43 +0000
Subject: [PATCH 157/186] [Hexagon] Remove std::hex in favor of format().

std::hex is not used anywhere in LLVM code base except for this place,
and it has a known undefined behavior (at least in libstdc++ 4.9.3):
https://llvm.org/bugs/show_bug.cgi?id=18156, which fires in UBSan
bootstrap of LLVM.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254547 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index b73ec0bfb191..2e780f035b2b 100644
--- a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -38,6 +38,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ELF.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -1511,14 +1512,15 @@ unsigned HexagonAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
 }
 
 void HexagonAsmParser::OutOfRange(SMLoc IDLoc, long long Val, long long Max) {
-  std::stringstream errStr;
-  errStr << "value " << Val << "(0x" << std::hex << Val << std::dec
+  std::string errStr;
+  raw_string_ostream ES(errStr);
+  ES << "value " << Val << "(" << format("0x%" PRIx64, Val)
          << ") out of range: ";
   if (Max >= 0)
-    errStr << "0-" << Max;
+    ES << "0-" << Max;
   else
-    errStr << Max << "-" << (-Max - 1);
-  Error(IDLoc, errStr.str().c_str());
+    ES << Max << "-" << (-Max - 1);
+  Error(IDLoc, ES.str().c_str());
 }
 
 int HexagonAsmParser::processInstruction(MCInst &Inst,

From dae9c830298e187bf984ce338d9961ac5c55e6e0 Mon Sep 17 00:00:00 2001
From: Alexey Samsonov <vonosmas@gmail.com>
Date: Wed, 2 Dec 2015 21:25:28 +0000
Subject: [PATCH 158/186] [PowerPC] Remove wild call to
 RegScavenger::initRegState().

This call should in fact be made by RegScavenger::enterBasicBlock()
called below. The first call does nothing except for triggering UB,
indicated by UBSan (passing nullptr to memset()).

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254548 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/RegisterScavenging.h | 7 +++----
 lib/Target/PowerPC/PPCFrameLowering.cpp   | 3 +--
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/include/llvm/CodeGen/RegisterScavenging.h b/include/llvm/CodeGen/RegisterScavenging.h
index 6517a58f5c07..122c78534253 100644
--- a/include/llvm/CodeGen/RegisterScavenging.h
+++ b/include/llvm/CodeGen/RegisterScavenging.h
@@ -74,10 +74,6 @@ class RegScavenger {
   /// Start tracking liveness from the begin of the specific basic block.
   void enterBasicBlock(MachineBasicBlock *mbb);
 
-  /// Allow resetting register state info for multiple
-  /// passes over/within the same function.
-  void initRegState();
-
   /// Move the internal MBB iterator and update register states.
   void forward();
 
@@ -180,6 +176,9 @@ class RegScavenger {
                            unsigned InstrLimit,
                            MachineBasicBlock::iterator &UseMI);
 
+  /// Allow resetting register state info for multiple
+  /// passes over/within the same function.
+  void initRegState();
 };
 
 } // End llvm namespace
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index 5a151eb6ab25..174deb88bc5c 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -569,8 +569,7 @@ bool PPCFrameLowering::findScratchRegister(MachineBasicBlock *MBB,
   if ((UseAtEnd && MBB->isReturnBlock()) ||
       (!UseAtEnd && (&MBB->getParent()->front() == MBB)))
     return true;
-    
-  RS.initRegState();
+
   RS.enterBasicBlock(MBB);
 
   if (UseAtEnd && !MBB->empty()) {

From aaaedd7f8f7d68b41ad3845d050365c03163942f Mon Sep 17 00:00:00 2001
From: Cong Hou <congh@google.com>
Date: Wed, 2 Dec 2015 21:33:47 +0000
Subject: [PATCH 159/186] Fix a typo in LoopVectorize.cpp. NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254549 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index c956a55a1009..c5b8b5b073d6 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5188,7 +5188,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(
         continue;
       }
 
-      // Count the number of live interals.
+      // Count the number of live intevals.
       unsigned RegUsage = 0;
       for (auto Inst : OpenIntervals)
         RegUsage += GetRegUsage(Inst->getType(), VFs[j]);

From ed192380fc0f73545d2eac8f18d07b7fb344c0db Mon Sep 17 00:00:00 2001
From: Xinliang David Li <davidxl@google.com>
Date: Wed, 2 Dec 2015 21:47:43 +0000
Subject: [PATCH 160/186] [PGO] Allow input value node list to be null

This is to handle the case when vp node linked
list array is laziliy initialized at runtime


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254551 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ProfileData/InstrProfData.inc | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/llvm/ProfileData/InstrProfData.inc b/include/llvm/ProfileData/InstrProfData.inc
index a2f0a8ae684b..aefdbc1b3e47 100644
--- a/include/llvm/ProfileData/InstrProfData.inc
+++ b/include/llvm/ProfileData/InstrProfData.inc
@@ -538,12 +538,13 @@ int initializeValueProfRuntimeRecord(ValueProfRuntimeRecord *RuntimeRecord,
     }
     NumValueKinds++;
     RuntimeRecord->SiteCountArray[I] = (uint8_t *)calloc(N, 1);
-    RuntimeRecord->NodesKind[I] = &RuntimeRecord->Nodes[S];
-    if (!RuntimeRecord->NodesKind[I])
+    if (!RuntimeRecord->SiteCountArray[I])
       return 1;
+    RuntimeRecord->NodesKind[I] = Nodes ? &Nodes[S] : NULL;
     for (J = 0; J < N; J++) {
+      /* Compute value count for each site. */
       uint32_t C = 0;
-      ValueProfNode *Site = RuntimeRecord->Nodes[S + J];
+      ValueProfNode *Site = Nodes ? RuntimeRecord->NodesKind[I][J] : NULL;
       while (Site) {
         C++;
         Site = Site->Next;
@@ -596,6 +597,8 @@ void getValueForSiteRT(const void *R, InstrProfValueData *Dst, uint32_t VK,
   unsigned I, N = 0;
   const ValueProfRuntimeRecord *Record = (const ValueProfRuntimeRecord *)R;
   N = getNumValueDataForSiteRT(R, VK, S);
+  if (N == 0)
+    return;
   ValueProfNode *VNode = Record->NodesKind[VK][S];
   for (I = 0; I < N; I++) {
     Dst[I] = VNode->VData;

From dc5640824942c77b26fe92a9aa7a7ee6d150454d Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Wed, 2 Dec 2015 22:01:56 +0000
Subject: [PATCH 161/186] [llvm-dwp] Include only the non-empty columns in the
 cu_index

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254555 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/tools/llvm-dwp/X86/simple.test |  6 +++++-
 tools/llvm-dwp/llvm-dwp.cpp         | 22 +++++++++++++++-------
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/test/tools/llvm-dwp/X86/simple.test b/test/tools/llvm-dwp/X86/simple.test
index aa5ae40dc2b3..6ee19697442d 100644
--- a/test/tools/llvm-dwp/X86/simple.test
+++ b/test/tools/llvm-dwp/X86/simple.test
@@ -42,7 +42,11 @@ CHECK:     DW_AT_name {{.*}} "b"
 CHECK:     DW_TAG_formal_parameter
 
 CHECK: .debug_cu_index contents:
-FIXME: Emit and verify the cu_index contents
+Ensure only the relevant/contained sections are included in the table:
+CHECK: Index Signature          INFO                     ABBREV                   STR_OFFSETS
+Don't bother checking the Signatures, they aren't correct yet.
+CHECK:                          [0x00000000, 0x00000029) [0x00000000, 0x00000031) [0x00000000, 0x00000010)
+CHECK:                          [0x00000029, 0x0000005e) [0x00000031, 0x00000075) [0x00000010, 0x00000024)
 
 CHECK: .debug_str.dwo contents:
 CHECK: "clang version
diff --git a/tools/llvm-dwp/llvm-dwp.cpp b/tools/llvm-dwp/llvm-dwp.cpp
index 6617b0b23aeb..e6a90cf8a3cf 100644
--- a/tools/llvm-dwp/llvm-dwp.cpp
+++ b/tools/llvm-dwp/llvm-dwp.cpp
@@ -157,9 +157,14 @@ static std::error_code write(MCStreamer &Out, ArrayRef<std::string> Inputs) {
       return Err;
   }
 
+  unsigned Columns = 0;
+  for (auto &C : ContributionOffsets)
+    if (C)
+      ++Columns;
+
   Out.SwitchSection(MCOFI.getDwarfCUIndexSection());
   Out.EmitIntValue(2, 4);                   // Version
-  Out.EmitIntValue(8, 4);                   // Columns
+  Out.EmitIntValue(Columns, 4);             // Columns
   Out.EmitIntValue(IndexEntries.size(), 4); // Num Units
   // FIXME: This is not the right number of buckets for a real hash.
   Out.EmitIntValue(IndexEntries.size(), 4); // Num Buckets
@@ -173,18 +178,21 @@ static std::error_code write(MCStreamer &Out, ArrayRef<std::string> Inputs) {
     Out.EmitIntValue(i + 1, 4);
 
   // Write the column headers (which sections will appear in the table)
-  for (size_t i = 1; i != 9; ++i)
-    Out.EmitIntValue(i, 4);
+  for (size_t i = 0; i != array_lengthof(ContributionOffsets); ++i)
+    if (ContributionOffsets[i])
+      Out.EmitIntValue(i + DW_SECT_INFO, 4);
 
   // Write the offsets.
   for (const auto &E : IndexEntries)
-    for (const auto &C : E.Contributions)
-      Out.EmitIntValue(C.Offset, 4);
+    for (size_t i = 0; i != array_lengthof(E.Contributions); ++i)
+      if (ContributionOffsets[i])
+        Out.EmitIntValue(E.Contributions[i].Offset, 4);
 
   // Write the lengths.
   for (const auto &E : IndexEntries)
-    for (const auto &C : E.Contributions)
-      Out.EmitIntValue(C.Length, 4);
+    for (size_t i = 0; i != array_lengthof(E.Contributions); ++i)
+      if (ContributionOffsets[i])
+        Out.EmitIntValue(E.Contributions[i].Length, 4);
 
   return std::error_code();
 }

From 44b99915504b69c98096d6b05f445b1fb58a848d Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Wed, 2 Dec 2015 22:22:24 +0000
Subject: [PATCH 162/186] Delete what is now duplicated code.

Having to import an alias as declaration is not thinlto specific.

The test difference are because when we already have a decl and we are
not importing it, we just leave the decl alone.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254556 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Linker/LinkModules.cpp                   | 31 ++------------------
 test/Linker/funcimport.ll                    | 16 +++++-----
 test/Transforms/FunctionImport/funcimport.ll |  4 +--
 3 files changed, 13 insertions(+), 38 deletions(-)

diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index e90ce166962e..1a01b78cd0bc 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -791,30 +791,6 @@ Function *ModuleLinker::copyFunctionProto(TypeMapTy &TypeMap,
 /// Set up prototypes for any aliases that come over from the source module.
 GlobalValue *ModuleLinker::copyGlobalAliasProto(TypeMapTy &TypeMap,
                                                 const GlobalAlias *SGA) {
-  // If we are importing and encounter a weak_any alias, or an alias to
-  // an object being imported as a declaration, we must import the alias
-  // as a declaration as well, which involves converting it to a non-alias.
-  // See comments in ModuleLinker::getLinkage for why we cannot import
-  // weak_any defintions.
-  if (isPerformingImport() && !doImportAsDefinition(SGA)) {
-    // Need to convert to declaration. All aliases must be definitions.
-    const GlobalValue *GVal = SGA->getBaseObject();
-    GlobalValue *NewGV;
-    if (auto *GVar = dyn_cast<GlobalVariable>(GVal))
-      NewGV = copyGlobalVariableProto(TypeMap, GVar);
-    else {
-      auto *F = dyn_cast<Function>(GVal);
-      assert(F);
-      NewGV = copyFunctionProto(TypeMap, F);
-    }
-    // Set the linkage to External or ExternalWeak (see comments in
-    // ModuleLinker::getLinkage for why WeakAny is converted to ExternalWeak).
-    if (SGA->hasWeakAnyLinkage())
-      NewGV->setLinkage(GlobalValue::ExternalWeakLinkage);
-    else
-      NewGV->setLinkage(GlobalValue::ExternalLinkage);
-    return NewGV;
-  }
   // If there is no linkage to be performed or we're linking from the source,
   // bring over SGA.
   auto *Ty = TypeMap.get(SGA->getValueType());
@@ -1421,7 +1397,9 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) {
   Comdat *C = nullptr;
   bool HasUnnamedAddr = SGV->hasUnnamedAddr();
 
-  if (const Comdat *SC = SGV->getComdat()) {
+  if (isPerformingImport() && !doImportAsDefinition(SGV)) {
+    LinkFromSrc = false;
+  } else if (const Comdat *SC = SGV->getComdat()) {
     Comdat::SelectionKind SK;
     std::tie(SK, LinkFromSrc) = ComdatsChosen[SC];
     C = DstM.getOrInsertComdat(SC->getName());
@@ -1454,9 +1432,6 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) {
     setVisibility(NewGV, SGV, DGV);
   } else {
     NewGV = copyGlobalValueProto(TypeMap, SGV, DGV, LinkFromSrc);
-
-    if (isPerformingImport() && !doImportAsDefinition(SGV))
-      DoNotLinkFromSource.insert(SGV);
   }
 
   NewGV->setUnnamedAddr(HasUnnamedAddr);
diff --git a/test/Linker/funcimport.ll b/test/Linker/funcimport.ll
index 8a71e21d0ad1..1e6c0ec64846 100644
--- a/test/Linker/funcimport.ll
+++ b/test/Linker/funcimport.ll
@@ -27,36 +27,36 @@
 ; RUN: llvm-link %t2.bc -functionindex=%t3.thinlto.bc -import=globalfunc1:%t.bc -S | FileCheck %s --check-prefix=IMPORTGLOB1
 ; IMPORTGLOB1-DAG: define available_externally void @globalfunc1
 ; IMPORTGLOB1-DAG: declare void @globalfunc2
-; IMPORTGLOB1-DAG: declare extern_weak void @weakalias
+; IMPORTGLOB1-DAG: declare void @weakalias
 ; IMPORTGLOB1-DAG: declare void @analias
-; IMPORTGLOB1-DAG: declare void @linkoncealias
+; IMPORTGLOB1-DAG: @linkoncealias = external global
 ; IMPORTGLOB1-NOT: @linkoncefunc
 
 ; Ensure that weak alias to a non-imported function is correctly
 ; turned into a declaration, but that strong alias to an imported function
 ; is imported as alias.
 ; RUN: llvm-link %t2.bc -functionindex=%t3.thinlto.bc -import=globalfunc2:%t.bc -S | FileCheck %s --check-prefix=IMPORTGLOB2
-; IMPORTGLOB2-DAG: declare void @analias()
+; IMPORTGLOB2-DAG: declare void @analias
 ; IMPORTGLOB2-DAG: declare void @globalfunc1
 ; IMPORTGLOB2-DAG: define available_externally void @globalfunc2
-; IMPORTGLOB2-DAG: declare extern_weak void @weakalias
+; IMPORTGLOB2-DAG: declare void @weakalias
 
 ; Ensure that strong alias imported in second pass of importing ends up
 ; as an alias.
 ; RUN: llvm-link %t2.bc -functionindex=%t3.thinlto.bc -import=globalfunc1:%t.bc -import=globalfunc2:%t.bc -S | FileCheck %s --check-prefix=IMPORTGLOB3
-; IMPORTGLOB3-DAG: declare void @analias()
+; IMPORTGLOB3-DAG: declare void @analias
 ; IMPORTGLOB3-DAG: define available_externally void @globalfunc1
 ; IMPORTGLOB3-DAG: define available_externally void @globalfunc2
-; IMPORTGLOB3-DAG: declare extern_weak void @weakalias
+; IMPORTGLOB3-DAG: declare void @weakalias
 
 ; Ensure that strong alias imported in first pass of importing ends up
 ; as an alias, and that seeing the alias definition during a second inlining
 ; pass is handled correctly.
 ; RUN: llvm-link %t2.bc -functionindex=%t3.thinlto.bc -import=globalfunc2:%t.bc -import=globalfunc1:%t.bc -S | FileCheck %s --check-prefix=IMPORTGLOB4
-; IMPORTGLOB4-DAG: declare void @analias()
+; IMPORTGLOB4-DAG: declare void @analias
 ; IMPORTGLOB4-DAG: define available_externally void @globalfunc2
 ; IMPORTGLOB4-DAG: define available_externally void @globalfunc1
-; IMPORTGLOB4-DAG: declare extern_weak void @weakalias
+; IMPORTGLOB4-DAG: declare void @weakalias
 
 ; An alias to an imported function is imported as alias if the function is not
 ; available_externally.
diff --git a/test/Transforms/FunctionImport/funcimport.ll b/test/Transforms/FunctionImport/funcimport.ll
index 2c9164fdaa98..c099b9766477 100644
--- a/test/Transforms/FunctionImport/funcimport.ll
+++ b/test/Transforms/FunctionImport/funcimport.ll
@@ -25,11 +25,11 @@ entry:
 }
 
 ; Won't import weak alias
-; CHECK-DAG: declare extern_weak void @weakalias()
+; CHECK-DAG: declare void @weakalias
 declare void @weakalias(...) #1
 
 ; Cannot create an alias to available_externally
-; CHECK-DAG: declare void @analias()
+; CHECK-DAG: declare void @analias
 declare void @analias(...) #1
 
 ; Aliases import the aliasee function

From de603060b913c975e8c0ccee1e8afdf4933d29b3 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Wed, 2 Dec 2015 22:34:30 +0000
Subject: [PATCH 163/186] Use std::string instead of strdup() and free() in
 WinCodeViewLineTables

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254557 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp | 11 +++++------
 lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h   | 10 +---------
 2 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp b/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp
index 8b1bea8049e4..c2c0f84e5c92 100644
--- a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp
+++ b/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp
@@ -27,15 +27,15 @@ StringRef WinCodeViewLineTables::getFullFilepath(const MDNode *S) {
   auto *Scope = cast<DIScope>(S);
   StringRef Dir = Scope->getDirectory(),
             Filename = Scope->getFilename();
-  char *&Result = DirAndFilenameToFilepathMap[std::make_pair(Dir, Filename)];
-  if (Result)
-    return Result;
+  std::string &Filepath =
+      DirAndFilenameToFilepathMap[std::make_pair(Dir, Filename)];
+  if (!Filepath.empty())
+    return Filepath;
 
   // Clang emits directory and relative filename info into the IR, but CodeView
   // operates on full paths.  We could change Clang to emit full paths too, but
   // that would increase the IR size and probably not needed for other users.
   // For now, just concatenate and canonicalize the path here.
-  std::string Filepath;
   if (Filename.find(':') == 1)
     Filepath = Filename;
   else
@@ -74,8 +74,7 @@ StringRef WinCodeViewLineTables::getFullFilepath(const MDNode *S) {
   while ((Cursor = Filepath.find("\\\\", Cursor)) != std::string::npos)
     Filepath.erase(Cursor, 1);
 
-  Result = strdup(Filepath.c_str());
-  return StringRef(Result);
+  return Filepath;
 }
 
 void WinCodeViewLineTables::maybeRecordLocation(DebugLoc DL,
diff --git a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h b/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h
index 43d1a432712e..78068e07c16f 100644
--- a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h
+++ b/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h
@@ -98,7 +98,7 @@ class LLVM_LIBRARY_VISIBILITY WinCodeViewLineTables : public AsmPrinterHandler {
     }
   } FileNameRegistry;
 
-  typedef std::map<std::pair<StringRef, StringRef>, char *>
+  typedef std::map<std::pair<StringRef, StringRef>, std::string>
       DirAndFilenameToFilepathMapTy;
   DirAndFilenameToFilepathMapTy DirAndFilenameToFilepathMap;
   StringRef getFullFilepath(const MDNode *S);
@@ -116,14 +116,6 @@ class LLVM_LIBRARY_VISIBILITY WinCodeViewLineTables : public AsmPrinterHandler {
 public:
   WinCodeViewLineTables(AsmPrinter *Asm);
 
-  ~WinCodeViewLineTables() override {
-    for (DirAndFilenameToFilepathMapTy::iterator
-             I = DirAndFilenameToFilepathMap.begin(),
-             E = DirAndFilenameToFilepathMap.end();
-         I != E; ++I)
-      free(I->second);
-  }
-
   void setSymbolSize(const llvm::MCSymbol *, uint64_t) override {}
 
   /// \brief Emit the COFF section that holds the line table information.

From ba2d199d493e3cb2a6f87ff38e2830a35142714f Mon Sep 17 00:00:00 2001
From: Mike Aizatsky <aizatsky@chromium.org>
Date: Wed, 2 Dec 2015 22:43:53 +0000
Subject: [PATCH 164/186] Libfuzzer: do not pass null into user function

Differential Revision: http://reviews.llvm.org/D15098

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254558 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Fuzzer/FuzzerLoop.cpp      | 6 +++++-
 lib/Fuzzer/test/SimpleTest.cpp | 2 ++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/lib/Fuzzer/FuzzerLoop.cpp b/lib/Fuzzer/FuzzerLoop.cpp
index 7aea169b7ae9..9c52a4dbe774 100644
--- a/lib/Fuzzer/FuzzerLoop.cpp
+++ b/lib/Fuzzer/FuzzerLoop.cpp
@@ -238,7 +238,11 @@ void Fuzzer::RunOneAndUpdateCorpus(Unit &U) {
 }
 
 void Fuzzer::ExecuteCallback(const Unit &U) {
-  int Res = USF.TargetFunction(U.data(), U.size());
+  const uint8_t *Data = U.data();
+  uint8_t EmptyData;
+  if (!Data) 
+    Data = &EmptyData;
+  int Res = USF.TargetFunction(Data, U.size());
   (void)Res;
   assert(Res == 0);
 }
diff --git a/lib/Fuzzer/test/SimpleTest.cpp b/lib/Fuzzer/test/SimpleTest.cpp
index 6811d115d960..04225a889f5d 100644
--- a/lib/Fuzzer/test/SimpleTest.cpp
+++ b/lib/Fuzzer/test/SimpleTest.cpp
@@ -1,4 +1,5 @@
 // Simple test for a fuzzer. The fuzzer must find the string "Hi!".
+#include <assert.h>
 #include <cstdint>
 #include <cstdlib>
 #include <cstddef>
@@ -7,6 +8,7 @@
 static volatile int Sink;
 
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  assert(Data);
   if (Size > 0 && Data[0] == 'H') {
     Sink = 1;
     if (Size > 1 && Data[1] == 'i') {

From 4744e8bfd8f1eded048dc640661a40cd3e81140b Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Wed, 2 Dec 2015 22:59:04 +0000
Subject: [PATCH 165/186] Switch the linker to having a whitelist of GVs.

This replaces DoNotLinkFromSource with ValuesToLink. It also moves the
computation of ValuesToLink earlier.

It is a bit simpler and an important step in slitting the linker into an
ir mover and a linker proper.

The test change is because we now avoid creating dead declarations.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254559 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Linker/LinkModules.cpp | 118 +++++++++++++++++++++----------------
 test/Linker/funcimport.ll  |   8 +--
 2 files changed, 71 insertions(+), 55 deletions(-)

diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index 1a01b78cd0bc..21df66863e56 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -392,8 +392,7 @@ class ModuleLinker {
   /// but this allows us to reuse the ValueMapper code.
   ValueToValueMapTy ValueMap;
 
-  // Set of items not to link in from source.
-  SmallPtrSet<const GlobalValue *, 16> DoNotLinkFromSource;
+  SetVector<GlobalValue *> ValuesToLink;
 
   DiagnosticHandlerFunction DiagnosticHandler;
 
@@ -862,18 +861,8 @@ Value *ModuleLinker::materializeDeclFor(Value *V) {
   if (!SGV)
     return nullptr;
 
-  // If we are done linking global value bodies (i.e. we are performing
-  // metadata linking), don't link in the global value due to this
-  // reference, simply map it to null.
-  if (DoneLinkingBodies)
-    return nullptr;
-
   linkGlobalValueProto(SGV);
-  if (HasError)
-    return nullptr;
-  Value *Ret = ValueMap[SGV];
-  assert(Ret);
-  return Ret;
+  return ValueMap[SGV];
 }
 
 void ValueMaterializerTy::materializeInitFor(GlobalValue *New,
@@ -881,6 +870,11 @@ void ValueMaterializerTy::materializeInitFor(GlobalValue *New,
   return ModLinker->materializeInitFor(New, Old);
 }
 
+static bool shouldLazyLink(const GlobalValue &GV) {
+  return GV.hasLocalLinkage() || GV.hasLinkOnceLinkage() ||
+         GV.hasAvailableExternallyLinkage();
+}
+
 void ModuleLinker::materializeInitFor(GlobalValue *New, GlobalValue *Old) {
   if (auto *F = dyn_cast<Function>(New)) {
     if (!F->isDeclaration())
@@ -900,7 +894,7 @@ void ModuleLinker::materializeInitFor(GlobalValue *New, GlobalValue *Old) {
   if (isPerformingImport() && !doImportAsDefinition(Old))
     return;
 
-  if (!New->hasLocalLinkage() && DoNotLinkFromSource.count(Old))
+  if (!ValuesToLink.count(Old) && !shouldLazyLink(*Old))
     return;
 
   linkGlobalValueBody(*New, *Old);
@@ -1342,7 +1336,8 @@ bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
                        [this](Constant *E) {
                          auto *Key = dyn_cast<GlobalValue>(
                              E->getAggregateElement(2)->stripPointerCasts());
-                         return DoNotLinkFromSource.count(Key);
+                         return Key && !ValuesToLink.count(Key) &&
+                                !shouldLazyLink(*Key);
                        }),
         SrcElements.end());
   uint64_t NewSize = DstElements.size() + SrcElements.size();
@@ -1381,14 +1376,6 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) {
 
   // Handle the ultra special appending linkage case first.
   assert(!DGV || SGV->hasAppendingLinkage() == DGV->hasAppendingLinkage());
-  if (SGV->hasAppendingLinkage() && isPerformingImport()) {
-    // Don't want to append to global_ctors list, for example, when we
-    // are importing for ThinLTO, otherwise the global ctors and dtors
-    // get executed multiple times for local variables (the latter causing
-    // double frees).
-    DoNotLinkFromSource.insert(SGV);
-    return false;
-  }
   if (SGV->hasAppendingLinkage())
     return linkAppendingVarProto(cast_or_null<GlobalVariable>(DGV),
                                  cast<GlobalVariable>(SGV));
@@ -1411,15 +1398,9 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) {
       return true;
   }
 
-  if (!LinkFromSrc) {
-    // Track the source global so that we don't attempt to copy it over when
-    // processing global initializers.
-    DoNotLinkFromSource.insert(SGV);
-
-    if (DGV)
-      // Make sure to remember this mapping.
-      ValueMap[SGV] =
-          ConstantExpr::getBitCast(DGV, TypeMap.get(SGV->getType()));
+  if (!LinkFromSrc && DGV) {
+    // Make sure to remember this mapping.
+    ValueMap[SGV] = ConstantExpr::getBitCast(DGV, TypeMap.get(SGV->getType()));
   }
 
   if (DGV)
@@ -1431,6 +1412,12 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) {
     // When linking from source we setVisibility from copyGlobalValueProto.
     setVisibility(NewGV, SGV, DGV);
   } else {
+    // If we are done linking global value bodies (i.e. we are performing
+    // metadata linking), don't link in the global value due to this
+    // reference, simply map it to null.
+    if (DoneLinkingBodies)
+      return false;
+
     NewGV = copyGlobalValueProto(TypeMap, SGV, DGV, LinkFromSrc);
   }
 
@@ -1774,30 +1761,62 @@ bool ModuleLinker::linkIfNeeded(GlobalValue &GV) {
   if (shouldLinkOnlyNeeded() && !(DGV && DGV->isDeclaration()))
     return false;
 
-  if (DGV && !GV.hasLocalLinkage()) {
+  if (DGV && !GV.hasLocalLinkage() && !GV.hasAppendingLinkage()) {
+    auto *DGVar = dyn_cast<GlobalVariable>(DGV);
+    auto *SGVar = dyn_cast<GlobalVariable>(&GV);
+    if (DGVar && SGVar) {
+      if (DGVar->isDeclaration() && SGVar->isDeclaration() &&
+          (!DGVar->isConstant() || !SGVar->isConstant())) {
+        DGVar->setConstant(false);
+        SGVar->setConstant(false);
+      }
+      if (DGVar->hasCommonLinkage() && SGVar->hasCommonLinkage()) {
+        unsigned Align = std::max(DGVar->getAlignment(), SGVar->getAlignment());
+        SGVar->setAlignment(Align);
+        DGVar->setAlignment(Align);
+      }
+    }
+
     GlobalValue::VisibilityTypes Visibility =
         getMinVisibility(DGV->getVisibility(), GV.getVisibility());
     DGV->setVisibility(Visibility);
     GV.setVisibility(Visibility);
+
+    bool HasUnnamedAddr = GV.hasUnnamedAddr() && DGV->hasUnnamedAddr();
+    DGV->setUnnamedAddr(HasUnnamedAddr);
+    GV.setUnnamedAddr(HasUnnamedAddr);
   }
 
+  // Don't want to append to global_ctors list, for example, when we
+  // are importing for ThinLTO, otherwise the global ctors and dtors
+  // get executed multiple times for local variables (the latter causing
+  // double frees).
+  if (GV.hasAppendingLinkage() && isPerformingImport())
+    return false;
+
+  if (isPerformingImport() && !doImportAsDefinition(&GV))
+    return false;
+
+  if (!DGV && !shouldOverrideFromSrc() &&
+      (GV.hasLocalLinkage() || GV.hasLinkOnceLinkage() ||
+       GV.hasAvailableExternallyLinkage()))
+    return false;
+
   if (const Comdat *SC = GV.getComdat()) {
     bool LinkFromSrc;
     Comdat::SelectionKind SK;
     std::tie(SK, LinkFromSrc) = ComdatsChosen[SC];
-    if (!LinkFromSrc) {
-      DoNotLinkFromSource.insert(&GV);
-      return false;
-    }
-  }
-
-  if (!DGV && !shouldOverrideFromSrc() &&
-      (GV.hasLocalLinkage() || GV.hasLinkOnceLinkage() ||
-       GV.hasAvailableExternallyLinkage())) {
+    if (LinkFromSrc)
+      ValuesToLink.insert(&GV);
     return false;
   }
-  MapValue(&GV, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer);
-  return HasError;
+
+  bool LinkFromSrc = true;
+  if (DGV && shouldLinkFromSource(LinkFromSrc, *DGV, GV))
+    return true;
+  if (LinkFromSrc)
+    ValuesToLink.insert(&GV);
+  return false;
 }
 
 bool ModuleLinker::run() {
@@ -1881,13 +1900,10 @@ bool ModuleLinker::run() {
     if (linkIfNeeded(GA))
       return true;
 
-  for (const auto &Entry : DstM.getComdatSymbolTable()) {
-    const Comdat &C = Entry.getValue();
-    if (C.getSelectionKind() == Comdat::Any)
-      continue;
-    const GlobalValue *GV = SrcM.getNamedValue(C.getName());
-    if (GV)
-      MapValue(GV, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer);
+  for (GlobalValue *GV : ValuesToLink) {
+    MapValue(GV, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer);
+    if (HasError)
+      return true;
   }
 
   // Note that we are done linking global value bodies. This prevents
diff --git a/test/Linker/funcimport.ll b/test/Linker/funcimport.ll
index 1e6c0ec64846..38deafd3e3f1 100644
--- a/test/Linker/funcimport.ll
+++ b/test/Linker/funcimport.ll
@@ -26,20 +26,20 @@
 ; lazily linked and never referenced/materialized.
 ; RUN: llvm-link %t2.bc -functionindex=%t3.thinlto.bc -import=globalfunc1:%t.bc -S | FileCheck %s --check-prefix=IMPORTGLOB1
 ; IMPORTGLOB1-DAG: define available_externally void @globalfunc1
-; IMPORTGLOB1-DAG: declare void @globalfunc2
 ; IMPORTGLOB1-DAG: declare void @weakalias
 ; IMPORTGLOB1-DAG: declare void @analias
-; IMPORTGLOB1-DAG: @linkoncealias = external global
+; IMPORTGLOB1-NOT: @linkoncealias
 ; IMPORTGLOB1-NOT: @linkoncefunc
+; IMPORTGLOB1-NOT: declare void @globalfunc2
 
 ; Ensure that weak alias to a non-imported function is correctly
 ; turned into a declaration, but that strong alias to an imported function
 ; is imported as alias.
 ; RUN: llvm-link %t2.bc -functionindex=%t3.thinlto.bc -import=globalfunc2:%t.bc -S | FileCheck %s --check-prefix=IMPORTGLOB2
 ; IMPORTGLOB2-DAG: declare void @analias
-; IMPORTGLOB2-DAG: declare void @globalfunc1
 ; IMPORTGLOB2-DAG: define available_externally void @globalfunc2
 ; IMPORTGLOB2-DAG: declare void @weakalias
+; IMPORTGLOB2-NOT: declare void @globalfunc1
 
 ; Ensure that strong alias imported in second pass of importing ends up
 ; as an alias.
@@ -98,9 +98,9 @@
 ; reference should turned into an external_weak declaration.
 ; RUN: llvm-link %t2.bc -functionindex=%t3.thinlto.bc -import=callweakfunc:%t.bc -import=weakfunc:%t.bc -S 2>&1 | FileCheck %s --check-prefix=IMPORTWEAKFUNC
 ; IMPORTWEAKFUNC-DAG: Ignoring import request for weak-any function weakfunc
-; IMPORTWEAKFUNC-DAG: @weakvar = extern_weak global i32, align 4
 ; IMPORTWEAKFUNC-DAG: declare extern_weak void @weakfunc
 ; IMPORTWEAKFUNC-DAG: define available_externally void @callweakfunc
+; IMPORTWEAKFUNC-NOT: @weakvar = extern_weak global i32, align 4
 
 @globalvar = global i32 1, align 4
 @staticvar = internal global i32 1, align 4

From 66c86b041704aa761755c031f10d4c344aef7cdc Mon Sep 17 00:00:00 2001
From: Alexey Samsonov <vonosmas@gmail.com>
Date: Wed, 2 Dec 2015 22:59:22 +0000
Subject: [PATCH 166/186] Fixup for r254547: use format_hex() to simplify code.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254560 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index 2e780f035b2b..5e78762b994a 100644
--- a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -1514,8 +1514,7 @@ unsigned HexagonAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
 void HexagonAsmParser::OutOfRange(SMLoc IDLoc, long long Val, long long Max) {
   std::string errStr;
   raw_string_ostream ES(errStr);
-  ES << "value " << Val << "(" << format("0x%" PRIx64, Val)
-         << ") out of range: ";
+  ES << "value " << Val << "(" << format_hex(Val, 0) << ") out of range: ";
   if (Max >= 0)
     ES << "0-" << Max;
   else

From 1114aa2fc68b45b7cc2100585081ebeba0f3f694 Mon Sep 17 00:00:00 2001
From: David Majnemer <david.majnemer@gmail.com>
Date: Wed, 2 Dec 2015 23:06:39 +0000
Subject: [PATCH 167/186] Move EH-specific helper functions to a more
 appropriate place

No functionality change is intended.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254562 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/EHPersonalities.h       | 83 ++++++++++++++++++
 include/llvm/Analysis/LibCallSemantics.h      | 84 -------------------
 include/llvm/CodeGen/MachineModuleInfo.h      |  2 +-
 include/llvm/CodeGen/WinEHFuncInfo.h          |  1 +
 lib/Analysis/CMakeLists.txt                   |  2 +-
 lib/Analysis/LibCallSemantics.cpp             | 47 -----------
 lib/CodeGen/DwarfEHPrepare.cpp                |  2 +-
 lib/CodeGen/MachineFunction.cpp               |  2 +-
 lib/CodeGen/MachineModuleInfo.cpp             |  2 +-
 lib/CodeGen/MachineVerifier.cpp               |  2 +-
 lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp |  2 +-
 lib/CodeGen/WinEHPrepare.cpp                  |  2 +-
 lib/Target/X86/X86ExpandPseudo.cpp            |  2 +-
 lib/Target/X86/X86FrameLowering.cpp           |  2 +-
 lib/Target/X86/X86ISelLowering.cpp            |  2 +-
 lib/Target/X86/X86WinEHState.cpp              |  2 +-
 lib/Transforms/IPO/PruneEH.cpp                |  2 +-
 .../InstCombine/InstructionCombining.cpp      |  2 +-
 .../Instrumentation/SanitizerCoverage.cpp     |  2 +-
 lib/Transforms/Utils/Local.cpp                |  2 +-
 20 files changed, 100 insertions(+), 147 deletions(-)
 create mode 100644 include/llvm/Analysis/EHPersonalities.h
 delete mode 100644 include/llvm/Analysis/LibCallSemantics.h
 delete mode 100644 lib/Analysis/LibCallSemantics.cpp

diff --git a/include/llvm/Analysis/EHPersonalities.h b/include/llvm/Analysis/EHPersonalities.h
new file mode 100644
index 000000000000..8d5f0f203496
--- /dev/null
+++ b/include/llvm/Analysis/EHPersonalities.h
@@ -0,0 +1,83 @@
+//===- EHPersonalities.h - Compute EH-related information -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_LIBCALLSEMANTICS_H
+#define LLVM_ANALYSIS_LIBCALLSEMANTICS_H
+
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+class Function;
+class Value;
+
+enum class EHPersonality {
+  Unknown,
+  GNU_Ada,
+  GNU_C,
+  GNU_CXX,
+  GNU_ObjC,
+  MSVC_X86SEH,
+  MSVC_Win64SEH,
+  MSVC_CXX,
+  CoreCLR
+};
+
+/// \brief See if the given exception handling personality function is one
+/// that we understand.  If so, return a description of it; otherwise return
+/// Unknown.
+EHPersonality classifyEHPersonality(const Value *Pers);
+
+/// \brief Returns true if this personality function catches asynchronous
+/// exceptions.
+inline bool isAsynchronousEHPersonality(EHPersonality Pers) {
+  // The two SEH personality functions can catch asynch exceptions. We assume
+  // unknown personalities don't catch asynch exceptions.
+  switch (Pers) {
+  case EHPersonality::MSVC_X86SEH:
+  case EHPersonality::MSVC_Win64SEH:
+    return true;
+  default:
+    return false;
+  }
+  llvm_unreachable("invalid enum");
+}
+
+/// \brief Returns true if this is a personality function that invokes
+/// handler funclets (which must return to it).
+inline bool isFuncletEHPersonality(EHPersonality Pers) {
+  switch (Pers) {
+  case EHPersonality::MSVC_CXX:
+  case EHPersonality::MSVC_X86SEH:
+  case EHPersonality::MSVC_Win64SEH:
+  case EHPersonality::CoreCLR:
+    return true;
+  default:
+    return false;
+  }
+  llvm_unreachable("invalid enum");
+}
+
+/// \brief Return true if this personality may be safely removed if there
+/// are no invoke instructions remaining in the current function.
+inline bool isNoOpWithoutInvoke(EHPersonality Pers) {
+  switch (Pers) {
+  case EHPersonality::Unknown:
+    return false;
+  // All known personalities currently have this behavior
+  default:
+    return true;
+  }
+  llvm_unreachable("invalid enum");
+}
+
+bool canSimplifyInvokeNoUnwind(const Function *F);
+
+} // end namespace llvm
+
+#endif
diff --git a/include/llvm/Analysis/LibCallSemantics.h b/include/llvm/Analysis/LibCallSemantics.h
deleted file mode 100644
index 14ecb55f340b..000000000000
--- a/include/llvm/Analysis/LibCallSemantics.h
+++ /dev/null
@@ -1,84 +0,0 @@
-//===- LibCallSemantics.h - Describe library semantics --------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines interfaces that can be used to describe language specific
-// runtime library interfaces (e.g. libc, libm, etc) to LLVM optimizers.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_ANALYSIS_LIBCALLSEMANTICS_H
-#define LLVM_ANALYSIS_LIBCALLSEMANTICS_H
-
-#include "llvm/Analysis/AliasAnalysis.h"
-
-namespace llvm {
-class InvokeInst;
-
-  enum class EHPersonality {
-    Unknown,
-    GNU_Ada,
-    GNU_C,
-    GNU_CXX,
-    GNU_ObjC,
-    MSVC_X86SEH,
-    MSVC_Win64SEH,
-    MSVC_CXX,
-    CoreCLR
-  };
-
-  /// \brief See if the given exception handling personality function is one
-  /// that we understand.  If so, return a description of it; otherwise return
-  /// Unknown.
-  EHPersonality classifyEHPersonality(const Value *Pers);
-
-  /// \brief Returns true if this personality function catches asynchronous
-  /// exceptions.
-  inline bool isAsynchronousEHPersonality(EHPersonality Pers) {
-    // The two SEH personality functions can catch asynch exceptions. We assume
-    // unknown personalities don't catch asynch exceptions.
-    switch (Pers) {
-    case EHPersonality::MSVC_X86SEH:
-    case EHPersonality::MSVC_Win64SEH:
-      return true;
-    default: return false;
-    }
-    llvm_unreachable("invalid enum");
-  }
-
-  /// \brief Returns true if this is a personality function that invokes
-  /// handler funclets (which must return to it).
-  inline bool isFuncletEHPersonality(EHPersonality Pers) {
-    switch (Pers) {
-    case EHPersonality::MSVC_CXX:
-    case EHPersonality::MSVC_X86SEH:
-    case EHPersonality::MSVC_Win64SEH:
-    case EHPersonality::CoreCLR:
-      return true;
-    default: return false;
-    }
-    llvm_unreachable("invalid enum");
-  }
-
-  /// \brief Return true if this personality may be safely removed if there
-  /// are no invoke instructions remaining in the current function.
-  inline bool isNoOpWithoutInvoke(EHPersonality Pers) {
-    switch (Pers) {
-    case EHPersonality::Unknown:
-      return false;
-    // All known personalities currently have this behavior
-    default: return true;
-    }
-    llvm_unreachable("invalid enum");
-  }
-
-  bool canSimplifyInvokeNoUnwind(const Function *F);
-
-} // end namespace llvm
-
-#endif
diff --git a/include/llvm/CodeGen/MachineModuleInfo.h b/include/llvm/CodeGen/MachineModuleInfo.h
index fd42b46476c5..43b9f5203c50 100644
--- a/include/llvm/CodeGen/MachineModuleInfo.h
+++ b/include/llvm/CodeGen/MachineModuleInfo.h
@@ -35,7 +35,7 @@
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/ValueHandle.h"
diff --git a/include/llvm/CodeGen/WinEHFuncInfo.h b/include/llvm/CodeGen/WinEHFuncInfo.h
index 5def70692ba5..5e8bb56eb617 100644
--- a/include/llvm/CodeGen/WinEHFuncInfo.h
+++ b/include/llvm/CodeGen/WinEHFuncInfo.h
@@ -22,6 +22,7 @@
 namespace llvm {
 class AllocaInst;
 class BasicBlock;
+class CatchReturnInst;
 class Constant;
 class Function;
 class GlobalVariable;
diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
index cb5cd07493b6..69623619a8b0 100644
--- a/lib/Analysis/CMakeLists.txt
+++ b/lib/Analysis/CMakeLists.txt
@@ -25,6 +25,7 @@ add_llvm_library(LLVMAnalysis
   DivergenceAnalysis.cpp
   DomPrinter.cpp
   DominanceFrontier.cpp
+  EHPersonalities.cpp
   GlobalsModRef.cpp
   IVUsers.cpp
   InlineCost.cpp
@@ -35,7 +36,6 @@ add_llvm_library(LLVMAnalysis
   IteratedDominanceFrontier.cpp
   LazyCallGraph.cpp
   LazyValueInfo.cpp
-  LibCallSemantics.cpp
   Lint.cpp
   Loads.cpp
   LoopAccessAnalysis.cpp
diff --git a/lib/Analysis/LibCallSemantics.cpp b/lib/Analysis/LibCallSemantics.cpp
deleted file mode 100644
index b91ff20aee25..000000000000
--- a/lib/Analysis/LibCallSemantics.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-//===- LibCallSemantics.cpp - Describe library semantics ------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements interfaces that can be used to describe language
-// specific runtime library interfaces (e.g. libc, libm, etc) to LLVM
-// optimizers.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Analysis/LibCallSemantics.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/IR/Function.h"
-using namespace llvm;
-
-/// See if the given exception handling personality function is one that we
-/// understand.  If so, return a description of it; otherwise return Unknown.
-EHPersonality llvm::classifyEHPersonality(const Value *Pers) {
-  const Function *F =
-      Pers ? dyn_cast<Function>(Pers->stripPointerCasts()) : nullptr;
-  if (!F)
-    return EHPersonality::Unknown;
-  return StringSwitch<EHPersonality>(F->getName())
-    .Case("__gnat_eh_personality", EHPersonality::GNU_Ada)
-    .Case("__gxx_personality_v0",  EHPersonality::GNU_CXX)
-    .Case("__gcc_personality_v0",  EHPersonality::GNU_C)
-    .Case("__objc_personality_v0", EHPersonality::GNU_ObjC)
-    .Case("_except_handler3",      EHPersonality::MSVC_X86SEH)
-    .Case("_except_handler4",      EHPersonality::MSVC_X86SEH)
-    .Case("__C_specific_handler",  EHPersonality::MSVC_Win64SEH)
-    .Case("__CxxFrameHandler3",    EHPersonality::MSVC_CXX)
-    .Case("ProcessCLRException",   EHPersonality::CoreCLR)
-    .Default(EHPersonality::Unknown);
-}
-
-bool llvm::canSimplifyInvokeNoUnwind(const Function *F) {
-  EHPersonality Personality = classifyEHPersonality(F->getPersonalityFn());
-  // We can't simplify any invokes to nounwind functions if the personality
-  // function wants to catch asynch exceptions.  The nounwind attribute only
-  // implies that the function does not throw synchronous exceptions.
-  return !isAsynchronousEHPersonality(Personality);
-}
diff --git a/lib/CodeGen/DwarfEHPrepare.cpp b/lib/CodeGen/DwarfEHPrepare.cpp
index 0f6e1463f10f..eae78a950d9a 100644
--- a/lib/CodeGen/DwarfEHPrepare.cpp
+++ b/lib/CodeGen/DwarfEHPrepare.cpp
@@ -16,7 +16,7 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index 7e6af1c9c41f..80d30a5b131a 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -17,7 +17,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionInitializer.h"
diff --git a/lib/CodeGen/MachineModuleInfo.cpp b/lib/CodeGen/MachineModuleInfo.cpp
index 18efcf39c453..1956a701d8e6 100644
--- a/lib/CodeGen/MachineModuleInfo.cpp
+++ b/lib/CodeGen/MachineModuleInfo.cpp
@@ -10,7 +10,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/TinyPtrVector.h"
-#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index e1020772629c..cdcd8eb4fbdf 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -28,7 +28,7 @@
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveStackAnalysis.h"
 #include "llvm/CodeGen/LiveVariables.h"
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 3bbe5d4203bb..f6c5d90f47ae 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -19,7 +19,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FastISel.h"
diff --git a/lib/CodeGen/WinEHPrepare.cpp b/lib/CodeGen/WinEHPrepare.cpp
index f3f4e3be389e..dee4b870434e 100644
--- a/lib/CodeGen/WinEHPrepare.cpp
+++ b/lib/CodeGen/WinEHPrepare.cpp
@@ -19,7 +19,7 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
diff --git a/lib/Target/X86/X86ExpandPseudo.cpp b/lib/Target/X86/X86ExpandPseudo.cpp
index 6015d3465de3..83a62b731b54 100644
--- a/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/lib/Target/X86/X86ExpandPseudo.cpp
@@ -19,7 +19,7 @@
 #include "X86InstrInfo.h"
 #include "X86MachineFunctionInfo.h"
 #include "X86Subtarget.h"
-#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/CodeGen/Passes.h" // For IDs of passes that are preserved.
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 52cf9fd44cbd..682f75c7f51c 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -18,7 +18,7 @@
 #include "X86Subtarget.h"
 #include "X86TargetMachine.h"
 #include "llvm/ADT/SmallSet.h"
-#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 0877d96a32c8..241bbfd331c1 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -25,7 +25,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
diff --git a/lib/Target/X86/X86WinEHState.cpp b/lib/Target/X86/X86WinEHState.cpp
index 8a5aa40bc7f2..0276f3969b48 100644
--- a/lib/Target/X86/X86WinEHState.cpp
+++ b/lib/Target/X86/X86WinEHState.cpp
@@ -15,7 +15,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86.h"
-#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp
index a0b740563693..714e1d6e42d2 100644
--- a/lib/Transforms/IPO/PruneEH.cpp
+++ b/lib/Transforms/IPO/PruneEH.cpp
@@ -21,7 +21,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
-#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index e95a65510ecc..74c5148f9f89 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -42,9 +42,9 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/LibCallSemantics.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index 719a436b05de..cbdcc5e6cfe1 100644
--- a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -31,7 +31,7 @@
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 623da675e05b..ba79b32ac3d5 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -20,8 +20,8 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/LibCallSemantics.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CFG.h"

From 413d8cd25eeaa1d210d057aae1c43b2989aee117 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@codeaurora.org>
Date: Wed, 2 Dec 2015 23:08:29 +0000
Subject: [PATCH 168/186] [Hexagon] Improve lowering of instructions to the MC
 layer

- Add extenders when necessary.
- Handle some basic relocations.

This should fix the failure in tools/clang/test/CodeGenCXX/crash.cpp


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254564 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Hexagon/Hexagon.h              |  7 ---
 lib/Target/Hexagon/HexagonAsmPrinter.cpp  | 20 +++++----
 lib/Target/Hexagon/HexagonMCInstLower.cpp | 52 +++++++++++++++++++----
 3 files changed, 55 insertions(+), 24 deletions(-)

diff --git a/lib/Target/Hexagon/Hexagon.h b/lib/Target/Hexagon/Hexagon.h
index d360be2aa5b2..ed7d9578902e 100644
--- a/lib/Target/Hexagon/Hexagon.h
+++ b/lib/Target/Hexagon/Hexagon.h
@@ -47,15 +47,8 @@
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
-  class MachineInstr;
-  class MCInst;
-  class MCInstrInfo;
-  class HexagonAsmPrinter;
   class HexagonTargetMachine;
 
-  void HexagonLowerToMC(const MachineInstr *MI, MCInst &MCI,
-                        HexagonAsmPrinter &AP);
-
   /// \brief Creates a Hexagon-specific Target Transformation Info pass.
   ImmutablePass *createHexagonTargetTransformInfoPass(const HexagonTargetMachine *TM);
 } // end namespace llvm;
diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index 530dab859d5f..19769258ee89 100644
--- a/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -56,6 +56,11 @@
 
 using namespace llvm;
 
+namespace llvm {
+  void HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI,
+                        MCInst &MCB, HexagonAsmPrinter &AP);
+}
+
 #define DEBUG_TYPE "asm-printer"
 
 static cl::opt<bool> AlignCalls(
@@ -179,6 +184,7 @@ bool HexagonAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
 ///
 void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   MCInst MCB = HexagonMCInstrInfo::createBundle();
+  const MCInstrInfo &MCII = *Subtarget->getInstrInfo();
 
   if (MI->isBundle()) {
     const MachineBasicBlock* MBB = MI->getParent();
@@ -190,25 +196,23 @@ void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) {
           MII->getOpcode() == TargetOpcode::IMPLICIT_DEF)
         ++IgnoreCount;
       else {
-        HexagonLowerToMC(&*MII, MCB, *this);
+        HexagonLowerToMC(MCII, &*MII, MCB, *this);
       }
     }
   }
   else {
-    HexagonLowerToMC(MI, MCB, *this);
+    HexagonLowerToMC(MCII, MI, MCB, *this);
     HexagonMCInstrInfo::padEndloop(OutStreamer->getContext(), MCB);
   }
   // Examine the packet and try to find instructions that can be converted
   // to compounds.
-  HexagonMCInstrInfo::tryCompound(*Subtarget->getInstrInfo(),
-                                  OutStreamer->getContext(), MCB);
+  HexagonMCInstrInfo::tryCompound(MCII, OutStreamer->getContext(), MCB);
   // Examine the packet and convert pairs of instructions to duplex
   // instructions when possible.
   SmallVector<DuplexCandidate, 8> possibleDuplexes;
-  possibleDuplexes = HexagonMCInstrInfo::getDuplexPossibilties(
-      *Subtarget->getInstrInfo(), MCB);
-  HexagonMCShuffle(*Subtarget->getInstrInfo(), *Subtarget,
-                   OutStreamer->getContext(), MCB, possibleDuplexes);
+  possibleDuplexes = HexagonMCInstrInfo::getDuplexPossibilties(MCII, MCB);
+  HexagonMCShuffle(MCII, *Subtarget, OutStreamer->getContext(), MCB,
+                   possibleDuplexes);
   EmitToStreamer(*OutStreamer, MCB);
 }
 
diff --git a/lib/Target/Hexagon/HexagonMCInstLower.cpp b/lib/Target/Hexagon/HexagonMCInstLower.cpp
index 82a9b23149c4..86d9e19c0547 100644
--- a/lib/Target/Hexagon/HexagonMCInstLower.cpp
+++ b/lib/Target/Hexagon/HexagonMCInstLower.cpp
@@ -26,39 +26,71 @@
 
 using namespace llvm;
 
-static MCOperand GetSymbolRef(const MachineOperand& MO, const MCSymbol* Symbol,
-                              HexagonAsmPrinter& Printer) {
+namespace llvm {
+  void HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI,
+                        MCInst &MCB, HexagonAsmPrinter &AP);
+}
+
+static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
+                              HexagonAsmPrinter &Printer) {
   MCContext &MC = Printer.OutContext;
   const MCExpr *ME;
 
-  ME = MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None, MC);
+  // Populate the relocation type based on Hexagon target flags
+  // set on an operand
+  MCSymbolRefExpr::VariantKind RelocationType;
+  switch (MO.getTargetFlags()) {
+  default:
+    RelocationType = MCSymbolRefExpr::VK_None;
+    break;
+  case HexagonII::MO_PCREL:
+    RelocationType = MCSymbolRefExpr::VK_Hexagon_PCREL;
+    break;
+  case HexagonII::MO_GOT:
+    RelocationType = MCSymbolRefExpr::VK_GOT;
+    break;
+  case HexagonII::MO_LO16:
+    RelocationType = MCSymbolRefExpr::VK_Hexagon_LO16;
+    break;
+  case HexagonII::MO_HI16:
+    RelocationType = MCSymbolRefExpr::VK_Hexagon_HI16;
+    break;
+  case HexagonII::MO_GPREL:
+    RelocationType = MCSymbolRefExpr::VK_Hexagon_GPREL;
+    break;
+  }
+
+  ME = MCSymbolRefExpr::create(Symbol, RelocationType, MC);
 
   if (!MO.isJTI() && MO.getOffset())
     ME = MCBinaryExpr::createAdd(ME, MCConstantExpr::create(MO.getOffset(), MC),
                                  MC);
 
-  return (MCOperand::createExpr(ME));
+  return MCOperand::createExpr(ME);
 }
 
 // Create an MCInst from a MachineInstr
-void llvm::HexagonLowerToMC(MachineInstr const* MI, MCInst& MCB,
-                            HexagonAsmPrinter& AP) {
-  if(MI->getOpcode() == Hexagon::ENDLOOP0){
+void llvm::HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI,
+                            MCInst &MCB, HexagonAsmPrinter &AP) {
+  if (MI->getOpcode() == Hexagon::ENDLOOP0) {
     HexagonMCInstrInfo::setInnerLoop(MCB);
     return;
   }
-  if(MI->getOpcode() == Hexagon::ENDLOOP1){
+  if (MI->getOpcode() == Hexagon::ENDLOOP1) {
     HexagonMCInstrInfo::setOuterLoop(MCB);
     return;
   }
-  MCInst* MCI = new (AP.OutContext) MCInst;
+  MCInst *MCI = new (AP.OutContext) MCInst;
   MCI->setOpcode(MI->getOpcode());
   assert(MCI->getOpcode() == static_cast<unsigned>(MI->getOpcode()) &&
          "MCI opcode should have been set on construction");
+  bool MustExtend = false;
 
   for (unsigned i = 0, e = MI->getNumOperands(); i < e; i++) {
     const MachineOperand &MO = MI->getOperand(i);
     MCOperand MCO;
+    if (MO.getTargetFlags() & HexagonII::HMOTF_ConstExtended)
+      MustExtend = true;
 
     switch (MO.getType()) {
     default:
@@ -107,5 +139,7 @@ void llvm::HexagonLowerToMC(MachineInstr const* MI, MCInst& MCB,
 
     MCI->addOperand(MCO);
   }
+  HexagonMCInstrInfo::extendIfNeeded(AP.OutContext, MCII, MCB, *MCI,
+                                     MustExtend);
   MCB.addOperand(MCOperand::createInst(MCI));
 }

From ce27c66bb49a032fda3dd0f7e27b1669409ab33b Mon Sep 17 00:00:00 2001
From: David Majnemer <david.majnemer@gmail.com>
Date: Wed, 2 Dec 2015 23:09:05 +0000
Subject: [PATCH 169/186] Forgot to add this file with r254562.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254565 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Analysis/EHPersonalities.cpp | 41 ++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 lib/Analysis/EHPersonalities.cpp

diff --git a/lib/Analysis/EHPersonalities.cpp b/lib/Analysis/EHPersonalities.cpp
new file mode 100644
index 000000000000..1d1b5fe11f67
--- /dev/null
+++ b/lib/Analysis/EHPersonalities.cpp
@@ -0,0 +1,41 @@
+//===- EHPersonalities.cpp - Compute EH-related information ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/IR/Function.h"
+using namespace llvm;
+
+/// See if the given exception handling personality function is one that we
+/// understand.  If so, return a description of it; otherwise return Unknown.
+EHPersonality llvm::classifyEHPersonality(const Value *Pers) {
+  const Function *F =
+      Pers ? dyn_cast<Function>(Pers->stripPointerCasts()) : nullptr;
+  if (!F)
+    return EHPersonality::Unknown;
+  return StringSwitch<EHPersonality>(F->getName())
+    .Case("__gnat_eh_personality", EHPersonality::GNU_Ada)
+    .Case("__gxx_personality_v0",  EHPersonality::GNU_CXX)
+    .Case("__gcc_personality_v0",  EHPersonality::GNU_C)
+    .Case("__objc_personality_v0", EHPersonality::GNU_ObjC)
+    .Case("_except_handler3",      EHPersonality::MSVC_X86SEH)
+    .Case("_except_handler4",      EHPersonality::MSVC_X86SEH)
+    .Case("__C_specific_handler",  EHPersonality::MSVC_Win64SEH)
+    .Case("__CxxFrameHandler3",    EHPersonality::MSVC_CXX)
+    .Case("ProcessCLRException",   EHPersonality::CoreCLR)
+    .Default(EHPersonality::Unknown);
+}
+
+bool llvm::canSimplifyInvokeNoUnwind(const Function *F) {
+  EHPersonality Personality = classifyEHPersonality(F->getPersonalityFn());
+  // We can't simplify any invokes to nounwind functions if the personality
+  // function wants to catch asynch exceptions.  The nounwind attribute only
+  // implies that the function does not throw synchronous exceptions.
+  return !isAsynchronousEHPersonality(Personality);
+}

From ff6afa9836e7faebdfc3378acf9c9238714488c2 Mon Sep 17 00:00:00 2001
From: David Majnemer <david.majnemer@gmail.com>
Date: Wed, 2 Dec 2015 23:28:27 +0000
Subject: [PATCH 170/186] Rename a header guard to be more appropriate

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254566 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/EHPersonalities.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/llvm/Analysis/EHPersonalities.h b/include/llvm/Analysis/EHPersonalities.h
index 8d5f0f203496..4a56728fbb4a 100644
--- a/include/llvm/Analysis/EHPersonalities.h
+++ b/include/llvm/Analysis/EHPersonalities.h
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ANALYSIS_LIBCALLSEMANTICS_H
-#define LLVM_ANALYSIS_LIBCALLSEMANTICS_H
+#ifndef LLVM_ANALYSIS_EHPERSONALITIES_H
+#define LLVM_ANALYSIS_EHPERSONALITIES_H
 
 #include "llvm/Support/ErrorHandling.h"
 

From 66e256e46ec68ed425d0bf0b067a3bf003915a09 Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Wed, 2 Dec 2015 23:40:03 +0000
Subject: [PATCH 171/186] [WebAssembly] Assert that byval and nest are not used
 for return types.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254567 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/WebAssemblyISelLowering.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 8f1a06d46305..2485df1ab5d2 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -353,12 +353,10 @@ SDValue WebAssemblyTargetLowering::LowerReturn(
 
   // Record the number and types of the return values.
   for (const ISD::OutputArg &Out : Outs) {
-    if (Out.Flags.isByVal())
-      fail(DL, DAG, "WebAssembly hasn't implemented byval results");
+    assert(!Out.Flags.isByVal() && "byval is not valid for return values");
+    assert(!Out.Flags.isNest() && "nest is not valid for return values");
     if (Out.Flags.isInAlloca())
       fail(DL, DAG, "WebAssembly hasn't implemented inalloca results");
-    if (Out.Flags.isNest())
-      fail(DL, DAG, "WebAssembly hasn't implemented nest results");
     if (Out.Flags.isInConsecutiveRegs())
       fail(DL, DAG, "WebAssembly hasn't implemented cons regs results");
     if (Out.Flags.isInConsecutiveRegsLast())

From 4231469f1d30adf0e1722b7dbb015e8038568971 Mon Sep 17 00:00:00 2001
From: Derek Schuff <dschuff@google.com>
Date: Thu, 3 Dec 2015 00:50:30 +0000
Subject: [PATCH 172/186] [WebAssembly] Add a test for wasm-store-results pass

Differential Revision: http://reviews.llvm.org/D15167

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254570 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../WebAssembly/WebAssemblyStoreResults.cpp    |  6 +++++-
 test/CodeGen/WebAssembly/store-results.ll      | 18 ++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)
 create mode 100644 test/CodeGen/WebAssembly/store-results.ll

diff --git a/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp b/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
index d0735b84de60..3a7f50e3b142 100644
--- a/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
@@ -69,7 +69,8 @@ bool WebAssemblyStoreResults::runOnMachineFunction(MachineFunction &MF) {
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
 
-  for (auto &MBB : MF)
+  for (auto &MBB : MF) {
+    DEBUG(dbgs() << "Basic Block: " << MBB.getName() << '\n');
     for (auto &MI : MBB)
       switch (MI.getOpcode()) {
       default:
@@ -94,9 +95,12 @@ bool WebAssemblyStoreResults::runOnMachineFunction(MachineFunction &MF) {
                         ->getFirstTerminator();
           if (&MI == Where || !MDT.dominates(&MI, Where))
             continue;
+          DEBUG(dbgs() << "Setting operand " << O << " in " << *Where <<
+                " from " << MI <<"\n");
           O.setReg(ToReg);
         }
       }
+  }
 
   return true;
 }
diff --git a/test/CodeGen/WebAssembly/store-results.ll b/test/CodeGen/WebAssembly/store-results.ll
new file mode 100644
index 000000000000..1bcee5d31fb7
--- /dev/null
+++ b/test/CodeGen/WebAssembly/store-results.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that the wasm-store-results pass makes users of stored values use the
+; result of store expressions to reduce get_local/set_local traffic.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: single_block:
+; CHECK-NOT: .local
+; CHECK: i32.const $push{{[0-9]+}}=, 0
+; CHECK: i32.store $push[[STORE:[0-9]+]]=, $0, $pop{{[0-9]+}}
+; CHECK: return $pop[[STORE]]{{$}}
+define i32 @single_block(i32* %p) {
+entry:
+  store i32 0, i32* %p
+  ret i32 0
+}

From 51b567dc4b8ed18fa45454da6b1e52b957f487d2 Mon Sep 17 00:00:00 2001
From: Justin Bogner <mail@justinbogner.com>
Date: Thu, 3 Dec 2015 00:52:20 +0000
Subject: [PATCH 173/186] MC: Make sure to clear *all* of MCMachOStreamer's
 state

The CreatedADWARFSection flag was added in r232842, but isn't cleared
properly when resetting the streamer's state. Fix that.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254571 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/MC/MCMachOStreamer.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/MC/MCMachOStreamer.cpp b/lib/MC/MCMachOStreamer.cpp
index 52ecf9fcfbf3..21f7571eec4a 100644
--- a/lib/MC/MCMachOStreamer.cpp
+++ b/lib/MC/MCMachOStreamer.cpp
@@ -60,6 +60,7 @@ class MCMachOStreamer : public MCObjectStreamer {
 
   /// state management
   void reset() override {
+    CreatedADWARFSection = false;
     HasSectionLabel.clear();
     MCObjectStreamer::reset();
   }

From aeaec5e3f929def8a7c1e1383c52dc56528d1a1a Mon Sep 17 00:00:00 2001
From: Xinliang David Li <davidxl@google.com>
Date: Thu, 3 Dec 2015 01:05:31 +0000
Subject: [PATCH 174/186] [PGO] Add v2 format compatibility test

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254572 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../llvm-profdata/Inputs/compat.profdata.v2   | Bin 0 -> 712 bytes
 test/tools/llvm-profdata/compat.proftext      |  20 ++++++++++++++++++
 2 files changed, 20 insertions(+)
 create mode 100644 test/tools/llvm-profdata/Inputs/compat.profdata.v2

diff --git a/test/tools/llvm-profdata/Inputs/compat.profdata.v2 b/test/tools/llvm-profdata/Inputs/compat.profdata.v2
new file mode 100644
index 0000000000000000000000000000000000000000..969867584a9934a408363896ef9ae250406b9e3f
GIT binary patch
literal 712
zcmeyLQ&5zjmf6U}00BDl*`X{5wFAmyWLUFT;<nGBA2G~O5d|onmY>fBWuhChXCFj_
zk>S_X-sfg1Q*X0CRT@C)+{Da0E=LFlCdUYsdjYk{!HFL%g5op>s3Mq^Q0^#2qY%)b
Ok-4~xO~oaFLmU7ea2Txs

literal 0
HcmV?d00001

diff --git a/test/tools/llvm-profdata/compat.proftext b/test/tools/llvm-profdata/compat.proftext
index 14da3374b5e9..139202d162e6 100644
--- a/test/tools/llvm-profdata/compat.proftext
+++ b/test/tools/llvm-profdata/compat.proftext
@@ -45,3 +45,23 @@ large_numbers
 # SUMMARY: Total functions: 3
 # SUMMARY: Maximum function count: 2305843009213693952
 # SUMMARY: Maximum internal block count: 1152921504606846976
+
+# RUN: llvm-profdata show %S/Inputs/compat.profdata.v2 -all-functions --counts | FileCheck %s -check-prefix=FORMATV2
+
+# FORMATV2: Counters:
+# FORMATV2-NEXT:   foo:
+# FORMATV2-NEXT:     Hash: 0x000000000000000a
+# FORMATV2-NEXT:     Counters: 2
+# FORMATV2-NEXT:     Function count: 499500
+# FORMATV2-NEXT:     Block counts: [179900]
+# FORMATV2-NEXT:   main:
+# FORMATV2-NEXT:     Hash: 0x000000000000410a
+# FORMATV2-NEXT:     Counters: 4
+# FORMATV2-NEXT:     Function count: 1
+# FORMATV2-NEXT:     Block counts: [1000, 1000000, 499500]
+# FORMATV2-NEXT: Functions shown: 2
+# FORMATV2-NEXT: Total functions: 2
+# FORMATV2-NEXT: Maximum function count: 499500
+# FORMATV2-NEXT: Maximum internal block count: 1000000
+
+

From 26ddca196954ccfa697102b46118956ad616073a Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Thu, 3 Dec 2015 01:44:45 +0000
Subject: [PATCH 175/186] RegisterPressure: Use range based for, fix else
 style; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254575 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/RegisterPressure.cpp | 69 +++++++++++++-------------------
 1 file changed, 28 insertions(+), 41 deletions(-)

diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp
index 8eba0edc8bfe..6e7feb5178ee 100644
--- a/lib/CodeGen/RegisterPressure.cpp
+++ b/lib/CodeGen/RegisterPressure.cpp
@@ -59,12 +59,12 @@ void RegisterPressure::dump(const TargetRegisterInfo *TRI) const {
   dbgs() << "Max Pressure: ";
   dumpRegSetPressure(MaxSetPressure, TRI);
   dbgs() << "Live In: ";
-  for (unsigned i = 0, e = LiveInRegs.size(); i < e; ++i)
-    dbgs() << PrintVRegOrUnit(LiveInRegs[i], TRI) << " ";
+  for (unsigned Reg : LiveInRegs)
+    dbgs() << PrintVRegOrUnit(Reg, TRI) << " ";
   dbgs() << '\n';
   dbgs() << "Live Out: ";
-  for (unsigned i = 0, e = LiveOutRegs.size(); i < e; ++i)
-    dbgs() << PrintVRegOrUnit(LiveOutRegs[i], TRI) << " ";
+  for (unsigned Reg : LiveOutRegs)
+    dbgs() << PrintVRegOrUnit(Reg, TRI) << " ";
   dbgs() << '\n';
 }
 
@@ -92,8 +92,8 @@ void PressureDiff::dump(const TargetRegisterInfo &TRI) const {
 /// Increase the current pressure as impacted by these registers and bump
 /// the high water mark if needed.
 void RegPressureTracker::increaseRegPressure(ArrayRef<unsigned> RegUnits) {
-  for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
-    PSetIterator PSetI = MRI->getPressureSets(RegUnits[i]);
+  for (unsigned RegUnit : RegUnits) {
+    PSetIterator PSetI = MRI->getPressureSets(RegUnit);
     unsigned Weight = PSetI.getWeight();
     for (; PSetI.isValid(); ++PSetI) {
       CurrSetPressure[*PSetI] += Weight;
@@ -106,8 +106,8 @@ void RegPressureTracker::increaseRegPressure(ArrayRef<unsigned> RegUnits) {
 
 /// Simply decrease the current pressure as impacted by these registers.
 void RegPressureTracker::decreaseRegPressure(ArrayRef<unsigned> RegUnits) {
-  for (unsigned I = 0, E = RegUnits.size(); I != E; ++I)
-    decreaseSetPressure(CurrSetPressure, MRI->getPressureSets(RegUnits[I]));
+  for (unsigned RegUnit : RegUnits)
+    decreaseSetPressure(CurrSetPressure, MRI->getPressureSets(RegUnit));
 }
 
 /// Clear the result so it can be used for another round of pressure tracking.
@@ -298,8 +298,7 @@ void RegPressureTracker::closeRegion() {
 void RegPressureTracker::initLiveThru(const RegPressureTracker &RPTracker) {
   LiveThruPressure.assign(TRI->getNumRegPressureSets(), 0);
   assert(isBottomClosed() && "need bottom-up tracking to intialize.");
-  for (unsigned i = 0, e = P.LiveOutRegs.size(); i < e; ++i) {
-    unsigned Reg = P.LiveOutRegs[i];
+  for (unsigned Reg : P.LiveOutRegs) {
     if (TargetRegisterInfo::isVirtualRegister(Reg)
         && !RPTracker.hasUntiedDef(Reg)) {
       increaseSetPressure(LiveThruPressure, MRI->getPressureSets(Reg));
@@ -449,18 +448,18 @@ static void collectPDiff(PressureDiff &PDiff, RegisterOperands &RegOpers,
                          const MachineRegisterInfo *MRI) {
   assert(!PDiff.begin()->isValid() && "stale PDiff");
 
-  for (unsigned i = 0, e = RegOpers.Defs.size(); i != e; ++i)
-    PDiff.addPressureChange(RegOpers.Defs[i], true, MRI);
+  for (unsigned Reg : RegOpers.Defs)
+    PDiff.addPressureChange(Reg, true, MRI);
 
-  for (unsigned i = 0, e = RegOpers.Uses.size(); i != e; ++i)
-    PDiff.addPressureChange(RegOpers.Uses[i], false, MRI);
+  for (unsigned Reg : RegOpers.Uses)
+    PDiff.addPressureChange(Reg, false, MRI);
 }
 
 /// Force liveness of registers.
 void RegPressureTracker::addLiveRegs(ArrayRef<unsigned> Regs) {
-  for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
-    if (LiveRegs.insert(Regs[i]))
-      increaseRegPressure(Regs[i]);
+  for (unsigned Reg : Regs) {
+    if (LiveRegs.insert(Reg))
+      increaseRegPressure(Reg);
   }
 }
 
@@ -527,8 +526,7 @@ void RegPressureTracker::recede(SmallVectorImpl<unsigned> *LiveUses,
 
   // Kill liveness at live defs.
   // TODO: consider earlyclobbers?
-  for (unsigned i = 0, e = RegOpers.Defs.size(); i < e; ++i) {
-    unsigned Reg = RegOpers.Defs[i];
+  for (unsigned Reg : RegOpers.Defs) {
     bool DeadDef = false;
     if (RequireIntervals) {
       const LiveRange *LR = getLiveRange(Reg);
@@ -552,8 +550,7 @@ void RegPressureTracker::recede(SmallVectorImpl<unsigned> *LiveUses,
   }
 
   // Generate liveness for uses.
-  for (unsigned i = 0, e = RegOpers.Uses.size(); i < e; ++i) {
-    unsigned Reg = RegOpers.Uses[i];
+  for (unsigned Reg : RegOpers.Uses) {
     if (!LiveRegs.contains(Reg)) {
       // Adjust liveouts if LiveIntervals are available.
       if (RequireIntervals) {
@@ -571,8 +568,7 @@ void RegPressureTracker::recede(SmallVectorImpl<unsigned> *LiveUses,
     }
   }
   if (TrackUntiedDefs) {
-    for (unsigned i = 0, e = RegOpers.Defs.size(); i < e; ++i) {
-      unsigned Reg = RegOpers.Defs[i];
+    for (unsigned Reg : RegOpers.Defs) {
       if (TargetRegisterInfo::isVirtualRegister(Reg) && !LiveRegs.contains(Reg))
         UntiedDefs.insert(Reg);
     }
@@ -602,8 +598,7 @@ void RegPressureTracker::advance() {
   RegisterOperands RegOpers;
   RegOpers.collect(*CurrPos, *TRI, *MRI);
 
-  for (unsigned i = 0, e = RegOpers.Uses.size(); i < e; ++i) {
-    unsigned Reg = RegOpers.Uses[i];
+  for (unsigned Reg : RegOpers.Uses) {
     // Discover live-ins.
     bool isLive = LiveRegs.contains(Reg);
     if (!isLive)
@@ -613,22 +608,19 @@ void RegPressureTracker::advance() {
     if (RequireIntervals) {
       const LiveRange *LR = getLiveRange(Reg);
       lastUse = LR && LR->Query(SlotIdx).isKill();
-    }
-    else {
+    } else {
       // Allocatable physregs are always single-use before register rewriting.
       lastUse = !TargetRegisterInfo::isVirtualRegister(Reg);
     }
     if (lastUse && isLive) {
       LiveRegs.erase(Reg);
       decreaseRegPressure(Reg);
-    }
-    else if (!lastUse && !isLive)
+    } else if (!lastUse && !isLive)
       increaseRegPressure(Reg);
   }
 
   // Generate liveness for defs.
-  for (unsigned i = 0, e = RegOpers.Defs.size(); i < e; ++i) {
-    unsigned Reg = RegOpers.Defs[i];
+  for (unsigned Reg : RegOpers.Defs) {
     if (LiveRegs.insert(Reg))
       increaseRegPressure(Reg);
   }
@@ -666,8 +658,7 @@ static void computeExcessPressureDelta(ArrayRef<unsigned> OldPressureVec,
         PDiff = 0;            // Under the limit
       else
         PDiff = PNew - Limit; // Just exceeded limit.
-    }
-    else if (Limit > PNew)
+    } else if (Limit > PNew)
       PDiff = Limit - POld;   // Just obeyed limit.
 
     if (PDiff) {
@@ -737,8 +728,7 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) {
   assert(RegOpers.DeadDefs.size() == 0);
 
   // Kill liveness at live defs.
-  for (unsigned i = 0, e = RegOpers.Defs.size(); i < e; ++i) {
-    unsigned Reg = RegOpers.Defs[i];
+  for (unsigned Reg : RegOpers.Defs) {
     bool DeadDef = false;
     if (RequireIntervals) {
       const LiveRange *LR = getLiveRange(Reg);
@@ -754,8 +744,7 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) {
     }
   }
   // Generate liveness for uses.
-  for (unsigned i = 0, e = RegOpers.Uses.size(); i < e; ++i) {
-    unsigned Reg = RegOpers.Uses[i];
+  for (unsigned Reg : RegOpers.Uses) {
     if (!LiveRegs.contains(Reg))
       increaseRegPressure(Reg);
   }
@@ -932,8 +921,7 @@ void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) {
   if (RequireIntervals)
     SlotIdx = LIS->getInstructionIndex(MI).getRegSlot();
 
-  for (unsigned i = 0, e = RegOpers.Uses.size(); i < e; ++i) {
-    unsigned Reg = RegOpers.Uses[i];
+  for (unsigned Reg : RegOpers.Uses) {
     if (RequireIntervals) {
       // FIXME: allow the caller to pass in the list of vreg uses that remain
       // to be bottom-scheduled to avoid searching uses at each query.
@@ -944,8 +932,7 @@ void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) {
         if (LRQ.isKill() && !findUseBetween(Reg, CurrIdx, SlotIdx, *MRI, LIS))
           decreaseRegPressure(Reg);
       }
-    }
-    else if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+    } else if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
       // Allocatable physregs are always single-use before register rewriting.
       decreaseRegPressure(Reg);
     }

From c0a189c3792865257c1383f176e5401373ed2270 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Thu, 3 Dec 2015 02:05:27 +0000
Subject: [PATCH 176/186] ScheduleDAGInstrs: Rework schedule graph builder.

The new algorithm remembers the uses encountered while walking backwards
until a matching def is found. Contrary to the previous version this:
- Works without LiveIntervals being available
- Allows to increase the precision to subregisters/lanemasks
  (not used for now)

The changes in the AMDGPU tests are necessary because the R600 scheduler
is not stable with respect to the order of nodes in the ready queues.

Differential Revision: http://reviews.llvm.org/D9068

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254577 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/ScheduleDAGInstrs.h      |  40 +++-
 lib/CodeGen/ScheduleDAGInstrs.cpp             | 225 +++++++++++++-----
 test/CodeGen/AMDGPU/image-attributes.ll       |  20 +-
 test/CodeGen/AMDGPU/literals.ll               |   8 +-
 .../AMDGPU/llvm.AMDGPU.read.workdim.ll        |   2 +-
 test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll      |   2 +-
 .../AMDGPU/llvm.r600.read.local.size.ll       |   6 +-
 test/CodeGen/AMDGPU/or.ll                     |   2 +-
 test/CodeGen/AMDGPU/set-dx10.ll               |  48 ++--
 test/CodeGen/AMDGPU/sext-in-reg.ll            |   4 +-
 test/CodeGen/AMDGPU/shl.ll                    |  12 +-
 test/CodeGen/AMDGPU/sra.ll                    |   8 +-
 test/CodeGen/AMDGPU/srl.ll                    |  10 +-
 test/CodeGen/AMDGPU/unsupported-cc.ll         |  32 +--
 test/CodeGen/AMDGPU/work-item-intrinsics.ll   |  12 +-
 test/CodeGen/AMDGPU/xor.ll                    |   2 +-
 16 files changed, 277 insertions(+), 156 deletions(-)

diff --git a/include/llvm/CodeGen/ScheduleDAGInstrs.h b/include/llvm/CodeGen/ScheduleDAGInstrs.h
index 1446f2ac082b..c715e0f79205 100644
--- a/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -33,15 +33,26 @@ namespace llvm {
   /// An individual mapping from virtual register number to SUnit.
   struct VReg2SUnit {
     unsigned VirtReg;
+    LaneBitmask LaneMask;
     SUnit *SU;
 
-    VReg2SUnit(unsigned reg, SUnit *su): VirtReg(reg), SU(su) {}
+    VReg2SUnit(unsigned VReg, LaneBitmask LaneMask, SUnit *SU)
+      : VirtReg(VReg), LaneMask(LaneMask), SU(SU) {}
 
     unsigned getSparseSetIndex() const {
       return TargetRegisterInfo::virtReg2Index(VirtReg);
     }
   };
 
+  /// Mapping from virtual register to SUnit including an operand index.
+  struct VReg2SUnitOperIdx : public VReg2SUnit {
+    unsigned OperandIndex;
+
+    VReg2SUnitOperIdx(unsigned VReg, LaneBitmask LaneMask,
+                      unsigned OperandIndex, SUnit *SU)
+      : VReg2SUnit(VReg, LaneMask, SU), OperandIndex(OperandIndex) {}
+  };
+
   /// Record a physical register access.
   /// For non-data-dependent uses, OpIdx == -1.
   struct PhysRegSUOper {
@@ -69,7 +80,10 @@ namespace llvm {
   /// Track local uses of virtual registers. These uses are gathered by the DAG
   /// builder and may be consulted by the scheduler to avoid iterating an entire
   /// vreg use list.
-  typedef SparseMultiSet<VReg2SUnit, VirtReg2IndexFunctor> VReg2UseMap;
+  typedef SparseMultiSet<VReg2SUnit, VirtReg2IndexFunctor> VReg2SUnitMultiMap;
+
+  typedef SparseMultiSet<VReg2SUnitOperIdx, VirtReg2IndexFunctor>
+    VReg2SUnitOperIdxMultiMap;
 
   /// ScheduleDAGInstrs - A ScheduleDAG subclass for scheduling lists of
   /// MachineInstrs.
@@ -95,6 +109,9 @@ namespace llvm {
     /// it has taken responsibility for scheduling the terminator correctly.
     bool CanHandleTerminators;
 
+    /// Whether lane masks should get tracked.
+    bool TrackLaneMasks;
+
     /// State specific to the current scheduling region.
     /// ------------------------------------------------
 
@@ -117,7 +134,7 @@ namespace llvm {
     /// After calling BuildSchedGraph, each vreg used in the scheduling region
     /// is mapped to a set of SUnits. These include all local vreg uses, not
     /// just the uses for a singly defined vreg.
-    VReg2UseMap VRegUses;
+    VReg2SUnitMultiMap VRegUses;
 
     /// State internal to DAG building.
     /// -------------------------------
@@ -129,8 +146,12 @@ namespace llvm {
     Reg2SUnitsMap Defs;
     Reg2SUnitsMap Uses;
 
-    /// Track the last instruction in this region defining each virtual register.
-    VReg2SUnitMap VRegDefs;
+    /// Tracks the last instruction(s) in this region defining each virtual
+    /// register. There may be multiple current definitions for a register with
+    /// disjunct lanemasks.
+    VReg2SUnitMultiMap CurrentVRegDefs;
+    /// Tracks the last instructions in this region using each virtual register.
+    VReg2SUnitOperIdxMultiMap CurrentVRegUses;
 
     /// PendingLoads - Remember where unknown loads are after the most recent
     /// unknown store, as we iterate. As with Defs and Uses, this is here
@@ -200,7 +221,8 @@ namespace llvm {
     /// input.
     void buildSchedGraph(AliasAnalysis *AA,
                          RegPressureTracker *RPTracker = nullptr,
-                         PressureDiffs *PDiffs = nullptr);
+                         PressureDiffs *PDiffs = nullptr,
+                         bool TrackLaneMasks = false);
 
     /// addSchedBarrierDeps - Add dependencies from instructions in the current
     /// list of instructions being scheduled to scheduling barrier. We want to
@@ -247,6 +269,12 @@ namespace llvm {
     /// Other adjustments may be made to the instruction if necessary. Return
     /// true if the operand has been deleted, false if not.
     bool toggleKillFlag(MachineInstr *MI, MachineOperand &MO);
+
+    /// Returns a mask for which lanes get read/written by the given (register)
+    /// machine operand.
+    LaneBitmask getLaneMaskForMO(const MachineOperand &MO) const;
+
+    void collectVRegUses(SUnit *SU);
   };
 
   /// newSUnit - Creates a new SUnit and return a ptr to it.
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index 12b2beb357b4..2ef02deebfbd 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -13,12 +13,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/ADT/IntEqClasses.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -55,7 +55,7 @@ ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
                                      bool RemoveKillFlags)
     : ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()), LIS(LIS),
       RemoveKillFlags(RemoveKillFlags), CanHandleTerminators(false),
-      FirstDbgValue(nullptr) {
+      TrackLaneMasks(false), FirstDbgValue(nullptr) {
   DbgValues.clear();
 
   const TargetSubtargetInfo &ST = mf.getSubtarget();
@@ -363,6 +363,20 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
   }
 }
 
+LaneBitmask ScheduleDAGInstrs::getLaneMaskForMO(const MachineOperand &MO) const
+{
+  unsigned Reg = MO.getReg();
+  // No point in tracking lanemasks if we don't have interesting subregisters.
+  const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
+  if (!RC.HasDisjunctSubRegs)
+    return ~0u;
+
+  unsigned SubReg = MO.getSubReg();
+  if (SubReg == 0)
+    return RC.getLaneMask();
+  return TRI->getSubRegIndexLaneMask(SubReg);
+}
+
 /// addVRegDefDeps - Add register output and data dependencies from this SUnit
 /// to instructions that occur later in the same scheduling region if they read
 /// from or write to the virtual register defined at OperIdx.
@@ -370,35 +384,106 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
 /// TODO: Hoist loop induction variable increments. This has to be
 /// reevaluated. Generally, IV scheduling should be done before coalescing.
 void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
-  const MachineInstr *MI = SU->getInstr();
-  unsigned Reg = MI->getOperand(OperIdx).getReg();
+  MachineInstr *MI = SU->getInstr();
+  MachineOperand &MO = MI->getOperand(OperIdx);
+  unsigned Reg = MO.getReg();
+
+  LaneBitmask DefLaneMask;
+  LaneBitmask KillLaneMask;
+  if (TrackLaneMasks) {
+    bool IsKill = MO.getSubReg() == 0 || MO.isUndef();
+    DefLaneMask = getLaneMaskForMO(MO);
+    // If we have a <read-undef> flag, none of the lane values comes from an
+    // earlier instruction.
+    KillLaneMask = IsKill ? ~0u : DefLaneMask;
+
+    // Clear undef flag, we'll re-add it later once we know which subregister
+    // Def is first.
+    MO.setIsUndef(false);
+  } else {
+    DefLaneMask = ~0u;
+    KillLaneMask = ~0u;
+  }
+
+  if (MO.isDead()) {
+    assert(CurrentVRegUses.find(Reg) == CurrentVRegUses.end() &&
+           "Dead defs should have no uses");
+  } else {
+    // Add data dependence to all uses we found so far.
+    const TargetSubtargetInfo &ST = MF.getSubtarget();
+    for (VReg2SUnitOperIdxMultiMap::iterator I = CurrentVRegUses.find(Reg),
+         E = CurrentVRegUses.end(); I != E; /*empty*/) {
+      LaneBitmask LaneMask = I->LaneMask;
+      // Ignore uses of other lanes.
+      if ((LaneMask & KillLaneMask) == 0) {
+        ++I;
+        continue;
+      }
 
-  // Singly defined vregs do not have output/anti dependencies.
-  // The current operand is a def, so we have at least one.
-  // Check here if there are any others...
+      if ((LaneMask & DefLaneMask) != 0) {
+        SUnit *UseSU = I->SU;
+        MachineInstr *Use = UseSU->getInstr();
+        SDep Dep(SU, SDep::Data, Reg);
+        Dep.setLatency(SchedModel.computeOperandLatency(MI, OperIdx, Use,
+                                                        I->OperandIndex));
+        ST.adjustSchedDependency(SU, UseSU, Dep);
+        UseSU->addPred(Dep);
+      }
+
+      LaneMask &= ~KillLaneMask;
+      // If we found a Def for all lanes of this use, remove it from the list.
+      if (LaneMask != 0) {
+        I->LaneMask = LaneMask;
+        ++I;
+      } else
+        I = CurrentVRegUses.erase(I);
+    }
+  }
+
+  // Shortcut: Singly defined vregs do not have output/anti dependencies.
   if (MRI.hasOneDef(Reg))
     return;
 
-  // Add output dependence to the next nearest def of this vreg.
+  // Add output dependence to the next nearest defs of this vreg.
   //
   // Unless this definition is dead, the output dependence should be
   // transitively redundant with antidependencies from this definition's
   // uses. We're conservative for now until we have a way to guarantee the uses
   // are not eliminated sometime during scheduling. The output dependence edge
   // is also useful if output latency exceeds def-use latency.
-  VReg2SUnitMap::iterator DefI = VRegDefs.find(Reg);
-  if (DefI == VRegDefs.end())
-    VRegDefs.insert(VReg2SUnit(Reg, SU));
-  else {
-    SUnit *DefSU = DefI->SU;
-    if (DefSU != SU && DefSU != &ExitSU) {
-      SDep Dep(SU, SDep::Output, Reg);
-      Dep.setLatency(
-        SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr()));
-      DefSU->addPred(Dep);
-    }
-    DefI->SU = SU;
+  LaneBitmask LaneMask = DefLaneMask;
+  for (VReg2SUnit &V2SU : make_range(CurrentVRegDefs.find(Reg),
+                                     CurrentVRegDefs.end())) {
+    // Ignore defs for other lanes.
+    if ((V2SU.LaneMask & LaneMask) == 0)
+      continue;
+    // Add an output dependence.
+    SUnit *DefSU = V2SU.SU;
+    // Ignore additional defs of the same lanes in one instruction. This can
+    // happen because lanemasks are shared for targets with too many
+    // subregisters. We also use some representration tricks/hacks where we
+    // add super-register defs/uses, to imply that although we only access parts
+    // of the reg we care about the full one.
+    if (DefSU == SU)
+      continue;
+    SDep Dep(SU, SDep::Output, Reg);
+    Dep.setLatency(
+      SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr()));
+    DefSU->addPred(Dep);
+
+    // Update current definition. This can get tricky if the def was about a
+    // bigger lanemask before. We then have to shrink it and create a new
+    // VReg2SUnit for the non-overlapping part.
+    LaneBitmask OverlapMask = V2SU.LaneMask & LaneMask;
+    LaneBitmask NonOverlapMask = V2SU.LaneMask & ~LaneMask;
+    if (NonOverlapMask != 0)
+      CurrentVRegDefs.insert(VReg2SUnit(Reg, NonOverlapMask, V2SU.SU));
+    V2SU.SU = SU;
+    V2SU.LaneMask = OverlapMask;
   }
+  // If there was no CurrentVRegDefs entry for some lanes yet, create one.
+  if (LaneMask != 0)
+    CurrentVRegDefs.insert(VReg2SUnit(Reg, LaneMask, SU));
 }
 
 /// addVRegUseDeps - Add a register data dependency if the instruction that
@@ -408,49 +493,26 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
 ///
 /// TODO: Handle ExitSU "uses" properly.
 void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
-  MachineInstr *MI = SU->getInstr();
-  unsigned Reg = MI->getOperand(OperIdx).getReg();
+  const MachineInstr *MI = SU->getInstr();
+  const MachineOperand &MO = MI->getOperand(OperIdx);
+  unsigned Reg = MO.getReg();
+
+  // Remember the use. Data dependencies will be added when we find the def.
+  LaneBitmask LaneMask = TrackLaneMasks ? getLaneMaskForMO(MO) : ~0u;
+  CurrentVRegUses.insert(VReg2SUnitOperIdx(Reg, LaneMask, OperIdx, SU));
+
+  // Add antidependences to the following defs of the vreg.
+  for (VReg2SUnit &V2SU : make_range(CurrentVRegDefs.find(Reg),
+                                     CurrentVRegDefs.end())) {
+    // Ignore defs for unrelated lanes.
+    LaneBitmask PrevDefLaneMask = V2SU.LaneMask;
+    if ((PrevDefLaneMask & LaneMask) == 0)
+      continue;
+    if (V2SU.SU == SU)
+      continue;
 
-  // Record this local VReg use.
-  VReg2UseMap::iterator UI = VRegUses.find(Reg);
-  for (; UI != VRegUses.end(); ++UI) {
-    if (UI->SU == SU)
-      break;
-  }
-  if (UI == VRegUses.end())
-    VRegUses.insert(VReg2SUnit(Reg, SU));
-
-  // Lookup this operand's reaching definition.
-  assert(LIS && "vreg dependencies requires LiveIntervals");
-  LiveQueryResult LRQ
-    = LIS->getInterval(Reg).Query(LIS->getInstructionIndex(MI));
-  VNInfo *VNI = LRQ.valueIn();
-
-  // VNI will be valid because MachineOperand::readsReg() is checked by caller.
-  assert(VNI && "No value to read by operand");
-  MachineInstr *Def = LIS->getInstructionFromIndex(VNI->def);
-  // Phis and other noninstructions (after coalescing) have a NULL Def.
-  if (Def) {
-    SUnit *DefSU = getSUnit(Def);
-    if (DefSU) {
-      // The reaching Def lives within this scheduling region.
-      // Create a data dependence.
-      SDep dep(DefSU, SDep::Data, Reg);
-      // Adjust the dependence latency using operand def/use information, then
-      // allow the target to perform its own adjustments.
-      int DefOp = Def->findRegisterDefOperandIdx(Reg);
-      dep.setLatency(SchedModel.computeOperandLatency(Def, DefOp, MI, OperIdx));
-
-      const TargetSubtargetInfo &ST = MF.getSubtarget();
-      ST.adjustSchedDependency(DefSU, SU, const_cast<SDep &>(dep));
-      SU->addPred(dep);
-    }
+    V2SU.SU->addPred(SDep(SU, SDep::Anti, Reg));
   }
-
-  // Add antidependence to the following def of the vreg it uses.
-  VReg2SUnitMap::iterator DefI = VRegDefs.find(Reg);
-  if (DefI != VRegDefs.end() && DefI->SU != SU)
-    DefI->SU->addPred(SDep(SU, SDep::Anti, Reg));
 }
 
 /// Return true if MI is an instruction we are unable to reason about
@@ -733,17 +795,42 @@ void ScheduleDAGInstrs::initSUnits() {
   }
 }
 
+void ScheduleDAGInstrs::collectVRegUses(SUnit *SU) {
+  const MachineInstr *MI = SU->getInstr();
+  for (const MachineOperand &MO : MI->operands()) {
+    if (!MO.isReg())
+      continue;
+    if (!MO.isUse() && (MO.getSubReg() == 0 || !TrackLaneMasks))
+      continue;
+
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+
+    // Record this local VReg use.
+    VReg2SUnitMultiMap::iterator UI = VRegUses.find(Reg);
+    for (; UI != VRegUses.end(); ++UI) {
+      if (UI->SU == SU)
+        break;
+    }
+    if (UI == VRegUses.end())
+      VRegUses.insert(VReg2SUnit(Reg, 0, SU));
+  }
+}
+
 /// If RegPressure is non-null, compute register pressure as a side effect. The
 /// DAG builder is an efficient place to do it because it already visits
 /// operands.
 void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
                                         RegPressureTracker *RPTracker,
-                                        PressureDiffs *PDiffs) {
+                                        PressureDiffs *PDiffs,
+                                        bool TrackLaneMasks) {
   const TargetSubtargetInfo &ST = MF.getSubtarget();
   bool UseAA = EnableAASchedMI.getNumOccurrences() > 0 ? EnableAASchedMI
                                                        : ST.useAA();
   AliasAnalysis *AAForDep = UseAA ? AA : nullptr;
 
+  this->TrackLaneMasks = TrackLaneMasks;
   MISUnitMap.clear();
   ScheduleDAG::clearDAG();
 
@@ -777,10 +864,14 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
   Defs.setUniverse(TRI->getNumRegs());
   Uses.setUniverse(TRI->getNumRegs());
 
-  assert(VRegDefs.empty() && "Only BuildSchedGraph may access VRegDefs");
+  assert(CurrentVRegDefs.empty() && "nobody else should use CurrentVRegDefs");
+  assert(CurrentVRegUses.empty() && "nobody else should use CurrentVRegUses");
+  unsigned NumVirtRegs = MRI.getNumVirtRegs();
+  CurrentVRegDefs.setUniverse(NumVirtRegs);
+  CurrentVRegUses.setUniverse(NumVirtRegs);
+
   VRegUses.clear();
-  VRegDefs.setUniverse(MRI.getNumVirtRegs());
-  VRegUses.setUniverse(MRI.getNumVirtRegs());
+  VRegUses.setUniverse(NumVirtRegs);
 
   // Model data dependencies between instructions being scheduled and the
   // ExitSU.
@@ -808,6 +899,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
       RPTracker->recede(/*LiveUses=*/nullptr, PDiff);
       assert(RPTracker->getPos() == std::prev(MII) &&
              "RPTracker can't find MI");
+      collectVRegUses(SU);
     }
 
     assert(
@@ -1057,7 +1149,8 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
 
   Defs.clear();
   Uses.clear();
-  VRegDefs.clear();
+  CurrentVRegDefs.clear();
+  CurrentVRegUses.clear();
   PendingLoads.clear();
 }
 
diff --git a/test/CodeGen/AMDGPU/image-attributes.ll b/test/CodeGen/AMDGPU/image-attributes.ll
index 7a5a7346865f..5906b2f15709 100644
--- a/test/CodeGen/AMDGPU/image-attributes.ll
+++ b/test/CodeGen/AMDGPU/image-attributes.ll
@@ -6,7 +6,7 @@
 
 ; FUNC-LABEL: {{^}}width_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[2].Z
+; EG: MOV * [[VAL]], KC0[2].Z
 define void @width_2d (%opencl.image2d_t addrspace(1)* %in,
                        i32 addrspace(1)* %out) {
 entry:
@@ -19,7 +19,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}width_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[2].Z
+; EG: MOV * [[VAL]], KC0[2].Z
 define void @width_3d (%opencl.image3d_t addrspace(1)* %in,
                        i32 addrspace(1)* %out) {
 entry:
@@ -36,7 +36,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}height_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[2].W
+; EG: MOV * [[VAL]], KC0[2].W
 define void @height_2d (%opencl.image2d_t addrspace(1)* %in,
                         i32 addrspace(1)* %out) {
 entry:
@@ -49,7 +49,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}height_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[2].W
+; EG: MOV * [[VAL]], KC0[2].W
 define void @height_3d (%opencl.image3d_t addrspace(1)* %in,
                         i32 addrspace(1)* %out) {
 entry:
@@ -66,7 +66,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}depth_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[3].X
+; EG: MOV * [[VAL]], KC0[3].X
 define void @depth_3d (%opencl.image3d_t addrspace(1)* %in,
                        i32 addrspace(1)* %out) {
 entry:
@@ -83,7 +83,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}data_type_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[3].Y
+; EG: MOV * [[VAL]], KC0[3].Y
 define void @data_type_2d (%opencl.image2d_t addrspace(1)* %in,
                            i32 addrspace(1)* %out) {
 entry:
@@ -96,7 +96,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}data_type_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[3].Y
+; EG: MOV * [[VAL]], KC0[3].Y
 define void @data_type_3d (%opencl.image3d_t addrspace(1)* %in,
                                      i32 addrspace(1)* %out) {
 entry:
@@ -113,7 +113,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}channel_order_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[3].Z
+; EG: MOV * [[VAL]], KC0[3].Z
 define void @channel_order_2d (%opencl.image2d_t addrspace(1)* %in,
                                i32 addrspace(1)* %out) {
 entry:
@@ -126,7 +126,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}channel_order_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[3].Z
+; EG: MOV * [[VAL]], KC0[3].Z
 define void @channel_order_3d (%opencl.image3d_t addrspace(1)* %in,
                                          i32 addrspace(1)* %out) {
 entry:
@@ -145,7 +145,7 @@ entry:
 ;
 ; FUNC-LABEL: {{^}}image_arg_2nd:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[4].Z
+; EG: MOV * [[VAL]], KC0[4].Z
 define void @image_arg_2nd (%opencl.image3d_t addrspace(1)* %in1,
                             i32 %x,
                             %opencl.image2d_t addrspace(1)* %in2,
diff --git a/test/CodeGen/AMDGPU/literals.ll b/test/CodeGen/AMDGPU/literals.ll
index cff1c24f89d6..9d2320cb2d19 100644
--- a/test/CodeGen/AMDGPU/literals.ll
+++ b/test/CodeGen/AMDGPU/literals.ll
@@ -7,8 +7,8 @@
 ; ADD_INT literal.x KC0[2].Z, 5
 
 ; CHECK: {{^}}i32_literal:
-; CHECK: ADD_INT {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: ADD_INT * {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.y
 ; CHECK-NEXT: 5
 define void @i32_literal(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -24,8 +24,8 @@ entry:
 ; ADD literal.x KC0[2].Z, 5.0
 
 ; CHECK: {{^}}float_literal:
-; CHECK: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: ADD * {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.y
 ; CHECK-NEXT: 1084227584(5.0
 define void @float_literal(float addrspace(1)* %out, float %in) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll
index 6dc9d050eee6..2e299e30b8c7 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll
@@ -4,7 +4,7 @@
 
 ; FUNC-LABEL: {{^}}read_workdim:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[2].Z
+; EG: MOV * [[VAL]], KC0[2].Z
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll
index 74792e50017f..a30a8e083eb6 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll
@@ -3,7 +3,7 @@
 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s
 
 ; R600: {{^}}amdgpu_trunc:
-; R600: TRUNC T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; R600: TRUNC {{\*? *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
 ; SI: {{^}}amdgpu_trunc:
 ; SI: v_trunc_f32
 
diff --git a/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
index f2a7256e812d..13ebee41e844 100644
--- a/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
+++ b/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
@@ -5,7 +5,7 @@
 
 ; FUNC-LABEL: {{^}}local_size_x:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[1].Z
+; EG: MOV * [[VAL]], KC0[1].Z
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18
@@ -23,7 +23,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}local_size_y:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[1].W
+; EG: MOV * [[VAL]], KC0[1].W
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
@@ -38,7 +38,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}local_size_z:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[2].X
+; EG: MOV * [[VAL]], KC0[2].X
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
diff --git a/test/CodeGen/AMDGPU/or.ll b/test/CodeGen/AMDGPU/or.ll
index 1c04090b407f..e40f18f040b7 100644
--- a/test/CodeGen/AMDGPU/or.ll
+++ b/test/CodeGen/AMDGPU/or.ll
@@ -153,7 +153,7 @@ define void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 }
 
 ; FUNC-LABEL: {{^}}or_i1:
-; EG: OR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}}
+; EG: OR_INT * {{\** *}}T{{[0-9]+\.[XYZW], PS, PV\.[XYZW]}}
 
 ; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}]
 define void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
diff --git a/test/CodeGen/AMDGPU/set-dx10.ll b/test/CodeGen/AMDGPU/set-dx10.ll
index 53694dcffa66..57365a6e1fc3 100644
--- a/test/CodeGen/AMDGPU/set-dx10.ll
+++ b/test/CodeGen/AMDGPU/set-dx10.ll
@@ -5,8 +5,8 @@
 ; SET*DX10 instructions.
 
 ; CHECK: {{^}}fcmp_une_select_fptosi:
-; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETNE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_une_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -19,8 +19,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_une_select_i32:
-; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETNE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_une_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -31,8 +31,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_oeq_select_fptosi:
-; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_oeq_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -45,8 +45,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_oeq_select_i32:
-; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_oeq_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -57,8 +57,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_ogt_select_fptosi:
-; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_ogt_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -71,8 +71,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_ogt_select_i32:
-; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_ogt_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -83,8 +83,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_oge_select_fptosi:
-; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_oge_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -97,8 +97,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_oge_select_i32:
-; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_oge_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -109,8 +109,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_ole_select_fptosi:
-; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_ole_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -123,8 +123,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_ole_select_i32:
-; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_ole_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -135,8 +135,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_olt_select_fptosi:
-; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_olt_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -149,8 +149,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_olt_select_i32:
-; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_olt_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
diff --git a/test/CodeGen/AMDGPU/sext-in-reg.ll b/test/CodeGen/AMDGPU/sext-in-reg.ll
index 95fcfdbdecae..23ae3b967971 100644
--- a/test/CodeGen/AMDGPU/sext-in-reg.ll
+++ b/test/CodeGen/AMDGPU/sext-in-reg.ll
@@ -12,8 +12,8 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 ; SI: buffer_store_dword [[EXTRACT]],
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
-; EG: BFE_INT [[RES]], {{.*}}, 0.0, 1
-; EG-NEXT: LSHR * [[ADDR]]
+; EG: LSHR * [[ADDR]]
+; EG: BFE_INT * [[RES]], {{.*}}, 0.0, 1
 define void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) {
   %shl = shl i32 %in, 31
   %sext = ashr i32 %shl, 31
diff --git a/test/CodeGen/AMDGPU/shl.ll b/test/CodeGen/AMDGPU/shl.ll
index bf08e66f3304..55db80731c90 100644
--- a/test/CodeGen/AMDGPU/shl.ll
+++ b/test/CodeGen/AMDGPU/shl.ll
@@ -53,14 +53,14 @@ define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in
   ret void
 }
 
-;EG: {{^}}shl_i64:
+;EG-LABEL: {{^}}shl_i64:
 ;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
 ;EG: LSHR {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
-;EG: LSHR {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
-;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-DAG: LSHR {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
 ;EG-DAG: LSHL {{\*? *}}[[HISMTMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], [[SHIFT]]
-;EG-DAG: OR_INT {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], {{[[HISMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
-;EG-DAG: LSHL {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], [[OPLO]], {{PS|[[SHIFT]]}}
+;EG-DAG: OR_INT {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], {{[[HISMTMP]]|PV.[XYZW]|PS}}, {{[[OVERF]]|PV.[XYZW]}}
+;EG-DAG: LSHL {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], [[OPLO]], {{PS|[[SHIFT]]|PV.[XYZW]}}
 ;EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
 ;EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
 ;EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0
@@ -80,7 +80,7 @@ define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   ret void
 }
 
-;EG: {{^}}shl_v2i64:
+;EG-LABEL: {{^}}shl_v2i64:
 ;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
 ;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
 ;EG-DAG: LSHR {{\*? *}}[[COMPSHA]]
diff --git a/test/CodeGen/AMDGPU/sra.ll b/test/CodeGen/AMDGPU/sra.ll
index bcbc32f4c053..3b59bbfb18c0 100644
--- a/test/CodeGen/AMDGPU/sra.ll
+++ b/test/CodeGen/AMDGPU/sra.ll
@@ -70,11 +70,11 @@ entry:
 ;EG-LABEL: {{^}}ashr_i64_2:
 ;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
 ;EG: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
-;EG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
-;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-DAG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
 ;EG-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]]
-;EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
-;EG-DAG: ASHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
+;EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]|PS}}, {{[[OVERF]]|PV.[XYZW]}}
+;EG-DAG: ASHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|PV.[XYZW]|[[SHIFT]]}}
 ;EG-DAG: ASHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal
 ;EG-DAG: ASHR {{\*? *}}[[HIBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal
 ;EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
diff --git a/test/CodeGen/AMDGPU/srl.ll b/test/CodeGen/AMDGPU/srl.ll
index ebb2f2db252e..bbd954356322 100644
--- a/test/CodeGen/AMDGPU/srl.ll
+++ b/test/CodeGen/AMDGPU/srl.ll
@@ -65,14 +65,14 @@ define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i
 
 ; EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
 ; EG: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
-; EG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
 ; EG-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+; EG-DAG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
 ; EG-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]]
-; EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
-; EG-DAG: LSHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
-; EG-DAG: LSHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
+; EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]|PS}}, {{[[OVERF]]|PV.[XYZW]}}
+; EG-DAG: LSHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]|PV\.[XYZW]}}
 ; EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
-; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
+; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]|PS}}
+; EG-DAG: LSHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], [[SHIFT]]
 ; EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0
 define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
diff --git a/test/CodeGen/AMDGPU/unsupported-cc.ll b/test/CodeGen/AMDGPU/unsupported-cc.ll
index 8ab4faf2f145..d120111a71fb 100644
--- a/test/CodeGen/AMDGPU/unsupported-cc.ll
+++ b/test/CodeGen/AMDGPU/unsupported-cc.ll
@@ -3,8 +3,8 @@
 ; These tests are for condition codes that are not supported by the hardware
 
 ; CHECK-LABEL: {{^}}slt:
-; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 5(7.006492e-45)
 define void @slt(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -15,8 +15,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}ult_i32:
-; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 5(7.006492e-45)
 define void @ult_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -40,8 +40,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}ult_float_native:
-; CHECK: SETGE T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
-; CHECK-NEXT: LSHR *
+; CHECK: LSHR
+; CHECK-NEXT: SETGE {{\*? *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, {{literal\.[xy]}}
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @ult_float_native(float addrspace(1)* %out, float %in) {
 entry:
@@ -52,8 +52,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}olt:
-; CHECK: SETGT T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
-; CHECK-NEXT: LSHR *
+; CHECK: LSHR
+; CHECK-NEXT: SETGT {{\*? *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @olt(float addrspace(1)* %out, float %in) {
 entry:
@@ -64,8 +64,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}sle:
-; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 6(8.407791e-45)
 define void @sle(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -76,8 +76,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}ule_i32:
-; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 6(8.407791e-45)
 define void @ule_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -101,8 +101,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}ule_float_native:
-; CHECK: SETGT T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
-; CHECK-NEXT: LSHR *
+; CHECK: LSHR
+; CHECK-NEXT: SETGT {{\*? *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, {{literal\.[xy]}}
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @ule_float_native(float addrspace(1)* %out, float %in) {
 entry:
@@ -113,8 +113,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}ole:
-; CHECK: SETGE T{{[0-9]\.[XYZW]}}, literal.x, KC0[2].Z
-; CHECK-NEXT: LSHR *
+; CHECK: LSHR
+; CHECK-NEXT: SETGE {{\*? *}}T{{[0-9]\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT:1084227584(5.000000e+00)
 define void @ole(float addrspace(1)* %out, float %in) {
 entry:
diff --git a/test/CodeGen/AMDGPU/work-item-intrinsics.ll b/test/CodeGen/AMDGPU/work-item-intrinsics.ll
index a704a23b0f92..f420ec9c7d23 100644
--- a/test/CodeGen/AMDGPU/work-item-intrinsics.ll
+++ b/test/CodeGen/AMDGPU/work-item-intrinsics.ll
@@ -7,7 +7,7 @@
 
 ; FUNC-LABEL: {{^}}ngroups_x:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[0].X
+; EG: MOV {{\*? *}}[[VAL]], KC0[0].X
 
 ; HSA: .amd_kernel_code_t
 
@@ -38,7 +38,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}ngroups_y:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[0].Y
+; EG: MOV {{\*? *}}[[VAL]], KC0[0].Y
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
@@ -53,7 +53,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}ngroups_z:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[0].Z
+; EG: MOV {{\*? *}}[[VAL]], KC0[0].Z
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
@@ -68,7 +68,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}global_size_x:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[0].W
+; EG: MOV {{\*? *}}[[VAL]], KC0[0].W
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc
@@ -83,7 +83,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}global_size_y:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[1].X
+; EG: MOV {{\*? *}}[[VAL]], KC0[1].X
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10
@@ -98,7 +98,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}global_size_z:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[1].Y
+; EG: MOV {{\*? *}}[[VAL]], KC0[1].Y
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14
diff --git a/test/CodeGen/AMDGPU/xor.ll b/test/CodeGen/AMDGPU/xor.ll
index ddb920af29d8..655655d92f08 100644
--- a/test/CodeGen/AMDGPU/xor.ll
+++ b/test/CodeGen/AMDGPU/xor.ll
@@ -38,7 +38,7 @@ define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in
 }
 
 ; FUNC-LABEL: {{^}}xor_i1:
-; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}}
+; EG: XOR_INT {{\** *}}{{T[0-9]+\.[XYZW]}}, {{PS|PV\.[XYZW]}}, {{PS|PV\.[XYZW]}}
 
 ; SI-DAG: v_cmp_le_f32_e32 [[CMP0:vcc]], 0, {{v[0-9]+}}
 ; SI-DAG: v_cmp_le_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 1.0, {{v[0-9]+}}

From c0ebe8e8427c642b7bac0417fc5be3022f5b625d Mon Sep 17 00:00:00 2001
From: Joerg Sonnenberger <joerg@bec.de>
Date: Thu, 3 Dec 2015 02:35:24 +0000
Subject: [PATCH 177/186] Add a TODO item that the nop handling before FP
 conditional branches is not enough for SPARCv7.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254580 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Sparc/DelaySlotFiller.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/Target/Sparc/DelaySlotFiller.cpp b/lib/Target/Sparc/DelaySlotFiller.cpp
index 38bff44e7542..c689b7f7201e 100644
--- a/lib/Target/Sparc/DelaySlotFiller.cpp
+++ b/lib/Target/Sparc/DelaySlotFiller.cpp
@@ -122,6 +122,8 @@ bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
       continue;
     }
 
+    // TODO: If we ever want to support v7, this needs to be extended
+    // to cover all floating point operations.
     if (!Subtarget->isV9() &&
         (MI->getOpcode() == SP::FCMPS || MI->getOpcode() == SP::FCMPD
          || MI->getOpcode() == SP::FCMPQ)) {

From d4c7e117aa0ff5868dee5ef18cfa5d3e56168361 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <mehdi.amini@apple.com>
Date: Thu, 3 Dec 2015 02:37:23 +0000
Subject: [PATCH 178/186] Remove "ExportingModule" from ThinLTO Index (NFC)

There is no real reason the index has to have the concept of an
exporting Module. We should be able to have one single unique
instance of the Index, and it should be read-only after creation
for the whole ThinLTO processing.
The linker plugin should be able to process multiple modules (in
parallel or in sequence) with the same index.

The only reason the ExportingModule was present seems to be to
implement hasExportedFunctions() that is used by the Module linker
to decide what to do with the current Module.
For now I replaced it with a query to the map of Modules path to
see if this module was declared in the Index and consider that if
it is the case then it is probably exporting function.
On the long term the Linker interface needs to evolve and this
call should not be needed anymore.

From: Mehdi Amini <mehdi.amini@apple.com>

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254581 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Bitcode/ReaderWriter.h           |  4 +--
 include/llvm/IR/FunctionInfo.h                | 30 ++++---------------
 include/llvm/Object/FunctionIndexObjectFile.h |  5 ++--
 lib/Bitcode/Reader/BitcodeReader.cpp          |  5 ++--
 lib/Linker/LinkModules.cpp                    |  2 +-
 lib/Object/FunctionIndexObjectFile.cpp        | 10 +++----
 tools/llvm-link/llvm-link.cpp                 |  2 +-
 7 files changed, 16 insertions(+), 42 deletions(-)

diff --git a/include/llvm/Bitcode/ReaderWriter.h b/include/llvm/Bitcode/ReaderWriter.h
index 1c08ded1656a..3e127290f378 100644
--- a/include/llvm/Bitcode/ReaderWriter.h
+++ b/include/llvm/Bitcode/ReaderWriter.h
@@ -71,15 +71,13 @@ namespace llvm {
                           DiagnosticHandlerFunction DiagnosticHandler);
 
   /// Parse the specified bitcode buffer, returning the function info index.
-  /// If ExportingModule is true, check for functions in the index from this
-  /// module when the combined index is built during parsing and set flag.
   /// If IsLazy is true, parse the entire function summary into
   /// the index. Otherwise skip the function summary section, and only create
   /// an index object with a map from function name to function summary offset.
   /// The index is used to perform lazy function summary reading later.
   ErrorOr<std::unique_ptr<FunctionInfoIndex>> getFunctionInfoIndex(
       MemoryBufferRef Buffer, DiagnosticHandlerFunction DiagnosticHandler,
-      const Module *ExportingModule = nullptr, bool IsLazy = false);
+      bool IsLazy = false);
 
   /// This method supports lazy reading of function summary data from the
   /// combined index during function importing. When reading the combined index
diff --git a/include/llvm/IR/FunctionInfo.h b/include/llvm/IR/FunctionInfo.h
index b8801693ab51..eba088a61bc0 100644
--- a/include/llvm/IR/FunctionInfo.h
+++ b/include/llvm/IR/FunctionInfo.h
@@ -165,19 +165,8 @@ class FunctionInfoIndex {
   /// Holds strings for combined index, mapping to the corresponding module ID.
   ModulePathStringTableTy ModulePathStringTable;
 
-  /// The main module being compiled, that we are importing into, if applicable.
-  /// Used to check if any of its functions are in the index and therefore
-  /// potentially exported.
-  const Module *ExportingModule;
-
-  /// Flag indicating whether the exporting module has any functions in the
-  /// index and therefore potentially exported (imported into another module).
-  bool HasExportedFunctions;
-
 public:
-  FunctionInfoIndex(const Module *M = nullptr)
-      : ExportingModule(M), HasExportedFunctions(false){};
-  ~FunctionInfoIndex() = default;
+  FunctionInfoIndex() = default;
 
   // Disable the copy constructor and assignment operators, so
   // no unexpected copying/moving occurs.
@@ -201,14 +190,6 @@ class FunctionInfoIndex {
 
   /// Add a function info for a function of the given name.
   void addFunctionInfo(StringRef FuncName, std::unique_ptr<FunctionInfo> Info) {
-    // Update the HasExportedFunctions flag, but only if we had a function
-    // summary (i.e. we aren't parsing them lazily or have a bitcode file
-    // without a function summary section).
-    if (ExportingModule && Info->functionSummary()) {
-      if (ExportingModule->getModuleIdentifier() ==
-          Info->functionSummary()->modulePath())
-        HasExportedFunctions = true;
-    }
     FunctionMap[FuncName].push_back(std::move(Info));
   }
 
@@ -248,11 +229,10 @@ class FunctionInfoIndex {
   }
 
   /// Check if the given Module has any functions available for exporting
-  /// in the index.
-  bool hasExportedFunctions(const Module *M) const {
-    assert(M == ExportingModule &&
-           "Checking for exported functions on unexpected module");
-    return HasExportedFunctions;
+  /// in the index. We consider any module present in the ModulePathStringTable
+  /// to have exported functions.
+  bool hasExportedFunctions(const Module &M) const {
+    return ModulePathStringTable.count(M.getModuleIdentifier());
   }
 };
 
diff --git a/include/llvm/Object/FunctionIndexObjectFile.h b/include/llvm/Object/FunctionIndexObjectFile.h
index 511a237881ed..74b461dc7cc7 100644
--- a/include/llvm/Object/FunctionIndexObjectFile.h
+++ b/include/llvm/Object/FunctionIndexObjectFile.h
@@ -88,7 +88,7 @@ class FunctionIndexObjectFile : public SymbolicFile {
   /// summary/index.
   static ErrorOr<std::unique_ptr<FunctionIndexObjectFile>>
   create(MemoryBufferRef Object, DiagnosticHandlerFunction DiagnosticHandler,
-         const Module *ExportingModule = nullptr, bool IsLazy = false);
+         bool IsLazy = false);
 
   /// \brief Parse the function summary information for function with the
   /// given name out of the given buffer. Parsed information is
@@ -104,8 +104,7 @@ class FunctionIndexObjectFile : public SymbolicFile {
 /// index object if found, or nullptr if not.
 ErrorOr<std::unique_ptr<FunctionInfoIndex>>
 getFunctionIndexForFile(StringRef Path,
-                        DiagnosticHandlerFunction DiagnosticHandler,
-                        const Module *ExportingModule = nullptr);
+                        DiagnosticHandlerFunction DiagnosticHandler);
 }
 
 #endif
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index 11c9b131da70..e95aba771b9c 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -5991,12 +5991,11 @@ llvm::getBitcodeProducerString(MemoryBufferRef Buffer, LLVMContext &Context,
 ErrorOr<std::unique_ptr<FunctionInfoIndex>>
 llvm::getFunctionInfoIndex(MemoryBufferRef Buffer,
                            DiagnosticHandlerFunction DiagnosticHandler,
-                           const Module *ExportingModule, bool IsLazy) {
+                           bool IsLazy) {
   std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(Buffer, false);
   FunctionIndexBitcodeReader R(Buf.get(), DiagnosticHandler, IsLazy);
 
-  std::unique_ptr<FunctionInfoIndex> Index =
-      llvm::make_unique<FunctionInfoIndex>(ExportingModule);
+  auto Index = llvm::make_unique<FunctionInfoIndex>();
 
   auto cleanupOnError = [&](std::error_code EC) {
     R.releaseBuffer(); // Never take ownership on error.
diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index 21df66863e56..4f7405872726 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -435,7 +435,7 @@ class ModuleLinker {
     // backend compilation, and we need to see if it has functions that
     // may be exported to another backend compilation.
     if (ImportIndex && !ImportFunction)
-      HasExportedFunctions = ImportIndex->hasExportedFunctions(&SrcM);
+      HasExportedFunctions = ImportIndex->hasExportedFunctions(SrcM);
   }
 
   bool run();
diff --git a/lib/Object/FunctionIndexObjectFile.cpp b/lib/Object/FunctionIndexObjectFile.cpp
index 717c56bc9018..fe111de1a9c8 100644
--- a/lib/Object/FunctionIndexObjectFile.cpp
+++ b/lib/Object/FunctionIndexObjectFile.cpp
@@ -86,7 +86,7 @@ bool FunctionIndexObjectFile::hasFunctionSummaryInMemBuffer(
 ErrorOr<std::unique_ptr<FunctionIndexObjectFile>>
 FunctionIndexObjectFile::create(MemoryBufferRef Object,
                                 DiagnosticHandlerFunction DiagnosticHandler,
-                                const Module *ExportingModule, bool IsLazy) {
+                                bool IsLazy) {
   std::unique_ptr<FunctionInfoIndex> Index;
 
   ErrorOr<MemoryBufferRef> BCOrErr = findBitcodeInMemBuffer(Object);
@@ -94,7 +94,7 @@ FunctionIndexObjectFile::create(MemoryBufferRef Object,
     return BCOrErr.getError();
 
   ErrorOr<std::unique_ptr<FunctionInfoIndex>> IOrErr = getFunctionInfoIndex(
-      BCOrErr.get(), DiagnosticHandler, ExportingModule, IsLazy);
+      BCOrErr.get(), DiagnosticHandler, IsLazy);
 
   if (std::error_code EC = IOrErr.getError())
     return EC;
@@ -125,8 +125,7 @@ std::error_code FunctionIndexObjectFile::findFunctionSummaryInMemBuffer(
 // index object if found, or nullptr if not.
 ErrorOr<std::unique_ptr<FunctionInfoIndex>>
 llvm::getFunctionIndexForFile(StringRef Path,
-                              DiagnosticHandlerFunction DiagnosticHandler,
-                              const Module *ExportingModule) {
+                              DiagnosticHandlerFunction DiagnosticHandler) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
       MemoryBuffer::getFileOrSTDIN(Path);
   std::error_code EC = FileOrErr.getError();
@@ -134,8 +133,7 @@ llvm::getFunctionIndexForFile(StringRef Path,
     return EC;
   MemoryBufferRef BufferRef = (FileOrErr.get())->getMemBufferRef();
   ErrorOr<std::unique_ptr<object::FunctionIndexObjectFile>> ObjOrErr =
-      object::FunctionIndexObjectFile::create(BufferRef, DiagnosticHandler,
-                                              ExportingModule);
+      object::FunctionIndexObjectFile::create(BufferRef, DiagnosticHandler);
   EC = ObjOrErr.getError();
   if (EC)
     return EC;
diff --git a/tools/llvm-link/llvm-link.cpp b/tools/llvm-link/llvm-link.cpp
index 9a373c25cc5c..39034aaf672c 100644
--- a/tools/llvm-link/llvm-link.cpp
+++ b/tools/llvm-link/llvm-link.cpp
@@ -229,7 +229,7 @@ static bool linkFiles(const char *argv0, LLVMContext &Context, Linker &L,
     std::unique_ptr<FunctionInfoIndex> Index;
     if (!FunctionIndex.empty()) {
       ErrorOr<std::unique_ptr<FunctionInfoIndex>> IndexOrErr =
-          llvm::getFunctionIndexForFile(FunctionIndex, diagnosticHandler, &*M);
+          llvm::getFunctionIndexForFile(FunctionIndex, diagnosticHandler);
       std::error_code EC = IndexOrErr.getError();
       if (EC) {
         errs() << EC.message() << '\n';

From 8466beff264684e6b6c354a569a3c8a3ecb22490 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <mehdi.amini@apple.com>
Date: Thu, 3 Dec 2015 02:37:30 +0000
Subject: [PATCH 179/186] Adapt comment and rename variable in ModuleLinker to
 describe more accurately the actual use.

Thanks Sean Silva for the suggestion.

From: Mehdi Amini <mehdi.amini@apple.com>

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254582 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Linker/Linker.h | 8 +++++---
 lib/Linker/LinkModules.cpp   | 8 ++++----
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/include/llvm/Linker/Linker.h b/include/llvm/Linker/Linker.h
index 7c24eef74111..0c7dc910a65c 100644
--- a/include/llvm/Linker/Linker.h
+++ b/include/llvm/Linker/Linker.h
@@ -74,15 +74,17 @@ class Linker {
   Module &getModule() const { return Composite; }
 
   /// \brief Link \p Src into the composite. The source is destroyed.
+  ///
   /// Passing OverrideSymbols as true will have symbols from Src
   /// shadow those in the Dest.
   /// For ThinLTO function importing/exporting the \p FunctionInfoIndex
-  /// is passed. If a \p FuncToImport is provided, only that single
-  /// function is imported from the source module.
+  /// is passed. If \p FunctionsToImport is provided, only the functions that
+  /// are part of the set will be imported from the source module.
+  ///
   /// Returns true on error.
   bool linkInModule(Module &Src, unsigned Flags = Flags::None,
                     const FunctionInfoIndex *Index = nullptr,
-                    DenseSet<const GlobalValue *> *FuncToImport = nullptr);
+                    DenseSet<const GlobalValue *> *FunctionsToImport = nullptr);
 
   static bool linkModules(Module &Dest, Module &Src,
                           DiagnosticHandlerFunction DiagnosticHandler,
diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index 4f7405872726..67613967f490 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -424,10 +424,10 @@ class ModuleLinker {
   ModuleLinker(Module &DstM, Linker::IdentifiedStructTypeSet &Set, Module &SrcM,
                DiagnosticHandlerFunction DiagnosticHandler, unsigned Flags,
                const FunctionInfoIndex *Index = nullptr,
-               DenseSet<const GlobalValue *> *FuncToImport = nullptr)
+               DenseSet<const GlobalValue *> *FunctionsToImport = nullptr)
       : DstM(DstM), SrcM(SrcM), TypeMap(Set), ValMaterializer(this),
         DiagnosticHandler(DiagnosticHandler), Flags(Flags), ImportIndex(Index),
-        ImportFunction(FuncToImport) {
+        ImportFunction(FunctionsToImport) {
     assert((ImportIndex || !ImportFunction) &&
            "Expect a FunctionInfoIndex when importing");
     // If we have a FunctionInfoIndex but no function to import,
@@ -2031,9 +2031,9 @@ Linker::Linker(Module &M)
 
 bool Linker::linkInModule(Module &Src, unsigned Flags,
                           const FunctionInfoIndex *Index,
-                          DenseSet<const GlobalValue *> *FuncToImport) {
+                          DenseSet<const GlobalValue *> *FunctionsToImport) {
   ModuleLinker TheLinker(Composite, IdentifiedStructTypes, Src,
-                         DiagnosticHandler, Flags, Index, FuncToImport);
+                         DiagnosticHandler, Flags, Index, FunctionsToImport);
   bool RetCode = TheLinker.run();
   Composite.dropTriviallyDeadConstantArrays();
   return RetCode;

From 690d7b2a89f76fd450f38564ce4e6732d853eadc Mon Sep 17 00:00:00 2001
From: Mehdi Amini <mehdi.amini@apple.com>
Date: Thu, 3 Dec 2015 02:37:33 +0000
Subject: [PATCH 180/186] Refactor FunctionImporter::importFunctions with a
 helper function to process the Worklist (NFC)

This precludes some more functional changes to perform bulk imports.

From: Mehdi Amini <mehdi.amini@apple.com>

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254583 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/IPO/FunctionImport.cpp | 74 ++++++++++++++++-----------
 1 file changed, 45 insertions(+), 29 deletions(-)

diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp
index 8230d64026c2..0187528153b5 100644
--- a/lib/Transforms/IPO/FunctionImport.cpp
+++ b/lib/Transforms/IPO/FunctionImport.cpp
@@ -66,7 +66,6 @@ static void findExternalCalls(const Function &F, StringSet<> &CalledFunctions,
   for (auto &BB : F) {
     for (auto &I : BB) {
       if (isa<CallInst>(I)) {
-        DEBUG(dbgs() << "Found a call: '" << I << "'\n");
         auto CalledFunction = cast<CallInst>(I).getCalledFunction();
         // Insert any new external calls that have not already been
         // added to set/worklist.
@@ -81,28 +80,14 @@ static void findExternalCalls(const Function &F, StringSet<> &CalledFunctions,
   }
 }
 
-// Automatically import functions in Module \p M based on the summaries index.
-//
-// The current implementation imports every called functions that exists in the
-// summaries index.
-bool FunctionImporter::importFunctions(Module &M) {
-
-  bool Changed = false;
-
-  /// First step is collecting the called external functions.
-  StringSet<> CalledFunctions;
-  SmallVector<StringRef, 64> Worklist;
-  for (auto &F : M) {
-    if (F.isDeclaration() || F.hasFnAttribute(Attribute::OptimizeNone))
-      continue;
-    findExternalCalls(F, CalledFunctions, Worklist);
-  }
-
-  /// Second step: for every call to an external function, try to import it.
-
-  // Linker that will be used for importing function
-  Linker L(M, DiagnosticHandler);
 
+// Helper function: given a worklist and an index, will process all the worklist
+// and import them based on the summary information
+static unsigned ProcessImportWorklist(Module &DestModule, SmallVector<StringRef, 64> &Worklist,
+                                      StringSet<> &CalledFunctions,
+                            Linker &TheLinker, const FunctionInfoIndex &Index,
+                            std::function<Module &(StringRef FileName)> &LazyModuleLoader) {
+  unsigned ImportCount = 0;
   while (!Worklist.empty()) {
     auto CalledFunctionName = Worklist.pop_back_val();
     DEBUG(dbgs() << "Process import for " << CalledFunctionName << "\n");
@@ -141,14 +126,14 @@ bool FunctionImporter::importFunctions(Module &M) {
                  << "\n");
 
     // Get the module for the import (potentially from the cache).
-    auto &Module = getLazyModule(FileName);
-    assert(&Module.getContext() == &M.getContext());
+    auto &Module = LazyModuleLoader(FileName);
+    assert(&Module.getContext() == &DestModule.getContext());
 
     // The function that we will import!
     GlobalValue *SGV = Module.getNamedValue(CalledFunctionName);
     StringRef ImportFunctionName = CalledFunctionName;
     if (!SGV) {
-      // Might be local in source Module, promoted/renamed in dest Module M.
+      // Might be local in source Module, promoted/renamed in DestModule.
       std::pair<StringRef, StringRef> Split =
           CalledFunctionName.split(".llvm.");
       SGV = Module.getNamedValue(Split.first);
@@ -184,21 +169,52 @@ bool FunctionImporter::importFunctions(Module &M) {
     // Link in the specified function.
     DenseSet<const GlobalValue *> FunctionsToImport;
     FunctionsToImport.insert(F);
-    if (L.linkInModule(Module, Linker::Flags::None, &Index,
+    if (TheLinker.linkInModule(Module, Linker::Flags::None, &Index,
                        &FunctionsToImport))
       report_fatal_error("Function Import: link error");
 
     // Process the newly imported function and add callees to the worklist.
-    GlobalValue *NewGV = M.getNamedValue(ImportFunctionName);
+    GlobalValue *NewGV = DestModule.getNamedValue(ImportFunctionName);
     assert(NewGV);
     Function *NewF = dyn_cast<Function>(NewGV);
     assert(NewF);
     findExternalCalls(*NewF, CalledFunctions, Worklist);
+    ++ImportCount;
+  }
+  return ImportCount;
+}
+
+// Automatically import functions in Module \p DestModule based on the summaries
+// index.
+//
+// The current implementation imports every called functions that exists in the
+// summaries index.
+bool FunctionImporter::importFunctions(Module &DestModule) {
+  DEBUG(errs() << "Starting import for Module " << DestModule.getModuleIdentifier()
+               << "\n");
+  unsigned ImportedCount = 0;
 
-    Changed = true;
+  /// First step is collecting the called external functions.
+  StringSet<> CalledFunctions;
+  SmallVector<StringRef, 64> Worklist;
+  for (auto &F : DestModule) {
+    if (F.isDeclaration() || F.hasFnAttribute(Attribute::OptimizeNone))
+      continue;
+    findExternalCalls(F, CalledFunctions, Worklist);
   }
+  if (Worklist.empty())
+    return false;
+
+  /// Second step: for every call to an external function, try to import it.
+
+  // Linker that will be used for importing function
+  Linker TheLinker(DestModule, DiagnosticHandler);
+
+  ImportedCount += ProcessImportWorklist(DestModule, Worklist, CalledFunctions, TheLinker, Index, getLazyModule );
 
-  return Changed;
+  DEBUG(errs() << "Imported " << ImportedCount << " functions for Module "
+               << DestModule.getModuleIdentifier() << "\n");
+  return ImportedCount;
 }
 
 /// Summary file to use for function importing when using -function-import from

From 2d8142d991ae58a7b3fc81c33ac1d18587676d96 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <mehdi.amini@apple.com>
Date: Thu, 3 Dec 2015 02:40:39 +0000
Subject: [PATCH 181/186] Rename Set variable to be plural

Thanks Sean Silva for catching this.

From: Mehdi Amini <mehdi.amini@apple.com>

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254584 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-link/llvm-link.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/llvm-link/llvm-link.cpp b/tools/llvm-link/llvm-link.cpp
index 39034aaf672c..6f90f4056f70 100644
--- a/tools/llvm-link/llvm-link.cpp
+++ b/tools/llvm-link/llvm-link.cpp
@@ -198,10 +198,10 @@ static bool importFunctions(const char *argv0, LLVMContext &Context,
     }
 
     // Link in the specified function.
-    DenseSet<const GlobalValue *> FunctionToImport;
-    FunctionToImport.insert(F);
+    DenseSet<const GlobalValue *> FunctionsToImport;
+    FunctionsToImport.insert(F);
     if (L.linkInModule(*M, Linker::Flags::None, Index.get(),
-                       &FunctionToImport))
+                       &FunctionsToImport))
       return false;
   }
   return true;

From e3c21d3c167d31a4f94a64dff77624fdf1e4b984 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <mehdi.amini@apple.com>
Date: Thu, 3 Dec 2015 02:58:14 +0000
Subject: [PATCH 182/186] clang-format FunctionImport after refactoring (NFC)

From: Mehdi Amini <mehdi.amini@apple.com>

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254585 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/IPO/FunctionImport.cpp | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp
index 0187528153b5..c2359a8a172e 100644
--- a/lib/Transforms/IPO/FunctionImport.cpp
+++ b/lib/Transforms/IPO/FunctionImport.cpp
@@ -80,13 +80,13 @@ static void findExternalCalls(const Function &F, StringSet<> &CalledFunctions,
   }
 }
 
-
 // Helper function: given a worklist and an index, will process all the worklist
 // and import them based on the summary information
-static unsigned ProcessImportWorklist(Module &DestModule, SmallVector<StringRef, 64> &Worklist,
-                                      StringSet<> &CalledFunctions,
-                            Linker &TheLinker, const FunctionInfoIndex &Index,
-                            std::function<Module &(StringRef FileName)> &LazyModuleLoader) {
+static unsigned ProcessImportWorklist(
+    Module &DestModule, SmallVector<StringRef, 64> &Worklist,
+    StringSet<> &CalledFunctions, Linker &TheLinker,
+    const FunctionInfoIndex &Index,
+    std::function<Module &(StringRef FileName)> &LazyModuleLoader) {
   unsigned ImportCount = 0;
   while (!Worklist.empty()) {
     auto CalledFunctionName = Worklist.pop_back_val();
@@ -170,7 +170,7 @@ static unsigned ProcessImportWorklist(Module &DestModule, SmallVector<StringRef,
     DenseSet<const GlobalValue *> FunctionsToImport;
     FunctionsToImport.insert(F);
     if (TheLinker.linkInModule(Module, Linker::Flags::None, &Index,
-                       &FunctionsToImport))
+                               &FunctionsToImport))
       report_fatal_error("Function Import: link error");
 
     // Process the newly imported function and add callees to the worklist.
@@ -190,8 +190,8 @@ static unsigned ProcessImportWorklist(Module &DestModule, SmallVector<StringRef,
 // The current implementation imports every called functions that exists in the
 // summaries index.
 bool FunctionImporter::importFunctions(Module &DestModule) {
-  DEBUG(errs() << "Starting import for Module " << DestModule.getModuleIdentifier()
-               << "\n");
+  DEBUG(errs() << "Starting import for Module "
+               << DestModule.getModuleIdentifier() << "\n");
   unsigned ImportedCount = 0;
 
   /// First step is collecting the called external functions.
@@ -210,7 +210,8 @@ bool FunctionImporter::importFunctions(Module &DestModule) {
   // Linker that will be used for importing function
   Linker TheLinker(DestModule, DiagnosticHandler);
 
-  ImportedCount += ProcessImportWorklist(DestModule, Worklist, CalledFunctions, TheLinker, Index, getLazyModule );
+  ImportedCount += ProcessImportWorklist(DestModule, Worklist, CalledFunctions,
+                                         TheLinker, Index, getLazyModule);
 
   DEBUG(errs() << "Imported " << ImportedCount << " functions for Module "
                << DestModule.getModuleIdentifier() << "\n");

From 033f8711011afab6663cdd5515027f8245f808c4 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Thu, 3 Dec 2015 03:01:10 +0000
Subject: [PATCH 183/186] Revert "ScheduleDAGInstrs: Rework schedule graph
 builder."

This works mostly fine but breaks some stage 1 builders when compiling
compiler-rt on i386. Revert for further investigation as I can't see an
obvious cause/fix.

This reverts commit r254577.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254586 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/ScheduleDAGInstrs.h      |  40 +---
 lib/CodeGen/ScheduleDAGInstrs.cpp             | 225 +++++-------------
 test/CodeGen/AMDGPU/image-attributes.ll       |  20 +-
 test/CodeGen/AMDGPU/literals.ll               |   8 +-
 .../AMDGPU/llvm.AMDGPU.read.workdim.ll        |   2 +-
 test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll      |   2 +-
 .../AMDGPU/llvm.r600.read.local.size.ll       |   6 +-
 test/CodeGen/AMDGPU/or.ll                     |   2 +-
 test/CodeGen/AMDGPU/set-dx10.ll               |  48 ++--
 test/CodeGen/AMDGPU/sext-in-reg.ll            |   4 +-
 test/CodeGen/AMDGPU/shl.ll                    |  12 +-
 test/CodeGen/AMDGPU/sra.ll                    |   8 +-
 test/CodeGen/AMDGPU/srl.ll                    |  10 +-
 test/CodeGen/AMDGPU/unsupported-cc.ll         |  32 +--
 test/CodeGen/AMDGPU/work-item-intrinsics.ll   |  12 +-
 test/CodeGen/AMDGPU/xor.ll                    |   2 +-
 16 files changed, 156 insertions(+), 277 deletions(-)

diff --git a/include/llvm/CodeGen/ScheduleDAGInstrs.h b/include/llvm/CodeGen/ScheduleDAGInstrs.h
index c715e0f79205..1446f2ac082b 100644
--- a/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -33,26 +33,15 @@ namespace llvm {
   /// An individual mapping from virtual register number to SUnit.
   struct VReg2SUnit {
     unsigned VirtReg;
-    LaneBitmask LaneMask;
     SUnit *SU;
 
-    VReg2SUnit(unsigned VReg, LaneBitmask LaneMask, SUnit *SU)
-      : VirtReg(VReg), LaneMask(LaneMask), SU(SU) {}
+    VReg2SUnit(unsigned reg, SUnit *su): VirtReg(reg), SU(su) {}
 
     unsigned getSparseSetIndex() const {
       return TargetRegisterInfo::virtReg2Index(VirtReg);
     }
   };
 
-  /// Mapping from virtual register to SUnit including an operand index.
-  struct VReg2SUnitOperIdx : public VReg2SUnit {
-    unsigned OperandIndex;
-
-    VReg2SUnitOperIdx(unsigned VReg, LaneBitmask LaneMask,
-                      unsigned OperandIndex, SUnit *SU)
-      : VReg2SUnit(VReg, LaneMask, SU), OperandIndex(OperandIndex) {}
-  };
-
   /// Record a physical register access.
   /// For non-data-dependent uses, OpIdx == -1.
   struct PhysRegSUOper {
@@ -80,10 +69,7 @@ namespace llvm {
   /// Track local uses of virtual registers. These uses are gathered by the DAG
   /// builder and may be consulted by the scheduler to avoid iterating an entire
   /// vreg use list.
-  typedef SparseMultiSet<VReg2SUnit, VirtReg2IndexFunctor> VReg2SUnitMultiMap;
-
-  typedef SparseMultiSet<VReg2SUnitOperIdx, VirtReg2IndexFunctor>
-    VReg2SUnitOperIdxMultiMap;
+  typedef SparseMultiSet<VReg2SUnit, VirtReg2IndexFunctor> VReg2UseMap;
 
   /// ScheduleDAGInstrs - A ScheduleDAG subclass for scheduling lists of
   /// MachineInstrs.
@@ -109,9 +95,6 @@ namespace llvm {
     /// it has taken responsibility for scheduling the terminator correctly.
     bool CanHandleTerminators;
 
-    /// Whether lane masks should get tracked.
-    bool TrackLaneMasks;
-
     /// State specific to the current scheduling region.
     /// ------------------------------------------------
 
@@ -134,7 +117,7 @@ namespace llvm {
     /// After calling BuildSchedGraph, each vreg used in the scheduling region
     /// is mapped to a set of SUnits. These include all local vreg uses, not
     /// just the uses for a singly defined vreg.
-    VReg2SUnitMultiMap VRegUses;
+    VReg2UseMap VRegUses;
 
     /// State internal to DAG building.
     /// -------------------------------
@@ -146,12 +129,8 @@ namespace llvm {
     Reg2SUnitsMap Defs;
     Reg2SUnitsMap Uses;
 
-    /// Tracks the last instruction(s) in this region defining each virtual
-    /// register. There may be multiple current definitions for a register with
-    /// disjunct lanemasks.
-    VReg2SUnitMultiMap CurrentVRegDefs;
-    /// Tracks the last instructions in this region using each virtual register.
-    VReg2SUnitOperIdxMultiMap CurrentVRegUses;
+    /// Track the last instruction in this region defining each virtual register.
+    VReg2SUnitMap VRegDefs;
 
     /// PendingLoads - Remember where unknown loads are after the most recent
     /// unknown store, as we iterate. As with Defs and Uses, this is here
@@ -221,8 +200,7 @@ namespace llvm {
     /// input.
     void buildSchedGraph(AliasAnalysis *AA,
                          RegPressureTracker *RPTracker = nullptr,
-                         PressureDiffs *PDiffs = nullptr,
-                         bool TrackLaneMasks = false);
+                         PressureDiffs *PDiffs = nullptr);
 
     /// addSchedBarrierDeps - Add dependencies from instructions in the current
     /// list of instructions being scheduled to scheduling barrier. We want to
@@ -269,12 +247,6 @@ namespace llvm {
     /// Other adjustments may be made to the instruction if necessary. Return
     /// true if the operand has been deleted, false if not.
     bool toggleKillFlag(MachineInstr *MI, MachineOperand &MO);
-
-    /// Returns a mask for which lanes get read/written by the given (register)
-    /// machine operand.
-    LaneBitmask getLaneMaskForMO(const MachineOperand &MO) const;
-
-    void collectVRegUses(SUnit *SU);
   };
 
   /// newSUnit - Creates a new SUnit and return a ptr to it.
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index 2ef02deebfbd..12b2beb357b4 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -13,12 +13,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
-#include "llvm/ADT/IntEqClasses.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -55,7 +55,7 @@ ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
                                      bool RemoveKillFlags)
     : ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()), LIS(LIS),
       RemoveKillFlags(RemoveKillFlags), CanHandleTerminators(false),
-      TrackLaneMasks(false), FirstDbgValue(nullptr) {
+      FirstDbgValue(nullptr) {
   DbgValues.clear();
 
   const TargetSubtargetInfo &ST = mf.getSubtarget();
@@ -363,20 +363,6 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
   }
 }
 
-LaneBitmask ScheduleDAGInstrs::getLaneMaskForMO(const MachineOperand &MO) const
-{
-  unsigned Reg = MO.getReg();
-  // No point in tracking lanemasks if we don't have interesting subregisters.
-  const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
-  if (!RC.HasDisjunctSubRegs)
-    return ~0u;
-
-  unsigned SubReg = MO.getSubReg();
-  if (SubReg == 0)
-    return RC.getLaneMask();
-  return TRI->getSubRegIndexLaneMask(SubReg);
-}
-
 /// addVRegDefDeps - Add register output and data dependencies from this SUnit
 /// to instructions that occur later in the same scheduling region if they read
 /// from or write to the virtual register defined at OperIdx.
@@ -384,106 +370,35 @@ LaneBitmask ScheduleDAGInstrs::getLaneMaskForMO(const MachineOperand &MO) const
 /// TODO: Hoist loop induction variable increments. This has to be
 /// reevaluated. Generally, IV scheduling should be done before coalescing.
 void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
-  MachineInstr *MI = SU->getInstr();
-  MachineOperand &MO = MI->getOperand(OperIdx);
-  unsigned Reg = MO.getReg();
-
-  LaneBitmask DefLaneMask;
-  LaneBitmask KillLaneMask;
-  if (TrackLaneMasks) {
-    bool IsKill = MO.getSubReg() == 0 || MO.isUndef();
-    DefLaneMask = getLaneMaskForMO(MO);
-    // If we have a <read-undef> flag, none of the lane values comes from an
-    // earlier instruction.
-    KillLaneMask = IsKill ? ~0u : DefLaneMask;
-
-    // Clear undef flag, we'll re-add it later once we know which subregister
-    // Def is first.
-    MO.setIsUndef(false);
-  } else {
-    DefLaneMask = ~0u;
-    KillLaneMask = ~0u;
-  }
-
-  if (MO.isDead()) {
-    assert(CurrentVRegUses.find(Reg) == CurrentVRegUses.end() &&
-           "Dead defs should have no uses");
-  } else {
-    // Add data dependence to all uses we found so far.
-    const TargetSubtargetInfo &ST = MF.getSubtarget();
-    for (VReg2SUnitOperIdxMultiMap::iterator I = CurrentVRegUses.find(Reg),
-         E = CurrentVRegUses.end(); I != E; /*empty*/) {
-      LaneBitmask LaneMask = I->LaneMask;
-      // Ignore uses of other lanes.
-      if ((LaneMask & KillLaneMask) == 0) {
-        ++I;
-        continue;
-      }
-
-      if ((LaneMask & DefLaneMask) != 0) {
-        SUnit *UseSU = I->SU;
-        MachineInstr *Use = UseSU->getInstr();
-        SDep Dep(SU, SDep::Data, Reg);
-        Dep.setLatency(SchedModel.computeOperandLatency(MI, OperIdx, Use,
-                                                        I->OperandIndex));
-        ST.adjustSchedDependency(SU, UseSU, Dep);
-        UseSU->addPred(Dep);
-      }
-
-      LaneMask &= ~KillLaneMask;
-      // If we found a Def for all lanes of this use, remove it from the list.
-      if (LaneMask != 0) {
-        I->LaneMask = LaneMask;
-        ++I;
-      } else
-        I = CurrentVRegUses.erase(I);
-    }
-  }
+  const MachineInstr *MI = SU->getInstr();
+  unsigned Reg = MI->getOperand(OperIdx).getReg();
 
-  // Shortcut: Singly defined vregs do not have output/anti dependencies.
+  // Singly defined vregs do not have output/anti dependencies.
+  // The current operand is a def, so we have at least one.
+  // Check here if there are any others...
   if (MRI.hasOneDef(Reg))
     return;
 
-  // Add output dependence to the next nearest defs of this vreg.
+  // Add output dependence to the next nearest def of this vreg.
   //
   // Unless this definition is dead, the output dependence should be
   // transitively redundant with antidependencies from this definition's
   // uses. We're conservative for now until we have a way to guarantee the uses
   // are not eliminated sometime during scheduling. The output dependence edge
   // is also useful if output latency exceeds def-use latency.
-  LaneBitmask LaneMask = DefLaneMask;
-  for (VReg2SUnit &V2SU : make_range(CurrentVRegDefs.find(Reg),
-                                     CurrentVRegDefs.end())) {
-    // Ignore defs for other lanes.
-    if ((V2SU.LaneMask & LaneMask) == 0)
-      continue;
-    // Add an output dependence.
-    SUnit *DefSU = V2SU.SU;
-    // Ignore additional defs of the same lanes in one instruction. This can
-    // happen because lanemasks are shared for targets with too many
-    // subregisters. We also use some representration tricks/hacks where we
-    // add super-register defs/uses, to imply that although we only access parts
-    // of the reg we care about the full one.
-    if (DefSU == SU)
-      continue;
-    SDep Dep(SU, SDep::Output, Reg);
-    Dep.setLatency(
-      SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr()));
-    DefSU->addPred(Dep);
-
-    // Update current definition. This can get tricky if the def was about a
-    // bigger lanemask before. We then have to shrink it and create a new
-    // VReg2SUnit for the non-overlapping part.
-    LaneBitmask OverlapMask = V2SU.LaneMask & LaneMask;
-    LaneBitmask NonOverlapMask = V2SU.LaneMask & ~LaneMask;
-    if (NonOverlapMask != 0)
-      CurrentVRegDefs.insert(VReg2SUnit(Reg, NonOverlapMask, V2SU.SU));
-    V2SU.SU = SU;
-    V2SU.LaneMask = OverlapMask;
+  VReg2SUnitMap::iterator DefI = VRegDefs.find(Reg);
+  if (DefI == VRegDefs.end())
+    VRegDefs.insert(VReg2SUnit(Reg, SU));
+  else {
+    SUnit *DefSU = DefI->SU;
+    if (DefSU != SU && DefSU != &ExitSU) {
+      SDep Dep(SU, SDep::Output, Reg);
+      Dep.setLatency(
+        SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr()));
+      DefSU->addPred(Dep);
+    }
+    DefI->SU = SU;
   }
-  // If there was no CurrentVRegDefs entry for some lanes yet, create one.
-  if (LaneMask != 0)
-    CurrentVRegDefs.insert(VReg2SUnit(Reg, LaneMask, SU));
 }
 
 /// addVRegUseDeps - Add a register data dependency if the instruction that
@@ -493,26 +408,49 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
 ///
 /// TODO: Handle ExitSU "uses" properly.
 void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
-  const MachineInstr *MI = SU->getInstr();
-  const MachineOperand &MO = MI->getOperand(OperIdx);
-  unsigned Reg = MO.getReg();
-
-  // Remember the use. Data dependencies will be added when we find the def.
-  LaneBitmask LaneMask = TrackLaneMasks ? getLaneMaskForMO(MO) : ~0u;
-  CurrentVRegUses.insert(VReg2SUnitOperIdx(Reg, LaneMask, OperIdx, SU));
-
-  // Add antidependences to the following defs of the vreg.
-  for (VReg2SUnit &V2SU : make_range(CurrentVRegDefs.find(Reg),
-                                     CurrentVRegDefs.end())) {
-    // Ignore defs for unrelated lanes.
-    LaneBitmask PrevDefLaneMask = V2SU.LaneMask;
-    if ((PrevDefLaneMask & LaneMask) == 0)
-      continue;
-    if (V2SU.SU == SU)
-      continue;
+  MachineInstr *MI = SU->getInstr();
+  unsigned Reg = MI->getOperand(OperIdx).getReg();
 
-    V2SU.SU->addPred(SDep(SU, SDep::Anti, Reg));
+  // Record this local VReg use.
+  VReg2UseMap::iterator UI = VRegUses.find(Reg);
+  for (; UI != VRegUses.end(); ++UI) {
+    if (UI->SU == SU)
+      break;
+  }
+  if (UI == VRegUses.end())
+    VRegUses.insert(VReg2SUnit(Reg, SU));
+
+  // Lookup this operand's reaching definition.
+  assert(LIS && "vreg dependencies requires LiveIntervals");
+  LiveQueryResult LRQ
+    = LIS->getInterval(Reg).Query(LIS->getInstructionIndex(MI));
+  VNInfo *VNI = LRQ.valueIn();
+
+  // VNI will be valid because MachineOperand::readsReg() is checked by caller.
+  assert(VNI && "No value to read by operand");
+  MachineInstr *Def = LIS->getInstructionFromIndex(VNI->def);
+  // Phis and other noninstructions (after coalescing) have a NULL Def.
+  if (Def) {
+    SUnit *DefSU = getSUnit(Def);
+    if (DefSU) {
+      // The reaching Def lives within this scheduling region.
+      // Create a data dependence.
+      SDep dep(DefSU, SDep::Data, Reg);
+      // Adjust the dependence latency using operand def/use information, then
+      // allow the target to perform its own adjustments.
+      int DefOp = Def->findRegisterDefOperandIdx(Reg);
+      dep.setLatency(SchedModel.computeOperandLatency(Def, DefOp, MI, OperIdx));
+
+      const TargetSubtargetInfo &ST = MF.getSubtarget();
+      ST.adjustSchedDependency(DefSU, SU, const_cast<SDep &>(dep));
+      SU->addPred(dep);
+    }
   }
+
+  // Add antidependence to the following def of the vreg it uses.
+  VReg2SUnitMap::iterator DefI = VRegDefs.find(Reg);
+  if (DefI != VRegDefs.end() && DefI->SU != SU)
+    DefI->SU->addPred(SDep(SU, SDep::Anti, Reg));
 }
 
 /// Return true if MI is an instruction we are unable to reason about
@@ -795,42 +733,17 @@ void ScheduleDAGInstrs::initSUnits() {
   }
 }
 
-void ScheduleDAGInstrs::collectVRegUses(SUnit *SU) {
-  const MachineInstr *MI = SU->getInstr();
-  for (const MachineOperand &MO : MI->operands()) {
-    if (!MO.isReg())
-      continue;
-    if (!MO.isUse() && (MO.getSubReg() == 0 || !TrackLaneMasks))
-      continue;
-
-    unsigned Reg = MO.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
-      continue;
-
-    // Record this local VReg use.
-    VReg2SUnitMultiMap::iterator UI = VRegUses.find(Reg);
-    for (; UI != VRegUses.end(); ++UI) {
-      if (UI->SU == SU)
-        break;
-    }
-    if (UI == VRegUses.end())
-      VRegUses.insert(VReg2SUnit(Reg, 0, SU));
-  }
-}
-
 /// If RegPressure is non-null, compute register pressure as a side effect. The
 /// DAG builder is an efficient place to do it because it already visits
 /// operands.
 void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
                                         RegPressureTracker *RPTracker,
-                                        PressureDiffs *PDiffs,
-                                        bool TrackLaneMasks) {
+                                        PressureDiffs *PDiffs) {
   const TargetSubtargetInfo &ST = MF.getSubtarget();
   bool UseAA = EnableAASchedMI.getNumOccurrences() > 0 ? EnableAASchedMI
                                                        : ST.useAA();
   AliasAnalysis *AAForDep = UseAA ? AA : nullptr;
 
-  this->TrackLaneMasks = TrackLaneMasks;
   MISUnitMap.clear();
   ScheduleDAG::clearDAG();
 
@@ -864,14 +777,10 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
   Defs.setUniverse(TRI->getNumRegs());
   Uses.setUniverse(TRI->getNumRegs());
 
-  assert(CurrentVRegDefs.empty() && "nobody else should use CurrentVRegDefs");
-  assert(CurrentVRegUses.empty() && "nobody else should use CurrentVRegUses");
-  unsigned NumVirtRegs = MRI.getNumVirtRegs();
-  CurrentVRegDefs.setUniverse(NumVirtRegs);
-  CurrentVRegUses.setUniverse(NumVirtRegs);
-
+  assert(VRegDefs.empty() && "Only BuildSchedGraph may access VRegDefs");
   VRegUses.clear();
-  VRegUses.setUniverse(NumVirtRegs);
+  VRegDefs.setUniverse(MRI.getNumVirtRegs());
+  VRegUses.setUniverse(MRI.getNumVirtRegs());
 
   // Model data dependencies between instructions being scheduled and the
   // ExitSU.
@@ -899,7 +808,6 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
       RPTracker->recede(/*LiveUses=*/nullptr, PDiff);
       assert(RPTracker->getPos() == std::prev(MII) &&
              "RPTracker can't find MI");
-      collectVRegUses(SU);
     }
 
     assert(
@@ -1149,8 +1057,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
 
   Defs.clear();
   Uses.clear();
-  CurrentVRegDefs.clear();
-  CurrentVRegUses.clear();
+  VRegDefs.clear();
   PendingLoads.clear();
 }
 
diff --git a/test/CodeGen/AMDGPU/image-attributes.ll b/test/CodeGen/AMDGPU/image-attributes.ll
index 5906b2f15709..7a5a7346865f 100644
--- a/test/CodeGen/AMDGPU/image-attributes.ll
+++ b/test/CodeGen/AMDGPU/image-attributes.ll
@@ -6,7 +6,7 @@
 
 ; FUNC-LABEL: {{^}}width_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV * [[VAL]], KC0[2].Z
+; EG: MOV [[VAL]], KC0[2].Z
 define void @width_2d (%opencl.image2d_t addrspace(1)* %in,
                        i32 addrspace(1)* %out) {
 entry:
@@ -19,7 +19,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}width_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV * [[VAL]], KC0[2].Z
+; EG: MOV [[VAL]], KC0[2].Z
 define void @width_3d (%opencl.image3d_t addrspace(1)* %in,
                        i32 addrspace(1)* %out) {
 entry:
@@ -36,7 +36,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}height_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV * [[VAL]], KC0[2].W
+; EG: MOV [[VAL]], KC0[2].W
 define void @height_2d (%opencl.image2d_t addrspace(1)* %in,
                         i32 addrspace(1)* %out) {
 entry:
@@ -49,7 +49,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}height_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV * [[VAL]], KC0[2].W
+; EG: MOV [[VAL]], KC0[2].W
 define void @height_3d (%opencl.image3d_t addrspace(1)* %in,
                         i32 addrspace(1)* %out) {
 entry:
@@ -66,7 +66,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}depth_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV * [[VAL]], KC0[3].X
+; EG: MOV [[VAL]], KC0[3].X
 define void @depth_3d (%opencl.image3d_t addrspace(1)* %in,
                        i32 addrspace(1)* %out) {
 entry:
@@ -83,7 +83,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}data_type_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV * [[VAL]], KC0[3].Y
+; EG: MOV [[VAL]], KC0[3].Y
 define void @data_type_2d (%opencl.image2d_t addrspace(1)* %in,
                            i32 addrspace(1)* %out) {
 entry:
@@ -96,7 +96,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}data_type_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV * [[VAL]], KC0[3].Y
+; EG: MOV [[VAL]], KC0[3].Y
 define void @data_type_3d (%opencl.image3d_t addrspace(1)* %in,
                                      i32 addrspace(1)* %out) {
 entry:
@@ -113,7 +113,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}channel_order_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV * [[VAL]], KC0[3].Z
+; EG: MOV [[VAL]], KC0[3].Z
 define void @channel_order_2d (%opencl.image2d_t addrspace(1)* %in,
                                i32 addrspace(1)* %out) {
 entry:
@@ -126,7 +126,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}channel_order_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV * [[VAL]], KC0[3].Z
+; EG: MOV [[VAL]], KC0[3].Z
 define void @channel_order_3d (%opencl.image3d_t addrspace(1)* %in,
                                          i32 addrspace(1)* %out) {
 entry:
@@ -145,7 +145,7 @@ entry:
 ;
 ; FUNC-LABEL: {{^}}image_arg_2nd:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV * [[VAL]], KC0[4].Z
+; EG: MOV [[VAL]], KC0[4].Z
 define void @image_arg_2nd (%opencl.image3d_t addrspace(1)* %in1,
                             i32 %x,
                             %opencl.image2d_t addrspace(1)* %in2,
diff --git a/test/CodeGen/AMDGPU/literals.ll b/test/CodeGen/AMDGPU/literals.ll
index 9d2320cb2d19..cff1c24f89d6 100644
--- a/test/CodeGen/AMDGPU/literals.ll
+++ b/test/CodeGen/AMDGPU/literals.ll
@@ -7,8 +7,8 @@
 ; ADD_INT literal.x KC0[2].Z, 5
 
 ; CHECK: {{^}}i32_literal:
-; CHECK: LSHR
-; CHECK-NEXT: ADD_INT * {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.y
+; CHECK: ADD_INT {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 5
 define void @i32_literal(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -24,8 +24,8 @@ entry:
 ; ADD literal.x KC0[2].Z, 5.0
 
 ; CHECK: {{^}}float_literal:
-; CHECK: LSHR
-; CHECK-NEXT: ADD * {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.y
+; CHECK: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.0
 define void @float_literal(float addrspace(1)* %out, float %in) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll
index 2e299e30b8c7..6dc9d050eee6 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll
@@ -4,7 +4,7 @@
 
 ; FUNC-LABEL: {{^}}read_workdim:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV * [[VAL]], KC0[2].Z
+; EG: MOV [[VAL]], KC0[2].Z
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll
index a30a8e083eb6..74792e50017f 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll
@@ -3,7 +3,7 @@
 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s
 
 ; R600: {{^}}amdgpu_trunc:
-; R600: TRUNC {{\*? *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; R600: TRUNC T{{[0-9]+\.[XYZW]}}, KC0[2].Z
 ; SI: {{^}}amdgpu_trunc:
 ; SI: v_trunc_f32
 
diff --git a/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
index 13ebee41e844..f2a7256e812d 100644
--- a/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
+++ b/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
@@ -5,7 +5,7 @@
 
 ; FUNC-LABEL: {{^}}local_size_x:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV * [[VAL]], KC0[1].Z
+; EG: MOV [[VAL]], KC0[1].Z
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18
@@ -23,7 +23,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}local_size_y:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV * [[VAL]], KC0[1].W
+; EG: MOV [[VAL]], KC0[1].W
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
@@ -38,7 +38,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}local_size_z:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV * [[VAL]], KC0[2].X
+; EG: MOV [[VAL]], KC0[2].X
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
diff --git a/test/CodeGen/AMDGPU/or.ll b/test/CodeGen/AMDGPU/or.ll
index e40f18f040b7..1c04090b407f 100644
--- a/test/CodeGen/AMDGPU/or.ll
+++ b/test/CodeGen/AMDGPU/or.ll
@@ -153,7 +153,7 @@ define void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 }
 
 ; FUNC-LABEL: {{^}}or_i1:
-; EG: OR_INT * {{\** *}}T{{[0-9]+\.[XYZW], PS, PV\.[XYZW]}}
+; EG: OR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}}
 
 ; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}]
 define void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
diff --git a/test/CodeGen/AMDGPU/set-dx10.ll b/test/CodeGen/AMDGPU/set-dx10.ll
index 57365a6e1fc3..53694dcffa66 100644
--- a/test/CodeGen/AMDGPU/set-dx10.ll
+++ b/test/CodeGen/AMDGPU/set-dx10.ll
@@ -5,8 +5,8 @@
 ; SET*DX10 instructions.
 
 ; CHECK: {{^}}fcmp_une_select_fptosi:
-; CHECK: LSHR
-; CHECK-NEXT: SETNE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
+; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_une_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -19,8 +19,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_une_select_i32:
-; CHECK: LSHR
-; CHECK-NEXT: SETNE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
+; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_une_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -31,8 +31,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_oeq_select_fptosi:
-; CHECK: LSHR
-; CHECK-NEXT: SETE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
+; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_oeq_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -45,8 +45,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_oeq_select_i32:
-; CHECK: LSHR
-; CHECK-NEXT: SETE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
+; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_oeq_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -57,8 +57,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_ogt_select_fptosi:
-; CHECK: LSHR
-; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
+; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_ogt_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -71,8 +71,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_ogt_select_i32:
-; CHECK: LSHR
-; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
+; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_ogt_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -83,8 +83,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_oge_select_fptosi:
-; CHECK: LSHR
-; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
+; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_oge_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -97,8 +97,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_oge_select_i32:
-; CHECK: LSHR
-; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
+; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_oge_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -109,8 +109,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_ole_select_fptosi:
-; CHECK: LSHR
-; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
+; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_ole_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -123,8 +123,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_ole_select_i32:
-; CHECK: LSHR
-; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
+; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_ole_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -135,8 +135,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_olt_select_fptosi:
-; CHECK: LSHR
-; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
+; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_olt_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -149,8 +149,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_olt_select_i32:
-; CHECK: LSHR
-; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
+; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_olt_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
diff --git a/test/CodeGen/AMDGPU/sext-in-reg.ll b/test/CodeGen/AMDGPU/sext-in-reg.ll
index 23ae3b967971..95fcfdbdecae 100644
--- a/test/CodeGen/AMDGPU/sext-in-reg.ll
+++ b/test/CodeGen/AMDGPU/sext-in-reg.ll
@@ -12,8 +12,8 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 ; SI: buffer_store_dword [[EXTRACT]],
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
-; EG: LSHR * [[ADDR]]
-; EG: BFE_INT * [[RES]], {{.*}}, 0.0, 1
+; EG: BFE_INT [[RES]], {{.*}}, 0.0, 1
+; EG-NEXT: LSHR * [[ADDR]]
 define void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) {
   %shl = shl i32 %in, 31
   %sext = ashr i32 %shl, 31
diff --git a/test/CodeGen/AMDGPU/shl.ll b/test/CodeGen/AMDGPU/shl.ll
index 55db80731c90..bf08e66f3304 100644
--- a/test/CodeGen/AMDGPU/shl.ll
+++ b/test/CodeGen/AMDGPU/shl.ll
@@ -53,14 +53,14 @@ define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in
   ret void
 }
 
-;EG-LABEL: {{^}}shl_i64:
+;EG: {{^}}shl_i64:
 ;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
 ;EG: LSHR {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
-;EG-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
-;EG-DAG: LSHR {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
+;EG: LSHR {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
+;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
 ;EG-DAG: LSHL {{\*? *}}[[HISMTMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], [[SHIFT]]
-;EG-DAG: OR_INT {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], {{[[HISMTMP]]|PV.[XYZW]|PS}}, {{[[OVERF]]|PV.[XYZW]}}
-;EG-DAG: LSHL {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], [[OPLO]], {{PS|[[SHIFT]]|PV.[XYZW]}}
+;EG-DAG: OR_INT {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], {{[[HISMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
+;EG-DAG: LSHL {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], [[OPLO]], {{PS|[[SHIFT]]}}
 ;EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
 ;EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
 ;EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0
@@ -80,7 +80,7 @@ define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   ret void
 }
 
-;EG-LABEL: {{^}}shl_v2i64:
+;EG: {{^}}shl_v2i64:
 ;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
 ;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
 ;EG-DAG: LSHR {{\*? *}}[[COMPSHA]]
diff --git a/test/CodeGen/AMDGPU/sra.ll b/test/CodeGen/AMDGPU/sra.ll
index 3b59bbfb18c0..bcbc32f4c053 100644
--- a/test/CodeGen/AMDGPU/sra.ll
+++ b/test/CodeGen/AMDGPU/sra.ll
@@ -70,11 +70,11 @@ entry:
 ;EG-LABEL: {{^}}ashr_i64_2:
 ;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
 ;EG: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
-;EG-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
-;EG-DAG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
+;EG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
+;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
 ;EG-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]]
-;EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]|PS}}, {{[[OVERF]]|PV.[XYZW]}}
-;EG-DAG: ASHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|PV.[XYZW]|[[SHIFT]]}}
+;EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
+;EG-DAG: ASHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
 ;EG-DAG: ASHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal
 ;EG-DAG: ASHR {{\*? *}}[[HIBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal
 ;EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
diff --git a/test/CodeGen/AMDGPU/srl.ll b/test/CodeGen/AMDGPU/srl.ll
index bbd954356322..ebb2f2db252e 100644
--- a/test/CodeGen/AMDGPU/srl.ll
+++ b/test/CodeGen/AMDGPU/srl.ll
@@ -65,14 +65,14 @@ define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i
 
 ; EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
 ; EG: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
+; EG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
 ; EG-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
-; EG-DAG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
 ; EG-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]]
-; EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]|PS}}, {{[[OVERF]]|PV.[XYZW]}}
-; EG-DAG: LSHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]|PV\.[XYZW]}}
+; EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
+; EG-DAG: LSHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
+; EG-DAG: LSHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
 ; EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
-; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]|PS}}
-; EG-DAG: LSHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], [[SHIFT]]
+; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
 ; EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0
 define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
diff --git a/test/CodeGen/AMDGPU/unsupported-cc.ll b/test/CodeGen/AMDGPU/unsupported-cc.ll
index d120111a71fb..8ab4faf2f145 100644
--- a/test/CodeGen/AMDGPU/unsupported-cc.ll
+++ b/test/CodeGen/AMDGPU/unsupported-cc.ll
@@ -3,8 +3,8 @@
 ; These tests are for condition codes that are not supported by the hardware
 
 ; CHECK-LABEL: {{^}}slt:
-; CHECK: LSHR
-; CHECK-NEXT: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
+; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 5(7.006492e-45)
 define void @slt(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -15,8 +15,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}ult_i32:
-; CHECK: LSHR
-; CHECK-NEXT: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
+; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 5(7.006492e-45)
 define void @ult_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -40,8 +40,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}ult_float_native:
-; CHECK: LSHR
-; CHECK-NEXT: SETGE {{\*? *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, {{literal\.[xy]}}
+; CHECK: SETGE T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
+; CHECK-NEXT: LSHR *
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @ult_float_native(float addrspace(1)* %out, float %in) {
 entry:
@@ -52,8 +52,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}olt:
-; CHECK: LSHR
-; CHECK-NEXT: SETGT {{\*? *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
+; CHECK: SETGT T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-NEXT: LSHR *
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @olt(float addrspace(1)* %out, float %in) {
 entry:
@@ -64,8 +64,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}sle:
-; CHECK: LSHR
-; CHECK-NEXT: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
+; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 6(8.407791e-45)
 define void @sle(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -76,8 +76,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}ule_i32:
-; CHECK: LSHR
-; CHECK-NEXT: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
+; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 6(8.407791e-45)
 define void @ule_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -101,8 +101,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}ule_float_native:
-; CHECK: LSHR
-; CHECK-NEXT: SETGT {{\*? *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, {{literal\.[xy]}}
+; CHECK: SETGT T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
+; CHECK-NEXT: LSHR *
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @ule_float_native(float addrspace(1)* %out, float %in) {
 entry:
@@ -113,8 +113,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}ole:
-; CHECK: LSHR
-; CHECK-NEXT: SETGE {{\*? *}}T{{[0-9]\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
+; CHECK: SETGE T{{[0-9]\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-NEXT: LSHR *
 ; CHECK-NEXT:1084227584(5.000000e+00)
 define void @ole(float addrspace(1)* %out, float %in) {
 entry:
diff --git a/test/CodeGen/AMDGPU/work-item-intrinsics.ll b/test/CodeGen/AMDGPU/work-item-intrinsics.ll
index f420ec9c7d23..a704a23b0f92 100644
--- a/test/CodeGen/AMDGPU/work-item-intrinsics.ll
+++ b/test/CodeGen/AMDGPU/work-item-intrinsics.ll
@@ -7,7 +7,7 @@
 
 ; FUNC-LABEL: {{^}}ngroups_x:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV {{\*? *}}[[VAL]], KC0[0].X
+; EG: MOV [[VAL]], KC0[0].X
 
 ; HSA: .amd_kernel_code_t
 
@@ -38,7 +38,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}ngroups_y:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV {{\*? *}}[[VAL]], KC0[0].Y
+; EG: MOV [[VAL]], KC0[0].Y
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
@@ -53,7 +53,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}ngroups_z:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV {{\*? *}}[[VAL]], KC0[0].Z
+; EG: MOV [[VAL]], KC0[0].Z
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
@@ -68,7 +68,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}global_size_x:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV {{\*? *}}[[VAL]], KC0[0].W
+; EG: MOV [[VAL]], KC0[0].W
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc
@@ -83,7 +83,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}global_size_y:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV {{\*? *}}[[VAL]], KC0[1].X
+; EG: MOV [[VAL]], KC0[1].X
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10
@@ -98,7 +98,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}global_size_z:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV {{\*? *}}[[VAL]], KC0[1].Y
+; EG: MOV [[VAL]], KC0[1].Y
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14
diff --git a/test/CodeGen/AMDGPU/xor.ll b/test/CodeGen/AMDGPU/xor.ll
index 655655d92f08..ddb920af29d8 100644
--- a/test/CodeGen/AMDGPU/xor.ll
+++ b/test/CodeGen/AMDGPU/xor.ll
@@ -38,7 +38,7 @@ define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in
 }
 
 ; FUNC-LABEL: {{^}}xor_i1:
-; EG: XOR_INT {{\** *}}{{T[0-9]+\.[XYZW]}}, {{PS|PV\.[XYZW]}}, {{PS|PV\.[XYZW]}}
+; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}}
 
 ; SI-DAG: v_cmp_le_f32_e32 [[CMP0:vcc]], 0, {{v[0-9]+}}
 ; SI-DAG: v_cmp_le_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 1.0, {{v[0-9]+}}

From d26f9217962b4fbe435341aadfc3950edd32c9a3 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Thu, 3 Dec 2015 03:34:32 +0000
Subject: [PATCH 184/186] AMDGPU/SI: Emit constant arrays in the
 .hsrodata_readonly_agent section

Summary: This is done only when targeting HSA.

Reviewers: arsenm

Subscribers: arsenm, llvm-commits

Differential Revision: http://reviews.llvm.org/D13807

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254587 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp        |  2 +-
 .../AMDGPU/AMDGPUHSATargetObjectFile.cpp      |  4 +++
 lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.h |  1 +
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      | 10 ++++++
 .../AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp   |  1 +
 lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  6 ++++
 lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h      |  2 ++
 test/CodeGen/AMDGPU/hsa-globals.ll            | 36 +++++++++++++++++++
 8 files changed, 61 insertions(+), 1 deletion(-)

diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 1c438b24e6a4..b677caa6c2c6 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -137,7 +137,7 @@ static bool isModuleLinkage(const GlobalValue *GV) {
 void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 
   if (TM.getTargetTriple().getOS() != Triple::AMDHSA ||
-      GV->isDeclaration() || AMDGPU::isReadOnlySegment(GV)) {
+      GV->isDeclaration()) {
     AsmPrinter::EmitGlobalVariable(GV);
     return;
   }
diff --git a/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.cpp b/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.cpp
index ee42eaacf546..32f53edeb770 100644
--- a/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.cpp
+++ b/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.cpp
@@ -26,6 +26,7 @@ void AMDGPUHSATargetObjectFile::Initialize(MCContext &Ctx,
   DataGlobalAgentSection = AMDGPU::getHSADataGlobalAgentSection(Ctx);
   DataGlobalProgramSection = AMDGPU::getHSADataGlobalProgramSection(Ctx);
 
+  RodataReadonlyAgentSection = AMDGPU::getHSARodataReadonlyAgentSection(Ctx);
 }
 
 bool AMDGPUHSATargetObjectFile::isAgentAllocationSection(
@@ -63,5 +64,8 @@ MCSection *AMDGPUHSATargetObjectFile::SelectSectionForGlobal(
       return DataGlobalProgramSection;
   }
 
+  if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GV))
+    return RodataReadonlyAgentSection;
+
   return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang, TM);
 }
diff --git a/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.h b/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.h
index 3697a96ffaf2..9ea51ec9b29e 100644
--- a/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.h
+++ b/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.h
@@ -25,6 +25,7 @@ class AMDGPUHSATargetObjectFile final : public TargetLoweringObjectFileELF {
 private:
   MCSection *DataGlobalAgentSection;
   MCSection *DataGlobalProgramSection;
+  MCSection *RodataReadonlyAgentSection;
 
   bool isAgentAllocationSection(const char *SectionName) const;
   bool isAgentAllocation(const GlobalValue *GV) const;
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index b360f2de0cb3..7359cfee7f27 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -369,6 +369,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
   bool ParseDirectiveAMDGPUHsaProgramGlobal();
   bool ParseSectionDirectiveHSADataGlobalAgent();
   bool ParseSectionDirectiveHSADataGlobalProgram();
+  bool ParseSectionDirectiveHSARodataReadonlyAgent();
 
 public:
 public:
@@ -1004,6 +1005,12 @@ bool AMDGPUAsmParser::ParseSectionDirectiveHSADataGlobalProgram() {
   return false;
 }
 
+bool AMDGPUAsmParser::ParseSectionDirectiveHSARodataReadonlyAgent() {
+  getParser().getStreamer().SwitchSection(
+      AMDGPU::getHSARodataReadonlyAgentSection(getContext()));
+  return false;
+}
+
 bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
   StringRef IDVal = DirectiveID.getString();
 
@@ -1034,6 +1041,9 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
   if (IDVal == ".hsadata_global_program")
     return ParseSectionDirectiveHSADataGlobalProgram();
 
+  if (IDVal == ".hsarodata_readonly_agent")
+    return ParseSectionDirectiveHSARodataReadonlyAgent();
+
   return true;
 }
 
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 89fa212de956..68b1d1ae83cc 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -38,5 +38,6 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() {
 bool AMDGPUMCAsmInfo::shouldOmitSectionDirective(StringRef SectionName) const {
   return SectionName == ".hsatext" || SectionName == ".hsadata_global_agent" ||
          SectionName == ".hsadata_global_program" ||
+         SectionName == ".hsarodata_readonly_agent" ||
          MCAsmInfo::shouldOmitSectionDirective(SectionName);
 }
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index e3c93384bd8e..441baed9b434 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -81,6 +81,12 @@ MCSection *getHSADataGlobalProgramSection(MCContext &Ctx) {
                             ELF::SHF_AMDGPU_HSA_GLOBAL);
 }
 
+MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx) {
+  return Ctx.getELFSection(".hsarodata_readonly_agent", ELF::SHT_PROGBITS,
+                           ELF::SHF_ALLOC | ELF::SHF_AMDGPU_HSA_READONLY |
+                           ELF::SHF_AMDGPU_HSA_AGENT);
+}
+
 bool isGroupSegment(const GlobalValue *GV) {
   return GV->getType()->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
 }
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index cc70064dbc0f..7b3c858e7c36 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -36,6 +36,8 @@ MCSection *getHSADataGlobalAgentSection(MCContext &Ctx);
 
 MCSection *getHSADataGlobalProgramSection(MCContext &Ctx);
 
+MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx);
+
 bool isGroupSegment(const GlobalValue *GV);
 bool isGlobalSegment(const GlobalValue *GV);
 bool isReadOnlySegment(const GlobalValue *GV);
diff --git a/test/CodeGen/AMDGPU/hsa-globals.ll b/test/CodeGen/AMDGPU/hsa-globals.ll
index c73ca3b0079a..b6483f0970b0 100644
--- a/test/CodeGen/AMDGPU/hsa-globals.ll
+++ b/test/CodeGen/AMDGPU/hsa-globals.ll
@@ -9,6 +9,9 @@
 @common_global_agent = common addrspace(1) global i32 0, section ".hsadata_global_agent"
 @external_global_agent = addrspace(1) global i32 0, section ".hsadata_global_agent"
 
+@internal_readonly = internal unnamed_addr addrspace(2) constant i32 0
+@external_readonly = unnamed_addr addrspace(2) constant i32 0
+
 define void @test() {
   ret void
 }
@@ -43,6 +46,16 @@ define void @test() {
 ; ASM: external_global_agent:
 ; ASM: .long 0
 
+; ASM: .amdgpu_hsa_module_global internal_readonly
+; ASM: .hsarodata_readonly_agent
+; ASM: internal_readonly:
+; ASM: .long 0
+
+; ASM: .amdgpu_hsa_program_global external_readonly
+; ASM: .hsarodata_readonly_agent
+; ASM: external_readonly:
+; ASM: .long 0
+
 ; ELF: Section {
 ; ELF: Name: .hsadata_global_program
 ; ELF: Type: SHT_PROGBITS (0x1)
@@ -64,6 +77,15 @@ define void @test() {
 ; ELF: ]
 ; ELF: }
 
+; ELF: Section {
+; ELF: Name: .hsarodata_readonly_agent
+; ELF: Type: SHT_PROGBITS (0x1)
+; ELF: Flags [ (0xA00002)
+; ELF: SHF_ALLOC (0x2)
+; ELF: SHF_AMDGPU_HSA_AGENT (0x800000)
+; ELF: SHF_AMDGPU_HSA_READONLY (0x200000)
+; ELF: ]
+
 ; ELF: Symbol {
 ; ELF: Name: common_global_agent
 ; ELF: Binding: Local
@@ -90,6 +112,13 @@ define void @test() {
 ; ELF: Section: .hsadata_global_program
 ; ELF: }
 
+; ELF: Symbol {
+; ELF: Name: internal_readonly
+; ELF: Binding: Local
+; ELF: Type: Object
+; ELF: Section: .hsarodata_readonly_agent
+; ELF: }
+
 ; ELF: Symbol {
 ; ELF: Name: external_global_agent
 ; ELF: Binding: Global
@@ -103,3 +132,10 @@ define void @test() {
 ; ELF: Type: Object
 ; ELF: Section: .hsadata_global_program
 ; ELF: }
+
+; ELF: Symbol {
+; ELF: Name: external_readonly
+; ELF: Binding: Global
+; ELF: Type: Object
+; ELF: Section: .hsarodata_readonly_agent
+; ELF: }

From 15766ec56623c80fa9771fc15e8f468fd4b77e6c Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Thu, 3 Dec 2015 05:57:37 +0000
Subject: [PATCH 185/186] [TableGen] Remove an assumption about the order of
 encodings in the MVT::SimpleValueType enum. Instead of assuming the types are
 sorted by size, scan the typeset arrays to find the smallest/largest type.
 NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254589 91177308-0d34-0410-b5e6-96231b3b80d8
---
 utils/TableGen/CodeGenDAGPatterns.cpp | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/utils/TableGen/CodeGenDAGPatterns.cpp b/utils/TableGen/CodeGenDAGPatterns.cpp
index 3f74a9999c98..3ebe51e05121 100644
--- a/utils/TableGen/CodeGenDAGPatterns.cpp
+++ b/utils/TableGen/CodeGenDAGPatterns.cpp
@@ -388,7 +388,13 @@ bool EEVT::TypeSet::EnforceSmallerThan(EEVT::TypeSet &Other, TreePattern &TP) {
   // the size of the smallest type.
   {
     TypeSet InputSet(Other);
-    MVT Smallest = TypeVec[0];
+    MVT Smallest = *std::min_element(TypeVec.begin(), TypeVec.end(),
+      [](MVT A, MVT B) {
+        return A.getScalarSizeInBits() < B.getScalarSizeInBits() ||
+               (A.getScalarSizeInBits() == B.getScalarSizeInBits() &&
+                A.getSizeInBits() < B.getSizeInBits());
+      });
+
     auto I = std::remove_if(Other.TypeVec.begin(), Other.TypeVec.end(),
       [Smallest](MVT OtherVT) {
         // Don't compare vector and non-vector types.
@@ -416,7 +422,12 @@ bool EEVT::TypeSet::EnforceSmallerThan(EEVT::TypeSet &Other, TreePattern &TP) {
   // the size of the largest type.
   {
     TypeSet InputSet(*this);
-    MVT Largest = Other.TypeVec[Other.TypeVec.size()-1];
+    MVT Largest = *std::max_element(Other.TypeVec.begin(), Other.TypeVec.end(),
+      [](MVT A, MVT B) {
+        return A.getScalarSizeInBits() < B.getScalarSizeInBits() ||
+               (A.getScalarSizeInBits() == B.getScalarSizeInBits() &&
+                A.getSizeInBits() < B.getSizeInBits());
+      });
     auto I = std::remove_if(TypeVec.begin(), TypeVec.end(),
       [Largest](MVT OtherVT) {
         // Don't compare vector and non-vector types.

From e95d304fd362e68e985c9624ce5503099f63a3ee Mon Sep 17 00:00:00 2001
From: Andy Gibbs <andyg1001@hotmail.co.uk>
Date: Thu, 3 Dec 2015 08:20:20 +0000
Subject: [PATCH 186/186] Fix class SCEVPredicate has virtual functions and
 accessible non-virtual destructor.

It is not enough to simply make the destructor virtual since there is a g++ 4.7
issue (see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53613) that throws the
error "looser throw specifier for ... overridding ~SCEVPredicate() noexcept".


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254592 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/ScalarEvolution.h | 2 +-
 lib/Analysis/ScalarEvolution.cpp        | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index da8c68aa6832..f674cc7ee56f 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@@ -183,7 +183,7 @@ namespace llvm {
 
   protected:
     SCEVPredicateKind Kind;
-    ~SCEVPredicate() = default;
+    virtual ~SCEVPredicate();
     SCEVPredicate(const SCEVPredicate&) = default;
     SCEVPredicate &operator=(const SCEVPredicate&) = default;
 
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index d04028b15e2f..9a0570d47f02 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -9643,6 +9643,8 @@ SCEVPredicate::SCEVPredicate(const FoldingSetNodeIDRef ID,
                              SCEVPredicateKind Kind)
     : FastID(ID), Kind(Kind) {}
 
+SCEVPredicate::~SCEVPredicate() {}
+
 SCEVEqualPredicate::SCEVEqualPredicate(const FoldingSetNodeIDRef ID,
                                        const SCEVUnknown *LHS,
                                        const SCEVConstant *RHS)