From 7ee142e3f6143d7c613ce75ca3f797b76032c39e Mon Sep 17 00:00:00 2001
From: Marina Yatsina <marina.yatsina@intel.com>
Date: Thu, 3 Dec 2015 08:55:33 +0000
Subject: [PATCH 001/364] [X86] Add support for fcomip, fucomip for Intel
 syntax

According to x86 spec, fcomip and fucomip should be supported for Intel syntax.

Differential Revision: http://reviews.llvm.org/D15104


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254595 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86InstrInfo.td | 4 ++--
 test/MC/X86/intel-syntax.s     | 5 +++++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 0571b07d2f8b..1e66739026e2 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -2651,14 +2651,14 @@ def : MnemonicAlias<"fcmova",   "fcmovnbe", "att">;
 def : MnemonicAlias<"fcmovnae", "fcmovb",   "att">;
 def : MnemonicAlias<"fcmovna",  "fcmovbe",  "att">;
 def : MnemonicAlias<"fcmovae",  "fcmovnb",  "att">;
-def : MnemonicAlias<"fcomip",   "fcompi",   "att">;
+def : MnemonicAlias<"fcomip",   "fcompi">;
 def : MnemonicAlias<"fildq",    "fildll",   "att">;
 def : MnemonicAlias<"fistpq",   "fistpll",  "att">;
 def : MnemonicAlias<"fisttpq",  "fisttpll", "att">;
 def : MnemonicAlias<"fldcww",   "fldcw",    "att">;
 def : MnemonicAlias<"fnstcww",  "fnstcw",   "att">;
 def : MnemonicAlias<"fnstsww",  "fnstsw",   "att">;
-def : MnemonicAlias<"fucomip",  "fucompi",  "att">;
+def : MnemonicAlias<"fucomip",  "fucompi">;
 def : MnemonicAlias<"fwait",    "wait">;
 
 def : MnemonicAlias<"fxsaveq",   "fxsave64",   "att">;
diff --git a/test/MC/X86/intel-syntax.s b/test/MC/X86/intel-syntax.s
index b79b21dc9691..71bf6cc0ffdb 100644
--- a/test/MC/X86/intel-syntax.s
+++ b/test/MC/X86/intel-syntax.s
@@ -736,3 +736,8 @@ fbld tbyte ptr [eax]
 fbstp tbyte ptr [eax]
 // CHECK: fbld (%eax)
 // CHECK: fbstp (%eax)
+
+fcomip st, st(2)
+fucomip st, st(2)
+// CHECK: fcompi  %st(2)
+// CHECK: fucompi  %st(2)

From 1051eae13a0f6397361617b1efab909cc7ba7c19 Mon Sep 17 00:00:00 2001
From: Zlatko Buljan <Zlatko.Buljan@imgtec.com>
Date: Thu, 3 Dec 2015 09:56:39 +0000
Subject: [PATCH 002/364] [mips][DSP] Add DSPr1 and DSPr2 tests for the
 standard encodings Differential Revision: http://reviews.llvm.org/D15141

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254598 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/MC/Disassembler/Mips/dsp/valid.txt   | 125 +++++++++++++
 test/MC/Disassembler/Mips/dspr2/valid.txt | 173 +++++++++++++++++
 test/MC/Mips/dsp/valid.s                  | 127 +++++++++++++
 test/MC/Mips/dspr2/valid.s                | 217 +++++++++++++++++-----
 4 files changed, 597 insertions(+), 45 deletions(-)
 create mode 100644 test/MC/Disassembler/Mips/dsp/valid.txt
 create mode 100644 test/MC/Disassembler/Mips/dspr2/valid.txt
 create mode 100644 test/MC/Mips/dsp/valid.s

diff --git a/test/MC/Disassembler/Mips/dsp/valid.txt b/test/MC/Disassembler/Mips/dsp/valid.txt
new file mode 100644
index 000000000000..e6ca900dde55
--- /dev/null
+++ b/test/MC/Disassembler/Mips/dsp/valid.txt
@@ -0,0 +1,125 @@
+# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux -mattr=dsp | FileCheck %s
+
+  0x7c 0x02 0x0a 0x52 # CHECK: absq_s.ph        $1, $2
+  0x7c 0x06 0x2c 0x52 # CHECK: absq_s.w         $5, $6
+  0x7d 0x09 0x3a 0x90 # CHECK: addq.ph          $7, $8, $9
+  0x7d 0x6c 0x53 0x90 # CHECK: addq_s.ph        $10, $11, $12
+  0x7d 0xcf 0x6d 0x90 # CHECK: addq_s.w         $13, $14, $15
+  0x7f 0xbe 0xe4 0x10 # CHECK: addsc            $gp, $sp, $fp
+  0x7c 0xe8 0x30 0x10 # CHECK: addu.qb          $6, $7, $8
+  0x7d 0x4b 0x49 0x10 # CHECK: addu_s.qb        $9, $10, $11
+  0x7d 0xae 0x64 0x50 # CHECK: addwc            $12, $13, $14
+  0x7c 0x1a 0xce 0xd2 # CHECK: bitrev           $25, $26
+  0x04 0x1c 0x14 0x9b # CHECK: bposge32         21104
+  0x7f 0x7c 0x02 0x11 # CHECK: cmp.eq.ph        $27, $gp
+  0x7f 0xbe 0x02 0x51 # CHECK: cmp.lt.ph        $sp, $fp
+  0x7f 0xe1 0x02 0x91 # CHECK: cmp.le.ph        $ra, $1
+  0x7d 0x8d 0x59 0x11 # CHECK: cmpgu.eq.qb      $11, $12, $13
+  0x7d 0xf0 0x71 0x51 # CHECK: cmpgu.lt.qb      $14, $15, $16
+  0x7e 0x53 0x89 0x91 # CHECK: cmpgu.le.qb      $17, $18, $19
+  0x7e 0x95 0x00 0x11 # CHECK: cmpu.eq.qb       $20, $21
+  0x7e 0xd7 0x00 0x51 # CHECK: cmpu.lt.qb       $22, $23
+  0x7f 0x19 0x00 0x91 # CHECK: cmpu.le.qb       $24, $25
+  0x7c 0x22 0x09 0x30 # CHECK: dpaq_s.w.ph      $ac1, $1, $2
+  0x7c 0x64 0x13 0x30 # CHECK: dpaq_sa.l.w      $ac2, $3, $4
+  0x7d 0x2a 0x08 0xf0 # CHECK: dpau.h.qbl       $ac1, $9, $10
+  0x7d 0x6c 0x09 0xf0 # CHECK: dpau.h.qbr       $ac1, $11, $12
+  0x7e 0x32 0x01 0x70 # CHECK: dpsq_s.w.ph      $ac0, $17, $18
+  0x7e 0x74 0x0b 0x70 # CHECK: dpsq_sa.l.w      $ac1, $19, $20
+  0x7c 0xa6 0x02 0xf0 # CHECK: dpsu.h.qbl       $ac0, $5, $6
+  0x7c 0xe8 0x0b 0xf0 # CHECK: dpsu.h.qbr       $ac1, $7, $8
+  0x7f 0xe1 0x00 0xb8 # CHECK: extp             $1, $ac0, 31
+  0x7c 0x02 0x0a 0xb8 # CHECK: extpdp           $2, $ac1, 0
+  0x7c 0x83 0x12 0xf8 # CHECK: extpdpv          $3, $ac2, $4
+  0x7c 0xc5 0x18 0xf8 # CHECK: extpv            $5, $ac3, $6
+  0x7f 0xe7 0x00 0x38 # CHECK: extr.w           $7, $ac0, 31
+  0x7d 0xe8 0x09 0x38 # CHECK: extr_r.w         $8, $ac1, 15
+  0x7c 0xe9 0x11 0xb8 # CHECK: extr_rs.w        $9, $ac2, 7
+  0x7c 0x6a 0x1b 0xb8 # CHECK: extr_s.h         $10, $ac3, 3
+  0x7d 0x8b 0x00 0x78 # CHECK: extrv.w          $11, $ac0, $12
+  0x7d 0xcd 0x09 0x78 # CHECK: extrv_r.w        $13, $ac1, $14
+  0x7e 0x0f 0x11 0xf8 # CHECK: extrv_rs.w       $15, $ac2, $16
+  0x7e 0x51 0x1b 0xf8 # CHECK: extrv_s.h        $17, $ac3, $18
+  0x7e 0x93 0x00 0x0c # CHECK: insv             $19, $20
+  0x7f 0x54 0x51 0x8a # CHECK: lbux             $10, $20($26)
+  0x7f 0x75 0x59 0x0a # CHECK: lhx              $11, $21($27)
+  0x7f 0x96 0x60 0x0a # CHECK: lwx              $12, $22($gp)
+  0x70 0xc7 0x08 0x00 # CHECK: madd             $ac1, $6, $7
+  0x71 0x09 0x08 0x01 # CHECK: maddu            $ac1, $8, $9
+  0x70 0xc7 0x00 0x00 # CHECK: madd             $6, $7
+  0x71 0x09 0x00 0x01 # CHECK: maddu            $8, $9
+  0x7c 0x64 0x15 0x30 # CHECK: maq_s.w.phl      $ac2, $3, $4
+  0x7c 0xa6 0x1c 0x30 # CHECK: maq_sa.w.phl     $ac3, $5, $6
+  0x7c 0xe8 0x05 0xb0 # CHECK: maq_s.w.phr      $ac0, $7, $8
+  0x7d 0x2a 0x0c 0xb0 # CHECK: maq_sa.w.phr     $ac1, $9, $10
+  0x00 0x20 0x70 0x10 # CHECK: mfhi             $14, $ac1
+  0x00 0x20 0x78 0x12 # CHECK: mflo             $15, $ac1
+  0x00 0x00 0x70 0x10 # CHECK: mfhi             $14
+  0x00 0x00 0x78 0x12 # CHECK: mflo             $15
+  0x7d 0x8d 0x5c 0x90 # CHECK: modsub           $11, $12, $13
+  0x71 0x4b 0x18 0x04 # CHECK: msub             $ac3, $10, $11
+  0x71 0x8d 0x10 0x05 # CHECK: msubu            $ac2, $12, $13
+  0x71 0x4b 0x00 0x04 # CHECK: msub             $10, $11
+  0x71 0x8d 0x00 0x05 # CHECK: msubu            $12, $13
+  0x02 0x00 0x18 0x11 # CHECK: mthi             $16, $ac3
+  0x02 0x00 0x00 0x11 # CHECK: mthi             $16
+  0x7d 0xc0 0x17 0xf8 # CHECK: mthlip           $14, $ac2
+  0x02 0x20 0x10 0x13 # CHECK: mtlo             $17, $ac2
+  0x02 0x20 0x00 0x13 # CHECK: mtlo             $17
+  0x7e 0xd7 0xaf 0x10 # CHECK: muleq_s.w.phl    $21, $22, $23
+  0x7f 0x3a 0xc7 0x50 # CHECK: muleq_s.w.phr    $24, $25, $26
+  0x7f 0x9d 0xd9 0x90 # CHECK: muleu_s.ph.qbl   $27, $gp, $sp
+  0x7f 0xe1 0xf1 0xd0 # CHECK: muleu_s.ph.qbr   $fp, $ra, $1
+  0x7c 0x64 0x17 0xd0 # CHECK: mulq_rs.ph       $2, $3, $4
+  0x7e 0x11 0x01 0xb0 # CHECK: mulsaq_s.w.ph    $ac0, $16, $17
+  0x00 0x43 0x18 0x18 # CHECK: mult             $ac3, $2, $3
+  0x00 0x85 0x10 0x19 # CHECK: multu            $ac2, $4, $5
+  0x00 0x43 0x00 0x18 # CHECK: mult             $2, $3
+  0x00 0x85 0x00 0x19 # CHECK: multu            $4, $5
+  0x7e 0x74 0x93 0x91 # CHECK: packrl.ph        $18, $19, $20
+  0x7d 0xe3 0x3a 0xd1 # CHECK: pick.ph          $7, $15, $3
+  0x7c 0x88 0x10 0xd1 # CHECK: pick.qb          $2, $4, $8
+  0x7c 0x15 0xa3 0x12 # CHECK: preceq.w.phl     $20, $21
+  0x7c 0x16 0xab 0x52 # CHECK: preceq.w.phr     $21, $22
+  0x7c 0x17 0xb1 0x12 # CHECK: precequ.ph.qbl   $22, $23
+  0x7c 0x19 0xc1 0x92 # CHECK: precequ.ph.qbla  $24, $25
+  0x7c 0x18 0xb9 0x52 # CHECK: precequ.ph.qbr   $23, $24
+  0x7c 0x1a 0xc9 0xd2 # CHECK: precequ.ph.qbra  $25, $26
+  0x7c 0x1b 0xd7 0x12 # CHECK: preceu.ph.qbl    $26, $27
+  0x7c 0x1d 0xe7 0x92 # CHECK: preceu.ph.qbla   $gp, $sp
+  0x7c 0x1c 0xdf 0x52 # CHECK: preceu.ph.qbr    $27, $gp
+  0x7c 0x1e 0xef 0xd2 # CHECK: preceu.ph.qbra   $sp, $fp
+  0x7e 0x53 0x8d 0x11 # CHECK: precrq.ph.w      $17, $18, $19
+  0x7e 0x32 0x83 0x11 # CHECK: precrq.qb.ph     $16, $17, $18
+  0x7e 0x95 0x9b 0xd1 # CHECK: precrqu_s.qb.ph  $19, $20, $21
+  0x7e 0x74 0x95 0x51 # CHECK: precrq_rs.ph.w   $18, $19, $20
+  0x7c 0x40 0x0d 0x10 # CHECK: raddu.w.qb       $1, $2
+  0x7d 0x00 0x2c 0xb8 # CHECK: rddsp            $5, 256
+  0x7c 0x0c 0x12 0x92 # CHECK: repl.ph          $2, 12
+  0x7c 0x55 0x08 0x92 # CHECK: repl.qb          $1, 85
+  0x7c 0x02 0x0a 0xd2 # CHECK: replv.ph         $1, $2
+  0x7c 0x02 0x08 0xd2 # CHECK: replv.qb         $1, $2
+  0x7d 0x00 0x0e 0xb8 # CHECK: shilo            $ac1, 16
+  0x7c 0x40 0x0e 0xf8 # CHECK: shilov           $ac1, $2
+  0x7c 0x62 0x0a 0x13 # CHECK: shll.ph          $1, $2, 3
+  0x7c 0x62 0x0b 0x13 # CHECK: shll_s.ph        $1, $2, 3
+  0x7c 0x62 0x08 0x13 # CHECK: shll.qb          $1, $2, 3
+  0x7c 0x62 0x0a 0x93 # CHECK: shllv.ph         $1, $2, $3
+  0x7c 0x62 0x0b 0x93 # CHECK: shllv_s.ph       $1, $2, $3
+  0x7c 0x62 0x08 0x93 # CHECK: shllv.qb         $1, $2, $3
+  0x7c 0x62 0x0d 0x93 # CHECK: shllv_s.w        $1, $2, $3
+  0x7c 0x62 0x0d 0x13 # CHECK: shll_s.w         $1, $2, 3
+  0x7c 0x22 0x2a 0x53 # CHECK: shra.ph          $5, $2, 1
+  0x7c 0x22 0x2b 0x53 # CHECK: shra_r.ph        $5, $2, 1
+  0x7c 0x62 0x0a 0xd3 # CHECK: shrav.ph         $1, $2, $3
+  0x7c 0x62 0x0b 0xd3 # CHECK: shrav_r.ph       $1, $2, $3
+  0x7c 0x62 0x0d 0xd3 # CHECK: shrav_r.w        $1, $2, $3
+  0x7c 0x22 0x0d 0x53 # CHECK: shra_r.w         $1, $2, 1
+  0x7c 0x42 0x08 0x53 # CHECK: shrl.qb          $1, $2, 2
+  0x7c 0x62 0x08 0xd3 # CHECK: shrlv.qb         $1, $2, $3
+  0x7c 0x43 0x0a 0xd0 # CHECK: subq.ph          $1, $2, $3
+  0x7c 0x43 0x0b 0xd0 # CHECK: subq_s.ph        $1, $2, $3
+  0x7c 0x43 0x0d 0xd0 # CHECK: subq_s.w         $1, $2, $3
+  0x7c 0x43 0x08 0x50 # CHECK: subu.qb          $1, $2, $3
+  0x7c 0x43 0x09 0x50 # CHECK: subu_s.qb        $1, $2, $3
+  0x7c 0x20 0x04 0xf8 # CHECK: wrdsp            $1, 0
diff --git a/test/MC/Disassembler/Mips/dspr2/valid.txt b/test/MC/Disassembler/Mips/dspr2/valid.txt
new file mode 100644
index 000000000000..b1b5a332dc56
--- /dev/null
+++ b/test/MC/Disassembler/Mips/dspr2/valid.txt
@@ -0,0 +1,173 @@
+# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux -mattr=dspr2 | FileCheck %s
+
+  0x7c 0x02 0x0a 0x52 # CHECK: absq_s.ph        $1, $2
+  0x7c 0x04 0x18 0x52 # CHECK: absq_s.qb        $3, $4
+  0x7c 0x06 0x2c 0x52 # CHECK: absq_s.w         $5, $6
+  0x7d 0x09 0x3a 0x90 # CHECK: addq.ph          $7, $8, $9
+  0x7d 0x6c 0x53 0x90 # CHECK: addq_s.ph        $10, $11, $12
+  0x7d 0xcf 0x6d 0x90 # CHECK: addq_s.w         $13, $14, $15
+  0x7e 0x32 0x82 0x18 # CHECK: addqh.ph         $16, $17, $18
+  0x7e 0x95 0x9a 0x98 # CHECK: addqh_r.ph       $19, $20, $21
+  0x7e 0xf8 0xb4 0x18 # CHECK: addqh.w          $22, $23, $24
+  0x7f 0x5b 0xcc 0x98 # CHECK: addqh_r.w        $25, $26, $27
+  0x7f 0xbe 0xe4 0x10 # CHECK: addsc            $gp, $sp, $fp
+  0x7c 0x22 0xfa 0x10 # CHECK: addu.ph          $ra, $1, $2
+  0x7c 0x85 0x1b 0x10 # CHECK: addu_s.ph        $3, $4, $5
+  0x7c 0xe8 0x30 0x10 # CHECK: addu.qb          $6, $7, $8
+  0x7d 0x4b 0x49 0x10 # CHECK: addu_s.qb        $9, $10, $11
+  0x7d 0xae 0x64 0x50 # CHECK: addwc            $12, $13, $14
+  0x7e 0x11 0x78 0x18 # CHECK: adduh.qb         $15, $16, $17
+  0x7e 0x74 0x90 0x98 # CHECK: adduh_r.qb       $18, $19, $20
+  0x7e 0xd5 0x00 0x31 # CHECK: append           $21, $22, 0
+  0x7f 0x17 0x1c 0x31 # CHECK: balign           $23, $24, 3
+  0x7c 0x1a 0xce 0xd2 # CHECK: bitrev           $25, $26
+  0x04 0x1c 0x14 0x9b # CHECK: bposge32         21104
+  0x7f 0x7c 0x02 0x11 # CHECK: cmp.eq.ph        $27, $gp
+  0x7f 0xbe 0x02 0x51 # CHECK: cmp.lt.ph        $sp, $fp
+  0x7f 0xe1 0x02 0x91 # CHECK: cmp.le.ph        $ra, $1
+  0x7c 0x64 0x16 0x11 # CHECK: cmpgdu.eq.qb     $2, $3, $4
+  0x7c 0xc7 0x2e 0x51 # CHECK: cmpgdu.lt.qb     $5, $6, $7
+  0x7d 0x2a 0x46 0x91 # CHECK: cmpgdu.le.qb     $8, $9, $10
+  0x7d 0x8d 0x59 0x11 # CHECK: cmpgu.eq.qb      $11, $12, $13
+  0x7d 0xf0 0x71 0x51 # CHECK: cmpgu.lt.qb      $14, $15, $16
+  0x7e 0x53 0x89 0x91 # CHECK: cmpgu.le.qb      $17, $18, $19
+  0x7e 0x95 0x00 0x11 # CHECK: cmpu.eq.qb       $20, $21
+  0x7e 0xd7 0x00 0x51 # CHECK: cmpu.lt.qb       $22, $23
+  0x7f 0x19 0x00 0x91 # CHECK: cmpu.le.qb       $24, $25
+  0x7f 0x5b 0x00 0x30 # CHECK: dpa.w.ph         $ac0, $26, $27
+  0x7c 0x22 0x09 0x30 # CHECK: dpaq_s.w.ph      $ac1, $1, $2
+  0x7c 0x64 0x13 0x30 # CHECK: dpaq_sa.l.w      $ac2, $3, $4
+  0x7c 0xa6 0x1e 0x30 # CHECK: dpaqx_s.w.ph     $ac3, $5, $6
+  0x7c 0xe8 0x06 0xb0 # CHECK: dpaqx_sa.w.ph    $ac0, $7, $8
+  0x7d 0x2a 0x08 0xf0 # CHECK: dpau.h.qbl       $ac1, $9, $10
+  0x7d 0x6c 0x09 0xf0 # CHECK: dpau.h.qbr       $ac1, $11, $12
+  0x7d 0xae 0x12 0x30 # CHECK: dpax.w.ph        $ac2, $13, $14
+  0x7d 0xf0 0x18 0x70 # CHECK: dps.w.ph         $ac3, $15, $16
+  0x7e 0x32 0x01 0x70 # CHECK: dpsq_s.w.ph      $ac0, $17, $18
+  0x7e 0x74 0x0b 0x70 # CHECK: dpsq_sa.l.w      $ac1, $19, $20
+  0x7c 0x22 0x16 0x70 # CHECK: dpsqx_s.w.ph     $ac2, $1, $2
+  0x7c 0x64 0x1e 0xf0 # CHECK: dpsqx_sa.w.ph    $ac3, $3, $4
+  0x7c 0xa6 0x02 0xf0 # CHECK: dpsu.h.qbl       $ac0, $5, $6
+  0x7c 0xe8 0x0b 0xf0 # CHECK: dpsu.h.qbr       $ac1, $7, $8
+  0x7d 0x2a 0x12 0x70 # CHECK: dpsx.w.ph        $ac2, $9, $10
+  0x7f 0xe1 0x00 0xb8 # CHECK: extp             $1, $ac0, 31
+  0x7c 0x02 0x0a 0xb8 # CHECK: extpdp           $2, $ac1, 0
+  0x7c 0x83 0x12 0xf8 # CHECK: extpdpv          $3, $ac2, $4
+  0x7c 0xc5 0x18 0xf8 # CHECK: extpv            $5, $ac3, $6
+  0x7f 0xe7 0x00 0x38 # CHECK: extr.w           $7, $ac0, 31
+  0x7d 0xe8 0x09 0x38 # CHECK: extr_r.w         $8, $ac1, 15
+  0x7c 0xe9 0x11 0xb8 # CHECK: extr_rs.w        $9, $ac2, 7
+  0x7c 0x6a 0x1b 0xb8 # CHECK: extr_s.h         $10, $ac3, 3
+  0x7d 0x8b 0x00 0x78 # CHECK: extrv.w          $11, $ac0, $12
+  0x7d 0xcd 0x09 0x78 # CHECK: extrv_r.w        $13, $ac1, $14
+  0x7e 0x0f 0x11 0xf8 # CHECK: extrv_rs.w       $15, $ac2, $16
+  0x7e 0x51 0x1b 0xf8 # CHECK: extrv_s.h        $17, $ac3, $18
+  0x7e 0x93 0x00 0x0c # CHECK: insv             $19, $20
+  0x7f 0x54 0x51 0x8a # CHECK: lbux             $10, $20($26)
+  0x7f 0x75 0x59 0x0a # CHECK: lhx              $11, $21($27)
+  0x7f 0x96 0x60 0x0a # CHECK: lwx              $12, $22($gp)
+  0x70 0xc7 0x08 0x00 # CHECK: madd             $ac1, $6, $7
+  0x71 0x09 0x08 0x01 # CHECK: maddu            $ac1, $8, $9
+  0x70 0xc7 0x00 0x00 # CHECK: madd             $6, $7
+  0x71 0x09 0x00 0x01 # CHECK: maddu            $8, $9
+  0x7c 0x64 0x15 0x30 # CHECK: maq_s.w.phl      $ac2, $3, $4
+  0x7c 0xa6 0x1c 0x30 # CHECK: maq_sa.w.phl     $ac3, $5, $6
+  0x7c 0xe8 0x05 0xb0 # CHECK: maq_s.w.phr      $ac0, $7, $8
+  0x7d 0x2a 0x0c 0xb0 # CHECK: maq_sa.w.phr     $ac1, $9, $10
+  0x00 0x20 0x70 0x10 # CHECK: mfhi             $14, $ac1
+  0x00 0x20 0x78 0x12 # CHECK: mflo             $15, $ac1
+  0x00 0x00 0x70 0x10 # CHECK: mfhi             $14
+  0x00 0x00 0x78 0x12 # CHECK: mflo             $15
+  0x7d 0x8d 0x5c 0x90 # CHECK: modsub           $11, $12, $13
+  0x71 0x4b 0x18 0x04 # CHECK: msub             $ac3, $10, $11
+  0x71 0x8d 0x10 0x05 # CHECK: msubu            $ac2, $12, $13
+  0x71 0x4b 0x00 0x04 # CHECK: msub             $10, $11
+  0x71 0x8d 0x00 0x05 # CHECK: msubu            $12, $13
+  0x02 0x00 0x18 0x11 # CHECK: mthi             $16, $ac3
+  0x02 0x00 0x00 0x11 # CHECK: mthi             $16
+  0x7d 0xc0 0x17 0xf8 # CHECK: mthlip           $14, $ac2
+  0x02 0x20 0x10 0x13 # CHECK: mtlo             $17, $ac2
+  0x02 0x20 0x00 0x13 # CHECK: mtlo             $17
+  0x7e 0x11 0x7b 0x18 # CHECK: mul.ph           $15, $16, $17
+  0x7e 0x74 0x93 0x98 # CHECK: mul_s.ph         $18, $19, $20
+  0x7e 0xd7 0xaf 0x10 # CHECK: muleq_s.w.phl    $21, $22, $23
+  0x7f 0x3a 0xc7 0x50 # CHECK: muleq_s.w.phr    $24, $25, $26
+  0x7f 0x9d 0xd9 0x90 # CHECK: muleu_s.ph.qbl   $27, $gp, $sp
+  0x7f 0xe1 0xf1 0xd0 # CHECK: muleu_s.ph.qbr   $fp, $ra, $1
+  0x7c 0x64 0x17 0xd0 # CHECK: mulq_rs.ph       $2, $3, $4
+  0x7c 0xc7 0x2d 0xd8 # CHECK: mulq_rs.w        $5, $6, $7
+  0x7d 0x2a 0x47 0x90 # CHECK: mulq_s.ph        $8, $9, $10
+  0x7d 0x8d 0x5d 0x98 # CHECK: mulq_s.w         $11, $12, $13
+  0x7d 0xcf 0x18 0xb0 # CHECK: mulsa.w.ph       $ac3, $14, $15
+  0x7e 0x11 0x01 0xb0 # CHECK: mulsaq_s.w.ph    $ac0, $16, $17
+  0x00 0x43 0x18 0x18 # CHECK: mult             $ac3, $2, $3
+  0x00 0x85 0x10 0x19 # CHECK: multu            $ac2, $4, $5
+  0x00 0x43 0x00 0x18 # CHECK: mult             $2, $3
+  0x00 0x85 0x00 0x19 # CHECK: multu            $4, $5
+  0x7e 0x74 0x93 0x91 # CHECK: packrl.ph        $18, $19, $20
+  0x7d 0xe3 0x3a 0xd1 # CHECK: pick.ph          $7, $15, $3
+  0x7c 0x88 0x10 0xd1 # CHECK: pick.qb          $2, $4, $8
+  0x7c 0x15 0xa3 0x12 # CHECK: preceq.w.phl     $20, $21
+  0x7c 0x16 0xab 0x52 # CHECK: preceq.w.phr     $21, $22
+  0x7c 0x17 0xb1 0x12 # CHECK: precequ.ph.qbl   $22, $23
+  0x7c 0x19 0xc1 0x92 # CHECK: precequ.ph.qbla  $24, $25
+  0x7c 0x18 0xb9 0x52 # CHECK: precequ.ph.qbr   $23, $24
+  0x7c 0x1a 0xc9 0xd2 # CHECK: precequ.ph.qbra  $25, $26
+  0x7c 0x1b 0xd7 0x12 # CHECK: preceu.ph.qbl    $26, $27
+  0x7c 0x1d 0xe7 0x92 # CHECK: preceu.ph.qbla   $gp, $sp
+  0x7c 0x1c 0xdf 0x52 # CHECK: preceu.ph.qbr    $27, $gp
+  0x7c 0x1e 0xef 0xd2 # CHECK: preceu.ph.qbra   $sp, $fp
+  0x7f 0x19 0xbb 0x51 # CHECK: precr.qb.ph      $23, $24, $25
+  0x7f 0x38 0x07 0x91 # CHECK: precr_sra.ph.w   $24, $25, 0
+  0x7f 0x38 0xff 0x91 # CHECK: precr_sra.ph.w   $24, $25, 31
+  0x7f 0x59 0x07 0xd1 # CHECK: precr_sra_r.ph.w $25, $26, 0
+  0x7f 0x59 0xff 0xd1 # CHECK: precr_sra_r.ph.w $25, $26, 31
+  0x7e 0x53 0x8d 0x11 # CHECK: precrq.ph.w      $17, $18, $19
+  0x7e 0x32 0x83 0x11 # CHECK: precrq.qb.ph     $16, $17, $18
+  0x7e 0x95 0x9b 0xd1 # CHECK: precrqu_s.qb.ph  $19, $20, $21
+  0x7e 0x74 0x95 0x51 # CHECK: precrq_rs.ph.w   $18, $19, $20
+  0x7c 0x41 0x18 0x71 # CHECK: prepend          $1, $2, 3
+  0x7c 0x40 0x0d 0x10 # CHECK: raddu.w.qb       $1, $2
+  0x7d 0x00 0x2c 0xb8 # CHECK: rddsp            $5, 256
+  0x7c 0x0c 0x12 0x92 # CHECK: repl.ph          $2, 12
+  0x7c 0x55 0x08 0x92 # CHECK: repl.qb          $1, 85
+  0x7c 0x02 0x0a 0xd2 # CHECK: replv.ph         $1, $2
+  0x7c 0x02 0x08 0xd2 # CHECK: replv.qb         $1, $2
+  0x7d 0x00 0x0e 0xb8 # CHECK: shilo            $ac1, 16
+  0x7c 0x40 0x0e 0xf8 # CHECK: shilov           $ac1, $2
+  0x7c 0x62 0x0a 0x13 # CHECK: shll.ph          $1, $2, 3
+  0x7c 0x62 0x0b 0x13 # CHECK: shll_s.ph        $1, $2, 3
+  0x7c 0x62 0x08 0x13 # CHECK: shll.qb          $1, $2, 3
+  0x7c 0x62 0x0a 0x93 # CHECK: shllv.ph         $1, $2, $3
+  0x7c 0x62 0x0b 0x93 # CHECK: shllv_s.ph       $1, $2, $3
+  0x7c 0x62 0x08 0x93 # CHECK: shllv.qb         $1, $2, $3
+  0x7c 0x62 0x0d 0x93 # CHECK: shllv_s.w        $1, $2, $3
+  0x7c 0x62 0x0d 0x13 # CHECK: shll_s.w         $1, $2, 3
+  0x7c 0x50 0x11 0x13 # CHECK: shra.qb          $2, $16, 2
+  0x7c 0x50 0x11 0x53 # CHECK: shra_r.qb        $2, $16, 2
+  0x7c 0x22 0x2a 0x53 # CHECK: shra.ph $5,      $2, 1
+  0x7c 0x22 0x2b 0x53 # CHECK: shra_r.ph        $5, $2, 1
+  0x7c 0x62 0x0a 0xd3 # CHECK: shrav.ph         $1, $2, $3
+  0x7c 0x62 0x0b 0xd3 # CHECK: shrav_r.ph       $1, $2, $3
+  0x7c 0x62 0x09 0x93 # CHECK: shrav.qb         $1, $2, $3
+  0x7c 0x62 0x09 0xd3 # CHECK: shrav_r.qb       $1, $2, $3
+  0x7c 0x62 0x0d 0xd3 # CHECK: shrav_r.w        $1, $2, $3
+  0x7c 0x22 0x0d 0x53 # CHECK: shra_r.w         $1, $2, 1
+  0x7c 0x42 0x0e 0x53 # CHECK: shrl.ph          $1, $2, 2
+  0x7c 0x42 0x08 0x53 # CHECK: shrl.qb          $1, $2, 2
+  0x7c 0x62 0x0e 0xd3 # CHECK: shrlv.ph         $1, $2, $3
+  0x7c 0x62 0x08 0xd3 # CHECK: shrlv.qb         $1, $2, $3
+  0x7c 0x43 0x0a 0xd0 # CHECK: subq.ph          $1, $2, $3
+  0x7c 0x43 0x0b 0xd0 # CHECK: subq_s.ph        $1, $2, $3
+  0x7c 0x43 0x0d 0xd0 # CHECK: subq_s.w         $1, $2, $3
+  0x7c 0x43 0x0a 0x58 # CHECK: subqh.ph         $1, $2, $3
+  0x7c 0x43 0x0a 0xd8 # CHECK: subqh_r.ph       $1, $2, $3
+  0x7c 0x43 0x0c 0x58 # CHECK: subqh.w          $1, $2, $3
+  0x7c 0x43 0x0c 0xd8 # CHECK: subqh_r.w        $1, $2, $3
+  0x7c 0x49 0x32 0x50 # CHECK: subu.ph          $6, $2, $9
+  0x7c 0x64 0x13 0x50 # CHECK: subu_s.ph        $2, $3, $4
+  0x7c 0x43 0x08 0x50 # CHECK: subu.qb          $1, $2, $3
+  0x7c 0x43 0x09 0x50 # CHECK: subu_s.qb        $1, $2, $3
+  0x7c 0x43 0x08 0x58 # CHECK: subuh.qb         $1, $2, $3
+  0x7c 0x43 0x08 0xd8 # CHECK: subuh_r.qb       $1, $2, $3
+  0x7c 0x20 0x04 0xf8 # CHECK: wrdsp            $1, 0
diff --git a/test/MC/Mips/dsp/valid.s b/test/MC/Mips/dsp/valid.s
new file mode 100644
index 000000000000..804669c5e464
--- /dev/null
+++ b/test/MC/Mips/dsp/valid.s
@@ -0,0 +1,127 @@
+# RUN: llvm-mc -show-encoding -triple=mips-unknown-unknown -mattr=dsp %s | FileCheck %s
+#
+# CHECK:   .text
+  .set noat
+  absq_s.ph         $1, $2          # CHECK: absq_s.ph          $1, $2          # encoding: [0x7c,0x02,0x0a,0x52]
+  absq_s.w          $5, $6          # CHECK: absq_s.w           $5, $6          # encoding: [0x7c,0x06,0x2c,0x52]
+  addq.ph           $7, $8, $9      # CHECK: addq.ph            $7, $8, $9      # encoding: [0x7d,0x09,0x3a,0x90]
+  addq_s.ph         $10, $11, $12   # CHECK: addq_s.ph          $10, $11, $12   # encoding: [0x7d,0x6c,0x53,0x90]
+  addq_s.w          $13, $14, $15   # CHECK: addq_s.w           $13, $14, $15   # encoding: [0x7d,0xcf,0x6d,0x90]
+  addsc             $gp, $sp, $fp   # CHECK: addsc              $gp, $sp, $fp   # encoding: [0x7f,0xbe,0xe4,0x10]
+  addu.qb           $6, $7, $8      # CHECK: addu.qb            $6, $7, $8      # encoding: [0x7c,0xe8,0x30,0x10]
+  addu_s.qb         $9, $10, $11    # CHECK: addu_s.qb          $9, $10, $11    # encoding: [0x7d,0x4b,0x49,0x10]
+  addwc             $12, $13, $14   # CHECK: addwc              $12, $13, $14   # encoding: [0x7d,0xae,0x64,0x50]
+  bitrev            $25, $26        # CHECK: bitrev             $25, $26        # encoding: [0x7c,0x1a,0xce,0xd2]
+  bposge32          21100           # CHECK: bposge32           21100           # encoding: [0x04,0x1c,0x14,0x9b]
+  cmp.eq.ph         $27, $gp        # CHECK: cmp.eq.ph          $27, $gp        # encoding: [0x7f,0x7c,0x02,0x11]
+  cmp.lt.ph         $sp, $fp        # CHECK: cmp.lt.ph          $sp, $fp        # encoding: [0x7f,0xbe,0x02,0x51]
+  cmp.le.ph         $ra, $1         # CHECK: cmp.le.ph          $ra, $1         # encoding: [0x7f,0xe1,0x02,0x91]
+  cmpgu.eq.qb       $11, $12, $13   # CHECK: cmpgu.eq.qb        $11, $12, $13   # encoding: [0x7d,0x8d,0x59,0x11]
+  cmpgu.lt.qb       $14, $15, $16   # CHECK: cmpgu.lt.qb        $14, $15, $16   # encoding: [0x7d,0xf0,0x71,0x51]
+  cmpgu.le.qb       $17, $18, $19   # CHECK: cmpgu.le.qb        $17, $18, $19   # encoding: [0x7e,0x53,0x89,0x91]
+  cmpu.eq.qb        $20, $21        # CHECK: cmpu.eq.qb         $20, $21        # encoding: [0x7e,0x95,0x00,0x11]
+  cmpu.lt.qb        $22, $23        # CHECK: cmpu.lt.qb         $22, $23        # encoding: [0x7e,0xd7,0x00,0x51]
+  cmpu.le.qb        $24, $25        # CHECK: cmpu.le.qb         $24, $25        # encoding: [0x7f,0x19,0x00,0x91]
+  dpaq_s.w.ph       $ac1, $1, $2    # CHECK: dpaq_s.w.ph        $ac1, $1, $2    # encoding: [0x7c,0x22,0x09,0x30]
+  dpaq_sa.l.w       $ac2, $3, $4    # CHECK: dpaq_sa.l.w        $ac2, $3, $4    # encoding: [0x7c,0x64,0x13,0x30]
+  dpau.h.qbl        $ac1, $9, $10   # CHECK: dpau.h.qbl         $ac1, $9, $10   # encoding: [0x7d,0x2a,0x08,0xf0]
+  dpau.h.qbr        $ac1, $11, $12  # CHECK: dpau.h.qbr         $ac1, $11, $12  # encoding: [0x7d,0x6c,0x09,0xf0]
+  dpsq_s.w.ph       $ac0, $17, $18  # CHECK: dpsq_s.w.ph        $ac0, $17, $18  # encoding: [0x7e,0x32,0x01,0x70]
+  dpsq_sa.l.w       $ac1, $19, $20  # CHECK: dpsq_sa.l.w        $ac1, $19, $20  # encoding: [0x7e,0x74,0x0b,0x70]
+  dpsu.h.qbl        $ac0, $5, $6    # CHECK: dpsu.h.qbl         $ac0, $5, $6    # encoding: [0x7c,0xa6,0x02,0xf0]
+  dpsu.h.qbr        $ac1, $7, $8    # CHECK: dpsu.h.qbr         $ac1, $7, $8    # encoding: [0x7c,0xe8,0x0b,0xf0]
+  extp              $1, $ac0, 31    # CHECK: extp               $1, $ac0, 31    # encoding: [0x7f,0xe1,0x00,0xb8]
+  extpdp            $2, $ac1, 0     # CHECK: extpdp             $2, $ac1, 0     # encoding: [0x7c,0x02,0x0a,0xb8]
+  extpdpv           $3, $ac2, $4    # CHECK: extpdpv            $3, $ac2, $4    # encoding: [0x7c,0x83,0x12,0xf8]
+  extpv             $5, $ac3, $6    # CHECK: extpv              $5, $ac3, $6    # encoding: [0x7c,0xc5,0x18,0xf8]
+  extr.w            $7, $ac0, 31    # CHECK: extr.w             $7, $ac0, 31    # encoding: [0x7f,0xe7,0x00,0x38]
+  extr_r.w          $8, $ac1, 15    # CHECK: extr_r.w           $8, $ac1, 15    # encoding: [0x7d,0xe8,0x09,0x38]
+  extr_rs.w         $9, $ac2, 7     # CHECK: extr_rs.w          $9, $ac2, 7     # encoding: [0x7c,0xe9,0x11,0xb8]
+  extr_s.h          $10, $ac3, 3    # CHECK: extr_s.h           $10, $ac3, 3    # encoding: [0x7c,0x6a,0x1b,0xb8]
+  extrv.w           $11, $ac0, $12  # CHECK: extrv.w            $11, $ac0, $12  # encoding: [0x7d,0x8b,0x00,0x78]
+  extrv_r.w         $13, $ac1, $14  # CHECK: extrv_r.w          $13, $ac1, $14  # encoding: [0x7d,0xcd,0x09,0x78]
+  extrv_rs.w        $15, $ac2, $16  # CHECK: extrv_rs.w         $15, $ac2, $16  # encoding: [0x7e,0x0f,0x11,0xf8]
+  extrv_s.h         $17, $ac3, $18  # CHECK: extrv_s.h          $17, $ac3, $18  # encoding: [0x7e,0x51,0x1b,0xf8]
+  insv              $19, $20        # CHECK: insv               $19, $20        # encoding: [0x7e,0x93,0x00,0x0c]
+  lbux              $10, $20($26)   # CHECK: lbux               $10, $20($26)   # encoding: [0x7f,0x54,0x51,0x8a]
+  lhx               $11, $21($27)   # CHECK: lhx                $11, $21($27)   # encoding: [0x7f,0x75,0x59,0x0a]
+  lwx               $12, $22($gp)   # CHECK: lwx                $12, $22($gp)   # encoding: [0x7f,0x96,0x60,0x0a]
+  madd              $ac1, $6, $7    # CHECK: madd               $ac1, $6, $7    # encoding: [0x70,0xc7,0x08,0x00]
+  maddu             $ac0, $8, $9    # CHECK: maddu              $ac0, $8, $9    # encoding: [0x71,0x09,0x00,0x01]
+  madd              $6, $7          # CHECK: madd               $6, $7          # encoding: [0x70,0xc7,0x00,0x00]
+  maddu             $8, $9          # CHECK: maddu              $8, $9          # encoding: [0x71,0x09,0x00,0x01]
+  maq_s.w.phl       $ac2, $3, $4    # CHECK: maq_s.w.phl        $ac2, $3, $4    # encoding: [0x7c,0x64,0x15,0x30]
+  maq_sa.w.phl      $ac3, $5, $6    # CHECK: maq_sa.w.phl       $ac3, $5, $6    # encoding: [0x7c,0xa6,0x1c,0x30]
+  maq_s.w.phr       $ac0, $7, $8    # CHECK: maq_s.w.phr        $ac0, $7, $8    # encoding: [0x7c,0xe8,0x05,0xb0]
+  maq_sa.w.phr      $ac1, $9, $10   # CHECK: maq_sa.w.phr       $ac1, $9, $10   # encoding: [0x7d,0x2a,0x0c,0xb0]
+  mfhi              $14, $ac1       # CHECK: mfhi               $14, $ac1       # encoding: [0x00,0x20,0x70,0x10]
+  mflo              $15, $ac0       # CHECK: mflo               $15, $ac0       # encoding: [0x00,0x00,0x78,0x12]
+  mfhi              $14             # CHECK: mfhi               $14             # encoding: [0x00,0x00,0x70,0x10]
+  mflo              $15             # CHECK: mflo               $15             # encoding: [0x00,0x00,0x78,0x12]
+  modsub            $11, $12, $13   # CHECK: modsub             $11, $12, $13   # encoding: [0x7d,0x8d,0x5c,0x90]
+  msub              $ac3, $10, $11  # CHECK: msub               $ac3, $10, $11  # encoding: [0x71,0x4b,0x18,0x04]
+  msubu             $ac2, $12, $13  # CHECK: msubu              $ac2, $12, $13  # encoding: [0x71,0x8d,0x10,0x05]
+  msub              $10, $11        # CHECK: msub               $10, $11        # encoding: [0x71,0x4b,0x00,0x04]
+  msubu             $12, $13        # CHECK: msubu              $12, $13        # encoding: [0x71,0x8d,0x00,0x05]
+  mthi              $16, $ac3       # CHECK: mthi               $16, $ac3       # encoding: [0x02,0x00,0x18,0x11]
+  mthi              $16             # CHECK: mthi               $16             # encoding: [0x02,0x00,0x00,0x11]
+  mthlip            $14, $ac2       # CHECK: mthlip             $14, $ac2       # encoding: [0x7d,0xc0,0x17,0xf8]
+  mtlo              $17, $ac2       # CHECK: mtlo               $17, $ac2       # encoding: [0x02,0x20,0x10,0x13]
+  mtlo              $17             # CHECK: mtlo               $17             # encoding: [0x02,0x20,0x00,0x13]
+  muleq_s.w.phl     $21, $22, $23   # CHECK: muleq_s.w.phl      $21, $22, $23   # encoding: [0x7e,0xd7,0xaf,0x10]
+  muleq_s.w.phr     $24, $25, $26   # CHECK: muleq_s.w.phr      $24, $25, $26   # encoding: [0x7f,0x3a,0xc7,0x50]
+  muleu_s.ph.qbl    $27, $gp, $sp   # CHECK: muleu_s.ph.qbl     $27, $gp, $sp   # encoding: [0x7f,0x9d,0xd9,0x90]
+  muleu_s.ph.qbr    $fp, $ra, $1    # CHECK: muleu_s.ph.qbr     $fp, $ra, $1    # encoding: [0x7f,0xe1,0xf1,0xd0]
+  mulq_rs.ph        $2, $3, $4      # CHECK: mulq_rs.ph         $2, $3, $4      # encoding: [0x7c,0x64,0x17,0xd0]
+  mulsaq_s.w.ph     $ac0, $16, $17  # CHECK: mulsaq_s.w.ph      $ac0, $16, $17  # encoding: [0x7e,0x11,0x01,0xb0]
+  mult              $ac3, $2, $3    # CHECK: mult               $ac3, $2, $3    # encoding: [0x00,0x43,0x18,0x18]
+  multu             $ac2, $4, $5    # CHECK: multu              $ac2, $4, $5    # encoding: [0x00,0x85,0x10,0x19]
+  mult              $2, $3          # CHECK: mult               $2, $3          # encoding: [0x00,0x43,0x00,0x18]
+  multu             $4, $5          # CHECK: multu              $4, $5          # encoding: [0x00,0x85,0x00,0x19]
+  packrl.ph         $18, $19, $20   # CHECK: packrl.ph          $18, $19, $20   # encoding: [0x7e,0x74,0x93,0x91]
+  pick.ph           $7, $15, $3     # CHECK: pick.ph            $7, $15, $3     # encoding: [0x7d,0xe3,0x3a,0xd1]
+  pick.qb           $2, $4, $8      # CHECK: pick.qb            $2, $4, $8      # encoding: [0x7c,0x88,0x10,0xd1]
+  preceq.w.phl      $20, $21        # CHECK: preceq.w.phl       $20, $21        # encoding: [0x7c,0x15,0xa3,0x12]
+  preceq.w.phr      $21, $22        # CHECK: preceq.w.phr       $21, $22        # encoding: [0x7c,0x16,0xab,0x52]
+  precequ.ph.qbl    $22, $23        # CHECK: precequ.ph.qbl     $22, $23        # encoding: [0x7c,0x17,0xb1,0x12]
+  precequ.ph.qbla   $24, $25        # CHECK: precequ.ph.qbla    $24, $25        # encoding: [0x7c,0x19,0xc1,0x92]
+  precequ.ph.qbr    $23, $24        # CHECK: precequ.ph.qbr     $23, $24        # encoding: [0x7c,0x18,0xb9,0x52]
+  precequ.ph.qbra   $25, $26        # CHECK: precequ.ph.qbra    $25, $26        # encoding: [0x7c,0x1a,0xc9,0xd2]
+  preceu.ph.qbl     $26, $27        # CHECK: preceu.ph.qbl      $26, $27        # encoding: [0x7c,0x1b,0xd7,0x12]
+  preceu.ph.qbla    $gp, $sp        # CHECK: preceu.ph.qbla     $gp, $sp        # encoding: [0x7c,0x1d,0xe7,0x92]
+  preceu.ph.qbr     $27, $gp        # CHECK: preceu.ph.qbr      $27, $gp        # encoding: [0x7c,0x1c,0xdf,0x52]
+  preceu.ph.qbra    $sp, $fp        # CHECK: preceu.ph.qbra     $sp, $fp        # encoding: [0x7c,0x1e,0xef,0xd2]
+  precrq.ph.w       $17, $18, $19   # CHECK: precrq.ph.w        $17, $18, $19   # encoding: [0x7e,0x53,0x8d,0x11]
+  precrq.qb.ph      $16, $17, $18   # CHECK: precrq.qb.ph       $16, $17, $18   # encoding: [0x7e,0x32,0x83,0x11]
+  precrqu_s.qb.ph   $19, $20, $21   # CHECK: precrqu_s.qb.ph    $19, $20, $21   # encoding: [0x7e,0x95,0x9b,0xd1]
+  precrq_rs.ph.w    $18, $19, $20   # CHECK: precrq_rs.ph.w     $18, $19, $20   # encoding: [0x7e,0x74,0x95,0x51]
+  raddu.w.qb        $1, $2          # CHECK: raddu.w.qb         $1, $2          # encoding: [0x7c,0x40,0x0d,0x10]
+  rddsp             $5, 256         # CHECK: rddsp              $5, 256         # encoding: [0x7d,0x00,0x2c,0xb8]
+  repl.ph           $2, 12          # CHECK: repl.ph            $2, 12          # encoding: [0x7c,0x0c,0x12,0x92]
+  repl.qb           $1, 85          # CHECK: repl.qb            $1, 85          # encoding: [0x7c,0x55,0x08,0x92]
+  replv.ph          $1, $2          # CHECK: replv.ph           $1, $2          # encoding: [0x7c,0x02,0x0a,0xd2]
+  replv.qb          $1, $2          # CHECK: replv.qb           $1, $2          # encoding: [0x7c,0x02,0x08,0xd2]
+  shilo             $ac1, 16        # CHECK: shilo              $ac1, 16        # encoding: [0x7d,0x00,0x0e,0xb8]
+  shilov            $ac1, $2        # CHECK: shilov             $ac1, $2        # encoding: [0x7c,0x40,0x0e,0xf8]
+  shll.ph           $1, $2, 3       # CHECK: shll.ph            $1, $2, 3       # encoding: [0x7c,0x62,0x0a,0x13]
+  shll_s.ph         $1, $2, 3       # CHECK: shll_s.ph          $1, $2, 3       # encoding: [0x7c,0x62,0x0b,0x13]
+  shll.qb           $1, $2, 3       # CHECK: shll.qb            $1, $2, 3       # encoding: [0x7c,0x62,0x08,0x13]
+  shllv.ph          $1, $2, $3      # CHECK: shllv.ph           $1, $2, $3      # encoding: [0x7c,0x62,0x0a,0x93]
+  shllv_s.ph        $1, $2, $3      # CHECK: shllv_s.ph         $1, $2, $3      # encoding: [0x7c,0x62,0x0b,0x93]
+  shllv.qb          $1, $2, $3      # CHECK: shllv.qb           $1, $2, $3      # encoding: [0x7c,0x62,0x08,0x93]
+  shllv_s.w         $1, $2, $3      # CHECK: shllv_s.w          $1, $2, $3      # encoding: [0x7c,0x62,0x0d,0x93]
+  shll_s.w          $1, $2, 3       # CHECK: shll_s.w           $1, $2, 3       # encoding: [0x7c,0x62,0x0d,0x13]
+  shra.ph           $5, $2, 1       # CHECK: shra.ph            $5, $2, 1       # encoding: [0x7c,0x22,0x2a,0x53]
+  shra_r.ph         $5, $2, 1       # CHECK: shra_r.ph          $5, $2, 1       # encoding: [0x7c,0x22,0x2b,0x53]
+  shrav.ph          $1, $2, $3      # CHECK: shrav.ph           $1, $2, $3      # encoding: [0x7c,0x62,0x0a,0xd3]
+  shrav_r.ph        $1, $2, $3      # CHECK: shrav_r.ph         $1, $2, $3      # encoding: [0x7c,0x62,0x0b,0xd3]
+  shrav_r.w         $1, $2, $3      # CHECK: shrav_r.w          $1, $2, $3      # encoding: [0x7c,0x62,0x0d,0xd3]
+  shra_r.w          $1, $2, 1       # CHECK: shra_r.w           $1, $2, 1       # encoding: [0x7c,0x22,0x0d,0x53]
+  shrl.qb           $1, $2, 2       # CHECK: shrl.qb            $1, $2, 2       # encoding: [0x7c,0x42,0x08,0x53]
+  shrlv.qb          $1, $2, $3      # CHECK: shrlv.qb           $1, $2, $3      # encoding: [0x7c,0x62,0x08,0xd3]
+  subq.ph           $1, $2, $3      # CHECK: subq.ph            $1, $2, $3      # encoding: [0x7c,0x43,0x0a,0xd0]
+  subq_s.ph         $1, $2, $3      # CHECK: subq_s.ph          $1, $2, $3      # encoding: [0x7c,0x43,0x0b,0xd0]
+  subq_s.w          $1, $2, $3      # CHECK: subq_s.w           $1, $2, $3      # encoding: [0x7c,0x43,0x0d,0xd0]
+  subu.qb           $1, $2, $3      # CHECK: subu.qb            $1, $2, $3      # encoding: [0x7c,0x43,0x08,0x50]
+  subu_s.qb         $1, $2, $3      # CHECK: subu_s.qb          $1, $2, $3      # encoding: [0x7c,0x43,0x09,0x50]
+  wrdsp             $1, 0           # CHECK: wrdsp              $1, 0           # encoding: [0x7c,0x20,0x04,0xf8]
diff --git a/test/MC/Mips/dspr2/valid.s b/test/MC/Mips/dspr2/valid.s
index d86081ec464c..ce9bd7309d7f 100644
--- a/test/MC/Mips/dspr2/valid.s
+++ b/test/MC/Mips/dspr2/valid.s
@@ -1,48 +1,175 @@
 # RUN: llvm-mc -show-encoding -triple=mips-unknown-unknown -mattr=dspr2 %s | FileCheck %s
 #
 # CHECK:   .text
-  precrq.qb.ph    $16,$17,$18  # CHECK: precrq.qb.ph     $16, $17, $18 # encoding: [0x7e,0x32,0x83,0x11]
-  precrq.ph.w     $17,$18,$19  # CHECK: precrq.ph.w      $17, $18, $19 # encoding: [0x7e,0x53,0x8d,0x11]
-  precrq_rs.ph.w  $18,$19,$20  # CHECK: precrq_rs.ph.w   $18, $19, $20 # encoding: [0x7e,0x74,0x95,0x51]
-  precrqu_s.qb.ph $19,$20,$21  # CHECK: precrqu_s.qb.ph  $19, $20, $21 # encoding: [0x7e,0x95,0x9b,0xd1]
-  preceq.w.phl    $20,$21      # CHECK: preceq.w.phl     $20, $21      # encoding: [0x7c,0x15,0xa3,0x12]
-  preceq.w.phr    $21,$22      # CHECK: preceq.w.phr     $21, $22      # encoding: [0x7c,0x16,0xab,0x52]
-  precequ.ph.qbl  $22,$23      # CHECK: precequ.ph.qbl   $22, $23      # encoding: [0x7c,0x17,0xb1,0x12]
-  precequ.ph.qbr  $23,$24      # CHECK: precequ.ph.qbr   $23, $24      # encoding: [0x7c,0x18,0xb9,0x52]
-  precequ.ph.qbla $24,$25      # CHECK: precequ.ph.qbla  $24, $25      # encoding: [0x7c,0x19,0xc1,0x92]
-  precequ.ph.qbra $25,$26      # CHECK: precequ.ph.qbra  $25, $26      # encoding: [0x7c,0x1a,0xc9,0xd2]
-  preceu.ph.qbl   $26,$27      # CHECK: preceu.ph.qbl    $26, $27      # encoding: [0x7c,0x1b,0xd7,0x12]
-  preceu.ph.qbr   $27,$28      # CHECK: preceu.ph.qbr    $27, $gp      # encoding: [0x7c,0x1c,0xdf,0x52]
-  preceu.ph.qbla  $28,$29      # CHECK: preceu.ph.qbla   $gp, $sp      # encoding: [0x7c,0x1d,0xe7,0x92]
-  preceu.ph.qbra  $29,$30      # CHECK: preceu.ph.qbra   $sp, $fp      # encoding: [0x7c,0x1e,0xef,0xd2]
-  precr.qb.ph     $23,$24,$25  # CHECK: precr.qb.ph      $23, $24, $25 # encoding: [0x7f,0x19,0xbb,0x51]
-  precr_sra.ph.w  $24,$25,0    # CHECK: precr_sra.ph.w   $24, $25, 0   # encoding: [0x7f,0x38,0x07,0x91]
-  precr_sra.ph.w  $24,$25,31   # CHECK: precr_sra.ph.w   $24, $25, 31  # encoding: [0x7f,0x38,0xff,0x91]
-  precr_sra_r.ph.w  $25,$26,0  # CHECK: precr_sra_r.ph.w $25, $26, 0   # encoding: [0x7f,0x59,0x07,0xd1]
-  precr_sra_r.ph.w  $25,$26,31 # CHECK: precr_sra_r.ph.w $25, $26, 31  # encoding: [0x7f,0x59,0xff,0xd1]
-  lbux $10, $s4($26)           # CHECK: lbux $10, $20($26)             # encoding: [0x7f,0x54,0x51,0x8a]
-  lhx  $11, $s5($27)           # CHECK: lhx  $11, $21($27)             # encoding: [0x7f,0x75,0x59,0x0a]
-  lwx  $12, $s6($28)           # CHECK: lwx  $12, $22($gp)             # encoding: [0x7f,0x96,0x60,0x0a]
-  mult $ac3, $2, $3            # CHECK: mult $ac3, $2, $3              # encoding: [0x00,0x43,0x18,0x18]
-  multu $ac2, $4, $5           # CHECK: multu $ac2, $4, $5             # encoding: [0x00,0x85,0x10,0x19]
-  madd $ac1, $6, $7            # CHECK: madd $ac1, $6, $7              # encoding: [0x70,0xc7,0x08,0x00]
-  maddu $ac0, $8, $9           # CHECK: maddu $ac0, $8, $9             # encoding: [0x71,0x09,0x00,0x01]
-  msub $ac3, $10, $11          # CHECK: msub $ac3, $10, $11            # encoding: [0x71,0x4b,0x18,0x04]
-  msubu $ac2, $12, $13         # CHECK: msubu $ac2, $12, $13           # encoding: [0x71,0x8d,0x10,0x05]
-  mfhi $14, $ac1               # CHECK: mfhi $14, $ac1                 # encoding: [0x00,0x20,0x70,0x10]
-  mflo $15, $ac0               # CHECK: mflo $15, $ac0                 # encoding: [0x00,0x00,0x78,0x12]
-  mthi $16, $ac3               # CHECK: mthi $16, $ac3                 # encoding: [0x02,0x00,0x18,0x11]
-  mtlo $17, $ac2               # CHECK: mtlo $17, $ac2                 # encoding: [0x02,0x20,0x10,0x13]
-  mult $2, $3                  # CHECK: mult $2, $3                    # encoding: [0x00,0x43,0x00,0x18]
-  multu $4, $5                 # CHECK: multu $4, $5                   # encoding: [0x00,0x85,0x00,0x19]
-  madd $6, $7                  # CHECK: madd $6, $7                    # encoding: [0x70,0xc7,0x00,0x00]
-  maddu $8, $9                 # CHECK: maddu $8, $9                   # encoding: [0x71,0x09,0x00,0x01]
-  msub $10, $11                # CHECK: msub $10, $11                  # encoding: [0x71,0x4b,0x00,0x04]
-  msubu $12, $13               # CHECK: msubu $12, $13                 # encoding: [0x71,0x8d,0x00,0x05]
-  mfhi $14                     # CHECK: mfhi $14                       # encoding: [0x00,0x00,0x70,0x10]
-  mflo $15                     # CHECK: mflo $15                       # encoding: [0x00,0x00,0x78,0x12]
-  mthi $16                     # CHECK: mthi $16                       # encoding: [0x02,0x00,0x00,0x11]
-  mtlo $17                     # CHECK: mtlo $17                       # encoding: [0x02,0x20,0x00,0x13]
-  append $2, $3, 3             # CHECK: append $2, $3, 3               # encoding: [0x7c,0x62,0x18,0x31]
-  balign $4, $5, 1             # CHECK: balign $4, $5, 1               # encoding: [0x7c,0xa4,0x0c,0x31]
-  prepend $6, $7, 4            # CHECK: prepend $6, $7, 4              # encoding: [0x7c,0xe6,0x20,0x71]
+  .set noat
+  absq_s.ph       $1, $2          # CHECK: absq_s.ph        $1, $2          # encoding: [0x7c,0x02,0x0a,0x52]
+  absq_s.qb       $3, $4          # CHECK: absq_s.qb        $3, $4          # encoding: [0x7c,0x04,0x18,0x52]
+  absq_s.w        $5, $6          # CHECK: absq_s.w         $5, $6          # encoding: [0x7c,0x06,0x2c,0x52]
+  addq.ph         $7, $8, $9      # CHECK: addq.ph          $7, $8, $9      # encoding: [0x7d,0x09,0x3a,0x90]
+  addq_s.ph       $10, $11, $12   # CHECK: addq_s.ph        $10, $11, $12   # encoding: [0x7d,0x6c,0x53,0x90]
+  addq_s.w        $13, $14, $15   # CHECK: addq_s.w         $13, $14, $15   # encoding: [0x7d,0xcf,0x6d,0x90]
+  addqh.ph        $16, $17, $18   # CHECK: addqh.ph         $16, $17, $18   # encoding: [0x7e,0x32,0x82,0x18]
+  addqh_r.ph      $19, $20, $21   # CHECK: addqh_r.ph       $19, $20, $21   # encoding: [0x7e,0x95,0x9a,0x98]
+  addqh.w         $22, $23, $24   # CHECK: addqh.w          $22, $23, $24   # encoding: [0x7e,0xf8,0xb4,0x18]
+  addqh_r.w       $25, $26, $27   # CHECK: addqh_r.w        $25, $26, $27   # encoding: [0x7f,0x5b,0xcc,0x98]
+  addsc           $gp, $sp, $fp   # CHECK: addsc            $gp, $sp, $fp   # encoding: [0x7f,0xbe,0xe4,0x10]
+  addu.ph         $ra, $1, $2     # CHECK: addu.ph          $ra, $1, $2     # encoding: [0x7c,0x22,0xfa,0x10]
+  addu_s.ph       $3, $4, $5      # CHECK: addu_s.ph        $3, $4, $5      # encoding: [0x7c,0x85,0x1b,0x10]
+  addu.qb         $6, $7, $8      # CHECK: addu.qb          $6, $7, $8      # encoding: [0x7c,0xe8,0x30,0x10]
+  addu_s.qb       $9, $10, $11    # CHECK: addu_s.qb        $9, $10, $11    # encoding: [0x7d,0x4b,0x49,0x10]
+  addwc           $12, $13, $14   # CHECK: addwc            $12, $13, $14   # encoding: [0x7d,0xae,0x64,0x50]
+  adduh.qb        $15, $16, $17   # CHECK: adduh.qb         $15, $16, $17   # encoding: [0x7e,0x11,0x78,0x18]
+  adduh_r.qb      $18, $19, $20   # CHECK: adduh_r.qb       $18, $19, $20   # encoding: [0x7e,0x74,0x90,0x98]
+  append          $21, $22, 0     # CHECK: append           $21, $22, 0     # encoding: [0x7e,0xd5,0x00,0x31]
+  balign          $23, $24, 3     # CHECK: balign           $23, $24, 3     # encoding: [0x7f,0x17,0x1c,0x31]
+  bitrev          $25, $26        # CHECK: bitrev           $25, $26        # encoding: [0x7c,0x1a,0xce,0xd2]
+  bposge32        21100           # CHECK: bposge32         21100           # encoding: [0x04,0x1c,0x14,0x9b]
+  cmp.eq.ph       $27, $gp        # CHECK: cmp.eq.ph        $27, $gp        # encoding: [0x7f,0x7c,0x02,0x11]
+  cmp.lt.ph       $sp, $fp        # CHECK: cmp.lt.ph        $sp, $fp        # encoding: [0x7f,0xbe,0x02,0x51]
+  cmp.le.ph       $ra, $1         # CHECK: cmp.le.ph        $ra, $1         # encoding: [0x7f,0xe1,0x02,0x91]
+  cmpgdu.eq.qb    $2, $3, $4      # CHECK: cmpgdu.eq.qb     $2, $3, $4      # encoding: [0x7c,0x64,0x16,0x11]
+  cmpgdu.lt.qb    $5, $6, $7      # CHECK: cmpgdu.lt.qb     $5, $6, $7      # encoding: [0x7c,0xc7,0x2e,0x51]
+  cmpgdu.le.qb    $8, $9, $10     # CHECK: cmpgdu.le.qb     $8, $9, $10     # encoding: [0x7d,0x2a,0x46,0x91]
+  cmpgu.eq.qb     $11, $12, $13   # CHECK: cmpgu.eq.qb      $11, $12, $13   # encoding: [0x7d,0x8d,0x59,0x11]
+  cmpgu.lt.qb     $14, $15, $16   # CHECK: cmpgu.lt.qb      $14, $15, $16   # encoding: [0x7d,0xf0,0x71,0x51]
+  cmpgu.le.qb     $17, $18, $19   # CHECK: cmpgu.le.qb      $17, $18, $19   # encoding: [0x7e,0x53,0x89,0x91]
+  cmpu.eq.qb      $20, $21        # CHECK: cmpu.eq.qb       $20, $21        # encoding: [0x7e,0x95,0x00,0x11]
+  cmpu.lt.qb      $22, $23        # CHECK: cmpu.lt.qb       $22, $23        # encoding: [0x7e,0xd7,0x00,0x51]
+  cmpu.le.qb      $24, $25        # CHECK: cmpu.le.qb       $24, $25        # encoding: [0x7f,0x19,0x00,0x91]
+  dpa.w.ph        $ac0, $26, $27  # CHECK: dpa.w.ph         $ac0, $26, $27  # encoding: [0x7f,0x5b,0x00,0x30]
+  dpaq_s.w.ph     $ac1, $1, $2    # CHECK: dpaq_s.w.ph      $ac1, $1, $2    # encoding: [0x7c,0x22,0x09,0x30]
+  dpaq_sa.l.w     $ac2, $3, $4    # CHECK: dpaq_sa.l.w      $ac2, $3, $4    # encoding: [0x7c,0x64,0x13,0x30]
+  dpaqx_s.w.ph    $ac3, $5, $6    # CHECK: dpaqx_s.w.ph     $ac3, $5, $6    # encoding: [0x7c,0xa6,0x1e,0x30]
+  dpaqx_sa.w.ph   $ac0, $7, $8    # CHECK: dpaqx_sa.w.ph    $ac0, $7, $8    # encoding: [0x7c,0xe8,0x06,0xb0]
+  dpau.h.qbl      $ac1, $9, $10   # CHECK: dpau.h.qbl       $ac1, $9, $10   # encoding: [0x7d,0x2a,0x08,0xf0]
+  dpau.h.qbr      $ac1, $11, $12  # CHECK: dpau.h.qbr       $ac1, $11, $12  # encoding: [0x7d,0x6c,0x09,0xf0]
+  dpax.w.ph       $ac2, $13, $14  # CHECK: dpax.w.ph        $ac2, $13, $14  # encoding: [0x7d,0xae,0x12,0x30]
+  dps.w.ph        $ac3, $15, $16  # CHECK: dps.w.ph         $ac3, $15, $16  # encoding: [0x7d,0xf0,0x18,0x70]
+  dpsq_s.w.ph     $ac0, $17, $18  # CHECK: dpsq_s.w.ph      $ac0, $17, $18  # encoding: [0x7e,0x32,0x01,0x70]
+  dpsq_sa.l.w     $ac1, $19, $20  # CHECK: dpsq_sa.l.w      $ac1, $19, $20  # encoding: [0x7e,0x74,0x0b,0x70]
+  dpsqx_s.w.ph    $ac2, $1, $2    # CHECK: dpsqx_s.w.ph     $ac2, $1, $2    # encoding: [0x7c,0x22,0x16,0x70]
+  dpsqx_sa.w.ph   $ac3, $3, $4    # CHECK: dpsqx_sa.w.ph    $ac3, $3, $4    # encoding: [0x7c,0x64,0x1e,0xf0]
+  dpsu.h.qbl      $ac0, $5, $6    # CHECK: dpsu.h.qbl       $ac0, $5, $6    # encoding: [0x7c,0xa6,0x02,0xf0]
+  dpsu.h.qbr      $ac1, $7, $8    # CHECK: dpsu.h.qbr       $ac1, $7, $8    # encoding: [0x7c,0xe8,0x0b,0xf0]
+  dpsx.w.ph       $ac2, $9, $10   # CHECK: dpsx.w.ph        $ac2, $9, $10   # encoding: [0x7d,0x2a,0x12,0x70]
+  extp            $1, $ac0, 31    # CHECK: extp             $1, $ac0, 31    # encoding: [0x7f,0xe1,0x00,0xb8]
+  extpdp          $2, $ac1, 0     # CHECK: extpdp           $2, $ac1, 0     # encoding: [0x7c,0x02,0x0a,0xb8]
+  extpdpv         $3, $ac2, $4    # CHECK: extpdpv          $3, $ac2, $4    # encoding: [0x7c,0x83,0x12,0xf8]
+  extpv           $5, $ac3, $6    # CHECK: extpv            $5, $ac3, $6    # encoding: [0x7c,0xc5,0x18,0xf8]
+  extr.w          $7, $ac0, 31    # CHECK: extr.w           $7, $ac0, 31    # encoding: [0x7f,0xe7,0x00,0x38]
+  extr_r.w        $8, $ac1, 15    # CHECK: extr_r.w         $8, $ac1, 15    # encoding: [0x7d,0xe8,0x09,0x38]
+  extr_rs.w       $9, $ac2, 7     # CHECK: extr_rs.w        $9, $ac2, 7     # encoding: [0x7c,0xe9,0x11,0xb8]
+  extr_s.h        $10, $ac3, 3    # CHECK: extr_s.h         $10, $ac3, 3    # encoding: [0x7c,0x6a,0x1b,0xb8]
+  extrv.w         $11, $ac0, $12  # CHECK: extrv.w          $11, $ac0, $12  # encoding: [0x7d,0x8b,0x00,0x78]
+  extrv_r.w       $13, $ac1, $14  # CHECK: extrv_r.w        $13, $ac1, $14  # encoding: [0x7d,0xcd,0x09,0x78]
+  extrv_rs.w      $15, $ac2, $16  # CHECK: extrv_rs.w       $15, $ac2, $16  # encoding: [0x7e,0x0f,0x11,0xf8]
+  extrv_s.h       $17, $ac3, $18  # CHECK: extrv_s.h        $17, $ac3, $18  # encoding: [0x7e,0x51,0x1b,0xf8]
+  insv            $19, $20        # CHECK: insv             $19, $20        # encoding: [0x7e,0x93,0x00,0x0c]
+  lbux            $10, $20($26)   # CHECK: lbux             $10, $20($26)   # encoding: [0x7f,0x54,0x51,0x8a]
+  lhx             $11, $21($27)   # CHECK: lhx              $11, $21($27)   # encoding: [0x7f,0x75,0x59,0x0a]
+  lwx             $12, $22($gp)   # CHECK: lwx              $12, $22($gp)   # encoding: [0x7f,0x96,0x60,0x0a]
+  madd            $ac1, $6, $7    # CHECK: madd             $ac1, $6, $7    # encoding: [0x70,0xc7,0x08,0x00]
+  maddu           $ac0, $8, $9    # CHECK: maddu            $ac0, $8, $9    # encoding: [0x71,0x09,0x00,0x01]
+  madd            $6, $7          # CHECK: madd             $6, $7          # encoding: [0x70,0xc7,0x00,0x00]
+  maddu           $8, $9          # CHECK: maddu            $8, $9          # encoding: [0x71,0x09,0x00,0x01]
+  maq_s.w.phl     $ac2, $3, $4    # CHECK: maq_s.w.phl      $ac2, $3, $4    # encoding: [0x7c,0x64,0x15,0x30]
+  maq_sa.w.phl    $ac3, $5, $6    # CHECK: maq_sa.w.phl     $ac3, $5, $6    # encoding: [0x7c,0xa6,0x1c,0x30]
+  maq_s.w.phr     $ac0, $7, $8    # CHECK: maq_s.w.phr      $ac0, $7, $8    # encoding: [0x7c,0xe8,0x05,0xb0]
+  maq_sa.w.phr    $ac1, $9, $10   # CHECK: maq_sa.w.phr     $ac1, $9, $10   # encoding: [0x7d,0x2a,0x0c,0xb0]
+  mfhi            $14, $ac1       # CHECK: mfhi             $14, $ac1       # encoding: [0x00,0x20,0x70,0x10]
+  mflo            $15, $ac0       # CHECK: mflo             $15, $ac0       # encoding: [0x00,0x00,0x78,0x12]
+  mfhi            $14             # CHECK: mfhi             $14             # encoding: [0x00,0x00,0x70,0x10]
+  mflo            $15             # CHECK: mflo             $15             # encoding: [0x00,0x00,0x78,0x12]
+  modsub          $11, $12, $13   # CHECK: modsub           $11, $12, $13   # encoding: [0x7d,0x8d,0x5c,0x90]
+  msub            $ac3, $10, $11  # CHECK: msub             $ac3, $10, $11  # encoding: [0x71,0x4b,0x18,0x04]
+  msubu           $ac2, $12, $13  # CHECK: msubu            $ac2, $12, $13  # encoding: [0x71,0x8d,0x10,0x05]
+  msub            $10, $11        # CHECK: msub             $10, $11        # encoding: [0x71,0x4b,0x00,0x04]
+  msubu           $12, $13        # CHECK: msubu            $12, $13        # encoding: [0x71,0x8d,0x00,0x05]
+  mthi            $16, $ac3       # CHECK: mthi             $16, $ac3       # encoding: [0x02,0x00,0x18,0x11]
+  mthi            $16             # CHECK: mthi             $16             # encoding: [0x02,0x00,0x00,0x11]
+  mthlip          $14, $ac2       # CHECK: mthlip           $14, $ac2       # encoding: [0x7d,0xc0,0x17,0xf8]
+  mtlo            $17, $ac2       # CHECK: mtlo             $17, $ac2       # encoding: [0x02,0x20,0x10,0x13]
+  mtlo            $17             # CHECK: mtlo             $17             # encoding: [0x02,0x20,0x00,0x13]
+  mul.ph          $15, $16, $17   # CHECK: mul.ph           $15, $16, $17   # encoding: [0x7e,0x11,0x7b,0x18]
+  mul_s.ph        $18, $19, $20   # CHECK: mul_s.ph         $18, $19, $20   # encoding: [0x7e,0x74,0x93,0x98]
+  muleq_s.w.phl   $21, $22, $23   # CHECK: muleq_s.w.phl    $21, $22, $23   # encoding: [0x7e,0xd7,0xaf,0x10]
+  muleq_s.w.phr   $24, $25, $26   # CHECK: muleq_s.w.phr    $24, $25, $26   # encoding: [0x7f,0x3a,0xc7,0x50]
+  muleu_s.ph.qbl  $27, $gp, $sp   # CHECK: muleu_s.ph.qbl   $27, $gp, $sp   # encoding: [0x7f,0x9d,0xd9,0x90]
+  muleu_s.ph.qbr  $fp, $ra, $1    # CHECK: muleu_s.ph.qbr   $fp, $ra, $1    # encoding: [0x7f,0xe1,0xf1,0xd0]
+  mulq_rs.ph      $2, $3, $4      # CHECK: mulq_rs.ph       $2, $3, $4      # encoding: [0x7c,0x64,0x17,0xd0]
+  mulq_rs.w       $5, $6, $7      # CHECK: mulq_rs.w        $5, $6, $7      # encoding: [0x7c,0xc7,0x2d,0xd8]
+  mulq_s.ph       $8, $9, $10     # CHECK: mulq_s.ph        $8, $9, $10     # encoding: [0x7d,0x2a,0x47,0x90]
+  mulq_s.w        $11, $12, $13   # CHECK: mulq_s.w         $11, $12, $13   # encoding: [0x7d,0x8d,0x5d,0x98]
+  mulsa.w.ph      $ac3, $14, $15  # CHECK: mulsa.w.ph       $ac3, $14, $15  # encoding: [0x7d,0xcf,0x18,0xb0]
+  mulsaq_s.w.ph   $ac0, $16, $17  # CHECK: mulsaq_s.w.ph    $ac0, $16, $17  # encoding: [0x7e,0x11,0x01,0xb0]
+  mult            $ac3, $2, $3    # CHECK: mult             $ac3, $2, $3    # encoding: [0x00,0x43,0x18,0x18]
+  multu           $ac2, $4, $5    # CHECK: multu            $ac2, $4, $5    # encoding: [0x00,0x85,0x10,0x19]
+  mult            $2, $3          # CHECK: mult             $2, $3          # encoding: [0x00,0x43,0x00,0x18]
+  multu           $4, $5          # CHECK: multu            $4, $5          # encoding: [0x00,0x85,0x00,0x19]
+  packrl.ph       $18, $19, $20   # CHECK: packrl.ph        $18, $19, $20   # encoding: [0x7e,0x74,0x93,0x91]
+  pick.ph         $7, $15, $3     # CHECK: pick.ph          $7, $15, $3     # encoding: [0x7d,0xe3,0x3a,0xd1]
+  pick.qb         $2, $4, $8      # CHECK: pick.qb          $2, $4, $8      # encoding: [0x7c,0x88,0x10,0xd1]
+  preceq.w.phl    $20,$21         # CHECK: preceq.w.phl     $20, $21        # encoding: [0x7c,0x15,0xa3,0x12]
+  preceq.w.phr    $21,$22         # CHECK: preceq.w.phr     $21, $22        # encoding: [0x7c,0x16,0xab,0x52]
+  precequ.ph.qbl  $22,$23         # CHECK: precequ.ph.qbl   $22, $23        # encoding: [0x7c,0x17,0xb1,0x12]
+  precequ.ph.qbla $24,$25         # CHECK: precequ.ph.qbla  $24, $25        # encoding: [0x7c,0x19,0xc1,0x92]
+  precequ.ph.qbr  $23,$24         # CHECK: precequ.ph.qbr   $23, $24        # encoding: [0x7c,0x18,0xb9,0x52]
+  precequ.ph.qbra $25,$26         # CHECK: precequ.ph.qbra  $25, $26        # encoding: [0x7c,0x1a,0xc9,0xd2]
+  preceu.ph.qbl   $26,$27         # CHECK: preceu.ph.qbl    $26, $27        # encoding: [0x7c,0x1b,0xd7,0x12]
+  preceu.ph.qbla  $28,$29         # CHECK: preceu.ph.qbla   $gp, $sp        # encoding: [0x7c,0x1d,0xe7,0x92]
+  preceu.ph.qbr   $27,$28         # CHECK: preceu.ph.qbr    $27, $gp        # encoding: [0x7c,0x1c,0xdf,0x52]
+  preceu.ph.qbra  $29,$30         # CHECK: preceu.ph.qbra   $sp, $fp        # encoding: [0x7c,0x1e,0xef,0xd2]
+  precr.qb.ph     $23,$24,$25     # CHECK: precr.qb.ph      $23, $24, $25   # encoding: [0x7f,0x19,0xbb,0x51]
+  precr_sra.ph.w  $24,$25,0       # CHECK: precr_sra.ph.w   $24, $25, 0     # encoding: [0x7f,0x38,0x07,0x91]
+  precr_sra.ph.w  $24,$25,31      # CHECK: precr_sra.ph.w   $24, $25, 31    # encoding: [0x7f,0x38,0xff,0x91]
+  precr_sra_r.ph.w  $25,$26,0     # CHECK: precr_sra_r.ph.w $25, $26, 0     # encoding: [0x7f,0x59,0x07,0xd1]
+  precr_sra_r.ph.w  $25,$26,31    # CHECK: precr_sra_r.ph.w $25, $26, 31    # encoding: [0x7f,0x59,0xff,0xd1]
+  precrq.ph.w     $17,$18,$19     # CHECK: precrq.ph.w      $17, $18, $19   # encoding: [0x7e,0x53,0x8d,0x11]
+  precrq.qb.ph    $16,$17,$18     # CHECK: precrq.qb.ph     $16, $17, $18   # encoding: [0x7e,0x32,0x83,0x11]
+  precrqu_s.qb.ph $19,$20,$21     # CHECK: precrqu_s.qb.ph  $19, $20, $21   # encoding: [0x7e,0x95,0x9b,0xd1]
+  precrq_rs.ph.w  $18,$19,$20     # CHECK: precrq_rs.ph.w   $18, $19, $20   # encoding: [0x7e,0x74,0x95,0x51]
+  prepend         $1, $2, 3       # CHECK: prepend          $1, $2, 3       # encoding: [0x7c,0x41,0x18,0x71]
+  raddu.w.qb      $1, $2          # CHECK: raddu.w.qb       $1, $2          # encoding: [0x7c,0x40,0x0d,0x10]
+  rddsp           $5, 256         # CHECK: rddsp            $5, 256         # encoding: [0x7d,0x00,0x2c,0xb8]
+  repl.ph         $2, 12          # CHECK: repl.ph          $2, 12          # encoding: [0x7c,0x0c,0x12,0x92]
+  repl.qb         $1, 85          # CHECK: repl.qb          $1, 85          # encoding: [0x7c,0x55,0x08,0x92]
+  replv.ph        $1, $2          # CHECK: replv.ph         $1, $2          # encoding: [0x7c,0x02,0x0a,0xd2]
+  replv.qb        $1, $2          # CHECK: replv.qb         $1, $2          # encoding: [0x7c,0x02,0x08,0xd2]
+  shilo           $ac1, 16        # CHECK: shilo            $ac1, 16        # encoding: [0x7d,0x00,0x0e,0xb8]
+  shilov          $ac1, $2        # CHECK: shilov           $ac1, $2        # encoding: [0x7c,0x40,0x0e,0xf8]
+  shll.ph         $1, $2, 3       # CHECK: shll.ph          $1, $2, 3       # encoding: [0x7c,0x62,0x0a,0x13]
+  shll_s.ph       $1, $2, 3       # CHECK: shll_s.ph        $1, $2, 3       # encoding: [0x7c,0x62,0x0b,0x13]
+  shll.qb         $1, $2, 3       # CHECK: shll.qb          $1, $2, 3       # encoding: [0x7c,0x62,0x08,0x13]
+  shllv.ph        $1, $2, $3      # CHECK: shllv.ph         $1, $2, $3      # encoding: [0x7c,0x62,0x0a,0x93]
+  shllv_s.ph      $1, $2, $3      # CHECK: shllv_s.ph       $1, $2, $3      # encoding: [0x7c,0x62,0x0b,0x93]
+  shllv.qb        $1, $2, $3      # CHECK: shllv.qb         $1, $2, $3      # encoding: [0x7c,0x62,0x08,0x93]
+  shllv_s.w       $1, $2, $3      # CHECK: shllv_s.w        $1, $2, $3      # encoding: [0x7c,0x62,0x0d,0x93]
+  shll_s.w        $1, $2, 3       # CHECK: shll_s.w         $1, $2, 3       # encoding: [0x7c,0x62,0x0d,0x13]
+  shra.qb         $2, $16, 2      # CHECK: shra.qb          $2, $16, 2      # encoding: [0x7c,0x50,0x11,0x13]
+  shra_r.qb       $2, $16, 2      # CHECK: shra_r.qb        $2, $16, 2      # encoding: [0x7c,0x50,0x11,0x53]
+  shra.ph         $5, $2, 1       # CHECK: shra.ph          $5, $2, 1       # encoding: [0x7c,0x22,0x2a,0x53]
+  shra_r.ph       $5, $2, 1       # CHECK: shra_r.ph        $5, $2, 1       # encoding: [0x7c,0x22,0x2b,0x53]
+  shrav.ph        $1, $2, $3      # CHECK: shrav.ph         $1, $2, $3      # encoding: [0x7c,0x62,0x0a,0xd3]
+  shrav_r.ph      $1, $2, $3      # CHECK: shrav_r.ph       $1, $2, $3      # encoding: [0x7c,0x62,0x0b,0xd3]
+  shrav.qb        $1, $2, $3      # CHECK: shrav.qb         $1, $2, $3      # encoding: [0x7c,0x62,0x09,0x93]
+  shrav_r.qb      $1, $2, $3      # CHECK: shrav_r.qb       $1, $2, $3      # encoding: [0x7c,0x62,0x09,0xd3]
+  shrav_r.w       $1, $2, $3      # CHECK: shrav_r.w        $1, $2, $3      # encoding: [0x7c,0x62,0x0d,0xd3]
+  shra_r.w        $1, $2, 1       # CHECK: shra_r.w         $1, $2, 1       # encoding: [0x7c,0x22,0x0d,0x53]
+  shrl.ph         $1, $2, 2       # CHECK: shrl.ph          $1, $2, 2       # encoding: [0x7c,0x42,0x0e,0x53]
+  shrl.qb         $1, $2, 2       # CHECK: shrl.qb          $1, $2, 2       # encoding: [0x7c,0x42,0x08,0x53]
+  shrlv.ph        $1, $2, $3      # CHECK: shrlv.ph         $1, $2, $3      # encoding: [0x7c,0x62,0x0e,0xd3]
+  shrlv.qb        $1, $2, $3      # CHECK: shrlv.qb         $1, $2, $3      # encoding: [0x7c,0x62,0x08,0xd3]
+  subq.ph         $1, $2, $3      # CHECK: subq.ph          $1, $2, $3      # encoding: [0x7c,0x43,0x0a,0xd0]
+  subq_s.ph       $1, $2, $3      # CHECK: subq_s.ph        $1, $2, $3      # encoding: [0x7c,0x43,0x0b,0xd0]
+  subq_s.w        $1, $2, $3      # CHECK: subq_s.w         $1, $2, $3      # encoding: [0x7c,0x43,0x0d,0xd0]
+  subqh.ph        $1, $2, $3      # CHECK: subqh.ph         $1, $2, $3      # encoding: [0x7c,0x43,0x0a,0x58]
+  subqh_r.ph      $1, $2, $3      # CHECK: subqh_r.ph       $1, $2, $3      # encoding: [0x7c,0x43,0x0a,0xd8]
+  subqh.w         $1, $2, $3      # CHECK: subqh.w          $1, $2, $3      # encoding: [0x7c,0x43,0x0c,0x58]
+  subqh_r.w       $1, $2, $3      # CHECK: subqh_r.w        $1, $2, $3      # encoding: [0x7c,0x43,0x0c,0xd8]
+  subu.ph         $6, $2, $9      # CHECK: subu.ph          $6, $2, $9      # encoding: [0x7c,0x49,0x32,0x50]
+  subu_s.ph       $2, $3, $4      # CHECK: subu_s.ph        $2, $3, $4      # encoding: [0x7c,0x64,0x13,0x50]
+  subu.qb         $1, $2, $3      # CHECK: subu.qb          $1, $2, $3      # encoding: [0x7c,0x43,0x08,0x50]
+  subu_s.qb       $1, $2, $3      # CHECK: subu_s.qb        $1, $2, $3      # encoding: [0x7c,0x43,0x09,0x50]
+  subuh.qb        $1, $2, $3      # CHECK: subuh.qb         $1, $2, $3      # encoding: [0x7c,0x43,0x08,0x58]
+  subuh_r.qb      $1, $2, $3      # CHECK: subuh_r.qb       $1, $2, $3      # encoding: [0x7c,0x43,0x08,0xd8]
+  wrdsp           $1, 0           # CHECK: wrdsp            $1, 0           # encoding: [0x7c,0x20,0x04,0xf8]

From 3a64f1988f06b903d52d74eebceb77ff31581956 Mon Sep 17 00:00:00 2001
From: Marina Yatsina <marina.yatsina@intel.com>
Date: Thu, 3 Dec 2015 12:17:03 +0000
Subject: [PATCH 003/364] [X86] MS inline asm: produce error when encountering
 "<type> ptr <reg name>"

Currently "<type> ptr <reg name>" treated as <reg name> in MS inline asm, ignoring the "<type> ptr" completely and possibly ignoring the intention of the user.
Fixed llvm to produce an error when encountering "<type> ptr <reg name>" operands.

For example: andpd xmm1,xmmword ptr xmm1 --> andpd xmm1, xmm1
though andpd has 2 possible matching formats - andpd xmm, xmm/m128

Patch by: ziv.izhar@intel.com
Differential Revision: http://reviews.llvm.org/D14607


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254607 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/AsmParser/X86AsmParser.cpp | 13 +++++++++++--
 test/MC/X86/intel-syntax-ambiguous.s      | 12 ++++++++++++
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 7089c1f7592b..d53ab71f3d5a 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -1693,12 +1693,14 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
       return ParseIntelOperator(IOK_TYPE);
   }
 
+  bool PtrInOperand = false;
   unsigned Size = getIntelMemOperandSize(Tok.getString());
   if (Size) {
     Parser.Lex(); // Eat operand size (e.g., byte, word).
     if (Tok.getString() != "PTR" && Tok.getString() != "ptr")
       return ErrorOperand(Tok.getLoc(), "Expected 'PTR' or 'ptr' token!");
     Parser.Lex(); // Eat ptr.
+    PtrInOperand = true;
   }
   Start = Tok.getLoc();
 
@@ -1754,9 +1756,16 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
   if (!ParseRegister(RegNo, Start, End)) {
     // If this is a segment register followed by a ':', then this is the start
     // of a segment override, otherwise this is a normal register reference.
-    if (getLexer().isNot(AsmToken::Colon))
+    // In case it is a normal register and there is ptr in the operand this 
+    // is an error
+    if (getLexer().isNot(AsmToken::Colon)){
+      if (PtrInOperand){
+        return ErrorOperand(Start, "expected memory operand after "
+                                   "'ptr', found register operand instead");
+      }
       return X86Operand::CreateReg(RegNo, Start, End);
-
+    }
+    
     return ParseIntelSegmentOverride(/*SegReg=*/RegNo, Start, Size);
   }
 
diff --git a/test/MC/X86/intel-syntax-ambiguous.s b/test/MC/X86/intel-syntax-ambiguous.s
index fe1fe5023902..e90cca820043 100644
--- a/test/MC/X86/intel-syntax-ambiguous.s
+++ b/test/MC/X86/intel-syntax-ambiguous.s
@@ -45,3 +45,15 @@ add rax, 3
 
 fadd   "?half@?0??bar@@YAXXZ@4NA"
 // CHECK: error: ambiguous operand size for instruction 'fadd'
+
+// Instruction line with PTR inside check that they don't accept register as memory.
+
+// CHECK:  error: expected memory operand after 'ptr', found register operand instead
+// CHECK: andps xmm1, xmmword ptr xmm1
+andps xmm1, xmmword ptr xmm1
+// CHECK:  error: expected memory operand after 'ptr', found register operand instead
+// CHECK: andps xmmword ptr xmm1, xmm1
+andps xmmword ptr xmm1, xmm1
+// CHECK:  error: expected memory operand after 'ptr', found register operand instead
+// CHECK: mov dword ptr eax, ebx
+mov dword ptr eax, ebx

From 927e4ae257f323569b8f8ecbc06e7d8cc198ea51 Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Thu, 3 Dec 2015 14:35:15 +0000
Subject: [PATCH 004/364] Delete dead code.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254609 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Linker/Linker.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/llvm/Linker/Linker.h b/include/llvm/Linker/Linker.h
index 0c7dc910a65c..f9890935126e 100644
--- a/include/llvm/Linker/Linker.h
+++ b/include/llvm/Linker/Linker.h
@@ -71,8 +71,6 @@ class Linker {
   Linker(Module &M, DiagnosticHandlerFunction DiagnosticHandler);
   Linker(Module &M);
 
-  Module &getModule() const { return Composite; }
-
   /// \brief Link \p Src into the composite. The source is destroyed.
   ///
   /// Passing OverrideSymbols as true will have symbols from Src

From 90ebc9e32048eec171225cd26e72001a2b973035 Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Thu, 3 Dec 2015 14:48:20 +0000
Subject: [PATCH 005/364] Don't pass member variables to member functions. NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254610 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Linker/LinkModules.cpp | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index 67613967f490..a6a26be6a44f 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -451,7 +451,7 @@ class ModuleLinker {
 
   /// Handles cloning of a global values from the source module into
   /// the destination module, including setting the attributes and visibility.
-  GlobalValue *copyGlobalValueProto(TypeMapTy &TypeMap, const GlobalValue *SGV,
+  GlobalValue *copyGlobalValueProto(const GlobalValue *SGV,
                                     const GlobalValue *DGV, bool ForDefinition);
 
   /// Check if we should promote the given local value to global scope.
@@ -526,10 +526,9 @@ class ModuleLinker {
 
   /// Functions that take care of cloning a specific global value type
   /// into the destination module.
-  GlobalVariable *copyGlobalVariableProto(TypeMapTy &TypeMap,
-                                          const GlobalVariable *SGVar);
-  Function *copyFunctionProto(TypeMapTy &TypeMap, const Function *SF);
-  GlobalValue *copyGlobalAliasProto(TypeMapTy &TypeMap, const GlobalAlias *SGA);
+  GlobalVariable *copyGlobalVariableProto(const GlobalVariable *SGVar);
+  Function *copyFunctionProto(const Function *SF);
+  GlobalValue *copyGlobalAliasProto(const GlobalAlias *SGA);
 
   /// Helper methods to check if we are importing from or potentially
   /// exporting from the current source module.
@@ -762,8 +761,7 @@ GlobalValue::LinkageTypes ModuleLinker::getLinkage(const GlobalValue *SGV) {
 /// Loop through the global variables in the src module and merge them into the
 /// dest module.
 GlobalVariable *
-ModuleLinker::copyGlobalVariableProto(TypeMapTy &TypeMap,
-                                      const GlobalVariable *SGVar) {
+ModuleLinker::copyGlobalVariableProto(const GlobalVariable *SGVar) {
   // No linking to be performed or linking from the source: simply create an
   // identical version of the symbol over in the dest module... the
   // initializer will be filled in later by LinkGlobalInits.
@@ -779,8 +777,7 @@ ModuleLinker::copyGlobalVariableProto(TypeMapTy &TypeMap,
 
 /// Link the function in the source module into the destination module if
 /// needed, setting up mapping information.
-Function *ModuleLinker::copyFunctionProto(TypeMapTy &TypeMap,
-                                          const Function *SF) {
+Function *ModuleLinker::copyFunctionProto(const Function *SF) {
   // If there is no linkage to be performed or we are linking from the source,
   // bring SF over.
   return Function::Create(TypeMap.get(SF->getFunctionType()),
@@ -788,8 +785,7 @@ Function *ModuleLinker::copyFunctionProto(TypeMapTy &TypeMap,
 }
 
 /// Set up prototypes for any aliases that come over from the source module.
-GlobalValue *ModuleLinker::copyGlobalAliasProto(TypeMapTy &TypeMap,
-                                                const GlobalAlias *SGA) {
+GlobalValue *ModuleLinker::copyGlobalAliasProto(const GlobalAlias *SGA) {
   // If there is no linkage to be performed or we're linking from the source,
   // bring over SGA.
   auto *Ty = TypeMap.get(SGA->getValueType());
@@ -820,18 +816,17 @@ void ModuleLinker::setVisibility(GlobalValue *NewGV, const GlobalValue *SGV,
   NewGV->setVisibility(Visibility);
 }
 
-GlobalValue *ModuleLinker::copyGlobalValueProto(TypeMapTy &TypeMap,
-                                                const GlobalValue *SGV,
+GlobalValue *ModuleLinker::copyGlobalValueProto(const GlobalValue *SGV,
                                                 const GlobalValue *DGV,
                                                 bool ForDefinition) {
   GlobalValue *NewGV;
   if (auto *SGVar = dyn_cast<GlobalVariable>(SGV)) {
-    NewGV = copyGlobalVariableProto(TypeMap, SGVar);
+    NewGV = copyGlobalVariableProto(SGVar);
   } else if (auto *SF = dyn_cast<Function>(SGV)) {
-    NewGV = copyFunctionProto(TypeMap, SF);
+    NewGV = copyFunctionProto(SF);
   } else {
     if (ForDefinition)
-      NewGV = copyGlobalAliasProto(TypeMap, cast<GlobalAlias>(SGV));
+      NewGV = copyGlobalAliasProto(cast<GlobalAlias>(SGV));
     else
       NewGV = new GlobalVariable(
           DstM, TypeMap.get(SGV->getType()->getElementType()),
@@ -1418,7 +1413,7 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) {
     if (DoneLinkingBodies)
       return false;
 
-    NewGV = copyGlobalValueProto(TypeMap, SGV, DGV, LinkFromSrc);
+    NewGV = copyGlobalValueProto(SGV, DGV, LinkFromSrc);
   }
 
   NewGV->setUnnamedAddr(HasUnnamedAddr);

From 695e5cca23d23b10b128ecccfa4399a7f5b306d7 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@codeaurora.org>
Date: Thu, 3 Dec 2015 15:41:33 +0000
Subject: [PATCH 006/364] [Hexagon] Fix instruction descriptor flags for memory
 access size

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254613 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
index af67481e4f47..47a6f8636276 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -90,12 +90,16 @@ namespace HexagonII {
     PostInc        = 6   // Post increment addressing mode
   };
 
+  // MemAccessSize is represented as 1+log2(N) where N is size in bits.
   enum class MemAccessSize {
     NoMemAccess = 0,            // Not a memory acces instruction.
     ByteAccess = 1,             // Byte access instruction (memb).
     HalfWordAccess = 2,         // Half word access instruction (memh).
     WordAccess = 3,             // Word access instruction (memw).
-    DoubleWordAccess = 4        // Double word access instruction (memd)
+    DoubleWordAccess = 4,       // Double word access instruction (memd)
+                    // 5,       // We do not have a 16 byte vector access.
+    Vector64Access = 7,         // 64 Byte vector access instruction (vmem).
+    Vector128Access = 8         // 128 Byte vector access instruction (vmem).
   };
 
   // MCInstrDesc TSFlags
@@ -175,7 +179,7 @@ namespace HexagonII {
     AddrModeMask = 0x7,
     // Access size for load/store instructions.
     MemAccessSizePos = 43,
-    MemAccesSizeMask = 0x7,
+    MemAccesSizeMask = 0xf,
 
     // Branch predicted taken.
     TakenPos = 47,

From 7fc4331080c9daadec246a43dad5d7095d81ecd9 Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Thu, 3 Dec 2015 16:36:16 +0000
Subject: [PATCH 007/364] Simplify ValueMap handling.

We now just return values and let ValueMap handle the map.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254615 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Linker/LinkModules.cpp | 91 ++++++++++++++++++++------------------
 1 file changed, 49 insertions(+), 42 deletions(-)

diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index a6a26be6a44f..4bc0ad039ac7 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -513,10 +513,10 @@ class ModuleLinker {
   void upgradeMismatchedGlobals();
 
   bool linkIfNeeded(GlobalValue &GV);
-  bool linkAppendingVarProto(GlobalVariable *DstGV,
-                             const GlobalVariable *SrcGV);
+  Constant *linkAppendingVarProto(GlobalVariable *DstGV,
+                                  const GlobalVariable *SrcGV);
 
-  bool linkGlobalValueProto(GlobalValue *GV);
+  Constant *linkGlobalValueProto(GlobalValue *GV);
   bool linkModuleFlagsMetadata();
 
   void linkGlobalInit(GlobalVariable &Dst, GlobalVariable &Src);
@@ -856,8 +856,7 @@ Value *ModuleLinker::materializeDeclFor(Value *V) {
   if (!SGV)
     return nullptr;
 
-  linkGlobalValueProto(SGV);
-  return ValueMap[SGV];
+  return linkGlobalValueProto(SGV);
 }
 
 void ValueMaterializerTy::materializeInitFor(GlobalValue *New,
@@ -1277,8 +1276,8 @@ static void getArrayElements(const Constant *C,
 
 /// If there were any appending global variables, link them together now.
 /// Return true on error.
-bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
-                                         const GlobalVariable *SrcGV) {
+Constant *ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
+                                              const GlobalVariable *SrcGV) {
   ArrayType *SrcTy =
       cast<ArrayType>(TypeMap.get(SrcGV->getType()->getElementType()));
   Type *EltTy = SrcTy->getElementType();
@@ -1286,32 +1285,46 @@ bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
   if (DstGV) {
     ArrayType *DstTy = cast<ArrayType>(DstGV->getType()->getElementType());
 
-    if (!SrcGV->hasAppendingLinkage() || !DstGV->hasAppendingLinkage())
-      return emitError(
+    if (!SrcGV->hasAppendingLinkage() || !DstGV->hasAppendingLinkage()) {
+      emitError(
           "Linking globals named '" + SrcGV->getName() +
           "': can only link appending global with another appending global!");
+      return nullptr;
+    }
 
     // Check to see that they two arrays agree on type.
-    if (EltTy != DstTy->getElementType())
-      return emitError("Appending variables with different element types!");
-    if (DstGV->isConstant() != SrcGV->isConstant())
-      return emitError("Appending variables linked with different const'ness!");
+    if (EltTy != DstTy->getElementType()) {
+      emitError("Appending variables with different element types!");
+      return nullptr;
+    }
+    if (DstGV->isConstant() != SrcGV->isConstant()) {
+      emitError("Appending variables linked with different const'ness!");
+      return nullptr;
+    }
 
-    if (DstGV->getAlignment() != SrcGV->getAlignment())
-      return emitError(
+    if (DstGV->getAlignment() != SrcGV->getAlignment()) {
+      emitError(
           "Appending variables with different alignment need to be linked!");
+      return nullptr;
+    }
 
-    if (DstGV->getVisibility() != SrcGV->getVisibility())
-      return emitError(
+    if (DstGV->getVisibility() != SrcGV->getVisibility()) {
+      emitError(
           "Appending variables with different visibility need to be linked!");
+      return nullptr;
+    }
 
-    if (DstGV->hasUnnamedAddr() != SrcGV->hasUnnamedAddr())
-      return emitError(
+    if (DstGV->hasUnnamedAddr() != SrcGV->hasUnnamedAddr()) {
+      emitError(
           "Appending variables with different unnamed_addr need to be linked!");
+      return nullptr;
+    }
 
-    if (StringRef(DstGV->getSection()) != SrcGV->getSection())
-      return emitError(
+    if (StringRef(DstGV->getSection()) != SrcGV->getSection()) {
+      emitError(
           "Appending variables with different section name need to be linked!");
+      return nullptr;
+    }
   }
 
   SmallVector<Constant *, 16> DstElements;
@@ -1347,9 +1360,10 @@ bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
   // Propagate alignment, visibility and section info.
   copyGVAttributes(NG, SrcGV);
 
-  // Replace any uses of the two global variables with uses of the new
-  // global.
-  ValueMap[SrcGV] = ConstantExpr::getBitCast(NG, TypeMap.get(SrcGV->getType()));
+  Constant *Ret = ConstantExpr::getBitCast(NG, TypeMap.get(SrcGV->getType()));
+
+  // Stop recursion.
+  ValueMap[SrcGV] = Ret;
 
   for (auto *V : SrcElements) {
     DstElements.push_back(
@@ -1358,15 +1372,17 @@ bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
 
   NG->setInitializer(ConstantArray::get(NewType, DstElements));
 
+  // Replace any uses of the two global variables with uses of the new
+  // global.
   if (DstGV) {
     DstGV->replaceAllUsesWith(ConstantExpr::getBitCast(NG, DstGV->getType()));
     DstGV->eraseFromParent();
   }
 
-  return false;
+  return Ret;
 }
 
-bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) {
+Constant *ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) {
   GlobalValue *DGV = getLinkedToGlobal(SGV);
 
   // Handle the ultra special appending linkage case first.
@@ -1390,12 +1406,7 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) {
       LinkFromSrc = true;
   } else if (DGV) {
     if (shouldLinkFromSource(LinkFromSrc, *DGV, *SGV))
-      return true;
-  }
-
-  if (!LinkFromSrc && DGV) {
-    // Make sure to remember this mapping.
-    ValueMap[SGV] = ConstantExpr::getBitCast(DGV, TypeMap.get(SGV->getType()));
+      return nullptr;
   }
 
   if (DGV)
@@ -1411,7 +1422,7 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) {
     // metadata linking), don't link in the global value due to this
     // reference, simply map it to null.
     if (DoneLinkingBodies)
-      return false;
+      return nullptr;
 
     NewGV = copyGlobalValueProto(SGV, DGV, LinkFromSrc);
   }
@@ -1434,16 +1445,12 @@ bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) {
       NewGVar->setConstant(false);
   }
 
-  // Make sure to remember this mapping.
-  if (NewGV != DGV) {
-    if (DGV) {
-      DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewGV, DGV->getType()));
-      DGV->eraseFromParent();
-    }
-    ValueMap[SGV] = NewGV;
+  if (NewGV != DGV && DGV) {
+    DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewGV, DGV->getType()));
+    DGV->eraseFromParent();
   }
 
-  return false;
+  return ConstantExpr::getBitCast(NewGV, TypeMap.get(SGV->getType()));
 }
 
 /// Update the initializers in the Dest module now that all globals that may be
@@ -1534,7 +1541,7 @@ bool ModuleLinker::linkGlobalValueBody(GlobalValue &Dst, GlobalValue &Src) {
     // are linked in. Otherwise, linkonce and other lazy linked GVs will
     // not be materialized if they aren't referenced.
     for (auto *SGV : ComdatMembers[SC]) {
-      auto *DGV = cast_or_null<GlobalValue>(ValueMap[SGV]);
+      auto *DGV = cast_or_null<GlobalValue>(ValueMap.lookup(SGV));
       if (DGV && !DGV->isDeclaration())
         continue;
       MapValue(SGV, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer);

From 44386caaacbb8f931bd2048da6a00db6beb17169 Mon Sep 17 00:00:00 2001
From: Colin LeMahieu <colinl@codeaurora.org>
Date: Thu, 3 Dec 2015 16:37:21 +0000
Subject: [PATCH 008/364] [Hexagon] NFC Using canonicalizePacket to
 compound/duplex/pad packets rather than doing it separately.  This also
 ensures the integrated assembler path matches the assembly parser path.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254616 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Hexagon/HexagonAsmPrinter.cpp | 28 ++++++++++--------------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index 19769258ee89..8a6d6555d901 100644
--- a/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -191,29 +191,23 @@ void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     MachineBasicBlock::const_instr_iterator MII = MI->getIterator();
     unsigned IgnoreCount = 0;
 
-    for (++MII; MII != MBB->instr_end() && MII->isInsideBundle(); ++MII) {
+    for (++MII; MII != MBB->instr_end() && MII->isInsideBundle(); ++MII)
       if (MII->getOpcode() == TargetOpcode::DBG_VALUE ||
           MII->getOpcode() == TargetOpcode::IMPLICIT_DEF)
         ++IgnoreCount;
-      else {
+      else
         HexagonLowerToMC(MCII, &*MII, MCB, *this);
-      }
-    }
   }
-  else {
+  else
     HexagonLowerToMC(MCII, MI, MCB, *this);
-    HexagonMCInstrInfo::padEndloop(OutStreamer->getContext(), MCB);
-  }
-  // Examine the packet and try to find instructions that can be converted
-  // to compounds.
-  HexagonMCInstrInfo::tryCompound(MCII, OutStreamer->getContext(), MCB);
-  // Examine the packet and convert pairs of instructions to duplex
-  // instructions when possible.
-  SmallVector<DuplexCandidate, 8> possibleDuplexes;
-  possibleDuplexes = HexagonMCInstrInfo::getDuplexPossibilties(MCII, MCB);
-  HexagonMCShuffle(MCII, *Subtarget, OutStreamer->getContext(), MCB,
-                   possibleDuplexes);
-  EmitToStreamer(*OutStreamer, MCB);
+
+  bool Ok = HexagonMCInstrInfo::canonicalizePacket(
+      MCII, *Subtarget, OutStreamer->getContext(), MCB, nullptr);
+  assert(Ok);
+  (void)Ok;
+  if(HexagonMCInstrInfo::bundleSize(MCB) == 0)
+    return;
+  OutStreamer->EmitInstruction(MCB, getSubtargetInfo());
 }
 
 extern "C" void LLVMInitializeHexagonAsmPrinter() {

From 55c790e29c334af76790f6ee01df6b0575f668ec Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@codeaurora.org>
Date: Thu, 3 Dec 2015 16:47:20 +0000
Subject: [PATCH 009/364] [Hexagon] Implement CONCAT_VECTORS for HVX using
 V6_vcombine

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254617 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Hexagon/HexagonISelLowering.cpp | 10 ++++++++++
 lib/Target/Hexagon/HexagonISelLowering.h   |  1 +
 lib/Target/Hexagon/HexagonInstrInfoV60.td  | 16 +++++++++++++++-
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index 04f5b6649293..a75f391a4eea 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -2042,6 +2042,7 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case HexagonISD::VCMPWEQ:       return "HexagonISD::VCMPWEQ";
   case HexagonISD::VCMPWGT:       return "HexagonISD::VCMPWGT";
   case HexagonISD::VCMPWGTU:      return "HexagonISD::VCMPWGTU";
+  case HexagonISD::VCOMBINE:      return "HexagonISD::VCOMBINE";
   case HexagonISD::VSHLH:         return "HexagonISD::VSHLH";
   case HexagonISD::VSHLW:         return "HexagonISD::VSHLW";
   case HexagonISD::VSPLATB:       return "HexagonISD::VSPLTB";
@@ -2346,6 +2347,7 @@ SDValue
 HexagonTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
                                            SelectionDAG &DAG) const {
   SDLoc dl(Op);
+  bool UseHVX = Subtarget.useHVXOps();
   EVT VT = Op.getValueType();
   unsigned NElts = Op.getNumOperands();
   SDValue Vec = Op.getOperand(0);
@@ -2376,6 +2378,14 @@ HexagonTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
     }
   }
 
+  if (UseHVX) {
+    SDValue Vec0 = Op.getOperand(1);
+    uint64_t VS = VecVT.getSizeInBits();
+    assert((VS == 64*8 && Subtarget.useHVXSglOps()) ||
+           (VS == 128*8 && Subtarget.useHVXDblOps()));
+    SDValue Combined = DAG.getNode(HexagonISD::VCOMBINE, dl, VT, Vec0, Vec);
+    return Combined;
+  }
   for (unsigned i = 0, e = NElts; i != e; ++i) {
     unsigned OpIdx = NElts - i - 1;
     SDValue Operand = Op.getOperand(OpIdx);
diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
index 64033d95ee3c..b6d39fe91728 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -80,6 +80,7 @@ bool isPositiveHalfWord(SDNode *N);
       INSERTRP,
       EXTRACTU,
       EXTRACTURP,
+      VCOMBINE,
       TC_RETURN,
       EH_RETURN,
       DCFETCH,
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV60.td b/lib/Target/Hexagon/HexagonInstrInfoV60.td
index 394df0fdd6e3..897ada081534 100644
--- a/lib/Target/Hexagon/HexagonInstrInfoV60.td
+++ b/lib/Target/Hexagon/HexagonInstrInfoV60.td
@@ -1535,6 +1535,20 @@ let isRegSequence = 1, Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV in
 defm V6_vcombine :
      T_HVX_alu_WV <"$dst = vcombine($src1,$src2)">, V6_vcombine_enc;
 
+def SDTHexagonVCOMBINE: SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>,
+      SDTCisSubVecOfVec<1, 0>]>;
+
+def HexagonVCOMBINE: SDNode<"HexagonISD::VCOMBINE", SDTHexagonVCOMBINE>;
+
+def: Pat<(v32i32 (HexagonVCOMBINE (v16i32 VectorRegs:$Vs),
+                                  (v16i32 VectorRegs:$Vt))),
+         (V6_vcombine VectorRegs:$Vs, VectorRegs:$Vt)>,
+         Requires<[UseHVXSgl]>;
+def: Pat<(v64i32 (HexagonVCOMBINE (v32i32 VecDblRegs:$Vs),
+                                  (v32i32 VecDblRegs:$Vt))),
+         (V6_vcombine_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>,
+         Requires<[UseHVXDbl]>;
+
 let Itinerary = CVI_VINLANESAT, Type = TypeCVI_VINLANESAT in {
 defm V6_vsathub :
      T_HVX_alu_VV <"$dst.ub = vsat($src1.h,$src2.h)">, V6_vsathub_enc;
@@ -1872,7 +1886,7 @@ defm V6_vasrhbrndsat :
      V6_vasrhbrndsat_enc;
 }
 
-// Assemlber mapped -- alias?
+// Assembler mapped -- alias?
 //defm V6_vtran2x2vdd : T_HVX_shift_VV <"">, V6_vtran2x2vdd_enc;
 let Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS in {
 defm V6_vshuffvdd :

From 5d94a27cae83d67fa8890f74a03ff22979adaec3 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@codeaurora.org>
Date: Thu, 3 Dec 2015 17:07:12 +0000
Subject: [PATCH 010/364] Friendly takeover of the Hexagon backend

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254620 91177308-0d34-0410-b5e6-96231b3b80d8
---
 CODE_OWNERS.TXT | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/CODE_OWNERS.TXT b/CODE_OWNERS.TXT
index 0a6f057341e8..cdd0f8dc933c 100644
--- a/CODE_OWNERS.TXT
+++ b/CODE_OWNERS.TXT
@@ -53,10 +53,6 @@ N: Quentin Colombet
 E: qcolombet@apple.com
 D: Register allocators
 
-N: Anshuman Dasgupta
-E: adasgupt@codeaurora.org
-D: Hexagon Backend
-
 N: Duncan P. N. Exon Smith
 E: dexonsmith@apple.com
 D: Branch weights and BlockFrequencyInfo
@@ -138,6 +134,10 @@ N: Richard Osborne
 E: richard@xmos.com
 D: XCore Backend
 
+N: Krzysztof Parzyszek
+E: kparzysz@codeaurora.org
+D: Hexagon Backend
+
 N: Chad Rosier
 E: mcrosier@codeaurora.org
 D: Fast-Isel

From 23a903a5175908226b6aecc1832d32e820b3f091 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Thu, 3 Dec 2015 17:19:58 +0000
Subject: [PATCH 011/364] AArch64FastISel: Use cbz/cbnz to branch on i1

In the case of a conditional branch without a preceding cmp we used to emit
a "and; cmp; b.eq/b.ne" sequence, use tbz/tbnz instead.

Differential Revision: http://reviews.llvm.org/D15122

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254621 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64FastISel.cpp        | 86 ++++++-------------
 test/CodeGen/AArch64/arm64-fast-isel-br.ll    | 15 +---
 .../AArch64/fast-isel-branch-cond-mask.ll     |  3 +-
 .../AArch64/fast-isel-branch-cond-split.ll    |  8 +-
 4 files changed, 32 insertions(+), 80 deletions(-)

diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index b7849d5bbc26..cae2d5276296 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -2275,7 +2275,6 @@ bool AArch64FastISel::selectBranch(const Instruction *I) {
   MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
   MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
 
-  AArch64CC::CondCode CC = AArch64CC::NE;
   if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
     if (CI->hasOneUse() && isValueAvailable(CI)) {
       // Try to optimize or fold the cmp.
@@ -2307,7 +2306,7 @@ bool AArch64FastISel::selectBranch(const Instruction *I) {
 
       // FCMP_UEQ and FCMP_ONE cannot be checked with a single branch
       // instruction.
-      CC = getCompareCC(Predicate);
+      AArch64CC::CondCode CC = getCompareCC(Predicate);
       AArch64CC::CondCode ExtraCC = AArch64CC::AL;
       switch (Predicate) {
       default:
@@ -2335,37 +2334,6 @@ bool AArch64FastISel::selectBranch(const Instruction *I) {
           .addImm(CC)
           .addMBB(TBB);
 
-      finishCondBranch(BI->getParent(), TBB, FBB);
-      return true;
-    }
-  } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
-    MVT SrcVT;
-    if (TI->hasOneUse() && isValueAvailable(TI) &&
-        isTypeSupported(TI->getOperand(0)->getType(), SrcVT)) {
-      unsigned CondReg = getRegForValue(TI->getOperand(0));
-      if (!CondReg)
-        return false;
-      bool CondIsKill = hasTrivialKill(TI->getOperand(0));
-
-      // Issue an extract_subreg to get the lower 32-bits.
-      if (SrcVT == MVT::i64) {
-        CondReg = fastEmitInst_extractsubreg(MVT::i32, CondReg, CondIsKill,
-                                             AArch64::sub_32);
-        CondIsKill = true;
-      }
-
-      unsigned ANDReg = emitAnd_ri(MVT::i32, CondReg, CondIsKill, 1);
-      assert(ANDReg && "Unexpected AND instruction emission failure.");
-      emitICmp_ri(MVT::i32, ANDReg, /*IsKill=*/true, 0);
-
-      if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
-        std::swap(TBB, FBB);
-        CC = AArch64CC::EQ;
-      }
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
-          .addImm(CC)
-          .addMBB(TBB);
-
       finishCondBranch(BI->getParent(), TBB, FBB);
       return true;
     }
@@ -2383,20 +2351,23 @@ bool AArch64FastISel::selectBranch(const Instruction *I) {
     } else
       FuncInfo.MBB->addSuccessorWithoutProb(Target);
     return true;
-  } else if (foldXALUIntrinsic(CC, I, BI->getCondition())) {
-    // Fake request the condition, otherwise the intrinsic might be completely
-    // optimized away.
-    unsigned CondReg = getRegForValue(BI->getCondition());
-    if (!CondReg)
-      return false;
+  } else {
+    AArch64CC::CondCode CC = AArch64CC::NE;
+    if (foldXALUIntrinsic(CC, I, BI->getCondition())) {
+      // Fake request the condition, otherwise the intrinsic might be completely
+      // optimized away.
+      unsigned CondReg = getRegForValue(BI->getCondition());
+      if (!CondReg)
+        return false;
 
-    // Emit the branch.
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
-      .addImm(CC)
-      .addMBB(TBB);
+      // Emit the branch.
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
+        .addImm(CC)
+        .addMBB(TBB);
 
-    finishCondBranch(BI->getParent(), TBB, FBB);
-    return true;
+      finishCondBranch(BI->getParent(), TBB, FBB);
+      return true;
+    }
   }
 
   unsigned CondReg = getRegForValue(BI->getCondition());
@@ -2404,26 +2375,19 @@ bool AArch64FastISel::selectBranch(const Instruction *I) {
     return false;
   bool CondRegIsKill = hasTrivialKill(BI->getCondition());
 
-  // We've been divorced from our compare!  Our block was split, and
-  // now our compare lives in a predecessor block.  We musn't
-  // re-compare here, as the children of the compare aren't guaranteed
-  // live across the block boundary (we *could* check for this).
-  // Regardless, the compare has been done in the predecessor block,
-  // and it left a value for us in a virtual register.  Ergo, we test
-  // the one-bit value left in the virtual register.
-  //
-  // FIXME: Optimize this with TBZW/TBZNW.
-  unsigned ANDReg = emitAnd_ri(MVT::i32, CondReg, CondRegIsKill, 1);
-  assert(ANDReg && "Unexpected AND instruction emission failure.");
-  emitICmp_ri(MVT::i32, ANDReg, /*IsKill=*/true, 0);
-
+  // i1 conditions come as i32 values, test the lowest bit with tb(n)z.
+  unsigned Opcode = AArch64::TBNZW;
   if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
     std::swap(TBB, FBB);
-    CC = AArch64CC::EQ;
+    Opcode = AArch64::TBZW;
   }
 
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
-      .addImm(CC)
+  const MCInstrDesc &II = TII.get(Opcode);
+  unsigned ConstrainedCondReg
+    = constrainOperandRegClass(II, CondReg, II.getNumDefs());
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+      .addReg(ConstrainedCondReg, getKillRegState(CondRegIsKill))
+      .addImm(0)
       .addMBB(TBB);
 
   finishCondBranch(BI->getParent(), TBB, FBB);
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-br.ll b/test/CodeGen/AArch64/arm64-fast-isel-br.ll
index 0ef7b143df80..55c9c6036ed5 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-br.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-br.ll
@@ -94,9 +94,7 @@ entry:
   store i32 %c, i32* %c.addr, align 4
   store i64 %d, i64* %d.addr, align 8
   %0 = load i16, i16* %b.addr, align 2
-; CHECK: and w0, w0, #0x1
-; CHECK: cmp w0, #0
-; CHECK: b.eq LBB4_2
+; CHECK: tbz w0, #0, LBB4_2
   %conv = trunc i16 %0 to i1
   br i1 %conv, label %if.then, label %if.end
 
@@ -106,9 +104,7 @@ if.then:                                          ; preds = %entry
 
 if.end:                                           ; preds = %if.then, %entry
   %1 = load i32, i32* %c.addr, align 4
-; CHECK: and w[[REG:[0-9]+]], w{{[0-9]+}}, #0x1
-; CHECK: cmp w[[REG]], #0
-; CHECK: b.eq LBB4_4
+; CHECK: tbz w{{[0-9]+}}, #0, LBB4_4
   %conv1 = trunc i32 %1 to i1
   br i1 %conv1, label %if.then3, label %if.end4
 
@@ -118,8 +114,7 @@ if.then3:                                         ; preds = %if.end
 
 if.end4:                                          ; preds = %if.then3, %if.end
   %2 = load i64, i64* %d.addr, align 8
-; CHECK: cmp w{{[0-9]+}}, #0
-; CHECK: b.eq LBB4_6
+; CHECK: tbz w{{[0-9]+}}, #0, LBB4_6
   %conv5 = trunc i64 %2 to i1
   br i1 %conv5, label %if.then7, label %if.end8
 
@@ -139,9 +134,7 @@ define i32 @trunc64(i64 %foo) nounwind {
 ; CHECK: trunc64
 ; CHECK: and  [[REG1:x[0-9]+]], x0, #0x1
 ; CHECK: mov  x[[REG2:[0-9]+]], [[REG1]]
-; CHECK: and  [[REG3:w[0-9]+]], w[[REG2]], #0x1
-; CHECK: cmp  [[REG3]], #0
-; CHECK: b.eq LBB5_2
+; CHECK: tbz w[[REG2]], #0, LBB5_2
   %a = and i64 %foo, 1
   %b = trunc i64 %a to i1
   br i1 %b, label %if.then, label %if.else
diff --git a/test/CodeGen/AArch64/fast-isel-branch-cond-mask.ll b/test/CodeGen/AArch64/fast-isel-branch-cond-mask.ll
index c018b2778b04..55fbf63319ee 100644
--- a/test/CodeGen/AArch64/fast-isel-branch-cond-mask.ll
+++ b/test/CodeGen/AArch64/fast-isel-branch-cond-mask.ll
@@ -4,8 +4,7 @@ define void @test(i64 %a, i64 %b, i2* %c) {
 ; CHECK-LABEL: test
 ; CHECK:       and [[REG1:w[0-9]+]], w8, #0x3
 ; CHECK-NEXT:  strb [[REG1]], {{\[}}x2{{\]}}
-; CHECK:       and [[REG2:w[0-9]+]], w8, #0x1
-; CHECK-NEXT:  cmp [[REG2]], #0
+; CHECK-NEXT:  tbz w9, #0,
  %1 = trunc i64 %a to i2
  %2 = trunc i64 %b to i1
 ; Force fast-isel to fall back to SDAG.
diff --git a/test/CodeGen/AArch64/fast-isel-branch-cond-split.ll b/test/CodeGen/AArch64/fast-isel-branch-cond-split.ll
index 5248b9253e7a..e04a62b85c8e 100644
--- a/test/CodeGen/AArch64/fast-isel-branch-cond-split.ll
+++ b/test/CodeGen/AArch64/fast-isel-branch-cond-split.ll
@@ -44,9 +44,7 @@ bb4:
 ; CHECK-NEXT:  cmp   w1, #0
 ; CHECK-NEXT:  cset  w9, eq
 ; CHECK-NEXT:  orr   w8, w8, w9
-; CHECK-NEXT:  and   w8, w8, #0x1
-; CHECK-NEXT:  cmp   w8, #0
-; CHECK-NEXT:  b.ne 
+; CHECK-NEXT:  tbnz w8, #0,
 define i64 @test_or_unpredictable(i32 %a, i32 %b) {
 bb1:
   %0 = icmp eq i32 %a, 0
@@ -68,9 +66,7 @@ bb4:
 ; CHECK-NEXT:  cmp   w1, #0
 ; CHECK-NEXT:  cset  w9, ne
 ; CHECK-NEXT:  and   w8, w8, w9
-; CHECK-NEXT:  and   w8, w8, #0x1
-; CHECK-NEXT:  cmp   w8, #0
-; CHECK-NEXT:  b.eq 
+; CHECK-NEXT:  tbz w8, #0,
 define i64 @test_and_unpredictable(i32 %a, i32 %b) {
 bb1:
   %0 = icmp ne i32 %a, 0

From 83c34652ffe9b368401aea98d9a4cc1ef6fa32d3 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@codeaurora.org>
Date: Thu, 3 Dec 2015 17:53:34 +0000
Subject: [PATCH 012/364] [Hexagon] Remove variable unused in NDEBUG build

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254623 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Hexagon/HexagonISelLowering.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index a75f391a4eea..b59fe6b67044 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -2380,9 +2380,8 @@ HexagonTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
 
   if (UseHVX) {
     SDValue Vec0 = Op.getOperand(1);
-    uint64_t VS = VecVT.getSizeInBits();
-    assert((VS == 64*8 && Subtarget.useHVXSglOps()) ||
-           (VS == 128*8 && Subtarget.useHVXDblOps()));
+    assert((VecVT.getSizeInBits() == 64*8 && Subtarget.useHVXSglOps()) ||
+           (VecVT.getSizeInBits() == 128*8 && Subtarget.useHVXDblOps()));
     SDValue Combined = DAG.getNode(HexagonISD::VCOMBINE, dl, VT, Vec0, Vec);
     return Combined;
   }

From 1c14f2864d7e7b87609c9d4c9b8295d0140ec978 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Thu, 3 Dec 2015 18:20:05 +0000
Subject: [PATCH 013/364] [ThinLTO] Appending linkage fixes

Summary:
Fix import from module with appending var, which cannot be imported. The
first fix is to remove an overly-aggressive error check.

The second fix is to deal with restructuring introduced to the module
linker yesterday in r254418 (actually, this fix was included already
in r254559, just added some additional cleanup).

Test by Mehdi Amini.

Reviewers: joker.eph, rafael

Subscribers: joker.eph, llvm-commits

Differential Revision: http://reviews.llvm.org/D15156

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254624 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Linker/LinkModules.cpp                    | 14 ++++++++-----
 .../Inputs/funcimport_appending_global.ll     |  6 ++++++
 test/Linker/funcimport_appending_global.ll    | 20 +++++++++++++++++++
 3 files changed, 35 insertions(+), 5 deletions(-)
 create mode 100644 test/Linker/Inputs/funcimport_appending_global.ll
 create mode 100644 test/Linker/funcimport_appending_global.ll

diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index 4bc0ad039ac7..55ab1824740b 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -726,8 +726,10 @@ GlobalValue::LinkageTypes ModuleLinker::getLinkage(const GlobalValue *SGV) {
     // It would be incorrect to import an appending linkage variable,
     // since it would cause global constructors/destructors to be
     // executed multiple times. This should have already been handled
-    // by linkGlobalValueProto.
-    llvm_unreachable("Cannot import appending linkage variable");
+    // by linkIfNeeded, and we will assert in shouldLinkFromSource
+    // if we try to import, so we simply return AppendingLinkage here
+    // as this helper is called more widely in getLinkedToGlobal.
+    return GlobalValue::AppendingLinkage;
 
   case GlobalValue::InternalLinkage:
   case GlobalValue::PrivateLinkage:
@@ -1015,8 +1017,7 @@ bool ModuleLinker::shouldLinkFromSource(bool &LinkFromSrc,
 
   // We always have to add Src if it has appending linkage.
   if (Src.hasAppendingLinkage()) {
-    // Caller should have already determined that we can't link from source
-    // when importing (see comments in linkGlobalValueProto).
+    // Should have prevented importing for appending linkage in linkIfNeeded.
     assert(!isPerformingImport());
     LinkFromSrc = true;
     return false;
@@ -1387,9 +1388,12 @@ Constant *ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) {
 
   // Handle the ultra special appending linkage case first.
   assert(!DGV || SGV->hasAppendingLinkage() == DGV->hasAppendingLinkage());
-  if (SGV->hasAppendingLinkage())
+  if (SGV->hasAppendingLinkage()) {
+    // Should have prevented importing for appending linkage in linkIfNeeded.
+    assert(!isPerformingImport());
     return linkAppendingVarProto(cast_or_null<GlobalVariable>(DGV),
                                  cast<GlobalVariable>(SGV));
+  }
 
   bool LinkFromSrc = true;
   Comdat *C = nullptr;
diff --git a/test/Linker/Inputs/funcimport_appending_global.ll b/test/Linker/Inputs/funcimport_appending_global.ll
new file mode 100644
index 000000000000..413b890b02ad
--- /dev/null
+++ b/test/Linker/Inputs/funcimport_appending_global.ll
@@ -0,0 +1,6 @@
+@v = weak global i8 1
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @foo, i8* @v}]
+
+define void @foo() {
+  ret void
+}
diff --git a/test/Linker/funcimport_appending_global.ll b/test/Linker/funcimport_appending_global.ll
new file mode 100644
index 000000000000..190d31ee8c7f
--- /dev/null
+++ b/test/Linker/funcimport_appending_global.ll
@@ -0,0 +1,20 @@
+; RUN: llvm-as -function-summary %s -o %t.bc
+; RUN: llvm-as -function-summary %p/Inputs/funcimport_appending_global.ll -o %t2.bc
+; RUN: llvm-lto -thinlto -o %t3 %t.bc %t2.bc
+
+; Do the import now
+; RUN: llvm-link %t.bc -functionindex=%t3.thinlto.bc -import=foo:%t2.bc -S | FileCheck %s
+
+; Ensure that global constructor (appending linkage) is not imported
+; CHECK-NOT: @llvm.global_ctors = {{.*}}@foo
+
+declare void @f()
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @f, i8* null}]
+
+define i32 @main() {
+entry:
+  call void @foo()
+  ret i32 0
+}
+
+declare void @foo()

From d61481245dc307bd0c15b7a681fc0b196884589c Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Thu, 3 Dec 2015 18:41:59 +0000
Subject: [PATCH 014/364] dwarfdump: Correctly indentify the indicies for DWP
 records

The indicies are one-based, not zero-based, per the spec.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254626 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/DebugInfo/DWARF/DWARFUnitIndex.cpp | 2 +-
 test/DebugInfo/dwarfdump-dwp.test      | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp b/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
index 1f1921649b57..96b316957dfd 100644
--- a/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
+++ b/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
@@ -133,7 +133,7 @@ void DWARFUnitIndex::dump(raw_ostream &OS) const {
   for (unsigned i = 0; i != Header.NumBuckets; ++i) {
     auto &Row = Rows[i];
     if (auto *Contribs = Row.Contributions.get()) {
-      OS << format("%5u 0x%016" PRIx64 " ", i, Row.Signature);
+      OS << format("%5u 0x%016" PRIx64 " ", i + 1, Row.Signature);
       for (unsigned i = 0; i != Header.NumColumns; ++i) {
         auto &Contrib = Contribs[i];
         OS << format("[0x%08x, 0x%08x) ", Contrib.Offset,
diff --git a/test/DebugInfo/dwarfdump-dwp.test b/test/DebugInfo/dwarfdump-dwp.test
index af5de5067e69..8aef636d4d9a 100644
--- a/test/DebugInfo/dwarfdump-dwp.test
+++ b/test/DebugInfo/dwarfdump-dwp.test
@@ -39,15 +39,15 @@ RUN: llvm-dwarfdump %p/Inputs/dwarfdump-dwp.x86_64.o | FileCheck %s
 ; CHECK-NEXT: version = 2 slots = 16
 ; CHECK:      Index Signature          INFO                     ABBREV                   LINE                     STR_OFFSETS
 ; CHECK-NEXT: ----- ------------------ ------------------------ ------------------------ ------------------------ ------------------------
-; CHECK-NEXT:     2 0xfef104c25502f092 [0x0000002d, 0x0000005f) [0x00000043, 0x0000008e) [0x0000001a, 0x00000034) [0x00000010, 0x00000024)
-; CHECK-NEXT:     8 0x03c30756e2d45008 [0x00000000, 0x0000002d) [0x00000000, 0x00000043) [0x00000000, 0x0000001a) [0x00000000, 0x00000010)
+; CHECK-NEXT:     3 0xfef104c25502f092 [0x0000002d, 0x0000005f) [0x00000043, 0x0000008e) [0x0000001a, 0x00000034) [0x00000010, 0x00000024)
+; CHECK-NEXT:     9 0x03c30756e2d45008 [0x00000000, 0x0000002d) [0x00000000, 0x00000043) [0x00000000, 0x0000001a) [0x00000000, 0x00000010)
 
 ; CHECK: .debug_tu_index contents:
 ; CHECK-NEXT: version = 2 slots = 16
 ; CHECK:      Index Signature          TYPES                    ABBREV                   LINE                     STR_OFFSETS
 ; CHECK-NEXT: ----- ------------------ ------------------------ ------------------------ ------------------------ ------------------------
-; CHECK-NEXT:     8 0x1d02f3be30cc5688 [0x00000024, 0x00000048) [0x00000043, 0x0000008e) [0x0000001a, 0x00000034) [0x00000010, 0x00000024)
-; CHECK-NEXT:    12 0x3875c0e21cda63fc [0x00000000, 0x00000024) [0x00000000, 0x00000043) [0x00000000, 0x0000001a) [0x00000000, 0x00000010)
+; CHECK-NEXT:     9 0x1d02f3be30cc5688 [0x00000024, 0x00000048) [0x00000043, 0x0000008e) [0x0000001a, 0x00000034) [0x00000010, 0x00000024)
+; CHECK-NEXT:    13 0x3875c0e21cda63fc [0x00000000, 0x00000024) [0x00000000, 0x00000043) [0x00000000, 0x0000001a) [0x00000000, 0x00000010)
 
 ; TODO: use the index section offset info to correctly dump strings in debug info
 ; TODO: use the index section offset info to correctly dump file names in debug info

From 0123bc6beac1ca9a06450e98d8e829d32ce52498 Mon Sep 17 00:00:00 2001
From: Chris Bieneman <beanz@apple.com>
Date: Thu, 3 Dec 2015 18:45:39 +0000
Subject: [PATCH 015/364] [CMake] Add option LLVM_EXTERNALIZE_DEBUGINFO

Summary: This adds support for generating dSYM files and stripping debug info from executables and dylibs. It also supports passing -object_path_lto to the linker to generate dSYMs for LTO builds.

Reviewers: bogner, friss

Subscribers: llvm-commits

Differential Revision: http://reviews.llvm.org/D15133

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254627 91177308-0d34-0410-b5e6-96231b3b80d8
---
 CMakeLists.txt              |  3 +++
 cmake/modules/AddLLVM.cmake | 27 +++++++++++++++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fc46413640c5..c4ff8f3cd28b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -301,6 +301,9 @@ endif( LLVM_USE_INTEL_JITEVENTS )
 option(LLVM_USE_OPROFILE
   "Use opagent JIT interface to inform OProfile about JIT code" OFF)
 
+option(LLVM_EXTERNALIZE_DEBUGINFO
+  "Generate dSYM files and strip executables and libraries (Darwin Only)" OFF)
+
 # If enabled, verify we are on a platform that supports oprofile.
 if( LLVM_USE_OPROFILE )
   if( NOT CMAKE_SYSTEM_NAME MATCHES "Linux" )
diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake
index b5517d0b893d..97ac96ed4281 100644
--- a/cmake/modules/AddLLVM.cmake
+++ b/cmake/modules/AddLLVM.cmake
@@ -512,6 +512,10 @@ function(llvm_add_library name)
       add_dependencies(${objlib} ${LLVM_COMMON_DEPENDS})
     endforeach()
   endif()
+
+  if(ARG_SHARED OR ARG_MODULE)
+    llvm_externalize_debuginfo(${name})
+  endif()
 endfunction()
 
 macro(add_llvm_library name)
@@ -655,6 +659,8 @@ macro(add_llvm_executable name)
   if( LLVM_COMMON_DEPENDS )
     add_dependencies( ${name} ${LLVM_COMMON_DEPENDS} )
   endif( LLVM_COMMON_DEPENDS )
+
+  llvm_externalize_debuginfo(${name})
 endmacro(add_llvm_executable name)
 
 function(export_executable_symbols target)
@@ -1168,3 +1174,24 @@ function(add_llvm_tool_symlink name dest)
     endif()
   endif()
 endfunction()
+
+function(llvm_externalize_debuginfo name)
+  if(NOT LLVM_EXTERNALIZE_DEBUGINFO)
+    return()
+  endif()
+
+  if(APPLE)
+    if(CMAKE_CXX_FLAGS MATCHES "-flto"
+      OR CMAKE_CXX_FLAGS_${uppercase_CMAKE_BUILD_TYPE} MATCHES "-flto")
+
+      set(lto_object ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/${name}-lto.o)
+      set_target_properties(${name} PROPERTIES
+        LINK_FLAGS "-Wl,-object_path_lto -Wl,${lto_object}")
+    endif()
+    add_custom_command(TARGET ${name} POST_BUILD
+      COMMAND xcrun dsymutil $<TARGET_FILE:${name}>
+      COMMAND xcrun strip -Sl $<TARGET_FILE:${name}>)
+  else()
+    message(FATAL_ERROR "LLVM_EXTERNALIZE_DEBUGINFO isn't implemented for non-darwin platforms!")
+  endif()
+endfunction()

From c4b843ccb787bfd31dc0ce3d01c4c61b5c86ca58 Mon Sep 17 00:00:00 2001
From: Andrew Kaylor <andrew.kaylor@intel.com>
Date: Thu, 3 Dec 2015 18:55:28 +0000
Subject: [PATCH 016/364] [WinEH] Avoid infinite loop in BranchFolding for
 multiple single block funclets

Differential Revision: http://reviews.llvm.org/D14996


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254629 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/BranchFolding.cpp                 |   8 ++
 .../BranchFolding/single-block-funclets.ll    | 110 ++++++++++++++++++
 2 files changed, 118 insertions(+)
 create mode 100644 test/Transforms/BranchFolding/single-block-funclets.ll

diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index 54d92ad67a97..c6a6476747e6 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -1564,6 +1564,14 @@ bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) {
       // removed, move this block to the end of the function.
       MachineBasicBlock *PrevTBB = nullptr, *PrevFBB = nullptr;
       SmallVector<MachineOperand, 4> PrevCond;
+      // We're looking for cases where PrevBB could possibly fall through to
+      // FallThrough, but if FallThrough is an EH pad that wouldn't be useful
+      // so here we skip over any EH pads so we might have a chance to find
+      // a branch target from PrevBB.
+      while (FallThrough != MF.end() && FallThrough->isEHPad())
+        ++FallThrough;
+      // Now check to see if the current block is sitting between PrevBB and
+      // a block to which it could fall through.
       if (FallThrough != MF.end() &&
           !TII->AnalyzeBranch(PrevBB, PrevTBB, PrevFBB, PrevCond, true) &&
           PrevBB.isSuccessor(&*FallThrough)) {
diff --git a/test/Transforms/BranchFolding/single-block-funclets.ll b/test/Transforms/BranchFolding/single-block-funclets.ll
new file mode 100644
index 000000000000..21c7818e5195
--- /dev/null
+++ b/test/Transforms/BranchFolding/single-block-funclets.ll
@@ -0,0 +1,110 @@
+; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s
+
+declare i32 @__CxxFrameHandler3(...)
+
+declare void @throw()
+declare i16 @f()
+
+define i16 @test1(i16 %a, i8* %b) personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
+entry:
+  %cmp = icmp eq i16 %a, 10
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %call1 = invoke i16 @f()
+          to label %cleanup unwind label %catch.dispatch
+
+if.else:
+  %call2 = invoke i16 @f()
+          to label %cleanup unwind label %catch.dispatch
+
+catch.dispatch:
+  catchpad [i8* null, i32 8, i8* null]
+          to label %catch unwind label %catch.dispatch.2
+
+catch:
+  invoke void @throw() noreturn
+          to label %unreachable unwind label %catchendblock
+
+catch.dispatch.2:
+  catchpad [i8* null, i32 64, i8* null]
+          to label %catch.2 unwind label %catchendblock
+
+catch.2:
+  store i8 1, i8* %b
+  invoke void @throw() noreturn
+          to label %unreachable unwind label %catchendblock
+
+catchendblock:
+  catchendpad unwind to caller
+
+cleanup:
+  %retval = phi i16 [ %call1, %if.then ], [ %call2, %if.else ]
+  ret i16 %retval
+
+unreachable:
+  unreachable
+}
+
+; This test verifies the case where two funclet blocks meet the old criteria
+; to be placed at the end.  The order of the blocks is not important for the
+; purposes of this test.  The failure mode is an infinite loop during
+; compilation.
+;
+; CHECK-LABEL: .def     test1;
+
+define i16 @test2(i16 %a, i8* %b) personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
+entry:
+  %cmp = icmp eq i16 %a, 10
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %call1 = invoke i16 @f()
+          to label %cleanup unwind label %catch.dispatch
+
+if.else:
+  %call2 = invoke i16 @f()
+          to label %cleanup unwind label %catch.dispatch
+
+catch.dispatch:
+  catchpad [i8* null, i32 8, i8* null]
+          to label %catch unwind label %catch.dispatch.2
+
+catch:
+  invoke void @throw() noreturn
+          to label %unreachable unwind label %catchendblock
+
+catch.dispatch.2:
+  %c2 = catchpad [i8* null, i32 32, i8* null]
+          to label %catch.2 unwind label %catch.dispatch.3
+
+catch.2:
+  store i8 1, i8* %b
+  catchret %c2 to label %cleanup
+
+catch.dispatch.3:
+  %c3 = catchpad [i8* null, i32 64, i8* null]
+          to label %catch.3 unwind label %catchendblock
+
+catch.3:
+  store i8 2, i8* %b
+  catchret %c3 to label %cleanup
+
+catchendblock:
+  catchendpad unwind to caller
+
+cleanup:
+  %retval = phi i16 [ %call1, %if.then ], [ %call2, %if.else ], [ -1, %catch.2 ], [ -1, %catch.3 ]
+  ret i16 %retval
+
+unreachable:
+  unreachable
+}
+
+; This test verifies the case where three funclet blocks all meet the old
+; criteria to be placed at the end.  The order of the blocks is not important
+; for the purposes of this test.  The failure mode is an infinite loop during
+; compilation.
+;
+; CHECK-LABEL: .def     test2;
+

From fd1c9c504318bbe41d43169d3be415bfa978caad Mon Sep 17 00:00:00 2001
From: Easwaran Raman <eraman@google.com>
Date: Thu, 3 Dec 2015 19:03:20 +0000
Subject: [PATCH 017/364] Test commit.

Remove blank spaces at the end of comments


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254630 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Analysis/InlineCost.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index 26f2e7ff504a..6d7d74999061 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -115,11 +115,11 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   /// inlining has the given attribute set either at the call site or the
   /// function declaration.  Primarily used to inspect call site specific
   /// attributes since these can be more precise than the ones on the callee
-  /// itself. 
+  /// itself.
   bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
   
   /// Return true if the given value is known non null within the callee if
-  /// inlined through this particular callsite. 
+  /// inlined through this particular callsite.
   bool isKnownNonNullInCallee(Value *V);
 
   // Custom analysis routines.

From 2272eac9491a7be71409e3e76ec09eccb678aee5 Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Thu, 3 Dec 2015 19:10:55 +0000
Subject: [PATCH 018/364] Simplify test. NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254631 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Linker/weakextern.ll | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/test/Linker/weakextern.ll b/test/Linker/weakextern.ll
index 8d479a0d39b3..e1754e60547e 100644
--- a/test/Linker/weakextern.ll
+++ b/test/Linker/weakextern.ll
@@ -1,12 +1,8 @@
-; RUN: llvm-as < %s > %t.bc
-; RUN: llvm-as < %p/testlink.ll > %t2.bc
-; RUN: llvm-link %t.bc %t.bc %t2.bc -o %t1.bc
-; RUN: llvm-dis < %t1.bc | FileCheck %s
+; RUN: llvm-link %s %s %p/testlink.ll -S | FileCheck %s
 ; CHECK: kallsyms_names = extern_weak
 ; CHECK: Inte = global i32
 ; CHECK: MyVar = external global i32
 
-@kallsyms_names = extern_weak global [0 x i8]		; <[0 x i8]*> [#uses=0]
-@MyVar = extern_weak global i32		; <i32*> [#uses=0]
-@Inte = extern_weak global i32		; <i32*> [#uses=0]
-
+@kallsyms_names = extern_weak global [0 x i8]
+@MyVar = extern_weak global i32
+@Inte = extern_weak global i32

From 44a7fca4326b5bd76d584b488cacd74f37965f13 Mon Sep 17 00:00:00 2001
From: Andrew Kaylor <andrew.kaylor@intel.com>
Date: Thu, 3 Dec 2015 19:30:38 +0000
Subject: [PATCH 019/364] Fix buildbot failures

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254636 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/BranchFolding/single-block-funclets.ll | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/Transforms/BranchFolding/single-block-funclets.ll b/test/Transforms/BranchFolding/single-block-funclets.ll
index 21c7818e5195..b2286ac33597 100644
--- a/test/Transforms/BranchFolding/single-block-funclets.ll
+++ b/test/Transforms/BranchFolding/single-block-funclets.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s
+; REQUIRES: X86
 
 declare i32 @__CxxFrameHandler3(...)
 

From 5779341a6d68c4f4971c4f0b0033b41f6cb10662 Mon Sep 17 00:00:00 2001
From: Andrew Kaylor <andrew.kaylor@intel.com>
Date: Thu, 3 Dec 2015 19:41:25 +0000
Subject: [PATCH 020/364] Move branch folding test to a better location.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254640 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../X86/branchfolding-catchpads.ll}                              | 1 -
 1 file changed, 1 deletion(-)
 rename test/{Transforms/BranchFolding/single-block-funclets.ll => CodeGen/X86/branchfolding-catchpads.ll} (99%)

diff --git a/test/Transforms/BranchFolding/single-block-funclets.ll b/test/CodeGen/X86/branchfolding-catchpads.ll
similarity index 99%
rename from test/Transforms/BranchFolding/single-block-funclets.ll
rename to test/CodeGen/X86/branchfolding-catchpads.ll
index b2286ac33597..21c7818e5195 100644
--- a/test/Transforms/BranchFolding/single-block-funclets.ll
+++ b/test/CodeGen/X86/branchfolding-catchpads.ll
@@ -1,5 +1,4 @@
 ; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s
-; REQUIRES: X86
 
 declare i32 @__CxxFrameHandler3(...)
 

From 208ed9b5fb3eb00c3cc52e647df6cfce49168449 Mon Sep 17 00:00:00 2001
From: Chris Bieneman <beanz@apple.com>
Date: Thu, 3 Dec 2015 19:47:04 +0000
Subject: [PATCH 021/364] [CMake] Removing an unnecessary layer of variable
 indirection

This prevents passthrough variables from having values.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254641 91177308-0d34-0410-b5e6-96231b3b80d8
---
 cmake/modules/LLVMExternalProjectUtils.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/modules/LLVMExternalProjectUtils.cmake b/cmake/modules/LLVMExternalProjectUtils.cmake
index 5d8fb71e08b3..c2d9f530c200 100644
--- a/cmake/modules/LLVMExternalProjectUtils.cmake
+++ b/cmake/modules/LLVMExternalProjectUtils.cmake
@@ -95,7 +95,7 @@ function(llvm_ExternalProject_Add name source_dir)
     if(variableName MATCHES "^${nameCanon}")
       string(REPLACE ";" "\;" value "${${variableName}}")
       list(APPEND PASSTHROUGH_VARIABLES
-        -D${variableName}=${${value}})
+        -D${variableName}=${value})
     endif()
   endforeach()
 

From 3a8af93eb7ae93e59adec0ae6e30352816ea200d Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Thu, 3 Dec 2015 20:46:59 +0000
Subject: [PATCH 022/364] [X86] Put no-op ADJCALLSTACK markers around all
 dynamic lowerings

Summary:
These ADJCALLSTACK markers don't generate code, but they keep dynamic
alloca code that calls chkstk out of the prologue.

This slightly pessimizes inalloca calls by preventing some register copy
coalescing, but I can live with that.

Reviewers: qcolombet

Subscribers: hans, llvm-commits

Differential Revision: http://reviews.llvm.org/D15200

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254645 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp     | 69 +++++++++++---------------
 test/CodeGen/X86/inalloca-stdcall.ll   |  5 +-
 test/CodeGen/X86/inalloca.ll           | 15 +++---
 test/CodeGen/X86/shrink-wrap-chkstk.ll | 37 ++++++++++++++
 4 files changed, 78 insertions(+), 48 deletions(-)
 create mode 100644 test/CodeGen/X86/shrink-wrap-chkstk.ll

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 241bbfd331c1..2cf1d4ba30ee 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -15622,54 +15622,40 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                SplitStack;
   SDLoc dl(Op);
 
+  // Get the inputs.
+  SDNode *Node = Op.getNode();
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size  = Op.getOperand(1);
+  unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  EVT VT = Node->getValueType(0);
+
+  // Chain the dynamic stack allocation so that it doesn't modify the stack
+  // pointer when other instructions are using the stack.
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
+
+  bool Is64Bit = Subtarget->is64Bit();
+  MVT SPTy = getPointerTy(DAG.getDataLayout());
+
+  SDValue Result;
   if (!Lower) {
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    SDNode* Node = Op.getNode();
-
     unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
-        " not tell us which reg is the stack pointer!");
+                    " not tell us which reg is the stack pointer!");
     EVT VT = Node->getValueType(0);
-    SDValue Tmp1 = SDValue(Node, 0);
-    SDValue Tmp2 = SDValue(Node, 1);
     SDValue Tmp3 = Node->getOperand(2);
-    SDValue Chain = Tmp1.getOperand(0);
-
-    // Chain the dynamic stack allocation so that it doesn't modify the stack
-    // pointer when other instructions are using the stack.
-    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true),
-        SDLoc(Node));
 
-    SDValue Size = Tmp2.getOperand(1);
     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
     Chain = SP.getValue(1);
     unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
     const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
     unsigned StackAlign = TFI.getStackAlignment();
-    Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
+    Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
     if (Align > StackAlign)
-      Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
-          DAG.getConstant(-(uint64_t)Align, dl, VT));
-    Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
-
-    Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
-        DAG.getIntPtrConstant(0, dl, true), SDValue(),
-        SDLoc(Node));
-
-    SDValue Ops[2] = { Tmp1, Tmp2 };
-    return DAG.getMergeValues(Ops, dl);
-  }
-
-  // Get the inputs.
-  SDValue Chain = Op.getOperand(0);
-  SDValue Size  = Op.getOperand(1);
-  unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
-  EVT VT = Op.getNode()->getValueType(0);
-
-  bool Is64Bit = Subtarget->is64Bit();
-  MVT SPTy = getPointerTy(DAG.getDataLayout());
-
-  if (SplitStack) {
+      Result = DAG.getNode(ISD::AND, dl, VT, Result,
+                         DAG.getConstant(-(uint64_t)Align, dl, VT));
+    Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
+  } else if (SplitStack) {
     MachineRegisterInfo &MRI = MF.getRegInfo();
 
     if (Is64Bit) {
@@ -15687,10 +15673,8 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
     const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
     unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
-    SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
+    Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
                                 DAG.getRegister(Vreg, SPTy));
-    SDValue Ops1[2] = { Value, Chain };
-    return DAG.getMergeValues(Ops1, dl);
   } else {
     SDValue Flag;
     const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX);
@@ -15712,9 +15696,14 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
       Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
     }
 
-    SDValue Ops1[2] = { SP, Chain };
-    return DAG.getMergeValues(Ops1, dl);
+    Result = SP;
   }
+
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
+                             DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
+
+  SDValue Ops[2] = {Result, Chain};
+  return DAG.getMergeValues(Ops, dl);
 }
 
 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
diff --git a/test/CodeGen/X86/inalloca-stdcall.ll b/test/CodeGen/X86/inalloca-stdcall.ll
index e5f6ea70e9cb..4f7e4092a99c 100644
--- a/test/CodeGen/X86/inalloca-stdcall.ll
+++ b/test/CodeGen/X86/inalloca-stdcall.ll
@@ -14,8 +14,9 @@ define void @g() {
   %f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1
   store i32 13, i32* %f1
   store i32 42, i32* %f2
-; CHECK: movl    $13, (%esp)
-; CHECK: movl    $42, 4(%esp)
+; CHECK: movl %esp, %eax
+; CHECK: movl    $13, (%eax)
+; CHECK: movl    $42, 4(%eax)
   call x86_stdcallcc void @f(%Foo* inalloca %b)
 ; CHECK: calll   _f@8
 ; CHECK-NOT: %esp
diff --git a/test/CodeGen/X86/inalloca.ll b/test/CodeGen/X86/inalloca.ll
index 904366219ab7..e523c945a69f 100644
--- a/test/CodeGen/X86/inalloca.ll
+++ b/test/CodeGen/X86/inalloca.ll
@@ -14,8 +14,9 @@ entry:
   %f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1
   store i32 13, i32* %f1
   store i32 42, i32* %f2
-; CHECK: movl    $13, (%esp)
-; CHECK: movl    $42, 4(%esp)
+; CHECK: movl %esp, %eax
+; CHECK: movl    $13, (%eax)
+; CHECK: movl    $42, 4(%eax)
   call void @f(%Foo* inalloca %b)
 ; CHECK: calll   _f
   ret void
@@ -33,8 +34,9 @@ entry:
   %f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1
   store i32 13, i32* %f1
   store i32 42, i32* %f2
-; CHECK: movl    $13, (%esp)
-; CHECK: movl    $42, 4(%esp)
+; CHECK: movl %esp, %eax
+; CHECK: movl    $13, (%eax)
+; CHECK: movl    $42, 4(%eax)
   call void @inreg_with_inalloca(i32 inreg 1, %Foo* inalloca %b)
 ; CHECK: movl    $1, %eax
 ; CHECK: calll   _inreg_with_inalloca
@@ -53,8 +55,9 @@ entry:
   %f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1
   store i32 13, i32* %f1
   store i32 42, i32* %f2
-; CHECK-DAG: movl    $13, (%esp)
-; CHECK-DAG: movl    $42, 4(%esp)
+; CHECK: movl %esp, %eax
+; CHECK-DAG: movl    $13, (%eax)
+; CHECK-DAG: movl    $42, 4(%eax)
   call x86_thiscallcc void @thiscall_with_inalloca(i8* null, %Foo* inalloca %b)
 ; CHECK-DAG: xorl    %ecx, %ecx
 ; CHECK: calll   _thiscall_with_inalloca
diff --git a/test/CodeGen/X86/shrink-wrap-chkstk.ll b/test/CodeGen/X86/shrink-wrap-chkstk.ll
new file mode 100644
index 000000000000..c0b2b45e676f
--- /dev/null
+++ b/test/CodeGen/X86/shrink-wrap-chkstk.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s -enable-shrink-wrap=true | FileCheck %s
+
+; chkstk cannot come before the usual prologue, since it adjusts ESP.
+
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc18.0.0"
+
+%struct.S = type { [12 x i8] }
+
+define x86_thiscallcc void @call_inalloca(i1 %x) {
+entry:
+  %argmem = alloca inalloca <{ %struct.S }>, align 4
+  %argidx1 = getelementptr inbounds <{ %struct.S }>, <{ %struct.S }>* %argmem, i32 0, i32 0, i32 0, i32 0
+  %argidx2 = getelementptr inbounds <{ %struct.S }>, <{ %struct.S }>* %argmem, i32 0, i32 0, i32 0, i32 1
+  store i8 42, i8* %argidx2, align 4
+  br i1 %x, label %bb1, label %bb2
+
+bb1:
+  store i8 42, i8* %argidx1, align 4
+  br label %bb2
+
+bb2:
+  call void @inalloca_params(<{ %struct.S }>* inalloca nonnull %argmem)
+  ret void
+}
+
+; CHECK-LABEL: _call_inalloca: # @call_inalloca
+; CHECK: pushl %ebp
+; CHECK: movl %esp, %ebp
+; CHECK: movl $12, %eax
+; CHECK: calll __chkstk
+; CHECK: calll _inalloca_params
+; CHECK: movl %ebp, %esp
+; CHECK: popl %ebp
+; CHECK: retl
+
+declare void @inalloca_params(<{ %struct.S }>* inalloca)

From 52c4f7de0934eef2019b77a14dc94415459d3fc1 Mon Sep 17 00:00:00 2001
From: Easwaran Raman <eraman@google.com>
Date: Thu, 3 Dec 2015 20:57:37 +0000
Subject: [PATCH 023/364] Interface to attach maximum function count from PGO
 to module as module flags.

This provides interface to get and set maximum function counts to Module. This
would allow things like determination of function hotness. The actual setting
of this max function count will have to be done in the frontend.

Differential Revision: http://reviews.llvm.org/D15003


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254647 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/Module.h | 11 +++++++++++
 lib/IR/Module.cpp        | 12 ++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/include/llvm/IR/Module.h b/include/llvm/IR/Module.h
index 4e99c4256730..6cf75e747e06 100644
--- a/include/llvm/IR/Module.h
+++ b/include/llvm/IR/Module.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_IR_MODULE_H
 #define LLVM_IR_MODULE_H
 
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/IR/Comdat.h"
 #include "llvm/IR/DataLayout.h"
@@ -639,6 +640,16 @@ class Module {
   /// \brief Set the PIC level (small or large model)
   void setPICLevel(PICLevel::Level PL);
 /// @}
+
+  /// @name Utility functions for querying and setting PGO counts
+  /// @{
+
+  /// \brief Set maximum function count in PGO mode
+  void setMaximumFunctionCount(uint64_t);
+
+  /// \brief Returns maximum function count in PGO mode
+  Optional<uint64_t> getMaximumFunctionCount();
+  /// @}
 };
 
 /// An raw_ostream inserter for modules.
diff --git a/lib/IR/Module.cpp b/lib/IR/Module.cpp
index 2b9adad44ba7..2acd9db210db 100644
--- a/lib/IR/Module.cpp
+++ b/lib/IR/Module.cpp
@@ -491,3 +491,15 @@ PICLevel::Level Module::getPICLevel() const {
 void Module::setPICLevel(PICLevel::Level PL) {
   addModuleFlag(ModFlagBehavior::Error, "PIC Level", PL);
 }
+
+void Module::setMaximumFunctionCount(uint64_t Count) {
+  addModuleFlag(ModFlagBehavior::Error, "MaxFunctionCount", Count);
+}
+
+Optional<uint64_t> Module::getMaximumFunctionCount() {
+  auto *Val =
+      cast_or_null<ConstantAsMetadata>(getModuleFlag("MaxFunctionCount"));
+  if (!Val)
+    return None;
+  return cast<ConstantInt>(Val->getValue())->getZExtValue();
+}

From 03212a0ad9e68062010028bb737c69312a6ad80f Mon Sep 17 00:00:00 2001
From: Keno Fischer <kfischer@college.harvard.edu>
Date: Thu, 3 Dec 2015 21:27:59 +0000
Subject: [PATCH 024/364] [RuntimeDyld] DenseMap -> std::unordered_map

DenseMap is most applicable when both keys and values are small.
In this case, the value violates that assumption, causing quite
significant memory overhead. A std::unordered_map is more appropriate
in this case (or at least fixed the memory problems I was seeing).

Differential Revision: http://reviews.llvm.org/D14910

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254651 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp   | 4 ++--
 lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index dd02ece3a9f1..a95f3bbe4179 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -97,11 +97,11 @@ void RuntimeDyldImpl::resolveRelocations() {
     // The Section here (Sections[i]) refers to the section in which the
     // symbol for the relocation is located.  The SectionID in the relocation
     // entry provides the section to which the relocation will be applied.
-    int Idx = it->getFirst();
+    int Idx = it->first;
     uint64_t Addr = Sections[Idx].getLoadAddress();
     DEBUG(dbgs() << "Resolving relocations Section #" << Idx << "\t"
                  << format("%p", (uintptr_t)Addr) << "\n");
-    resolveRelocationList(it->getSecond(), Addr);
+    resolveRelocationList(it->second, Addr);
   }
   Relocations.clear();
 
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
index 914efd24660a..dafd3c8793c3 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
@@ -30,6 +30,7 @@
 #include "llvm/Support/SwapByteOrder.h"
 #include "llvm/Support/raw_ostream.h"
 #include <map>
+#include <unordered_map>
 #include <system_error>
 
 using namespace llvm;
@@ -264,7 +265,7 @@ class RuntimeDyldImpl {
   // Relocations to sections already loaded. Indexed by SectionID which is the
   // source of the address. The target where the address will be written is
   // SectionID/Offset in the relocation itself.
-  DenseMap<unsigned, RelocationList> Relocations;
+  std::unordered_map<unsigned, RelocationList> Relocations;
 
   // Relocations to external symbols that are not yet resolved.  Symbols are
   // external when they aren't found in the global symbol table of all loaded

From 70e1c7be4416f3db4a0ca9688879203428e67ac1 Mon Sep 17 00:00:00 2001
From: Colin LeMahieu <colinl@codeaurora.org>
Date: Thu, 3 Dec 2015 21:44:28 +0000
Subject: [PATCH 025/364] [Hexagon] Adding shuffling resources for HVX
 instructions and tests for instruction encodings.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254652 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../Hexagon/MCTargetDesc/HexagonShuffler.cpp  |  88 +++-
 .../Hexagon/MCTargetDesc/HexagonShuffler.h    |  50 ++-
 test/MC/Hexagon/test.s                        |   4 +
 test/MC/Hexagon/v60-alu.s                     | 312 +++++++++++++
 test/MC/Hexagon/v60-permute.s                 |  51 +++
 test/MC/Hexagon/v60-shift.s                   |  39 ++
 test/MC/Hexagon/v60-vcmp.s                    |  84 ++++
 test/MC/Hexagon/v60-vmem.s                    | 424 ++++++++++++++++++
 test/MC/Hexagon/v60-vmpy-acc.s                | 123 +++++
 test/MC/Hexagon/v60-vmpy1.s                   | 138 ++++++
 test/MC/Hexagon/v60lookup.s                   |  14 +
 11 files changed, 1320 insertions(+), 7 deletions(-)
 create mode 100644 test/MC/Hexagon/test.s
 create mode 100644 test/MC/Hexagon/v60-alu.s
 create mode 100644 test/MC/Hexagon/v60-permute.s
 create mode 100644 test/MC/Hexagon/v60-shift.s
 create mode 100644 test/MC/Hexagon/v60-vcmp.s
 create mode 100644 test/MC/Hexagon/v60-vmem.s
 create mode 100644 test/MC/Hexagon/v60-vmpy-acc.s
 create mode 100644 test/MC/Hexagon/v60-vmpy1.s
 create mode 100644 test/MC/Hexagon/v60lookup.s

diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
index 45e1909ede5a..6ceb848ba20c 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
@@ -95,6 +95,60 @@ unsigned HexagonResource::setWeight(unsigned s) {
   return (Weight);
 }
 
+HexagonCVIResource::TypeUnitsAndLanes *HexagonCVIResource::TUL;
+
+bool HexagonCVIResource::SetUp = HexagonCVIResource::setup();
+
+bool HexagonCVIResource::setup() {
+  assert(!TUL);
+  TUL = new (TypeUnitsAndLanes);
+
+  (*TUL)[HexagonII::TypeCVI_VA] =
+      UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
+  (*TUL)[HexagonII::TypeCVI_VA_DV] = UnitsAndLanes(CVI_XLANE | CVI_MPY0, 2);
+  (*TUL)[HexagonII::TypeCVI_VX] = UnitsAndLanes(CVI_MPY0 | CVI_MPY1, 1);
+  (*TUL)[HexagonII::TypeCVI_VX_DV] = UnitsAndLanes(CVI_MPY0, 2);
+  (*TUL)[HexagonII::TypeCVI_VP] = UnitsAndLanes(CVI_XLANE, 1);
+  (*TUL)[HexagonII::TypeCVI_VP_VS] = UnitsAndLanes(CVI_XLANE, 2);
+  (*TUL)[HexagonII::TypeCVI_VS] = UnitsAndLanes(CVI_SHIFT, 1);
+  (*TUL)[HexagonII::TypeCVI_VINLANESAT] = UnitsAndLanes(CVI_SHIFT, 1);
+  (*TUL)[HexagonII::TypeCVI_VM_LD] =
+      UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
+  (*TUL)[HexagonII::TypeCVI_VM_TMP_LD] = UnitsAndLanes(CVI_NONE, 0);
+  (*TUL)[HexagonII::TypeCVI_VM_CUR_LD] =
+      UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
+  (*TUL)[HexagonII::TypeCVI_VM_VP_LDU] = UnitsAndLanes(CVI_XLANE, 1);
+  (*TUL)[HexagonII::TypeCVI_VM_ST] =
+      UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
+  (*TUL)[HexagonII::TypeCVI_VM_NEW_ST] = UnitsAndLanes(CVI_NONE, 0);
+  (*TUL)[HexagonII::TypeCVI_VM_STU] = UnitsAndLanes(CVI_XLANE, 1);
+  (*TUL)[HexagonII::TypeCVI_HIST] = UnitsAndLanes(CVI_XLANE, 4);
+
+  return true;
+}
+
+HexagonCVIResource::HexagonCVIResource(MCInstrInfo const &MCII, unsigned s,
+                                       MCInst const *id)
+    : HexagonResource(s) {
+  unsigned T = HexagonMCInstrInfo::getType(MCII, *id);
+
+  if (TUL->count(T)) {
+    // For an HVX insn.
+    Valid = true;
+    setUnits((*TUL)[T].first);
+    setLanes((*TUL)[T].second);
+    setLoad(HexagonMCInstrInfo::getDesc(MCII, *id).mayLoad());
+    setStore(HexagonMCInstrInfo::getDesc(MCII, *id).mayStore());
+  } else {
+    // For core insns.
+    Valid = false;
+    setUnits(0);
+    setLanes(0);
+    setLoad(false);
+    setStore(false);
+  }
+}
+
 HexagonShuffler::HexagonShuffler(MCInstrInfo const &MCII,
                                  MCSubtargetInfo const &STI)
     : MCII(MCII), STI(STI) {
@@ -109,7 +163,7 @@ void HexagonShuffler::reset() {
 
 void HexagonShuffler::append(MCInst const *ID, MCInst const *Extender,
                              unsigned S, bool X) {
-  HexagonInstr PI(ID, Extender, S, X);
+  HexagonInstr PI(MCII, ID, Extender, S, X);
 
   Packet.push_back(PI);
 }
@@ -128,6 +182,8 @@ bool HexagonShuffler::check() {
   // Number of memory operations, loads, solo loads, stores, solo stores, single
   // stores.
   unsigned memory = 0, loads = 0, load0 = 0, stores = 0, store0 = 0, store1 = 0;
+  // Number of HVX loads, HVX stores.
+  unsigned CVIloads = 0, CVIstores = 0;
   // Number of duplex insns, solo insns.
   unsigned duplex = 0, solo = 0;
   // Number of insns restricting other insns in the packet to A and X types,
@@ -170,6 +226,12 @@ bool HexagonShuffler::check() {
     case HexagonII::TypeJ:
       ++jumps;
       break;
+    case HexagonII::TypeCVI_VM_VP_LDU:
+      ++onlyNo1;
+    case HexagonII::TypeCVI_VM_LD:
+    case HexagonII::TypeCVI_VM_TMP_LD:
+    case HexagonII::TypeCVI_VM_CUR_LD:
+      ++CVIloads;
     case HexagonII::TypeLD:
       ++loads;
       ++memory;
@@ -178,6 +240,11 @@ bool HexagonShuffler::check() {
       if (HexagonMCInstrInfo::getDesc(MCII, *ID).isReturn())
         ++jumps, ++jump1; // DEALLOC_RETURN is of type LD.
       break;
+    case HexagonII::TypeCVI_VM_STU:
+      ++onlyNo1;
+    case HexagonII::TypeCVI_VM_ST:
+    case HexagonII::TypeCVI_VM_NEW_ST:
+      ++CVIstores;
     case HexagonII::TypeST:
       ++stores;
       ++memory;
@@ -205,9 +272,9 @@ bool HexagonShuffler::check() {
   }
 
   // Check if the packet is legal.
-  if ((load0 > 1 || store0 > 1) || (duplex > 1 || (duplex && memory)) ||
-      (solo && size() > 1) || (onlyAX && neitherAnorX > 1) ||
-      (onlyAX && xtypeFloat)) {
+  if ((load0 > 1 || store0 > 1 || CVIloads > 1 || CVIstores > 1) ||
+      (duplex > 1 || (duplex && memory)) || (solo && size() > 1) ||
+      (onlyAX && neitherAnorX > 1) || (onlyAX && xtypeFloat)) {
     Error = SHUFFLE_ERROR_INVALID;
     return false;
   }
@@ -338,6 +405,19 @@ bool HexagonShuffler::check() {
         return false;
       }
   }
+  // Verify the CVI slot subscriptions.
+  {
+    HexagonUnitAuction AuctionCVI;
+
+    std::sort(begin(), end(), HexagonInstr::lessCVI);
+
+    for (iterator I = begin(); I != end(); ++I)
+      for (unsigned i = 0; i < I->CVI.getLanes(); ++i) // TODO: I->CVI.isValid?
+        if (!AuctionCVI.bid(I->CVI.getUnits() << i)) {
+          Error = SHUFFLE_ERROR_SLOTS;
+          return false;
+        }
+  }
 
   Error = SHUFFLE_SUCCESS;
   return true;
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
index 6355c3275a38..174f10fb2580 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
@@ -51,6 +51,44 @@ class HexagonResource {
   };
 };
 
+// HVX insn resources.
+class HexagonCVIResource : public HexagonResource {
+  typedef std::pair<unsigned, unsigned> UnitsAndLanes;
+  typedef llvm::DenseMap<unsigned, UnitsAndLanes> TypeUnitsAndLanes;
+
+  // Available HVX slots.
+  enum {
+    CVI_NONE = 0,
+    CVI_XLANE = 1 << 0,
+    CVI_SHIFT = 1 << 1,
+    CVI_MPY0 = 1 << 2,
+    CVI_MPY1 = 1 << 3
+  };
+
+  static bool SetUp;
+  static bool setup();
+  static TypeUnitsAndLanes *TUL;
+
+  // Count of adjacent slots that the insn requires to be executed.
+  unsigned Lanes;
+  // Flag whether the insn is a load or a store.
+  bool Load, Store;
+  // Flag whether the HVX resources are valid.
+  bool Valid;
+
+  void setLanes(unsigned l) { Lanes = l; };
+  void setLoad(bool f = true) { Load = f; };
+  void setStore(bool f = true) { Store = f; };
+
+public:
+  HexagonCVIResource(MCInstrInfo const &MCII, unsigned s, MCInst const *id);
+
+  bool isValid() const { return (Valid); };
+  unsigned getLanes() const { return (Lanes); };
+  bool mayLoad() const { return (Load); };
+  bool mayStore() const { return (Store); };
+};
+
 // Handle to an insn used by the shuffling algorithm.
 class HexagonInstr {
   friend class HexagonShuffler;
@@ -58,12 +96,14 @@ class HexagonInstr {
   MCInst const *ID;
   MCInst const *Extender;
   HexagonResource Core;
+  HexagonCVIResource CVI;
   bool SoloException;
 
 public:
-  HexagonInstr(MCInst const *id, MCInst const *Extender, unsigned s,
-               bool x = false)
-      : ID(id), Extender(Extender), Core(s), SoloException(x){};
+  HexagonInstr(MCInstrInfo const &MCII, MCInst const *id,
+               MCInst const *Extender, unsigned s, bool x = false)
+      : ID(id), Extender(Extender), Core(s), CVI(MCII, s, id),
+        SoloException(x){};
 
   MCInst const *getDesc() const { return (ID); };
 
@@ -79,6 +119,10 @@ class HexagonInstr {
   static bool lessCore(const HexagonInstr &A, const HexagonInstr &B) {
     return (HexagonResource::lessUnits(A.Core, B.Core));
   };
+  // Check if the handles are in ascending order by HVX slots.
+  static bool lessCVI(const HexagonInstr &A, const HexagonInstr &B) {
+    return (HexagonResource::lessUnits(A.CVI, B.CVI));
+  };
 };
 
 // Bundle shuffler.
diff --git a/test/MC/Hexagon/test.s b/test/MC/Hexagon/test.s
new file mode 100644
index 000000000000..e60578e65930
--- /dev/null
+++ b/test/MC/Hexagon/test.s
@@ -0,0 +1,4 @@
+#RUN: llvm-mc -filetype=obj -triple=hexagon -mcpu=hexagonv60 %s
+
+{ vmem (r0 + #0) = v0
+  r0 = memw(r0) } 
\ No newline at end of file
diff --git a/test/MC/Hexagon/v60-alu.s b/test/MC/Hexagon/v60-alu.s
new file mode 100644
index 000000000000..1583c3da2cb7
--- /dev/null
+++ b/test/MC/Hexagon/v60-alu.s
@@ -0,0 +1,312 @@
+#RUN: llvm-mc -triple=hexagon -mcpu=hexagonv60 -filetype=obj %s | \
+#RUN: llvm-objdump -triple=hexagon -mcpu=hexagonv60 -d - | \
+#RUN: FileCheck %s
+
+#CHECK: 1ce2cbd7 { v23.w = vavg(v11.w,{{ *}}v2.w):rnd }
+v23.w=vavg(v11.w,v2.w):rnd
+
+#CHECK: 1cf4d323 { v3.h = vnavg(v19.h,{{ *}}v20.h) }
+v3.h=vnavg(v19.h,v20.h)
+
+#CHECK: 1cffce9a { v26.uh = vavg(v14.uh,{{ *}}v31.uh):rnd }
+v26.uh=vavg(v14.uh,v31.uh):rnd
+
+#CHECK: 1ce5cba1 { v1.h = vavg(v11.h,{{ *}}v5.h):rnd }
+v1.h=vavg(v11.h,v5.h):rnd
+
+#CHECK: 1cc0d012 { v18.ub = vabsdiff(v16.ub,{{ *}}v0.ub) }
+v18.ub=vabsdiff(v16.ub,v0.ub)
+
+#CHECK: 1cc2de29 { v9.uh = vabsdiff(v30.h,{{ *}}v2.h) }
+v9.uh=vabsdiff(v30.h,v2.h)
+
+#CHECK: 1ce9ca06 { v6.b = vnavg(v10.ub,{{ *}}v9.ub) }
+v6.b=vnavg(v10.ub,v9.ub)
+
+#CHECK: 1caacf90 { v17:16.w = vadd(v15.h,{{ *}}v10.h) }
+v17:16.w=vadd(v15.h,v10.h)
+
+#CHECK: 1cb4cabe { v31:30.h = vsub(v10.ub,{{ *}}v20.ub) }
+v31:30.h=vsub(v10.ub,v20.ub)
+
+#CHECK: 1cb8cada { v27:26.w = vsub(v10.uh,{{ *}}v24.uh) }
+v27:26.w=vsub(v10.uh,v24.uh)
+
+#CHECK: 1cbcdbe8 { v9:8.w = vsub(v27.h,{{ *}}v28.h) }
+v9:8.w=vsub(v27.h,v28.h)
+
+#CHECK: 1caeca00 { v1:0.h = vsub(v11:10.h,{{ *}}v15:14.h):sat }
+v1:0.h=vsub(v11:10.h,v15:14.h):sat
+
+#CHECK: 1ca8c43e { v31:30.w = vsub(v5:4.w,{{ *}}v9:8.w):sat }
+v31:30.w=vsub(v5:4.w,v9:8.w):sat
+
+#CHECK: 1cbad95c { v29:28.h = vadd(v25.ub,{{ *}}v26.ub) }
+v29:28.h=vadd(v25.ub,v26.ub)
+
+#CHECK: 1ca1dc64 { v5:4.w = vadd(v28.uh,{{ *}}v1.uh) }
+v5:4.w=vadd(v28.uh,v1.uh)
+
+#CHECK: 1c79c350 { v16.h = vsub(v3.h,{{ *}}v25.h):sat }
+v16.h=vsub(v3.h,v25.h):sat
+
+#CHECK: 1c7fd364 { v4.w = vsub(v19.w,{{ *}}v31.w):sat }
+v4.w=vsub(v19.w,v31.w):sat
+
+#CHECK: 1c67d816 { v22.ub = vsub(v24.ub,{{ *}}v7.ub):sat }
+v22.ub=vsub(v24.ub,v7.ub):sat
+
+#CHECK: 1c7ddc2f { v15.uh = vsub(v28.uh,{{ *}}v29.uh):sat }
+v15.uh=vsub(v28.uh,v29.uh):sat
+
+#CHECK: 1c5cc6d7 { v23.h = vsub(v6.h,{{ *}}v28.h) }
+v23.h=vsub(v6.h,v28.h)
+
+#CHECK: 1c54cae4 { v4.w = vsub(v10.w,{{ *}}v20.w) }
+v4.w=vsub(v10.w,v20.w)
+
+#CHECK: 1c4dc78b { v11.w = vadd(v7.w,{{ *}}v13.w):sat }
+v11.w=vadd(v7.w,v13.w):sat
+
+#CHECK: 1c48c7a4 { v4.b = vsub(v7.b,{{ *}}v8.b) }
+v4.b=vsub(v7.b,v8.b)
+
+#CHECK: 1cdec3b0 { v16.uh = vavg(v3.uh,{{ *}}v30.uh) }
+v16.uh=vavg(v3.uh,v30.uh)
+
+#CHECK: 1c76dc98 { v25:24.b = vadd(v29:28.b,{{ *}}v23:22.b) }
+v25:24.b=vadd(v29:28.b,v23:22.b)
+
+#CHECK: 1c7ad4a6 { v7:6.h = vadd(v21:20.h,{{ *}}v27:26.h) }
+v7:6.h=vadd(v21:20.h,v27:26.h)
+
+#CHECK: 1cc7c564 { v4.uw = vabsdiff(v5.w,{{ *}}v7.w) }
+v4.uw=vabsdiff(v5.w,v7.w)
+
+#CHECK: 1cd2cdc1 { v1.h = vavg(v13.h,{{ *}}v18.h) }
+v1.h=vavg(v13.h,v18.h)
+
+#CHECK: 1cd5d246 { v6.uh = vabsdiff(v18.uh,{{ *}}v21.uh) }
+v6.uh=vabsdiff(v18.uh,v21.uh)
+
+#CHECK: 1cdcd987 { v7.ub = vavg(v25.ub,{{ *}}v28.ub) }
+v7.ub=vavg(v25.ub,v28.ub)
+
+#CHECK: 1c92c6e4 { v5:4.uh = vsub(v7:6.uh,{{ *}}v19:18.uh):sat }
+v5:4.uh=vsub(v7:6.uh,v19:18.uh):sat
+
+#CHECK: 1c86dace { v15:14.ub = vsub(v27:26.ub,{{ *}}v7:6.ub):sat }
+v15:14.ub=vsub(v27:26.ub,v7:6.ub):sat
+
+#CHECK: 1cffc07c { v28.ub = vavg(v0.ub,{{ *}}v31.ub):rnd }
+v28.ub=vavg(v0.ub,v31.ub):rnd
+
+#CHECK: 1cf8d851 { v17.w = vnavg(v24.w,{{ *}}v24.w) }
+v17.w=vnavg(v24.w,v24.w)
+
+#CHECK: 1c70d2e6 { v7:6.ub = vadd(v19:18.ub,{{ *}}v17:16.ub):sat }
+v7:6.ub=vadd(v19:18.ub,v17:16.ub):sat
+
+#CHECK: 1c72dec6 { v7:6.w = vadd(v31:30.w,{{ *}}v19:18.w) }
+v7:6.w=vadd(v31:30.w,v19:18.w)
+
+#CHECK: 1c92d23e { v31:30.h = vadd(v19:18.h,{{ *}}v19:18.h):sat }
+v31:30.h=vadd(v19:18.h,v19:18.h):sat
+
+#CHECK: 1c94de1e { v31:30.uh = vadd(v31:30.uh,{{ *}}v21:20.uh):sat }
+v31:30.uh=vadd(v31:30.uh,v21:20.uh):sat
+
+#CHECK: 1c9ec07c { v29:28.b = vsub(v1:0.b,{{ *}}v31:30.b) }
+v29:28.b=vsub(v1:0.b,v31:30.b)
+
+#CHECK: 1c88da56 { v23:22.w = vadd(v27:26.w,{{ *}}v9:8.w):sat }
+v23:22.w=vadd(v27:26.w,v9:8.w):sat
+
+#CHECK: 1c9acab8 { v25:24.w = vsub(v11:10.w,{{ *}}v27:26.w) }
+v25:24.w=vsub(v11:10.w,v27:26.w)
+
+#CHECK: 1c82d282 { v3:2.h = vsub(v19:18.h,{{ *}}v3:2.h) }
+v3:2.h=vsub(v19:18.h,v3:2.h)
+
+#CHECK: 1c2bd9a6 { v6 = vand(v25,{{ *}}v11) }
+v6=vand(v25,v11)
+
+#CHECK: 1c43c22d { v13.ub = vadd(v2.ub,{{ *}}v3.ub):sat }
+v13.ub=vadd(v2.ub,v3.ub):sat
+
+#CHECK: 1c59d707 { v7.w = vadd(v23.w,{{ *}}v25.w) }
+v7.w=vadd(v23.w,v25.w)
+
+#CHECK: 1c3fc9e1 { v1 = vxor(v9,{{ *}}v31) }
+v1=vxor(v9,v31)
+
+#CHECK: 1c2acbdf { v31 = vor(v11,{{ *}}v10) }
+v31=vor(v11,v10)
+
+#CHECK: 1cdaccf6 { v22.w = vavg(v12.w,{{ *}}v26.w) }
+v22.w=vavg(v12.w,v26.w)
+
+#CHECK: 1c5ac767 { v7.h = vadd(v7.h,{{ *}}v26.h):sat }
+v7.h=vadd(v7.h,v26.h):sat
+
+#CHECK: 1c40d956 { v22.uh = vadd(v25.uh,{{ *}}v0.uh):sat }
+v22.uh=vadd(v25.uh,v0.uh):sat
+
+#CHECK: 1fbbd611 { v17.w = vasr(v22.w{{ *}},{{ *}}v27.w) }
+v17.w=vasr(v22.w,v27.w)
+
+#CHECK: 1fbad835 { v21.w = vlsr(v24.w{{ *}},{{ *}}v26.w) }
+v21.w=vlsr(v24.w,v26.w)
+
+#CHECK: 1f79cedc { v28.b = vround(v14.h{{ *}},{{ *}}v25.h):sat }
+v28.b=vround(v14.h,v25.h):sat
+
+#CHECK: 1f69c4e0 { v0.ub = vround(v4.h{{ *}},{{ *}}v9.h):sat }
+v0.ub=vround(v4.h,v9.h):sat
+
+#CHECK: 1f72c485 { v5.h = vround(v4.w{{ *}},{{ *}}v18.w):sat }
+v5.h=vround(v4.w,v18.w):sat
+
+#CHECK: 1f6bc8b1 { v17.uh = vround(v8.w{{ *}},{{ *}}v11.w):sat }
+v17.uh=vround(v8.w,v11.w):sat
+
+#CHECK: 1f71c25b { v27.ub = vsat(v2.h{{ *}},{{ *}}v17.h) }
+v27.ub=vsat(v2.h,v17.h)
+
+#CHECK: 1f66c560 { v0.h = vsat(v5.w{{ *}},{{ *}}v6.w) }
+v0.h=vsat(v5.w,v6.w)
+
+#CHECK: 1fb3d148 { v8.h = vlsr(v17.h{{ *}},{{ *}}v19.h) }
+v8.h=vlsr(v17.h,v19.h)
+
+#CHECK: 1fbec56e { v14.h = vasr(v5.h{{ *}},{{ *}}v30.h) }
+v14.h=vasr(v5.h,v30.h)
+
+#CHECK: 1fb2d2a2 { v2.h = vasl(v18.h{{ *}},{{ *}}v18.h) }
+v2.h=vasl(v18.h,v18.h)
+
+#CHECK: 1faccc95 { v21.w = vasl(v12.w{{ *}},{{ *}}v12.w) }
+v21.w=vasl(v12.w,v12.w)
+
+#CHECK: 1fb9c1e2 { v2.h = vadd(v1.h{{ *}},{{ *}}v25.h) }
+v2.h=vadd(v1.h,v25.h)
+
+#CHECK: 1fbbd5df { v31.b = vadd(v21.b{{ *}},{{ *}}v27.b) }
+v31.b=vadd(v21.b,v27.b)
+
+#CHECK: 1f25c578 { v24 = vrdelta(v5{{ *}},{{ *}}v5) }
+v24=vrdelta(v5,v5)
+
+#CHECK: 1f22c62a { v10 = vdelta(v6{{ *}},{{ *}}v2) }
+v10=vdelta(v6,v2)
+
+#CHECK: 1f20d102 { v2.w = vmax(v17.w{{ *}},{{ *}}v0.w) }
+v2.w=vmax(v17.w,v0.w)
+
+#CHECK: 1f1ed6fc { v28.h = vmax(v22.h{{ *}},{{ *}}v30.h) }
+v28.h=vmax(v22.h,v30.h)
+
+#CHECK: 1f0cc8d8 { v24.uh = vmax(v8.uh{{ *}},{{ *}}v12.uh) }
+v24.uh=vmax(v8.uh,v12.uh)
+
+#CHECK: 1f00c1b0 { v16.ub = vmax(v1.ub{{ *}},{{ *}}v0.ub) }
+v16.ub=vmax(v1.ub,v0.ub)
+
+#CHECK: 1f12d08e { v14.w = vmin(v16.w{{ *}},{{ *}}v18.w) }
+v14.w=vmin(v16.w,v18.w)
+
+#CHECK: 1f1ad466 { v6.h = vmin(v20.h{{ *}},{{ *}}v26.h) }
+v6.h=vmin(v20.h,v26.h)
+
+#CHECK: 1f13df5d { v29.uh = vmin(v31.uh{{ *}},{{ *}}v19.uh) }
+v29.uh=vmin(v31.uh,v19.uh)
+
+#CHECK: 1f09c226 { v6.ub = vmin(v2.ub{{ *}},{{ *}}v9.ub) }
+v6.ub=vmin(v2.ub,v9.ub)
+
+#CHECK: 1f41d34f { v15.b = vshuffo(v19.b{{ *}},{{ *}}v1.b) }
+v15.b=vshuffo(v19.b,v1.b)
+
+#CHECK: 1f5fc72e { v14.b = vshuffe(v7.b{{ *}},{{ *}}v31.b) }
+v14.b=vshuffe(v7.b,v31.b)
+
+#CHECK: 1f34d0f7 { v23.b = vdeale(v16.b{{ *}},{{ *}}v20.b) }
+v23.b=vdeale(v16.b,v20.b)
+
+#CHECK: 1f4bd6c4 { v5:4.b = vshuffoe(v22.b{{ *}},{{ *}}v11.b) }
+v5:4.b=vshuffoe(v22.b,v11.b)
+
+#CHECK: 1f5dcea2 { v3:2.h = vshuffoe(v14.h{{ *}},{{ *}}v29.h) }
+v3:2.h=vshuffoe(v14.h,v29.h)
+
+#CHECK: 1f4fd186 { v6.h = vshuffo(v17.h{{ *}},{{ *}}v15.h) }
+v6.h=vshuffo(v17.h,v15.h)
+
+#CHECK: 1f5bda79 { v25.h = vshuffe(v26.h{{ *}},{{ *}}v27.h) }
+v25.h=vshuffe(v26.h,v27.h)
+
+#CHECK: 1f41d1f2 { v19:18 = vcombine(v17{{ *}},{{ *}}v1) }
+v19:18=vcombine(v17,v1)
+
+#CHECK: 1e82f432 { if (!q2) v18.b -= v20.b }
+if (!q2) v18.b-=v20.b
+
+#CHECK: 1ec2fd13 { if (q3) v19.w -= v29.w }
+if (q3) v19.w-=v29.w
+
+#CHECK: 1e81fef9 { if (q2) v25.h -= v30.h }
+if (q2) v25.h-=v30.h
+
+#CHECK: 1e81e2d3 { if (q2) v19.b -= v2.b }
+if (q2) v19.b-=v2.b
+
+#CHECK: 1e41ecad { if (!q1) v13.w += v12.w }
+if (!q1) v13.w+=v12.w
+
+#CHECK: 1e41e789 { if (!q1) v9.h += v7.h }
+if (!q1) v9.h+=v7.h
+
+#CHECK: 1e81e967 { if (!q2) v7.b += v9.b }
+if (!q2) v7.b+=v9.b
+
+#CHECK: 1e41f04f { if (q1) v15.w += v16.w }
+if (q1) v15.w+=v16.w
+
+#CHECK: 1e01e838 { if (q0) v24.h += v8.h }
+if (q0) v24.h+=v8.h
+
+#CHECK: 1ec1f112 { if (q3) v18.b += v17.b }
+if (q3) v18.b+=v17.b
+
+#CHECK: 1e42f67b { if (!q1) v27.w -= v22.w }
+if (!q1) v27.w-=v22.w
+
+#CHECK: 1e82ea5b { if (!q2) v27.h -= v10.h }
+if (!q2) v27.h-=v10.h
+
+#CHECK: 1e00c586 { v6 = vnot(v5) }
+v6=vnot(v5)
+
+#CHECK: 1e00df70 { v16.w = vabs(v31.w):sat }
+v16.w=vabs(v31.w):sat
+
+#CHECK: 1e00d45f { v31.w = vabs(v20.w) }
+v31.w=vabs(v20.w)
+
+#CHECK: 1e00db2f { v15.h = vabs(v27.h):sat }
+v15.h=vabs(v27.h):sat
+
+#CHECK: 1e00d001 { v1.h = vabs(v16.h) }
+v1.h=vabs(v16.h)
+
+#CHECK: 1e02c832 { v19:18.uh = vzxt(v8.ub) }
+v19:18.uh=vzxt(v8.ub)
+
+#CHECK: 1e02c98a { v11:10.w = vsxt(v9.h) }
+v11:10.w=vsxt(v9.h)
+
+#CHECK: 1e02cf76 { v23:22.h = vsxt(v15.b) }
+v23:22.h=vsxt(v15.b)
+
+#CHECK: 1e02c258 { v25:24.uw = vzxt(v2.uh) }
+v25:24.uw=vzxt(v2.uh)
diff --git a/test/MC/Hexagon/v60-permute.s b/test/MC/Hexagon/v60-permute.s
new file mode 100644
index 000000000000..b3544bd0a57b
--- /dev/null
+++ b/test/MC/Hexagon/v60-permute.s
@@ -0,0 +1,51 @@
+#RUN: llvm-mc -triple=hexagon -mcpu=hexagonv60 -filetype=obj %s | \
+#RUN: llvm-objdump -triple=hexagon -mcpu=hexagonv60 -d - | \
+#RUN: FileCheck %s
+
+#CHECK: 1fd2d5cf { v15.b = vpack(v21.h{{ *}},{{ *}}v18.h):sat }
+v15.b=vpack(v21.h,v18.h):sat
+
+#CHECK: 1fd7d7a2 { v2.ub = vpack(v23.h{{ *}},{{ *}}v23.h):sat }
+v2.ub=vpack(v23.h,v23.h):sat
+
+#CHECK: 1fc7d464 { v4.h = vpacke(v20.w{{ *}},{{ *}}v7.w) }
+v4.h=vpacke(v20.w,v7.w)
+
+#CHECK: 1fc2c75b { v27.b = vpacke(v7.h{{ *}},{{ *}}v2.h) }
+v27.b=vpacke(v7.h,v2.h)
+
+#CHECK: 1fc9c5ed { v13.uh = vpack(v5.w{{ *}},{{ *}}v9.w):sat }
+v13.uh=vpack(v5.w,v9.w):sat
+
+#CHECK: 1ff1d81f { v31.h = vpack(v24.w{{ *}},{{ *}}v17.w):sat }
+v31.h=vpack(v24.w,v17.w):sat
+
+#CHECK: 1fe6c435 { v21.b = vpacko(v4.h{{ *}},{{ *}}v6.h) }
+v21.b=vpacko(v4.h,v6.h)
+
+#CHECK: 1febc140 { v0.h = vpacko(v1.w{{ *}},{{ *}}v11.w) }
+v0.h=vpacko(v1.w,v11.w)
+
+#CHECK: 1e01d256 { v23:22.h = vunpack(v18.b) }
+v23:22.h=vunpack(v18.b)
+
+#CHECK: 1e01cc38 { v25:24.uw = vunpack(v12.uh) }
+v25:24.uw=vunpack(v12.uh)
+
+#CHECK: 1e01c61e { v31:30.uh = vunpack(v6.ub) }
+v31:30.uh=vunpack(v6.ub)
+
+#CHECK: 1e01d778 { v25:24.w = vunpack(v23.h) }
+v25:24.w=vunpack(v23.h)
+
+#CHECK: 1e00c0e0 { v0.b = vdeal(v0.b) }
+v0.b=vdeal(v0.b)
+
+#CHECK: 1e00d5c9 { v9.h = vdeal(v21.h) }
+v9.h=vdeal(v21.h)
+
+#CHECK: 1e02cb1c { v28.b = vshuff(v11.b) }
+v28.b=vshuff(v11.b)
+
+#CHECK: 1e01d8fe { v30.h = vshuff(v24.h) }
+v30.h=vshuff(v24.h)
diff --git a/test/MC/Hexagon/v60-shift.s b/test/MC/Hexagon/v60-shift.s
new file mode 100644
index 000000000000..3d0c334debb9
--- /dev/null
+++ b/test/MC/Hexagon/v60-shift.s
@@ -0,0 +1,39 @@
+#RUN: llvm-mc -triple=hexagon -mcpu=hexagonv60 -filetype=obj %s | \
+#RUN: llvm-objdump -triple=hexagon -mcpu=hexagonv60 -d - | \
+#RUN: FileCheck %s
+
+#CHECK: 198fd829 { v9.uw = vlsr(v24.uw,{{ *}}r15) }
+v9.uw=vlsr(v24.uw,r15)
+
+#CHECK: 1999d645 { v5.uh = vlsr(v22.uh,{{ *}}r25) }
+v5.uh=vlsr(v22.uh,r25)
+
+#CHECK: 198cc303 { v3.h = vasl(v3.h,{{ *}}r12) }
+v3.h=vasl(v3.h,r12)
+
+#CHECK: 1965d7ac { v12.w = vasr(v23.w,{{ *}}r5) }
+v12.w=vasr(v23.w,r5)
+
+#CHECK: 197dddc3 { v3.h = vasr(v29.h,{{ *}}r29) }
+v3.h=vasr(v29.h,r29)
+
+#CHECK: 197adde8 { v8.w = vasl(v29.w,{{ *}}r26) }
+v8.w=vasl(v29.w,r26)
+
+#CHECK: 1977cc26 { v6 = vror(v12,{{ *}}r23) }
+v6=vror(v12,r23)
+
+#CHECK: 1e02cfad { v13.uw = vcl0(v15.uw) }
+v13.uw=vcl0(v15.uw)
+
+#CHECK: 1e02defb { v27.uh = vcl0(v30.uh) }
+v27.uh=vcl0(v30.uh)
+
+#CHECK: 1e03de90 { v16.w = vnormamt(v30.w) }
+v16.w=vnormamt(v30.w)
+
+#CHECK: 1e03d4a3 { v3.h = vnormamt(v20.h) }
+v3.h=vnormamt(v20.h)
+
+#CHECK: 1e02c2d8 { v24.h = vpopcount(v2.h) }
+v24.h=vpopcount(v2.h)
diff --git a/test/MC/Hexagon/v60-vcmp.s b/test/MC/Hexagon/v60-vcmp.s
new file mode 100644
index 000000000000..c7f4e128be63
--- /dev/null
+++ b/test/MC/Hexagon/v60-vcmp.s
@@ -0,0 +1,84 @@
+#RUN: llvm-mc -triple=hexagon -mcpu=hexagonv60 -filetype=obj %s | \
+#RUN: llvm-objdump -triple=hexagon -mcpu=hexagonv60 -d - | \
+#RUN: FileCheck %s
+
+#CHECK: 1c81f142 { q2 |= vcmp.eq(v17.b{{ *}},{{ *}}v1.b) }
+q2|=vcmp.eq(v17.b,v1.b)
+
+#CHECK: 1c84fb2a { q2 &= vcmp.gt(v27.uw{{ *}},{{ *}}v4.uw) }
+q2&=vcmp.gt(v27.uw,v4.uw)
+
+#CHECK: 1c8cf826 { q2 &= vcmp.gt(v24.uh{{ *}},{{ *}}v12.uh) }
+q2&=vcmp.gt(v24.uh,v12.uh)
+
+#CHECK: 1c80e720 { q0 &= vcmp.gt(v7.ub{{ *}},{{ *}}v0.ub) }
+q0&=vcmp.gt(v7.ub,v0.ub)
+
+#CHECK: 1c9aed1a { q2 &= vcmp.gt(v13.w{{ *}},{{ *}}v26.w) }
+q2&=vcmp.gt(v13.w,v26.w)
+
+#CHECK: 1c8de516 { q2 &= vcmp.gt(v5.h{{ *}},{{ *}}v13.h) }
+q2&=vcmp.gt(v5.h,v13.h)
+
+#CHECK: 1c8dfc11 { q1 &= vcmp.gt(v28.b{{ *}},{{ *}}v13.b) }
+q1&=vcmp.gt(v28.b,v13.b)
+
+#CHECK: 1c94fa0b { q3 &= vcmp.eq(v26.w{{ *}},{{ *}}v20.w) }
+q3&=vcmp.eq(v26.w,v20.w)
+
+#CHECK: 1c83e206 { q2 &= vcmp.eq(v2.h{{ *}},{{ *}}v3.h) }
+q2&=vcmp.eq(v2.h,v3.h)
+
+#CHECK: 1c85e900 { q0 &= vcmp.eq(v9.b{{ *}},{{ *}}v5.b) }
+q0&=vcmp.eq(v9.b,v5.b)
+
+#CHECK: 1c9cfca8 { q0 ^= vcmp.gt(v28.uw{{ *}},{{ *}}v28.uw) }
+q0^=vcmp.gt(v28.uw,v28.uw)
+
+#CHECK: 1c81faa0 { q0 ^= vcmp.gt(v26.ub{{ *}},{{ *}}v1.ub) }
+q0^=vcmp.gt(v26.ub,v1.ub)
+
+#CHECK: 1c96f0a4 { q0 ^= vcmp.gt(v16.uh{{ *}},{{ *}}v22.uh) }
+q0^=vcmp.gt(v16.uh,v22.uh)
+
+#CHECK: 1c9bf795 { q1 ^= vcmp.gt(v23.h{{ *}},{{ *}}v27.h) }
+q1^=vcmp.gt(v23.h,v27.h)
+
+#CHECK: 1c9de698 { q0 ^= vcmp.gt(v6.w{{ *}},{{ *}}v29.w) }
+q0^=vcmp.gt(v6.w,v29.w)
+
+#CHECK: 1c82ef8a { q2 ^= vcmp.eq(v15.w{{ *}},{{ *}}v2.w) }
+q2^=vcmp.eq(v15.w,v2.w)
+
+#CHECK: 1c99e891 { q1 ^= vcmp.gt(v8.b{{ *}},{{ *}}v25.b) }
+q1^=vcmp.gt(v8.b,v25.b)
+
+#CHECK: 1c8afe55 { q1 |= vcmp.gt(v30.h{{ *}},{{ *}}v10.h) }
+q1|=vcmp.gt(v30.h,v10.h)
+
+#CHECK: 1c92ef50 { q0 |= vcmp.gt(v15.b{{ *}},{{ *}}v18.b) }
+q0|=vcmp.gt(v15.b,v18.b)
+
+#CHECK: 1c9ffb4b { q3 |= vcmp.eq(v27.w{{ *}},{{ *}}v31.w) }
+q3|=vcmp.eq(v27.w,v31.w)
+
+#CHECK: 1c87e944 { q0 |= vcmp.eq(v9.h{{ *}},{{ *}}v7.h) }
+q0|=vcmp.eq(v9.h,v7.h)
+
+#CHECK: 1c8ee768 { q0 |= vcmp.gt(v7.uw{{ *}},{{ *}}v14.uw) }
+q0|=vcmp.gt(v7.uw,v14.uw)
+
+#CHECK: 1c92e265 { q1 |= vcmp.gt(v2.uh{{ *}},{{ *}}v18.uh) }
+q1|=vcmp.gt(v2.uh,v18.uh)
+
+#CHECK: 1c80f062 { q2 |= vcmp.gt(v16.ub{{ *}},{{ *}}v0.ub) }
+q2|=vcmp.gt(v16.ub,v0.ub)
+
+#CHECK: 1c91f75a { q2 |= vcmp.gt(v23.w{{ *}},{{ *}}v17.w) }
+q2|=vcmp.gt(v23.w,v17.w)
+
+#CHECK: 1c86fe84 { q0 ^= vcmp.eq(v30.h{{ *}},{{ *}}v6.h) }
+q0^=vcmp.eq(v30.h,v6.h)
+
+#CHECK: 1c86ec82 { q2 ^= vcmp.eq(v12.b{{ *}},{{ *}}v6.b) }
+q2^=vcmp.eq(v12.b,v6.b)
diff --git a/test/MC/Hexagon/v60-vmem.s b/test/MC/Hexagon/v60-vmem.s
new file mode 100644
index 000000000000..fe202251ec4b
--- /dev/null
+++ b/test/MC/Hexagon/v60-vmem.s
@@ -0,0 +1,424 @@
+#RUN: llvm-mc -triple=hexagon -mcpu=hexagonv60 -filetype=obj %s | \
+#RUN: llvm-objdump -triple=hexagon -mcpu=hexagonv60 -d - | \
+#RUN: FileCheck %s
+
+#CHECK: 292cc11b { vmem(r12++#1) = v27 }
+{
+  vmem(r12++#1)=v27
+}
+
+#CHECK: 294dc319 { v25 = vmem(r13++#3):nt }
+{
+  v25=vmem(r13++#3):nt
+}
+
+#CHECK: 2904c1fb { v27 = vmemu(r4++#1) }
+{
+  v27=vmemu(r4++#1)
+}
+
+#CHECK: 291dc01f { v31 = vmem(r29++#0) }
+{
+  v31=vmem(r29++#0)
+}
+
+#CHECK: 293ec0ff { vmemu(r30++#0) = v31 }
+{
+  vmemu(r30++#0)=v31
+}
+
+#CHECK: 296ec411 { vmem(r14++#-4):nt = v17 }
+{
+  vmem(r14++#-4):nt=v17
+}
+
+#CHECK: 29fec62f { if (!p0) vmem(r30++#-2):nt = v15 }
+{
+  if (!p0) vmem(r30++#-2):nt=v15
+}
+
+#CHECK: 29f9c914 { if (p1) vmem(r25++#1):nt = v20 }
+{
+  if (p1) vmem(r25++#1):nt=v20
+}
+
+#CHECK: 2984de30 { if (!q3) vmem(r4++#-2) = v16 }
+{
+  if (!q3) vmem(r4++#-2)=v16
+}
+
+#CHECK: 2992dd1f { if (q3) vmem(r18++#-3) = v31 }
+{
+  if (q3) vmem(r18++#-3)=v31
+}
+
+#CHECK: 29c9c425 { if (!q0) vmem(r9++#-4):nt = v5 }
+{
+  if (!q0) vmem(r9++#-4):nt=v5
+}
+
+#CHECK: 29d1cf11 { if (q1) vmem(r17++#-1):nt = v17 }
+{
+  if (q1) vmem(r17++#-1):nt=v17
+}
+
+#CHECK: 29a7c328 { if (!p0) vmem(r7++#3) = v8 }
+{
+  if (!p0) vmem(r7++#3)=v8
+}
+
+#CHECK: 29b6cc1d { if (p1) vmem(r22++#-4) = v29 }
+{
+  if (p1) vmem(r22++#-4)=v29
+}
+
+#CHECK: 29abc5fe { if (!p0) vmemu(r11++#-3) = v30 }
+{
+  if (!p0) vmemu(r11++#-3)=v30
+}
+
+#CHECK: 29b8d5c4 { if (p2) vmemu(r24++#-3) = v4 }
+{
+  if (p2) vmemu(r24++#-3)=v4
+}
+
+#CHECK: 2860e407 { vmem(r0+#-4):nt = v7 }
+{
+  vmem(r0+#-4):nt=v7
+}
+
+#CHECK: 2830e2e7 { vmemu(r16+#-6) = v7 }
+{
+  vmemu(r16+#-6)=v7
+}
+
+#CHECK: 2839c316 { vmem(r25+#3) = v22 }
+{
+  vmem(r25+#3)=v22
+}
+#CHECK: 284be316 { v22 = vmem(r11+#-5):nt }
+{
+  v22=vmem(r11+#-5):nt
+}
+
+#CHECK: 280ec1e6 { v6 = vmemu(r14+#1) }
+{
+  v6=vmemu(r14+#1)
+}
+
+#CHECK: 280ae50c { v12 = vmem(r10+#-3) }
+{
+  v12=vmem(r10+#-3)
+}
+
+#CHECK: 2b62e005 { vmem(r2++m1):nt = v5 }
+{
+  vmem(r2++m1):nt=v5
+}
+
+#CHECK: 2b28e0f2 { vmemu(r8++m1) = v18 }
+{
+  vmemu(r8++m1)=v18
+}
+
+#CHECK: 2b42e019 { v25 = vmem(r2++m1):nt }
+{
+  v25=vmem(r2++m1):nt
+}
+
+#CHECK: 2b2ce009 { vmem(r12++m1) = v9 }
+{
+  vmem(r12++m1)=v9
+}
+
+#CHECK: 2b03c005 { v5 = vmem(r3++m0) }
+{
+  v5=vmem(r3++m0)
+}
+
+
+#CHECK: 2b0ec0f5 { v21 = vmemu(r14++m0) }
+{
+  v21=vmemu(r14++m0)
+}
+
+#CHECK: 2be8c022 { if (!p0) vmem(r8++m0):nt = v2 }
+{
+  if (!p0) vmem(r8++m0):nt=v2
+}
+
+#CHECK: 2bebd813 { if (p3) vmem(r11++m0):nt = v19 }
+{
+  if (p3) vmem(r11++m0):nt=v19
+}
+
+#CHECK: 2ba5e0e7 { if (!p0) vmemu(r5++m1) = v7 }
+{
+  if (!p0) vmemu(r5++m1)=v7
+}
+
+#CHECK: 2ba4f0dd { if (p2) vmemu(r4++m1) = v29 }
+{
+  if (p2) vmemu(r4++m1)=v29
+}
+
+#CHECK: 2ba4e828 { if (!p1) vmem(r4++m1) = v8 }
+{
+  if (!p1) vmem(r4++m1)=v8
+}
+
+#CHECK: 2bbae803 { if (p1) vmem(r26++m1) = v3 }
+{
+  if (p1) vmem(r26++m1)=v3
+}
+
+#CHECK: 2bc9c027 { if (!q0) vmem(r9++m0):nt = v7 }
+{
+  if (!q0) vmem(r9++m0):nt=v7
+}
+
+#CHECK: 2bcfc001 { if (q0) vmem(r15++m0):nt = v1 }
+{
+  if (q0) vmem(r15++m0):nt=v1
+}
+
+#CHECK: 2b97f031 { if (!q2) vmem(r23++m1) = v17 }
+{
+  if (!q2) vmem(r23++m1)=v17
+}
+
+#CHECK: 2b8ad809 { if (q3) vmem(r10++m0) = v9 }
+{
+  if (q3) vmem(r10++m0)=v9
+}
+
+#CHECK: 28c7f438 { if (!q2) vmem(r7+#-4):nt = v24 }
+{
+  if (!q2) vmem(r7+#-4):nt=v24
+}
+
+#CHECK: 28d1eb15 { if (q1) vmem(r17+#-5):nt = v21 }
+{
+  if (q1) vmem(r17+#-5):nt=v21
+}
+
+#CHECK: 289cfe2b { if (!q3) vmem(r28+#-2) = v11 }
+{
+  if (!q3) vmem(r28+#-2)=v11
+}
+
+#CHECK: 288eef0f { if (q1) vmem(r14+#-1) = v15 }
+{
+  if (q1) vmem(r14+#-1)=v15
+}
+
+#CHECK: 28a2d1e1 { if (!p2) vmemu(r2+#1) = v1 }
+{
+  if (!p2) vmemu(r2+#1)=v1
+}
+
+#CHECK: 28bcf4db { if (p2) vmemu(r28+#-4) = v27 }
+{
+  if (p2) vmemu(r28+#-4)=v27
+}
+
+#CHECK: 28b2c925 { if (!p1) vmem(r18+#1) = v5 }
+{
+  if (!p1) vmem(r18+#1)=v5
+}
+
+#CHECK: 28afe41a { if (p0) vmem(r15+#-4) = v26 }
+{
+  if (p0) vmem(r15+#-4)=v26
+}
+
+#CHECK: 28f7fd3a { if (!p3) vmem(r23+#-3):nt = v26 }
+{
+  if (!p3) vmem(r23+#-3):nt=v26
+}
+
+#CHECK: 28f5fd10 { if (p3) vmem(r21+#-3):nt = v16 }
+{
+  if (p3) vmem(r21+#-3):nt=v16
+}
+
+#CHECK: 2945c440 v0.tmp = vmem(r5++#-4):nt }
+{
+  v0.tmp=vmem(r5++#-4):nt
+  v26=v0
+}
+
+#CHECK: 2942c338 v24.cur = vmem(r2++#3):nt }
+{
+  v24.cur=vmem(r2++#3):nt
+  v6=v24
+}
+
+#CHECK: 2908c157 v23.tmp = vmem(r8++#1) }
+{
+  v25=v23
+  v23.tmp=vmem(r8++#1)
+}
+
+#CHECK: 2903c72d v13.cur = vmem(r3++#-1) }
+{
+  v13.cur=vmem(r3++#-1)
+  v21=v13
+}
+
+#CHECK: 2855c743 v3.tmp = vmem(r21+#7):nt }
+{
+  v3.tmp=vmem(r21+#7):nt
+  v21=v3
+}
+
+#CHECK: 2856e025 v5.cur = vmem(r22+#-8):nt }
+{
+  v5.cur=vmem(r22+#-8):nt
+  v29=v5
+}
+
+#CHECK: 2802c555 v21.tmp = vmem(r2+#5) }
+{
+  v31=v21
+  v21.tmp=vmem(r2+#5)
+}
+
+#CHECK: 2814e12a v10.cur = vmem(r20+#-7) }
+{
+  v9=v10
+  v10.cur=vmem(r20+#-7)
+}
+
+
+#CHECK: 2b52c02c v12.cur = vmem(r18++m0):nt }
+{
+  v12.cur=vmem(r18++m0):nt
+  v25=v12
+}
+
+#CHECK: 2b4ae043 v3.tmp = vmem(r10++m1):nt }
+{
+  v25=v3
+  v3.tmp=vmem(r10++m1):nt
+}
+
+#CHECK: 2b06c025 v5.cur = vmem(r6++m0) }
+{
+  v5.cur=vmem(r6++m0)
+  v10=v5
+}
+
+#CHECK: 2b17e048 v8.tmp = vmem(r23++m1) }
+{
+  v8.tmp=vmem(r23++m1)
+  v28=v8
+}
+
+#CHECK: 282ee422 vmem(r14+#-4) = v14.new }
+{
+  v14 = v14
+  vmem(r14+#-4)=v14.new
+}
+
+#CHECK: 2866e222 vmem(r6+#-6):nt = v16.new }
+{
+  v16 = v8
+  vmem(r6+#-6):nt=v16.new
+}
+
+#CHECK: 28b1cd42 if(p1) vmem(r17+#5) = v17.new }
+{
+  v17 = v25
+  if(p1)vmem(r17+#5)=v17.new
+}
+
+#CHECK: 28bbeb6a if(!p1) vmem(r27+#-5) = v17.new }
+{
+  v17 = v15
+  if(!p1)vmem(r27+#-5)=v17.new
+}
+
+#CHECK: 28e4d252 if(p2) vmem(r4+#2):nt = v24.new }
+{
+  v24 = v10
+  if(p2)vmem(r4+#2):nt=v24.new
+}
+
+#CHECK: 28f8d17a if(!p2) vmem(r24+#1):nt = v4.new }
+{
+  v4 = v8
+  if(!p2)vmem(r24+#1):nt=v4.new
+}
+
+#CHECK: 2924c322 vmem(r4++#3) = v4.new }
+{
+  v4 = v3
+  vmem(r4++#3)=v4.new
+}
+
+#CHECK: 2961c122 vmem(r1++#1):nt = v7.new }
+{
+  v7 = v8
+  vmem(r1++#1):nt=v7.new
+}
+
+#CHECK: 29a6d042 if(p2) vmem(r6++#0) = v11.new }
+{
+  v11 = v13
+  if(p2)vmem(r6++#0)=v11.new
+}
+
+#CHECK: 29a2cb6a if(!p1) vmem(r2++#3) = v25.new }
+{
+  v25 = v17
+  if(!p1)vmem(r2++#3)=v25.new
+}
+
+#CHECK: 29f5c952 if(p1) vmem(r21++#1):nt = v14.new }
+{
+  v14 = v13
+  if(p1)vmem(r21++#1):nt=v14.new
+}
+
+#CHECK: 29f7cd7a if(!p1) vmem(r23++#-3):nt = v1.new }
+{
+  v1 = v0
+  if(!p1)vmem(r23++#-3):nt=v1.new
+}
+
+#CHECK: 2b3ec022 vmem(r30++m0) = v10.new }
+{
+  v10 = v23
+  vmem(r30++m0)=v10.new
+}
+
+#CHECK: 2b6fc022 vmem(r15++m0):nt = v19.new }
+{
+  v19 = v20
+  vmem(r15++m0):nt=v19.new
+}
+
+#CHECK: 2bb7f042 if(p2) vmem(r23++m1) = v6.new }
+{
+  v6 = v30
+  if(p2)vmem(r23++m1)=v6.new
+}
+
+#CHECK: 2ba2f06a if(!p2) vmem(r2++m1) = v12.new }
+{
+  v12 = v9
+  if(!p2)vmem(r2++m1)=v12.new
+}
+
+#CHECK: 2be7e852 if(p1) vmem(r7++m1):nt = v3.new }
+{
+  v3 = v13
+  if(p1)vmem(r7++m1):nt=v3.new
+}
+
+#CHECK: 2bfdd07a if(!p2) vmem(r29++m0):nt = v29.new }
+{
+  v29 = v9
+  if(!p2)vmem(r29++m0):nt=v29.new
+}
diff --git a/test/MC/Hexagon/v60-vmpy-acc.s b/test/MC/Hexagon/v60-vmpy-acc.s
new file mode 100644
index 000000000000..c39a9252b563
--- /dev/null
+++ b/test/MC/Hexagon/v60-vmpy-acc.s
@@ -0,0 +1,123 @@
+#RUN: llvm-mc -triple=hexagon -mcpu=hexagonv60 -filetype=obj %s | \
+#RUN: llvm-objdump -triple=hexagon -mcpu=hexagonv60 -d - | \
+#RUN: FileCheck %s
+
+#CHECK: 1936ee37 { v23.w += vdmpy(v15:14.h,r22.uh,#1):sat }
+v23.w += vdmpy(v15:14.h,r22.uh,#1):sat
+
+#CHECK: 193bf90f { v15.w += vdmpy(v25.h,r27.uh):sat }
+v15.w += vdmpy(v25.h,r27.uh):sat
+
+#CHECK: 1902fcf0 { v17:16.h += vdmpy(v29:28.ub,r2.b) }
+v17:16.h += vdmpy(v29:28.ub,r2.b)
+
+#CHECK: 190cffd1 { v17.h += vdmpy(v31.ub,r12.b) }
+v17.h += vdmpy(v31.ub,r12.b)
+
+#CHECK: 1900f5ac { v12.w += vrmpy(v21.ub,r0.b) }
+v12.w += vrmpy(v21.ub,r0.b)
+
+#CHECK: 1905fb86 { v6.uw += vrmpy(v27.ub,r5.ub) }
+v6.uw += vrmpy(v27.ub,r5.ub)
+
+#CHECK: 191de570 { v16.w += vdmpy(v5.h,r29.b) }
+v16.w += vdmpy(v5.h,r29.b)
+
+#CHECK: 191de846 { v7:6.w += vtmpy(v9:8.h,r29.b) }
+v7:6.w += vtmpy(v9:8.h,r29.b)
+
+#CHECK: 190bfa22 { v3:2.h += vtmpy(v27:26.ub,r11.b) }
+v3:2.h += vtmpy(v27:26.ub,r11.b)
+
+#CHECK: 1915e408 { v9:8.h += vtmpy(v5:4.b,r21.b) }
+v9:8.h += vtmpy(v5:4.b,r21.b)
+
+#CHECK: 1987f71e { v31:30.uh += vmpy(v23.ub,r7.ub) }
+v31:30.uh += vmpy(v23.ub,r7.ub)
+
+#CHECK: 1969ff47 { v7.w += vasl(v31.w,r9) }
+v7.w += vasl(v31.w,r9)
+
+#CHECK: 196de3b0 { v16.w += vasr(v3.w,r13) }
+v16.w += vasr(v3.w,r13)
+
+#CHECK: 1977fe0a { v11:10.uw += vdsad(v31:30.uh,r23.uh) }
+v11:10.uw += vdsad(v31:30.uh,r23.uh)
+
+#CHECK: 196eee36 { v22.h += vmpyi(v14.h,r14.b) }
+v22.h += vmpyi(v14.h,r14.b)
+
+#CHECK: 1931faac { v13:12.h += vmpy(v26.ub,r17.b) }
+v13:12.h += vmpy(v26.ub,r17.b)
+
+#CHECK: 193cfc94 { v21:20.w += vdmpy(v29:28.h,r28.b) }
+v21:20.w += vdmpy(v29:28.h,r28.b)
+
+#CHECK: 1934fc62 { v2.w += vdmpy(v28.h,r20.h):sat }
+v2.w += vdmpy(v28.h,r20.h):sat
+
+#CHECK: 1925fe5f { v31.w += vdmpy(v31:30.h,r5.h):sat }
+v31.w += vdmpy(v31:30.h,r5.h):sat
+
+#CHECK: 194efe36 { v23:22.uw += vmpy(v30.uh,r14.uh) }
+v23:22.uw += vmpy(v30.uh,r14.uh)
+
+#CHECK: 1948e306 { v7:6.w += vmpy(v3.h,r8.h):sat }
+v7:6.w += vmpy(v3.h,r8.h):sat
+
+#CHECK: 192af2f8 { v25:24.w += vmpa(v19:18.h,r10.b) }
+v25:24.w += vmpa(v19:18.h,r10.b)
+
+#CHECK: 1926e4da { v27:26.h += vmpa(v5:4.ub,r6.b) }
+v27:26.h += vmpa(v5:4.ub,r6.b)
+
+#CHECK: 194ff078 { v24.w += vmpyi(v16.w,r15.h) }
+v24.w += vmpyi(v16.w,r15.h)
+
+#CHECK: 1946e247 { v7.w += vmpyi(v2.w,r6.b) }
+v7.w += vmpyi(v2.w,r6.b)
+
+#CHECK: 1c3fead5 { v21.w += vmpyo(v10.w,v31.h):<<1:sat:shift }
+v21.w += vmpyo(v10.w,v31.h):<<1:sat:shift
+
+#CHECK: 1c30e1fa { v26.w += vmpyo(v1.w,v16.h):<<1:rnd:sat:shift }
+v26.w += vmpyo(v1.w,v16.h):<<1:rnd:sat:shift
+
+#CHECK: 1c34f690 { v16.h += vmpyi(v22.h,v20.h) }
+v16.h += vmpyi(v22.h,v20.h)
+
+#CHECK: 1c34f4b5 { v21.w += vmpyie(v20.w,v20.uh) }
+v21.w += vmpyie(v20.w,v20.uh)
+
+#CHECK: 1c54f804 { v4.w += vmpyie(v24.w,v20.h) }
+v4.w += vmpyie(v24.w,v20.h)
+
+#CHECK: 1c1ff6f4 { v21:20.w += vmpy(v22.h,v31.h) }
+v21:20.w += vmpy(v22.h,v31.h)
+
+#CHECK: 1c31f026 { v7:6.w += vmpy(v16.h,v17.uh) }
+v7:6.w += vmpy(v16.h,v17.uh)
+
+#CHECK: 1c12fb98 { v25:24.h += vmpy(v27.b,v18.b) }
+v25:24.h += vmpy(v27.b,v18.b)
+
+#CHECK: 1c17fcc0 { v1:0.h += vmpy(v28.ub,v23.b) }
+v1:0.h += vmpy(v28.ub,v23.b)
+
+#CHECK: 1c16f26f { v15.w += vdmpy(v18.h,v22.h):sat }
+v15.w += vdmpy(v18.h,v22.h):sat
+
+#CHECK: 1c0bea3a { v26.w += vrmpy(v10.b,v11.b) }
+v26.w += vrmpy(v10.b,v11.b)
+
+#CHECK: 1c15eb47 { v7.w += vrmpy(v11.ub,v21.b) }
+v7.w += vrmpy(v11.ub,v21.b)
+
+#CHECK: 1c26e40e { v15:14.uw += vmpy(v4.uh,v6.uh) }
+v15:14.uw += vmpy(v4.uh,v6.uh)
+
+#CHECK: 1c0df9a8 { v9:8.uh += vmpy(v25.ub,v13.ub) }
+v9:8.uh += vmpy(v25.ub,v13.ub)
+
+#CHECK: 1c0afc15 { v21.uw += vrmpy(v28.ub,v10.ub) }
+v21.uw += vrmpy(v28.ub,v10.ub)
diff --git a/test/MC/Hexagon/v60-vmpy1.s b/test/MC/Hexagon/v60-vmpy1.s
new file mode 100644
index 000000000000..1f36a5e95ddb
--- /dev/null
+++ b/test/MC/Hexagon/v60-vmpy1.s
@@ -0,0 +1,138 @@
+#RUN: llvm-mc -triple=hexagon -mcpu=hexagonv60 -filetype=obj %s | \
+#RUN: llvm-objdump -triple=hexagon -mcpu=hexagonv60 -d - | \
+#RUN: FileCheck %s
+
+#CHECK: 1939c223 { v3.w = vdmpy(v3:2.h,{{ *}}r25.uh,{{ *}}#1):sat }
+v3.w=vdmpy(v3:2.h,r25.uh,#1):sat
+
+#CHECK: 1936de0d { v13.w = vdmpy(v30.h,{{ *}}r22.uh):sat }
+v13.w=vdmpy(v30.h,r22.uh):sat
+
+#CHECK: 1919ccea { v11:10.h = vdmpy(v13:12.ub,{{ *}}r25.b) }
+v11:10.h=vdmpy(v13:12.ub,r25.b)
+
+#CHECK: 1918ced6 { v22.h = vdmpy(v14.ub,{{ *}}r24.b) }
+v22.h=vdmpy(v14.ub,r24.b)
+
+#CHECK: 1911deba { v27:26.uw = vdsad(v31:30.uh,{{ *}}r17.uh) }
+v27:26.uw=vdsad(v31:30.uh,r17.uh)
+
+#CHECK: 1908da97 { v23.w = vrmpy(v26.ub,{{ *}}r8.b) }
+v23.w=vrmpy(v26.ub,r8.b)
+
+#CHECK: 1915c974 { v20.uw = vrmpy(v9.ub,{{ *}}r21.ub) }
+v20.uw=vrmpy(v9.ub,r21.ub)
+
+#CHECK: 190dd446 { v6.w = vdmpy(v20.h,{{ *}}r13.b) }
+v6.w=vdmpy(v20.h,r13.b)
+
+#CHECK: 190ec030 { v17:16.h = vtmpy(v1:0.ub,{{ *}}r14.b) }
+v17:16.h=vtmpy(v1:0.ub,r14.b)
+
+#CHECK: 1918de1c { v29:28.h = vtmpy(v31:30.b,{{ *}}r24.b) }
+v29:28.h=vtmpy(v31:30.b,r24.b)
+
+#CHECK: 198dddf1 { v17.w = vmpyi(v29.w,{{ *}}r13.h) }
+v17.w=vmpyi(v29.w,r13.h)
+
+#CHECK: 19bccb13 { v19.w = vmpyi(v11.w,{{ *}}r28.b) }
+v19.w=vmpyi(v11.w,r28.b)
+
+#CHECK: 19c8cb0a { v11:10.uh = vmpy(v11.ub,{{ *}}r8.ub) }
+v11:10.uh=vmpy(v11.ub,r8.ub)
+
+#CHECK: 1973d012 { v18.h = vmpyi(v16.h,{{ *}}r19.b) }
+v18.h=vmpyi(v16.h,r19.b)
+
+#CHECK: 1922d1aa { v11:10.h = vmpy(v17.ub,{{ *}}r2.b) }
+v11:10.h=vmpy(v17.ub,r2.b)
+
+#CHECK: 1936ce9c { v29:28.w = vdmpy(v15:14.h,{{ *}}r22.b) }
+v29:28.w=vdmpy(v15:14.h,r22.b)
+
+#CHECK: 1925d86b { v11.w = vdmpy(v25:24.h,{{ *}}r5.h):sat }
+v11.w=vdmpy(v25:24.h,r5.h):sat
+
+#CHECK: 1925c255 { v21.w = vdmpy(v2.h,{{ *}}r5.h):sat }
+v21.w=vdmpy(v2.h,r5.h):sat
+
+#CHECK: 1941d424 { v4.h = vmpy(v20.h,{{ *}}r1.h):<<1:sat }
+v4.h=vmpy(v20.h,r1.h):<<1:sat
+
+#CHECK: 1943cf0a { v11:10.w = vmpy(v15.h,{{ *}}r3.h) }
+v11:10.w=vmpy(v15.h,r3.h)
+
+#CHECK: 193ec2f0 { v17:16.w = vmpa(v3:2.h,{{ *}}r30.b) }
+v17:16.w=vmpa(v3:2.h,r30.b)
+
+#CHECK: 193ddcde { v31:30.h = vmpa(v29:28.ub,{{ *}}r29.b) }
+v31:30.h=vmpa(v29:28.ub,r29.b)
+
+#CHECK: 1946de76 { v23:22.uw = vmpy(v30.uh,{{ *}}r6.uh) }
+v23:22.uw=vmpy(v30.uh,r6.uh)
+
+#CHECK: 1945c945 { v5.h = vmpy(v9.h,{{ *}}r5.h):<<1:rnd:sat }
+v5.h=vmpy(v9.h,r5.h):<<1:rnd:sat
+
+#CHECK: 19b0c280 { v1:0.w = vtmpy(v3:2.h,{{ *}}r16.b) }
+v1:0.w=vtmpy(v3:2.h,r16.b)
+
+#CHECK: 1c34d937 { v23.h = vmpy(v25.h,{{ *}}v20.h):<<1:rnd:sat }
+v23.h=vmpy(v25.h,v20.h):<<1:rnd:sat
+
+#CHECK: 1c36c90a { v11:10.uw = vmpy(v9.uh,{{ *}}v22.uh) }
+v11:10.uw=vmpy(v9.uh,v22.uh)
+
+#CHECK: 1c09c3ec { v13:12.w = vmpy(v3.h,{{ *}}v9.h) }
+v13:12.w=vmpy(v3.h,v9.h)
+
+#CHECK: 1c0dd1d8 { v25:24.h = vmpy(v17.ub,{{ *}}v13.b) }
+v25:24.h=vmpy(v17.ub,v13.b)
+
+#CHECK: 1c0dc0a4 { v5:4.uh = vmpy(v0.ub,{{ *}}v13.ub) }
+v5:4.uh=vmpy(v0.ub,v13.ub)
+
+#CHECK: 1c14df84 { v5:4.h = vmpy(v31.b,{{ *}}v20.b) }
+v5:4.h=vmpy(v31.b,v20.b)
+
+#CHECK: 1c16d77c { v28.w = vdmpy(v23.h,{{ *}}v22.h):sat }
+v28.w=vdmpy(v23.h,v22.h):sat
+
+#CHECK: 1c08d84f { v15.w = vrmpy(v24.ub,{{ *}}v8.b) }
+v15.w=vrmpy(v24.ub,v8.b)
+
+#CHECK: 1c06da29 { v9.w = vrmpy(v26.b,{{ *}}v6.b) }
+v9.w=vrmpy(v26.b,v6.b)
+
+#CHECK: 1c1ac805 { v5.uw = vrmpy(v8.ub,{{ *}}v26.ub) }
+v5.uw=vrmpy(v8.ub,v26.ub)
+
+#CHECK: 1c39d089 { v9.h = vmpyi(v16.h,{{ *}}v25.h) }
+v9.h=vmpyi(v16.h,v25.h)
+
+#CHECK: 1c3ecc64 { v5:4.h = vmpa(v13:12.ub,{{ *}}v31:30.b) }
+v5:4.h=vmpa(v13:12.ub,v31:30.b)
+
+#CHECK: 1c21ce54 { v21:20.w = vmpy(v14.h,{{ *}}v1.uh) }
+v21:20.w=vmpy(v14.h,v1.uh)
+
+#CHECK: 1cf2c6f0 { v17:16.h = vmpa(v7:6.ub,{{ *}}v19:18.ub) }
+v17:16.h=vmpa(v7:6.ub,v19:18.ub)
+
+#CHECK: 1fcdc82b { v11.w = vmpyio(v8.w{{ *}},{{ *}}v13.h) }
+v11.w=vmpyio(v8.w,v13.h)
+
+#CHECK: 1fdeda10 { v16.w = vmpyie(v26.w{{ *}},{{ *}}v30.uh) }
+v16.w=vmpyie(v26.w,v30.uh)
+
+#CHECK: 1ff2c2a6 { v6.w = vmpye(v2.w{{ *}},{{ *}}v18.uh) }
+v6.w=vmpye(v2.w,v18.uh)
+
+#CHECK: 1ff7cbfa { v26.w = vmpyo(v11.w{{ *}},{{ *}}v23.h):<<1:sat }
+v26.w=vmpyo(v11.w,v23.h):<<1:sat
+
+#CHECK: 1f5cd411 { v17.w = vmpyo(v20.w{{ *}},{{ *}}v28.h):<<1:rnd:sat }
+v17.w=vmpyo(v20.w,v28.h):<<1:rnd:sat
+
+#CHECK: 1f71cf1d { v29.w = vmpyieo(v15.h{{ *}},{{ *}}v17.h) }
+v29.w=vmpyieo(v15.h,v17.h)
diff --git a/test/MC/Hexagon/v60lookup.s b/test/MC/Hexagon/v60lookup.s
new file mode 100644
index 000000000000..b92a2d3c6eb1
--- /dev/null
+++ b/test/MC/Hexagon/v60lookup.s
@@ -0,0 +1,14 @@
+#RUN: llvm-mc -triple=hexagon -mcpu=hexagonv60 -filetype=obj %s | \
+#RUN: llvm-objdump -triple=hexagon -mcpu=hexagonv60 -d - | \
+#RUN: FileCheck %s
+
+                    V31.b = vlut32(V29.b, V15.b, R1)
+# CHECK: 1b79fd3f { v31.b = vlut32(v29.b,v15.b,r1) }
+                    V31.b |= vlut32(V29.b, V15.b, R2)
+# CHECK: 1b7afdbf { v31.b |= vlut32(v29.b,v15.b,r2) }
+                    V31:30.h = vlut16(V29.b, V15.h, R3)
+# CHECK: 1b7bfdde { v31:30.h = vlut16(v29.b,v15.h,r3) }
+                    v31:30.h |= vlut16(v2.b, v9.h, r4)
+# CHECK: 1b4ce2fe { v31:30.h |= vlut16(v2.b,v9.h,r4) }
+                    v31.w = vinsert(r4)
+# CHECK: 19a4e03f { v31.w = vinsert(r4) }

From 9f51f8f7e70fb7fbaff887224ac4ebec3125ae45 Mon Sep 17 00:00:00 2001
From: Chih-Hung Hsieh <chh@google.com>
Date: Thu, 3 Dec 2015 22:02:40 +0000
Subject: [PATCH 026/364] [X86] Part 1 to fix x86-64 fp128 calling convention.

Almost all these changes are conditioned and only apply to the new
x86-64 f128 type configuration, which will be enabled in a follow up
patch. They are required together to make new f128 work. If there is
any error, we should fix or revert them as a whole.
These changes should have no impact to current configurations.

* Relax type legalization checks to accept new f128 type configuration,
  whose TypeAction is TypeSoftenFloat, not TypeLegal, but also has
  TLI.isTypeLegal true.
* Relax GetSoftenedFloat to return in some cases f128 type SDValue,
  which is TLI.isTypeLegal but not "softened" to i128 node.
* Allow customized FABS, FNEG, FCOPYSIGN on new f128 type configuration,
  to generate optimized bitwise operators for libm functions.
* Enhance related Lower* functions to handle f128 type.
* Enhance DAGTypeLegalizer::run, SoftenFloatResult, and related functions
  to keep new f128 type in register, and convert f128 operators to library calls.
* Fix Combiner, Emitter, Legalizer routines that did not handle f128 type.
* Add ExpandConstant to handle i128 constants, ExpandNode
  to handle ISD::Constant node.
* Add one more parameter to getCommonSubClass and firstCommonClass,
  to guarantee that returned common sub class will contain the specified
  simple value type.
  This extra parameter is used by EmitCopyFromReg in InstrEmitter.cpp.
* Fix infinite loop in getTypeLegalizationCost when f128 is the value type.
* Fix printOperand to handle null operand.
* Enhance ISD::BITCAST node to handle f128 constant.
* Expand new f128 type for BR_CC, SELECT_CC, SELECT, SETCC nodes.
* Enhance X86AsmPrinter to emit f128 values in comments.

Differential Revision: http://reviews.llvm.org/D15134


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254653 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Target/TargetLowering.h          |   4 +-
 include/llvm/Target/TargetRegisterInfo.h      |   6 +-
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp      |  18 ++-
 lib/CodeGen/SelectionDAG/InstrEmitter.cpp     |   2 +-
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp      |  34 ++++-
 .../SelectionDAG/LegalizeFloatTypes.cpp       | 141 ++++++++++++++----
 lib/CodeGen/SelectionDAG/LegalizeTypes.cpp    |  43 ++++--
 lib/CodeGen/SelectionDAG/LegalizeTypes.h      |  67 +++++++--
 .../SelectionDAG/LegalizeTypesGeneric.cpp     |  11 +-
 lib/CodeGen/SelectionDAG/SelectionDAG.cpp     |   4 +-
 .../SelectionDAG/SelectionDAGDumper.cpp       |   5 +-
 lib/CodeGen/SelectionDAG/TargetLowering.cpp   |   4 +-
 lib/CodeGen/TargetLoweringBase.cpp            |   4 +
 lib/CodeGen/TargetRegisterInfo.cpp            |  18 ++-
 lib/Target/X86/X86MCInstLower.cpp             |  14 +-
 15 files changed, 298 insertions(+), 77 deletions(-)

diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index 819458dbb0f0..e247abcb2f75 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -97,7 +97,8 @@ class TargetLoweringBase {
     TypeLegal,           // The target natively supports this type.
     TypePromoteInteger,  // Replace this integer with a larger one.
     TypeExpandInteger,   // Split this integer into two of half the size.
-    TypeSoftenFloat,     // Convert this float to a same size integer type.
+    TypeSoftenFloat,     // Convert this float to a same size integer type,
+                         // if an operation is not supported in target HW.
     TypeExpandFloat,     // Split this float into two of half the size.
     TypeScalarizeVector, // Replace this one-element vector with its element.
     TypeSplitVector,     // Split this vector into two of half the size.
@@ -1913,6 +1914,7 @@ class TargetLoweringBase {
   /// up the MVT::LAST_VALUETYPE value to the next multiple of 8.
   uint32_t CondCodeActions[ISD::SETCC_INVALID][(MVT::LAST_VALUETYPE + 7) / 8];
 
+protected:
   ValueTypeActionImpl ValueTypeActions;
 
 private:
diff --git a/include/llvm/Target/TargetRegisterInfo.h b/include/llvm/Target/TargetRegisterInfo.h
index e8926f788156..7d293fe82a6b 100644
--- a/include/llvm/Target/TargetRegisterInfo.h
+++ b/include/llvm/Target/TargetRegisterInfo.h
@@ -614,9 +614,13 @@ class TargetRegisterInfo : public MCRegisterInfo {
 
   /// Find the largest common subclass of A and B.
   /// Return NULL if there is no common subclass.
+  /// The common subclass should contain
+  /// simple value type SVT if it is not the Any type.
   const TargetRegisterClass *
   getCommonSubClass(const TargetRegisterClass *A,
-                    const TargetRegisterClass *B) const;
+                    const TargetRegisterClass *B,
+                    const MVT::SimpleValueType SVT =
+                    MVT::SimpleValueType::Any) const;
 
   /// Returns a TargetRegisterClass used for pointer values.
   /// If a target supports multiple different pointer register classes,
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 2941a7eb3aed..10fb334c4c60 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -8722,6 +8722,22 @@ SDValue DAGCombiner::visitFSQRT(SDNode *N) {
                      ZeroCmp, Zero, RV);
 }
 
+static inline bool CanCombineFCOPYSIGN_EXTEND_ROUND(SDNode *N) {
+  // copysign(x, fp_extend(y)) -> copysign(x, y)
+  // copysign(x, fp_round(y)) -> copysign(x, y)
+  // Do not optimize out type conversion of f128 type yet.
+  // For some target like x86_64, configuration is changed
+  // to keep one f128 value in one SSE register, but
+  // instruction selection cannot handle FCOPYSIGN on
+  // SSE registers yet.
+  SDValue N1 = N->getOperand(1);
+  EVT N1VT = N1->getValueType(0);
+  EVT N1Op0VT = N1->getOperand(0)->getValueType(0);
+  return (N1.getOpcode() == ISD::FP_EXTEND ||
+          N1.getOpcode() == ISD::FP_ROUND) &&
+         (N1VT == N1Op0VT || N1Op0VT != MVT::f128);
+}
+
 SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -8765,7 +8781,7 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
 
   // copysign(x, fp_extend(y)) -> copysign(x, y)
   // copysign(x, fp_round(y)) -> copysign(x, y)
-  if (N1.getOpcode() == ISD::FP_EXTEND || N1.getOpcode() == ISD::FP_ROUND)
+  if (CanCombineFCOPYSIGN_EXTEND_ROUND(N))
     return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT,
                        N0, N1.getOperand(0));
 
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 5ec10308dc28..a1e2d410ab00 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -139,7 +139,7 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
               UseRC = RC;
             else if (RC) {
               const TargetRegisterClass *ComRC =
-                TRI->getCommonSubClass(UseRC, RC);
+                TRI->getCommonSubClass(UseRC, RC, VT.SimpleTy);
               // If multiple uses expect disjoint register classes, we emit
               // copies in AddRegisterOperand.
               if (ComRC)
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 8238cdeb59ca..3393e17b8e09 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -154,6 +154,7 @@ class SelectionDAGLegalize {
   SDValue ExpandVectorBuildThroughStack(SDNode* Node);
 
   SDValue ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP);
+  SDValue ExpandConstant(ConstantSDNode *CP);
 
   // if ExpandNode returns false, LegalizeOp falls back to ConvertNodeToLibcall
   bool ExpandNode(SDNode *Node);
@@ -294,6 +295,20 @@ SelectionDAGLegalize::ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP) {
   return Result;
 }
 
+/// Expands the Constant node to a load from the constant pool.
+SDValue SelectionDAGLegalize::ExpandConstant(ConstantSDNode *CP) {
+  SDLoc dl(CP);
+  EVT VT = CP->getValueType(0);
+  SDValue CPIdx = DAG.getConstantPool(CP->getConstantIntValue(),
+                                      TLI.getPointerTy(DAG.getDataLayout()));
+  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+  SDValue Result =
+    DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+                MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+                false, false, false, Alignment);
+  return Result;
+}
+
 /// Expands an unaligned store to 2 half-size stores.
 static void ExpandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG,
                                  const TargetLowering &TLI,
@@ -1192,15 +1207,17 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
 
 #ifndef NDEBUG
   for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
-    assert(TLI.getTypeAction(*DAG.getContext(), Node->getValueType(i)) ==
-             TargetLowering::TypeLegal &&
+    assert((TLI.getTypeAction(*DAG.getContext(), Node->getValueType(i)) ==
+              TargetLowering::TypeLegal ||
+            TLI.isTypeLegal(Node->getValueType(i))) &&
            "Unexpected illegal type!");
 
   for (const SDValue &Op : Node->op_values())
-    assert((TLI.getTypeAction(*DAG.getContext(),
-                              Op.getValueType()) == TargetLowering::TypeLegal ||
-                              Op.getOpcode() == ISD::TargetConstant) &&
-                              "Unexpected illegal type!");
+    assert((TLI.getTypeAction(*DAG.getContext(), Op.getValueType()) ==
+              TargetLowering::TypeLegal ||
+            TLI.isTypeLegal(Op.getValueType()) ||
+            Op.getOpcode() == ISD::TargetConstant) &&
+            "Unexpected illegal type!");
 #endif
 
   // Figure out the correct action; the way to query this varies by opcode
@@ -3390,6 +3407,11 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       Results.push_back(ExpandConstantFP(CFP, true));
     break;
   }
+  case ISD::Constant: {
+    ConstantSDNode *CP = cast<ConstantSDNode>(Node);
+    Results.push_back(ExpandConstant(CP));
+    break;
+  }
   case ISD::FSUB: {
     EVT VT = Node->getValueType(0);
     if (TLI.isOperationLegalOrCustom(ISD::FADD, VT) &&
diff --git a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 97e88bf84a70..bb150f726c23 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -43,10 +43,10 @@ static RTLIB::Libcall GetFPLibCall(EVT VT,
 }
 
 //===----------------------------------------------------------------------===//
-//  Result Float to Integer Conversion.
+//  Convert Float Results to Integer for Non-HW-supported Operations.
 //===----------------------------------------------------------------------===//
 
-void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
+bool DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
   DEBUG(dbgs() << "Soften float result " << ResNo << ": "; N->dump(&DAG);
         dbgs() << "\n");
   SDValue R = SDValue();
@@ -59,20 +59,26 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
 #endif
     llvm_unreachable("Do not know how to soften the result of this operator!");
 
+    case ISD::Register:
+    case ISD::CopyFromReg:
+    case ISD::CopyToReg:
+      assert(isLegalInHWReg(N->getValueType(ResNo)) &&
+             "Unsupported SoftenFloatRes opcode!");
+      // Only when isLegalInHWReg, we can skip check of the operands.
+      R = SDValue(N, ResNo);
+      break;
     case ISD::MERGE_VALUES:R = SoftenFloatRes_MERGE_VALUES(N, ResNo); break;
-    case ISD::BITCAST:     R = SoftenFloatRes_BITCAST(N); break;
+    case ISD::BITCAST:     R = SoftenFloatRes_BITCAST(N, ResNo); break;
     case ISD::BUILD_PAIR:  R = SoftenFloatRes_BUILD_PAIR(N); break;
-    case ISD::ConstantFP:
-      R = SoftenFloatRes_ConstantFP(cast<ConstantFPSDNode>(N));
-      break;
+    case ISD::ConstantFP:  R = SoftenFloatRes_ConstantFP(N, ResNo); break;
     case ISD::EXTRACT_VECTOR_ELT:
       R = SoftenFloatRes_EXTRACT_VECTOR_ELT(N); break;
-    case ISD::FABS:        R = SoftenFloatRes_FABS(N); break;
+    case ISD::FABS:        R = SoftenFloatRes_FABS(N, ResNo); break;
     case ISD::FMINNUM:     R = SoftenFloatRes_FMINNUM(N); break;
     case ISD::FMAXNUM:     R = SoftenFloatRes_FMAXNUM(N); break;
     case ISD::FADD:        R = SoftenFloatRes_FADD(N); break;
     case ISD::FCEIL:       R = SoftenFloatRes_FCEIL(N); break;
-    case ISD::FCOPYSIGN:   R = SoftenFloatRes_FCOPYSIGN(N); break;
+    case ISD::FCOPYSIGN:   R = SoftenFloatRes_FCOPYSIGN(N, ResNo); break;
     case ISD::FCOS:        R = SoftenFloatRes_FCOS(N); break;
     case ISD::FDIV:        R = SoftenFloatRes_FDIV(N); break;
     case ISD::FEXP:        R = SoftenFloatRes_FEXP(N); break;
@@ -84,7 +90,7 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::FMA:         R = SoftenFloatRes_FMA(N); break;
     case ISD::FMUL:        R = SoftenFloatRes_FMUL(N); break;
     case ISD::FNEARBYINT:  R = SoftenFloatRes_FNEARBYINT(N); break;
-    case ISD::FNEG:        R = SoftenFloatRes_FNEG(N); break;
+    case ISD::FNEG:        R = SoftenFloatRes_FNEG(N, ResNo); break;
     case ISD::FP_EXTEND:   R = SoftenFloatRes_FP_EXTEND(N); break;
     case ISD::FP_ROUND:    R = SoftenFloatRes_FP_ROUND(N); break;
     case ISD::FP16_TO_FP:  R = SoftenFloatRes_FP16_TO_FP(N); break;
@@ -97,9 +103,9 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::FSQRT:       R = SoftenFloatRes_FSQRT(N); break;
     case ISD::FSUB:        R = SoftenFloatRes_FSUB(N); break;
     case ISD::FTRUNC:      R = SoftenFloatRes_FTRUNC(N); break;
-    case ISD::LOAD:        R = SoftenFloatRes_LOAD(N); break;
-    case ISD::SELECT:      R = SoftenFloatRes_SELECT(N); break;
-    case ISD::SELECT_CC:   R = SoftenFloatRes_SELECT_CC(N); break;
+    case ISD::LOAD:        R = SoftenFloatRes_LOAD(N, ResNo); break;
+    case ISD::SELECT:      R = SoftenFloatRes_SELECT(N, ResNo); break;
+    case ISD::SELECT_CC:   R = SoftenFloatRes_SELECT_CC(N, ResNo); break;
     case ISD::SINT_TO_FP:
     case ISD::UINT_TO_FP:  R = SoftenFloatRes_XINT_TO_FP(N); break;
     case ISD::UNDEF:       R = SoftenFloatRes_UNDEF(N); break;
@@ -107,11 +113,19 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
   }
 
   // If R is null, the sub-method took care of registering the result.
-  if (R.getNode())
+  if (R.getNode()) {
     SetSoftenedFloat(SDValue(N, ResNo), R);
+    ReplaceSoftenFloatResult(N, ResNo, R);
+  }
+  // Return true only if the node is changed,
+  // assuming that the operands are also converted when necessary.
+  // Otherwise, return false to tell caller to scan operands.
+  return R.getNode() && R.getNode() != N;
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_BITCAST(SDNode *N) {
+SDValue DAGTypeLegalizer::SoftenFloatRes_BITCAST(SDNode *N, unsigned ResNo) {
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
   return BitConvertToInteger(N->getOperand(0));
 }
 
@@ -130,10 +144,14 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_BUILD_PAIR(SDNode *N) {
                      BitConvertToInteger(N->getOperand(1)));
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_ConstantFP(ConstantFPSDNode *N) {
-  return DAG.getConstant(N->getValueAPF().bitcastToAPInt(), SDLoc(N),
+SDValue DAGTypeLegalizer::SoftenFloatRes_ConstantFP(SDNode *N, unsigned ResNo) {
+  // When LegalInHWReg, we can load better from the constant pool.
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
+  ConstantFPSDNode *CN = cast<ConstantFPSDNode>(N);
+  return DAG.getConstant(CN->getValueAPF().bitcastToAPInt(), SDLoc(CN),
                          TLI.getTypeToTransformTo(*DAG.getContext(),
-                                                  N->getValueType(0)));
+                                                  CN->getValueType(0)));
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N) {
@@ -143,7 +161,10 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N) {
                      NewOp, N->getOperand(1));
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_FABS(SDNode *N) {
+SDValue DAGTypeLegalizer::SoftenFloatRes_FABS(SDNode *N, unsigned ResNo) {
+  // When LegalInHWReg, FABS can be implemented as native bitwise operations.
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   unsigned Size = NVT.getSizeInBits();
 
@@ -206,7 +227,10 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FCEIL(SDNode *N) {
                          NVT, Op, false, SDLoc(N)).first;
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_FCOPYSIGN(SDNode *N) {
+SDValue DAGTypeLegalizer::SoftenFloatRes_FCOPYSIGN(SDNode *N, unsigned ResNo) {
+  // When LegalInHWReg, FCOPYSIGN can be implemented as native bitwise operations.
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
   SDValue LHS = GetSoftenedFloat(N->getOperand(0));
   SDValue RHS = BitConvertToInteger(N->getOperand(1));
   SDLoc dl(N);
@@ -390,7 +414,10 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FNEARBYINT(SDNode *N) {
                          NVT, Op, false, SDLoc(N)).first;
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_FNEG(SDNode *N) {
+SDValue DAGTypeLegalizer::SoftenFloatRes_FNEG(SDNode *N, unsigned ResNo) {
+  // When LegalInHWReg, FNEG can be implemented as native bitwise operations.
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDLoc dl(N);
   // Expand Y = FNEG(X) -> Y = SUB -0.0, X
@@ -580,7 +607,8 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FTRUNC(SDNode *N) {
                          NVT, Op, false, SDLoc(N)).first;
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
+SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N, unsigned ResNo) {
+  bool LegalInHWReg = isLegalInHWReg(N->getValueType(ResNo));
   LoadSDNode *L = cast<LoadSDNode>(N);
   EVT VT = N->getValueType(0);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
@@ -595,7 +623,8 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
                        L->getAAInfo());
     // Legalized the chain result - switch anything that used the old chain to
     // use the new one.
-    ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
+    if (N != NewL.getValue(1).getNode())
+      ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
     return NewL;
   }
 
@@ -609,17 +638,24 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
   // Legalized the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
-  return BitConvertToInteger(DAG.getNode(ISD::FP_EXTEND, dl, VT, NewL));
+  auto ExtendNode = DAG.getNode(ISD::FP_EXTEND, dl, VT, NewL);
+  if (LegalInHWReg)
+    return ExtendNode;
+  return BitConvertToInteger(ExtendNode);
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT(SDNode *N) {
+SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT(SDNode *N, unsigned ResNo) {
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
   SDValue LHS = GetSoftenedFloat(N->getOperand(1));
   SDValue RHS = GetSoftenedFloat(N->getOperand(2));
   return DAG.getSelect(SDLoc(N),
                        LHS.getValueType(), N->getOperand(0), LHS, RHS);
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT_CC(SDNode *N) {
+SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT_CC(SDNode *N, unsigned ResNo) {
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
   SDValue LHS = GetSoftenedFloat(N->getOperand(2));
   SDValue RHS = GetSoftenedFloat(N->getOperand(3));
   return DAG.getNode(ISD::SELECT_CC, SDLoc(N),
@@ -645,7 +681,8 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_VAARG(SDNode *N) {
 
   // Legalized the chain result - switch anything that used the old chain to
   // use the new one.
-  ReplaceValueWith(SDValue(N, 1), NewVAARG.getValue(1));
+  if (N != NewVAARG.getValue(1).getNode())
+    ReplaceValueWith(SDValue(N, 1), NewVAARG.getValue(1));
   return NewVAARG;
 }
 
@@ -679,7 +716,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_XINT_TO_FP(SDNode *N) {
 
 
 //===----------------------------------------------------------------------===//
-//  Operand Float to Integer Conversion..
+//  Convert Float Operand to Integer for Non-HW-supported Operations.
 //===----------------------------------------------------------------------===//
 
 bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
@@ -689,6 +726,8 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
 
   switch (N->getOpcode()) {
   default:
+    if (CanSkipSoftenFloatOperand(N, OpNo))
+      return false;
 #ifndef NDEBUG
     dbgs() << "SoftenFloatOperand Op #" << OpNo << ": ";
     N->dump(&DAG); dbgs() << "\n";
@@ -704,14 +743,23 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
   case ISD::FP_TO_UINT:  Res = SoftenFloatOp_FP_TO_UINT(N); break;
   case ISD::SELECT_CC:   Res = SoftenFloatOp_SELECT_CC(N); break;
   case ISD::SETCC:       Res = SoftenFloatOp_SETCC(N); break;
-  case ISD::STORE:       Res = SoftenFloatOp_STORE(N, OpNo); break;
+  case ISD::STORE:
+    Res = SoftenFloatOp_STORE(N, OpNo);
+    // Do not try to analyze or soften this node again if the value is
+    // or can be held in a register. In that case, Res.getNode() should
+    // be equal to N.
+    if (Res.getNode() == N &&
+        isLegalInHWReg(N->getOperand(OpNo).getValueType()))
+      return false;
+    // Otherwise, we need to reanalyze and lower the new Res nodes.
+    break;
   }
 
   // If the result is null, the sub-method took care of registering results etc.
   if (!Res.getNode()) return false;
 
   // If the result is N, the sub-method updated N in place.  Tell the legalizer
-  // core about this.
+  // core about this to re-analyze.
   if (Res.getNode() == N)
     return true;
 
@@ -722,6 +770,41 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
   return false;
 }
 
+bool DAGTypeLegalizer::CanSkipSoftenFloatOperand(SDNode *N, unsigned OpNo) {
+  if (!isLegalInHWReg(N->getOperand(OpNo).getValueType()))
+    return false;
+  // When the operand type can be kept in registers, SoftenFloatResult
+  // will call ReplaceValueWith to replace all references and we can
+  // skip softening this operand.
+  switch (N->getOperand(OpNo).getOpcode()) {
+    case ISD::BITCAST:
+    case ISD::ConstantFP:
+    case ISD::CopyFromReg:
+    case ISD::CopyToReg:
+    case ISD::FABS:
+    case ISD::FCOPYSIGN:
+    case ISD::FNEG:
+    case ISD::Register:
+    case ISD::SELECT:
+    case ISD::SELECT_CC:
+      return true;
+  }
+  // For some opcodes, SoftenFloatResult handles all conversion of softening
+  // and replacing operands, so that there is no need to soften operands
+  // again, although such opcode could be scanned for other illegal operands.
+  switch (N->getOpcode()) {
+    case ISD::ConstantFP:
+    case ISD::CopyFromReg:
+    case ISD::CopyToReg:
+    case ISD::FABS:
+    case ISD::FCOPYSIGN:
+    case ISD::FNEG:
+    case ISD::Register:
+      return true;
+  }
+  return false;
+}
+
 SDValue DAGTypeLegalizer::SoftenFloatOp_BITCAST(SDNode *N) {
   return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0),
                      GetSoftenedFloat(N->getOperand(0)));
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 337cbe7fc598..d6b4f7921f2b 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -238,9 +238,13 @@ bool DAGTypeLegalizer::run() {
         Changed = true;
         goto NodeDone;
       case TargetLowering::TypeSoftenFloat:
-        SoftenFloatResult(N, i);
-        Changed = true;
-        goto NodeDone;
+        Changed = SoftenFloatResult(N, i);
+        if (Changed)
+          goto NodeDone;
+        // If not changed, the result type should be legally in register.
+        assert(isLegalInHWReg(ResultVT) &&
+               "Unchanged SoftenFloatResult should be legal in register!");
+        goto ScanOperands;
       case TargetLowering::TypeExpandFloat:
         ExpandFloatResult(N, i);
         Changed = true;
@@ -411,18 +415,27 @@ bool DAGTypeLegalizer::run() {
     bool Failed = false;
 
     // Check that all result types are legal.
+    // A value type is illegal if its TypeAction is not TypeLegal,
+    // and TLI.RegClassForVT does not have a register class for this type.
+    // For example, the x86_64 target has f128 that is not TypeLegal,
+    // to have softened operators, but it also has FR128 register class to
+    // pass and return f128 values. Hence a legalized node can have f128 type.
     if (!IgnoreNodeResults(&Node))
       for (unsigned i = 0, NumVals = Node.getNumValues(); i < NumVals; ++i)
-        if (!isTypeLegal(Node.getValueType(i))) {
-          dbgs() << "Result type " << i << " illegal!\n";
+        if (!isTypeLegal(Node.getValueType(i)) &&
+            !TLI.isTypeLegal(Node.getValueType(i))) {
+          dbgs() << "Result type " << i << " illegal: ";
+          Node.dump();
           Failed = true;
         }
 
     // Check that all operand types are legal.
     for (unsigned i = 0, NumOps = Node.getNumOperands(); i < NumOps; ++i)
       if (!IgnoreNodeResults(Node.getOperand(i).getNode()) &&
-          !isTypeLegal(Node.getOperand(i).getValueType())) {
-        dbgs() << "Operand type " << i << " illegal!\n";
+          !isTypeLegal(Node.getOperand(i).getValueType()) &&
+          !TLI.isTypeLegal(Node.getOperand(i).getValueType())) {
+        dbgs() << "Operand type " << i << " illegal: ";
+        Node.getOperand(i).dump();
         Failed = true;
       }
 
@@ -748,13 +761,23 @@ void DAGTypeLegalizer::SetPromotedInteger(SDValue Op, SDValue Result) {
 }
 
 void DAGTypeLegalizer::SetSoftenedFloat(SDValue Op, SDValue Result) {
-  assert(Result.getValueType() ==
-         TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) &&
+  // f128 of x86_64 could be kept in SSE registers,
+  // but sometimes softened to i128.
+  assert((Result.getValueType() ==
+          TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) ||
+          Op.getValueType() ==
+          TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType())) &&
          "Invalid type for softened float");
   AnalyzeNewValue(Result);
 
   SDValue &OpEntry = SoftenedFloats[Op];
-  assert(!OpEntry.getNode() && "Node is already converted to integer!");
+  // Allow repeated calls to save f128 type nodes
+  // or any node with type that transforms to itself.
+  // Many operations on these types are not softened.
+  assert((!OpEntry.getNode()||
+          Op.getValueType() ==
+          TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType())) &&
+         "Node is already converted to integer!");
   OpEntry = Result;
 }
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 4e4740f1f9cb..84ea374345e9 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -72,6 +72,20 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
     return TLI.getTypeAction(*DAG.getContext(), VT) == TargetLowering::TypeLegal;
   }
 
+  /// isSimpleLegalType - Return true if this is a simple legal type.
+  bool isSimpleLegalType(EVT VT) const {
+    return VT.isSimple() && TLI.isTypeLegal(VT);
+  }
+
+  /// isLegalInHWReg - Return true if this type can be passed in registers.
+  /// For example, x86_64's f128, should to be legally in registers
+  /// and only some operations converted to library calls or integer
+  /// bitwise operations.
+  bool isLegalInHWReg(EVT VT) const {
+    EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+    return VT == NVT && isSimpleLegalType(VT);
+  }
+
   EVT getSetCCResultType(EVT VT) const {
     return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
   }
@@ -372,32 +386,48 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   // Float to Integer Conversion Support: LegalizeFloatTypes.cpp
   //===--------------------------------------------------------------------===//
 
-  /// GetSoftenedFloat - Given a processed operand Op which was converted to an
-  /// integer of the same size, this returns the integer.  The integer contains
-  /// exactly the same bits as Op - only the type changed.  For example, if Op
-  /// is an f32 which was softened to an i32, then this method returns an i32,
-  /// the bits of which coincide with those of Op.
+  /// GetSoftenedFloat - Given an operand Op of Float type, returns the integer
+  /// if the Op is not supported in target HW and converted to the integer.
+  /// The integer contains exactly the same bits as Op - only the type changed.
+  /// For example, if Op is an f32 which was softened to an i32, then this method
+  /// returns an i32, the bits of which coincide with those of Op.
+  /// If the Op can be efficiently supported in target HW or the operand must
+  /// stay in a register, the Op is not converted to an integer.
+  /// In that case, the given op is returned.
   SDValue GetSoftenedFloat(SDValue Op) {
     SDValue &SoftenedOp = SoftenedFloats[Op];
+    if (!SoftenedOp.getNode() &&
+        isSimpleLegalType(Op.getValueType()))
+      return Op;
     RemapValue(SoftenedOp);
     assert(SoftenedOp.getNode() && "Operand wasn't converted to integer?");
     return SoftenedOp;
   }
   void SetSoftenedFloat(SDValue Op, SDValue Result);
 
-  // Result Float to Integer Conversion.
-  void SoftenFloatResult(SDNode *N, unsigned OpNo);
+  // Call ReplaceValueWith(SDValue(N, ResNo), Res) if necessary.
+  void ReplaceSoftenFloatResult(SDNode *N, unsigned ResNo, SDValue &NewRes) {
+    // When the result type can be kept in HW registers, the converted
+    // NewRes node could have the same type. We can save the effort in
+    // cloning every user of N in SoftenFloatOperand or other legalization functions,
+    // by calling ReplaceValueWith here to update all users.
+    if (NewRes.getNode() != N && isLegalInHWReg(N->getValueType(ResNo)))
+      ReplaceValueWith(SDValue(N, ResNo), NewRes);
+  }
+
+  // Convert Float Results to Integer for Non-HW-supported Operations.
+  bool SoftenFloatResult(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_MERGE_VALUES(SDNode *N, unsigned ResNo);
-  SDValue SoftenFloatRes_BITCAST(SDNode *N);
+  SDValue SoftenFloatRes_BITCAST(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_BUILD_PAIR(SDNode *N);
-  SDValue SoftenFloatRes_ConstantFP(ConstantFPSDNode *N);
+  SDValue SoftenFloatRes_ConstantFP(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N);
-  SDValue SoftenFloatRes_FABS(SDNode *N);
+  SDValue SoftenFloatRes_FABS(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_FMINNUM(SDNode *N);
   SDValue SoftenFloatRes_FMAXNUM(SDNode *N);
   SDValue SoftenFloatRes_FADD(SDNode *N);
   SDValue SoftenFloatRes_FCEIL(SDNode *N);
-  SDValue SoftenFloatRes_FCOPYSIGN(SDNode *N);
+  SDValue SoftenFloatRes_FCOPYSIGN(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_FCOS(SDNode *N);
   SDValue SoftenFloatRes_FDIV(SDNode *N);
   SDValue SoftenFloatRes_FEXP(SDNode *N);
@@ -409,7 +439,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue SoftenFloatRes_FMA(SDNode *N);
   SDValue SoftenFloatRes_FMUL(SDNode *N);
   SDValue SoftenFloatRes_FNEARBYINT(SDNode *N);
-  SDValue SoftenFloatRes_FNEG(SDNode *N);
+  SDValue SoftenFloatRes_FNEG(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_FP_EXTEND(SDNode *N);
   SDValue SoftenFloatRes_FP16_TO_FP(SDNode *N);
   SDValue SoftenFloatRes_FP_ROUND(SDNode *N);
@@ -422,14 +452,19 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue SoftenFloatRes_FSQRT(SDNode *N);
   SDValue SoftenFloatRes_FSUB(SDNode *N);
   SDValue SoftenFloatRes_FTRUNC(SDNode *N);
-  SDValue SoftenFloatRes_LOAD(SDNode *N);
-  SDValue SoftenFloatRes_SELECT(SDNode *N);
-  SDValue SoftenFloatRes_SELECT_CC(SDNode *N);
+  SDValue SoftenFloatRes_LOAD(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_SELECT(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_SELECT_CC(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_UNDEF(SDNode *N);
   SDValue SoftenFloatRes_VAARG(SDNode *N);
   SDValue SoftenFloatRes_XINT_TO_FP(SDNode *N);
 
-  // Operand Float to Integer Conversion.
+  // Return true if we can skip softening the given operand or SDNode because
+  // it was soften before by SoftenFloatResult and references to the operand
+  // were replaced by ReplaceValueWith.
+  bool CanSkipSoftenFloatOperand(SDNode *N, unsigned OpNo);
+
+  // Convert Float Operand to Integer for Non-HW-supported Operations.
   bool SoftenFloatOperand(SDNode *N, unsigned OpNo);
   SDValue SoftenFloatOp_BITCAST(SDNode *N);
   SDValue SoftenFloatOp_BR_CC(SDNode *N);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index eb545982ed02..593c346df770 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -53,12 +53,17 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
     case TargetLowering::TypePromoteFloat:
       llvm_unreachable("Bitcast of a promotion-needing float should never need"
                        "expansion");
-    case TargetLowering::TypeSoftenFloat:
-      // Convert the integer operand instead.
-      SplitInteger(GetSoftenedFloat(InOp), Lo, Hi);
+    case TargetLowering::TypeSoftenFloat: {
+      // Expand the floating point operand only if it was converted to integers.
+      // Otherwise, it is a legal type like f128 that can be saved in a register.
+      auto SoftenedOp = GetSoftenedFloat(InOp);
+      if (SoftenedOp == InOp)
+        break;
+      SplitInteger(SoftenedOp, Lo, Hi);
       Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
       Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
       return;
+    }
     case TargetLowering::TypeExpandInteger:
     case TargetLowering::TypeExpandFloat: {
       auto &DL = DAG.getDataLayout();
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 4b7887b26afe..771bb00d86ac 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2893,8 +2893,10 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL,
         return getConstantFP(APFloat(APFloat::IEEEhalf, Val), DL, VT);
       if (VT == MVT::f32 && C->getValueType(0) == MVT::i32)
         return getConstantFP(APFloat(APFloat::IEEEsingle, Val), DL, VT);
-      else if (VT == MVT::f64 && C->getValueType(0) == MVT::i64)
+      if (VT == MVT::f64 && C->getValueType(0) == MVT::i64)
         return getConstantFP(APFloat(APFloat::IEEEdouble, Val), DL, VT);
+      if (VT == MVT::f128 && C->getValueType(0) == MVT::i128)
+        return getConstantFP(APFloat(APFloat::IEEEquad, Val), DL, VT);
       break;
     case ISD::BSWAP:
       return getConstant(Val.byteSwap(), DL, VT, C->isTargetOpcode(),
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index a6f9699bb29c..d362f98d6464 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -626,7 +626,10 @@ void SDNode::printr(raw_ostream &OS, const SelectionDAG *G) const {
 
 static bool printOperand(raw_ostream &OS, const SelectionDAG *G,
                          const SDValue Value) {
-  if (shouldPrintInline(*Value.getNode())) {
+  if (!Value.getNode()) {
+    OS << "<null>";
+    return false;
+  } else if (shouldPrintInline(*Value.getNode())) {
     OS << Value->getOperationName(G) << ':';
     Value->print_types(OS, G);
     Value->print_details(OS, G);
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 21935cdd4699..bb31231f4e1a 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1072,7 +1072,9 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
         Op.getOperand(0).getValueType().isFloatingPoint()) {
       bool OpVTLegal = isOperationLegalOrCustom(ISD::FGETSIGN, Op.getValueType());
       bool i32Legal  = isOperationLegalOrCustom(ISD::FGETSIGN, MVT::i32);
-      if ((OpVTLegal || i32Legal) && Op.getValueType().isSimple()) {
+      if ((OpVTLegal || i32Legal) && Op.getValueType().isSimple() &&
+           Op.getOperand(0).getValueType() != MVT::f128) {
+        // Cannot eliminate/lower SHL for f128 yet.
         EVT Ty = OpVTLegal ? Op.getValueType() : MVT::i32;
         // Make a FGETSIGN + SHL to move the sign bit into the appropriate
         // place.  We expect the SHL to be eliminated by other optimizations.
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index 69c130809bb8..68bca2e70369 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -1654,6 +1654,10 @@ TargetLoweringBase::getTypeLegalizationCost(const DataLayout &DL,
     if (LK.first == TypeSplitVector || LK.first == TypeExpandInteger)
       Cost *= 2;
 
+    // Do not loop with f128 type.
+    if (MTy == LK.second)
+      return std::make_pair(Cost, MTy.getSimpleVT());
+
     // Keep legalizing the type.
     MTy = LK.second;
   }
diff --git a/lib/CodeGen/TargetRegisterInfo.cpp b/lib/CodeGen/TargetRegisterInfo.cpp
index fe91c86b71f8..0c4a3dcb226e 100644
--- a/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/lib/CodeGen/TargetRegisterInfo.cpp
@@ -171,16 +171,24 @@ BitVector TargetRegisterInfo::getAllocatableSet(const MachineFunction &MF,
 static inline
 const TargetRegisterClass *firstCommonClass(const uint32_t *A,
                                             const uint32_t *B,
-                                            const TargetRegisterInfo *TRI) {
+                                            const TargetRegisterInfo *TRI,
+                                            const MVT::SimpleValueType SVT =
+                                            MVT::SimpleValueType::Any) {
+  const MVT VT(SVT);
   for (unsigned I = 0, E = TRI->getNumRegClasses(); I < E; I += 32)
-    if (unsigned Common = *A++ & *B++)
-      return TRI->getRegClass(I + countTrailingZeros(Common));
+    if (unsigned Common = *A++ & *B++) {
+      const TargetRegisterClass *RC =
+          TRI->getRegClass(I + countTrailingZeros(Common));
+      if (SVT == MVT::SimpleValueType::Any || RC->hasType(VT))
+        return RC;
+    }
   return nullptr;
 }
 
 const TargetRegisterClass *
 TargetRegisterInfo::getCommonSubClass(const TargetRegisterClass *A,
-                                      const TargetRegisterClass *B) const {
+                                      const TargetRegisterClass *B,
+                                      const MVT::SimpleValueType SVT) const {
   // First take care of the trivial cases.
   if (A == B)
     return A;
@@ -189,7 +197,7 @@ TargetRegisterInfo::getCommonSubClass(const TargetRegisterClass *A,
 
   // Register classes are ordered topologically, so the largest common
   // sub-class it the common sub-class with the smallest ID.
-  return firstCommonClass(A->getSubClassMask(), B->getSubClassMask(), this);
+  return firstCommonClass(A->getSubClassMask(), B->getSubClassMask(), this, SVT);
 }
 
 const TargetRegisterClass *
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index 6887f2e4c04a..8878c9f169b5 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -1373,7 +1373,19 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
           if (isa<UndefValue>(COp)) {
             CS << "u";
           } else if (auto *CI = dyn_cast<ConstantInt>(COp)) {
-            CS << CI->getZExtValue();
+            if (CI->getBitWidth() <= 64) {
+              CS << CI->getZExtValue();
+            } else {
+              // print multi-word constant as (w0,w1)
+              auto Val = CI->getValue();
+              CS << "(";
+              for (int i = 0, N = Val.getNumWords(); i < N; ++i) {
+                if (i > 0)
+                  CS << ",";
+                CS << Val.getRawData()[i];
+              }
+              CS << ")";
+            }
           } else if (auto *CF = dyn_cast<ConstantFP>(COp)) {
             SmallString<32> Str;
             CF->getValueAPF().toString(Str);

From 0956a120f5a443670cf86e59a6ad9dcff7135a9e Mon Sep 17 00:00:00 2001
From: Davide Italiano <davide@freebsd.org>
Date: Thu, 3 Dec 2015 22:13:40 +0000
Subject: [PATCH 027/364] [llvm-objdump] Use report_fatal_error() if we can't
 find a target.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254654 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-objdump/llvm-objdump.cpp | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp
index 54f24d7a00ff..069425429d16 100644
--- a/tools/llvm-objdump/llvm-objdump.cpp
+++ b/tools/llvm-objdump/llvm-objdump.cpp
@@ -282,10 +282,8 @@ static const Target *getTarget(const ObjectFile *Obj = nullptr) {
   std::string Error;
   const Target *TheTarget = TargetRegistry::lookupTarget(ArchName, TheTriple,
                                                          Error);
-  if (!TheTarget) {
-    errs() << ToolName << ": " << Error;
-    return nullptr;
-  }
+  if (!TheTarget)
+    report_fatal_error("can't find target: " + Error);
 
   // Update the triple name and return the found target.
   TripleName = TheTriple.getTriple();
@@ -805,10 +803,6 @@ static bool getHidden(RelocationRef RelRef) {
 
 static void DisassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
   const Target *TheTarget = getTarget(Obj);
-  // getTarget() will have already issued a diagnostic if necessary, so
-  // just bail here if it failed.
-  if (!TheTarget)
-    return;
 
   // Package up features to be passed to target/subtarget
   std::string FeaturesStr;

From bb300c512008269c1caabff9613a6dbc49f83fa4 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Thu, 3 Dec 2015 22:17:26 +0000
Subject: [PATCH 028/364] raw_ostream: << operator for callables with
 raw_stream argument

This allows easier construction of print helpers. Example:

Printable PrintLaneMask(unsigned LaneMask) {
  return Printable([LaneMask](raw_ostream &OS) {
    OS << format("%08X", LaneMask);
  });
}

// Usage:
OS << PrintLaneMask(Mask);

Differential Revision: http://reviews.llvm.org/D14348

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254655 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Support/raw_ostream.h            | 14 +++
 include/llvm/Target/TargetRegisterInfo.h      | 73 ++------------
 lib/CodeGen/RegAllocPBQP.cpp                  | 26 +----
 .../SelectionDAG/SelectionDAGDumper.cpp       | 17 +---
 lib/CodeGen/TargetRegisterInfo.cpp            | 95 +++++++++++--------
 lib/Support/raw_ostream.cpp                   |  4 +
 6 files changed, 89 insertions(+), 140 deletions(-)

diff --git a/include/llvm/Support/raw_ostream.h b/include/llvm/Support/raw_ostream.h
index e5cc40e7d6b2..38a96fa6ab74 100644
--- a/include/llvm/Support/raw_ostream.h
+++ b/include/llvm/Support/raw_ostream.h
@@ -17,12 +17,14 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/DataTypes.h"
+#include <functional>
 #include <system_error>
 
 namespace llvm {
 class format_object_base;
 class FormattedString;
 class FormattedNumber;
+class raw_ostream;
 template <typename T> class SmallVectorImpl;
 
 namespace sys {
@@ -31,6 +33,15 @@ enum OpenFlags : unsigned;
 }
 }
 
+/// Type of function that prints to raw_ostream.
+///
+/// Typical usage:
+///     Printable PrintFoo(Foo x) {
+///       return [] (raw_ostream &os) { os << /* ... */; };
+///     }
+///     os << "Foo: " << PrintFoo(foo) << '\n';
+typedef std::function<void(raw_ostream&)> Printable;
+
 /// This class implements an extremely fast bulk output stream that can *only*
 /// output to a stream.  It does not support seeking, reopening, rewinding, line
 /// buffered disciplines etc. It is a simple buffer that outputs
@@ -203,6 +214,9 @@ class raw_ostream {
 
   raw_ostream &operator<<(double N);
 
+  /// IO manipulator, \see Printable.
+  raw_ostream &operator<<(Printable P);
+
   /// Output \p N in hexadecimal, without any prefix or padding.
   raw_ostream &write_hex(unsigned long long N);
 
diff --git a/include/llvm/Target/TargetRegisterInfo.h b/include/llvm/Target/TargetRegisterInfo.h
index 7d293fe82a6b..de6f46eba013 100644
--- a/include/llvm/Target/TargetRegisterInfo.h
+++ b/include/llvm/Target/TargetRegisterInfo.h
@@ -932,7 +932,6 @@ struct VirtReg2IndexFunctor : public std::unary_function<unsigned, unsigned> {
   }
 };
 
-/// Helper class for printing registers on a raw_ostream.
 /// Prints virtual and physical registers with or without a TRI instance.
 ///
 /// The format is:
@@ -943,24 +942,10 @@ struct VirtReg2IndexFunctor : public std::unary_function<unsigned, unsigned> {
 ///   %physreg17      - a physical register when no TRI instance given.
 ///
 /// Usage: OS << PrintReg(Reg, TRI) << '\n';
-///
-class PrintReg {
-  const TargetRegisterInfo *TRI;
-  unsigned Reg;
-  unsigned SubIdx;
-public:
-  explicit PrintReg(unsigned reg, const TargetRegisterInfo *tri = nullptr,
-                    unsigned subidx = 0)
-    : TRI(tri), Reg(reg), SubIdx(subidx) {}
-  void print(raw_ostream&) const;
-};
+Printable PrintReg(unsigned Reg, const TargetRegisterInfo *TRI = nullptr,
+                   unsigned SubRegIdx = 0);
 
-static inline raw_ostream &operator<<(raw_ostream &OS, const PrintReg &PR) {
-  PR.print(OS);
-  return OS;
-}
-
-/// Helper class for printing register units on a raw_ostream.
+/// Create Printable object to print register units on a \ref raw_ostream.
 ///
 /// Register units are named after their root registers:
 ///
@@ -968,54 +953,14 @@ static inline raw_ostream &operator<<(raw_ostream &OS, const PrintReg &PR) {
 ///   FP0~ST7 - Dual roots.
 ///
 /// Usage: OS << PrintRegUnit(Unit, TRI) << '\n';
-///
-class PrintRegUnit {
-protected:
-  const TargetRegisterInfo *TRI;
-  unsigned Unit;
-public:
-  PrintRegUnit(unsigned unit, const TargetRegisterInfo *tri)
-    : TRI(tri), Unit(unit) {}
-  void print(raw_ostream&) const;
-};
-
-static inline raw_ostream &operator<<(raw_ostream &OS, const PrintRegUnit &PR) {
-  PR.print(OS);
-  return OS;
-}
+Printable PrintRegUnit(unsigned Unit, const TargetRegisterInfo *TRI);
 
-/// It is often convenient to track virtual registers and
-/// physical register units in the same list.
-class PrintVRegOrUnit : protected PrintRegUnit {
-public:
-  PrintVRegOrUnit(unsigned VRegOrUnit, const TargetRegisterInfo *tri)
-    : PrintRegUnit(VRegOrUnit, tri) {}
-  void print(raw_ostream&) const;
-};
-
-static inline raw_ostream &operator<<(raw_ostream &OS,
-                                      const PrintVRegOrUnit &PR) {
-  PR.print(OS);
-  return OS;
-}
-
-/// Helper class for printing lane masks.
-///
-/// They are currently printed out as hexadecimal numbers.
-/// Usage: OS << PrintLaneMask(Mask);
-class PrintLaneMask {
-protected:
-  LaneBitmask LaneMask;
-public:
-  PrintLaneMask(LaneBitmask LaneMask)
-    : LaneMask(LaneMask) {}
-  void print(raw_ostream&) const;
-};
+/// \brief Create Printable object to print virtual registers and physical
+/// registers on a \ref raw_ostream.
+Printable PrintVRegOrUnit(unsigned VRegOrUnit, const TargetRegisterInfo *TRI);
 
-static inline raw_ostream &operator<<(raw_ostream &OS, const PrintLaneMask &P) {
-  P.print(OS);
-  return OS;
-}
+/// Create Printable object to print LaneBitmasks on a \ref raw_ostream.
+Printable PrintLaneMask(LaneBitmask LaneMask);
 
 } // End llvm namespace
 
diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp
index afa98b26d727..f08d616e6812 100644
--- a/lib/CodeGen/RegAllocPBQP.cpp
+++ b/lib/CodeGen/RegAllocPBQP.cpp
@@ -805,33 +805,17 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
   return true;
 }
 
-namespace {
-// A helper class for printing node and register info in a consistent way
-class PrintNodeInfo {
-public:
-  typedef PBQP::RegAlloc::PBQPRAGraph Graph;
-  typedef PBQP::RegAlloc::PBQPRAGraph::NodeId NodeId;
-
-  PrintNodeInfo(NodeId NId, const Graph &G) : G(G), NId(NId) {}
-
-  void print(raw_ostream &OS) const {
+/// Create Printable object for node and register info.
+static Printable PrintNodeInfo(PBQP::RegAlloc::PBQPRAGraph::NodeId NId,
+                               const PBQP::RegAlloc::PBQPRAGraph &G) {
+  return [NId, &G](raw_ostream &OS) {
     const MachineRegisterInfo &MRI = G.getMetadata().MF.getRegInfo();
     const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
     unsigned VReg = G.getNodeMetadata(NId).getVReg();
     const char *RegClassName = TRI->getRegClassName(MRI.getRegClass(VReg));
     OS << NId << " (" << RegClassName << ':' << PrintReg(VReg, TRI) << ')';
-  }
-
-private:
-  const Graph &G;
-  NodeId NId;
-};
-
-inline raw_ostream &operator<<(raw_ostream &OS, const PrintNodeInfo &PR) {
-  PR.print(OS);
-  return OS;
+  };
 }
-} // anonymous namespace
 
 void PBQP::RegAlloc::PBQPRAGraph::dump(raw_ostream &OS) const {
   for (auto NId : nodeIds()) {
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index d362f98d6464..8c3a0f2d81ec 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -369,25 +369,14 @@ const char *SDNode::getIndexedModeName(ISD::MemIndexedMode AM) {
   }
 }
 
-namespace {
-class PrintNodeId {
-  const SDNode &Node;
-public:
-  explicit PrintNodeId(const SDNode &Node)
-      : Node(Node) {}
-  void print(raw_ostream &OS) const {
+static Printable PrintNodeId(const SDNode &Node) {
+  return [&Node](raw_ostream &OS) {
 #ifndef NDEBUG
     OS << 't' << Node.PersistentId;
 #else
     OS << (const void*)&Node;
 #endif
-  }
-};
-
-static inline raw_ostream &operator<<(raw_ostream &OS, const PrintNodeId &P) {
-  P.print(OS);
-  return OS;
-}
+  };
 }
 
 void SDNode::dump() const { dump(nullptr); }
diff --git a/lib/CodeGen/TargetRegisterInfo.cpp b/lib/CodeGen/TargetRegisterInfo.cpp
index 0c4a3dcb226e..839d9ef31ad0 100644
--- a/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/lib/CodeGen/TargetRegisterInfo.cpp
@@ -40,58 +40,71 @@ TargetRegisterInfo::TargetRegisterInfo(const TargetRegisterInfoDesc *ID,
 
 TargetRegisterInfo::~TargetRegisterInfo() {}
 
-void PrintReg::print(raw_ostream &OS) const {
-  if (!Reg)
-    OS << "%noreg";
-  else if (TargetRegisterInfo::isStackSlot(Reg))
-    OS << "SS#" << TargetRegisterInfo::stackSlot2Index(Reg);
-  else if (TargetRegisterInfo::isVirtualRegister(Reg))
-    OS << "%vreg" << TargetRegisterInfo::virtReg2Index(Reg);
-  else if (TRI && Reg < TRI->getNumRegs())
-    OS << '%' << TRI->getName(Reg);
-  else
-    OS << "%physreg" << Reg;
-  if (SubIdx) {
-    if (TRI)
-      OS << ':' << TRI->getSubRegIndexName(SubIdx);
+namespace llvm {
+
+Printable PrintReg(unsigned Reg, const TargetRegisterInfo *TRI,
+                   unsigned SubIdx) {
+  return [Reg, TRI, SubIdx](raw_ostream &OS) {
+    if (!Reg)
+      OS << "%noreg";
+    else if (TargetRegisterInfo::isStackSlot(Reg))
+      OS << "SS#" << TargetRegisterInfo::stackSlot2Index(Reg);
+    else if (TargetRegisterInfo::isVirtualRegister(Reg))
+      OS << "%vreg" << TargetRegisterInfo::virtReg2Index(Reg);
+    else if (TRI && Reg < TRI->getNumRegs())
+      OS << '%' << TRI->getName(Reg);
     else
-      OS << ":sub(" << SubIdx << ')';
-  }
+      OS << "%physreg" << Reg;
+    if (SubIdx) {
+      if (TRI)
+        OS << ':' << TRI->getSubRegIndexName(SubIdx);
+      else
+        OS << ":sub(" << SubIdx << ')';
+    }
+  };
 }
 
-void PrintRegUnit::print(raw_ostream &OS) const {
-  // Generic printout when TRI is missing.
-  if (!TRI) {
-    OS << "Unit~" << Unit;
-    return;
-  }
+Printable PrintRegUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
+  return [Unit, TRI](raw_ostream &OS) {
+    // Generic printout when TRI is missing.
+    if (!TRI) {
+      OS << "Unit~" << Unit;
+      return;
+    }
 
-  // Check for invalid register units.
-  if (Unit >= TRI->getNumRegUnits()) {
-    OS << "BadUnit~" << Unit;
-    return;
-  }
+    // Check for invalid register units.
+    if (Unit >= TRI->getNumRegUnits()) {
+      OS << "BadUnit~" << Unit;
+      return;
+    }
 
-  // Normal units have at least one root.
-  MCRegUnitRootIterator Roots(Unit, TRI);
-  assert(Roots.isValid() && "Unit has no roots.");
-  OS << TRI->getName(*Roots);
-  for (++Roots; Roots.isValid(); ++Roots)
-    OS << '~' << TRI->getName(*Roots);
+    // Normal units have at least one root.
+    MCRegUnitRootIterator Roots(Unit, TRI);
+    assert(Roots.isValid() && "Unit has no roots.");
+    OS << TRI->getName(*Roots);
+    for (++Roots; Roots.isValid(); ++Roots)
+      OS << '~' << TRI->getName(*Roots);
+  };
 }
 
-void PrintVRegOrUnit::print(raw_ostream &OS) const {
-  if (TRI && TRI->isVirtualRegister(Unit)) {
-    OS << "%vreg" << TargetRegisterInfo::virtReg2Index(Unit);
-    return;
-  }
-  PrintRegUnit::print(OS);
+Printable PrintVRegOrUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
+  return [Unit, TRI](raw_ostream &OS) {
+    if (TRI && TRI->isVirtualRegister(Unit)) {
+      OS << "%vreg" << TargetRegisterInfo::virtReg2Index(Unit);
+    } else {
+      OS << PrintRegUnit(Unit, TRI);
+    }
+  };
 }
 
-void PrintLaneMask::print(raw_ostream &OS) const {
-  OS << format("%08X", LaneMask);
+Printable PrintLaneMask(LaneBitmask LaneMask) {
+  return [LaneMask](raw_ostream &OS) {
+    OS << format("%08X", LaneMask);
+  };
 }
 
+} // End of llvm namespace
+
 /// getAllocatableClass - Return the maximal subclass of the given register
 /// class that is alloctable, or NULL.
 const TargetRegisterClass *
diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp
index 49ef400c5f2d..5b1dceca0bfc 100644
--- a/lib/Support/raw_ostream.cpp
+++ b/lib/Support/raw_ostream.cpp
@@ -264,6 +264,10 @@ raw_ostream &raw_ostream::operator<<(double N) {
   return this->operator<<(format("%e", N));
 }
 
+raw_ostream &raw_ostream::operator<<(Printable P) {
+  P(*this);
+  return *this;
+}
 
 
 void raw_ostream::flush_nonempty() {

From 68c9f5ec88b3d7679f09d6634c64e6f1d209f097 Mon Sep 17 00:00:00 2001
From: David Majnemer <david.majnemer@gmail.com>
Date: Thu, 3 Dec 2015 22:45:19 +0000
Subject: [PATCH 029/364] [Analysis] Become aware of MSVC's new/delete
 functions

The compiler can take advantage of the allocation/deallocation
function's properties.  We knew how to do this for Itanium but had no
support for MSVC-style functions.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254656 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/TargetLibraryInfo.def   | 81 +++++++++++++++++++
 lib/Analysis/MemoryBuiltins.cpp               | 24 +++++-
 .../InstCombine/malloc-free-delete.ll         | 11 +++
 3 files changed, 114 insertions(+), 2 deletions(-)

diff --git a/include/llvm/Analysis/TargetLibraryInfo.def b/include/llvm/Analysis/TargetLibraryInfo.def
index 393e9d6695f8..7798e3c88248 100644
--- a/include/llvm/Analysis/TargetLibraryInfo.def
+++ b/include/llvm/Analysis/TargetLibraryInfo.def
@@ -27,6 +27,86 @@
 #define TLI_DEFINE_STRING_INTERNAL(string_repr) string_repr,
 #endif
 
+/// void *new(unsigned int);
+TLI_DEFINE_ENUM_INTERNAL(msvc_new_int)
+TLI_DEFINE_STRING_INTERNAL("??2@YAPAXI@Z")
+
+/// void *new(unsigned int, nothrow);
+TLI_DEFINE_ENUM_INTERNAL(msvc_new_int_nothrow)
+TLI_DEFINE_STRING_INTERNAL("??2@YAPAXIABUnothrow_t@std@@@Z")
+
+/// void *new(unsigned long long);
+TLI_DEFINE_ENUM_INTERNAL(msvc_new_longlong)
+TLI_DEFINE_STRING_INTERNAL("??2@YAPEAX_K@Z")
+
+/// void *new(unsigned long long, nothrow);
+TLI_DEFINE_ENUM_INTERNAL(msvc_new_longlong_nothrow)
+TLI_DEFINE_STRING_INTERNAL("??2@YAPEAX_KAEBUnothrow_t@std@@@Z")
+
+/// void operator delete(void*);
+TLI_DEFINE_ENUM_INTERNAL(msvc_delete_ptr32)
+TLI_DEFINE_STRING_INTERNAL("??3@YAXPAX@Z")
+
+/// void operator delete(void*, nothrow);
+TLI_DEFINE_ENUM_INTERNAL(msvc_delete_ptr32_nothrow)
+TLI_DEFINE_STRING_INTERNAL("??3@YAXPAXABUnothrow_t@std@@@Z")
+
+/// void operator delete(void*, unsigned int);
+TLI_DEFINE_ENUM_INTERNAL(msvc_delete_ptr32_int)
+TLI_DEFINE_STRING_INTERNAL("??3@YAXPAXI@Z")
+
+/// void operator delete(void*);
+TLI_DEFINE_ENUM_INTERNAL(msvc_delete_ptr64)
+TLI_DEFINE_STRING_INTERNAL("??3@YAXPEAX@Z")
+
+/// void operator delete(void*, nothrow);
+TLI_DEFINE_ENUM_INTERNAL(msvc_delete_ptr64_nothrow)
+TLI_DEFINE_STRING_INTERNAL("??3@YAXPEAXAEBUnothrow_t@std@@@Z")
+
+/// void operator delete(void*, unsigned long long);
+TLI_DEFINE_ENUM_INTERNAL(msvc_delete_ptr64_longlong)
+TLI_DEFINE_STRING_INTERNAL("??3@YAXPEAX_K@Z")
+
+/// void *new[](unsigned int);
+TLI_DEFINE_ENUM_INTERNAL(msvc_new_array_int)
+TLI_DEFINE_STRING_INTERNAL("??_U@YAPAXI@Z")
+
+/// void *new[](unsigned int, nothrow);
+TLI_DEFINE_ENUM_INTERNAL(msvc_new_array_int_nothrow)
+TLI_DEFINE_STRING_INTERNAL("??_U@YAPAXIABUnothrow_t@std@@@Z")
+
+/// void *new[](unsigned long long);
+TLI_DEFINE_ENUM_INTERNAL(msvc_new_array_longlong)
+TLI_DEFINE_STRING_INTERNAL("??_U@YAPEAX_K@Z")
+
+/// void *new[](unsigned long long, nothrow);
+TLI_DEFINE_ENUM_INTERNAL(msvc_new_array_longlong_nothrow)
+TLI_DEFINE_STRING_INTERNAL("??_U@YAPEAX_KAEBUnothrow_t@std@@@Z")
+
+/// void operator delete[](void*);
+TLI_DEFINE_ENUM_INTERNAL(msvc_delete_array_ptr32)
+TLI_DEFINE_STRING_INTERNAL("??_V@YAXPAX@Z")
+
+/// void operator delete[](void*, nothrow);
+TLI_DEFINE_ENUM_INTERNAL(msvc_delete_array_ptr32_nothrow)
+TLI_DEFINE_STRING_INTERNAL("??_V@YAXPAXABUnothrow_t@std@@@Z")
+
+/// void operator delete[](void*, unsigned int);
+TLI_DEFINE_ENUM_INTERNAL(msvc_delete_array_ptr32_int)
+TLI_DEFINE_STRING_INTERNAL("??_V@YAXPAXI@Z")
+
+/// void operator delete[](void*);
+TLI_DEFINE_ENUM_INTERNAL(msvc_delete_array_ptr64)
+TLI_DEFINE_STRING_INTERNAL("??_V@YAXPEAX@Z")
+
+/// void operator delete[](void*, nothrow);
+TLI_DEFINE_ENUM_INTERNAL(msvc_delete_array_ptr64_nothrow)
+TLI_DEFINE_STRING_INTERNAL("??_V@YAXPEAXAEBUnothrow_t@std@@@Z")
+
+/// void operator delete[](void*, unsigned long long);
+TLI_DEFINE_ENUM_INTERNAL(msvc_delete_array_ptr64_longlong)
+TLI_DEFINE_STRING_INTERNAL("??_V@YAXPEAX_K@Z")
+
 /// int _IO_getc(_IO_FILE * __fp);
 TLI_DEFINE_ENUM_INTERNAL(under_IO_getc)
 TLI_DEFINE_STRING_INTERNAL("_IO_getc")
@@ -673,6 +753,7 @@ TLI_DEFINE_STRING_INTERNAL("modff")
 /// long double modfl(long double value, long double *iptr);
 TLI_DEFINE_ENUM_INTERNAL(modfl)
 TLI_DEFINE_STRING_INTERNAL("modfl")
+
 /// double nearbyint(double x);
 TLI_DEFINE_ENUM_INTERNAL(nearbyint)
 TLI_DEFINE_STRING_INTERNAL("nearbyint")
diff --git a/lib/Analysis/MemoryBuiltins.cpp b/lib/Analysis/MemoryBuiltins.cpp
index b4dce4941538..c64be771f1f0 100644
--- a/lib/Analysis/MemoryBuiltins.cpp
+++ b/lib/Analysis/MemoryBuiltins.cpp
@@ -62,6 +62,14 @@ static const AllocFnsTy AllocationFnData[] = {
   {LibFunc::ZnajRKSt9nothrow_t,  MallocLike,  2, 0,  -1}, // new[](unsigned int, nothrow)
   {LibFunc::Znam,                OpNewLike,   1, 0,  -1}, // new[](unsigned long)
   {LibFunc::ZnamRKSt9nothrow_t,  MallocLike,  2, 0,  -1}, // new[](unsigned long, nothrow)
+  {LibFunc::msvc_new_int,         OpNewLike,   1, 0,  -1}, // new(unsigned int)
+  {LibFunc::msvc_new_int_nothrow, MallocLike,  2, 0,  -1}, // new(unsigned int, nothrow)
+  {LibFunc::msvc_new_longlong,         OpNewLike,   1, 0,  -1}, // new(unsigned long long)
+  {LibFunc::msvc_new_longlong_nothrow, MallocLike,  2, 0,  -1}, // new(unsigned long long, nothrow)
+  {LibFunc::msvc_new_array_int,         OpNewLike,   1, 0,  -1}, // new[](unsigned int)
+  {LibFunc::msvc_new_array_int_nothrow, MallocLike,  2, 0,  -1}, // new[](unsigned int, nothrow)
+  {LibFunc::msvc_new_array_longlong,         OpNewLike,   1, 0,  -1}, // new[](unsigned long long)
+  {LibFunc::msvc_new_array_longlong_nothrow, MallocLike,  2, 0,  -1}, // new[](unsigned long long, nothrow)
   {LibFunc::calloc,              CallocLike,  2, 0,   1},
   {LibFunc::realloc,             ReallocLike, 2, 1,  -1},
   {LibFunc::reallocf,            ReallocLike, 2, 1,  -1},
@@ -308,14 +316,26 @@ const CallInst *llvm::isFreeCall(const Value *I, const TargetLibraryInfo *TLI) {
   unsigned ExpectedNumParams;
   if (TLIFn == LibFunc::free ||
       TLIFn == LibFunc::ZdlPv || // operator delete(void*)
-      TLIFn == LibFunc::ZdaPv)   // operator delete[](void*)
+      TLIFn == LibFunc::ZdaPv || // operator delete[](void*)
+      TLIFn == LibFunc::msvc_delete_ptr32 || // operator delete(void*)
+      TLIFn == LibFunc::msvc_delete_ptr64 || // operator delete(void*)
+      TLIFn == LibFunc::msvc_delete_array_ptr32 || // operator delete[](void*)
+      TLIFn == LibFunc::msvc_delete_array_ptr64)   // operator delete[](void*)
     ExpectedNumParams = 1;
   else if (TLIFn == LibFunc::ZdlPvj ||              // delete(void*, uint)
            TLIFn == LibFunc::ZdlPvm ||              // delete(void*, ulong)
            TLIFn == LibFunc::ZdlPvRKSt9nothrow_t || // delete(void*, nothrow)
            TLIFn == LibFunc::ZdaPvj ||              // delete[](void*, uint)
            TLIFn == LibFunc::ZdaPvm ||              // delete[](void*, ulong)
-           TLIFn == LibFunc::ZdaPvRKSt9nothrow_t)   // delete[](void*, nothrow)
+           TLIFn == LibFunc::ZdaPvRKSt9nothrow_t || // delete[](void*, nothrow)
+           TLIFn == LibFunc::msvc_delete_ptr32_int ||      // delete(void*, uint)
+           TLIFn == LibFunc::msvc_delete_ptr64_longlong || // delete(void*, ulonglong)
+           TLIFn == LibFunc::msvc_delete_ptr32_nothrow || // delete(void*, nothrow)
+           TLIFn == LibFunc::msvc_delete_ptr64_nothrow || // delete(void*, nothrow)
+           TLIFn == LibFunc::msvc_delete_array_ptr32_int ||      // delete[](void*, uint)
+           TLIFn == LibFunc::msvc_delete_array_ptr64_longlong || // delete[](void*, ulonglong)
+           TLIFn == LibFunc::msvc_delete_array_ptr32_nothrow || // delete[](void*, nothrow)
+           TLIFn == LibFunc::msvc_delete_array_ptr64_nothrow)   // delete[](void*, nothrow)
     ExpectedNumParams = 2;
   else
     return nullptr;
diff --git a/test/Transforms/InstCombine/malloc-free-delete.ll b/test/Transforms/InstCombine/malloc-free-delete.ll
index 138001ace951..8fcb8214360d 100644
--- a/test/Transforms/InstCombine/malloc-free-delete.ll
+++ b/test/Transforms/InstCombine/malloc-free-delete.ll
@@ -186,3 +186,14 @@ define void @test8() {
   call void @_ZdaPvj(i8* %naj, i32 32) builtin
   ret void
 }
+
+declare noalias i8* @"\01??2@YAPEAX_K@Z"(i64) nobuiltin
+declare void @"\01??3@YAXPEAX@Z"(i8*) nobuiltin
+
+; CHECK-LABEL: @test9(
+define void @test9() {
+  ; CHECK-NOT: call
+  %new_long_long = call noalias i8* @"\01??2@YAPEAX_K@Z"(i64 32) builtin
+  call void @"\01??3@YAXPEAX@Z"(i8* %new_long_long) builtin
+  ret void
+}

From 2ffa666beca8a55cafb5f0bac88671fec7645626 Mon Sep 17 00:00:00 2001
From: Chris Bieneman <beanz@apple.com>
Date: Thu, 3 Dec 2015 22:51:08 +0000
Subject: [PATCH 030/364] [CMake] set_target_properties doesn't append link
 flags

This fixes a bug introduced in r254627, and another occurance of the same bug in this file.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254657 91177308-0d34-0410-b5e6-96231b3b80d8
---
 cmake/modules/AddLLVM.cmake | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake
index 97ac96ed4281..e121da6786d4 100644
--- a/cmake/modules/AddLLVM.cmake
+++ b/cmake/modules/AddLLVM.cmake
@@ -667,7 +667,7 @@ function(export_executable_symbols target)
   if (NOT MSVC) # MSVC's linker doesn't support exporting all symbols.
     set_target_properties(${target} PROPERTIES ENABLE_EXPORTS 1)
     if (APPLE)
-      set_target_properties(${target} PROPERTIES
+      set_property(TARGET ${target} APPEND_STRING PROPERTY
         LINK_FLAGS "-rdynamic")
     endif()
   endif()
@@ -1185,8 +1185,8 @@ function(llvm_externalize_debuginfo name)
       OR CMAKE_CXX_FLAGS_${uppercase_CMAKE_BUILD_TYPE} MATCHES "-flto")
 
       set(lto_object ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/${name}-lto.o)
-      set_target_properties(${name} PROPERTIES
-        LINK_FLAGS "-Wl,-object_path_lto -Wl,${lto_object}")
+      set_property(TARGET ${name} APPEND_STRING PROPERTY
+        LINK_FLAGS " -Wl,-object_path_lto,${lto_object}")
     endif()
     add_custom_command(TARGET ${name} POST_BUILD
       COMMAND xcrun dsymutil $<TARGET_FILE:${name}>

From d8b19eb290bffe27ffcb92950162a15024ed6572 Mon Sep 17 00:00:00 2001
From: Chris Bieneman <beanz@apple.com>
Date: Thu, 3 Dec 2015 22:55:36 +0000
Subject: [PATCH 031/364] [CMake] Fixing bots

CMake calls to set_property with APPEND string need to have a leading space.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254659 91177308-0d34-0410-b5e6-96231b3b80d8
---
 cmake/modules/AddLLVM.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake
index e121da6786d4..8cc8abbbc7de 100644
--- a/cmake/modules/AddLLVM.cmake
+++ b/cmake/modules/AddLLVM.cmake
@@ -668,7 +668,7 @@ function(export_executable_symbols target)
     set_target_properties(${target} PROPERTIES ENABLE_EXPORTS 1)
     if (APPLE)
       set_property(TARGET ${target} APPEND_STRING PROPERTY
-        LINK_FLAGS "-rdynamic")
+        LINK_FLAGS " -rdynamic")
     endif()
   endif()
 endfunction()

From add5c9809ba804ed03c7f289a29fccb158d27ae9 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Thu, 3 Dec 2015 23:00:28 +0000
Subject: [PATCH 032/364] Revert "raw_ostream: << operator for callables with
 raw_stream argument"

This commit provoked "error C2593: 'operator <<' is ambiguous" on MSVC.

This reverts commit r254655.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254661 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Support/raw_ostream.h            | 14 ---
 include/llvm/Target/TargetRegisterInfo.h      | 73 ++++++++++++--
 lib/CodeGen/RegAllocPBQP.cpp                  | 26 ++++-
 .../SelectionDAG/SelectionDAGDumper.cpp       | 17 +++-
 lib/CodeGen/TargetRegisterInfo.cpp            | 95 ++++++++-----------
 lib/Support/raw_ostream.cpp                   |  4 -
 6 files changed, 140 insertions(+), 89 deletions(-)

diff --git a/include/llvm/Support/raw_ostream.h b/include/llvm/Support/raw_ostream.h
index 38a96fa6ab74..e5cc40e7d6b2 100644
--- a/include/llvm/Support/raw_ostream.h
+++ b/include/llvm/Support/raw_ostream.h
@@ -17,14 +17,12 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/DataTypes.h"
-#include <functional>
 #include <system_error>
 
 namespace llvm {
 class format_object_base;
 class FormattedString;
 class FormattedNumber;
-class raw_ostream;
 template <typename T> class SmallVectorImpl;
 
 namespace sys {
@@ -33,15 +31,6 @@ enum OpenFlags : unsigned;
 }
 }
 
-/// Type of function that prints to raw_ostream.
-///
-/// Typical usage:
-///     Printable PrintFoo(Foo x) {
-///       return [] (raw_ostream &os) { os << /* ... */; };
-///     }
-///     os << "Foo: " << PrintFoo(foo) << '\n';
-typedef std::function<void(raw_ostream&)> Printable;
-
 /// This class implements an extremely fast bulk output stream that can *only*
 /// output to a stream.  It does not support seeking, reopening, rewinding, line
 /// buffered disciplines etc. It is a simple buffer that outputs
@@ -214,9 +203,6 @@ class raw_ostream {
 
   raw_ostream &operator<<(double N);
 
-  /// IO manipulator, \see Printable.
-  raw_ostream &operator<<(Printable P);
-
   /// Output \p N in hexadecimal, without any prefix or padding.
   raw_ostream &write_hex(unsigned long long N);
 
diff --git a/include/llvm/Target/TargetRegisterInfo.h b/include/llvm/Target/TargetRegisterInfo.h
index de6f46eba013..7d293fe82a6b 100644
--- a/include/llvm/Target/TargetRegisterInfo.h
+++ b/include/llvm/Target/TargetRegisterInfo.h
@@ -932,6 +932,7 @@ struct VirtReg2IndexFunctor : public std::unary_function<unsigned, unsigned> {
   }
 };
 
+/// Helper class for printing registers on a raw_ostream.
 /// Prints virtual and physical registers with or without a TRI instance.
 ///
 /// The format is:
@@ -942,10 +943,24 @@ struct VirtReg2IndexFunctor : public std::unary_function<unsigned, unsigned> {
 ///   %physreg17      - a physical register when no TRI instance given.
 ///
 /// Usage: OS << PrintReg(Reg, TRI) << '\n';
-Printable PrintReg(unsigned Reg, const TargetRegisterInfo *TRI = nullptr,
-                   unsigned SubRegIdx = 0);
+///
+class PrintReg {
+  const TargetRegisterInfo *TRI;
+  unsigned Reg;
+  unsigned SubIdx;
+public:
+  explicit PrintReg(unsigned reg, const TargetRegisterInfo *tri = nullptr,
+                    unsigned subidx = 0)
+    : TRI(tri), Reg(reg), SubIdx(subidx) {}
+  void print(raw_ostream&) const;
+};
 
-/// Create Printable object to print register units on a \ref raw_ostream.
+static inline raw_ostream &operator<<(raw_ostream &OS, const PrintReg &PR) {
+  PR.print(OS);
+  return OS;
+}
+
+/// Helper class for printing register units on a raw_ostream.
 ///
 /// Register units are named after their root registers:
 ///
@@ -953,14 +968,54 @@ Printable PrintReg(unsigned Reg, const TargetRegisterInfo *TRI = nullptr,
 ///   FP0~ST7 - Dual roots.
 ///
 /// Usage: OS << PrintRegUnit(Unit, TRI) << '\n';
-Printable PrintRegUnit(unsigned Unit, const TargetRegisterInfo *TRI);
+///
+class PrintRegUnit {
+protected:
+  const TargetRegisterInfo *TRI;
+  unsigned Unit;
+public:
+  PrintRegUnit(unsigned unit, const TargetRegisterInfo *tri)
+    : TRI(tri), Unit(unit) {}
+  void print(raw_ostream&) const;
+};
+
+static inline raw_ostream &operator<<(raw_ostream &OS, const PrintRegUnit &PR) {
+  PR.print(OS);
+  return OS;
+}
 
-/// \brief Create Printable object to print virtual registers and physical
-/// registers on a \ref raw_ostream.
-Printable PrintVRegOrUnit(unsigned VRegOrUnit, const TargetRegisterInfo *TRI);
+/// It is often convenient to track virtual registers and
+/// physical register units in the same list.
+class PrintVRegOrUnit : protected PrintRegUnit {
+public:
+  PrintVRegOrUnit(unsigned VRegOrUnit, const TargetRegisterInfo *tri)
+    : PrintRegUnit(VRegOrUnit, tri) {}
+  void print(raw_ostream&) const;
+};
+
+static inline raw_ostream &operator<<(raw_ostream &OS,
+                                      const PrintVRegOrUnit &PR) {
+  PR.print(OS);
+  return OS;
+}
+
+/// Helper class for printing lane masks.
+///
+/// They are currently printed out as hexadecimal numbers.
+/// Usage: OS << PrintLaneMask(Mask);
+class PrintLaneMask {
+protected:
+  LaneBitmask LaneMask;
+public:
+  PrintLaneMask(LaneBitmask LaneMask)
+    : LaneMask(LaneMask) {}
+  void print(raw_ostream&) const;
+};
 
-/// Create Printable object to print LaneBitmasks on a \ref raw_ostream.
-Printable PrintLaneMask(LaneBitmask LaneMask);
+static inline raw_ostream &operator<<(raw_ostream &OS, const PrintLaneMask &P) {
+  P.print(OS);
+  return OS;
+}
 
 } // End llvm namespace
 
diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp
index f08d616e6812..afa98b26d727 100644
--- a/lib/CodeGen/RegAllocPBQP.cpp
+++ b/lib/CodeGen/RegAllocPBQP.cpp
@@ -805,17 +805,33 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
   return true;
 }
 
-/// Create Printable object for node and register info.
-static Printable PrintNodeInfo(PBQP::RegAlloc::PBQPRAGraph::NodeId NId,
-                               const PBQP::RegAlloc::PBQPRAGraph &G) {
-  return [NId, &G](raw_ostream &OS) {
+namespace {
+// A helper class for printing node and register info in a consistent way
+class PrintNodeInfo {
+public:
+  typedef PBQP::RegAlloc::PBQPRAGraph Graph;
+  typedef PBQP::RegAlloc::PBQPRAGraph::NodeId NodeId;
+
+  PrintNodeInfo(NodeId NId, const Graph &G) : G(G), NId(NId) {}
+
+  void print(raw_ostream &OS) const {
     const MachineRegisterInfo &MRI = G.getMetadata().MF.getRegInfo();
     const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
     unsigned VReg = G.getNodeMetadata(NId).getVReg();
     const char *RegClassName = TRI->getRegClassName(MRI.getRegClass(VReg));
     OS << NId << " (" << RegClassName << ':' << PrintReg(VReg, TRI) << ')';
-  };
+  }
+
+private:
+  const Graph &G;
+  NodeId NId;
+};
+
+inline raw_ostream &operator<<(raw_ostream &OS, const PrintNodeInfo &PR) {
+  PR.print(OS);
+  return OS;
 }
+} // anonymous namespace
 
 void PBQP::RegAlloc::PBQPRAGraph::dump(raw_ostream &OS) const {
   for (auto NId : nodeIds()) {
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 8c3a0f2d81ec..d362f98d6464 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -369,14 +369,25 @@ const char *SDNode::getIndexedModeName(ISD::MemIndexedMode AM) {
   }
 }
 
-static Printable PrintNodeId(const SDNode &Node) {
-  return [&Node](raw_ostream &OS) {
+namespace {
+class PrintNodeId {
+  const SDNode &Node;
+public:
+  explicit PrintNodeId(const SDNode &Node)
+      : Node(Node) {}
+  void print(raw_ostream &OS) const {
 #ifndef NDEBUG
     OS << 't' << Node.PersistentId;
 #else
     OS << (const void*)&Node;
 #endif
-  };
+  }
+};
+
+static inline raw_ostream &operator<<(raw_ostream &OS, const PrintNodeId &P) {
+  P.print(OS);
+  return OS;
+}
 }
 
 void SDNode::dump() const { dump(nullptr); }
diff --git a/lib/CodeGen/TargetRegisterInfo.cpp b/lib/CodeGen/TargetRegisterInfo.cpp
index 839d9ef31ad0..0c4a3dcb226e 100644
--- a/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/lib/CodeGen/TargetRegisterInfo.cpp
@@ -40,71 +40,58 @@ TargetRegisterInfo::TargetRegisterInfo(const TargetRegisterInfoDesc *ID,
 
 TargetRegisterInfo::~TargetRegisterInfo() {}
 
-namespace llvm {
-
-Printable PrintReg(unsigned Reg, const TargetRegisterInfo *TRI,
-                   unsigned SubIdx) {
-  return [Reg, TRI, SubIdx](raw_ostream &OS) {
-    if (!Reg)
-      OS << "%noreg";
-    else if (TargetRegisterInfo::isStackSlot(Reg))
-      OS << "SS#" << TargetRegisterInfo::stackSlot2Index(Reg);
-    else if (TargetRegisterInfo::isVirtualRegister(Reg))
-      OS << "%vreg" << TargetRegisterInfo::virtReg2Index(Reg);
-    else if (TRI && Reg < TRI->getNumRegs())
-      OS << '%' << TRI->getName(Reg);
+void PrintReg::print(raw_ostream &OS) const {
+  if (!Reg)
+    OS << "%noreg";
+  else if (TargetRegisterInfo::isStackSlot(Reg))
+    OS << "SS#" << TargetRegisterInfo::stackSlot2Index(Reg);
+  else if (TargetRegisterInfo::isVirtualRegister(Reg))
+    OS << "%vreg" << TargetRegisterInfo::virtReg2Index(Reg);
+  else if (TRI && Reg < TRI->getNumRegs())
+    OS << '%' << TRI->getName(Reg);
+  else
+    OS << "%physreg" << Reg;
+  if (SubIdx) {
+    if (TRI)
+      OS << ':' << TRI->getSubRegIndexName(SubIdx);
     else
-      OS << "%physreg" << Reg;
-    if (SubIdx) {
-      if (TRI)
-        OS << ':' << TRI->getSubRegIndexName(SubIdx);
-      else
-        OS << ":sub(" << SubIdx << ')';
-    }
-  };
+      OS << ":sub(" << SubIdx << ')';
+  }
 }
 
-Printable PrintRegUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
-  return [Unit, TRI](raw_ostream &OS) {
-    // Generic printout when TRI is missing.
-    if (!TRI) {
-      OS << "Unit~" << Unit;
-      return;
-    }
+void PrintRegUnit::print(raw_ostream &OS) const {
+  // Generic printout when TRI is missing.
+  if (!TRI) {
+    OS << "Unit~" << Unit;
+    return;
+  }
 
-    // Check for invalid register units.
-    if (Unit >= TRI->getNumRegUnits()) {
-      OS << "BadUnit~" << Unit;
-      return;
-    }
+  // Check for invalid register units.
+  if (Unit >= TRI->getNumRegUnits()) {
+    OS << "BadUnit~" << Unit;
+    return;
+  }
 
-    // Normal units have at least one root.
-    MCRegUnitRootIterator Roots(Unit, TRI);
-    assert(Roots.isValid() && "Unit has no roots.");
-    OS << TRI->getName(*Roots);
-    for (++Roots; Roots.isValid(); ++Roots)
-      OS << '~' << TRI->getName(*Roots);
-  };
+  // Normal units have at least one root.
+  MCRegUnitRootIterator Roots(Unit, TRI);
+  assert(Roots.isValid() && "Unit has no roots.");
+  OS << TRI->getName(*Roots);
+  for (++Roots; Roots.isValid(); ++Roots)
+    OS << '~' << TRI->getName(*Roots);
 }
 
-Printable PrintVRegOrUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
-  return [Unit, TRI](raw_ostream &OS) {
-    if (TRI && TRI->isVirtualRegister(Unit)) {
-      OS << "%vreg" << TargetRegisterInfo::virtReg2Index(Unit);
-    } else {
-      OS << PrintRegUnit(Unit, TRI);
-    }
-  };
+void PrintVRegOrUnit::print(raw_ostream &OS) const {
+  if (TRI && TRI->isVirtualRegister(Unit)) {
+    OS << "%vreg" << TargetRegisterInfo::virtReg2Index(Unit);
+    return;
+  }
+  PrintRegUnit::print(OS);
 }
 
-Printable PrintLaneMask(LaneBitmask LaneMask) {
-  return [LaneMask](raw_ostream &OS) {
-    OS << format("%08X", LaneMask);
-  };
+void PrintLaneMask::print(raw_ostream &OS) const {
+  OS << format("%08X", LaneMask);
 }
 
-} // End of llvm namespace
-
 /// getAllocatableClass - Return the maximal subclass of the given register
 /// class that is alloctable, or NULL.
 const TargetRegisterClass *
diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp
index 5b1dceca0bfc..49ef400c5f2d 100644
--- a/lib/Support/raw_ostream.cpp
+++ b/lib/Support/raw_ostream.cpp
@@ -264,10 +264,6 @@ raw_ostream &raw_ostream::operator<<(double N) {
   return this->operator<<(format("%e", N));
 }
 
-raw_ostream &raw_ostream::operator<<(Printable P) {
-  P(*this);
-  return *this;
-}
 
 
 void raw_ostream::flush_nonempty() {

From e94b2105e93b551b1b1bc9a009eb7c3c622ad5b7 Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Thu, 3 Dec 2015 23:07:03 +0000
Subject: [PATCH 033/364] [WebAssembly] Fix dominance check for PHIs in the
 StoreResult pass

When a block has no terminator instructions, getFirstTerminator() returns
end(), which can't be used in dominance checks. Check dominance for phi
operands separately.

Also, remove some bits from WebAssemblyRegStackify.cpp that were causing
trouble on the same testcase; they were left behind from an earlier
experiment.

Differential Revision: http://reviews.llvm.org/D15210


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254662 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../WebAssembly/WebAssemblyRegStackify.cpp    |  9 ++--
 .../WebAssembly/WebAssemblyStoreResults.cpp   | 33 +++++++++-----
 test/CodeGen/WebAssembly/store-results.ll     | 43 +++++++++++++++++++
 3 files changed, 69 insertions(+), 16 deletions(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index bdccc8577c5e..ecbbc5c72243 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -81,6 +81,7 @@ static void ImposeStackOrdering(MachineInstr *MI) {
 // more precise.
 static bool IsSafeToMove(const MachineInstr *Def, const MachineInstr *Insert,
                          AliasAnalysis &AA) {
+  assert(Def->getParent() == Insert->getParent());
   bool SawStore = false, SawSideEffects = false;
   MachineBasicBlock::const_iterator D(Def), I(Insert);
   for (--I; I != D; --I)
@@ -155,17 +156,15 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
             Def->getOpcode() == WebAssembly::ARGUMENT_F64)
           continue;
 
-        // Single-use expression trees require defs that have one use, or that
-        // they be trivially clonable.
+        // Single-use expression trees require defs that have one use.
         // TODO: Eventually we'll relax this, to take advantage of set_local
         // returning its result.
         if (!MRI.hasOneUse(Reg))
           continue;
 
-        // For now, be conservative and don't look across block boundaries,
-        // unless we have something trivially clonable.
+        // For now, be conservative and don't look across block boundaries.
         // TODO: Be more aggressive.
-        if (Def->getParent() != &MBB && !Def->isMoveImmediate())
+        if (Def->getParent() != &MBB)
           continue;
 
         // Don't move instructions that have side effects or memory dependencies
diff --git a/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp b/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
index 3a7f50e3b142..4a8fc09878c4 100644
--- a/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
@@ -10,9 +10,12 @@
 /// \file
 /// \brief This file implements an optimization pass using store result values.
 ///
-/// WebAssembly's store instructions return the stored value, specifically to
-/// enable the optimization of reducing get_local/set_local traffic, which is
-/// what we're doing here.
+/// WebAssembly's store instructions return the stored value. This is to enable
+/// an optimization wherein uses of the stored value can be replaced by uses of
+/// the store's result value, making the stored value register more likely to
+/// be single-use, thus more likely to be useful to register stackifying, and
+/// potentially also exposing the store to register stackifying. These both can
+/// reduce get_local/set_local traffic.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -89,14 +92,22 @@ bool WebAssemblyStoreResults::runOnMachineFunction(MachineFunction &MF) {
         for (auto I = MRI.use_begin(FromReg), E = MRI.use_end(); I != E;) {
           MachineOperand &O = *I++;
           MachineInstr *Where = O.getParent();
-          if (Where->getOpcode() == TargetOpcode::PHI)
-            Where = Where->getOperand(&O - &Where->getOperand(0) + 1)
-                        .getMBB()
-                        ->getFirstTerminator();
-          if (&MI == Where || !MDT.dominates(&MI, Where))
-            continue;
-          DEBUG(dbgs() << "Setting operand " << O << " in " << *Where <<
-                " from " << MI <<"\n");
+          if (Where->getOpcode() == TargetOpcode::PHI) {
+            // PHIs use their operands on their incoming CFG edges rather than
+            // in their parent blocks. Get the basic block paired with this use
+            // of FromReg and check that MI's block dominates it.
+            MachineBasicBlock *Pred =
+                Where->getOperand(&O - &Where->getOperand(0) + 1).getMBB();
+            if (!MDT.dominates(&MBB, Pred))
+              continue;
+          } else {
+            // For a non-PHI, check that MI dominates the instruction in the
+            // normal way.
+            if (&MI == Where || !MDT.dominates(&MI, Where))
+              continue;
+          }
+          DEBUG(dbgs() << "Setting operand " << O << " in " << *Where
+                       << " from " << MI << "\n");
           O.setReg(ToReg);
         }
       }
diff --git a/test/CodeGen/WebAssembly/store-results.ll b/test/CodeGen/WebAssembly/store-results.ll
index 1bcee5d31fb7..c05ed3a04be3 100644
--- a/test/CodeGen/WebAssembly/store-results.ll
+++ b/test/CodeGen/WebAssembly/store-results.ll
@@ -16,3 +16,46 @@ entry:
   store i32 0, i32* %p
   ret i32 0
 }
+
+; Test interesting corner cases for wasm-store-results, in which the operand of
+; a store ends up getting used by a phi, which needs special handling in the
+; dominance test, since phis use their operands on their incoming edges.
+
+%class.Vec3 = type { float, float, float }
+
+@pos = global %class.Vec3 zeroinitializer, align 4
+
+; CHECK-LABEL: foo:
+; CHECK: i32.store $discard=, $pop0, $0
+define void @foo() {
+for.body.i:
+  br label %for.body5.i
+
+for.body5.i:
+  %i.0168.i = phi i32 [ 0, %for.body.i ], [ %inc.i, %for.body5.i ]
+  %conv6.i = sitofp i32 %i.0168.i to float
+  store volatile float 0.0, float* getelementptr inbounds (%class.Vec3, %class.Vec3* @pos, i32 0, i32 0)
+  %inc.i = add nuw nsw i32 %i.0168.i, 1
+  %exitcond.i = icmp eq i32 %inc.i, 256
+  br i1 %exitcond.i, label %for.cond.cleanup4.i, label %for.body5.i
+
+for.cond.cleanup4.i:
+  ret void
+}
+
+; CHECK-LABEL: bar:
+; CHECK: i32.store $discard=, $0, $pop0
+define void @bar() {
+for.body.i:
+  br label %for.body5.i
+
+for.body5.i:
+  %i.0168.i = phi float [ 0.0, %for.body.i ], [ %inc.i, %for.body5.i ]
+  store volatile float 0.0, float* getelementptr inbounds (%class.Vec3, %class.Vec3* @pos, i32 0, i32 0)
+  %inc.i = fadd float %i.0168.i, 1.0
+  %exitcond.i = fcmp oeq float %inc.i, 256.0
+  br i1 %exitcond.i, label %for.cond.cleanup4.i, label %for.body5.i
+
+for.cond.cleanup4.i:
+  ret void
+}

From c5aa4e71ff52b61762ef2dfdab5ccba6a49fffe5 Mon Sep 17 00:00:00 2001
From: Justin Bogner <mail@justinbogner.com>
Date: Thu, 3 Dec 2015 23:28:35 +0000
Subject: [PATCH 034/364] AsmPrinter: Simplify emitting FP elements in
 sequential data. NFC

Use APFloat APIs here Rather than manually type-punning through
unions.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254664 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 41 ++++++++++-----------------
 1 file changed, 15 insertions(+), 26 deletions(-)

diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 9ffd830a9f58..2cfea650872a 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1945,33 +1945,22 @@ static void emitGlobalConstantDataSequential(const DataLayout &DL,
       AP.OutStreamer->EmitIntValue(CDS->getElementAsInteger(i),
                                    ElementByteSize);
     }
-  } else if (ElementByteSize == 4) {
-    // FP Constants are printed as integer constants to avoid losing
-    // precision.
-    assert(CDS->getElementType()->isFloatTy());
-    for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) {
-      union {
-        float F;
-        uint32_t I;
-      };
-
-      F = CDS->getElementAsFloat(i);
-      if (AP.isVerbose())
-        AP.OutStreamer->GetCommentOS() << "float " << F << '\n';
-      AP.OutStreamer->EmitIntValue(I, 4);
-    }
   } else {
-    assert(CDS->getElementType()->isDoubleTy());
-    for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) {
-      union {
-        double F;
-        uint64_t I;
-      };
-
-      F = CDS->getElementAsDouble(i);
-      if (AP.isVerbose())
-        AP.OutStreamer->GetCommentOS() << "double " << F << '\n';
-      AP.OutStreamer->EmitIntValue(I, 8);
+    // FP Constants are printed as integer constants to avoid losing precision.
+    for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I) {
+      APFloat Num = CDS->getElementAsAPFloat(I);
+      if (AP.isVerbose()) {
+        if (ElementByteSize == 4)
+          AP.OutStreamer->GetCommentOS() << "float " << Num.convertToFloat()
+                                         << '\n';
+        else if (ElementByteSize == 8)
+          AP.OutStreamer->GetCommentOS() << "double " << Num.convertToDouble()
+                                         << '\n';
+        else
+          llvm_unreachable("Unexpected float width");
+      }
+      AP.OutStreamer->EmitIntValue(Num.bitcastToAPInt().getLimitedValue(),
+                                   ElementByteSize);
     }
   }
 

From 922e9787e69e0af586b7f5b370969f99d2f8ea99 Mon Sep 17 00:00:00 2001
From: JF Bastien <jfb@google.com>
Date: Thu, 3 Dec 2015 23:43:56 +0000
Subject: [PATCH 035/364] CodeGen peephole: fold redundant phys reg copies

Code generation often exposes redundant physical register copies through
virtual registers such as:

  %vreg = COPY %PHYSREG
  ...
  %PHYSREG = COPY %vreg

There are cases where no intervening clobber of %PHYSREG occurs, and the
later copy could therefore be removed. In some cases this further allows
us to remove the initial copy.

This patch contains a motivating example which comes from the x86 build
of Chrome, specifically cc::ResourceProvider::UnlockForRead uses
libstdc++'s implementation of hash_map. That example has two tests live
at the same time, and after machine sinking LLVM has confused itself
enough and things spilling EFLAGS is a great idea even though it's
never restored and the comparison results are both live.

Before this patch we have:
  DEC32m %RIP, 1, %noreg, <ga:@L>, %noreg, %EFLAGS<imp-def>
  %vreg1<def> = COPY %EFLAGS; GR64:%vreg1
  %EFLAGS<def> = COPY %vreg1; GR64:%vreg1
  JNE_1 <BB#1>, %EFLAGS<imp-use>

Both copies are useless. This patch tries to eliminate the later copy in
a generic manner.

dec is especially confusing to LLVM when compared with sub.

I wrote this patch to treat all physical registers generically, but only
remove redundant copies of non-allocatable physical registers because
the allocatable ones caused issues (e.g. when calling conventions weren't
properly modeled) and should be handled later by the register allocator
anyways.

The following tests used to failed when the patch also replaced allocatable
registers:
  CodeGen/X86/StackColoring.ll
  CodeGen/X86/avx512-calling-conv.ll
  CodeGen/X86/copy-propagation.ll
  CodeGen/X86/inline-asm-fpstack.ll
  CodeGen/X86/musttail-varargs.ll
  CodeGen/X86/pop-stack-cleanup.ll
  CodeGen/X86/preserve_mostcc64.ll
  CodeGen/X86/tailcallstack64.ll
  CodeGen/X86/this-return-64.ll
This happens because COPY has other special meaning for e.g. dependency
breakage and x87 FP stack.

Note that all other backends' tests pass.

Reviewers: qcolombet
Subscribers: llvm-commits
Differential Revision: http://reviews.llvm.org/D15157

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254665 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/PeepholeOptimizer.cpp             | 144 +++++++++++--
 .../X86/peephole-na-phys-copy-folding.ll      | 190 ++++++++++++++++++
 2 files changed, 322 insertions(+), 12 deletions(-)
 create mode 100644 test/CodeGen/X86/peephole-na-phys-copy-folding.ll

diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp
index 4fd1c4bda433..0fabc40b64e0 100644
--- a/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/lib/CodeGen/PeepholeOptimizer.cpp
@@ -98,6 +98,10 @@ static cl::opt<bool>
 DisableAdvCopyOpt("disable-adv-copy-opt", cl::Hidden, cl::init(false),
                   cl::desc("Disable advanced copy optimization"));
 
+static cl::opt<bool> DisableNAPhysCopyOpt(
+    "disable-non-allocatable-phys-copy-opt", cl::Hidden, cl::init(false),
+    cl::desc("Disable non-allocatable physical register copy optimization"));
+
 // Limit the number of PHI instructions to process
 // in PeepholeOptimizer::getNextSource.
 static cl::opt<unsigned> RewritePHILimit(
@@ -111,6 +115,7 @@ STATISTIC(NumLoadFold,   "Number of loads folded");
 STATISTIC(NumSelects,    "Number of selects optimized");
 STATISTIC(NumUncoalescableCopies, "Number of uncoalescable copies optimized");
 STATISTIC(NumRewrittenCopies, "Number of copies rewritten");
+STATISTIC(NumNAPhysCopies, "Number of non-allocatable physical copies removed");
 
 namespace {
   class ValueTrackerResult;
@@ -162,12 +167,24 @@ namespace {
                        DenseMap<unsigned, MachineInstr*> &ImmDefMIs);
 
     /// \brief If copy instruction \p MI is a virtual register copy, track it in
-    /// the set \p CopiedFromRegs and \p CopyMIs. If this virtual register was
+    /// the set \p CopySrcRegs and \p CopyMIs. If this virtual register was
     /// previously seen as a copy, replace the uses of this copy with the
     /// previously seen copy's destination register.
     bool foldRedundantCopy(MachineInstr *MI,
-                           SmallSet<unsigned, 4> &CopiedFromRegs,
-                           DenseMap<unsigned, MachineInstr*> &CopyMIs);
+                           SmallSet<unsigned, 4> &CopySrcRegs,
+                           DenseMap<unsigned, MachineInstr *> &CopyMIs);
+
+    /// \brief Is the register \p Reg a non-allocatable physical register?
+    bool isNAPhysCopy(unsigned Reg);
+
+    /// \brief If copy instruction \p MI is a non-allocatable virtual<->physical
+    /// register copy, track it in the \p NAPhysToVirtMIs map. If this
+    /// non-allocatable physical register was previously copied to a virtual
+    /// registered and hasn't been clobbered, the virt->phys copy can be
+    /// deleted.
+    bool foldRedundantNAPhysCopy(
+        MachineInstr *MI,
+        DenseMap<unsigned, MachineInstr *> &NAPhysToVirtMIs);
 
     bool isLoadFoldable(MachineInstr *MI,
                         SmallSet<unsigned, 16> &FoldAsLoadDefCandidates);
@@ -1332,7 +1349,7 @@ bool PeepholeOptimizer::foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB,
     if (ImmDefRegs.count(Reg) == 0)
       continue;
     DenseMap<unsigned, MachineInstr*>::iterator II = ImmDefMIs.find(Reg);
-    assert(II != ImmDefMIs.end());
+    assert(II != ImmDefMIs.end() && "couldn't find immediate definition");
     if (TII->FoldImmediate(MI, II->second, Reg, MRI)) {
       ++NumImmFold;
       return true;
@@ -1356,10 +1373,10 @@ bool PeepholeOptimizer::foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB,
 //
 // Should replace %vreg2 uses with %vreg1:sub1
 bool PeepholeOptimizer::foldRedundantCopy(
-  MachineInstr *MI,
-  SmallSet<unsigned, 4> &CopySrcRegs,
-  DenseMap<unsigned, MachineInstr *> &CopyMIs) {
-  assert(MI->isCopy());
+    MachineInstr *MI,
+    SmallSet<unsigned, 4> &CopySrcRegs,
+    DenseMap<unsigned, MachineInstr *> &CopyMIs) {
+  assert(MI->isCopy() && "expected a COPY machine instruction");
 
   unsigned SrcReg = MI->getOperand(1).getReg();
   if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
@@ -1400,6 +1417,59 @@ bool PeepholeOptimizer::foldRedundantCopy(
   return true;
 }
 
+bool PeepholeOptimizer::isNAPhysCopy(unsigned Reg) {
+  return TargetRegisterInfo::isPhysicalRegister(Reg) &&
+         !MRI->isAllocatable(Reg);
+}
+
+bool PeepholeOptimizer::foldRedundantNAPhysCopy(
+    MachineInstr *MI, DenseMap<unsigned, MachineInstr *> &NAPhysToVirtMIs) {
+  assert(MI->isCopy() && "expected a COPY machine instruction");
+
+  if (DisableNAPhysCopyOpt)
+    return false;
+
+  unsigned DstReg = MI->getOperand(0).getReg();
+  unsigned SrcReg = MI->getOperand(1).getReg();
+  if (isNAPhysCopy(SrcReg) && TargetRegisterInfo::isVirtualRegister(DstReg)) {
+    // %vreg = COPY %PHYSREG
+    // Avoid using a datastructure which can track multiple live non-allocatable
+    // phys->virt copies since LLVM doesn't seem to do this.
+    NAPhysToVirtMIs.insert({SrcReg, MI});
+    return false;
+  }
+
+  if (!(TargetRegisterInfo::isVirtualRegister(SrcReg) && isNAPhysCopy(DstReg)))
+    return false;
+
+  // %PHYSREG = COPY %vreg
+  auto PrevCopy = NAPhysToVirtMIs.find(DstReg);
+  if (PrevCopy == NAPhysToVirtMIs.end()) {
+    // We can't remove the copy: there was an intervening clobber of the
+    // non-allocatable physical register after the copy to virtual.
+    DEBUG(dbgs() << "NAPhysCopy: intervening clobber forbids erasing " << *MI
+                 << '\n');
+    return false;
+  }
+
+  unsigned PrevDstReg = PrevCopy->second->getOperand(0).getReg();
+  if (PrevDstReg == SrcReg) {
+    // Remove the virt->phys copy: we saw the virtual register definition, and
+    // the non-allocatable physical register's state hasn't changed since then.
+    DEBUG(dbgs() << "NAPhysCopy: erasing " << *MI << '\n');
+    ++NumNAPhysCopies;
+    return true;
+  }
+
+  // Potential missed optimization opportunity: we saw a different virtual
+  // register get a copy of the non-allocatable physical register, and we only
+  // track one such copy. Avoid getting confused by this new non-allocatable
+  // physical register definition, and remove it from the tracked copies.
+  DEBUG(dbgs() << "NAPhysCopy: missed opportunity " << *MI << '\n');
+  NAPhysToVirtMIs.erase(PrevCopy);
+  return false;
+}
+
 bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
   if (skipOptnoneFunction(*MF.getFunction()))
     return false;
@@ -1433,6 +1503,13 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
     DenseMap<unsigned, MachineInstr*> ImmDefMIs;
     SmallSet<unsigned, 16> FoldAsLoadDefCandidates;
 
+    // Track when a non-allocatable physical register is copied to a virtual
+    // register so that useless moves can be removed.
+    //
+    // %PHYSREG is the map index; MI is the last valid `%vreg = COPY %PHYSREG`
+    // without any intervening re-definition of %PHYSREG.
+    DenseMap<unsigned, MachineInstr *> NAPhysToVirtMIs;
+
     // Set of virtual registers that are copied from.
     SmallSet<unsigned, 4> CopySrcRegs;
     DenseMap<unsigned, MachineInstr *> CopySrcMIs;
@@ -1453,10 +1530,51 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
       if (MI->isLoadFoldBarrier())
         FoldAsLoadDefCandidates.clear();
 
-      if (MI->isPosition() || MI->isPHI() || MI->isImplicitDef() ||
-          MI->isKill() || MI->isInlineAsm() ||
-          MI->hasUnmodeledSideEffects())
+      if (MI->isPosition() || MI->isPHI())
+        continue;
+
+      if (!MI->isCopy()) {
+        for (const auto &Op : MI->operands()) {
+          // Visit all operands: definitions can be implicit or explicit.
+          if (Op.isReg()) {
+            unsigned Reg = Op.getReg();
+            if (Op.isDef() && isNAPhysCopy(Reg)) {
+              const auto &Def = NAPhysToVirtMIs.find(Reg);
+              if (Def != NAPhysToVirtMIs.end()) {
+                // A new definition of the non-allocatable physical register
+                // invalidates previous copies.
+                DEBUG(dbgs() << "NAPhysCopy: invalidating because of " << *MI
+                             << '\n');
+                NAPhysToVirtMIs.erase(Def);
+              }
+            }
+          } else if (Op.isRegMask()) {
+            const uint32_t *RegMask = Op.getRegMask();
+            for (auto &RegMI : NAPhysToVirtMIs) {
+              unsigned Def = RegMI.first;
+              if (MachineOperand::clobbersPhysReg(RegMask, Def)) {
+                DEBUG(dbgs() << "NAPhysCopy: invalidating because of " << *MI
+                             << '\n');
+                NAPhysToVirtMIs.erase(Def);
+              }
+            }
+          }
+        }
+      }
+
+      if (MI->isImplicitDef() || MI->isKill())
+        continue;
+
+      if (MI->isInlineAsm() || MI->hasUnmodeledSideEffects()) {
+        // Blow away all non-allocatable physical registers knowledge since we
+        // don't know what's correct anymore.
+        //
+        // FIXME: handle explicit asm clobbers.
+        DEBUG(dbgs() << "NAPhysCopy: blowing away all info due to " << *MI
+                     << '\n');
+        NAPhysToVirtMIs.clear();
         continue;
+      }
 
       if ((isUncoalescableCopy(*MI) &&
            optimizeUncoalescableCopy(MI, LocalMIs)) ||
@@ -1479,7 +1597,9 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
         continue;
       }
 
-      if (MI->isCopy() && foldRedundantCopy(MI, CopySrcRegs, CopySrcMIs)) {
+      if (MI->isCopy() &&
+          (foldRedundantCopy(MI, CopySrcRegs, CopySrcMIs) ||
+           foldRedundantNAPhysCopy(MI, NAPhysToVirtMIs))) {
         LocalMIs.erase(MI);
         MI->eraseFromParent();
         Changed = true;
diff --git a/test/CodeGen/X86/peephole-na-phys-copy-folding.ll b/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
new file mode 100644
index 000000000000..438bf8ddf4c7
--- /dev/null
+++ b/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
@@ -0,0 +1,190 @@
+; RUN: llc -verify-machineinstrs -mtriple=i386-linux-gnu %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s
+
+; The peephole optimizer can elide some physical register copies such as
+; EFLAGS. Make sure the flags are used directly, instead of needlessly using
+; lahf, when possible.
+
+@L = external global i32
+@M = external global i8
+declare i32 @bar(i64)
+
+; CHECK-LABEL: plus_one
+; CHECK-NOT: seto
+; CHECK-NOT: lahf
+; CHECK-NOT: sahf
+; CHECK-NOT: pushf
+; CHECK-NOT: popf
+; CHECK: incl L
+define i1 @plus_one() {
+entry:
+  %loaded_L = load i32, i32* @L
+  %val = add nsw i32 %loaded_L, 1 ; N.B. will emit inc.
+  store i32 %val, i32* @L
+  %loaded_M = load i8, i8* @M
+  %masked = and i8 %loaded_M, 8
+  %M_is_true = icmp ne i8 %masked, 0
+  %L_is_false = icmp eq i32 %val, 0
+  %cond = and i1 %L_is_false, %M_is_true
+  br i1 %cond, label %exit2, label %exit
+
+exit:
+  ret i1 true
+
+exit2:
+  ret i1 false
+}
+
+; CHECK-LABEL: plus_forty_two
+; CHECK-NOT: seto
+; CHECK-NOT: lahf
+; CHECK-NOT: sahf
+; CHECK-NOT: pushf
+; CHECK-NOT: popf
+; CHECK: addl $42,
+define i1 @plus_forty_two() {
+entry:
+  %loaded_L = load i32, i32* @L
+  %val = add nsw i32 %loaded_L, 42 ; N.B. won't emit inc.
+  store i32 %val, i32* @L
+  %loaded_M = load i8, i8* @M
+  %masked = and i8 %loaded_M, 8
+  %M_is_true = icmp ne i8 %masked, 0
+  %L_is_false = icmp eq i32 %val, 0
+  %cond = and i1 %L_is_false, %M_is_true
+  br i1 %cond, label %exit2, label %exit
+
+exit:
+  ret i1 true
+
+exit2:
+  ret i1 false
+}
+
+; CHECK-LABEL: minus_one
+; CHECK-NOT: seto
+; CHECK-NOT: lahf
+; CHECK-NOT: sahf
+; CHECK-NOT: pushf
+; CHECK-NOT: popf
+; CHECK: decl L
+define i1 @minus_one() {
+entry:
+  %loaded_L = load i32, i32* @L
+  %val = add nsw i32 %loaded_L, -1 ; N.B. will emit dec.
+  store i32 %val, i32* @L
+  %loaded_M = load i8, i8* @M
+  %masked = and i8 %loaded_M, 8
+  %M_is_true = icmp ne i8 %masked, 0
+  %L_is_false = icmp eq i32 %val, 0
+  %cond = and i1 %L_is_false, %M_is_true
+  br i1 %cond, label %exit2, label %exit
+
+exit:
+  ret i1 true
+
+exit2:
+  ret i1 false
+}
+
+; CHECK-LABEL: minus_forty_two
+; CHECK-NOT: seto
+; CHECK-NOT: lahf
+; CHECK-NOT: sahf
+; CHECK-NOT: pushf
+; CHECK-NOT: popf
+; CHECK: addl $-42,
+define i1 @minus_forty_two() {
+entry:
+  %loaded_L = load i32, i32* @L
+  %val = add nsw i32 %loaded_L, -42 ; N.B. won't emit dec.
+  store i32 %val, i32* @L
+  %loaded_M = load i8, i8* @M
+  %masked = and i8 %loaded_M, 8
+  %M_is_true = icmp ne i8 %masked, 0
+  %L_is_false = icmp eq i32 %val, 0
+  %cond = and i1 %L_is_false, %M_is_true
+  br i1 %cond, label %exit2, label %exit
+
+exit:
+  ret i1 true
+
+exit2:
+  ret i1 false
+}
+
+; CHECK-LABEL: test_intervening_call:
+; CHECK:       cmpxchg
+; CHECK:       seto %al
+; CHECK-NEXT:  lahf
+; CHECK:       call{{[lq]}} bar
+; CHECK:       addb $127, %al
+; CHECK-NEXT:  sahf
+define i64 @test_intervening_call(i64* %foo, i64 %bar, i64 %baz) {
+  ; cmpxchg sets EFLAGS, call clobbers it, then br uses EFLAGS.
+  %cx = cmpxchg i64* %foo, i64 %bar, i64 %baz seq_cst seq_cst
+  %v = extractvalue { i64, i1 } %cx, 0
+  %p = extractvalue { i64, i1 } %cx, 1
+  call i32 @bar(i64 %v)
+  br i1 %p, label %t, label %f
+
+t:
+  ret i64 42
+
+f:
+  ret i64 0
+}
+
+; CHECK-LABEL: test_two_live_flags:
+; CHECK:       cmpxchg
+; CHECK-NEXT:  seto %al
+; CHECK-NEXT:  lahf
+; Save result of the first cmpxchg into D.
+; CHECK-NEXT:  mov{{[lq]}} %[[AX:[er]ax]], %[[D:[re]d[xi]]]
+; CHECK:       cmpxchg
+; CHECK-NEXT:  sete %al
+; Save result of the second cmpxchg onto the stack.
+; CHECK-NEXT:  push{{[lq]}} %[[AX]]
+; Restore result of the first cmpxchg from D, put it back in EFLAGS.
+; CHECK-NEXT:  mov{{[lq]}} %[[D]], %[[AX]]
+; CHECK-NEXT:  addb $127, %al
+; CHECK-NEXT:  sahf
+; Restore result of the second cmpxchg from the stack.
+; CHECK-NEXT:  pop{{[lq]}} %[[AX]]
+; Test from EFLAGS restored from first cmpxchg, jump if that fails.
+; CHECK-NEXT:  jne
+; Fallthrough to test the second cmpxchg's result.
+; CHECK:       testb %al, %al
+; CHECK-NEXT:  je
+define i64 @test_two_live_flags(
+       i64* %foo0, i64 %bar0, i64 %baz0,
+       i64* %foo1, i64 %bar1, i64 %baz1) {
+  %cx0 = cmpxchg i64* %foo0, i64 %bar0, i64 %baz0 seq_cst seq_cst
+  %p0 = extractvalue { i64, i1 } %cx0, 1
+  %cx1 = cmpxchg i64* %foo1, i64 %bar1, i64 %baz1 seq_cst seq_cst
+  %p1 = extractvalue { i64, i1 } %cx1, 1
+  %flag = and i1 %p0, %p1
+  br i1 %flag, label %t, label %f
+
+t:
+  ret i64 42
+
+f:
+  ret i64 0
+}
+
+; CHECK-LABEL: asm_clobbering_flags:
+; CHECK:       test
+; CHECK-NEXT:  setg
+; CHECK-NEXT:  #APP
+; CHECK-NEXT:  bsfl
+; CHECK-NEXT:  #NO_APP
+; CHECK-NEXT:  movl
+; CHECK-NEXT:  ret
+define i1 @asm_clobbering_flags(i32* %mem) {
+  %val = load i32, i32* %mem, align 4
+  %cmp = icmp sgt i32 %val, 0
+  %res = tail call i32 asm "bsfl $1,$0", "=r,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i32 %val)
+  store i32 %res, i32* %mem, align 4
+  ret i1 %cmp
+}

From 34254063b545bbbd722e3a7fba029b3e3653762a Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Thu, 3 Dec 2015 23:56:42 +0000
Subject: [PATCH 036/364] Simplify since this function never fails.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254667 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/LTO/LTOModule.h |  2 +-
 lib/LTO/LTOModule.cpp        | 12 ++----------
 2 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/include/llvm/LTO/LTOModule.h b/include/llvm/LTO/LTOModule.h
index c3e86afe1d82..83a523613a76 100644
--- a/include/llvm/LTO/LTOModule.h
+++ b/include/llvm/LTO/LTOModule.h
@@ -170,7 +170,7 @@ struct LTOModule {
 
   /// Parse the symbols from the module and model-level ASM and add them to
   /// either the defined or undefined lists.
-  bool parseSymbols(std::string &errMsg);
+  void parseSymbols();
 
   /// Add a symbol which isn't defined just yet to a list to be resolved later.
   void addPotentialUndefinedSymbol(const object::BasicSymbolRef &Sym,
diff --git a/lib/LTO/LTOModule.cpp b/lib/LTO/LTOModule.cpp
index d28563c23b80..42a568b54c7b 100644
--- a/lib/LTO/LTOModule.cpp
+++ b/lib/LTO/LTOModule.cpp
@@ -252,11 +252,7 @@ LTOModule *LTOModule::makeLTOModule(MemoryBufferRef Buffer,
   else
     Ret = new LTOModule(std::move(IRObj), target);
 
-  if (Ret->parseSymbols(errMsg)) {
-    delete Ret;
-    return nullptr;
-  }
-
+  Ret->parseSymbols();
   Ret->parseMetadata();
 
   return Ret;
@@ -592,9 +588,7 @@ void LTOModule::addPotentialUndefinedSymbol(const object::BasicSymbolRef &Sym,
   info.symbol = decl;
 }
 
-/// parseSymbols - Parse the symbols from the module and model-level ASM and add
-/// them to either the defined or undefined lists.
-bool LTOModule::parseSymbols(std::string &errMsg) {
+void LTOModule::parseSymbols() {
   for (auto &Sym : IRFile->symbols()) {
     const GlobalValue *GV = IRFile->getSymbolGV(Sym.getRawDataRefImpl());
     uint32_t Flags = Sym.getFlags();
@@ -649,8 +643,6 @@ bool LTOModule::parseSymbols(std::string &errMsg) {
     NameAndAttributes info = u->getValue();
     _symbols.push_back(info);
   }
-
-  return false;
 }
 
 /// parseMetadata - Parse metadata from the module

From 7008dd751d0660fee014bb2fcc31c92eb540faa4 Mon Sep 17 00:00:00 2001
From: "Kevin B. Smith" <kevin.b.smith@intel.com>
Date: Fri, 4 Dec 2015 00:00:10 +0000
Subject: [PATCH 037/364] [CodeGen] Minor correction to comment on PhysRegInfo.
 Differential revision: http://reviews.llvm.org/D15216

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254668 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/MachineInstrBundle.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/llvm/CodeGen/MachineInstrBundle.h b/include/llvm/CodeGen/MachineInstrBundle.h
index 0ce42dda55bd..4ec3c189ae03 100644
--- a/include/llvm/CodeGen/MachineInstrBundle.h
+++ b/include/llvm/CodeGen/MachineInstrBundle.h
@@ -174,7 +174,7 @@ class MachineOperandIteratorBase {
     /// Defines - Reg or a super-register is defined.
     bool Defines;
 
-    /// Reads - Read or a super-register is read.
+    /// Reads - Reg or a super-register is read.
     bool Reads;
 
     /// ReadsOverlap - Reg or an overlapping register is read.

From b7250858d96b8ce567681214273ac0e62713c661 Mon Sep 17 00:00:00 2001
From: Nathan Slingerland <slingn@gmail.com>
Date: Fri, 4 Dec 2015 00:00:20 +0000
Subject: [PATCH 038/364] [llvm-profdata] Add support for weighted merge of
 profile data

This change adds support for an optional weight when merging profile data with the llvm-profdata tool.
Weights are specified by adding an option ':<weight>' suffix to the input file names.

Adding support for arbitrary weighting of input profile data allows for relative importance to be placed on the
input data from multiple training runs.

Both sampled and instrumented profiles are supported.

Reviewers: dnovillo, bogner, davidxl

Subscribers: llvm-commits

Differential Revision: http://reviews.llvm.org/D14547

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254669 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/CommandGuide/llvm-profdata.rst           |   6 +-
 include/llvm/ProfileData/InstrProf.h          |  31 ++++++--
 include/llvm/ProfileData/InstrProfWriter.h    |   4 +-
 include/llvm/ProfileData/SampleProf.h         |  54 +++++++++-----
 lib/ProfileData/InstrProfWriter.cpp           |  14 +++-
 .../Inputs/weight-instr-bar.profdata          | Bin 0 -> 1320 bytes
 .../Inputs/weight-instr-foo.profdata          | Bin 0 -> 1320 bytes
 .../Inputs/weight-sample-bar.proftext         |   8 ++
 .../Inputs/weight-sample-foo.proftext         |   8 ++
 test/tools/llvm-profdata/weight-instr.test    |  55 ++++++++++++++
 test/tools/llvm-profdata/weight-sample.test   |  43 +++++++++++
 tools/llvm-profdata/llvm-profdata.cpp         |  70 ++++++++++++++----
 unittests/ProfileData/InstrProfTest.cpp       |  20 +++++
 13 files changed, 266 insertions(+), 47 deletions(-)
 create mode 100644 test/tools/llvm-profdata/Inputs/weight-instr-bar.profdata
 create mode 100644 test/tools/llvm-profdata/Inputs/weight-instr-foo.profdata
 create mode 100644 test/tools/llvm-profdata/Inputs/weight-sample-bar.proftext
 create mode 100644 test/tools/llvm-profdata/Inputs/weight-sample-foo.proftext
 create mode 100644 test/tools/llvm-profdata/weight-instr.test
 create mode 100644 test/tools/llvm-profdata/weight-sample.test

diff --git a/docs/CommandGuide/llvm-profdata.rst b/docs/CommandGuide/llvm-profdata.rst
index 210826a7babc..a4b18f301e42 100644
--- a/docs/CommandGuide/llvm-profdata.rst
+++ b/docs/CommandGuide/llvm-profdata.rst
@@ -28,7 +28,7 @@ MERGE
 SYNOPSIS
 ^^^^^^^^
 
-:program:`llvm-profdata merge` [*options*] [*filenames...*]
+:program:`llvm-profdata merge` [*options*] [*filename[:weight]...*]
 
 DESCRIPTION
 ^^^^^^^^^^^
@@ -37,6 +37,10 @@ DESCRIPTION
 generated by PGO instrumentation and merges them together into a single
 indexed profile data file.
 
+The profile counts in each input file can be scaled (multiplied) by specifying
+``<filename>:<weight>``, where `<weight>` is a decimal integer >= 1.
+A default weight of 1 is assumed if only `<filename>` is given.
+
 OPTIONS
 ^^^^^^^
 
diff --git a/include/llvm/ProfileData/InstrProf.h b/include/llvm/ProfileData/InstrProf.h
index 956485119102..e1ed2e9ce48c 100644
--- a/include/llvm/ProfileData/InstrProf.h
+++ b/include/llvm/ProfileData/InstrProf.h
@@ -218,7 +218,8 @@ struct InstrProfValueSiteRecord {
   }
 
   /// Merge data from another InstrProfValueSiteRecord
-  void mergeValueData(InstrProfValueSiteRecord &Input) {
+  /// Optionally scale merged counts by \p Weight.
+  void mergeValueData(InstrProfValueSiteRecord &Input, uint64_t Weight = 1) {
     this->sortByTargetValues();
     Input.sortByTargetValues();
     auto I = ValueData.begin();
@@ -228,7 +229,11 @@ struct InstrProfValueSiteRecord {
       while (I != IE && I->Value < J->Value)
         ++I;
       if (I != IE && I->Value == J->Value) {
-        I->Count = SaturatingAdd(I->Count, J->Count);
+        // TODO: Check for counter overflow and return error if it occurs.
+        uint64_t JCount = J->Count;
+        if (Weight > 1)
+          JCount = SaturatingMultiply(JCount, Weight);
+        I->Count = SaturatingAdd(I->Count, JCount);
         ++I;
         continue;
       }
@@ -274,7 +279,8 @@ struct InstrProfRecord {
                            ValueMapType *HashKeys);
 
   /// Merge the counts in \p Other into this one.
-  inline instrprof_error merge(InstrProfRecord &Other);
+  /// Optionally scale merged counts by \p Weight.
+  inline instrprof_error merge(InstrProfRecord &Other, uint64_t Weight = 1);
 
   /// Used by InstrProfWriter: update the value strings to commoned strings in
   /// the writer instance.
@@ -326,7 +332,9 @@ struct InstrProfRecord {
   }
 
   // Merge Value Profile data from Src record to this record for ValueKind.
-  instrprof_error mergeValueProfData(uint32_t ValueKind, InstrProfRecord &Src) {
+  // Scale merged value counts by \p Weight.
+  instrprof_error mergeValueProfData(uint32_t ValueKind, InstrProfRecord &Src,
+                                     uint64_t Weight) {
     uint32_t ThisNumValueSites = getNumValueSites(ValueKind);
     uint32_t OtherNumValueSites = Src.getNumValueSites(ValueKind);
     if (ThisNumValueSites != OtherNumValueSites)
@@ -336,7 +344,7 @@ struct InstrProfRecord {
     std::vector<InstrProfValueSiteRecord> &OtherSiteRecords =
         Src.getValueSitesForKind(ValueKind);
     for (uint32_t I = 0; I < ThisNumValueSites; I++)
-      ThisSiteRecords[I].mergeValueData(OtherSiteRecords[I]);
+      ThisSiteRecords[I].mergeValueData(OtherSiteRecords[I], Weight);
     return instrprof_error::success;
   }
 };
@@ -422,7 +430,8 @@ void InstrProfRecord::updateStrings(InstrProfStringTable *StrTab) {
       VData.Value = (uint64_t)StrTab->insertString((const char *)VData.Value);
 }
 
-instrprof_error InstrProfRecord::merge(InstrProfRecord &Other) {
+instrprof_error InstrProfRecord::merge(InstrProfRecord &Other,
+                                       uint64_t Weight) {
   // If the number of counters doesn't match we either have bad data
   // or a hash collision.
   if (Counts.size() != Other.Counts.size())
@@ -432,13 +441,19 @@ instrprof_error InstrProfRecord::merge(InstrProfRecord &Other) {
 
   for (size_t I = 0, E = Other.Counts.size(); I < E; ++I) {
     bool ResultOverflowed;
-    Counts[I] = SaturatingAdd(Counts[I], Other.Counts[I], ResultOverflowed);
+    uint64_t OtherCount = Other.Counts[I];
+    if (Weight > 1) {
+      OtherCount = SaturatingMultiply(OtherCount, Weight, ResultOverflowed);
+      if (ResultOverflowed)
+        Result = instrprof_error::counter_overflow;
+    }
+    Counts[I] = SaturatingAdd(Counts[I], OtherCount, ResultOverflowed);
     if (ResultOverflowed)
       Result = instrprof_error::counter_overflow;
   }
 
   for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) {
-    instrprof_error MergeValueResult = mergeValueProfData(Kind, Other);
+    instrprof_error MergeValueResult = mergeValueProfData(Kind, Other, Weight);
     if (MergeValueResult != instrprof_error::success)
       Result = MergeValueResult;
   }
diff --git a/include/llvm/ProfileData/InstrProfWriter.h b/include/llvm/ProfileData/InstrProfWriter.h
index d026e08ec861..1958d5f232e7 100644
--- a/include/llvm/ProfileData/InstrProfWriter.h
+++ b/include/llvm/ProfileData/InstrProfWriter.h
@@ -39,8 +39,8 @@ class InstrProfWriter {
   void updateStringTableReferences(InstrProfRecord &I);
   /// Add function counts for the given function. If there are already counts
   /// for this function and the hash and number of counts match, each counter is
-  /// summed.
-  std::error_code addRecord(InstrProfRecord &&I);
+  /// summed. Optionally scale counts by \p Weight.
+  std::error_code addRecord(InstrProfRecord &&I, uint64_t Weight = 1);
   /// Write the profile to \c OS
   void write(raw_fd_ostream &OS);
   /// Write the profile in text format to \c OS
diff --git a/include/llvm/ProfileData/SampleProf.h b/include/llvm/ProfileData/SampleProf.h
index a7b22c735480..3337f4d7df5c 100644
--- a/include/llvm/ProfileData/SampleProf.h
+++ b/include/llvm/ProfileData/SampleProf.h
@@ -173,19 +173,25 @@ class SampleRecord {
   SampleRecord() : NumSamples(0), CallTargets() {}
 
   /// Increment the number of samples for this record by \p S.
+  /// Optionally scale sample count \p S by \p Weight.
   ///
   /// Sample counts accumulate using saturating arithmetic, to avoid wrapping
   /// around unsigned integers.
-  void addSamples(uint64_t S) {
+  void addSamples(uint64_t S, uint64_t Weight = 1) {
+    if (Weight > 1)
+      S = SaturatingMultiply(S, Weight);
     NumSamples = SaturatingAdd(NumSamples, S);
   }
 
   /// Add called function \p F with samples \p S.
+  /// Optionally scale sample count \p S by \p Weight.
   ///
   /// Sample counts accumulate using saturating arithmetic, to avoid wrapping
   /// around unsigned integers.
-  void addCalledTarget(StringRef F, uint64_t S) {
+  void addCalledTarget(StringRef F, uint64_t S, uint64_t Weight = 1) {
     uint64_t &TargetSamples = CallTargets[F];
+    if (Weight > 1)
+      S = SaturatingMultiply(S, Weight);
     TargetSamples = SaturatingAdd(TargetSamples, S);
   }
 
@@ -196,10 +202,11 @@ class SampleRecord {
   const CallTargetMap &getCallTargets() const { return CallTargets; }
 
   /// Merge the samples in \p Other into this record.
-  void merge(const SampleRecord &Other) {
-    addSamples(Other.getSamples());
+  /// Optionally scale sample counts by \p Weight.
+  void merge(const SampleRecord &Other, uint64_t Weight = 1) {
+    addSamples(Other.getSamples(), Weight);
     for (const auto &I : Other.getCallTargets())
-      addCalledTarget(I.first(), I.second);
+      addCalledTarget(I.first(), I.second, Weight);
   }
 
   void print(raw_ostream &OS, unsigned Indent) const;
@@ -226,16 +233,26 @@ class FunctionSamples {
   FunctionSamples() : TotalSamples(0), TotalHeadSamples(0) {}
   void print(raw_ostream &OS = dbgs(), unsigned Indent = 0) const;
   void dump() const;
-  void addTotalSamples(uint64_t Num) { TotalSamples += Num; }
-  void addHeadSamples(uint64_t Num) { TotalHeadSamples += Num; }
-  void addBodySamples(uint32_t LineOffset, uint32_t Discriminator,
-                      uint64_t Num) {
-    BodySamples[LineLocation(LineOffset, Discriminator)].addSamples(Num);
+  void addTotalSamples(uint64_t Num, uint64_t Weight = 1) {
+    if (Weight > 1)
+      Num = SaturatingMultiply(Num, Weight);
+    TotalSamples += Num;
+  }
+  void addHeadSamples(uint64_t Num, uint64_t Weight = 1) {
+    if (Weight > 1)
+      Num = SaturatingMultiply(Num, Weight);
+    TotalHeadSamples += Num;
+  }
+  void addBodySamples(uint32_t LineOffset, uint32_t Discriminator, uint64_t Num,
+                      uint64_t Weight = 1) {
+    BodySamples[LineLocation(LineOffset, Discriminator)].addSamples(Num,
+                                                                    Weight);
   }
   void addCalledTargetSamples(uint32_t LineOffset, uint32_t Discriminator,
-                              std::string FName, uint64_t Num) {
-    BodySamples[LineLocation(LineOffset, Discriminator)].addCalledTarget(FName,
-                                                                         Num);
+                              std::string FName, uint64_t Num,
+                              uint64_t Weight = 1) {
+    BodySamples[LineLocation(LineOffset, Discriminator)].addCalledTarget(
+        FName, Num, Weight);
   }
 
   /// Return the number of samples collected at the given location.
@@ -284,18 +301,19 @@ class FunctionSamples {
   }
 
   /// Merge the samples in \p Other into this one.
-  void merge(const FunctionSamples &Other) {
-    addTotalSamples(Other.getTotalSamples());
-    addHeadSamples(Other.getHeadSamples());
+  /// Optionally scale samples by \p Weight.
+  void merge(const FunctionSamples &Other, uint64_t Weight = 1) {
+    addTotalSamples(Other.getTotalSamples(), Weight);
+    addHeadSamples(Other.getHeadSamples(), Weight);
     for (const auto &I : Other.getBodySamples()) {
       const LineLocation &Loc = I.first;
       const SampleRecord &Rec = I.second;
-      BodySamples[Loc].merge(Rec);
+      BodySamples[Loc].merge(Rec, Weight);
     }
     for (const auto &I : Other.getCallsiteSamples()) {
       const CallsiteLocation &Loc = I.first;
       const FunctionSamples &Rec = I.second;
-      functionSamplesAt(Loc).merge(Rec);
+      functionSamplesAt(Loc).merge(Rec, Weight);
     }
   }
 
diff --git a/lib/ProfileData/InstrProfWriter.cpp b/lib/ProfileData/InstrProfWriter.cpp
index 78bec012eeb2..2261c92f03a9 100644
--- a/lib/ProfileData/InstrProfWriter.cpp
+++ b/lib/ProfileData/InstrProfWriter.cpp
@@ -98,7 +98,8 @@ void InstrProfWriter::updateStringTableReferences(InstrProfRecord &I) {
   I.updateStrings(&StringTable);
 }
 
-std::error_code InstrProfWriter::addRecord(InstrProfRecord &&I) {
+std::error_code InstrProfWriter::addRecord(InstrProfRecord &&I,
+                                           uint64_t Weight) {
   updateStringTableReferences(I);
   auto &ProfileDataMap = FunctionData[I.Name];
 
@@ -113,9 +114,18 @@ std::error_code InstrProfWriter::addRecord(InstrProfRecord &&I) {
     // We've never seen a function with this name and hash, add it.
     Dest = std::move(I);
     Result = instrprof_error::success;
+    if (Weight > 1) {
+      for (auto &Count : Dest.Counts) {
+        bool Overflowed;
+        Count = SaturatingMultiply(Count, Weight, Overflowed);
+        if (Overflowed && Result == instrprof_error::success) {
+          Result = instrprof_error::counter_overflow;
+        }
+      }
+    }
   } else {
     // We're updating a function we've seen before.
-    Result = Dest.merge(I);
+    Result = Dest.merge(I, Weight);
   }
 
   // We keep track of the max function count as we go for simplicity.
diff --git a/test/tools/llvm-profdata/Inputs/weight-instr-bar.profdata b/test/tools/llvm-profdata/Inputs/weight-instr-bar.profdata
new file mode 100644
index 0000000000000000000000000000000000000000..4ed07660f654090e750b19be4e0af609bc1c61db
GIT binary patch
literal 1320
zcmeyLQ&5zjmf6V600ExHYmK2yFeL$%U}Tt_rlBWzFff!ADy;yeON$fJQ=x1IMi>K1
zb3kcEhBbR7Zu=bi5d*Wx04kG~pWnp<VZ)@LG8h!XF!Qall3u04PhdJ=h9xByp&JJC
zP;T!%PLPQfJU{qBG{Uf7S9_nErA)oe0uly-A5c%^CT8Y&H1)5nJ7QRC1Che;A5?hI
zP_Xbvr*SzDrVf`pOq^JCF!i*iVeUbv(d|W-C&Y)P3I+$LNw7qLZt!5?YYY~LQ0R_y
U3iadCz6FN_BP=E0kVh8=0IAU^eE<Le

literal 0
HcmV?d00001

diff --git a/test/tools/llvm-profdata/Inputs/weight-instr-foo.profdata b/test/tools/llvm-profdata/Inputs/weight-instr-foo.profdata
new file mode 100644
index 0000000000000000000000000000000000000000..581ef39a55b28408da07cfecea322493dddc2515
GIT binary patch
literal 1320
zcmeyLQ&5zjmf6V600ExHYmK2yFeL$%U}Tt_rlBWzFff!ADy;yeON$fJQ=x1IMi>K1
zb3kcEhBbR7Zu=bi5d*Wx04kG~pWnp<VZ)?gZpiJu#|e_L;Q7H1q7jCfZ<UqwDiwYL
z(*ZLqDX|FMFpx%U0Abj#tG&<7Ql{Q!fhzj}^+axBW}Zh=|H`@}hP5_OIduP_^Rd}P
zPcfK#&}noxpvx2Dqw628d|0YraDe&{mM8|RhcyPP!$@>jI*ICuRlWs>LPl7M$03g{
F3;+VlD0u(?

literal 0
HcmV?d00001

diff --git a/test/tools/llvm-profdata/Inputs/weight-sample-bar.proftext b/test/tools/llvm-profdata/Inputs/weight-sample-bar.proftext
new file mode 100644
index 000000000000..a910f745e6c7
--- /dev/null
+++ b/test/tools/llvm-profdata/Inputs/weight-sample-bar.proftext
@@ -0,0 +1,8 @@
+bar:1772037:35370
+ 17: 35370
+ 18: 35370
+ 19: 7005
+ 20: 29407
+ 21: 12170
+ 23: 18150 bar:19829
+ 25: 36666
diff --git a/test/tools/llvm-profdata/Inputs/weight-sample-foo.proftext b/test/tools/llvm-profdata/Inputs/weight-sample-foo.proftext
new file mode 100644
index 000000000000..155ec5d00315
--- /dev/null
+++ b/test/tools/llvm-profdata/Inputs/weight-sample-foo.proftext
@@ -0,0 +1,8 @@
+foo:1763288:35327
+ 7: 35327
+ 8: 35327
+ 9: 6930
+ 10: 29341
+ 11: 11906
+ 13: 18185 foo:19531
+ 15: 36458
diff --git a/test/tools/llvm-profdata/weight-instr.test b/test/tools/llvm-profdata/weight-instr.test
new file mode 100644
index 000000000000..bc0b5061647f
--- /dev/null
+++ b/test/tools/llvm-profdata/weight-instr.test
@@ -0,0 +1,55 @@
+Tests for weighted merge of instrumented profiles.
+
+1- Merge the foo and bar profiles with unity weight and verify the combined output
+RUN: llvm-profdata merge --instr %p/Inputs/weight-instr-bar.profdata:1 %p/Inputs/weight-instr-foo.profdata:1 -o %t
+RUN: llvm-profdata show --instr -all-functions %t | FileCheck %s --check-prefix=WEIGHT1
+WEIGHT1: Counters:
+WEIGHT1:   usage:
+WEIGHT1:     Hash: 0x0000000000000000
+WEIGHT1:     Counters: 1
+WEIGHT1:     Function count: 0
+WEIGHT1:   foo:
+WEIGHT1:     Hash: 0x000000000000028a
+WEIGHT1:     Counters: 3
+WEIGHT1:     Function count: 866988873
+WEIGHT1:   bar:
+WEIGHT1:     Hash: 0x000000000000028a
+WEIGHT1:     Counters: 3
+WEIGHT1:     Function count: 866988873
+WEIGHT1:   main:
+WEIGHT1:     Hash: 0x7d31c47ea98f8248
+WEIGHT1:     Counters: 60
+WEIGHT1:     Function count: 2
+WEIGHT1: Functions shown: 4
+WEIGHT1: Total functions: 4
+WEIGHT1: Maximum function count: 866988873
+WEIGHT1: Maximum internal block count: 267914296
+
+2- Merge the foo and bar profiles with weight 3x and 5x respectively and verify the combined output
+RUN: llvm-profdata merge --instr %p/Inputs/weight-instr-bar.profdata:3 %p/Inputs/weight-instr-foo.profdata:5 -o %t
+RUN: llvm-profdata show --instr -all-functions %t | FileCheck %s --check-prefix=WEIGHT2
+WEIGHT2: Counters:
+WEIGHT2:   usage:
+WEIGHT2:     Hash: 0x0000000000000000
+WEIGHT2:     Counters: 1
+WEIGHT2:     Function count: 0
+WEIGHT2:   foo:
+WEIGHT2:     Hash: 0x000000000000028a
+WEIGHT2:     Counters: 3
+WEIGHT2:     Function count: 4334944365
+WEIGHT2:   bar:
+WEIGHT2:     Hash: 0x000000000000028a
+WEIGHT2:     Counters: 3
+WEIGHT2:     Function count: 2600966619
+WEIGHT2:   main:
+WEIGHT2:     Hash: 0x7d31c47ea98f8248
+WEIGHT2:     Counters: 60
+WEIGHT2:     Function count: 8
+WEIGHT2: Functions shown: 4
+WEIGHT2: Total functions: 4
+WEIGHT2: Maximum function count: 4334944365
+WEIGHT2: Maximum internal block count: 1339571480
+
+3- Bad merge: foo and bar profiles with invalid weights
+RUN: not llvm-profdata merge --instr %p/Inputs/weight-instr-bar.profdata:3 %p/Inputs/weight-instr-foo.profdata:-5 -o %t.out 2>&1 | FileCheck %s --check-prefix=ERROR3
+ERROR3: error: Input weight must be a positive integer.
diff --git a/test/tools/llvm-profdata/weight-sample.test b/test/tools/llvm-profdata/weight-sample.test
new file mode 100644
index 000000000000..a1fe1df1b6de
--- /dev/null
+++ b/test/tools/llvm-profdata/weight-sample.test
@@ -0,0 +1,43 @@
+Tests for weighted merge of sample profiles.
+
+1- Merge the foo and bar profiles with unity weight and verify the combined output
+RUN: llvm-profdata merge --sample --text %p/Inputs/weight-sample-bar.proftext:1 %p/Inputs/weight-sample-foo.proftext:1 -o - | FileCheck %s --check-prefix=WEIGHT1
+WEIGHT1: foo:1763288:35327
+WEIGHT1:  7: 35327
+WEIGHT1:  8: 35327
+WEIGHT1:  9: 6930
+WEIGHT1:  10: 29341
+WEIGHT1:  11: 11906
+WEIGHT1:  13: 18185 foo:19531
+WEIGHT1:  15: 36458
+WEIGHT1: bar:1772037:35370
+WEIGHT1:  17: 35370
+WEIGHT1:  18: 35370
+WEIGHT1:  19: 7005
+WEIGHT1:  20: 29407
+WEIGHT1:  21: 12170
+WEIGHT1:  23: 18150 bar:19829
+WEIGHT1:  25: 36666
+
+2- Merge the foo and bar profiles with weight 3x and 5x respectively and verify the combined output
+RUN: llvm-profdata merge --sample --text %p/Inputs/weight-sample-bar.proftext:3 %p/Inputs/weight-sample-foo.proftext:5 -o - | FileCheck %s --check-prefix=WEIGHT2
+WEIGHT2: foo:8816440:176635
+WEIGHT2:  7: 176635
+WEIGHT2:  8: 176635
+WEIGHT2:  9: 34650
+WEIGHT2:  10: 146705
+WEIGHT2:  11: 59530
+WEIGHT2:  13: 90925 foo:97655
+WEIGHT2:  15: 182290
+WEIGHT2: bar:5316111:106110
+WEIGHT2:  17: 106110
+WEIGHT2:  18: 106110
+WEIGHT2:  19: 21015
+WEIGHT2:  20: 88221
+WEIGHT2:  21: 36510
+WEIGHT2:  23: 54450 bar:59487
+WEIGHT2:  25: 109998
+
+3- Bad merge: foo and bar profiles with invalid weights
+RUN: not llvm-profdata merge --sample --text %p/Inputs/weight-sample-bar.proftext:3 %p/Inputs/weight-sample-foo.proftext:-5 -o %t.out 2>&1 | FileCheck %s --check-prefix=ERROR3
+ERROR3: error: Input weight must be a positive integer.
diff --git a/tools/llvm-profdata/llvm-profdata.cpp b/tools/llvm-profdata/llvm-profdata.cpp
index 10b6855233d5..56c80f518ec4 100644
--- a/tools/llvm-profdata/llvm-profdata.cpp
+++ b/tools/llvm-profdata/llvm-profdata.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/ProfileData/InstrProfReader.h"
@@ -27,6 +28,7 @@
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/raw_ostream.h"
+#include <tuple>
 
 using namespace llvm;
 
@@ -93,7 +95,17 @@ static void handleMergeWriterError(std::error_code &Error,
   }
 }
 
-static void mergeInstrProfile(const cl::list<std::string> &Inputs,
+struct WeightedFile {
+  StringRef Filename;
+  uint64_t Weight;
+
+  WeightedFile() {}
+
+  WeightedFile(StringRef F, uint64_t W) : Filename{F}, Weight{W} {}
+};
+typedef SmallVector<WeightedFile, 5> WeightedFileVector;
+
+static void mergeInstrProfile(const WeightedFileVector &Inputs,
                               StringRef OutputFilename,
                               ProfileFormat OutputFormat) {
   if (OutputFilename.compare("-") == 0)
@@ -109,21 +121,21 @@ static void mergeInstrProfile(const cl::list<std::string> &Inputs,
 
   InstrProfWriter Writer;
   SmallSet<std::error_code, 4> WriterErrorCodes;
-  for (const auto &Filename : Inputs) {
-    auto ReaderOrErr = InstrProfReader::create(Filename);
+  for (const auto &Input : Inputs) {
+    auto ReaderOrErr = InstrProfReader::create(Input.Filename);
     if (std::error_code ec = ReaderOrErr.getError())
-      exitWithErrorCode(ec, Filename);
+      exitWithErrorCode(ec, Input.Filename);
 
     auto Reader = std::move(ReaderOrErr.get());
     for (auto &I : *Reader) {
-      if (std::error_code EC = Writer.addRecord(std::move(I))) {
+      if (std::error_code EC = Writer.addRecord(std::move(I), Input.Weight)) {
         // Only show hint the first time an error occurs.
         bool firstTime = WriterErrorCodes.insert(EC).second;
-        handleMergeWriterError(EC, Filename, I.Name, firstTime);
+        handleMergeWriterError(EC, Input.Filename, I.Name, firstTime);
       }
     }
     if (Reader->hasError())
-      exitWithErrorCode(Reader->getError(), Filename);
+      exitWithErrorCode(Reader->getError(), Input.Filename);
   }
   if (OutputFormat == PF_Text)
     Writer.writeText(Output);
@@ -135,7 +147,7 @@ static sampleprof::SampleProfileFormat FormatMap[] = {
     sampleprof::SPF_None, sampleprof::SPF_Text, sampleprof::SPF_Binary,
     sampleprof::SPF_GCC};
 
-static void mergeSampleProfile(const cl::list<std::string> &Inputs,
+static void mergeSampleProfile(const WeightedFileVector &Inputs,
                                StringRef OutputFilename,
                                ProfileFormat OutputFormat) {
   using namespace sampleprof;
@@ -147,11 +159,11 @@ static void mergeSampleProfile(const cl::list<std::string> &Inputs,
   auto Writer = std::move(WriterOrErr.get());
   StringMap<FunctionSamples> ProfileMap;
   SmallVector<std::unique_ptr<sampleprof::SampleProfileReader>, 5> Readers;
-  for (const auto &Filename : Inputs) {
+  for (const auto &Input : Inputs) {
     auto ReaderOrErr =
-        SampleProfileReader::create(Filename, getGlobalContext());
+        SampleProfileReader::create(Input.Filename, getGlobalContext());
     if (std::error_code EC = ReaderOrErr.getError())
-      exitWithErrorCode(EC, Filename);
+      exitWithErrorCode(EC, Input.Filename);
 
     // We need to keep the readers around until after all the files are
     // read so that we do not lose the function names stored in each
@@ -160,7 +172,7 @@ static void mergeSampleProfile(const cl::list<std::string> &Inputs,
     Readers.push_back(std::move(ReaderOrErr.get()));
     const auto Reader = Readers.back().get();
     if (std::error_code EC = Reader->read())
-      exitWithErrorCode(EC, Filename);
+      exitWithErrorCode(EC, Input.Filename);
 
     StringMap<FunctionSamples> &Profiles = Reader->getProfiles();
     for (StringMap<FunctionSamples>::iterator I = Profiles.begin(),
@@ -168,15 +180,38 @@ static void mergeSampleProfile(const cl::list<std::string> &Inputs,
          I != E; ++I) {
       StringRef FName = I->first();
       FunctionSamples &Samples = I->second;
-      ProfileMap[FName].merge(Samples);
+      ProfileMap[FName].merge(Samples, Input.Weight);
     }
   }
   Writer->write(ProfileMap);
 }
 
+static void parseInputFiles(const cl::list<std::string> &Inputs,
+                            WeightedFileVector &WeightedInputs) {
+  WeightedInputs.reserve(Inputs.size());
+
+  for (StringRef Input : Inputs) {
+    StringRef FileName;
+    StringRef WeightStr;
+    std::tie(FileName, WeightStr) = Input.rsplit(':');
+    if (WeightStr.empty() || sys::fs::exists(Input)) {
+      // No weight specified or valid path containing delimiter.
+      WeightedInputs.push_back(WeightedFile(Input, 1));
+    } else {
+      // Input weight specified.
+      uint64_t Weight;
+      if (WeightStr.getAsInteger(10, Weight) || Weight < 1) {
+        // Invalid input weight.
+        exitWithError("Input weight must be a positive integer.");
+      }
+      WeightedInputs.push_back(WeightedFile(FileName, Weight));
+    }
+  }
+}
+
 static int merge_main(int argc, const char *argv[]) {
   cl::list<std::string> Inputs(cl::Positional, cl::Required, cl::OneOrMore,
-                               cl::desc("<filenames...>"));
+                               cl::desc("<filename[:weight]...>"));
 
   cl::opt<std::string> OutputFilename("output", cl::value_desc("output"),
                                       cl::init("-"), cl::Required,
@@ -198,10 +233,13 @@ static int merge_main(int argc, const char *argv[]) {
 
   cl::ParseCommandLineOptions(argc, argv, "LLVM profile data merger\n");
 
+  WeightedFileVector WeightedInputs;
+  parseInputFiles(Inputs, WeightedInputs);
+
   if (ProfileKind == instr)
-    mergeInstrProfile(Inputs, OutputFilename, OutputFormat);
+    mergeInstrProfile(WeightedInputs, OutputFilename, OutputFormat);
   else
-    mergeSampleProfile(Inputs, OutputFilename, OutputFormat);
+    mergeSampleProfile(WeightedInputs, OutputFilename, OutputFormat);
 
   return 0;
 }
diff --git a/unittests/ProfileData/InstrProfTest.cpp b/unittests/ProfileData/InstrProfTest.cpp
index 635a5431a513..946afdadba93 100644
--- a/unittests/ProfileData/InstrProfTest.cpp
+++ b/unittests/ProfileData/InstrProfTest.cpp
@@ -490,4 +490,24 @@ TEST_F(InstrProfTest, get_max_function_count) {
   ASSERT_EQ(1ULL << 63, Reader->getMaximumFunctionCount());
 }
 
+TEST_F(InstrProfTest, get_weighted_function_counts) {
+  InstrProfRecord Record1("foo", 0x1234, {1, 2});
+  InstrProfRecord Record2("foo", 0x1235, {3, 4});
+  Writer.addRecord(std::move(Record1), 3);
+  Writer.addRecord(std::move(Record2), 5);
+  auto Profile = Writer.writeBuffer();
+  readProfile(std::move(Profile));
+
+  std::vector<uint64_t> Counts;
+  ASSERT_TRUE(NoError(Reader->getFunctionCounts("foo", 0x1234, Counts)));
+  ASSERT_EQ(2U, Counts.size());
+  ASSERT_EQ(3U, Counts[0]);
+  ASSERT_EQ(6U, Counts[1]);
+
+  ASSERT_TRUE(NoError(Reader->getFunctionCounts("foo", 0x1235, Counts)));
+  ASSERT_EQ(2U, Counts.size());
+  ASSERT_EQ(15U, Counts[0]);
+  ASSERT_EQ(20U, Counts[1]);
+}
+
 } // end anonymous namespace

From 21aabdad38b4f6284e44df5456b8f8f4a844c5c8 Mon Sep 17 00:00:00 2001
From: Cong Hou <congh@google.com>
Date: Fri, 4 Dec 2015 00:36:58 +0000
Subject: [PATCH 039/364] Don't punish vectorized arithmetic instruction whose
 type will be split to multiple registers

Currently in LLVM's cost model, a vectorized arithmetic instruction will have
high cost if its type is split into multiple registers. However, this
punishment is too heavy and unnecessary. The overhead of the split should not
be on arithmetic instructions but instructions that implement the split. Note
that during vectorization we have calculated the register pressure, and we
only choose proper interleaving factor (and also vectorization factor) so
that we don't use more registers than the maximum number.

Here is a very simple example: if a vadd has the cost 1, and if we double VF
so that we need two registers to perform it, then its cost will become 4 with
the current implementation, which will prevent us to use larger VF.


Differential revision: http://reviews.llvm.org/D15159


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254671 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/BasicTTIImpl.h      | 6 +-----
 test/Analysis/CostModel/X86/reduction.ll | 2 +-
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h
index e2245e9984b8..ec311a093869 100644
--- a/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/include/llvm/CodeGen/BasicTTIImpl.h
@@ -302,12 +302,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
 
     if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
       // The operation is legal. Assume it costs 1.
-      // If the type is split to multiple registers, assume that there is some
-      // overhead to this.
       // TODO: Once we have extract/insert subvector cost we need to use them.
-      if (LT.first > 1)
-        return LT.first * 2 * OpCost;
-      return LT.first * 1 * OpCost;
+      return LT.first * OpCost;
     }
 
     if (!TLI->isOperationExpand(ISD, LT.second)) {
diff --git a/test/Analysis/CostModel/X86/reduction.ll b/test/Analysis/CostModel/X86/reduction.ll
index 78e65aee1460..aaafe07c1eb8 100644
--- a/test/Analysis/CostModel/X86/reduction.ll
+++ b/test/Analysis/CostModel/X86/reduction.ll
@@ -33,7 +33,7 @@ define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) {
   %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
 
 ; CHECK-LABEL: reduction_cost_int
-; CHECK:  cost of 23 {{.*}} extractelement
+; CHECK:  cost of 17 {{.*}} extractelement
 
   %r = extractelement <8 x i32> %bin.rdx.3, i32 0
   ret i32 %r

From cb1cb45c602578eeb8c6894e86206bade019d044 Mon Sep 17 00:00:00 2001
From: Evgeniy Stepanov <eugeni.stepanov@gmail.com>
Date: Fri, 4 Dec 2015 00:45:43 +0000
Subject: [PATCH 040/364] Emit function alias to data as a function symbol.

CFI emits jump slots for indirect functions as a byte array
constant, and declares function-typed aliases to these constants.

This change fixes AsmPrinter to emit these aliases as function
symbols and not data symbols.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254674 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/AsmPrinter/AsmPrinter.cpp  |  5 +++++
 test/CodeGen/Generic/function-alias.ll | 12 ++++++++++++
 2 files changed, 17 insertions(+)
 create mode 100644 test/CodeGen/Generic/function-alias.ll

diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 2cfea650872a..b8604240b5d9 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1187,6 +1187,11 @@ bool AsmPrinter::doFinalization(Module &M) {
     else
       assert(Alias.hasLocalLinkage() && "Invalid alias linkage");
 
+    // Set the symbol type to function if the alias has a function type.
+    // This affects codegen when the aliasee is not a function.
+    if (Alias.getType()->getPointerElementType()->isFunctionTy())
+      OutStreamer->EmitSymbolAttribute(Name, MCSA_ELF_TypeFunction);
+
     EmitVisibility(Name, Alias.getVisibility());
 
     // Emit the directives as assignments aka .set:
diff --git a/test/CodeGen/Generic/function-alias.ll b/test/CodeGen/Generic/function-alias.ll
new file mode 100644
index 000000000000..d68d75d5578a
--- /dev/null
+++ b/test/CodeGen/Generic/function-alias.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; "data" constant
+@0 = private constant <{ i8, i8 }> <{i8 15, i8 11}>, section ".text"
+
+; function-typed alias
+@ud2 = alias void (), bitcast (<{ i8, i8 }>* @0 to void ()*)
+
+; Check that "ud2" is emitted as a function symbol.
+; CHECK: .type{{.*}}ud2,@function

From f015928ee8d3b4b918ed5c1d3ea3327417a82be0 Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Fri, 4 Dec 2015 00:45:57 +0000
Subject: [PATCH 041/364] Simplify the error handling in llvm-lto a bit.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254675 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-lto/llvm-lto.cpp | 73 +++++++++++++++++++------------------
 1 file changed, 37 insertions(+), 36 deletions(-)

diff --git a/tools/llvm-lto/llvm-lto.cpp b/tools/llvm-lto/llvm-lto.cpp
index 64e0ae31d06a..86b95577b307 100644
--- a/tools/llvm-lto/llvm-lto.cpp
+++ b/tools/llvm-lto/llvm-lto.cpp
@@ -150,18 +150,34 @@ static void diagnosticHandler(const DiagnosticInfo &DI) {
     exit(1);
 }
 
+static void error(const Twine &Msg) {
+  errs() << "llvm-lto: " << Msg << '\n';
+  exit(1);
+}
+
+static void error(std::error_code EC, const Twine &Prefix) {
+  if (EC)
+    error(Prefix + ": " + EC.message());
+}
+
+template <typename T>
+static void error(const ErrorOr<T> &V, const Twine &Prefix) {
+  error(V.getError(), Prefix);
+}
+
 static std::unique_ptr<LTOModule>
 getLocalLTOModule(StringRef Path, std::unique_ptr<MemoryBuffer> &Buffer,
-                  const TargetOptions &Options, std::string &Error) {
+                  const TargetOptions &Options) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
       MemoryBuffer::getFile(Path);
-  if (std::error_code EC = BufferOrErr.getError()) {
-    Error = EC.message();
-    return nullptr;
-  }
+  error(BufferOrErr, "error loading file '" + Path + "'");
   Buffer = std::move(BufferOrErr.get());
-  return std::unique_ptr<LTOModule>(LTOModule::createInLocalContext(
+  std::string Error;
+  std::unique_ptr<LTOModule> Ret(LTOModule::createInLocalContext(
       Buffer->getBufferStart(), Buffer->getBufferSize(), Options, Error, Path));
+  if (!Error.empty())
+    error("error loading file '" + Path + "' " + Error);
+  return Ret;
 }
 
 /// \brief List symbols in each IR file.
@@ -170,42 +186,30 @@ getLocalLTOModule(StringRef Path, std::unique_ptr<MemoryBuffer> &Buffer,
 /// functionality that's exposed by the C API to list symbols.  Moreover, this
 /// provides testing coverage for modules that have been created in their own
 /// contexts.
-static int listSymbols(StringRef Command, const TargetOptions &Options) {
+static void listSymbols(const TargetOptions &Options) {
   for (auto &Filename : InputFilenames) {
-    std::string Error;
     std::unique_ptr<MemoryBuffer> Buffer;
     std::unique_ptr<LTOModule> Module =
-        getLocalLTOModule(Filename, Buffer, Options, Error);
-    if (!Module) {
-      errs() << Command << ": error loading file '" << Filename
-             << "': " << Error << "\n";
-      return 1;
-    }
+        getLocalLTOModule(Filename, Buffer, Options);
 
     // List the symbols.
     outs() << Filename << ":\n";
     for (int I = 0, E = Module->getSymbolCount(); I != E; ++I)
       outs() << Module->getSymbolName(I) << "\n";
   }
-  return 0;
 }
 
 /// Create a combined index file from the input IR files and write it.
 ///
 /// This is meant to enable testing of ThinLTO combined index generation,
 /// currently available via the gold plugin via -thinlto.
-static int createCombinedFunctionIndex(StringRef Command) {
+static void createCombinedFunctionIndex() {
   FunctionInfoIndex CombinedIndex;
   uint64_t NextModuleId = 0;
   for (auto &Filename : InputFilenames) {
     ErrorOr<std::unique_ptr<FunctionInfoIndex>> IndexOrErr =
         llvm::getFunctionIndexForFile(Filename, diagnosticHandler);
-    if (std::error_code EC = IndexOrErr.getError()) {
-      std::string Error = EC.message();
-      errs() << Command << ": error loading file '" << Filename
-             << "': " << Error << "\n";
-      return 1;
-    }
+    error(IndexOrErr, "error loading file '" + Filename + "'");
     std::unique_ptr<FunctionInfoIndex> Index = std::move(IndexOrErr.get());
     // Skip files without a function summary.
     if (!Index)
@@ -216,14 +220,9 @@ static int createCombinedFunctionIndex(StringRef Command) {
   assert(!OutputFilename.empty());
   raw_fd_ostream OS(OutputFilename + ".thinlto.bc", EC,
                     sys::fs::OpenFlags::F_None);
-  if (EC) {
-    errs() << Command << ": error opening the file '" << OutputFilename
-           << ".thinlto.bc': " << EC.message() << "\n";
-    return 1;
-  }
+  error(EC, "error opening the file '" + OutputFilename + ".thinlto.bc'");
   WriteFunctionSummaryToFile(CombinedIndex, OS);
   OS.close();
-  return 0;
 }
 
 int main(int argc, char **argv) {
@@ -234,10 +233,8 @@ int main(int argc, char **argv) {
   llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
   cl::ParseCommandLineOptions(argc, argv, "llvm LTO linker\n");
 
-  if (OptLevel < '0' || OptLevel > '3') {
-    errs() << argv[0] << ": optimization level must be between 0 and 3\n";
-    return 1;
-  }
+  if (OptLevel < '0' || OptLevel > '3')
+    error("optimization level must be between 0 and 3");
 
   // Initialize the configured targets.
   InitializeAllTargets();
@@ -248,11 +245,15 @@ int main(int argc, char **argv) {
   // set up the TargetOptions for the machine
   TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
 
-  if (ListSymbolsOnly)
-    return listSymbols(argv[0], Options);
+  if (ListSymbolsOnly) {
+    listSymbols(Options);
+    return 0;
+  }
 
-  if (ThinLTO)
-    return createCombinedFunctionIndex(argv[0]);
+  if (ThinLTO) {
+    createCombinedFunctionIndex();
+    return 0;
+  }
 
   unsigned BaseArg = 0;
 

From cd7c95d7e887ecda0b795690623ea9567f0659e7 Mon Sep 17 00:00:00 2001
From: Evgeniy Stepanov <eugeni.stepanov@gmail.com>
Date: Fri, 4 Dec 2015 00:57:25 +0000
Subject: [PATCH 042/364] Fix function-alias.ll test on non-X86 targets.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254676 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/Generic/function-alias.ll | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/CodeGen/Generic/function-alias.ll b/test/CodeGen/Generic/function-alias.ll
index d68d75d5578a..7eec5be198b0 100644
--- a/test/CodeGen/Generic/function-alias.ll
+++ b/test/CodeGen/Generic/function-alias.ll
@@ -1,6 +1,4 @@
 ; RUN: llc < %s | FileCheck %s
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
 
 ; "data" constant
 @0 = private constant <{ i8, i8 }> <{i8 15, i8 11}>, section ".text"

From 7579d3aaed060dd8002e11f9c64f27508ec32f3c Mon Sep 17 00:00:00 2001
From: Xinliang David Li <davidxl@google.com>
Date: Fri, 4 Dec 2015 01:02:10 +0000
Subject: [PATCH 043/364] [PGO] Unify VP data format between raw and indexed
 profile (Reader)

With the latest refactoring and code sharing patches landed,
it is possible to unify the value profile implementation between
raw and indexed profile. This is the patch in raw profile reader
that uses the common interface.

Differential Revision: http://reviews.llvm.org/D15056


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254677 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ProfileData/InstrProfReader.h | 13 +++++-
 lib/ProfileData/InstrProfReader.cpp        | 49 +++++-----------------
 2 files changed, 22 insertions(+), 40 deletions(-)

diff --git a/include/llvm/ProfileData/InstrProfReader.h b/include/llvm/ProfileData/InstrProfReader.h
index 49233366e164..318981f75e18 100644
--- a/include/llvm/ProfileData/InstrProfReader.h
+++ b/include/llvm/ProfileData/InstrProfReader.h
@@ -162,10 +162,19 @@ class RawInstrProfReader : public InstrProfReader {
 private:
   std::error_code readNextHeader(const char *CurrentPos);
   std::error_code readHeader(const RawInstrProf::Header &Header);
-  template <class IntT>
-  IntT swap(IntT Int) const {
+  template <class IntT> IntT swap(IntT Int) const {
     return ShouldSwapBytes ? sys::getSwappedBytes(Int) : Int;
   }
+  support::endianness getDataEndianness() const {
+    support::endianness HostEndian = getHostEndianness();
+    if (!ShouldSwapBytes)
+      return HostEndian;
+    if (HostEndian == support::little)
+      return support::big;
+    else
+      return support::little;
+  }
+
   inline uint8_t getNumPaddingBytes(uint64_t SizeInBytes) {
     return 7 & (sizeof(uint64_t) - SizeInBytes % sizeof(uint64_t));
   }
diff --git a/lib/ProfileData/InstrProfReader.cpp b/lib/ProfileData/InstrProfReader.cpp
index cfc968739806..7683cad6ede4 100644
--- a/lib/ProfileData/InstrProfReader.cpp
+++ b/lib/ProfileData/InstrProfReader.cpp
@@ -296,55 +296,28 @@ std::error_code RawInstrProfReader<IntPtrT>::readRawCounts(
 }
 
 template <class IntPtrT>
-std::error_code RawInstrProfReader<IntPtrT>::readValueProfilingData(
-    InstrProfRecord &Record) {
+std::error_code
+RawInstrProfReader<IntPtrT>::readValueProfilingData(InstrProfRecord &Record) {
 
   Record.clearValueData();
   if (!Data->Values || (ValueDataDelta == 0))
     return success();
 
-  // Read value data.
-  uint64_t NumVSites = 0;
-  for (uint32_t Kind = IPVK_First; Kind <= ValueKindLast; ++Kind)
-    NumVSites += swap(Data->NumValueSites[Kind]);
-  NumVSites += getNumPaddingBytes(NumVSites);
+  ErrorOr<std::unique_ptr<ValueProfData>> VDataPtrOrErr =
+      ValueProfData::getValueProfData(getValueDataCounts(Data->Values),
+                                      (const unsigned char *)ProfileEnd,
+                                      getDataEndianness());
 
-  auto VDataCounts = makeArrayRef(getValueDataCounts(Data->Values), NumVSites);
-  // Check bounds.
-  if (VDataCounts.data() < ValueDataStart ||
-      VDataCounts.data() + VDataCounts.size() >
-          reinterpret_cast<const uint8_t *>(ProfileEnd))
-    return error(instrprof_error::malformed);
+  if (VDataPtrOrErr.getError())
+    return VDataPtrOrErr.getError();
 
-  const InstrProfValueData *VDataPtr =
-      getValueData(swap(Data->Values) + NumVSites);
-  for (uint32_t Kind = IPVK_First; Kind <= ValueKindLast; ++Kind) {
-    NumVSites = swap(Data->NumValueSites[Kind]);
-    Record.reserveSites(Kind, NumVSites);
-    for (uint32_t VSite = 0; VSite < NumVSites; ++VSite) {
-
-      uint32_t VDataCount = VDataCounts[VSite];
-      if ((const char *)(VDataPtr + VDataCount) > ProfileEnd)
-        return error(instrprof_error::malformed);
-
-      std::vector<InstrProfValueData> CurrentValues;
-      CurrentValues.reserve(VDataCount);
-      for (uint32_t VIndex = 0; VIndex < VDataCount; ++VIndex) {
-        uint64_t TargetValue = swap(VDataPtr->Value);
-        uint64_t Count = swap(VDataPtr->Count);
-        CurrentValues.push_back({TargetValue, Count});
-        ++VDataPtr;
-      }
-      Record.addValueData(Kind, VSite, CurrentValues.data(),
-                          VDataCount, &FunctionPtrToNameMap);
-    }
-  }
+  VDataPtrOrErr.get()->deserializeTo(Record, &FunctionPtrToNameMap);
   return success();
 }
 
 template <class IntPtrT>
-std::error_code RawInstrProfReader<IntPtrT>::readNextRecord(
-    InstrProfRecord &Record) {
+std::error_code
+RawInstrProfReader<IntPtrT>::readNextRecord(InstrProfRecord &Record) {
   if (atEnd())
     if (std::error_code EC = readNextHeader(ProfileEnd))
       return EC;

From 2cb46213b7277fc92db5d443c8e217140e5ac2c8 Mon Sep 17 00:00:00 2001
From: Justin Bogner <mail@justinbogner.com>
Date: Fri, 4 Dec 2015 01:14:24 +0000
Subject: [PATCH 044/364] IR: Update a comment and a bool that've been out of
 date since 2012

It became impossible to get here with a half in r157393, over 3 years
ago.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254679 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/IR/AsmWriter.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp
index 759c5a8001c0..cb9a792c598b 100644
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp
@@ -1108,11 +1108,10 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
       // the value back and get the same value.
       //
       bool ignored;
-      bool isHalf = &CFP->getValueAPF().getSemantics()==&APFloat::IEEEhalf;
       bool isDouble = &CFP->getValueAPF().getSemantics()==&APFloat::IEEEdouble;
       bool isInf = CFP->getValueAPF().isInfinity();
       bool isNaN = CFP->getValueAPF().isNaN();
-      if (!isHalf && !isInf && !isNaN) {
+      if (!isInf && !isNaN) {
         double Val = isDouble ? CFP->getValueAPF().convertToDouble() :
                                 CFP->getValueAPF().convertToFloat();
         SmallString<128> StrVal;
@@ -1140,7 +1139,7 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
                     "assuming that double is 64 bits!");
       char Buffer[40];
       APFloat apf = CFP->getValueAPF();
-      // Halves and floats are represented in ASCII IR as double, convert.
+      // Floats are represented in ASCII IR as double, convert.
       if (!isDouble)
         apf.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven,
                           &ignored);

From 2012083871b75ca032fefaec739ce1f280a8f3de Mon Sep 17 00:00:00 2001
From: JF Bastien <jfb@google.com>
Date: Fri, 4 Dec 2015 01:18:17 +0000
Subject: [PATCH 045/364] X86InstrInfo::copyPhysReg: workaround reg liveness

Summary:
computeRegisterLiveness and analyzePhysReg are currently getting
confused about liveness in some cases, breaking copyPhysReg's
calculation of whether AX is dead in some cases. Work around this issue
temporarily by assuming that AX is always live.

See detail in: https://llvm.org/bugs/show_bug.cgi?id=25033#c7
And associated bugs PR24535 PR25033 PR24991 PR24992 PR25201.

This workaround makes the code correct but slightly inefficient, but it
seems to confuse the machine instr verifier which now things EAX was
undefined in some cases where it's being conservatively saved /
restored.

Reviewers: majnemer, sanjoy
Subscribers: llvm-commits
Differential Revision: http://reviews.llvm.org/D15198

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254680 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86InstrInfo.cpp               | 16 +++++--
 test/CodeGen/X86/cmpxchg-clobber-flags.ll     | 43 +++++++++++++++++--
 .../X86/peephole-na-phys-copy-folding.ll      |  8 ++--
 3 files changed, 57 insertions(+), 10 deletions(-)

diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 12da3a9319e6..e9d36f8ce2f1 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -4412,9 +4412,19 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     int Pop = is64 ? X86::POP64r : X86::POP32r;
     int AX = is64 ? X86::RAX : X86::EAX;
 
-    bool AXDead = (Reg == AX) ||
-                  (MachineBasicBlock::LQR_Dead ==
-                   MBB.computeRegisterLiveness(&getRegisterInfo(), AX, MI));
+    bool AXDead = (Reg == AX);
+    // FIXME: The above could figure out that AX is dead in more cases with:
+    //          || (MachineBasicBlock::LQR_Dead ==
+    //            MBB.computeRegisterLiveness(&getRegisterInfo(), AX, MI));
+    //
+    //        Unfortunately this is slightly broken, see PR24535 and the likely
+    //        related PR25033 PR24991 PR24992 PR25201. These issues seem to
+    //        showcase sub-register / super-register confusion: a previous kill
+    //        of AH but no kill of AL leads computeRegisterLiveness to
+    //        erroneously conclude that AX is dead.
+    //
+    //        Once fixed, also update cmpxchg-clobber-flags.ll and
+    //        peephole-na-phys-copy-folding.ll.
 
     if (!AXDead)
       BuildMI(MBB, MI, DL, get(Push)).addReg(AX, getKillRegState(true));
diff --git a/test/CodeGen/X86/cmpxchg-clobber-flags.ll b/test/CodeGen/X86/cmpxchg-clobber-flags.ll
index c129128b5fa7..791edba89c44 100644
--- a/test/CodeGen/X86/cmpxchg-clobber-flags.ll
+++ b/test/CodeGen/X86/cmpxchg-clobber-flags.ll
@@ -1,7 +1,14 @@
-; RUN: llc -verify-machineinstrs -mtriple=i386-linux-gnu %s -o - | FileCheck %s -check-prefix=i386
-; RUN: llc -verify-machineinstrs -mtriple=i386-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=i386f
-; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s -check-prefix=x8664
-; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=x8664
+; RUN: llc -mtriple=i386-linux-gnu %s -o - | FileCheck %s -check-prefix=i386
+; RUN: llc -mtriple=i386-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=i386f
+; RUN: llc -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s -check-prefix=x8664
+; RUN: llc -mtriple=x86_64-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=x8664
+
+; FIXME: X86InstrInfo::copyPhysReg had code which figured out whether AX was
+;        live or not to avoid save / restore when it's not needed. See FIXME in
+;        that function for more details on which the code is currently
+;        disabled. The extra push/pop are marked below and can be removed once
+;        the issue is fixed.
+;        -verify-machineinstrs should also be added back in the RUN lines above.
 
 declare i32 @foo()
 declare i32 @bar(i64)
@@ -17,22 +24,34 @@ define i64 @test_intervening_call(i64* %foo, i64 %bar, i64 %baz) {
 ; i386-NEXT: movl %edx, 4(%esp)
 ; i386-NEXT: movl %eax, (%esp)
 ; i386-NEXT: calll bar
+; ** FIXME Next line isn't actually necessary. **
+; i386-NEXT: pushl %eax
 ; i386-NEXT: movl [[FLAGS]], %eax
 ; i386-NEXT: addb $127, %al
 ; i386-NEXT: sahf
+; ** FIXME Next line isn't actually necessary. **
+; i386-NEXT: popl %eax
 ; i386-NEXT: jne
 
 ; i386f-LABEL: test_intervening_call:
 ; i386f: cmpxchg8b
 ; i386f-NEXT: movl %eax, (%esp)
 ; i386f-NEXT: movl %edx, 4(%esp)
+; ** FIXME Next line isn't actually necessary. **
+; i386f-NEXT: pushl %eax
 ; i386f-NEXT: seto %al
 ; i386f-NEXT: lahf
 ; i386f-NEXT: movl %eax, [[FLAGS:%.*]]
+; ** FIXME Next line isn't actually necessary. **
+; i386f-NEXT: popl %eax
 ; i386f-NEXT: calll bar
+; ** FIXME Next line isn't actually necessary. **
+; i386f-NEXT: pushl %eax
 ; i386f-NEXT: movl [[FLAGS]], %eax
 ; i386f-NEXT: addb $127, %al
 ; i386f-NEXT: sahf
+; ** FIXME Next line isn't actually necessary. **
+; i386f-NEXT: popl %eax
 ; i386f-NEXT: jne
 
 ; x8664-LABEL: test_intervening_call:
@@ -44,9 +63,13 @@ define i64 @test_intervening_call(i64* %foo, i64 %bar, i64 %baz) {
 ; x8664-NEXT: popq %rax
 ; x8664-NEXT: movq %rax, %rdi
 ; x8664-NEXT: callq bar
+; ** FIXME Next line isn't actually necessary. **
+; x8664-NEXT: pushq %rax
 ; x8664-NEXT: movq [[FLAGS]], %rax
 ; x8664-NEXT: addb $127, %al
 ; x8664-NEXT: sahf
+; ** FIXME Next line isn't actually necessary. **
+; x8664-NEXT: popq %rax
 ; x8664-NEXT: jne
 
   %cx = cmpxchg i64* %foo, i64 %bar, i64 %baz seq_cst seq_cst
@@ -111,9 +134,13 @@ cond.end:
 define i32 @test_feed_cmov(i32* %addr, i32 %desired, i32 %new) {
 ; i386-LABEL: test_feed_cmov:
 ; i386: cmpxchgl
+; ** FIXME Next line isn't actually necessary. **
+; i386-NEXT: pushl %eax
 ; i386-NEXT: seto %al
 ; i386-NEXT: lahf
 ; i386-NEXT: movl %eax, [[FLAGS:%.*]]
+; ** FIXME Next line isn't actually necessary. **
+; i386-NEXT: popl %eax
 ; i386-NEXT: calll foo
 ; i386-NEXT: pushl %eax
 ; i386-NEXT: movl [[FLAGS]], %eax
@@ -123,9 +150,13 @@ define i32 @test_feed_cmov(i32* %addr, i32 %desired, i32 %new) {
 
 ; i386f-LABEL: test_feed_cmov:
 ; i386f: cmpxchgl
+; ** FIXME Next line isn't actually necessary. **
+; i386f-NEXT: pushl %eax
 ; i386f-NEXT: seto %al
 ; i386f-NEXT: lahf
 ; i386f-NEXT: movl %eax, [[FLAGS:%.*]]
+; ** FIXME Next line isn't actually necessary. **
+; i386f-NEXT: popl %eax
 ; i386f-NEXT: calll foo
 ; i386f-NEXT: pushl %eax
 ; i386f-NEXT: movl [[FLAGS]], %eax
@@ -135,9 +166,13 @@ define i32 @test_feed_cmov(i32* %addr, i32 %desired, i32 %new) {
 
 ; x8664-LABEL: test_feed_cmov:
 ; x8664: cmpxchgl
+; ** FIXME Next line isn't actually necessary. **
+; x8664: pushq %rax
 ; x8664: seto %al
 ; x8664-NEXT: lahf
 ; x8664-NEXT: movq %rax, [[FLAGS:%.*]]
+; ** FIXME Next line isn't actually necessary. **
+; x8664-NEXT: popq %rax
 ; x8664-NEXT: callq foo
 ; x8664-NEXT: pushq %rax
 ; x8664-NEXT: movq [[FLAGS]], %rax
diff --git a/test/CodeGen/X86/peephole-na-phys-copy-folding.ll b/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
index 438bf8ddf4c7..891a925611cf 100644
--- a/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
+++ b/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
@@ -1,5 +1,7 @@
-; RUN: llc -verify-machineinstrs -mtriple=i386-linux-gnu %s -o - | FileCheck %s
-; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s
+; RUN: llc -mtriple=i386-linux-gnu %s -o - | FileCheck %s
+; RUN: llc -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s
+
+; FIXME Add -verify-machineinstrs back when PR24535 is fixed.
 
 ; The peephole optimizer can elide some physical register copies such as
 ; EFLAGS. Make sure the flags are used directly, instead of needlessly using
@@ -137,7 +139,7 @@ f:
 
 ; CHECK-LABEL: test_two_live_flags:
 ; CHECK:       cmpxchg
-; CHECK-NEXT:  seto %al
+; CHECK:       seto %al
 ; CHECK-NEXT:  lahf
 ; Save result of the first cmpxchg into D.
 ; CHECK-NEXT:  mov{{[lq]}} %[[AX:[er]ax]], %[[D:[re]d[xi]]]

From ae4aa8b8d233366d231a10dafae68d7287512927 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Fri, 4 Dec 2015 01:31:59 +0000
Subject: [PATCH 046/364] raw_ostream: << operator for callables with
 raw_ostream argument

This is a revised version of r254655 which uses a Printable wrapper
class to avoid ambiguous overload problems.

Differential Revision: http://reviews.llvm.org/D14348

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254681 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Support/Printable.h              | 52 ++++++++++
 include/llvm/Target/TargetRegisterInfo.h      | 74 ++-------------
 lib/CodeGen/RegAllocPBQP.cpp                  | 27 ++----
 .../SelectionDAG/SelectionDAGDumper.cpp       | 18 +---
 lib/CodeGen/TargetRegisterInfo.cpp            | 95 +++++++++++--------
 5 files changed, 126 insertions(+), 140 deletions(-)
 create mode 100644 include/llvm/Support/Printable.h

diff --git a/include/llvm/Support/Printable.h b/include/llvm/Support/Printable.h
new file mode 100644
index 000000000000..5c1b8d5070d4
--- /dev/null
+++ b/include/llvm/Support/Printable.h
@@ -0,0 +1,52 @@
+//===--- Printable.h - Print function helpers -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines the Printable struct.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_PRINTABLE_H
+#define LLVM_SUPPORT_PRINTABLE_H
+
+#include <functional>
+
+namespace llvm {
+
+class raw_ostream;
+
+/// Simple wrapper around std::function<void(raw_ostream&)>.
+/// This class is usefull to construct print helpers for raw_ostream.
+///
+/// Example:
+///     Printable PrintRegister(unsigned Register) {
+///       return Printable([Register](raw_ostream &OS) {
+///         OS << getRegisterName(Register);
+///       }
+///     }
+///     ... OS << PrintRegister(Register); ...
+///
+/// Implementation note: Ideally this would just be a typedef, but doing so
+/// leads to operator << being ambiguous as function has matching constructors
+/// in some STL versions. I have seen the problem on gcc 4.6 libstdc++ and
+/// microsoft STL.
+class Printable {
+public:
+  std::function<void(raw_ostream &OS)> Print;
+  Printable(const std::function<void(raw_ostream &OS)> Print)
+    : Print(Print) {}
+};
+
+static inline raw_ostream &operator<<(raw_ostream &OS, const Printable &P) {
+  P.Print(OS);
+  return OS;
+}
+
+}
+
+#endif
diff --git a/include/llvm/Target/TargetRegisterInfo.h b/include/llvm/Target/TargetRegisterInfo.h
index 7d293fe82a6b..414255edb23e 100644
--- a/include/llvm/Target/TargetRegisterInfo.h
+++ b/include/llvm/Target/TargetRegisterInfo.h
@@ -22,6 +22,7 @@
 #include "llvm/IR/CallingConv.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Printable.h"
 #include <cassert>
 #include <functional>
 
@@ -932,7 +933,6 @@ struct VirtReg2IndexFunctor : public std::unary_function<unsigned, unsigned> {
   }
 };
 
-/// Helper class for printing registers on a raw_ostream.
 /// Prints virtual and physical registers with or without a TRI instance.
 ///
 /// The format is:
@@ -943,24 +943,10 @@ struct VirtReg2IndexFunctor : public std::unary_function<unsigned, unsigned> {
 ///   %physreg17      - a physical register when no TRI instance given.
 ///
 /// Usage: OS << PrintReg(Reg, TRI) << '\n';
-///
-class PrintReg {
-  const TargetRegisterInfo *TRI;
-  unsigned Reg;
-  unsigned SubIdx;
-public:
-  explicit PrintReg(unsigned reg, const TargetRegisterInfo *tri = nullptr,
-                    unsigned subidx = 0)
-    : TRI(tri), Reg(reg), SubIdx(subidx) {}
-  void print(raw_ostream&) const;
-};
+Printable PrintReg(unsigned Reg, const TargetRegisterInfo *TRI = nullptr,
+                   unsigned SubRegIdx = 0);
 
-static inline raw_ostream &operator<<(raw_ostream &OS, const PrintReg &PR) {
-  PR.print(OS);
-  return OS;
-}
-
-/// Helper class for printing register units on a raw_ostream.
+/// Create Printable object to print register units on a \ref raw_ostream.
 ///
 /// Register units are named after their root registers:
 ///
@@ -968,54 +954,14 @@ static inline raw_ostream &operator<<(raw_ostream &OS, const PrintReg &PR) {
 ///   FP0~ST7 - Dual roots.
 ///
 /// Usage: OS << PrintRegUnit(Unit, TRI) << '\n';
-///
-class PrintRegUnit {
-protected:
-  const TargetRegisterInfo *TRI;
-  unsigned Unit;
-public:
-  PrintRegUnit(unsigned unit, const TargetRegisterInfo *tri)
-    : TRI(tri), Unit(unit) {}
-  void print(raw_ostream&) const;
-};
-
-static inline raw_ostream &operator<<(raw_ostream &OS, const PrintRegUnit &PR) {
-  PR.print(OS);
-  return OS;
-}
+Printable PrintRegUnit(unsigned Unit, const TargetRegisterInfo *TRI);
 
-/// It is often convenient to track virtual registers and
-/// physical register units in the same list.
-class PrintVRegOrUnit : protected PrintRegUnit {
-public:
-  PrintVRegOrUnit(unsigned VRegOrUnit, const TargetRegisterInfo *tri)
-    : PrintRegUnit(VRegOrUnit, tri) {}
-  void print(raw_ostream&) const;
-};
-
-static inline raw_ostream &operator<<(raw_ostream &OS,
-                                      const PrintVRegOrUnit &PR) {
-  PR.print(OS);
-  return OS;
-}
-
-/// Helper class for printing lane masks.
-///
-/// They are currently printed out as hexadecimal numbers.
-/// Usage: OS << PrintLaneMask(Mask);
-class PrintLaneMask {
-protected:
-  LaneBitmask LaneMask;
-public:
-  PrintLaneMask(LaneBitmask LaneMask)
-    : LaneMask(LaneMask) {}
-  void print(raw_ostream&) const;
-};
+/// \brief Create Printable object to print virtual registers and physical
+/// registers on a \ref raw_ostream.
+Printable PrintVRegOrUnit(unsigned VRegOrUnit, const TargetRegisterInfo *TRI);
 
-static inline raw_ostream &operator<<(raw_ostream &OS, const PrintLaneMask &P) {
-  P.print(OS);
-  return OS;
-}
+/// Create Printable object to print LaneBitmasks on a \ref raw_ostream.
+Printable PrintLaneMask(LaneBitmask LaneMask);
 
 } // End llvm namespace
 
diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp
index afa98b26d727..fd28b05ed80a 100644
--- a/lib/CodeGen/RegAllocPBQP.cpp
+++ b/lib/CodeGen/RegAllocPBQP.cpp
@@ -47,6 +47,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Printable.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
@@ -805,33 +806,17 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
   return true;
 }
 
-namespace {
-// A helper class for printing node and register info in a consistent way
-class PrintNodeInfo {
-public:
-  typedef PBQP::RegAlloc::PBQPRAGraph Graph;
-  typedef PBQP::RegAlloc::PBQPRAGraph::NodeId NodeId;
-
-  PrintNodeInfo(NodeId NId, const Graph &G) : G(G), NId(NId) {}
-
-  void print(raw_ostream &OS) const {
+/// Create Printable object for node and register info.
+static Printable PrintNodeInfo(PBQP::RegAlloc::PBQPRAGraph::NodeId NId,
+                               const PBQP::RegAlloc::PBQPRAGraph &G) {
+  return Printable([NId, &G](raw_ostream &OS) {
     const MachineRegisterInfo &MRI = G.getMetadata().MF.getRegInfo();
     const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
     unsigned VReg = G.getNodeMetadata(NId).getVReg();
     const char *RegClassName = TRI->getRegClassName(MRI.getRegClass(VReg));
     OS << NId << " (" << RegClassName << ':' << PrintReg(VReg, TRI) << ')';
-  }
-
-private:
-  const Graph &G;
-  NodeId NId;
-};
-
-inline raw_ostream &operator<<(raw_ostream &OS, const PrintNodeInfo &PR) {
-  PR.print(OS);
-  return OS;
+  });
 }
-} // anonymous namespace
 
 void PBQP::RegAlloc::PBQPRAGraph::dump(raw_ostream &OS) const {
   for (auto NId : nodeIds()) {
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index d362f98d6464..7c5492b554c2 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -22,6 +22,7 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/Printable.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
@@ -369,25 +370,14 @@ const char *SDNode::getIndexedModeName(ISD::MemIndexedMode AM) {
   }
 }
 
-namespace {
-class PrintNodeId {
-  const SDNode &Node;
-public:
-  explicit PrintNodeId(const SDNode &Node)
-      : Node(Node) {}
-  void print(raw_ostream &OS) const {
+static Printable PrintNodeId(const SDNode &Node) {
+  return Printable([&Node](raw_ostream &OS) {
 #ifndef NDEBUG
     OS << 't' << Node.PersistentId;
 #else
     OS << (const void*)&Node;
 #endif
-  }
-};
-
-static inline raw_ostream &operator<<(raw_ostream &OS, const PrintNodeId &P) {
-  P.print(OS);
-  return OS;
-}
+  });
 }
 
 void SDNode::dump() const { dump(nullptr); }
diff --git a/lib/CodeGen/TargetRegisterInfo.cpp b/lib/CodeGen/TargetRegisterInfo.cpp
index 0c4a3dcb226e..0a7042ac3db5 100644
--- a/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/lib/CodeGen/TargetRegisterInfo.cpp
@@ -40,58 +40,71 @@ TargetRegisterInfo::TargetRegisterInfo(const TargetRegisterInfoDesc *ID,
 
 TargetRegisterInfo::~TargetRegisterInfo() {}
 
-void PrintReg::print(raw_ostream &OS) const {
-  if (!Reg)
-    OS << "%noreg";
-  else if (TargetRegisterInfo::isStackSlot(Reg))
-    OS << "SS#" << TargetRegisterInfo::stackSlot2Index(Reg);
-  else if (TargetRegisterInfo::isVirtualRegister(Reg))
-    OS << "%vreg" << TargetRegisterInfo::virtReg2Index(Reg);
-  else if (TRI && Reg < TRI->getNumRegs())
-    OS << '%' << TRI->getName(Reg);
-  else
-    OS << "%physreg" << Reg;
-  if (SubIdx) {
-    if (TRI)
-      OS << ':' << TRI->getSubRegIndexName(SubIdx);
+namespace llvm {
+
+Printable PrintReg(unsigned Reg, const TargetRegisterInfo *TRI,
+                   unsigned SubIdx) {
+  return Printable([Reg, TRI, SubIdx](raw_ostream &OS) {
+    if (!Reg)
+      OS << "%noreg";
+    else if (TargetRegisterInfo::isStackSlot(Reg))
+      OS << "SS#" << TargetRegisterInfo::stackSlot2Index(Reg);
+    else if (TargetRegisterInfo::isVirtualRegister(Reg))
+      OS << "%vreg" << TargetRegisterInfo::virtReg2Index(Reg);
+    else if (TRI && Reg < TRI->getNumRegs())
+      OS << '%' << TRI->getName(Reg);
     else
-      OS << ":sub(" << SubIdx << ')';
-  }
+      OS << "%physreg" << Reg;
+    if (SubIdx) {
+      if (TRI)
+        OS << ':' << TRI->getSubRegIndexName(SubIdx);
+      else
+        OS << ":sub(" << SubIdx << ')';
+    }
+  });
 }
 
-void PrintRegUnit::print(raw_ostream &OS) const {
-  // Generic printout when TRI is missing.
-  if (!TRI) {
-    OS << "Unit~" << Unit;
-    return;
-  }
+Printable PrintRegUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
+  return Printable([Unit, TRI](raw_ostream &OS) {
+    // Generic printout when TRI is missing.
+    if (!TRI) {
+      OS << "Unit~" << Unit;
+      return;
+    }
 
-  // Check for invalid register units.
-  if (Unit >= TRI->getNumRegUnits()) {
-    OS << "BadUnit~" << Unit;
-    return;
-  }
+    // Check for invalid register units.
+    if (Unit >= TRI->getNumRegUnits()) {
+      OS << "BadUnit~" << Unit;
+      return;
+    }
 
-  // Normal units have at least one root.
-  MCRegUnitRootIterator Roots(Unit, TRI);
-  assert(Roots.isValid() && "Unit has no roots.");
-  OS << TRI->getName(*Roots);
-  for (++Roots; Roots.isValid(); ++Roots)
-    OS << '~' << TRI->getName(*Roots);
+    // Normal units have at least one root.
+    MCRegUnitRootIterator Roots(Unit, TRI);
+    assert(Roots.isValid() && "Unit has no roots.");
+    OS << TRI->getName(*Roots);
+    for (++Roots; Roots.isValid(); ++Roots)
+      OS << '~' << TRI->getName(*Roots);
+  });
 }
 
-void PrintVRegOrUnit::print(raw_ostream &OS) const {
-  if (TRI && TRI->isVirtualRegister(Unit)) {
-    OS << "%vreg" << TargetRegisterInfo::virtReg2Index(Unit);
-    return;
-  }
-  PrintRegUnit::print(OS);
+Printable PrintVRegOrUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
+  return Printable([Unit, TRI](raw_ostream &OS) {
+    if (TRI && TRI->isVirtualRegister(Unit)) {
+      OS << "%vreg" << TargetRegisterInfo::virtReg2Index(Unit);
+    } else {
+      OS << PrintRegUnit(Unit, TRI);
+    }
+  });
 }
 
-void PrintLaneMask::print(raw_ostream &OS) const {
-  OS << format("%08X", LaneMask);
+Printable PrintLaneMask(LaneBitmask LaneMask) {
+  return Printable([LaneMask](raw_ostream &OS) {
+    OS << format("%08X", LaneMask);
+  });
 }
 
+} // End of llvm namespace
+
 /// getAllocatableClass - Return the maximal subclass of the given register
 /// class that is alloctable, or NULL.
 const TargetRegisterClass *

From 7adbf112c7e68bbd8cda3315eb3d831426401987 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Fri, 4 Dec 2015 01:51:19 +0000
Subject: [PATCH 047/364] ScheduleDAGInstrs: Rework schedule graph builder.

Re-comitting with a change that avoids undefined uses getting put into
the VRegUses list.

The new algorithm remembers the uses encountered while walking backwards
until a matching def is found. Contrary to the previous version this:
- Works without LiveIntervals being available
- Allows to increase the precision to subregisters/lanemasks
  (not used for now)

The changes in the AMDGPU tests are necessary because the R600 scheduler
is not stable with respect to the order of nodes in the ready queues.

Differential Revision: http://reviews.llvm.org/D9068

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254683 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/ScheduleDAGInstrs.h      |  40 ++-
 lib/CodeGen/ScheduleDAGInstrs.cpp             | 227 +++++++++++++-----
 test/CodeGen/AMDGPU/image-attributes.ll       |  20 +-
 test/CodeGen/AMDGPU/literals.ll               |   8 +-
 .../AMDGPU/llvm.AMDGPU.read.workdim.ll        |   2 +-
 test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll      |   2 +-
 .../AMDGPU/llvm.r600.read.local.size.ll       |   6 +-
 test/CodeGen/AMDGPU/or.ll                     |   2 +-
 test/CodeGen/AMDGPU/set-dx10.ll               |  48 ++--
 test/CodeGen/AMDGPU/sext-in-reg.ll            |   4 +-
 test/CodeGen/AMDGPU/shl.ll                    |  12 +-
 test/CodeGen/AMDGPU/sra.ll                    |   8 +-
 test/CodeGen/AMDGPU/srl.ll                    |  10 +-
 test/CodeGen/AMDGPU/unsupported-cc.ll         |  32 +--
 test/CodeGen/AMDGPU/work-item-intrinsics.ll   |  12 +-
 test/CodeGen/AMDGPU/xor.ll                    |   2 +-
 16 files changed, 279 insertions(+), 156 deletions(-)

diff --git a/include/llvm/CodeGen/ScheduleDAGInstrs.h b/include/llvm/CodeGen/ScheduleDAGInstrs.h
index 1446f2ac082b..c715e0f79205 100644
--- a/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -33,15 +33,26 @@ namespace llvm {
   /// An individual mapping from virtual register number to SUnit.
   struct VReg2SUnit {
     unsigned VirtReg;
+    LaneBitmask LaneMask;
     SUnit *SU;
 
-    VReg2SUnit(unsigned reg, SUnit *su): VirtReg(reg), SU(su) {}
+    VReg2SUnit(unsigned VReg, LaneBitmask LaneMask, SUnit *SU)
+      : VirtReg(VReg), LaneMask(LaneMask), SU(SU) {}
 
     unsigned getSparseSetIndex() const {
       return TargetRegisterInfo::virtReg2Index(VirtReg);
     }
   };
 
+  /// Mapping from virtual register to SUnit including an operand index.
+  struct VReg2SUnitOperIdx : public VReg2SUnit {
+    unsigned OperandIndex;
+
+    VReg2SUnitOperIdx(unsigned VReg, LaneBitmask LaneMask,
+                      unsigned OperandIndex, SUnit *SU)
+      : VReg2SUnit(VReg, LaneMask, SU), OperandIndex(OperandIndex) {}
+  };
+
   /// Record a physical register access.
   /// For non-data-dependent uses, OpIdx == -1.
   struct PhysRegSUOper {
@@ -69,7 +80,10 @@ namespace llvm {
   /// Track local uses of virtual registers. These uses are gathered by the DAG
   /// builder and may be consulted by the scheduler to avoid iterating an entire
   /// vreg use list.
-  typedef SparseMultiSet<VReg2SUnit, VirtReg2IndexFunctor> VReg2UseMap;
+  typedef SparseMultiSet<VReg2SUnit, VirtReg2IndexFunctor> VReg2SUnitMultiMap;
+
+  typedef SparseMultiSet<VReg2SUnitOperIdx, VirtReg2IndexFunctor>
+    VReg2SUnitOperIdxMultiMap;
 
   /// ScheduleDAGInstrs - A ScheduleDAG subclass for scheduling lists of
   /// MachineInstrs.
@@ -95,6 +109,9 @@ namespace llvm {
     /// it has taken responsibility for scheduling the terminator correctly.
     bool CanHandleTerminators;
 
+    /// Whether lane masks should get tracked.
+    bool TrackLaneMasks;
+
     /// State specific to the current scheduling region.
     /// ------------------------------------------------
 
@@ -117,7 +134,7 @@ namespace llvm {
     /// After calling BuildSchedGraph, each vreg used in the scheduling region
     /// is mapped to a set of SUnits. These include all local vreg uses, not
     /// just the uses for a singly defined vreg.
-    VReg2UseMap VRegUses;
+    VReg2SUnitMultiMap VRegUses;
 
     /// State internal to DAG building.
     /// -------------------------------
@@ -129,8 +146,12 @@ namespace llvm {
     Reg2SUnitsMap Defs;
     Reg2SUnitsMap Uses;
 
-    /// Track the last instruction in this region defining each virtual register.
-    VReg2SUnitMap VRegDefs;
+    /// Tracks the last instruction(s) in this region defining each virtual
+    /// register. There may be multiple current definitions for a register with
+    /// disjunct lanemasks.
+    VReg2SUnitMultiMap CurrentVRegDefs;
+    /// Tracks the last instructions in this region using each virtual register.
+    VReg2SUnitOperIdxMultiMap CurrentVRegUses;
 
     /// PendingLoads - Remember where unknown loads are after the most recent
     /// unknown store, as we iterate. As with Defs and Uses, this is here
@@ -200,7 +221,8 @@ namespace llvm {
     /// input.
     void buildSchedGraph(AliasAnalysis *AA,
                          RegPressureTracker *RPTracker = nullptr,
-                         PressureDiffs *PDiffs = nullptr);
+                         PressureDiffs *PDiffs = nullptr,
+                         bool TrackLaneMasks = false);
 
     /// addSchedBarrierDeps - Add dependencies from instructions in the current
     /// list of instructions being scheduled to scheduling barrier. We want to
@@ -247,6 +269,12 @@ namespace llvm {
     /// Other adjustments may be made to the instruction if necessary. Return
     /// true if the operand has been deleted, false if not.
     bool toggleKillFlag(MachineInstr *MI, MachineOperand &MO);
+
+    /// Returns a mask for which lanes get read/written by the given (register)
+    /// machine operand.
+    LaneBitmask getLaneMaskForMO(const MachineOperand &MO) const;
+
+    void collectVRegUses(SUnit *SU);
   };
 
   /// newSUnit - Creates a new SUnit and return a ptr to it.
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index 12b2beb357b4..9d588ff24f61 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -13,12 +13,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/ADT/IntEqClasses.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -55,7 +55,7 @@ ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
                                      bool RemoveKillFlags)
     : ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()), LIS(LIS),
       RemoveKillFlags(RemoveKillFlags), CanHandleTerminators(false),
-      FirstDbgValue(nullptr) {
+      TrackLaneMasks(false), FirstDbgValue(nullptr) {
   DbgValues.clear();
 
   const TargetSubtargetInfo &ST = mf.getSubtarget();
@@ -363,6 +363,20 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
   }
 }
 
+LaneBitmask ScheduleDAGInstrs::getLaneMaskForMO(const MachineOperand &MO) const
+{
+  unsigned Reg = MO.getReg();
+  // No point in tracking lanemasks if we don't have interesting subregisters.
+  const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
+  if (!RC.HasDisjunctSubRegs)
+    return ~0u;
+
+  unsigned SubReg = MO.getSubReg();
+  if (SubReg == 0)
+    return RC.getLaneMask();
+  return TRI->getSubRegIndexLaneMask(SubReg);
+}
+
 /// addVRegDefDeps - Add register output and data dependencies from this SUnit
 /// to instructions that occur later in the same scheduling region if they read
 /// from or write to the virtual register defined at OperIdx.
@@ -370,35 +384,106 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
 /// TODO: Hoist loop induction variable increments. This has to be
 /// reevaluated. Generally, IV scheduling should be done before coalescing.
 void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
-  const MachineInstr *MI = SU->getInstr();
-  unsigned Reg = MI->getOperand(OperIdx).getReg();
+  MachineInstr *MI = SU->getInstr();
+  MachineOperand &MO = MI->getOperand(OperIdx);
+  unsigned Reg = MO.getReg();
+
+  LaneBitmask DefLaneMask;
+  LaneBitmask KillLaneMask;
+  if (TrackLaneMasks) {
+    bool IsKill = MO.getSubReg() == 0 || MO.isUndef();
+    DefLaneMask = getLaneMaskForMO(MO);
+    // If we have a <read-undef> flag, none of the lane values comes from an
+    // earlier instruction.
+    KillLaneMask = IsKill ? ~0u : DefLaneMask;
+
+    // Clear undef flag, we'll re-add it later once we know which subregister
+    // Def is first.
+    MO.setIsUndef(false);
+  } else {
+    DefLaneMask = ~0u;
+    KillLaneMask = ~0u;
+  }
+
+  if (MO.isDead()) {
+    assert(CurrentVRegUses.find(Reg) == CurrentVRegUses.end() &&
+           "Dead defs should have no uses");
+  } else {
+    // Add data dependence to all uses we found so far.
+    const TargetSubtargetInfo &ST = MF.getSubtarget();
+    for (VReg2SUnitOperIdxMultiMap::iterator I = CurrentVRegUses.find(Reg),
+         E = CurrentVRegUses.end(); I != E; /*empty*/) {
+      LaneBitmask LaneMask = I->LaneMask;
+      // Ignore uses of other lanes.
+      if ((LaneMask & KillLaneMask) == 0) {
+        ++I;
+        continue;
+      }
 
-  // Singly defined vregs do not have output/anti dependencies.
-  // The current operand is a def, so we have at least one.
-  // Check here if there are any others...
+      if ((LaneMask & DefLaneMask) != 0) {
+        SUnit *UseSU = I->SU;
+        MachineInstr *Use = UseSU->getInstr();
+        SDep Dep(SU, SDep::Data, Reg);
+        Dep.setLatency(SchedModel.computeOperandLatency(MI, OperIdx, Use,
+                                                        I->OperandIndex));
+        ST.adjustSchedDependency(SU, UseSU, Dep);
+        UseSU->addPred(Dep);
+      }
+
+      LaneMask &= ~KillLaneMask;
+      // If we found a Def for all lanes of this use, remove it from the list.
+      if (LaneMask != 0) {
+        I->LaneMask = LaneMask;
+        ++I;
+      } else
+        I = CurrentVRegUses.erase(I);
+    }
+  }
+
+  // Shortcut: Singly defined vregs do not have output/anti dependencies.
   if (MRI.hasOneDef(Reg))
     return;
 
-  // Add output dependence to the next nearest def of this vreg.
+  // Add output dependence to the next nearest defs of this vreg.
   //
   // Unless this definition is dead, the output dependence should be
   // transitively redundant with antidependencies from this definition's
   // uses. We're conservative for now until we have a way to guarantee the uses
   // are not eliminated sometime during scheduling. The output dependence edge
   // is also useful if output latency exceeds def-use latency.
-  VReg2SUnitMap::iterator DefI = VRegDefs.find(Reg);
-  if (DefI == VRegDefs.end())
-    VRegDefs.insert(VReg2SUnit(Reg, SU));
-  else {
-    SUnit *DefSU = DefI->SU;
-    if (DefSU != SU && DefSU != &ExitSU) {
-      SDep Dep(SU, SDep::Output, Reg);
-      Dep.setLatency(
-        SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr()));
-      DefSU->addPred(Dep);
-    }
-    DefI->SU = SU;
+  LaneBitmask LaneMask = DefLaneMask;
+  for (VReg2SUnit &V2SU : make_range(CurrentVRegDefs.find(Reg),
+                                     CurrentVRegDefs.end())) {
+    // Ignore defs for other lanes.
+    if ((V2SU.LaneMask & LaneMask) == 0)
+      continue;
+    // Add an output dependence.
+    SUnit *DefSU = V2SU.SU;
+    // Ignore additional defs of the same lanes in one instruction. This can
+    // happen because lanemasks are shared for targets with too many
+    // subregisters. We also use some representration tricks/hacks where we
+    // add super-register defs/uses, to imply that although we only access parts
+    // of the reg we care about the full one.
+    if (DefSU == SU)
+      continue;
+    SDep Dep(SU, SDep::Output, Reg);
+    Dep.setLatency(
+      SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr()));
+    DefSU->addPred(Dep);
+
+    // Update current definition. This can get tricky if the def was about a
+    // bigger lanemask before. We then have to shrink it and create a new
+    // VReg2SUnit for the non-overlapping part.
+    LaneBitmask OverlapMask = V2SU.LaneMask & LaneMask;
+    LaneBitmask NonOverlapMask = V2SU.LaneMask & ~LaneMask;
+    if (NonOverlapMask != 0)
+      CurrentVRegDefs.insert(VReg2SUnit(Reg, NonOverlapMask, V2SU.SU));
+    V2SU.SU = SU;
+    V2SU.LaneMask = OverlapMask;
   }
+  // If there was no CurrentVRegDefs entry for some lanes yet, create one.
+  if (LaneMask != 0)
+    CurrentVRegDefs.insert(VReg2SUnit(Reg, LaneMask, SU));
 }
 
 /// addVRegUseDeps - Add a register data dependency if the instruction that
@@ -408,49 +493,26 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
 ///
 /// TODO: Handle ExitSU "uses" properly.
 void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
-  MachineInstr *MI = SU->getInstr();
-  unsigned Reg = MI->getOperand(OperIdx).getReg();
+  const MachineInstr *MI = SU->getInstr();
+  const MachineOperand &MO = MI->getOperand(OperIdx);
+  unsigned Reg = MO.getReg();
+
+  // Remember the use. Data dependencies will be added when we find the def.
+  LaneBitmask LaneMask = TrackLaneMasks ? getLaneMaskForMO(MO) : ~0u;
+  CurrentVRegUses.insert(VReg2SUnitOperIdx(Reg, LaneMask, OperIdx, SU));
+
+  // Add antidependences to the following defs of the vreg.
+  for (VReg2SUnit &V2SU : make_range(CurrentVRegDefs.find(Reg),
+                                     CurrentVRegDefs.end())) {
+    // Ignore defs for unrelated lanes.
+    LaneBitmask PrevDefLaneMask = V2SU.LaneMask;
+    if ((PrevDefLaneMask & LaneMask) == 0)
+      continue;
+    if (V2SU.SU == SU)
+      continue;
 
-  // Record this local VReg use.
-  VReg2UseMap::iterator UI = VRegUses.find(Reg);
-  for (; UI != VRegUses.end(); ++UI) {
-    if (UI->SU == SU)
-      break;
-  }
-  if (UI == VRegUses.end())
-    VRegUses.insert(VReg2SUnit(Reg, SU));
-
-  // Lookup this operand's reaching definition.
-  assert(LIS && "vreg dependencies requires LiveIntervals");
-  LiveQueryResult LRQ
-    = LIS->getInterval(Reg).Query(LIS->getInstructionIndex(MI));
-  VNInfo *VNI = LRQ.valueIn();
-
-  // VNI will be valid because MachineOperand::readsReg() is checked by caller.
-  assert(VNI && "No value to read by operand");
-  MachineInstr *Def = LIS->getInstructionFromIndex(VNI->def);
-  // Phis and other noninstructions (after coalescing) have a NULL Def.
-  if (Def) {
-    SUnit *DefSU = getSUnit(Def);
-    if (DefSU) {
-      // The reaching Def lives within this scheduling region.
-      // Create a data dependence.
-      SDep dep(DefSU, SDep::Data, Reg);
-      // Adjust the dependence latency using operand def/use information, then
-      // allow the target to perform its own adjustments.
-      int DefOp = Def->findRegisterDefOperandIdx(Reg);
-      dep.setLatency(SchedModel.computeOperandLatency(Def, DefOp, MI, OperIdx));
-
-      const TargetSubtargetInfo &ST = MF.getSubtarget();
-      ST.adjustSchedDependency(DefSU, SU, const_cast<SDep &>(dep));
-      SU->addPred(dep);
-    }
+    V2SU.SU->addPred(SDep(SU, SDep::Anti, Reg));
   }
-
-  // Add antidependence to the following def of the vreg it uses.
-  VReg2SUnitMap::iterator DefI = VRegDefs.find(Reg);
-  if (DefI != VRegDefs.end() && DefI->SU != SU)
-    DefI->SU->addPred(SDep(SU, SDep::Anti, Reg));
 }
 
 /// Return true if MI is an instruction we are unable to reason about
@@ -733,17 +795,44 @@ void ScheduleDAGInstrs::initSUnits() {
   }
 }
 
+void ScheduleDAGInstrs::collectVRegUses(SUnit *SU) {
+  const MachineInstr *MI = SU->getInstr();
+  for (const MachineOperand &MO : MI->operands()) {
+    if (!MO.isReg())
+      continue;
+    if (!MO.readsReg())
+      continue;
+    if (TrackLaneMasks && !MO.isUse())
+      continue;
+
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+
+    // Record this local VReg use.
+    VReg2SUnitMultiMap::iterator UI = VRegUses.find(Reg);
+    for (; UI != VRegUses.end(); ++UI) {
+      if (UI->SU == SU)
+        break;
+    }
+    if (UI == VRegUses.end())
+      VRegUses.insert(VReg2SUnit(Reg, 0, SU));
+  }
+}
+
 /// If RegPressure is non-null, compute register pressure as a side effect. The
 /// DAG builder is an efficient place to do it because it already visits
 /// operands.
 void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
                                         RegPressureTracker *RPTracker,
-                                        PressureDiffs *PDiffs) {
+                                        PressureDiffs *PDiffs,
+                                        bool TrackLaneMasks) {
   const TargetSubtargetInfo &ST = MF.getSubtarget();
   bool UseAA = EnableAASchedMI.getNumOccurrences() > 0 ? EnableAASchedMI
                                                        : ST.useAA();
   AliasAnalysis *AAForDep = UseAA ? AA : nullptr;
 
+  this->TrackLaneMasks = TrackLaneMasks;
   MISUnitMap.clear();
   ScheduleDAG::clearDAG();
 
@@ -777,10 +866,14 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
   Defs.setUniverse(TRI->getNumRegs());
   Uses.setUniverse(TRI->getNumRegs());
 
-  assert(VRegDefs.empty() && "Only BuildSchedGraph may access VRegDefs");
+  assert(CurrentVRegDefs.empty() && "nobody else should use CurrentVRegDefs");
+  assert(CurrentVRegUses.empty() && "nobody else should use CurrentVRegUses");
+  unsigned NumVirtRegs = MRI.getNumVirtRegs();
+  CurrentVRegDefs.setUniverse(NumVirtRegs);
+  CurrentVRegUses.setUniverse(NumVirtRegs);
+
   VRegUses.clear();
-  VRegDefs.setUniverse(MRI.getNumVirtRegs());
-  VRegUses.setUniverse(MRI.getNumVirtRegs());
+  VRegUses.setUniverse(NumVirtRegs);
 
   // Model data dependencies between instructions being scheduled and the
   // ExitSU.
@@ -808,6 +901,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
       RPTracker->recede(/*LiveUses=*/nullptr, PDiff);
       assert(RPTracker->getPos() == std::prev(MII) &&
              "RPTracker can't find MI");
+      collectVRegUses(SU);
     }
 
     assert(
@@ -1057,7 +1151,8 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
 
   Defs.clear();
   Uses.clear();
-  VRegDefs.clear();
+  CurrentVRegDefs.clear();
+  CurrentVRegUses.clear();
   PendingLoads.clear();
 }
 
diff --git a/test/CodeGen/AMDGPU/image-attributes.ll b/test/CodeGen/AMDGPU/image-attributes.ll
index 7a5a7346865f..5906b2f15709 100644
--- a/test/CodeGen/AMDGPU/image-attributes.ll
+++ b/test/CodeGen/AMDGPU/image-attributes.ll
@@ -6,7 +6,7 @@
 
 ; FUNC-LABEL: {{^}}width_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[2].Z
+; EG: MOV * [[VAL]], KC0[2].Z
 define void @width_2d (%opencl.image2d_t addrspace(1)* %in,
                        i32 addrspace(1)* %out) {
 entry:
@@ -19,7 +19,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}width_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[2].Z
+; EG: MOV * [[VAL]], KC0[2].Z
 define void @width_3d (%opencl.image3d_t addrspace(1)* %in,
                        i32 addrspace(1)* %out) {
 entry:
@@ -36,7 +36,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}height_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[2].W
+; EG: MOV * [[VAL]], KC0[2].W
 define void @height_2d (%opencl.image2d_t addrspace(1)* %in,
                         i32 addrspace(1)* %out) {
 entry:
@@ -49,7 +49,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}height_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[2].W
+; EG: MOV * [[VAL]], KC0[2].W
 define void @height_3d (%opencl.image3d_t addrspace(1)* %in,
                         i32 addrspace(1)* %out) {
 entry:
@@ -66,7 +66,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}depth_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[3].X
+; EG: MOV * [[VAL]], KC0[3].X
 define void @depth_3d (%opencl.image3d_t addrspace(1)* %in,
                        i32 addrspace(1)* %out) {
 entry:
@@ -83,7 +83,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}data_type_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[3].Y
+; EG: MOV * [[VAL]], KC0[3].Y
 define void @data_type_2d (%opencl.image2d_t addrspace(1)* %in,
                            i32 addrspace(1)* %out) {
 entry:
@@ -96,7 +96,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}data_type_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[3].Y
+; EG: MOV * [[VAL]], KC0[3].Y
 define void @data_type_3d (%opencl.image3d_t addrspace(1)* %in,
                                      i32 addrspace(1)* %out) {
 entry:
@@ -113,7 +113,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}channel_order_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[3].Z
+; EG: MOV * [[VAL]], KC0[3].Z
 define void @channel_order_2d (%opencl.image2d_t addrspace(1)* %in,
                                i32 addrspace(1)* %out) {
 entry:
@@ -126,7 +126,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}channel_order_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[3].Z
+; EG: MOV * [[VAL]], KC0[3].Z
 define void @channel_order_3d (%opencl.image3d_t addrspace(1)* %in,
                                          i32 addrspace(1)* %out) {
 entry:
@@ -145,7 +145,7 @@ entry:
 ;
 ; FUNC-LABEL: {{^}}image_arg_2nd:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[4].Z
+; EG: MOV * [[VAL]], KC0[4].Z
 define void @image_arg_2nd (%opencl.image3d_t addrspace(1)* %in1,
                             i32 %x,
                             %opencl.image2d_t addrspace(1)* %in2,
diff --git a/test/CodeGen/AMDGPU/literals.ll b/test/CodeGen/AMDGPU/literals.ll
index cff1c24f89d6..9d2320cb2d19 100644
--- a/test/CodeGen/AMDGPU/literals.ll
+++ b/test/CodeGen/AMDGPU/literals.ll
@@ -7,8 +7,8 @@
 ; ADD_INT literal.x KC0[2].Z, 5
 
 ; CHECK: {{^}}i32_literal:
-; CHECK: ADD_INT {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: ADD_INT * {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.y
 ; CHECK-NEXT: 5
 define void @i32_literal(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -24,8 +24,8 @@ entry:
 ; ADD literal.x KC0[2].Z, 5.0
 
 ; CHECK: {{^}}float_literal:
-; CHECK: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: ADD * {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.y
 ; CHECK-NEXT: 1084227584(5.0
 define void @float_literal(float addrspace(1)* %out, float %in) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll
index 6dc9d050eee6..2e299e30b8c7 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll
@@ -4,7 +4,7 @@
 
 ; FUNC-LABEL: {{^}}read_workdim:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[2].Z
+; EG: MOV * [[VAL]], KC0[2].Z
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll
index 74792e50017f..a30a8e083eb6 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll
@@ -3,7 +3,7 @@
 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s
 
 ; R600: {{^}}amdgpu_trunc:
-; R600: TRUNC T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; R600: TRUNC {{\*? *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
 ; SI: {{^}}amdgpu_trunc:
 ; SI: v_trunc_f32
 
diff --git a/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
index f2a7256e812d..13ebee41e844 100644
--- a/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
+++ b/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
@@ -5,7 +5,7 @@
 
 ; FUNC-LABEL: {{^}}local_size_x:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[1].Z
+; EG: MOV * [[VAL]], KC0[1].Z
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18
@@ -23,7 +23,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}local_size_y:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[1].W
+; EG: MOV * [[VAL]], KC0[1].W
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
@@ -38,7 +38,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}local_size_z:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[2].X
+; EG: MOV * [[VAL]], KC0[2].X
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
diff --git a/test/CodeGen/AMDGPU/or.ll b/test/CodeGen/AMDGPU/or.ll
index 1c04090b407f..e40f18f040b7 100644
--- a/test/CodeGen/AMDGPU/or.ll
+++ b/test/CodeGen/AMDGPU/or.ll
@@ -153,7 +153,7 @@ define void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 }
 
 ; FUNC-LABEL: {{^}}or_i1:
-; EG: OR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}}
+; EG: OR_INT * {{\** *}}T{{[0-9]+\.[XYZW], PS, PV\.[XYZW]}}
 
 ; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}]
 define void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
diff --git a/test/CodeGen/AMDGPU/set-dx10.ll b/test/CodeGen/AMDGPU/set-dx10.ll
index 53694dcffa66..57365a6e1fc3 100644
--- a/test/CodeGen/AMDGPU/set-dx10.ll
+++ b/test/CodeGen/AMDGPU/set-dx10.ll
@@ -5,8 +5,8 @@
 ; SET*DX10 instructions.
 
 ; CHECK: {{^}}fcmp_une_select_fptosi:
-; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETNE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_une_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -19,8 +19,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_une_select_i32:
-; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETNE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_une_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -31,8 +31,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_oeq_select_fptosi:
-; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_oeq_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -45,8 +45,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_oeq_select_i32:
-; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_oeq_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -57,8 +57,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_ogt_select_fptosi:
-; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_ogt_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -71,8 +71,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_ogt_select_i32:
-; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_ogt_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -83,8 +83,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_oge_select_fptosi:
-; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_oge_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -97,8 +97,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_oge_select_i32:
-; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_oge_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -109,8 +109,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_ole_select_fptosi:
-; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_ole_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -123,8 +123,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_ole_select_i32:
-; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_ole_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -135,8 +135,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_olt_select_fptosi:
-; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_olt_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -149,8 +149,8 @@ entry:
 }
 
 ; CHECK: {{^}}fcmp_olt_select_i32:
-; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_olt_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
diff --git a/test/CodeGen/AMDGPU/sext-in-reg.ll b/test/CodeGen/AMDGPU/sext-in-reg.ll
index 95fcfdbdecae..23ae3b967971 100644
--- a/test/CodeGen/AMDGPU/sext-in-reg.ll
+++ b/test/CodeGen/AMDGPU/sext-in-reg.ll
@@ -12,8 +12,8 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 ; SI: buffer_store_dword [[EXTRACT]],
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
-; EG: BFE_INT [[RES]], {{.*}}, 0.0, 1
-; EG-NEXT: LSHR * [[ADDR]]
+; EG: LSHR * [[ADDR]]
+; EG: BFE_INT * [[RES]], {{.*}}, 0.0, 1
 define void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) {
   %shl = shl i32 %in, 31
   %sext = ashr i32 %shl, 31
diff --git a/test/CodeGen/AMDGPU/shl.ll b/test/CodeGen/AMDGPU/shl.ll
index bf08e66f3304..55db80731c90 100644
--- a/test/CodeGen/AMDGPU/shl.ll
+++ b/test/CodeGen/AMDGPU/shl.ll
@@ -53,14 +53,14 @@ define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in
   ret void
 }
 
-;EG: {{^}}shl_i64:
+;EG-LABEL: {{^}}shl_i64:
 ;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
 ;EG: LSHR {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
-;EG: LSHR {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
-;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-DAG: LSHR {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
 ;EG-DAG: LSHL {{\*? *}}[[HISMTMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], [[SHIFT]]
-;EG-DAG: OR_INT {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], {{[[HISMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
-;EG-DAG: LSHL {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], [[OPLO]], {{PS|[[SHIFT]]}}
+;EG-DAG: OR_INT {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], {{[[HISMTMP]]|PV.[XYZW]|PS}}, {{[[OVERF]]|PV.[XYZW]}}
+;EG-DAG: LSHL {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], [[OPLO]], {{PS|[[SHIFT]]|PV.[XYZW]}}
 ;EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
 ;EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
 ;EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0
@@ -80,7 +80,7 @@ define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   ret void
 }
 
-;EG: {{^}}shl_v2i64:
+;EG-LABEL: {{^}}shl_v2i64:
 ;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
 ;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
 ;EG-DAG: LSHR {{\*? *}}[[COMPSHA]]
diff --git a/test/CodeGen/AMDGPU/sra.ll b/test/CodeGen/AMDGPU/sra.ll
index bcbc32f4c053..3b59bbfb18c0 100644
--- a/test/CodeGen/AMDGPU/sra.ll
+++ b/test/CodeGen/AMDGPU/sra.ll
@@ -70,11 +70,11 @@ entry:
 ;EG-LABEL: {{^}}ashr_i64_2:
 ;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
 ;EG: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
-;EG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
-;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-DAG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
 ;EG-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]]
-;EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
-;EG-DAG: ASHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
+;EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]|PS}}, {{[[OVERF]]|PV.[XYZW]}}
+;EG-DAG: ASHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|PV.[XYZW]|[[SHIFT]]}}
 ;EG-DAG: ASHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal
 ;EG-DAG: ASHR {{\*? *}}[[HIBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal
 ;EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
diff --git a/test/CodeGen/AMDGPU/srl.ll b/test/CodeGen/AMDGPU/srl.ll
index ebb2f2db252e..bbd954356322 100644
--- a/test/CodeGen/AMDGPU/srl.ll
+++ b/test/CodeGen/AMDGPU/srl.ll
@@ -65,14 +65,14 @@ define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i
 
 ; EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
 ; EG: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
-; EG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
 ; EG-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+; EG-DAG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
 ; EG-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]]
-; EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
-; EG-DAG: LSHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
-; EG-DAG: LSHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
+; EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]|PS}}, {{[[OVERF]]|PV.[XYZW]}}
+; EG-DAG: LSHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]|PV\.[XYZW]}}
 ; EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
-; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
+; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]|PS}}
+; EG-DAG: LSHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], [[SHIFT]]
 ; EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0
 define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
diff --git a/test/CodeGen/AMDGPU/unsupported-cc.ll b/test/CodeGen/AMDGPU/unsupported-cc.ll
index 8ab4faf2f145..d120111a71fb 100644
--- a/test/CodeGen/AMDGPU/unsupported-cc.ll
+++ b/test/CodeGen/AMDGPU/unsupported-cc.ll
@@ -3,8 +3,8 @@
 ; These tests are for condition codes that are not supported by the hardware
 
 ; CHECK-LABEL: {{^}}slt:
-; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 5(7.006492e-45)
 define void @slt(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -15,8 +15,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}ult_i32:
-; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 5(7.006492e-45)
 define void @ult_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -40,8 +40,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}ult_float_native:
-; CHECK: SETGE T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
-; CHECK-NEXT: LSHR *
+; CHECK: LSHR
+; CHECK-NEXT: SETGE {{\*? *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, {{literal\.[xy]}}
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @ult_float_native(float addrspace(1)* %out, float %in) {
 entry:
@@ -52,8 +52,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}olt:
-; CHECK: SETGT T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
-; CHECK-NEXT: LSHR *
+; CHECK: LSHR
+; CHECK-NEXT: SETGT {{\*? *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @olt(float addrspace(1)* %out, float %in) {
 entry:
@@ -64,8 +64,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}sle:
-; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 6(8.407791e-45)
 define void @sle(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -76,8 +76,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}ule_i32:
-; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
-; CHECK-NEXT: LSHR
+; CHECK: LSHR
+; CHECK-NEXT: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 6(8.407791e-45)
 define void @ule_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -101,8 +101,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}ule_float_native:
-; CHECK: SETGT T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
-; CHECK-NEXT: LSHR *
+; CHECK: LSHR
+; CHECK-NEXT: SETGT {{\*? *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, {{literal\.[xy]}}
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @ule_float_native(float addrspace(1)* %out, float %in) {
 entry:
@@ -113,8 +113,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}ole:
-; CHECK: SETGE T{{[0-9]\.[XYZW]}}, literal.x, KC0[2].Z
-; CHECK-NEXT: LSHR *
+; CHECK: LSHR
+; CHECK-NEXT: SETGE {{\*? *}}T{{[0-9]\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT:1084227584(5.000000e+00)
 define void @ole(float addrspace(1)* %out, float %in) {
 entry:
diff --git a/test/CodeGen/AMDGPU/work-item-intrinsics.ll b/test/CodeGen/AMDGPU/work-item-intrinsics.ll
index a704a23b0f92..f420ec9c7d23 100644
--- a/test/CodeGen/AMDGPU/work-item-intrinsics.ll
+++ b/test/CodeGen/AMDGPU/work-item-intrinsics.ll
@@ -7,7 +7,7 @@
 
 ; FUNC-LABEL: {{^}}ngroups_x:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[0].X
+; EG: MOV {{\*? *}}[[VAL]], KC0[0].X
 
 ; HSA: .amd_kernel_code_t
 
@@ -38,7 +38,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}ngroups_y:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[0].Y
+; EG: MOV {{\*? *}}[[VAL]], KC0[0].Y
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
@@ -53,7 +53,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}ngroups_z:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[0].Z
+; EG: MOV {{\*? *}}[[VAL]], KC0[0].Z
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
@@ -68,7 +68,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}global_size_x:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[0].W
+; EG: MOV {{\*? *}}[[VAL]], KC0[0].W
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc
@@ -83,7 +83,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}global_size_y:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[1].X
+; EG: MOV {{\*? *}}[[VAL]], KC0[1].X
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10
@@ -98,7 +98,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}global_size_z:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV [[VAL]], KC0[1].Y
+; EG: MOV {{\*? *}}[[VAL]], KC0[1].Y
 
 ; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14
diff --git a/test/CodeGen/AMDGPU/xor.ll b/test/CodeGen/AMDGPU/xor.ll
index ddb920af29d8..655655d92f08 100644
--- a/test/CodeGen/AMDGPU/xor.ll
+++ b/test/CodeGen/AMDGPU/xor.ll
@@ -38,7 +38,7 @@ define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in
 }
 
 ; FUNC-LABEL: {{^}}xor_i1:
-; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}}
+; EG: XOR_INT {{\** *}}{{T[0-9]+\.[XYZW]}}, {{PS|PV\.[XYZW]}}, {{PS|PV\.[XYZW]}}
 
 ; SI-DAG: v_cmp_le_f32_e32 [[CMP0:vcc]], 0, {{v[0-9]+}}
 ; SI-DAG: v_cmp_le_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 1.0, {{v[0-9]+}}

From c445f0fb72e9028e9ec92924025317c70b667359 Mon Sep 17 00:00:00 2001
From: Quentin Colombet <qcolombet@apple.com>
Date: Fri, 4 Dec 2015 01:53:14 +0000
Subject: [PATCH 048/364] [ARM] When a bitcast is about to be turned into a
 VMOVDRR, try to combine it with its source instead of forcing the values on
 GPRs.

This improves the lowering of vector code when such bitcasts happen in the
middle of vector computations.

rdar://problem/23691584


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254684 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/ARMISelLowering.cpp  | 55 ++++++++++++++++++++++
 test/CodeGen/ARM/combine-vmovdrr.ll | 72 +++++++++++++++++++++++++++++
 2 files changed, 127 insertions(+)
 create mode 100644 test/CodeGen/ARM/combine-vmovdrr.ll

diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 33f74a3ba9fd..23f7bd0f4c8b 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -4139,6 +4139,56 @@ static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results,
   Results.push_back(Read.getOperand(0));
 }
 
+/// \p BC is a bitcast that is about to be turned into a VMOVDRR.
+/// When \p DstVT, the destination type of \p BC, is on the vector
+/// register bank and the source of bitcast, \p Op, operates on the same bank,
+/// it might be possible to combine them, such that everything stays on the
+/// vector register bank.
+/// \p return The node that would replace \p BT, if the combine
+/// is possible.
+static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC,
+                                                SelectionDAG &DAG) {
+  SDValue Op = BC->getOperand(0);
+  EVT DstVT = BC->getValueType(0);
+
+  // The only vector instruction that can produce a scalar (remember,
+  // since the bitcast was about to be turned into VMOVDRR, the source
+  // type is i64) from a vector is EXTRACT_VECTOR_ELT.
+  // Moreover, we can do this combine only if there is one use.
+  // Finally, if the destination type is not a vector, there is not
+  // much point on forcing everything on the vector bank.
+  if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+      !Op.hasOneUse())
+    return SDValue();
+
+  // If the index is not constant, we will introduce an additional
+  // multiply that will stick.
+  // Give up in that case.
+  ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+  if (!Index)
+    return SDValue();
+  unsigned DstNumElt = DstVT.getVectorNumElements();
+
+  // Compute the new index.
+  const APInt &APIntIndex = Index->getAPIntValue();
+  APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
+  NewIndex *= APIntIndex;
+  // Check if the new constant index fits into i32.
+  if (NewIndex.getBitWidth() > 32)
+    return SDValue();
+
+  // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
+  // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
+  SDLoc dl(Op);
+  SDValue ExtractSrc = Op.getOperand(0);
+  EVT VecVT = EVT::getVectorVT(
+      *DAG.getContext(), DstVT.getScalarType(),
+      ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
+  SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
+                     DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
+}
+
 /// ExpandBITCAST - If the target supports VFP, this function is called to
 /// expand a bit convert where either the source or destination type is i64 to
 /// use a VMOVDRR or VMOVRRD node.  This should not be done when the non-i64
@@ -4158,6 +4208,11 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
 
   // Turn i64->f64 into VMOVDRR.
   if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
+    // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
+    // if we can combine the bitcast with its source.
+    if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG))
+      return Val;
+
     SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
                              DAG.getConstant(0, dl, MVT::i32));
     SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
diff --git a/test/CodeGen/ARM/combine-vmovdrr.ll b/test/CodeGen/ARM/combine-vmovdrr.ll
new file mode 100644
index 000000000000..358f7e3a983e
--- /dev/null
+++ b/test/CodeGen/ARM/combine-vmovdrr.ll
@@ -0,0 +1,72 @@
+; RUN: llc %s -o - | FileCheck %s
+
+target triple = "thumbv7s-apple-ios"
+
+declare <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %shuffle.i.i307, <8 x i8> %shuffle.i27.i308, <8 x i8> %vtbl2.i25.i)
+
+; Check that we get the motivating example:
+; The bitcasts force the values to go through the GPRs, whereas
+; they are defined on VPRs and used on VPRs.
+;
+; CHECK-LABEL: motivatingExample:
+; CHECK: vldr [[ARG2_VAL:d[0-9]+]], [r1]
+; CHECK-NEXT: vld1.32 {[[ARG1_VALlo:d[0-9]+]], [[ARG1_VALhi:d[0-9]+]]}, [r0]
+; CHECK-NEXT: vtbl.8 [[RES:d[0-9]+]], {[[ARG1_VALlo]], [[ARG1_VALhi]]}, [[ARG2_VAL]]
+; CHECK-NEXT: vstr [[RES]], [r1]
+; CHECK-NEXT: bx lr
+define void @motivatingExample(<2 x i64>* %addr, <8 x i8>* %addr2) {
+  %shuffle.i.bc.i309 = load <2 x i64>, <2 x i64>* %addr
+  %vtbl2.i25.i = load <8 x i8>, <8 x i8>* %addr2
+  %shuffle.i.extract.i310 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 0
+  %shuffle.i27.extract.i311 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 1
+  %tmp45 = bitcast i64 %shuffle.i.extract.i310 to <8 x i8>
+  %tmp46 = bitcast i64 %shuffle.i27.extract.i311 to <8 x i8>
+  %vtbl2.i25.i313 = tail call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %tmp45, <8 x i8> %tmp46, <8 x i8> %vtbl2.i25.i)
+  store <8 x i8> %vtbl2.i25.i313, <8 x i8>* %addr2
+  ret void
+}
+
+; Check that we do not perform the transformation for dynamic index.
+; CHECK-LABEL: dynamicIndex:
+; CHECK-NOT: mul
+; CHECK: pop
+define void @dynamicIndex(<2 x i64>* %addr, <8 x i8>* %addr2, i32 %index) {
+  %shuffle.i.bc.i309 = load <2 x i64>, <2 x i64>* %addr
+  %vtbl2.i25.i = load <8 x i8>, <8 x i8>* %addr2
+  %shuffle.i.extract.i310 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 %index
+  %shuffle.i27.extract.i311 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 1
+  %tmp45 = bitcast i64 %shuffle.i.extract.i310 to <8 x i8>
+  %tmp46 = bitcast i64 %shuffle.i27.extract.i311 to <8 x i8>
+  %vtbl2.i25.i313 = tail call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %tmp45, <8 x i8> %tmp46, <8 x i8> %vtbl2.i25.i)
+  store <8 x i8> %vtbl2.i25.i313, <8 x i8>* %addr2
+  ret void
+}
+
+; Check that we do not perform the transformation when there are several uses
+; of the result of the bitcast.
+; CHECK-LABEL: severalUses:
+; ARG1_VALlo is hard coded because we need to access the high part of d0,
+; i.e., s1, and we can't express that with filecheck.
+; CHECK: vld1.32 {[[ARG1_VALlo:d0]], [[ARG1_VALhi:d[0-9]+]]}, [r0]
+; CHECK-NEXT: vldr [[ARG2_VAL:d[0-9]+]], [r1]
+; s1 is actually 2 * ARG1_VALlo + 1, but we cannot express that with filecheck.
+; CHECK-NEXT: vmov [[REThi:r[0-9]+]], s1
+; We build the return value here. s0 is 2 * ARG1_VALlo.
+; CHECK-NEXT: vmov r0, s0
+; This copy is correct but actually useless. We should be able to clean it up.
+; CHECK-NEXT: vmov [[ARG1_VALloCPY:d[0-9]+]], r0, [[REThi]]
+; CHECK-NEXT: vtbl.8 [[RES:d[0-9]+]], {[[ARG1_VALloCPY]], [[ARG1_VALhi]]}, [[ARG2_VAL]]
+; CHECK-NEXT: vstr [[RES]], [r1]
+; CHECK-NEXT: mov r1, [[REThi]]
+; CHECK-NEXT: bx lr
+define i64 @severalUses(<2 x i64>* %addr, <8 x i8>* %addr2) {
+  %shuffle.i.bc.i309 = load <2 x i64>, <2 x i64>* %addr
+  %vtbl2.i25.i = load <8 x i8>, <8 x i8>* %addr2
+  %shuffle.i.extract.i310 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 0
+  %shuffle.i27.extract.i311 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 1
+  %tmp45 = bitcast i64 %shuffle.i.extract.i310 to <8 x i8>
+  %tmp46 = bitcast i64 %shuffle.i27.extract.i311 to <8 x i8>
+  %vtbl2.i25.i313 = tail call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %tmp45, <8 x i8> %tmp46, <8 x i8> %vtbl2.i25.i)
+  store <8 x i8> %vtbl2.i25.i313, <8 x i8>* %addr2
+  ret i64 %shuffle.i.extract.i310
+}

From a9a96c1f37545082343ccb7cfee7f97f8f7a76be Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Fri, 4 Dec 2015 02:00:12 +0000
Subject: [PATCH 049/364] Move llvm/test/CodeGen/Generic/function-alias.ll to
 X86. It is incompatible to PECOFF.

FIXME: It may be ELF-generic.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254685 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/{Generic => X86}/function-alias.ll | 2 ++
 1 file changed, 2 insertions(+)
 rename test/CodeGen/{Generic => X86}/function-alias.ll (73%)

diff --git a/test/CodeGen/Generic/function-alias.ll b/test/CodeGen/X86/function-alias.ll
similarity index 73%
rename from test/CodeGen/Generic/function-alias.ll
rename to test/CodeGen/X86/function-alias.ll
index 7eec5be198b0..d68d75d5578a 100644
--- a/test/CodeGen/Generic/function-alias.ll
+++ b/test/CodeGen/X86/function-alias.ll
@@ -1,4 +1,6 @@
 ; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
 
 ; "data" constant
 @0 = private constant <{ i8, i8 }> <{i8 15, i8 11}>, section ".text"

From d8ba62ab992b9e142e002aee220354c9f09d92b7 Mon Sep 17 00:00:00 2001
From: Junmo Park <junmoz.park@samsung.com>
Date: Fri, 4 Dec 2015 02:06:59 +0000
Subject: [PATCH 050/364] git-svn-id:
 https://llvm.org/svn/llvm-project/llvm/trunk@254686
 91177308-0d34-0410-b5e6-96231b3b80d8

---
 lib/CodeGen/BranchFolding.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index c6a6476747e6..e41926a819c2 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -371,7 +371,7 @@ static unsigned ComputeCommonTailLength(MachineBasicBlock *MBB1,
   }
   // Back past possible debugging pseudos at beginning of block.  This matters
   // when one block differs from the other only by whether debugging pseudos
-  // are present at the beginning.  (This way, the various checks later for
+  // are present at the beginning. (This way, the various checks later for
   // I1==MBB1->begin() work as expected.)
   if (I1 == MBB1->begin() && I2 != MBB2->begin()) {
     --I2;

From 8061fe5c678f809e69acc199fe68dc5794601503 Mon Sep 17 00:00:00 2001
From: Nathan Slingerland <slingn@gmail.com>
Date: Fri, 4 Dec 2015 02:13:58 +0000
Subject: [PATCH 051/364] Revert "[llvm-profdata] Add support for weighted
 merge of profile data"

This reverts commit b7250858d96b8ce567681214273ac0e62713c661.

Reverting in order to investigate Windows test failure.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254687 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/CommandGuide/llvm-profdata.rst           |   6 +-
 include/llvm/ProfileData/InstrProf.h          |  31 ++------
 include/llvm/ProfileData/InstrProfWriter.h    |   4 +-
 include/llvm/ProfileData/SampleProf.h         |  54 +++++---------
 lib/ProfileData/InstrProfWriter.cpp           |  14 +---
 .../Inputs/weight-instr-bar.profdata          | Bin 1320 -> 0 bytes
 .../Inputs/weight-instr-foo.profdata          | Bin 1320 -> 0 bytes
 .../Inputs/weight-sample-bar.proftext         |   8 --
 .../Inputs/weight-sample-foo.proftext         |   8 --
 test/tools/llvm-profdata/weight-instr.test    |  55 --------------
 test/tools/llvm-profdata/weight-sample.test   |  43 -----------
 tools/llvm-profdata/llvm-profdata.cpp         |  70 ++++--------------
 unittests/ProfileData/InstrProfTest.cpp       |  20 -----
 13 files changed, 47 insertions(+), 266 deletions(-)
 delete mode 100644 test/tools/llvm-profdata/Inputs/weight-instr-bar.profdata
 delete mode 100644 test/tools/llvm-profdata/Inputs/weight-instr-foo.profdata
 delete mode 100644 test/tools/llvm-profdata/Inputs/weight-sample-bar.proftext
 delete mode 100644 test/tools/llvm-profdata/Inputs/weight-sample-foo.proftext
 delete mode 100644 test/tools/llvm-profdata/weight-instr.test
 delete mode 100644 test/tools/llvm-profdata/weight-sample.test

diff --git a/docs/CommandGuide/llvm-profdata.rst b/docs/CommandGuide/llvm-profdata.rst
index a4b18f301e42..210826a7babc 100644
--- a/docs/CommandGuide/llvm-profdata.rst
+++ b/docs/CommandGuide/llvm-profdata.rst
@@ -28,7 +28,7 @@ MERGE
 SYNOPSIS
 ^^^^^^^^
 
-:program:`llvm-profdata merge` [*options*] [*filename[:weight]...*]
+:program:`llvm-profdata merge` [*options*] [*filenames...*]
 
 DESCRIPTION
 ^^^^^^^^^^^
@@ -37,10 +37,6 @@ DESCRIPTION
 generated by PGO instrumentation and merges them together into a single
 indexed profile data file.
 
-The profile counts in each input file can be scaled (multiplied) by specifying
-``<filename>:<weight>``, where `<weight>` is a decimal integer >= 1.
-A default weight of 1 is assumed if only `<filename>` is given.
-
 OPTIONS
 ^^^^^^^
 
diff --git a/include/llvm/ProfileData/InstrProf.h b/include/llvm/ProfileData/InstrProf.h
index e1ed2e9ce48c..956485119102 100644
--- a/include/llvm/ProfileData/InstrProf.h
+++ b/include/llvm/ProfileData/InstrProf.h
@@ -218,8 +218,7 @@ struct InstrProfValueSiteRecord {
   }
 
   /// Merge data from another InstrProfValueSiteRecord
-  /// Optionally scale merged counts by \p Weight.
-  void mergeValueData(InstrProfValueSiteRecord &Input, uint64_t Weight = 1) {
+  void mergeValueData(InstrProfValueSiteRecord &Input) {
     this->sortByTargetValues();
     Input.sortByTargetValues();
     auto I = ValueData.begin();
@@ -229,11 +228,7 @@ struct InstrProfValueSiteRecord {
       while (I != IE && I->Value < J->Value)
         ++I;
       if (I != IE && I->Value == J->Value) {
-        // TODO: Check for counter overflow and return error if it occurs.
-        uint64_t JCount = J->Count;
-        if (Weight > 1)
-          JCount = SaturatingMultiply(JCount, Weight);
-        I->Count = SaturatingAdd(I->Count, JCount);
+        I->Count = SaturatingAdd(I->Count, J->Count);
         ++I;
         continue;
       }
@@ -279,8 +274,7 @@ struct InstrProfRecord {
                            ValueMapType *HashKeys);
 
   /// Merge the counts in \p Other into this one.
-  /// Optionally scale merged counts by \p Weight.
-  inline instrprof_error merge(InstrProfRecord &Other, uint64_t Weight = 1);
+  inline instrprof_error merge(InstrProfRecord &Other);
 
   /// Used by InstrProfWriter: update the value strings to commoned strings in
   /// the writer instance.
@@ -332,9 +326,7 @@ struct InstrProfRecord {
   }
 
   // Merge Value Profile data from Src record to this record for ValueKind.
-  // Scale merged value counts by \p Weight.
-  instrprof_error mergeValueProfData(uint32_t ValueKind, InstrProfRecord &Src,
-                                     uint64_t Weight) {
+  instrprof_error mergeValueProfData(uint32_t ValueKind, InstrProfRecord &Src) {
     uint32_t ThisNumValueSites = getNumValueSites(ValueKind);
     uint32_t OtherNumValueSites = Src.getNumValueSites(ValueKind);
     if (ThisNumValueSites != OtherNumValueSites)
@@ -344,7 +336,7 @@ struct InstrProfRecord {
     std::vector<InstrProfValueSiteRecord> &OtherSiteRecords =
         Src.getValueSitesForKind(ValueKind);
     for (uint32_t I = 0; I < ThisNumValueSites; I++)
-      ThisSiteRecords[I].mergeValueData(OtherSiteRecords[I], Weight);
+      ThisSiteRecords[I].mergeValueData(OtherSiteRecords[I]);
     return instrprof_error::success;
   }
 };
@@ -430,8 +422,7 @@ void InstrProfRecord::updateStrings(InstrProfStringTable *StrTab) {
       VData.Value = (uint64_t)StrTab->insertString((const char *)VData.Value);
 }
 
-instrprof_error InstrProfRecord::merge(InstrProfRecord &Other,
-                                       uint64_t Weight) {
+instrprof_error InstrProfRecord::merge(InstrProfRecord &Other) {
   // If the number of counters doesn't match we either have bad data
   // or a hash collision.
   if (Counts.size() != Other.Counts.size())
@@ -441,19 +432,13 @@ instrprof_error InstrProfRecord::merge(InstrProfRecord &Other,
 
   for (size_t I = 0, E = Other.Counts.size(); I < E; ++I) {
     bool ResultOverflowed;
-    uint64_t OtherCount = Other.Counts[I];
-    if (Weight > 1) {
-      OtherCount = SaturatingMultiply(OtherCount, Weight, ResultOverflowed);
-      if (ResultOverflowed)
-        Result = instrprof_error::counter_overflow;
-    }
-    Counts[I] = SaturatingAdd(Counts[I], OtherCount, ResultOverflowed);
+    Counts[I] = SaturatingAdd(Counts[I], Other.Counts[I], ResultOverflowed);
     if (ResultOverflowed)
       Result = instrprof_error::counter_overflow;
   }
 
   for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) {
-    instrprof_error MergeValueResult = mergeValueProfData(Kind, Other, Weight);
+    instrprof_error MergeValueResult = mergeValueProfData(Kind, Other);
     if (MergeValueResult != instrprof_error::success)
       Result = MergeValueResult;
   }
diff --git a/include/llvm/ProfileData/InstrProfWriter.h b/include/llvm/ProfileData/InstrProfWriter.h
index 1958d5f232e7..d026e08ec861 100644
--- a/include/llvm/ProfileData/InstrProfWriter.h
+++ b/include/llvm/ProfileData/InstrProfWriter.h
@@ -39,8 +39,8 @@ class InstrProfWriter {
   void updateStringTableReferences(InstrProfRecord &I);
   /// Add function counts for the given function. If there are already counts
   /// for this function and the hash and number of counts match, each counter is
-  /// summed. Optionally scale counts by \p Weight.
-  std::error_code addRecord(InstrProfRecord &&I, uint64_t Weight = 1);
+  /// summed.
+  std::error_code addRecord(InstrProfRecord &&I);
   /// Write the profile to \c OS
   void write(raw_fd_ostream &OS);
   /// Write the profile in text format to \c OS
diff --git a/include/llvm/ProfileData/SampleProf.h b/include/llvm/ProfileData/SampleProf.h
index 3337f4d7df5c..a7b22c735480 100644
--- a/include/llvm/ProfileData/SampleProf.h
+++ b/include/llvm/ProfileData/SampleProf.h
@@ -173,25 +173,19 @@ class SampleRecord {
   SampleRecord() : NumSamples(0), CallTargets() {}
 
   /// Increment the number of samples for this record by \p S.
-  /// Optionally scale sample count \p S by \p Weight.
   ///
   /// Sample counts accumulate using saturating arithmetic, to avoid wrapping
   /// around unsigned integers.
-  void addSamples(uint64_t S, uint64_t Weight = 1) {
-    if (Weight > 1)
-      S = SaturatingMultiply(S, Weight);
+  void addSamples(uint64_t S) {
     NumSamples = SaturatingAdd(NumSamples, S);
   }
 
   /// Add called function \p F with samples \p S.
-  /// Optionally scale sample count \p S by \p Weight.
   ///
   /// Sample counts accumulate using saturating arithmetic, to avoid wrapping
   /// around unsigned integers.
-  void addCalledTarget(StringRef F, uint64_t S, uint64_t Weight = 1) {
+  void addCalledTarget(StringRef F, uint64_t S) {
     uint64_t &TargetSamples = CallTargets[F];
-    if (Weight > 1)
-      S = SaturatingMultiply(S, Weight);
     TargetSamples = SaturatingAdd(TargetSamples, S);
   }
 
@@ -202,11 +196,10 @@ class SampleRecord {
   const CallTargetMap &getCallTargets() const { return CallTargets; }
 
   /// Merge the samples in \p Other into this record.
-  /// Optionally scale sample counts by \p Weight.
-  void merge(const SampleRecord &Other, uint64_t Weight = 1) {
-    addSamples(Other.getSamples(), Weight);
+  void merge(const SampleRecord &Other) {
+    addSamples(Other.getSamples());
     for (const auto &I : Other.getCallTargets())
-      addCalledTarget(I.first(), I.second, Weight);
+      addCalledTarget(I.first(), I.second);
   }
 
   void print(raw_ostream &OS, unsigned Indent) const;
@@ -233,26 +226,16 @@ class FunctionSamples {
   FunctionSamples() : TotalSamples(0), TotalHeadSamples(0) {}
   void print(raw_ostream &OS = dbgs(), unsigned Indent = 0) const;
   void dump() const;
-  void addTotalSamples(uint64_t Num, uint64_t Weight = 1) {
-    if (Weight > 1)
-      Num = SaturatingMultiply(Num, Weight);
-    TotalSamples += Num;
-  }
-  void addHeadSamples(uint64_t Num, uint64_t Weight = 1) {
-    if (Weight > 1)
-      Num = SaturatingMultiply(Num, Weight);
-    TotalHeadSamples += Num;
-  }
-  void addBodySamples(uint32_t LineOffset, uint32_t Discriminator, uint64_t Num,
-                      uint64_t Weight = 1) {
-    BodySamples[LineLocation(LineOffset, Discriminator)].addSamples(Num,
-                                                                    Weight);
+  void addTotalSamples(uint64_t Num) { TotalSamples += Num; }
+  void addHeadSamples(uint64_t Num) { TotalHeadSamples += Num; }
+  void addBodySamples(uint32_t LineOffset, uint32_t Discriminator,
+                      uint64_t Num) {
+    BodySamples[LineLocation(LineOffset, Discriminator)].addSamples(Num);
   }
   void addCalledTargetSamples(uint32_t LineOffset, uint32_t Discriminator,
-                              std::string FName, uint64_t Num,
-                              uint64_t Weight = 1) {
-    BodySamples[LineLocation(LineOffset, Discriminator)].addCalledTarget(
-        FName, Num, Weight);
+                              std::string FName, uint64_t Num) {
+    BodySamples[LineLocation(LineOffset, Discriminator)].addCalledTarget(FName,
+                                                                         Num);
   }
 
   /// Return the number of samples collected at the given location.
@@ -301,19 +284,18 @@ class FunctionSamples {
   }
 
   /// Merge the samples in \p Other into this one.
-  /// Optionally scale samples by \p Weight.
-  void merge(const FunctionSamples &Other, uint64_t Weight = 1) {
-    addTotalSamples(Other.getTotalSamples(), Weight);
-    addHeadSamples(Other.getHeadSamples(), Weight);
+  void merge(const FunctionSamples &Other) {
+    addTotalSamples(Other.getTotalSamples());
+    addHeadSamples(Other.getHeadSamples());
     for (const auto &I : Other.getBodySamples()) {
       const LineLocation &Loc = I.first;
       const SampleRecord &Rec = I.second;
-      BodySamples[Loc].merge(Rec, Weight);
+      BodySamples[Loc].merge(Rec);
     }
     for (const auto &I : Other.getCallsiteSamples()) {
       const CallsiteLocation &Loc = I.first;
       const FunctionSamples &Rec = I.second;
-      functionSamplesAt(Loc).merge(Rec, Weight);
+      functionSamplesAt(Loc).merge(Rec);
     }
   }
 
diff --git a/lib/ProfileData/InstrProfWriter.cpp b/lib/ProfileData/InstrProfWriter.cpp
index 2261c92f03a9..78bec012eeb2 100644
--- a/lib/ProfileData/InstrProfWriter.cpp
+++ b/lib/ProfileData/InstrProfWriter.cpp
@@ -98,8 +98,7 @@ void InstrProfWriter::updateStringTableReferences(InstrProfRecord &I) {
   I.updateStrings(&StringTable);
 }
 
-std::error_code InstrProfWriter::addRecord(InstrProfRecord &&I,
-                                           uint64_t Weight) {
+std::error_code InstrProfWriter::addRecord(InstrProfRecord &&I) {
   updateStringTableReferences(I);
   auto &ProfileDataMap = FunctionData[I.Name];
 
@@ -114,18 +113,9 @@ std::error_code InstrProfWriter::addRecord(InstrProfRecord &&I,
     // We've never seen a function with this name and hash, add it.
     Dest = std::move(I);
     Result = instrprof_error::success;
-    if (Weight > 1) {
-      for (auto &Count : Dest.Counts) {
-        bool Overflowed;
-        Count = SaturatingMultiply(Count, Weight, Overflowed);
-        if (Overflowed && Result == instrprof_error::success) {
-          Result = instrprof_error::counter_overflow;
-        }
-      }
-    }
   } else {
     // We're updating a function we've seen before.
-    Result = Dest.merge(I, Weight);
+    Result = Dest.merge(I);
   }
 
   // We keep track of the max function count as we go for simplicity.
diff --git a/test/tools/llvm-profdata/Inputs/weight-instr-bar.profdata b/test/tools/llvm-profdata/Inputs/weight-instr-bar.profdata
deleted file mode 100644
index 4ed07660f654090e750b19be4e0af609bc1c61db..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1320
zcmeyLQ&5zjmf6V600ExHYmK2yFeL$%U}Tt_rlBWzFff!ADy;yeON$fJQ=x1IMi>K1
zb3kcEhBbR7Zu=bi5d*Wx04kG~pWnp<VZ)@LG8h!XF!Qall3u04PhdJ=h9xByp&JJC
zP;T!%PLPQfJU{qBG{Uf7S9_nErA)oe0uly-A5c%^CT8Y&H1)5nJ7QRC1Che;A5?hI
zP_Xbvr*SzDrVf`pOq^JCF!i*iVeUbv(d|W-C&Y)P3I+$LNw7qLZt!5?YYY~LQ0R_y
U3iadCz6FN_BP=E0kVh8=0IAU^eE<Le

diff --git a/test/tools/llvm-profdata/Inputs/weight-instr-foo.profdata b/test/tools/llvm-profdata/Inputs/weight-instr-foo.profdata
deleted file mode 100644
index 581ef39a55b28408da07cfecea322493dddc2515..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1320
zcmeyLQ&5zjmf6V600ExHYmK2yFeL$%U}Tt_rlBWzFff!ADy;yeON$fJQ=x1IMi>K1
zb3kcEhBbR7Zu=bi5d*Wx04kG~pWnp<VZ)?gZpiJu#|e_L;Q7H1q7jCfZ<UqwDiwYL
z(*ZLqDX|FMFpx%U0Abj#tG&<7Ql{Q!fhzj}^+axBW}Zh=|H`@}hP5_OIduP_^Rd}P
zPcfK#&}noxpvx2Dqw628d|0YraDe&{mM8|RhcyPP!$@>jI*ICuRlWs>LPl7M$03g{
F3;+VlD0u(?

diff --git a/test/tools/llvm-profdata/Inputs/weight-sample-bar.proftext b/test/tools/llvm-profdata/Inputs/weight-sample-bar.proftext
deleted file mode 100644
index a910f745e6c7..000000000000
--- a/test/tools/llvm-profdata/Inputs/weight-sample-bar.proftext
+++ /dev/null
@@ -1,8 +0,0 @@
-bar:1772037:35370
- 17: 35370
- 18: 35370
- 19: 7005
- 20: 29407
- 21: 12170
- 23: 18150 bar:19829
- 25: 36666
diff --git a/test/tools/llvm-profdata/Inputs/weight-sample-foo.proftext b/test/tools/llvm-profdata/Inputs/weight-sample-foo.proftext
deleted file mode 100644
index 155ec5d00315..000000000000
--- a/test/tools/llvm-profdata/Inputs/weight-sample-foo.proftext
+++ /dev/null
@@ -1,8 +0,0 @@
-foo:1763288:35327
- 7: 35327
- 8: 35327
- 9: 6930
- 10: 29341
- 11: 11906
- 13: 18185 foo:19531
- 15: 36458
diff --git a/test/tools/llvm-profdata/weight-instr.test b/test/tools/llvm-profdata/weight-instr.test
deleted file mode 100644
index bc0b5061647f..000000000000
--- a/test/tools/llvm-profdata/weight-instr.test
+++ /dev/null
@@ -1,55 +0,0 @@
-Tests for weighted merge of instrumented profiles.
-
-1- Merge the foo and bar profiles with unity weight and verify the combined output
-RUN: llvm-profdata merge --instr %p/Inputs/weight-instr-bar.profdata:1 %p/Inputs/weight-instr-foo.profdata:1 -o %t
-RUN: llvm-profdata show --instr -all-functions %t | FileCheck %s --check-prefix=WEIGHT1
-WEIGHT1: Counters:
-WEIGHT1:   usage:
-WEIGHT1:     Hash: 0x0000000000000000
-WEIGHT1:     Counters: 1
-WEIGHT1:     Function count: 0
-WEIGHT1:   foo:
-WEIGHT1:     Hash: 0x000000000000028a
-WEIGHT1:     Counters: 3
-WEIGHT1:     Function count: 866988873
-WEIGHT1:   bar:
-WEIGHT1:     Hash: 0x000000000000028a
-WEIGHT1:     Counters: 3
-WEIGHT1:     Function count: 866988873
-WEIGHT1:   main:
-WEIGHT1:     Hash: 0x7d31c47ea98f8248
-WEIGHT1:     Counters: 60
-WEIGHT1:     Function count: 2
-WEIGHT1: Functions shown: 4
-WEIGHT1: Total functions: 4
-WEIGHT1: Maximum function count: 866988873
-WEIGHT1: Maximum internal block count: 267914296
-
-2- Merge the foo and bar profiles with weight 3x and 5x respectively and verify the combined output
-RUN: llvm-profdata merge --instr %p/Inputs/weight-instr-bar.profdata:3 %p/Inputs/weight-instr-foo.profdata:5 -o %t
-RUN: llvm-profdata show --instr -all-functions %t | FileCheck %s --check-prefix=WEIGHT2
-WEIGHT2: Counters:
-WEIGHT2:   usage:
-WEIGHT2:     Hash: 0x0000000000000000
-WEIGHT2:     Counters: 1
-WEIGHT2:     Function count: 0
-WEIGHT2:   foo:
-WEIGHT2:     Hash: 0x000000000000028a
-WEIGHT2:     Counters: 3
-WEIGHT2:     Function count: 4334944365
-WEIGHT2:   bar:
-WEIGHT2:     Hash: 0x000000000000028a
-WEIGHT2:     Counters: 3
-WEIGHT2:     Function count: 2600966619
-WEIGHT2:   main:
-WEIGHT2:     Hash: 0x7d31c47ea98f8248
-WEIGHT2:     Counters: 60
-WEIGHT2:     Function count: 8
-WEIGHT2: Functions shown: 4
-WEIGHT2: Total functions: 4
-WEIGHT2: Maximum function count: 4334944365
-WEIGHT2: Maximum internal block count: 1339571480
-
-3- Bad merge: foo and bar profiles with invalid weights
-RUN: not llvm-profdata merge --instr %p/Inputs/weight-instr-bar.profdata:3 %p/Inputs/weight-instr-foo.profdata:-5 -o %t.out 2>&1 | FileCheck %s --check-prefix=ERROR3
-ERROR3: error: Input weight must be a positive integer.
diff --git a/test/tools/llvm-profdata/weight-sample.test b/test/tools/llvm-profdata/weight-sample.test
deleted file mode 100644
index a1fe1df1b6de..000000000000
--- a/test/tools/llvm-profdata/weight-sample.test
+++ /dev/null
@@ -1,43 +0,0 @@
-Tests for weighted merge of sample profiles.
-
-1- Merge the foo and bar profiles with unity weight and verify the combined output
-RUN: llvm-profdata merge --sample --text %p/Inputs/weight-sample-bar.proftext:1 %p/Inputs/weight-sample-foo.proftext:1 -o - | FileCheck %s --check-prefix=WEIGHT1
-WEIGHT1: foo:1763288:35327
-WEIGHT1:  7: 35327
-WEIGHT1:  8: 35327
-WEIGHT1:  9: 6930
-WEIGHT1:  10: 29341
-WEIGHT1:  11: 11906
-WEIGHT1:  13: 18185 foo:19531
-WEIGHT1:  15: 36458
-WEIGHT1: bar:1772037:35370
-WEIGHT1:  17: 35370
-WEIGHT1:  18: 35370
-WEIGHT1:  19: 7005
-WEIGHT1:  20: 29407
-WEIGHT1:  21: 12170
-WEIGHT1:  23: 18150 bar:19829
-WEIGHT1:  25: 36666
-
-2- Merge the foo and bar profiles with weight 3x and 5x respectively and verify the combined output
-RUN: llvm-profdata merge --sample --text %p/Inputs/weight-sample-bar.proftext:3 %p/Inputs/weight-sample-foo.proftext:5 -o - | FileCheck %s --check-prefix=WEIGHT2
-WEIGHT2: foo:8816440:176635
-WEIGHT2:  7: 176635
-WEIGHT2:  8: 176635
-WEIGHT2:  9: 34650
-WEIGHT2:  10: 146705
-WEIGHT2:  11: 59530
-WEIGHT2:  13: 90925 foo:97655
-WEIGHT2:  15: 182290
-WEIGHT2: bar:5316111:106110
-WEIGHT2:  17: 106110
-WEIGHT2:  18: 106110
-WEIGHT2:  19: 21015
-WEIGHT2:  20: 88221
-WEIGHT2:  21: 36510
-WEIGHT2:  23: 54450 bar:59487
-WEIGHT2:  25: 109998
-
-3- Bad merge: foo and bar profiles with invalid weights
-RUN: not llvm-profdata merge --sample --text %p/Inputs/weight-sample-bar.proftext:3 %p/Inputs/weight-sample-foo.proftext:-5 -o %t.out 2>&1 | FileCheck %s --check-prefix=ERROR3
-ERROR3: error: Input weight must be a positive integer.
diff --git a/tools/llvm-profdata/llvm-profdata.cpp b/tools/llvm-profdata/llvm-profdata.cpp
index 56c80f518ec4..10b6855233d5 100644
--- a/tools/llvm-profdata/llvm-profdata.cpp
+++ b/tools/llvm-profdata/llvm-profdata.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/ProfileData/InstrProfReader.h"
@@ -28,7 +27,6 @@
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/raw_ostream.h"
-#include <tuple>
 
 using namespace llvm;
 
@@ -95,17 +93,7 @@ static void handleMergeWriterError(std::error_code &Error,
   }
 }
 
-struct WeightedFile {
-  StringRef Filename;
-  uint64_t Weight;
-
-  WeightedFile() {}
-
-  WeightedFile(StringRef F, uint64_t W) : Filename{F}, Weight{W} {}
-};
-typedef SmallVector<WeightedFile, 5> WeightedFileVector;
-
-static void mergeInstrProfile(const WeightedFileVector &Inputs,
+static void mergeInstrProfile(const cl::list<std::string> &Inputs,
                               StringRef OutputFilename,
                               ProfileFormat OutputFormat) {
   if (OutputFilename.compare("-") == 0)
@@ -121,21 +109,21 @@ static void mergeInstrProfile(const WeightedFileVector &Inputs,
 
   InstrProfWriter Writer;
   SmallSet<std::error_code, 4> WriterErrorCodes;
-  for (const auto &Input : Inputs) {
-    auto ReaderOrErr = InstrProfReader::create(Input.Filename);
+  for (const auto &Filename : Inputs) {
+    auto ReaderOrErr = InstrProfReader::create(Filename);
     if (std::error_code ec = ReaderOrErr.getError())
-      exitWithErrorCode(ec, Input.Filename);
+      exitWithErrorCode(ec, Filename);
 
     auto Reader = std::move(ReaderOrErr.get());
     for (auto &I : *Reader) {
-      if (std::error_code EC = Writer.addRecord(std::move(I), Input.Weight)) {
+      if (std::error_code EC = Writer.addRecord(std::move(I))) {
         // Only show hint the first time an error occurs.
         bool firstTime = WriterErrorCodes.insert(EC).second;
-        handleMergeWriterError(EC, Input.Filename, I.Name, firstTime);
+        handleMergeWriterError(EC, Filename, I.Name, firstTime);
       }
     }
     if (Reader->hasError())
-      exitWithErrorCode(Reader->getError(), Input.Filename);
+      exitWithErrorCode(Reader->getError(), Filename);
   }
   if (OutputFormat == PF_Text)
     Writer.writeText(Output);
@@ -147,7 +135,7 @@ static sampleprof::SampleProfileFormat FormatMap[] = {
     sampleprof::SPF_None, sampleprof::SPF_Text, sampleprof::SPF_Binary,
     sampleprof::SPF_GCC};
 
-static void mergeSampleProfile(const WeightedFileVector &Inputs,
+static void mergeSampleProfile(const cl::list<std::string> &Inputs,
                                StringRef OutputFilename,
                                ProfileFormat OutputFormat) {
   using namespace sampleprof;
@@ -159,11 +147,11 @@ static void mergeSampleProfile(const WeightedFileVector &Inputs,
   auto Writer = std::move(WriterOrErr.get());
   StringMap<FunctionSamples> ProfileMap;
   SmallVector<std::unique_ptr<sampleprof::SampleProfileReader>, 5> Readers;
-  for (const auto &Input : Inputs) {
+  for (const auto &Filename : Inputs) {
     auto ReaderOrErr =
-        SampleProfileReader::create(Input.Filename, getGlobalContext());
+        SampleProfileReader::create(Filename, getGlobalContext());
     if (std::error_code EC = ReaderOrErr.getError())
-      exitWithErrorCode(EC, Input.Filename);
+      exitWithErrorCode(EC, Filename);
 
     // We need to keep the readers around until after all the files are
     // read so that we do not lose the function names stored in each
@@ -172,7 +160,7 @@ static void mergeSampleProfile(const WeightedFileVector &Inputs,
     Readers.push_back(std::move(ReaderOrErr.get()));
     const auto Reader = Readers.back().get();
     if (std::error_code EC = Reader->read())
-      exitWithErrorCode(EC, Input.Filename);
+      exitWithErrorCode(EC, Filename);
 
     StringMap<FunctionSamples> &Profiles = Reader->getProfiles();
     for (StringMap<FunctionSamples>::iterator I = Profiles.begin(),
@@ -180,38 +168,15 @@ static void mergeSampleProfile(const WeightedFileVector &Inputs,
          I != E; ++I) {
       StringRef FName = I->first();
       FunctionSamples &Samples = I->second;
-      ProfileMap[FName].merge(Samples, Input.Weight);
+      ProfileMap[FName].merge(Samples);
     }
   }
   Writer->write(ProfileMap);
 }
 
-static void parseInputFiles(const cl::list<std::string> &Inputs,
-                            WeightedFileVector &WeightedInputs) {
-  WeightedInputs.reserve(Inputs.size());
-
-  for (StringRef Input : Inputs) {
-    StringRef FileName;
-    StringRef WeightStr;
-    std::tie(FileName, WeightStr) = Input.rsplit(':');
-    if (WeightStr.empty() || sys::fs::exists(Input)) {
-      // No weight specified or valid path containing delimiter.
-      WeightedInputs.push_back(WeightedFile(Input, 1));
-    } else {
-      // Input weight specified.
-      uint64_t Weight;
-      if (WeightStr.getAsInteger(10, Weight) || Weight < 1) {
-        // Invalid input weight.
-        exitWithError("Input weight must be a positive integer.");
-      }
-      WeightedInputs.push_back(WeightedFile(FileName, Weight));
-    }
-  }
-}
-
 static int merge_main(int argc, const char *argv[]) {
   cl::list<std::string> Inputs(cl::Positional, cl::Required, cl::OneOrMore,
-                               cl::desc("<filename[:weight]...>"));
+                               cl::desc("<filenames...>"));
 
   cl::opt<std::string> OutputFilename("output", cl::value_desc("output"),
                                       cl::init("-"), cl::Required,
@@ -233,13 +198,10 @@ static int merge_main(int argc, const char *argv[]) {
 
   cl::ParseCommandLineOptions(argc, argv, "LLVM profile data merger\n");
 
-  WeightedFileVector WeightedInputs;
-  parseInputFiles(Inputs, WeightedInputs);
-
   if (ProfileKind == instr)
-    mergeInstrProfile(WeightedInputs, OutputFilename, OutputFormat);
+    mergeInstrProfile(Inputs, OutputFilename, OutputFormat);
   else
-    mergeSampleProfile(WeightedInputs, OutputFilename, OutputFormat);
+    mergeSampleProfile(Inputs, OutputFilename, OutputFormat);
 
   return 0;
 }
diff --git a/unittests/ProfileData/InstrProfTest.cpp b/unittests/ProfileData/InstrProfTest.cpp
index 946afdadba93..635a5431a513 100644
--- a/unittests/ProfileData/InstrProfTest.cpp
+++ b/unittests/ProfileData/InstrProfTest.cpp
@@ -490,24 +490,4 @@ TEST_F(InstrProfTest, get_max_function_count) {
   ASSERT_EQ(1ULL << 63, Reader->getMaximumFunctionCount());
 }
 
-TEST_F(InstrProfTest, get_weighted_function_counts) {
-  InstrProfRecord Record1("foo", 0x1234, {1, 2});
-  InstrProfRecord Record2("foo", 0x1235, {3, 4});
-  Writer.addRecord(std::move(Record1), 3);
-  Writer.addRecord(std::move(Record2), 5);
-  auto Profile = Writer.writeBuffer();
-  readProfile(std::move(Profile));
-
-  std::vector<uint64_t> Counts;
-  ASSERT_TRUE(NoError(Reader->getFunctionCounts("foo", 0x1234, Counts)));
-  ASSERT_EQ(2U, Counts.size());
-  ASSERT_EQ(3U, Counts[0]);
-  ASSERT_EQ(6U, Counts[1]);
-
-  ASSERT_TRUE(NoError(Reader->getFunctionCounts("foo", 0x1235, Counts)));
-  ASSERT_EQ(2U, Counts.size());
-  ASSERT_EQ(15U, Counts[0]);
-  ASSERT_EQ(20U, Counts[1]);
-}
-
 } // end anonymous namespace

From d8def4abd185c430d7649be9347778612c296871 Mon Sep 17 00:00:00 2001
From: Justin Bogner <mail@justinbogner.com>
Date: Fri, 4 Dec 2015 02:14:34 +0000
Subject: [PATCH 052/364] IR: Use format_hex instead of handrolling the
 conversion. NFC

Cleans up some very old code in AsmWriter's WriteConstantInternal.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254688 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/IR/AsmWriter.cpp | 64 ++++++++++++--------------------------------
 1 file changed, 17 insertions(+), 47 deletions(-)

diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp
index cb9a792c598b..fae1ebee5f2a 100644
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp
@@ -39,6 +39,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
@@ -1137,15 +1138,12 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
       // x86, so we must not use these types.
       static_assert(sizeof(double) == sizeof(uint64_t),
                     "assuming that double is 64 bits!");
-      char Buffer[40];
       APFloat apf = CFP->getValueAPF();
       // Floats are represented in ASCII IR as double, convert.
       if (!isDouble)
         apf.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven,
                           &ignored);
-      Out << "0x" <<
-              utohex_buffer(uint64_t(apf.bitcastToAPInt().getZExtValue()),
-                            Buffer+40);
+      Out << format_hex(apf.bitcastToAPInt().getZExtValue(), 0, /*Upper=*/true);
       return;
     }
 
@@ -1153,60 +1151,32 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     // These appear as a magic letter identifying the type, then a
     // fixed number of hex digits.
     Out << "0x";
-    // Bit position, in the current word, of the next nibble to print.
-    int shiftcount;
-
+    APInt API = CFP->getValueAPF().bitcastToAPInt();
     if (&CFP->getValueAPF().getSemantics() == &APFloat::x87DoubleExtended) {
       Out << 'K';
-      // api needed to prevent premature destruction
-      APInt api = CFP->getValueAPF().bitcastToAPInt();
-      const uint64_t* p = api.getRawData();
-      uint64_t word = p[1];
-      shiftcount = 12;
-      int width = api.getBitWidth();
-      for (int j=0; j<width; j+=4, shiftcount-=4) {
-        unsigned int nibble = (word>>shiftcount) & 15;
-        if (nibble < 10)
-          Out << (unsigned char)(nibble + '0');
-        else
-          Out << (unsigned char)(nibble - 10 + 'A');
-        if (shiftcount == 0 && j+4 < width) {
-          word = *p;
-          shiftcount = 64;
-          if (width-j-4 < 64)
-            shiftcount = width-j-4;
-        }
-      }
+      Out << format_hex_no_prefix(API.getHiBits(16).getZExtValue(), 4,
+                                  /*Upper=*/true);
+      Out << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16,
+                                  /*Upper=*/true);
       return;
     } else if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEquad) {
-      shiftcount = 60;
       Out << 'L';
+      Out << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16,
+                                  /*Upper=*/true);
+      Out << format_hex_no_prefix(API.getHiBits(64).getZExtValue(), 16,
+                                  /*Upper=*/true);
     } else if (&CFP->getValueAPF().getSemantics() == &APFloat::PPCDoubleDouble) {
-      shiftcount = 60;
       Out << 'M';
+      Out << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16,
+                                  /*Upper=*/true);
+      Out << format_hex_no_prefix(API.getHiBits(64).getZExtValue(), 16,
+                                  /*Upper=*/true);
     } else if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEhalf) {
-      shiftcount = 12;
       Out << 'H';
+      Out << format_hex_no_prefix(API.getZExtValue(), 4,
+                                  /*Upper=*/true);
     } else
       llvm_unreachable("Unsupported floating point type");
-    // api needed to prevent premature destruction
-    APInt api = CFP->getValueAPF().bitcastToAPInt();
-    const uint64_t* p = api.getRawData();
-    uint64_t word = *p;
-    int width = api.getBitWidth();
-    for (int j=0; j<width; j+=4, shiftcount-=4) {
-      unsigned int nibble = (word>>shiftcount) & 15;
-      if (nibble < 10)
-        Out << (unsigned char)(nibble + '0');
-      else
-        Out << (unsigned char)(nibble - 10 + 'A');
-      if (shiftcount == 0 && j+4 < width) {
-        word = *(++p);
-        shiftcount = 64;
-        if (width-j-4 < 64)
-          shiftcount = width-j-4;
-      }
-    }
     return;
   }
 

From 51540fbf420fb65bfdf0173a08ea951e10cf431f Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Fri, 4 Dec 2015 02:15:39 +0000
Subject: [PATCH 053/364] [Orc] Rename JITCompileCallbackManagerBase to
 JITCompileCallbackManager.

This class is turning into a useful interface, rather than an implementation
detail, so I'm dropping the 'Base' suffix.

No functional change.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254693 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../Orc/CompileOnDemandLayer.h                |  2 +-
 .../ExecutionEngine/Orc/IndirectionUtils.h    | 20 +++++++++----------
 lib/ExecutionEngine/Orc/IndirectionUtils.cpp  |  2 +-
 lib/ExecutionEngine/Orc/OrcCBindingsStack.cpp |  2 +-
 lib/ExecutionEngine/Orc/OrcCBindingsStack.h   |  2 +-
 tools/lli/OrcLazyJIT.cpp                      |  2 +-
 tools/lli/OrcLazyJIT.h                        |  2 +-
 .../Orc/CompileOnDemandLayerTest.cpp          |  4 ++--
 8 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h b/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
index 242d2420162f..b7ee9b5937f7 100644
--- a/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
+++ b/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
@@ -38,7 +38,7 @@ namespace orc {
 /// of the function body from the original module. The extracted body is then
 /// compiled and executed.
 template <typename BaseLayerT,
-          typename CompileCallbackMgrT = JITCompileCallbackManagerBase,
+          typename CompileCallbackMgrT = JITCompileCallbackManager,
           typename IndirectStubsMgrT = IndirectStubsManagerBase>
 class CompileOnDemandLayer {
 private:
diff --git a/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h b/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h
index cabc95543d81..b5b258e7a05c 100644
--- a/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h
+++ b/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h
@@ -27,8 +27,8 @@
 namespace llvm {
 namespace orc {
 
-/// @brief Target-independent base class JITCompileCallbackManager.
-class JITCompileCallbackManagerBase {
+/// @brief Target-independent base class for compile callback management.
+class JITCompileCallbackManager {
 public:
 
   typedef std::function<TargetAddress()> CompileFtor;
@@ -50,13 +50,13 @@ class JITCompileCallbackManagerBase {
     CompileFtor &Compile;
   };
 
-  /// @brief Construct a JITCompileCallbackManagerBase.
+  /// @brief Construct a JITCompileCallbackManager.
   /// @param ErrorHandlerAddress The address of an error handler in the target
   ///                            process to be used if a compile callback fails.
-  JITCompileCallbackManagerBase(TargetAddress ErrorHandlerAddress)
+  JITCompileCallbackManager(TargetAddress ErrorHandlerAddress)
     : ErrorHandlerAddress(ErrorHandlerAddress) {}
 
-  virtual ~JITCompileCallbackManagerBase() {}
+  virtual ~JITCompileCallbackManager() {}
 
   /// @brief Execute the callback for the given trampoline id. Called by the JIT
   ///        to compile functions on demand.
@@ -116,16 +116,16 @@ class JITCompileCallbackManagerBase {
   virtual void anchor();
 };
 
-/// @brief Manage compile callbacks.
+/// @brief Manage compile callbacks for in-process JITs.
 template <typename TargetT>
-class JITCompileCallbackManager : public JITCompileCallbackManagerBase {
+class LocalJITCompileCallbackManager : public JITCompileCallbackManager {
 public:
 
-  /// @brief Construct a JITCompileCallbackManager.
+  /// @brief Construct a InProcessJITCompileCallbackManager.
   /// @param ErrorHandlerAddress The address of an error handler in the target
   ///                            process to be used if a compile callback fails.
-  JITCompileCallbackManager(TargetAddress ErrorHandlerAddress)
-    : JITCompileCallbackManagerBase(ErrorHandlerAddress) {
+  LocalJITCompileCallbackManager(TargetAddress ErrorHandlerAddress)
+    : JITCompileCallbackManager(ErrorHandlerAddress) {
 
     /// Set up the resolver block.
     std::error_code EC;
diff --git a/lib/ExecutionEngine/Orc/IndirectionUtils.cpp b/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
index 30dbe0383771..dd6e3a3b29ae 100644
--- a/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
+++ b/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
@@ -19,7 +19,7 @@
 namespace llvm {
 namespace orc {
 
-void JITCompileCallbackManagerBase::anchor() {}
+void JITCompileCallbackManager::anchor() {}
 void IndirectStubsManagerBase::anchor() {}
 
 Constant* createIRTypedAddress(FunctionType &FT, TargetAddress Addr) {
diff --git a/lib/ExecutionEngine/Orc/OrcCBindingsStack.cpp b/lib/ExecutionEngine/Orc/OrcCBindingsStack.cpp
index 7326fa7e2f8f..d1af56d84867 100644
--- a/lib/ExecutionEngine/Orc/OrcCBindingsStack.cpp
+++ b/lib/ExecutionEngine/Orc/OrcCBindingsStack.cpp
@@ -23,7 +23,7 @@ OrcCBindingsStack::createCompileCallbackMgr(Triple T) {
     default: return nullptr;
 
     case Triple::x86_64: {
-      typedef orc::JITCompileCallbackManager<orc::OrcX86_64> CCMgrT;
+      typedef orc::LocalJITCompileCallbackManager<orc::OrcX86_64> CCMgrT;
       return llvm::make_unique<CCMgrT>(0);
     }
   }
diff --git a/lib/ExecutionEngine/Orc/OrcCBindingsStack.h b/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
index c62210112c35..d2f7fe4ac0ef 100644
--- a/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
+++ b/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
@@ -29,7 +29,7 @@ DEFINE_SIMPLE_CONVERSION_FUNCTIONS(TargetMachine, LLVMTargetMachineRef)
 class OrcCBindingsStack {
 public:
 
-  typedef orc::JITCompileCallbackManagerBase CompileCallbackMgr;
+  typedef orc::JITCompileCallbackManager CompileCallbackMgr;
   typedef orc::ObjectLinkingLayer<> ObjLayerT;
   typedef orc::IRCompileLayer<ObjLayerT> CompileLayerT;
   typedef orc::CompileOnDemandLayer<CompileLayerT, CompileCallbackMgr> CODLayerT;
diff --git a/tools/lli/OrcLazyJIT.cpp b/tools/lli/OrcLazyJIT.cpp
index 7d79c48559ef..edac10b86556 100644
--- a/tools/lli/OrcLazyJIT.cpp
+++ b/tools/lli/OrcLazyJIT.cpp
@@ -52,7 +52,7 @@ OrcLazyJIT::createCompileCallbackMgr(Triple T) {
     default: return nullptr;
 
     case Triple::x86_64: {
-      typedef orc::JITCompileCallbackManager<orc::OrcX86_64> CCMgrT;
+      typedef orc::LocalJITCompileCallbackManager<orc::OrcX86_64> CCMgrT;
       return llvm::make_unique<CCMgrT>(0);
     }
   }
diff --git a/tools/lli/OrcLazyJIT.h b/tools/lli/OrcLazyJIT.h
index ec86a72efaa0..bb4da33ea9b6 100644
--- a/tools/lli/OrcLazyJIT.h
+++ b/tools/lli/OrcLazyJIT.h
@@ -29,7 +29,7 @@ namespace llvm {
 class OrcLazyJIT {
 public:
 
-  typedef orc::JITCompileCallbackManagerBase CompileCallbackMgr;
+  typedef orc::JITCompileCallbackManager CompileCallbackMgr;
   typedef orc::ObjectLinkingLayer<> ObjLayerT;
   typedef orc::IRCompileLayer<ObjLayerT> CompileLayerT;
   typedef std::function<std::unique_ptr<Module>(std::unique_ptr<Module>)>
diff --git a/unittests/ExecutionEngine/Orc/CompileOnDemandLayerTest.cpp b/unittests/ExecutionEngine/Orc/CompileOnDemandLayerTest.cpp
index 49f4cc124f9e..4a30cfc42971 100644
--- a/unittests/ExecutionEngine/Orc/CompileOnDemandLayerTest.cpp
+++ b/unittests/ExecutionEngine/Orc/CompileOnDemandLayerTest.cpp
@@ -16,10 +16,10 @@ using namespace llvm::orc;
 
 namespace {
 
-class DummyCallbackManager : public orc::JITCompileCallbackManagerBase {
+class DummyCallbackManager : public orc::JITCompileCallbackManager {
 public:
   DummyCallbackManager()
-      : JITCompileCallbackManagerBase(0), NextStubAddress(0),
+      : JITCompileCallbackManager(0), NextStubAddress(0),
         UniversalCompile([]() { return 0; }) {
   }
 

From 64649a7dc718206641b8b6bb436d4cdd3c836d04 Mon Sep 17 00:00:00 2001
From: Junmo Park <junmoz.park@samsung.com>
Date: Fri, 4 Dec 2015 02:29:25 +0000
Subject: [PATCH 054/364] [BranchFolding] Merge MMOs during tail merge

Summary:
If we remove the MMOs from Load/Store instructions,
they are treated as volatile. This makes other optimization passes unhappy.
eg. Load/Store Optimization

So, it looks better to merge, not remove.

Reviewers: gberry, mcrosier

Subscribers: gberry, llvm-commits

Differential Revision: http://reviews.llvm.org/D14797


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254694 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/BranchFolding.cpp | 42 ++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index e41926a819c2..3878281a4fe9 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -744,24 +744,35 @@ bool BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
   return true;
 }
 
-static bool hasIdenticalMMOs(const MachineInstr *MI1, const MachineInstr *MI2) {
+// Add MI1's MMOs to MI2's MMOs while excluding any duplicates. The MI scheduler
+// currently doesn't handle multiple MMOs, so duplicates would likely pessimize
+// the scheduler.
+static void mergeMMOs(MachineInstr *MI1, MachineInstr *MI2) {
   auto I1 = MI1->memoperands_begin(), E1 = MI1->memoperands_end();
   auto I2 = MI2->memoperands_begin(), E2 = MI2->memoperands_end();
-  if ((E1 - I1) != (E2 - I2))
-    return false;
-  for (; I1 != E1; ++I1, ++I2) {
-    if (**I1 != **I2)
-      return false;
+  MachineFunction *MF = MI1->getParent()->getParent();
+
+  // Mostly, MI1's MMO count is 1 or zero. So we don't have to use
+  // SmallSet.
+  for (; I1 != E1; ++I1) {
+    bool IsDupMMO = false;
+    for (I2 = MI2->memoperands_begin(); I2 != E2; ++I2) {
+      if (**I1 == **I2) {
+        IsDupMMO = true;
+        break;
+      }
+    }
+    if (IsDupMMO == false) {
+      MI2->addMemOperand(*MF, *I1);
+      E2 = MI2->memoperands_end();
+    }
   }
-  return true;
 }
 
 static void
-removeMMOsFromMemoryOperations(MachineBasicBlock::iterator MBBIStartPos,
-                               MachineBasicBlock &MBBCommon) {
-  // Remove MMOs from memory operations in the common block
-  // when they do not match the ones from the block being tail-merged.
-  // This ensures later passes conservatively compute dependencies.
+mergeMMOsFromMemoryOperations(MachineBasicBlock::iterator MBBIStartPos,
+                              MachineBasicBlock &MBBCommon) {
+  // Merge MMOs from memory operations in the common block
   MachineBasicBlock *MBB = MBBIStartPos->getParent();
   // Note CommonTailLen does not necessarily matches the size of
   // the common BB nor all its instructions because of debug
@@ -792,8 +803,7 @@ removeMMOsFromMemoryOperations(MachineBasicBlock::iterator MBBIStartPos,
     assert(MBBICommon->isIdenticalTo(&*MBBI) && "Expected matching MIIs!");
 
     if (MBBICommon->mayLoad() || MBBICommon->mayStore())
-      if (!hasIdenticalMMOs(&*MBBI, &*MBBICommon))
-        MBBICommon->clearMemRefs();
+      mergeMMOs(&*MBBI, &*MBBICommon);
 
     ++MBBI;
     ++MBBICommon;
@@ -913,8 +923,8 @@ bool BranchFolder::TryTailMergeBlocks(MachineBasicBlock *SuccBB,
         continue;
       DEBUG(dbgs() << "BB#" << SameTails[i].getBlock()->getNumber()
                    << (i == e-1 ? "" : ", "));
-      // Remove MMOs from memory operations as needed.
-      removeMMOsFromMemoryOperations(SameTails[i].getTailStartPos(), *MBB);
+      // Merge MMOs from memory operations as needed.
+      mergeMMOsFromMemoryOperations(SameTails[i].getTailStartPos(), *MBB);
       // Hack the end off BB i, making it jump to BB commonTailIndex instead.
       ReplaceTailWithBranchTo(SameTails[i].getTailStartPos(), MBB);
       // BB i is no longer a predecessor of SuccBB; remove it from the worklist.

From 6d3f26eb9adb329347b7af61958b32cf30761768 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Fri, 4 Dec 2015 02:32:32 +0000
Subject: [PATCH 055/364] [Orc] Fix Kaleidoscope example for change in r254693.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254695 91177308-0d34-0410-b5e6-96231b3b80d8
---
 examples/Kaleidoscope/Orc/fully_lazy/toy.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/Kaleidoscope/Orc/fully_lazy/toy.cpp b/examples/Kaleidoscope/Orc/fully_lazy/toy.cpp
index 8ba76e86ee07..78184f5d32cd 100644
--- a/examples/Kaleidoscope/Orc/fully_lazy/toy.cpp
+++ b/examples/Kaleidoscope/Orc/fully_lazy/toy.cpp
@@ -1308,7 +1308,7 @@ class KaleidoscopeJIT {
 
   std::map<std::string, std::unique_ptr<FunctionAST>> FunctionDefs;
 
-  JITCompileCallbackManager<OrcX86_64> CompileCallbacks;
+  LocalJITCompileCallbackManager<OrcX86_64> CompileCallbacks;
 };
 
 static void HandleDefinition(SessionContext &S, KaleidoscopeJIT &J) {

From c6a202e8593da2db6c7330085536436ddc3fa263 Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Fri, 4 Dec 2015 02:42:28 +0000
Subject: [PATCH 056/364] Move a call to getGlobalContext out of lib/LTO.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254696 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/LTO/LTOCodeGenerator.h |  4 +---
 lib/LTO/LTOCodeGenerator.cpp        | 11 ++---------
 tools/llvm-lto/llvm-lto.cpp         |  2 +-
 tools/lto/lto.cpp                   |  5 +++--
 4 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/include/llvm/LTO/LTOCodeGenerator.h b/include/llvm/LTO/LTOCodeGenerator.h
index c322288a1ae9..876defbdcd3f 100644
--- a/include/llvm/LTO/LTOCodeGenerator.h
+++ b/include/llvm/LTO/LTOCodeGenerator.h
@@ -62,8 +62,7 @@ namespace llvm {
 struct LTOCodeGenerator {
   static const char *getVersionString();
 
-  LTOCodeGenerator();
-  LTOCodeGenerator(std::unique_ptr<LLVMContext> Context);
+  LTOCodeGenerator(LLVMContext &Context);
   ~LTOCodeGenerator();
 
   /// Merge given module.  Return true on success.
@@ -168,7 +167,6 @@ struct LTOCodeGenerator {
 
   typedef StringMap<uint8_t> StringSet;
 
-  std::unique_ptr<LLVMContext> OwnedContext;
   LLVMContext &Context;
   std::unique_ptr<Module> MergedModule;
   std::unique_ptr<Linker> IRLinker;
diff --git a/lib/LTO/LTOCodeGenerator.cpp b/lib/LTO/LTOCodeGenerator.cpp
index 468ec24e3a06..b0dae74c13d4 100644
--- a/lib/LTO/LTOCodeGenerator.cpp
+++ b/lib/LTO/LTOCodeGenerator.cpp
@@ -64,20 +64,13 @@ const char* LTOCodeGenerator::getVersionString() {
 #endif
 }
 
-LTOCodeGenerator::LTOCodeGenerator()
-    : Context(getGlobalContext()),
+LTOCodeGenerator::LTOCodeGenerator(LLVMContext &Context)
+    : Context(Context),
       MergedModule(new Module("ld-temp.o", Context)),
       IRLinker(new Linker(*MergedModule)) {
   initializeLTOPasses();
 }
 
-LTOCodeGenerator::LTOCodeGenerator(std::unique_ptr<LLVMContext> Context)
-    : OwnedContext(std::move(Context)), Context(*OwnedContext),
-      MergedModule(new Module("ld-temp.o", *OwnedContext)),
-      IRLinker(new Linker(*MergedModule)) {
-  initializeLTOPasses();
-}
-
 LTOCodeGenerator::~LTOCodeGenerator() {}
 
 // Initialize LTO passes. Please keep this function in sync with
diff --git a/tools/llvm-lto/llvm-lto.cpp b/tools/llvm-lto/llvm-lto.cpp
index 86b95577b307..07cd9bb1eaf8 100644
--- a/tools/llvm-lto/llvm-lto.cpp
+++ b/tools/llvm-lto/llvm-lto.cpp
@@ -257,7 +257,7 @@ int main(int argc, char **argv) {
 
   unsigned BaseArg = 0;
 
-  LTOCodeGenerator CodeGen;
+  LTOCodeGenerator CodeGen(getGlobalContext());
 
   if (UseDiagnosticHandler)
     CodeGen.setDiagnosticHandler(handleDiagnostics, nullptr);
diff --git a/tools/lto/lto.cpp b/tools/lto/lto.cpp
index ee389da2499e..6058fb1197eb 100644
--- a/tools/lto/lto.cpp
+++ b/tools/lto/lto.cpp
@@ -95,13 +95,14 @@ static void handleLibLTODiagnostic(lto_codegen_diagnostic_severity_t Severity,
 // libLTO API semantics, which require that the code generator owns the object
 // file.
 struct LibLTOCodeGenerator : LTOCodeGenerator {
-  LibLTOCodeGenerator() {
+  LibLTOCodeGenerator() : LTOCodeGenerator(getGlobalContext()) {
     setDiagnosticHandler(handleLibLTODiagnostic, nullptr); }
   LibLTOCodeGenerator(std::unique_ptr<LLVMContext> Context)
-      : LTOCodeGenerator(std::move(Context)) {
+      : LTOCodeGenerator(*Context), OwnedContext(std::move(Context)) {
     setDiagnosticHandler(handleLibLTODiagnostic, nullptr); }
 
   std::unique_ptr<MemoryBuffer> NativeObjectFile;
+  std::unique_ptr<LLVMContext> OwnedContext;
 };
 
 }

From 9ce890e1fe27d88dea0f652d11a09673eab67162 Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Fri, 4 Dec 2015 04:15:05 +0000
Subject: [PATCH 057/364] Revert "[BranchFolding] Merge MMOs during tail merge"

This reverts commit r254694.

It broke bootstrap.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254700 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/BranchFolding.cpp | 42 +++++++++++++----------------------
 1 file changed, 16 insertions(+), 26 deletions(-)

diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index 3878281a4fe9..e41926a819c2 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -744,35 +744,24 @@ bool BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
   return true;
 }
 
-// Add MI1's MMOs to MI2's MMOs while excluding any duplicates. The MI scheduler
-// currently doesn't handle multiple MMOs, so duplicates would likely pessimize
-// the scheduler.
-static void mergeMMOs(MachineInstr *MI1, MachineInstr *MI2) {
+static bool hasIdenticalMMOs(const MachineInstr *MI1, const MachineInstr *MI2) {
   auto I1 = MI1->memoperands_begin(), E1 = MI1->memoperands_end();
   auto I2 = MI2->memoperands_begin(), E2 = MI2->memoperands_end();
-  MachineFunction *MF = MI1->getParent()->getParent();
-
-  // Mostly, MI1's MMO count is 1 or zero. So we don't have to use
-  // SmallSet.
-  for (; I1 != E1; ++I1) {
-    bool IsDupMMO = false;
-    for (I2 = MI2->memoperands_begin(); I2 != E2; ++I2) {
-      if (**I1 == **I2) {
-        IsDupMMO = true;
-        break;
-      }
-    }
-    if (IsDupMMO == false) {
-      MI2->addMemOperand(*MF, *I1);
-      E2 = MI2->memoperands_end();
-    }
+  if ((E1 - I1) != (E2 - I2))
+    return false;
+  for (; I1 != E1; ++I1, ++I2) {
+    if (**I1 != **I2)
+      return false;
   }
+  return true;
 }
 
 static void
-mergeMMOsFromMemoryOperations(MachineBasicBlock::iterator MBBIStartPos,
-                              MachineBasicBlock &MBBCommon) {
-  // Merge MMOs from memory operations in the common block
+removeMMOsFromMemoryOperations(MachineBasicBlock::iterator MBBIStartPos,
+                               MachineBasicBlock &MBBCommon) {
+  // Remove MMOs from memory operations in the common block
+  // when they do not match the ones from the block being tail-merged.
+  // This ensures later passes conservatively compute dependencies.
   MachineBasicBlock *MBB = MBBIStartPos->getParent();
   // Note CommonTailLen does not necessarily matches the size of
   // the common BB nor all its instructions because of debug
@@ -803,7 +792,8 @@ mergeMMOsFromMemoryOperations(MachineBasicBlock::iterator MBBIStartPos,
     assert(MBBICommon->isIdenticalTo(&*MBBI) && "Expected matching MIIs!");
 
     if (MBBICommon->mayLoad() || MBBICommon->mayStore())
-      mergeMMOs(&*MBBI, &*MBBICommon);
+      if (!hasIdenticalMMOs(&*MBBI, &*MBBICommon))
+        MBBICommon->clearMemRefs();
 
     ++MBBI;
     ++MBBICommon;
@@ -923,8 +913,8 @@ bool BranchFolder::TryTailMergeBlocks(MachineBasicBlock *SuccBB,
         continue;
       DEBUG(dbgs() << "BB#" << SameTails[i].getBlock()->getNumber()
                    << (i == e-1 ? "" : ", "));
-      // Merge MMOs from memory operations as needed.
-      mergeMMOsFromMemoryOperations(SameTails[i].getTailStartPos(), *MBB);
+      // Remove MMOs from memory operations as needed.
+      removeMMOsFromMemoryOperations(SameTails[i].getTailStartPos(), *MBB);
       // Hack the end off BB i, making it jump to BB commonTailIndex instead.
       ReplaceTailWithBranchTo(SameTails[i].getTailStartPos(), MBB);
       // BB i is no longer a predecessor of SuccBB; remove it from the worklist.

From 008021cd33a7820d7edee10536d7c4693f29129e Mon Sep 17 00:00:00 2001
From: Yury Gribov <y.gribov@samsung.com>
Date: Fri, 4 Dec 2015 09:19:14 +0000
Subject: [PATCH 058/364] [asan] Fix dynamic allocas unpoisoning on PowerPC64.

For PowerPC64 we cannot just pass SP extracted from @llvm.stackrestore to
_asan_allocas_unpoison due to specific ABI requirements
(http://refspecs.linuxfoundation.org/ELF/ppc64/PPC-elf64abi.html#DYNAM-STACK).
This patch adds the value returned by @llvm.get.dynamic.area.offset to
extracted from @llvm.stackrestore stack pointer, so dynamic allocas unpoisoning
stuff would work correctly on PowerPC64.

Patch by Max Ostapenko.

Differential Revision: http://reviews.llvm.org/D15108


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254707 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../Instrumentation/AddressSanitizer.cpp      | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 8eb82e39b8a6..dea94a514fe8 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -624,9 +624,24 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
   void unpoisonDynamicAllocasBeforeInst(Instruction *InstBefore,
                                         Value *SavedStack) {
     IRBuilder<> IRB(InstBefore);
+    Value *DynamicAreaPtr = IRB.CreatePtrToInt(SavedStack, IntptrTy);
+    // When we insert _asan_allocas_unpoison before @llvm.stackrestore, we
+    // need to adjust extracted SP to compute the address of the most recent
+    // alloca. We have a special @llvm.get.dynamic.area.offset intrinsic for
+    // this purpose.
+    if (!isa<ReturnInst>(InstBefore)) {
+      Function *DynamicAreaOffsetFunc = Intrinsic::getDeclaration(
+          InstBefore->getModule(), Intrinsic::get_dynamic_area_offset,
+          {IntptrTy});
+
+      Value *DynamicAreaOffset = IRB.CreateCall(DynamicAreaOffsetFunc, {});
+
+      DynamicAreaPtr = IRB.CreateAdd(IRB.CreatePtrToInt(SavedStack, IntptrTy),
+                                     DynamicAreaOffset);
+    }
+
     IRB.CreateCall(AsanAllocasUnpoisonFunc,
-                   {IRB.CreateLoad(DynamicAllocaLayout),
-                    IRB.CreatePtrToInt(SavedStack, IntptrTy)});
+                   {IRB.CreateLoad(DynamicAllocaLayout), DynamicAreaPtr});
   }
 
   // Unpoison dynamic allocas redzones.

From e471c45c92d58dc28cb3cab00d24b6bba5f1b3fe Mon Sep 17 00:00:00 2001
From: Oliver Stannard <oliver.stannard@arm.com>
Date: Fri, 4 Dec 2015 09:45:18 +0000
Subject: [PATCH 059/364] [AArch64] Clean up statistical profiling test

This check has nothing to do with the statistical profiling extension, so
shouldn't be in this test.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254709 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../Disassembler/AArch64/armv8.2a-statistical-profiling.txt   | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/test/MC/Disassembler/AArch64/armv8.2a-statistical-profiling.txt b/test/MC/Disassembler/AArch64/armv8.2a-statistical-profiling.txt
index e83d750e715e..217424cc46e0 100644
--- a/test/MC/Disassembler/AArch64/armv8.2a-statistical-profiling.txt
+++ b/test/MC/Disassembler/AArch64/armv8.2a-statistical-profiling.txt
@@ -1,10 +1,6 @@
 # RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+spe --disassemble < %s | FileCheck %s
 # RUN: llvm-mc -triple aarch64-none-linux-gnu --disassemble < %s | FileCheck --check-prefix=NO_SPE %s
 
-[0x1f,0x22,0x03,0xd5]
-# CHECK: hint #0x10
-# NO_SPE: hint #0x10
-
 [0x3f,0x22,0x03,0xd5]
 # CHECK: psb csync
 # NO_SPE: hint #0x11

From a18156c3b8d92d5eb1af043947d1b1461c84e40f Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@hotmail.com>
Date: Fri, 4 Dec 2015 10:53:15 +0000
Subject: [PATCH 060/364] LEA code size optimization pass (Part 1): Remove
 redundant address recalculations, by Andrey Turetsky

Add new x86 pass which replaces address calculations in load or store instructions with def register of existing LEA (must be in the same basic block), if the LEA calculates address that differs only by a displacement. Works only with -Os or -Oz.
Differential Revision: http://reviews.llvm.org/D13294


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254712 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/CMakeLists.txt       |   1 +
 lib/Target/X86/X86.h                |   4 +
 lib/Target/X86/X86OptimizeLEAs.cpp  | 324 ++++++++++++++++++++++++++++
 lib/Target/X86/X86TargetMachine.cpp |   3 +
 test/CodeGen/X86/lea-opt.ll         | 131 +++++++++++
 5 files changed, 463 insertions(+)
 create mode 100644 lib/Target/X86/X86OptimizeLEAs.cpp
 create mode 100644 test/CodeGen/X86/lea-opt.ll

diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index db486f9243af..b23f5c353013 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -34,6 +34,7 @@ set(sources
   X86VZeroUpper.cpp
   X86FixupLEAs.cpp
   X86WinEHState.cpp
+  X86OptimizeLEAs.cpp
   )
 
 add_llvm_target(X86CodeGen ${sources})
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index cd914ee7f9c8..6bdb07d1df04 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -58,6 +58,10 @@ FunctionPass *createX86PadShortFunctions();
 /// to eliminate execution delays in some Atom processors.
 FunctionPass *createX86FixupLEAs();
 
+/// createX86OptimizeLEAs() - Return a pass that removes redundant
+/// address recalculations.
+FunctionPass *createX86OptimizeLEAs();
+
 /// createX86CallFrameOptimization - Return a pass that optimizes
 /// the code-size of x86 call sequences. This is done by replacing
 /// esp-relative movs with pushes.
diff --git a/lib/Target/X86/X86OptimizeLEAs.cpp b/lib/Target/X86/X86OptimizeLEAs.cpp
new file mode 100644
index 000000000000..9171786707d8
--- /dev/null
+++ b/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -0,0 +1,324 @@
+//===-- X86OptimizeLEAs.cpp - optimize usage of LEA instructions ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass that performs some optimizations with LEA
+// instructions in order to improve code size.
+// Currently, it does one thing:
+// 1) Address calculations in load and store instructions are replaced by
+//    existing LEA def registers where possible.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-optimize-LEAs"
+
+STATISTIC(NumSubstLEAs, "Number of LEA instruction substitutions");
+
+namespace {
+class OptimizeLEAPass : public MachineFunctionPass {
+public:
+  OptimizeLEAPass() : MachineFunctionPass(ID) {}
+
+  const char *getPassName() const override { return "X86 LEA Optimize"; }
+
+  /// \brief Loop over all of the basic blocks, replacing address
+  /// calculations in load and store instructions, if it's already
+  /// been calculated by LEA. Also, remove redundant LEAs.
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  /// \brief Returns a distance between two instructions inside one basic block.
+  /// Negative result means, that instructions occur in reverse order.
+  int calcInstrDist(const MachineInstr &First, const MachineInstr &Last);
+
+  /// \brief Choose the best \p LEA instruction from the \p List to replace
+  /// address calculation in \p MI instruction. Return the address displacement
+  /// and the distance between \p MI and the choosen \p LEA in \p AddrDispShift
+  /// and \p Dist.
+  bool chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List,
+                     const MachineInstr &MI, MachineInstr *&LEA,
+                     int64_t &AddrDispShift, int &Dist);
+
+  /// \brief Returns true if two machine operand are identical and they are not
+  /// physical registers.
+  bool isIdenticalOp(const MachineOperand &MO1, const MachineOperand &MO2);
+
+  /// \brief Returns true if the instruction is LEA.
+  bool isLEA(const MachineInstr &MI);
+
+  /// \brief Returns true if two instructions have memory operands that only
+  /// differ by displacement. The numbers of the first memory operands for both
+  /// instructions are specified through \p N1 and \p N2. The address
+  /// displacement is returned through AddrDispShift.
+  bool isSimilarMemOp(const MachineInstr &MI1, unsigned N1,
+                      const MachineInstr &MI2, unsigned N2,
+                      int64_t &AddrDispShift);
+
+  /// \brief Find all LEA instructions in the basic block.
+  void findLEAs(const MachineBasicBlock &MBB,
+                SmallVectorImpl<MachineInstr *> &List);
+
+  /// \brief Removes redundant address calculations.
+  bool removeRedundantAddrCalc(const SmallVectorImpl<MachineInstr *> &List);
+
+  MachineRegisterInfo *MRI;
+  const X86InstrInfo *TII;
+  const X86RegisterInfo *TRI;
+
+  static char ID;
+};
+char OptimizeLEAPass::ID = 0;
+}
+
+FunctionPass *llvm::createX86OptimizeLEAs() { return new OptimizeLEAPass(); }
+
+int OptimizeLEAPass::calcInstrDist(const MachineInstr &First,
+                                   const MachineInstr &Last) {
+  const MachineBasicBlock *MBB = First.getParent();
+
+  // Both instructions must be in the same basic block.
+  assert(Last.getParent() == MBB &&
+         "Instructions are in different basic blocks");
+
+  return std::distance(MBB->begin(), MachineBasicBlock::const_iterator(&Last)) -
+         std::distance(MBB->begin(), MachineBasicBlock::const_iterator(&First));
+}
+
+// Find the best LEA instruction in the List to replace address recalculation in
+// MI. Such LEA must meet these requirements:
+// 1) The address calculated by the LEA differs only by the displacement from
+//    the address used in MI.
+// 2) The register class of the definition of the LEA is compatible with the
+//    register class of the address base register of MI.
+// 3) Displacement of the new memory operand should fit in 1 byte if possible.
+// 4) The LEA should be as close to MI as possible, and prior to it if
+//    possible.
+bool OptimizeLEAPass::chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List,
+                                    const MachineInstr &MI, MachineInstr *&LEA,
+                                    int64_t &AddrDispShift, int &Dist) {
+  const MachineFunction *MF = MI.getParent()->getParent();
+  const MCInstrDesc &Desc = MI.getDesc();
+  int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags, MI.getOpcode()) +
+                X86II::getOperandBias(Desc);
+
+  LEA = nullptr;
+
+  // Loop over all LEA instructions.
+  for (auto DefMI : List) {
+    int64_t AddrDispShiftTemp = 0;
+
+    // Compare instructions memory operands.
+    if (!isSimilarMemOp(MI, MemOpNo, *DefMI, 1, AddrDispShiftTemp))
+      continue;
+
+    // Make sure address displacement fits 4 bytes.
+    if (!isInt<32>(AddrDispShiftTemp))
+      continue;
+
+    // Check that LEA def register can be used as MI address base. Some
+    // instructions can use a limited set of registers as address base, for
+    // example MOV8mr_NOREX. We could constrain the register class of the LEA
+    // def to suit MI, however since this case is very rare and hard to
+    // reproduce in a test it's just more reliable to skip the LEA.
+    if (TII->getRegClass(Desc, MemOpNo + X86::AddrBaseReg, TRI, *MF) !=
+        MRI->getRegClass(DefMI->getOperand(0).getReg()))
+      continue;
+
+    // Choose the closest LEA instruction from the list, prior to MI if
+    // possible. Note that we took into account resulting address displacement
+    // as well. Also note that the list is sorted by the order in which the LEAs
+    // occur, so the break condition is pretty simple.
+    int DistTemp = calcInstrDist(*DefMI, MI);
+    assert(DistTemp != 0 &&
+           "The distance between two different instructions cannot be zero");
+    if (DistTemp > 0 || LEA == nullptr) {
+      // Do not update return LEA, if the current one provides a displacement
+      // which fits in 1 byte, while the new candidate does not.
+      if (LEA != nullptr && !isInt<8>(AddrDispShiftTemp) &&
+          isInt<8>(AddrDispShift))
+        continue;
+
+      LEA = DefMI;
+      AddrDispShift = AddrDispShiftTemp;
+      Dist = DistTemp;
+    }
+
+    // FIXME: Maybe we should not always stop at the first LEA after MI.
+    if (DistTemp < 0)
+      break;
+  }
+
+  return LEA != nullptr;
+}
+
+bool OptimizeLEAPass::isIdenticalOp(const MachineOperand &MO1,
+                                    const MachineOperand &MO2) {
+  return MO1.isIdenticalTo(MO2) &&
+         (!MO1.isReg() ||
+          !TargetRegisterInfo::isPhysicalRegister(MO1.getReg()));
+}
+
+bool OptimizeLEAPass::isLEA(const MachineInstr &MI) {
+  unsigned Opcode = MI.getOpcode();
+  return Opcode == X86::LEA16r || Opcode == X86::LEA32r ||
+         Opcode == X86::LEA64r || Opcode == X86::LEA64_32r;
+}
+
+// Check if MI1 and MI2 have memory operands which represent addresses that
+// differ only by displacement.
+bool OptimizeLEAPass::isSimilarMemOp(const MachineInstr &MI1, unsigned N1,
+                                     const MachineInstr &MI2, unsigned N2,
+                                     int64_t &AddrDispShift) {
+  // Address base, scale, index and segment operands must be identical.
+  static const int IdenticalOpNums[] = {X86::AddrBaseReg, X86::AddrScaleAmt,
+                                        X86::AddrIndexReg, X86::AddrSegmentReg};
+  for (auto &N : IdenticalOpNums)
+    if (!isIdenticalOp(MI1.getOperand(N1 + N), MI2.getOperand(N2 + N)))
+      return false;
+
+  // Address displacement operands may differ by a constant.
+  const MachineOperand *Op1 = &MI1.getOperand(N1 + X86::AddrDisp);
+  const MachineOperand *Op2 = &MI2.getOperand(N2 + X86::AddrDisp);
+  if (!isIdenticalOp(*Op1, *Op2)) {
+    if (Op1->isImm() && Op2->isImm())
+      AddrDispShift = Op1->getImm() - Op2->getImm();
+    else if (Op1->isGlobal() && Op2->isGlobal() &&
+             Op1->getGlobal() == Op2->getGlobal())
+      AddrDispShift = Op1->getOffset() - Op2->getOffset();
+    else
+      return false;
+  }
+
+  return true;
+}
+
+void OptimizeLEAPass::findLEAs(const MachineBasicBlock &MBB,
+                               SmallVectorImpl<MachineInstr *> &List) {
+  for (auto &MI : MBB) {
+    if (isLEA(MI))
+      List.push_back(const_cast<MachineInstr *>(&MI));
+  }
+}
+
+// Try to find load and store instructions which recalculate addresses already
+// calculated by some LEA and replace their memory operands with its def
+// register.
+bool OptimizeLEAPass::removeRedundantAddrCalc(
+    const SmallVectorImpl<MachineInstr *> &List) {
+  bool Changed = false;
+
+  assert(List.size() > 0);
+  MachineBasicBlock *MBB = List[0]->getParent();
+
+  // Process all instructions in basic block.
+  for (auto I = MBB->begin(), E = MBB->end(); I != E;) {
+    MachineInstr &MI = *I++;
+    unsigned Opcode = MI.getOpcode();
+
+    // Instruction must be load or store.
+    if (!MI.mayLoadOrStore())
+      continue;
+
+    // Get the number of the first memory operand.
+    const MCInstrDesc &Desc = MI.getDesc();
+    int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags, Opcode);
+
+    // If instruction has no memory operand - skip it.
+    if (MemOpNo < 0)
+      continue;
+
+    MemOpNo += X86II::getOperandBias(Desc);
+
+    // Get the best LEA instruction to replace address calculation.
+    MachineInstr *DefMI;
+    int64_t AddrDispShift;
+    int Dist;
+    if (!chooseBestLEA(List, MI, DefMI, AddrDispShift, Dist))
+      continue;
+
+    // If LEA occurs before current instruction, we can freely replace
+    // the instruction. If LEA occurs after, we can lift LEA above the
+    // instruction and this way to be able to replace it. Since LEA and the
+    // instruction have similar memory operands (thus, the same def
+    // instructions for these operands), we can always do that, without
+    // worries of using registers before their defs.
+    if (Dist < 0) {
+      DefMI->removeFromParent();
+      MBB->insert(MachineBasicBlock::iterator(&MI), DefMI);
+    }
+
+    // Since we can possibly extend register lifetime, clear kill flags.
+    MRI->clearKillFlags(DefMI->getOperand(0).getReg());
+
+    ++NumSubstLEAs;
+    DEBUG(dbgs() << "OptimizeLEAs: Candidate to replace: "; MI.dump(););
+
+    // Change instruction operands.
+    MI.getOperand(MemOpNo + X86::AddrBaseReg)
+        .ChangeToRegister(DefMI->getOperand(0).getReg(), false);
+    MI.getOperand(MemOpNo + X86::AddrScaleAmt).ChangeToImmediate(1);
+    MI.getOperand(MemOpNo + X86::AddrIndexReg)
+        .ChangeToRegister(X86::NoRegister, false);
+    MI.getOperand(MemOpNo + X86::AddrDisp).ChangeToImmediate(AddrDispShift);
+    MI.getOperand(MemOpNo + X86::AddrSegmentReg)
+        .ChangeToRegister(X86::NoRegister, false);
+
+    DEBUG(dbgs() << "OptimizeLEAs: Replaced by: "; MI.dump(););
+
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) {
+  bool Changed = false;
+  bool OptSize = MF.getFunction()->optForSize();
+  bool MinSize = MF.getFunction()->optForMinSize();
+
+  // Perform this optimization only if we care about code size.
+  if (!OptSize && !MinSize)
+    return false;
+
+  MRI = &MF.getRegInfo();
+  TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+  TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo();
+
+  // Process all basic blocks.
+  for (auto &MBB : MF) {
+    SmallVector<MachineInstr *, 16> LEAs;
+
+    // Find all LEA instructions in basic block.
+    findLEAs(MBB, LEAs);
+
+    // If current basic block has no LEAs, move on to the next one.
+    if (LEAs.empty())
+      continue;
+
+    // Remove redundant address calculations.
+    Changed |= removeRedundantAddrCalc(LEAs);
+  }
+
+  return Changed;
+}
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 2e869eb7c3cd..0e7e4c0c84a9 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -254,6 +254,9 @@ bool X86PassConfig::addPreISel() {
 }
 
 void X86PassConfig::addPreRegAlloc() {
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createX86OptimizeLEAs());
+
   addPass(createX86CallFrameOptimization());
 }
 
diff --git a/test/CodeGen/X86/lea-opt.ll b/test/CodeGen/X86/lea-opt.ll
new file mode 100644
index 000000000000..c105b31995b3
--- /dev/null
+++ b/test/CodeGen/X86/lea-opt.ll
@@ -0,0 +1,131 @@
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
+
+%struct.anon1 = type { i32, i32, i32 }
+%struct.anon2 = type { i32, [32 x i32], i32 }
+
+@arr1 = external global [65 x %struct.anon1], align 16
+@arr2 = external global [65 x %struct.anon2], align 16
+
+define void @test1(i64 %x) nounwind {
+entry:
+  %a = getelementptr inbounds [65 x %struct.anon1], [65 x %struct.anon1]* @arr1, i64 0, i64 %x, i32 0
+  %tmp = load i32, i32* %a, align 4
+  %b = getelementptr inbounds [65 x %struct.anon1], [65 x %struct.anon1]* @arr1, i64 0, i64 %x, i32 1
+  %tmp1 = load i32, i32* %b, align 4
+  %sub = sub i32 %tmp, %tmp1
+  %c = getelementptr inbounds [65 x %struct.anon1], [65 x %struct.anon1]* @arr1, i64 0, i64 %x, i32 2
+  %tmp2 = load i32, i32* %c, align 4
+  %add = add nsw i32 %sub, %tmp2
+  switch i32 %add, label %sw.epilog [
+    i32 1, label %sw.bb.1
+    i32 2, label %sw.bb.2
+  ]
+
+sw.bb.1:                                          ; preds = %entry
+  store i32 111, i32* %b, align 4
+  store i32 222, i32* %c, align 4
+  br label %sw.epilog
+
+sw.bb.2:                                          ; preds = %entry
+  store i32 333, i32* %b, align 4
+  store i32 444, i32* %c, align 4
+  br label %sw.epilog
+
+sw.epilog:                                        ; preds = %sw.bb.2, %sw.bb.1, %entry
+  ret void
+; CHECK-LABEL: test1:
+; CHECK:	leaq (%rdi,%rdi,2), [[REG1:%[a-z]+]]
+; CHECK:	movl arr1(,[[REG1]],4), {{.*}}
+; CHECK:	leaq arr1+4(,[[REG1]],4), [[REG2:%[a-z]+]]
+; CHECK:	subl arr1+4(,[[REG1]],4), {{.*}}
+; CHECK:	leaq arr1+8(,[[REG1]],4), [[REG3:%[a-z]+]]
+; CHECK:	addl arr1+8(,[[REG1]],4), {{.*}}
+; CHECK:	movl ${{[1-4]+}}, ([[REG2]])
+; CHECK:	movl ${{[1-4]+}}, ([[REG3]])
+; CHECK:	movl ${{[1-4]+}}, ([[REG2]])
+; CHECK:	movl ${{[1-4]+}}, ([[REG3]])
+}
+
+define void @test2(i64 %x) nounwind optsize {
+entry:
+  %a = getelementptr inbounds [65 x %struct.anon1], [65 x %struct.anon1]* @arr1, i64 0, i64 %x, i32 0
+  %tmp = load i32, i32* %a, align 4
+  %b = getelementptr inbounds [65 x %struct.anon1], [65 x %struct.anon1]* @arr1, i64 0, i64 %x, i32 1
+  %tmp1 = load i32, i32* %b, align 4
+  %sub = sub i32 %tmp, %tmp1
+  %c = getelementptr inbounds [65 x %struct.anon1], [65 x %struct.anon1]* @arr1, i64 0, i64 %x, i32 2
+  %tmp2 = load i32, i32* %c, align 4
+  %add = add nsw i32 %sub, %tmp2
+  switch i32 %add, label %sw.epilog [
+    i32 1, label %sw.bb.1
+    i32 2, label %sw.bb.2
+  ]
+
+sw.bb.1:                                          ; preds = %entry
+  store i32 111, i32* %b, align 4
+  store i32 222, i32* %c, align 4
+  br label %sw.epilog
+
+sw.bb.2:                                          ; preds = %entry
+  store i32 333, i32* %b, align 4
+  store i32 444, i32* %c, align 4
+  br label %sw.epilog
+
+sw.epilog:                                        ; preds = %sw.bb.2, %sw.bb.1, %entry
+  ret void
+; CHECK-LABEL: test2:
+; CHECK:	leaq (%rdi,%rdi,2), [[REG1:%[a-z]+]]
+; CHECK:	leaq arr1+4(,[[REG1]],4), [[REG2:%[a-z]+]]
+; CHECK:	movl -4([[REG2]]), {{.*}}
+; CHECK:	subl ([[REG2]]), {{.*}}
+; CHECK:	leaq arr1+8(,[[REG1]],4), [[REG3:%[a-z]+]]
+; CHECK:	addl ([[REG3]]), {{.*}}
+; CHECK:	movl ${{[1-4]+}}, ([[REG2]])
+; CHECK:	movl ${{[1-4]+}}, ([[REG3]])
+; CHECK:	movl ${{[1-4]+}}, ([[REG2]])
+; CHECK:	movl ${{[1-4]+}}, ([[REG3]])
+}
+
+; Check that LEA optimization pass takes into account a resultant address
+; displacement when choosing a LEA instruction for replacing a redundant
+; address recalculation.
+
+define void @test3(i64 %x) nounwind optsize {
+entry:
+  %a = getelementptr inbounds [65 x %struct.anon2], [65 x %struct.anon2]* @arr2, i64 0, i64 %x, i32 2
+  %tmp = load i32, i32* %a, align 4
+  %b = getelementptr inbounds [65 x %struct.anon2], [65 x %struct.anon2]* @arr2, i64 0, i64 %x, i32 0
+  %tmp1 = load i32, i32* %b, align 4
+  %add = add nsw i32 %tmp, %tmp1
+  switch i32 %add, label %sw.epilog [
+    i32 1, label %sw.bb.1
+    i32 2, label %sw.bb.2
+  ]
+
+sw.bb.1:                                          ; preds = %entry
+  store i32 111, i32* %a, align 4
+  store i32 222, i32* %b, align 4
+  br label %sw.epilog
+
+sw.bb.2:                                          ; preds = %entry
+  store i32 333, i32* %a, align 4
+  store i32 444, i32* %b, align 4
+  br label %sw.epilog
+
+sw.epilog:                                        ; preds = %sw.bb.2, %sw.bb.1, %entry
+  ret void
+; CHECK-LABEL: test3:
+; CHECK:	imulq {{.*}}, [[REG1:%[a-z]+]]
+; CHECK:	leaq arr2+132([[REG1]]), [[REG2:%[a-z]+]]
+; CHECK:	leaq arr2([[REG1]]), [[REG3:%[a-z]+]]
+
+; REG3's definition is closer to movl than REG2's, but the pass still chooses
+; REG2 because it provides the resultant address displacement fitting 1 byte.
+
+; CHECK:	movl ([[REG2]]), {{.*}}
+; CHECK:	addl ([[REG3]]), {{.*}}
+; CHECK:	movl ${{[1-4]+}}, ([[REG2]])
+; CHECK:	movl ${{[1-4]+}}, ([[REG3]])
+; CHECK:	movl ${{[1-4]+}}, ([[REG2]])
+; CHECK:	movl ${{[1-4]+}}, ([[REG3]])
+}

From c9119982e4c708faaab5f76a0f4bb81f42afeec6 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulsson@linux.vnet.ibm.com>
Date: Fri, 4 Dec 2015 12:48:51 +0000
Subject: [PATCH 061/364] [SystemZ] Bugfix: Don't add CC twice to new
 three-address instruction.

Since BuildMI() automatically adds the implicit operands for a new instruction,
adding the old instructions CC operand resulted in that there were two CC imp-def
operands, where only one was marked as dead. This caused buildSchedGraph() to
miss dependencies on the CC reg.

Review by Ulrich Weigand

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254714 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/SystemZ/SystemZInstrInfo.cpp | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 85aa983da3a6..e6b5fc8e6235 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -676,7 +676,8 @@ SystemZInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
                                         LiveVariables *LV) const {
   MachineInstr *MI = MBBI;
   MachineBasicBlock *MBB = MI->getParent();
-  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
 
   unsigned Opcode = MI->getOpcode();
   unsigned NumOps = MI->getNumOperands();
@@ -703,14 +704,19 @@ SystemZInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     }
     int ThreeOperandOpcode = SystemZ::getThreeOperandOpcode(Opcode);
     if (ThreeOperandOpcode >= 0) {
-      MachineInstrBuilder MIB =
-        BuildMI(*MBB, MBBI, MI->getDebugLoc(), get(ThreeOperandOpcode))
-        .addOperand(Dest);
+      // Create three address instruction without adding the implicit
+      // operands. Those will instead be copied over from the original
+      // instruction by the loop below.
+      MachineInstrBuilder MIB(*MF,
+                              MF->CreateMachineInstr(get(ThreeOperandOpcode),
+                                    MI->getDebugLoc(), /*NoImplicit=*/true));
+      MIB.addOperand(Dest);
       // Keep the kill state, but drop the tied flag.
       MIB.addReg(Src.getReg(), getKillRegState(Src.isKill()), Src.getSubReg());
       // Keep the remaining operands as-is.
       for (unsigned I = 2; I < NumOps; ++I)
         MIB.addOperand(MI->getOperand(I));
+      MBB->insert(MI, MIB);
       return finishConvertToThreeAddress(MI, MIB, LV);
     }
   }

From fe8212f64b7b754d41d9d5e06718f8d44e13cf5e Mon Sep 17 00:00:00 2001
From: Colin LeMahieu <colinl@codeaurora.org>
Date: Fri, 4 Dec 2015 15:48:45 +0000
Subject: [PATCH 062/364] [Hexagon] Using multiply instead of shift on signed
 number which can be UB

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254719 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index 5e78762b994a..64050824c10b 100644
--- a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -554,13 +554,13 @@ struct HexagonOperand : public MCParsedAsmOperand {
   void adds4_6ImmOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::createImm(CE->getValue() << 6));
+    Inst.addOperand(MCOperand::createImm(CE->getValue() * 64));
   }
 
   void adds3_6ImmOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::createImm(CE->getValue() << 6));
+    Inst.addOperand(MCOperand::createImm(CE->getValue() * 64));
   }
 
   StringRef getToken() const {

From e67b3e872170aa4f2cacf12cc7c6c3aed7373399 Mon Sep 17 00:00:00 2001
From: Tim Northover <tnorthover@apple.com>
Date: Fri, 4 Dec 2015 16:10:48 +0000
Subject: [PATCH 063/364] ARM/AArch64: update reference documentation.

There's a more comprehensive ACLE and a real v8 ARM ARM now.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254720 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/CompilerWriterInfo.rst | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/CompilerWriterInfo.rst b/docs/CompilerWriterInfo.rst
index 900ba24e230f..6c3ff4b10f1e 100644
--- a/docs/CompilerWriterInfo.rst
+++ b/docs/CompilerWriterInfo.rst
@@ -22,14 +22,16 @@ ARM
 
 * `ABI Addenda and Errata <http://infocenter.arm.com/help/topic/com.arm.doc.ihi0045d/IHI0045D_ABI_addenda.pdf>`_
 
-* `ARM C Language Extensions <http://infocenter.arm.com/help/topic/com.arm.doc.ihi0053a/IHI0053A_acle.pdf>`_
+* `ARM C Language Extensions <http://infocenter.arm.com/help/topic/com.arm.doc.ihi0053c/IHI0053C_acle_2_0.pdf>`_
 
 AArch64
 -------
 
+* `ARMv8 Architecture Reference Manual <http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0487a.h/index.html>`_
+
 * `ARMv8 Instruction Set Overview <http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.genc010197a/index.html>`_
 
-* `ARM C Language Extensions <http://infocenter.arm.com/help/topic/com.arm.doc.ihi0053a/IHI0053A_acle.pdf>`_
+* `ARM C Language Extensions <http://infocenter.arm.com/help/topic/com.arm.doc.ihi0053c/IHI0053C_acle_2_0.pdf>`_
 
 Itanium (ia64)
 --------------

From 608d7b4f4a08cf0ac4529ef1260667d82f3926c4 Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Fri, 4 Dec 2015 16:14:31 +0000
Subject: [PATCH 064/364] Modernize the C++ APIs for creating LTO modules.

This is a continuation of r253367.

These functions return is owned by the caller, so they return
std::unique_ptr now.

The call can fail, so the return is wrapped in ErrorOr.

They have a context where to report diagnostics, so they don't need to
take a string out parameter.

With this there are no call to getGlobalContext in lib/LTO.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254721 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/LTO/LTOModule.h |  42 +++++------
 lib/LTO/LTOModule.cpp        | 131 ++++++++++++++++-------------------
 tools/llvm-lto/llvm-lto.cpp  |  49 +++++++------
 tools/lto/lto.cpp            |  70 +++++++++++++++----
 4 files changed, 165 insertions(+), 127 deletions(-)

diff --git a/include/llvm/LTO/LTOModule.h b/include/llvm/LTO/LTOModule.h
index 83a523613a76..97b5865bd47f 100644
--- a/include/llvm/LTO/LTOModule.h
+++ b/include/llvm/LTO/LTOModule.h
@@ -91,25 +91,24 @@ struct LTOModule {
   /// InitializeAllTargetMCs();
   /// InitializeAllAsmPrinters();
   /// InitializeAllAsmParsers();
-  static LTOModule *createFromFile(const char *path, TargetOptions options,
-                                   std::string &errMsg);
-  static LTOModule *createFromOpenFile(int fd, const char *path, size_t size,
-                                       TargetOptions options,
-                                       std::string &errMsg);
-  static LTOModule *createFromOpenFileSlice(int fd, const char *path,
-                                            size_t map_size, off_t offset,
-                                            TargetOptions options,
-                                            std::string &errMsg);
-  static LTOModule *createFromBuffer(const void *mem, size_t length,
-                                     TargetOptions options, std::string &errMsg,
-                                     StringRef path = "");
-
-  static LTOModule *createInLocalContext(const void *mem, size_t length,
-                                         TargetOptions options,
-                                         std::string &errMsg, StringRef path);
-  static LTOModule *createInContext(const void *mem, size_t length,
-                                    TargetOptions options, std::string &errMsg,
-                                    StringRef path, LLVMContext *Context);
+  static ErrorOr<std::unique_ptr<LTOModule>>
+  createFromFile(LLVMContext &Context, const char *path, TargetOptions options);
+  static ErrorOr<std::unique_ptr<LTOModule>>
+  createFromOpenFile(LLVMContext &Context, int fd, const char *path,
+                     size_t size, TargetOptions options);
+  static ErrorOr<std::unique_ptr<LTOModule>>
+  createFromOpenFileSlice(LLVMContext &Context, int fd, const char *path,
+                          size_t map_size, off_t offset, TargetOptions options);
+  static ErrorOr<std::unique_ptr<LTOModule>>
+  createFromBuffer(LLVMContext &Context, const void *mem, size_t length,
+                   TargetOptions options, StringRef path = "");
+
+  static ErrorOr<std::unique_ptr<LTOModule>>
+  createInLocalContext(const void *mem, size_t length, TargetOptions options,
+                       StringRef path);
+  static ErrorOr<std::unique_ptr<LTOModule>>
+  createInContext(const void *mem, size_t length, TargetOptions options,
+                  StringRef path, LLVMContext *Context);
 
   const Module &getModule() const {
     return const_cast<LTOModule*>(this)->getModule();
@@ -207,8 +206,9 @@ struct LTOModule {
   bool objcClassNameFromExpression(const Constant *c, std::string &name);
 
   /// Create an LTOModule (private version).
-  static LTOModule *makeLTOModule(MemoryBufferRef Buffer, TargetOptions options,
-                                  std::string &errMsg, LLVMContext *Context);
+  static ErrorOr<std::unique_ptr<LTOModule>>
+  makeLTOModule(MemoryBufferRef Buffer, TargetOptions options,
+                LLVMContext *Context);
 };
 }
 #endif
diff --git a/lib/LTO/LTOModule.cpp b/lib/LTO/LTOModule.cpp
index 42a568b54c7b..a6a3002e457d 100644
--- a/lib/LTO/LTOModule.cpp
+++ b/lib/LTO/LTOModule.cpp
@@ -100,89 +100,72 @@ std::string LTOModule::getProducerString(MemoryBuffer *Buffer) {
   return getBitcodeProducerString(*BCOrErr, Context);
 }
 
-LTOModule *LTOModule::createFromFile(const char *path, TargetOptions options,
-                                     std::string &errMsg) {
+ErrorOr<std::unique_ptr<LTOModule>>
+LTOModule::createFromFile(LLVMContext &Context, const char *path,
+                          TargetOptions options) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
       MemoryBuffer::getFile(path);
-  if (std::error_code EC = BufferOrErr.getError()) {
-    errMsg = EC.message();
-    return nullptr;
-  }
+  if (std::error_code EC = BufferOrErr.getError())
+    return EC;
   std::unique_ptr<MemoryBuffer> Buffer = std::move(BufferOrErr.get());
-  return makeLTOModule(Buffer->getMemBufferRef(), options, errMsg,
-                       &getGlobalContext());
+  return makeLTOModule(Buffer->getMemBufferRef(), options, &Context);
 }
 
-LTOModule *LTOModule::createFromOpenFile(int fd, const char *path, size_t size,
-                                         TargetOptions options,
-                                         std::string &errMsg) {
-  return createFromOpenFileSlice(fd, path, size, 0, options, errMsg);
+ErrorOr<std::unique_ptr<LTOModule>>
+LTOModule::createFromOpenFile(LLVMContext &Context, int fd, const char *path,
+                              size_t size, TargetOptions options) {
+  return createFromOpenFileSlice(Context, fd, path, size, 0, options);
 }
 
-LTOModule *LTOModule::createFromOpenFileSlice(int fd, const char *path,
-                                              size_t map_size, off_t offset,
-                                              TargetOptions options,
-                                              std::string &errMsg) {
+ErrorOr<std::unique_ptr<LTOModule>>
+LTOModule::createFromOpenFileSlice(LLVMContext &Context, int fd,
+                                   const char *path, size_t map_size,
+                                   off_t offset, TargetOptions options) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
       MemoryBuffer::getOpenFileSlice(fd, path, map_size, offset);
-  if (std::error_code EC = BufferOrErr.getError()) {
-    errMsg = EC.message();
-    return nullptr;
-  }
+  if (std::error_code EC = BufferOrErr.getError())
+    return EC;
   std::unique_ptr<MemoryBuffer> Buffer = std::move(BufferOrErr.get());
-  return makeLTOModule(Buffer->getMemBufferRef(), options, errMsg,
-                       &getGlobalContext());
+  return makeLTOModule(Buffer->getMemBufferRef(), options, &Context);
 }
 
-LTOModule *LTOModule::createFromBuffer(const void *mem, size_t length,
-                                       TargetOptions options,
-                                       std::string &errMsg, StringRef path) {
-  return createInContext(mem, length, options, errMsg, path,
-                         &getGlobalContext());
+ErrorOr<std::unique_ptr<LTOModule>>
+LTOModule::createFromBuffer(LLVMContext &Context, const void *mem,
+                            size_t length, TargetOptions options,
+                            StringRef path) {
+  return createInContext(mem, length, options, path, &Context);
 }
 
-LTOModule *LTOModule::createInLocalContext(const void *mem, size_t length,
-                                           TargetOptions options,
-                                           std::string &errMsg,
-                                           StringRef path) {
-  return createInContext(mem, length, options, errMsg, path, nullptr);
+ErrorOr<std::unique_ptr<LTOModule>>
+LTOModule::createInLocalContext(const void *mem, size_t length,
+                                TargetOptions options, StringRef path) {
+  return createInContext(mem, length, options, path, nullptr);
 }
 
-LTOModule *LTOModule::createInContext(const void *mem, size_t length,
-                                      TargetOptions options,
-                                      std::string &errMsg, StringRef path,
-                                      LLVMContext *Context) {
+ErrorOr<std::unique_ptr<LTOModule>>
+LTOModule::createInContext(const void *mem, size_t length,
+                           TargetOptions options, StringRef path,
+                           LLVMContext *Context) {
   StringRef Data((const char *)mem, length);
   MemoryBufferRef Buffer(Data, path);
-  return makeLTOModule(Buffer, options, errMsg, Context);
+  return makeLTOModule(Buffer, options, Context);
 }
 
-static std::unique_ptr<Module> parseBitcodeFileImpl(MemoryBufferRef Buffer,
-                                                    LLVMContext &Context,
-                                                    bool ShouldBeLazy,
-                                                    std::string &ErrMsg) {
+static ErrorOr<std::unique_ptr<Module>>
+parseBitcodeFileImpl(MemoryBufferRef Buffer, LLVMContext &Context,
+                     bool ShouldBeLazy) {
 
   // Find the buffer.
   ErrorOr<MemoryBufferRef> MBOrErr =
       IRObjectFile::findBitcodeInMemBuffer(Buffer);
-  if (std::error_code EC = MBOrErr.getError()) {
-    ErrMsg = EC.message();
-    return nullptr;
-  }
-
-  std::function<void(const DiagnosticInfo &)> DiagnosticHandler =
-      [&ErrMsg](const DiagnosticInfo &DI) {
-        raw_string_ostream Stream(ErrMsg);
-        DiagnosticPrinterRawOStream DP(Stream);
-        DI.print(DP);
-      };
+  if (std::error_code EC = MBOrErr.getError())
+    return EC;
 
   if (!ShouldBeLazy) {
     // Parse the full file.
-    ErrorOr<std::unique_ptr<Module>> M =
-        parseBitcodeFile(*MBOrErr, Context, DiagnosticHandler);
-    if (!M)
-      return nullptr;
+    ErrorOr<std::unique_ptr<Module>> M = parseBitcodeFile(*MBOrErr, Context);
+    if (std::error_code EC = M.getError())
+      return EC;
     return std::move(*M);
   }
 
@@ -190,16 +173,16 @@ static std::unique_ptr<Module> parseBitcodeFileImpl(MemoryBufferRef Buffer,
   std::unique_ptr<MemoryBuffer> LightweightBuf =
       MemoryBuffer::getMemBuffer(*MBOrErr, false);
   ErrorOr<std::unique_ptr<Module>> M =
-      getLazyBitcodeModule(std::move(LightweightBuf), Context,
-                           DiagnosticHandler, true /*ShouldLazyLoadMetadata*/);
-  if (!M)
-    return nullptr;
+      getLazyBitcodeModule(std::move(LightweightBuf), Context, nullptr,
+                           true /*ShouldLazyLoadMetadata*/);
+  if (std::error_code EC = M.getError())
+    return EC;
   return std::move(*M);
 }
 
-LTOModule *LTOModule::makeLTOModule(MemoryBufferRef Buffer,
-                                    TargetOptions options, std::string &errMsg,
-                                    LLVMContext *Context) {
+ErrorOr<std::unique_ptr<LTOModule>>
+LTOModule::makeLTOModule(MemoryBufferRef Buffer, TargetOptions options,
+                         LLVMContext *Context) {
   std::unique_ptr<LLVMContext> OwnedContext;
   if (!Context) {
     OwnedContext = llvm::make_unique<LLVMContext>();
@@ -208,11 +191,12 @@ LTOModule *LTOModule::makeLTOModule(MemoryBufferRef Buffer,
 
   // If we own a context, we know this is being used only for symbol
   // extraction, not linking.  Be lazy in that case.
-  std::unique_ptr<Module> M = parseBitcodeFileImpl(
-      Buffer, *Context,
-      /* ShouldBeLazy */ static_cast<bool>(OwnedContext), errMsg);
-  if (!M)
-    return nullptr;
+  ErrorOr<std::unique_ptr<Module>> MOrErr =
+      parseBitcodeFileImpl(Buffer, *Context,
+                           /* ShouldBeLazy */ static_cast<bool>(OwnedContext));
+  if (std::error_code EC = MOrErr.getError())
+    return EC;
+  std::unique_ptr<Module> &M = *MOrErr;
 
   std::string TripleStr = M->getTargetTriple();
   if (TripleStr.empty())
@@ -220,9 +204,10 @@ LTOModule *LTOModule::makeLTOModule(MemoryBufferRef Buffer,
   llvm::Triple Triple(TripleStr);
 
   // find machine architecture for this module
+  std::string errMsg;
   const Target *march = TargetRegistry::lookupTarget(TripleStr, errMsg);
   if (!march)
-    return nullptr;
+    return std::unique_ptr<LTOModule>(nullptr);
 
   // construct LTOModule, hand over ownership of module and target
   SubtargetFeatures Features;
@@ -246,16 +231,16 @@ LTOModule *LTOModule::makeLTOModule(MemoryBufferRef Buffer,
   std::unique_ptr<object::IRObjectFile> IRObj(
       new object::IRObjectFile(Buffer, std::move(M)));
 
-  LTOModule *Ret;
+  std::unique_ptr<LTOModule> Ret;
   if (OwnedContext)
-    Ret = new LTOModule(std::move(IRObj), target, std::move(OwnedContext));
+    Ret.reset(new LTOModule(std::move(IRObj), target, std::move(OwnedContext)));
   else
-    Ret = new LTOModule(std::move(IRObj), target);
+    Ret.reset(new LTOModule(std::move(IRObj), target));
 
   Ret->parseSymbols();
   Ret->parseMetadata();
 
-  return Ret;
+  return std::move(Ret);
 }
 
 /// Create a MemoryBuffer from a memory range with an optional name.
diff --git a/tools/llvm-lto/llvm-lto.cpp b/tools/llvm-lto/llvm-lto.cpp
index 07cd9bb1eaf8..4bc692279b9e 100644
--- a/tools/llvm-lto/llvm-lto.cpp
+++ b/tools/llvm-lto/llvm-lto.cpp
@@ -124,23 +124,27 @@ static void handleDiagnostics(lto_codegen_diagnostic_severity_t Severity,
   errs() << Msg << "\n";
 }
 
+static std::string CurrentActivity;
 static void diagnosticHandler(const DiagnosticInfo &DI) {
   raw_ostream &OS = errs();
   OS << "llvm-lto: ";
   switch (DI.getSeverity()) {
   case DS_Error:
-    OS << "error: ";
+    OS << "error";
     break;
   case DS_Warning:
-    OS << "warning: ";
+    OS << "warning";
     break;
   case DS_Remark:
-    OS << "remark: ";
+    OS << "remark";
     break;
   case DS_Note:
-    OS << "note: ";
+    OS << "note";
     break;
   }
+  if (!CurrentActivity.empty())
+    OS << ' ' << CurrentActivity;
+  OS << ": ";
 
   DiagnosticPrinterRawOStream DP(OS);
   DI.print(DP);
@@ -150,6 +154,11 @@ static void diagnosticHandler(const DiagnosticInfo &DI) {
     exit(1);
 }
 
+static void diagnosticHandlerWithContenxt(const DiagnosticInfo &DI,
+                                          void *Context) {
+  diagnosticHandler(DI);
+}
+
 static void error(const Twine &Msg) {
   errs() << "llvm-lto: " << Msg << '\n';
   exit(1);
@@ -172,12 +181,11 @@ getLocalLTOModule(StringRef Path, std::unique_ptr<MemoryBuffer> &Buffer,
       MemoryBuffer::getFile(Path);
   error(BufferOrErr, "error loading file '" + Path + "'");
   Buffer = std::move(BufferOrErr.get());
-  std::string Error;
-  std::unique_ptr<LTOModule> Ret(LTOModule::createInLocalContext(
-      Buffer->getBufferStart(), Buffer->getBufferSize(), Options, Error, Path));
-  if (!Error.empty())
-    error("error loading file '" + Path + "' " + Error);
-  return Ret;
+  CurrentActivity = ("loading file '" + Path + "'").str();
+  ErrorOr<std::unique_ptr<LTOModule>> Ret = LTOModule::createInLocalContext(
+      Buffer->getBufferStart(), Buffer->getBufferSize(), Options, Path);
+  CurrentActivity = "";
+  return std::move(*Ret);
 }
 
 /// \brief List symbols in each IR file.
@@ -207,10 +215,11 @@ static void createCombinedFunctionIndex() {
   FunctionInfoIndex CombinedIndex;
   uint64_t NextModuleId = 0;
   for (auto &Filename : InputFilenames) {
+    CurrentActivity = "loading file '" + Filename + "'";
     ErrorOr<std::unique_ptr<FunctionInfoIndex>> IndexOrErr =
         llvm::getFunctionIndexForFile(Filename, diagnosticHandler);
-    error(IndexOrErr, "error loading file '" + Filename + "'");
     std::unique_ptr<FunctionInfoIndex> Index = std::move(IndexOrErr.get());
+    CurrentActivity = "";
     // Skip files without a function summary.
     if (!Index)
       continue;
@@ -257,7 +266,10 @@ int main(int argc, char **argv) {
 
   unsigned BaseArg = 0;
 
-  LTOCodeGenerator CodeGen(getGlobalContext());
+  LLVMContext Context;
+  Context.setDiagnosticHandler(diagnosticHandlerWithContenxt, nullptr, true);
+
+  LTOCodeGenerator CodeGen(Context);
 
   if (UseDiagnosticHandler)
     CodeGen.setDiagnosticHandler(handleDiagnostics, nullptr);
@@ -274,14 +286,11 @@ int main(int argc, char **argv) {
   std::vector<std::string> KeptDSOSyms;
 
   for (unsigned i = BaseArg; i < InputFilenames.size(); ++i) {
-    std::string error;
-    std::unique_ptr<LTOModule> Module(
-        LTOModule::createFromFile(InputFilenames[i].c_str(), Options, error));
-    if (!error.empty()) {
-      errs() << argv[0] << ": error loading file '" << InputFilenames[i]
-             << "': " << error << "\n";
-      return 1;
-    }
+    CurrentActivity = "loading file '" + InputFilenames[i] + "'";
+    ErrorOr<std::unique_ptr<LTOModule>> ModuleOrErr =
+        LTOModule::createFromFile(Context, InputFilenames[i].c_str(), Options);
+    std::unique_ptr<LTOModule> &Module = *ModuleOrErr;
+    CurrentActivity = "";
 
     unsigned NumSyms = Module->getSymbolCount();
     for (unsigned I = 0; I < NumSyms; ++I) {
diff --git a/tools/lto/lto.cpp b/tools/lto/lto.cpp
index 6058fb1197eb..d13de57e830c 100644
--- a/tools/lto/lto.cpp
+++ b/tools/lto/lto.cpp
@@ -15,6 +15,8 @@
 #include "llvm-c/lto.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/CommandFlags.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/LTO/LTOCodeGenerator.h"
 #include "llvm/LTO/LTOModule.h"
@@ -64,6 +66,24 @@ static bool initialized = false;
 // Holds the command-line option parsing state of the LTO module.
 static bool parsedOptions = false;
 
+static LLVMContext *LTOContext = nullptr;
+
+static void diagnosticHandler(const DiagnosticInfo &DI, void *Context) {
+  if (DI.getSeverity() != DS_Error) {
+    DiagnosticPrinterRawOStream DP(errs());
+    DI.print(DP);
+    errs() << '\n';
+    return;
+  }
+  sLastErrorString = "";
+  {
+    raw_string_ostream Stream(sLastErrorString);
+    DiagnosticPrinterRawOStream DP(Stream);
+    DI.print(DP);
+  }
+  sLastErrorString += '\n';
+}
+
 // Initialize the configured targets if they have not been initialized.
 static void lto_initialize() {
   if (!initialized) {
@@ -79,6 +99,9 @@ static void lto_initialize() {
     InitializeAllAsmParsers();
     InitializeAllAsmPrinters();
     InitializeAllDisassemblers();
+
+    LTOContext = &getGlobalContext();
+    LTOContext->setDiagnosticHandler(diagnosticHandler, nullptr, true);
     initialized = true;
   }
 }
@@ -95,7 +118,7 @@ static void handleLibLTODiagnostic(lto_codegen_diagnostic_severity_t Severity,
 // libLTO API semantics, which require that the code generator owns the object
 // file.
 struct LibLTOCodeGenerator : LTOCodeGenerator {
-  LibLTOCodeGenerator() : LTOCodeGenerator(getGlobalContext()) {
+  LibLTOCodeGenerator() : LTOCodeGenerator(*LTOContext) {
     setDiagnosticHandler(handleLibLTODiagnostic, nullptr); }
   LibLTOCodeGenerator(std::unique_ptr<LLVMContext> Context)
       : LTOCodeGenerator(*Context), OwnedContext(std::move(Context)) {
@@ -166,14 +189,21 @@ lto_module_is_object_file_in_memory_for_target(const void* mem,
 lto_module_t lto_module_create(const char* path) {
   lto_initialize();
   llvm::TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
-  return wrap(LTOModule::createFromFile(path, Options, sLastErrorString));
+  ErrorOr<std::unique_ptr<LTOModule>> M =
+      LTOModule::createFromFile(*LTOContext, path, Options);
+  if (!M)
+    return nullptr;
+  return wrap(M->release());
 }
 
 lto_module_t lto_module_create_from_fd(int fd, const char *path, size_t size) {
   lto_initialize();
   llvm::TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
-  return wrap(
-      LTOModule::createFromOpenFile(fd, path, size, Options, sLastErrorString));
+  ErrorOr<std::unique_ptr<LTOModule>> M =
+      LTOModule::createFromOpenFile(*LTOContext, fd, path, size, Options);
+  if (!M)
+    return nullptr;
+  return wrap(M->release());
 }
 
 lto_module_t lto_module_create_from_fd_at_offset(int fd, const char *path,
@@ -182,14 +212,21 @@ lto_module_t lto_module_create_from_fd_at_offset(int fd, const char *path,
                                                  off_t offset) {
   lto_initialize();
   llvm::TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
-  return wrap(LTOModule::createFromOpenFileSlice(fd, path, map_size, offset,
-                                                 Options, sLastErrorString));
+  ErrorOr<std::unique_ptr<LTOModule>> M = LTOModule::createFromOpenFileSlice(
+      *LTOContext, fd, path, map_size, offset, Options);
+  if (!M)
+    return nullptr;
+  return wrap(M->release());
 }
 
 lto_module_t lto_module_create_from_memory(const void* mem, size_t length) {
   lto_initialize();
   llvm::TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
-  return wrap(LTOModule::createFromBuffer(mem, length, Options, sLastErrorString));
+  ErrorOr<std::unique_ptr<LTOModule>> M =
+      LTOModule::createFromBuffer(*LTOContext, mem, length, Options);
+  if (!M)
+    return nullptr;
+  return wrap(M->release());
 }
 
 lto_module_t lto_module_create_from_memory_with_path(const void* mem,
@@ -197,16 +234,22 @@ lto_module_t lto_module_create_from_memory_with_path(const void* mem,
                                                      const char *path) {
   lto_initialize();
   llvm::TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
-  return wrap(
-      LTOModule::createFromBuffer(mem, length, Options, sLastErrorString, path));
+  ErrorOr<std::unique_ptr<LTOModule>> M =
+      LTOModule::createFromBuffer(*LTOContext, mem, length, Options, path);
+  if (!M)
+    return nullptr;
+  return wrap(M->release());
 }
 
 lto_module_t lto_module_create_in_local_context(const void *mem, size_t length,
                                                 const char *path) {
   lto_initialize();
   llvm::TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
-  return wrap(LTOModule::createInLocalContext(mem, length, Options,
-                                              sLastErrorString, path));
+  ErrorOr<std::unique_ptr<LTOModule>> M =
+      LTOModule::createInLocalContext(mem, length, Options, path);
+  if (!M)
+    return nullptr;
+  return wrap(M->release());
 }
 
 lto_module_t lto_module_create_in_codegen_context(const void *mem,
@@ -215,8 +258,9 @@ lto_module_t lto_module_create_in_codegen_context(const void *mem,
                                                   lto_code_gen_t cg) {
   lto_initialize();
   llvm::TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
-  return wrap(LTOModule::createInContext(mem, length, Options, sLastErrorString,
-                                         path, &unwrap(cg)->getContext()));
+  ErrorOr<std::unique_ptr<LTOModule>> M = LTOModule::createInContext(
+      mem, length, Options, path, &unwrap(cg)->getContext());
+  return wrap(M->release());
 }
 
 void lto_module_dispose(lto_module_t mod) { delete unwrap(mod); }

From 42a2b123da9cb79b3ffad472b64acdb981c1e850 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@codeaurora.org>
Date: Fri, 4 Dec 2015 16:18:15 +0000
Subject: [PATCH 065/364] [Hexagon] Simplify LowerCONCAT_VECTORS, handle
 different types better

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254724 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Hexagon/HexagonISelLowering.cpp | 113 ++++++++++-----------
 1 file changed, 55 insertions(+), 58 deletions(-)

diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index b59fe6b67044..0a89ef424dd2 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -1520,7 +1520,9 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
       Subtarget(ST) {
   bool IsV4 = !Subtarget.hasV5TOps();
   auto &HRI = *Subtarget.getRegisterInfo();
-  bool UseHVX = Subtarget.useHVXOps(), UseHVXDbl = Subtarget.useHVXDblOps();
+  bool UseHVX = Subtarget.useHVXOps();
+  bool UseHVXSgl = Subtarget.useHVXSglOps();
+  bool UseHVXDbl = Subtarget.useHVXDblOps();
 
   setPrefLoopAlignment(4);
   setPrefFunctionAlignment(4);
@@ -1808,17 +1810,18 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8,  Custom);
   if (UseHVX) {
-    if(!UseHVXDbl) {
-      setOperationAction(ISD::CONCAT_VECTORS, MVT::v128i8, Custom);
-      setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i16, Custom);
-      setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i32, Custom);
-      setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i64, Custom);
-    }
-    else {
-      setOperationAction(ISD::CONCAT_VECTORS, MVT::v256i8, Custom);
+    if (UseHVXSgl) {
+      setOperationAction(ISD::CONCAT_VECTORS, MVT::v128i8,  Custom);
+      setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i16,  Custom);
+      setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i32,  Custom);
+      setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i64,  Custom);
+    } else if (UseHVXDbl) {
+      setOperationAction(ISD::CONCAT_VECTORS, MVT::v256i8,  Custom);
       setOperationAction(ISD::CONCAT_VECTORS, MVT::v128i16, Custom);
-      setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i32, Custom);
-      setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i64, Custom);
+      setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i32,  Custom);
+      setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i64,  Custom);
+    } else {
+      llvm_unreachable("Unrecognized HVX mode");
     }
   }
   // Subtarget-specific operation actions.
@@ -2212,8 +2215,7 @@ HexagonTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 
   unsigned Size = VT.getSizeInBits();
 
-  // A vector larger than 64 bits cannot be represented in Hexagon.
-  // Expand will split the vector.
+  // Only handle vectors of 64 bits or shorter.
   if (Size > 64)
     return SDValue();
 
@@ -2350,63 +2352,58 @@ HexagonTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
   bool UseHVX = Subtarget.useHVXOps();
   EVT VT = Op.getValueType();
   unsigned NElts = Op.getNumOperands();
-  SDValue Vec = Op.getOperand(0);
-  EVT VecVT = Vec.getValueType();
-  SDValue Width = DAG.getConstant(VecVT.getSizeInBits(), dl, MVT::i64);
-  SDValue Shifted = DAG.getNode(ISD::SHL, dl, MVT::i64, Width,
-                                DAG.getConstant(32, dl, MVT::i64));
-  SDValue ConstVal = DAG.getConstant(0, dl, MVT::i64);
-
-  ConstantSDNode *W = dyn_cast<ConstantSDNode>(Width);
-  ConstantSDNode *S = dyn_cast<ConstantSDNode>(Shifted);
-
-  if ((VecVT.getSimpleVT() == MVT::v2i16) && (NElts == 2) && W && S) {
-    if ((W->getZExtValue() == 32) && ((S->getZExtValue() >> 32) == 32)) {
-      // We are trying to concat two v2i16 to a single v4i16.
-      SDValue Vec0 = Op.getOperand(1);
-      SDValue Combined  = DAG.getNode(HexagonISD::COMBINE, dl, VT, Vec0, Vec);
-      return DAG.getNode(ISD::BITCAST, dl, VT, Combined);
+  SDValue Vec0 = Op.getOperand(0);
+  EVT VecVT = Vec0.getValueType();
+  unsigned Width = VecVT.getSizeInBits();
+
+  if (NElts == 2) {
+    MVT ST = VecVT.getSimpleVT();
+    // We are trying to concat two v2i16 to a single v4i16, or two v4i8
+    // into a single v8i8.
+    if (ST == MVT::v2i16 || ST == MVT::v4i8)
+      return DAG.getNode(HexagonISD::COMBINE, dl, VT, Op.getOperand(1), Vec0);
+
+    if (UseHVX) {
+      assert((Width ==  64*8 && Subtarget.useHVXSglOps()) ||
+             (Width == 128*8 && Subtarget.useHVXDblOps()));
+      SDValue Vec1 = Op.getOperand(1);
+      MVT OpTy = Subtarget.useHVXSglOps() ? MVT::v16i32 : MVT::v32i32;
+      MVT ReTy = Subtarget.useHVXSglOps() ? MVT::v32i32 : MVT::v64i32;
+      SDValue B0 = DAG.getNode(ISD::BITCAST, dl, OpTy, Vec0);
+      SDValue B1 = DAG.getNode(ISD::BITCAST, dl, OpTy, Vec1);
+      SDValue VC = DAG.getNode(HexagonISD::VCOMBINE, dl, ReTy, B1, B0);
+      return DAG.getNode(ISD::BITCAST, dl, VT, VC);
     }
   }
 
-  if ((VecVT.getSimpleVT() == MVT::v4i8) && (NElts == 2) && W && S) {
-    if ((W->getZExtValue() == 32) && ((S->getZExtValue() >> 32) == 32)) {
-      // We are trying to concat two v4i8 to a single v8i8.
-      SDValue Vec0 = Op.getOperand(1);
-      SDValue Combined  = DAG.getNode(HexagonISD::COMBINE, dl, VT, Vec0, Vec);
-      return DAG.getNode(ISD::BITCAST, dl, VT, Combined);
-    }
-  }
+  if (VT.getSizeInBits() != 32 && VT.getSizeInBits() != 64)
+    return SDValue();
+
+  SDValue C0 = DAG.getConstant(0, dl, MVT::i64);
+  SDValue C32 = DAG.getConstant(32, dl, MVT::i64);
+  SDValue W = DAG.getConstant(Width, dl, MVT::i64);
+  // Create the "width" part of the argument to insert_rp/insertp_rp.
+  SDValue S = DAG.getNode(ISD::SHL, dl, MVT::i64, W, C32);
+  SDValue V = C0;
 
-  if (UseHVX) {
-    SDValue Vec0 = Op.getOperand(1);
-    assert((VecVT.getSizeInBits() == 64*8 && Subtarget.useHVXSglOps()) ||
-           (VecVT.getSizeInBits() == 128*8 && Subtarget.useHVXDblOps()));
-    SDValue Combined = DAG.getNode(HexagonISD::VCOMBINE, dl, VT, Vec0, Vec);
-    return Combined;
-  }
   for (unsigned i = 0, e = NElts; i != e; ++i) {
-    unsigned OpIdx = NElts - i - 1;
-    SDValue Operand = Op.getOperand(OpIdx);
+    unsigned N = NElts-i-1;
+    SDValue OpN = Op.getOperand(N);
 
-    if (VT.getSizeInBits() == 64 &&
-        Operand.getValueType().getSizeInBits() == 32) {
+    if (VT.getSizeInBits() == 64 && OpN.getValueType().getSizeInBits() == 32) {
       SDValue C = DAG.getConstant(0, dl, MVT::i32);
-      Operand = DAG.getNode(HexagonISD::COMBINE, dl, VT, C, Operand);
+      OpN = DAG.getNode(HexagonISD::COMBINE, dl, VT, C, OpN);
     }
-
-    SDValue Idx = DAG.getConstant(OpIdx, dl, MVT::i64);
-    SDValue Offset = DAG.getNode(ISD::MUL, dl, MVT::i64, Idx, Width);
-    SDValue Combined = DAG.getNode(ISD::OR, dl, MVT::i64, Shifted, Offset);
-    const SDValue Ops[] = {ConstVal, Operand, Combined};
-
+    SDValue Idx = DAG.getConstant(N, dl, MVT::i64);
+    SDValue Offset = DAG.getNode(ISD::MUL, dl, MVT::i64, Idx, W);
+    SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i64, S, Offset);
     if (VT.getSizeInBits() == 32)
-      ConstVal = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i32, Ops);
+      V = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i32, {V, OpN, Or});
     else
-      ConstVal = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i64, Ops);
+      V = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i64, {V, OpN, Or});
   }
 
-  return DAG.getNode(ISD::BITCAST, dl, VT, ConstVal);
+  return DAG.getNode(ISD::BITCAST, dl, VT, V);
 }
 
 SDValue

From 3bf2abe88cac804262355d2070803d6334863c6b Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Fri, 4 Dec 2015 17:09:42 +0000
Subject: [PATCH 066/364] [WebAssembly] Use SelectionDAG::getUNDEF. NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254726 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/WebAssemblyISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 2485df1ab5d2..9f0860550533 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -400,7 +400,7 @@ SDValue WebAssemblyTargetLowering::LowerFormalArguments(
         In.Used
             ? DAG.getNode(WebAssemblyISD::ARGUMENT, DL, In.VT,
                           DAG.getTargetConstant(InVals.size(), DL, MVT::i32))
-            : DAG.getNode(ISD::UNDEF, DL, In.VT));
+            : DAG.getUNDEF(In.VT));
 
     // Record the number and types of arguments.
     MF.getInfo<WebAssemblyFunctionInfo>()->addParam(In.VT);

From 0771e45d22709610a9ee404063de8de05b9d92a8 Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Fri, 4 Dec 2015 17:12:52 +0000
Subject: [PATCH 067/364] [WebAssembly] Check for more unsupported ABI flags.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254727 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../WebAssembly/WebAssemblyISelLowering.cpp   | 27 ++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 9f0860550533..c54ffb0654a6 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -289,6 +289,18 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
   if (Ins.size() > 1)
     fail(DL, DAG, "WebAssembly doesn't support more than 1 returned value yet");
 
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  for (const ISD::OutputArg &Out : Outs) {
+    assert(!Out.Flags.isByVal() && "byval is not valid for return values");
+    assert(!Out.Flags.isNest() && "nest is not valid for return values");
+    if (Out.Flags.isInAlloca())
+      fail(DL, DAG, "WebAssembly hasn't implemented inalloca results");
+    if (Out.Flags.isInConsecutiveRegs())
+      fail(DL, DAG, "WebAssembly hasn't implemented cons regs results");
+    if (Out.Flags.isInConsecutiveRegsLast())
+      fail(DL, DAG, "WebAssembly hasn't implemented cons regs last results");
+  }
+
   bool IsVarArg = CLI.IsVarArg;
   if (IsVarArg)
     fail(DL, DAG, "WebAssembly doesn't support varargs yet");
@@ -309,8 +321,21 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
   Ops.append(OutVals.begin(), OutVals.end());
 
   SmallVector<EVT, 8> Tys;
-  for (const auto &In : Ins)
+  for (const auto &In : Ins) {
+    if (In.Flags.isByVal())
+      fail(DL, DAG, "WebAssembly hasn't implemented byval arguments");
+    if (In.Flags.isInAlloca())
+      fail(DL, DAG, "WebAssembly hasn't implemented inalloca arguments");
+    if (In.Flags.isNest())
+      fail(DL, DAG, "WebAssembly hasn't implemented nest arguments");
+    if (In.Flags.isInConsecutiveRegs())
+      fail(DL, DAG, "WebAssembly hasn't implemented cons regs arguments");
+    if (In.Flags.isInConsecutiveRegsLast())
+      fail(DL, DAG, "WebAssembly hasn't implemented cons regs last arguments");
+    // Ignore In.getOrigAlign() because all our arguments are passed in
+    // registers.
     Tys.push_back(In.VT);
+  }
   Tys.push_back(MVT::Other);
   SDVTList TyList = DAG.getVTList(Tys);
   SDValue Res =

From 1951dce526cca48df403afbb46de939be15acb73 Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Fri, 4 Dec 2015 17:16:07 +0000
Subject: [PATCH 068/364] [WebAssembly] Factor out the list of supported
 calling conventions.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254728 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../WebAssembly/WebAssemblyISelLowering.cpp     | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index c54ffb0654a6..79d7cbbefa2b 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -257,6 +257,16 @@ static void fail(SDLoc DL, SelectionDAG &DAG, const char *msg) {
       DiagnosticInfoUnsupported(DL, *MF.getFunction(), msg, SDValue()));
 }
 
+// Test whether the given calling convention is supported.
+static bool
+CallingConvSupported(CallingConv::ID CallConv) {
+  // We currently support the language-independent target-independent
+  // conventions.
+  return CallConv == CallingConv::C ||
+         CallConv == CallingConv::Fast ||
+         CallConv == CallingConv::Cold;
+}
+
 SDValue
 WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
                                      SmallVectorImpl<SDValue> &InVals) const {
@@ -267,8 +277,7 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
   MachineFunction &MF = DAG.getMachineFunction();
 
   CallingConv::ID CallConv = CLI.CallConv;
-  if (CallConv != CallingConv::C && CallConv != CallingConv::Fast &&
-      CallConv != CallingConv::Cold)
+  if (!CallingConvSupported(CallConv))
     fail(DL, DAG,
          "WebAssembly doesn't support language-specific or target-specific "
          "calling conventions yet");
@@ -367,7 +376,7 @@ SDValue WebAssemblyTargetLowering::LowerReturn(
     const SmallVectorImpl<SDValue> &OutVals, SDLoc DL,
     SelectionDAG &DAG) const {
   assert(Outs.size() <= 1 && "WebAssembly can only return up to one value");
-  if (CallConv != CallingConv::C)
+  if (!CallingConvSupported(CallConv))
     fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions");
   if (IsVarArg)
     fail(DL, DAG, "WebAssembly doesn't support varargs yet");
@@ -399,7 +408,7 @@ SDValue WebAssemblyTargetLowering::LowerFormalArguments(
     SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
 
-  if (CallConv != CallingConv::C)
+  if (!CallingConvSupported(CallConv))
     fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions");
   if (IsVarArg)
     fail(DL, DAG, "WebAssembly doesn't support varargs yet");

From e0b2e5de57f8488a3c01cf55161cb9ec946b5fec Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Fri, 4 Dec 2015 17:18:32 +0000
Subject: [PATCH 069/364] [WebAssembly] clang-format CallingConvSupported. NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254729 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/WebAssemblyISelLowering.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 79d7cbbefa2b..b651855eea7a 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -258,12 +258,10 @@ static void fail(SDLoc DL, SelectionDAG &DAG, const char *msg) {
 }
 
 // Test whether the given calling convention is supported.
-static bool
-CallingConvSupported(CallingConv::ID CallConv) {
+static bool CallingConvSupported(CallingConv::ID CallConv) {
   // We currently support the language-independent target-independent
   // conventions.
-  return CallConv == CallingConv::C ||
-         CallConv == CallingConv::Fast ||
+  return CallConv == CallingConv::C || CallConv == CallingConv::Fast ||
          CallConv == CallingConv::Cold;
 }
 

From 6326d04e5969570e0c7f883e1fadbcf6a1fc4e1b Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Fri, 4 Dec 2015 17:19:44 +0000
Subject: [PATCH 070/364] [WebAssembly] Give names to the callseq begin and end
 instructions.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254730 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/WebAssemblyInstrCall.td | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
index 4028460bd231..018d26cfacda 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrCall.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
@@ -17,10 +17,10 @@ let Defs = [ARGUMENTS] in {
 // The call sequence start/end LLVM-isms isn't useful to WebAssembly since it's
 // a virtual ISA.
 let isCodeGenOnly = 1 in {
-def : I<(outs), (ins i64imm:$amt),
-        [(WebAssemblycallseq_start timm:$amt)]>;
-def : I<(outs), (ins i64imm:$amt1, i64imm:$amt2),
-        [(WebAssemblycallseq_end timm:$amt1, timm:$amt2)]>;
+def ADJCALLSTACKDOWN : I<(outs), (ins i64imm:$amt),
+                         [(WebAssemblycallseq_start timm:$amt)]>;
+def ADJCALLSTACKUP : I<(outs), (ins i64imm:$amt1, i64imm:$amt2),
+                       [(WebAssemblycallseq_end timm:$amt1, timm:$amt2)]>;
 } // isCodeGenOnly = 1
 
 multiclass CALL<WebAssemblyRegClass vt> {

From 2a3cc32f7191837dc796338f42d9d75d188b4ce6 Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Fri, 4 Dec 2015 17:20:04 +0000
Subject: [PATCH 071/364] [llvm-dwp] Retrieve the DWOID from the CU for the
 cu_index entry

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254731 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/DebugInfo/DWARF/DWARFFormValue.h |  3 +
 lib/DebugInfo/DWARF/DWARFFormValue.cpp        | 10 ++-
 test/tools/llvm-dwp/X86/simple.test           |  6 +-
 tools/llvm-dwp/llvm-dwp.cpp                   | 63 ++++++++++++++++++-
 4 files changed, 76 insertions(+), 6 deletions(-)

diff --git a/include/llvm/DebugInfo/DWARF/DWARFFormValue.h b/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
index 7ddcc0d81d59..3c32a3e5b794 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
@@ -84,6 +84,9 @@ class DWARFFormValue {
                  const DWARFUnit *u) const;
   static bool skipValue(uint16_t form, DataExtractor debug_info_data,
                         uint32_t *offset_ptr, const DWARFUnit *u);
+  static bool skipValue(uint16_t form, DataExtractor debug_info_data,
+                        uint32_t *offset_ptr, uint16_t Version,
+                        uint8_t AddrSize);
 
   static ArrayRef<uint8_t> getFixedFormSizes(uint8_t AddrSize,
                                              uint16_t Version);
diff --git a/lib/DebugInfo/DWARF/DWARFFormValue.cpp b/lib/DebugInfo/DWARF/DWARFFormValue.cpp
index a11b00a926d9..3dc58423df68 100644
--- a/lib/DebugInfo/DWARF/DWARFFormValue.cpp
+++ b/lib/DebugInfo/DWARF/DWARFFormValue.cpp
@@ -261,6 +261,12 @@ DWARFFormValue::skipValue(DataExtractor debug_info_data, uint32_t* offset_ptr,
 bool
 DWARFFormValue::skipValue(uint16_t form, DataExtractor debug_info_data,
                           uint32_t *offset_ptr, const DWARFUnit *cu) {
+  return skipValue(form, debug_info_data, offset_ptr, cu->getVersion(),
+                   cu->getAddressByteSize());
+}
+bool DWARFFormValue::skipValue(uint16_t form, DataExtractor debug_info_data,
+                               uint32_t *offset_ptr, uint16_t Version,
+                               uint8_t AddrSize) {
   bool indirect = false;
   do {
     switch (form) {
@@ -295,10 +301,10 @@ DWARFFormValue::skipValue(uint16_t form, DataExtractor debug_info_data,
 
     // Compile unit address sized values
     case DW_FORM_addr:
-      *offset_ptr += cu->getAddressByteSize();
+      *offset_ptr += AddrSize;
       return true;
     case DW_FORM_ref_addr:
-      *offset_ptr += getRefAddrSize(cu->getAddressByteSize(), cu->getVersion());
+      *offset_ptr += getRefAddrSize(AddrSize, Version);
       return true;
 
     // 0 byte values - implied from the form.
diff --git a/test/tools/llvm-dwp/X86/simple.test b/test/tools/llvm-dwp/X86/simple.test
index 6ee19697442d..1c7b1040bd31 100644
--- a/test/tools/llvm-dwp/X86/simple.test
+++ b/test/tools/llvm-dwp/X86/simple.test
@@ -28,6 +28,7 @@ CHECK: .debug_info.dwo contents:
 CHECK: 0x00000000: Compile Unit: length = 0x00000025 version = 0x0004 abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x00000029)
 CHECK: DW_TAG_compile_unit
 CHECK:   DW_AT_name {{.*}} "a.cpp"
+CHECK:   DW_AT_GNU_dwo_id {{.*}} ([[DWOA:.*]])
 CHECK:   DW_TAG_variable
 CHECK:     DW_AT_name {{.*}} "a"
 CHECK:   DW_TAG_structure_type
@@ -35,6 +36,7 @@ CHECK:     DW_AT_name {{.*}} "foo"
 
 CHECK: 0x00000029: Compile Unit: length = 0x00000031 version = 0x0004 abbr_offset = 0x0031 addr_size = 0x08 (next unit at 0x0000005e)
 CHECK:   DW_AT_name {{.*}} "b.cpp"
+CHECK:   DW_AT_GNU_dwo_id {{.*}} ([[DWOB:.*]])
 CHECK:   DW_TAG_structure_type
 CHECK:     DW_AT_name {{.*}} "bar"
 CHECK:   DW_TAG_subprogram
@@ -45,8 +47,8 @@ CHECK: .debug_cu_index contents:
 Ensure only the relevant/contained sections are included in the table:
 CHECK: Index Signature          INFO                     ABBREV                   STR_OFFSETS
 Don't bother checking the Signatures, they aren't correct yet.
-CHECK:                          [0x00000000, 0x00000029) [0x00000000, 0x00000031) [0x00000000, 0x00000010)
-CHECK:                          [0x00000029, 0x0000005e) [0x00000031, 0x00000075) [0x00000010, 0x00000024)
+CHECK:     1 [[DWOA]]           [0x00000000, 0x00000029) [0x00000000, 0x00000031) [0x00000000, 0x00000010)
+CHECK:     2 [[DWOB]]           [0x00000029, 0x0000005e) [0x00000031, 0x00000075) [0x00000010, 0x00000024)
 
 CHECK: .debug_str.dwo contents:
 CHECK: "clang version
diff --git a/tools/llvm-dwp/llvm-dwp.cpp b/tools/llvm-dwp/llvm-dwp.cpp
index e6a90cf8a3cf..b68ba437f830 100644
--- a/tools/llvm-dwp/llvm-dwp.cpp
+++ b/tools/llvm-dwp/llvm-dwp.cpp
@@ -1,5 +1,6 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringSet.h"
+#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -82,6 +83,52 @@ writeStringsAndOffsets(MCStreamer &Out, StringMap<uint32_t> &Strings,
   return std::error_code();
 }
 
+static uint32_t getCUAbbrev(StringRef Abbrev, uint64_t AbbrCode) {
+  uint64_t CurCode;
+  uint32_t Offset = 0;
+  DataExtractor AbbrevData(Abbrev, true, 0);
+  while ((CurCode = AbbrevData.getULEB128(&Offset)) != AbbrCode) {
+    // Tag
+    AbbrevData.getULEB128(&Offset);
+    // DW_CHILDREN
+    AbbrevData.getU8(&Offset);
+    // Attributes
+    while (AbbrevData.getULEB128(&Offset) | AbbrevData.getULEB128(&Offset))
+      ;
+  }
+  return Offset;
+}
+
+static uint64_t getCUSignature(StringRef Abbrev, StringRef Info) {
+  uint32_t Offset = 0;
+  DataExtractor InfoData(Info, true, 0);
+  InfoData.getU32(&Offset); // Length
+  uint16_t Version = InfoData.getU16(&Offset);
+  InfoData.getU32(&Offset); // Abbrev offset (should be zero)
+  uint8_t AddrSize = InfoData.getU8(&Offset);
+
+  uint32_t AbbrCode = InfoData.getULEB128(&Offset);
+
+  DataExtractor AbbrevData(Abbrev, true, 0);
+  uint32_t AbbrevOffset = getCUAbbrev(Abbrev, AbbrCode);
+  uint64_t Tag = AbbrevData.getULEB128(&AbbrevOffset);
+  (void)Tag;
+  // FIXME: Real error handling
+  assert(Tag == dwarf::DW_TAG_compile_unit);
+  // DW_CHILDREN
+  AbbrevData.getU8(&AbbrevOffset);
+  uint32_t Name;
+  uint32_t Form;
+  while ((Name = AbbrevData.getULEB128(&AbbrevOffset)) |
+             (Form = AbbrevData.getULEB128(&AbbrevOffset)) &&
+         Name != dwarf::DW_AT_GNU_dwo_id) {
+    DWARFFormValue::skipValue(Form, InfoData, &Offset, Version, AddrSize);
+  }
+  // FIXME: Real error handling
+  assert(Name == dwarf::DW_AT_GNU_dwo_id);
+  return InfoData.getU64(&Offset);
+}
+
 static std::error_code write(MCStreamer &Out, ArrayRef<std::string> Inputs) {
   const auto &MCOFI = *Out.getContext().getObjectFileInfo();
   MCSection *const StrSection = MCOFI.getDwarfStrDWOSection();
@@ -104,7 +151,6 @@ static std::error_code write(MCStreamer &Out, ArrayRef<std::string> Inputs) {
   StringMap<uint32_t> Strings;
   uint32_t StringOffset = 0;
 
-  uint64_t UnitIndex = 0;
   uint32_t ContributionOffsets[8] = {};
 
   for (const auto &Input : Inputs) {
@@ -114,10 +160,11 @@ static std::error_code write(MCStreamer &Out, ArrayRef<std::string> Inputs) {
 
     IndexEntries.emplace_back();
     UnitIndexEntry &CurEntry = IndexEntries.back();
-    CurEntry.Signature = UnitIndex++;
 
     StringRef CurStrSection;
     StringRef CurStrOffsetSection;
+    StringRef InfoSection;
+    StringRef AbbrevSection;
 
     for (const auto &Section : ErrOrObj->getBinary()->sections()) {
       StringRef Name;
@@ -138,6 +185,14 @@ static std::error_code write(MCStreamer &Out, ArrayRef<std::string> Inputs) {
         CurEntry.Contributions[Index].Offset = ContributionOffsets[Index];
         ContributionOffsets[Index] +=
             (CurEntry.Contributions[Index].Length = Contents.size());
+
+        if (Kind == DW_SECT_INFO) {
+          assert(InfoSection.empty());
+          InfoSection = Contents;
+        } else if (Kind == DW_SECT_ABBREV) {
+          assert(AbbrevSection.empty());
+          AbbrevSection = Contents;
+        }
       }
 
       MCSection *OutSection = SectionPair->second.first;
@@ -151,6 +206,10 @@ static std::error_code write(MCStreamer &Out, ArrayRef<std::string> Inputs) {
       }
     }
 
+    assert(!AbbrevSection.empty());
+    assert(!InfoSection.empty());
+    CurEntry.Signature = getCUSignature(AbbrevSection, InfoSection);
+
     if (auto Err = writeStringsAndOffsets(Out, Strings, StringOffset,
                                           StrSection, StrOffsetSection,
                                           CurStrSection, CurStrOffsetSection))

From cd2103de5a0dcbe429010509d91d7648aaca6f59 Mon Sep 17 00:00:00 2001
From: Manman Ren <manman.ren@gmail.com>
Date: Fri, 4 Dec 2015 17:40:13 +0000
Subject: [PATCH 072/364] [CXX TLS calling convention] Add CXX TLS calling
 convention.

This commit adds a new target-independent calling convention for C++ TLS
access functions. It aims to minimize overhead in the caller by perserving as
many registers as possible.

The target-specific implementation for X86-64 is defined as following:
  Arguments are passed as for the default C calling convention
  The same applies for the return value(s)
  The callee preserves all GPRs - except RAX and RDI

The access function makes C-style TLS function calls in the entry and exit
block, C-style TLS functions save a lot more registers than normal calls.
The added calling convention ties into the existing implementation of the
C-style TLS functions, so we can't simply use existing calling conventions
such as preserve_mostcc.

rdar://9001553


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254737 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/BitCodeFormat.rst             |  1 +
 docs/LangRef.rst                   | 10 +++++
 include/llvm/IR/CallingConv.h      |  3 ++
 lib/AsmParser/LLLexer.cpp          |  1 +
 lib/AsmParser/LLParser.cpp         |  2 +
 lib/AsmParser/LLToken.h            |  1 +
 lib/IR/AsmWriter.cpp               |  1 +
 lib/Target/X86/X86RegisterInfo.cpp |  8 ++++
 test/CodeGen/X86/cxx_tlscc64.ll    | 71 ++++++++++++++++++++++++++++++
 9 files changed, 98 insertions(+)
 create mode 100644 test/CodeGen/X86/cxx_tlscc64.ll

diff --git a/docs/BitCodeFormat.rst b/docs/BitCodeFormat.rst
index 62d66f85d557..d6e3099bdb63 100644
--- a/docs/BitCodeFormat.rst
+++ b/docs/BitCodeFormat.rst
@@ -756,6 +756,7 @@ function. The operand fields are:
   * ``anyregcc``: code 13
   * ``preserve_mostcc``: code 14
   * ``preserve_allcc``: code 15
+  * ``cxx_fast_tlscc``: code 17
   * ``x86_stdcallcc``: code 64
   * ``x86_fastcallcc``: code 65
   * ``arm_apcscc``: code 66
diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index cf1ceab1f1c6..82b33557c128 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -406,6 +406,16 @@ added in the future:
     This calling convention, like the `PreserveMost` calling convention, will be
     used by a future version of the ObjectiveC runtime and should be considered
     experimental at this time.
+"``cxx_fast_tlscc``" - The `CXX_FAST_TLS` calling convention for access functions
+    This calling convention aims to minimize overhead in the caller by
+    preserving as many registers as possible. This calling convention behaves
+    identical to the `C` calling convention on how arguments and return values
+    are passed, but it uses a different set of caller/callee-saved registers.
+    Given that C-style TLS on Darwin has its own special CSRs, we can't use the
+    existing `PreserveMost`.
+
+    - On X86-64 the callee preserves all general purpose registers, except for
+      RDI and RAX.
 "``cc <n>``" - Numbered convention
     Any calling convention may be specified by number, allowing
     target-specific calling conventions to be used. Target specific
diff --git a/include/llvm/IR/CallingConv.h b/include/llvm/IR/CallingConv.h
index ac7cc9b74ab9..8204d3e2e812 100644
--- a/include/llvm/IR/CallingConv.h
+++ b/include/llvm/IR/CallingConv.h
@@ -72,6 +72,9 @@ namespace CallingConv {
     // Swift - Calling convention for Swift.
     Swift = 16,
 
+    // CXX_FAST_TLS - Calling convention for access functions.
+    CXX_FAST_TLS = 17,
+
     // Target - This is the start of the target-specific calling conventions,
     // e.g. fastcall and thiscall on X86.
     FirstTargetCC = 64,
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index 372c56588864..f95a763e3dae 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -591,6 +591,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(ghccc);
   KEYWORD(hhvmcc);
   KEYWORD(hhvm_ccc);
+  KEYWORD(cxx_fast_tlscc);
 
   KEYWORD(cc);
   KEYWORD(c);
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index 307ed397834c..145b5eaaceca 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -1544,6 +1544,7 @@ bool LLParser::ParseOptionalDLLStorageClass(unsigned &Res) {
 ///   ::= 'ghccc'
 ///   ::= 'hhvmcc'
 ///   ::= 'hhvm_ccc'
+///   ::= 'cxx_fast_tlscc'
 ///   ::= 'cc' UINT
 ///
 bool LLParser::ParseOptionalCallingConv(unsigned &CC) {
@@ -1574,6 +1575,7 @@ bool LLParser::ParseOptionalCallingConv(unsigned &CC) {
   case lltok::kw_ghccc:          CC = CallingConv::GHC; break;
   case lltok::kw_hhvmcc:         CC = CallingConv::HHVM; break;
   case lltok::kw_hhvm_ccc:       CC = CallingConv::HHVM_C; break;
+  case lltok::kw_cxx_fast_tlscc: CC = CallingConv::CXX_FAST_TLS; break;
   case lltok::kw_cc: {
       Lex.Lex();
       return ParseUInt32(CC);
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index 6a9a1de0b850..48abeac95066 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -99,6 +99,7 @@ namespace lltok {
     kw_preserve_mostcc, kw_preserve_allcc,
     kw_ghccc,
     kw_hhvmcc, kw_hhvm_ccc,
+    kw_cxx_fast_tlscc,
 
     // Attributes:
     kw_attributes,
diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp
index fae1ebee5f2a..f8040a7b5f86 100644
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp
@@ -304,6 +304,7 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
   case CallingConv::AnyReg:        Out << "anyregcc"; break;
   case CallingConv::PreserveMost:  Out << "preserve_mostcc"; break;
   case CallingConv::PreserveAll:   Out << "preserve_allcc"; break;
+  case CallingConv::CXX_FAST_TLS:  Out << "cxx_fast_tlscc"; break;
   case CallingConv::GHC:           Out << "ghccc"; break;
   case CallingConv::X86_StdCall:   Out << "x86_stdcallcc"; break;
   case CallingConv::X86_FastCall:  Out << "x86_fastcallcc"; break;
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 39de5004143e..888437634789 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -248,6 +248,10 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     if (HasAVX)
       return CSR_64_RT_AllRegs_AVX_SaveList;
     return CSR_64_RT_AllRegs_SaveList;
+  case CallingConv::CXX_FAST_TLS:
+    if (Is64Bit)
+      return CSR_64_TLS_Darwin_SaveList;
+    break;
   case CallingConv::Intel_OCL_BI: {
     if (HasAVX512 && IsWin64)
       return CSR_Win64_Intel_OCL_BI_AVX512_SaveList;
@@ -310,6 +314,10 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
     if (HasAVX)
       return CSR_64_RT_AllRegs_AVX_RegMask;
     return CSR_64_RT_AllRegs_RegMask;
+  case CallingConv::CXX_FAST_TLS:
+    if (Is64Bit)
+      return CSR_64_TLS_Darwin_RegMask;
+    break;
   case CallingConv::Intel_OCL_BI: {
     if (HasAVX512 && IsWin64)
       return CSR_Win64_Intel_OCL_BI_AVX512_RegMask;
diff --git a/test/CodeGen/X86/cxx_tlscc64.ll b/test/CodeGen/X86/cxx_tlscc64.ll
new file mode 100644
index 000000000000..c229521cc9a4
--- /dev/null
+++ b/test/CodeGen/X86/cxx_tlscc64.ll
@@ -0,0 +1,71 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -enable-shrink-wrap=true | FileCheck --check-prefix=SHRINK %s
+%struct.S = type { i8 }
+
+@sg = internal thread_local global %struct.S zeroinitializer, align 1
+@__dso_handle = external global i8
+@__tls_guard = internal thread_local unnamed_addr global i1 false
+
+declare void @_ZN1SC1Ev(%struct.S*)
+declare void @_ZN1SD1Ev(%struct.S*)
+declare i32 @_tlv_atexit(void (i8*)*, i8*, i8*)
+
+; Every GPR should be saved - except rdi, rax, and rsp
+; CHECK-LABEL: _ZTW2sg
+; CHECK: pushq %r11
+; CHECK: pushq %r10
+; CHECK: pushq %r9
+; CHECK: pushq %r8
+; CHECK: pushq %rsi
+; CHECK: pushq %rdx
+; CHECK: pushq %rcx
+; CHECK: pushq %rbx
+; CHECK: callq
+; CHECK: jne
+; CHECK: callq
+; CHECK: tlv_atexit
+; CHECK: callq
+; CHECK: popq %rbx
+; CHECK: popq %rcx
+; CHECK: popq %rdx
+; CHECK: popq %rsi
+; CHECK: popq %r8
+; CHECK: popq %r9
+; CHECK: popq %r10
+; CHECK: popq %r11
+; SHRINK-LABEL: _ZTW2sg
+; SHRINK: callq
+; SHRINK: jne
+; SHRINK: pushq %r11
+; SHRINK: pushq %r10
+; SHRINK: pushq %r9
+; SHRINK: pushq %r8
+; SHRINK: pushq %rsi
+; SHRINK: pushq %rdx
+; SHRINK: pushq %rcx
+; SHRINK: pushq %rbx
+; SHRINK: callq
+; SHRINK: tlv_atexit
+; SHRINK: popq %rbx
+; SHRINK: popq %rcx
+; SHRINK: popq %rdx
+; SHRINK: popq %rsi
+; SHRINK: popq %r8
+; SHRINK: popq %r9
+; SHRINK: popq %r10
+; SHRINK: popq %r11
+; SHRINK: LBB{{.*}}:
+; SHRINK: callq
+define cxx_fast_tlscc nonnull %struct.S* @_ZTW2sg() {
+  %.b.i = load i1, i1* @__tls_guard, align 1
+  br i1 %.b.i, label %__tls_init.exit, label %init.i
+
+init.i:
+  store i1 true, i1* @__tls_guard, align 1
+  tail call void @_ZN1SC1Ev(%struct.S* nonnull @sg) #2
+  %1 = tail call i32 @_tlv_atexit(void (i8*)* nonnull bitcast (void (%struct.S*)* @_ZN1SD1Ev to void (i8*)*), i8* nonnull getelementptr inbounds (%struct.S, %struct.S* @sg, i64 0, i32 0), i8* nonnull @__dso_handle) #2
+  br label %__tls_init.exit
+
+__tls_init.exit:
+  ret %struct.S* @sg
+}

From 015255c0401cdd5f699fc2c0edd6f0239253a6f1 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Fri, 4 Dec 2015 17:51:55 +0000
Subject: [PATCH 073/364] fix formatting; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254739 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86SelectionDAGInfo.cpp | 43 +++++++++++---------------
 1 file changed, 18 insertions(+), 25 deletions(-)

diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index ce79fcf9ad81..b1a01614b4a1 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -44,13 +44,10 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible(
   return false;
 }
 
-SDValue
-X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
-                                             SDValue Chain,
-                                             SDValue Dst, SDValue Src,
-                                             SDValue Size, unsigned Align,
-                                             bool isVolatile,
-                                         MachinePointerInfo DstPtrInfo) const {
+SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
+    SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, bool isVolatile,
+    MachinePointerInfo DstPtrInfo) const {
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
   const X86Subtarget &Subtarget =
       DAG.getMachineFunction().getSubtarget<X86Subtarget>();
@@ -74,10 +71,10 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
     // Check to see if there is a specialized entry-point for memory zeroing.
     ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
 
-    if (const char *bzeroEntry =  V &&
+    if (const char *bzeroEntry = V &&
         V->isNullValue() ? Subtarget.getBZeroEntry() : nullptr) {
-      EVT IntPtr =
-          DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+      EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout());
       Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
       TargetLowering::ArgListTy Args;
       TargetLowering::ArgListEntry Entry;
@@ -94,7 +91,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
                    0)
         .setDiscardResult();
 
-      std::pair<SDValue,SDValue> CallResult = DAG.getTargetLoweringInfo().LowerCallTo(CLI);
+      std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
       return CallResult.second;
     }
 
@@ -144,8 +141,8 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
       BytesLeft = SizeVal % UBytes;
     }
 
-    Chain  = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT),
-                              InFlag);
+    Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT),
+                             InFlag);
     InFlag = Chain.getValue(1);
   } else {
     AVT = MVT::i8;
@@ -172,9 +169,8 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
     SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count,
                                DAG.getConstant((AVT == MVT::i64) ? 7 : 3, dl,
                                                CVT));
-    Chain  = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX :
-                                                             X86::ECX,
-                              Left, InFlag);
+    Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : X86::ECX,
+                             Left, InFlag);
     InFlag = Chain.getValue(1);
     Tys = DAG.getVTList(MVT::Other, MVT::Glue);
     SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag };
@@ -249,17 +245,14 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
   unsigned BytesLeft = SizeVal % UBytes;
 
   SDValue InFlag;
-  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX :
-                                                              X86::ECX,
-                            Count, InFlag);
+  Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : X86::ECX,
+                           Count, InFlag);
   InFlag = Chain.getValue(1);
-  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI :
-                                                              X86::EDI,
-                            Dst, InFlag);
+  Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : X86::EDI,
+                           Dst, InFlag);
   InFlag = Chain.getValue(1);
-  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RSI :
-                                                              X86::ESI,
-                            Src, InFlag);
+  Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RSI : X86::ESI,
+                           Src, InFlag);
   InFlag = Chain.getValue(1);
 
   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);

From d3e1404b7b4160d3b5b5979b6fc6ee99ac4daf75 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Fri, 4 Dec 2015 17:54:31 +0000
Subject: [PATCH 074/364] don't repeat function names in comments; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254740 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Target/TargetSelectionDAGInfo.h | 45 +++++++++-----------
 1 file changed, 21 insertions(+), 24 deletions(-)

diff --git a/include/llvm/Target/TargetSelectionDAGInfo.h b/include/llvm/Target/TargetSelectionDAGInfo.h
index 53db5aa84292..a7143ac3fa66 100644
--- a/include/llvm/Target/TargetSelectionDAGInfo.h
+++ b/include/llvm/Target/TargetSelectionDAGInfo.h
@@ -21,7 +21,7 @@
 namespace llvm {
 
 //===----------------------------------------------------------------------===//
-/// TargetSelectionDAGInfo - Targets can subclass this to parameterize the
+/// Targets can subclass this to parameterize the
 /// SelectionDAG lowering and instruction selection process.
 ///
 class TargetSelectionDAGInfo {
@@ -32,8 +32,8 @@ class TargetSelectionDAGInfo {
   explicit TargetSelectionDAGInfo() = default;
   virtual ~TargetSelectionDAGInfo();
 
-  /// EmitTargetCodeForMemcpy - Emit target-specific code that performs a
-  /// memcpy. This can be used by targets to provide code sequences for cases
+  /// Emit target-specific code that performs a memcpy.
+  /// This can be used by targets to provide code sequences for cases
   /// that don't fit the target's parameters for simple loads/stores and can be
   /// more efficient than using a library call. This function can return a null
   /// SDValue if the target declines to use custom code and a different
@@ -56,8 +56,8 @@ class TargetSelectionDAGInfo {
     return SDValue();
   }
 
-  /// EmitTargetCodeForMemmove - Emit target-specific code that performs a
-  /// memmove. This can be used by targets to provide code sequences for cases
+  /// Emit target-specific code that performs a memmove.
+  /// This can be used by targets to provide code sequences for cases
   /// that don't fit the target's parameters for simple loads/stores and can be
   /// more efficient than using a library call. This function can return a null
   /// SDValue if the target declines to use custom code and a different
@@ -72,8 +72,8 @@ class TargetSelectionDAGInfo {
     return SDValue();
   }
 
-  /// EmitTargetCodeForMemset - Emit target-specific code that performs a
-  /// memset. This can be used by targets to provide code sequences for cases
+  /// Emit target-specific code that performs a memset.
+  /// This can be used by targets to provide code sequences for cases
   /// that don't fit the target's parameters for simple stores and can be more
   /// efficient than using a library call. This function can return a null
   /// SDValue if the target declines to use custom code and a different
@@ -87,11 +87,10 @@ class TargetSelectionDAGInfo {
     return SDValue();
   }
 
-  /// EmitTargetCodeForMemcmp - Emit target-specific code that performs a
-  /// memcmp, in cases where that is faster than a libcall.  The first
-  /// returned SDValue is the result of the memcmp and the second is
-  /// the chain.  Both SDValues can be null if a normal libcall should
-  /// be used.
+  /// Emit target-specific code that performs a memcmp, in cases where that is
+  /// faster than a libcall. The first returned SDValue is the result of the
+  /// memcmp and the second is the chain. Both SDValues can be null if a normal
+  /// libcall should be used.
   virtual std::pair<SDValue, SDValue>
   EmitTargetCodeForMemcmp(SelectionDAG &DAG, SDLoc dl,
                           SDValue Chain,
@@ -101,11 +100,10 @@ class TargetSelectionDAGInfo {
     return std::make_pair(SDValue(), SDValue());
   }
 
-  /// EmitTargetCodeForMemchr - Emit target-specific code that performs a
-  /// memchr, in cases where that is faster than a libcall.  The first
-  /// returned SDValue is the result of the memchr and the second is
-  /// the chain.  Both SDValues can be null if a normal libcall should
-  /// be used.
+  /// Emit target-specific code that performs a memchr, in cases where that is
+  /// faster than a libcall. The first returned SDValue is the result of the
+  /// memchr and the second is the chain. Both SDValues can be null if a normal
+  /// libcall should be used.
   virtual std::pair<SDValue, SDValue>
   EmitTargetCodeForMemchr(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
                           SDValue Src, SDValue Char, SDValue Length,
@@ -113,8 +111,8 @@ class TargetSelectionDAGInfo {
     return std::make_pair(SDValue(), SDValue());
   }
 
-  /// EmitTargetCodeForStrcpy - Emit target-specific code that performs a
-  /// strcpy or stpcpy, in cases where that is faster than a libcall.
+  /// Emit target-specific code that performs a strcpy or stpcpy, in cases
+  /// where that is faster than a libcall.
   /// The first returned SDValue is the result of the copy (the start
   /// of the destination string for strcpy, a pointer to the null terminator
   /// for stpcpy) and the second is the chain.  Both SDValues can be null
@@ -128,11 +126,10 @@ class TargetSelectionDAGInfo {
     return std::make_pair(SDValue(), SDValue());
   }
 
-  /// EmitTargetCodeForStrcmp - Emit target-specific code that performs a
-  /// strcmp, in cases where that is faster than a libcall.  The first
-  /// returned SDValue is the result of the strcmp and the second is
-  /// the chain.  Both SDValues can be null if a normal libcall should
-  /// be used.
+  /// Emit target-specific code that performs a strcmp, in cases where that is
+  /// faster than a libcall.
+  /// The first returned SDValue is the result of the strcmp and the second is
+  /// the chain. Both SDValues can be null if a normal libcall should be used.
   virtual std::pair<SDValue, SDValue>
   EmitTargetCodeForStrcmp(SelectionDAG &DAG, SDLoc dl,
                           SDValue Chain,

From 90c200464e00b6004e9f91b18a33cfd51202f3d8 Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Fri, 4 Dec 2015 18:27:03 +0000
Subject: [PATCH 075/364] [WebAssembly] Add several more calling conventions to
 the supported list.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254741 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/WebAssemblyISelLowering.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index b651855eea7a..a7eba5611134 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -260,9 +260,14 @@ static void fail(SDLoc DL, SelectionDAG &DAG, const char *msg) {
 // Test whether the given calling convention is supported.
 static bool CallingConvSupported(CallingConv::ID CallConv) {
   // We currently support the language-independent target-independent
-  // conventions.
+  // conventions. We don't yet have a way to annotate calls with properties like
+  // "cold", and we don't have any call-clobbered registers, so these are mostly
+  // all handled the same.
   return CallConv == CallingConv::C || CallConv == CallingConv::Fast ||
-         CallConv == CallingConv::Cold;
+         CallConv == CallingConv::Cold ||
+         CallConv == CallingConv::PreserveMost ||
+         CallConv == CallingConv::PreserveAll ||
+         CallConv == CallingConv::CXX_FAST_TLS;
 }
 
 SDValue

From 4d585624ebb50ff6dae91402180356c43aa9ed25 Mon Sep 17 00:00:00 2001
From: Mike Aizatsky <aizatsky@chromium.org>
Date: Fri, 4 Dec 2015 18:35:37 +0000
Subject: [PATCH 076/364] sancov -not-covered-functions.

Summary: The command prints out list of functions that were not entered.
To do this, addresses are first converted to function locations. Set
operations are used for function locations.

Differential Revision: http://reviews.llvm.org/D14889

review

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254742 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/tools/sancov/covered_functions.test     |  11 +-
 test/tools/sancov/not_covered_functions.test |   7 +
 tools/sancov/CMakeLists.txt                  |   4 +
 tools/sancov/sancov.cc                       | 389 ++++++++++++++-----
 4 files changed, 317 insertions(+), 94 deletions(-)
 create mode 100644 test/tools/sancov/not_covered_functions.test

diff --git a/test/tools/sancov/covered_functions.test b/test/tools/sancov/covered_functions.test
index 02dd30210238..5e0696bf8615 100644
--- a/test/tools/sancov/covered_functions.test
+++ b/test/tools/sancov/covered_functions.test
@@ -1,14 +1,13 @@
 REQUIRES: x86_64-linux
-RUN: sancov -obj %p/Inputs/test-linux_x86_64 -covered_functions %p/Inputs/test-linux_x86_64.sancov | FileCheck %s
-RUN: sancov -obj %p/Inputs/test-linux_x86_64 -covered_functions %p/Inputs/test-linux_x86_64-1.sancov | FileCheck --check-prefix=MULTIPLE_FILES %s
-RUN: sancov -obj %p/Inputs/test-linux_x86_64 -demangle=0 -covered_functions %p/Inputs/test-linux_x86_64.sancov | FileCheck --check-prefix=NO_DEMANGLE %s
+RUN: sancov -obj %p/Inputs/test-linux_x86_64 -covered-functions %p/Inputs/test-linux_x86_64.sancov | FileCheck %s
+RUN: sancov -obj %p/Inputs/test-linux_x86_64 -covered-functions -strip_path_prefix=Inputs/ %p/Inputs/test-linux_x86_64.sancov | FileCheck --check-prefix=STRIP_PATH %s
+RUN: sancov -obj %p/Inputs/test-linux_x86_64 -demangle=0 -covered-functions %p/Inputs/test-linux_x86_64.sancov | FileCheck --check-prefix=NO_DEMANGLE %s
 
 CHECK: Inputs{{[/\\]}}test.cpp:12 bar(std::string)
 CHECK: Inputs{{[/\\]}}test.cpp:14 main
 
-MULTIPLE_FILES: {{^}}foo.cpp:5 foo()
-MULTIPLE_FILES: {{^}}test.cpp:12 bar(std::string)
-MULTIPLE_FILES: {{^}}test.cpp:14 main
+STRIP_PATH: {{^}}test.cpp:12 bar(std::string)
+STRIP_PATH: {{^}}test.cpp:14 main
 
 NO_DEMANGLE: test.cpp:12 _Z3barSs
 NO_DEMANGLE: test.cpp:14 main
diff --git a/test/tools/sancov/not_covered_functions.test b/test/tools/sancov/not_covered_functions.test
new file mode 100644
index 000000000000..b82f9e22d5d8
--- /dev/null
+++ b/test/tools/sancov/not_covered_functions.test
@@ -0,0 +1,7 @@
+REQUIRES: x86_64-linux
+RUN: sancov -obj %p/Inputs/test-linux_x86_64 -not-covered-functions %p/Inputs/test-linux_x86_64.sancov | FileCheck %s
+RUN: sancov -obj %p/Inputs/test-linux_x86_64 -not-covered-functions %p/Inputs/test-linux_x86_64-1.sancov | FileCheck --check-prefix=CHECK1 --allow-empty %s
+
+CHECK: Inputs{{[/\\]}}foo.cpp:5 foo()
+CHECK1-NOT: {{.}}*
+
diff --git a/tools/sancov/CMakeLists.txt b/tools/sancov/CMakeLists.txt
index f891f779e8c9..5ce589b37373 100644
--- a/tools/sancov/CMakeLists.txt
+++ b/tools/sancov/CMakeLists.txt
@@ -1,4 +1,8 @@
 set(LLVM_LINK_COMPONENTS
+  AllTargetsAsmPrinters
+  AllTargetsDescs
+  AllTargetsDisassemblers
+  AllTargetsInfos
   DebugInfoDWARF
   DebugInfoPDB
   Object
diff --git a/tools/sancov/sancov.cc b/tools/sancov/sancov.cc
index 9b54575028ad..450c21bd1690 100644
--- a/tools/sancov/sancov.cc
+++ b/tools/sancov/sancov.cc
@@ -12,6 +12,18 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/DebugInfo/Symbolize/Symbolize.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Object/Binary.h"
+#include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/ErrorOr.h"
@@ -22,11 +34,14 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/raw_ostream.h"
 
 #include <set>
 #include <stdio.h>
+#include <string>
 #include <vector>
 
 using namespace llvm;
@@ -35,13 +50,19 @@ namespace {
 
 // --------- COMMAND LINE FLAGS ---------
 
-enum ActionType { PrintAction, CoveredFunctionsAction };
+enum ActionType {
+  PrintAction,
+  CoveredFunctionsAction,
+  NotCoveredFunctionsAction
+};
 
 cl::opt<ActionType> Action(
     cl::desc("Action (required)"), cl::Required,
     cl::values(clEnumValN(PrintAction, "print", "Print coverage addresses"),
-               clEnumValN(CoveredFunctionsAction, "covered_functions",
+               clEnumValN(CoveredFunctionsAction, "covered-functions",
                           "Print all covered funcions."),
+               clEnumValN(NotCoveredFunctionsAction, "not-covered-functions",
+                          "Print all not covered funcions."),
                clEnumValEnd));
 
 static cl::list<std::string> ClInputFiles(cl::Positional, cl::OneOrMore,
@@ -55,6 +76,10 @@ static cl::opt<bool>
     ClDemangle("demangle", cl::init(true),
         cl::desc("Print demangled function name."));
 
+static cl::opt<std::string> ClStripPathPrefix(
+    "strip_path_prefix", cl::init(""),
+    cl::desc("Strip this prefix from file paths in reports."));
+
 // --------- FORMAT SPECIFICATION ---------
 
 struct FileHeader {
@@ -68,31 +93,256 @@ static const uint32_t Bitness64 = 0xFFFFFF64;
 
 // ---------
 
+static void FailIfError(std::error_code Error) {
+  if (!Error)
+    return;
+  errs() << "Error: " << Error.message() << "(" << Error.value() << ")\n";
+  exit(1);
+}
+
 template <typename T> static void FailIfError(const ErrorOr<T> &E) {
-  if (E)
+  FailIfError(E.getError());
+}
+
+static void FailIfNotEmpty(const std::string &E) {
+  if (E.empty())
     return;
+  errs() << "Error: " << E << "\n";
+  exit(1);
+}
 
-  auto Error = E.getError();
-  errs() << "Error: " << Error.message() << "(" << Error.value() << ")\n";
-  exit(-2);
+template <typename T>
+static void FailIfEmpty(const std::unique_ptr<T> &Ptr,
+                        const std::string &Message) {
+  if (Ptr.get())
+    return;
+  errs() << "Error: " << Message << "\n";
+  exit(1);
 }
 
 template <typename T>
 static void readInts(const char *Start, const char *End,
-                     std::vector<uint64_t> *V) {
+                     std::set<uint64_t> *Ints) {
   const T *S = reinterpret_cast<const T *>(Start);
   const T *E = reinterpret_cast<const T *>(End);
-  V->reserve(E - S);
-  std::copy(S, E, std::back_inserter(*V));
+  std::copy(S, E, std::inserter(*Ints, Ints->end()));
+}
+
+struct FileLoc {
+  bool operator<(const FileLoc &RHS) const {
+    return std::tie(FileName, Line) < std::tie(RHS.FileName, RHS.Line);
+  }
+
+  std::string FileName;
+  uint32_t Line;
+};
+
+struct FunctionLoc {
+  bool operator<(const FunctionLoc &RHS) const {
+    return std::tie(Loc, FunctionName) < std::tie(RHS.Loc, RHS.FunctionName);
+  }
+
+  FileLoc Loc;
+  std::string FunctionName;
+};
+
+std::string stripPathPrefix(std::string Path) {
+  if (ClStripPathPrefix.empty())
+    return Path;
+  size_t Pos = Path.find(ClStripPathPrefix);
+  if (Pos == std::string::npos)
+    return Path;
+  return Path.substr(Pos + ClStripPathPrefix.size());
+}
+
+// Compute [FileLoc -> FunctionName] map for given addresses.
+static std::map<FileLoc, std::string>
+computeFunctionsMap(const std::set<uint64_t> &Addrs) {
+  std::map<FileLoc, std::string> Fns;
+
+  symbolize::LLVMSymbolizer::Options SymbolizerOptions;
+  SymbolizerOptions.Demangle = ClDemangle;
+  SymbolizerOptions.UseSymbolTable = true;
+  symbolize::LLVMSymbolizer Symbolizer(SymbolizerOptions);
+
+  // Fill in Fns map.
+  for (auto Addr : Addrs) {
+    auto InliningInfo = Symbolizer.symbolizeInlinedCode(ClBinaryName, Addr);
+    FailIfError(InliningInfo);
+    for (uint32_t i = 0; i < InliningInfo->getNumberOfFrames(); ++i) {
+      auto FrameInfo = InliningInfo->getFrame(i);
+      SmallString<256> FileName(FrameInfo.FileName);
+      sys::path::remove_dots(FileName, /* remove_dot_dot */ true);
+      FileLoc Loc = {FileName.str(), FrameInfo.Line};
+      Fns[Loc] = FrameInfo.FunctionName;
+    }
+  }
+
+  return Fns;
+}
+
+// Compute functions for given addresses. It keeps only the first
+// occurence of a function within a file.
+std::set<FunctionLoc> computeFunctionLocs(const std::set<uint64_t> &Addrs) {
+  std::map<FileLoc, std::string> Fns = computeFunctionsMap(Addrs);
+
+  std::set<FunctionLoc> result;
+  std::string LastFileName;
+  std::set<std::string> ProcessedFunctions;
+
+  for (const auto &P : Fns) {
+    std::string FileName = P.first.FileName;
+    std::string FunctionName = P.second;
+
+    if (LastFileName != FileName)
+      ProcessedFunctions.clear();
+    LastFileName = FileName;
+
+    if (!ProcessedFunctions.insert(FunctionName).second)
+      continue;
+
+    result.insert(FunctionLoc{P.first, P.second});
+  }
+
+  return result;
+}
+
+// Locate __sanitizer_cov function address.
+static uint64_t findSanitizerCovFunction(const object::ObjectFile &O) {
+  for (const object::SymbolRef &Symbol : O.symbols()) {
+    ErrorOr<uint64_t> AddressOrErr = Symbol.getAddress();
+    FailIfError(AddressOrErr);
+
+    ErrorOr<StringRef> Name = Symbol.getName();
+    FailIfError(Name);
+
+    if (Name.get() == "__sanitizer_cov") {
+      return AddressOrErr.get();
+    }
+  }
+  FailIfNotEmpty("__sanitizer_cov not found");
+  return 0; // not reachable.
+}
+
+// Locate addresses of all coverage points in a file. Coverage point
+// is defined as the 'address of instruction following __sanitizer_cov
+// call - 1'.
+static void getObjectCoveragePoints(const object::ObjectFile &O,
+                                    std::set<uint64_t> *Addrs) {
+  Triple TheTriple("unknown-unknown-unknown");
+  TheTriple.setArch(Triple::ArchType(O.getArch()));
+  auto TripleName = TheTriple.getTriple();
+
+  std::string Error;
+  const Target *TheTarget = TargetRegistry::lookupTarget(TripleName, Error);
+  FailIfNotEmpty(Error);
+
+  std::unique_ptr<const MCSubtargetInfo> STI(
+      TheTarget->createMCSubtargetInfo(TripleName, "", ""));
+  FailIfEmpty(STI, "no subtarget info for target " + TripleName);
+
+  std::unique_ptr<const MCRegisterInfo> MRI(
+      TheTarget->createMCRegInfo(TripleName));
+  FailIfEmpty(MRI, "no register info for target " + TripleName);
+
+  std::unique_ptr<const MCAsmInfo> AsmInfo(
+      TheTarget->createMCAsmInfo(*MRI, TripleName));
+  FailIfEmpty(AsmInfo, "no asm info for target " + TripleName);
+
+  std::unique_ptr<const MCObjectFileInfo> MOFI(new MCObjectFileInfo);
+  MCContext Ctx(AsmInfo.get(), MRI.get(), MOFI.get());
+  std::unique_ptr<MCDisassembler> DisAsm(
+      TheTarget->createMCDisassembler(*STI, Ctx));
+  FailIfEmpty(DisAsm, "no disassembler info for target " + TripleName);
+
+  std::unique_ptr<const MCInstrInfo> MII(TheTarget->createMCInstrInfo());
+  FailIfEmpty(MII, "no instruction info for target " + TripleName);
+
+  std::unique_ptr<const MCInstrAnalysis> MIA(
+      TheTarget->createMCInstrAnalysis(MII.get()));
+  FailIfEmpty(MIA, "no instruction analysis info for target " + TripleName);
+
+  uint64_t SanCovAddr = findSanitizerCovFunction(O);
+
+  for (const auto Section : O.sections()) {
+    if (Section.isVirtual() || !Section.isText()) // llvm-objdump does the same.
+      continue;
+    uint64_t SectionAddr = Section.getAddress();
+    uint64_t SectSize = Section.getSize();
+    if (!SectSize)
+      continue;
+
+    StringRef SectionName;
+    FailIfError(Section.getName(SectionName));
+
+    StringRef BytesStr;
+    FailIfError(Section.getContents(BytesStr));
+    ArrayRef<uint8_t> Bytes(reinterpret_cast<const uint8_t *>(BytesStr.data()),
+                            BytesStr.size());
+
+    for (uint64_t Index = 0, Size = 0; Index < Section.getSize();
+         Index += Size) {
+      MCInst Inst;
+      if (!DisAsm->getInstruction(Inst, Size, Bytes.slice(Index),
+                                  SectionAddr + Index, nulls(), nulls())) {
+        if (Size == 0)
+          Size = 1;
+        continue;
+      }
+      uint64_t Target;
+      if (MIA->isCall(Inst) &&
+          MIA->evaluateBranch(Inst, SectionAddr + Index, Size, Target)) {
+        if (Target == SanCovAddr) {
+          // Sanitizer coverage uses the address of the next instruction - 1.
+          Addrs->insert(Index + SectionAddr + Size - 1);
+        }
+      }
+    }
+  }
 }
 
-static std::string CommonPrefix(std::string A, std::string B) {
-  if (A.size() > B.size())
-    return std::string(B.begin(),
-                       std::mismatch(B.begin(), B.end(), A.begin()).first);
+static void getArchiveCoveragePoints(const object::Archive &A,
+                                     std::set<uint64_t> *Addrs) {
+  for (auto &ErrorOrChild : A.children()) {
+    FailIfError(ErrorOrChild);
+    const object::Archive::Child &C = *ErrorOrChild;
+    ErrorOr<std::unique_ptr<object::Binary>> ChildOrErr = C.getAsBinary();
+    FailIfError(ChildOrErr);
+    if (object::ObjectFile *O =
+            dyn_cast<object::ObjectFile>(&*ChildOrErr.get()))
+      getObjectCoveragePoints(*O, Addrs);
+    else
+      FailIfError(object::object_error::invalid_file_type);
+  }
+}
+
+// Locate addresses of all coverage points in a file. Coverage point
+// is defined as the 'address of instruction following __sanitizer_cov
+// call - 1'.
+std::set<uint64_t> getCoveragePoints(std::string FileName) {
+  std::set<uint64_t> Result;
+
+  ErrorOr<object::OwningBinary<object::Binary>> BinaryOrErr =
+      object::createBinary(FileName);
+  FailIfError(BinaryOrErr);
+
+  object::Binary &Binary = *BinaryOrErr.get().getBinary();
+  if (object::Archive *A = dyn_cast<object::Archive>(&Binary))
+    getArchiveCoveragePoints(*A, &Result);
+  else if (object::ObjectFile *O = dyn_cast<object::ObjectFile>(&Binary))
+    getObjectCoveragePoints(*O, &Result);
   else
-    return std::string(A.begin(),
-                       std::mismatch(A.begin(), A.end(), B.begin()).first);
+    FailIfError(object::object_error::invalid_file_type);
+
+  return Result;
+}
+
+static void printFunctionLocs(const std::set<FunctionLoc> &FnLocs,
+                              raw_ostream &OS) {
+  for (const FunctionLoc &FnLoc : FnLocs) {
+    OS << stripPathPrefix(FnLoc.Loc.FileName) << ":" << FnLoc.Loc.Line << " "
+       << FnLoc.FunctionName << "\n";
+  }
 }
 
 class CoverageData {
@@ -116,7 +366,7 @@ class CoverageData {
       return make_error_code(errc::illegal_byte_sequence);
     }
 
-    auto Addrs = llvm::make_unique<std::vector<uint64_t>>();
+    auto Addrs = llvm::make_unique<std::set<uint64_t>>();
 
     switch (Header->Bitness) {
     case Bitness64:
@@ -138,15 +388,12 @@ class CoverageData {
   // Merge multiple coverage data together.
   static std::unique_ptr<CoverageData>
   merge(const std::vector<std::unique_ptr<CoverageData>> &Covs) {
-    std::set<uint64_t> Addrs;
+    auto Addrs = llvm::make_unique<std::set<uint64_t>>();
 
     for (const auto &Cov : Covs)
-      Addrs.insert(Cov->Addrs->begin(), Cov->Addrs->end());
+      Addrs->insert(Cov->Addrs->begin(), Cov->Addrs->end());
 
-    auto AddrsVector = llvm::make_unique<std::vector<uint64_t>>(
-        Addrs.begin(), Addrs.end());
-    return std::unique_ptr<CoverageData>(
-        new CoverageData(std::move(AddrsVector)));
+    return std::unique_ptr<CoverageData>(new CoverageData(std::move(Addrs)));
   }
 
   // Read list of files and merges their coverage info.
@@ -163,83 +410,39 @@ class CoverageData {
   }
 
   // Print coverage addresses.
-  void printAddrs(raw_ostream &out) {
+  void printAddrs(raw_ostream &OS) {
     for (auto Addr : *Addrs) {
-      out << "0x";
-      out.write_hex(Addr);
-      out << "\n";
+      OS << "0x";
+      OS.write_hex(Addr);
+      OS << "\n";
     }
   }
 
   // Print list of covered functions.
   // Line format: <file_name>:<line> <function_name>
-  void printCoveredFunctions(raw_ostream &out) {
-    if (Addrs->empty())
-      return;
-    symbolize::LLVMSymbolizer::Options SymbolizerOptions;
-    SymbolizerOptions.Demangle = ClDemangle;
-    symbolize::LLVMSymbolizer Symbolizer(SymbolizerOptions);
-
-    struct FileLoc {
-      std::string FileName;
-      uint32_t Line;
-      bool operator<(const FileLoc &Rhs) const {
-        return std::tie(FileName, Line) < std::tie(Rhs.FileName, Rhs.Line);
-      }
-    };
-
-    // FileLoc -> FunctionName
-    std::map<FileLoc, std::string> Fns;
-
-    // Fill in Fns map.
-    for (auto Addr : *Addrs) {
-      auto InliningInfo = Symbolizer.symbolizeInlinedCode(ClBinaryName, Addr);
-      FailIfError(InliningInfo);
-      for (uint32_t i = 0; i < InliningInfo->getNumberOfFrames(); ++i) {
-        auto FrameInfo = InliningInfo->getFrame(i);
-        SmallString<256> FileName(FrameInfo.FileName);
-        sys::path::remove_dots(FileName, /* remove_dot_dot */ true);
-        FileLoc Loc = { FileName.str(), FrameInfo.Line };
-        Fns[Loc] = FrameInfo.FunctionName;
-      }
-    }
-
-    // Compute file names common prefix.
-    std::string FilePrefix = Fns.begin()->first.FileName;
-    for (const auto &P : Fns)
-      FilePrefix = CommonPrefix(FilePrefix, P.first.FileName);
-
-    // Print first function occurence in a file.
-    {
-      std::string LastFileName;
-      std::set<std::string> ProcessedFunctions;
-
-      for (const auto &P : Fns) {
-        std::string FileName = P.first.FileName;
-        std::string FunctionName = P.second;
-        uint32_t Line = P.first.Line;
-
-        if (LastFileName != FileName)
-          ProcessedFunctions.clear();
-        LastFileName = FileName;
-
-        if (!ProcessedFunctions.insert(FunctionName).second)
-          continue;
-
-        // Don't strip prefix if we only have a single file.
-        if (FileName.size() > FilePrefix.size())
-          FileName = FileName.substr(FilePrefix.size());
+  void printCoveredFunctions(raw_ostream &OS) {
+    printFunctionLocs(computeFunctionLocs(*Addrs), OS);
+  }
 
-        out << FileName << ":" << Line << " " << FunctionName << "\n";
-      }
-    }
+  // Print list of not covered functions.
+  // Line format: <file_name>:<line> <function_name>
+  void printNotCoveredFunctions(raw_ostream &OS) {
+    std::set<FunctionLoc> AllFns =
+        computeFunctionLocs(getCoveragePoints(ClBinaryName));
+    std::set<FunctionLoc> CoveredFns = computeFunctionLocs(*Addrs);
+
+    std::set<FunctionLoc> NotCoveredFns;
+    std::set_difference(AllFns.begin(), AllFns.end(), CoveredFns.begin(),
+                        CoveredFns.end(),
+                        std::inserter(NotCoveredFns, NotCoveredFns.end()));
+    printFunctionLocs(NotCoveredFns, OS);
   }
 
- private:
-  explicit CoverageData(std::unique_ptr<std::vector<uint64_t>> Addrs)
+private:
+  explicit CoverageData(std::unique_ptr<std::set<uint64_t>> Addrs)
       : Addrs(std::move(Addrs)) {}
 
-  std::unique_ptr<std::vector<uint64_t>> Addrs;
+  std::unique_ptr<std::set<uint64_t>> Addrs;
 };
 } // namespace
 
@@ -249,6 +452,10 @@ int main(int argc, char **argv) {
   PrettyStackTraceProgram X(argc, argv);
   llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
 
+  llvm::InitializeAllTargetInfos();
+  llvm::InitializeAllTargetMCs();
+  llvm::InitializeAllDisassemblers();
+
   cl::ParseCommandLineOptions(argc, argv, "Sanitizer Coverage Processing Tool");
 
   auto CovData = CoverageData::readAndMerge(ClInputFiles);
@@ -263,5 +470,11 @@ int main(int argc, char **argv) {
     CovData.get()->printCoveredFunctions(outs());
     return 0;
   }
+  case NotCoveredFunctionsAction: {
+    CovData.get()->printNotCoveredFunctions(outs());
+    return 0;
+  }
   }
+
+  llvm_unreachable("unsupported action");
 }

From 76bae99a6129f228dd70d0f94d2c68e186d84664 Mon Sep 17 00:00:00 2001
From: Mike Aizatsky <aizatsky@chromium.org>
Date: Fri, 4 Dec 2015 18:50:18 +0000
Subject: [PATCH 077/364] adding MC dependencies in hopes to pacify the hexagon
 build.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254745 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/sancov/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/sancov/CMakeLists.txt b/tools/sancov/CMakeLists.txt
index 5ce589b37373..e92b1fcbb862 100644
--- a/tools/sancov/CMakeLists.txt
+++ b/tools/sancov/CMakeLists.txt
@@ -5,6 +5,8 @@ set(LLVM_LINK_COMPONENTS
   AllTargetsInfos
   DebugInfoDWARF
   DebugInfoPDB
+  MC
+  MCDisassembler
   Object
   Support
   Symbolize

From 03d76d50e6b92734a489341ddf52371eb14e660e Mon Sep 17 00:00:00 2001
From: Mike Aizatsky <aizatsky@chromium.org>
Date: Fri, 4 Dec 2015 19:11:54 +0000
Subject: [PATCH 078/364] fixing Makefile

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254749 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/sancov/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/sancov/Makefile b/tools/sancov/Makefile
index 1114fe052d4d..7dba1a7a594a 100644
--- a/tools/sancov/Makefile
+++ b/tools/sancov/Makefile
@@ -9,7 +9,8 @@
 
 LEVEL := ../..
 TOOLNAME := sancov
-LINK_COMPONENTS := DebugInfoDWARF DebugInfoPDB Object Support Symbolize
+LINK_COMPONENTS := all-targets DebugInfoDWARF DebugInfoPDB MC MCParser \
+  MCDisassembler Object Support Symbolize
 
 # This tool has no plugins, optimize startup time.
 TOOL_NO_EXPORTS := 1

From 2b50cdde679409dac8b96b9cf15f8560b34d0741 Mon Sep 17 00:00:00 2001
From: Davide Italiano <davide@freebsd.org>
Date: Fri, 4 Dec 2015 19:27:58 +0000
Subject: [PATCH 079/364] [llvm-readobj/ELF] Simplify Verdef handling.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254751 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-readobj/ELFDumper.cpp | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/tools/llvm-readobj/ELFDumper.cpp b/tools/llvm-readobj/ELFDumper.cpp
index ef07a85f885d..02397f382848 100644
--- a/tools/llvm-readobj/ELFDumper.cpp
+++ b/tools/llvm-readobj/ELFDumper.cpp
@@ -433,17 +433,11 @@ StringRef ELFDumper<ELFT>::getSymbolVersion(StringRef StrTab,
   if (entry.isVerdef()) {
     // The first Verdaux entry holds the name.
     name_offset = entry.getVerdef()->getAux()->vda_name;
-  } else {
-    name_offset = entry.getVernaux()->vna_name;
-  }
-
-  // Set IsDefault
-  if (entry.isVerdef()) {
     IsDefault = !(vs->vs_index & ELF::VERSYM_HIDDEN);
   } else {
+    name_offset = entry.getVernaux()->vna_name;
     IsDefault = false;
   }
-
   if (name_offset >= StrTab.size())
     reportError("Invalid string offset");
   return StringRef(StrTab.data() + name_offset);

From f6712106fa164c50ba9751d21139bd902016feef Mon Sep 17 00:00:00 2001
From: Davide Italiano <davide@freebsd.org>
Date: Fri, 4 Dec 2015 19:29:49 +0000
Subject: [PATCH 080/364] [llvm-readobj] reportError() never returns. Mark with
 the correct attribute.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254752 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-readobj/llvm-readobj.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/llvm-readobj/llvm-readobj.cpp b/tools/llvm-readobj/llvm-readobj.cpp
index 63cec03438cc..5406afff241e 100644
--- a/tools/llvm-readobj/llvm-readobj.cpp
+++ b/tools/llvm-readobj/llvm-readobj.cpp
@@ -231,7 +231,7 @@ namespace opts {
 
 namespace llvm {
 
-void reportError(Twine Msg) {
+LLVM_ATTRIBUTE_NORETURN void reportError(Twine Msg) {
   outs() << "\nError reading file: " << Msg << ".\n";
   outs().flush();
   exit(1);

From c57a70c32743e51e0823918a0f6fdcd60c14a0ca Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Fri, 4 Dec 2015 19:54:24 +0000
Subject: [PATCH 081/364] ScheduleDAGInstrs: Move LiveIntervals field to
 ScheduleDAGMI

Now that ScheduleDAGInstrs doesn't need it anymore we can move the field
down the class hierarcy to ScheduleDAGMI.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254759 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/MachineScheduler.h  | 11 ++++++++---
 include/llvm/CodeGen/ScheduleDAGInstrs.h |  8 --------
 lib/CodeGen/ScheduleDAGInstrs.cpp        |  3 +--
 3 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/include/llvm/CodeGen/MachineScheduler.h b/include/llvm/CodeGen/MachineScheduler.h
index 3a510084f65a..358fd5a3732a 100644
--- a/include/llvm/CodeGen/MachineScheduler.h
+++ b/include/llvm/CodeGen/MachineScheduler.h
@@ -228,6 +228,7 @@ class ScheduleDAGMutation {
 class ScheduleDAGMI : public ScheduleDAGInstrs {
 protected:
   AliasAnalysis *AA;
+  LiveIntervals *LIS;
   std::unique_ptr<MachineSchedStrategy> SchedImpl;
 
   /// Topo - A topological ordering for SUnits which permits fast IsReachable
@@ -255,9 +256,10 @@ class ScheduleDAGMI : public ScheduleDAGInstrs {
 public:
   ScheduleDAGMI(MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S,
                 bool RemoveKillFlags)
-      : ScheduleDAGInstrs(*C->MF, C->MLI, C->LIS, RemoveKillFlags),
-        AA(C->AA), SchedImpl(std::move(S)), Topo(SUnits, &ExitSU), CurrentTop(),
-        CurrentBottom(), NextClusterPred(nullptr), NextClusterSucc(nullptr) {
+      : ScheduleDAGInstrs(*C->MF, C->MLI, RemoveKillFlags), AA(C->AA),
+        LIS(C->LIS), SchedImpl(std::move(S)), Topo(SUnits, &ExitSU),
+        CurrentTop(), CurrentBottom(), NextClusterPred(nullptr),
+        NextClusterSucc(nullptr) {
 #ifndef NDEBUG
     NumInstrsScheduled = 0;
 #endif
@@ -266,6 +268,9 @@ class ScheduleDAGMI : public ScheduleDAGInstrs {
   // Provide a vtable anchor
   ~ScheduleDAGMI() override;
 
+  // Returns LiveIntervals instance for use in DAG mutators and such.
+  LiveIntervals *getLIS() const { return LIS; }
+
   /// Return true if this DAG supports VReg liveness and RegPressure.
   virtual bool hasVRegLiveness() const { return false; }
 
diff --git a/include/llvm/CodeGen/ScheduleDAGInstrs.h b/include/llvm/CodeGen/ScheduleDAGInstrs.h
index c715e0f79205..c574df094911 100644
--- a/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -26,7 +26,6 @@ namespace llvm {
   class MachineFrameInfo;
   class MachineLoopInfo;
   class MachineDominatorTree;
-  class LiveIntervals;
   class RegPressureTracker;
   class PressureDiffs;
 
@@ -92,9 +91,6 @@ namespace llvm {
     const MachineLoopInfo *MLI;
     const MachineFrameInfo *MFI;
 
-    /// Live Intervals provides reaching defs in preRA scheduling.
-    LiveIntervals *LIS;
-
     /// TargetSchedModel provides an interface to the machine model.
     TargetSchedModel SchedModel;
 
@@ -172,14 +168,10 @@ namespace llvm {
   public:
     explicit ScheduleDAGInstrs(MachineFunction &mf,
                                const MachineLoopInfo *mli,
-                               LiveIntervals *LIS = nullptr,
                                bool RemoveKillFlags = false);
 
     ~ScheduleDAGInstrs() override {}
 
-    /// \brief Expose LiveIntervals for use in DAG mutators and such.
-    LiveIntervals *getLIS() const { return LIS; }
-
     /// \brief Get the machine model for instruction scheduling.
     const TargetSchedModel *getSchedModel() const { return &SchedModel; }
 
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index 9d588ff24f61..fb82ab7a5555 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -51,9 +51,8 @@ static cl::opt<bool> UseTBAA("use-tbaa-in-sched-mi", cl::Hidden,
 
 ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
                                      const MachineLoopInfo *mli,
-                                     LiveIntervals *LIS,
                                      bool RemoveKillFlags)
-    : ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()), LIS(LIS),
+    : ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()),
       RemoveKillFlags(RemoveKillFlags), CanHandleTerminators(false),
       TrackLaneMasks(false), FirstDbgValue(nullptr) {
   DbgValues.clear();

From 697fe024f6dad16e21c9b8e41bb27463971939c0 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Fri, 4 Dec 2015 20:05:04 +0000
Subject: [PATCH 082/364] [LegacyPassManager] Reduce memory usage for
 AnalysisUsage

The LegacyPassManager was storing an instance of AnalysisUsage for each instance of each pass. In practice, most instances of a single pass class share the same dependencies. We can't rely on this because passes can (and some do) have dynamic dependencies based on instance options.

We can exploit the likely commonality by uniqueing the usage information after querying the pass, but before storing it into the pass manager. This greatly reduces memory consumption by the AnalysisUsage objects. For a long pass pipeline, I measured a decrease in memory consumption for this storage of about 50%. I have not measured on the default O3 pipeline, but I suspect it will see some benefit as well since many passes are repeated (e.g. InstCombine).

Differential Revision: http://reviews.llvm.org/D14677


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254760 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/LegacyPassManagers.h | 36 +++++++++++++++++++++++++++-
 lib/IR/LegacyPassManager.cpp         | 32 ++++++++++++++++++-------
 2 files changed, 59 insertions(+), 9 deletions(-)

diff --git a/include/llvm/IR/LegacyPassManagers.h b/include/llvm/IR/LegacyPassManagers.h
index 3a0385581509..af045585691b 100644
--- a/include/llvm/IR/LegacyPassManagers.h
+++ b/include/llvm/IR/LegacyPassManagers.h
@@ -16,6 +16,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Pass.h"
@@ -250,7 +251,40 @@ class PMTopLevelManager {
   /// Map from ID to immutable passes.
   SmallDenseMap<AnalysisID, ImmutablePass *, 8> ImmutablePassMap;
 
-  DenseMap<Pass *, AnalysisUsage *> AnUsageMap;
+
+  /// A wrapper around AnalysisUsage for the purpose of uniqueing.  The wrapper
+  /// is used to avoid needing to make AnalysisUsage itself a folding set node.
+  struct AUFoldingSetNode : public FoldingSetNode {
+    AnalysisUsage AU;
+    AUFoldingSetNode(const AnalysisUsage &AU) : AU(AU) {}
+    void Profile(FoldingSetNodeID &ID) const {
+      Profile(ID, AU);
+    }
+    static void Profile(FoldingSetNodeID &ID, const AnalysisUsage &AU) {
+      // TODO: We could consider sorting the dependency arrays within the
+      // AnalysisUsage (since they are conceptually unordered).
+      ID.AddBoolean(AU.getPreservesAll());
+      for (auto &Vec : {AU.getRequiredSet(), AU.getRequiredTransitiveSet(),
+            AU.getPreservedSet(), AU.getUsedSet()}) {
+        ID.AddInteger(Vec.size());
+        for(AnalysisID AID : Vec)
+          ID.AddPointer(AID);
+      }
+    }
+  };
+
+  // Contains all of the unique combinations of AnalysisUsage.  This is helpful
+  // when we have multiple instances of the same pass since they'll usually
+  // have the same analysis usage and can share storage.
+  FoldingSet<AUFoldingSetNode> UniqueAnalysisUsages;
+  
+  // Allocator used for allocating UAFoldingSetNodes.  This handles deletion of
+  // all allocated nodes in one fell swoop.
+  BumpPtrAllocator AUFoldingSetNodeAllocator;
+  
+  // Maps from a pass to it's associated entry in UniqueAnalysisUsages.  Does
+  // not own the storage associated with either key or value.. 
+  DenseMap<Pass *, AnalysisUsage*> AnUsageMap;
 
   /// Collection of PassInfo objects found via analysis IDs and in this top
   /// level manager. This is used to memoize queries to the pass registry.
diff --git a/lib/IR/LegacyPassManager.cpp b/lib/IR/LegacyPassManager.cpp
index 69f402029c81..08e8906e88db 100644
--- a/lib/IR/LegacyPassManager.cpp
+++ b/lib/IR/LegacyPassManager.cpp
@@ -569,13 +569,33 @@ void PMTopLevelManager::collectLastUses(SmallVectorImpl<Pass *> &LastUses,
 
 AnalysisUsage *PMTopLevelManager::findAnalysisUsage(Pass *P) {
   AnalysisUsage *AnUsage = nullptr;
-  DenseMap<Pass *, AnalysisUsage *>::iterator DMI = AnUsageMap.find(P);
+  auto DMI = AnUsageMap.find(P);
   if (DMI != AnUsageMap.end())
     AnUsage = DMI->second;
   else {
-    AnUsage = new AnalysisUsage();
-    P->getAnalysisUsage(*AnUsage);
-    AnUsageMap[P] = AnUsage;
+    // Look up the analysis usage from the pass instance (different instances
+    // of the same pass can produce different results), but unique the
+    // resulting object to reduce memory usage.  This helps to greatly reduce
+    // memory usage when we have many instances of only a few pass types
+    // (e.g. instcombine, simplifycfg, etc...) which tend to share a fixed set
+    // of dependencies.
+    AnalysisUsage AU;
+    P->getAnalysisUsage(AU);
+    
+    AUFoldingSetNode* Node = nullptr;
+    FoldingSetNodeID ID;
+    AUFoldingSetNode::Profile(ID, AU);
+    void *IP = nullptr;
+    if (auto *N = UniqueAnalysisUsages.FindNodeOrInsertPos(ID, IP))
+      Node = N;
+    else {
+      Node = new (AUFoldingSetNodeAllocator) AUFoldingSetNode(AU);
+      UniqueAnalysisUsages.InsertNode(Node, IP);
+    }
+    assert(Node && "cached analysis usage must be non null");
+
+    AnUsageMap[P] = &Node->AU;
+    AnUsage = &Node->AU;;
   }
   return AnUsage;
 }
@@ -798,10 +818,6 @@ PMTopLevelManager::~PMTopLevelManager() {
   for (SmallVectorImpl<ImmutablePass *>::iterator
          I = ImmutablePasses.begin(), E = ImmutablePasses.end(); I != E; ++I)
     delete *I;
-
-  for (DenseMap<Pass *, AnalysisUsage *>::iterator DMI = AnUsageMap.begin(),
-         DME = AnUsageMap.end(); DMI != DME; ++DMI)
-    delete DMI->second;
 }
 
 //===----------------------------------------------------------------------===//

From fb25c75967e3b30f6687737616613a56c9d442ca Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@playingwithpointers.com>
Date: Fri, 4 Dec 2015 20:34:37 +0000
Subject: [PATCH 083/364] [OperandBundles] Allow operand-specific attributes in
 operand bundles

Currently `OperandBundleUse::operandsHaveAttr` computes its result
without being given a specific operand.  This is problematic because it
forces us to say that, e.g., even non-pointer operands in `"deopt"`
operand bundles are `readonly`, which doesn't make sense.

This commit changes `operandsHaveAttr` to work in the context of a
specific operand, so that we can give the operand attributes that make
sense for the operands's `llvm::Type`.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254764 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/InstrTypes.h | 38 ++++++++++++++++++++++++------------
 lib/IR/Instructions.cpp      |  4 ++--
 2 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/include/llvm/IR/InstrTypes.h b/include/llvm/IR/InstrTypes.h
index 81de6999cdb1..58bc7c1ee10a 100644
--- a/include/llvm/IR/InstrTypes.h
+++ b/include/llvm/IR/InstrTypes.h
@@ -1121,14 +1121,12 @@ struct OperandBundleUse {
   explicit OperandBundleUse(StringMapEntry<uint32_t> *Tag, ArrayRef<Use> Inputs)
       : Inputs(Inputs), Tag(Tag) {}
 
-  /// \brief Return true if all the operands in this operand bundle have the
-  /// attribute A.
-  ///
-  /// Currently there is no way to have attributes on operand bundles differ on
-  /// a per operand granularity.
-  bool operandsHaveAttr(Attribute::AttrKind A) const {
+  /// \brief Return true if the operand at index \p Idx in this operand bundle
+  /// has the attribute A.
+  bool operandHasAttr(unsigned Idx, Attribute::AttrKind A) const {
     if (isDeoptOperandBundle())
-      return A == Attribute::ReadOnly || A == Attribute::NoCapture;
+      if (A == Attribute::ReadOnly || A == Attribute::NoCapture)
+        return Inputs[Idx]->getType()->isPointerTy();
 
     // Conservative answer:  no operands have any attributes.
     return false;
@@ -1351,11 +1349,7 @@ template <typename InstrTy, typename OpIteratorTy> class OperandBundleUser {
   /// It is an error to call this with an OpIdx that does not correspond to an
   /// bundle operand.
   OperandBundleUse getOperandBundleForOperand(unsigned OpIdx) const {
-    for (auto &BOI : bundle_op_infos())
-      if (BOI.Begin <= OpIdx && OpIdx < BOI.End)
-        return operandBundleFromBundleOpInfo(BOI);
-
-    llvm_unreachable("Did not find operand bundle for operand!");
+    return operandBundleFromBundleOpInfo(getBundleOpInfoForOperand(OpIdx));
   }
 
   /// \brief Return true if this operand bundle user has operand bundles that
@@ -1382,6 +1376,14 @@ template <typename InstrTy, typename OpIteratorTy> class OperandBundleUser {
     return false;
   }
 
+  /// \brief Return true if the bundle operand at index \p OpIdx has the
+  /// attribute \p A.
+  bool bundleOperandHasAttr(unsigned OpIdx,  Attribute::AttrKind A) const {
+    auto &BOI = getBundleOpInfoForOperand(OpIdx);
+    auto OBU = operandBundleFromBundleOpInfo(BOI);
+    return OBU.operandHasAttr(OpIdx - BOI.Begin, A);
+  }
+
 protected:
   /// \brief Is the function attribute S disallowed by some operand bundle on
   /// this operand bundle user?
@@ -1518,6 +1520,18 @@ template <typename InstrTy, typename OpIteratorTy> class OperandBundleUser {
     return It;
   }
 
+  /// \brief Return the BundleOpInfo for the operand at index OpIdx.
+  ///
+  /// It is an error to call this with an OpIdx that does not correspond to an
+  /// bundle operand.
+  const BundleOpInfo &getBundleOpInfoForOperand(unsigned OpIdx) const {
+    for (auto &BOI : bundle_op_infos())
+      if (BOI.Begin <= OpIdx && OpIdx < BOI.End)
+        return BOI;
+
+    llvm_unreachable("Did not find operand bundle for operand!");
+  }
+
   /// \brief Return the total number of values used in \p Bundles.
   static unsigned CountBundleInputs(ArrayRef<OperandBundleDef> Bundles) {
     unsigned Total = 0;
diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp
index b8c72dd7e39d..bba0ef2d7d34 100644
--- a/lib/IR/Instructions.cpp
+++ b/lib/IR/Instructions.cpp
@@ -369,7 +369,7 @@ bool CallInst::dataOperandHasImpliedAttr(unsigned i,
 
   assert(hasOperandBundles() && i >= (getBundleOperandsStartIndex() + 1) &&
          "Must be either a call argument or an operand bundle!");
-  return getOperandBundleForOperand(i - 1).operandsHaveAttr(A);
+  return bundleOperandHasAttr(i - 1, A);
 }
 
 /// IsConstantOne - Return true only if val is constant int 1
@@ -646,7 +646,7 @@ bool InvokeInst::dataOperandHasImpliedAttr(unsigned i,
 
   assert(hasOperandBundles() && i >= (getBundleOperandsStartIndex() + 1) &&
          "Must be either an invoke argument or an operand bundle!");
-  return getOperandBundleForOperand(i - 1).operandsHaveAttr(A);
+  return bundleOperandHasAttr(i - 1, A);
 }
 
 void InvokeInst::addAttribute(unsigned i, Attribute::AttrKind attr) {

From 19d1511e67cb88a242419ad529e01d66739f283d Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Fri, 4 Dec 2015 21:16:42 +0000
Subject: [PATCH 084/364] [llvm-dwp] Include the debug_line.dwo section

This probably shouldn't be generated in the .dwo file for CUs, only for
TUs, but it's in the sample .dwos (generated by clang) so dwp should
reflect that.

Arguably the DWP tool could be smart enough to know that the CUs
shouldn't need a debug_line.dwo section and skip that even when it's
legitimately generated for TUs, but that's a bit more off-book.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254767 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/tools/llvm-dwp/X86/simple.test | 6 +++---
 tools/llvm-dwp/llvm-dwp.cpp         | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/test/tools/llvm-dwp/X86/simple.test b/test/tools/llvm-dwp/X86/simple.test
index 1c7b1040bd31..3c9795fbb934 100644
--- a/test/tools/llvm-dwp/X86/simple.test
+++ b/test/tools/llvm-dwp/X86/simple.test
@@ -45,10 +45,10 @@ CHECK:     DW_TAG_formal_parameter
 
 CHECK: .debug_cu_index contents:
 Ensure only the relevant/contained sections are included in the table:
-CHECK: Index Signature          INFO                     ABBREV                   STR_OFFSETS
+CHECK: Index Signature          INFO                     ABBREV                   LINE                     STR_OFFSETS
 Don't bother checking the Signatures, they aren't correct yet.
-CHECK:     1 [[DWOA]]           [0x00000000, 0x00000029) [0x00000000, 0x00000031) [0x00000000, 0x00000010)
-CHECK:     2 [[DWOB]]           [0x00000029, 0x0000005e) [0x00000031, 0x00000075) [0x00000010, 0x00000024)
+CHECK:     1 [[DWOA]]           [0x00000000, 0x00000029) [0x00000000, 0x00000031) [0x00000000, 0x00000011) [0x00000000, 0x00000010)
+CHECK:     2 [[DWOB]]           [0x00000029, 0x0000005e) [0x00000031, 0x00000075) [0x00000011, 0x00000022) [0x00000010, 0x00000024)
 
 CHECK: .debug_str.dwo contents:
 CHECK: "clang version
diff --git a/tools/llvm-dwp/llvm-dwp.cpp b/tools/llvm-dwp/llvm-dwp.cpp
index b68ba437f830..f67ecbf3437f 100644
--- a/tools/llvm-dwp/llvm-dwp.cpp
+++ b/tools/llvm-dwp/llvm-dwp.cpp
@@ -139,6 +139,7 @@ static std::error_code write(MCStreamer &Out, ArrayRef<std::string> Inputs) {
       {"debug_str_offsets.dwo", {StrOffsetSection, DW_SECT_STR_OFFSETS}},
       {"debug_str.dwo", {StrSection, static_cast<DWARFSectionKind>(0)}},
       {"debug_loc.dwo", {MCOFI.getDwarfLocDWOSection(), DW_SECT_LOC}},
+      {"debug_line.dwo", {MCOFI.getDwarfLineDWOSection(), DW_SECT_LINE}},
       {"debug_abbrev.dwo", {MCOFI.getDwarfAbbrevDWOSection(), DW_SECT_ABBREV}}};
 
   struct UnitIndexEntry {

From efb247f17cbc37fd4ff105b4316d6528db81564f Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Fri, 4 Dec 2015 21:29:53 +0000
Subject: [PATCH 085/364] Fix llvm-readobj build on Windows, match noreturn
 attribute on reportError in headers

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254769 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-readobj/llvm-readobj.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/llvm-readobj/llvm-readobj.h b/tools/llvm-readobj/llvm-readobj.h
index 58c50f58d750..5a103920c165 100644
--- a/tools/llvm-readobj/llvm-readobj.h
+++ b/tools/llvm-readobj/llvm-readobj.h
@@ -11,6 +11,7 @@
 #define LLVM_TOOLS_LLVM_READOBJ_LLVM_READOBJ_H
 
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include <string>
 
 namespace llvm {
@@ -19,7 +20,7 @@ namespace llvm {
   }
 
   // Various helper functions.
-  void reportError(Twine Msg);
+  LLVM_ATTRIBUTE_NORETURN void reportError(Twine Msg);
   void error(std::error_code ec);
   bool relocAddressLess(object::RelocationRef A,
                         object::RelocationRef B);

From a2b2c5abb0d5f55744b113e893ede123c1a41d6c Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Fri, 4 Dec 2015 21:30:23 +0000
Subject: [PATCH 086/364] [llvm-dwp] Implement the required on-disk probed hash
 table

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254770 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/tools/llvm-dwp/X86/simple.test |  4 ++--
 tools/llvm-dwp/llvm-dwp.cpp         | 21 ++++++++++++++++-----
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/test/tools/llvm-dwp/X86/simple.test b/test/tools/llvm-dwp/X86/simple.test
index 3c9795fbb934..2ed8e611844a 100644
--- a/test/tools/llvm-dwp/X86/simple.test
+++ b/test/tools/llvm-dwp/X86/simple.test
@@ -47,8 +47,8 @@ CHECK: .debug_cu_index contents:
 Ensure only the relevant/contained sections are included in the table:
 CHECK: Index Signature          INFO                     ABBREV                   LINE                     STR_OFFSETS
 Don't bother checking the Signatures, they aren't correct yet.
-CHECK:     1 [[DWOA]]           [0x00000000, 0x00000029) [0x00000000, 0x00000031) [0x00000000, 0x00000011) [0x00000000, 0x00000010)
-CHECK:     2 [[DWOB]]           [0x00000029, 0x0000005e) [0x00000031, 0x00000075) [0x00000011, 0x00000022) [0x00000010, 0x00000024)
+CHECK:     3 [[DWOA]]           [0x00000000, 0x00000029) [0x00000000, 0x00000031) [0x00000000, 0x00000011) [0x00000000, 0x00000010)
+CHECK:     4 [[DWOB]]           [0x00000029, 0x0000005e) [0x00000031, 0x00000075) [0x00000011, 0x00000022) [0x00000010, 0x00000024)
 
 CHECK: .debug_str.dwo contents:
 CHECK: "clang version
diff --git a/tools/llvm-dwp/llvm-dwp.cpp b/tools/llvm-dwp/llvm-dwp.cpp
index f67ecbf3437f..b4aaea3b238d 100644
--- a/tools/llvm-dwp/llvm-dwp.cpp
+++ b/tools/llvm-dwp/llvm-dwp.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
+#include "llvm/Support/MathExtras.h"
 #include <memory>
 #include <list>
 #include <unordered_set>
@@ -222,20 +223,30 @@ static std::error_code write(MCStreamer &Out, ArrayRef<std::string> Inputs) {
     if (C)
       ++Columns;
 
+  std::vector<unsigned> Buckets(NextPowerOf2(3 * IndexEntries.size() / 2));
+  uint64_t Mask = Buckets.size() - 1;
+  for (size_t i = 0; i != IndexEntries.size(); ++i) {
+    auto S = IndexEntries[i].Signature;
+    auto H = S & Mask;
+    while (Buckets[H])
+      H += ((S >> 32) & Mask) | 1;
+    Buckets[H] = i + 1;
+  }
+
   Out.SwitchSection(MCOFI.getDwarfCUIndexSection());
   Out.EmitIntValue(2, 4);                   // Version
   Out.EmitIntValue(Columns, 4);             // Columns
   Out.EmitIntValue(IndexEntries.size(), 4); // Num Units
   // FIXME: This is not the right number of buckets for a real hash.
-  Out.EmitIntValue(IndexEntries.size(), 4); // Num Buckets
+  Out.EmitIntValue(Buckets.size(), 4); // Num Buckets
 
   // Write the signatures.
-  for (const auto &E : IndexEntries)
-    Out.EmitIntValue(E.Signature, 8);
+  for (const auto &I : Buckets)
+    Out.EmitIntValue(I ? IndexEntries[I - 1].Signature : 0, 8);
 
   // Write the indexes.
-  for (size_t i = 0; i != IndexEntries.size(); ++i)
-    Out.EmitIntValue(i + 1, 4);
+  for (const auto &I : Buckets)
+    Out.EmitIntValue(I, 4);
 
   // Write the column headers (which sections will appear in the table)
   for (size_t i = 0; i != array_lengthof(ContributionOffsets); ++i)

From fca82775a2b75d8aaaf8f30226acaa3b3607f356 Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Fri, 4 Dec 2015 21:38:39 +0000
Subject: [PATCH 087/364] [llvm-dwp] Remove some out of date comments

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254772 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/tools/llvm-dwp/X86/simple.test | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/tools/llvm-dwp/X86/simple.test b/test/tools/llvm-dwp/X86/simple.test
index 2ed8e611844a..5cc626334680 100644
--- a/test/tools/llvm-dwp/X86/simple.test
+++ b/test/tools/llvm-dwp/X86/simple.test
@@ -44,9 +44,7 @@ CHECK:     DW_AT_name {{.*}} "b"
 CHECK:     DW_TAG_formal_parameter
 
 CHECK: .debug_cu_index contents:
-Ensure only the relevant/contained sections are included in the table:
 CHECK: Index Signature          INFO                     ABBREV                   LINE                     STR_OFFSETS
-Don't bother checking the Signatures, they aren't correct yet.
 CHECK:     3 [[DWOA]]           [0x00000000, 0x00000029) [0x00000000, 0x00000031) [0x00000000, 0x00000011) [0x00000000, 0x00000010)
 CHECK:     4 [[DWOB]]           [0x00000029, 0x0000005e) [0x00000031, 0x00000075) [0x00000011, 0x00000022) [0x00000010, 0x00000024)
 

From b590f81c153265bc3ce6173161642b7fbc5b8a1c Mon Sep 17 00:00:00 2001
From: Chad Rosier <mcrosier@codeaurora.org>
Date: Fri, 4 Dec 2015 21:38:44 +0000
Subject: [PATCH 088/364] [AArch64] Expand vector SDIVREM/UDIVREM operations.

http://reviews.llvm.org/D15214
Patch by Ana Pazos <apazos@codeaurora.org>!

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254773 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64ISelLowering.cpp |  4 ++++
 test/CodeGen/AArch64/divrem.ll             | 22 ++++++++++++++++++++++
 2 files changed, 26 insertions(+)
 create mode 100644 test/CodeGen/AArch64/divrem.ll

diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index f0fb03451b2a..9340e7f0a55c 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -237,6 +237,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
   setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+  for (MVT VT : MVT::vector_valuetypes()) {
+    setOperationAction(ISD::SDIVREM, VT, Expand);
+    setOperationAction(ISD::UDIVREM, VT, Expand);
+  }
   setOperationAction(ISD::SREM, MVT::i32, Expand);
   setOperationAction(ISD::SREM, MVT::i64, Expand);
   setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
diff --git a/test/CodeGen/AArch64/divrem.ll b/test/CodeGen/AArch64/divrem.ll
new file mode 100644
index 000000000000..9f648eb63eac
--- /dev/null
+++ b/test/CodeGen/AArch64/divrem.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s -mattr=+neon | FileCheck %s
+
+; SDIVREM/UDIVREM DAG nodes are generated but expanded when lowering and
+; should not generate select error.
+define <2 x i32> @test_udivrem(<2 x i32> %x, < 2 x i32> %y, < 2 x i32>* %z) {
+; CHECK-LABEL: test_udivrem
+; CHECK-DAG: udivrem
+; CHECK-NOT: LLVM ERROR: Cannot select
+  %div = udiv <2 x i32> %x, %y
+  store <2 x i32> %div, <2 x i32>* %z
+  %1 = urem <2 x i32> %x, %y
+  ret <2 x i32> %1
+}
+
+define <4 x i32> @test_sdivrem(<4 x i32> %x,  <4 x i32>* %y) {
+; CHECK-LABEL: test_sdivrem
+; CHECK-DAG: sdivrem
+  %div = sdiv <4 x i32> %x,  < i32 20, i32 20, i32 20, i32 20 >
+  store <4 x i32> %div, <4 x i32>* %y
+  %1 = srem <4 x i32> %x, < i32 20, i32 20, i32 20, i32 20 >
+  ret <4 x i32> %1
+}

From 6f41c1352b5e860ce6fd737f1d69a13f810c0311 Mon Sep 17 00:00:00 2001
From: Keno Fischer <kfischer@college.harvard.edu>
Date: Fri, 4 Dec 2015 21:56:46 +0000
Subject: [PATCH 089/364] [llc/opt] Add an option to run all passes twice

Summary: Lately, I have submitted a number of patches to fix bugs that
only occurred when using the same pass manager to compile multiple
modules (generally these bugs are failure to reset some persistent
state). Unfortunately I don't think there is currently a way to test
that from the command line. This adds a very simple flag to both llc
and opt, under which the tools will simply re-run their respective
pass pipelines using the same pass manager on (a clone of the same
module). Additionally, we verify that both outputs are bitwise the
same.

Reviewers: yaron.keren

Subscribers: loladiro, yaron.keren, kcc, llvm-commits

Differential Revision: http://reviews.llvm.org/D14965

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254774 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/MC/ELF/empty-twice.ll |  6 +++++
 test/Other/opt-twice.ll    | 14 ++++++++++
 tools/llc/llc.cpp          | 52 +++++++++++++++++++++++++++++++++++---
 tools/opt/opt.cpp          | 48 +++++++++++++++++++++++++++++++----
 4 files changed, 111 insertions(+), 9 deletions(-)
 create mode 100644 test/MC/ELF/empty-twice.ll
 create mode 100644 test/Other/opt-twice.ll

diff --git a/test/MC/ELF/empty-twice.ll b/test/MC/ELF/empty-twice.ll
new file mode 100644
index 000000000000..c24bd629c416
--- /dev/null
+++ b/test/MC/ELF/empty-twice.ll
@@ -0,0 +1,6 @@
+; Check that there is no persistent state in the ELF emitter that crashes us
+; when we try to reuse the pass manager
+; RUN: llc -compile-twice -filetype=obj %s -o -
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
+target triple = "i386-pc-linux-gnu"
diff --git a/test/Other/opt-twice.ll b/test/Other/opt-twice.ll
new file mode 100644
index 000000000000..6bff52e34e35
--- /dev/null
+++ b/test/Other/opt-twice.ll
@@ -0,0 +1,14 @@
+; The pass here doesn't matter (we use deadargelim), but test
+; that the -run-twice options exists, generates output, and
+; doesn't crash
+; RUN: opt -run-twice -deadargelim -S < %s | FileCheck %s
+
+; CHECK: define internal void @test
+define internal {} @test() {
+  ret {} undef
+}
+
+define void @caller() {
+  call {} @test()
+  ret void
+}
diff --git a/tools/llc/llc.cpp b/tools/llc/llc.cpp
index c51c012391b2..531aba1f64bf 100644
--- a/tools/llc/llc.cpp
+++ b/tools/llc/llc.cpp
@@ -45,6 +45,7 @@
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include <memory>
 using namespace llvm;
 
@@ -96,6 +97,12 @@ static cl::opt<bool> AsmVerbose("asm-verbose",
                                 cl::desc("Add comments to directives."),
                                 cl::init(true));
 
+static cl::opt<bool>
+    CompileTwice("compile-twice", cl::Hidden,
+                 cl::desc("Run everything twice, re-using the same pass "
+                          "manager and verify the the result is the same."),
+                 cl::init(false));
+
 static int compileModule(char **, LLVMContext &);
 
 static std::unique_ptr<tool_output_file>
@@ -325,10 +332,15 @@ static int compileModule(char **argv, LLVMContext &Context) {
 
   {
     raw_pwrite_stream *OS = &Out->os();
-    std::unique_ptr<buffer_ostream> BOS;
-    if (FileType != TargetMachine::CGFT_AssemblyFile &&
-        !Out->os().supportsSeeking()) {
-      BOS = make_unique<buffer_ostream>(*OS);
+
+    // Manually do the buffering rather than using buffer_ostream,
+    // so we can memcmp the contents in CompileTwice mode
+    SmallVector<char, 0> Buffer;
+    std::unique_ptr<raw_svector_ostream> BOS;
+    if ((FileType != TargetMachine::CGFT_AssemblyFile &&
+         !Out->os().supportsSeeking()) ||
+        CompileTwice) {
+      BOS = make_unique<raw_svector_ostream>(Buffer);
       OS = BOS.get();
     }
 
@@ -378,7 +390,39 @@ static int compileModule(char **argv, LLVMContext &Context) {
     // Before executing passes, print the final values of the LLVM options.
     cl::PrintOptionValues();
 
+    // If requested, run the pass manager over the same module again,
+    // to catch any bugs due to persistent state in the passes. Note that
+    // opt has the same functionality, so it may be worth abstracting this out
+    // in the future.
+    SmallVector<char, 0> CompileTwiceBuffer;
+    if (CompileTwice) {
+      std::unique_ptr<Module> M2(llvm::CloneModule(M.get()));
+      PM.run(*M2);
+      CompileTwiceBuffer = Buffer;
+      Buffer.clear();
+    }
+
     PM.run(*M);
+
+    // Compare the two outputs and make sure they're the same
+    if (CompileTwice) {
+      if (Buffer.size() != CompileTwiceBuffer.size() ||
+          (memcmp(Buffer.data(), CompileTwiceBuffer.data(), Buffer.size()) !=
+           0)) {
+        errs()
+            << "Running the pass manager twice changed the output.\n"
+               "Writing the result of the second run to the specified output\n"
+               "To generate the one-run comparison binary, just run without\n"
+               "the compile-twice option\n";
+        Out->os() << Buffer;
+        Out->keep();
+        return 1;
+      }
+    }
+
+    if (BOS) {
+      Out->os() << Buffer;
+    }
   }
 
   // Declare success.
diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp
index 5fe2f034c6e2..c1510a7fb259 100644
--- a/tools/opt/opt.cpp
+++ b/tools/opt/opt.cpp
@@ -28,6 +28,7 @@
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/LegacyPassNameParser.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
@@ -36,7 +37,6 @@
 #include "llvm/LinkAllIR.h"
 #include "llvm/LinkAllPasses.h"
 #include "llvm/MC/SubtargetFeature.h"
-#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Host.h"
@@ -51,6 +51,7 @@
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include <algorithm>
 #include <memory>
 using namespace llvm;
@@ -190,6 +191,11 @@ static cl::opt<bool> PreserveAssemblyUseListOrder(
     cl::desc("Preserve use-list order when writing LLVM assembly."),
     cl::init(false), cl::Hidden);
 
+static cl::opt<bool>
+    RunTwice("run-twice",
+             cl::desc("Run all passes twice, re-using the same pass manager."),
+             cl::init(false), cl::Hidden);
+
 static inline void addPass(legacy::PassManagerBase &PM, Pass *P) {
   // Add the pass to the pass manager...
   PM.add(P);
@@ -582,14 +588,25 @@ int main(int argc, char **argv) {
   if (!NoVerify && !VerifyEach)
     Passes.add(createVerifierPass());
 
+  // In run twice mode, we want to make sure the output is bit-by-bit
+  // equivalent if we run the pass manager again, so setup two buffers and
+  // a stream to write to them. Note that llc does something similar and it
+  // may be worth to abstract this out in the future.
+  SmallVector<char, 0> Buffer;
+  SmallVector<char, 0> CompileTwiceBuffer;
+  std::unique_ptr<raw_svector_ostream> BOS;
+  raw_ostream *OS = &Out->os();
+  if (RunTwice) {
+    BOS = make_unique<raw_svector_ostream>(Buffer);
+    OS = BOS.get();
+  }
+
   // Write bitcode or assembly to the output as the last step...
   if (!NoOutput && !AnalyzeOnly) {
     if (OutputAssembly)
-      Passes.add(
-          createPrintModulePass(Out->os(), "", PreserveAssemblyUseListOrder));
+      Passes.add(createPrintModulePass(*OS, "", PreserveAssemblyUseListOrder));
     else
-      Passes.add(
-          createBitcodeWriterPass(Out->os(), PreserveBitcodeUseListOrder));
+      Passes.add(createBitcodeWriterPass(*OS, PreserveBitcodeUseListOrder));
   }
 
   // Before executing passes, print the final values of the LLVM options.
@@ -598,6 +615,27 @@ int main(int argc, char **argv) {
   // Now that we have all of the passes ready, run them.
   Passes.run(*M);
 
+  // If requested, run all passes again with the same pass manager to catch
+  // bugs caused by persistent state in the passes
+  if (RunTwice) {
+    CompileTwiceBuffer = Buffer;
+    Buffer.clear();
+    std::unique_ptr<Module> M2(CloneModule(M.get()));
+    Passes.run(*M2);
+    if (Buffer.size() != CompileTwiceBuffer.size() ||
+        (memcmp(Buffer.data(), CompileTwiceBuffer.data(), Buffer.size()) !=
+         0)) {
+      errs() << "Running the pass manager twice changed the output.\n"
+                "Writing the result of the second run to the specified output."
+                "To generate the one-run comparison binary, just run without\n"
+                "the compile-twice option\n";
+      Out->os() << BOS->str();
+      Out->keep();
+      return 1;
+    }
+    Out->os() << BOS->str();
+  }
+
   // Declare success.
   if (!NoOutput || PrintBreakpoints)
     Out->keep();

From b6cc95afa7478cecc423bac61fa54aa7fc3f6efe Mon Sep 17 00:00:00 2001
From: Pete Cooper <peter_cooper@apple.com>
Date: Fri, 4 Dec 2015 21:59:04 +0000
Subject: [PATCH 090/364] Fix incorrect quote.  NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254775 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/GettingStarted.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/GettingStarted.rst b/docs/GettingStarted.rst
index 0d3115513453..8fb4daa5e26e 100644
--- a/docs/GettingStarted.rst
+++ b/docs/GettingStarted.rst
@@ -853,7 +853,7 @@ with the latest Xcode:
 
 .. code-block:: console
 
-  % cmake -G "Ninja" -DCMAKE_OSX_ARCHITECTURES=“armv7;armv7s;arm64"
+  % cmake -G "Ninja" -DCMAKE_OSX_ARCHITECTURES="armv7;armv7s;arm64"
     -DCMAKE_TOOLCHAIN_FILE=<PATH_TO_LLVM>/cmake/platforms/iOS.cmake
     -DCMAKE_BUILD_TYPE=Release -DLLVM_BUILD_RUNTIME=Off -DLLVM_INCLUDE_TESTS=Off
     -DLLVM_INCLUDE_EXAMPLES=Off -DLLVM_ENABLE_BACKTRACES=Off [options]

From cc87069c319aed2f95394a76dccfcd6360c08b80 Mon Sep 17 00:00:00 2001
From: Weiming Zhao <weimingz@codeaurora.org>
Date: Fri, 4 Dec 2015 22:00:47 +0000
Subject: [PATCH 091/364] [SimplifyLibCalls] Optimization for pow(x, n) where n
 is some constant

Summary:
    In order to avoid calling pow function we generate repeated fmul when n is a
    positive or negative whole number.

    For each exponent we pre-compute Addition Chains in order to minimize the no.
    of fmuls.
    Refer: http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html

    We pre-compute addition chains for exponents upto 32 (which results in a max of
    7 fmuls).

    For eg:
    4 = 2+2
    5 = 2+3
    6 = 3+3 and so on

    Hence,
    pow(x, 4.0) ==> y = fmul x, x
                    x = fmul y, y
                    ret x

    For negative exponents, we simply compute the reciprocal of the final result.

    Note: This transformation is only enabled under fast-math.

    Patch by Mandeep Singh Grang <mgrang@codeaurora.org>

Reviewers: weimingz, majnemer, escha, davide, scanon, joerg

Subscribers: probinson, escha, llvm-commits

Differential Revision: http://reviews.llvm.org/D13994

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254776 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Utils/SimplifyLibCalls.cpp |  51 +++++++++
 test/Transforms/InstCombine/pow-4.ll      | 120 ++++++++++++++++++++++
 2 files changed, 171 insertions(+)
 create mode 100644 test/Transforms/InstCombine/pow-4.ll

diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 83afb1a65ac0..df75ed96893d 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -1058,6 +1058,31 @@ Value *LibCallSimplifier::optimizeCos(CallInst *CI, IRBuilder<> &B) {
   return Ret;
 }
 
+static Value *getPow(Value *InnerChain[33], unsigned Exp, IRBuilder<> &B) {
+  // Multiplications calculated using Addition Chains.
+  // Refer: http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html
+
+  assert(Exp != 0 && "Incorrect exponent 0 not handled");
+
+  if (InnerChain[Exp])
+    return InnerChain[Exp];
+
+  static const unsigned AddChain[33][2] = {
+      {0, 0}, // Unused.
+      {0, 0}, // Unused (base case = pow1).
+      {1, 1}, // Unused (pre-computed).
+      {1, 2},  {2, 2},   {2, 3},  {3, 3},   {2, 5},  {4, 4},
+      {1, 8},  {5, 5},   {1, 10}, {6, 6},   {4, 9},  {7, 7},
+      {3, 12}, {8, 8},   {8, 9},  {2, 16},  {1, 18}, {10, 10},
+      {6, 15}, {11, 11}, {3, 20}, {12, 12}, {8, 17}, {13, 13},
+      {3, 24}, {14, 14}, {4, 25}, {15, 15}, {3, 28}, {16, 16},
+  };
+
+  InnerChain[Exp] = B.CreateFMul(getPow(InnerChain, AddChain[Exp][0], B),
+                                 getPow(InnerChain, AddChain[Exp][1], B));
+  return InnerChain[Exp];
+}
+
 Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
   Value *Ret = nullptr;
@@ -1156,6 +1181,32 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
     return B.CreateFMul(Op1, Op1, "pow2");
   if (Op2C->isExactlyValue(-1.0)) // pow(x, -1.0) -> 1.0/x
     return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), Op1, "powrecip");
+
+  // In -ffast-math, generate repeated fmul instead of generating pow(x, n).
+  if (unsafeFPMath) {
+    APFloat V = abs(Op2C->getValueAPF());
+    // We limit to a max of 7 fmul(s). Thus max exponent is 32.
+    // This transformation applies to integer exponents only.
+    if (V.compare(APFloat(V.getSemantics(), 32.0)) == APFloat::cmpGreaterThan ||
+        !V.isInteger())
+      return nullptr;
+
+    // We will memoize intermediate products of the Addition Chain.
+    Value *InnerChain[33] = {nullptr};
+    InnerChain[1] = Op1;
+    InnerChain[2] = B.CreateFMul(Op1, Op1);
+
+    // We cannot readily convert a non-double type (like float) to a double.
+    // So we first convert V to something which could be converted to double.
+    bool ignored;
+    V.convert(APFloat::IEEEdouble, APFloat::rmTowardZero, &ignored);
+    Value *FMul = getPow(InnerChain, V.convertToDouble(), B);
+    // For negative exponents simply compute the reciprocal.
+    if (Op2C->isNegative())
+      FMul = B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), FMul);
+    return FMul;
+  }
+
   return nullptr;
 }
 
diff --git a/test/Transforms/InstCombine/pow-4.ll b/test/Transforms/InstCombine/pow-4.ll
new file mode 100644
index 000000000000..76ef4c5de923
--- /dev/null
+++ b/test/Transforms/InstCombine/pow-4.ll
@@ -0,0 +1,120 @@
+; Test that the pow library call simplifier works correctly.
+
+; RUN: opt -instcombine -S < %s | FileCheck %s
+
+; Function Attrs: nounwind readnone
+declare double @llvm.pow.f64(double, double)
+declare float @llvm.pow.f32(float, float)
+
+; pow(x, 4.0f)
+define float @test_simplify_4f(float %x) #0 {
+; CHECK-LABEL: @test_simplify_4f(
+; CHECK-NOT: pow
+; CHECK-NEXT: %1 = fmul float %x, %x
+; CHECK-NEXT: %2 = fmul float %1, %1
+; CHECK-NEXT: ret float %2
+  %1 = call float @llvm.pow.f32(float %x, float 4.000000e+00)
+  ret float %1
+}
+
+; pow(x, 3.0)
+define double @test_simplify_3(double %x) #0 {
+; CHECK-LABEL: @test_simplify_3(
+; CHECK-NOT: pow
+; CHECK-NEXT: %1 = fmul double %x, %x
+; CHECK-NEXT: %2 = fmul double %1, %x
+; CHECK-NEXT: ret double %2
+  %1 = call double @llvm.pow.f64(double %x, double 3.000000e+00)
+  ret double %1
+}
+
+; pow(x, 4.0)
+define double @test_simplify_4(double %x) #0 {
+; CHECK-LABEL: @test_simplify_4(
+; CHECK-NOT: pow
+; CHECK-NEXT: %1 = fmul double %x, %x
+; CHECK-NEXT: %2 = fmul double %1, %1
+; CHECK-NEXT: ret double %2
+  %1 = call double @llvm.pow.f64(double %x, double 4.000000e+00)
+  ret double %1
+}
+
+; pow(x, 15.0)
+define double @test_simplify_15(double %x) #0 {
+; CHECK-LABEL: @test_simplify_15(
+; CHECK-NOT: pow
+; CHECK-NEXT: %1 = fmul double %x, %x
+; CHECK-NEXT: %2 = fmul double %1, %x
+; CHECK-NEXT: %3 = fmul double %2, %2
+; CHECK-NEXT: %4 = fmul double %3, %3
+; CHECK-NEXT: %5 = fmul double %2, %4
+; CHECK-NEXT: ret double %5
+  %1 = call double @llvm.pow.f64(double %x, double 1.500000e+01)
+  ret double %1
+}
+
+; pow(x, -7.0)
+define double @test_simplify_neg_7(double %x) #0 {
+; CHECK-LABEL: @test_simplify_neg_7(
+; CHECK-NOT: pow
+; CHECK-NEXT: %1 = fmul double %x, %x
+; CHECK-NEXT: %2 = fmul double %1, %x
+; CHECK-NEXT: %3 = fmul double %1, %2
+; CHECK-NEXT: %4 = fmul double %1, %3
+; CHECK-NEXT: %5 = fdiv double 1.000000e+00, %4
+; CHECK-NEXT: ret double %5
+  %1 = call double @llvm.pow.f64(double %x, double -7.000000e+00)
+  ret double %1
+}
+
+; pow(x, -19.0)
+define double @test_simplify_neg_19(double %x) #0 {
+; CHECK-LABEL: @test_simplify_neg_19(
+; CHECK-NOT: pow
+; CHECK-NEXT: %1 = fmul double %x, %x
+; CHECK-NEXT: %2 = fmul double %1, %1
+; CHECK-NEXT: %3 = fmul double %2, %2
+; CHECK-NEXT: %4 = fmul double %3, %3
+; CHECK-NEXT: %5 = fmul double %1, %4
+; CHECK-NEXT: %6 = fmul double %5, %x
+; CHECK-NEXT: %7 = fdiv double 1.000000e+00, %6
+; CHECK-NEXT: ret double %7
+  %1 = call double @llvm.pow.f64(double %x, double -1.900000e+01)
+  ret double %1
+}
+
+; pow(x, 11.23)
+define double @test_simplify_11_23(double %x) #0 {
+; CHECK-LABEL: @test_simplify_11_23(
+; CHECK-NOT: fmul
+; CHECK-NEXT: %1 = call double @llvm.pow.f64(double %x, double 1.123000e+01)
+; CHECK-NEXT: ret double %1
+  %1 = call double @llvm.pow.f64(double %x, double 1.123000e+01)
+  ret double %1
+}
+
+; pow(x, 32.0)
+define double @test_simplify_32(double %x) #0 {
+; CHECK-LABEL: @test_simplify_32(
+; CHECK-NOT: pow
+; CHECK-NEXT: %1 = fmul double %x, %x
+; CHECK-NEXT: %2 = fmul double %1, %1
+; CHECK-NEXT: %3 = fmul double %2, %2
+; CHECK-NEXT: %4 = fmul double %3, %3
+; CHECK-NEXT: %5 = fmul double %4, %4
+; CHECK-NEXT: ret double %5
+  %1 = call double @llvm.pow.f64(double %x, double 3.200000e+01)
+  ret double %1
+}
+
+; pow(x, 33.0)
+define double @test_simplify_33(double %x) #0 {
+; CHECK-LABEL: @test_simplify_33(
+; CHECK-NOT: fmul
+; CHECK-NEXT: %1 = call double @llvm.pow.f64(double %x, double 3.300000e+01)
+; CHECK-NEXT: ret double %1
+  %1 = call double @llvm.pow.f64(double %x, double 3.300000e+01)
+  ret double %1
+}
+
+attributes #0 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="true" "use-soft-float"="false" }

From c55f4fb8055591bee9ed577794299c0dd3ff791e Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Fri, 4 Dec 2015 22:08:53 +0000
Subject: [PATCH 092/364] Always pass a diagnostic handler to the linker.

Before this patch the diagnostic handler was optional. If it was not
passed, the one in the LLVMContext was used.

That is probably not a pattern we want to follow. If each area has an
optional callback, there is a sea of callbacks and it is hard to follow
which one is called.

Doing this also found cases where the callback is a nice addition, like
testing that no errors or warnings are reported.

The other option is to always use the diagnostic handler in the
LLVMContext. That has a few problems

* To implement the C API we would have to set the diag handler and then
  set it back to the original value.
* Code that creates the context might be far away from code that wants
  the diagnostics.

I do have a patch that implements the second option and will send that as
an RFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254777 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Linker/Linker.h         |  6 +++---
 lib/LTO/LTOCodeGenerator.cpp         | 10 ++++++----
 lib/Linker/LinkModules.cpp           | 10 ----------
 tools/bugpoint/BugDriver.cpp         |  9 ++++++++-
 tools/bugpoint/Miscompilation.cpp    | 19 +++++++++++++++----
 tools/gold/gold-plugin.cpp           |  2 +-
 unittests/Linker/LinkModulesTest.cpp |  8 +++++---
 7 files changed, 38 insertions(+), 26 deletions(-)

diff --git a/include/llvm/Linker/Linker.h b/include/llvm/Linker/Linker.h
index f9890935126e..f0c8ad979ab6 100644
--- a/include/llvm/Linker/Linker.h
+++ b/include/llvm/Linker/Linker.h
@@ -69,7 +69,6 @@ class Linker {
   };
 
   Linker(Module &M, DiagnosticHandlerFunction DiagnosticHandler);
-  Linker(Module &M);
 
   /// \brief Link \p Src into the composite. The source is destroyed.
   ///
@@ -88,8 +87,9 @@ class Linker {
                           DiagnosticHandlerFunction DiagnosticHandler,
                           unsigned Flags = Flags::None);
 
-  static bool linkModules(Module &Dest, Module &Src,
-                          unsigned Flags = Flags::None);
+  DiagnosticHandlerFunction getDiagnosticHandler() const {
+    return DiagnosticHandler;
+  }
 
 private:
   Module &Composite;
diff --git a/lib/LTO/LTOCodeGenerator.cpp b/lib/LTO/LTOCodeGenerator.cpp
index b0dae74c13d4..25c150b27840 100644
--- a/lib/LTO/LTOCodeGenerator.cpp
+++ b/lib/LTO/LTOCodeGenerator.cpp
@@ -65,9 +65,10 @@ const char* LTOCodeGenerator::getVersionString() {
 }
 
 LTOCodeGenerator::LTOCodeGenerator(LLVMContext &Context)
-    : Context(Context),
-      MergedModule(new Module("ld-temp.o", Context)),
-      IRLinker(new Linker(*MergedModule)) {
+    : Context(Context), MergedModule(new Module("ld-temp.o", Context)),
+      IRLinker(new Linker(*MergedModule, [this](const DiagnosticInfo &DI) {
+        MergedModule->getContext().diagnose(DI);
+      })) {
   initializeLTOPasses();
 }
 
@@ -123,7 +124,8 @@ void LTOCodeGenerator::setModule(std::unique_ptr<LTOModule> Mod) {
   AsmUndefinedRefs.clear();
 
   MergedModule = Mod->takeModule();
-  IRLinker = make_unique<Linker>(*MergedModule);
+  IRLinker =
+      make_unique<Linker>(*MergedModule, IRLinker->getDiagnosticHandler());
 
   const std::vector<const char*> &Undefs = Mod->getAsmUndefinedRefs();
   for (int I = 0, E = Undefs.size(); I != E; ++I)
diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index 55ab1824740b..88b8e443c489 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -2030,11 +2030,6 @@ Linker::Linker(Module &M, DiagnosticHandlerFunction DiagnosticHandler)
   }
 }
 
-Linker::Linker(Module &M)
-    : Linker(M, [this](const DiagnosticInfo &DI) {
-        Composite.getContext().diagnose(DI);
-      }) {}
-
 bool Linker::linkInModule(Module &Src, unsigned Flags,
                           const FunctionInfoIndex *Index,
                           DenseSet<const GlobalValue *> *FunctionsToImport) {
@@ -2061,11 +2056,6 @@ bool Linker::linkModules(Module &Dest, Module &Src,
   return L.linkInModule(Src, Flags);
 }
 
-bool Linker::linkModules(Module &Dest, Module &Src, unsigned Flags) {
-  Linker L(Dest);
-  return L.linkInModule(Src, Flags);
-}
-
 //===----------------------------------------------------------------------===//
 // C API.
 //===----------------------------------------------------------------------===//
diff --git a/tools/bugpoint/BugDriver.cpp b/tools/bugpoint/BugDriver.cpp
index 39887d5d59dc..9edc242d470e 100644
--- a/tools/bugpoint/BugDriver.cpp
+++ b/tools/bugpoint/BugDriver.cpp
@@ -15,6 +15,7 @@
 
 #include "BugDriver.h"
 #include "ToolRunner.h"
+#include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/IRReader/IRReader.h"
@@ -112,6 +113,12 @@ std::unique_ptr<Module> llvm::parseInputFile(StringRef Filename,
   return Result;
 }
 
+static void diagnosticHandler(const DiagnosticInfo &DI) {
+  DiagnosticPrinterRawOStream DP(errs());
+  DI.print(DP);
+  errs() << '\n';
+}
+
 // This method takes the specified list of LLVM input files, attempts to load
 // them, either as assembly or bitcode, then link them together. It returns
 // true on failure (if, for example, an input bitcode file could not be
@@ -132,7 +139,7 @@ bool BugDriver::addSources(const std::vector<std::string> &Filenames) {
     if (!M.get()) return true;
 
     outs() << "Linking in input file: '" << Filenames[i] << "'\n";
-    if (Linker::linkModules(*Program, *M))
+    if (Linker::linkModules(*Program, *M, diagnosticHandler))
       return true;
   }
 
diff --git a/tools/bugpoint/Miscompilation.cpp b/tools/bugpoint/Miscompilation.cpp
index e7eae40ec95a..0b61b0969855 100644
--- a/tools/bugpoint/Miscompilation.cpp
+++ b/tools/bugpoint/Miscompilation.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Config/config.h"   // for HAVE_LINK_R
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
@@ -207,6 +208,14 @@ namespace {
   };
 }
 
+static void diagnosticHandler(const DiagnosticInfo &DI) {
+  DiagnosticPrinterRawOStream DP(errs());
+  DI.print(DP);
+  errs() << '\n';
+  if (DI.getSeverity() == DS_Error)
+    exit(1);
+}
+
 /// TestMergedProgram - Given two modules, link them together and run the
 /// program, checking to see if the program matches the diff. If there is
 /// an error, return NULL. If not, return the merged module. The Broken argument
@@ -222,7 +231,7 @@ static Module *TestMergedProgram(const BugDriver &BD, Module *M1, Module *M2,
     M1 = CloneModule(M1);
     M2 = CloneModule(M2);
   }
-  if (Linker::linkModules(*M1, *M2))
+  if (Linker::linkModules(*M1, *M2, diagnosticHandler))
     exit(1);
   delete M2;   // We are done with this module.
 
@@ -390,7 +399,8 @@ static bool ExtractLoops(BugDriver &BD,
         MisCompFunctions.emplace_back(F->getName(), F->getFunctionType());
       }
 
-      if (Linker::linkModules(*ToNotOptimize, *ToOptimizeLoopExtracted))
+      if (Linker::linkModules(*ToNotOptimize, *ToOptimizeLoopExtracted,
+                              diagnosticHandler))
         exit(1);
 
       MiscompiledFunctions.clear();
@@ -418,7 +428,8 @@ static bool ExtractLoops(BugDriver &BD,
     // extraction both didn't break the program, and didn't mask the problem.
     // Replace the current program with the loop extracted version, and try to
     // extract another loop.
-    if (Linker::linkModules(*ToNotOptimize, *ToOptimizeLoopExtracted))
+    if (Linker::linkModules(*ToNotOptimize, *ToOptimizeLoopExtracted,
+                            diagnosticHandler))
       exit(1);
 
     delete ToOptimizeLoopExtracted;
@@ -594,7 +605,7 @@ static bool ExtractBlocks(BugDriver &BD,
     if (!I->isDeclaration())
       MisCompFunctions.emplace_back(I->getName(), I->getFunctionType());
 
-  if (Linker::linkModules(*ProgClone, *Extracted))
+  if (Linker::linkModules(*ProgClone, *Extracted, diagnosticHandler))
     exit(1);
 
   // Set the new program and delete the old one.
diff --git a/tools/gold/gold-plugin.cpp b/tools/gold/gold-plugin.cpp
index 1bd2f8afb290..8eacdc3ff235 100644
--- a/tools/gold/gold-plugin.cpp
+++ b/tools/gold/gold-plugin.cpp
@@ -938,7 +938,7 @@ static ld_plugin_status allSymbolsReadHook(raw_fd_ostream *ApiFile) {
   }
 
   std::unique_ptr<Module> Combined(new Module("ld-temp.o", Context));
-  Linker L(*Combined);
+  Linker L(*Combined, diagnosticHandler);
 
   std::string DefaultTriple = sys::getDefaultTargetTriple();
 
diff --git a/unittests/Linker/LinkModulesTest.cpp b/unittests/Linker/LinkModulesTest.cpp
index 4eba718e2663..e56a692125ec 100644
--- a/unittests/Linker/LinkModulesTest.cpp
+++ b/unittests/Linker/LinkModulesTest.cpp
@@ -71,6 +71,8 @@ class LinkModuleTest : public testing::Test {
   BasicBlock *ExitBB;
 };
 
+static void expectNoDiags(const DiagnosticInfo &DI) { EXPECT_TRUE(false); }
+
 TEST_F(LinkModuleTest, BlockAddress) {
   IRBuilder<> Builder(EntryBB);
 
@@ -93,7 +95,7 @@ TEST_F(LinkModuleTest, BlockAddress) {
   Builder.CreateRet(ConstantPointerNull::get(Type::getInt8PtrTy(Ctx)));
 
   Module *LinkedModule = new Module("MyModuleLinked", Ctx);
-  Linker::linkModules(*LinkedModule, *M);
+  Linker::linkModules(*LinkedModule, *M, expectNoDiags);
 
   // Delete the original module.
   M.reset();
@@ -169,13 +171,13 @@ static Module *getInternal(LLVMContext &Ctx) {
 TEST_F(LinkModuleTest, EmptyModule) {
   std::unique_ptr<Module> InternalM(getInternal(Ctx));
   std::unique_ptr<Module> EmptyM(new Module("EmptyModule1", Ctx));
-  Linker::linkModules(*EmptyM, *InternalM);
+  Linker::linkModules(*EmptyM, *InternalM, expectNoDiags);
 }
 
 TEST_F(LinkModuleTest, EmptyModule2) {
   std::unique_ptr<Module> InternalM(getInternal(Ctx));
   std::unique_ptr<Module> EmptyM(new Module("EmptyModule1", Ctx));
-  Linker::linkModules(*InternalM, *EmptyM);
+  Linker::linkModules(*InternalM, *EmptyM, expectNoDiags);
 }
 
 TEST_F(LinkModuleTest, TypeMerge) {

From c5aab87e68369e95e4fbe83f6ed2bd38ec9f41d6 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Fri, 4 Dec 2015 22:09:19 +0000
Subject: [PATCH 093/364] [Orc] Move some code up into the
 JITCompileCallbackManager base class. NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254778 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../ExecutionEngine/Orc/IndirectionUtils.h    | 39 ++++++++++---------
 .../Orc/CompileOnDemandLayerTest.cpp          | 12 +-----
 2 files changed, 22 insertions(+), 29 deletions(-)

diff --git a/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h b/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h
index b5b258e7a05c..3bfff059110c 100644
--- a/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h
+++ b/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h
@@ -84,7 +84,11 @@ class JITCompileCallbackManager {
   }
 
   /// @brief Reserve a compile callback.
-  virtual CompileCallbackInfo getCompileCallback() = 0;
+  CompileCallbackInfo getCompileCallback() {
+    TargetAddress TrampolineAddr = getAvailableTrampolineAddr();
+    auto &Compile = this->ActiveTrampolines[TrampolineAddr];
+    return CompileCallbackInfo(TrampolineAddr, Compile);
+  }
 
   /// @brief Get a CompileCallbackInfo for an existing callback.
   CompileCallbackInfo getCompileCallbackInfo(TargetAddress TrampolineAddr) {
@@ -113,6 +117,20 @@ class JITCompileCallbackManager {
   std::vector<TargetAddress> AvailableTrampolines;
 
 private:
+
+  TargetAddress getAvailableTrampolineAddr() {
+    if (this->AvailableTrampolines.empty())
+      grow();
+    assert(!this->AvailableTrampolines.empty() &&
+           "Failed to grow available trampolines.");
+    TargetAddress TrampolineAddr = this->AvailableTrampolines.back();
+    this->AvailableTrampolines.pop_back();
+    return TrampolineAddr;
+  }
+
+  // Create new trampolines - to be implemented in subclasses.
+  virtual void grow() = 0;
+
   virtual void anchor();
 };
 
@@ -145,13 +163,6 @@ class LocalJITCompileCallbackManager : public JITCompileCallbackManager {
     assert(!EC && "Failed to mprotect resolver block");
   }
 
-  /// @brief Get/create a compile callback with the given signature.
-  CompileCallbackInfo getCompileCallback() final {
-    TargetAddress TrampolineAddr = getAvailableTrampolineAddr();
-    auto &Compile = this->ActiveTrampolines[TrampolineAddr];
-    return CompileCallbackInfo(TrampolineAddr, Compile);
-  }
-
 private:
 
   static TargetAddress reenter(void *CCMgr, void *TrampolineId) {
@@ -162,17 +173,7 @@ class LocalJITCompileCallbackManager : public JITCompileCallbackManager {
                reinterpret_cast<uintptr_t>(TrampolineId)));
   }
 
-  TargetAddress getAvailableTrampolineAddr() {
-    if (this->AvailableTrampolines.empty())
-      grow();
-    assert(!this->AvailableTrampolines.empty() &&
-           "Failed to grow available trampolines.");
-    TargetAddress TrampolineAddr = this->AvailableTrampolines.back();
-    this->AvailableTrampolines.pop_back();
-    return TrampolineAddr;
-  }
-
-  void grow() {
+  void grow() override {
     assert(this->AvailableTrampolines.empty() && "Growing prematurely?");
 
     std::error_code EC;
diff --git a/unittests/ExecutionEngine/Orc/CompileOnDemandLayerTest.cpp b/unittests/ExecutionEngine/Orc/CompileOnDemandLayerTest.cpp
index 4a30cfc42971..ca508d0a7561 100644
--- a/unittests/ExecutionEngine/Orc/CompileOnDemandLayerTest.cpp
+++ b/unittests/ExecutionEngine/Orc/CompileOnDemandLayerTest.cpp
@@ -18,17 +18,9 @@ namespace {
 
 class DummyCallbackManager : public orc::JITCompileCallbackManager {
 public:
-  DummyCallbackManager()
-      : JITCompileCallbackManager(0), NextStubAddress(0),
-        UniversalCompile([]() { return 0; }) {
-  }
-
-  CompileCallbackInfo getCompileCallback() override {
-    return CompileCallbackInfo(++NextStubAddress, UniversalCompile);
-  }
+  DummyCallbackManager() : JITCompileCallbackManager(0) { }
 public:
-  TargetAddress NextStubAddress;
-  CompileFtor UniversalCompile;
+  void grow() override { llvm_unreachable("not implemented"); }
 };
 
 class DummyStubsManager : public orc::IndirectStubsManagerBase {

From f8e0f06f7c0ce41dc03400ad65dd005531234579 Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Fri, 4 Dec 2015 22:26:21 +0000
Subject: [PATCH 094/364] MSVC complains about this being ambiguous.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254782 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/LTO/LTOCodeGenerator.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/LTO/LTOCodeGenerator.cpp b/lib/LTO/LTOCodeGenerator.cpp
index 25c150b27840..bf3cde59443d 100644
--- a/lib/LTO/LTOCodeGenerator.cpp
+++ b/lib/LTO/LTOCodeGenerator.cpp
@@ -124,8 +124,8 @@ void LTOCodeGenerator::setModule(std::unique_ptr<LTOModule> Mod) {
   AsmUndefinedRefs.clear();
 
   MergedModule = Mod->takeModule();
-  IRLinker =
-      make_unique<Linker>(*MergedModule, IRLinker->getDiagnosticHandler());
+  IRLinker = llvm::make_unique<Linker>(*MergedModule,
+                                       IRLinker->getDiagnosticHandler());
 
   const std::vector<const char*> &Undefs = Mod->getAsmUndefinedRefs();
   for (int I = 0, E = Undefs.size(); I != E; ++I)

From 3f8065b694d883dfc4f8ced0d90d49e76b4f2c0b Mon Sep 17 00:00:00 2001
From: Kostya Serebryany <kcc@google.com>
Date: Fri, 4 Dec 2015 22:29:39 +0000
Subject: [PATCH 095/364] [libFuzzer] compute base64 in-process instead of
 using an external lib. Since libFuzzer should not depend on anything, just
 re-implement base64 encoder. PR25746

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254784 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Fuzzer/FuzzerIO.cpp            |  5 -----
 lib/Fuzzer/FuzzerInternal.h        |  2 +-
 lib/Fuzzer/FuzzerLoop.cpp          |  6 ++----
 lib/Fuzzer/FuzzerUtil.cpp          | 29 +++++++++++++++++++++++++++++
 lib/Fuzzer/test/FuzzerUnittest.cpp | 13 +++++++++++++
 5 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/lib/Fuzzer/FuzzerIO.cpp b/lib/Fuzzer/FuzzerIO.cpp
index abc444a3d471..043fad396d51 100644
--- a/lib/Fuzzer/FuzzerIO.cpp
+++ b/lib/Fuzzer/FuzzerIO.cpp
@@ -91,11 +91,6 @@ std::string DirPlusFile(const std::string &DirPath,
   return DirPath + "/" + FileName;
 }
 
-void PrintFileAsBase64(const std::string &Path) {
-  std::string Cmd = "base64 -w 0 < " + Path + "; echo";
-  ExecuteCommand(Cmd);
-}
-
 void Printf(const char *Fmt, ...) {
   va_list ap;
   va_start(ap, Fmt);
diff --git a/lib/Fuzzer/FuzzerInternal.h b/lib/Fuzzer/FuzzerInternal.h
index 2c382b2ef314..bc6bec7473d3 100644
--- a/lib/Fuzzer/FuzzerInternal.h
+++ b/lib/Fuzzer/FuzzerInternal.h
@@ -42,7 +42,7 @@ void Print(const Unit &U, const char *PrintAfter = "");
 void PrintASCII(const Unit &U, const char *PrintAfter = "");
 std::string Hash(const Unit &U);
 void SetTimer(int Seconds);
-void PrintFileAsBase64(const std::string &Path);
+std::string Base64(const Unit &U);
 int ExecuteCommand(const std::string &Command);
 
 // Private copy of SHA1 implementation.
diff --git a/lib/Fuzzer/FuzzerLoop.cpp b/lib/Fuzzer/FuzzerLoop.cpp
index 9c52a4dbe774..ca7f82b55607 100644
--- a/lib/Fuzzer/FuzzerLoop.cpp
+++ b/lib/Fuzzer/FuzzerLoop.cpp
@@ -302,10 +302,8 @@ void Fuzzer::WriteUnitToFileWithPrefix(const Unit &U, const char *Prefix) {
   WriteToFile(U, Path);
   Printf("artifact_prefix='%s'; Test unit written to %s\n",
          Options.ArtifactPrefix.c_str(), Path.c_str());
-  if (U.size() <= kMaxUnitSizeToPrint) {
-    Printf("Base64: ");
-    PrintFileAsBase64(Path);
-  }
+  if (U.size() <= kMaxUnitSizeToPrint)
+    Printf("Base64: %s\n", Base64(U).c_str());
 }
 
 void Fuzzer::SaveCorpus() {
diff --git a/lib/Fuzzer/FuzzerUtil.cpp b/lib/Fuzzer/FuzzerUtil.cpp
index 20a41e0d4fbb..6c1133fffd37 100644
--- a/lib/Fuzzer/FuzzerUtil.cpp
+++ b/lib/Fuzzer/FuzzerUtil.cpp
@@ -167,4 +167,33 @@ bool ParseDictionaryFile(const std::string &Text, std::vector<Unit> *Units) {
 
 int GetPid() { return getpid(); }
 
+
+std::string Base64(const Unit &U) {
+  static const char Table[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+                              "abcdefghijklmnopqrstuvwxyz"
+                              "0123456789+/";
+  std::string Res;
+  size_t i;
+  for (i = 0; i + 2 < U.size(); i += 3) {
+    uint32_t x = (U[i] << 16) + (U[i + 1] << 8) + U[i + 2];
+    Res += Table[(x >> 18) & 63];
+    Res += Table[(x >> 12) & 63];
+    Res += Table[(x >> 6) & 63];
+    Res += Table[x & 63];
+  }
+  if (i + 1 == U.size()) {
+    uint32_t x = (U[i] << 16);
+    Res += Table[(x >> 18) & 63];
+    Res += Table[(x >> 12) & 63];
+    Res += "==";
+  } else if (i + 2 == U.size()) {
+    uint32_t x = (U[i] << 16) + (U[i + 1] << 8);
+    Res += Table[(x >> 18) & 63];
+    Res += Table[(x >> 12) & 63];
+    Res += Table[(x >> 6) & 63];
+    Res += "=";
+  }
+  return Res;
+}
+
 }  // namespace fuzzer
diff --git a/lib/Fuzzer/test/FuzzerUnittest.cpp b/lib/Fuzzer/test/FuzzerUnittest.cpp
index 4a96468f8d7a..b92e61877c6c 100644
--- a/lib/Fuzzer/test/FuzzerUnittest.cpp
+++ b/lib/Fuzzer/test/FuzzerUnittest.cpp
@@ -360,3 +360,16 @@ TEST(FuzzerDictionary, ParseDictionaryFile) {
   EXPECT_EQ(Units,
             std::vector<Unit>({Unit({'a', 'a'}), Unit({'a', 'b', 'c'})}));
 }
+
+TEST(FuzzerUtil, Base64) {
+  EXPECT_EQ("", Base64({}));
+  EXPECT_EQ("YQ==", Base64({'a'}));
+  EXPECT_EQ("eA==", Base64({'x'}));
+  EXPECT_EQ("YWI=", Base64({'a', 'b'}));
+  EXPECT_EQ("eHk=", Base64({'x', 'y'}));
+  EXPECT_EQ("YWJj", Base64({'a', 'b', 'c'}));
+  EXPECT_EQ("eHl6", Base64({'x', 'y', 'z'}));
+  EXPECT_EQ("YWJjeA==", Base64({'a', 'b', 'c', 'x'}));
+  EXPECT_EQ("YWJjeHk=", Base64({'a', 'b', 'c', 'x', 'y'}));
+  EXPECT_EQ("YWJjeHl6", Base64({'a', 'b', 'c', 'x', 'y', 'z'}));
+}

From a7a95fed0f4e64e5a0c8836db68be8964fdc10c7 Mon Sep 17 00:00:00 2001
From: Derek Schuff <dschuff@google.com>
Date: Fri, 4 Dec 2015 22:47:58 +0000
Subject: [PATCH 096/364] Add TransformUtils to list of required libraries for
 llc

This dependency was added in r254774

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254786 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llc/CMakeLists.txt | 1 +
 tools/llc/LLVMBuild.txt  | 2 +-
 tools/llc/Makefile       | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/llc/CMakeLists.txt b/tools/llc/CMakeLists.txt
index ff5a89e1da44..136cf4be1c12 100644
--- a/tools/llc/CMakeLists.txt
+++ b/tools/llc/CMakeLists.txt
@@ -11,6 +11,7 @@ set(LLVM_LINK_COMPONENTS
   SelectionDAG
   Support
   Target
+  TransformUtils
   )
 
 # Support plugins.
diff --git a/tools/llc/LLVMBuild.txt b/tools/llc/LLVMBuild.txt
index 38660cf27a46..c1f5cebea859 100644
--- a/tools/llc/LLVMBuild.txt
+++ b/tools/llc/LLVMBuild.txt
@@ -19,4 +19,4 @@
 type = Tool
 name = llc
 parent = Tools
-required_libraries = AsmParser BitReader IRReader MIRParser all-targets
+required_libraries = AsmParser BitReader IRReader MIRParser TransformUtils all-targets
diff --git a/tools/llc/Makefile b/tools/llc/Makefile
index ae64c9a5b57c..cd34c80d840a 100644
--- a/tools/llc/Makefile
+++ b/tools/llc/Makefile
@@ -9,7 +9,7 @@
 
 LEVEL := ../..
 TOOLNAME := llc
-LINK_COMPONENTS := all-targets bitreader asmparser irreader mirparser
+LINK_COMPONENTS := all-targets bitreader asmparser irreader mirparser transformutils
 
 # Support plugins.
 NO_DEAD_STRIP := 1

From 35cba4cf6a352382826c0daf86108898807e4910 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@hanshq.net>
Date: Fri, 4 Dec 2015 23:00:33 +0000
Subject: [PATCH 097/364] X86: Don't emit SAHF/LAHF for 64-bit targets unless
 explicitly supported

These instructions are not supported by all CPUs in 64-bit mode. Emitting them
causes Chromium to crash on start-up for users with such chips.

(GCC puts these instructions behind -msahf on 64-bit for the same reason.)

This patch adds FeatureLAHFSAHF, enables it by default for 32-bit targets
and modern CPUs, and changes X86InstrInfo::copyPhysReg back to the lowering
from before r244503 when the instructions are not available.

Differential Revision: http://reviews.llvm.org/D15240

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254793 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86.td                         | 58 ++++++++++-----
 lib/Target/X86/X86ISelLowering.cpp            |  3 +
 lib/Target/X86/X86InstrInfo.cpp               | 29 ++++++--
 lib/Target/X86/X86InstrInfo.td                |  7 +-
 lib/Target/X86/X86Subtarget.cpp               | 10 +++
 lib/Target/X86/X86Subtarget.h                 |  4 ++
 test/CodeGen/X86/cmpxchg-clobber-flags.ll     | 72 +++++++++++++------
 .../X86/peephole-na-phys-copy-folding.ll      |  2 +-
 8 files changed, 136 insertions(+), 49 deletions(-)

diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 7d9f396c1e96..dc5ab1bf65d4 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -182,6 +182,8 @@ def FeaturePRFCHW  : SubtargetFeature<"prfchw", "HasPRFCHW", "true",
                                       "Support PRFCHW instructions">;
 def FeatureRDSEED  : SubtargetFeature<"rdseed", "HasRDSEED", "true",
                                       "Support RDSEED instruction">;
+def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF", "true",
+                                       "Support LAHF and SAHF instructions">;
 def FeatureMPX     : SubtargetFeature<"mpx", "HasMPX", "true",
                                       "Support MPX instructions">;
 def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
@@ -273,7 +275,8 @@ def : ProcessorModel<"core2", SandyBridgeModel, [
   FeatureSSSE3,
   FeatureFXSR,
   FeatureCMPXCHG16B,
-  FeatureSlowBTMem
+  FeatureSlowBTMem,
+  FeatureLAHFSAHF
 ]>;
 def : ProcessorModel<"penryn", SandyBridgeModel, [
   FeatureSlowUAMem16,
@@ -281,7 +284,8 @@ def : ProcessorModel<"penryn", SandyBridgeModel, [
   FeatureSSE41,
   FeatureFXSR,
   FeatureCMPXCHG16B,
-  FeatureSlowBTMem
+  FeatureSlowBTMem,
+  FeatureLAHFSAHF
 ]>;
 
 // Atom CPUs.
@@ -299,7 +303,8 @@ class BonnellProc<string Name> : ProcessorModel<Name, AtomModel, [
   FeatureSlowDivide64,
   FeatureCallRegIndirect,
   FeatureLEAUsesAG,
-  FeaturePadShortFunctions
+  FeaturePadShortFunctions,
+  FeatureLAHFSAHF
 ]>;
 def : BonnellProc<"bonnell">;
 def : BonnellProc<"atom">; // Pin the generic name to the baseline.
@@ -319,7 +324,8 @@ class SilvermontProc<string Name> : ProcessorModel<Name, SLMModel, [
   FeaturePRFCHW,
   FeatureSlowLEA,
   FeatureSlowIncDec,
-  FeatureSlowBTMem
+  FeatureSlowBTMem,
+  FeatureLAHFSAHF
 ]>;
 def : SilvermontProc<"silvermont">;
 def : SilvermontProc<"slm">; // Legacy alias.
@@ -331,7 +337,8 @@ class NehalemProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
   FeatureFXSR,
   FeatureCMPXCHG16B,
   FeatureSlowBTMem,
-  FeaturePOPCNT
+  FeaturePOPCNT,
+  FeatureLAHFSAHF
 ]>;
 def : NehalemProc<"nehalem">;
 def : NehalemProc<"corei7">;
@@ -346,7 +353,8 @@ class WestmereProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
   FeatureSlowBTMem,
   FeaturePOPCNT,
   FeatureAES,
-  FeaturePCLMUL
+  FeaturePCLMUL,
+  FeatureLAHFSAHF
 ]>;
 def : WestmereProc<"westmere">;
 
@@ -363,7 +371,8 @@ class SandyBridgeProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
   FeatureAES,
   FeaturePCLMUL,
   FeatureXSAVE,
-  FeatureXSAVEOPT
+  FeatureXSAVEOPT,
+  FeatureLAHFSAHF
 ]>;
 def : SandyBridgeProc<"sandybridge">;
 def : SandyBridgeProc<"corei7-avx">; // Legacy alias.
@@ -382,7 +391,8 @@ class IvyBridgeProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
   FeatureXSAVEOPT,
   FeatureRDRAND,
   FeatureF16C,
-  FeatureFSGSBase
+  FeatureFSGSBase,
+  FeatureLAHFSAHF
 ]>;
 def : IvyBridgeProc<"ivybridge">;
 def : IvyBridgeProc<"core-avx-i">; // Legacy alias.
@@ -408,7 +418,8 @@ class HaswellProc<string Name> : ProcessorModel<Name, HaswellModel, [
   FeatureFMA,
   FeatureRTM,
   FeatureHLE,
-  FeatureSlowIncDec
+  FeatureSlowIncDec,
+  FeatureLAHFSAHF
 ]>;
 def : HaswellProc<"haswell">;
 def : HaswellProc<"core-avx2">; // Legacy alias.
@@ -436,7 +447,8 @@ class BroadwellProc<string Name> : ProcessorModel<Name, HaswellModel, [
   FeatureHLE,
   FeatureADX,
   FeatureRDSEED,
-  FeatureSlowIncDec
+  FeatureSlowIncDec,
+  FeatureLAHFSAHF
 ]>;
 def : BroadwellProc<"broadwell">;
 
@@ -465,7 +477,8 @@ class KnightsLandingProc<string Name> : ProcessorModel<Name, HaswellModel, [
   FeatureRTM,
   FeatureHLE,
   FeatureSlowIncDec,
-  FeatureMPX
+  FeatureMPX,
+  FeatureLAHFSAHF
 ]>;
 def : KnightsLandingProc<"knl">;
 
@@ -500,7 +513,8 @@ class SkylakeProc<string Name> : ProcessorModel<Name, HaswellModel, [
   FeatureSlowIncDec,
   FeatureMPX,
   FeatureXSAVEC,
-  FeatureXSAVES
+  FeatureXSAVES,
+  FeatureLAHFSAHF
 ]>;
 def : SkylakeProc<"skylake">;
 def : SkylakeProc<"skx">; // Legacy alias.
@@ -547,7 +561,7 @@ def : Proc<"amdfam10",        [FeatureSSE4A, Feature3DNowA, FeatureFXSR,
                                FeatureSlowBTMem, FeatureSlowSHLD]>;
 def : Proc<"barcelona",       [FeatureSSE4A, Feature3DNowA, FeatureFXSR,
                                FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT,
-                               FeatureSlowBTMem, FeatureSlowSHLD]>;
+                               FeatureSlowBTMem, FeatureSlowSHLD, FeatureLAHFSAHF]>;
 
 // Bobcat
 def : Proc<"btver1", [
@@ -560,7 +574,8 @@ def : Proc<"btver1", [
   FeatureLZCNT,
   FeaturePOPCNT,
   FeatureXSAVE,
-  FeatureSlowSHLD
+  FeatureSlowSHLD,
+  FeatureLAHFSAHF
 ]>;
 
 // Jaguar
@@ -580,7 +595,8 @@ def : ProcessorModel<"btver2", BtVer2Model, [
   FeaturePOPCNT,
   FeatureXSAVE,
   FeatureXSAVEOPT,
-  FeatureSlowSHLD
+  FeatureSlowSHLD,
+  FeatureLAHFSAHF
 ]>;
 
 // Bulldozer
@@ -598,7 +614,8 @@ def : Proc<"bdver1", [
   FeatureLZCNT,
   FeaturePOPCNT,
   FeatureXSAVE,
-  FeatureSlowSHLD
+  FeatureSlowSHLD,
+  FeatureLAHFSAHF
 ]>;
 // Piledriver
 def : Proc<"bdver2", [
@@ -619,7 +636,8 @@ def : Proc<"bdver2", [
   FeatureBMI,
   FeatureTBM,
   FeatureFMA,
-  FeatureSlowSHLD
+  FeatureSlowSHLD,
+  FeatureLAHFSAHF
 ]>;
 
 // Steamroller
@@ -643,7 +661,8 @@ def : Proc<"bdver3", [
   FeatureFMA,
   FeatureXSAVEOPT,
   FeatureSlowSHLD,
-  FeatureFSGSBase
+  FeatureFSGSBase,
+  FeatureLAHFSAHF
 ]>;
 
 // Excavator
@@ -666,7 +685,8 @@ def : Proc<"bdver4", [
   FeatureTBM,
   FeatureFMA,
   FeatureXSAVEOPT,
-  FeatureFSGSBase
+  FeatureFSGSBase,
+  FeatureLAHFSAHF
 ]>;
 
 def : Proc<"geode",           [FeatureSlowUAMem16, Feature3DNowA]>;
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 2cf1d4ba30ee..c07bca8fe52a 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -13930,6 +13930,9 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
   SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
                             DAG.getConstant(8, dl, MVT::i8));
   SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
+
+  // Some 64-bit targets lack SAHF support, but they do support FCOMI.
+  assert(Subtarget->hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
   return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
 }
 
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index e9d36f8ce2f1..ebe329064c50 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -4385,7 +4385,32 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   int Reg = FromEFLAGS ? DestReg : SrcReg;
   bool is32 = X86::GR32RegClass.contains(Reg);
   bool is64 = X86::GR64RegClass.contains(Reg);
+
   if ((FromEFLAGS || ToEFLAGS) && (is32 || is64)) {
+    int Mov = is64 ? X86::MOV64rr : X86::MOV32rr;
+    int Push = is64 ? X86::PUSH64r : X86::PUSH32r;
+    int PushF = is64 ? X86::PUSHF64 : X86::PUSHF32;
+    int Pop = is64 ? X86::POP64r : X86::POP32r;
+    int PopF = is64 ? X86::POPF64 : X86::POPF32;
+    int AX = is64 ? X86::RAX : X86::EAX;
+
+    if (!Subtarget.hasLAHFSAHF()) {
+      assert(is64 && "Not having LAHF/SAHF only happens on 64-bit.");
+      // Moving EFLAGS to / from another register requires a push and a pop.
+      // Notice that we have to adjust the stack if we don't want to clobber the
+      // first frame index. See X86FrameLowering.cpp - clobbersTheStack.
+      if (FromEFLAGS) {
+        BuildMI(MBB, MI, DL, get(PushF));
+        BuildMI(MBB, MI, DL, get(Pop), DestReg);
+      }
+      if (ToEFLAGS) {
+        BuildMI(MBB, MI, DL, get(Push))
+            .addReg(SrcReg, getKillRegState(KillSrc));
+        BuildMI(MBB, MI, DL, get(PopF));
+      }
+      return;
+    }
+
     // The flags need to be saved, but saving EFLAGS with PUSHF/POPF is
     // inefficient. Instead:
     //   - Save the overflow flag OF into AL using SETO, and restore it using a
@@ -4407,10 +4432,6 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     // Notice that we have to adjust the stack if we don't want to clobber the
     // first frame index. See X86FrameLowering.cpp - clobbersTheStack.
 
-    int Mov = is64 ? X86::MOV64rr : X86::MOV32rr;
-    int Push = is64 ? X86::PUSH64r : X86::PUSH32r;
-    int Pop = is64 ? X86::POP64r : X86::POP32r;
-    int AX = is64 ? X86::RAX : X86::EAX;
 
     bool AXDead = (Reg == AX);
     // FIXME: The above could figure out that AX is dead in more cases with:
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 1e66739026e2..1c21a098bc6c 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -799,6 +799,7 @@ def HasSHA       : Predicate<"Subtarget->hasSHA()">;
 def HasPRFCHW    : Predicate<"Subtarget->hasPRFCHW()">;
 def HasRDSEED    : Predicate<"Subtarget->hasRDSEED()">;
 def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">;
+def HasLAHFSAHF  : Predicate<"Subtarget->hasLAHFSAHF()">;
 def FPStackf32   : Predicate<"!Subtarget->hasSSE1()">;
 def FPStackf64   : Predicate<"!Subtarget->hasSSE2()">;
 def HasMPX       : Predicate<"Subtarget->hasMPX()">;
@@ -1502,10 +1503,12 @@ def MOV8rm_NOREX : I<0x8A, MRMSrcMem,
 let SchedRW = [WriteALU] in {
 let Defs = [EFLAGS], Uses = [AH] in
 def SAHF     : I<0x9E, RawFrm, (outs),  (ins), "sahf",
-                 [(set EFLAGS, (X86sahf AH))], IIC_AHF>;
+                 [(set EFLAGS, (X86sahf AH))], IIC_AHF>,
+               Requires<[HasLAHFSAHF]>;
 let Defs = [AH], Uses = [EFLAGS], hasSideEffects = 0 in
 def LAHF     : I<0x9F, RawFrm, (outs),  (ins), "lahf", [],
-                IIC_AHF>;  // AH = flags
+                IIC_AHF>,  // AH = flags
+               Requires<[HasLAHFSAHF]>;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 44a46b7e07a2..f90a0b0d04f1 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -189,6 +189,15 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
       FullFS = "+64bit,+sse2";
   }
 
+  // LAHF/SAHF are always supported in non-64-bit mode.
+  if (!In64BitMode) {
+    if (!FullFS.empty())
+      FullFS = "+sahf," + FullFS;
+    else
+      FullFS = "+sahf";
+  }
+
+
   // Parse features string and set the CPU.
   ParseSubtargetFeatures(CPUName, FullFS);
 
@@ -264,6 +273,7 @@ void X86Subtarget::initializeEnvironment() {
   HasSHA = false;
   HasPRFCHW = false;
   HasRDSEED = false;
+  HasLAHFSAHF = false;
   HasMPX = false;
   IsBTMemSlow = false;
   IsSHLDSlow = false;
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 353b4f7f5ebd..83bc640976ac 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -152,6 +152,9 @@ class X86Subtarget final : public X86GenSubtargetInfo {
   /// Processor has RDSEED instructions.
   bool HasRDSEED;
 
+  /// Processor has LAHF/SAHF instructions.
+  bool HasLAHFSAHF;
+
   /// True if BT (bit test) of memory instructions are slow.
   bool IsBTMemSlow;
 
@@ -374,6 +377,7 @@ class X86Subtarget final : public X86GenSubtargetInfo {
   bool hasSHA() const { return HasSHA; }
   bool hasPRFCHW() const { return HasPRFCHW; }
   bool hasRDSEED() const { return HasRDSEED; }
+  bool hasLAHFSAHF() const { return HasLAHFSAHF; }
   bool isBTMemSlow() const { return IsBTMemSlow; }
   bool isSHLDSlow() const { return IsSHLDSlow; }
   bool isUnalignedMem16Slow() const { return IsUAMem16Slow; }
diff --git a/test/CodeGen/X86/cmpxchg-clobber-flags.ll b/test/CodeGen/X86/cmpxchg-clobber-flags.ll
index 791edba89c44..c294dee40135 100644
--- a/test/CodeGen/X86/cmpxchg-clobber-flags.ll
+++ b/test/CodeGen/X86/cmpxchg-clobber-flags.ll
@@ -1,7 +1,11 @@
 ; RUN: llc -mtriple=i386-linux-gnu %s -o - | FileCheck %s -check-prefix=i386
 ; RUN: llc -mtriple=i386-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=i386f
+
 ; RUN: llc -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s -check-prefix=x8664
 ; RUN: llc -mtriple=x86_64-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=x8664
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sahf %s -o - | FileCheck %s -check-prefix=x8664-sahf
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sahf -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=x8664-sahf
+; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=corei7 %s -o - | FileCheck %s -check-prefix=x8664-sahf
 
 ; FIXME: X86InstrInfo::copyPhysReg had code which figured out whether AX was
 ;        live or not to avoid save / restore when it's not needed. See FIXME in
@@ -56,21 +60,31 @@ define i64 @test_intervening_call(i64* %foo, i64 %bar, i64 %baz) {
 
 ; x8664-LABEL: test_intervening_call:
 ; x8664: cmpxchgq
-; x8664: pushq %rax
-; x8664-NEXT: seto %al
-; x8664-NEXT: lahf
-; x8664-NEXT: movq %rax, [[FLAGS:%.*]]
-; x8664-NEXT: popq %rax
+; x8664: pushfq
+; x8664-NEXT: popq [[FLAGS:%.*]]
 ; x8664-NEXT: movq %rax, %rdi
 ; x8664-NEXT: callq bar
+; x8664-NEXT: pushq [[FLAGS]]
+; x8664-NEXT: popfq
+; x8664-NEXT: jne
+
+; x8664-sahf-LABEL: test_intervening_call:
+; x8664-sahf: cmpxchgq
+; x8664-sahf: pushq %rax
+; x8664-sahf-NEXT: seto %al
+; x8664-sahf-NEXT: lahf
+; x8664-sahf-NEXT: movq %rax, [[FLAGS:%.*]]
+; x8664-sahf-NEXT: popq %rax
+; x8664-sahf-NEXT: movq %rax, %rdi
+; x8664-sahf-NEXT: callq bar
 ; ** FIXME Next line isn't actually necessary. **
-; x8664-NEXT: pushq %rax
-; x8664-NEXT: movq [[FLAGS]], %rax
-; x8664-NEXT: addb $127, %al
-; x8664-NEXT: sahf
+; x8664-sahf-NEXT: pushq %rax
+; x8664-sahf-NEXT: movq [[FLAGS]], %rax
+; x8664-sahf-NEXT: addb $127, %al
+; x8664-sahf-NEXT: sahf
 ; ** FIXME Next line isn't actually necessary. **
-; x8664-NEXT: popq %rax
-; x8664-NEXT: jne
+; x8664-sahf-NEXT: popq %rax
+; x8664-sahf-NEXT: jne
 
   %cx = cmpxchg i64* %foo, i64 %bar, i64 %baz seq_cst seq_cst
   %v = extractvalue { i64, i1 } %cx, 0
@@ -99,6 +113,10 @@ define i32 @test_control_flow(i32* %p, i32 %i, i32 %j) {
 ; x8664: cmpxchg
 ; x8664-NEXT: jne
 
+; x8664-sahf-LABEL: test_control_flow:
+; x8664-sahf: cmpxchg
+; x8664-sahf-NEXT: jne
+
 entry:
   %cmp = icmp sgt i32 %i, %j
   br i1 %cmp, label %loop_start, label %cond.end
@@ -165,20 +183,28 @@ define i32 @test_feed_cmov(i32* %addr, i32 %desired, i32 %new) {
 ; i386f-NEXT: popl %eax
 
 ; x8664-LABEL: test_feed_cmov:
-; x8664: cmpxchgl
+; x8664: cmpxchg
+; x8664: pushfq
+; x8664-NEXT: popq [[FLAGS:%.*]]
+; x8664-NEXT: callq foo
+; x8664-NEXT: pushq [[FLAGS]]
+; x8664-NEXT: popfq
+
+; x8664-sahf-LABEL: test_feed_cmov:
+; x8664-sahf: cmpxchgl
 ; ** FIXME Next line isn't actually necessary. **
-; x8664: pushq %rax
-; x8664: seto %al
-; x8664-NEXT: lahf
-; x8664-NEXT: movq %rax, [[FLAGS:%.*]]
+; x8664-sahf: pushq %rax
+; x8664-sahf: seto %al
+; x8664-sahf-NEXT: lahf
+; x8664-sahf-NEXT: movq %rax, [[FLAGS:%.*]]
 ; ** FIXME Next line isn't actually necessary. **
-; x8664-NEXT: popq %rax
-; x8664-NEXT: callq foo
-; x8664-NEXT: pushq %rax
-; x8664-NEXT: movq [[FLAGS]], %rax
-; x8664-NEXT: addb $127, %al
-; x8664-NEXT: sahf
-; x8664-NEXT: popq %rax
+; x8664-sahf-NEXT: popq %rax
+; x8664-sahf-NEXT: callq foo
+; x8664-sahf-NEXT: pushq %rax
+; x8664-sahf-NEXT: movq [[FLAGS]], %rax
+; x8664-sahf-NEXT: addb $127, %al
+; x8664-sahf-NEXT: sahf
+; x8664-sahf-NEXT: popq %rax
 
   %res = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
   %success = extractvalue { i32, i1 } %res, 1
diff --git a/test/CodeGen/X86/peephole-na-phys-copy-folding.ll b/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
index 891a925611cf..a8df33454e92 100644
--- a/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
+++ b/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=i386-linux-gnu %s -o - | FileCheck %s
-; RUN: llc -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sahf %s -o - | FileCheck %s
 
 ; FIXME Add -verify-machineinstrs back when PR24535 is fixed.
 

From a026cdc11a64e7c87b44d09a53c72fdf9e2163ea Mon Sep 17 00:00:00 2001
From: Justin Bogner <mail@justinbogner.com>
Date: Fri, 4 Dec 2015 23:00:54 +0000
Subject: [PATCH 098/364] CodeGen: Move the SlotIndexes BumpPtrAllocator before
 the list it allocates

When a `SlotIndexes` is destroyed, `ileAllocator` will currently be
destructed before `IndexList`, but all of `IndexList`'s storage has
been allocated by `ileAllocator`. This means we'll call destructors on
garbage data, which is very bad. This can be avoided by putting the
BumpPtrAllocator earlier in the class than anything it allocates.

Unfortunately, I don't know how to test this. It depends very much on
memory layout, and the only evidence I have that this is actually
happening in practice are backtraces that might be explained by this.
By inspection though, the code is obviously dangerous/wrong, and this
is the right thing to do.

I'll follow up later with a patch that calls clearAndLeakNodesUnsafely
on the list, since there isn't much point in destructing them when
they're allocated in a BPA anyway, but I figured it makes sense to
commit the correctness fix separately from that optimization.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254794 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/SlotIndexes.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/include/llvm/CodeGen/SlotIndexes.h b/include/llvm/CodeGen/SlotIndexes.h
index bd3a9062fb90..5dc42e20debe 100644
--- a/include/llvm/CodeGen/SlotIndexes.h
+++ b/include/llvm/CodeGen/SlotIndexes.h
@@ -333,6 +333,8 @@ namespace llvm {
   /// This pass assigns indexes to each instruction.
   class SlotIndexes : public MachineFunctionPass {
   private:
+    // IndexListEntry allocator.
+    BumpPtrAllocator ileAllocator;
 
     typedef ilist<IndexListEntry> IndexList;
     IndexList indexList;
@@ -353,9 +355,6 @@ namespace llvm {
     /// and MBB id.
     SmallVector<IdxMBBPair, 8> idx2MBBMap;
 
-    // IndexListEntry allocator.
-    BumpPtrAllocator ileAllocator;
-
     IndexListEntry* createEntry(MachineInstr *mi, unsigned index) {
       IndexListEntry *entry =
         static_cast<IndexListEntry*>(

From 9ac3ec43b3eb2c18007542c1fe43d93519606c32 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Fri, 4 Dec 2015 23:06:33 +0000
Subject: [PATCH 099/364] Address a memory leak in 254760

The issue appears to have been that the copy constructor of the SmallVector was being invoked and this was somehow leading to leaked memory.  This patch avoids the symptom, but likely doesn't address the underlying problem.  I'm still investigating the root cause, but wanted to avoid the memory leak in the mean time.  Even with the underlying fix, avoiding the redundant allocation is worthwhile.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254795 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/LegacyPassManagers.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/llvm/IR/LegacyPassManagers.h b/include/llvm/IR/LegacyPassManagers.h
index af045585691b..418702c0b781 100644
--- a/include/llvm/IR/LegacyPassManagers.h
+++ b/include/llvm/IR/LegacyPassManagers.h
@@ -264,12 +264,15 @@ class PMTopLevelManager {
       // TODO: We could consider sorting the dependency arrays within the
       // AnalysisUsage (since they are conceptually unordered).
       ID.AddBoolean(AU.getPreservesAll());
-      for (auto &Vec : {AU.getRequiredSet(), AU.getRequiredTransitiveSet(),
-            AU.getPreservedSet(), AU.getUsedSet()}) {
+      auto ProfileVec = [&](const SmallVectorImpl<AnalysisID>& Vec) {
         ID.AddInteger(Vec.size());
         for(AnalysisID AID : Vec)
           ID.AddPointer(AID);
-      }
+      };
+      ProfileVec(AU.getRequiredSet());
+      ProfileVec(AU.getRequiredTransitiveSet());
+      ProfileVec(AU.getPreservedSet());
+      ProfileVec(AU.getUsedSet());
     }
   };
 

From 16d4cc83c30485f28bb37715930c4302749d23b1 Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Fri, 4 Dec 2015 23:22:35 +0000
Subject: [PATCH 100/364] [WebAssembly] Initial varargs support.

Full varargs support will depend on prologue/epilogue support, but this patch
gets us started with most of the basic infrastructure.

Differential Revision: http://reviews.llvm.org/D15231


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254799 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../WebAssembly/WebAssemblyISelLowering.cpp   | 100 +++++++++++---
 .../WebAssembly/WebAssemblyISelLowering.h     |   1 +
 .../WebAssembly/WebAssemblyInstrCall.td       |   4 +-
 .../WebAssembly/WebAssemblyInstrInfo.cpp      |   4 +-
 .../WebAssembly/WebAssemblyRegStackify.cpp    |   4 +
 .../WebAssembly/WebAssemblyRegisterInfo.cpp   |   9 ++
 .../WebAssembly/WebAssemblyRegisterInfo.h     |   4 +
 test/CodeGen/WebAssembly/varargs.ll           | 122 ++++++++++++++++++
 8 files changed, 230 insertions(+), 18 deletions(-)
 create mode 100644 test/CodeGen/WebAssembly/varargs.ll

diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index a7eba5611134..65d2b1967b13 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -118,6 +118,13 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
   setOperationAction(ISD::ExternalSymbol, MVTPtr, Custom);
   setOperationAction(ISD::JumpTable, MVTPtr, Custom);
 
+  // Take the default expansion for va_arg, va_copy, and va_end. There is no
+  // default action for va_start, so we do that custom.
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+  setOperationAction(ISD::VAARG, MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY, MVT::Other, Expand);
+  setOperationAction(ISD::VAEND, MVT::Other, Expand);
+
   for (auto T : {MVT::f32, MVT::f64}) {
     // Don't expand the floating-point types to constant pools.
     setOperationAction(ISD::ConstantFP, T, Legal);
@@ -314,23 +321,67 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
   }
 
   bool IsVarArg = CLI.IsVarArg;
-  if (IsVarArg)
-    fail(DL, DAG, "WebAssembly doesn't support varargs yet");
+  unsigned NumFixedArgs = CLI.NumFixedArgs;
+  auto PtrVT = getPointerTy(MF.getDataLayout());
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
-  unsigned NumBytes = CCInfo.getNextStackOffset();
 
-  auto PtrVT = getPointerTy(MF.getDataLayout());
-  auto Zero = DAG.getConstant(0, DL, PtrVT, true);
+  if (IsVarArg) {
+    // Outgoing non-fixed arguments are placed at the top of the stack. First
+    // compute their offsets and the total amount of argument stack space
+    // needed.
+    for (SDValue Arg :
+         make_range(OutVals.begin() + NumFixedArgs, OutVals.end())) {
+      EVT VT = Arg.getValueType();
+      assert(VT != MVT::iPTR && "Legalized args should be concrete");
+      Type *Ty = VT.getTypeForEVT(*DAG.getContext());
+      unsigned Offset =
+          CCInfo.AllocateStack(MF.getDataLayout().getTypeAllocSize(Ty),
+                               MF.getDataLayout().getABITypeAlignment(Ty));
+      CCInfo.addLoc(CCValAssign::getMem(ArgLocs.size(), VT.getSimpleVT(),
+                                        Offset, VT.getSimpleVT(),
+                                        CCValAssign::Full));
+    }
+  }
+
+  unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
+
   auto NB = DAG.getConstant(NumBytes, DL, PtrVT, true);
   Chain = DAG.getCALLSEQ_START(Chain, NB, DL);
 
+  if (IsVarArg) {
+    // For non-fixed arguments, next emit stores to store the argument values
+    // to the stack at the offsets computed above.
+    SDValue SP = DAG.getCopyFromReg(
+        Chain, DL, getStackPointerRegisterToSaveRestore(), PtrVT);
+    unsigned ValNo = 0;
+    SmallVector<SDValue, 8> Chains;
+    for (SDValue Arg :
+         make_range(OutVals.begin() + NumFixedArgs, OutVals.end())) {
+      assert(ArgLocs[ValNo].getValNo() == ValNo &&
+             "ArgLocs should remain in order and only hold varargs args");
+      unsigned Offset = ArgLocs[ValNo++].getLocMemOffset();
+      SDValue Add = DAG.getNode(ISD::ADD, DL, PtrVT, SP,
+                                DAG.getConstant(Offset, DL, PtrVT));
+      Chains.push_back(DAG.getStore(Chain, DL, Arg, Add,
+                                    MachinePointerInfo::getStack(MF, Offset),
+                                    false, false, 0));
+    }
+    if (!Chains.empty())
+      Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+  }
+
+  // Compute the operands for the CALLn node.
   SmallVector<SDValue, 16> Ops;
   Ops.push_back(Chain);
   Ops.push_back(Callee);
-  Ops.append(OutVals.begin(), OutVals.end());
+
+  // Add all fixed arguments. Note that for non-varargs calls, NumFixedArgs
+  // isn't reliable.
+  Ops.append(OutVals.begin(),
+             IsVarArg ? OutVals.begin() + NumFixedArgs : OutVals.end());
 
   SmallVector<EVT, 8> Tys;
   for (const auto &In : Ins) {
@@ -360,7 +411,8 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
     Chain = Res.getValue(1);
   }
 
-  Chain = DAG.getCALLSEQ_END(Chain, NB, Zero, SDValue(), DL);
+  SDValue Unused = DAG.getUNDEF(PtrVT);
+  Chain = DAG.getCALLSEQ_END(Chain, NB, Unused, SDValue(), DL);
 
   return Chain;
 }
@@ -374,15 +426,13 @@ bool WebAssemblyTargetLowering::CanLowerReturn(
 }
 
 SDValue WebAssemblyTargetLowering::LowerReturn(
-    SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+    SDValue Chain, CallingConv::ID CallConv, bool /*IsVarArg*/,
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     const SmallVectorImpl<SDValue> &OutVals, SDLoc DL,
     SelectionDAG &DAG) const {
   assert(Outs.size() <= 1 && "WebAssembly can only return up to one value");
   if (!CallingConvSupported(CallConv))
     fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions");
-  if (IsVarArg)
-    fail(DL, DAG, "WebAssembly doesn't support varargs yet");
 
   SmallVector<SDValue, 4> RetOps(1, Chain);
   RetOps.append(OutVals.begin(), OutVals.end());
@@ -392,29 +442,26 @@ SDValue WebAssemblyTargetLowering::LowerReturn(
   for (const ISD::OutputArg &Out : Outs) {
     assert(!Out.Flags.isByVal() && "byval is not valid for return values");
     assert(!Out.Flags.isNest() && "nest is not valid for return values");
+    assert(Out.IsFixed && "non-fixed return value is not valid");
     if (Out.Flags.isInAlloca())
       fail(DL, DAG, "WebAssembly hasn't implemented inalloca results");
     if (Out.Flags.isInConsecutiveRegs())
       fail(DL, DAG, "WebAssembly hasn't implemented cons regs results");
     if (Out.Flags.isInConsecutiveRegsLast())
       fail(DL, DAG, "WebAssembly hasn't implemented cons regs last results");
-    if (!Out.IsFixed)
-      fail(DL, DAG, "WebAssembly doesn't support non-fixed results yet");
   }
 
   return Chain;
 }
 
 SDValue WebAssemblyTargetLowering::LowerFormalArguments(
-    SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+    SDValue Chain, CallingConv::ID CallConv, bool /*IsVarArg*/,
     const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
     SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
 
   if (!CallingConvSupported(CallConv))
     fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions");
-  if (IsVarArg)
-    fail(DL, DAG, "WebAssembly doesn't support varargs yet");
 
   // Set up the incoming ARGUMENTS value, which serves to represent the liveness
   // of the incoming values before they're represented by virtual registers.
@@ -443,6 +490,9 @@ SDValue WebAssemblyTargetLowering::LowerFormalArguments(
     MF.getInfo<WebAssemblyFunctionInfo>()->addParam(In.VT);
   }
 
+  // Incoming varargs arguments are on the stack and will be accessed through
+  // va_arg, so we don't need to do anything for them here.
+
   return Chain;
 }
 
@@ -464,6 +514,8 @@ SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op,
     return LowerJumpTable(Op, DAG);
   case ISD::BR_JT:
     return LowerBR_JT(Op, DAG);
+  case ISD::VASTART:
+    return LowerVASTART(Op, DAG);
   }
 }
 
@@ -529,6 +581,24 @@ SDValue WebAssemblyTargetLowering::LowerBR_JT(SDValue Op,
   return DAG.getNode(WebAssemblyISD::TABLESWITCH, DL, MVT::Other, Ops);
 }
 
+SDValue WebAssemblyTargetLowering::LowerVASTART(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT PtrVT = getPointerTy(DAG.getMachineFunction().getDataLayout());
+
+  // The incoming non-fixed arguments are placed on the top of the stack, with
+  // natural alignment, at the point of the call, so the base pointer is just
+  // the current frame pointer.
+  DAG.getMachineFunction().getFrameInfo()->setFrameAddressIsTaken(true);
+  unsigned FP =
+      static_cast<const WebAssemblyRegisterInfo *>(Subtarget->getRegisterInfo())
+          ->getFrameRegister(DAG.getMachineFunction());
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, FP, PtrVT);
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), DL, FrameAddr, Op.getOperand(1),
+                      MachinePointerInfo(SV), false, false, 0);
+}
+
 //===----------------------------------------------------------------------===//
 //                          WebAssembly Optimization Hooks
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index af5eab671f27..b6b54bb13ea6 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -77,6 +77,7 @@ class WebAssemblyTargetLowering final : public TargetLowering {
   SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
 };
 
 namespace WebAssembly {
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
index 018d26cfacda..6b7d03da4897 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrCall.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
@@ -19,8 +19,8 @@ let Defs = [ARGUMENTS] in {
 let isCodeGenOnly = 1 in {
 def ADJCALLSTACKDOWN : I<(outs), (ins i64imm:$amt),
                          [(WebAssemblycallseq_start timm:$amt)]>;
-def ADJCALLSTACKUP : I<(outs), (ins i64imm:$amt1, i64imm:$amt2),
-                       [(WebAssemblycallseq_end timm:$amt1, timm:$amt2)]>;
+def ADJCALLSTACKUP : I<(outs), (ins i64imm:$amt),
+                       [(WebAssemblycallseq_end timm:$amt, undef)]>;
 } // isCodeGenOnly = 1
 
 multiclass CALL<WebAssemblyRegClass vt> {
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
index 82296b3cdace..bd06bc396dcd 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
@@ -28,7 +28,9 @@ using namespace llvm;
 #include "WebAssemblyGenInstrInfo.inc"
 
 WebAssemblyInstrInfo::WebAssemblyInstrInfo(const WebAssemblySubtarget &STI)
-    : RI(STI.getTargetTriple()) {}
+    : WebAssemblyGenInstrInfo(WebAssembly::ADJCALLSTACKDOWN,
+                              WebAssembly::ADJCALLSTACKUP),
+      RI(STI.getTargetTriple()) {}
 
 void WebAssemblyInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator I,
diff --git a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index ecbbc5c72243..7abc20a8387e 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -204,6 +204,10 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
           continue;
         unsigned VReg = MO.getReg();
 
+        // Don't stackify physregs like SP or FP.
+        if (!TargetRegisterInfo::isVirtualRegister(VReg))
+          continue;
+
         if (MFI.isVRegStackified(VReg)) {
           if (MO.isDef())
             Stack.push_back(VReg);
diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
index 6c74098aff10..f87b547e3f57 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
@@ -67,3 +67,12 @@ WebAssemblyRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const WebAssemblyFrameLowering *TFI = getFrameLowering(MF);
   return Regs[TFI->hasFP(MF)][TT.isArch64Bit()];
 }
+
+const TargetRegisterClass *
+WebAssemblyRegisterInfo::getPointerRegClass(const MachineFunction &MF,
+                                            unsigned Kind) const {
+  assert(Kind == 0 && "Only one kind of pointer on WebAssembly");
+  if (MF.getSubtarget<WebAssemblySubtarget>().hasAddr64())
+    return &WebAssembly::I64RegClass;
+  return &WebAssembly::I32RegClass;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h
index bd1485317160..ad1d71eebf22 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h
@@ -41,6 +41,10 @@ class WebAssemblyRegisterInfo final : public WebAssemblyGenRegisterInfo {
 
   // Debug information queries.
   unsigned getFrameRegister(const MachineFunction &MF) const override;
+
+  const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF,
+                     unsigned Kind = 0) const override;
 };
 
 } // end namespace llvm
diff --git a/test/CodeGen/WebAssembly/varargs.ll b/test/CodeGen/WebAssembly/varargs.ll
new file mode 100644
index 000000000000..ccc7c1f9ce43
--- /dev/null
+++ b/test/CodeGen/WebAssembly/varargs.ll
@@ -0,0 +1,122 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test varargs constructs.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; Test va_start.
+
+; TODO: Test va_start.
+
+;define void @start(i8** %ap, ...) {
+;entry:
+;  %0 = bitcast i8** %ap to i8*
+;  call void @llvm.va_start(i8* %0)
+;  ret void
+;}
+
+; Test va_end.
+
+; CHECK-LABEL: end:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: return{{$}}
+define void @end(i8** %ap) {
+entry:
+  %0 = bitcast i8** %ap to i8*
+  call void @llvm.va_end(i8* %0)
+  ret void
+}
+
+; Test va_copy.
+
+; CHECK-LABEL: copy:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: i32.load  $push0=, $1{{$}}
+; CHECK-NEXT: i32.store $discard=, $0, $pop0{{$}}
+; CHECK-NEXT: return{{$}}
+define void @copy(i8** %ap, i8** %bp) {
+entry:
+  %0 = bitcast i8** %ap to i8*
+  %1 = bitcast i8** %bp to i8*
+  call void @llvm.va_copy(i8* %0, i8* %1)
+  ret void
+}
+
+; Test va_arg with an i8 argument.
+
+; CHECK-LABEL: arg_i8:
+; CHECK-NEXT: .param     i32{{$}}
+; CHECK-NEXT: .result    i32{{$}}
+; CHECK-NEXT: .local     i32{{$}}
+; CHECK-NEXT: i32.load   $1=, $0{{$}}
+; CHECK-NEXT: i32.const  $push0=, 4{{$}}
+; CHECK-NEXT: i32.add    $push1=, $1, $pop0{{$}}
+; CHECK-NEXT: i32.store  $discard=, $0, $pop1{{$}}
+; CHECK-NEXT: i32.load   $push2=, $1{{$}}
+; CHECK-NEXT: return     $pop2{{$}}
+define i8 @arg_i8(i8** %ap) {
+entry:
+  %t = va_arg i8** %ap, i8
+  ret i8 %t
+}
+
+; Test va_arg with an i32 argument.
+
+; CHECK-LABEL: arg_i32:
+; CHECK-NEXT: .param     i32{{$}}
+; CHECK-NEXT: .result    i32{{$}}
+; CHECK-NEXT: .local     i32{{$}}
+; CHECK-NEXT: i32.load   $push0=, $0{{$}}
+; CHECK-NEXT: i32.const  $push1=, 3{{$}}
+; CHECK-NEXT: i32.add    $push2=, $pop0, $pop1{{$}}
+; CHECK-NEXT: i32.const  $push3=, -4{{$}}
+; CHECK-NEXT: i32.and    $1=, $pop2, $pop3{{$}}
+; CHECK-NEXT: i32.const  $push4=, 4{{$}}
+; CHECK-NEXT: i32.add    $push5=, $1, $pop4{{$}}
+; CHECK-NEXT: i32.store  $discard=, $0, $pop5{{$}}
+; CHECK-NEXT: i32.load   $push6=, $1{{$}}
+; CHECK-NEXT: return     $pop6{{$}}
+define i32 @arg_i32(i8** %ap) {
+entry:
+  %t = va_arg i8** %ap, i32
+  ret i32 %t
+}
+
+; Test va_arg with an i128 argument.
+
+; CHECK-LABEL: arg_i128:
+; CHECK-NEXT: .param i32, i32{{$}}
+; CHECK-NEXT: .local
+; CHECK: i32.and
+; CHECK: i64.load
+; CHECK: i64.load
+; CHECK: return{{$}}
+define i128 @arg_i128(i8** %ap) {
+entry:
+  %t = va_arg i8** %ap, i128
+  ret i128 %t
+}
+
+; Test a varargs call with no actual arguments.
+
+declare void @callee(...)
+
+; CHECK-LABEL: caller_none:
+; CHECK-NEXT: call callee{{$}}
+; CHECK-NEXT: return{{$}}
+define void @caller_none() {
+  call void (...) @callee()
+  ret void
+}
+
+; TODO: Test a varargs call with actual arguments.
+
+;define void @caller_some() {
+;  call void (...) @callee(i32 0, double 2.0)
+;  ret void
+;}
+
+declare void @llvm.va_start(i8*)
+declare void @llvm.va_end(i8*)
+declare void @llvm.va_copy(i8*, i8*)

From 2174f151dc0d0a5f615786ba30a27dc65f42aa85 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@hanshq.net>
Date: Fri, 4 Dec 2015 23:32:19 +0000
Subject: [PATCH 101/364] Add FeatureLAHFSAHF to amdfam10 as well.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254801 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index dc5ab1bf65d4..7fc4def15fe5 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -558,7 +558,7 @@ def : Proc<"athlon64-sse3",   [FeatureSlowUAMem16, FeatureSSE3,   Feature3DNowA,
                                FeatureSlowSHLD]>;
 def : Proc<"amdfam10",        [FeatureSSE4A, Feature3DNowA, FeatureFXSR,
                                FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT,
-                               FeatureSlowBTMem, FeatureSlowSHLD]>;
+                               FeatureSlowBTMem, FeatureSlowSHLD, FeatureLAHFSAHF]>;
 def : Proc<"barcelona",       [FeatureSSE4A, Feature3DNowA, FeatureFXSR,
                                FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT,
                                FeatureSlowBTMem, FeatureSlowSHLD, FeatureLAHFSAHF]>;

From 5143703795b66599df93a9f35462e522f925c0f8 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Fri, 4 Dec 2015 23:40:22 +0000
Subject: [PATCH 102/364] [ThinLTO] Helper for performing renaming/promotion on
 a module

Creates a module and performs necessary renaming/promotion of locals
that may be exported to another module.

Split out of D15024.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254802 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Linker/Linker.h |  7 +++++++
 lib/Linker/LinkModules.cpp   | 12 ++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/include/llvm/Linker/Linker.h b/include/llvm/Linker/Linker.h
index f0c8ad979ab6..aa4300942947 100644
--- a/include/llvm/Linker/Linker.h
+++ b/include/llvm/Linker/Linker.h
@@ -99,6 +99,13 @@ class Linker {
   DiagnosticHandlerFunction DiagnosticHandler;
 };
 
+/// Create a new module with exported local functions renamed and promoted
+/// for ThinLTO.
+std::unique_ptr<Module>
+renameModuleForThinLTO(std::unique_ptr<Module> &M,
+                       const FunctionInfoIndex *Index,
+                       DiagnosticHandlerFunction DiagnosticHandler);
+
 } // End llvm namespace
 
 #endif
diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index 88b8e443c489..627137ba3abd 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -2056,6 +2056,18 @@ bool Linker::linkModules(Module &Dest, Module &Src,
   return L.linkInModule(Src, Flags);
 }
 
+std::unique_ptr<Module>
+llvm::renameModuleForThinLTO(std::unique_ptr<Module> &M,
+                             const FunctionInfoIndex *Index,
+                             DiagnosticHandlerFunction DiagnosticHandler) {
+  std::unique_ptr<llvm::Module> RenamedModule(
+      new llvm::Module(M->getModuleIdentifier(), M->getContext()));
+  Linker L(*RenamedModule.get(), DiagnosticHandler);
+  if (L.linkInModule(*M.get(), llvm::Linker::Flags::None, Index))
+    return nullptr;
+  return RenamedModule;
+}
+
 //===----------------------------------------------------------------------===//
 // C API.
 //===----------------------------------------------------------------------===//

From f79b7835d821236a302090fa9eb05a3a1cc47c31 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Fri, 4 Dec 2015 23:48:19 +0000
Subject: [PATCH 103/364] [PassManager] Ensure destructors of cached
 AnalysisUsage objects are run

In 254760, I introduced the usage of a BumpPtrAllocator for the AnalysisUsage instances held by the PassManger.  This turns out to have been incorrect since a BumpPtrAllocator does not run the destructors of objects when deallocating memory.  Since a few of our SmallVector's had grown beyond their small size, we end up with some leaked memory.  We need to use a SpecificBumpPtrAllocator instead.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254803 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/LegacyPassManagers.h | 2 +-
 lib/IR/LegacyPassManager.cpp         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/llvm/IR/LegacyPassManagers.h b/include/llvm/IR/LegacyPassManagers.h
index 418702c0b781..b8e33478d6a9 100644
--- a/include/llvm/IR/LegacyPassManagers.h
+++ b/include/llvm/IR/LegacyPassManagers.h
@@ -283,7 +283,7 @@ class PMTopLevelManager {
   
   // Allocator used for allocating UAFoldingSetNodes.  This handles deletion of
   // all allocated nodes in one fell swoop.
-  BumpPtrAllocator AUFoldingSetNodeAllocator;
+  SpecificBumpPtrAllocator<AUFoldingSetNode> AUFoldingSetNodeAllocator;
   
   // Maps from a pass to it's associated entry in UniqueAnalysisUsages.  Does
   // not own the storage associated with either key or value.. 
diff --git a/lib/IR/LegacyPassManager.cpp b/lib/IR/LegacyPassManager.cpp
index 08e8906e88db..f2e0c7d32c02 100644
--- a/lib/IR/LegacyPassManager.cpp
+++ b/lib/IR/LegacyPassManager.cpp
@@ -589,7 +589,7 @@ AnalysisUsage *PMTopLevelManager::findAnalysisUsage(Pass *P) {
     if (auto *N = UniqueAnalysisUsages.FindNodeOrInsertPos(ID, IP))
       Node = N;
     else {
-      Node = new (AUFoldingSetNodeAllocator) AUFoldingSetNode(AU);
+      Node = new (AUFoldingSetNodeAllocator.Allocate()) AUFoldingSetNode(AU);
       UniqueAnalysisUsages.InsertNode(Node, IP);
     }
     assert(Node && "cached analysis usage must be non null");

From 357108cbea3d64a0f31b931121d4a381ee6c1a3b Mon Sep 17 00:00:00 2001
From: Keno Fischer <kfischer@college.harvard.edu>
Date: Sat, 5 Dec 2015 00:06:37 +0000
Subject: [PATCH 104/364] [opt] Fix sanitizer complaints about r254774

`Out` can be null if no output is requested, so move any access
to it inside the conditional. Thanks to Justin Bogner for finding
this.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254804 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/opt/opt.cpp | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp
index c1510a7fb259..fc31beb48154 100644
--- a/tools/opt/opt.cpp
+++ b/tools/opt/opt.cpp
@@ -595,14 +595,16 @@ int main(int argc, char **argv) {
   SmallVector<char, 0> Buffer;
   SmallVector<char, 0> CompileTwiceBuffer;
   std::unique_ptr<raw_svector_ostream> BOS;
-  raw_ostream *OS = &Out->os();
-  if (RunTwice) {
-    BOS = make_unique<raw_svector_ostream>(Buffer);
-    OS = BOS.get();
-  }
+  raw_ostream *OS = nullptr;
 
   // Write bitcode or assembly to the output as the last step...
   if (!NoOutput && !AnalyzeOnly) {
+    assert(Out);
+    OS = &Out->os();
+    if (RunTwice) {
+      BOS = make_unique<raw_svector_ostream>(Buffer);
+      OS = BOS.get();
+    }
     if (OutputAssembly)
       Passes.add(createPrintModulePass(*OS, "", PreserveAssemblyUseListOrder));
     else
@@ -618,6 +620,7 @@ int main(int argc, char **argv) {
   // If requested, run all passes again with the same pass manager to catch
   // bugs caused by persistent state in the passes
   if (RunTwice) {
+    assert(Out);
     CompileTwiceBuffer = Buffer;
     Buffer.clear();
     std::unique_ptr<Module> M2(CloneModule(M.get()));

From 3817e67f7f0807234396b92d1a8884798da2f6f8 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Sat, 5 Dec 2015 00:18:33 +0000
Subject: [PATCH 105/364] [EarlyCSE] IsSimple vs IsVolatile naming
 clarification (NFC)

When the notion of target specific memory intrinsics was introduced to EarlyCSE, the commit confused the notions of volatile and simple memory access.  Since I'm about to start working on this area, cleanup the naming so that patches aren't horribly confusing.  Note that the actual implementation was always bailing if the load or store wasn't simple.

Reminder:
- "volatile" - C++ volatile, can't remove any memory operations, but in principal unordered
- "ordered" - imposes ordering constraints on other nearby memory operations
- "atomic" - can't be split or sheared.  In LLVM terms, all "ordered" operations are also atomic so the predicate "isAtomic" is often used.
- "simple" - a load which is none of the above.  These are normal loads and what most of the optimizer works with.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254805 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/TargetTransformInfo.h   |  6 +++--
 .../AArch64/AArch64TargetTransformInfo.cpp    |  4 ++--
 lib/Transforms/Scalar/EarlyCSE.cpp            | 22 +++++++++----------
 3 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h
index 98458f1c3f3b..35c756b362d6 100644
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -42,11 +42,13 @@ class Value;
 /// \brief Information about a load/store intrinsic defined by the target.
 struct MemIntrinsicInfo {
   MemIntrinsicInfo()
-      : ReadMem(false), WriteMem(false), Vol(false), MatchingId(0),
+      : ReadMem(false), WriteMem(false), IsSimple(false), MatchingId(0),
         NumMemRefs(0), PtrVal(nullptr) {}
   bool ReadMem;
   bool WriteMem;
-  bool Vol;
+  /// True only if this memory operation is non-volatile, non-atomic, and
+  /// unordered.  (See LoadInst/StoreInst for details on each)
+  bool IsSimple;
   // Same Id is set by the target for corresponding load/store intrinsics.
   unsigned short MatchingId;
   int NumMemRefs;
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 81402a854f6a..e803ef949b9d 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -538,7 +538,7 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
   case Intrinsic::aarch64_neon_ld4:
     Info.ReadMem = true;
     Info.WriteMem = false;
-    Info.Vol = false;
+    Info.IsSimple = true;
     Info.NumMemRefs = 1;
     Info.PtrVal = Inst->getArgOperand(0);
     break;
@@ -547,7 +547,7 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
   case Intrinsic::aarch64_neon_st4:
     Info.ReadMem = false;
     Info.WriteMem = true;
-    Info.Vol = false;
+    Info.IsSimple = true;
     Info.NumMemRefs = 1;
     Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
     break;
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index de539d53a4f5..b055044ba6d0 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -388,8 +388,8 @@ class EarlyCSE {
   class ParseMemoryInst {
   public:
     ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI)
-        : Load(false), Store(false), Vol(false), MayReadFromMemory(false),
-          MayWriteToMemory(false), MatchingId(-1), Ptr(nullptr) {
+      : Load(false), Store(false), IsSimple(true), MayReadFromMemory(false),
+        MayWriteToMemory(false), MatchingId(-1), Ptr(nullptr) {
       MayReadFromMemory = Inst->mayReadFromMemory();
       MayWriteToMemory = Inst->mayWriteToMemory();
       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
@@ -402,22 +402,22 @@ class EarlyCSE {
           MatchingId = Info.MatchingId;
           MayReadFromMemory = Info.ReadMem;
           MayWriteToMemory = Info.WriteMem;
-          Vol = Info.Vol;
+          IsSimple = Info.IsSimple;
           Ptr = Info.PtrVal;
         }
       } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
         Load = true;
-        Vol = !LI->isSimple();
+        IsSimple = LI->isSimple();
         Ptr = LI->getPointerOperand();
       } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
         Store = true;
-        Vol = !SI->isSimple();
+        IsSimple = SI->isSimple();
         Ptr = SI->getPointerOperand();
       }
     }
     bool isLoad() const { return Load; }
     bool isStore() const { return Store; }
-    bool isVolatile() const { return Vol; }
+    bool isSimple() const { return IsSimple; }
     bool isMatchingMemLoc(const ParseMemoryInst &Inst) const {
       return Ptr == Inst.Ptr && MatchingId == Inst.MatchingId;
     }
@@ -430,7 +430,7 @@ class EarlyCSE {
   private:
     bool Load;
     bool Store;
-    bool Vol;
+    bool IsSimple;
     bool MayReadFromMemory;
     bool MayWriteToMemory;
     // For regular (non-intrinsic) loads/stores, this is set to -1. For
@@ -554,8 +554,8 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
     ParseMemoryInst MemInst(Inst, TTI);
     // If this is a non-volatile load, process it.
     if (MemInst.isValid() && MemInst.isLoad()) {
-      // Ignore volatile loads.
-      if (MemInst.isVolatile()) {
+      // Ignore volatile or ordered loads.
+      if (!MemInst.isSimple()) {
         LastStore = nullptr;
         // Don't CSE across synchronization boundaries.
         if (Inst->mayWriteToMemory())
@@ -662,8 +662,8 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
             MemInst.getPtr(),
             LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId()));
 
-        // Remember that this was the last store we saw for DSE.
-        if (!MemInst.isVolatile())
+        // Remember that this was the last normal store we saw for DSE.
+        if (MemInst.isSimple())
           LastStore = Inst;
       }
     }

From a9143d4647d9eca6d9ed32e76a56368ed4935969 Mon Sep 17 00:00:00 2001
From: Derek Schuff <dschuff@google.com>
Date: Sat, 5 Dec 2015 00:26:39 +0000
Subject: [PATCH 106/364] [WebAssembly] Support constant offsets on loads and
 stores

This is just prototype for load/store for i32 types. I'll add them to
the rest of the types if we like this direction.

Differential Revision: http://reviews.llvm.org/D15197

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254807 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../WebAssembly/WebAssemblyInstrMemory.td     | 159 ++++++++++--------
 .../WebAssembly/WebAssemblyStoreResults.cpp   |   2 +-
 test/CodeGen/WebAssembly/cfg-stackify.ll      |   2 +-
 test/CodeGen/WebAssembly/global.ll            |   2 +-
 test/CodeGen/WebAssembly/load-ext.ll          |  20 +--
 test/CodeGen/WebAssembly/load-store-i1.ll     |  12 +-
 test/CodeGen/WebAssembly/load.ll              |   8 +-
 test/CodeGen/WebAssembly/store-results.ll     |   6 +-
 test/CodeGen/WebAssembly/store-trunc.ll       |  10 +-
 test/CodeGen/WebAssembly/store.ll             |   8 +-
 test/CodeGen/WebAssembly/varargs.ll           |  16 +-
 11 files changed, 128 insertions(+), 117 deletions(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
index 700a196fa29c..fbb3df2f7b82 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
@@ -25,59 +25,63 @@
 let Defs = [ARGUMENTS] in {
 
 // Basic load.
-def LOAD_I32 : I<(outs I32:$dst), (ins I32:$addr),
-                 [(set I32:$dst, (load I32:$addr))],
-                 "i32.load\t$dst, $addr">;
-def LOAD_I64 : I<(outs I64:$dst), (ins I32:$addr),
-                 [(set I64:$dst, (load I32:$addr))],
-                 "i64.load\t$dst, $addr">;
-def LOAD_F32 : I<(outs F32:$dst), (ins I32:$addr),
-                 [(set F32:$dst, (load I32:$addr))],
-                 "f32.load\t$dst, $addr">;
-def LOAD_F64 : I<(outs F64:$dst), (ins I32:$addr),
-                 [(set F64:$dst, (load I32:$addr))],
-                 "f64.load\t$dst, $addr">;
+def LOAD_I32 : I<(outs I32:$dst), (ins I32:$off, I32:$addr), [],
+                 "i32.load\t$dst, $off($addr)">;
+def LOAD_I64 : I<(outs I64:$dst), (ins I32:$off, I32:$addr), [],
+                 "i64.load\t$dst, $off($addr)">;
+def LOAD_F32 : I<(outs F32:$dst), (ins I32:$off, I32:$addr), [],
+                 "f32.load\t$dst, $off($addr)">;
+def LOAD_F64 : I<(outs F64:$dst), (ins I32:$off, I32:$addr), [],
+                 "f64.load\t$dst, $off($addr)">;
 
 // Extending load.
-def LOAD8_S_I32  : I<(outs I32:$dst), (ins I32:$addr),
-                     [(set I32:$dst, (sextloadi8 I32:$addr))],
-                     "i32.load8_s\t$dst, $addr">;
-def LOAD8_U_I32  : I<(outs I32:$dst), (ins I32:$addr),
-                     [(set I32:$dst, (zextloadi8 I32:$addr))],
-                     "i32.load8_u\t$dst, $addr">;
-def LOAD16_S_I32 : I<(outs I32:$dst), (ins I32:$addr),
-                     [(set I32:$dst, (sextloadi16 I32:$addr))],
-                     "i32.load16_s\t$dst, $addr">;
-def LOAD16_U_I32 : I<(outs I32:$dst), (ins I32:$addr),
-                     [(set I32:$dst, (zextloadi16 I32:$addr))],
-                     "i32.load16_u\t$dst, $addr">;
-def LOAD8_S_I64  : I<(outs I64:$dst), (ins I32:$addr),
-                     [(set I64:$dst, (sextloadi8 I32:$addr))],
-                     "i64.load8_s\t$dst, $addr">;
-def LOAD8_U_I64  : I<(outs I64:$dst), (ins I32:$addr),
-                     [(set I64:$dst, (zextloadi8 I32:$addr))],
-                     "i64.load8_u\t$dst, $addr">;
-def LOAD16_S_I64 : I<(outs I64:$dst), (ins I32:$addr),
-                     [(set I64:$dst, (sextloadi16 I32:$addr))],
-                     "i64.load16_s\t$dst, $addr">;
-def LOAD16_U_I64 : I<(outs I64:$dst), (ins I32:$addr),
-                     [(set I64:$dst, (zextloadi16 I32:$addr))],
-                     "i64.load16_u\t$dst, $addr">;
-def LOAD32_S_I64 : I<(outs I64:$dst), (ins I32:$addr),
-                     [(set I64:$dst, (sextloadi32 I32:$addr))],
-                     "i64.load32_s\t$dst, $addr">;
-def LOAD32_U_I64 : I<(outs I64:$dst), (ins I32:$addr),
-                     [(set I64:$dst, (zextloadi32 I32:$addr))],
-                     "i64.load32_u\t$dst, $addr">;
+def LOAD8_S_I32  : I<(outs I32:$dst), (ins I32:$off, I32:$addr), [],
+                     "i32.load8_s\t$dst, $off($addr)">;
+def LOAD8_U_I32  : I<(outs I32:$dst), (ins I32:$off, I32:$addr), [],
+                     "i32.load8_u\t$dst, $off($addr)">;
+def LOAD16_S_I32 : I<(outs I32:$dst), (ins I32:$off, I32:$addr), [],
+                     "i32.load16_s\t$dst, $off($addr)">;
+def LOAD16_U_I32 : I<(outs I32:$dst), (ins I32:$off, I32:$addr), [],
+                     "i32.load16_u\t$dst, $off($addr)">;
+def LOAD8_S_I64  : I<(outs I64:$dst), (ins I32:$off, I32:$addr), [],
+                     "i64.load8_s\t$dst, $off($addr)">;
+def LOAD8_U_I64  : I<(outs I64:$dst), (ins I32:$off, I32:$addr), [],
+                     "i64.load8_u\t$dst, $off($addr)">;
+def LOAD16_S_I64 : I<(outs I64:$dst), (ins I32:$off, I32:$addr), [],
+                     "i64.load16_s\t$dst, $off($addr)">;
+def LOAD16_U_I64 : I<(outs I64:$dst), (ins I32:$off, I32:$addr), [],
+                     "i64.load16_u\t$dst, $off($addr)">;
+def LOAD32_S_I64 : I<(outs I64:$dst), (ins I32:$off, I32:$addr), [],
+                     "i64.load32_s\t$dst, $off($addr)">;
+def LOAD32_U_I64 : I<(outs I64:$dst), (ins I32:$off, I32:$addr), [],
+                     "i64.load32_u\t$dst, $off($addr)">;
 
 } // Defs = [ARGUMENTS]
 
+// Select loads with no constant offset.
+def : Pat<(i32 (load I32:$addr)), (LOAD_I32 0, $addr)>;
+def : Pat<(i64 (load I32:$addr)), (LOAD_I64 0, $addr)>;
+def : Pat<(f32 (load I32:$addr)), (LOAD_F32 0, $addr)>;
+def : Pat<(f64 (load I32:$addr)), (LOAD_F64 0, $addr)>;
+
+// Select extending loads with no constant offset.
+def : Pat<(i32 (sextloadi8 I32:$addr)), (LOAD8_S_I32 0, $addr)>;
+def : Pat<(i32 (zextloadi8 I32:$addr)), (LOAD8_U_I32 0, $addr)>;
+def : Pat<(i32 (sextloadi16 I32:$addr)), (LOAD16_S_I32 0, $addr)>;
+def : Pat<(i32 (zextloadi16 I32:$addr)), (LOAD16_U_I32 0, $addr)>;
+def : Pat<(i64 (sextloadi8 I32:$addr)), (LOAD8_S_I64 0, $addr)>;
+def : Pat<(i64 (zextloadi8 I32:$addr)), (LOAD8_U_I64 0, $addr)>;
+def : Pat<(i64 (sextloadi16 I32:$addr)), (LOAD16_S_I64 0, $addr)>;
+def : Pat<(i64 (zextloadi16 I32:$addr)), (LOAD16_U_I64 0, $addr)>;
+def : Pat<(i64 (sextloadi32 I32:$addr)), (LOAD32_S_I64 0, $addr)>;
+def : Pat<(i64 (zextloadi32 I32:$addr)), (LOAD32_U_I64 0, $addr)>;
+
 // "Don't care" extending load become zero-extending load.
-def : Pat<(i32 (extloadi8 I32:$addr)),  (LOAD8_U_I32 $addr)>;
-def : Pat<(i32 (extloadi16 I32:$addr)), (LOAD16_U_I32 $addr)>;
-def : Pat<(i64 (extloadi8 I32:$addr)),  (LOAD8_U_I64 $addr)>;
-def : Pat<(i64 (extloadi16 I32:$addr)), (LOAD16_U_I64 $addr)>;
-def : Pat<(i64 (extloadi32 I32:$addr)), (LOAD32_U_I64 $addr)>;
+def : Pat<(i32 (extloadi8 I32:$addr)),  (LOAD8_U_I32 0, $addr)>;
+def : Pat<(i32 (extloadi16 I32:$addr)), (LOAD16_U_I32 0, $addr)>;
+def : Pat<(i64 (extloadi8 I32:$addr)),  (LOAD8_U_I64 0, $addr)>;
+def : Pat<(i64 (extloadi16 I32:$addr)), (LOAD16_U_I64 0, $addr)>;
+def : Pat<(i64 (extloadi32 I32:$addr)), (LOAD32_U_I64 0, $addr)>;
 
 let Defs = [ARGUMENTS] in {
 
@@ -87,48 +91,55 @@ let Defs = [ARGUMENTS] in {
 // instruction definition patterns that don't reference all of the output
 // operands.
 // Note: WebAssembly inverts SelectionDAG's usual operand order.
-def STORE_I32  : I<(outs I32:$dst), (ins I32:$addr, I32:$val), [],
-                   "i32.store\t$dst, $addr, $val">;
-def STORE_I64  : I<(outs I64:$dst), (ins I32:$addr, I64:$val), [],
-                   "i64.store\t$dst, $addr, $val">;
-def STORE_F32  : I<(outs F32:$dst), (ins I32:$addr, F32:$val), [],
-                   "f32.store\t$dst, $addr, $val">;
-def STORE_F64  : I<(outs F64:$dst), (ins I32:$addr, F64:$val), [],
-                   "f64.store\t$dst, $addr, $val">;
+def STORE_I32  : I<(outs I32:$dst), (ins I32:$off, I32:$addr, I32:$val), [],
+                   "i32.store\t$dst, $off($addr), $val">;
+def STORE_I64  : I<(outs I64:$dst), (ins I32:$off, I32:$addr, I64:$val), [],
+                   "i64.store\t$dst, $off($addr), $val">;
+def STORE_F32  : I<(outs F32:$dst), (ins I32:$off, I32:$addr, F32:$val), [],
+                   "f32.store\t$dst, $off($addr), $val">;
+def STORE_F64  : I<(outs F64:$dst), (ins I32:$off, I32:$addr, F64:$val), [],
+                   "f64.store\t$dst, $off($addr), $val">;
 
 } // Defs = [ARGUMENTS]
 
-def : Pat<(store I32:$val, I32:$addr), (STORE_I32 I32:$addr, I32:$val)>;
-def : Pat<(store I64:$val, I32:$addr), (STORE_I64 I32:$addr, I64:$val)>;
-def : Pat<(store F32:$val, I32:$addr), (STORE_F32 I32:$addr, F32:$val)>;
-def : Pat<(store F64:$val, I32:$addr), (STORE_F64 I32:$addr, F64:$val)>;
+def : Pat<(store I32:$val, I32:$addr), (STORE_I32 0, I32:$addr, I32:$val)>;
+def : Pat<(store I64:$val, I32:$addr), (STORE_I64 0, I32:$addr, I64:$val)>;
+def : Pat<(store F32:$val, I32:$addr), (STORE_F32 0, I32:$addr, F32:$val)>;
+def : Pat<(store F64:$val, I32:$addr), (STORE_F64 0, I32:$addr, F64:$val)>;
+
+// FIXME: This pattern matches an immediate to actually use the offset field
+// in the store instruction; however only unsigned offsets are supported in
+// wasm, so we need to constrain the immediate we match. This may require
+// custom code rather than a simple pattern.
+// def : Pat<(store I32:$val, (add I32:$addr, (i32 imm:$off))),
+//           (STORE_I32 imm:$off, I32:$addr, I32:$val)>;
 
 let Defs = [ARGUMENTS] in {
 
 // Truncating store.
-def STORE8_I32  : I<(outs I32:$dst), (ins I32:$addr, I32:$val), [],
-                    "i32.store8\t$dst, $addr, $val">;
-def STORE16_I32 : I<(outs I32:$dst), (ins I32:$addr, I32:$val), [],
-                    "i32.store16\t$dst, $addr, $val">;
-def STORE8_I64  : I<(outs I64:$dst), (ins I32:$addr, I64:$val), [],
-                    "i64.store8\t$dst, $addr, $val">;
-def STORE16_I64 : I<(outs I64:$dst), (ins I32:$addr, I64:$val), [],
-                    "i64.store16\t$dst, $addr, $val">;
-def STORE32_I64 : I<(outs I64:$dst), (ins I32:$addr, I64:$val), [],
-                    "i64.store32\t$dst, $addr, $val">;
+def STORE8_I32  : I<(outs I32:$dst), (ins I32:$off, I32:$addr, I32:$val), [],
+                    "i32.store8\t$dst, $off($addr), $val">;
+def STORE16_I32 : I<(outs I32:$dst), (ins I32:$off, I32:$addr, I32:$val), [],
+                    "i32.store16\t$dst, $off($addr), $val">;
+def STORE8_I64  : I<(outs I64:$dst), (ins I32:$off, I32:$addr, I64:$val), [],
+                    "i64.store8\t$dst, $off($addr), $val">;
+def STORE16_I64 : I<(outs I64:$dst), (ins I32:$off, I32:$addr, I64:$val), [],
+                    "i64.store16\t$dst, $off($addr), $val">;
+def STORE32_I64 : I<(outs I64:$dst), (ins I32:$off, I32:$addr, I64:$val), [],
+                    "i64.store32\t$dst, $off($addr), $val">;
 
 } // Defs = [ARGUMENTS]
 
 def : Pat<(truncstorei8 I32:$val, I32:$addr),
-          (STORE8_I32 I32:$addr, I32:$val)>;
+          (STORE8_I32 0, I32:$addr, I32:$val)>;
 def : Pat<(truncstorei16 I32:$val, I32:$addr),
-          (STORE16_I32 I32:$addr, I32:$val)>;
+          (STORE16_I32 0, I32:$addr, I32:$val)>;
 def : Pat<(truncstorei8 I64:$val, I32:$addr),
-          (STORE8_I64 I32:$addr, I64:$val)>;
+          (STORE8_I64 0, I32:$addr, I64:$val)>;
 def : Pat<(truncstorei16 I64:$val, I32:$addr),
-          (STORE16_I64 I32:$addr, I64:$val)>;
+          (STORE16_I64 0, I32:$addr, I64:$val)>;
 def : Pat<(truncstorei32 I64:$val, I32:$addr),
-          (STORE32_I64 I32:$addr, I64:$val)>;
+          (STORE32_I64 0, I32:$addr, I64:$val)>;
 
 let Defs = [ARGUMENTS] in {
 
diff --git a/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp b/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
index 4a8fc09878c4..b67453bee708 100644
--- a/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
@@ -88,7 +88,7 @@ bool WebAssemblyStoreResults::runOnMachineFunction(MachineFunction &MF) {
       case WebAssembly::STORE_I32:
       case WebAssembly::STORE_I64:
         unsigned ToReg = MI.getOperand(0).getReg();
-        unsigned FromReg = MI.getOperand(2).getReg();
+        unsigned FromReg = MI.getOperand(3).getReg();
         for (auto I = MRI.use_begin(FromReg), E = MRI.use_end(); I != E;) {
           MachineOperand &O = *I++;
           MachineInstr *Where = O.getParent();
diff --git a/test/CodeGen/WebAssembly/cfg-stackify.ll b/test/CodeGen/WebAssembly/cfg-stackify.ll
index 4a53f6e4bb93..b8ac48bf49dc 100644
--- a/test/CodeGen/WebAssembly/cfg-stackify.ll
+++ b/test/CodeGen/WebAssembly/cfg-stackify.ll
@@ -184,7 +184,7 @@ entry:
 ; CHECK-LABEL: minimal_loop:
 ; CHECK-NOT: br
 ; CHECK: BB7_1:
-; CHECK: i32.store $discard=, $0, $pop{{[0-9]+}}{{$}}
+; CHECK: i32.store $discard=, 0($0), $pop{{[0-9]+}}{{$}}
 ; CHECK: br BB7_1{{$}}
 ; CHECK: BB7_2:
 define i32 @minimal_loop(i32* %p) {
diff --git a/test/CodeGen/WebAssembly/global.ll b/test/CodeGen/WebAssembly/global.ll
index 818c454a4914..ffc73e3c1e35 100644
--- a/test/CodeGen/WebAssembly/global.ll
+++ b/test/CodeGen/WebAssembly/global.ll
@@ -11,7 +11,7 @@ target triple = "wasm32-unknown-unknown"
 
 ; CHECK: foo:
 ; CHECK: i32.const $push0=, answer{{$}}
-; CHECK-NEXT: i32.load $push1=, $pop0{{$}}
+; CHECK-NEXT: i32.load $push1=, 0($pop0){{$}}
 ; CHECK-NEXT: return $pop1{{$}}
 define i32 @foo() {
   %a = load i32, i32* @answer
diff --git a/test/CodeGen/WebAssembly/load-ext.ll b/test/CodeGen/WebAssembly/load-ext.ll
index bdccfff1d161..0ffcd38a8666 100644
--- a/test/CodeGen/WebAssembly/load-ext.ll
+++ b/test/CodeGen/WebAssembly/load-ext.ll
@@ -6,7 +6,7 @@ target datalayout = "e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: sext_i8_i32:
-; CHECK: i32.load8_s $push0=, $0{{$}}
+; CHECK: i32.load8_s $push0=, 0($0){{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @sext_i8_i32(i8 *%p) {
   %v = load i8, i8* %p
@@ -15,7 +15,7 @@ define i32 @sext_i8_i32(i8 *%p) {
 }
 
 ; CHECK-LABEL: zext_i8_i32:
-; CHECK: i32.load8_u $push0=, $0{{$}}
+; CHECK: i32.load8_u $push0=, 0($0){{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @zext_i8_i32(i8 *%p) {
   %v = load i8, i8* %p
@@ -24,7 +24,7 @@ define i32 @zext_i8_i32(i8 *%p) {
 }
 
 ; CHECK-LABEL: sext_i16_i32:
-; CHECK: i32.load16_s $push0=, $0{{$}}
+; CHECK: i32.load16_s $push0=, 0($0){{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @sext_i16_i32(i16 *%p) {
   %v = load i16, i16* %p
@@ -33,7 +33,7 @@ define i32 @sext_i16_i32(i16 *%p) {
 }
 
 ; CHECK-LABEL: zext_i16_i32:
-; CHECK: i32.load16_u $push0=, $0{{$}}
+; CHECK: i32.load16_u $push0=, 0($0){{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @zext_i16_i32(i16 *%p) {
   %v = load i16, i16* %p
@@ -42,7 +42,7 @@ define i32 @zext_i16_i32(i16 *%p) {
 }
 
 ; CHECK-LABEL: sext_i8_i64:
-; CHECK: i64.load8_s $push0=, $0{{$}}
+; CHECK: i64.load8_s $push0=, 0($0){{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @sext_i8_i64(i8 *%p) {
   %v = load i8, i8* %p
@@ -51,7 +51,7 @@ define i64 @sext_i8_i64(i8 *%p) {
 }
 
 ; CHECK-LABEL: zext_i8_i64:
-; CHECK: i64.load8_u $push0=, $0{{$}}
+; CHECK: i64.load8_u $push0=, 0($0){{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @zext_i8_i64(i8 *%p) {
   %v = load i8, i8* %p
@@ -60,7 +60,7 @@ define i64 @zext_i8_i64(i8 *%p) {
 }
 
 ; CHECK-LABEL: sext_i16_i64:
-; CHECK: i64.load16_s $push0=, $0{{$}}
+; CHECK: i64.load16_s $push0=, 0($0){{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @sext_i16_i64(i16 *%p) {
   %v = load i16, i16* %p
@@ -69,7 +69,7 @@ define i64 @sext_i16_i64(i16 *%p) {
 }
 
 ; CHECK-LABEL: zext_i16_i64:
-; CHECK: i64.load16_u $push0=, $0{{$}}
+; CHECK: i64.load16_u $push0=, 0($0){{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @zext_i16_i64(i16 *%p) {
   %v = load i16, i16* %p
@@ -78,7 +78,7 @@ define i64 @zext_i16_i64(i16 *%p) {
 }
 
 ; CHECK-LABEL: sext_i32_i64:
-; CHECK: i64.load32_s $push0=, $0{{$}}
+; CHECK: i64.load32_s $push0=, 0($0){{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @sext_i32_i64(i32 *%p) {
   %v = load i32, i32* %p
@@ -87,7 +87,7 @@ define i64 @sext_i32_i64(i32 *%p) {
 }
 
 ; CHECK-LABEL: zext_i32_i64:
-; CHECK: i64.load32_u $push0=, $0{{$}}
+; CHECK: i64.load32_u $push0=, 0($0){{$}}
 ; CHECK: return $pop0{{$}}
 define i64 @zext_i32_i64(i32 *%p) {
   %v = load i32, i32* %p
diff --git a/test/CodeGen/WebAssembly/load-store-i1.ll b/test/CodeGen/WebAssembly/load-store-i1.ll
index 33d3aeecc582..1acdfc0dbdeb 100644
--- a/test/CodeGen/WebAssembly/load-store-i1.ll
+++ b/test/CodeGen/WebAssembly/load-store-i1.ll
@@ -6,7 +6,7 @@ target datalayout = "e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: load_u_i1_i32:
-; CHECK:      i32.load8_u $push[[NUM0:[0-9]+]]=, $0{{$}}
+; CHECK:      i32.load8_u $push[[NUM0:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[NUM0]]{{$}}
 define i32 @load_u_i1_i32(i1* %p) {
   %v = load i1, i1* %p
@@ -15,7 +15,7 @@ define i32 @load_u_i1_i32(i1* %p) {
 }
 
 ; CHECK-LABEL: load_s_i1_i32:
-; CHECK:      i32.load8_u $push[[NUM0:[0-9]+]]=, $0{{$}}
+; CHECK:      i32.load8_u $push[[NUM0:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: i32.const $[[NUM1:[0-9]+]]=, 31{{$}}
 ; CHECK-NEXT: shl $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $[[NUM1]]{{$}}
 ; CHECK-NEXT: shr_s $push[[NUM3:[0-9]+]]=, $pop[[NUM2]], $[[NUM1]]{{$}}
@@ -27,7 +27,7 @@ define i32 @load_s_i1_i32(i1* %p) {
 }
 
 ; CHECK-LABEL: load_u_i1_i64:
-; CHECK:      i64.load8_u $push[[NUM0:[0-9]+]]=, $0{{$}}
+; CHECK:      i64.load8_u $push[[NUM0:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[NUM0]]{{$}}
 define i64 @load_u_i1_i64(i1* %p) {
   %v = load i1, i1* %p
@@ -36,7 +36,7 @@ define i64 @load_u_i1_i64(i1* %p) {
 }
 
 ; CHECK-LABEL: load_s_i1_i64:
-; CHECK:      i64.load8_u $push[[NUM0:[0-9]+]]=, $0{{$}}
+; CHECK:      i64.load8_u $push[[NUM0:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: i64.const $[[NUM1:[0-9]+]]=, 63{{$}}
 ; CHECK-NEXT: shl $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $[[NUM1]]{{$}}
 ; CHECK-NEXT: shr_s $push[[NUM3:[0-9]+]]=, $pop[[NUM2]], $[[NUM1]]{{$}}
@@ -50,7 +50,7 @@ define i64 @load_s_i1_i64(i1* %p) {
 ; CHECK-LABEL: store_i32_i1:
 ; CHECK:      i32.const $push[[NUM0:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i32.and $push[[NUM1:[0-9]+]]=, $1, $pop[[NUM0]]{{$}}
-; CHECK-NEXT: i32.store8 $discard=, $0, $pop[[NUM1]]{{$}}
+; CHECK-NEXT: i32.store8 $discard=, 0($0), $pop[[NUM1]]{{$}}
 define void @store_i32_i1(i1* %p, i32 %v) {
   %t = trunc i32 %v to i1
   store i1 %t, i1* %p
@@ -60,7 +60,7 @@ define void @store_i32_i1(i1* %p, i32 %v) {
 ; CHECK-LABEL: store_i64_i1:
 ; CHECK:      i64.const $push[[NUM0:[0-9]+]]=, 1{{$}}
 ; CHECK-NEXT: i64.and $push[[NUM1:[0-9]+]]=, $1, $pop[[NUM0]]{{$}}
-; CHECK-NEXT: i64.store8 $discard=, $0, $pop[[NUM1]]{{$}}
+; CHECK-NEXT: i64.store8 $discard=, 0($0), $pop[[NUM1]]{{$}}
 define void @store_i64_i1(i1* %p, i64 %v) {
   %t = trunc i64 %v to i1
   store i1 %t, i1* %p
diff --git a/test/CodeGen/WebAssembly/load.ll b/test/CodeGen/WebAssembly/load.ll
index 1017167d5227..aa8ae689e0d1 100644
--- a/test/CodeGen/WebAssembly/load.ll
+++ b/test/CodeGen/WebAssembly/load.ll
@@ -8,7 +8,7 @@ target triple = "wasm32-unknown-unknown"
 ; CHECK-LABEL: ldi32:
 ; CHECK-NEXT: .param i32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: i32.load $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: i32.load $push[[NUM:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @ldi32(i32 *%p) {
   %v = load i32, i32* %p
@@ -18,7 +18,7 @@ define i32 @ldi32(i32 *%p) {
 ; CHECK-LABEL: ldi64:
 ; CHECK-NEXT: .param i32{{$}}
 ; CHECK-NEXT: .result i64{{$}}
-; CHECK-NEXT: i64.load $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: i64.load $push[[NUM:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i64 @ldi64(i64 *%p) {
   %v = load i64, i64* %p
@@ -28,7 +28,7 @@ define i64 @ldi64(i64 *%p) {
 ; CHECK-LABEL: ldf32:
 ; CHECK-NEXT: .param i32{{$}}
 ; CHECK-NEXT: .result f32{{$}}
-; CHECK-NEXT: f32.load $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: f32.load $push[[NUM:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define float @ldf32(float *%p) {
   %v = load float, float* %p
@@ -38,7 +38,7 @@ define float @ldf32(float *%p) {
 ; CHECK-LABEL: ldf64:
 ; CHECK-NEXT: .param i32{{$}}
 ; CHECK-NEXT: .result f64{{$}}
-; CHECK-NEXT: f64.load $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: f64.load $push[[NUM:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define double @ldf64(double *%p) {
   %v = load double, double* %p
diff --git a/test/CodeGen/WebAssembly/store-results.ll b/test/CodeGen/WebAssembly/store-results.ll
index c05ed3a04be3..5bb7eafa4b1a 100644
--- a/test/CodeGen/WebAssembly/store-results.ll
+++ b/test/CodeGen/WebAssembly/store-results.ll
@@ -9,7 +9,7 @@ target triple = "wasm32-unknown-unknown"
 ; CHECK-LABEL: single_block:
 ; CHECK-NOT: .local
 ; CHECK: i32.const $push{{[0-9]+}}=, 0
-; CHECK: i32.store $push[[STORE:[0-9]+]]=, $0, $pop{{[0-9]+}}
+; CHECK: i32.store $push[[STORE:[0-9]+]]=, 0($0), $pop{{[0-9]+}}
 ; CHECK: return $pop[[STORE]]{{$}}
 define i32 @single_block(i32* %p) {
 entry:
@@ -26,7 +26,7 @@ entry:
 @pos = global %class.Vec3 zeroinitializer, align 4
 
 ; CHECK-LABEL: foo:
-; CHECK: i32.store $discard=, $pop0, $0
+; CHECK: i32.store $discard=, 0($pop0), $0
 define void @foo() {
 for.body.i:
   br label %for.body5.i
@@ -44,7 +44,7 @@ for.cond.cleanup4.i:
 }
 
 ; CHECK-LABEL: bar:
-; CHECK: i32.store $discard=, $0, $pop0
+; CHECK: i32.store $discard=, 0($0), $pop0
 define void @bar() {
 for.body.i:
   br label %for.body5.i
diff --git a/test/CodeGen/WebAssembly/store-trunc.ll b/test/CodeGen/WebAssembly/store-trunc.ll
index e3587a5ff170..c12b716dfd59 100644
--- a/test/CodeGen/WebAssembly/store-trunc.ll
+++ b/test/CodeGen/WebAssembly/store-trunc.ll
@@ -6,7 +6,7 @@ target datalayout = "e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: trunc_i8_i32:
-; CHECK: i32.store8 $discard=, $0, $1{{$}}
+; CHECK: i32.store8 $discard=, 0($0), $1{{$}}
 define void @trunc_i8_i32(i8 *%p, i32 %v) {
   %t = trunc i32 %v to i8
   store i8 %t, i8* %p
@@ -14,7 +14,7 @@ define void @trunc_i8_i32(i8 *%p, i32 %v) {
 }
 
 ; CHECK-LABEL: trunc_i16_i32:
-; CHECK: i32.store16 $discard=, $0, $1{{$}}
+; CHECK: i32.store16 $discard=, 0($0), $1{{$}}
 define void @trunc_i16_i32(i16 *%p, i32 %v) {
   %t = trunc i32 %v to i16
   store i16 %t, i16* %p
@@ -22,7 +22,7 @@ define void @trunc_i16_i32(i16 *%p, i32 %v) {
 }
 
 ; CHECK-LABEL: trunc_i8_i64:
-; CHECK: i64.store8 $discard=, $0, $1{{$}}
+; CHECK: i64.store8 $discard=, 0($0), $1{{$}}
 define void @trunc_i8_i64(i8 *%p, i64 %v) {
   %t = trunc i64 %v to i8
   store i8 %t, i8* %p
@@ -30,7 +30,7 @@ define void @trunc_i8_i64(i8 *%p, i64 %v) {
 }
 
 ; CHECK-LABEL: trunc_i16_i64:
-; CHECK: i64.store16 $discard=, $0, $1{{$}}
+; CHECK: i64.store16 $discard=, 0($0), $1{{$}}
 define void @trunc_i16_i64(i16 *%p, i64 %v) {
   %t = trunc i64 %v to i16
   store i16 %t, i16* %p
@@ -38,7 +38,7 @@ define void @trunc_i16_i64(i16 *%p, i64 %v) {
 }
 
 ; CHECK-LABEL: trunc_i32_i64:
-; CHECK: i64.store32 $discard=, $0, $1{{$}}
+; CHECK: i64.store32 $discard=, 0($0), $1{{$}}
 define void @trunc_i32_i64(i32 *%p, i64 %v) {
   %t = trunc i64 %v to i32
   store i32 %t, i32* %p
diff --git a/test/CodeGen/WebAssembly/store.ll b/test/CodeGen/WebAssembly/store.ll
index a2164d4ae6a9..442caedef3a7 100644
--- a/test/CodeGen/WebAssembly/store.ll
+++ b/test/CodeGen/WebAssembly/store.ll
@@ -7,7 +7,7 @@ target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: sti32:
 ; CHECK-NEXT: .param i32, i32{{$}}
-; CHECK-NEXT: i32.store $discard=, $0, $1{{$}}
+; CHECK-NEXT: i32.store $discard=, 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @sti32(i32 *%p, i32 %v) {
   store i32 %v, i32* %p
@@ -16,7 +16,7 @@ define void @sti32(i32 *%p, i32 %v) {
 
 ; CHECK-LABEL: sti64:
 ; CHECK-NEXT: .param i32, i64{{$}}
-; CHECK-NEXT: i64.store $discard=, $0, $1{{$}}
+; CHECK-NEXT: i64.store $discard=, 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @sti64(i64 *%p, i64 %v) {
   store i64 %v, i64* %p
@@ -25,7 +25,7 @@ define void @sti64(i64 *%p, i64 %v) {
 
 ; CHECK-LABEL: stf32:
 ; CHECK-NEXT: .param i32, f32{{$}}
-; CHECK-NEXT: f32.store $discard=, $0, $1{{$}}
+; CHECK-NEXT: f32.store $discard=, 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @stf32(float *%p, float %v) {
   store float %v, float* %p
@@ -34,7 +34,7 @@ define void @stf32(float *%p, float %v) {
 
 ; CHECK-LABEL: stf64:
 ; CHECK-NEXT: .param i32, f64{{$}}
-; CHECK-NEXT: f64.store $discard=, $0, $1{{$}}
+; CHECK-NEXT: f64.store $discard=, 0($0), $1{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @stf64(double *%p, double %v) {
   store double %v, double* %p
diff --git a/test/CodeGen/WebAssembly/varargs.ll b/test/CodeGen/WebAssembly/varargs.ll
index ccc7c1f9ce43..bda0dd779e65 100644
--- a/test/CodeGen/WebAssembly/varargs.ll
+++ b/test/CodeGen/WebAssembly/varargs.ll
@@ -32,8 +32,8 @@ entry:
 
 ; CHECK-LABEL: copy:
 ; CHECK-NEXT: .param i32, i32{{$}}
-; CHECK-NEXT: i32.load  $push0=, $1{{$}}
-; CHECK-NEXT: i32.store $discard=, $0, $pop0{{$}}
+; CHECK-NEXT: i32.load  $push0=, 0($1){{$}}
+; CHECK-NEXT: i32.store $discard=, 0($0), $pop0{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @copy(i8** %ap, i8** %bp) {
 entry:
@@ -49,11 +49,11 @@ entry:
 ; CHECK-NEXT: .param     i32{{$}}
 ; CHECK-NEXT: .result    i32{{$}}
 ; CHECK-NEXT: .local     i32{{$}}
-; CHECK-NEXT: i32.load   $1=, $0{{$}}
+; CHECK-NEXT: i32.load   $1=, 0($0){{$}}
 ; CHECK-NEXT: i32.const  $push0=, 4{{$}}
 ; CHECK-NEXT: i32.add    $push1=, $1, $pop0{{$}}
-; CHECK-NEXT: i32.store  $discard=, $0, $pop1{{$}}
-; CHECK-NEXT: i32.load   $push2=, $1{{$}}
+; CHECK-NEXT: i32.store  $discard=, 0($0), $pop1{{$}}
+; CHECK-NEXT: i32.load   $push2=, 0($1){{$}}
 ; CHECK-NEXT: return     $pop2{{$}}
 define i8 @arg_i8(i8** %ap) {
 entry:
@@ -67,15 +67,15 @@ entry:
 ; CHECK-NEXT: .param     i32{{$}}
 ; CHECK-NEXT: .result    i32{{$}}
 ; CHECK-NEXT: .local     i32{{$}}
-; CHECK-NEXT: i32.load   $push0=, $0{{$}}
+; CHECK-NEXT: i32.load   $push0=, 0($0){{$}}
 ; CHECK-NEXT: i32.const  $push1=, 3{{$}}
 ; CHECK-NEXT: i32.add    $push2=, $pop0, $pop1{{$}}
 ; CHECK-NEXT: i32.const  $push3=, -4{{$}}
 ; CHECK-NEXT: i32.and    $1=, $pop2, $pop3{{$}}
 ; CHECK-NEXT: i32.const  $push4=, 4{{$}}
 ; CHECK-NEXT: i32.add    $push5=, $1, $pop4{{$}}
-; CHECK-NEXT: i32.store  $discard=, $0, $pop5{{$}}
-; CHECK-NEXT: i32.load   $push6=, $1{{$}}
+; CHECK-NEXT: i32.store  $discard=, 0($0), $pop5{{$}}
+; CHECK-NEXT: i32.load   $push6=, 0($1){{$}}
 ; CHECK-NEXT: return     $pop6{{$}}
 define i32 @arg_i32(i8** %ap) {
 entry:

From 83eafdfffcd7fc709e4738213c3355d9b2d933b9 Mon Sep 17 00:00:00 2001
From: Justin Bogner <mail@justinbogner.com>
Date: Sat, 5 Dec 2015 00:39:14 +0000
Subject: [PATCH 107/364] CodeGen: Let the BumpPtrAllocator free the elements
 of indexList

The indexList's nodes are all allocated on a BumpPtrAllocator, so it's
more efficient to let them be freed when it goes away, rather than
deleting them directly. This is a follow up to r254794.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254808 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/SlotIndexes.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/llvm/CodeGen/SlotIndexes.h b/include/llvm/CodeGen/SlotIndexes.h
index 5dc42e20debe..7b621bee259f 100644
--- a/include/llvm/CodeGen/SlotIndexes.h
+++ b/include/llvm/CodeGen/SlotIndexes.h
@@ -376,6 +376,11 @@ namespace llvm {
       initializeSlotIndexesPass(*PassRegistry::getPassRegistry());
     }
 
+    ~SlotIndexes() {
+      // The indexList's nodes are all allocated in the BumpPtrAllocator.
+      indexList.clearAndLeakNodesUnsafely();
+    }
+
     void getAnalysisUsage(AnalysisUsage &au) const override;
     void releaseMemory() override;
 

From ca526959c1fc8d8172b780d916cdeb3f03a33836 Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Sat, 5 Dec 2015 00:51:40 +0000
Subject: [PATCH 108/364] [WebAssembly] Fix scheduling dependencies in
 register-stackified code

Add physical register defs to instructions used from stackified
instructions to prevent them from being scheduled into the middle of
a stack sequence. This is a conservative measure which may be loosened
in the future.

Differential Revision: http://reviews.llvm.org/D15252


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254811 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../WebAssembly/WebAssemblyRegStackify.cpp    | 51 +++++++++++++++----
 test/CodeGen/WebAssembly/load-store-i1.ll     |  8 +--
 test/CodeGen/WebAssembly/reg-stackify.ll      | 28 ++++++++++
 3 files changed, 74 insertions(+), 13 deletions(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index 7abc20a8387e..ac016a7b9b0a 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -61,15 +61,41 @@ FunctionPass *llvm::createWebAssemblyRegStackify() {
 }
 
 // Decorate the given instruction with implicit operands that enforce the
-// expression stack ordering constraints.
-static void ImposeStackOrdering(MachineInstr *MI) {
-  // Read and write the opaque EXPR_STACK register.
-  MI->addOperand(MachineOperand::CreateReg(WebAssembly::EXPR_STACK,
-                                           /*isDef=*/true,
-                                           /*isImp=*/true));
+// expression stack ordering constraints needed for an instruction which is
+// consumed by an instruction using the expression stack.
+static void ImposeStackInputOrdering(MachineInstr *MI) {
+  // Write the opaque EXPR_STACK register.
+  if (!MI->definesRegister(WebAssembly::EXPR_STACK))
+    MI->addOperand(MachineOperand::CreateReg(WebAssembly::EXPR_STACK,
+                                             /*isDef=*/true,
+                                             /*isImp=*/true));
+}
+
+// Decorate the given instruction with implicit operands that enforce the
+// expression stack ordering constraints for an instruction which is on
+// the expression stack.
+static void ImposeStackOrdering(MachineInstr *MI, MachineRegisterInfo &MRI) {
+  ImposeStackInputOrdering(MI);
+
+  // Also read the opaque EXPR_STACK register.
   MI->addOperand(MachineOperand::CreateReg(WebAssembly::EXPR_STACK,
                                            /*isDef=*/false,
                                            /*isImp=*/true));
+
+  // Also, mark any inputs to this instruction as being consumed by an
+  // instruction on the expression stack.
+  // TODO: Find a lighter way to describe the appropriate constraints.
+  for (MachineOperand &MO : MI->uses()) {
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    MachineInstr *Def = MRI.getVRegDef(Reg);
+    if (Def->getOpcode() == TargetOpcode::PHI)
+      continue;
+    ImposeStackInputOrdering(Def);
+  }
 }
 
 // Test whether it's safe to move Def to just before Insert. Note that this
@@ -126,8 +152,15 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
           continue;
 
         unsigned Reg = Op.getReg();
-        if (!TargetRegisterInfo::isVirtualRegister(Reg))
+        if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+          // An instruction with a physical register. Conservatively mark it as
+          // an expression stack input so that it isn't reordered with anything
+          // in an expression stack which might use it (physical registers
+          // aren't in SSA form so it's not trivial to determine this).
+          // TODO: Be less conservative.
+          ImposeStackInputOrdering(Insert);
           continue;
+        }
 
         // Only consider registers with a single definition.
         // TODO: Eventually we may relax this, to stackify phi transfers.
@@ -178,11 +211,11 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
         MBB.insert(MachineBasicBlock::instr_iterator(Insert),
                    Def->removeFromParent());
         MFI.stackifyVReg(Reg);
-        ImposeStackOrdering(Def);
+        ImposeStackOrdering(Def, MRI);
         Insert = Def;
       }
       if (AnyStackified)
-        ImposeStackOrdering(&MI);
+        ImposeStackOrdering(&MI, MRI);
     }
   }
 
diff --git a/test/CodeGen/WebAssembly/load-store-i1.ll b/test/CodeGen/WebAssembly/load-store-i1.ll
index 1acdfc0dbdeb..37b514729479 100644
--- a/test/CodeGen/WebAssembly/load-store-i1.ll
+++ b/test/CodeGen/WebAssembly/load-store-i1.ll
@@ -15,8 +15,8 @@ define i32 @load_u_i1_i32(i1* %p) {
 }
 
 ; CHECK-LABEL: load_s_i1_i32:
-; CHECK:      i32.load8_u $push[[NUM0:[0-9]+]]=, 0($0){{$}}
-; CHECK-NEXT: i32.const $[[NUM1:[0-9]+]]=, 31{{$}}
+; CHECK:      i32.const $[[NUM1:[0-9]+]]=, 31{{$}}
+; CHECK-NEXT: i32.load8_u $push[[NUM0:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: shl $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $[[NUM1]]{{$}}
 ; CHECK-NEXT: shr_s $push[[NUM3:[0-9]+]]=, $pop[[NUM2]], $[[NUM1]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM3]]{{$}}
@@ -36,8 +36,8 @@ define i64 @load_u_i1_i64(i1* %p) {
 }
 
 ; CHECK-LABEL: load_s_i1_i64:
-; CHECK:      i64.load8_u $push[[NUM0:[0-9]+]]=, 0($0){{$}}
-; CHECK-NEXT: i64.const $[[NUM1:[0-9]+]]=, 63{{$}}
+; CHECK:      i64.const $[[NUM1:[0-9]+]]=, 63{{$}}
+; CHECK-NEXT: i64.load8_u $push[[NUM0:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: shl $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $[[NUM1]]{{$}}
 ; CHECK-NEXT: shr_s $push[[NUM3:[0-9]+]]=, $pop[[NUM2]], $[[NUM1]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM3]]{{$}}
diff --git a/test/CodeGen/WebAssembly/reg-stackify.ll b/test/CodeGen/WebAssembly/reg-stackify.ll
index f3000aab70ad..180d70e2e4a7 100644
--- a/test/CodeGen/WebAssembly/reg-stackify.ll
+++ b/test/CodeGen/WebAssembly/reg-stackify.ll
@@ -44,4 +44,32 @@ define i32 @yes1(i32* %q) {
   ret i32 %t
 }
 
+; Don't schedule stack uses into the stack. To reduce register pressure, the
+; scheduler might be tempted to move the definition of $2 down. However, this
+; would risk getting incorrect liveness if the instructions are later
+; rearranged to make the stack contiguous.
+
+; CHECK-LABEL: stack_uses:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: local i32, i32{{$}}
+; CHECK-NEXT: i32.const       $1=, 1{{$}}
+; CHECK-NEXT: i32.const       $2=, 0{{$}}
+; CHECK-NEXT: i32.and         $push0=, $0, $1{{$}}
+; CHECK-NEXT: i32.eq          $push1=, $pop0, $2{{$}}
+; CHECK-NEXT: block           BB4_2{{$}}
+; CHECK-NEXT: br_if           $pop1, BB4_2{{$}}
+; CHECK-NEXT: return          $2{{$}}
+; CHECK-NEXT: BB4_2:{{$}}
+; CHECK-NEXT: return          $1{{$}}
+define i32 @stack_uses(i32 %x) {
+entry:
+  %c = trunc i32 %x to i1
+  br i1 %c, label %true, label %false
+true:
+  ret i32 0
+false:
+  ret i32 1
+}
+
 !0 = !{}

From c5cf58b8a73357b0c4309f1e8dd10ba0f72accd2 Mon Sep 17 00:00:00 2001
From: Cong Hou <congh@google.com>
Date: Sat, 5 Dec 2015 01:00:22 +0000
Subject: [PATCH 109/364] Fix a typo in LoopVectorize.cpp. NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254813 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index c5b8b5b073d6..917f2d55f6cb 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5188,7 +5188,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(
         continue;
       }
 
-      // Count the number of live intevals.
+      // Count the number of live intervals.
       unsigned RegUsage = 0;
       for (auto Inst : OpenIntervals)
         RegUsage += GetRegUsage(Inst->getType(), VFs[j]);

From 517455ae01552736121c387b27362e2f0244de7e Mon Sep 17 00:00:00 2001
From: Keno Fischer <kfischer@college.harvard.edu>
Date: Sat, 5 Dec 2015 01:02:53 +0000
Subject: [PATCH 110/364] [MC] Add a test for state reset in MCMachOStreamer

This was fixed in r254751, but untestable until r254774, which
added the necessary command line flag to llc. Add a test now
to make sure this doesn't regress again.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254814 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/MC/MachO/empty-twice.ll | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 test/MC/MachO/empty-twice.ll

diff --git a/test/MC/MachO/empty-twice.ll b/test/MC/MachO/empty-twice.ll
new file mode 100644
index 000000000000..6914c73a58d1
--- /dev/null
+++ b/test/MC/MachO/empty-twice.ll
@@ -0,0 +1,12 @@
+; Check that there is no persistent state in the MachO emitter that crashes
+; us when reusing the pass manager.
+; RUN: llc -mtriple=x86_64-apple-darwin -compile-twice -filetype=obj %s -o -
+
+; Force the creation of a DWARF section
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "LLVM", isOptimized: true)
+!1 = !DIFile(filename: "<stdin>", directory: "/")
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}

From 3687382a3485127a701ef31dabed4a32cf231276 Mon Sep 17 00:00:00 2001
From: Keno Fischer <kfischer@college.harvard.edu>
Date: Sat, 5 Dec 2015 01:38:12 +0000
Subject: [PATCH 111/364] [opt] Fix run-twice option for non-idempotent passes

Cloning the module was supposed to guard against the possibility
that the passes may be non-idempotent. However, for some reason
I decided to put that AFTER the passes had already run on the
module, defeating the point entirely. Fix that by moving up the
CloneModule as is done in llc.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254819 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/opt/opt.cpp | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp
index fc31beb48154..fe1605aa8436 100644
--- a/tools/opt/opt.cpp
+++ b/tools/opt/opt.cpp
@@ -614,22 +614,26 @@ int main(int argc, char **argv) {
   // Before executing passes, print the final values of the LLVM options.
   cl::PrintOptionValues();
 
+  // If requested, run all passes again with the same pass manager to catch
+  // bugs caused by persistent state in the passes
+  if (RunTwice) {
+      std::unique_ptr<Module> M2(CloneModule(M.get()));
+      Passes.run(*M2);
+      CompileTwiceBuffer = Buffer;
+      Buffer.clear();
+  }
+
   // Now that we have all of the passes ready, run them.
   Passes.run(*M);
 
-  // If requested, run all passes again with the same pass manager to catch
-  // bugs caused by persistent state in the passes
+  // Compare the two outputs and make sure they're the same
   if (RunTwice) {
     assert(Out);
-    CompileTwiceBuffer = Buffer;
-    Buffer.clear();
-    std::unique_ptr<Module> M2(CloneModule(M.get()));
-    Passes.run(*M2);
     if (Buffer.size() != CompileTwiceBuffer.size() ||
         (memcmp(Buffer.data(), CompileTwiceBuffer.data(), Buffer.size()) !=
          0)) {
       errs() << "Running the pass manager twice changed the output.\n"
-                "Writing the result of the second run to the specified output."
+                "Writing the result of the second run to the specified output.\n"
                 "To generate the one-run comparison binary, just run without\n"
                 "the compile-twice option\n";
       Out->os() << BOS->str();

From 830355b8521ba547c5bbb7e5225f362af62db0e8 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Sat, 5 Dec 2015 01:44:20 +0000
Subject: [PATCH 112/364] Whitespace.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254821 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ExecutionEngine/Orc/IndirectionUtils.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h b/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h
index 3bfff059110c..aa75b3f46b4a 100644
--- a/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h
+++ b/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h
@@ -202,7 +202,7 @@ class LocalJITCompileCallbackManager : public JITCompileCallbackManager {
 					  sys::Memory::MF_READ |
 					  sys::Memory::MF_EXEC);
     assert(!EC && "Failed to mprotect trampoline block");
-    
+
     TrampolineBlocks.push_back(std::move(TrampolineBlock));
   }
 
@@ -404,7 +404,7 @@ void moveGlobalVariableInitializer(GlobalVariable &OrigGV,
                                    ValueMaterializer *Materializer = nullptr,
                                    GlobalVariable *NewGV = nullptr);
 
-/// @brief Clone 
+/// @brief Clone
 GlobalAlias* cloneGlobalAliasDecl(Module &Dst, const GlobalAlias &OrigA,
                                   ValueToValueMapTy &VMap);
 

From d33fc70ecf45feddde1f7675730ff170c017d650 Mon Sep 17 00:00:00 2001
From: Kostya Serebryany <kcc@google.com>
Date: Sat, 5 Dec 2015 02:09:22 +0000
Subject: [PATCH 113/364] [libFuzzer] don't reload the corpus more than once
 every second

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254824 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Fuzzer/FuzzerLoop.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/lib/Fuzzer/FuzzerLoop.cpp b/lib/Fuzzer/FuzzerLoop.cpp
index ca7f82b55607..889c30c87489 100644
--- a/lib/Fuzzer/FuzzerLoop.cpp
+++ b/lib/Fuzzer/FuzzerLoop.cpp
@@ -468,10 +468,15 @@ void Fuzzer::Drill() {
 }
 
 void Fuzzer::Loop() {
+  system_clock::time_point LastCorpusReload = system_clock::now();
   while (true) {
     size_t J1 = ChooseUnitIdxToMutate();;
     SyncCorpus();
-    RereadOutputCorpus();
+    auto Now = system_clock::now();
+    if (duration_cast<seconds>(Now - LastCorpusReload).count()) {
+      RereadOutputCorpus();
+      LastCorpusReload = Now;
+    }
     if (TotalNumberOfRuns >= Options.MaxNumberOfRuns)
       break;
     if (Options.MaxTotalTimeSec > 0 &&

From 35be75843e0a6e2b7f6431b7d960379380aa345d Mon Sep 17 00:00:00 2001
From: Kostya Serebryany <kcc@google.com>
Date: Sat, 5 Dec 2015 02:23:49 +0000
Subject: [PATCH 114/364] [libFuzzer] one more trophie

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254825 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/LibFuzzer.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/LibFuzzer.rst b/docs/LibFuzzer.rst
index eb79b2e490c4..e14ea67d652c 100644
--- a/docs/LibFuzzer.rst
+++ b/docs/LibFuzzer.rst
@@ -483,6 +483,8 @@ Trophies
 
 * `Python <http://bugs.python.org/issue25388>`_
 
+* OpenSSL/BoringSSL: `[1] <https://boringssl.googlesource.com/boringssl/+/cb852981cd61733a7a1ae4fd8755b7ff950e857d>`_
+
 * `Libxml2
   <https://bugzilla.gnome.org/buglist.cgi?bug_status=__all__&content=libFuzzer&list_id=68957&order=Importance&product=libxml2&query_format=specific>`_
 

From 8893466777ecbe91ff69206072bc95a3e38e1a0d Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Sat, 5 Dec 2015 03:03:35 +0000
Subject: [PATCH 115/364] [WebAssembly] Implement ReverseBranchCondition, and
 re-enable MachineBlockPlacement

This patch introduces a codegen-only instruction currently named br_unless,
which makes it convenient to implement ReverseBranchCondition and re-enable
the MachineBlockPlacement pass. Then in a late pass, it lowers br_unless
back into br_if.

Differential Revision: http://reviews.llvm.org/D14995


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254826 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/CMakeLists.txt         |   1 +
 lib/Target/WebAssembly/WebAssembly.h          |   1 +
 .../WebAssembly/WebAssemblyInstrControl.td    |  19 ++-
 .../WebAssembly/WebAssemblyInstrInfo.cpp      |  32 +++--
 .../WebAssembly/WebAssemblyLowerBrUnless.cpp  | 133 ++++++++++++++++++
 .../WebAssembly/WebAssemblyPeephole.cpp       |   2 +-
 .../WebAssembly/WebAssemblyTargetMachine.cpp  |   7 +-
 test/CodeGen/WebAssembly/cfg-stackify.ll      | 125 ++++++++++++++--
 test/CodeGen/WebAssembly/reg-stackify.ll      |  41 ++++--
 test/CodeGen/WebAssembly/switch.ll            |   5 +-
 10 files changed, 321 insertions(+), 45 deletions(-)
 create mode 100644 lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp

diff --git a/lib/Target/WebAssembly/CMakeLists.txt b/lib/Target/WebAssembly/CMakeLists.txt
index 6a5894958e32..5d1a27a6f093 100644
--- a/lib/Target/WebAssembly/CMakeLists.txt
+++ b/lib/Target/WebAssembly/CMakeLists.txt
@@ -18,6 +18,7 @@ add_llvm_target(WebAssemblyCodeGen
   WebAssemblyISelDAGToDAG.cpp
   WebAssemblyISelLowering.cpp
   WebAssemblyInstrInfo.cpp
+  WebAssemblyLowerBrUnless.cpp
   WebAssemblyMachineFunctionInfo.cpp
   WebAssemblyMCInstLower.cpp
   WebAssemblyOptimizeReturned.cpp
diff --git a/lib/Target/WebAssembly/WebAssembly.h b/lib/Target/WebAssembly/WebAssembly.h
index 001f9f9d4a72..6705b22a376f 100644
--- a/lib/Target/WebAssembly/WebAssembly.h
+++ b/lib/Target/WebAssembly/WebAssembly.h
@@ -32,6 +32,7 @@ FunctionPass *createWebAssemblyStoreResults();
 FunctionPass *createWebAssemblyRegStackify();
 FunctionPass *createWebAssemblyRegColoring();
 FunctionPass *createWebAssemblyCFGStackify();
+FunctionPass *createWebAssemblyLowerBrUnless();
 FunctionPass *createWebAssemblyRegNumbering();
 FunctionPass *createWebAssemblyPeephole();
 
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
index 840f7d669314..708d902e99e1 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -15,9 +15,13 @@
 let Defs = [ARGUMENTS] in {
 
 let isBranch = 1, isTerminator = 1, hasCtrlDep = 1 in {
-def BR_IF : I<(outs), (ins I32:$a, bb_op:$dst),
-              [(brcond I32:$a, bb:$dst)],
-               "br_if   \t$a, $dst">;
+// The condition operand is a boolean value which WebAssembly represents as i32.
+def BR_IF : I<(outs), (ins I32:$cond, bb_op:$dst),
+              [(brcond I32:$cond, bb:$dst)],
+               "br_if   \t$cond, $dst">;
+let isCodeGenOnly = 1 in
+def BR_UNLESS : I<(outs), (ins I32:$cond, bb_op:$dst), [],
+                   "br_unless\t$cond, $dst">;
 let isBarrier = 1 in {
 def BR   : I<(outs), (ins bb_op:$dst),
              [(br bb:$dst)],
@@ -25,6 +29,15 @@ def BR   : I<(outs), (ins bb_op:$dst),
 } // isBarrier = 1
 } // isBranch = 1, isTerminator = 1, hasCtrlDep = 1
 
+} // Defs = [ARGUMENTS]
+
+def : Pat<(brcond (i32 (setne I32:$cond, 0)), bb:$dst),
+          (BR_IF I32:$cond, bb_op:$dst)>;
+def : Pat<(brcond (i32 (seteq I32:$cond, 0)), bb:$dst),
+          (BR_UNLESS I32:$cond, bb_op:$dst)>;
+
+let Defs = [ARGUMENTS] in {
+
 // TODO: SelectionDAG's lowering insists on using a pointer as the index for
 // jump tables, so in practice we don't ever use TABLESWITCH_I64 in wasm32 mode
 // currently.
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
index bd06bc396dcd..3b219f4a901a 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
@@ -71,6 +71,15 @@ bool WebAssemblyInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
     case WebAssembly::BR_IF:
       if (HaveCond)
         return true;
+      Cond.push_back(MachineOperand::CreateImm(true));
+      Cond.push_back(MI.getOperand(0));
+      TBB = MI.getOperand(1).getMBB();
+      HaveCond = true;
+      break;
+    case WebAssembly::BR_UNLESS:
+      if (HaveCond)
+        return true;
+      Cond.push_back(MachineOperand::CreateImm(false));
       Cond.push_back(MI.getOperand(0));
       TBB = MI.getOperand(1).getMBB();
       HaveCond = true;
@@ -113,8 +122,6 @@ unsigned WebAssemblyInstrInfo::InsertBranch(MachineBasicBlock &MBB,
                                             MachineBasicBlock *FBB,
                                             ArrayRef<MachineOperand> Cond,
                                             DebugLoc DL) const {
-  assert(Cond.size() <= 1);
-
   if (Cond.empty()) {
     if (!TBB)
       return 0;
@@ -123,7 +130,17 @@ unsigned WebAssemblyInstrInfo::InsertBranch(MachineBasicBlock &MBB,
     return 1;
   }
 
-  BuildMI(&MBB, DL, get(WebAssembly::BR_IF)).addOperand(Cond[0]).addMBB(TBB);
+  assert(Cond.size() == 2 && "Expected a flag and a successor block");
+
+  if (Cond[0].getImm()) {
+    BuildMI(&MBB, DL, get(WebAssembly::BR_IF))
+        .addOperand(Cond[1])
+        .addMBB(TBB);
+  } else {
+    BuildMI(&MBB, DL, get(WebAssembly::BR_UNLESS))
+        .addOperand(Cond[1])
+        .addMBB(TBB);
+  }
   if (!FBB)
     return 1;
 
@@ -133,10 +150,7 @@ unsigned WebAssemblyInstrInfo::InsertBranch(MachineBasicBlock &MBB,
 
 bool WebAssemblyInstrInfo::ReverseBranchCondition(
     SmallVectorImpl<MachineOperand> &Cond) const {
-  assert(Cond.size() == 1);
-
-  // TODO: Add branch reversal here... And re-enable MachineBlockPlacementID
-  // when we do.
-
-  return true;
+  assert(Cond.size() == 2 && "Expected a flag and a successor block");
+  Cond.front() = MachineOperand::CreateImm(!Cond.front().getImm());
+  return false;
 }
diff --git a/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp b/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
new file mode 100644
index 000000000000..846f6eb1e5cf
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
@@ -0,0 +1,133 @@
+//===-- WebAssemblyLowerBrUnless.cpp - Lower br_unless --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file lowers br_unless into br_if with an inverted condition.
+///
+/// br_unless is not currently in the spec, but it's very convenient for LLVM
+/// to use. This pass allows LLVM to use it, for now.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-lower-br_unless"
+
+namespace {
+class WebAssemblyLowerBrUnless final : public MachineFunctionPass {
+  const char *getPassName() const override {
+    return "WebAssembly Lower br_unless";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyLowerBrUnless() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyLowerBrUnless::ID = 0;
+FunctionPass *llvm::createWebAssemblyLowerBrUnless() {
+  return new WebAssemblyLowerBrUnless();
+}
+
+bool WebAssemblyLowerBrUnless::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** Lowering br_unless **********\n"
+                  "********** Function: "
+               << MF.getName() << '\n');
+
+  auto &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+  const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  auto &MRI = MF.getRegInfo();
+
+  for (auto &MBB : MF) {
+    for (auto MII = MBB.begin(); MII != MBB.end(); ) {
+      MachineInstr *MI = &*MII++;
+      if (MI->getOpcode() != WebAssembly::BR_UNLESS)
+        continue;
+
+      unsigned Cond = MI->getOperand(0).getReg();
+      bool Inverted = false;
+
+      // Attempt to invert the condition in place.
+      if (MFI.isVRegStackified(Cond)) {
+        assert(MRI.hasOneDef(Cond));
+        MachineInstr *Def = MRI.getVRegDef(Cond);
+        switch (Def->getOpcode()) {
+        using namespace WebAssembly;
+        case EQ_I32: Def->setDesc(TII.get(NE_I32)); Inverted = true; break;
+        case NE_I32: Def->setDesc(TII.get(EQ_I32)); Inverted = true; break;
+        case GT_S_I32: Def->setDesc(TII.get(LE_S_I32)); Inverted = true; break;
+        case GE_S_I32: Def->setDesc(TII.get(LT_S_I32)); Inverted = true; break;
+        case LT_S_I32: Def->setDesc(TII.get(GE_S_I32)); Inverted = true; break;
+        case LE_S_I32: Def->setDesc(TII.get(GT_S_I32)); Inverted = true; break;
+        case GT_U_I32: Def->setDesc(TII.get(LE_U_I32)); Inverted = true; break;
+        case GE_U_I32: Def->setDesc(TII.get(LT_U_I32)); Inverted = true; break;
+        case LT_U_I32: Def->setDesc(TII.get(GE_U_I32)); Inverted = true; break;
+        case LE_U_I32: Def->setDesc(TII.get(GT_U_I32)); Inverted = true; break;
+        case EQ_I64: Def->setDesc(TII.get(NE_I64)); Inverted = true; break;
+        case NE_I64: Def->setDesc(TII.get(EQ_I64)); Inverted = true; break;
+        case GT_S_I64: Def->setDesc(TII.get(LE_S_I64)); Inverted = true; break;
+        case GE_S_I64: Def->setDesc(TII.get(LT_S_I64)); Inverted = true; break;
+        case LT_S_I64: Def->setDesc(TII.get(GE_S_I64)); Inverted = true; break;
+        case LE_S_I64: Def->setDesc(TII.get(GT_S_I64)); Inverted = true; break;
+        case GT_U_I64: Def->setDesc(TII.get(LE_U_I64)); Inverted = true; break;
+        case GE_U_I64: Def->setDesc(TII.get(LT_U_I64)); Inverted = true; break;
+        case LT_U_I64: Def->setDesc(TII.get(GE_U_I64)); Inverted = true; break;
+        case LE_U_I64: Def->setDesc(TII.get(GT_U_I64)); Inverted = true; break;
+        case EQ_F32: Def->setDesc(TII.get(NE_F32)); Inverted = true; break;
+        case NE_F32: Def->setDesc(TII.get(EQ_F32)); Inverted = true; break;
+        case EQ_F64: Def->setDesc(TII.get(NE_F64)); Inverted = true; break;
+        case NE_F64: Def->setDesc(TII.get(EQ_F64)); Inverted = true; break;
+        default: break;
+        }
+      }
+
+      // If we weren't able to invert the condition in place. Insert an
+      // expression to invert it.
+      if (!Inverted) {
+        unsigned ZeroReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+        MFI.stackifyVReg(ZeroReg);
+        BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::CONST_I32), ZeroReg)
+            .addImm(0);
+        unsigned Tmp = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+        MFI.stackifyVReg(Tmp);
+        BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::EQ_I32), Tmp)
+            .addReg(Cond)
+            .addReg(ZeroReg);
+        Cond = Tmp;
+        Inverted = true;
+      }
+
+      // The br_unless condition has now been inverted. Insert a br_if and
+      // delete the br_unless.
+      assert(Inverted);
+      BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::BR_IF))
+          .addReg(Cond)
+          .addMBB(MI->getOperand(1).getMBB());
+      MBB.erase(MI);
+    }
+  }
+
+  return true;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyPeephole.cpp b/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
index e149d9cce719..11f44775b5ea 100644
--- a/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
@@ -64,7 +64,7 @@ bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) {
         // can use $discard instead.
         MachineOperand &MO = MI.getOperand(0);
         unsigned OldReg = MO.getReg();
-        if (OldReg == MI.getOperand(2).getReg()) {
+        if (OldReg == MI.getOperand(3).getReg()) {
           unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
           MO.setReg(NewReg);
           MO.setIsDead();
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index 29fb89c6e18f..b54699243bd4 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -171,10 +171,6 @@ void WebAssemblyPassConfig::addPostRegAlloc() {
   // Fails with: should be run after register allocation.
   disablePass(&MachineCopyPropagationID);
 
-  // TODO: Until we get ReverseBranchCondition support, MachineBlockPlacement
-  // can create ugly-looking control flow.
-  disablePass(&MachineBlockPlacementID);
-
   // Run the register coloring pass to reduce the total number of registers.
   addPass(createWebAssemblyRegColoring());
 }
@@ -183,6 +179,9 @@ void WebAssemblyPassConfig::addPreEmitPass() {
   // Put the CFG in structured form; insert BLOCK and LOOP markers.
   addPass(createWebAssemblyCFGStackify());
 
+  // Lower br_unless into br_if.
+  addPass(createWebAssemblyLowerBrUnless());
+
   // Create a mapping from LLVM CodeGen virtual registers to wasm registers.
   addPass(createWebAssemblyRegNumbering());
 
diff --git a/test/CodeGen/WebAssembly/cfg-stackify.ll b/test/CodeGen/WebAssembly/cfg-stackify.ll
index b8ac48bf49dc..c615ebb0db9d 100644
--- a/test/CodeGen/WebAssembly/cfg-stackify.ll
+++ b/test/CodeGen/WebAssembly/cfg-stackify.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-block-placement | FileCheck %s
+; RUN: llc < %s -asm-verbose=false | FileCheck -check-prefix=OPT %s
 
 ; Test the CFG stackifier pass.
 
@@ -12,10 +13,21 @@ declare void @something()
 ; CHECK-LABEL: test0:
 ; CHECK: loop
 ; CHECK: i32.add
+; CHECK-NOT: br
 ; CHECK: br_if
+; CHECK-NOT: br
 ; CHECK: call
 ; CHECK: br BB0_1{{$}}
 ; CHECK: return{{$}}
+; OPT-LABEL: test0:
+; OPT: loop
+; OPT: i32.add
+; OPT-NOT: br
+; OPT: br_if
+; OPT-NOT: br
+; OPT: call
+; OPT: br BB0_1{{$}}
+; OPT: return{{$}}
 define void @test0(i32 %n) {
 entry:
   br label %header
@@ -40,10 +52,21 @@ back:
 ; CHECK-LABEL: test1:
 ; CHECK: loop
 ; CHECK: i32.add
+; CHECK-NOT: br
 ; CHECK: br_if
+; CHECK-NOT: br
 ; CHECK: call
 ; CHECK: br BB1_1{{$}}
 ; CHECK: return{{$}}
+; OPT-LABEL: test1:
+; OPT: loop
+; OPT: i32.add
+; OPT-NOT: br
+; OPT: br_if
+; OPT-NOT: br
+; OPT: call
+; OPT: br BB1_1{{$}}
+; OPT: return{{$}}
 define void @test1(i32 %n) {
 entry:
   br label %header
@@ -69,9 +92,16 @@ back:
 ; CHECK: block BB2_2{{$}}
 ; CHECK: br_if {{.*}}, BB2_2{{$}}
 ; CHECK: BB2_1:
-; CHECK: br_if $pop{{[0-9]+}}, BB2_1{{$}}
+; CHECK: br_if ${{[0-9]+}}, BB2_1{{$}}
 ; CHECK: BB2_2:
 ; CHECK: return{{$}}
+; OPT-LABEL: test2:
+; OPT: block BB2_2{{$}}
+; OPT: br_if {{.*}}, BB2_2{{$}}
+; OPT: BB2_1:
+; OPT: br_if ${{[0-9]+}}, BB2_1{{$}}
+; OPT: BB2_2:
+; OPT: return{{$}}
 define void @test2(double* nocapture %p, i32 %n) {
 entry:
   %cmp.4 = icmp sgt i32 %n, 0
@@ -100,13 +130,23 @@ for.end:
 ; CHECK-LABEL: doublediamond:
 ; CHECK: block BB3_5{{$}}
 ; CHECK: block BB3_2{{$}}
-; CHECK: br_if $pop{{[0-9]+}}, BB3_2{{$}}
+; CHECK: br_if $0, BB3_2{{$}}
 ; CHECK: block BB3_4{{$}}
-; CHECK: br_if $pop{{[0-9]+}}, BB3_4{{$}}
+; CHECK: br_if $1, BB3_4{{$}}
 ; CHECK: br BB3_5{{$}}
 ; CHECK: BB3_4:
 ; CHECK: BB3_5:
 ; CHECK: return ${{[0-9]+}}{{$}}
+; OPT-LABEL: doublediamond:
+; OPT: block BB3_5{{$}}
+; OPT: block BB3_4{{$}}
+; OPT: br_if {{.*}}, BB3_4{{$}}
+; OPT: block BB3_3{{$}}
+; OPT: br_if {{.*}}, BB3_3{{$}}
+; OPT: br BB3_5{{$}}
+; OPT: BB3_4:
+; OPT: BB3_5:
+; OPT: return ${{[0-9]+}}{{$}}
 define i32 @doublediamond(i32 %a, i32 %b, i32* %p) {
 entry:
   %c = icmp eq i32 %a, 0
@@ -132,9 +172,14 @@ exit:
 
 ; CHECK-LABEL: triangle:
 ; CHECK: block BB4_2{{$}}
-; CHECK: br_if $pop{{[0-9]+}}, BB4_2{{$}}
+; CHECK: br_if $1, BB4_2{{$}}
 ; CHECK: BB4_2:
 ; CHECK: return ${{[0-9]+}}{{$}}
+; OPT-LABEL: triangle:
+; OPT: block BB4_2{{$}}
+; OPT: br_if $1, BB4_2{{$}}
+; OPT: BB4_2:
+; OPT: return ${{[0-9]+}}{{$}}
 define i32 @triangle(i32* %p, i32 %a) {
 entry:
   %c = icmp eq i32 %a, 0
@@ -151,11 +196,19 @@ exit:
 ; CHECK-LABEL: diamond:
 ; CHECK: block BB5_3{{$}}
 ; CHECK: block BB5_2{{$}}
-; CHECK: br_if $pop{{[0-9]+}}, BB5_2{{$}}
+; CHECK: br_if $1, BB5_2{{$}}
 ; CHECK: br BB5_3{{$}}
 ; CHECK: BB5_2:
 ; CHECK: BB5_3:
 ; CHECK: return ${{[0-9]+}}{{$}}
+; OPT-LABEL: diamond:
+; OPT: block BB5_3{{$}}
+; OPT: block BB5_2{{$}}
+; OPT: br_if {{.*}}, BB5_2{{$}}
+; OPT: br BB5_3{{$}}
+; OPT: BB5_2:
+; OPT: BB5_3:
+; OPT: return ${{[0-9]+}}{{$}}
 define i32 @diamond(i32* %p, i32 %a) {
 entry:
   %c = icmp eq i32 %a, 0
@@ -175,6 +228,9 @@ exit:
 ; CHECK-LABEL: single_block:
 ; CHECK-NOT: br
 ; CHECK: return $pop{{[0-9]+}}{{$}}
+; OPT-LABEL: single_block:
+; OPT-NOT: br
+; OPT: return $pop{{[0-9]+}}{{$}}
 define i32 @single_block(i32* %p) {
 entry:
   store volatile i32 0, i32* %p
@@ -187,6 +243,12 @@ entry:
 ; CHECK: i32.store $discard=, 0($0), $pop{{[0-9]+}}{{$}}
 ; CHECK: br BB7_1{{$}}
 ; CHECK: BB7_2:
+; OPT-LABEL: minimal_loop:
+; OPT-NOT: br
+; OPT: BB7_1:
+; OPT: i32.store $discard=, 0($0), $pop{{[0-9]+}}{{$}}
+; OPT: br BB7_1{{$}}
+; OPT: BB7_2:
 define i32 @minimal_loop(i32* %p) {
 entry:
   store volatile i32 0, i32* %p
@@ -203,6 +265,13 @@ loop:
 ; CHECK: br_if $pop{{[0-9]+}}, BB8_1{{$}}
 ; CHECK: BB8_2:
 ; CHECK: return ${{[0-9]+}}{{$}}
+; OPT-LABEL: simple_loop:
+; OPT-NOT: br
+; OPT: BB8_1:
+; OPT: loop BB8_2{{$}}
+; OPT: br_if {{.*}}, BB8_1{{$}}
+; OPT: BB8_2:
+; OPT: return ${{[0-9]+}}{{$}}
 define i32 @simple_loop(i32* %p, i32 %a) {
 entry:
   %c = icmp eq i32 %a, 0
@@ -218,12 +287,20 @@ exit:
 
 ; CHECK-LABEL: doubletriangle:
 ; CHECK: block BB9_4{{$}}
-; CHECK: br_if $pop{{[0-9]+}}, BB9_4{{$}}
+; CHECK: br_if $0, BB9_4{{$}}
 ; CHECK: block BB9_3{{$}}
-; CHECK: br_if $pop{{[0-9]+}}, BB9_3{{$}}
+; CHECK: br_if $1, BB9_3{{$}}
 ; CHECK: BB9_3:
 ; CHECK: BB9_4:
 ; CHECK: return ${{[0-9]+}}{{$}}
+; OPT-LABEL: doubletriangle:
+; OPT: block BB9_4{{$}}
+; OPT: br_if $0, BB9_4{{$}}
+; OPT: block BB9_3{{$}}
+; OPT: br_if $1, BB9_3{{$}}
+; OPT: BB9_3:
+; OPT: BB9_4:
+; OPT: return ${{[0-9]+}}{{$}}
 define i32 @doubletriangle(i32 %a, i32 %b, i32* %p) {
 entry:
   %c = icmp eq i32 %a, 0
@@ -247,12 +324,21 @@ exit:
 ; CHECK-LABEL: ifelse_earlyexits:
 ; CHECK: block BB10_4{{$}}
 ; CHECK: block BB10_2{{$}}
-; CHECK: br_if $pop{{[0-9]+}}, BB10_2{{$}}
+; CHECK: br_if $0, BB10_2{{$}}
 ; CHECK: br BB10_4{{$}}
 ; CHECK: BB10_2:
-; CHECK: br_if $pop{{[0-9]+}}, BB10_4{{$}}
+; CHECK: br_if $1, BB10_4{{$}}
 ; CHECK: BB10_4:
 ; CHECK: return ${{[0-9]+}}{{$}}
+; OPT-LABEL: ifelse_earlyexits:
+; OPT: block BB10_4{{$}}
+; OPT: block BB10_3{{$}}
+; OPT: br_if {{.*}}, BB10_3{{$}}
+; OPT: br_if $1, BB10_4{{$}}
+; OPT: br BB10_4{{$}}
+; OPT: BB10_3:
+; OPT: BB10_4:
+; OPT: return ${{[0-9]+}}{{$}}
 define i32 @ifelse_earlyexits(i32 %a, i32 %b, i32* %p) {
 entry:
   %c = icmp eq i32 %a, 0
@@ -278,16 +364,31 @@ exit:
 ; CHECK: loop            BB11_7{{$}}
 ; CHECK: block           BB11_6{{$}}
 ; CHECK: block           BB11_3{{$}}
-; CHECK: br_if           $pop{{.*}}, BB11_3{{$}}
+; CHECK: br_if           $0, BB11_3{{$}}
 ; CHECK: br              BB11_6{{$}}
 ; CHECK: BB11_3:
 ; CHECK: block           BB11_5{{$}}
-; CHECK: br_if           $pop{{.*}}, BB11_5{{$}}
+; CHECK: br_if           $1, BB11_5{{$}}
 ; CHECK: br              BB11_6{{$}}
 ; CHECK: BB11_5:
 ; CHECK: BB11_6:
 ; CHECK: br              BB11_1{{$}}
 ; CHECK: BB11_7:
+; OPT-LABEL: doublediamond_in_a_loop:
+; OPT: BB11_1:
+; OPT: loop            BB11_7{{$}}
+; OPT: block           BB11_6{{$}}
+; OPT: block           BB11_5{{$}}
+; OPT: br_if           {{.*}}, BB11_5{{$}}
+; OPT: block           BB11_4{{$}}
+; OPT: br_if           {{.*}}, BB11_4{{$}}
+; OPT: br              BB11_6{{$}}
+; OPT: BB11_4:
+; OPT: br              BB11_6{{$}}
+; OPT: BB11_5:
+; OPT: BB11_6:
+; OPT: br              BB11_1{{$}}
+; OPT: BB11_7:
 define i32 @doublediamond_in_a_loop(i32 %a, i32 %b, i32* %p) {
 entry:
   br label %header
diff --git a/test/CodeGen/WebAssembly/reg-stackify.ll b/test/CodeGen/WebAssembly/reg-stackify.ll
index 180d70e2e4a7..af4a3501531b 100644
--- a/test/CodeGen/WebAssembly/reg-stackify.ll
+++ b/test/CodeGen/WebAssembly/reg-stackify.ll
@@ -50,22 +50,35 @@ define i32 @yes1(i32* %q) {
 ; rearranged to make the stack contiguous.
 
 ; CHECK-LABEL: stack_uses:
-; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .param i32, i32, i32, i32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: local i32, i32{{$}}
-; CHECK-NEXT: i32.const       $1=, 1{{$}}
-; CHECK-NEXT: i32.const       $2=, 0{{$}}
-; CHECK-NEXT: i32.and         $push0=, $0, $1{{$}}
-; CHECK-NEXT: i32.eq          $push1=, $pop0, $2{{$}}
-; CHECK-NEXT: block           BB4_2{{$}}
-; CHECK-NEXT: br_if           $pop1, BB4_2{{$}}
-; CHECK-NEXT: return          $2{{$}}
-; CHECK-NEXT: BB4_2:{{$}}
-; CHECK-NEXT: return          $1{{$}}
-define i32 @stack_uses(i32 %x) {
+; CHECK-NEXT: .local i32, i32{{$}}
+; CHECK-NEXT: i32.const   $4=, 1{{$}}
+; CHECK-NEXT: i32.const   $5=, 2{{$}}
+; CHECK-NEXT: i32.lt_s    $push0=, $0, $4{{$}}
+; CHECK-NEXT: i32.lt_s    $push1=, $1, $5{{$}}
+; CHECK-NEXT: i32.xor     $push4=, $pop0, $pop1{{$}}
+; CHECK-NEXT: i32.lt_s    $push2=, $2, $4{{$}}
+; CHECK-NEXT: i32.lt_s    $push3=, $3, $5{{$}}
+; CHECK-NEXT: i32.xor     $push5=, $pop2, $pop3{{$}}
+; CHECK-NEXT: i32.xor     $push6=, $pop4, $pop5{{$}}
+; CHECK-NEXT: i32.ne      $push7=, $pop6, $4{{$}}
+; CHECK-NEXT: block       BB4_2{{$}}
+; CHECK-NEXT: br_if       $pop7, BB4_2{{$}}
+; CHECK-NEXT: i32.const   $push8=, 0{{$}}
+; CHECK-NEXT: return      $pop8{{$}}
+; CHECK-NEXT: BB4_2:
+; CHECK-NEXT: return      $4{{$}}
+define i32 @stack_uses(i32 %x, i32 %y, i32 %z, i32 %w) {
 entry:
-  %c = trunc i32 %x to i1
-  br i1 %c, label %true, label %false
+  %c = icmp sle i32 %x, 0
+  %d = icmp sle i32 %y, 1
+  %e = icmp sle i32 %z, 0
+  %f = icmp sle i32 %w, 1
+  %g = xor i1 %c, %d
+  %h = xor i1 %e, %f
+  %i = xor i1 %g, %h
+  br i1 %i, label %true, label %false
 true:
   ret i32 0
 false:
diff --git a/test/CodeGen/WebAssembly/switch.ll b/test/CodeGen/WebAssembly/switch.ll
index 41c5b357d068..c62333c336fa 100644
--- a/test/CodeGen/WebAssembly/switch.ll
+++ b/test/CodeGen/WebAssembly/switch.ll
@@ -1,6 +1,7 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-block-placement | FileCheck %s
 
-; Test switch instructions.
+; Test switch instructions. Block placement is disabled because it reorders
+; the blocks in a way that isn't interesting here.
 
 target datalayout = "e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"

From 543e02b4381244c28463696f159170004c787628 Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Sat, 5 Dec 2015 03:05:45 +0000
Subject: [PATCH 116/364] [llvm-dwp] Support debug_tu_index

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254827 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/MC/MCObjectFileInfo.h      |   2 +
 lib/MC/MCObjectFileInfo.cpp             |  10 ++
 test/tools/llvm-dwp/Inputs/simple/a.dwo | Bin 1193 -> 1369 bytes
 test/tools/llvm-dwp/Inputs/simple/b.dwo | Bin 1241 -> 1409 bytes
 test/tools/llvm-dwp/X86/simple.test     |  51 ++++++--
 tools/llvm-dwp/llvm-dwp.cpp             | 163 ++++++++++++++++--------
 6 files changed, 160 insertions(+), 66 deletions(-)

diff --git a/include/llvm/MC/MCObjectFileInfo.h b/include/llvm/MC/MCObjectFileInfo.h
index 388a208fb4a0..cf2c3f12bb6b 100644
--- a/include/llvm/MC/MCObjectFileInfo.h
+++ b/include/llvm/MC/MCObjectFileInfo.h
@@ -118,6 +118,7 @@ class MCObjectFileInfo {
 
   // These are for Fission DWP files.
   MCSection *DwarfCUIndexSection;
+  MCSection *DwarfTUIndexSection;
 
   /// Section for newer gnu pubnames.
   MCSection *DwarfGnuPubNamesSection;
@@ -266,6 +267,7 @@ class MCObjectFileInfo {
   MCSection *getDwarfStrOffDWOSection() const { return DwarfStrOffDWOSection; }
   MCSection *getDwarfAddrSection() const { return DwarfAddrSection; }
   MCSection *getDwarfCUIndexSection() const { return DwarfCUIndexSection; }
+  MCSection *getDwarfTUIndexSection() const { return DwarfTUIndexSection; }
 
   MCSection *getCOFFDebugSymbolsSection() const {
     return COFFDebugSymbolsSection;
diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index 41e28698b1cc..dbedd73a4325 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -262,6 +262,9 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(Triple T) {
   DwarfCUIndexSection =
       Ctx->getMachOSection("__DWARF", "__debug_cu_index", MachO::S_ATTR_DEBUG,
                            SectionKind::getMetadata());
+  DwarfTUIndexSection =
+      Ctx->getMachOSection("__DWARF", "__debug_tu_index", MachO::S_ATTR_DEBUG,
+                           SectionKind::getMetadata());
   StackMapSection = Ctx->getMachOSection("__LLVM_STACKMAPS", "__llvm_stackmaps",
                                          0, SectionKind::getMetadata());
 
@@ -537,6 +540,8 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(Triple T) {
   // DWP Sections
   DwarfCUIndexSection =
       Ctx->getELFSection(".debug_cu_index", ELF::SHT_PROGBITS, 0);
+  DwarfTUIndexSection =
+      Ctx->getELFSection(".debug_tu_index", ELF::SHT_PROGBITS, 0);
 
   StackMapSection =
       Ctx->getELFSection(".llvm_stackmaps", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
@@ -725,6 +730,11 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(Triple T) {
       COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
           COFF::IMAGE_SCN_MEM_READ,
       SectionKind::getMetadata());
+  DwarfTUIndexSection = Ctx->getCOFFSection(
+      ".debug_tu_index",
+      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+          COFF::IMAGE_SCN_MEM_READ,
+      SectionKind::getMetadata());
   DwarfAccelNamesSection = Ctx->getCOFFSection(
       ".apple_names",
       COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
diff --git a/test/tools/llvm-dwp/Inputs/simple/a.dwo b/test/tools/llvm-dwp/Inputs/simple/a.dwo
index 7bdb2a7b9f826e0dfcbafb94b64ef9285b42f4c6..1fc71ca8d17560f4ce433262fedc9bcb4fc14a2f 100644
GIT binary patch
delta 328
zcmZ3<d6R2`hWHIe1~_m4vm6+>88{}|hxlj$xh!BI4n`n@kt5*BqcHZv%uGzmKv6~}
zrr!+Af0A#>JUUQn!JvRp3=@-s39&G+PTa0#W+RyiRK?=RC?GD(%D})X%)s1a$IWfU
zZOsi-DF(z$3<5xsk?}VlBSWHIazO!*H}Ov<Ye{86YVqVq#`MV(7+EGCV0^%+J9#0K
zIp+bOJP4>ve#j`#SUCA0qd4aih!BG_SgaW=rVjO_)MQ0wb52JnZ}Q|sW^>UQ5HB(u
lfa>D{(j1f5GHbI&03FEy)Gs{w1M^M^Ae)(i9jXY40suf}IPw4h

delta 204
zcmcb~wUTp!hIj)b0~|PjSq=<b4D1u_LsV3OTo$kp2P2Td=y$1hf9nJ08YU)XpeQ2~
z({BdmiTAannVammxvjXZxq+g*K+MFz4w7g5&Br*|o-uRsPR8iT5=;*mH7EaMGUwa_
zlmh|7$(Bs=tVKW;<K%~o!kllxq6`dzlXo(kbN++!nkN5bHW!6DgJA_!loLp^Pj+R|
MW?I2AxrpUF0D?Co7ytkO

diff --git a/test/tools/llvm-dwp/Inputs/simple/b.dwo b/test/tools/llvm-dwp/Inputs/simple/b.dwo
index f41243dc722b011d346dccede7fd91268525d578..a69cd69eb995177ad6c1d2579310f49c55a8a2c3 100644
GIT binary patch
delta 339
zcmcb~*~mRXL!5z$0S+9%EC&W|29AmLiHfE`E(=(QgAvGJob-Vy^bpI(e@q==XAJgz
zW|Ez_SHfBWp$H}{2NYrj>0@DFV`5}vWMC9zRBf^oW@T=&+vvp3z`!KTU?Z8SF!7_h
z0jndUfVePFHJdO4P@0?DirbnSs8tMznHU6sBqQT*K1PNlz2t%dAaAk;V<u}!WkG83
z<W|P?$sZUWFj`K&$YjoW6DSD+Dw747<QWSme`FNr{0k9c2m^~XgT;awA%aqq1DVY^
xbD_M+lP5BpiynaZmf-+Y9~Y43n0%L6o3#PtAE18W$qFnxC4g*Z2B1IC0RW6#IUoQ4

delta 226
zcmZqVzR5X3Lwo}x0~|PjSq=<b4D1u_6J-s7To$kp2P2Td7~Xt*!i3Kg=QA-eGBQp)
zC?Ufr$f(+6C(O#+WVg|Yoq>T#m;uP-=C<Ot=AJCVs4l?33)IfU0F+{2Wc<y?I60d!
zbMjNh=*a?1ER!9W9x$3tR%ABkyb892fx%^RB9lC0(PT#^aZX97y96gc1gg-2@|q?q
ivY3k+KwQPJ0;-P_NV8AQWzlB60g?gg7oNO<WhVffk|m!2

diff --git a/test/tools/llvm-dwp/X86/simple.test b/test/tools/llvm-dwp/X86/simple.test
index 5cc626334680..5502ef23d29f 100644
--- a/test/tools/llvm-dwp/X86/simple.test
+++ b/test/tools/llvm-dwp/X86/simple.test
@@ -13,48 +13,73 @@ b.cpp:
   void b(bar) {
   }
 
-CHECK: .debug_abbrev.dwo contents:
-CHECK: Abbrev table for offset: 0x00000000
+CHECK-LABEL: .debug_abbrev.dwo contents:
+CHECK-LABEL: Abbrev table for offset:
+CHECK: 0x0000[[AAOFF:.*]]
 CHECK: DW_TAG_compile_unit
 CHECK: DW_TAG_variable
 CHECK: DW_TAG_structure_type
-CHECK: Abbrev table for offset: 0x00000031
+CHECK-LABEL: Abbrev table for offset:
+CHECK: 0x0000[[BAOFF:.*]]
 CHECK: DW_TAG_compile_unit
 CHECK: DW_TAG_structure_type
 CHECK: DW_TAG_subprogram
 CHECK: DW_TAG_formal_parameter
 
 CHECK: .debug_info.dwo contents:
-CHECK: 0x00000000: Compile Unit: length = 0x00000025 version = 0x0004 abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x00000029)
+CHECK: [[AOFF:0x[0-9a-f]*]]:
+CHECK-LABEL: Compile Unit: length = 0x00000029 version = 0x0004 abbr_offset =
+CHECK:         0x[[AAOFF]] addr_size = 0x08 (next unit at [[BOFF:.*]])
 CHECK: DW_TAG_compile_unit
 CHECK:   DW_AT_name {{.*}} "a.cpp"
 CHECK:   DW_AT_GNU_dwo_id {{.*}} ([[DWOA:.*]])
 CHECK:   DW_TAG_variable
 CHECK:     DW_AT_name {{.*}} "a"
 CHECK:   DW_TAG_structure_type
-CHECK:     DW_AT_name {{.*}} "foo"
+CHECK:     DW_AT_signature {{.*}} ([[FOOSIG:.*]])
 
-CHECK: 0x00000029: Compile Unit: length = 0x00000031 version = 0x0004 abbr_offset = 0x0031 addr_size = 0x08 (next unit at 0x0000005e)
+CHECK: [[BOFF]]:
+CHECK-LABEL: Compile Unit: length = 0x00000035 version = 0x0004 abbr_offset =
+CHECK:         0x[[BAOFF]] addr_size = 0x08 (next unit at [[XOFF:.*]])
 CHECK:   DW_AT_name {{.*}} "b.cpp"
 CHECK:   DW_AT_GNU_dwo_id {{.*}} ([[DWOB:.*]])
 CHECK:   DW_TAG_structure_type
-CHECK:     DW_AT_name {{.*}} "bar"
+CHECK:     DW_AT_signature {{.*}} ([[BARSIG:.*]])
 CHECK:   DW_TAG_subprogram
 CHECK:     DW_AT_name {{.*}} "b"
 CHECK:     DW_TAG_formal_parameter
 
-CHECK: .debug_cu_index contents:
-CHECK: Index Signature          INFO                     ABBREV                   LINE                     STR_OFFSETS
-CHECK:     3 [[DWOA]]           [0x00000000, 0x00000029) [0x00000000, 0x00000031) [0x00000000, 0x00000011) [0x00000000, 0x00000010)
-CHECK:     4 [[DWOB]]           [0x00000029, 0x0000005e) [0x00000031, 0x00000075) [0x00000011, 0x00000022) [0x00000010, 0x00000024)
+CHECK-LABEL: .debug_types.dwo contents:
+CHECK: [[FOOUOFF:0x[0-9a-f]*]]:
+CHECK-LABEL: Type Unit: length = 0x00000020 version = 0x0004 abbr_offset =
+CHECK:         0x[[AAOFF]] addr_size = 0x08 type_signature = [[FOOSIG]] type_offset = 0x[[FOOOFF:.*]] (next unit at [[BARUOFF:.*]])
+CHECK:             DW_TAG_type_unit
+CHECK: [[FOOOFF]]:   DW_TAG_structure_type
+CHECK:                 DW_AT_name {{.*}} "foo"
+CHECK: [[BARUOFF]]:
+CHECK-LABEL: Type Unit: length = 0x00000020 version = 0x0004 abbr_offset =
+CHECK:         0x[[BAOFF]] addr_size = 0x08 type_signature = [[BARSIG]] type_offset = 0x001e (next unit at [[XUOFF:.*]])
+CHECK:             DW_TAG_type_unit
+CHECK: 0x00000042:   DW_TAG_structure_type
+CHECK:                 DW_AT_name {{.*}} "bar"
 
-CHECK: .debug_str.dwo contents:
+CHECK-LABEL: .debug_cu_index contents:
+CHECK: Index Signature INFO                      ABBREV                             LINE                     STR_OFFSETS
+CHECK:     1 [[DWOA]]  {{\[}}[[AOFF]], [[BOFF]]) [0x0000[[AAOFF]], 0x0000[[BAOFF]]) [0x00000000, 0x0000001a) [0x00000000, 0x00000010)
+CHECK:     3 [[DWOB]]  {{\[}}[[BOFF]], [[XOFF]]) [0x0000[[BAOFF]], 0x00000099)      [0x0000001a, 0x00000034) [0x00000010, 0x00000024)
+
+CHECK-LABEL: .debug_tu_index contents:
+CHECK: Index Signature  TYPES                           ABBREV                             LINE                     STR_OFFSETS
+CHECK:     1 [[FOOSIG]] {{\[}}[[FOOUOFF]], [[BARUOFF]]) [0x0000[[AAOFF]], 0x0000[[BAOFF]]) [0x00000000, 0x0000001a) [0x00000000, 0x00000010)
+CHECK:     4 [[BARSIG]] {{\[}}[[BARUOFF]], [[XUOFF]])   [0x0000[[BAOFF]], 0x00000099)      [0x0000001a, 0x00000034) [0x00000010, 0x00000024)
+
+CHECK-LABEL: .debug_str.dwo contents:
 CHECK: "clang version
 CHECK: 0x[[ACPP:.*]]: "a.cpp"
 CHECK-NOT: "clang version
 CHECK: 0x[[BCPP:.*]]: "b.cpp"
 
-CHECK: .debug_str_offsets.dwo contents:
+CHECK-LABEL: .debug_str_offsets.dwo contents:
 CHECK: : 00000000
 CHECK: : [[ACPP]]
 CHECK: : 00000000
diff --git a/tools/llvm-dwp/llvm-dwp.cpp b/tools/llvm-dwp/llvm-dwp.cpp
index b4aaea3b238d..608eca152d94 100644
--- a/tools/llvm-dwp/llvm-dwp.cpp
+++ b/tools/llvm-dwp/llvm-dwp.cpp
@@ -130,6 +130,90 @@ static uint64_t getCUSignature(StringRef Abbrev, StringRef Info) {
   return InfoData.getU64(&Offset);
 }
 
+struct UnitIndexEntry {
+  uint64_t Signature;
+  DWARFUnitIndex::Entry::SectionContribution Contributions[8];
+};
+
+static void addAllTypes(std::vector<UnitIndexEntry> &TypeIndexEntries,
+                        uint32_t OutTypesOffset, StringRef Types,
+                        const UnitIndexEntry &CUEntry) {
+  uint32_t Offset = 0;
+  DataExtractor Data(Types, true, 0);
+  while (Data.isValidOffset(Offset)) {
+    TypeIndexEntries.push_back(CUEntry);
+    auto &Entry = TypeIndexEntries.back();
+    // Zero out the debug_info contribution
+    Entry.Contributions[0] = {};
+    auto &C = Entry.Contributions[DW_SECT_TYPES - DW_SECT_INFO];
+    C.Offset = OutTypesOffset + Offset;
+    auto PrevOffset = Offset;
+    // Length of the unit, including the 4 byte length field.
+    C.Length = Data.getU32(&Offset) + 4;
+
+    Data.getU16(&Offset); // Version
+    Data.getU32(&Offset); // Abbrev offset
+    Data.getU8(&Offset);  // Address size
+    Entry.Signature = Data.getU64(&Offset);
+    Offset = PrevOffset + C.Length;
+  }
+}
+
+static void
+writeIndexTable(MCStreamer &Out, ArrayRef<unsigned> ContributionOffsets,
+                ArrayRef<UnitIndexEntry> IndexEntries,
+                uint32_t DWARFUnitIndex::Entry::SectionContribution::*Field) {
+  for (const auto &E : IndexEntries)
+    for (size_t i = 0; i != array_lengthof(E.Contributions); ++i)
+      if (ContributionOffsets[i])
+        Out.EmitIntValue(E.Contributions[i].*Field, 4);
+}
+
+static void writeIndex(MCStreamer &Out, MCSection *Section,
+                       ArrayRef<unsigned> ContributionOffsets,
+                       ArrayRef<UnitIndexEntry> IndexEntries) {
+  unsigned Columns = 0;
+  for (auto &C : ContributionOffsets)
+    if (C)
+      ++Columns;
+
+  std::vector<unsigned> Buckets(NextPowerOf2(3 * IndexEntries.size() / 2));
+  uint64_t Mask = Buckets.size() - 1;
+  for (size_t i = 0; i != IndexEntries.size(); ++i) {
+    auto S = IndexEntries[i].Signature;
+    auto H = S & Mask;
+    while (Buckets[H])
+      H += ((S >> 32) & Mask) | 1;
+    Buckets[H] = i + 1;
+  }
+
+  Out.SwitchSection(Section);
+  Out.EmitIntValue(2, 4);                   // Version
+  Out.EmitIntValue(Columns, 4);             // Columns
+  Out.EmitIntValue(IndexEntries.size(), 4); // Num Units
+  Out.EmitIntValue(Buckets.size(), 4); // Num Buckets
+
+  // Write the signatures.
+  for (const auto &I : Buckets)
+    Out.EmitIntValue(I ? IndexEntries[I - 1].Signature : 0, 8);
+
+  // Write the indexes.
+  for (const auto &I : Buckets)
+    Out.EmitIntValue(I, 4);
+
+  // Write the column headers (which sections will appear in the table)
+  for (size_t i = 0; i != ContributionOffsets.size(); ++i)
+    if (ContributionOffsets[i])
+      Out.EmitIntValue(i + DW_SECT_INFO, 4);
+
+  // Write the offsets.
+  writeIndexTable(Out, ContributionOffsets, IndexEntries,
+                  &DWARFUnitIndex::Entry::SectionContribution::Offset);
+
+  // Write the lengths.
+  writeIndexTable(Out, ContributionOffsets, IndexEntries,
+                  &DWARFUnitIndex::Entry::SectionContribution::Length);
+}
 static std::error_code write(MCStreamer &Out, ArrayRef<std::string> Inputs) {
   const auto &MCOFI = *Out.getContext().getObjectFileInfo();
   MCSection *const StrSection = MCOFI.getDwarfStrDWOSection();
@@ -143,12 +227,8 @@ static std::error_code write(MCStreamer &Out, ArrayRef<std::string> Inputs) {
       {"debug_line.dwo", {MCOFI.getDwarfLineDWOSection(), DW_SECT_LINE}},
       {"debug_abbrev.dwo", {MCOFI.getDwarfAbbrevDWOSection(), DW_SECT_ABBREV}}};
 
-  struct UnitIndexEntry {
-    uint64_t Signature;
-    DWARFUnitIndex::Entry::SectionContribution Contributions[8];
-  };
-
   std::vector<UnitIndexEntry> IndexEntries;
+  std::vector<UnitIndexEntry> TypeIndexEntries;
 
   StringMap<uint32_t> Strings;
   uint32_t StringOffset = 0;
@@ -167,6 +247,9 @@ static std::error_code write(MCStreamer &Out, ArrayRef<std::string> Inputs) {
     StringRef CurStrOffsetSection;
     StringRef InfoSection;
     StringRef AbbrevSection;
+    StringRef TypesSection;
+
+    auto TypesOffset = ContributionOffsets[DW_SECT_TYPES - DW_SECT_INFO];
 
     for (const auto &Section : ErrOrObj->getBinary()->sections()) {
       StringRef Name;
@@ -188,12 +271,18 @@ static std::error_code write(MCStreamer &Out, ArrayRef<std::string> Inputs) {
         ContributionOffsets[Index] +=
             (CurEntry.Contributions[Index].Length = Contents.size());
 
-        if (Kind == DW_SECT_INFO) {
-          assert(InfoSection.empty());
+        switch (Kind) {
+        case DW_SECT_INFO:
           InfoSection = Contents;
-        } else if (Kind == DW_SECT_ABBREV) {
-          assert(AbbrevSection.empty());
+          break;
+        case DW_SECT_ABBREV:
           AbbrevSection = Contents;
+          break;
+        case DW_SECT_TYPES:
+          TypesSection = Contents;
+          break;
+        default:
+          break;
         }
       }
 
@@ -211,6 +300,7 @@ static std::error_code write(MCStreamer &Out, ArrayRef<std::string> Inputs) {
     assert(!AbbrevSection.empty());
     assert(!InfoSection.empty());
     CurEntry.Signature = getCUSignature(AbbrevSection, InfoSection);
+    addAllTypes(TypeIndexEntries, TypesOffset, TypesSection, CurEntry);
 
     if (auto Err = writeStringsAndOffsets(Out, Strings, StringOffset,
                                           StrSection, StrOffsetSection,
@@ -218,52 +308,19 @@ static std::error_code write(MCStreamer &Out, ArrayRef<std::string> Inputs) {
       return Err;
   }
 
-  unsigned Columns = 0;
-  for (auto &C : ContributionOffsets)
-    if (C)
-      ++Columns;
-
-  std::vector<unsigned> Buckets(NextPowerOf2(3 * IndexEntries.size() / 2));
-  uint64_t Mask = Buckets.size() - 1;
-  for (size_t i = 0; i != IndexEntries.size(); ++i) {
-    auto S = IndexEntries[i].Signature;
-    auto H = S & Mask;
-    while (Buckets[H])
-      H += ((S >> 32) & Mask) | 1;
-    Buckets[H] = i + 1;
-  }
+  // Lie about there being no info contributions so the TU index only includes
+  // the type unit contribution
+  ContributionOffsets[0] = 0;
+  writeIndex(Out, MCOFI.getDwarfTUIndexSection(), ContributionOffsets,
+             TypeIndexEntries);
 
-  Out.SwitchSection(MCOFI.getDwarfCUIndexSection());
-  Out.EmitIntValue(2, 4);                   // Version
-  Out.EmitIntValue(Columns, 4);             // Columns
-  Out.EmitIntValue(IndexEntries.size(), 4); // Num Units
-  // FIXME: This is not the right number of buckets for a real hash.
-  Out.EmitIntValue(Buckets.size(), 4); // Num Buckets
+  // Lie about the type contribution
+  ContributionOffsets[DW_SECT_TYPES - DW_SECT_INFO] = 0;
+  // Unlie about the info contribution
+  ContributionOffsets[0] = 1;
 
-  // Write the signatures.
-  for (const auto &I : Buckets)
-    Out.EmitIntValue(I ? IndexEntries[I - 1].Signature : 0, 8);
-
-  // Write the indexes.
-  for (const auto &I : Buckets)
-    Out.EmitIntValue(I, 4);
-
-  // Write the column headers (which sections will appear in the table)
-  for (size_t i = 0; i != array_lengthof(ContributionOffsets); ++i)
-    if (ContributionOffsets[i])
-      Out.EmitIntValue(i + DW_SECT_INFO, 4);
-
-  // Write the offsets.
-  for (const auto &E : IndexEntries)
-    for (size_t i = 0; i != array_lengthof(E.Contributions); ++i)
-      if (ContributionOffsets[i])
-        Out.EmitIntValue(E.Contributions[i].Offset, 4);
-
-  // Write the lengths.
-  for (const auto &E : IndexEntries)
-    for (size_t i = 0; i != array_lengthof(E.Contributions); ++i)
-      if (ContributionOffsets[i])
-        Out.EmitIntValue(E.Contributions[i].Length, 4);
+  writeIndex(Out, MCOFI.getDwarfCUIndexSection(), ContributionOffsets,
+             IndexEntries);
 
   return std::error_code();
 }

From 32ad075fd5d405f2273dd6609164cb95af5c3b4b Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Sat, 5 Dec 2015 03:06:30 +0000
Subject: [PATCH 117/364] [llvm-dwp] clang-format this to catch anything I've
 missed along the way

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254828 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-dwp/llvm-dwp.cpp | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/tools/llvm-dwp/llvm-dwp.cpp b/tools/llvm-dwp/llvm-dwp.cpp
index 608eca152d94..2583e2e20818 100644
--- a/tools/llvm-dwp/llvm-dwp.cpp
+++ b/tools/llvm-dwp/llvm-dwp.cpp
@@ -1,7 +1,8 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringSet.h"
-#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
+#include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -11,17 +12,16 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/DataExtractor.h"
-#include "llvm/Support/Options.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Options.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
-#include "llvm/Support/MathExtras.h"
-#include <memory>
 #include <list>
+#include <memory>
 #include <unordered_set>
 
 using namespace llvm;
@@ -32,8 +32,10 @@ OptionCategory DwpCategory("Specific Options");
 static list<std::string> InputFiles(Positional, OneOrMore,
                                     desc("<input files>"), cat(DwpCategory));
 
-static opt<std::string> OutputFilename(Required, "o", desc("Specify the output file."),
-                                      value_desc("filename"), cat(DwpCategory));
+static opt<std::string> OutputFilename(Required, "o",
+                                       desc("Specify the output file."),
+                                       value_desc("filename"),
+                                       cat(DwpCategory));
 
 static int error(const Twine &Error, const Twine &Context) {
   errs() << Twine("while processing ") + Context + ":\n";
@@ -191,7 +193,7 @@ static void writeIndex(MCStreamer &Out, MCSection *Section,
   Out.EmitIntValue(2, 4);                   // Version
   Out.EmitIntValue(Columns, 4);             // Columns
   Out.EmitIntValue(IndexEntries.size(), 4); // Num Units
-  Out.EmitIntValue(Buckets.size(), 4); // Num Buckets
+  Out.EmitIntValue(Buckets.size(), 4);      // Num Buckets
 
   // Write the signatures.
   for (const auto &I : Buckets)
@@ -325,7 +327,7 @@ static std::error_code write(MCStreamer &Out, ArrayRef<std::string> Inputs) {
   return std::error_code();
 }
 
-int main(int argc, char** argv) {
+int main(int argc, char **argv) {
 
   ParseCommandLineOptions(argc, argv, "merge split dwarf (.dwo) files");
 
@@ -357,8 +359,7 @@ int main(int argc, char** argv) {
 
   MCObjectFileInfo MOFI;
   MCContext MC(MAI.get(), MRI.get(), &MOFI);
-  MOFI.InitMCObjectFileInfo(TheTriple, Reloc::Default, CodeModel::Default,
-                             MC);
+  MOFI.InitMCObjectFileInfo(TheTriple, Reloc::Default, CodeModel::Default, MC);
 
   auto MAB = TheTarget->createMCAsmBackend(*MRI, TripleName, "");
   if (!MAB)

From eaf992ce08c4f3d08215149750f5342a628656ad Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Sat, 5 Dec 2015 03:10:05 +0000
Subject: [PATCH 118/364] [llvm-dwp] Rename the sufficiently-modified test to
 reflect it's non-simplicity

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254829 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../llvm-dwp/Inputs/{simple => type_units}/a.cpp    |   0
 .../llvm-dwp/Inputs/{simple => type_units}/a.dwo    | Bin
 .../llvm-dwp/Inputs/{simple => type_units}/b.cpp    |   0
 .../llvm-dwp/Inputs/{simple => type_units}/b.dwo    | Bin
 .../llvm-dwp/X86/{simple.test => type_units.test}   |   0
 5 files changed, 0 insertions(+), 0 deletions(-)
 rename test/tools/llvm-dwp/Inputs/{simple => type_units}/a.cpp (100%)
 rename test/tools/llvm-dwp/Inputs/{simple => type_units}/a.dwo (100%)
 rename test/tools/llvm-dwp/Inputs/{simple => type_units}/b.cpp (100%)
 rename test/tools/llvm-dwp/Inputs/{simple => type_units}/b.dwo (100%)
 rename test/tools/llvm-dwp/X86/{simple.test => type_units.test} (100%)

diff --git a/test/tools/llvm-dwp/Inputs/simple/a.cpp b/test/tools/llvm-dwp/Inputs/type_units/a.cpp
similarity index 100%
rename from test/tools/llvm-dwp/Inputs/simple/a.cpp
rename to test/tools/llvm-dwp/Inputs/type_units/a.cpp
diff --git a/test/tools/llvm-dwp/Inputs/simple/a.dwo b/test/tools/llvm-dwp/Inputs/type_units/a.dwo
similarity index 100%
rename from test/tools/llvm-dwp/Inputs/simple/a.dwo
rename to test/tools/llvm-dwp/Inputs/type_units/a.dwo
diff --git a/test/tools/llvm-dwp/Inputs/simple/b.cpp b/test/tools/llvm-dwp/Inputs/type_units/b.cpp
similarity index 100%
rename from test/tools/llvm-dwp/Inputs/simple/b.cpp
rename to test/tools/llvm-dwp/Inputs/type_units/b.cpp
diff --git a/test/tools/llvm-dwp/Inputs/simple/b.dwo b/test/tools/llvm-dwp/Inputs/type_units/b.dwo
similarity index 100%
rename from test/tools/llvm-dwp/Inputs/simple/b.dwo
rename to test/tools/llvm-dwp/Inputs/type_units/b.dwo
diff --git a/test/tools/llvm-dwp/X86/simple.test b/test/tools/llvm-dwp/X86/type_units.test
similarity index 100%
rename from test/tools/llvm-dwp/X86/simple.test
rename to test/tools/llvm-dwp/X86/type_units.test

From 28683ac9d56e053056b176b8ddcbe104e6e5b434 Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Sat, 5 Dec 2015 03:11:17 +0000
Subject: [PATCH 119/364] [llvm-dwp] Fix the type_units.test since I renamed
 its inputs as well

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254830 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/tools/llvm-dwp/X86/type_units.test | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/tools/llvm-dwp/X86/type_units.test b/test/tools/llvm-dwp/X86/type_units.test
index 5502ef23d29f..c49d60d47bb3 100644
--- a/test/tools/llvm-dwp/X86/type_units.test
+++ b/test/tools/llvm-dwp/X86/type_units.test
@@ -1,4 +1,4 @@
-RUN: llvm-dwp %p/../Inputs/simple/a.dwo %p/../Inputs/simple/b.dwo -o %t
+RUN: llvm-dwp %p/../Inputs/type_units/a.dwo %p/../Inputs/type_units/b.dwo -o %t
 RUN: llvm-dwarfdump %t | FileCheck %s
 
 FIXME: For some reason, piping straight from llvm-dwp to llvm-dwarfdump doesn't behave well - looks like dwarfdump is reading/closes before dwp has finished.

From 2b762697564ca1e12e0e974e93ceeb4c3420505c Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Sat, 5 Dec 2015 03:41:53 +0000
Subject: [PATCH 120/364] [llvm-dwp] Add coverage for both the presence and
 absence of type units, and fix/remove the emission of a broken tu_index when
 no type units are present

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254833 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/tools/llvm-dwp/Inputs/simple/a.dwo       | Bin 0 -> 1193 bytes
 test/tools/llvm-dwp/Inputs/simple/b.dwo       | Bin 0 -> 1241 bytes
 .../X86/{type_units.test => simple.test}      |  58 +++++++++++-------
 tools/llvm-dwp/llvm-dwp.cpp                   |  12 ++--
 4 files changed, 42 insertions(+), 28 deletions(-)
 create mode 100644 test/tools/llvm-dwp/Inputs/simple/a.dwo
 create mode 100644 test/tools/llvm-dwp/Inputs/simple/b.dwo
 rename test/tools/llvm-dwp/X86/{type_units.test => simple.test} (54%)

diff --git a/test/tools/llvm-dwp/Inputs/simple/a.dwo b/test/tools/llvm-dwp/Inputs/simple/a.dwo
new file mode 100644
index 0000000000000000000000000000000000000000..7bdb2a7b9f826e0dfcbafb94b64ef9285b42f4c6
GIT binary patch
literal 1193
zcmbtTT}s115T4ytTP=#EFN!Y#5n3N&wTf1;EsBC5;tgtAQ!S=RNq<C8l!_<t5`Fbu
z@D8593wQ!SoK1Gy?HcgGfthc<Z)SEUB)LC0KBR<zGy(+}GN}SQC}Js)OE3j7s5+K=
zVRUW3Rrg#YXKtBUBi;5p?v=5&k;`ZE86)jD-NwpLTV2b5WmcO_u%PC7!C{*ichQUJ
zOXy?h0C9o|IeEW(d-`CvlrEx1DE$J)2<}XhWT{t3jk8|iWiJLm*8$U`RB?Vgv6G;4
z04&9oM1pgiokpj)G@e4fX804Ct(!*Msvx%g5HDqRZdW=NWyh<U=hq&XA;qFb%U-S4
zvfC|HuDdl)<yNKQ+g(+0>aHDd+~HrgfYZvW8JtxbWTT1RyNqS7Wk=hfB>29j%Xx`;
ztjk<&eH3I?6(f32BQ!qp%mZ2`ghABrI>U;t@#a?^&{jhjM7`I?imvhIPce}cC_(S#
zbta1nev7=PXgt;DIj34rK*K)(`l2?8$Uoe;;P`O|#`UQC`(i%oJ~8?yuwUDTBP@Dj
W+#_lezMoh|yOF`TrpWUre(OK6DqH*j

literal 0
HcmV?d00001

diff --git a/test/tools/llvm-dwp/Inputs/simple/b.dwo b/test/tools/llvm-dwp/Inputs/simple/b.dwo
new file mode 100644
index 0000000000000000000000000000000000000000..f41243dc722b011d346dccede7fd91268525d578
GIT binary patch
literal 1241
zcmbtU%`O8`6h8M(>#rfHT2yRiBf(;{8ll7(f`nLDk=RtHrZmQkOsiOl+FI}gEG>Bo
zPayFgmNw4my)(wmBo=OR?m6FgzVkC}r`PM-8=Nr+4+9f^P{;zj#7L`4TQC5<P;;x^
zk=}NKX2bXNk}+=-^odq*?w#l}v!!yOJgHB(Zu@llhbzuZf^F2!&cLn)P}wiqB?|0r
z*EzhG&{oj4(28g&G=M&a0o%KMyTAW@c;vV=)1U_vY*$M$p5hOejOV*jmSy7|Gndvn
z=Id%N0M7x2nao0FF$16(EPG=P_gHzaV<J!N@+gi;1J4uK%wd;7G>#95{n^(LJ-yMq
zI&D>L^sPYn<1jjO?DM0F>(`9K3m*&tF{!|cU#~ZvRx?gFyt*I9Rof1ncARk=p3}v+
z!@tiMc8GnIQ9mn+eg~>{3e&_De#^j;cw6Da`p}m#AI5R%odhFsGC{kOz){H)U3g}N
zi*9w+0cKQ=#P4+BSrab0Rc(M7l_T+}B(qVa2vjZQHAO>kN(wtlPOT@*6%E0HcuC+{
z=K9gokGD@^@nQfFpEUnN)+f!AE&g;`;wMA%k~StjX%ph*GRi~^(h4J>_#04C-$$xq
AOaK4?

literal 0
HcmV?d00001

diff --git a/test/tools/llvm-dwp/X86/type_units.test b/test/tools/llvm-dwp/X86/simple.test
similarity index 54%
rename from test/tools/llvm-dwp/X86/type_units.test
rename to test/tools/llvm-dwp/X86/simple.test
index c49d60d47bb3..962e270a594e 100644
--- a/test/tools/llvm-dwp/X86/type_units.test
+++ b/test/tools/llvm-dwp/X86/simple.test
@@ -1,5 +1,8 @@
+RUN: llvm-dwp %p/../Inputs/simple/a.dwo %p/../Inputs/simple/b.dwo -o %t
+RUN: llvm-dwarfdump %t | FileCheck --check-prefix=CHECK --check-prefix=NOTYP %s
+RUN: llvm-objdump -h %t | FileCheck --check-prefix=NOTYPOBJ %s
 RUN: llvm-dwp %p/../Inputs/type_units/a.dwo %p/../Inputs/type_units/b.dwo -o %t
-RUN: llvm-dwarfdump %t | FileCheck %s
+RUN: llvm-dwarfdump %t | FileCheck --check-prefix=CHECK --check-prefix=TYPES %s
 
 FIXME: For some reason, piping straight from llvm-dwp to llvm-dwarfdump doesn't behave well - looks like dwarfdump is reading/closes before dwp has finished.
 
@@ -28,7 +31,7 @@ CHECK: DW_TAG_formal_parameter
 
 CHECK: .debug_info.dwo contents:
 CHECK: [[AOFF:0x[0-9a-f]*]]:
-CHECK-LABEL: Compile Unit: length = 0x00000029 version = 0x0004 abbr_offset =
+CHECK-LABEL: Compile Unit: length = {{.*}} version = 0x0004 abbr_offset =
 CHECK:         0x[[AAOFF]] addr_size = 0x08 (next unit at [[BOFF:.*]])
 CHECK: DW_TAG_compile_unit
 CHECK:   DW_AT_name {{.*}} "a.cpp"
@@ -36,42 +39,51 @@ CHECK:   DW_AT_GNU_dwo_id {{.*}} ([[DWOA:.*]])
 CHECK:   DW_TAG_variable
 CHECK:     DW_AT_name {{.*}} "a"
 CHECK:   DW_TAG_structure_type
-CHECK:     DW_AT_signature {{.*}} ([[FOOSIG:.*]])
+NOTYP:     DW_AT_name {{.*}} "foo"
+TYPES:     DW_AT_signature {{.*}} ([[FOOSIG:.*]])
 
 CHECK: [[BOFF]]:
-CHECK-LABEL: Compile Unit: length = 0x00000035 version = 0x0004 abbr_offset =
+CHECK-LABEL: Compile Unit: length = {{.*}} version = 0x0004 abbr_offset =
 CHECK:         0x[[BAOFF]] addr_size = 0x08 (next unit at [[XOFF:.*]])
 CHECK:   DW_AT_name {{.*}} "b.cpp"
 CHECK:   DW_AT_GNU_dwo_id {{.*}} ([[DWOB:.*]])
 CHECK:   DW_TAG_structure_type
-CHECK:     DW_AT_signature {{.*}} ([[BARSIG:.*]])
+NOTYP:     DW_AT_name {{.*}} "bar"
+TYPES:     DW_AT_signature {{.*}} ([[BARSIG:.*]])
 CHECK:   DW_TAG_subprogram
 CHECK:     DW_AT_name {{.*}} "b"
 CHECK:     DW_TAG_formal_parameter
 
-CHECK-LABEL: .debug_types.dwo contents:
-CHECK: [[FOOUOFF:0x[0-9a-f]*]]:
-CHECK-LABEL: Type Unit: length = 0x00000020 version = 0x0004 abbr_offset =
-CHECK:         0x[[AAOFF]] addr_size = 0x08 type_signature = [[FOOSIG]] type_offset = 0x[[FOOOFF:.*]] (next unit at [[BARUOFF:.*]])
-CHECK:             DW_TAG_type_unit
-CHECK: [[FOOOFF]]:   DW_TAG_structure_type
-CHECK:                 DW_AT_name {{.*}} "foo"
-CHECK: [[BARUOFF]]:
-CHECK-LABEL: Type Unit: length = 0x00000020 version = 0x0004 abbr_offset =
-CHECK:         0x[[BAOFF]] addr_size = 0x08 type_signature = [[BARSIG]] type_offset = 0x001e (next unit at [[XUOFF:.*]])
-CHECK:             DW_TAG_type_unit
-CHECK: 0x00000042:   DW_TAG_structure_type
-CHECK:                 DW_AT_name {{.*}} "bar"
+NOTYP-NOT: .debug_types.dwo contents:
+TYPES-LABEL: .debug_types.dwo contents:
+TYPES: [[FOOUOFF:0x[0-9a-f]*]]:
+TYPES-LABEL: Type Unit: length = 0x00000020 version = 0x0004 abbr_offset =
+TYPES:         0x[[AAOFF]] addr_size = 0x08 type_signature = [[FOOSIG]] type_offset = 0x[[FOOOFF:.*]] (next unit at [[BARUOFF:.*]])
+TYPES:             DW_TAG_type_unit
+TYPES: [[FOOOFF]]:   DW_TAG_structure_type
+TYPES:                 DW_AT_name {{.*}} "foo"
+TYPES: [[BARUOFF]]:
+TYPES-LABEL: Type Unit: length = 0x00000020 version = 0x0004 abbr_offset =
+TYPES:         0x[[BAOFF]] addr_size = 0x08 type_signature = [[BARSIG]] type_offset = 0x001e (next unit at [[XUOFF:.*]])
+TYPES:             DW_TAG_type_unit
+TYPES: 0x00000042:   DW_TAG_structure_type
+TYPES:                 DW_AT_name {{.*}} "bar"
 
 CHECK-LABEL: .debug_cu_index contents:
 CHECK: Index Signature INFO                      ABBREV                             LINE                     STR_OFFSETS
-CHECK:     1 [[DWOA]]  {{\[}}[[AOFF]], [[BOFF]]) [0x0000[[AAOFF]], 0x0000[[BAOFF]]) [0x00000000, 0x0000001a) [0x00000000, 0x00000010)
-CHECK:     3 [[DWOB]]  {{\[}}[[BOFF]], [[XOFF]]) [0x0000[[BAOFF]], 0x00000099)      [0x0000001a, 0x00000034) [0x00000010, 0x00000024)
+TYPES:     1 [[DWOA]]  {{\[}}[[AOFF]], [[BOFF]]) [0x0000[[AAOFF]], 0x0000[[BAOFF]]) [0x00000000, 0x0000001a) [0x00000000, 0x00000010)
+TYPES:     3 [[DWOB]]  {{\[}}[[BOFF]], [[XOFF]]) [0x0000[[BAOFF]], 0x00000099)      [0x0000001a, 0x00000034) [0x00000010, 0x00000024)
+NOTYP:     3 [[DWOA]]  {{\[}}[[AOFF]], [[BOFF]]) [0x0000[[AAOFF]], 0x0000[[BAOFF]]) [0x00000000, 0x00000011) [0x00000000, 0x00000010)
+NOTYP:     4 [[DWOB]]  {{\[}}[[BOFF]], [[XOFF]]) [0x0000[[BAOFF]], 0x00000075)      [0x00000011, 0x00000022) [0x00000010, 0x00000024)
 
 CHECK-LABEL: .debug_tu_index contents:
-CHECK: Index Signature  TYPES                           ABBREV                             LINE                     STR_OFFSETS
-CHECK:     1 [[FOOSIG]] {{\[}}[[FOOUOFF]], [[BARUOFF]]) [0x0000[[AAOFF]], 0x0000[[BAOFF]]) [0x00000000, 0x0000001a) [0x00000000, 0x00000010)
-CHECK:     4 [[BARSIG]] {{\[}}[[BARUOFF]], [[XUOFF]])   [0x0000[[BAOFF]], 0x00000099)      [0x0000001a, 0x00000034) [0x00000010, 0x00000024)
+NOTYP-NOT: Index
+TYPES: Index Signature  TYPES                           ABBREV                             LINE                     STR_OFFSETS
+TYPES:     1 [[FOOSIG]] {{\[}}[[FOOUOFF]], [[BARUOFF]]) [0x0000[[AAOFF]], 0x0000[[BAOFF]]) [0x00000000, 0x0000001a) [0x00000000, 0x00000010)
+TYPES:     4 [[BARSIG]] {{\[}}[[BARUOFF]], [[XUOFF]])   [0x0000[[BAOFF]], 0x00000099)      [0x0000001a, 0x00000034) [0x00000010, 0x00000024)
+
+Ensure we do not create a debug_tu_index, even an empty or malformed one.
+NOTYPOBJ-NOT: .debug_tu_index
 
 CHECK-LABEL: .debug_str.dwo contents:
 CHECK: "clang version
diff --git a/tools/llvm-dwp/llvm-dwp.cpp b/tools/llvm-dwp/llvm-dwp.cpp
index 2583e2e20818..9a9440574e85 100644
--- a/tools/llvm-dwp/llvm-dwp.cpp
+++ b/tools/llvm-dwp/llvm-dwp.cpp
@@ -310,11 +310,13 @@ static std::error_code write(MCStreamer &Out, ArrayRef<std::string> Inputs) {
       return Err;
   }
 
-  // Lie about there being no info contributions so the TU index only includes
-  // the type unit contribution
-  ContributionOffsets[0] = 0;
-  writeIndex(Out, MCOFI.getDwarfTUIndexSection(), ContributionOffsets,
-             TypeIndexEntries);
+  if (!TypeIndexEntries.empty()) {
+    // Lie about there being no info contributions so the TU index only includes
+    // the type unit contribution
+    ContributionOffsets[0] = 0;
+    writeIndex(Out, MCOFI.getDwarfTUIndexSection(), ContributionOffsets,
+               TypeIndexEntries);
+  }
 
   // Lie about the type contribution
   ContributionOffsets[DW_SECT_TYPES - DW_SECT_INFO] = 0;

From c7d0ead45b3f81bdca4b3abea9675fcf790a871c Mon Sep 17 00:00:00 2001
From: Cong Hou <congh@google.com>
Date: Sat, 5 Dec 2015 05:00:55 +0000
Subject: [PATCH 121/364] Normalize successors' probabilities when building
 MBBs for jump table.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254837 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 2 ++
 test/CodeGen/X86/switch-edge-weight.ll           | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 85e7e3c1bc8c..867b9562019b 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8259,12 +8259,14 @@ void SelectionDAGBuilder::lowerWorkItem(SwitchWorkListItem W, Value *Cond,
             JumpProb += DefaultProb / 2;
             FallthroughProb -= DefaultProb / 2;
             JumpMBB->setSuccProbability(SI, DefaultProb / 2);
+            JumpMBB->normalizeSuccProbs();
             break;
           }
         }
 
         addSuccessorWithProb(CurMBB, Fallthrough, FallthroughProb);
         addSuccessorWithProb(CurMBB, JumpMBB, JumpProb);
+        CurMBB->normalizeSuccProbs();
 
         // The jump table header will be inserted in our current block, do the
         // range check, and fall through to our fallthrough block.
diff --git a/test/CodeGen/X86/switch-edge-weight.ll b/test/CodeGen/X86/switch-edge-weight.ll
index 6f594868c7ad..b8cb7b1280ad 100644
--- a/test/CodeGen/X86/switch-edge-weight.ll
+++ b/test/CodeGen/X86/switch-edge-weight.ll
@@ -111,7 +111,7 @@ sw.epilog:
 ; BB#8 to BB#3: {11} (10)
 ; BB#8 to BB#4: {12} (10)
 ; BB#8 to BB#5: {13, 14} (20)
-; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}14.29%) BB#6({{[0-9a-fx/= ]+}}7.14%) BB#2({{[0-9a-fx/= ]+}}14.29%) BB#3({{[0-9a-fx/= ]+}}14.29%) BB#4({{[0-9a-fx/= ]+}}14.29%) BB#5({{[0-9a-fx/= ]+}}28.57%)
+; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}15.38%) BB#6({{[0-9a-fx/= ]+}}7.69%) BB#2({{[0-9a-fx/= ]+}}15.38%) BB#3({{[0-9a-fx/= ]+}}15.38%) BB#4({{[0-9a-fx/= ]+}}15.38%) BB#5({{[0-9a-fx/= ]+}}30.77%)
 }
 
 ; CHECK-LABEL: test3

From a8bc4db3b2a16181b6373980abd142b7c1cb6e91 Mon Sep 17 00:00:00 2001
From: Xinliang David Li <davidxl@google.com>
Date: Sat, 5 Dec 2015 05:16:36 +0000
Subject: [PATCH 122/364] [PGO] Add version to getPGOFuncName method

Different version of indexed format may use different
name uniquing schemes for static functions. Pass the
version info to the name interface so that different
schmes can be picked (for profile lookup).


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254838 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ProfileData/InstrProf.h       | 11 +++++++----
 include/llvm/ProfileData/InstrProfReader.h |  3 +++
 lib/ProfileData/InstrProf.cpp              |  8 +++++---
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/include/llvm/ProfileData/InstrProf.h b/include/llvm/ProfileData/InstrProf.h
index 956485119102..3e711bb60cf0 100644
--- a/include/llvm/ProfileData/InstrProf.h
+++ b/include/llvm/ProfileData/InstrProf.h
@@ -16,9 +16,9 @@
 #ifndef LLVM_PROFILEDATA_INSTRPROF_H_
 #define LLVM_PROFILEDATA_INSTRPROF_H_
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/ProfileData/InstrProfData.inc"
 #include "llvm/Support/Endian.h"
@@ -30,6 +30,7 @@
 #include <system_error>
 #include <vector>
 
+#define INSTR_PROF_INDEX_VERSION 3
 namespace llvm {
 
 class Function;
@@ -132,7 +133,8 @@ inline StringRef getInstrProfFileOverriderFuncName() {
 
 /// Return the modified name for function \c F suitable to be
 /// used the key for profile lookup.
-std::string getPGOFuncName(const Function &F);
+std::string getPGOFuncName(const Function &F,
+                           uint64_t Version = INSTR_PROF_INDEX_VERSION);
 
 /// Return the modified name for a function suitable to be
 /// used the key for profile lookup. The function's original
@@ -140,7 +142,8 @@ std::string getPGOFuncName(const Function &F);
 /// The function is defined in module \c FileName.
 std::string getPGOFuncName(StringRef RawFuncName,
                            GlobalValue::LinkageTypes Linkage,
-                           StringRef FileName);
+                           StringRef FileName,
+                           uint64_t Version = INSTR_PROF_INDEX_VERSION);
 
 /// Create and return the global variable for function name used in PGO
 /// instrumentation. \c FuncName is the name of the function returned
@@ -504,7 +507,7 @@ static inline uint64_t ComputeHash(HashT Type, StringRef K) {
 }
 
 const uint64_t Magic = 0x8169666f72706cff; // "\xfflprofi\x81"
-const uint64_t Version = 3;
+const uint64_t Version = INSTR_PROF_INDEX_VERSION;
 const HashT HashType = HashT::MD5;
 
 // This structure defines the file header of the LLVM profile
diff --git a/include/llvm/ProfileData/InstrProfReader.h b/include/llvm/ProfileData/InstrProfReader.h
index 318981f75e18..2837e421ba87 100644
--- a/include/llvm/ProfileData/InstrProfReader.h
+++ b/include/llvm/ProfileData/InstrProfReader.h
@@ -279,6 +279,7 @@ struct InstrProfReaderIndexBase {
   virtual bool atEnd() const = 0;
   virtual void setValueProfDataEndianness(support::endianness Endianness) = 0;
   virtual ~InstrProfReaderIndexBase() {}
+  virtual uint64_t getVersion() const = 0;
 };
 
 typedef OnDiskIterableChainedHashTable<InstrProfLookupTrait>
@@ -312,6 +313,7 @@ class InstrProfReaderIndex : public InstrProfReaderIndexBase {
     HashTable->getInfoObj().setValueProfDataEndianness(Endianness);
   }
   ~InstrProfReaderIndex() override {}
+  uint64_t getVersion() const override { return FormatVersion; }
 };
 
 /// Reader for the indexed binary instrprof format.
@@ -328,6 +330,7 @@ class IndexedInstrProfReader : public InstrProfReader {
   IndexedInstrProfReader &operator=(const IndexedInstrProfReader &) = delete;
 
 public:
+  uint64_t getVersion() const { return Index->getVersion(); }
   IndexedInstrProfReader(std::unique_ptr<MemoryBuffer> DataBuffer)
       : DataBuffer(std::move(DataBuffer)), Index(nullptr) {}
 
diff --git a/lib/ProfileData/InstrProf.cpp b/lib/ProfileData/InstrProf.cpp
index 530be8ac044a..a965a1208b51 100644
--- a/lib/ProfileData/InstrProf.cpp
+++ b/lib/ProfileData/InstrProf.cpp
@@ -74,7 +74,8 @@ namespace llvm {
 
 std::string getPGOFuncName(StringRef RawFuncName,
                            GlobalValue::LinkageTypes Linkage,
-                           StringRef FileName) {
+                           StringRef FileName,
+                           uint64_t Version LLVM_ATTRIBUTE_UNUSED) {
 
   // Function names may be prefixed with a binary '1' to indicate
   // that the backend should not modify the symbols due to any platform
@@ -96,8 +97,9 @@ std::string getPGOFuncName(StringRef RawFuncName,
   return FuncName;
 }
 
-std::string getPGOFuncName(const Function &F) {
-  return getPGOFuncName(F.getName(), F.getLinkage(), F.getParent()->getName());
+std::string getPGOFuncName(const Function &F, uint64_t Version) {
+  return getPGOFuncName(F.getName(), F.getLinkage(), F.getParent()->getName(),
+                        Version);
 }
 
 GlobalVariable *createPGOFuncNameVar(Module &M,

From fbbab8b9598762b23d1cc870d4a7a1cba4158792 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 5 Dec 2015 07:07:42 +0000
Subject: [PATCH 123/364] [X86][FMA4] Explicitly set the domain of FMA4
 float/double scalar instructions

Both were defaulting to the float domain - now matches the packed instructions.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254841 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86InstrFMA.td    | 61 +++++++++++++++++---------------
 test/CodeGen/X86/fma_patterns.ll |  2 +-
 2 files changed, 33 insertions(+), 30 deletions(-)

diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index 0467a64d7e51..b11ff6e253fa 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -374,36 +374,23 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
 } // isCodeGenOnly = 1
 }
 
-defm VFMADDSS4  : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>,
-                  fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32,
-                            int_x86_fma_vfmadd_ss>;
-defm VFMADDSD4  : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>,
-                  fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64,
-                            int_x86_fma_vfmadd_sd>;
-defm VFMSUBSS4  : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>,
-                  fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32,
-                            int_x86_fma_vfmsub_ss>;
-defm VFMSUBSD4  : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>,
-                  fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64,
-                            int_x86_fma_vfmsub_sd>;
-defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,
-                        X86Fnmadd, loadf32>,
-                  fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32,
-                            int_x86_fma_vfnmadd_ss>;
-defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,
-                        X86Fnmadd, loadf64>,
-                  fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64,
-                            int_x86_fma_vfnmadd_sd>;
-defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,
-                        X86Fnmsub, loadf32>,
-                  fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32,
-                            int_x86_fma_vfnmsub_ss>;
-defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
-                        X86Fnmsub, loadf64>,
-                  fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64,
-                            int_x86_fma_vfnmsub_sd>;
-
 let ExeDomain = SSEPackedSingle in {
+  // Scalar Instructions
+  defm VFMADDSS4  : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>,
+                    fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32,
+                              int_x86_fma_vfmadd_ss>;
+  defm VFMSUBSS4  : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>,
+                    fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32,
+                              int_x86_fma_vfmsub_ss>;
+  defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,
+                          X86Fnmadd, loadf32>,
+                    fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32,
+                              int_x86_fma_vfnmadd_ss>;
+  defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,
+                          X86Fnmsub, loadf32>,
+                    fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32,
+                              int_x86_fma_vfnmsub_ss>;
+  // Packed Instructions
   defm VFMADDPS4    : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32,
                             loadv4f32, loadv8f32>;
   defm VFMSUBPS4    : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32,
@@ -419,6 +406,22 @@ let ExeDomain = SSEPackedSingle in {
 }
 
 let ExeDomain = SSEPackedDouble in {
+  // Scalar Instructions
+  defm VFMADDSD4  : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>,
+                    fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64,
+                              int_x86_fma_vfmadd_sd>;
+  defm VFMSUBSD4  : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>,
+                    fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64,
+                              int_x86_fma_vfmsub_sd>;
+  defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,
+                          X86Fnmadd, loadf64>,
+                    fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64,
+                              int_x86_fma_vfnmadd_sd>;
+  defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
+                          X86Fnmsub, loadf64>,
+                    fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64,
+                              int_x86_fma_vfnmsub_sd>;
+  // Packed Instructions
   defm VFMADDPD4    : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64,
                             loadv2f64, loadv4f64>;
   defm VFMSUBPD4    : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64,
diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll
index 0f0dd20da040..c6a4954e51a8 100644
--- a/test/CodeGen/X86/fma_patterns.ll
+++ b/test/CodeGen/X86/fma_patterns.ll
@@ -1120,7 +1120,7 @@ define double @test_f64_fneg_fmul(double %x, double %y) #0 {
 ;
 ; FMA4-LABEL: test_f64_fneg_fmul:
 ; FMA4:       # BB#0:
-; FMA4-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FMA4-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; FMA4-NEXT:    vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
 ; FMA4-NEXT:    retq
 ;

From 83f50fab5317ea62c8298a82c1de5054d5795ac9 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Sat, 5 Dec 2015 07:13:25 +0000
Subject: [PATCH 124/364] Use std::fill instead of memset to initialize an
 array to avoid hardcoded count and a multiply. The outputed code is
 identical.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254842 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Target/TargetLowering.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index e247abcb2f75..e99c9f758f8e 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -1293,7 +1293,7 @@ class TargetLoweringBase {
 
   /// Remove all register classes.
   void clearRegisterClasses() {
-    memset(RegClassForVT, 0,MVT::LAST_VALUETYPE * sizeof(TargetRegisterClass*));
+    std::fill(std::begin(RegClassForVT), std::end(RegClassForVT), nullptr);
 
     AvailableRegClasses.clear();
   }

From 79402ee6f96ff3be95b445286d91d0d87b5a3cc9 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Sat, 5 Dec 2015 07:13:35 +0000
Subject: [PATCH 125/364] Replace uint16_t with the MCPhysReg typedef in many
 places. A lot of physical register arrays already use this typedef.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254843 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/CallingConvLower.h       |  2 +-
 include/llvm/MC/MCInstrDesc.h                 | 28 ++++++++--------
 lib/CodeGen/MIRParser/MIParser.cpp            |  4 +--
 lib/CodeGen/MachineInstr.cpp                  |  6 ++--
 lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp  |  4 +--
 .../SelectionDAG/ScheduleDAGRRList.cpp        | 12 +++----
 lib/MC/MCInstrDesc.cpp                        |  2 +-
 lib/MC/MCParser/AsmParser.cpp                 |  4 +--
 lib/Target/AArch64/AArch64CallingConvention.h | 32 +++++++++----------
 lib/Target/ARM/ARMCallingConv.h               | 18 +++++------
 lib/Target/ARM/ARMFastISel.cpp                |  2 +-
 lib/Target/ARM/Thumb2SizeReduction.cpp        |  2 +-
 .../Disassembler/HexagonDisassembler.cpp      | 32 +++++++++----------
 lib/Target/Hexagon/HexagonGenMux.cpp          |  4 +--
 lib/Target/Hexagon/HexagonISelLowering.cpp    | 24 +++++++-------
 .../Hexagon/MCTargetDesc/HexagonMCChecker.cpp |  2 +-
 .../MCTargetDesc/HexagonMCCodeEmitter.cpp     |  2 +-
 lib/Target/PowerPC/PPCFrameLowering.cpp       |  2 +-
 lib/Target/PowerPC/PPCInstrInfo.cpp           |  4 +--
 lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp |  6 ++--
 utils/TableGen/InstrInfoEmitter.cpp           |  2 +-
 21 files changed, 98 insertions(+), 96 deletions(-)

diff --git a/include/llvm/CodeGen/CallingConvLower.h b/include/llvm/CodeGen/CallingConvLower.h
index 9df41dd0257c..415abb90da57 100644
--- a/include/llvm/CodeGen/CallingConvLower.h
+++ b/include/llvm/CodeGen/CallingConvLower.h
@@ -369,7 +369,7 @@ class CCState {
   /// AllocateRegBlock - Attempt to allocate a block of RegsRequired consecutive
   /// registers. If this is not possible, return zero. Otherwise, return the first
   /// register of the block that were allocated, marking the entire block as allocated.
-  unsigned AllocateRegBlock(ArrayRef<uint16_t> Regs, unsigned RegsRequired) {
+  unsigned AllocateRegBlock(ArrayRef<MCPhysReg> Regs, unsigned RegsRequired) {
     if (RegsRequired > Regs.size())
       return 0;
 
diff --git a/include/llvm/MC/MCInstrDesc.h b/include/llvm/MC/MCInstrDesc.h
index 1baf82ee5c45..88aab73d4058 100644
--- a/include/llvm/MC/MCInstrDesc.h
+++ b/include/llvm/MC/MCInstrDesc.h
@@ -15,12 +15,12 @@
 #ifndef LLVM_MC_MCINSTRDESC_H
 #define LLVM_MC_MCINSTRDESC_H
 
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/DataTypes.h"
 #include <string>
 
 namespace llvm {
   class MCInst;
-  class MCRegisterInfo;
   class MCSubtargetInfo;
   class FeatureBitset;
 
@@ -137,16 +137,16 @@ enum Flag {
 /// directly to describe itself.
 class MCInstrDesc {
 public:
-  unsigned short Opcode;        // The opcode number
-  unsigned short NumOperands;   // Num of args (may be more if variable_ops)
-  unsigned char NumDefs;        // Num of args that are definitions
-  unsigned char Size;           // Number of bytes in encoding.
-  unsigned short SchedClass;    // enum identifying instr sched class
-  uint64_t Flags;               // Flags identifying machine instr class
-  uint64_t TSFlags;             // Target Specific Flag values
-  const uint16_t *ImplicitUses; // Registers implicitly read by this instr
-  const uint16_t *ImplicitDefs; // Registers implicitly defined by this instr
-  const MCOperandInfo *OpInfo;  // 'NumOperands' entries about operands
+  unsigned short Opcode;         // The opcode number
+  unsigned short NumOperands;    // Num of args (may be more if variable_ops)
+  unsigned char NumDefs;         // Num of args that are definitions
+  unsigned char Size;            // Number of bytes in encoding.
+  unsigned short SchedClass;     // enum identifying instr sched class
+  uint64_t Flags;                // Flags identifying machine instr class
+  uint64_t TSFlags;              // Target Specific Flag values
+  const MCPhysReg *ImplicitUses; // Registers implicitly read by this instr
+  const MCPhysReg *ImplicitDefs; // Registers implicitly defined by this instr
+  const MCOperandInfo *OpInfo;   // 'NumOperands' entries about operands
   // Subtarget feature that this is deprecated on, if any
   // -1 implies this is not deprecated by any single feature. It may still be 
   // deprecated due to a "complex" reason, below.
@@ -472,7 +472,7 @@ class MCInstrDesc {
   /// marked as implicitly reading the 'CL' register, which it always does.
   ///
   /// This method returns null if the instruction has no implicit uses.
-  const uint16_t *getImplicitUses() const { return ImplicitUses; }
+  const MCPhysReg *getImplicitUses() const { return ImplicitUses; }
 
   /// \brief Return the number of implicit uses this instruction has.
   unsigned getNumImplicitUses() const {
@@ -494,7 +494,7 @@ class MCInstrDesc {
   /// EAX/EDX/EFLAGS registers.
   ///
   /// This method returns null if the instruction has no implicit defs.
-  const uint16_t *getImplicitDefs() const { return ImplicitDefs; }
+  const MCPhysReg *getImplicitDefs() const { return ImplicitDefs; }
 
   /// \brief Return the number of implicit defs this instruct has.
   unsigned getNumImplicitDefs() const {
@@ -509,7 +509,7 @@ class MCInstrDesc {
   /// \brief Return true if this instruction implicitly
   /// uses the specified physical register.
   bool hasImplicitUseOfPhysReg(unsigned Reg) const {
-    if (const uint16_t *ImpUses = ImplicitUses)
+    if (const MCPhysReg *ImpUses = ImplicitUses)
       for (; *ImpUses; ++ImpUses)
         if (*ImpUses == Reg)
           return true;
diff --git a/lib/CodeGen/MIRParser/MIParser.cpp b/lib/CodeGen/MIRParser/MIParser.cpp
index c9c2d62cec30..f2f6584fb6c8 100644
--- a/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/lib/CodeGen/MIRParser/MIParser.cpp
@@ -745,11 +745,11 @@ bool MIParser::verifyImplicitOperands(ArrayRef<ParsedMachineOperand> Operands,
   // Gather all the expected implicit operands.
   SmallVector<MachineOperand, 4> ImplicitOperands;
   if (MCID.ImplicitDefs)
-    for (const uint16_t *ImpDefs = MCID.getImplicitDefs(); *ImpDefs; ++ImpDefs)
+    for (const MCPhysReg *ImpDefs = MCID.getImplicitDefs(); *ImpDefs; ++ImpDefs)
       ImplicitOperands.push_back(
           MachineOperand::CreateReg(*ImpDefs, true, true));
   if (MCID.ImplicitUses)
-    for (const uint16_t *ImpUses = MCID.getImplicitUses(); *ImpUses; ++ImpUses)
+    for (const MCPhysReg *ImpUses = MCID.getImplicitUses(); *ImpUses; ++ImpUses)
       ImplicitOperands.push_back(
           MachineOperand::CreateReg(*ImpUses, false, true));
 
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index e202810bf6e5..1eb2edcd7cec 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -631,10 +631,12 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST) const {
 
 void MachineInstr::addImplicitDefUseOperands(MachineFunction &MF) {
   if (MCID->ImplicitDefs)
-    for (const uint16_t *ImpDefs = MCID->getImplicitDefs(); *ImpDefs; ++ImpDefs)
+    for (const MCPhysReg *ImpDefs = MCID->getImplicitDefs(); *ImpDefs;
+           ++ImpDefs)
       addOperand(MF, MachineOperand::CreateReg(*ImpDefs, true, true));
   if (MCID->ImplicitUses)
-    for (const uint16_t *ImpUses = MCID->getImplicitUses(); *ImpUses; ++ImpUses)
+    for (const MCPhysReg *ImpUses = MCID->getImplicitUses(); *ImpUses;
+           ++ImpUses)
       addOperand(MF, MachineOperand::CreateReg(*ImpUses, false, true));
 }
 
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index 34e1a7001082..62e7733ecd2b 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -440,7 +440,7 @@ static MVT getPhysicalRegisterVT(SDNode *N, unsigned Reg,
     const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
     assert(MCID.ImplicitDefs && "Physical reg def must be in implicit def list!");
     NumRes = MCID.getNumDefs();
-    for (const uint16_t *ImpDef = MCID.getImplicitDefs(); *ImpDef; ++ImpDef) {
+    for (const MCPhysReg *ImpDef = MCID.getImplicitDefs(); *ImpDef; ++ImpDef) {
       if (Reg == *ImpDef)
         break;
       ++NumRes;
@@ -519,7 +519,7 @@ bool ScheduleDAGFast::DelayForLiveRegsBottomUp(SUnit *SU,
     const MCInstrDesc &MCID = TII->get(Node->getMachineOpcode());
     if (!MCID.ImplicitDefs)
       continue;
-    for (const uint16_t *Reg = MCID.getImplicitDefs(); *Reg; ++Reg) {
+    for (const MCPhysReg *Reg = MCID.getImplicitDefs(); *Reg; ++Reg) {
       CheckForLiveRegDef(SU, *Reg, LiveRegDefs, RegAdded, LRegs, TRI);
     }
   }
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index 78985e01ef9a..91024e672f9c 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -1206,7 +1206,7 @@ static MVT getPhysicalRegisterVT(SDNode *N, unsigned Reg,
     const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
     assert(MCID.ImplicitDefs && "Physical reg def must be in implicit def list!");
     NumRes = MCID.getNumDefs();
-    for (const uint16_t *ImpDef = MCID.getImplicitDefs(); *ImpDef; ++ImpDef) {
+    for (const MCPhysReg *ImpDef = MCID.getImplicitDefs(); *ImpDef; ++ImpDef) {
       if (Reg == *ImpDef)
         break;
       ++NumRes;
@@ -1335,7 +1335,7 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) {
     const MCInstrDesc &MCID = TII->get(Node->getMachineOpcode());
     if (!MCID.ImplicitDefs)
       continue;
-    for (const uint16_t *Reg = MCID.getImplicitDefs(); *Reg; ++Reg)
+    for (const MCPhysReg *Reg = MCID.getImplicitDefs(); *Reg; ++Reg)
       CheckForLiveRegDef(SU, *Reg, LiveRegDefs.get(), RegAdded, LRegs, TRI);
   }
 
@@ -2720,7 +2720,7 @@ static bool canClobberReachingPhysRegUse(const SUnit *DepSU, const SUnit *SU,
                                          ScheduleDAGRRList *scheduleDAG,
                                          const TargetInstrInfo *TII,
                                          const TargetRegisterInfo *TRI) {
-  const uint16_t *ImpDefs
+  const MCPhysReg *ImpDefs
     = TII->get(SU->getNode()->getMachineOpcode()).getImplicitDefs();
   const uint32_t *RegMask = getNodeRegMask(SU->getNode());
   if(!ImpDefs && !RegMask)
@@ -2739,7 +2739,7 @@ static bool canClobberReachingPhysRegUse(const SUnit *DepSU, const SUnit *SU,
         return true;
 
       if (ImpDefs)
-        for (const uint16_t *ImpDef = ImpDefs; *ImpDef; ++ImpDef)
+        for (const MCPhysReg *ImpDef = ImpDefs; *ImpDef; ++ImpDef)
           // Return true if SU clobbers this physical register use and the
           // definition of the register reaches from DepSU. IsReachable queries
           // a topological forward sort of the DAG (following the successors).
@@ -2758,13 +2758,13 @@ static bool canClobberPhysRegDefs(const SUnit *SuccSU, const SUnit *SU,
                                   const TargetRegisterInfo *TRI) {
   SDNode *N = SuccSU->getNode();
   unsigned NumDefs = TII->get(N->getMachineOpcode()).getNumDefs();
-  const uint16_t *ImpDefs = TII->get(N->getMachineOpcode()).getImplicitDefs();
+  const MCPhysReg *ImpDefs = TII->get(N->getMachineOpcode()).getImplicitDefs();
   assert(ImpDefs && "Caller should check hasPhysRegDefs");
   for (const SDNode *SUNode = SU->getNode(); SUNode;
        SUNode = SUNode->getGluedNode()) {
     if (!SUNode->isMachineOpcode())
       continue;
-    const uint16_t *SUImpDefs =
+    const MCPhysReg *SUImpDefs =
       TII->get(SUNode->getMachineOpcode()).getImplicitDefs();
     const uint32_t *SURegMask = getNodeRegMask(SUNode);
     if (!SUImpDefs && !SURegMask)
diff --git a/lib/MC/MCInstrDesc.cpp b/lib/MC/MCInstrDesc.cpp
index 5be2fa1b30b6..ee55f3eff3ac 100644
--- a/lib/MC/MCInstrDesc.cpp
+++ b/lib/MC/MCInstrDesc.cpp
@@ -53,7 +53,7 @@ bool MCInstrDesc::mayAffectControlFlow(const MCInst &MI,
 
 bool MCInstrDesc::hasImplicitDefOfPhysReg(unsigned Reg,
                                           const MCRegisterInfo *MRI) const {
-  if (const uint16_t *ImpDefs = ImplicitDefs)
+  if (const MCPhysReg *ImpDefs = ImplicitDefs)
     for (; *ImpDefs; ++ImpDefs)
       if (*ImpDefs == Reg || (MRI && MRI->isSubRegister(Reg, *ImpDefs)))
         return true;
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index 8e8be8e52f63..61f7d749b968 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -4753,8 +4753,8 @@ bool AsmParser::parseMSInlineAsm(
     }
 
     // Consider implicit defs to be clobbers.  Think of cpuid and push.
-    ArrayRef<uint16_t> ImpDefs(Desc.getImplicitDefs(),
-                               Desc.getNumImplicitDefs());
+    ArrayRef<MCPhysReg> ImpDefs(Desc.getImplicitDefs(),
+                                Desc.getNumImplicitDefs());
     ClobberRegs.insert(ClobberRegs.end(), ImpDefs.begin(), ImpDefs.end());
   }
 
diff --git a/lib/Target/AArch64/AArch64CallingConvention.h b/lib/Target/AArch64/AArch64CallingConvention.h
index 68f9dcf3fc2a..bc44bc5f2461 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.h
+++ b/lib/Target/AArch64/AArch64CallingConvention.h
@@ -25,21 +25,21 @@
 namespace {
 using namespace llvm;
 
-static const uint16_t XRegList[] = {AArch64::X0, AArch64::X1, AArch64::X2,
-                                    AArch64::X3, AArch64::X4, AArch64::X5,
-                                    AArch64::X6, AArch64::X7};
-static const uint16_t HRegList[] = {AArch64::H0, AArch64::H1, AArch64::H2,
-                                    AArch64::H3, AArch64::H4, AArch64::H5,
-                                    AArch64::H6, AArch64::H7};
-static const uint16_t SRegList[] = {AArch64::S0, AArch64::S1, AArch64::S2,
-                                    AArch64::S3, AArch64::S4, AArch64::S5,
-                                    AArch64::S6, AArch64::S7};
-static const uint16_t DRegList[] = {AArch64::D0, AArch64::D1, AArch64::D2,
-                                    AArch64::D3, AArch64::D4, AArch64::D5,
-                                    AArch64::D6, AArch64::D7};
-static const uint16_t QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
-                                    AArch64::Q3, AArch64::Q4, AArch64::Q5,
-                                    AArch64::Q6, AArch64::Q7};
+static const MCPhysReg XRegList[] = {AArch64::X0, AArch64::X1, AArch64::X2,
+                                     AArch64::X3, AArch64::X4, AArch64::X5,
+                                     AArch64::X6, AArch64::X7};
+static const MCPhysReg HRegList[] = {AArch64::H0, AArch64::H1, AArch64::H2,
+                                     AArch64::H3, AArch64::H4, AArch64::H5,
+                                     AArch64::H6, AArch64::H7};
+static const MCPhysReg SRegList[] = {AArch64::S0, AArch64::S1, AArch64::S2,
+                                     AArch64::S3, AArch64::S4, AArch64::S5,
+                                     AArch64::S6, AArch64::S7};
+static const MCPhysReg DRegList[] = {AArch64::D0, AArch64::D1, AArch64::D2,
+                                     AArch64::D3, AArch64::D4, AArch64::D5,
+                                     AArch64::D6, AArch64::D7};
+static const MCPhysReg QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
+                                     AArch64::Q3, AArch64::Q4, AArch64::Q5,
+                                     AArch64::Q6, AArch64::Q7};
 
 static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
                              MVT LocVT, ISD::ArgFlagsTy &ArgFlags,
@@ -86,7 +86,7 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
                                     ISD::ArgFlagsTy &ArgFlags, CCState &State) {
   // Try to allocate a contiguous block of registers, each of the correct
   // size to hold one member.
-  ArrayRef<uint16_t> RegList;
+  ArrayRef<MCPhysReg> RegList;
   if (LocVT.SimpleTy == MVT::i64)
     RegList = XRegList;
   else if (LocVT.SimpleTy == MVT::f16)
diff --git a/lib/Target/ARM/ARMCallingConv.h b/lib/Target/ARM/ARMCallingConv.h
index 3d216c0ed04a..a731d00883a1 100644
--- a/lib/Target/ARM/ARMCallingConv.h
+++ b/lib/Target/ARM/ARMCallingConv.h
@@ -160,15 +160,15 @@ static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
                                    State);
 }
 
-static const uint16_t RRegList[] = { ARM::R0,  ARM::R1,  ARM::R2,  ARM::R3 };
+static const MCPhysReg RRegList[] = { ARM::R0,  ARM::R1,  ARM::R2,  ARM::R3 };
 
-static const uint16_t SRegList[] = { ARM::S0,  ARM::S1,  ARM::S2,  ARM::S3,
-                                     ARM::S4,  ARM::S5,  ARM::S6,  ARM::S7,
-                                     ARM::S8,  ARM::S9,  ARM::S10, ARM::S11,
-                                     ARM::S12, ARM::S13, ARM::S14,  ARM::S15 };
-static const uint16_t DRegList[] = { ARM::D0, ARM::D1, ARM::D2, ARM::D3,
-                                     ARM::D4, ARM::D5, ARM::D6, ARM::D7 };
-static const uint16_t QRegList[] = { ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3 };
+static const MCPhysReg SRegList[] = { ARM::S0,  ARM::S1,  ARM::S2,  ARM::S3,
+                                      ARM::S4,  ARM::S5,  ARM::S6,  ARM::S7,
+                                      ARM::S8,  ARM::S9,  ARM::S10, ARM::S11,
+                                      ARM::S12, ARM::S13, ARM::S14,  ARM::S15 };
+static const MCPhysReg DRegList[] = { ARM::D0, ARM::D1, ARM::D2, ARM::D3,
+                                      ARM::D4, ARM::D5, ARM::D6, ARM::D7 };
+static const MCPhysReg QRegList[] = { ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3 };
 
 
 // Allocate part of an AAPCS HFA or HVA. We assume that each member of the HA
@@ -203,7 +203,7 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT,
   unsigned StackAlign = DL.getStackAlignment();
   unsigned Align = std::min(PendingMembers[0].getExtraInfo(), StackAlign);
 
-  ArrayRef<uint16_t> RegList;
+  ArrayRef<MCPhysReg> RegList;
   switch (LocVT.SimpleTy) {
   case MVT::i32: {
     RegList = RRegList;
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 175107450fc0..9bdf823c85bd 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -3036,7 +3036,7 @@ bool ARMFastISel::fastLowerArguments() {
   }
 
 
-  static const uint16_t GPRArgRegs[] = {
+  static const MCPhysReg GPRArgRegs[] = {
     ARM::R0, ARM::R1, ARM::R2, ARM::R3
   };
 
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
index f38fe1904055..bcd0e5751258 100644
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -218,7 +218,7 @@ Thumb2SizeReduce::Thumb2SizeReduce(std::function<bool(const Function &)> Ftor)
 }
 
 static bool HasImplicitCPSRDef(const MCInstrDesc &MCID) {
-  for (const uint16_t *Regs = MCID.getImplicitDefs(); *Regs; ++Regs)
+  for (const MCPhysReg *Regs = MCID.getImplicitDefs(); *Regs; ++Regs)
     if (*Regs == ARM::CPSR)
       return true;
   return false;
diff --git a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index 1db59e1dd99d..4a9c3413cb29 100644
--- a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -471,7 +471,7 @@ extern const MCInstrDesc HexagonInsts[];
 }
 
 static DecodeStatus DecodeRegisterClass(MCInst &Inst, unsigned RegNo,
-                                        ArrayRef<uint16_t> Table) {
+                                        ArrayRef<MCPhysReg> Table) {
   if (RegNo < Table.size()) {
     Inst.addOperand(MCOperand::createReg(Table[RegNo]));
     return MCDisassembler::Success;
@@ -489,7 +489,7 @@ static DecodeStatus DecodeIntRegsLow8RegisterClass(MCInst &Inst, unsigned RegNo,
 static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
                                                const void *Decoder) {
-  static const uint16_t IntRegDecoderTable[] = {
+  static const MCPhysReg IntRegDecoderTable[] = {
       Hexagon::R0,  Hexagon::R1,  Hexagon::R2,  Hexagon::R3,  Hexagon::R4,
       Hexagon::R5,  Hexagon::R6,  Hexagon::R7,  Hexagon::R8,  Hexagon::R9,
       Hexagon::R10, Hexagon::R11, Hexagon::R12, Hexagon::R13, Hexagon::R14,
@@ -498,13 +498,13 @@ static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo,
       Hexagon::R25, Hexagon::R26, Hexagon::R27, Hexagon::R28, Hexagon::R29,
       Hexagon::R30, Hexagon::R31};
 
-  return (DecodeRegisterClass(Inst, RegNo, IntRegDecoderTable));
+  return DecodeRegisterClass(Inst, RegNo, IntRegDecoderTable);
 }
 
 static DecodeStatus DecodeVectorRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                   uint64_t /*Address*/,
                                                   const void *Decoder) {
-  static const uint16_t VecRegDecoderTable[] = {
+  static const MCPhysReg VecRegDecoderTable[] = {
       Hexagon::V0,  Hexagon::V1,  Hexagon::V2,  Hexagon::V3,  Hexagon::V4,
       Hexagon::V5,  Hexagon::V6,  Hexagon::V7,  Hexagon::V8,  Hexagon::V9,
       Hexagon::V10, Hexagon::V11, Hexagon::V12, Hexagon::V13, Hexagon::V14,
@@ -513,25 +513,25 @@ static DecodeStatus DecodeVectorRegsRegisterClass(MCInst &Inst, unsigned RegNo,
       Hexagon::V25, Hexagon::V26, Hexagon::V27, Hexagon::V28, Hexagon::V29,
       Hexagon::V30, Hexagon::V31};
 
-  return (DecodeRegisterClass(Inst, RegNo, VecRegDecoderTable));
+  return DecodeRegisterClass(Inst, RegNo, VecRegDecoderTable);
 }
 
 static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                   uint64_t /*Address*/,
                                                   const void *Decoder) {
-  static const uint16_t DoubleRegDecoderTable[] = {
+  static const MCPhysReg DoubleRegDecoderTable[] = {
       Hexagon::D0,  Hexagon::D1,  Hexagon::D2,  Hexagon::D3,
       Hexagon::D4,  Hexagon::D5,  Hexagon::D6,  Hexagon::D7,
       Hexagon::D8,  Hexagon::D9,  Hexagon::D10, Hexagon::D11,
       Hexagon::D12, Hexagon::D13, Hexagon::D14, Hexagon::D15};
 
-  return (DecodeRegisterClass(Inst, RegNo >> 1, DoubleRegDecoderTable));
+  return DecodeRegisterClass(Inst, RegNo >> 1, DoubleRegDecoderTable);
 }
 
 static DecodeStatus DecodeVecDblRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                   uint64_t /*Address*/,
                                                   const void *Decoder) {
-  static const uint16_t VecDblRegDecoderTable[] = {
+  static const MCPhysReg VecDblRegDecoderTable[] = {
       Hexagon::W0,  Hexagon::W1,  Hexagon::W2,  Hexagon::W3,
       Hexagon::W4,  Hexagon::W5,  Hexagon::W6,  Hexagon::W7,
       Hexagon::W8,  Hexagon::W9,  Hexagon::W10, Hexagon::W11,
@@ -543,25 +543,25 @@ static DecodeStatus DecodeVecDblRegsRegisterClass(MCInst &Inst, unsigned RegNo,
 static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                 uint64_t /*Address*/,
                                                 const void *Decoder) {
-  static const uint16_t PredRegDecoderTable[] = {Hexagon::P0, Hexagon::P1,
-                                                 Hexagon::P2, Hexagon::P3};
+  static const MCPhysReg PredRegDecoderTable[] = {Hexagon::P0, Hexagon::P1,
+                                                  Hexagon::P2, Hexagon::P3};
 
-  return (DecodeRegisterClass(Inst, RegNo, PredRegDecoderTable));
+  return DecodeRegisterClass(Inst, RegNo, PredRegDecoderTable);
 }
 
 static DecodeStatus DecodeVecPredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                    uint64_t /*Address*/,
                                                    const void *Decoder) {
-  static const uint16_t VecPredRegDecoderTable[] = {Hexagon::Q0, Hexagon::Q1,
-                                                    Hexagon::Q2, Hexagon::Q3};
+  static const MCPhysReg VecPredRegDecoderTable[] = {Hexagon::Q0, Hexagon::Q1,
+                                                     Hexagon::Q2, Hexagon::Q3};
 
-  return (DecodeRegisterClass(Inst, RegNo, VecPredRegDecoderTable));
+  return DecodeRegisterClass(Inst, RegNo, VecPredRegDecoderTable);
 }
 
 static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t /*Address*/,
                                                const void *Decoder) {
-  static const uint16_t CtrlRegDecoderTable[] = {
+  static const MCPhysReg CtrlRegDecoderTable[] = {
     Hexagon::SA0, Hexagon::LC0, Hexagon::SA1, Hexagon::LC1,
     Hexagon::P3_0, Hexagon::C5, Hexagon::C6, Hexagon::C7,
     Hexagon::USR, Hexagon::PC, Hexagon::UGP, Hexagon::GP,
@@ -582,7 +582,7 @@ static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo,
 static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
                                                  uint64_t /*Address*/,
                                                  const void *Decoder) {
-  static const uint16_t CtrlReg64DecoderTable[] = {
+  static const MCPhysReg CtrlReg64DecoderTable[] = {
       Hexagon::C1_0,   Hexagon::NoRegister,
       Hexagon::C3_2,   Hexagon::NoRegister,
       Hexagon::C7_6,   Hexagon::NoRegister,
diff --git a/lib/Target/Hexagon/HexagonGenMux.cpp b/lib/Target/Hexagon/HexagonGenMux.cpp
index b4ebd9140e75..c059d566709e 100644
--- a/lib/Target/Hexagon/HexagonGenMux.cpp
+++ b/lib/Target/Hexagon/HexagonGenMux.cpp
@@ -120,10 +120,10 @@ void HexagonGenMux::getDefsUses(const MachineInstr *MI, BitVector &Defs,
   // First, get the implicit defs and uses for this instruction.
   unsigned Opc = MI->getOpcode();
   const MCInstrDesc &D = HII->get(Opc);
-  if (const uint16_t *R = D.ImplicitDefs)
+  if (const MCPhysReg *R = D.ImplicitDefs)
     while (*R)
       expandReg(*R++, Defs);
-  if (const uint16_t *R = D.ImplicitUses)
+  if (const MCPhysReg *R = D.ImplicitUses)
     while (*R)
       expandReg(*R++, Uses);
 
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index 0a89ef424dd2..f82fe7699e84 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -316,18 +316,18 @@ static bool CC_HexagonVector(unsigned ValNo, MVT ValVT,
                              MVT LocVT, CCValAssign::LocInfo LocInfo,
                              ISD::ArgFlagsTy ArgFlags, CCState &State) {
 
-    static const uint16_t VecLstS[] = { Hexagon::V0, Hexagon::V1,
-                                        Hexagon::V2, Hexagon::V3,
-                                        Hexagon::V4, Hexagon::V5,
-                                        Hexagon::V6, Hexagon::V7,
-                                        Hexagon::V8, Hexagon::V9,
-                                        Hexagon::V10, Hexagon::V11,
-                                        Hexagon::V12, Hexagon::V13,
-                                        Hexagon::V14, Hexagon::V15};
-    static const uint16_t VecLstD[] = { Hexagon::W0, Hexagon::W1,
-                                        Hexagon::W2, Hexagon::W3,
-                                        Hexagon::W4, Hexagon::W5,
-                                        Hexagon::W6, Hexagon::W7};
+    static const MCPhysReg VecLstS[] = { Hexagon::V0, Hexagon::V1,
+                                         Hexagon::V2, Hexagon::V3,
+                                         Hexagon::V4, Hexagon::V5,
+                                         Hexagon::V6, Hexagon::V7,
+                                         Hexagon::V8, Hexagon::V9,
+                                         Hexagon::V10, Hexagon::V11,
+                                         Hexagon::V12, Hexagon::V13,
+                                         Hexagon::V14, Hexagon::V15};
+    static const MCPhysReg VecLstD[] = { Hexagon::W0, Hexagon::W1,
+                                         Hexagon::W2, Hexagon::W3,
+                                         Hexagon::W4, Hexagon::W5,
+                                         Hexagon::W6, Hexagon::W7};
   auto &MF = State.getMachineFunction();
   auto &HST = MF.getSubtarget<HexagonSubtarget>();
   bool UseHVX = HST.useHVXOps();
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
index c11abc1f42f6..fefe7543f397 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
@@ -85,7 +85,7 @@ void HexagonMCChecker::init(MCInst const& MCI) {
     }
 
   // Get implicit register definitions.
-  const uint16_t* ImpDefs = MCID.getImplicitDefs();
+  const MCPhysReg *ImpDefs = MCID.getImplicitDefs();
   for (unsigned i = 0; i < MCID.getNumImplicitDefs(); ++i) {
     unsigned R = ImpDefs[i];
 
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
index 35e490afe41f..c2c6275e7e8d 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
@@ -334,7 +334,7 @@ static Hexagon::Fixups getFixupNoBits(MCInstrInfo const &MCII, const MCInst &MI,
   // The only relocs left should be GP relative:
   default:
     if (MCID.mayStore() || MCID.mayLoad()) {
-      for (const uint16_t *ImpUses = MCID.getImplicitUses(); *ImpUses;
+      for (const MCPhysReg *ImpUses = MCID.getImplicitUses(); *ImpUses;
            ++ImpUses) {
         if (*ImpUses == Hexagon::GP) {
           switch (HexagonMCInstrInfo::getAccessSize(MCII, MI)) {
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index 174deb88bc5c..beab844c6025 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -30,7 +30,7 @@ using namespace llvm;
 
 /// VRRegNo - Map from a numbered VR register to its enum value.
 ///
-static const uint16_t VRRegNo[] = {
+static const MCPhysReg VRRegNo[] = {
  PPC::V0 , PPC::V1 , PPC::V2 , PPC::V3 , PPC::V4 , PPC::V5 , PPC::V6 , PPC::V7 ,
  PPC::V8 , PPC::V9 , PPC::V10, PPC::V11, PPC::V12, PPC::V13, PPC::V14, PPC::V15,
  PPC::V16, PPC::V17, PPC::V18, PPC::V19, PPC::V20, PPC::V21, PPC::V22, PPC::V23,
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 51c85f6ea673..c17603a7718a 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -1748,13 +1748,13 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr,
     MI->setDesc(NewDesc);
 
     if (NewDesc.ImplicitDefs)
-      for (const uint16_t *ImpDefs = NewDesc.getImplicitDefs();
+      for (const MCPhysReg *ImpDefs = NewDesc.getImplicitDefs();
            *ImpDefs; ++ImpDefs)
         if (!MI->definesRegister(*ImpDefs))
           MI->addOperand(*MI->getParent()->getParent(),
                          MachineOperand::CreateReg(*ImpDefs, true, true));
     if (NewDesc.ImplicitUses)
-      for (const uint16_t *ImpUses = NewDesc.getImplicitUses();
+      for (const MCPhysReg *ImpUses = NewDesc.getImplicitUses();
            *ImpUses; ++ImpUses)
         if (!MI->readsRegister(*ImpUses))
           MI->addOperand(*MI->getParent()->getParent(),
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 2eba084fc147..914ccdfb49a1 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -639,13 +639,13 @@ class DarwinX86AsmBackend : public X86AsmBackend {
   /// \brief Get the compact unwind number for a given register. The number
   /// corresponds to the enum lists in compact_unwind_encoding.h.
   int getCompactUnwindRegNum(unsigned Reg) const {
-    static const uint16_t CU32BitRegs[7] = {
+    static const MCPhysReg CU32BitRegs[7] = {
       X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
     };
-    static const uint16_t CU64BitRegs[] = {
+    static const MCPhysReg CU64BitRegs[] = {
       X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
     };
-    const uint16_t *CURegs = Is64Bit ? CU64BitRegs : CU32BitRegs;
+    const MCPhysReg *CURegs = Is64Bit ? CU64BitRegs : CU32BitRegs;
     for (int Idx = 1; *CURegs; ++CURegs, ++Idx)
       if (*CURegs == Reg)
         return Idx;
diff --git a/utils/TableGen/InstrInfoEmitter.cpp b/utils/TableGen/InstrInfoEmitter.cpp
index a4302d09078b..a6583399fa20 100644
--- a/utils/TableGen/InstrInfoEmitter.cpp
+++ b/utils/TableGen/InstrInfoEmitter.cpp
@@ -74,7 +74,7 @@ class InstrInfoEmitter {
 
 static void PrintDefList(const std::vector<Record*> &Uses,
                          unsigned Num, raw_ostream &OS) {
-  OS << "static const uint16_t ImplicitList" << Num << "[] = { ";
+  OS << "static const MCPhysReg ImplicitList" << Num << "[] = { ";
   for (unsigned i = 0, e = Uses.size(); i != e; ++i)
     OS << getQualifiedName(Uses[i]) << ", ";
   OS << "0 };\n";

From 092921b3d817e8c63e565a633067e2927aa5c85a Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 5 Dec 2015 07:27:50 +0000
Subject: [PATCH 126/364] [X86][ADX] Added memory folding patterns and stack
 folding tests

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254844 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86InstrInfo.cpp              |  6 +++
 test/CodeGen/X86/stack-folding-adx-x86_64.ll | 45 ++++++++++++++++++++
 2 files changed, 51 insertions(+)
 create mode 100644 test/CodeGen/X86/stack-folding-adx-x86_64.ll

diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index ebe329064c50..34d4e90b3101 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -1650,6 +1650,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::PEXT32rr,          X86::PEXT32rm,            0 },
     { X86::PEXT64rr,          X86::PEXT64rm,            0 },
 
+    // ADX foldable instructions
+    { X86::ADCX32rr,          X86::ADCX32rm,            0 },
+    { X86::ADCX64rr,          X86::ADCX64rm,            0 },
+    { X86::ADOX32rr,          X86::ADOX32rm,            0 },
+    { X86::ADOX64rr,          X86::ADOX64rm,            0 },
+
     // AVX-512 foldable instructions
     { X86::VADDPSZrr,         X86::VADDPSZrm,           0 },
     { X86::VADDPDZrr,         X86::VADDPDZrm,           0 },
diff --git a/test/CodeGen/X86/stack-folding-adx-x86_64.ll b/test/CodeGen/X86/stack-folding-adx-x86_64.ll
new file mode 100644
index 000000000000..5f109f09aa19
--- /dev/null
+++ b/test/CodeGen/X86/stack-folding-adx-x86_64.ll
@@ -0,0 +1,45 @@
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+adx < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+; Stack reload folding tests.
+;
+; By including a nop call with sideeffects we can force a partial register spill of the
+; relevant registers and check that the reload is correctly folded into the instruction.
+
+define i8 @stack_fold_addcarry_u32(i8 %a0, i32 %a1, i32 %a2, i8* %a3) {
+  ;CHECK-LABEL: stack_fold_addcarry_u32
+  ;CHECK:       adcxl {{-?[0-9]*}}(%rsp), %ecx {{.*#+}} 4-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = tail call i8 @llvm.x86.addcarry.u32(i8 %a0, i32 %a1, i32 %a2, i8* %a3)
+  ret i8 %2;
+}
+declare i8 @llvm.x86.addcarry.u32(i8, i32, i32, i8*)
+
+define i8 @stack_fold_addcarry_u64(i8 %a0, i64 %a1, i64 %a2, i8* %a3) {
+  ;CHECK-LABEL: stack_fold_addcarry_u64
+  ;CHECK:       adcxq {{-?[0-9]*}}(%rsp), %rcx {{.*#+}} 8-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = tail call i8 @llvm.x86.addcarry.u64(i8 %a0, i64 %a1, i64 %a2, i8* %a3)
+  ret i8 %2;
+}
+declare i8 @llvm.x86.addcarry.u64(i8, i64, i64, i8*)
+
+define i8 @stack_fold_addcarryx_u32(i8 %a0, i32 %a1, i32 %a2, i8* %a3) {
+  ;CHECK-LABEL: stack_fold_addcarryx_u32
+  ;CHECK:       adcxl {{-?[0-9]*}}(%rsp), %ecx {{.*#+}} 4-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = tail call i8 @llvm.x86.addcarryx.u32(i8 %a0, i32 %a1, i32 %a2, i8* %a3)
+  ret i8 %2;
+}
+declare i8 @llvm.x86.addcarryx.u32(i8, i32, i32, i8*)
+
+define i8 @stack_fold_addcarryx_u64(i8 %a0, i64 %a1, i64 %a2, i8* %a3) {
+  ;CHECK-LABEL: stack_fold_addcarryx_u64
+  ;CHECK:       adcxq {{-?[0-9]*}}(%rsp), %rcx {{.*#+}} 8-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = tail call i8 @llvm.x86.addcarryx.u64(i8 %a0, i64 %a1, i64 %a2, i8* %a3)
+  ret i8 %2;
+}
+declare i8 @llvm.x86.addcarryx.u64(i8, i64, i64, i8*)

From b2387dc751b88b853072fd679fb85ae7dc92e003 Mon Sep 17 00:00:00 2001
From: Keno Fischer <kfischer@college.harvard.edu>
Date: Sat, 5 Dec 2015 14:42:34 +0000
Subject: [PATCH 127/364] [ASAN] Add doFinalization to reset state

Summary: If the same pass manager is used for multiple modules ASAN
complains about GlobalsMD being initialized twice. Fix this by
resetting GlobalsMD in a new doFinalization method to allow this
use case.

Reviewers: kcc

Subscribers: llvm-commits

Differential Revision: http://reviews.llvm.org/D14962

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254851 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Instrumentation/AddressSanitizer.cpp | 11 +++++++++++
 test/Instrumentation/AddressSanitizer/twice.ll      |  8 ++++++++
 2 files changed, 19 insertions(+)
 create mode 100644 test/Instrumentation/AddressSanitizer/twice.ll

diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index dea94a514fe8..a9df5e5898ae 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -280,6 +280,11 @@ class GlobalsMetadata {
 
   GlobalsMetadata() : inited_(false) {}
 
+  void reset() {
+    inited_ = false;
+    Entries.clear();
+  }
+
   void init(Module &M) {
     assert(!inited_);
     inited_ = true;
@@ -450,6 +455,7 @@ struct AddressSanitizer : public FunctionPass {
   bool maybeInsertAsanInitAtFunctionEntry(Function &F);
   void markEscapedLocalAllocas(Function &F);
   bool doInitialization(Module &M) override;
+  bool doFinalization(Module &M) override;
   static char ID;  // Pass identification, replacement for typeid
 
   DominatorTree &getDominatorTree() const { return *DT; }
@@ -1521,6 +1527,11 @@ bool AddressSanitizer::doInitialization(Module &M) {
   return true;
 }
 
+bool AddressSanitizer::doFinalization(Module &M) {
+  GlobalsMD.reset();
+  return false;
+}
+
 bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) {
   // For each NSObject descendant having a +load method, this method is invoked
   // by the ObjC runtime before any of the static constructors is called.
diff --git a/test/Instrumentation/AddressSanitizer/twice.ll b/test/Instrumentation/AddressSanitizer/twice.ll
new file mode 100644
index 000000000000..9f7826f73952
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/twice.ll
@@ -0,0 +1,8 @@
+; Check that the address sanitizer pass can be reused
+; RUN: opt < %s -S -run-twice -asan
+
+define void @foo(i64* %b) nounwind uwtable sanitize_address {
+  entry:
+  store i64 0, i64* %b, align 1
+  ret void
+}

From 5fa397629b43cf1428395698efdf913042a04ab7 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Sat, 5 Dec 2015 17:34:07 +0000
Subject: [PATCH 128/364] [Hexagon] Don't call getNumImplicitDefs and then
 iterate over the count. getNumImplicitDefs contains a loop so its better to
 just loop over the null terminated implicit def list. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254852 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../Hexagon/MCTargetDesc/HexagonMCChecker.cpp | 49 ++++++++++---------
 1 file changed, 25 insertions(+), 24 deletions(-)

diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
index fefe7543f397..46b7b41fec3b 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
@@ -85,32 +85,33 @@ void HexagonMCChecker::init(MCInst const& MCI) {
     }
 
   // Get implicit register definitions.
-  const MCPhysReg *ImpDefs = MCID.getImplicitDefs();
-  for (unsigned i = 0; i < MCID.getNumImplicitDefs(); ++i) {
-    unsigned R = ImpDefs[i];
+  if (const MCPhysReg *ImpDef = MCID.getImplicitDefs())
+    for (; *ImpDef; ++ImpDef) {
+      unsigned R = *ImpDef;
 
-    if (Hexagon::R31 != R && MCID.isCall())
-      // Any register other than the LR and the PC are actually volatile ones
-      // as defined by the ABI, not modified implicitly by the call insn.
-      continue;
-    if (Hexagon::PC == R)
-      // Branches are the only insns that can change the PC,
-      // otherwise a read-only register.
-      continue;
+      if (Hexagon::R31 != R && MCID.isCall())
+        // Any register other than the LR and the PC are actually volatile ones
+        // as defined by the ABI, not modified implicitly by the call insn.
+        continue;
+      if (Hexagon::PC == R)
+        // Branches are the only insns that can change the PC,
+        // otherwise a read-only register.
+        continue;
 
-    if (Hexagon::USR_OVF == R)
-      // Many insns change the USR implicitly, but only one or another flag.
-      // The instruction table models the USR.OVF flag, which can be implicitly
-      // modified more than once, but cannot be modified in the same packet
-      // with an instruction that modifies is explicitly. Deal with such situ-
-      // ations individually.
-      SoftDefs.insert(R);
-    else if (isPredicateRegister(R) && HexagonMCInstrInfo::isPredicateLate(MCII, MCI))
-      // Include implicit late predicates.
-      LatePreds.insert(R);
-    else
-      Defs[R].insert(PredSense(PredReg, isTrue));
-  }
+      if (Hexagon::USR_OVF == R)
+        // Many insns change the USR implicitly, but only one or another flag.
+        // The instruction table models the USR.OVF flag, which can be implicitly
+        // modified more than once, but cannot be modified in the same packet
+        // with an instruction that modifies is explicitly. Deal with such situ-
+        // ations individually.
+        SoftDefs.insert(R);
+      else if (isPredicateRegister(R) &&
+               HexagonMCInstrInfo::isPredicateLate(MCII, MCI))
+        // Include implicit late predicates.
+        LatePreds.insert(R);
+      else
+        Defs[R].insert(PredSense(PredReg, isTrue));
+    }
 
   // Figure out explicit register definitions.
   for (unsigned i = 0; i < MCID.getNumDefs(); ++i) {

From e05c0dfd574a41963e48413bb0524d2cc23645e5 Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Sat, 5 Dec 2015 19:15:57 +0000
Subject: [PATCH 129/364] [WebAssembly] Expand frem as a floating point library
 function.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254854 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/WebAssemblyISelLowering.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 65d2b1967b13..baefd8d0758d 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -133,7 +133,8 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
                     ISD::SETULT, ISD::SETULE, ISD::SETUGT, ISD::SETUGE})
       setCondCodeAction(CC, T, Expand);
     // Expand floating-point library function operators.
-    for (auto Op : {ISD::FSIN, ISD::FCOS, ISD::FSINCOS, ISD::FPOWI, ISD::FPOW})
+    for (auto Op : {ISD::FSIN, ISD::FCOS, ISD::FSINCOS, ISD::FPOWI, ISD::FPOW,
+                    ISD::FREM})
       setOperationAction(Op, T, Expand);
     // Note supported floating-point library function operators that otherwise
     // default to expand.

From e7174bd9a604bce9023a25abddb802fd851d9f3a Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Sat, 5 Dec 2015 19:24:17 +0000
Subject: [PATCH 130/364] [WebAssembly] Call TargetPassConfig base class
 functions in overriding functions.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254855 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index b54699243bd4..917dfacfe9d5 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -146,14 +146,20 @@ void WebAssemblyPassConfig::addIRPasses() {
 }
 
 bool WebAssemblyPassConfig::addInstSelector() {
+  (void)TargetPassConfig::addInstSelector();
   addPass(
       createWebAssemblyISelDag(getWebAssemblyTargetMachine(), getOptLevel()));
   return false;
 }
 
-bool WebAssemblyPassConfig::addILPOpts() { return true; }
+bool WebAssemblyPassConfig::addILPOpts() {
+  (void)TargetPassConfig::addILPOpts();
+  return true;
+}
 
 void WebAssemblyPassConfig::addPreRegAlloc() {
+  TargetPassConfig::addPreRegAlloc();
+
   // Prepare store instructions for register stackifying.
   addPass(createWebAssemblyStoreResults());
 
@@ -173,9 +179,13 @@ void WebAssemblyPassConfig::addPostRegAlloc() {
 
   // Run the register coloring pass to reduce the total number of registers.
   addPass(createWebAssemblyRegColoring());
+
+  TargetPassConfig::addPostRegAlloc();
 }
 
 void WebAssemblyPassConfig::addPreEmitPass() {
+  TargetPassConfig::addPreEmitPass();
+    
   // Put the CFG in structured form; insert BLOCK and LOOP markers.
   addPass(createWebAssemblyCFGStackify());
 

From ecc456747ed00af4960e5f76cb0a75c56e67f776 Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Sat, 5 Dec 2015 19:27:18 +0000
Subject: [PATCH 131/364] [WebAssembly] Move useAA() out of line to make it
 more convenient to experiment with.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254856 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/WebAssemblySubtarget.cpp | 1 +
 lib/Target/WebAssembly/WebAssemblySubtarget.h   | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/Target/WebAssembly/WebAssemblySubtarget.cpp b/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
index 3d9e7aacbfbf..cb2d5a63a19f 100644
--- a/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
+++ b/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
@@ -46,3 +46,4 @@ WebAssemblySubtarget::WebAssemblySubtarget(const Triple &TT,
       TLInfo(TM, *this) {}
 
 bool WebAssemblySubtarget::enableMachineScheduler() const { return true; }
+bool WebAssemblySubtarget::useAA() const { return true; }
diff --git a/lib/Target/WebAssembly/WebAssemblySubtarget.h b/lib/Target/WebAssembly/WebAssemblySubtarget.h
index 9b17300e497d..f530a290fa0e 100644
--- a/lib/Target/WebAssembly/WebAssemblySubtarget.h
+++ b/lib/Target/WebAssembly/WebAssemblySubtarget.h
@@ -69,7 +69,7 @@ class WebAssemblySubtarget final : public WebAssemblyGenSubtargetInfo {
   }
   const Triple &getTargetTriple() const { return TargetTriple; }
   bool enableMachineScheduler() const override;
-  bool useAA() const override { return true; }
+  bool useAA() const override;
 
   // Predicates used by WebAssemblyInstrInfo.td.
   bool hasAddr64() const { return TargetTriple.isArch64Bit(); }

From 1f5f023fe6acfb5d3bf41cc19045e3e187707bf0 Mon Sep 17 00:00:00 2001
From: JF Bastien <jfb@google.com>
Date: Sat, 5 Dec 2015 19:36:33 +0000
Subject: [PATCH 132/364] WebAssembly: improve readme, add placeholder for
 tests.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254857 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/README.txt                  | 9 +++++++++
 lib/Target/WebAssembly/known_gcc_test_failures.txt | 2 ++
 2 files changed, 11 insertions(+)
 create mode 100644 lib/Target/WebAssembly/known_gcc_test_failures.txt

diff --git a/lib/Target/WebAssembly/README.txt b/lib/Target/WebAssembly/README.txt
index bfb124d504eb..78b3123cde85 100644
--- a/lib/Target/WebAssembly/README.txt
+++ b/lib/Target/WebAssembly/README.txt
@@ -12,6 +12,15 @@ binary encoding of WebAssembly itself:
   * https://github.com/WebAssembly/design/blob/master/AstSemantics.md
   * https://github.com/WebAssembly/design/blob/master/BinaryEncoding.md
 
+The backend is built, tested and archived on the following waterfall:
+  https://build.chromium.org/p/client.wasm.llvm/console
+
+The backend's bringup is done using the GCC torture test suite first since it
+doesn't require C library support. Current known failures are in
+known_gcc_test_failures.txt, all other tests should pass. The waterfall will
+turn red if not. Once most of these pass, further testing will use LLVM's own
+test suite.
+
 Interesting work that remains to be done:
 * Write a pass to restructurize irreducible control flow. This needs to be done
   before register allocation to be efficient, because it may duplicate basic
diff --git a/lib/Target/WebAssembly/known_gcc_test_failures.txt b/lib/Target/WebAssembly/known_gcc_test_failures.txt
new file mode 100644
index 000000000000..6038b198abea
--- /dev/null
+++ b/lib/Target/WebAssembly/known_gcc_test_failures.txt
@@ -0,0 +1,2 @@
+# Tests which are known to fail from the GCC torture test suite.
+# FIXME: placeholder. The script which runs the tests needs a file here!

From 64d85bf50cf9481a3979a1c7fdf0ecb58077a1f1 Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Sat, 5 Dec 2015 19:43:19 +0000
Subject: [PATCH 133/364] [WebAssembly] Update a stale comment. NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254859 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/WebAssemblyInstrCall.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
index 6b7d03da4897..84f70400f8a3 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrCall.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
@@ -14,8 +14,8 @@
 
 let Defs = [ARGUMENTS] in {
 
-// The call sequence start/end LLVM-isms isn't useful to WebAssembly since it's
-// a virtual ISA.
+// Call sequence markers. These have an immediate which represents the amount of
+// stack space to allocate or free, which is used for varargs lowering.
 let isCodeGenOnly = 1 in {
 def ADJCALLSTACKDOWN : I<(outs), (ins i64imm:$amt),
                          [(WebAssemblycallseq_start timm:$amt)]>;

From 26a006a71e4fc1431a1a3782979ac3482fdab736 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Sat, 5 Dec 2015 19:54:59 +0000
Subject: [PATCH 134/364] fix typo; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254860 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/sse-minmax.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/CodeGen/X86/sse-minmax.ll b/test/CodeGen/X86/sse-minmax.ll
index e4d0373299fb..f0341277851d 100644
--- a/test/CodeGen/X86/sse-minmax.ll
+++ b/test/CodeGen/X86/sse-minmax.ll
@@ -3,7 +3,7 @@
 ; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=nehalem -asm-verbose=false -enable-no-nans-fp-math  | FileCheck -check-prefix=FINITE %s
 
 ; Some of these patterns can be matched as SSE min or max. Some of
-; then can be matched provided that the operands are swapped.
+; them can be matched provided that the operands are swapped.
 ; Some of them can't be matched at all and require a comparison
 ; and a conditional branch.
 

From b975ecb43f74888e84d14d92ef95c9a86684549f Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Sat, 5 Dec 2015 20:03:44 +0000
Subject: [PATCH 135/364] [WebAssembly] Support inline asm constraints of type
 i16 and similar.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254861 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../WebAssembly/WebAssemblyISelLowering.cpp      | 11 +++++++----
 test/CodeGen/WebAssembly/inline-asm.ll           | 16 ++++++++++++++++
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index baefd8d0758d..6e1283b4d334 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -228,10 +228,13 @@ WebAssemblyTargetLowering::getRegForInlineAsmConstraint(
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     case 'r':
-      if (VT == MVT::i32)
-        return std::make_pair(0U, &WebAssembly::I32RegClass);
-      if (VT == MVT::i64)
-        return std::make_pair(0U, &WebAssembly::I64RegClass);
+      assert(VT != MVT::iPTR && "Pointer MVT not expected here");
+      if (VT.isInteger() && !VT.isVector()) {
+        if (VT.getSizeInBits() <= 32)
+          return std::make_pair(0U, &WebAssembly::I32RegClass);
+        if (VT.getSizeInBits() <= 64)
+          return std::make_pair(0U, &WebAssembly::I64RegClass);
+      }
       break;
     default:
       break;
diff --git a/test/CodeGen/WebAssembly/inline-asm.ll b/test/CodeGen/WebAssembly/inline-asm.ll
index e9d2ebf51f63..646ea779dc8f 100644
--- a/test/CodeGen/WebAssembly/inline-asm.ll
+++ b/test/CodeGen/WebAssembly/inline-asm.ll
@@ -56,6 +56,22 @@ entry:
   ret i64 %0
 }
 
+; CHECK-LABEL: X_i16:
+; CHECK: foo $1{{$}}
+; CHECK: i32.store16 $discard=, 0($0), $1{{$}}
+define void @X_i16(i16 * %t) {
+  call void asm sideeffect "foo $0", "=*X,~{dirflag},~{fpsr},~{flags},~{memory}"(i16* %t)
+  ret void
+}
+
+; CHECK-LABEL: X_ptr:
+; CHECK: foo $1{{$}}
+; CHECK: i32.store $discard=, 0($0), $1
+define void @X_ptr(i16 ** %t) {
+  call void asm sideeffect "foo $0", "=*X,~{dirflag},~{fpsr},~{flags},~{memory}"(i16** %t)
+  ret void
+}
+
 attributes #0 = { nounwind }
 
 !0 = !{i32 47}

From 27951cf8f8fdefbfd6b51e6dbc956f29de553d64 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Sat, 5 Dec 2015 20:27:10 +0000
Subject: [PATCH 136/364] Add vector fmaxnum tests that correspond to the
 existing fminnum tests

Note: missing 256-bit tests for min and max should also be added.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254862 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/fmaxnum.ll | 47 +++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/test/CodeGen/X86/fmaxnum.ll b/test/CodeGen/X86/fmaxnum.ll
index 23678c46dba0..17bd3b9b45b4 100644
--- a/test/CodeGen/X86/fmaxnum.ll
+++ b/test/CodeGen/X86/fmaxnum.ll
@@ -7,6 +7,12 @@ declare float @llvm.maxnum.f32(float, float)
 declare double @llvm.maxnum.f64(double, double)
 declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80)
 
+declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>)
+declare <8 x double> @llvm.maxnum.v8f64(<8 x double>, <8 x double>)
+
+
 ; CHECK-LABEL: @test_fmaxf
 ; CHECK: calll fmaxf
 define float @test_fmaxf(float %x, float %y) {
@@ -48,3 +54,44 @@ define x86_fp80 @test_intrinsic_fmaxl(x86_fp80 %x, x86_fp80 %y) {
   %z = call x86_fp80 @llvm.maxnum.f80(x86_fp80 %x, x86_fp80 %y) readnone
   ret x86_fp80 %z
 }
+
+; CHECK-LABEL: @test_intrinsic_fmax_v2f32
+; CHECK: calll fmaxf
+; CHECK: calll fmaxf
+define <2 x float> @test_intrinsic_fmax_v2f32(<2 x float> %x, <2 x float> %y) {
+  %z = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %x, <2 x float> %y) readnone
+  ret <2 x float> %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmax_v4f32
+; CHECK: calll fmaxf
+; CHECK: calll fmaxf
+; CHECK: calll fmaxf
+; CHECK: calll fmaxf
+define <4 x float> @test_intrinsic_fmax_v4f32(<4 x float> %x, <4 x float> %y) {
+  %z = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float> %y) readnone
+  ret <4 x float> %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmax_v2f64
+; CHECK: calll fmax
+; CHECK: calll fmax
+define <2 x double> @test_intrinsic_fmax_v2f64(<2 x double> %x, <2 x double> %y) {
+  %z = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %x, <2 x double> %y) readnone
+  ret <2 x double> %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmax_v8f64
+; CHECK: calll fmax
+; CHECK: calll fmax
+; CHECK: calll fmax
+; CHECK: calll fmax
+; CHECK: calll fmax
+; CHECK: calll fmax
+; CHECK: calll fmax
+; CHECK: calll fmax
+define <8 x double> @test_intrinsic_fmax_v8f64(<8 x double> %x, <8 x double> %y) {
+  %z = call <8 x double> @llvm.maxnum.v8f64(<8 x double> %x, <8 x double> %y) readnone
+  ret <8 x double> %z
+}
+

From 76e67ade5a49d8ba1285a249117d8242cf3476e3 Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Sat, 5 Dec 2015 20:41:36 +0000
Subject: [PATCH 137/364] [WebAssembly] Implement direct calls to external
 symbols.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254863 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../WebAssembly/WebAssemblyInstrCall.td       | 38 ++++++++++++++++---
 .../WebAssembly/WebAssemblyInstrInfo.td       |  1 -
 test/CodeGen/WebAssembly/frem.ll              | 26 +++++++++++++
 test/CodeGen/WebAssembly/global.ll            |  3 +-
 4 files changed, 59 insertions(+), 9 deletions(-)
 create mode 100644 test/CodeGen/WebAssembly/frem.ll

diff --git a/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
index 84f70400f8a3..0587c0b6613e 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrCall.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
@@ -12,20 +12,22 @@
 ///
 //===----------------------------------------------------------------------===//
 
+// TODO: addr64: These currently assume the callee address is 32-bit.
+
 let Defs = [ARGUMENTS] in {
 
 // Call sequence markers. These have an immediate which represents the amount of
 // stack space to allocate or free, which is used for varargs lowering.
 let isCodeGenOnly = 1 in {
-def ADJCALLSTACKDOWN : I<(outs), (ins i64imm:$amt),
+def ADJCALLSTACKDOWN : I<(outs), (ins i32imm:$amt),
                          [(WebAssemblycallseq_start timm:$amt)]>;
-def ADJCALLSTACKUP : I<(outs), (ins i64imm:$amt),
+def ADJCALLSTACKUP : I<(outs), (ins i32imm:$amt),
                        [(WebAssemblycallseq_end timm:$amt, undef)]>;
 } // isCodeGenOnly = 1
 
 multiclass CALL<WebAssemblyRegClass vt> {
-  def CALL_#vt : I<(outs vt:$dst), (ins global:$callee, variable_ops),
-                   [(set vt:$dst, (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee)))],
+  def CALL_#vt : I<(outs vt:$dst), (ins i32imm:$callee, variable_ops),
+                   [(set vt:$dst, (WebAssemblycall1 (i32 imm:$callee)))],
                    "call    \t$dst, $callee">;
   def CALL_INDIRECT_#vt : I<(outs vt:$dst), (ins I32:$callee, variable_ops),
                             [(set vt:$dst, (WebAssemblycall1 I32:$callee))],
@@ -37,8 +39,8 @@ let Uses = [SP32, SP64], isCall = 1 in {
   defm : CALL<F32>;
   defm : CALL<F64>;
 
-  def CALL_VOID : I<(outs), (ins global:$callee, variable_ops),
-                    [(WebAssemblycall0 (WebAssemblywrapper tglobaladdr:$callee))],
+  def CALL_VOID : I<(outs), (ins i32imm:$callee, variable_ops),
+                    [(WebAssemblycall0 (i32 imm:$callee))],
                     "call    \t$callee">;
   def CALL_INDIRECT_VOID : I<(outs), (ins I32:$callee, variable_ops),
                              [(WebAssemblycall0 I32:$callee)],
@@ -46,3 +48,27 @@ let Uses = [SP32, SP64], isCall = 1 in {
 } // Uses = [SP32,SP64], isCall = 1
 
 } // Defs = [ARGUMENTS]
+
+// Patterns for matching a direct call to a global address.
+def : Pat<(i32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+          (CALL_I32 tglobaladdr:$callee)>;
+def : Pat<(i64 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+          (CALL_I64 tglobaladdr:$callee)>;
+def : Pat<(f32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+          (CALL_F32 tglobaladdr:$callee)>;
+def : Pat<(f64 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+          (CALL_F64 tglobaladdr:$callee)>;
+def : Pat<(WebAssemblycall0 (WebAssemblywrapper tglobaladdr:$callee)),
+          (CALL_VOID tglobaladdr:$callee)>;
+
+// Patterns for matching a direct call to an external symbol.
+def : Pat<(i32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+          (CALL_I32 texternalsym:$callee)>;
+def : Pat<(i64 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+          (CALL_I64 texternalsym:$callee)>;
+def : Pat<(f32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+          (CALL_F32 texternalsym:$callee)>;
+def : Pat<(f64 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+          (CALL_F64 texternalsym:$callee)>;
+def : Pat<(WebAssemblycall0 (WebAssemblywrapper texternalsym:$callee)),
+          (CALL_VOID texternalsym:$callee)>;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index caffac1bc52b..c36a45fe91d7 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -68,7 +68,6 @@ def WebAssemblywrapper  : SDNode<"WebAssemblyISD::Wrapper",
 
 def bb_op : Operand<OtherVT>;
 def tjumptable_op : Operand<iPTR>;
-def global : Operand<iPTR>;
 
 //===----------------------------------------------------------------------===//
 // WebAssembly Instruction Format Definitions.
diff --git a/test/CodeGen/WebAssembly/frem.ll b/test/CodeGen/WebAssembly/frem.ll
new file mode 100644
index 000000000000..43552a9bdf3e
--- /dev/null
+++ b/test/CodeGen/WebAssembly/frem.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that the frem instruction works.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: frem32:
+; CHECK-NEXT: .param f32, f32{{$}}
+; CHECK-NEXT: .result f32{{$}}
+; CHECK-NEXT: call $push0=, fmodf, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define float @frem32(float %x, float %y) {
+  %a = frem float %x, %y
+  ret float %a
+}
+
+; CHECK-LABEL: frem64:
+; CHECK-NEXT: .param f64, f64{{$}}
+; CHECK-NEXT: .result f64{{$}}
+; CHECK-NEXT: call $push0=, fmod, $0, $1{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define double @frem64(double %x, double %y) {
+  %a = frem double %x, %y
+  ret double %a
+}
diff --git a/test/CodeGen/WebAssembly/global.ll b/test/CodeGen/WebAssembly/global.ll
index ffc73e3c1e35..e00d32b972ec 100644
--- a/test/CodeGen/WebAssembly/global.ll
+++ b/test/CodeGen/WebAssembly/global.ll
@@ -21,8 +21,7 @@ define i32 @foo() {
 ; CHECK-LABEL: call_memcpy:
 ; CHECK-NEXT: .param          i32, i32, i32{{$}}
 ; CHECK-NEXT: .result         i32{{$}}
-; CHECK-NEXT: i32.const       $push0=, memcpy{{$}}
-; CHECK-NEXT: call_indirect   $pop0, $0, $1, $2{{$}}
+; CHECK-NEXT: call            memcpy, $0, $1, $2{{$}}
 ; CHECK-NEXT: return          $0{{$}}
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1)
 define i8* @call_memcpy(i8* %p, i8* nocapture readonly %q, i32 %n) {

From 39f84fda2f1b44f926a313e138721d4b14d00da7 Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Sat, 5 Dec 2015 20:46:53 +0000
Subject: [PATCH 138/364] [WebAssembly] Replace the fake JUMP_TABLE instruction
 with a def : Pat. NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254864 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/WebAssemblyInstrInfo.td | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index c36a45fe91d7..dafe6c1ed64b 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -67,7 +67,6 @@ def WebAssemblywrapper  : SDNode<"WebAssemblyISD::Wrapper",
 //===----------------------------------------------------------------------===//
 
 def bb_op : Operand<OtherVT>;
-def tjumptable_op : Operand<iPTR>;
 
 //===----------------------------------------------------------------------===//
 // WebAssembly Instruction Format Definitions.
@@ -135,13 +134,11 @@ def : Pat<(i32 (WebAssemblywrapper tglobaladdr:$dst)),
           (CONST_I32 tglobaladdr:$dst)>;
 def : Pat<(i32 (WebAssemblywrapper texternalsym:$dst)),
           (CONST_I32 texternalsym:$dst)>;
+def : Pat<(i32 (WebAssemblywrapper tjumptable:$dst)),
+          (CONST_I32 tjumptable:$dst)>;
 
 let Defs = [ARGUMENTS] in {
 
-def JUMP_TABLE : I<(outs I32:$dst), (ins tjumptable_op:$addr),
-                   [(set I32:$dst, (WebAssemblywrapper tjumptable:$addr))],
-                   "jump_table\t$dst, $addr">;
-
 // Function signature and local variable declaration "instructions".
 def PARAM  : I<(outs), (ins variable_ops), [], ".param  \t">;
 def RESULT : I<(outs), (ins variable_ops), [], ".result \t">;

From 9eb92586b28e7d951f0ac320045885a320f2a0ab Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Sat, 5 Dec 2015 22:12:39 +0000
Subject: [PATCH 139/364] [WebAssembly] Don't perform the returned-argument
 optimization on constants.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254866 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../WebAssembly/WebAssemblyOptimizeReturned.cpp    |  3 +++
 test/CodeGen/WebAssembly/returned.ll               | 14 ++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp b/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
index dea419c5975c..4dc401a2c7cc 100644
--- a/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
@@ -57,6 +57,9 @@ void OptimizeReturned::visitCallSite(CallSite CS) {
     if (CS.paramHasAttr(1 + i, Attribute::Returned)) {
       Instruction *Inst = CS.getInstruction();
       Value *Arg = CS.getArgOperand(i);
+      // Ignore constants, globals, undef, etc.
+      if (isa<Constant>(Arg))
+        continue;
       // Like replaceDominatedUsesWith but using Instruction/Use dominance.
       for (auto UI = Arg->use_begin(), UE = Arg->use_end(); UI != UE;) {
         Use &U = *UI++;
diff --git a/test/CodeGen/WebAssembly/returned.ll b/test/CodeGen/WebAssembly/returned.ll
index d65e2a8bc3e5..9cfdc711a8a3 100644
--- a/test/CodeGen/WebAssembly/returned.ll
+++ b/test/CodeGen/WebAssembly/returned.ll
@@ -33,3 +33,17 @@ entry:
   %call = tail call i8* @memcpy(i8* %p, i8* %s, i32 %n)
   ret i8* %p
 }
+
+; Test that the optimization isn't performed on constant arguments.
+
+; CHECK-LABEL: test_constant_arg:
+; CHECK-NEXT: i32.const   $push0=, global{{$}}
+; CHECK-NEXT: call        $discard=, returns_arg, $pop0{{$}}
+; CHECK-NEXT: return{{$}}
+@global = external global i32
+@addr = global i32* @global
+define void @test_constant_arg() {
+  %call = call i32* @returns_arg(i32* @global)
+  ret void
+}
+declare i32* @returns_arg(i32* returned)

From 3aa1034794ef6a4c9b9196c00debbd318b36fdd3 Mon Sep 17 00:00:00 2001
From: Davide Italiano <davide@freebsd.org>
Date: Sat, 5 Dec 2015 23:36:52 +0000
Subject: [PATCH 140/364] [llvm-readobj] report_error() does not return, so we
 can simplify.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254868 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-readobj/llvm-readobj.cpp | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/tools/llvm-readobj/llvm-readobj.cpp b/tools/llvm-readobj/llvm-readobj.cpp
index 5406afff241e..2a75ababb2e8 100644
--- a/tools/llvm-readobj/llvm-readobj.cpp
+++ b/tools/llvm-readobj/llvm-readobj.cpp
@@ -296,10 +296,8 @@ static std::error_code createDumper(const ObjectFile *Obj, StreamWriter &Writer,
 static void dumpObject(const ObjectFile *Obj) {
   StreamWriter Writer(outs());
   std::unique_ptr<ObjDumper> Dumper;
-  if (std::error_code EC = createDumper(Obj, Writer, Dumper)) {
+  if (std::error_code EC = createDumper(Obj, Writer, Dumper))
     reportError(Obj->getFileName(), EC);
-    return;
-  }
 
   outs() << '\n';
   outs() << "File: " << Obj->getFileName() << "\n";
@@ -414,10 +412,8 @@ static void dumpInput(StringRef File) {
 
   // Attempt to open the binary.
   ErrorOr<OwningBinary<Binary>> BinaryOrErr = createBinary(File);
-  if (std::error_code EC = BinaryOrErr.getError()) {
+  if (std::error_code EC = BinaryOrErr.getError())
     reportError(File, EC);
-    return;
-  }
   Binary &Binary = *BinaryOrErr.get().getBinary();
 
   if (Archive *Arc = dyn_cast<Archive>(&Binary))

From a8231e7f59c2f43d11c387030e5e0c29670bd5a9 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@playingwithpointers.com>
Date: Sat, 5 Dec 2015 23:44:22 +0000
Subject: [PATCH 141/364] [InstCombine] Call getCmpPredicateForMinMax only with
 a valid SPF

Summary:
There are `SelectPatternFlavor`s that don't represent min or max idioms,
and we should not be passing those to `getCmpPredicateForMinMax`.

Fixes PR25745.

Reviewers: majnemer

Subscribers: llvm-commits

Differential Revision: http://reviews.llvm.org/D15249

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254869 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/ValueTracking.h         |  5 +++++
 .../InstCombine/InstCombineSelect.cpp         |  6 +++++-
 test/Transforms/InstCombine/pr25745.ll        | 20 +++++++++++++++++++
 3 files changed, 30 insertions(+), 1 deletion(-)
 create mode 100644 test/Transforms/InstCombine/pr25745.ll

diff --git a/include/llvm/Analysis/ValueTracking.h b/include/llvm/Analysis/ValueTracking.h
index b34d6bac1f34..eb2c000e07cd 100644
--- a/include/llvm/Analysis/ValueTracking.h
+++ b/include/llvm/Analysis/ValueTracking.h
@@ -412,6 +412,11 @@ namespace llvm {
     bool Ordered;               /// When implementing this min/max pattern as
                                 /// fcmp; select, does the fcmp have to be
                                 /// ordered?
+
+    /// \brief Return true if \p SPF is a min or a max pattern.
+    static bool isMinOrMax(SelectPatternFlavor SPF) {
+      return !(SPF == SPF_UNKNOWN || SPF == SPF_ABS || SPF == SPF_NABS);
+    }
   };
   /// Pattern match integer [SU]MIN, [SU]MAX and ABS idioms, returning the kind
   /// and providing the out parameter results if we successfully match.
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 2baa131bc99c..776704d1efa9 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1070,7 +1070,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
     SelectPatternResult SPR = matchSelectPattern(&SI, LHS, RHS, &CastOp);
     auto SPF = SPR.Flavor;
 
-    if (SPF) {
+    if (SelectPatternResult::isMinOrMax(SPF)) {
       // Canonicalize so that type casts are outside select patterns.
       if (LHS->getType()->getPrimitiveSizeInBits() !=
           SI.getType()->getPrimitiveSizeInBits()) {
@@ -1091,11 +1091,15 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
                                            SI.getType());
         return ReplaceInstUsesWith(SI, NewSI);
       }
+    }
 
+    if (SPF) {
       // MAX(MAX(a, b), a) -> MAX(a, b)
       // MIN(MIN(a, b), a) -> MIN(a, b)
       // MAX(MIN(a, b), a) -> a
       // MIN(MAX(a, b), a) -> a
+      // ABS(ABS(a)) -> ABS(a)
+      // NABS(NABS(a)) -> NABS(a)
       if (SelectPatternFlavor SPF2 = matchSelectPattern(LHS, LHS2, RHS2).Flavor)
         if (Instruction *R = FoldSPFofSPF(cast<Instruction>(LHS),SPF2,LHS2,RHS2,
                                           SI, SPF, RHS))
diff --git a/test/Transforms/InstCombine/pr25745.ll b/test/Transforms/InstCombine/pr25745.ll
new file mode 100644
index 000000000000..3bf9efc92b90
--- /dev/null
+++ b/test/Transforms/InstCombine/pr25745.ll
@@ -0,0 +1,20 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+; Checking for a crash
+
+declare void @use.i1(i1 %val)
+declare void @use.i64(i64 %val)
+
+define i64 @f(i32 %x) {
+; CHECK-LABEL: @f(
+ entry:
+  %x.wide = sext i32 %x to i64
+  %minus.x = sub i32 0, %x
+  %minus.x.wide = sext i32 %minus.x to i64
+  %c = icmp slt i32 %x, 0
+  %val = select i1 %c, i64 %x.wide, i64 %minus.x.wide
+  call void @use.i1(i1 %c)
+  call void @use.i64(i64 %x.wide)
+  ret i64 %val
+; CHECK: ret i64 %val
+}

From eb32659b9c004c7ad9270932a3cc7a40286b77d6 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Sun, 6 Dec 2015 05:07:58 +0000
Subject: [PATCH 142/364] Minor formatting fix. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254871 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/SelectionDAGNodes.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h
index 66ed5703fe11..e8f63b2d928e 100644
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -664,7 +664,7 @@ class SDNode : public FoldingSetNode, public ilist_node<SDNode> {
   /// to which the glue operand points. Otherwise return NULL.
   SDNode *getGluedNode() const {
     if (getNumOperands() != 0 &&
-      getOperand(getNumOperands()-1).getValueType() == MVT::Glue)
+        getOperand(getNumOperands()-1).getValueType() == MVT::Glue)
       return getOperand(getNumOperands()-1).getNode();
     return nullptr;
   }

From e6bc7d1f0d765d1e67a1c5d6c7a4f36677810c8e Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Sun, 6 Dec 2015 05:08:07 +0000
Subject: [PATCH 143/364] Use make_range to reduce mentions of iterator type.
 NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254872 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/LazyCallGraph.h         |  5 +--
 include/llvm/Analysis/LoopInfo.h              |  2 +-
 include/llvm/CodeGen/MachineBasicBlock.h      | 12 +++---
 include/llvm/CodeGen/MachineInstr.h           | 38 ++++++++---------
 include/llvm/CodeGen/MachineRegisterInfo.h    | 42 +++++++------------
 include/llvm/CodeGen/SelectionDAG.h           |  5 +--
 include/llvm/CodeGen/SelectionDAGNodes.h      |  8 ++--
 include/llvm/IR/CallSite.h                    |  4 +-
 include/llvm/IR/DebugInfo.h                   | 10 ++---
 include/llvm/IR/Function.h                    |  4 +-
 include/llvm/IR/InstrTypes.h                  |  6 +--
 include/llvm/IR/Instructions.h                | 28 ++++++-------
 include/llvm/IR/Metadata.h                    |  4 +-
 include/llvm/IR/Module.h                      | 18 ++++----
 include/llvm/IR/Statepoint.h                  |  9 ++--
 include/llvm/IR/User.h                        |  2 +-
 include/llvm/IR/Value.h                       |  8 ++--
 include/llvm/Object/Archive.h                 |  5 +--
 include/llvm/Object/ObjectFile.h              |  3 +-
 include/llvm/Support/Registry.h               |  2 +-
 lib/ExecutionEngine/MCJIT/MCJIT.h             |  2 +-
 lib/Object/MachOObjectFile.cpp                | 12 ++----
 .../MCTargetDesc/HexagonMCInstrInfo.cpp       |  3 +-
 utils/TableGen/CodeGenSchedule.h              | 10 ++---
 utils/TableGen/CodeGenTarget.h                |  2 +-
 25 files changed, 105 insertions(+), 139 deletions(-)

diff --git a/include/llvm/Analysis/LazyCallGraph.h b/include/llvm/Analysis/LazyCallGraph.h
index 7cbc40f768eb..270a32621be7 100644
--- a/include/llvm/Analysis/LazyCallGraph.h
+++ b/include/llvm/Analysis/LazyCallGraph.h
@@ -235,7 +235,7 @@ class LazyCallGraph {
     parent_iterator parent_end() const { return ParentSCCs.end(); }
 
     iterator_range<parent_iterator> parents() const {
-      return iterator_range<parent_iterator>(parent_begin(), parent_end());
+      return make_range(parent_begin(), parent_end());
     }
 
     /// \brief Test if this SCC is a parent of \a C.
@@ -410,8 +410,7 @@ class LazyCallGraph {
   }
 
   iterator_range<postorder_scc_iterator> postorder_sccs() {
-    return iterator_range<postorder_scc_iterator>(postorder_scc_begin(),
-                                                  postorder_scc_end());
+    return make_range(postorder_scc_begin(), postorder_scc_end());
   }
 
   /// \brief Lookup a function in the graph which has already been scanned and
diff --git a/include/llvm/Analysis/LoopInfo.h b/include/llvm/Analysis/LoopInfo.h
index ac0a4b02f445..9196250233cd 100644
--- a/include/llvm/Analysis/LoopInfo.h
+++ b/include/llvm/Analysis/LoopInfo.h
@@ -141,7 +141,7 @@ class LoopBase {
   block_iterator block_begin() const { return Blocks.begin(); }
   block_iterator block_end() const { return Blocks.end(); }
   inline iterator_range<block_iterator> blocks() const {
-    return iterator_range<block_iterator>(block_begin(), block_end());
+    return make_range(block_begin(), block_end());
   }
 
   /// getNumBlocks - Get the number of blocks in this loop in constant time.
diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h
index ac87f4f901f5..57bd24ddddfe 100644
--- a/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/include/llvm/CodeGen/MachineBasicBlock.h
@@ -272,10 +272,10 @@ class MachineBasicBlock
   }
 
   inline iterator_range<iterator> terminators() {
-    return iterator_range<iterator>(getFirstTerminator(), end());
+    return make_range(getFirstTerminator(), end());
   }
   inline iterator_range<const_iterator> terminators() const {
-    return iterator_range<const_iterator>(getFirstTerminator(), end());
+    return make_range(getFirstTerminator(), end());
   }
 
   // Machine-CFG iterators
@@ -325,16 +325,16 @@ class MachineBasicBlock
   bool                 succ_empty() const { return Successors.empty();   }
 
   inline iterator_range<pred_iterator> predecessors() {
-    return iterator_range<pred_iterator>(pred_begin(), pred_end());
+    return make_range(pred_begin(), pred_end());
   }
   inline iterator_range<const_pred_iterator> predecessors() const {
-    return iterator_range<const_pred_iterator>(pred_begin(), pred_end());
+    return make_range(pred_begin(), pred_end());
   }
   inline iterator_range<succ_iterator> successors() {
-    return iterator_range<succ_iterator>(succ_begin(), succ_end());
+    return make_range(succ_begin(), succ_end());
   }
   inline iterator_range<const_succ_iterator> successors() const {
-    return iterator_range<const_succ_iterator>(succ_begin(), succ_end());
+    return make_range(succ_begin(), succ_end());
   }
 
   // LiveIn management methods.
diff --git a/include/llvm/CodeGen/MachineInstr.h b/include/llvm/CodeGen/MachineInstr.h
index 607e2781960f..07b1133b2a4a 100644
--- a/include/llvm/CodeGen/MachineInstr.h
+++ b/include/llvm/CodeGen/MachineInstr.h
@@ -296,48 +296,46 @@ class MachineInstr
   const_mop_iterator operands_end() const { return Operands + NumOperands; }
 
   iterator_range<mop_iterator> operands() {
-    return iterator_range<mop_iterator>(operands_begin(), operands_end());
+    return make_range(operands_begin(), operands_end());
   }
   iterator_range<const_mop_iterator> operands() const {
-    return iterator_range<const_mop_iterator>(operands_begin(), operands_end());
+    return make_range(operands_begin(), operands_end());
   }
   iterator_range<mop_iterator> explicit_operands() {
-    return iterator_range<mop_iterator>(
-        operands_begin(), operands_begin() + getNumExplicitOperands());
+    return make_range(operands_begin(),
+                      operands_begin() + getNumExplicitOperands());
   }
   iterator_range<const_mop_iterator> explicit_operands() const {
-    return iterator_range<const_mop_iterator>(
-        operands_begin(), operands_begin() + getNumExplicitOperands());
+    return make_range(operands_begin(),
+                      operands_begin() + getNumExplicitOperands());
   }
   iterator_range<mop_iterator> implicit_operands() {
-    return iterator_range<mop_iterator>(explicit_operands().end(),
-                                        operands_end());
+    return make_range(explicit_operands().end(), operands_end());
   }
   iterator_range<const_mop_iterator> implicit_operands() const {
-    return iterator_range<const_mop_iterator>(explicit_operands().end(),
-                                              operands_end());
+    return make_range(explicit_operands().end(), operands_end());
   }
   /// Returns a range over all explicit operands that are register definitions.
   /// Implicit definition are not included!
   iterator_range<mop_iterator> defs() {
-    return iterator_range<mop_iterator>(
-        operands_begin(), operands_begin() + getDesc().getNumDefs());
+    return make_range(operands_begin(),
+                      operands_begin() + getDesc().getNumDefs());
   }
   /// \copydoc defs()
   iterator_range<const_mop_iterator> defs() const {
-    return iterator_range<const_mop_iterator>(
-        operands_begin(), operands_begin() + getDesc().getNumDefs());
+    return make_range(operands_begin(),
+                      operands_begin() + getDesc().getNumDefs());
   }
   /// Returns a range that includes all operands that are register uses.
   /// This may include unrelated operands which are not register uses.
   iterator_range<mop_iterator> uses() {
-    return iterator_range<mop_iterator>(
-        operands_begin() + getDesc().getNumDefs(), operands_end());
+    return make_range(operands_begin() + getDesc().getNumDefs(),
+                      operands_end());
   }
   /// \copydoc uses()
   iterator_range<const_mop_iterator> uses() const {
-    return iterator_range<const_mop_iterator>(
-        operands_begin() + getDesc().getNumDefs(), operands_end());
+    return make_range(operands_begin() + getDesc().getNumDefs(),
+                      operands_end());
   }
 
   /// Returns the number of the operand iterator \p I points to.
@@ -351,10 +349,10 @@ class MachineInstr
   bool memoperands_empty() const { return NumMemRefs == 0; }
 
   iterator_range<mmo_iterator>  memoperands() {
-    return iterator_range<mmo_iterator>(memoperands_begin(), memoperands_end());
+    return make_range(memoperands_begin(), memoperands_end());
   }
   iterator_range<mmo_iterator> memoperands() const {
-    return iterator_range<mmo_iterator>(memoperands_begin(), memoperands_end());
+    return make_range(memoperands_begin(), memoperands_end());
   }
 
   /// Return true if this instruction has exactly one MachineMemOperand.
diff --git a/include/llvm/CodeGen/MachineRegisterInfo.h b/include/llvm/CodeGen/MachineRegisterInfo.h
index 0a1f62006327..04191bc1b74f 100644
--- a/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -234,7 +234,7 @@ class MachineRegisterInfo {
   static reg_iterator reg_end() { return reg_iterator(nullptr); }
 
   inline iterator_range<reg_iterator>  reg_operands(unsigned Reg) const {
-    return iterator_range<reg_iterator>(reg_begin(Reg), reg_end());
+    return make_range(reg_begin(Reg), reg_end());
   }
 
   /// reg_instr_iterator/reg_instr_begin/reg_instr_end - Walk all defs and uses
@@ -250,8 +250,7 @@ class MachineRegisterInfo {
 
   inline iterator_range<reg_instr_iterator>
   reg_instructions(unsigned Reg) const {
-    return iterator_range<reg_instr_iterator>(reg_instr_begin(Reg),
-                                              reg_instr_end());
+    return make_range(reg_instr_begin(Reg), reg_instr_end());
   }
 
   /// reg_bundle_iterator/reg_bundle_begin/reg_bundle_end - Walk all defs and uses
@@ -266,8 +265,7 @@ class MachineRegisterInfo {
   }
 
   inline iterator_range<reg_bundle_iterator> reg_bundles(unsigned Reg) const {
-    return iterator_range<reg_bundle_iterator>(reg_bundle_begin(Reg),
-                                               reg_bundle_end());
+    return make_range(reg_bundle_begin(Reg), reg_bundle_end());
   }
 
   /// reg_empty - Return true if there are no instructions using or defining the
@@ -287,8 +285,7 @@ class MachineRegisterInfo {
 
   inline iterator_range<reg_nodbg_iterator>
   reg_nodbg_operands(unsigned Reg) const {
-    return iterator_range<reg_nodbg_iterator>(reg_nodbg_begin(Reg),
-                                              reg_nodbg_end());
+    return make_range(reg_nodbg_begin(Reg), reg_nodbg_end());
   }
 
   /// reg_instr_nodbg_iterator/reg_instr_nodbg_begin/reg_instr_nodbg_end - Walk
@@ -305,8 +302,7 @@ class MachineRegisterInfo {
 
   inline iterator_range<reg_instr_nodbg_iterator>
   reg_nodbg_instructions(unsigned Reg) const {
-    return iterator_range<reg_instr_nodbg_iterator>(reg_instr_nodbg_begin(Reg),
-                                                    reg_instr_nodbg_end());
+    return make_range(reg_instr_nodbg_begin(Reg), reg_instr_nodbg_end());
   }
 
   /// reg_bundle_nodbg_iterator/reg_bundle_nodbg_begin/reg_bundle_nodbg_end - Walk
@@ -323,8 +319,7 @@ class MachineRegisterInfo {
 
   inline iterator_range<reg_bundle_nodbg_iterator>
   reg_nodbg_bundles(unsigned Reg) const {
-    return iterator_range<reg_bundle_nodbg_iterator>(reg_bundle_nodbg_begin(Reg),
-                                                     reg_bundle_nodbg_end());
+    return make_range(reg_bundle_nodbg_begin(Reg), reg_bundle_nodbg_end());
   }
 
   /// reg_nodbg_empty - Return true if the only instructions using or defining
@@ -342,7 +337,7 @@ class MachineRegisterInfo {
   static def_iterator def_end() { return def_iterator(nullptr); }
 
   inline iterator_range<def_iterator> def_operands(unsigned Reg) const {
-    return iterator_range<def_iterator>(def_begin(Reg), def_end());
+    return make_range(def_begin(Reg), def_end());
   }
 
   /// def_instr_iterator/def_instr_begin/def_instr_end - Walk all defs of the
@@ -358,8 +353,7 @@ class MachineRegisterInfo {
 
   inline iterator_range<def_instr_iterator>
   def_instructions(unsigned Reg) const {
-    return iterator_range<def_instr_iterator>(def_instr_begin(Reg),
-                                              def_instr_end());
+    return make_range(def_instr_begin(Reg), def_instr_end());
   }
 
   /// def_bundle_iterator/def_bundle_begin/def_bundle_end - Walk all defs of the
@@ -374,8 +368,7 @@ class MachineRegisterInfo {
   }
 
   inline iterator_range<def_bundle_iterator> def_bundles(unsigned Reg) const {
-    return iterator_range<def_bundle_iterator>(def_bundle_begin(Reg),
-                                               def_bundle_end());
+    return make_range(def_bundle_begin(Reg), def_bundle_end());
   }
 
   /// def_empty - Return true if there are no instructions defining the
@@ -400,7 +393,7 @@ class MachineRegisterInfo {
   static use_iterator use_end() { return use_iterator(nullptr); }
 
   inline iterator_range<use_iterator> use_operands(unsigned Reg) const {
-    return iterator_range<use_iterator>(use_begin(Reg), use_end());
+    return make_range(use_begin(Reg), use_end());
   }
 
   /// use_instr_iterator/use_instr_begin/use_instr_end - Walk all uses of the
@@ -416,8 +409,7 @@ class MachineRegisterInfo {
 
   inline iterator_range<use_instr_iterator>
   use_instructions(unsigned Reg) const {
-    return iterator_range<use_instr_iterator>(use_instr_begin(Reg),
-                                              use_instr_end());
+    return make_range(use_instr_begin(Reg), use_instr_end());
   }
 
   /// use_bundle_iterator/use_bundle_begin/use_bundle_end - Walk all uses of the
@@ -432,8 +424,7 @@ class MachineRegisterInfo {
   }
 
   inline iterator_range<use_bundle_iterator> use_bundles(unsigned Reg) const {
-    return iterator_range<use_bundle_iterator>(use_bundle_begin(Reg),
-                                               use_bundle_end());
+    return make_range(use_bundle_begin(Reg), use_bundle_end());
   }
 
   /// use_empty - Return true if there are no instructions using the specified
@@ -462,8 +453,7 @@ class MachineRegisterInfo {
 
   inline iterator_range<use_nodbg_iterator>
   use_nodbg_operands(unsigned Reg) const {
-    return iterator_range<use_nodbg_iterator>(use_nodbg_begin(Reg),
-                                              use_nodbg_end());
+    return make_range(use_nodbg_begin(Reg), use_nodbg_end());
   }
 
   /// use_instr_nodbg_iterator/use_instr_nodbg_begin/use_instr_nodbg_end - Walk
@@ -480,8 +470,7 @@ class MachineRegisterInfo {
 
   inline iterator_range<use_instr_nodbg_iterator>
   use_nodbg_instructions(unsigned Reg) const {
-    return iterator_range<use_instr_nodbg_iterator>(use_instr_nodbg_begin(Reg),
-                                                    use_instr_nodbg_end());
+    return make_range(use_instr_nodbg_begin(Reg), use_instr_nodbg_end());
   }
 
   /// use_bundle_nodbg_iterator/use_bundle_nodbg_begin/use_bundle_nodbg_end - Walk
@@ -498,8 +487,7 @@ class MachineRegisterInfo {
 
   inline iterator_range<use_bundle_nodbg_iterator>
   use_nodbg_bundles(unsigned Reg) const {
-    return iterator_range<use_bundle_nodbg_iterator>(use_bundle_nodbg_begin(Reg),
-                                                     use_bundle_nodbg_end());
+    return make_range(use_bundle_nodbg_begin(Reg), use_bundle_nodbg_end());
   }
 
   /// use_nodbg_empty - Return true if there are no non-Debug instructions
diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h
index 83464a6c9785..a21e9ae881a7 100644
--- a/include/llvm/CodeGen/SelectionDAG.h
+++ b/include/llvm/CodeGen/SelectionDAG.h
@@ -326,11 +326,10 @@ class SelectionDAG {
   }
 
   iterator_range<allnodes_iterator> allnodes() {
-    return iterator_range<allnodes_iterator>(allnodes_begin(), allnodes_end());
+    return make_range(allnodes_begin(), allnodes_end());
   }
   iterator_range<allnodes_const_iterator> allnodes() const {
-    return iterator_range<allnodes_const_iterator>(allnodes_begin(),
-                                                   allnodes_end());
+    return make_range(allnodes_begin(), allnodes_end());
   }
 
   /// Return the root tag of the SelectionDAG.
diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h
index e8f63b2d928e..548549ab1353 100644
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -576,10 +576,10 @@ class SDNode : public FoldingSetNode, public ilist_node<SDNode> {
   static use_iterator use_end() { return use_iterator(nullptr); }
 
   inline iterator_range<use_iterator> uses() {
-    return iterator_range<use_iterator>(use_begin(), use_end());
+    return make_range(use_begin(), use_end());
   }
   inline iterator_range<use_iterator> uses() const {
-    return iterator_range<use_iterator>(use_begin(), use_end());
+    return make_range(use_begin(), use_end());
   }
 
   /// Return true if there are exactly NUSES uses of the indicated value.
@@ -651,8 +651,8 @@ class SDNode : public FoldingSetNode, public ilist_node<SDNode> {
   };
 
   iterator_range<value_op_iterator> op_values() const {
-    return iterator_range<value_op_iterator>(value_op_iterator(op_begin()),
-                                             value_op_iterator(op_end()));
+    return make_range(value_op_iterator(op_begin()),
+                      value_op_iterator(op_end()));
   }
 
   SDVTList getVTList() const {
diff --git a/include/llvm/IR/CallSite.h b/include/llvm/IR/CallSite.h
index c87f1293330b..8556dda163b8 100644
--- a/include/llvm/IR/CallSite.h
+++ b/include/llvm/IR/CallSite.h
@@ -158,7 +158,7 @@ class CallSiteBase {
 
   IterTy arg_end() const { return (*this)->op_end() - getArgumentEndOffset(); }
   iterator_range<IterTy> args() const {
-    return iterator_range<IterTy>(arg_begin(), arg_end());
+    return make_range(arg_begin(), arg_end());
   }
   bool arg_empty() const { return arg_end() == arg_begin(); }
   unsigned arg_size() const { return unsigned(arg_end() - arg_begin()); }
@@ -182,7 +182,7 @@ class CallSiteBase {
     return (*this)->op_end() - (isCall() ? 1 : 3);
   }
   iterator_range<IterTy> data_ops() const {
-    return iterator_range<IterTy>(data_operands_begin(), data_operands_end());
+    return make_range(data_operands_begin(), data_operands_end());
   }
   bool data_operands_empty() const {
     return data_operands_end() == data_operands_begin();
diff --git a/include/llvm/IR/DebugInfo.h b/include/llvm/IR/DebugInfo.h
index 59cabd326d28..4caceacbb58e 100644
--- a/include/llvm/IR/DebugInfo.h
+++ b/include/llvm/IR/DebugInfo.h
@@ -105,23 +105,23 @@ class DebugInfoFinder {
   typedef SmallVectorImpl<DIScope *>::const_iterator scope_iterator;
 
   iterator_range<compile_unit_iterator> compile_units() const {
-    return iterator_range<compile_unit_iterator>(CUs.begin(), CUs.end());
+    return make_range(CUs.begin(), CUs.end());
   }
 
   iterator_range<subprogram_iterator> subprograms() const {
-    return iterator_range<subprogram_iterator>(SPs.begin(), SPs.end());
+    return make_range(SPs.begin(), SPs.end());
   }
 
   iterator_range<global_variable_iterator> global_variables() const {
-    return iterator_range<global_variable_iterator>(GVs.begin(), GVs.end());
+    return make_range(GVs.begin(), GVs.end());
   }
 
   iterator_range<type_iterator> types() const {
-    return iterator_range<type_iterator>(TYs.begin(), TYs.end());
+    return make_range(TYs.begin(), TYs.end());
   }
 
   iterator_range<scope_iterator> scopes() const {
-    return iterator_range<scope_iterator>(Scopes.begin(), Scopes.end());
+    return make_range(Scopes.begin(), Scopes.end());
   }
 
   unsigned compile_unit_count() const { return CUs.size(); }
diff --git a/include/llvm/IR/Function.h b/include/llvm/IR/Function.h
index 71822a462daa..a55ff5cb6e2b 100644
--- a/include/llvm/IR/Function.h
+++ b/include/llvm/IR/Function.h
@@ -493,11 +493,11 @@ class Function : public GlobalObject, public ilist_node<Function> {
   }
 
   iterator_range<arg_iterator> args() {
-    return iterator_range<arg_iterator>(arg_begin(), arg_end());
+    return make_range(arg_begin(), arg_end());
   }
 
   iterator_range<const_arg_iterator> args() const {
-    return iterator_range<const_arg_iterator>(arg_begin(), arg_end());
+    return make_range(arg_begin(), arg_end());
   }
 
 /// @}
diff --git a/include/llvm/IR/InstrTypes.h b/include/llvm/IR/InstrTypes.h
index 58bc7c1ee10a..157cb27cefbb 100644
--- a/include/llvm/IR/InstrTypes.h
+++ b/include/llvm/IR/InstrTypes.h
@@ -1479,14 +1479,12 @@ template <typename InstrTy, typename OpIteratorTy> class OperandBundleUser {
 
   /// \brief Return the range [\p bundle_op_info_begin, \p bundle_op_info_end).
   iterator_range<bundle_op_iterator> bundle_op_infos() {
-    return iterator_range<bundle_op_iterator>(bundle_op_info_begin(),
-                                              bundle_op_info_end());
+    return make_range(bundle_op_info_begin(), bundle_op_info_end());
   }
 
   /// \brief Return the range [\p bundle_op_info_begin, \p bundle_op_info_end).
   iterator_range<const_bundle_op_iterator> bundle_op_infos() const {
-    return iterator_range<const_bundle_op_iterator>(bundle_op_info_begin(),
-                                                    bundle_op_info_end());
+    return make_range(bundle_op_info_begin(), bundle_op_info_end());
   }
 
   /// \brief Populate the BundleOpInfo instances and the Use& vector from \p
diff --git a/include/llvm/IR/Instructions.h b/include/llvm/IR/Instructions.h
index 5119749ba73c..ae06a5f641a1 100644
--- a/include/llvm/IR/Instructions.h
+++ b/include/llvm/IR/Instructions.h
@@ -1547,14 +1547,12 @@ class CallInst : public Instruction,
   iterator_range<op_iterator> arg_operands() {
     // The last operand in the op list is the callee - it's not one of the args
     // so we don't want to iterate over it.
-    return iterator_range<op_iterator>(
-        op_begin(), op_end() - getNumTotalBundleOperands() - 1);
+    return make_range(op_begin(), op_end() - getNumTotalBundleOperands() - 1);
   }
 
   /// arg_operands - iteration adapter for range-for loops.
   iterator_range<const_op_iterator> arg_operands() const {
-    return iterator_range<const_op_iterator>(
-        op_begin(), op_end() - getNumTotalBundleOperands() - 1);
+    return make_range(op_begin(), op_end() - getNumTotalBundleOperands() - 1);
   }
 
   /// \brief Wrappers for getting the \c Use of a call argument.
@@ -2213,7 +2211,7 @@ class ExtractValueInst : public UnaryInstruction {
   inline idx_iterator idx_begin() const { return Indices.begin(); }
   inline idx_iterator idx_end()   const { return Indices.end(); }
   inline iterator_range<idx_iterator> indices() const {
-    return iterator_range<idx_iterator>(idx_begin(), idx_end());
+    return make_range(idx_begin(), idx_end());
   }
 
   Value *getAggregateOperand() {
@@ -2330,7 +2328,7 @@ class InsertValueInst : public Instruction {
   inline idx_iterator idx_begin() const { return Indices.begin(); }
   inline idx_iterator idx_end()   const { return Indices.end(); }
   inline iterator_range<idx_iterator> indices() const {
-    return iterator_range<idx_iterator>(idx_begin(), idx_end());
+    return make_range(idx_begin(), idx_end());
   }
 
   Value *getAggregateOperand() {
@@ -3105,12 +3103,12 @@ class SwitchInst : public TerminatorInst {
 
   /// cases - iteration adapter for range-for loops.
   iterator_range<CaseIt> cases() {
-    return iterator_range<CaseIt>(case_begin(), case_end());
+    return make_range(case_begin(), case_end());
   }
 
   /// cases - iteration adapter for range-for loops.
   iterator_range<ConstCaseIt> cases() const {
-    return iterator_range<ConstCaseIt>(case_begin(), case_end());
+    return make_range(case_begin(), case_end());
   }
 
   /// Returns an iterator that points to the default case.
@@ -3454,14 +3452,12 @@ class InvokeInst : public TerminatorInst,
 
   /// arg_operands - iteration adapter for range-for loops.
   iterator_range<op_iterator> arg_operands() {
-    return iterator_range<op_iterator>(
-        op_begin(), op_end() - getNumTotalBundleOperands() - 3);
+    return make_range(op_begin(), op_end() - getNumTotalBundleOperands() - 3);
   }
 
   /// arg_operands - iteration adapter for range-for loops.
   iterator_range<const_op_iterator> arg_operands() const {
-    return iterator_range<const_op_iterator>(
-        op_begin(), op_end() - getNumTotalBundleOperands() - 3);
+    return make_range(op_begin(), op_end() - getNumTotalBundleOperands() - 3);
   }
 
   /// \brief Wrappers for getting the \c Use of a invoke argument.
@@ -3915,12 +3911,12 @@ class CatchPadInst : public TerminatorInst {
 
   /// arg_operands - iteration adapter for range-for loops.
   iterator_range<op_iterator> arg_operands() {
-    return iterator_range<op_iterator>(op_begin(), op_end() - 2);
+    return make_range(op_begin(), op_end() - 2);
   }
 
   /// arg_operands - iteration adapter for range-for loops.
   iterator_range<const_op_iterator> arg_operands() const {
-    return iterator_range<const_op_iterator>(op_begin(), op_end() - 2);
+    return make_range(op_begin(), op_end() - 2);
   }
 
   /// \brief Wrappers for getting the \c Use of a catchpad argument.
@@ -4039,12 +4035,12 @@ class TerminatePadInst : public TerminatorInst {
 
   /// arg_operands - iteration adapter for range-for loops.
   iterator_range<op_iterator> arg_operands() {
-    return iterator_range<op_iterator>(op_begin(), arg_end());
+    return make_range(op_begin(), arg_end());
   }
 
   /// arg_operands - iteration adapter for range-for loops.
   iterator_range<const_op_iterator> arg_operands() const {
-    return iterator_range<const_op_iterator>(op_begin(), arg_end());
+    return make_range(op_begin(), arg_end());
   }
 
   /// \brief Wrappers for getting the \c Use of a terminatepad argument.
diff --git a/include/llvm/IR/Metadata.h b/include/llvm/IR/Metadata.h
index 8805cec1471b..276fa7d11885 100644
--- a/include/llvm/IR/Metadata.h
+++ b/include/llvm/IR/Metadata.h
@@ -1210,10 +1210,10 @@ class NamedMDNode : public ilist_node<NamedMDNode> {
   const_op_iterator op_end()   const { return const_op_iterator(this, getNumOperands()); }
 
   inline iterator_range<op_iterator>  operands() {
-    return iterator_range<op_iterator>(op_begin(), op_end());
+    return make_range(op_begin(), op_end());
   }
   inline iterator_range<const_op_iterator> operands() const {
-    return iterator_range<const_op_iterator>(op_begin(), op_end());
+    return make_range(op_begin(), op_end());
   }
 };
 
diff --git a/include/llvm/IR/Module.h b/include/llvm/IR/Module.h
index 6cf75e747e06..2378b6d83d87 100644
--- a/include/llvm/IR/Module.h
+++ b/include/llvm/IR/Module.h
@@ -515,10 +515,10 @@ class Module {
   bool                  global_empty() const { return GlobalList.empty(); }
 
   iterator_range<global_iterator> globals() {
-    return iterator_range<global_iterator>(global_begin(), global_end());
+    return make_range(global_begin(), global_end());
   }
   iterator_range<const_global_iterator> globals() const {
-    return iterator_range<const_global_iterator>(global_begin(), global_end());
+    return make_range(global_begin(), global_end());
   }
 
 /// @}
@@ -537,10 +537,10 @@ class Module {
   bool                    empty() const { return FunctionList.empty(); }
 
   iterator_range<iterator> functions() {
-    return iterator_range<iterator>(begin(), end());
+    return make_range(begin(), end());
   }
   iterator_range<const_iterator> functions() const {
-    return iterator_range<const_iterator>(begin(), end());
+    return make_range(begin(), end());
   }
 
 /// @}
@@ -555,10 +555,10 @@ class Module {
   bool                 alias_empty() const      { return AliasList.empty(); }
 
   iterator_range<alias_iterator> aliases() {
-    return iterator_range<alias_iterator>(alias_begin(), alias_end());
+    return make_range(alias_begin(), alias_end());
   }
   iterator_range<const_alias_iterator> aliases() const {
-    return iterator_range<const_alias_iterator>(alias_begin(), alias_end());
+    return make_range(alias_begin(), alias_end());
   }
 
 /// @}
@@ -579,12 +579,10 @@ class Module {
   bool named_metadata_empty() const { return NamedMDList.empty(); }
 
   iterator_range<named_metadata_iterator> named_metadata() {
-    return iterator_range<named_metadata_iterator>(named_metadata_begin(),
-                                                   named_metadata_end());
+    return make_range(named_metadata_begin(), named_metadata_end());
   }
   iterator_range<const_named_metadata_iterator> named_metadata() const {
-    return iterator_range<const_named_metadata_iterator>(named_metadata_begin(),
-                                                         named_metadata_end());
+    return make_range(named_metadata_begin(), named_metadata_end());
   }
 
   /// Destroy ConstantArrays in LLVMContext if they are not used.
diff --git a/include/llvm/IR/Statepoint.h b/include/llvm/IR/Statepoint.h
index efe58e3e5a15..21b98a97a83c 100644
--- a/include/llvm/IR/Statepoint.h
+++ b/include/llvm/IR/Statepoint.h
@@ -173,7 +173,7 @@ class StatepointBase {
 
   /// range adapter for call arguments
   iterator_range<arg_iterator> call_args() const {
-    return iterator_range<arg_iterator>(arg_begin(), arg_end());
+    return make_range(arg_begin(), arg_end());
   }
 
   /// \brief Return true if the call or the callee has the given attribute.
@@ -201,8 +201,7 @@ class StatepointBase {
 
   /// range adapter for GC transition arguments
   iterator_range<arg_iterator> gc_transition_args() const {
-    return iterator_range<arg_iterator>(gc_transition_args_begin(),
-                                        gc_transition_args_end());
+    return make_range(gc_transition_args_begin(), gc_transition_args_end());
   }
 
   /// Number of additional arguments excluding those intended
@@ -225,7 +224,7 @@ class StatepointBase {
 
   /// range adapter for vm state arguments
   iterator_range<arg_iterator> vm_state_args() const {
-    return iterator_range<arg_iterator>(vm_state_begin(), vm_state_end());
+    return make_range(vm_state_begin(), vm_state_end());
   }
 
   typename CallSiteTy::arg_iterator gc_args_begin() const {
@@ -241,7 +240,7 @@ class StatepointBase {
 
   /// range adapter for gc arguments
   iterator_range<arg_iterator> gc_args() const {
-    return iterator_range<arg_iterator>(gc_args_begin(), gc_args_end());
+    return make_range(gc_args_begin(), gc_args_end());
   }
 
   /// Get list of all gc reloactes linked to this statepoint
diff --git a/include/llvm/IR/User.h b/include/llvm/IR/User.h
index 78a3b43c86d2..639dc5c01c8c 100644
--- a/include/llvm/IR/User.h
+++ b/include/llvm/IR/User.h
@@ -233,7 +233,7 @@ class User : public Value {
     return value_op_iterator(op_end());
   }
   iterator_range<value_op_iterator> operand_values() {
-    return iterator_range<value_op_iterator>(value_op_begin(), value_op_end());
+    return make_range(value_op_begin(), value_op_end());
   }
 
   /// \brief Drop all references to operands.
diff --git a/include/llvm/IR/Value.h b/include/llvm/IR/Value.h
index 9a87a7178866..7f11ba3d1f6d 100644
--- a/include/llvm/IR/Value.h
+++ b/include/llvm/IR/Value.h
@@ -283,10 +283,10 @@ class Value {
   use_iterator       use_end()         { return use_iterator();   }
   const_use_iterator use_end()   const { return const_use_iterator();   }
   iterator_range<use_iterator> uses() {
-    return iterator_range<use_iterator>(use_begin(), use_end());
+    return make_range(use_begin(), use_end());
   }
   iterator_range<const_use_iterator> uses() const {
-    return iterator_range<const_use_iterator>(use_begin(), use_end());
+    return make_range(use_begin(), use_end());
   }
 
   bool               user_empty() const { return UseList == nullptr; }
@@ -300,10 +300,10 @@ class Value {
   User               *user_back()        { return *user_begin(); }
   const User         *user_back()  const { return *user_begin(); }
   iterator_range<user_iterator> users() {
-    return iterator_range<user_iterator>(user_begin(), user_end());
+    return make_range(user_begin(), user_end());
   }
   iterator_range<const_user_iterator> users() const {
-    return iterator_range<const_user_iterator>(user_begin(), user_end());
+    return make_range(user_begin(), user_end());
   }
 
   /// \brief Return true if there is exactly one user of this value.
diff --git a/include/llvm/Object/Archive.h b/include/llvm/Object/Archive.h
index a68f200ce3fb..8dd042a2533f 100644
--- a/include/llvm/Object/Archive.h
+++ b/include/llvm/Object/Archive.h
@@ -191,14 +191,13 @@ class Archive : public Binary {
   child_iterator child_begin(bool SkipInternal = true) const;
   child_iterator child_end() const;
   iterator_range<child_iterator> children(bool SkipInternal = true) const {
-    return iterator_range<child_iterator>(child_begin(SkipInternal),
-                                          child_end());
+    return make_range(child_begin(SkipInternal), child_end());
   }
 
   symbol_iterator symbol_begin() const;
   symbol_iterator symbol_end() const;
   iterator_range<symbol_iterator> symbols() const {
-    return iterator_range<symbol_iterator>(symbol_begin(), symbol_end());
+    return make_range(symbol_begin(), symbol_end());
   }
 
   // Cast methods.
diff --git a/include/llvm/Object/ObjectFile.h b/include/llvm/Object/ObjectFile.h
index 08131908e067..ce0c891ee0c2 100644
--- a/include/llvm/Object/ObjectFile.h
+++ b/include/llvm/Object/ObjectFile.h
@@ -100,8 +100,7 @@ class SectionRef {
   relocation_iterator relocation_begin() const;
   relocation_iterator relocation_end() const;
   iterator_range<relocation_iterator> relocations() const {
-    return iterator_range<relocation_iterator>(relocation_begin(),
-                                               relocation_end());
+    return make_range(relocation_begin(), relocation_end());
   }
   section_iterator getRelocatedSection() const;
 
diff --git a/include/llvm/Support/Registry.h b/include/llvm/Support/Registry.h
index 2cb8f3c719fa..bbea97b289a6 100644
--- a/include/llvm/Support/Registry.h
+++ b/include/llvm/Support/Registry.h
@@ -119,7 +119,7 @@ namespace llvm {
     static iterator end()   { return iterator(nullptr); }
 
     static iterator_range<iterator> entries() {
-      return iterator_range<iterator>(begin(), end());
+      return make_range(begin(), end());
     }
 
     /// Abstract base class for registry listeners, which are informed when new
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.h b/lib/ExecutionEngine/MCJIT/MCJIT.h
index f27aa39f2d5d..3c9d2fd50336 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.h
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.h
@@ -86,7 +86,7 @@ class MCJIT : public ExecutionEngine {
     ModulePtrSet::iterator begin_added() { return AddedModules.begin(); }
     ModulePtrSet::iterator end_added() { return AddedModules.end(); }
     iterator_range<ModulePtrSet::iterator> added() {
-      return iterator_range<ModulePtrSet::iterator>(begin_added(), end_added());
+      return make_range(begin_added(), end_added());
     }
 
     ModulePtrSet::iterator begin_loaded() { return LoadedModules.begin(); }
diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp
index e34c86542ab3..d1f79b225ee4 100644
--- a/lib/Object/MachOObjectFile.cpp
+++ b/lib/Object/MachOObjectFile.cpp
@@ -1403,8 +1403,7 @@ MachOObjectFile::exports(ArrayRef<uint8_t> Trie) {
   ExportEntry Finish(Trie);
   Finish.moveToEnd();
 
-  return iterator_range<export_iterator>(export_iterator(Start),
-                                         export_iterator(Finish));
+  return make_range(export_iterator(Start), export_iterator(Finish));
 }
 
 iterator_range<export_iterator> MachOObjectFile::exports() const {
@@ -1574,8 +1573,7 @@ MachOObjectFile::rebaseTable(ArrayRef<uint8_t> Opcodes, bool is64) {
   MachORebaseEntry Finish(Opcodes, is64);
   Finish.moveToEnd();
 
-  return iterator_range<rebase_iterator>(rebase_iterator(Start),
-                                         rebase_iterator(Finish));
+  return make_range(rebase_iterator(Start), rebase_iterator(Finish));
 }
 
 iterator_range<rebase_iterator> MachOObjectFile::rebaseTable() const {
@@ -1826,8 +1824,7 @@ MachOObjectFile::bindTable(ArrayRef<uint8_t> Opcodes, bool is64,
   MachOBindEntry Finish(Opcodes, is64, BKind);
   Finish.moveToEnd();
 
-  return iterator_range<bind_iterator>(bind_iterator(Start),
-                                       bind_iterator(Finish));
+  return make_range(bind_iterator(Start), bind_iterator(Finish));
 }
 
 iterator_range<bind_iterator> MachOObjectFile::bindTable() const {
@@ -1857,8 +1854,7 @@ MachOObjectFile::end_load_commands() const {
 
 iterator_range<MachOObjectFile::load_command_iterator>
 MachOObjectFile::load_commands() const {
-  return iterator_range<load_command_iterator>(begin_load_commands(),
-                                               end_load_commands());
+  return make_range(begin_load_commands(), end_load_commands());
 }
 
 StringRef
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
index 7adc0ba51da3..e6842076db2a 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
@@ -45,8 +45,7 @@ void HexagonMCInstrInfo::addConstExtender(MCContext &Context,
 iterator_range<MCInst::const_iterator>
 HexagonMCInstrInfo::bundleInstructions(MCInst const &MCI) {
   assert(isBundle(MCI));
-  return iterator_range<MCInst::const_iterator>(
-      MCI.begin() + bundleInstructionsOffset, MCI.end());
+  return make_range(MCI.begin() + bundleInstructionsOffset, MCI.end());
 }
 
 size_t HexagonMCInstrInfo::bundleSize(MCInst const &MCI) {
diff --git a/utils/TableGen/CodeGenSchedule.h b/utils/TableGen/CodeGenSchedule.h
index 6c34f51900cb..f5c50c992a92 100644
--- a/utils/TableGen/CodeGenSchedule.h
+++ b/utils/TableGen/CodeGenSchedule.h
@@ -257,18 +257,16 @@ class CodeGenSchedModels {
   class_iterator classes_end() { return SchedClasses.end(); }
   const_class_iterator classes_end() const { return SchedClasses.end(); }
   iterator_range<class_iterator> classes() {
-   return iterator_range<class_iterator>(classes_begin(), classes_end());
+   return make_range(classes_begin(), classes_end());
   }
   iterator_range<const_class_iterator> classes() const {
-   return iterator_range<const_class_iterator>(classes_begin(), classes_end());
+   return make_range(classes_begin(), classes_end());
   }
   iterator_range<class_iterator> explicit_classes() {
-    return iterator_range<class_iterator>(
-        classes_begin(), classes_begin() + NumInstrSchedClasses);
+    return make_range(classes_begin(), classes_begin() + NumInstrSchedClasses);
   }
   iterator_range<const_class_iterator> explicit_classes() const {
-    return iterator_range<const_class_iterator>(
-        classes_begin(), classes_begin() + NumInstrSchedClasses);
+    return make_range(classes_begin(), classes_begin() + NumInstrSchedClasses);
   }
 
   Record *getModelOrItinDef(Record *ProcDef) const {
diff --git a/utils/TableGen/CodeGenTarget.h b/utils/TableGen/CodeGenTarget.h
index 24b38514260c..cf4a0bbe5bd9 100644
--- a/utils/TableGen/CodeGenTarget.h
+++ b/utils/TableGen/CodeGenTarget.h
@@ -173,7 +173,7 @@ class CodeGenTarget {
   inst_iterator inst_begin() const{return getInstructionsByEnumValue().begin();}
   inst_iterator inst_end() const { return getInstructionsByEnumValue().end(); }
   iterator_range<inst_iterator> instructions() const {
-    return iterator_range<inst_iterator>(inst_begin(), inst_end());
+    return make_range(inst_begin(), inst_end());
   }
 
 
From e407ee0520400e5fa7316aed15caf788990de2af Mon Sep 17 00:00:00 2001
From: Igor Breger <igor.breger@intel.com>
Date: Sun, 6 Dec 2015 11:35:18 +0000
Subject: [PATCH 144/364] AVX512: support AVX512BW Intrinsic in 32bit mode.

Differential Revision: http://reviews.llvm.org/D15076

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254873 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp      |  128 +--
 test/CodeGen/X86/avx512bw-intrinsics.ll | 1064 ++++++++++++++++++++++-
 2 files changed, 1139 insertions(+), 53 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index c07bca8fe52a..364a8c260ba1 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1710,8 +1710,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
-  if (!Subtarget->is64Bit())
+  if (!Subtarget->is64Bit()) {
     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
+    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
+  }
 
   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
   // handle type legalization for these operations here.
@@ -15983,58 +15985,83 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
 }
 
+/// \brief Return Mask with the necessary casting or extending
+/// for \p Mask according to \p MaskVT when lowering masking intrinsics
+static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
+                           const X86Subtarget *Subtarget,
+                           SelectionDAG &DAG, SDLoc dl) {
+
+  if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
+    // Mask should be extended
+    Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
+                       MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
+  }
+
+  if (Mask.getSimpleValueType() == MVT::i64 && Subtarget->is32Bit()) {
+    assert(MaskVT == MVT::v64i1 && "Unexpected mask VT!");
+    assert(Subtarget->hasBWI() && "Expected AVX512BW target!");
+    // In case 32bit mode, bitcast i64 is illegal, extend/split it.
+    SDValue Lo, Hi;
+    Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
+                        DAG.getConstant(0, dl, MVT::i32));
+    Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
+                        DAG.getConstant(1, dl, MVT::i32));
+
+    Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v32i1, Lo);
+    Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v32i1, Hi);
+
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Hi, Lo);
+
+  } else {
+    MVT BitcastVT = MVT::getVectorVT(MVT::i1,
+                                     Mask.getSimpleValueType().getSizeInBits());
+    // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
+    // are extracted by EXTRACT_SUBVECTOR.
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+                       DAG.getBitcast(BitcastVT, Mask),
+                       DAG.getIntPtrConstant(0, dl));
+  }
+}
+
 /// \brief Return (and \p Op, \p Mask) for compare instructions or
 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
 /// necessary casting or extending for \p Mask when lowering masking intrinsics
 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
-                                    SDValue PreservedSrc,
-                                    const X86Subtarget *Subtarget,
-                                    SelectionDAG &DAG) {
-    MVT VT = Op.getSimpleValueType();
-    MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
-    SDValue VMask;
-    unsigned OpcodeSelect = ISD::VSELECT;
-    SDLoc dl(Op);
+                  SDValue PreservedSrc,
+                  const X86Subtarget *Subtarget,
+                  SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+  unsigned OpcodeSelect = ISD::VSELECT;
+  SDLoc dl(Op);
 
-    if (isAllOnesConstant(Mask))
-      return Op;
+  if (isAllOnesConstant(Mask))
+    return Op;
 
-    if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
-      MVT newMaskVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
-      VMask = DAG.getBitcast(MaskVT,
-                             DAG.getNode(ISD::ANY_EXTEND, dl, newMaskVT, Mask));
-    } else {
-      MVT BitcastVT = MVT::getVectorVT(MVT::i1,
-                                       Mask.getSimpleValueType().getSizeInBits());
-      // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
-      // are extracted by EXTRACT_SUBVECTOR.
-      VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
-                          DAG.getBitcast(BitcastVT, Mask),
-                          DAG.getIntPtrConstant(0, dl));
-    }
+  SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
 
-    switch (Op.getOpcode()) {
-    default: break;
-    case X86ISD::PCMPEQM:
-    case X86ISD::PCMPGTM:
-    case X86ISD::CMPM:
-    case X86ISD::CMPMU:
-      return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
-    case X86ISD::VFPCLASS:
+  switch (Op.getOpcode()) {
+  default: break;
+  case X86ISD::PCMPEQM:
+  case X86ISD::PCMPGTM:
+  case X86ISD::CMPM:
+  case X86ISD::CMPMU:
+    return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
+  case X86ISD::VFPCLASS:
     case X86ISD::VFPCLASSS:
-      return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
-    case X86ISD::VTRUNC:
-    case X86ISD::VTRUNCS:
-    case X86ISD::VTRUNCUS:
-      // We can't use ISD::VSELECT here because it is not always "Legal"
-      // for the destination type. For example vpmovqb require only AVX512
-      // and vselect that can operate on byte element type require BWI
-      OpcodeSelect = X86ISD::SELECT;
-      break;
-    }
-    if (PreservedSrc.getOpcode() == ISD::UNDEF)
-      PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
-    return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
+    return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
+  case X86ISD::VTRUNC:
+  case X86ISD::VTRUNCS:
+  case X86ISD::VTRUNCUS:
+    // We can't use ISD::VSELECT here because it is not always "Legal"
+    // for the destination type. For example vpmovqb require only AVX512
+    // and vselect that can operate on byte element type require BWI
+    OpcodeSelect = X86ISD::SELECT;
+    break;
+  }
+  if (PreservedSrc.getOpcode() == ISD::UNDEF)
+    PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
+  return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
 }
 
 /// \brief Creates an SDNode for a predicated scalar operation.
@@ -16569,12 +16596,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
       SDValue Mask = Op.getOperand(3);
       MVT VT = Op.getSimpleValueType();
       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
-      MVT BitcastVT = MVT::getVectorVT(MVT::i1,
-                                       Mask.getSimpleValueType().getSizeInBits());
-      SDLoc dl(Op);
-      SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
-                                  DAG.getBitcast(BitcastVT, Mask),
-                                  DAG.getIntPtrConstant(0, dl));
+      SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
       return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
                          Op.getOperand(2));
     }
@@ -19978,6 +20000,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
     }
   }
+  case ISD::INTRINSIC_WO_CHAIN: {
+	  Results.push_back(LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG));
+	  return;
+  }
   case ISD::READCYCLECOUNTER: {
     return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
                                    Results);
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index 6b032e0e6d78..0eba131a67c4 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512BW
-
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32
 
 define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) {
 ; AVX512BW-LABEL: test_pcmpeq_b:
@@ -8,6 +8,18 @@ define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) {
 ; AVX512BW-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovq %k0, %rax
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_pcmpeq_b:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    subl $12, %esp
+; AVX512F-32-NEXT:  .Ltmp0:
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 16
+; AVX512F-32-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, (%esp)
+; AVX512F-32-NEXT:    movl (%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    addl $12, %esp
+; AVX512F-32-NEXT:    retl
   %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
   ret i64 %res
 }
@@ -19,6 +31,21 @@ define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
 ; AVX512BW-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
 ; AVX512BW-NEXT:    kmovq %k0, %rax
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_pcmpeq_b:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    subl $12, %esp
+; AVX512F-32-NEXT:  .Ltmp1:
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 16
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, (%esp)
+; AVX512F-32-NEXT:    movl (%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    addl $12, %esp
+; AVX512F-32-NEXT:    retl
   %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
   ret i64 %res
 }
@@ -31,6 +58,12 @@ define i32 @test_pcmpeq_w(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_pcmpeq_w:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    retl
   %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
   ret i32 %res
 }
@@ -42,6 +75,13 @@ define i32 @test_mask_pcmpeq_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
 ; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_pcmpeq_w:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    retl
   %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
   ret i32 %res
 }
@@ -54,6 +94,18 @@ define i64 @test_pcmpgt_b(<64 x i8> %a, <64 x i8> %b) {
 ; AVX512BW-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovq %k0, %rax
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_pcmpgt_b:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    subl $12, %esp
+; AVX512F-32-NEXT:  .Ltmp2:
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 16
+; AVX512F-32-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, (%esp)
+; AVX512F-32-NEXT:    movl (%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    addl $12, %esp
+; AVX512F-32-NEXT:    retl
   %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
   ret i64 %res
 }
@@ -65,6 +117,21 @@ define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
 ; AVX512BW-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0 {%k1}
 ; AVX512BW-NEXT:    kmovq %k0, %rax
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_pcmpgt_b:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    subl $12, %esp
+; AVX512F-32-NEXT:  .Ltmp3:
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 16
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, (%esp)
+; AVX512F-32-NEXT:    movl (%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    addl $12, %esp
+; AVX512F-32-NEXT:    retl
   %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
   ret i64 %res
 }
@@ -77,6 +144,12 @@ define i32 @test_pcmpgt_w(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512BW-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_pcmpgt_w:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    retl
   %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
   ret i32 %res
 }
@@ -88,6 +161,13 @@ define i32 @test_mask_pcmpgt_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
 ; AVX512BW-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 {%k1}
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_pcmpgt_w:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    retl
   %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
   ret i32 %res
 }
@@ -121,6 +201,46 @@ define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
 ; AVX512BW-NEXT:    kmovq %k0, %rax
 ; AVX512BW-NEXT:    addq %rdx, %rax
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_cmp_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    subl $68, %esp
+; AVX512F-32-NEXT:  .Ltmp4:
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 72
+; AVX512F-32-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpltb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpleb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpunordb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpnltb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpnleb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpordb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, (%esp)
+; AVX512F-32-NEXT:    addl (%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    addl $68, %esp
+; AVX512F-32-NEXT:    retl
   %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
   %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1)
   %ret1 = add i64 %res0, %res1
@@ -167,6 +287,49 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
 ; AVX512BW-NEXT:    kmovq %k0, %rax
 ; AVX512BW-NEXT:    addq %rdx, %rax
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_cmp_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    subl $68, %esp
+; AVX512F-32-NEXT:  .Ltmp5:
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 72
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, (%esp)
+; AVX512F-32-NEXT:    movl (%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpltb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpleb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpunordb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpnltb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpnleb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpordb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    addl $68, %esp
+; AVX512F-32-NEXT:    retl
   %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
   %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask)
   %ret1 = add i64 %res0, %res1
@@ -214,6 +377,46 @@ define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
 ; AVX512BW-NEXT:    kmovq %k0, %rax
 ; AVX512BW-NEXT:    addq %rdx, %rax
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_ucmp_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    subl $68, %esp
+; AVX512F-32-NEXT:  .Ltmp6:
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 72
+; AVX512F-32-NEXT:    vpcmpequb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpltub %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpleub %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpunordub %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpnequb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpnltub %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpordub %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovq %k0, (%esp)
+; AVX512F-32-NEXT:    addl (%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    addl $68, %esp
+; AVX512F-32-NEXT:    retl
   %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
   %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1)
   %ret1 = add i64 %res0, %res1
@@ -260,6 +463,49 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
 ; AVX512BW-NEXT:    kmovq %k0, %rax
 ; AVX512BW-NEXT:    addq %rdx, %rax
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_x86_avx512_ucmp_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    subl $68, %esp
+; AVX512F-32-NEXT:  .Ltmp7:
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 72
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpcmpequb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, (%esp)
+; AVX512F-32-NEXT:    movl (%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpltub %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpleub %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpunordub %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpnequb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpnltub %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    vpcmpordub %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    addl $68, %esp
+; AVX512F-32-NEXT:    retl
   %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
   %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask)
   %ret1 = add i64 %res0, %res1
@@ -307,6 +553,33 @@ define i32 @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    addl %edx, %eax
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_cmp_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    vpcmpltw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmplew %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %ecx, %eax
+; AVX512F-32-NEXT:    vpcmpunordw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %ecx, %eax
+; AVX512F-32-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmpnlew %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %edx
+; AVX512F-32-NEXT:    addl %ecx, %edx
+; AVX512F-32-NEXT:    vpcmpordw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %edx, %eax
+; AVX512F-32-NEXT:    retl
   %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
   %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1)
   %ret1 = add i32 %res0, %res1
@@ -353,6 +626,34 @@ define i32 @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    addl %edx, %eax
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_cmp_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    vpcmpltw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmplew %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %ecx, %eax
+; AVX512F-32-NEXT:    vpcmpunordw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %ecx, %eax
+; AVX512F-32-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmpnlew %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %edx
+; AVX512F-32-NEXT:    addl %ecx, %edx
+; AVX512F-32-NEXT:    vpcmpordw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %edx, %eax
+; AVX512F-32-NEXT:    retl
   %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
   %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask)
   %ret1 = add i32 %res0, %res1
@@ -400,6 +701,33 @@ define i32 @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    addl %edx, %eax
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_ucmp_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpcmpequw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %ecx, %eax
+; AVX512F-32-NEXT:    vpcmpunorduw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmpnequw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %ecx, %eax
+; AVX512F-32-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %edx
+; AVX512F-32-NEXT:    addl %ecx, %edx
+; AVX512F-32-NEXT:    vpcmporduw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %edx, %eax
+; AVX512F-32-NEXT:    retl
   %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
   %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1)
   %ret1 = add i32 %res0, %res1
@@ -446,6 +774,34 @@ define i32 @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    addl %edx, %eax
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_ucmp_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpcmpequw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %ecx, %eax
+; AVX512F-32-NEXT:    vpcmpunorduw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmpnequw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %ecx, %eax
+; AVX512F-32-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %ecx
+; AVX512F-32-NEXT:    addl %eax, %ecx
+; AVX512F-32-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %edx
+; AVX512F-32-NEXT:    addl %ecx, %edx
+; AVX512F-32-NEXT:    vpcmporduw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    addl %edx, %eax
+; AVX512F-32-NEXT:    retl
   %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
   %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask)
   %ret1 = add i32 %res0, %res1
@@ -474,6 +830,12 @@ define <32 x i16> @test_x86_mask_blend_w_512(i32 %mask, <32 x i16> %a1, <32 x i1
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vpblendmw %zmm1, %zmm0, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_x86_mask_blend_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpblendmw %zmm1, %zmm0, %zmm0 {%k1}
+; AVX512F-32-NEXT:    retl
     %res = call <32 x i16> @llvm.x86.avx512.mask.blend.w.512(<32 x i16> %a1, <32 x i16> %a2, i32 %mask) ; <<32 x i16>> [#uses=1]
   ret <32 x i16> %res
 }
@@ -485,6 +847,14 @@ define <64 x i8> @test_x86_mask_blend_b_512(i64 %a0, <64 x i8> %a1, <64 x i8> %a
 ; AVX512BW-NEXT:    kmovq %rdi, %k1
 ; AVX512BW-NEXT:    vpblendmb %zmm1, %zmm0, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_x86_mask_blend_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpblendmb %zmm1, %zmm0, %zmm0 {%k1}
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.blend.b.512(<64 x i8> %a1, <64 x i8> %a2, i64 %a0) ; <<64 x i8>> [#uses=1]
   ret <64 x i8> %res
 }
@@ -494,6 +864,11 @@ define <32 x i16> @test_mask_packs_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rr_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
@@ -505,6 +880,13 @@ define <32 x i16> @test_mask_packs_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <
 ; AVX512BW-NEXT:    vpackssdw %zmm1, %zmm0, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rrk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackssdw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
@@ -515,6 +897,12 @@ define <32 x i16> @test_mask_packs_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b,
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rrkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
@@ -524,6 +912,12 @@ define <32 x i16> @test_mask_packs_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpackssdw (%rdi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rm_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpackssdw (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
@@ -536,6 +930,14 @@ define <32 x i16> @test_mask_packs_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr
 ; AVX512BW-NEXT:    vpackssdw (%rdi), %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rmk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackssdw (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
@@ -547,6 +949,13 @@ define <32 x i16> @test_mask_packs_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %pt
 ; AVX512BW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-NEXT:    vpackssdw (%rdi), %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rmkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackssdw (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
@@ -557,6 +966,12 @@ define <32 x i16> @test_mask_packs_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rmb_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpackssdw (%eax){1to16}, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -571,6 +986,14 @@ define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <3
 ; AVX512BW-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rmbk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackssdw (%eax){1to16}, %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -584,6 +1007,13 @@ define <32 x i16> @test_mask_packs_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i
 ; AVX512BW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rmbkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackssdw (%eax){1to16}, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -598,6 +1028,11 @@ define <64 x i8> @test_mask_packs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi16_rr_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
   ret <64 x i8> %res
 }
@@ -609,6 +1044,15 @@ define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <6
 ; AVX512BW-NEXT:    vpacksswb %zmm1, %zmm0, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi16_rrk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpacksswb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
   ret <64 x i8> %res
 }
@@ -619,6 +1063,14 @@ define <64 x i8> @test_mask_packs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i
 ; AVX512BW-NEXT:    kmovq %rdi, %k1
 ; AVX512BW-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi16_rrkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
   ret <64 x i8> %res
 }
@@ -628,6 +1080,12 @@ define <64 x i8> @test_mask_packs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpacksswb (%rdi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi16_rm_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpacksswb (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
   ret <64 x i8> %res
@@ -640,6 +1098,16 @@ define <64 x i8> @test_mask_packs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_
 ; AVX512BW-NEXT:    vpacksswb (%rdi), %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi16_rmk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpacksswb (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
   ret <64 x i8> %res
@@ -651,6 +1119,15 @@ define <64 x i8> @test_mask_packs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr
 ; AVX512BW-NEXT:    kmovq %rsi, %k1
 ; AVX512BW-NEXT:    vpacksswb (%rdi), %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi16_rmkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpacksswb (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
   ret <64 x i8> %res
@@ -664,6 +1141,11 @@ define <32 x i16> @test_mask_packus_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rr_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
@@ -675,6 +1157,13 @@ define <32 x i16> @test_mask_packus_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b,
 ; AVX512BW-NEXT:    vpackusdw %zmm1, %zmm0, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rrk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackusdw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
@@ -685,6 +1174,12 @@ define <32 x i16> @test_mask_packus_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b,
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rrkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
@@ -694,6 +1189,12 @@ define <32 x i16> @test_mask_packus_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpackusdw (%rdi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rm_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpackusdw (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
@@ -706,6 +1207,14 @@ define <32 x i16> @test_mask_packus_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %pt
 ; AVX512BW-NEXT:    vpackusdw (%rdi), %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rmk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackusdw (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
@@ -717,6 +1226,13 @@ define <32 x i16> @test_mask_packus_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %p
 ; AVX512BW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-NEXT:    vpackusdw (%rdi), %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rmkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackusdw (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
@@ -727,6 +1243,12 @@ define <32 x i16> @test_mask_packus_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rmb_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpackusdw (%eax){1to16}, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -741,6 +1263,14 @@ define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <
 ; AVX512BW-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rmbk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackusdw (%eax){1to16}, %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -754,6 +1284,13 @@ define <32 x i16> @test_mask_packus_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b,
 ; AVX512BW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rmbkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackusdw (%eax){1to16}, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -768,6 +1305,11 @@ define <64 x i8> @test_mask_packus_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi16_rr_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
   ret <64 x i8> %res
 }
@@ -779,6 +1321,15 @@ define <64 x i8> @test_mask_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <
 ; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi16_rrk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpackuswb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
   ret <64 x i8> %res
 }
@@ -789,6 +1340,14 @@ define <64 x i8> @test_mask_packus_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b,
 ; AVX512BW-NEXT:    kmovq %rdi, %k1
 ; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi16_rrkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
   ret <64 x i8> %res
 }
@@ -798,6 +1357,12 @@ define <64 x i8> @test_mask_packus_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpackuswb (%rdi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi16_rm_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpackuswb (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
   ret <64 x i8> %res
@@ -810,6 +1375,16 @@ define <64 x i8> @test_mask_packus_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr
 ; AVX512BW-NEXT:    vpackuswb (%rdi), %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi16_rmk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpackuswb (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
   ret <64 x i8> %res
@@ -821,6 +1396,15 @@ define <64 x i8> @test_mask_packus_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %pt
 ; AVX512BW-NEXT:    kmovq %rsi, %k1
 ; AVX512BW-NEXT:    vpackuswb (%rdi), %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi16_rmkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpackuswb (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
   ret <64 x i8> %res
@@ -833,6 +1417,11 @@ define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epi16_rr_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
@@ -844,6 +1433,13 @@ define <32 x i16> @test_mask_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <3
 ; AVX512BW-NEXT:    vpaddsw %zmm1, %zmm0, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epi16_rrk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
@@ -854,6 +1450,12 @@ define <32 x i16> @test_mask_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epi16_rrkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
@@ -863,6 +1465,12 @@ define <32 x i16> @test_mask_adds_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpaddsw (%rdi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epi16_rm_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpaddsw (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
@@ -875,6 +1483,14 @@ define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_
 ; AVX512BW-NEXT:    vpaddsw (%rdi), %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epi16_rmk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddsw (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
@@ -886,6 +1502,13 @@ define <32 x i16> @test_mask_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr
 ; AVX512BW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-NEXT:    vpaddsw (%rdi), %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epi16_rmkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddsw (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
@@ -898,6 +1521,11 @@ define <32 x i16> @test_mask_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epi16_rr_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
@@ -909,6 +1537,13 @@ define <32 x i16> @test_mask_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <3
 ; AVX512BW-NEXT:    vpsubsw %zmm1, %zmm0, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epi16_rrk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
@@ -919,6 +1554,12 @@ define <32 x i16> @test_mask_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epi16_rrkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
@@ -928,6 +1569,12 @@ define <32 x i16> @test_mask_subs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpsubsw (%rdi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epi16_rm_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpsubsw (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
@@ -940,6 +1587,14 @@ define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_
 ; AVX512BW-NEXT:    vpsubsw (%rdi), %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epi16_rmk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubsw (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
@@ -951,6 +1606,13 @@ define <32 x i16> @test_mask_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr
 ; AVX512BW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-NEXT:    vpsubsw (%rdi), %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epi16_rmkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubsw (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
@@ -963,6 +1625,11 @@ define <32 x i16> @test_mask_adds_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epu16_rr_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
@@ -974,6 +1641,13 @@ define <32 x i16> @test_mask_adds_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <3
 ; AVX512BW-NEXT:    vpaddusw %zmm1, %zmm0, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epu16_rrk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddusw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
@@ -984,6 +1658,12 @@ define <32 x i16> @test_mask_adds_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epu16_rrkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
@@ -993,6 +1673,12 @@ define <32 x i16> @test_mask_adds_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpaddusw (%rdi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epu16_rm_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpaddusw (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
@@ -1005,6 +1691,14 @@ define <32 x i16> @test_mask_adds_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_
 ; AVX512BW-NEXT:    vpaddusw (%rdi), %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epu16_rmk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddusw (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
@@ -1016,6 +1710,13 @@ define <32 x i16> @test_mask_adds_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr
 ; AVX512BW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-NEXT:    vpaddusw (%rdi), %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_adds_epu16_rmkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpaddusw (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
@@ -1028,6 +1729,11 @@ define <32 x i16> @test_mask_subs_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epu16_rr_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
@@ -1039,6 +1745,13 @@ define <32 x i16> @test_mask_subs_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <3
 ; AVX512BW-NEXT:    vpsubusw %zmm1, %zmm0, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epu16_rrk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubusw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
@@ -1049,6 +1762,12 @@ define <32 x i16> @test_mask_subs_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epu16_rrkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
@@ -1058,6 +1777,12 @@ define <32 x i16> @test_mask_subs_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpsubusw (%rdi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epu16_rm_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpsubusw (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
@@ -1070,6 +1795,14 @@ define <32 x i16> @test_mask_subs_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_
 ; AVX512BW-NEXT:    vpsubusw (%rdi), %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epu16_rmk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubusw (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
@@ -1081,6 +1814,13 @@ define <32 x i16> @test_mask_subs_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr
 ; AVX512BW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-NEXT:    vpsubusw (%rdi), %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_subs_epu16_rmkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpsubusw (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
@@ -1098,6 +1838,16 @@ define <64 x i8>@test_int_x86_avx512_mask_pmaxs_b_512(<64 x i8> %x0, <64 x i8> %
 ; AVX512BW-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaxs_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.pmaxs.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmaxs.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -1114,6 +1864,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pmaxs_w_512(<32 x i16> %x0, <32 x i16
 ; AVX512BW-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaxs_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmaxs.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmaxs.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1130,6 +1888,16 @@ define <64 x i8>@test_int_x86_avx512_mask_pmaxu_b_512(<64 x i8> %x0, <64 x i8> %
 ; AVX512BW-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaxu_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpmaxub %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.pmaxu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmaxu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -1146,6 +1914,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pmaxu_w_512(<32 x i16> %x0, <32 x i16
 ; AVX512BW-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaxu_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmaxu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmaxu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1162,6 +1938,16 @@ define <64 x i8>@test_int_x86_avx512_mask_pmins_b_512(<64 x i8> %x0, <64 x i8> %
 ; AVX512BW-NEXT:    vpminsb %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmins_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpminsb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpminsb %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.pmins.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmins.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -1178,6 +1964,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pmins_w_512(<32 x i16> %x0, <32 x i16
 ; AVX512BW-NEXT:    vpminsw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmins_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpminsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpminsw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmins.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmins.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1194,6 +1988,16 @@ define <64 x i8>@test_int_x86_avx512_mask_pminu_b_512(<64 x i8> %x0, <64 x i8> %
 ; AVX512BW-NEXT:    vpminub %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pminu_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpminub %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpminub %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.pminu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pminu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -1210,6 +2014,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pminu_w_512(<32 x i16> %x0, <32 x i16
 ; AVX512BW-NEXT:    vpminuw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pminu_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpminuw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpminuw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.pminu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pminu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1227,6 +2039,15 @@ define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32
 ; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpaddw %zmm1, %zmm3, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm3
+; AVX512F-32-NEXT:    vpermt2w %zmm2, %zmm0, %zmm3 {%k1}
+; AVX512F-32-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1
+; AVX512F-32-NEXT:    vpaddw %zmm1, %zmm3, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1244,6 +2065,15 @@ define <32 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <3
 ; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpaddw %zmm1, %zmm3, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm3
+; AVX512F-32-NEXT:    vpermt2w %zmm2, %zmm0, %zmm3 {%k1} {z}
+; AVX512F-32-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1
+; AVX512F-32-NEXT:    vpaddw %zmm1, %zmm3, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1261,6 +2091,15 @@ define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32
 ; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpaddw %zmm1, %zmm3, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vmovaps %zmm1, %zmm3
+; AVX512F-32-NEXT:    vpermi2w %zmm2, %zmm0, %zmm3 {%k1}
+; AVX512F-32-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1
+; AVX512F-32-NEXT:    vpaddw %zmm1, %zmm3, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1277,6 +2116,16 @@ define <64 x i8>@test_int_x86_avx512_mask_pavg_b_512(<64 x i8> %x0, <64 x i8> %x
 ; AVX512BW-NEXT:    vpavgb %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pavg_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpavgb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpavgb %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -1293,6 +2142,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pavg_w_512(<32 x i16> %x0, <32 x i16>
 ; AVX512BW-NEXT:    vpavgw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pavg_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpavgw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpavgw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1309,6 +2166,16 @@ define <64 x i8>@test_int_x86_avx512_mask_pshuf_b_512(<64 x i8> %x0, <64 x i8> %
 ; AVX512BW-NEXT:    vpshufb %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pshuf_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpshufb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpshufb %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -1325,6 +2192,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pabs_w_512(<32 x i16> %x0, <32 x i16>
 ; AVX512BW-NEXT:    vpabsw %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pabs_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpabsw %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vpabsw %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1341,6 +2216,16 @@ define <64 x i8>@test_int_x86_avx512_mask_pabs_b_512(<64 x i8> %x0, <64 x i8> %x
 ; AVX512BW-NEXT:    vpabsb %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pabs_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpabsb %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vpabsb %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -1357,6 +2242,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pmulhu_w_512(<32 x i16> %x0, <32 x i1
 ; AVX512BW-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmulhu_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1373,6 +2266,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pmulh_w_512(<32 x i16> %x0, <32 x i16
 ; AVX512BW-NEXT:    vpmulhw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmulh_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpmulhw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpmulhw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1389,6 +2290,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pmulhr_sw_512(<32 x i16> %x0, <32 x i
 ; AVX512BW-NEXT:    vpmulhrsw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmulhr_sw_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpmulhrsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpmulhrsw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1407,6 +2316,16 @@ define <32 x i8>@test_int_x86_avx512_mask_pmov_wb_512(<32 x i16> %x0, <32 x i8>
 ; AVX512BW-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmov_wb_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpmovwb %zmm0, %ymm1 {%k1}
+; AVX512F-32-NEXT:    vpmovwb %zmm0, %ymm2 {%k1} {z}
+; AVX512F-32-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512F-32-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512F-32-NEXT:    retl
     %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1)
     %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2)
     %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2)
@@ -1424,6 +2343,14 @@ define void @test_int_x86_avx512_mask_pmov_wb_mem_512(i8* %ptr, <32 x i16> %x1,
 ; AVX512BW-NEXT:    vpmovwb %zmm0, (%rdi)
 ; AVX512BW-NEXT:    vpmovwb %zmm0, (%rdi) {%k1}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpmovwb %zmm0, (%eax)
+; AVX512F-32-NEXT:    vpmovwb %zmm0, (%eax) {%k1}
+; AVX512F-32-NEXT:    retl
     call void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1)
     call void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2)
     ret void
@@ -1441,6 +2368,16 @@ define <32 x i8>@test_int_x86_avx512_mask_pmovs_wb_512(<32 x i16> %x0, <32 x i8>
 ; AVX512BW-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovs_wb_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpmovswb %zmm0, %ymm1 {%k1}
+; AVX512F-32-NEXT:    vpmovswb %zmm0, %ymm2 {%k1} {z}
+; AVX512F-32-NEXT:    vpmovswb %zmm0, %ymm0
+; AVX512F-32-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512F-32-NEXT:    retl
     %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1)
     %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2)
     %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2)
@@ -1458,6 +2395,15 @@ define void @test_int_x86_avx512_mask_pmovs_wb_mem_512(i8* %ptr, <32 x i16> %x1,
 ; AVX512BW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-NEXT:    vpmovswb %zmm0, (%rdi) {%k1}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; AVX512F-32-NEXT:    vpmovswb %zmm0, (%ecx)
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovswb %zmm0, (%ecx) {%k1}
+; AVX512F-32-NEXT:    retl
     call void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1)
     call void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2)
     ret void
@@ -1475,6 +2421,16 @@ define <32 x i8>@test_int_x86_avx512_mask_pmovus_wb_512(<32 x i16> %x0, <32 x i8
 ; AVX512BW-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovus_wb_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpmovuswb %zmm0, %ymm1 {%k1}
+; AVX512F-32-NEXT:    vpmovuswb %zmm0, %ymm2 {%k1} {z}
+; AVX512F-32-NEXT:    vpmovuswb %zmm0, %ymm0
+; AVX512F-32-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512F-32-NEXT:    retl
     %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1)
     %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2)
     %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2)
@@ -1492,6 +2448,15 @@ define void @test_int_x86_avx512_mask_pmovus_wb_mem_512(i8* %ptr, <32 x i16> %x1
 ; AVX512BW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-NEXT:    vpmovuswb %zmm0, (%rdi) {%k1}
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; AVX512F-32-NEXT:    vpmovuswb %zmm0, (%ecx)
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovuswb %zmm0, (%ecx) {%k1}
+; AVX512F-32-NEXT:    retl
     call void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1)
     call void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2)
     ret void
@@ -1507,6 +2472,14 @@ define <32 x i16>@test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x i
 ; AVX512BW-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaddubs_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1523,6 +2496,14 @@ define <16 x i32>@test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i1
 ; AVX512BW-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaddw_d_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 -1)
   %res2 = add <16 x i32> %res, %res1
@@ -1539,6 +2520,16 @@ define <64 x i8>@test_int_x86_avx512_mask_punpckhb_w_512(<64 x i8> %x0, <64 x i8
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpckhb_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm2[8],k1[8],zmm2[9],k1[9],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[12],k1[12],zmm2[13],k1[13],zmm2[14],k1[14],zmm2[15],k1[15],zmm2[24],k1[24],zmm2[25],k1[25],zmm2[26],k1[26],zmm2[27],k1[27],zmm2[28],k1[28],zmm2[29],k1[29],zmm2[30],k1[30],zmm2[31],k1[31],zmm2[40],k1[40],zmm2[41],k1[41],zmm2[42],k1[42],zmm2[43],k1[43],zmm2[44],k1[44],zmm2[45],k1[45],zmm2[46],k1[46],zmm2[47],k1[47],zmm2[56],k1[56],zmm2[57],k1[57],zmm2[58],k1[58],zmm2[59],k1[59],zmm2[60],k1[60],zmm2[61],k1[61],zmm2[62],k1[62],zmm2[63],k1[63]
+; AVX512F-32-NEXT:    vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -1555,6 +2546,16 @@ define <64 x i8>@test_int_x86_avx512_mask_punpcklb_w_512(<64 x i8> %x0, <64 x i8
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpcklb_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpunpcklbw {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[2],k1[2],zmm2[3],k1[3],zmm2[4],k1[4],zmm2[5],k1[5],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[16],k1[16],zmm2[17],k1[17],zmm2[18],k1[18],zmm2[19],k1[19],zmm2[20],k1[20],zmm2[21],k1[21],zmm2[22],k1[22],zmm2[23],k1[23],zmm2[32],k1[32],zmm2[33],k1[33],zmm2[34],k1[34],zmm2[35],k1[35],zmm2[36],k1[36],zmm2[37],k1[37],zmm2[38],k1[38],zmm2[39],k1[39],zmm2[48],k1[48],zmm2[49],k1[49],zmm2[50],k1[50],zmm2[51],k1[51],zmm2[52],k1[52],zmm2[53],k1[53],zmm2[54],k1[54],zmm2[55],k1[55]
+; AVX512F-32-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -1571,6 +2572,14 @@ define <32 x i16>@test_int_x86_avx512_mask_punpckhw_d_512(<32 x i16> %x0, <32 x
 ; AVX512BW-NEXT:    vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpckhw_d_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpunpckhwd {{.*#+}} zmm2 = zmm2[4],k1[4],zmm2[5],k1[5],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[12],k1[12],zmm2[13],k1[13],zmm2[14],k1[14],zmm2[15],k1[15],zmm2[20],k1[20],zmm2[21],k1[21],zmm2[22],k1[22],zmm2[23],k1[23],zmm2[28],k1[28],zmm2[29],k1[29],zmm2[30],k1[30],zmm2[31],k1[31]
+; AVX512F-32-NEXT:    vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1587,6 +2596,14 @@ define <32 x i16>@test_int_x86_avx512_mask_punpcklw_d_512(<32 x i16> %x0, <32 x
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpcklw_d_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpunpcklwd {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[2],k1[2],zmm2[3],k1[3],zmm2[8],k1[8],zmm2[9],k1[9],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[16],k1[16],zmm2[17],k1[17],zmm2[18],k1[18],zmm2[19],k1[19],zmm2[24],k1[24],zmm2[25],k1[25],zmm2[26],k1[26],zmm2[27],k1[27]
+; AVX512F-32-NEXT:    vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -1605,6 +2622,18 @@ define <64 x i8>@test_int_x86_avx512_mask_palignr_512(<64 x i8> %x0, <64 x i8> %
 ; AVX512BW-NEXT:    vpaddb %zmm3, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_palignr_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm3
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 %x4)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> zeroinitializer, i64 %x4)
   %res2 = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 -1)
@@ -1625,6 +2654,16 @@ define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8>
 ; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_dbpsadbw_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z}
+; AVX512F-32-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm2, %zmm1
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> %x3, i32 %x4)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> zeroinitializer, i32 %x4)
   %res2 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> %x3, i32 -1)
@@ -1642,6 +2681,13 @@ define <8 x i64>@test_int_x86_avx512_mask_psll_dq_512(<8 x i64> %x0) {
 ; AVX512BW-NEXT:    vpslldq $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_psll_dq_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpslldq $8, %zmm0, %zmm1
+; AVX512F-32-NEXT:    vpslldq $4, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 8)
   %res1 = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 4)
   %res2 = add <8 x i64> %res, %res1
@@ -1657,6 +2703,13 @@ define <8 x i64>@test_int_x86_avx512_mask_psrl_dq_512(<8 x i64> %x0) {
 ; AVX512BW-NEXT:    vpsrldq $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrl_dq_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpsrldq $8, %zmm0, %zmm1
+; AVX512F-32-NEXT:    vpsrldq $4, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 8)
   %res1 = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 4)
   %res2 = add <8 x i64> %res, %res1
@@ -1671,6 +2724,13 @@ define  <8 x i64>@test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8>
 ; AVX512BW-NEXT:    vpsadbw %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_psadb_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpsadbw %zmm1, %zmm0, %zmm1
+; AVX512F-32-NEXT:    vpsadbw %zmm2, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
   %res = call  <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x1)
   %res1 = call  <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x2)
   %res2 = add  <8 x i64> %res, %res1

From f7fc15ed79ddcb8cb26f00b61d32b757c62a24b3 Mon Sep 17 00:00:00 2001
From: Michael Kuperstein <michael.m.kuperstein@intel.com>
Date: Sun, 6 Dec 2015 13:06:20 +0000
Subject: [PATCH 145/364] [X86] Always generate precise CFA adjustments.

This removes the code path that generate "synchronous" (only correct at call site) CFA.
We will probably want to re-introduce it once we are capable of emitting different
.eh_frame and .debug_frame sections.

Differential Revision: http://reviews.llvm.org/D14948

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254874 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/MachineModuleInfo.h    | 11 +++---
 lib/Target/X86/X86CallFrameOptimization.cpp |  3 +-
 lib/Target/X86/X86FrameLowering.cpp         | 21 +++++++-----
 lib/Target/X86/X86MCInstLower.cpp           |  6 ++--
 test/CodeGen/X86/push-cfi.ll                | 37 ++++++++++++++++-----
 test/CodeGen/X86/tls-pie.ll                 |  8 +++++
 test/CodeGen/X86/win32-pic-jumptable.ll     |  4 +++
 7 files changed, 65 insertions(+), 25 deletions(-)

diff --git a/include/llvm/CodeGen/MachineModuleInfo.h b/include/llvm/CodeGen/MachineModuleInfo.h
index 43b9f5203c50..acd6440eb358 100644
--- a/include/llvm/CodeGen/MachineModuleInfo.h
+++ b/include/llvm/CodeGen/MachineModuleInfo.h
@@ -161,6 +161,12 @@ class MachineModuleInfo : public ImmutablePass {
   bool CallsUnwindInit;
   bool HasEHFunclets;
 
+  // TODO: Ideally, what we'd like is to have a switch that allows emitting 
+  // synchronous (precise at call-sites only) CFA into .eh_frame. However,
+  // even under this switch, we'd like .debug_frame to be precise when using.
+  // -g. At this moment, there's no way to specify that some CFI directives
+  // go into .eh_frame only, while others go into .debug_frame only.
+
   /// DbgInfoAvailable - True if debugging information is available
   /// in this module.
   bool DbgInfoAvailable;
@@ -235,11 +241,6 @@ class MachineModuleInfo : public ImmutablePass {
   bool hasDebugInfo() const { return DbgInfoAvailable; }
   void setDebugInfoAvailability(bool avail) { DbgInfoAvailable = avail; }
 
-  // Returns true if we need to generate precise CFI. Currently
-  // this is equivalent to hasDebugInfo(), but if we ever implement
-  // async EH, it will require precise CFI as well.
-  bool usePreciseUnwindInfo() const { return hasDebugInfo(); }
-
   bool callsEHReturn() const { return CallsEHReturn; }
   void setCallsEHReturn(bool b) { CallsEHReturn = b; }
 
diff --git a/lib/Target/X86/X86CallFrameOptimization.cpp b/lib/Target/X86/X86CallFrameOptimization.cpp
index 23990b01ba18..fc6ee1752f1f 100644
--- a/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -500,7 +500,8 @@ bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
 
     // For debugging, when using SP-based CFA, we need to adjust the CFA
     // offset after each push.
-    if (!TFL->hasFP(MF) && MF.getMMI().usePreciseUnwindInfo())
+    // TODO: This is needed only if we require precise CFA.
+    if (!TFL->hasFP(MF))
       TFL->BuildCFI(MBB, std::next(Push), DL, 
                     MCCFIInstruction::createAdjustCfaOffset(nullptr, 4));
 
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 682f75c7f51c..2e7ed58e340a 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -2524,10 +2524,10 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
     // (Pushes of argument for frame setup, callee pops for frame destroy)
     Amount -= InternalAmt;
 
-    // If this is a callee-pop calling convention, and we're emitting precise
-    // SP-based CFI, emit a CFA adjust for the amount the callee popped.
-    if (isDestroy && InternalAmt && DwarfCFI && !hasFP(MF) && 
-        MMI.usePreciseUnwindInfo())
+    // TODO: This is needed only if we require precise CFA.
+    // If this is a callee-pop calling convention, emit a CFA adjust for
+    // the amount the callee popped.
+    if (isDestroy && InternalAmt && DwarfCFI && !hasFP(MF))
       BuildCFI(MBB, I, DL, 
                MCCFIInstruction::createAdjustCfaOffset(nullptr, -InternalAmt));
 
@@ -2548,11 +2548,14 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
       // offset to be correct at each call site, while for debugging we want
       // it to be more precise.
       int CFAOffset = Amount;
-      if (!MMI.usePreciseUnwindInfo())
-        CFAOffset += InternalAmt;
-      CFAOffset = isDestroy ? -CFAOffset : CFAOffset;
-      BuildCFI(MBB, I, DL, 
-               MCCFIInstruction::createAdjustCfaOffset(nullptr, CFAOffset));
+      // TODO: When not using precise CFA, we also need to adjust for the
+      // InternalAmt here.
+
+      if (CFAOffset) {
+        CFAOffset = isDestroy ? -CFAOffset : CFAOffset;
+        BuildCFI(MBB, I, DL, 
+                 MCCFIInstruction::createAdjustCfaOffset(nullptr, CFAOffset));
+      }
     }
 
     return;
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index 8878c9f169b5..af386807cd70 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -1143,8 +1143,10 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const X86FrameLowering* FrameLowering =
         MF->getSubtarget<X86Subtarget>().getFrameLowering();
     bool hasFP = FrameLowering->hasFP(*MF);
-
-    bool NeedsDwarfCFI = MMI->usePreciseUnwindInfo();
+    
+    // TODO: This is needed only if we require precise CFA.
+    bool NeedsDwarfCFI = 
+         (MMI->hasDebugInfo() || MF->getFunction()->needsUnwindTableEntry());
     int stackGrowth = -RI->getSlotSize();
 
     if (NeedsDwarfCFI && !hasFP) {
diff --git a/test/CodeGen/X86/push-cfi.ll b/test/CodeGen/X86/push-cfi.ll
index 4d07a1d8181b..6389708f42cc 100644
--- a/test/CodeGen/X86/push-cfi.ll
+++ b/test/CodeGen/X86/push-cfi.ll
@@ -6,17 +6,24 @@ declare void @good(i32 %a, i32 %b, i32 %c, i32 %d)
 declare void @large(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f)
 declare void @empty()
 
-; When we use an invoke, and have FP, we expect a .cfi_escape GNU_ARGS_SIZE
-; with size 16 before the invocation. Without FP, we expect.cfi_adjust_cfa_offset
-; before and after.
-; Darwin should not generate pushes in neither circumstance.
+; When we use an invoke, we expect a .cfi_escape GNU_ARGS_SIZE
+; with size 16 before the invocation. Without FP, we also expect
+; .cfi_adjust_cfa_offset after each push.
+; Darwin should not generate pushes in either circumstance.
 ; CHECK-LABEL: test1_nofp:
 ; LINUX: .cfi_escape 0x2e, 0x10
-; LINUX: .cfi_adjust_cfa_offset 16
 ; LINUX-NEXT: pushl   $4
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
 ; LINUX-NEXT: pushl   $3
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
 ; LINUX-NEXT: pushl   $2
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
 ; LINUX-NEXT: pushl   $1
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
 ; LINUX-NEXT: call
 ; LINUX-NEXT: addl $16, %esp
 ; LINUX: .cfi_adjust_cfa_offset -16
@@ -62,11 +69,18 @@ cleanup:
 ; so darwin should not generate pushes.
 ; CHECK-LABEL: test2_nofp:
 ; LINUX-NOT: .cfi_escape
-; LINUX: .cfi_adjust_cfa_offset 16
-; LINUX-NEXT: pushl   $4
+; LINUX: pushl   $4
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
 ; LINUX-NEXT: pushl   $3
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
 ; LINUX-NEXT: pushl   $2
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
 ; LINUX-NEXT: pushl   $1
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
 ; LINUX-NEXT: call
 ; LINUX-NEXT: addl $16, %esp
 ; LINUX: .cfi_adjust_cfa_offset -16
@@ -170,11 +184,18 @@ cleanup:
 ; without parameters, but don't need to adjust the cfa offset
 ; CHECK-LABEL: test5_nofp:
 ; LINUX: .cfi_escape 0x2e, 0x10
-; LINUX: .cfi_adjust_cfa_offset 16
 ; LINUX-NEXT: pushl   $4
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
 ; LINUX-NEXT: pushl   $3
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
 ; LINUX-NEXT: pushl   $2
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
 ; LINUX-NEXT: pushl   $1
+; LINUX-NEXT: Ltmp{{[0-9]+}}:
+; LINUX-NEXT: .cfi_adjust_cfa_offset 4
 ; LINUX-NEXT: call
 ; LINUX-NEXT: addl $16, %esp
 ; LINUX: .cfi_adjust_cfa_offset -16
diff --git a/test/CodeGen/X86/tls-pie.ll b/test/CodeGen/X86/tls-pie.ll
index 10fe1e94bbdc..235230e3c6a8 100644
--- a/test/CodeGen/X86/tls-pie.ll
+++ b/test/CodeGen/X86/tls-pie.ll
@@ -36,9 +36,13 @@ entry:
 define i32 @f3() {
 ; X32-LABEL: f3:
 ; X32:      calll .L{{[0-9]+}}$pb
+; X32-NEXT: .Ltmp{{[0-9]+}}:
+; X32-NEXT: .cfi_adjust_cfa_offset 4
 ; X32-NEXT: .L{{[0-9]+}}$pb:
 ; X32-NEXT: popl %eax
 ; X32-NEXT: .Ltmp{{[0-9]+}}:
+; X32-NEXT: .cfi_adjust_cfa_offset -4
+; X32-NEXT: .Ltmp{{[0-9]+}}:
 ; X32-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp{{[0-9]+}}-.L{{[0-9]+}}$pb), %eax
 ; X32-NEXT: movl i2@GOTNTPOFF(%eax), %eax
 ; X32-NEXT: movl %gs:(%eax), %eax
@@ -56,9 +60,13 @@ entry:
 define i32* @f4() {
 ; X32-LABEL: f4:
 ; X32:      calll .L{{[0-9]+}}$pb
+; X32-NEXT: .Ltmp{{[0-9]+}}:
+; X32-NEXT: .cfi_adjust_cfa_offset 4
 ; X32-NEXT: .L{{[0-9]+}}$pb:
 ; X32-NEXT: popl %ecx
 ; X32-NEXT: .Ltmp{{[0-9]+}}:
+; X32-NEXT: .cfi_adjust_cfa_offset -4
+; X32-NEXT: .Ltmp{{[0-9]+}}:
 ; X32-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp{{[0-9]+}}-.L{{[0-9]+}}$pb), %ecx
 ; X32-NEXT: movl %gs:0, %eax
 ; X32-NEXT: addl i2@GOTNTPOFF(%ecx), %eax
diff --git a/test/CodeGen/X86/win32-pic-jumptable.ll b/test/CodeGen/X86/win32-pic-jumptable.ll
index 1a90b6238f26..3a8ef2d0b916 100644
--- a/test/CodeGen/X86/win32-pic-jumptable.ll
+++ b/test/CodeGen/X86/win32-pic-jumptable.ll
@@ -1,8 +1,12 @@
 ; RUN: llc < %s -relocation-model=pic | FileCheck %s
 
 ; CHECK:        calll L0$pb
+; CHECK-NEXT: Ltmp{{[0-9]+}}:
+; CHECK-NEXT: .cfi_adjust_cfa_offset 4
 ; CHECK-NEXT: L0$pb:
 ; CHECK-NEXT:   popl %eax
+; CHECK-NEXT: Ltmp{{[0-9]+}}:
+; CHECK-NEXT: .cfi_adjust_cfa_offset -4
 ; CHECK-NEXT:   addl LJTI0_0(,%ecx,4), %eax
 ; CHECK-NEXT:   jmpl *%eax
 

From 023610af4f6a40d16801fac7fc0effdd329108cd Mon Sep 17 00:00:00 2001
From: Asaf Badouh <asaf.badouh@intel.com>
Date: Sun, 6 Dec 2015 13:26:56 +0000
Subject: [PATCH 146/364] [X86][AVX512] add vmovss/sd missing encoding

Differential Revision: http://reviews.llvm.org/D14701


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254875 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/IntrinsicsX86.td      |  7 +++
 lib/Target/X86/X86InstrAVX512.td      | 83 +++++++++++++------------
 lib/Target/X86/X86IntrinsicsInfo.h    |  6 +-
 test/CodeGen/X86/avx512-intrinsics.ll | 45 ++++++++++++++
 test/MC/X86/avx512-encodings.s        | 55 +++++++++++++++++
 test/MC/X86/intel-syntax-avx512.s     | 88 +++++++++++++++++++++++++++
 6 files changed, 245 insertions(+), 39 deletions(-)

diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td
index 57ad278a68bd..1c028dea601f 100644
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@@ -1840,6 +1840,13 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx512_mask_load_pd_512 : GCCBuiltin<"__builtin_ia32_loadapd512_mask">,
         Intrinsic<[llvm_v8f64_ty], [llvm_ptr_ty, llvm_v8f64_ty, llvm_i8_ty],
                   [IntrReadArgMem]>;
+
+  def int_x86_avx512_mask_move_ss : GCCBuiltin<"__builtin_ia32_movss_mask">,
+        Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx512_mask_move_sd : GCCBuiltin<"__builtin_ia32_movsd_mask">,
+        Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
+                  [IntrNoMem]>;
 }
 
 // Conditional store ops
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index d15d0dc96e6f..452e9f05f84a 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -2973,53 +2973,60 @@ def VMOVQI2PQIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst),
 // AVX-512  MOVSS, MOVSD
 //===----------------------------------------------------------------------===//
 
-multiclass avx512_move_scalar <string asm, RegisterClass RC,
-                              SDNode OpNode, ValueType vt,
-                              X86MemOperand x86memop, PatFrag mem_pat> {
-  let hasSideEffects = 0 in {
-  def rr : SI<0x10, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, RC:$src2),
-              !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-              [(set VR128X:$dst, (vt (OpNode VR128X:$src1,
-                                      (scalar_to_vector RC:$src2))))],
-              IIC_SSE_MOV_S_RR>, EVEX_4V, VEX_LIG;
-  let Constraints = "$src1 = $dst" in
-  def rrk : SI<0x10, MRMSrcReg, (outs VR128X:$dst),
-              (ins VR128X:$src1, VK1WM:$mask, RC:$src2, RC:$src3),
-              !strconcat(asm,
-                "\t{$src3, $src2, $dst {${mask}}|$dst {${mask}}, $src2, $src3}"),
-              [], IIC_SSE_MOV_S_RR>, EVEX_4V, VEX_LIG, EVEX_K;
-  def rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
-              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
-              [(set RC:$dst, (mem_pat addr:$src))], IIC_SSE_MOV_S_RM>,
-              EVEX, VEX_LIG;
+multiclass avx512_move_scalar <string asm, SDNode OpNode, 
+                              X86VectorVTInfo _> {
+  defm rr_Int : AVX512_maskable_scalar<0x10, MRMSrcReg, _, (outs _.RC:$dst), 
+                    (ins _.RC:$src1, _.RC:$src2),
+                    asm, "$src2, $src1","$src1, $src2", 
+                    (_.VT (OpNode (_.VT _.RC:$src1),
+                                   (_.VT _.RC:$src2))),
+                                   IIC_SSE_MOV_S_RR>, EVEX_4V;
+  let Constraints = "$src1 = $dst" , mayLoad = 1 in
+    defm rm_Int : AVX512_maskable_3src_scalar<0x10, MRMSrcMem, _,
+                    (outs _.RC:$dst), 
+                    (ins _.ScalarMemOp:$src),
+                    asm,"$src","$src",
+                    (_.VT (OpNode (_.VT _.RC:$src1), 
+                               (_.VT (scalar_to_vector 
+                                     (_.ScalarLdFrag addr:$src)))))>, EVEX;
+  let isCodeGenOnly = 1 in {
+    def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), 
+               (ins _.RC:$src1, _.FRC:$src2),
+               !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+               [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1,
+                                      (scalar_to_vector _.FRC:$src2))))],
+               _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V;
+  let mayLoad = 1 in
+    def rm : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src),
+               !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+               [(set _.FRC:$dst, (_.ScalarLdFrag addr:$src))],
+               _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX;
+  }
   let mayStore = 1 in {
-  def mr: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
-             !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
-             [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
-             EVEX, VEX_LIG;
-  def mrk: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, VK1WM:$mask, RC:$src),
-             !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
-             [], IIC_SSE_MOV_S_MR>,
-             EVEX, VEX_LIG, EVEX_K;
+    def mr: AVX512PI<0x11, MRMDestMem, (outs), (ins _.ScalarMemOp:$dst, _.FRC:$src),
+               !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+               [(store _.FRC:$src, addr:$dst)],  _.ExeDomain, IIC_SSE_MOV_S_MR>,
+               EVEX;
+    def mrk: AVX512PI<0x11, MRMDestMem, (outs), 
+                (ins _.ScalarMemOp:$dst, VK1WM:$mask, _.FRC:$src),
+                !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
+                [], _.ExeDomain, IIC_SSE_MOV_S_MR>, EVEX, EVEX_K;
   } // mayStore
-  } //hasSideEffects = 0
 }
 
-let ExeDomain = SSEPackedSingle in
-defm VMOVSSZ : avx512_move_scalar<"movss", FR32X, X86Movss, v4f32, f32mem,
-                                 loadf32>, XS, EVEX_CD8<32, CD8VT1>;
+defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, f32x_info>,
+                                  VEX_LIG, XS, EVEX_CD8<32, CD8VT1>;
 
-let ExeDomain = SSEPackedDouble in
-defm VMOVSDZ : avx512_move_scalar<"movsd", FR64X, X86Movsd, v2f64, f64mem,
-                                 loadf64>, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, f64x_info>,
+                                  VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
 
 def : Pat<(f32 (X86select VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))),
-          (COPY_TO_REGCLASS (VMOVSSZrrk (COPY_TO_REGCLASS FR32X:$src2, VR128X),
-           VK1WM:$mask, (f32 (IMPLICIT_DEF)), FR32X:$src1), FR32X)>;
+          (COPY_TO_REGCLASS (VMOVSSZrr_Intk (COPY_TO_REGCLASS FR32X:$src2, VR128X),
+           VK1WM:$mask, (v4f32 (IMPLICIT_DEF)),(COPY_TO_REGCLASS FR32X:$src1, VR128X)), FR32X)>;
 
 def : Pat<(f64 (X86select VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))),
-          (COPY_TO_REGCLASS (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src2, VR128X),
-           VK1WM:$mask, (f64 (IMPLICIT_DEF)), FR64X:$src1), FR64X)>;
+          (COPY_TO_REGCLASS (VMOVSDZrr_Intk (COPY_TO_REGCLASS FR64X:$src2, VR128X),
+           VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR64X:$src1, VR128X)), FR64X)>;
 
 def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask),
           (VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS GR8:$mask, VK1WM)),
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index bb2f7248b0e9..cc53d5f3ce5a 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -143,7 +143,7 @@ static const IntrinsicData IntrinsicsWithChain[] = {
                      EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
   X86_INTRINSIC_DATA(avx512_mask_expand_load_q_512,
                      EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8,
+  X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8,
                      X86ISD::VTRUNC, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_256, TRUNCATE_TO_MEM_VI8,
                      X86ISD::VTRUNC, 0),
@@ -807,6 +807,10 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::MOVDDUP, 0),
   X86_INTRINSIC_DATA(avx512_mask_movddup_512, INTR_TYPE_1OP_MASK,
                      X86ISD::MOVDDUP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_move_sd, INTR_TYPE_SCALAR_MASK, 
+                     X86ISD::MOVSD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_move_ss, INTR_TYPE_SCALAR_MASK, 
+                     X86ISD::MOVSS, 0),
   X86_INTRINSIC_DATA(avx512_mask_movshdup_128, INTR_TYPE_1OP_MASK,
                      X86ISD::MOVSHDUP, 0),
   X86_INTRINSIC_DATA(avx512_mask_movshdup_256, INTR_TYPE_1OP_MASK,
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index c47027eed2b2..c01f1adce360 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -6234,3 +6234,48 @@ define i32 @test_x86_avx512_ucomi_ss_lt(<4 x float> %a0, <4 x float> %a1) {
 }
 
 declare i32 @llvm.x86.avx512.vcomi.ss(<4 x float>, <4 x float>, i32, i32)
+declare <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_move_ss_rrk(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_move_ss_rrk:
+; CHECK: vmovss  %xmm1, %xmm0, %xmm2 {%k1}
+  %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+  ret <4 x float> %res
+}
+
+define <4 x float>@test_int_x86_avx512_mask_move_ss_rrkz(<4 x float> %x0, <4 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_move_ss_rrkz:
+; CHECK: vmovss  %xmm1, %xmm0, %xmm0 {%k1} {z}
+  %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> zeroinitializer, i8 %x2)
+  ret <4 x float> %res
+}
+
+define <4 x float>@test_int_x86_avx512_mask_move_ss_rr(<4 x float> %x0, <4 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_move_ss_rr:
+; CHECK: vmovss %xmm1, %xmm0, %xmm0
+  %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> zeroinitializer, i8 -1)
+  ret <4 x float> %res
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double>, <2 x double>, <2 x double>, i8)
+define <2 x double>@test_int_x86_avx512_mask_move_sd_rr(<2 x double> %x0, <2 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_move_sd_rr:
+; CHECK: vmovsd  %xmm1, %xmm0, %xmm0
+  %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> zeroinitializer, i8 -1)
+  ret <2 x double> %res
+}
+
+define <2 x double>@test_int_x86_avx512_mask_move_sd_rrkz(<2 x double> %x0, <2 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_move_sd_rrkz:
+; CHECK: vmovsd  %xmm1, %xmm0, %xmm0 {%k1} {z}
+  %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> zeroinitializer, i8 %x2)
+  ret <2 x double> %res
+}
+
+define <2 x double>@test_int_x86_avx512_mask_move_sd_rrk(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_move_sd_rrk:
+; CHECK: vmovsd  %xmm1, %xmm0, %xmm2 {%k1}
+  %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
+  ret <2 x double> %res
+}
+
diff --git a/test/MC/X86/avx512-encodings.s b/test/MC/X86/avx512-encodings.s
index 2043100bf3e6..d8806effb0e3 100644
--- a/test/MC/X86/avx512-encodings.s
+++ b/test/MC/X86/avx512-encodings.s
@@ -19220,3 +19220,58 @@ vpermilpd $0x23, 0x400(%rbx), %zmm2
 // CHECK: vucomiss -516(%rdx), %xmm22
 // CHECK:  encoding: [0x62,0xe1,0x7c,0x08,0x2e,0xb2,0xfc,0xfd,0xff,0xff]
           vucomiss -516(%rdx), %xmm22
+// CHECK: vmovsd (%rcx), %xmm25 {%k3}
+// CHECK:  encoding: [0x62,0x61,0xff,0x0b,0x10,0x09]
+          vmovsd (%rcx), %xmm25 {%k3}
+
+// CHECK: vmovsd (%rcx), %xmm25 {%k3} {z}
+// CHECK:  encoding: [0x62,0x61,0xff,0x8b,0x10,0x09]
+          vmovsd (%rcx), %xmm25 {%k3} {z}
+
+// CHECK: vmovsd %xmm19, %xmm3, %xmm27 {%k3} {z}
+// CHECK:  encoding: [0x62,0x21,0xe7,0x8b,0x10,0xdb]
+          vmovsd %xmm19, %xmm3, %xmm27 {%k3} {z}
+
+// CHECK: vmovss (%rcx), %xmm2 {%k4}
+// CHECK:  encoding: [0x62,0xf1,0x7e,0x0c,0x10,0x11]
+          vmovss (%rcx), %xmm2 {%k4}
+
+// CHECK: vmovss (%rcx), %xmm2 {%k4} {z}
+// CHECK:  encoding: [0x62,0xf1,0x7e,0x8c,0x10,0x11]
+          vmovss (%rcx), %xmm2 {%k4} {z}
+
+// CHECK: vmovss %xmm26, %xmm9, %xmm28 {%k4} {z}
+// CHECK:  encoding: [0x62,0x01,0x36,0x8c,0x10,0xe2]
+          vmovss %xmm26, %xmm9, %xmm28 {%k4} {z}
+
+// CHECK: vmovsd %xmm15, %xmm22, %xmm21 {%k7} {z}
+// CHECK:  encoding: [0x62,0xc1,0xcf,0x87,0x10,0xef]
+          vmovsd %xmm15, %xmm22, %xmm21 {%k7} {z}
+
+// CHECK: vmovsd %xmm8, %xmm13, %xmm3 {%k5} {z}
+// CHECK:  encoding: [0x62,0xd1,0x97,0x8d,0x10,0xd8]
+          vmovsd %xmm8, %xmm13, %xmm3 {%k5} {z}
+
+// CHECK: vmovss %xmm2, %xmm27, %xmm17 {%k2} {z}
+// CHECK:  encoding: [0x62,0xe1,0x26,0x82,0x10,0xca]
+          vmovss %xmm2, %xmm27, %xmm17 {%k2} {z}
+
+// CHECK: vmovss %xmm23, %xmm19, %xmm10 {%k3} {z}
+// CHECK:  encoding: [0x62,0x31,0x66,0x83,0x10,0xd7]
+          vmovss %xmm23, %xmm19, %xmm10 {%k3} {z}
+
+// CHECK: vmovsd %xmm4, %xmm15, %xmm4 {%k6} {z}
+// CHECK:  encoding: [0x62,0xf1,0x87,0x8e,0x10,0xe4]
+          vmovsd %xmm4, %xmm15, %xmm4 {%k6} {z}
+
+// CHECK: vmovsd %xmm14, %xmm2, %xmm20 {%k7} {z}
+// CHECK:  encoding: [0x62,0xc1,0xef,0x8f,0x10,0xe6]
+          vmovsd %xmm14, %xmm2, %xmm20 {%k7} {z}
+
+// CHECK: vmovss %xmm19, %xmm11, %xmm21 {%k3} {z}
+// CHECK:  encoding: [0x62,0xa1,0x26,0x8b,0x10,0xeb]
+          vmovss %xmm19, %xmm11, %xmm21 {%k3} {z}
+
+// CHECK: vmovss %xmm24, %xmm27, %xmm15 {%k2} {z}
+// CHECK:  encoding: [0x62,0x11,0x26,0x82,0x10,0xf8]
+          vmovss %xmm24, %xmm27, %xmm15 {%k2} {z}
diff --git a/test/MC/X86/intel-syntax-avx512.s b/test/MC/X86/intel-syntax-avx512.s
index 6340f853b553..c5ab7dde1106 100644
--- a/test/MC/X86/intel-syntax-avx512.s
+++ b/test/MC/X86/intel-syntax-avx512.s
@@ -264,3 +264,91 @@ vaddpd zmm1,zmm1,zmm2,{rz-sae}
 // CHECK:  vcomiss xmm16, dword ptr [rcx]
 // CHECK:  encoding: [0x62,0xe1,0x7c,0x08,0x2f,0x01]
           vcomiss xmm16, DWORD PTR [rcx]
+
+// CHECK: vmovss dword ptr [rcx] {k2}, xmm13
+// CHECK:  encoding: [0x62,0x71,0x7e,0x0a,0x11,0x29]
+          vmovss dword ptr [rcx]{k2},xmm13
+
+// CHECK: vmovss dword ptr [rax + 8*r14 + 4660], xmm13
+// CHECK:  encoding: [0xc4,0x21,0x7a,0x11,0xac,0xf0,0x34,0x12,0x00,0x00]
+          vmovss dword ptr [rax+r14*8+0x1234],xmm13
+
+// CHECK: vmovss dword ptr [rdx + 508], xmm13
+// CHECK:  encoding: [0xc5,0x7a,0x11,0xaa,0xfc,0x01,0x00,0x00]
+          vmovss dword ptr [rdx+0x1fc],xmm13
+
+// CHECK: vmovss dword ptr [rdx + 512], xmm13
+// CHECK:  encoding: [0xc5,0x7a,0x11,0xaa,0x00,0x02,0x00,0x00]
+          vmovss dword ptr [rdx+0x200],xmm13
+
+// CHECK: vmovss dword ptr [rdx - 512], xmm13
+// CHECK:  encoding: [0xc5,0x7a,0x11,0xaa,0x00,0xfe,0xff,0xff]
+          vmovss dword ptr [rdx-0x200],xmm13
+
+// CHECK: vmovss dword ptr [rdx - 516], xmm13
+// CHECK:  encoding: [0xc5,0x7a,0x11,0xaa,0xfc,0xfd,0xff,0xff]
+          vmovss dword ptr [rdx-0x204],xmm13
+
+// CHECK: vmovss dword ptr [rdx + 508], xmm5
+// CHECK:  encoding: [0xc5,0xfa,0x11,0xaa,0xfc,0x01,0x00,0x00]
+          vmovss dword ptr [rdx+0x1fc],xmm5
+
+// CHECK: vmovss dword ptr [rdx + 512], xmm5
+// CHECK:  encoding: [0xc5,0xfa,0x11,0xaa,0x00,0x02,0x00,0x00]
+          vmovss dword ptr [rdx+0x200],xmm5
+
+// CHECK: vmovss dword ptr [rdx - 512], xmm5
+// CHECK:  encoding: [0xc5,0xfa,0x11,0xaa,0x00,0xfe,0xff,0xff]
+          vmovss dword ptr [rdx-0x200], xmm5
+
+// CHECK: vmovss dword ptr [rdx - 516], xmm5
+// CHECK:  encoding: [0xc5,0xfa,0x11,0xaa,0xfc,0xfd,0xff,0xff]
+          vmovss dword ptr [rdx-0x204],xmm5
+
+// CHECK: vmovss dword ptr [rcx], xmm13
+// CHECK:  encoding: [0xc5,0x7a,0x11,0x29]
+          vmovss dword ptr [rcx],xmm13
+
+// CHECK: vmovss xmm2, dword ptr [rcx]
+// CHECK:  encoding: [0xc5,0xfa,0x10,0x11]
+          vmovss xmm2, dword ptr [rcx]
+
+// CHECK: vmovss xmm2 {k4}, dword ptr [rcx]
+// CHECK:  encoding: [0x62,0xf1,0x7e,0x0c,0x10,0x11]
+          vmovss xmm2{k4}, dword ptr [rcx]
+
+// CHECK: vmovss xmm2 {k4} {z}, dword ptr [rcx]
+// CHECK:  encoding: [0x62,0xf1,0x7e,0x8c,0x10,0x11]
+          vmovss xmm2{k4} {z}, dword ptr [rcx]
+
+// CHECK: vmovsd xmm25 , qword ptr [rcx]
+// CHECK:  encoding: [0x62,0x61,0xff,0x08,0x10,0x09]
+          vmovsd xmm25, qword ptr [rcx]
+
+// CHECK: vmovsd xmm25 {k3}, qword ptr [rcx]
+// CHECK:  encoding: [0x62,0x61,0xff,0x0b,0x10,0x09]
+          vmovsd xmm25{k3}, qword ptr [rcx]
+
+// CHECK: vmovsd xmm25 {k3} {z}, qword ptr [rcx]
+// CHECK:  encoding: [0x62,0x61,0xff,0x8b,0x10,0x09]
+          vmovsd xmm25{k3} {z}, qword ptr [rcx]
+
+// CHECK: vmovsd xmm25 , qword ptr [rax + 8*r14 + 291]
+// CHECK:  encoding: [0x62,0x21,0xff,0x08,0x10,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vmovsd xmm25, qword ptr [rax+r14*8+0x123]
+
+// CHECK: vmovsd xmm25 , qword ptr [rdx + 1016]
+// CHECK:  encoding: [0x62,0x61,0xff,0x08,0x10,0x4a,0x7f]
+          vmovsd xmm25, qword ptr [rdx+0x3f8]
+
+// CHECK: vmovsd xmm25 , qword ptr [rdx + 1024]
+// CHECK:  encoding: [0x62,0x61,0xff,0x08,0x10,0x8a,0x00,0x04,0x00,0x00]
+          vmovsd xmm25, qword ptr [rdx+0x400]
+
+// CHECK: vmovsd xmm25 , qword ptr [rdx - 1024]
+// CHECK:  encoding: [0x62,0x61,0xff,0x08,0x10,0x4a,0x80]
+          vmovsd xmm25, qword ptr [rdx-0x400]
+
+// CHECK: vmovsd xmm25 , qword ptr [rdx - 1032]
+// CHECK:  encoding: [0x62,0x61,0xff,0x08,0x10,0x8a,0xf8,0xfb,0xff,0xff]
+          vmovsd xmm25, qword ptr [rdx-0x408]

From 1c73aa0b63c49b8fbbc9cd0cf35ebcc027174969 Mon Sep 17 00:00:00 2001
From: Marina Yatsina <marina.yatsina@intel.com>
Date: Sun, 6 Dec 2015 15:31:47 +0000
Subject: [PATCH 147/364] [X86]  Add support for loopz, loopnz for Intel syntax

According to x86 spec, loopz and loopnz should be supported for Intel syntax, where loopz is equivalent to loope and loopnz is equivalent to loopne.

Differential Revision: http://reviews.llvm.org/D15148


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254877 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86InstrInfo.td | 4 ++--
 test/MC/X86/intel-syntax.s     | 5 +++++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 1c21a098bc6c..4a4ceaca88f4 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -2572,8 +2572,8 @@ def : MnemonicAlias<"lret", "lretl", "att">, Requires<[Not16BitMode]>;
 def : MnemonicAlias<"leavel", "leave", "att">, Requires<[Not64BitMode]>;
 def : MnemonicAlias<"leaveq", "leave", "att">, Requires<[In64BitMode]>;
 
-def : MnemonicAlias<"loopz",  "loope",  "att">;
-def : MnemonicAlias<"loopnz", "loopne", "att">;
+def : MnemonicAlias<"loopz",  "loope">;
+def : MnemonicAlias<"loopnz", "loopne">;
 
 def : MnemonicAlias<"pop",   "popw",  "att">, Requires<[In16BitMode]>;
 def : MnemonicAlias<"pop",   "popl",  "att">, Requires<[In32BitMode]>;
diff --git a/test/MC/X86/intel-syntax.s b/test/MC/X86/intel-syntax.s
index 71bf6cc0ffdb..214d827168b8 100644
--- a/test/MC/X86/intel-syntax.s
+++ b/test/MC/X86/intel-syntax.s
@@ -741,3 +741,8 @@ fcomip st, st(2)
 fucomip st, st(2)
 // CHECK: fcompi  %st(2)
 // CHECK: fucompi  %st(2)
+
+loopz _foo
+loopnz _foo
+// CHECK: loope _foo
+// CHECK: loopne _foo

From e89ea49c12448ba0f7909013048ca1ae7e8dd98d Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Sun, 6 Dec 2015 16:18:25 +0000
Subject: [PATCH 148/364] Create llvm.global_ctors in the new format.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254878 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Utils/ModuleUtils.cpp               | 4 ++--
 test/Instrumentation/SanitizerCoverage/coverage.ll | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/Transforms/Utils/ModuleUtils.cpp b/lib/Transforms/Utils/ModuleUtils.cpp
index c5abf76d93fa..9ec28a3f3d47 100644
--- a/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/lib/Transforms/Utils/ModuleUtils.cpp
@@ -43,9 +43,9 @@ static void appendToGlobalArray(const char *Array,
     }
     GVCtor->eraseFromParent();
   } else {
-    // Use a simple two-field struct if there isn't one already.
+    // Use the new three-field struct if there isn't one already.
     EltTy = StructType::get(IRB.getInt32Ty(), PointerType::getUnqual(FnTy),
-                            nullptr);
+                            IRB.getInt8PtrTy(), nullptr);
   }
 
   // Build a 2 or 3 field global_ctor entry.  We don't take a comdat key.
diff --git a/test/Instrumentation/SanitizerCoverage/coverage.ll b/test/Instrumentation/SanitizerCoverage/coverage.ll
index 659c03040f2f..71fdbbb5ada7 100644
--- a/test/Instrumentation/SanitizerCoverage/coverage.ll
+++ b/test/Instrumentation/SanitizerCoverage/coverage.ll
@@ -29,8 +29,8 @@ entry:
 }
 
 ; CHECK0-NOT: @llvm.global_ctors = {{.*}}{ i32 2, void ()* @sancov.module_ctor }
-; CHECK1: @llvm.global_ctors = {{.*}}{ i32 2, void ()* @sancov.module_ctor }
-; CHECK2: @llvm.global_ctors = {{.*}}{ i32 2, void ()* @sancov.module_ctor }
+; CHECK1: @llvm.global_ctors = {{.*}}{ i32 2, void ()* @sancov.module_ctor, i8* null }
+; CHECK2: @llvm.global_ctors = {{.*}}{ i32 2, void ()* @sancov.module_ctor, i8* null }
 
 ; CHECK0-NOT: call void @__sanitizer_cov(
 ; CHECK0-NOT: call void @__sanitizer_cov_module_init(

From 180d5cb8e1b2fb4d9fdba85669c972de4cf6b734 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Sun, 6 Dec 2015 18:05:12 +0000
Subject: [PATCH 149/364] [x86] add missing maxnum/minnum tests for 256-bit
 vectors

Also, switch to x86-64 because once we can lower these to something
more reasonable, there will be less noise in the checks. And add
AVX runs because those will be different than SSE.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254879 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/fmaxnum.ll | 58 ++++++++++++++++++++++---------------
 test/CodeGen/X86/fminnum.ll | 50 ++++++++++++++++++++------------
 2 files changed, 66 insertions(+), 42 deletions(-)

diff --git a/test/CodeGen/X86/fmaxnum.ll b/test/CodeGen/X86/fmaxnum.ll
index 17bd3b9b45b4..7aa087f92bdc 100644
--- a/test/CodeGen/X86/fmaxnum.ll
+++ b/test/CodeGen/X86/fmaxnum.ll
@@ -1,4 +1,5 @@
-; RUN: llc  -march=x86 -mtriple=i386-linux-gnu  < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=sse2  < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx  < %s | FileCheck %s
 
 declare float @fmaxf(float, float)
 declare double @fmax(double, double)
@@ -10,86 +11,97 @@ declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80)
 declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>)
 declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
 declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>)
+declare <4 x double> @llvm.maxnum.v4f64(<4 x double>, <4 x double>)
 declare <8 x double> @llvm.maxnum.v8f64(<8 x double>, <8 x double>)
 
 
 ; CHECK-LABEL: @test_fmaxf
-; CHECK: calll fmaxf
+; CHECK: jmp fmaxf
 define float @test_fmaxf(float %x, float %y) {
   %z = call float @fmaxf(float %x, float %y) readnone
   ret float %z
 }
 
 ; CHECK-LABEL: @test_fmax
-; CHECK: calll fmax
+; CHECK: jmp fmax
 define double @test_fmax(double %x, double %y) {
   %z = call double @fmax(double %x, double %y) readnone
   ret double %z
 }
 
 ; CHECK-LABEL: @test_fmaxl
-; CHECK: calll fmaxl
+; CHECK: callq fmaxl
 define x86_fp80 @test_fmaxl(x86_fp80 %x, x86_fp80 %y) {
   %z = call x86_fp80 @fmaxl(x86_fp80 %x, x86_fp80 %y) readnone
   ret x86_fp80 %z
 }
 
 ; CHECK-LABEL: @test_intrinsic_fmaxf
-; CHECK: calll fmaxf
+; CHECK: jmp fmaxf
 define float @test_intrinsic_fmaxf(float %x, float %y) {
   %z = call float @llvm.maxnum.f32(float %x, float %y) readnone
   ret float %z
 }
 
 ; CHECK-LABEL: @test_intrinsic_fmax
-; CHECK: calll fmax
+; CHECK: jmp fmax
 define double @test_intrinsic_fmax(double %x, double %y) {
   %z = call double @llvm.maxnum.f64(double %x, double %y) readnone
   ret double %z
 }
 
 ; CHECK-LABEL: @test_intrinsic_fmaxl
-; CHECK: calll fmaxl
+; CHECK: callq fmaxl
 define x86_fp80 @test_intrinsic_fmaxl(x86_fp80 %x, x86_fp80 %y) {
   %z = call x86_fp80 @llvm.maxnum.f80(x86_fp80 %x, x86_fp80 %y) readnone
   ret x86_fp80 %z
 }
 
 ; CHECK-LABEL: @test_intrinsic_fmax_v2f32
-; CHECK: calll fmaxf
-; CHECK: calll fmaxf
+; CHECK: callq fmaxf
+; CHECK: callq fmaxf
 define <2 x float> @test_intrinsic_fmax_v2f32(<2 x float> %x, <2 x float> %y) {
   %z = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %x, <2 x float> %y) readnone
   ret <2 x float> %z
 }
 
 ; CHECK-LABEL: @test_intrinsic_fmax_v4f32
-; CHECK: calll fmaxf
-; CHECK: calll fmaxf
-; CHECK: calll fmaxf
-; CHECK: calll fmaxf
+; CHECK: callq fmaxf
+; CHECK: callq fmaxf
+; CHECK: callq fmaxf
+; CHECK: callq fmaxf
 define <4 x float> @test_intrinsic_fmax_v4f32(<4 x float> %x, <4 x float> %y) {
   %z = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float> %y) readnone
   ret <4 x float> %z
 }
 
 ; CHECK-LABEL: @test_intrinsic_fmax_v2f64
-; CHECK: calll fmax
-; CHECK: calll fmax
+; CHECK: callq fmax
+; CHECK: callq fmax
 define <2 x double> @test_intrinsic_fmax_v2f64(<2 x double> %x, <2 x double> %y) {
   %z = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %x, <2 x double> %y) readnone
   ret <2 x double> %z
 }
 
+; CHECK-LABEL: @test_intrinsic_fmax_v4f64
+; CHECK: callq fmax
+; CHECK: callq fmax
+; CHECK: callq fmax
+; CHECK: callq fmax
+define <4 x double> @test_intrinsic_fmax_v4f64(<4 x double> %x, <4 x double> %y) {
+  %z = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %x, <4 x double> %y) readnone
+  ret <4 x double> %z
+}
+
 ; CHECK-LABEL: @test_intrinsic_fmax_v8f64
-; CHECK: calll fmax
-; CHECK: calll fmax
-; CHECK: calll fmax
-; CHECK: calll fmax
-; CHECK: calll fmax
-; CHECK: calll fmax
-; CHECK: calll fmax
-; CHECK: calll fmax
+; CHECK: callq fmax
+; CHECK: callq fmax
+; CHECK: callq fmax
+; CHECK: callq fmax
+; CHECK: callq fmax
+; CHECK: callq fmax
+; CHECK: callq fmax
+; CHECK: callq fmax
 define <8 x double> @test_intrinsic_fmax_v8f64(<8 x double> %x, <8 x double> %y) {
   %z = call <8 x double> @llvm.maxnum.v8f64(<8 x double> %x, <8 x double> %y) readnone
   ret <8 x double> %z
diff --git a/test/CodeGen/X86/fminnum.ll b/test/CodeGen/X86/fminnum.ll
index 1e33cf4696af..e89ed32ad618 100644
--- a/test/CodeGen/X86/fminnum.ll
+++ b/test/CodeGen/X86/fminnum.ll
@@ -1,4 +1,5 @@
-; RUN: llc  -march=x86 -mtriple=i386-linux-gnu -mattr=+sse,+sse2 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=sse2 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx < %s | FileCheck %s
 
 declare float @fminf(float, float)
 declare double @fmin(double, double)
@@ -10,6 +11,7 @@ declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80)
 declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>)
 declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
 declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>)
+declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>)
 declare <8 x double> @llvm.minnum.v8f64(<8 x double>, <8 x double>)
 
 ; CHECK-LABEL: @test_fminf
@@ -27,7 +29,7 @@ define double @test_fmin(double %x, double %y) {
 }
 
 ; CHECK-LABEL: @test_fminl
-; CHECK: calll fminl
+; CHECK: callq fminl
 define x86_fp80 @test_fminl(x86_fp80 %x, x86_fp80 %y) {
   %z = call x86_fp80 @fminl(x86_fp80 %x, x86_fp80 %y) readnone
   ret x86_fp80 %z
@@ -48,47 +50,57 @@ define double @test_intrinsic_fmin(double %x, double %y) {
 }
 
 ; CHECK-LABEL: @test_intrinsic_fminl
-; CHECK: calll fminl
+; CHECK: callq fminl
 define x86_fp80 @test_intrinsic_fminl(x86_fp80 %x, x86_fp80 %y) {
   %z = call x86_fp80 @llvm.minnum.f80(x86_fp80 %x, x86_fp80 %y) readnone
   ret x86_fp80 %z
 }
 
 ; CHECK-LABEL: @test_intrinsic_fmin_v2f32
-; CHECK: calll fminf
-; CHECK: calll fminf
+; CHECK: callq fminf
+; CHECK: callq fminf
 define <2 x float> @test_intrinsic_fmin_v2f32(<2 x float> %x, <2 x float> %y) {
   %z = call <2 x float> @llvm.minnum.v2f32(<2 x float> %x, <2 x float> %y) readnone
   ret <2 x float> %z
 }
 
 ; CHECK-LABEL: @test_intrinsic_fmin_v4f32
-; CHECK: calll fminf
-; CHECK: calll fminf
-; CHECK: calll fminf
-; CHECK: calll fminf
+; CHECK: callq fminf
+; CHECK: callq fminf
+; CHECK: callq fminf
+; CHECK: callq fminf
 define <4 x float> @test_intrinsic_fmin_v4f32(<4 x float> %x, <4 x float> %y) {
   %z = call <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float> %y) readnone
   ret <4 x float> %z
 }
 
 ; CHECK-LABEL: @test_intrinsic_fmin_v2f64
-; CHECK: calll fmin
-; CHECK: calll fmin
+; CHECK: callq fmin
+; CHECK: callq fmin
 define <2 x double> @test_intrinsic_fmin_v2f64(<2 x double> %x, <2 x double> %y) {
   %z = call <2 x double> @llvm.minnum.v2f64(<2 x double> %x, <2 x double> %y) readnone
   ret <2 x double> %z
 }
 
+; CHECK-LABEL: @test_intrinsic_fmin_v4f64
+; CHECK: callq fmin
+; CHECK: callq fmin
+; CHECK: callq fmin
+; CHECK: callq fmin
+define <4 x double> @test_intrinsic_fmin_v4f64(<4 x double> %x, <4 x double> %y) {
+  %z = call <4 x double> @llvm.minnum.v4f64(<4 x double> %x, <4 x double> %y) readnone
+  ret <4 x double> %z
+}
+
 ; CHECK-LABEL: @test_intrinsic_fmin_v8f64
-; CHECK: calll fmin
-; CHECK: calll fmin
-; CHECK: calll fmin
-; CHECK: calll fmin
-; CHECK: calll fmin
-; CHECK: calll fmin
-; CHECK: calll fmin
-; CHECK: calll fmin
+; CHECK: callq fmin
+; CHECK: callq fmin
+; CHECK: callq fmin
+; CHECK: callq fmin
+; CHECK: callq fmin
+; CHECK: callq fmin
+; CHECK: callq fmin
+; CHECK: callq fmin
 define <8 x double> @test_intrinsic_fmin_v8f64(<8 x double> %x, <8 x double> %y) {
   %z = call <8 x double> @llvm.minnum.v8f64(<8 x double> %x, <8 x double> %y) readnone
   ret <8 x double> %z

From 51b079cd28aacf110bf1b53c89bca4a2b23c4d22 Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Sun, 6 Dec 2015 19:29:54 +0000
Subject: [PATCH 150/364] [WebAssembly] Add some more ideas to README.txt.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254880 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/README.txt | 43 +++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/lib/Target/WebAssembly/README.txt b/lib/Target/WebAssembly/README.txt
index 78b3123cde85..97072ab1cc15 100644
--- a/lib/Target/WebAssembly/README.txt
+++ b/lib/Target/WebAssembly/README.txt
@@ -47,3 +47,46 @@ expression stack across the jump (sometimes). We should (a) model this, and
 (b) extend the stackifier to utilize it.
 
 //===---------------------------------------------------------------------===//
+
+The min/max operators aren't exactly a<b?a:b because of NaN and negative zero
+behavior. The ARM target has the same kind of min/max instructions and has
+implemented optimizations for them; we should do similar optimizations for
+WebAssembly.
+
+//===---------------------------------------------------------------------===//
+
+AArch64 runs SeparateConstOffsetFromGEPPass, followed by EarlyCSE and LICM.
+Would these be useful to run for WebAssembly too? Also, it has an option to
+run SimplifyCFG after running the AtomicExpand pass. Would this be useful for
+us too?
+
+//===---------------------------------------------------------------------===//
+
+When is it profitable to set isAsCheapAsAMove on instructions in WebAssembly?
+
+//===---------------------------------------------------------------------===//
+
+Register stackification uses the EXPR_STACK physical register to impose
+ordering dependencies on instructions with stack operands. This is pessimistic;
+we should consider alternate ways to model stack dependencies.
+
+//===---------------------------------------------------------------------===//
+
+Lots of things could be done in WebAssemblyTargetTransformInfo.cpp. Similarly,
+there are numerous optimization-related hooks that can be overridden in
+WebAssemblyTargetLowering.
+
+//===---------------------------------------------------------------------===//
+
+Instead of the OptimizeReturned pass, which should consider preserving the
+"returned" attribute through to MachineInstrs and extending the StoreResults
+pass to do this optimization on calls too. That would also let the
+WebAssemblyPeephole pass clean up dead defs for such calls, as it does for
+stores.
+
+//===---------------------------------------------------------------------===//
+
+Memset/memcpy/memmove should be marked with the "returned" attribute somehow,
+even when they are translated through intrinsics.
+
+//===---------------------------------------------------------------------===//

From d749dbb6ecc72c57a30c3d996d6b176bc4db18e3 Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Sun, 6 Dec 2015 19:31:44 +0000
Subject: [PATCH 151/364] [WebAssembly] Tighten up some testcase regular
 expressions.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254881 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/WebAssembly/cfg-stackify.ll | 18 +++++++++---------
 test/CodeGen/WebAssembly/cpus.ll         |  4 ++--
 test/CodeGen/WebAssembly/switch.ll       |  4 ++--
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/test/CodeGen/WebAssembly/cfg-stackify.ll b/test/CodeGen/WebAssembly/cfg-stackify.ll
index c615ebb0db9d..4c86c55a8f85 100644
--- a/test/CodeGen/WebAssembly/cfg-stackify.ll
+++ b/test/CodeGen/WebAssembly/cfg-stackify.ll
@@ -90,14 +90,14 @@ back:
 
 ; CHECK-LABEL: test2:
 ; CHECK: block BB2_2{{$}}
-; CHECK: br_if {{.*}}, BB2_2{{$}}
+; CHECK: br_if {{[^,]*}}, BB2_2{{$}}
 ; CHECK: BB2_1:
 ; CHECK: br_if ${{[0-9]+}}, BB2_1{{$}}
 ; CHECK: BB2_2:
 ; CHECK: return{{$}}
 ; OPT-LABEL: test2:
 ; OPT: block BB2_2{{$}}
-; OPT: br_if {{.*}}, BB2_2{{$}}
+; OPT: br_if {{[^,]*}}, BB2_2{{$}}
 ; OPT: BB2_1:
 ; OPT: br_if ${{[0-9]+}}, BB2_1{{$}}
 ; OPT: BB2_2:
@@ -140,9 +140,9 @@ for.end:
 ; OPT-LABEL: doublediamond:
 ; OPT: block BB3_5{{$}}
 ; OPT: block BB3_4{{$}}
-; OPT: br_if {{.*}}, BB3_4{{$}}
+; OPT: br_if {{[^,]*}}, BB3_4{{$}}
 ; OPT: block BB3_3{{$}}
-; OPT: br_if {{.*}}, BB3_3{{$}}
+; OPT: br_if {{[^,]*}}, BB3_3{{$}}
 ; OPT: br BB3_5{{$}}
 ; OPT: BB3_4:
 ; OPT: BB3_5:
@@ -204,7 +204,7 @@ exit:
 ; OPT-LABEL: diamond:
 ; OPT: block BB5_3{{$}}
 ; OPT: block BB5_2{{$}}
-; OPT: br_if {{.*}}, BB5_2{{$}}
+; OPT: br_if {{[^,]*}}, BB5_2{{$}}
 ; OPT: br BB5_3{{$}}
 ; OPT: BB5_2:
 ; OPT: BB5_3:
@@ -269,7 +269,7 @@ loop:
 ; OPT-NOT: br
 ; OPT: BB8_1:
 ; OPT: loop BB8_2{{$}}
-; OPT: br_if {{.*}}, BB8_1{{$}}
+; OPT: br_if {{[^,]*}}, BB8_1{{$}}
 ; OPT: BB8_2:
 ; OPT: return ${{[0-9]+}}{{$}}
 define i32 @simple_loop(i32* %p, i32 %a) {
@@ -333,7 +333,7 @@ exit:
 ; OPT-LABEL: ifelse_earlyexits:
 ; OPT: block BB10_4{{$}}
 ; OPT: block BB10_3{{$}}
-; OPT: br_if {{.*}}, BB10_3{{$}}
+; OPT: br_if {{[^,]*}}, BB10_3{{$}}
 ; OPT: br_if $1, BB10_4{{$}}
 ; OPT: br BB10_4{{$}}
 ; OPT: BB10_3:
@@ -379,9 +379,9 @@ exit:
 ; OPT: loop            BB11_7{{$}}
 ; OPT: block           BB11_6{{$}}
 ; OPT: block           BB11_5{{$}}
-; OPT: br_if           {{.*}}, BB11_5{{$}}
+; OPT: br_if           {{[^,]*}}, BB11_5{{$}}
 ; OPT: block           BB11_4{{$}}
-; OPT: br_if           {{.*}}, BB11_4{{$}}
+; OPT: br_if           {{[^,]*}}, BB11_4{{$}}
 ; OPT: br              BB11_6{{$}}
 ; OPT: BB11_4:
 ; OPT: br              BB11_6{{$}}
diff --git a/test/CodeGen/WebAssembly/cpus.ll b/test/CodeGen/WebAssembly/cpus.ll
index bbc9c8fe4f31..2b77c5f475c8 100644
--- a/test/CodeGen/WebAssembly/cpus.ll
+++ b/test/CodeGen/WebAssembly/cpus.ll
@@ -9,8 +9,8 @@
 ; RUN: llc < %s -mtriple=wasm32-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID
 ; RUN: llc < %s -mtriple=wasm64-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID
 
-; CHECK-NOT: {{.*}}  is not a recognized processor for this target
-; INVALID: {{.*}}  is not a recognized processor for this target
+; CHECK-NOT: {{.*}} is not a recognized processor for this target
+; INVALID: {{.*}} is not a recognized processor for this target
 
 define i32 @f(i32 %i_like_the_web) {
   ret i32 %i_like_the_web
diff --git a/test/CodeGen/WebAssembly/switch.ll b/test/CodeGen/WebAssembly/switch.ll
index c62333c336fa..b146a239b419 100644
--- a/test/CodeGen/WebAssembly/switch.ll
+++ b/test/CodeGen/WebAssembly/switch.ll
@@ -21,7 +21,7 @@ declare void @foo5()
 ; CHECK: block BB0_4{{$}}
 ; CHECK: block BB0_3{{$}}
 ; CHECK: block BB0_2{{$}}
-; CHECK: tableswitch {{.*}}, BB0_2, BB0_2, BB0_2, BB0_2, BB0_2, BB0_2, BB0_2, BB0_2, BB0_3, BB0_3, BB0_3, BB0_3, BB0_3, BB0_3, BB0_3, BB0_3, BB0_4, BB0_4, BB0_4, BB0_4, BB0_4, BB0_4, BB0_5, BB0_6, BB0_7{{$}}
+; CHECK: tableswitch {{[^,]*}}, BB0_2, BB0_2, BB0_2, BB0_2, BB0_2, BB0_2, BB0_2, BB0_2, BB0_3, BB0_3, BB0_3, BB0_3, BB0_3, BB0_3, BB0_3, BB0_3, BB0_4, BB0_4, BB0_4, BB0_4, BB0_4, BB0_4, BB0_5, BB0_6, BB0_7{{$}}
 ; CHECK: BB0_2:
 ; CHECK:   call foo0
 ; CHECK: BB0_3:
@@ -101,7 +101,7 @@ sw.epilog:                                        ; preds = %entry, %sw.bb.5, %s
 ; CHECK: block BB1_4{{$}}
 ; CHECK: block BB1_3{{$}}
 ; CHECK: block BB1_2{{$}}
-; CHECK: tableswitch {{.*}}, BB1_2, BB1_2, BB1_2, BB1_2, BB1_2, BB1_2, BB1_2, BB1_2, BB1_3, BB1_3, BB1_3, BB1_3, BB1_3, BB1_3, BB1_3, BB1_3, BB1_4, BB1_4, BB1_4, BB1_4, BB1_4, BB1_4, BB1_5, BB1_6, BB1_7{{$}}
+; CHECK: tableswitch {{[^,]*}}, BB1_2, BB1_2, BB1_2, BB1_2, BB1_2, BB1_2, BB1_2, BB1_2, BB1_3, BB1_3, BB1_3, BB1_3, BB1_3, BB1_3, BB1_3, BB1_3, BB1_4, BB1_4, BB1_4, BB1_4, BB1_4, BB1_4, BB1_5, BB1_6, BB1_7{{$}}
 ; CHECK: BB1_2:
 ; CHECK:   call foo0
 ; CHECK: BB1_3:

From 4693393907a37825c48332f7dad147cb2764441a Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Sun, 6 Dec 2015 19:33:32 +0000
Subject: [PATCH 152/364] [WebAssembly] Enable folding of offsets into global
 variable addresses.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254882 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../WebAssembly/WebAssemblyISelLowering.cpp   | 10 ++---
 .../WebAssembly/WebAssemblyMCInstLower.cpp    |  9 +++-
 test/CodeGen/WebAssembly/offset-folding.ll    | 45 +++++++++++++++++++
 3 files changed, 56 insertions(+), 8 deletions(-)
 create mode 100644 test/CodeGen/WebAssembly/offset-folding.ll

diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 6e1283b4d334..85fb753ed0e0 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -196,9 +196,8 @@ FastISel *WebAssemblyTargetLowering::createFastISel(
 
 bool WebAssemblyTargetLowering::isOffsetFoldingLegal(
     const GlobalAddressSDNode * /*GA*/) const {
-  // The WebAssembly target doesn't support folding offsets into global
-  // addresses.
-  return false;
+  // All offsets can be folded.
+  return true;
 }
 
 MVT WebAssemblyTargetLowering::getScalarShiftAmountTy(const DataLayout & /*DL*/,
@@ -528,13 +527,12 @@ SDValue WebAssemblyTargetLowering::LowerGlobalAddress(SDValue Op,
   SDLoc DL(Op);
   const auto *GA = cast<GlobalAddressSDNode>(Op);
   EVT VT = Op.getValueType();
-  assert(GA->getOffset() == 0 &&
-         "offsets on global addresses are forbidden by isOffsetFoldingLegal");
   assert(GA->getTargetFlags() == 0 && "WebAssembly doesn't set target flags");
   if (GA->getAddressSpace() != 0)
     fail(DL, DAG, "WebAssembly only expects the 0 address space");
   return DAG.getNode(WebAssemblyISD::Wrapper, DL, VT,
-                     DAG.getTargetGlobalAddress(GA->getGlobal(), DL, VT));
+                     DAG.getTargetGlobalAddress(GA->getGlobal(), DL, VT,
+                                                GA->getOffset()));
 }
 
 SDValue
diff --git a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
index dd9358035a88..2d2adeb93d2d 100644
--- a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -39,11 +39,16 @@ MCSymbol *WebAssemblyMCInstLower::GetExternalSymbolSymbol(
 
 MCOperand WebAssemblyMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
                                                      MCSymbol *Sym) const {
+  assert(MO.getTargetFlags() == 0 && "WebAssembly does not use target flags");
 
   const MCExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
 
-  if (!MO.isJTI() && MO.getOffset())
-    llvm_unreachable("unknown symbol op");
+  int64_t Offset = MO.getOffset();
+  if (Offset != 0) {
+    assert(!MO.isJTI() && "Unexpected offset with jump table index");
+    Expr =
+        MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(Offset, Ctx), Ctx);
+  }
 
   return MCOperand::createExpr(Expr);
 }
diff --git a/test/CodeGen/WebAssembly/offset-folding.ll b/test/CodeGen/WebAssembly/offset-folding.ll
new file mode 100644
index 000000000000..19b110fcfa8a
--- /dev/null
+++ b/test/CodeGen/WebAssembly/offset-folding.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that constant offsets can be folded into global addresses.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+@x = external global [0 x i32]
+@y = global [50 x i32] zeroinitializer
+
+; Test basic constant offsets of both defined and external symbols.
+
+; CHECK-LABEL: test0:
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, x+188{{$}}
+; CHECK=NEXT: return $pop0{{$}}
+define i32* @test0() {
+  ret i32* getelementptr ([0 x i32], [0 x i32]* @x, i32 0, i32 47)
+}
+
+; CHECK-LABEL: test1:
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, y+188{{$}}
+; CHECK=NEXT: return $pop0{{$}}
+define i32* @test1() {
+  ret i32* getelementptr ([50 x i32], [50 x i32]* @y, i32 0, i32 47)
+}
+
+; Test zero offsets.
+
+; CHECK-LABEL: test2:
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, x{{$}}
+; CHECK=NEXT: return $pop0{{$}}
+define i32* @test2() {
+  ret i32* getelementptr ([0 x i32], [0 x i32]* @x, i32 0, i32 0)
+}
+
+; CHECK-LABEL: test3:
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.const $push0=, y{{$}}
+; CHECK=NEXT: return $pop0{{$}}
+define i32* @test3() {
+  ret i32* getelementptr ([50 x i32], [50 x i32]* @y, i32 0, i32 0)
+}

From 577f887f754894edf6d8093ddca72ff69f0659ec Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Sun, 6 Dec 2015 19:34:57 +0000
Subject: [PATCH 153/364] [WebAssembly] Make tableswitch's 'default' operand
 explicit. NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254883 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/WebAssemblyInstrControl.td | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
index 708d902e99e1..9a9468bb3909 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -42,12 +42,12 @@ let Defs = [ARGUMENTS] in {
 // jump tables, so in practice we don't ever use TABLESWITCH_I64 in wasm32 mode
 // currently.
 let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
-def TABLESWITCH_I32 : I<(outs), (ins I32:$index, variable_ops),
-                        [(WebAssemblytableswitch I32:$index)],
-                        "tableswitch\t$index">;
-def TABLESWITCH_I64 : I<(outs), (ins I64:$index, variable_ops),
-                        [(WebAssemblytableswitch I64:$index)],
-                        "tableswitch\t$index">;
+def TABLESWITCH_I32 : I<(outs), (ins I32:$index, bb_op:$default, variable_ops),
+                        [(WebAssemblytableswitch I32:$index, bb:$default)],
+                        "tableswitch\t$index, $default">;
+def TABLESWITCH_I64 : I<(outs), (ins I64:$index, bb_op:$default, variable_ops),
+                        [(WebAssemblytableswitch I64:$index, bb:$default)],
+                        "tableswitch\t$index, $default">;
 } // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
 
 // Placemarkers to indicate the start of a block or loop scope.

From 001f3417071d4d6b08cc0dcd1dc03f5f90fe7623 Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Sun, 6 Dec 2015 19:42:29 +0000
Subject: [PATCH 154/364] [WebAssembly] Factor out a TypeToString function,
 since we need it in multiple places.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254884 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstPrinter/WebAssemblyInstPrinter.cpp    | 32 +++++++++----------
 .../InstPrinter/WebAssemblyInstPrinter.h      |  7 ++++
 .../WebAssembly/WebAssemblyAsmPrinter.cpp     | 16 +---------
 3 files changed, 24 insertions(+), 31 deletions(-)

diff --git a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
index 9b94806c9533..3a151dec16f3 100644
--- a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
+++ b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
@@ -98,22 +98,7 @@ void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     case WebAssembly::PARAM:
     case WebAssembly::RESULT:
     case WebAssembly::LOCAL:
-      switch (Op.getImm()) {
-      case MVT::i32:
-        O << "i32";
-        break;
-      case MVT::i64:
-        O << "i64";
-        break;
-      case MVT::f32:
-        O << "f32";
-        break;
-      case MVT::f64:
-        O << "f64";
-        break;
-      default:
-        llvm_unreachable("unexpected type");
-      }
+      O << WebAssembly::TypeToString(MVT::SimpleValueType(Op.getImm()));
       break;
     default:
       O << Op.getImm();
@@ -126,3 +111,18 @@ void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     Op.getExpr()->print(O, &MAI);
   }
 }
+
+const char *llvm::WebAssembly::TypeToString(MVT Ty) {
+  switch (Ty.SimpleTy) {
+  case MVT::i32:
+    return "i32";
+  case MVT::i64:
+    return "i64";
+  case MVT::f32:
+    return "f32";
+  case MVT::f64:
+    return "f64";
+  default:
+    llvm_unreachable("unsupported type");
+  }
+}
diff --git a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
index 319c8ee1d5d9..20569da0b110 100644
--- a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
+++ b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
@@ -17,6 +17,7 @@
 
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/CodeGen/MachineValueType.h"
 
 namespace llvm {
 
@@ -40,6 +41,12 @@ class WebAssemblyInstPrinter final : public MCInstPrinter {
   static const char *getRegisterName(unsigned RegNo);
 };
 
+namespace WebAssembly {
+
+const char *TypeToString(MVT Ty);
+
+} // end namespace WebAssembly
+
 } // end namespace llvm
 
 #endif
diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index cfabc21ea4d9..1b175a7f8d5b 100644
--- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -115,21 +115,7 @@ std::string WebAssemblyAsmPrinter::regToString(const MachineOperand &MO) {
 }
 
 const char *WebAssemblyAsmPrinter::toString(MVT VT) const {
-  switch (VT.SimpleTy) {
-  default:
-    break;
-  case MVT::f32:
-    return "f32";
-  case MVT::f64:
-    return "f64";
-  case MVT::i32:
-    return "i32";
-  case MVT::i64:
-    return "i64";
-  }
-  DEBUG(dbgs() << "Invalid type " << EVT(VT).getEVTString() << '\n');
-  llvm_unreachable("invalid type");
-  return "<invalid>";
+  return WebAssembly::TypeToString(VT);
 }
 
 //===----------------------------------------------------------------------===//

From 1ecc6c0df2f39cc3c79ebc4f23f77be7e009f9df Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Sun, 6 Dec 2015 19:44:45 +0000
Subject: [PATCH 155/364] [Orc] Rename IndirectStubsManagerBase to
 IndirectStubsManager.

No functional change.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254885 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h    | 2 +-
 include/llvm/ExecutionEngine/Orc/IndirectionUtils.h        | 6 +++---
 lib/ExecutionEngine/Orc/IndirectionUtils.cpp               | 2 +-
 lib/ExecutionEngine/Orc/OrcCBindingsStack.cpp              | 3 ++-
 lib/ExecutionEngine/Orc/OrcCBindingsStack.h                | 2 +-
 tools/lli/OrcLazyJIT.cpp                                   | 3 ++-
 unittests/ExecutionEngine/Orc/CompileOnDemandLayerTest.cpp | 2 +-
 7 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h b/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
index b7ee9b5937f7..7dab5d1bc67f 100644
--- a/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
+++ b/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
@@ -39,7 +39,7 @@ namespace orc {
 /// compiled and executed.
 template <typename BaseLayerT,
           typename CompileCallbackMgrT = JITCompileCallbackManager,
-          typename IndirectStubsMgrT = IndirectStubsManagerBase>
+          typename IndirectStubsMgrT = IndirectStubsManager>
 class CompileOnDemandLayer {
 private:
 
diff --git a/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h b/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h
index aa75b3f46b4a..e490d894390b 100644
--- a/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h
+++ b/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h
@@ -211,13 +211,13 @@ class LocalJITCompileCallbackManager : public JITCompileCallbackManager {
 };
 
 /// @brief Base class for managing collections of named indirect stubs.
-class IndirectStubsManagerBase {
+class IndirectStubsManager {
 public:
 
   /// @brief Map type for initializing the manager. See init.
   typedef StringMap<std::pair<TargetAddress, JITSymbolFlags>> StubInitsMap;
 
-  virtual ~IndirectStubsManagerBase() {}
+  virtual ~IndirectStubsManager() {}
 
   /// @brief Create a single stub with the given name, target address and flags.
   virtual std::error_code createStub(StringRef StubName, TargetAddress StubAddr,
@@ -244,7 +244,7 @@ class IndirectStubsManagerBase {
 /// @brief IndirectStubsManager implementation for a concrete target, e.g.
 ///        OrcX86_64. (See OrcTargetSupport.h).
 template <typename TargetT>
-class IndirectStubsManager : public IndirectStubsManagerBase {
+class LocalIndirectStubsManager : public IndirectStubsManager {
 public:
 
   std::error_code createStub(StringRef StubName, TargetAddress StubAddr,
diff --git a/lib/ExecutionEngine/Orc/IndirectionUtils.cpp b/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
index dd6e3a3b29ae..34564e42b10f 100644
--- a/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
+++ b/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
@@ -20,7 +20,7 @@ namespace llvm {
 namespace orc {
 
 void JITCompileCallbackManager::anchor() {}
-void IndirectStubsManagerBase::anchor() {}
+void IndirectStubsManager::anchor() {}
 
 Constant* createIRTypedAddress(FunctionType &FT, TargetAddress Addr) {
   Constant *AddrIntVal =
diff --git a/lib/ExecutionEngine/Orc/OrcCBindingsStack.cpp b/lib/ExecutionEngine/Orc/OrcCBindingsStack.cpp
index d1af56d84867..e519c7f30920 100644
--- a/lib/ExecutionEngine/Orc/OrcCBindingsStack.cpp
+++ b/lib/ExecutionEngine/Orc/OrcCBindingsStack.cpp
@@ -36,7 +36,8 @@ OrcCBindingsStack::createIndirectStubsMgrBuilder(Triple T) {
 
     case Triple::x86_64:
       return [](){
-        return llvm::make_unique<orc::IndirectStubsManager<orc::OrcX86_64>>();
+        return llvm::make_unique<
+                 orc::LocalIndirectStubsManager<orc::OrcX86_64>>();
       };
   }
 }
diff --git a/lib/ExecutionEngine/Orc/OrcCBindingsStack.h b/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
index d2f7fe4ac0ef..2e17624ff474 100644
--- a/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
+++ b/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
@@ -268,7 +268,7 @@ class OrcCBindingsStack {
   CompileLayerT CompileLayer;
   CODLayerT CODLayer;
 
-  std::unique_ptr<orc::IndirectStubsManagerBase> IndirectStubsMgr;
+  std::unique_ptr<orc::IndirectStubsManager> IndirectStubsMgr;
 
   std::vector<std::unique_ptr<GenericHandle>> GenericHandles;
   std::vector<unsigned> FreeHandleIndexes;
diff --git a/tools/lli/OrcLazyJIT.cpp b/tools/lli/OrcLazyJIT.cpp
index edac10b86556..4235145ee7a5 100644
--- a/tools/lli/OrcLazyJIT.cpp
+++ b/tools/lli/OrcLazyJIT.cpp
@@ -65,7 +65,8 @@ OrcLazyJIT::createIndirectStubsMgrBuilder(Triple T) {
 
     case Triple::x86_64:
       return [](){
-        return llvm::make_unique<orc::IndirectStubsManager<orc::OrcX86_64>>();
+        return llvm::make_unique<
+                       orc::LocalIndirectStubsManager<orc::OrcX86_64>>();
       };
   }
 }
diff --git a/unittests/ExecutionEngine/Orc/CompileOnDemandLayerTest.cpp b/unittests/ExecutionEngine/Orc/CompileOnDemandLayerTest.cpp
index ca508d0a7561..a27e649b616f 100644
--- a/unittests/ExecutionEngine/Orc/CompileOnDemandLayerTest.cpp
+++ b/unittests/ExecutionEngine/Orc/CompileOnDemandLayerTest.cpp
@@ -23,7 +23,7 @@ class DummyCallbackManager : public orc::JITCompileCallbackManager {
   void grow() override { llvm_unreachable("not implemented"); }
 };
 
-class DummyStubsManager : public orc::IndirectStubsManagerBase {
+class DummyStubsManager : public orc::IndirectStubsManager {
 public:
   std::error_code createStub(StringRef StubName, TargetAddress InitAddr,
                              JITSymbolFlags Flags) override {

From 3b8cbdadafcf6879c1e6544ee8dd8182e6cd4133 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sun, 6 Dec 2015 20:12:19 +0000
Subject: [PATCH 156/364] [X86][AVX] Tidied up BROADCASTPD/BROADCASTPS tests

Regenerate tests using update_llc_test_checks.py


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254886 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/avx-vbroadcast.ll | 117 ++++++++++++++++++-----------
 1 file changed, 72 insertions(+), 45 deletions(-)

diff --git a/test/CodeGen/X86/avx-vbroadcast.ll b/test/CodeGen/X86/avx-vbroadcast.ll
index 8b8c11b85875..bfc9149b107d 100644
--- a/test/CodeGen/X86/avx-vbroadcast.ll
+++ b/test/CodeGen/X86/avx-vbroadcast.ll
@@ -1,7 +1,11 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
 
-; CHECK: vbroadcastsd (%
 define <4 x i64> @A(i64* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: A:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastsd (%rdi), %ymm0
+; CHECK-NEXT:    retq
 entry:
   %q = load i64, i64* %ptr, align 8
   %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
@@ -11,8 +15,11 @@ entry:
   ret <4 x i64> %vecinit6.i
 }
 
-; CHECK: vbroadcastss (%
 define <8 x i32> @B(i32* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: B:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss (%rdi), %ymm0
+; CHECK-NEXT:    retq
 entry:
   %q = load i32, i32* %ptr, align 4
   %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
@@ -22,8 +29,11 @@ entry:
   ret <8 x i32> %vecinit6.i
 }
 
-; CHECK: vbroadcastsd (%
 define <4 x double> @C(double* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: C:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastsd (%rdi), %ymm0
+; CHECK-NEXT:    retq
 entry:
   %q = load double, double* %ptr, align 8
   %vecinit.i = insertelement <4 x double> undef, double %q, i32 0
@@ -33,8 +43,11 @@ entry:
   ret <4 x double> %vecinit6.i
 }
 
-; CHECK: vbroadcastss (%
 define <8 x float> @D(float* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: D:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss (%rdi), %ymm0
+; CHECK-NEXT:    retq
 entry:
   %q = load float, float* %ptr, align 4
   %vecinit.i = insertelement <8 x float> undef, float %q, i32 0
@@ -46,8 +59,11 @@ entry:
 
 ;;;; 128-bit versions
 
-; CHECK: vbroadcastss (%
 define <4 x float> @e(float* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: e:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss (%rdi), %xmm0
+; CHECK-NEXT:    retq
 entry:
   %q = load float, float* %ptr, align 4
   %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
@@ -57,12 +73,14 @@ entry:
   ret <4 x float> %vecinit6.i
 }
 
-
-; CHECK: _e2
-; CHECK-NOT: vbroadcastss
-; CHECK: ret
+; Don't broadcast constants on pre-AVX2 hardware.
 define <4 x float> @_e2(float* %ptr) nounwind uwtable readnone ssp {
-    %vecinit.i = insertelement <4 x float> undef, float      0xbf80000000000000, i32 0
+; CHECK-LABEL: _e2:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [-7.812500e-03,-7.812500e-03,-7.812500e-03,-7.812500e-03]
+; CHECK-NEXT:    retq
+entry:
+   %vecinit.i = insertelement <4 x float> undef, float       0xbf80000000000000, i32 0
   %vecinit2.i = insertelement <4 x float> %vecinit.i, float  0xbf80000000000000, i32 1
   %vecinit4.i = insertelement <4 x float> %vecinit2.i, float 0xbf80000000000000, i32 2
   %vecinit6.i = insertelement <4 x float> %vecinit4.i, float 0xbf80000000000000, i32 3
@@ -70,8 +88,11 @@ define <4 x float> @_e2(float* %ptr) nounwind uwtable readnone ssp {
 }
 
 
-; CHECK: vbroadcastss (%
 define <4 x i32> @F(i32* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: F:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss (%rdi), %xmm0
+; CHECK-NEXT:    retq
 entry:
   %q = load i32, i32* %ptr, align 4
   %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
@@ -83,10 +104,12 @@ entry:
 
 ; Unsupported vbroadcasts
 
-; CHECK: _G
-; CHECK-NOT: broadcast (%
-; CHECK: ret
 define <2 x i64> @G(i64* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: G:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; CHECK-NEXT:    retq
 entry:
   %q = load i64, i64* %ptr, align 8
   %vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
@@ -94,18 +117,21 @@ entry:
   ret <2 x i64> %vecinit2.i
 }
 
-; CHECK: _H
-; CHECK-NOT: broadcast
-; CHECK: ret
 define <4 x i32> @H(<4 x i32> %a) {
+; CHECK-LABEL: H:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; CHECK-NEXT:    retq
+entry:
   %x = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   ret <4 x i32> %x
 }
 
-; CHECK: _I
-; CHECK-NOT: broadcast (%
-; CHECK: ret
 define <2 x double> @I(double* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: I:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
+; CHECK-NEXT:    retq
 entry:
   %q = load double, double* %ptr, align 4
   %vecinit.i = insertelement <2 x double> undef, double %q, i32 0
@@ -113,10 +139,13 @@ entry:
   ret <2 x double> %vecinit2.i
 }
 
-; CHECK: _RR
-; CHECK: vbroadcastss (%
-; CHECK: ret
 define <4 x float> @_RR(float* %ptr, i32* %k) nounwind uwtable readnone ssp {
+; CHECK-LABEL: _RR:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss (%rdi), %xmm0
+; CHECK-NEXT:    movl (%rsi), %eax
+; CHECK-NEXT:    movl %eax, (%rax)
+; CHECK-NEXT:    retq
 entry:
   %q = load float, float* %ptr, align 4
   %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
@@ -129,11 +158,11 @@ entry:
   ret <4 x float> %vecinit6.i
 }
 
-
-; CHECK: _RR2
-; CHECK: vbroadcastss (%
-; CHECK: ret
 define <4 x float> @_RR2(float* %ptr, i32* %k) nounwind uwtable readnone ssp {
+; CHECK-LABEL: _RR2:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss (%rdi), %xmm0
+; CHECK-NEXT:    retq
 entry:
   %q = load float, float* %ptr, align 4
   %v = insertelement <4 x float> undef, float %q, i32 0
@@ -141,16 +170,15 @@ entry:
   ret <4 x float> %t
 }
 
-
 ; These tests check that a vbroadcast instruction is used when we have a splat
 ; formed from a concat_vectors (via the shufflevector) of two BUILD_VECTORs
 ; (via the insertelements).
 
-; CHECK-LABEL: splat_concat1
-; CHECK-NOT: vinsertf128
-; CHECK: vbroadcastss (%
-; CHECK-NEXT: ret
 define <8 x float> @splat_concat1(float* %p) {
+; CHECK-LABEL: splat_concat1:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastss (%rdi), %ymm0
+; CHECK-NEXT:    retq
   %1 = load float, float* %p, align 4
   %2 = insertelement <4 x float> undef, float %1, i32 0
   %3 = insertelement <4 x float> %2, float %1, i32 1
@@ -160,11 +188,11 @@ define <8 x float> @splat_concat1(float* %p) {
   ret <8 x float> %6
 }
 
-; CHECK-LABEL: splat_concat2
-; CHECK-NOT: vinsertf128
-; CHECK: vbroadcastss (%
-; CHECK-NEXT: ret
 define <8 x float> @splat_concat2(float* %p) {
+; CHECK-LABEL: splat_concat2:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastss (%rdi), %ymm0
+; CHECK-NEXT:    retq
   %1 = load float, float* %p, align 4
   %2 = insertelement <4 x float> undef, float %1, i32 0
   %3 = insertelement <4 x float> %2, float %1, i32 1
@@ -178,11 +206,11 @@ define <8 x float> @splat_concat2(float* %p) {
   ret <8 x float> %10
 }
 
-; CHECK-LABEL: splat_concat3
-; CHECK-NOT: vinsertf128
-; CHECK: vbroadcastsd (%
-; CHECK-NEXT: ret
 define <4 x double> @splat_concat3(double* %p) {
+; CHECK-LABEL: splat_concat3:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastsd (%rdi), %ymm0
+; CHECK-NEXT:    retq
   %1 = load double, double* %p, align 8
   %2 = insertelement <2 x double> undef, double %1, i32 0
   %3 = insertelement <2 x double> %2, double %1, i32 1
@@ -190,11 +218,11 @@ define <4 x double> @splat_concat3(double* %p) {
   ret <4 x double> %4
 }
 
-; CHECK-LABEL: splat_concat4
-; CHECK-NOT: vinsertf128
-; CHECK: vbroadcastsd (%
-; CHECK-NEXT: ret
 define <4 x double> @splat_concat4(double* %p) {
+; CHECK-LABEL: splat_concat4:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastsd (%rdi), %ymm0
+; CHECK-NEXT:    retq
   %1 = load double, double* %p, align 8
   %2 = insertelement <2 x double> undef, double %1, i32 0
   %3 = insertelement <2 x double> %2, double %1, i32 1
@@ -203,4 +231,3 @@ define <4 x double> @splat_concat4(double* %p) {
   %6 = shufflevector <2 x double> %3, <2 x double> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x double> %6
 }
-

From 41e546b231af7f317b84d2b342125cea73a1bb46 Mon Sep 17 00:00:00 2001
From: Keno Fischer <kfischer@college.harvard.edu>
Date: Sun, 6 Dec 2015 23:05:38 +0000
Subject: [PATCH 157/364] [Verifier] Fix !dbg validation if Scope is the
 Subprogram

Summary:
We are inserting both Scope and SP into the Seen map and check whether
it was already there in which case we skip the validation (the idea
being that we already checked this Subprogram before). However,
if (Scope == SP) as MDNodes, then inserting the Scope, will trigger
the Seen check causing us to incorrectly not validate this !dbg
attachment. Fix this by not performing the SP Seen check if Scope == SP

Reviewers: pcc, dexonsmith, dblaikie

Subscribers: dblaikie, llvm-commits

Differential Revision: http://reviews.llvm.org/D14697

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254887 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/IR/Verifier.cpp                  |  5 ++++-
 test/CodeGen/X86/2010-05-28-Crash.ll |  2 +-
 test/Verifier/func-dbg.ll            | 25 +++++++++++++++++++++++++
 3 files changed, 30 insertions(+), 2 deletions(-)
 create mode 100644 test/Verifier/func-dbg.ll

diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index 5cbb597ca269..96b8a779577d 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -1814,7 +1814,10 @@ void Verifier::visitFunction(const Function &F) {
         continue;
 
       DISubprogram *SP = Scope ? Scope->getSubprogram() : nullptr;
-      if (SP && !Seen.insert(SP).second)
+
+      // Scope and SP could be the same MDNode and we don't want to skip
+      // validation in that case
+      if (SP && ((Scope != SP) && !Seen.insert(SP).second))
         continue;
 
       // FIXME: Once N is canonical, check "SP == &N".
diff --git a/test/CodeGen/X86/2010-05-28-Crash.ll b/test/CodeGen/X86/2010-05-28-Crash.ll
index 678f1befad1d..7967d45c2ee8 100644
--- a/test/CodeGen/X86/2010-05-28-Crash.ll
+++ b/test/CodeGen/X86/2010-05-28-Crash.ll
@@ -16,7 +16,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 define i32 @bar(i32 %x) nounwind optsize ssp !dbg !8 {
 entry:
   tail call void @llvm.dbg.value(metadata i32 %x, i64 0, metadata !7, metadata !DIExpression()), !dbg !DILocation(scope: !8)
-  tail call void @llvm.dbg.value(metadata i32 1, i64 0, metadata !0, metadata !DIExpression()) nounwind, !dbg !DILocation(scope: !1)
+  tail call void @llvm.dbg.value(metadata i32 1, i64 0, metadata !0, metadata !DIExpression()) nounwind, !dbg !DILocation(scope: !1, inlinedAt: !DILocation(scope: !8))
   %0 = tail call i32 (...) @zoo(i32 1) nounwind, !dbg !12 ; <i32> [#uses=1]
   %1 = add nsw i32 %0, %x, !dbg !13               ; <i32> [#uses=1]
   ret i32 %1, !dbg !13
diff --git a/test/Verifier/func-dbg.ll b/test/Verifier/func-dbg.ll
new file mode 100644
index 000000000000..e56de94d18c9
--- /dev/null
+++ b/test/Verifier/func-dbg.ll
@@ -0,0 +1,25 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+
+define i32 @foo() !dbg !4 {
+entry:
+  ret i32 0, !dbg !6
+}
+
+define i32 @bar() !dbg !5 {
+entry:
+; CHECK: !dbg attachment points at wrong subprogram for function
+  ret i32 0, !dbg !6
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7, !8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: false, runtimeVersion: 0, emissionKind: 0, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+!1 = !DIFile(filename: "dwarf-test.c", directory: "test")
+!2 = !{}
+!3 = !{!4, !5}
+!4 = distinct !DISubprogram(name: "foo", scope: !0, isDefinition: true)
+!5 = distinct !DISubprogram(name: "bar", scope: !0, isDefinition: true)
+!6 = !DILocation(line: 7, scope: !4)
+!7 = !{i32 2, !"Dwarf Version", i32 3}
+!8 = !{i32 1, !"Debug Info Version", i32 3}

From 0511c3bfdd84c473ea36ae52feacd6f40d1422d0 Mon Sep 17 00:00:00 2001
From: Davide Italiano <davide@freebsd.org>
Date: Mon, 7 Dec 2015 00:03:28 +0000
Subject: [PATCH 158/364] [llvm-objdump/MachoDump] Make code much more concise.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254888 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-objdump/MachODump.cpp | 25 +++++--------------------
 1 file changed, 5 insertions(+), 20 deletions(-)

diff --git a/tools/llvm-objdump/MachODump.cpp b/tools/llvm-objdump/MachODump.cpp
index 3fea0b2a4eee..b270057151e1 100644
--- a/tools/llvm-objdump/MachODump.cpp
+++ b/tools/llvm-objdump/MachODump.cpp
@@ -7608,26 +7608,11 @@ static void PrintUuidLoadCommand(MachO::uuid_command uuid) {
   else
     outs() << "\n";
   outs() << "    uuid ";
-  outs() << format("%02" PRIX32, uuid.uuid[0]);
-  outs() << format("%02" PRIX32, uuid.uuid[1]);
-  outs() << format("%02" PRIX32, uuid.uuid[2]);
-  outs() << format("%02" PRIX32, uuid.uuid[3]);
-  outs() << "-";
-  outs() << format("%02" PRIX32, uuid.uuid[4]);
-  outs() << format("%02" PRIX32, uuid.uuid[5]);
-  outs() << "-";
-  outs() << format("%02" PRIX32, uuid.uuid[6]);
-  outs() << format("%02" PRIX32, uuid.uuid[7]);
-  outs() << "-";
-  outs() << format("%02" PRIX32, uuid.uuid[8]);
-  outs() << format("%02" PRIX32, uuid.uuid[9]);
-  outs() << "-";
-  outs() << format("%02" PRIX32, uuid.uuid[10]);
-  outs() << format("%02" PRIX32, uuid.uuid[11]);
-  outs() << format("%02" PRIX32, uuid.uuid[12]);
-  outs() << format("%02" PRIX32, uuid.uuid[13]);
-  outs() << format("%02" PRIX32, uuid.uuid[14]);
-  outs() << format("%02" PRIX32, uuid.uuid[15]);
+  for (int i = 0; i < 16; ++i) {
+    outs() << format("%02" PRIX32, uuid.uuid[i]);
+    if (i == 3 || i == 5 || i == 7 || i == 9)
+      outs() << "-";
+  }
   outs() << "\n";
 }
 

From 260f72a26498347915e351d16b642a679abf64a9 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Mon, 7 Dec 2015 06:01:51 +0000
Subject: [PATCH 159/364] Add uint8_t size to LegalizeAction enum so we can use
 the enum type directly and remove some casts. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254893 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Target/TargetLowering.h | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index e99c9f758f8e..7e981776290d 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -83,7 +83,7 @@ class TargetLoweringBase {
 public:
   /// This enum indicates whether operations are valid for a target, and if not,
   /// what action should be used to make them valid.
-  enum LegalizeAction {
+  enum LegalizeAction : uint8_t {
     Legal,      // The target natively supports this operation.
     Promote,    // This operation should be executed in a larger type.
     Expand,     // Try to expand this to other ops, otherwise use a libcall.
@@ -550,8 +550,7 @@ class TargetLoweringBase {
     // If a target-specific SDNode requires legalization, require the target
     // to provide custom legalization for it.
     if (Op > array_lengthof(OpActions[0])) return Custom;
-    unsigned I = (unsigned) VT.getSimpleVT().SimpleTy;
-    return (LegalizeAction)OpActions[I][Op];
+    return OpActions[(unsigned)VT.getSimpleVT().SimpleTy][Op];
   }
 
   /// Return true if the specified operation is legal on this target or can be
@@ -595,7 +594,7 @@ class TargetLoweringBase {
     unsigned MemI = (unsigned) MemVT.getSimpleVT().SimpleTy;
     assert(ExtType < ISD::LAST_LOADEXT_TYPE && ValI < MVT::LAST_VALUETYPE &&
            MemI < MVT::LAST_VALUETYPE && "Table isn't big enough!");
-    return (LegalizeAction)LoadExtActions[ValI][MemI][ExtType];
+    return LoadExtActions[ValI][MemI][ExtType];
   }
 
   /// Return true if the specified load with extension is legal on this target.
@@ -621,7 +620,7 @@ class TargetLoweringBase {
     unsigned MemI = (unsigned) MemVT.getSimpleVT().SimpleTy;
     assert(ValI < MVT::LAST_VALUETYPE && MemI < MVT::LAST_VALUETYPE &&
            "Table isn't big enough!");
-    return (LegalizeAction)TruncStoreActions[ValI][MemI];
+    return TruncStoreActions[ValI][MemI];
   }
 
   /// Return true if the specified store with truncation is legal on this
@@ -1316,7 +1315,7 @@ class TargetLoweringBase {
   void setOperationAction(unsigned Op, MVT VT,
                           LegalizeAction Action) {
     assert(Op < array_lengthof(OpActions[0]) && "Table isn't big enough!");
-    OpActions[(unsigned)VT.SimpleTy][Op] = (uint8_t)Action;
+    OpActions[(unsigned)VT.SimpleTy][Op] = Action;
   }
 
   /// Indicate that the specified load with extension does not work with the
@@ -1325,7 +1324,7 @@ class TargetLoweringBase {
                         LegalizeAction Action) {
     assert(ExtType < ISD::LAST_LOADEXT_TYPE && ValVT.isValid() &&
            MemVT.isValid() && "Table isn't big enough!");
-    LoadExtActions[ValVT.SimpleTy][MemVT.SimpleTy][ExtType] = (uint8_t)Action;
+    LoadExtActions[(unsigned)ValVT.SimpleTy][MemVT.SimpleTy][ExtType] = Action;
   }
 
   /// Indicate that the specified truncating store does not work with the
@@ -1333,7 +1332,7 @@ class TargetLoweringBase {
   void setTruncStoreAction(MVT ValVT, MVT MemVT,
                            LegalizeAction Action) {
     assert(ValVT.isValid() && MemVT.isValid() && "Table isn't big enough!");
-    TruncStoreActions[ValVT.SimpleTy][MemVT.SimpleTy] = (uint8_t)Action;
+    TruncStoreActions[(unsigned)ValVT.SimpleTy][MemVT.SimpleTy] = Action;
   }
 
   /// Indicate that the specified indexed load does or does not work with the
@@ -1886,17 +1885,17 @@ class TargetLoweringBase {
   /// operations are Legal (aka, supported natively by the target), but
   /// operations that are not should be described.  Note that operations on
   /// non-legal value types are not described here.
-  uint8_t OpActions[MVT::LAST_VALUETYPE][ISD::BUILTIN_OP_END];
+  LegalizeAction OpActions[MVT::LAST_VALUETYPE][ISD::BUILTIN_OP_END];
 
   /// For each load extension type and each value type, keep a LegalizeAction
   /// that indicates how instruction selection should deal with a load of a
   /// specific value type and extension type.
-  uint8_t LoadExtActions[MVT::LAST_VALUETYPE][MVT::LAST_VALUETYPE]
-                        [ISD::LAST_LOADEXT_TYPE];
+  LegalizeAction LoadExtActions[MVT::LAST_VALUETYPE][MVT::LAST_VALUETYPE]
+                               [ISD::LAST_LOADEXT_TYPE];
 
   /// For each value type pair keep a LegalizeAction that indicates whether a
   /// truncating store of a specific value type and truncating type is legal.
-  uint8_t TruncStoreActions[MVT::LAST_VALUETYPE][MVT::LAST_VALUETYPE];
+  LegalizeAction TruncStoreActions[MVT::LAST_VALUETYPE][MVT::LAST_VALUETYPE];
 
   /// For each indexed mode and each value type, keep a pair of LegalizeAction
   /// that indicates how instruction selection should deal with the load /

From 1632a3b15247d3710382f81dc1ba6b4debf7c531 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Mon, 7 Dec 2015 06:31:41 +0000
Subject: [PATCH 160/364] Add uint8_t size to LegalizeTypeAction enum and use
 the enum type directly to remove some typecasts. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254895 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Target/TargetLowering.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index 7e981776290d..f7152565f914 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -93,7 +93,7 @@ class TargetLoweringBase {
 
   /// This enum indicates whether a types are legal for a target, and if not,
   /// what action should be used to make them valid.
-  enum LegalizeTypeAction {
+  enum LegalizeTypeAction : uint8_t {
     TypeLegal,           // The target natively supports this type.
     TypePromoteInteger,  // Replace this integer with a larger one.
     TypeExpandInteger,   // Split this integer into two of half the size.
@@ -413,20 +413,20 @@ class TargetLoweringBase {
   class ValueTypeActionImpl {
     /// ValueTypeActions - For each value type, keep a LegalizeTypeAction enum
     /// that indicates how instruction selection should deal with the type.
-    uint8_t ValueTypeActions[MVT::LAST_VALUETYPE];
+    LegalizeTypeAction ValueTypeActions[MVT::LAST_VALUETYPE];
 
   public:
     ValueTypeActionImpl() {
-      std::fill(std::begin(ValueTypeActions), std::end(ValueTypeActions), 0);
+      std::fill(std::begin(ValueTypeActions), std::end(ValueTypeActions),
+                TypeLegal);
     }
 
     LegalizeTypeAction getTypeAction(MVT VT) const {
-      return (LegalizeTypeAction)ValueTypeActions[VT.SimpleTy];
+      return ValueTypeActions[VT.SimpleTy];
     }
 
     void setTypeAction(MVT VT, LegalizeTypeAction Action) {
-      unsigned I = VT.SimpleTy;
-      ValueTypeActions[I] = Action;
+      ValueTypeActions[VT.SimpleTy] = Action;
     }
   };
 

From 01654c4941dcf2447b293500ef34ca9c299ee8f0 Mon Sep 17 00:00:00 2001
From: Zlatko Buljan <Zlatko.Buljan@imgtec.com>
Date: Mon, 7 Dec 2015 08:29:31 +0000
Subject: [PATCH 161/364] [mips][microMIPS] Implement LH, LHE, LHU and LHUE
 instructions Differential Revision: http://reviews.llvm.org/D9824

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254897 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Mips/MicroMipsInstrInfo.td         | 12 ++++---
 lib/Target/Mips/MipsInstrInfo.td              | 35 +++++++++++++++----
 .../Disassembler/Mips/micromips32r6/valid.txt |  4 +++
 .../Disassembler/Mips/micromips64r6/valid.txt |  4 +++
 test/MC/Mips/micromips-invalid.s              | 12 +++++++
 test/MC/Mips/micromips32r6/invalid.s          | 12 +++++++
 test/MC/Mips/micromips32r6/valid.s            |  4 +++
 test/MC/Mips/micromips64r6/invalid.s          | 12 +++++++
 test/MC/Mips/micromips64r6/valid.s            |  4 +++
 test/MC/Mips/mips32r6/invalid.s               | 12 +++++++
 test/MC/Mips/mips64r6/invalid.s               | 12 +++++++
 11 files changed, 113 insertions(+), 10 deletions(-)

diff --git a/lib/Target/Mips/MicroMipsInstrInfo.td b/lib/Target/Mips/MicroMipsInstrInfo.td
index 175a9559e004..5745601b32e4 100644
--- a/lib/Target/Mips/MicroMipsInstrInfo.td
+++ b/lib/Target/Mips/MicroMipsInstrInfo.td
@@ -738,8 +738,10 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
   let DecoderMethod = "DecodeMemMMImm16" in {
     def LB_MM  : Load<"lb", GPR32Opnd>, MMRel, LW_FM_MM<0x7>;
     def LBu_MM : Load<"lbu", GPR32Opnd>, MMRel, LW_FM_MM<0x5>;
-    def LH_MM  : Load<"lh", GPR32Opnd>, MMRel, LW_FM_MM<0xf>;
-    def LHu_MM : Load<"lhu", GPR32Opnd>, MMRel, LW_FM_MM<0xd>;
+    def LH_MM  : LoadMemory<"lh", GPR32Opnd, mem_simm16gpr>, MMRel,
+                 LW_FM_MM<0xf>;
+    def LHu_MM : LoadMemory<"lhu", GPR32Opnd, mem_simm16gpr>, MMRel,
+                 LW_FM_MM<0xd>;
     def LW_MM  : Load<"lw", GPR32Opnd>, MMRel, LW_FM_MM<0x3f>;
     def SB_MM  : Store<"sb", GPR32Opnd>, MMRel, LW_FM_MM<0x6>;
     def SH_MM  : Store<"sh", GPR32Opnd>, MMRel, LW_FM_MM<0xe>;
@@ -749,8 +751,10 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
   let DecoderMethod = "DecodeMemMMImm9" in {
     def LBE_MM  : Load<"lbe", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x4>;
     def LBuE_MM : Load<"lbue", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x0>;
-    def LHE_MM  : Load<"lhe", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x5>;
-    def LHuE_MM : Load<"lhue", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x1>;
+    def LHE_MM  : LoadMemory<"lhe", GPR32Opnd, mem_simm9gpr>,
+                  POOL32C_LHUE_FM_MM<0x18, 0x6, 0x5>;
+    def LHuE_MM : LoadMemory<"lhue", GPR32Opnd, mem_simm9gpr>,
+                  POOL32C_LHUE_FM_MM<0x18, 0x6, 0x1>;
     def LWE_MM  : Load<"lwe", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x7>;
     def SBE_MM  : Store<"sbe", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0xa, 0x4>;
     def SHE_MM  : Store<"she", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0xa, 0x5>;
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index e0b317e9bf2d..f3be7fc46187 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -598,6 +598,14 @@ def MipsMemSimm16AsmOperand : AsmOperandClass {
   let PredicateMethod = "isMemWithSimmOffset<16>";
 }
 
+def MipsMemSimm16GPRAsmOperand : AsmOperandClass {
+  let Name = "MemOffsetSimm16GPR";
+  let SuperClasses = [MipsMemAsmOperand];
+  let RenderMethod = "addMemOperands";
+  let ParserMethod = "parseMemOperand";
+  let PredicateMethod = "isMemWithSimmOffsetGPR<16>";
+}
+
 def MipsInvertedImmoperand : AsmOperandClass {
   let Name = "InvNum";
   let RenderMethod = "addImmOperands";
@@ -653,6 +661,12 @@ def mem_simm16 : mem_generic {
   let ParserMatchClass = MipsMemSimm16AsmOperand;
 }
 
+def mem_simm16gpr : mem_generic {
+  let MIOperandInfo = (ops ptr_rc, simm16);
+  let EncoderMethod = "getMemEncoding";
+  let ParserMatchClass = MipsMemSimm16GPRAsmOperand;
+}
+
 def mem_ea : Operand<iPTR> {
   let PrintMethod = "printMemOperandEA";
   let MIOperandInfo = (ops ptr_rc, simm16);
@@ -820,15 +834,21 @@ class LoadUpper<string opstr, RegisterOperand RO, Operand Imm>:
 }
 
 // Memory Load/Store
-class Load<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag,
-           InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> :
-  InstSE<(outs RO:$rt), (ins mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+class LoadMemory<string opstr, DAGOperand RO, DAGOperand MO,
+                 SDPatternOperator OpNode = null_frag,
+                 InstrItinClass Itin = NoItinerary,
+                 ComplexPattern Addr = addr> :
+  InstSE<(outs RO:$rt), (ins MO:$addr), !strconcat(opstr, "\t$rt, $addr"),
          [(set RO:$rt, (OpNode Addr:$addr))], Itin, FrmI, opstr> {
   let DecoderMethod = "DecodeMem";
   let canFoldAsLoad = 1;
   let mayLoad = 1;
 }
 
+class Load<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag,
+           InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> :
+  LoadMemory<opstr, RO, mem, OpNode, Itin, Addr>;
+
 class StoreMemory<string opstr, DAGOperand RO, DAGOperand MO,
             SDPatternOperator OpNode = null_frag,
             InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> :
@@ -1374,9 +1394,12 @@ def ROTRV : MMRel, shift_rotate_reg<"rotrv", GPR32Opnd, II_ROTRV, rotr>,
 def LB  : Load<"lb", GPR32Opnd, sextloadi8, II_LB>, MMRel, LW_FM<0x20>;
 def LBu : Load<"lbu", GPR32Opnd, zextloadi8, II_LBU, addrDefault>, MMRel,
           LW_FM<0x24>;
-def LH  : Load<"lh", GPR32Opnd, sextloadi16, II_LH, addrDefault>, MMRel,
-          LW_FM<0x21>;
-def LHu : Load<"lhu", GPR32Opnd, zextloadi16, II_LHU>, MMRel, LW_FM<0x25>;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def LH  : LoadMemory<"lh", GPR32Opnd, mem_simm16gpr, sextloadi16, II_LH,
+                       addrDefault>, MMRel, LW_FM<0x21>;
+  def LHu : LoadMemory<"lhu", GPR32Opnd, mem_simm16gpr, zextloadi16, II_LHU>,
+            MMRel, LW_FM<0x25>;
+}
 let AdditionalPredicates = [NotInMicroMips] in {
 def LW  : StdMMR6Rel, Load<"lw", GPR32Opnd, load, II_LW, addrDefault>, MMRel,
           LW_FM<0x23>;
diff --git a/test/MC/Disassembler/Mips/micromips32r6/valid.txt b/test/MC/Disassembler/Mips/micromips32r6/valid.txt
index 5fa2138262a4..d96a243eef8c 100644
--- a/test/MC/Disassembler/Mips/micromips32r6/valid.txt
+++ b/test/MC/Disassembler/Mips/micromips32r6/valid.txt
@@ -253,3 +253,7 @@
 0x55 0x04 0x12 0x78 # CHECK: selnez.d $f2, $f4, $f8
 0x54 0x62 0x00 0x60 # CHECK: class.s $f2, $f3
 0x54 0x82 0x02 0x60 # CHECK: class.d $f2, $f4
+0x3c 0x44 0x00 0x08 # CHECK: lh $2, 8($4)
+0x60 0x82 0x6a 0x08 # CHECK: lhe $4, 8($2)
+0x34 0x82 0x00 0x08 # CHECK: lhu $4, 8($2)
+0x60 0x82 0x62 0x08 # CHECK: lhue $4, 8($2)
diff --git a/test/MC/Disassembler/Mips/micromips64r6/valid.txt b/test/MC/Disassembler/Mips/micromips64r6/valid.txt
index 10a9687384ea..fadd61c9ecd7 100644
--- a/test/MC/Disassembler/Mips/micromips64r6/valid.txt
+++ b/test/MC/Disassembler/Mips/micromips64r6/valid.txt
@@ -166,3 +166,7 @@
 0x55 0x04 0x12 0x78 # CHECK: selnez.d $f2, $f4, $f8
 0x54 0x62 0x00 0x60 # CHECK: class.s $f2, $f3
 0x54 0x82 0x02 0x60 # CHECK: class.d $f2, $f4
+0x3c 0x44 0x00 0x08 # CHECK: lh $2, 8($4)
+0x60 0x82 0x6a 0x08 # CHECK: lhe $4, 8($2)
+0x34 0x82 0x00 0x08 # CHECK: lhu $4, 8($2)
+0x60 0x82 0x62 0x08 # CHECK: lhue $4, 8($2)
diff --git a/test/MC/Mips/micromips-invalid.s b/test/MC/Mips/micromips-invalid.s
index ed0ab1bdc233..63a1c914301a 100644
--- a/test/MC/Mips/micromips-invalid.s
+++ b/test/MC/Mips/micromips-invalid.s
@@ -91,3 +91,15 @@
   jraddiusp 33      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: expected both 7-bit unsigned immediate and multiple of 4
   jraddiusp 125     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: expected both 7-bit unsigned immediate and multiple of 4
   jraddiusp 132     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: expected both 7-bit unsigned immediate and multiple of 4
+  lh $33, 8($4)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lhe $34, 8($2)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lhu $35, 8($2)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lhue $36, 8($2)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lh $2, 8($34)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lhe $4, 8($33)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lhu $4, 8($35)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lhue $4, 8($37)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lh $2, 65536($4)  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lhe $4, 512($2)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lhu $4, 65536($2) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lhue $4, 512($2)  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/test/MC/Mips/micromips32r6/invalid.s b/test/MC/Mips/micromips32r6/invalid.s
index 14259eadaeac..8f1e64acf777 100644
--- a/test/MC/Mips/micromips32r6/invalid.s
+++ b/test/MC/Mips/micromips32r6/invalid.s
@@ -109,3 +109,15 @@
   swm16 $16-$20, 8($sp)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
   swm16 $16, $17, $ra, 8($fp)  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
   swm16 $16, $17, $ra, 64($sp) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lh $33, 8($4)            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lhe $34, 8($2)           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lhu $35, 8($2)           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lhue $36, 8($2)          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lh $2, 8($34)            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lhe $4, 8($33)           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lhu $4, 8($35)           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lhue $4, 8($37)          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lh $2, 65536($4)         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lhe $4, 512($2)          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lhu $4, 65536($2)        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lhue $4, 512($2)         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/test/MC/Mips/micromips32r6/valid.s b/test/MC/Mips/micromips32r6/valid.s
index 194b15e1a4f6..85958bc49179 100644
--- a/test/MC/Mips/micromips32r6/valid.s
+++ b/test/MC/Mips/micromips32r6/valid.s
@@ -246,3 +246,7 @@
   selnez.d $f2, $f4, $f8   # CHECK: selnez.d $f2, $f4, $f8 # encoding: [0x55,0x04,0x12,0x78]
   class.s $f2, $f3         # CHECK: class.s $f2, $f3       # encoding: [0x54,0x62,0x00,0x60]
   class.d $f2, $f4         # CHECK: class.d $f2, $f4       # encoding: [0x54,0x82,0x02,0x60]
+  lh $2, 8($4)             # CHECK: lh $2, 8($4)        # encoding: [0x3c,0x44,0x00,0x08]
+  lhe $4, 8($2)            # CHECK: lhe $4, 8($2)       # encoding: [0x60,0x82,0x6a,0x08]
+  lhu $4, 8($2)            # CHECK: lhu $4, 8($2)       # encoding: [0x34,0x82,0x00,0x08]
+  lhue $4, 8($2)           # CHECK: lhue $4, 8($2)      # encoding: [0x60,0x82,0x62,0x08]
diff --git a/test/MC/Mips/micromips64r6/invalid.s b/test/MC/Mips/micromips64r6/invalid.s
index 51b9f7530485..27b5146af3eb 100644
--- a/test/MC/Mips/micromips64r6/invalid.s
+++ b/test/MC/Mips/micromips64r6/invalid.s
@@ -118,3 +118,15 @@
   swm16 $16-$20, 8($sp)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
   swm16 $16, $17, $ra, 8($fp)  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
   swm16 $16, $17, $ra, 64($sp) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lh $33, 8($4)            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lhe $34, 8($2)           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lhu $35, 8($2)           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lhue $36, 8($2)          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lh $2, 8($34)            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lhe $4, 8($33)           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lhu $4, 8($35)           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lhue $4, 8($37)          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lh $2, 65536($4)         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lhe $4, 512($2)          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lhu $4, 65536($2)        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  lhue $4, 512($2)         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/test/MC/Mips/micromips64r6/valid.s b/test/MC/Mips/micromips64r6/valid.s
index 1c8781b6e966..edee56adeda4 100644
--- a/test/MC/Mips/micromips64r6/valid.s
+++ b/test/MC/Mips/micromips64r6/valid.s
@@ -146,5 +146,9 @@ a:
         selnez.d $f2, $f4, $f8   # CHECK: selnez.d $f2, $f4, $f8  # encoding: [0x55,0x04,0x12,0x78]
         class.s $f2, $f3         # CHECK: class.s $f2, $f3        # encoding: [0x54,0x62,0x00,0x60]
         class.d $f2, $f4         # CHECK: class.d $f2, $f4        # encoding: [0x54,0x82,0x02,0x60]
+        lh $2, 8($4)             # CHECK: lh $2, 8($4)        # encoding: [0x3c,0x44,0x00,0x08]
+        lhe $4, 8($2)            # CHECK: lhe $4, 8($2)       # encoding: [0x60,0x82,0x6a,0x08]
+        lhu $4, 8($2)            # CHECK: lhu $4, 8($2)       # encoding: [0x34,0x82,0x00,0x08]
+        lhue $4, 8($2)           # CHECK: lhue $4, 8($2)      # encoding: [0x60,0x82,0x62,0x08]
 
 1:
diff --git a/test/MC/Mips/mips32r6/invalid.s b/test/MC/Mips/mips32r6/invalid.s
index 452cd3a5ee62..56edcb372a4a 100644
--- a/test/MC/Mips/mips32r6/invalid.s
+++ b/test/MC/Mips/mips32r6/invalid.s
@@ -19,6 +19,18 @@ local_label:
         break 1024, 5     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         break 7, 1024     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         break 1024, 1024  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lh  $33, 8($4)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lhe $34, 8($2)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lhu $35, 8($2)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lhue $36, 8($2)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lh  $2, 8($34)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lhe $4, 8($33)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lhu $4, 8($35)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lhue $4, 8($37)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lh  $2, 65536($4) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lhe $4, 512($2)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lhu $4, 65536($2) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lhue $4, 512($2)  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         // FIXME: Following tests are temporarely disabled, until "PredicateControl not in hierarchy" problem is resolved
         bltl  $7, $8, local_label  # -CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         bltul $7, $8, local_label  # -CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips64r6/invalid.s b/test/MC/Mips/mips64r6/invalid.s
index 8d68b51c5111..c615b06e4bfd 100644
--- a/test/MC/Mips/mips64r6/invalid.s
+++ b/test/MC/Mips/mips64r6/invalid.s
@@ -17,6 +17,18 @@ local_label:
         break 1024, 5     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         break 7, 1024     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         break 1024, 1024  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lh  $33, 8($4)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lhe $34, 8($2)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lhu $35, 8($2)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lhue $36, 8($2)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lh  $2, 8($34)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lhe $4, 8($33)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lhu $4, 8($35)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lhue $4, 8($37)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lh  $2, 65536($4) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lhe $4, 512($2)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lhu $4, 65536($2) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lhue $4, 512($2)  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         // FIXME: Following tests are temporarely disabled, until "PredicateControl not in hierarchy" problem is resolved
         bltl  $7, $8, local_label  # -CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         bltul $7, $8, local_label  # -CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled

From 0239b7553bd7dc0071475e9bf63effa983690666 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 7 Dec 2015 09:09:54 +0000
Subject: [PATCH 162/364] [X86][AVX] Added tests to load+broadcast non-zero'th
 vector elements

Baseline for an upcoming patch for PR23022

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254898 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/avx-vbroadcast.ll  | 153 +++++++++++++++++++
 test/CodeGen/X86/avx2-vbroadcast.ll | 225 +++++++++++++++++++++++++++-
 2 files changed, 375 insertions(+), 3 deletions(-)

diff --git a/test/CodeGen/X86/avx-vbroadcast.ll b/test/CodeGen/X86/avx-vbroadcast.ll
index bfc9149b107d..5c0f43da876d 100644
--- a/test/CodeGen/X86/avx-vbroadcast.ll
+++ b/test/CodeGen/X86/avx-vbroadcast.ll
@@ -102,6 +102,159 @@ entry:
   ret <4 x i32> %vecinit6.i
 }
 
+; FIXME: Pointer adjusted broadcasts
+
+define <4 x i32> @load_splat_4i32_4i32_1111(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4i32_4i32_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = mem[1,1,1,1]
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x i32>, <4 x i32>* %ptr
+  %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %ret
+}
+
+define <8 x i32> @load_splat_8i32_4i32_33333333(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_8i32_4i32_33333333:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x i32>, <4 x i32>* %ptr
+  %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @load_splat_8i32_8i32_55555555(<8 x i32>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_8i32_8i32_55555555:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovaps (%rdi), %ymm0
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <8 x i32>, <8 x i32>* %ptr
+  %ret = shufflevector <8 x i32> %ld, <8 x i32> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <8 x i32> %ret
+}
+
+define <4 x float> @load_splat_4f32_4f32_1111(<4 x float>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4f32_4f32_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = mem[1,1,1,1]
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x float>, <4 x float>* %ptr
+  %ret = shufflevector <4 x float> %ld, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x float> %ret
+}
+
+define <8 x float> @load_splat_8f32_4f32_33333333(<4 x float>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_8f32_4f32_33333333:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x float>, <4 x float>* %ptr
+  %ret = shufflevector <4 x float> %ld, <4 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  ret <8 x float> %ret
+}
+
+define <8 x float> @load_splat_8f32_8f32_55555555(<8 x float>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_8f32_8f32_55555555:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovaps (%rdi), %ymm0
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <8 x float>, <8 x float>* %ptr
+  %ret = shufflevector <8 x float> %ld, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <8 x float> %ret
+}
+
+define <2 x i64> @load_splat_2i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_2i64_2i64_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,3,2,3]
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <2 x i64>, <2 x i64>* %ptr
+  %ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x i64> %ret
+}
+
+define <4 x i64> @load_splat_4i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4i64_2i64_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovaps (%rdi), %xmm0
+; CHECK-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <2 x i64>, <2 x i64>* %ptr
+  %ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i64> %ret
+}
+
+define <4 x i64> @load_splat_4i64_4i64_2222(<4 x i64>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4i64_4i64_2222:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovapd (%rdi), %ymm0
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x i64>, <4 x i64>* %ptr
+  %ret = shufflevector <4 x i64> %ld, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x i64> %ret
+}
+
+define <2 x double> @load_splat_2f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_2f64_2f64_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovaps (%rdi), %xmm0
+; CHECK-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <2 x double>, <2 x double>* %ptr
+  %ret = shufflevector <2 x double> %ld, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x double> %ret
+}
+
+define <4 x double> @load_splat_4f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4f64_2f64_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovaps (%rdi), %xmm0
+; CHECK-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <2 x double>, <2 x double>* %ptr
+  %ret = shufflevector <2 x double> %ld, <2 x double> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x double> %ret
+}
+
+define <4 x double> @load_splat_4f64_4f64_2222(<4 x double>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4f64_4f64_2222:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovapd (%rdi), %ymm0
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x double>, <4 x double>* %ptr
+  %ret = shufflevector <4 x double> %ld, <4 x double> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x double> %ret
+}
+
 ; Unsupported vbroadcasts
 
 define <2 x i64> @G(i64* %ptr) nounwind uwtable readnone ssp {
diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll
index 418707cdc237..186f50873650 100644
--- a/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -172,6 +172,225 @@ entry:
   ret <4 x i64> %q3
 }
 
+; FIXME: Pointer adjusted broadcasts
+
+define <16 x i8> @load_splat_16i8_16i8_1111111111111111(<16 x i8>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_16i8_16i8_1111111111111111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm0
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <16 x i8>, <16 x i8>* %ptr
+  %ret = shufflevector <16 x i8> %ld, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <16 x i8> %ret
+}
+
+define <32 x i8> @load_splat_32i8_16i8_11111111111111111111111111111111(<16 x i8>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_32i8_16i8_11111111111111111111111111111111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm0
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <16 x i8>, <16 x i8>* %ptr
+  %ret = shufflevector <16 x i8> %ld, <16 x i8> undef, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <32 x i8> %ret
+}
+
+define <32 x i8> @load_splat_32i8_32i8_11111111111111111111111111111111(<32 x i8>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_32i8_32i8_11111111111111111111111111111111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <32 x i8>, <32 x i8>* %ptr
+  %ret = shufflevector <32 x i8> %ld, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <32 x i8> %ret
+}
+
+define <8 x i16> @load_splat_8i16_8i16_11111111(<8 x i16>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_8i16_8i16_11111111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm0
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <8 x i16>, <8 x i16>* %ptr
+  %ret = shufflevector <8 x i16> %ld, <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <8 x i16> %ret
+}
+
+define <16 x i16> @load_splat_16i16_8i16_1111111111111111(<8 x i16>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_16i16_8i16_1111111111111111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm0
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <8 x i16>, <8 x i16>* %ptr
+  %ret = shufflevector <8 x i16> %ld, <8 x i16> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <16 x i16> %ret
+}
+
+define <16 x i16> @load_splat_16i16_16i16_1111111111111111(<16 x i16>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_16i16_16i16_1111111111111111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <16 x i16>, <16 x i16>* %ptr
+  %ret = shufflevector <16 x i16> %ld, <16 x i16> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <16 x i16> %ret
+}
+
+define <4 x i32> @load_splat_4i32_4i32_1111(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4i32_4i32_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = mem[1,1,1,1]
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x i32>, <4 x i32>* %ptr
+  %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %ret
+}
+
+define <8 x i32> @load_splat_8i32_4i32_33333333(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_8i32_4i32_33333333:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm0
+; CHECK-NEXT:    vpbroadcastd LCPI15_0(%rip), %ymm1
+; CHECK-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x i32>, <4 x i32>* %ptr
+  %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @load_splat_8i32_8i32_55555555(<8 x i32>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_8i32_8i32_55555555:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpbroadcastd LCPI16_0(%rip), %ymm0
+; CHECK-NEXT:    vpermd (%rdi), %ymm0, %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <8 x i32>, <8 x i32>* %ptr
+  %ret = shufflevector <8 x i32> %ld, <8 x i32> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <8 x i32> %ret
+}
+
+define <4 x float> @load_splat_4f32_4f32_1111(<4 x float>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4f32_4f32_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = mem[1,1,1,1]
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x float>, <4 x float>* %ptr
+  %ret = shufflevector <4 x float> %ld, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x float> %ret
+}
+
+define <8 x float> @load_splat_8f32_4f32_33333333(<4 x float>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_8f32_4f32_33333333:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovaps (%rdi), %xmm0
+; CHECK-NEXT:    vbroadcastss LCPI18_0(%rip), %ymm1
+; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x float>, <4 x float>* %ptr
+  %ret = shufflevector <4 x float> %ld, <4 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  ret <8 x float> %ret
+}
+
+define <8 x float> @load_splat_8f32_8f32_55555555(<8 x float>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_8f32_8f32_55555555:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss LCPI19_0(%rip), %ymm0
+; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <8 x float>, <8 x float>* %ptr
+  %ret = shufflevector <8 x float> %ld, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <8 x float> %ret
+}
+
+define <2 x i64> @load_splat_2i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_2i64_2i64_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,3,2,3]
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <2 x i64>, <2 x i64>* %ptr
+  %ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x i64> %ret
+}
+
+define <4 x i64> @load_splat_4i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4i64_2i64_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm0
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1]
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <2 x i64>, <2 x i64>* %ptr
+  %ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i64> %ret
+}
+
+define <4 x i64> @load_splat_4i64_4i64_2222(<4 x i64>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4i64_4i64_2222:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = mem[2,2,2,2]
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x i64>, <4 x i64>* %ptr
+  %ret = shufflevector <4 x i64> %ld, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x i64> %ret
+}
+
+define <2 x double> @load_splat_2f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_2f64_2f64_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovaps (%rdi), %xmm0
+; CHECK-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <2 x double>, <2 x double>* %ptr
+  %ret = shufflevector <2 x double> %ld, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x double> %ret
+}
+
+define <4 x double> @load_splat_4f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4f64_2f64_1111:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmovapd (%rdi), %xmm0
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1]
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <2 x double>, <2 x double>* %ptr
+  %ret = shufflevector <2 x double> %ld, <2 x double> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x double> %ret
+}
+
+define <4 x double> @load_splat_4f64_4f64_2222(<4 x double>* %ptr) nounwind uwtable readnone ssp {
+; CHECK-LABEL: load_splat_4f64_4f64_2222:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,2,2,2]
+; CHECK-NEXT:    retq
+entry:
+  %ld = load <4 x double>, <4 x double>* %ptr
+  %ret = shufflevector <4 x double> %ld, <4 x double> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x double> %ret
+}
+
 ; make sure that we still don't support broadcast double into 128-bit vector
 ; this used to crash
 define <2 x double> @I(double* %ptr) nounwind uwtable readnone ssp {
@@ -242,13 +461,13 @@ define void @crash() nounwind alwaysinline {
 ; CHECK:       ## BB#0: ## %WGLoopsEntry
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    je LBB13_1
+; CHECK-NEXT:    je LBB31_1
 ; CHECK-NEXT:  ## BB#2: ## %ret
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:    .align 4, 0x90
-; CHECK-NEXT:  LBB13_1: ## %footer349VF
+; CHECK-NEXT:  LBB31_1: ## %footer349VF
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    jmp LBB13_1
+; CHECK-NEXT:    jmp LBB31_1
 WGLoopsEntry:
   br i1 undef, label %ret, label %footer329VF
 

From 8205637a28004f0cb652634f13a154006e187c12 Mon Sep 17 00:00:00 2001
From: Bradley Smith <bradley.smith@arm.com>
Date: Mon, 7 Dec 2015 10:54:36 +0000
Subject: [PATCH 163/364] [ARM] Flag vcvt{t,b} with an f16 type specifier as
 part of the FP16 extension

Additionally correct the Cortex-R7 definition to allow the FP16 feature.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254900 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/ARM.td                |  1 +
 lib/Target/ARM/ARMInstrVFP.td        | 12 ++++++++----
 test/CodeGen/ARM/build-attributes.ll |  2 +-
 test/MC/ARM/neon-vcvt-fp16.s         | 18 ++++++++++++++++++
 4 files changed, 28 insertions(+), 5 deletions(-)
 create mode 100644 test/MC/ARM/neon-vcvt-fp16.s

diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index dd33c3614b1a..a44dc830a673 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -585,6 +585,7 @@ def : ProcessorModel<"cortex-r7",   CortexA8Model,      [ARMv7r, ProcR7,
                                                          FeatureVFP3,
                                                          FeatureVFPOnlySP,
                                                          FeatureD16,
+                                                         FeatureFP16,
                                                          FeatureMP,
                                                          FeatureSlowFPBrcc,
                                                          FeatureHWDivARM,
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index 2aea73a6336e..050cd1a445ad 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -540,19 +540,23 @@ def VCVTSD  : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm,
 // FIXME: Verify encoding after integrated assembler is working.
 def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$Sd, $Sm",
-                 [/* For disassembly only; pattern left blank */]>;
+                 [/* For disassembly only; pattern left blank */]>,
+                 Requires<[HasFP16]>;
 
 def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm",
-                 [/* For disassembly only; pattern left blank */]>;
+                 [/* For disassembly only; pattern left blank */]>,
+                 Requires<[HasFP16]>;
 
 def VCVTTHS: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  /* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$Sd, $Sm",
-                 [/* For disassembly only; pattern left blank */]>;
+                 [/* For disassembly only; pattern left blank */]>,
+                 Requires<[HasFP16]>;
 
 def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  /* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$Sd, $Sm",
-                 [/* For disassembly only; pattern left blank */]>;
+                 [/* For disassembly only; pattern left blank */]>,
+                 Requires<[HasFP16]>;
 
 def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0,
                    (outs DPR:$Dd), (ins SPR:$Sm),
diff --git a/test/CodeGen/ARM/build-attributes.ll b/test/CodeGen/ARM/build-attributes.ll
index b80191d76012..bf502b3ae077 100644
--- a/test/CodeGen/ARM/build-attributes.ll
+++ b/test/CodeGen/ARM/build-attributes.ll
@@ -1105,7 +1105,7 @@
 ; CORTEX-R7:  .eabi_attribute 25, 1
 ; CORTEX-R7:  .eabi_attribute 27, 1
 ; CORTEX-R7-NOT:  .eabi_attribute 28
-; CORTEX-R7-NOT:  .eabi_attribute 36
+; CORTEX-R7:  .eabi_attribute 36, 1
 ; CORTEX-R7:  .eabi_attribute 38, 1
 ; CORTEX-R7:  .eabi_attribute 42, 1
 ; CORTEX-R7:  .eabi_attribute 44, 2
diff --git a/test/MC/ARM/neon-vcvt-fp16.s b/test/MC/ARM/neon-vcvt-fp16.s
new file mode 100644
index 000000000000..a23be061c0fa
--- /dev/null
+++ b/test/MC/ARM/neon-vcvt-fp16.s
@@ -0,0 +1,18 @@
+@ RUN: llvm-mc -mcpu=cortex-r7 -triple arm -show-encoding < %s 2>&1| \
+@ RUN:    FileCheck %s --check-prefix=CHECK-FP16
+@ RUN: not llvm-mc -mcpu=cortex-r5 -triple arm -show-encoding < %s 2>&1 | \
+@ RUN:    FileCheck %s --check-prefix=CHECK-NOFP16
+
+@ CHECK-FP16: vcvtt.f32.f16	s7, s1         @ encoding: [0xe0,0x3a,0xf2,0xee]
+@ CHECK-NOFP16: instruction requires: half-float conversions
+	vcvtt.f32.f16	s7, s1
+@ CHECK-FP16: vcvtt.f16.f32	s1, s7         @ encoding: [0xe3,0x0a,0xf3,0xee]
+@ CHECK-NOFP16: instruction requires: half-float conversions
+	vcvtt.f16.f32	s1, s7
+
+@ CHECK-FP16: vcvtb.f32.f16	s7, s1         @ encoding: [0x60,0x3a,0xf2,0xee]
+@ CHECK-NOFP16: instruction requires: half-float conversions
+	vcvtb.f32.f16	s7, s1
+@ CHECK-FP16: vcvtb.f16.f32	s1, s7         @ encoding: [0x63,0x0a,0xf3,0xee]
+@ CHECK-NOFP16: instruction requires: half-float conversions
+	vcvtb.f16.f32	s1, s7

From 5c239f408dd6152787aac57f38ab16e5e1d0c94f Mon Sep 17 00:00:00 2001
From: Marina Yatsina <marina.yatsina@intel.com>
Date: Mon, 7 Dec 2015 13:09:20 +0000
Subject: [PATCH 164/364] [X86]  Adding support for FWORD type for MS inline
 asm

Adding support for FWORD type for MS inline asm.

Differential Revision: http://reviews.llvm.org/D15268


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254904 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/AsmParser/X86AsmParser.cpp | 1 +
 test/MC/X86/intel-syntax.s                | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index d53ab71f3d5a..f2efefd35c52 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -1049,6 +1049,7 @@ static unsigned getIntelMemOperandSize(StringRef OpStr) {
     .Cases("BYTE", "byte", 8)
     .Cases("WORD", "word", 16)
     .Cases("DWORD", "dword", 32)
+    .Cases("FWORD", "fword", 48)
     .Cases("QWORD", "qword", 64)
     .Cases("MMWORD","mmword", 64)
     .Cases("XWORD", "xword", 80)
diff --git a/test/MC/X86/intel-syntax.s b/test/MC/X86/intel-syntax.s
index 214d827168b8..001a26d07019 100644
--- a/test/MC/X86/intel-syntax.s
+++ b/test/MC/X86/intel-syntax.s
@@ -746,3 +746,6 @@ loopz _foo
 loopnz _foo
 // CHECK: loope _foo
 // CHECK: loopne _foo
+
+sidt fword ptr [eax]
+// CHECK: sidtq (%eax)

From fbb911506eb0aeca8a269d0fa80ac279d932b44f Mon Sep 17 00:00:00 2001
From: Asaf Badouh <asaf.badouh@intel.com>
Date: Mon, 7 Dec 2015 13:14:14 +0000
Subject: [PATCH 165/364] [avx512] rename gcc intrinsics to be align with gcc
 format rename the gcc intrinsics suffix : _mask ->_round

Differential Revision: http://reviews.llvm.org/D15285


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254905 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/IntrinsicsX86.td | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td
index 1c028dea601f..29377d136797 100644
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@@ -5205,40 +5205,40 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
                      llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_mask_add_ss_round : GCCBuiltin<"__builtin_ia32_addss_mask">,
+  def int_x86_avx512_mask_add_ss_round : GCCBuiltin<"__builtin_ia32_addss_round">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                      llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_div_ss_round : GCCBuiltin<"__builtin_ia32_divss_mask">,
+  def int_x86_avx512_mask_div_ss_round : GCCBuiltin<"__builtin_ia32_divss_round">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                      llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_mul_ss_round : GCCBuiltin<"__builtin_ia32_mulss_mask">,
+  def int_x86_avx512_mask_mul_ss_round : GCCBuiltin<"__builtin_ia32_mulss_round">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                      llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_sub_ss_round : GCCBuiltin<"__builtin_ia32_subss_mask">,
+  def int_x86_avx512_mask_sub_ss_round : GCCBuiltin<"__builtin_ia32_subss_round">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                      llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_max_ss_round : GCCBuiltin<"__builtin_ia32_maxss_mask">,
+  def int_x86_avx512_mask_max_ss_round : GCCBuiltin<"__builtin_ia32_maxss_round">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                      llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_min_ss_round : GCCBuiltin<"__builtin_ia32_minss_mask">,
+  def int_x86_avx512_mask_min_ss_round : GCCBuiltin<"__builtin_ia32_minss_round">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                      llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_add_sd_round : GCCBuiltin<"__builtin_ia32_addsd_mask">,
+  def int_x86_avx512_mask_add_sd_round : GCCBuiltin<"__builtin_ia32_addsd_round">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                      llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_div_sd_round : GCCBuiltin<"__builtin_ia32_divsd_mask">,
+  def int_x86_avx512_mask_div_sd_round : GCCBuiltin<"__builtin_ia32_divsd_round">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                      llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_mul_sd_round : GCCBuiltin<"__builtin_ia32_mulsd_mask">,
+  def int_x86_avx512_mask_mul_sd_round : GCCBuiltin<"__builtin_ia32_mulsd_round">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                      llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_sub_sd_round : GCCBuiltin<"__builtin_ia32_subsd_mask">,
+  def int_x86_avx512_mask_sub_sd_round : GCCBuiltin<"__builtin_ia32_subsd_round">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                      llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_max_sd_round : GCCBuiltin<"__builtin_ia32_maxsd_mask">,
+  def int_x86_avx512_mask_max_sd_round : GCCBuiltin<"__builtin_ia32_maxsd_round">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                      llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_min_sd_round : GCCBuiltin<"__builtin_ia32_minsd_mask">,
+  def int_x86_avx512_mask_min_sd_round : GCCBuiltin<"__builtin_ia32_minsd_round">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                      llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
 

From 35ecd26675997eeb0954dfb0edffb4ce85cf20f1 Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Mon, 7 Dec 2015 13:24:23 +0000
Subject: [PATCH 166/364] Change how the linker handles the old
 llvm.global_ctors.

Now instead of changing it to the new format and then linking, it just
handles the old format while copying it over.

The main differences are:

* There is no rauw in the source module.
* An old format input is always upgraded.

The first item helps with having a sane API that passes in a GV list to
the linker.

The second one is a small step in deprecating the old format.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254907 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Linker/LinkModules.cpp | 126 ++++++++++---------------------------
 test/Linker/ctors5.ll      |   8 +++
 2 files changed, 42 insertions(+), 92 deletions(-)
 create mode 100644 test/Linker/ctors5.ll

diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index 627137ba3abd..a1c3162bf796 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -509,9 +509,6 @@ class ModuleLinker {
 
   void computeTypeMapping();
 
-  void upgradeMismatchedGlobalArray(StringRef Name);
-  void upgradeMismatchedGlobals();
-
   bool linkIfNeeded(GlobalValue &GV);
   Constant *linkAppendingVarProto(GlobalVariable *DstGV,
                                   const GlobalVariable *SrcGV);
@@ -1190,83 +1187,6 @@ void ModuleLinker::computeTypeMapping() {
   TypeMap.linkDefinedTypeBodies();
 }
 
-static void upgradeGlobalArray(GlobalVariable *GV) {
-  ArrayType *ATy = cast<ArrayType>(GV->getType()->getElementType());
-  StructType *OldTy = cast<StructType>(ATy->getElementType());
-  assert(OldTy->getNumElements() == 2 && "Expected to upgrade from 2 elements");
-
-  // Get the upgraded 3 element type.
-  PointerType *VoidPtrTy = Type::getInt8Ty(GV->getContext())->getPointerTo();
-  Type *Tys[3] = {OldTy->getElementType(0), OldTy->getElementType(1),
-                  VoidPtrTy};
-  StructType *NewTy = StructType::get(GV->getContext(), Tys, false);
-
-  // Build new constants with a null third field filled in.
-  Constant *OldInitC = GV->getInitializer();
-  ConstantArray *OldInit = dyn_cast<ConstantArray>(OldInitC);
-  if (!OldInit && !isa<ConstantAggregateZero>(OldInitC))
-    // Invalid initializer; give up.
-    return;
-  std::vector<Constant *> Initializers;
-  if (OldInit && OldInit->getNumOperands()) {
-    Value *Null = Constant::getNullValue(VoidPtrTy);
-    for (Use &U : OldInit->operands()) {
-      ConstantStruct *Init = cast<ConstantStruct>(U.get());
-      Initializers.push_back(ConstantStruct::get(
-          NewTy, Init->getOperand(0), Init->getOperand(1), Null, nullptr));
-    }
-  }
-  assert(Initializers.size() == ATy->getNumElements() &&
-         "Failed to copy all array elements");
-
-  // Replace the old GV with a new one.
-  ATy = ArrayType::get(NewTy, Initializers.size());
-  Constant *NewInit = ConstantArray::get(ATy, Initializers);
-  GlobalVariable *NewGV = new GlobalVariable(
-      *GV->getParent(), ATy, GV->isConstant(), GV->getLinkage(), NewInit, "",
-      GV, GV->getThreadLocalMode(), GV->getType()->getAddressSpace(),
-      GV->isExternallyInitialized());
-  NewGV->copyAttributesFrom(GV);
-  NewGV->takeName(GV);
-  assert(GV->use_empty() && "program cannot use initializer list");
-  GV->eraseFromParent();
-}
-
-void ModuleLinker::upgradeMismatchedGlobalArray(StringRef Name) {
-  // Look for the global arrays.
-  auto *DstGV = dyn_cast_or_null<GlobalVariable>(DstM.getNamedValue(Name));
-  if (!DstGV)
-    return;
-  auto *SrcGV = dyn_cast_or_null<GlobalVariable>(SrcM.getNamedValue(Name));
-  if (!SrcGV)
-    return;
-
-  // Check if the types already match.
-  auto *DstTy = cast<ArrayType>(DstGV->getType()->getElementType());
-  auto *SrcTy =
-      cast<ArrayType>(TypeMap.get(SrcGV->getType()->getElementType()));
-  if (DstTy == SrcTy)
-    return;
-
-  // Grab the element types.  We can only upgrade an array of a two-field
-  // struct.  Only bother if the other one has three-fields.
-  auto *DstEltTy = cast<StructType>(DstTy->getElementType());
-  auto *SrcEltTy = cast<StructType>(SrcTy->getElementType());
-  if (DstEltTy->getNumElements() == 2 && SrcEltTy->getNumElements() == 3) {
-    upgradeGlobalArray(DstGV);
-    return;
-  }
-  if (DstEltTy->getNumElements() == 3 && SrcEltTy->getNumElements() == 2)
-    upgradeGlobalArray(SrcGV);
-
-  // We can't upgrade any other differences.
-}
-
-void ModuleLinker::upgradeMismatchedGlobals() {
-  upgradeMismatchedGlobalArray("llvm.global_ctors");
-  upgradeMismatchedGlobalArray("llvm.global_dtors");
-}
-
 static void getArrayElements(const Constant *C,
                              SmallVectorImpl<Constant *> &Dest) {
   unsigned NumElements = cast<ArrayType>(C->getType())->getNumElements();
@@ -1279,9 +1199,25 @@ static void getArrayElements(const Constant *C,
 /// Return true on error.
 Constant *ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
                                               const GlobalVariable *SrcGV) {
-  ArrayType *SrcTy =
-      cast<ArrayType>(TypeMap.get(SrcGV->getType()->getElementType()));
-  Type *EltTy = SrcTy->getElementType();
+  Type *EltTy = cast<ArrayType>(TypeMap.get(SrcGV->getType()->getElementType()))
+                    ->getElementType();
+
+  StringRef Name = SrcGV->getName();
+  bool IsNewStructor = false;
+  bool IsOldStructor = false;
+  if (Name == "llvm.global_ctors" || Name == "llvm.global_dtors") {
+    if (cast<StructType>(EltTy)->getNumElements() == 3)
+      IsNewStructor = true;
+    else
+      IsOldStructor = true;
+  }
+
+  PointerType *VoidPtrTy = Type::getInt8Ty(SrcGV->getContext())->getPointerTo();
+  if (IsOldStructor) {
+    auto &ST = *cast<StructType>(EltTy);
+    Type *Tys[3] = {ST.getElementType(0), ST.getElementType(1), VoidPtrTy};
+    EltTy = StructType::get(SrcGV->getContext(), Tys, false);
+  }
 
   if (DstGV) {
     ArrayType *DstTy = cast<ArrayType>(DstGV->getType()->getElementType());
@@ -1335,10 +1271,6 @@ Constant *ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
   SmallVector<Constant *, 16> SrcElements;
   getArrayElements(SrcGV->getInitializer(), SrcElements);
 
-  StringRef Name = SrcGV->getName();
-  bool IsNewStructor =
-      (Name == "llvm.global_ctors" || Name == "llvm.global_dtors") &&
-      cast<StructType>(EltTy)->getNumElements() == 3;
   if (IsNewStructor)
     SrcElements.erase(
         std::remove_if(SrcElements.begin(), SrcElements.end(),
@@ -1367,8 +1299,21 @@ Constant *ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
   ValueMap[SrcGV] = Ret;
 
   for (auto *V : SrcElements) {
-    DstElements.push_back(
-        MapValue(V, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer));
+    Constant *NewV;
+    if (IsOldStructor) {
+      auto *S = cast<ConstantStruct>(V);
+      auto *E1 = MapValue(S->getOperand(0), ValueMap, RF_MoveDistinctMDs,
+                          &TypeMap, &ValMaterializer);
+      auto *E2 = MapValue(S->getOperand(1), ValueMap, RF_MoveDistinctMDs,
+                          &TypeMap, &ValMaterializer);
+      Value *Null = Constant::getNullValue(VoidPtrTy);
+      NewV =
+          ConstantStruct::get(cast<StructType>(EltTy), E1, E2, Null, nullptr);
+    } else {
+      NewV =
+          MapValue(V, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer);
+    }
+    DstElements.push_back(NewV);
   }
 
   NG->setInitializer(ConstantArray::get(NewType, DstElements));
@@ -1877,9 +1822,6 @@ bool ModuleLinker::run() {
     ComdatsChosen[&C] = std::make_pair(SK, LinkFromSrc);
   }
 
-  // Upgrade mismatched global arrays.
-  upgradeMismatchedGlobals();
-
   for (GlobalVariable &GV : SrcM.globals())
     if (const Comdat *SC = GV.getComdat())
       ComdatMembers[SC].push_back(&GV);
diff --git a/test/Linker/ctors5.ll b/test/Linker/ctors5.ll
new file mode 100644
index 000000000000..99124061bb32
--- /dev/null
+++ b/test/Linker/ctors5.ll
@@ -0,0 +1,8 @@
+; RUN: llvm-link -S %s | FileCheck %s
+
+@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @f }]
+; CHECK: @llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @f, i8* null }]
+
+define void @f() {
+  ret void
+}

From eea645e49f194c23d90a465967cee1561cc997ea Mon Sep 17 00:00:00 2001
From: Igor Breger <igor.breger@intel.com>
Date: Mon, 7 Dec 2015 13:25:18 +0000
Subject: [PATCH 167/364] AVX-512: implement kunpck intrinsics.

Differential Revision: http://reviews.llvm.org/D14821

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254908 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/IntrinsicsX86.td        |   6 +
 lib/Target/X86/X86ISelLowering.cpp      |  50 ++++++---
 lib/Target/X86/X86InstrAVX512.td        |  13 +--
 lib/Target/X86/X86IntrinsicsInfo.h      |   8 +-
 test/CodeGen/X86/avx512-intrinsics.ll   | 141 ++++++++++++++++--------
 test/CodeGen/X86/avx512bw-intrinsics.ll |  50 +++++++++
 6 files changed, 194 insertions(+), 74 deletions(-)

diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td
index 29377d136797..c287a3a1928e 100644
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@@ -4026,6 +4026,12 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx512_kunpck_bw : GCCBuiltin<"__builtin_ia32_kunpckhi">,
               Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty],
                          [IntrNoMem]>;
+  def int_x86_avx512_kunpck_wd : GCCBuiltin<"__builtin_ia32_kunpcksi">,
+              Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+                         [IntrNoMem]>;
+  def int_x86_avx512_kunpck_dq : GCCBuiltin<"__builtin_ia32_kunpckdi">,
+              Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
+                         [IntrNoMem]>;
   def int_x86_avx512_kortestz_w : GCCBuiltin<"__builtin_ia32_kortestzhi">,
               Intrinsic<[llvm_i32_ty], [llvm_i16_ty, llvm_i16_ty],
                         [IntrNoMem]>;
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 364a8c260ba1..f38ca2956ff3 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -15998,19 +15998,26 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
   }
 
   if (Mask.getSimpleValueType() == MVT::i64 && Subtarget->is32Bit()) {
-    assert(MaskVT == MVT::v64i1 && "Unexpected mask VT!");
-    assert(Subtarget->hasBWI() && "Expected AVX512BW target!");
-    // In case 32bit mode, bitcast i64 is illegal, extend/split it.
-    SDValue Lo, Hi;
-    Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
-                        DAG.getConstant(0, dl, MVT::i32));
-    Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
-                        DAG.getConstant(1, dl, MVT::i32));
-
-    Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v32i1, Lo);
-    Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v32i1, Hi);
-
-    return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Hi, Lo);
+    if (MaskVT == MVT::v64i1) {
+      assert(Subtarget->hasBWI() && "Expected AVX512BW target!");
+      // In case 32bit mode, bitcast i64 is illegal, extend/split it.
+      SDValue Lo, Hi;
+      Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
+                          DAG.getConstant(0, dl, MVT::i32));
+      Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
+                          DAG.getConstant(1, dl, MVT::i32));
+
+      Lo = DAG.getBitcast(MVT::v32i1, Lo);
+      Hi = DAG.getBitcast(MVT::v32i1, Hi);
+
+      return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
+    } else {
+      // MaskVT require < 64bit. Truncate mask (should succeed in any case),
+      // and bitcast.
+      MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
+      return DAG.getBitcast(MaskVT,
+                            DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
+    }
 
   } else {
     MVT BitcastVT = MVT::getVectorVT(MVT::i1,
@@ -16600,6 +16607,18 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
       return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
                          Op.getOperand(2));
     }
+    case KUNPCK: {
+      MVT VT = Op.getSimpleValueType();
+      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
+
+      SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
+      SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
+      // Arguments should be swapped.
+      SDValue Res = DAG.getNode(IntrData->Opc0, dl,
+                                MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
+                                Src2, Src1);
+      return DAG.getBitcast(VT, Res);
+    }
     default:
       break;
     }
@@ -20001,8 +20020,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     }
   }
   case ISD::INTRINSIC_WO_CHAIN: {
-	  Results.push_back(LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG));
-	  return;
+    if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
+      Results.push_back(V);
+    return;
   }
   case ISD::READCYCLECOUNTER: {
     return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 452e9f05f84a..60238f6ab23d 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -2396,16 +2396,6 @@ defm KUNPCKBW : avx512_mask_unpck<"bw", VK16, v16i1, VK8, HasAVX512>, PD;
 defm KUNPCKWD : avx512_mask_unpck<"wd", VK32, v32i1, VK16, HasBWI>, PS;
 defm KUNPCKDQ : avx512_mask_unpck<"dq", VK64, v64i1, VK32, HasBWI>, PS, VEX_W;
 
-multiclass avx512_mask_unpck_int<string IntName, string InstName> {
-  let Predicates = [HasAVX512] in
-    def : Pat<(!cast<Intrinsic>("int_x86_avx512_"##IntName##"_bw")
-                (i16 GR16:$src1), (i16 GR16:$src2)),
-              (COPY_TO_REGCLASS (!cast<Instruction>(InstName##"BWrr")
-              (v16i1 (COPY_TO_REGCLASS GR16:$src1, VK16)),
-              (v16i1 (COPY_TO_REGCLASS GR16:$src2, VK16))), GR16)>;
-}
-defm : avx512_mask_unpck_int<"kunpck",  "KUNPCK">;
-
 // Mask bit testing
 multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
                               SDNode OpNode, Predicate prd> {
@@ -2496,6 +2486,9 @@ def : Pat<(v16i1 (insert_subvector undef, (v8i1 VK8:$src), (iPTR 0))),
 def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))),
           (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>;
 
+def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 0))),
+          (v16i1 (COPY_TO_REGCLASS VK32:$src, VK16))>;
+
 def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 0))),
           (v32i1 (COPY_TO_REGCLASS VK64:$src, VK32))>;
 
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index cc53d5f3ce5a..8f8a100cea04 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -30,7 +30,7 @@ enum IntrinsicType {
   COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM,
   TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
   EXPAND_FROM_MEM, BLEND, INSERT_SUBVEC,
-  TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM
+  TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK
 };
 
 struct IntrinsicData {
@@ -341,7 +341,9 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_cvtusi642ss, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0),
   X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
   X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
-
+  X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0),
+  X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0),
+  X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0),
   X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_128, FMA_OP_MASK3, X86ISD::FMADD, 0),
   X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_256, FMA_OP_MASK3, X86ISD::FMADD, 0),
   X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_512, FMA_OP_MASK3, X86ISD::FMADD,
@@ -1827,7 +1829,7 @@ static void verifyIntrinsicTables() {
          "Intrinsic data tables should have unique entries");
 }
 
-// X86 specific compare constants.
+// X86 specific compare constants.
 // They must be kept in synch with avxintrin.h
 #define _X86_CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
 #define _X86_CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index c01f1adce360..4a4032570e7c 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -65,9 +65,9 @@ declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone
 define i16 @unpckbw_test(i16 %a0, i16 %a1) {
 ; CHECK-LABEL: unpckbw_test:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k0
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    kunpckbw %k0, %k1, %k0
+; CHECK-NEXT:    kmovw %edi, %k0
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    kunpckbw %k1, %k0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
 ; CHECK-NEXT:    retq
   %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1)
@@ -6160,76 +6160,103 @@ define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x
 }
 
 define i32 @test_x86_avx512_comi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_avx512_comi_sd_eq_sae
-; CHECK: vcomisd {sae}, %xmm1, %xmm0
-; CHECK-NEXT: sete	%al
-  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 8) 
+; CHECK-LABEL: test_x86_avx512_comi_sd_eq_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcomisd {sae}, %xmm1, %xmm0
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    retq
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 8)
   ret i32 %res
 }
 
 define i32 @test_x86_avx512_ucomi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq_sae
-; CHECK: vucomisd {sae}, %xmm1, %xmm0
-; CHECK-NEXT: sete %al
-  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 8) 
+; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vucomisd {sae}, %xmm1, %xmm0
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    retq
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 8)
   ret i32 %res
 }
 
 define i32 @test_x86_avx512_comi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_avx512_comi_sd_eq
-; CHECK: vcomisd %xmm1, %xmm0
-; CHECK-NEXT: sete %al
-  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 4) 
+; CHECK-LABEL: test_x86_avx512_comi_sd_eq:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcomisd %xmm1, %xmm0
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    retq
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 4)
   ret i32 %res
 }
 
 define i32 @test_x86_avx512_ucomi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq
-; CHECK: vucomisd %xmm1, %xmm0
-; CHECK-NEXT: sete %al
-  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 4) 
+; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vucomisd %xmm1, %xmm0
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    retq
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 4)
   ret i32 %res
 }
 
 define i32 @test_x86_avx512_comi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_avx512_comi_sd_lt_sae
-; CHECK: vcomisd {sae}, %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 8) 
+; CHECK-LABEL: test_x86_avx512_comi_sd_lt_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcomisd {sae}, %xmm1, %xmm0
+; CHECK-NEXT:    sbbl %eax, %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    retq
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 8)
   ret i32 %res
 }
 
 define i32 @test_x86_avx512_ucomi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt_sae
-; CHECK: vucomisd {sae}, %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 8) 
+; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt_sae:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vucomisd {sae}, %xmm1, %xmm0
+; CHECK-NEXT:    sbbl %eax, %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    retq
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 8)
   ret i32 %res
 }
 
 define i32 @test_x86_avx512_comi_sd_lt(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_avx512_comi_sd_lt
-; CHECK: vcomisd %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 4) 
+; CHECK-LABEL: test_x86_avx512_comi_sd_lt:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcomisd %xmm1, %xmm0
+; CHECK-NEXT:    sbbl %eax, %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    retq
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 4)
   ret i32 %res
 }
 
 define i32 @test_x86_avx512_ucomi_sd_lt(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt
-; CHECK: vucomisd %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 4) 
+; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vucomisd %xmm1, %xmm0
+; CHECK-NEXT:    sbbl %eax, %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    retq
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 4)
   ret i32 %res
 }
 
-declare i32 @llvm.x86.avx512.vcomi.sd(<2 x double>, <2 x double>, i32, i32) 
+declare i32 @llvm.x86.avx512.vcomi.sd(<2 x double>, <2 x double>, i32, i32)
 
 define i32 @test_x86_avx512_ucomi_ss_lt(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_avx512_ucomi_ss_lt
-; CHECK: vucomiss %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-  %res = call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> %a0, <4 x float> %a1, i32 9, i32 4) 
+; CHECK-LABEL: test_x86_avx512_ucomi_ss_lt:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vucomiss %xmm1, %xmm0
+; CHECK-NEXT:    sbbl %eax, %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    retq
+  %res = call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> %a0, <4 x float> %a1, i32 9, i32 4)
   ret i32 %res
 }
 
@@ -6238,21 +6265,32 @@ declare <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float>, <4 x float>, <4 x
 
 define <4 x float>@test_int_x86_avx512_mask_move_ss_rrk(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_move_ss_rrk:
-; CHECK: vmovss  %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovss %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
   ret <4 x float> %res
 }
 
 define <4 x float>@test_int_x86_avx512_mask_move_ss_rrkz(<4 x float> %x0, <4 x float> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_move_ss_rrkz:
-; CHECK: vmovss  %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> zeroinitializer, i8 %x2)
   ret <4 x float> %res
 }
 
 define <4 x float>@test_int_x86_avx512_mask_move_ss_rr(<4 x float> %x0, <4 x float> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_move_ss_rr:
-; CHECK: vmovss %xmm1, %xmm0, %xmm0
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmovss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> zeroinitializer, i8 -1)
   ret <4 x float> %res
 }
@@ -6260,21 +6298,32 @@ define <4 x float>@test_int_x86_avx512_mask_move_ss_rr(<4 x float> %x0, <4 x flo
 declare <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double>, <2 x double>, <2 x double>, i8)
 define <2 x double>@test_int_x86_avx512_mask_move_sd_rr(<2 x double> %x0, <2 x double> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_move_sd_rr:
-; CHECK: vmovsd  %xmm1, %xmm0, %xmm0
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmovsd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> zeroinitializer, i8 -1)
   ret <2 x double> %res
 }
 
 define <2 x double>@test_int_x86_avx512_mask_move_sd_rrkz(<2 x double> %x0, <2 x double> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_move_sd_rrkz:
-; CHECK: vmovsd  %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> zeroinitializer, i8 %x2)
   ret <2 x double> %res
 }
 
 define <2 x double>@test_int_x86_avx512_mask_move_sd_rrk(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_move_sd_rrk:
-; CHECK: vmovsd  %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovsd %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
   ret <2 x double> %res
 }
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index 0eba131a67c4..c6ba0dd6eb42 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -2736,3 +2736,53 @@ define  <8 x i64>@test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8>
   %res2 = add  <8 x i64> %res, %res1
   ret  <8 x i64> %res2
 }
+
+declare i32 @llvm.x86.avx512.kunpck.wd(i32, i32)
+
+define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) {
+; AVX512BW-LABEL: test_int_x86_avx512_kunpck_wd:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k0
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    kunpckwd %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_wd:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckwd %k1, %k0, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    retl
+  %res = call i32 @llvm.x86.avx512.kunpck.wd(i32 %x0, i32 %x1)
+  ret i32 %res
+}
+
+declare i64 @llvm.x86.avx512.kunpck.dq(i64, i64)
+
+define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) {
+; AVX512BW-LABEL: test_int_x86_avx512_kunpck_qd:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k0
+; AVX512BW-NEXT:    kmovq %rsi, %k1
+; AVX512BW-NEXT:    kunpckdq %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_qd:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    subl $12, %esp
+; AVX512F-32-NEXT:  .Ltmp8:
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 16
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k0
+; AVX512F-32-NEXT:    kmovq %k0, (%esp)
+; AVX512F-32-NEXT:    movl (%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    addl $12, %esp
+; AVX512F-32-NEXT:    retl
+  %res = call i64 @llvm.x86.avx512.kunpck.dq(i64 %x0, i64 %x1)
+  ret i64 %res
+}

From b06ff9b1e1a0e4e9b3a135efd2803424db3a2abf Mon Sep 17 00:00:00 2001
From: Elena Demikhovsky <elena.demikhovsky@intel.com>
Date: Mon, 7 Dec 2015 13:39:24 +0000
Subject: [PATCH 168/364] AVX-512: Fixed masked load / store instruction
 selection for KNL.

Patterns were missing for KNL target for <8 x i32>, <8 x float> masked load/store.

This intrinsic comes with all legal types:
<8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 align, <8 x i1> %mask, <8 x float> %passThru),
but still requires lowering, because VMASKMOVPS, VMASKMOVDQU32 work with 512-bit vectors only.

All data operands should be widened to 512-bit vector.
The mask operand should be widened to v16i1 with zeroes.

Differential Revision: http://reviews.llvm.org/D15265


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254909 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../SelectionDAG/LegalizeVectorOps.cpp        |   5 +-
 lib/Target/X86/X86ISelLowering.cpp            | 105 ++++++++++++++++++
 lib/Target/X86/X86InstrAVX512.td              |  27 -----
 test/CodeGen/X86/masked_memop.ll              |  46 +++++++-
 4 files changed, 151 insertions(+), 32 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 1fb7b160a671..8295b2a19dd2 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -244,7 +244,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
         Changed = true;
         return LegalizeOp(ExpandStore(Op));
       }
-  } else if (Op.getOpcode() == ISD::MSCATTER)
+  } else if (Op.getOpcode() == ISD::MSCATTER || Op.getOpcode() == ISD::MSTORE)
     HasVectorValue = true;
 
   for (SDNode::value_iterator J = Node->value_begin(), E = Node->value_end();
@@ -344,6 +344,9 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::MSCATTER:
     QueryType = cast<MaskedScatterSDNode>(Node)->getValue().getValueType();
     break;
+  case ISD::MSTORE:
+    QueryType = cast<MaskedStoreSDNode>(Node)->getValue().getValueType();
+    break;
   }
 
   switch (TLI.getOperationAction(Node->getOpcode(), QueryType)) {
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index f38ca2956ff3..fa6f5c8be88c 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1384,6 +1384,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
       setTruncStoreAction(MVT::v4i32, MVT::v4i8,  Legal);
       setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
+    } else {
+      setOperationAction(ISD::MLOAD,    MVT::v8i32, Custom);
+      setOperationAction(ISD::MLOAD,    MVT::v8f32, Custom);
+      setOperationAction(ISD::MSTORE,   MVT::v8i32, Custom);
+      setOperationAction(ISD::MSTORE,   MVT::v8f32, Custom);
     }
     setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
@@ -1459,6 +1464,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v16i1, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
@@ -19685,6 +19691,47 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
 }
 
+/// Widen a vector input to a vector of NVT.  The
+/// input vector must have the same element type as NVT.
+static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
+                            bool FillWithZeroes = false) {
+  // Check if InOp already has the right width.
+  MVT InVT = InOp.getSimpleValueType();
+  if (InVT == NVT)
+    return InOp;
+
+  if (InOp.isUndef())
+    return DAG.getUNDEF(NVT);
+
+  assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
+         "input and widen element type must match");
+
+  unsigned InNumElts = InVT.getVectorNumElements();
+  unsigned WidenNumElts = NVT.getVectorNumElements();
+  assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
+         "Unexpected request for vector widening");
+
+  EVT EltVT = NVT.getVectorElementType();
+
+  SDLoc dl(InOp);
+  if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
+      ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
+    SmallVector<SDValue, 16> Ops;
+    for (unsigned i = 0; i < InNumElts; ++i)
+      Ops.push_back(InOp.getOperand(i));
+
+    SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
+      DAG.getUNDEF(EltVT);
+    for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
+      Ops.push_back(FillVal);
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops);
+  }
+  SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
+    DAG.getUNDEF(NVT);
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
+                     InOp, DAG.getIntPtrConstant(0, dl));
+}
+
 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget *Subtarget,
                              SelectionDAG &DAG) {
   assert(Subtarget->hasAVX512() &&
@@ -19714,6 +19761,62 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget *Subtarget,
   return Op;
 }
 
+static SDValue LowerMLOAD(SDValue Op, const X86Subtarget *Subtarget,
+                          SelectionDAG &DAG) {
+
+  MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
+  MVT VT = Op.getSimpleValueType();
+  SDValue Mask = N->getMask();
+  SDLoc dl(Op);
+
+  if (Subtarget->hasAVX512() && !Subtarget->hasVLX() &&
+      !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) {
+    // This operation is legal for targets with VLX, but without
+    // VLX the vector should be widened to 512 bit
+    unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
+    MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec);
+    MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
+    SDValue Src0 = N->getSrc0();
+    Src0 = ExtendToType(Src0, WideDataVT, DAG);
+    Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
+    SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
+                                        N->getBasePtr(), Mask, Src0,
+                                        N->getMemoryVT(), N->getMemOperand(),
+                                        N->getExtensionType());
+
+    SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+                                 NewLoad.getValue(0),
+                                 DAG.getIntPtrConstant(0, dl));
+    SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
+    return DAG.getMergeValues(RetOps, dl);
+  }
+  return Op;
+}
+
+static SDValue LowerMSTORE(SDValue Op, const X86Subtarget *Subtarget,
+                           SelectionDAG &DAG) {
+  MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
+  SDValue DataToStore = N->getValue();
+  MVT VT = DataToStore.getSimpleValueType();
+  SDValue Mask = N->getMask();
+  SDLoc dl(Op);
+
+  if (Subtarget->hasAVX512() && !Subtarget->hasVLX() &&
+      !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) {
+    // This operation is legal for targets with VLX, but without
+    // VLX the vector should be widened to 512 bit
+    unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
+    MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec);
+    MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
+    DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
+    Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
+    return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
+                              Mask, N->getMemoryVT(), N->getMemOperand(),
+                              N->isTruncatingStore());
+  }
+  return Op;
+}
+
 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget *Subtarget,
                             SelectionDAG &DAG) {
   assert(Subtarget->hasAVX512() &&
@@ -19873,6 +19976,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::UMAX:
   case ISD::UMIN:               return LowerMINMAX(Op, DAG);
   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
+  case ISD::MLOAD:              return LowerMLOAD(Op, Subtarget, DAG);
+  case ISD::MSTORE:             return LowerMSTORE(Op, Subtarget, DAG);
   case ISD::MGATHER:            return LowerMGATHER(Op, Subtarget, DAG);
   case ISD::MSCATTER:           return LowerMSCATTER(Op, Subtarget, DAG);
   case ISD::GC_TRANSITION_START:
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 60238f6ab23d..58206c6acaa6 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -2766,22 +2766,6 @@ def: Pat<(int_x86_avx512_mask_store_pd_512 addr:$ptr, (v8f64 VR512:$src),
          (VMOVAPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)),
             VR512:$src)>;
 
-let Predicates = [HasAVX512, NoVLX] in {
-def: Pat<(X86mstore addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src)),
-         (VMOVUPSZmrk addr:$ptr,
-         (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)),
-         (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256:$src, sub_ymm))>;
-
-def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, undef)),
-         (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmkz
-          (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>;
-
-def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src0))),
-         (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmk
-         (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256:$src0, sub_ymm),
-          (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>;
-}
-
 defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info,
                                        HasAVX512>,
                  avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info,
@@ -2843,17 +2827,6 @@ def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV),
                            (v16i32 VR512:$src))),
                   (VMOVDQU32Zrrkz (KNOTWrr VK16WM:$mask), VR512:$src)>;
 }
-// NoVLX patterns
-let Predicates = [HasAVX512, NoVLX] in {
-def: Pat<(X86mstore addr:$ptr, VK8WM:$mask, (v8i32 VR256:$src)),
-         (VMOVDQU32Zmrk addr:$ptr,
-         (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)),
-         (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256:$src, sub_ymm))>;
-
-def: Pat<(v8i32 (masked_load addr:$ptr, VK8WM:$mask, undef)),
-         (v8i32 (EXTRACT_SUBREG (v16i32 (VMOVDQU32Zrmkz
-          (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>;
-}
 
 // Move Int Doubleword to Packed Double Int
 //
diff --git a/test/CodeGen/X86/masked_memop.ll b/test/CodeGen/X86/masked_memop.ll
index a720054c167c..1a9cf008e869 100644
--- a/test/CodeGen/X86/masked_memop.ll
+++ b/test/CodeGen/X86/masked_memop.ll
@@ -139,18 +139,55 @@ define <4 x double> @test10(<4 x i32> %trigger, <4 x double>* %addr, <4 x double
   ret <4 x double> %res
 }
 
-; AVX2-LABEL: test11
+; AVX2-LABEL: test11a
 ; AVX2: vmaskmovps
 ; AVX2: vblendvps
 
-; SKX-LABEL: test11
-; SKX: vmovaps {{.*}}{%k1}
-define <8 x float> @test11(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) {
+; SKX-LABEL: test11a
+; SKX: vmovaps (%rdi), %ymm1 {%k1}
+; AVX512-LABEL: test11a
+; AVX512: kshiftlw $8
+; AVX512: kshiftrw $8
+; AVX512: vmovups (%rdi), %zmm1 {%k1}
+define <8 x float> @test11a(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) {
   %mask = icmp eq <8 x i32> %trigger, zeroinitializer
   %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 32, <8 x i1>%mask, <8 x float>%dst)
   ret <8 x float> %res
 }
 
+; SKX-LABEL: test11b
+; SKX: vmovdqu32 (%rdi), %ymm1 {%k1}
+; AVX512-LABEL: test11b
+; AVX512: kshiftlw        $8
+; AVX512: kshiftrw        $8
+; AVX512: vmovdqu32 (%rdi), %zmm1 {%k1}
+define <8 x i32> @test11b(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %dst) {
+  %res = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %addr, i32 4, <8 x i1>%mask, <8 x i32>%dst)
+  ret <8 x i32> %res
+}
+
+; SKX-LABEL: test11c
+; SKX: vmovaps (%rdi), %ymm0 {%k1} {z}
+; AVX512-LABEL: test11c
+; AVX512: kshiftlw  $8
+; AVX512: kshiftrw  $8
+; AVX512: vmovups (%rdi), %zmm0 {%k1} {z}
+define <8 x float> @test11c(<8 x i1> %mask, <8 x float>* %addr) {
+  %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 32, <8 x i1> %mask, <8 x float> zeroinitializer)
+  ret <8 x float> %res
+}
+
+; SKX-LABEL: test11d
+; SKX: vmovdqu32 (%rdi), %ymm0 {%k1} {z}
+; AVX512-LABEL: test11d
+; AVX512: kshiftlw  $8
+; AVX512: kshiftrw  $8
+; AVX512: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
+define <8 x i32> @test11d(<8 x i1> %mask, <8 x i32>* %addr) {
+  %res = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %addr, i32 4, <8 x i1> %mask, <8 x i32> zeroinitializer)
+  ret <8 x i32> %res
+}
+
 ; AVX2-LABEL: test12
 ; AVX2: vpmaskmovd %ymm
 
@@ -291,6 +328,7 @@ declare void @llvm.masked.store.v16f32(<16 x float>, <16 x float>*, i32, <16 x i
 declare void @llvm.masked.store.v16f32p(<16 x float>*, <16 x float>**, i32, <16 x i1>)
 declare <16 x float> @llvm.masked.load.v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>)
 declare <8 x float> @llvm.masked.load.v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)
+declare <8 x i32> @llvm.masked.load.v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>)
 declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
 declare <2 x float> @llvm.masked.load.v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>)
 declare <8 x double> @llvm.masked.load.v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>)

From 0284291aa161a26a07c1da135ad34596672dd79a Mon Sep 17 00:00:00 2001
From: Daniel Sanders <daniel.sanders@imgtec.com>
Date: Mon, 7 Dec 2015 14:12:44 +0000
Subject: [PATCH 169/364] [mips][ias] Removed DSP/DSPr2 instructions from base
 architecture valid-xfail.s's.

Summary:
valid-xfail.s is for instructions that should be valid in the given ISA but
incorrectly fail. DSP/DSPr2 instructions are correct to fail since DSP/DSPr2 is
not enabled.

Reviewers: vkalintiris

Subscribers: dsanders, llvm-commits

Differential Revision: http://reviews.llvm.org/D15072

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254911 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/MC/Mips/mips32r2/invalid-dsp.s   |  97 ++++++++++++++++++
 test/MC/Mips/mips32r2/invalid-dspr2.s | 134 ++++++++++++++++++++++++
 test/MC/Mips/mips32r2/valid-xfail.s   | 123 ----------------------
 test/MC/Mips/mips32r3/valid-xfail.s   | 123 ----------------------
 test/MC/Mips/mips32r5/valid-xfail.s   | 123 ----------------------
 test/MC/Mips/mips64r2/valid-xfail.s   | 142 ++------------------------
 test/MC/Mips/mips64r3/valid-xfail.s   | 139 ++-----------------------
 test/MC/Mips/mips64r5/valid-xfail.s   | 139 ++-----------------------
 8 files changed, 255 insertions(+), 765 deletions(-)
 create mode 100644 test/MC/Mips/mips32r2/invalid-dsp.s
 create mode 100644 test/MC/Mips/mips32r2/invalid-dspr2.s

diff --git a/test/MC/Mips/mips32r2/invalid-dsp.s b/test/MC/Mips/mips32r2/invalid-dsp.s
new file mode 100644
index 000000000000..66e5f63129ac
--- /dev/null
+++ b/test/MC/Mips/mips32r2/invalid-dsp.s
@@ -0,0 +1,97 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding \
+# RUN:     -mcpu=mips32r2 2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        absq_s.ph       $8,$a0        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        absq_s.w        $s3,$ra       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addq.ph         $s1,$15,$at   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addq_s.ph       $s3,$s6,$s2   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addq_s.w        $a2,$8,$at    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addsc           $s8,$15,$12   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addu.qb         $s6,$v1,$v1   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addu_s.qb       $s4,$s8,$s1   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addwc           $k0,$s6,$s7   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bitrev          $14,$at       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmp.eq.ph       $s7,$14       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmp.le.ph       $8,$14        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmp.lt.ph       $k0,$sp       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmpgu.eq.qb     $14,$s6,$s8   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmpgu.le.qb     $9,$a3,$s4    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmpgu.lt.qb     $sp,$at,$8    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmpu.eq.qb      $v0,$24       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmpu.le.qb      $s1,$a1       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmpu.lt.qb      $at,$a3       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpaq_sa.l.w     $ac0,$a2,$14  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpau.h.qbl      $ac1,$10,$24  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpau.h.qbr      $ac1,$s7,$s6  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpsq_s.w.ph     $ac0,$gp,$k0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpsq_sa.l.w     $ac0,$a3,$15  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpsu.h.qbl      $ac2,$14,$10  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpsu.h.qbr      $ac2,$a1,$s6  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        extpdpv         $s6,$ac0,$s8  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        extpv           $13,$ac0,$14  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        extrv.w         $8,$ac3,$at   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        extrv_r.w       $8,$ac1,$s6   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        extrv_rs.w      $gp,$ac1,$s6  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        extrv_s.h       $s2,$ac1,$14  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        insv            $s2,$at       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        lbux            $9,$14($v0)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        lhx             $sp,$k0($15)  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        lwx             $12,$12($s4)  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        madd            $ac2,$sp,$14  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maddu           $ac2,$a1,$24  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maq_s.w.phl     $ac2,$25,$11  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maq_s.w.phr     $ac0,$10,$25  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maq_sa.w.phl    $ac3,$a1,$v1  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maq_sa.w.phr    $ac1,$at,$10  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfhi            $9,$ac2       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mflo            $9,$ac2       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        modsub          $a3,$12,$a3   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mthi            $v0,$ac1      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mthlip          $a3,$ac0      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mtlo            $v0,$ac1      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        muleq_s.w.phl   $11,$s4,$s4   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        muleq_s.w.phr   $s6,$a0,$s8   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        muleu_s.ph.qbl  $a2,$14,$8    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        muleu_s.ph.qbr  $a1,$ra,$9    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mulq_rs.ph      $s2,$14,$15   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mulsaq_s.w.ph   $ac0,$ra,$s2  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mult            $ac1, $2, $3  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        multu           $ac1, $2, $3  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        packrl.ph       $ra,$24,$14   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        pick.ph         $ra,$a2,$gp   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        pick.qb         $11,$a0,$gp   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        precequ.ph.qbl  $s7,$ra       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        precequ.ph.qbla $a0,$9        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        precequ.ph.qbr  $ra,$s3       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        precequ.ph.qbra $24,$8        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        preceu.ph.qbl   $sp,$8        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        preceu.ph.qbla  $s6,$11       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        preceu.ph.qbr   $gp,$s1       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        preceu.ph.qbra  $k1,$s0       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        precrq.ph.w     $14,$s8,$24   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        precrq.qb.ph    $a2,$12,$12   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        precrq_rs.ph.w  $a1,$k0,$a3   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        precrqu_s.qb.ph $zero,$gp,$s5 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        raddu.w.qb      $25,$s3       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        repl.ph         $at,-307      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        replv.ph        $v1,$s7       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        replv.qb        $25,$12       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shilo           $ac1,26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shilov          $ac2,$10      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shllv.ph        $10,$s0,$s0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shllv.qb        $gp,$v1,$zero # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shllv_s.ph      $k1,$at,$13   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shllv_s.w       $s1,$ra,$k0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shrav.ph        $25,$s2,$s1   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shrav_r.ph      $s3,$11,$25   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shrav_r.w       $s7,$s4,$s6   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shrlv.qb        $a2,$s2,$11   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subq.ph         $ra,$9,$s8    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subq_s.ph       $13,$s8,$s5   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subq_s.w        $k1,$a2,$a3   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subu.qb         $s6,$a2,$s6   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subu_s.qb       $s1,$at,$ra   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips32r2/invalid-dspr2.s b/test/MC/Mips/mips32r2/invalid-dspr2.s
new file mode 100644
index 000000000000..5c31b465ca1f
--- /dev/null
+++ b/test/MC/Mips/mips32r2/invalid-dspr2.s
@@ -0,0 +1,134 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding \
+# RUN:     -mcpu=mips32r2 2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        absq_s.ph       $8,$a0         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        absq_s.qb       $15,$s1        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        absq_s.w        $s3,$ra        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addq.ph         $s1,$15,$at    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addq_s.ph       $s3,$s6,$s2    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addq_s.w        $a2,$8,$at     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addqh.ph        $s4,$14,$s1    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addqh_r.ph      $sp,$25,$s8    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addsc           $s8,$15,$12    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addu.ph         $a2,$14,$s3    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addu.qb         $s6,$v1,$v1    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addu_s.ph       $a3,$s3,$gp    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addu_s.qb       $s4,$s8,$s1    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        adduh.qb        $a1,$a1,$at    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        adduh_r.qb      $a0,$9,$12     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        addwc           $k0,$s6,$s7    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bitrev          $14,$at        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmp.eq.ph       $s7,$14        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmp.le.ph       $8,$14         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmp.lt.ph       $k0,$sp        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmpgdu.eq.qb    $s3,$zero,$k0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmpgdu.le.qb    $v1,$15,$s2    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmpgdu.lt.qb    $s0,$gp,$sp    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmpgu.eq.qb     $14,$s6,$s8    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmpgu.le.qb     $9,$a3,$s4     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmpgu.lt.qb     $sp,$at,$8     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmpu.eq.qb      $v0,$24        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmpu.le.qb      $s1,$a1        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cmpu.lt.qb      $at,$a3        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpa.w.ph        $ac1,$s7,$k0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpaq_sa.l.w     $ac0,$a2,$14   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpaqx_s.w.ph    $ac3,$a0,$24   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpaqx_sa.w.ph   $ac1,$zero,$s5 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpau.h.qbl      $ac1,$10,$24   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpau.h.qbr      $ac1,$s7,$s6   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpax.w.ph       $ac3,$a0,$k0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dps.w.ph        $ac1,$a3,$a1   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpsq_s.w.ph     $ac0,$gp,$k0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpsq_sa.l.w     $ac0,$a3,$15   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpsqx_s.w.ph    $ac3,$13,$a3   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpsqx_sa.w.ph   $ac3,$sp,$s2   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpsu.h.qbl      $ac2,$14,$10   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpsu.h.qbr      $ac2,$a1,$s6   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dpsx.w.ph       $ac0,$s7,$gp   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        extpdpv         $s6,$ac0,$s8   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        extpv           $13,$ac0,$14   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        extrv.w         $8,$ac3,$at    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        extrv_r.w       $8,$ac1,$s6    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        extrv_rs.w      $gp,$ac1,$s6   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        extrv_s.h       $s2,$ac1,$14   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        insv            $s2,$at        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        lbux            $9,$14($v0)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        lhx             $sp,$k0($15)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        lwx             $12,$12($s4)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        madd            $ac2,$sp,$14   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maddu           $ac2,$a1,$24   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maq_s.w.phl     $ac2,$25,$11   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maq_s.w.phr     $ac0,$10,$25   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maq_sa.w.phl    $ac3,$a1,$v1   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maq_sa.w.phr    $ac1,$at,$10   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfhi            $9,$ac2        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mflo            $9,$ac2        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        modsub          $a3,$12,$a3    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mthi            $v0,$ac1       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mthlip          $a3,$ac0       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mtlo            $v0,$ac1       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mul.ph          $10,$14,$15    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mul.ph          $s4,$24,$s0    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mul_s.ph        $10,$14,$15    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        muleq_s.w.phl   $11,$s4,$s4    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        muleq_s.w.phr   $s6,$a0,$s8    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        muleu_s.ph.qbl  $a2,$14,$8     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        muleu_s.ph.qbr  $a1,$ra,$9     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mulq_rs.ph      $s2,$14,$15    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mulq_rs.w       $at,$s4,$25    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mulq_s.ph       $s0,$k1,$15    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mulq_s.w        $9,$a3,$s0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mulsa.w.ph      $ac1,$s4,$s6   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mulsaq_s.w.ph   $ac0,$ra,$s2   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mult            $ac1, $2, $3   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        multu           $ac1, $2, $3   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        packrl.ph       $ra,$24,$14    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        pick.ph         $ra,$a2,$gp    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        pick.qb         $11,$a0,$gp    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        precequ.ph.qbl  $s7,$ra        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        precequ.ph.qbla $a0,$9         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        precequ.ph.qbr  $ra,$s3        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        precequ.ph.qbra $24,$8         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        preceu.ph.qbl   $sp,$8         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        preceu.ph.qbla  $s6,$11        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        preceu.ph.qbr   $gp,$s1        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        preceu.ph.qbra  $k1,$s0        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        precr.qb.ph     $v0,$12,$s8    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        precrq.ph.w     $14,$s8,$24    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        precrq.qb.ph    $a2,$12,$12    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        precrq_rs.ph.w  $a1,$k0,$a3    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        precrqu_s.qb.ph $zero,$gp,$s5  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        raddu.w.qb      $25,$s3        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        repl.ph         $at,-307       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        replv.ph        $v1,$s7        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        replv.qb        $25,$12        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shilo           $ac1,26        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shilov          $ac2,$10       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shllv.ph        $10,$s0,$s0    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shllv.qb        $gp,$v1,$zero  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shllv_s.ph      $k1,$at,$13    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shllv_s.w       $s1,$ra,$k0    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shrav.ph        $25,$s2,$s1    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shrav.qb        $zero,$24,$11  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shrav_r.ph      $s3,$11,$25    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shrav_r.qb      $a0,$sp,$s5    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shrav_r.w       $s7,$s4,$s6    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shrlv.ph        $14,$10,$9     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        shrlv.qb        $a2,$s2,$11    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subq.ph         $ra,$9,$s8     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subq_s.ph       $13,$s8,$s5    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subq_s.w        $k1,$a2,$a3    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subqh.ph        $10,$at,$9     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subqh.w         $v0,$a2,$zero  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subqh_r.ph      $a0,$12,$s6    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subqh_r.w       $10,$a2,$gp    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subu.ph         $9,$s6,$s4     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subu.qb         $s6,$a2,$s6    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subu_s.ph       $v1,$a1,$s3    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subu_s.qb       $s1,$at,$ra    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subuh.qb        $zero,$gp,$gp  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        subuh_r.qb      $s4,$s8,$s6    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips32r2/valid-xfail.s b/test/MC/Mips/mips32r2/valid-xfail.s
index 658f172aec3d..5a61eb6cbfb7 100644
--- a/test/MC/Mips/mips32r2/valid-xfail.s
+++ b/test/MC/Mips/mips32r2/valid-xfail.s
@@ -8,27 +8,10 @@
 
         .set noat
         abs.ps          $f22,$f8
-        absq_s.ph       $8,$a0
-        absq_s.qb       $15,$s1
-        absq_s.w        $s3,$ra
         add.ps          $f25,$f27,$f13
-        addq.ph         $s1,$15,$at
-        addq_s.ph       $s3,$s6,$s2
-        addq_s.w        $a2,$8,$at
-        addqh.ph        $s4,$14,$s1
         addqh.w         $s7,$s7,$k1
-        addqh_r.ph      $sp,$25,$s8
         addqh_r.w       $8,$v1,$zero
-        addsc           $s8,$15,$12
-        addu.ph         $a2,$14,$s3
-        addu.qb         $s6,$v1,$v1
-        addu_s.ph       $a3,$s3,$gp
-        addu_s.qb       $s4,$s8,$s1
-        adduh.qb        $a1,$a1,$at
-        adduh_r.qb      $a0,$9,$12
-        addwc           $k0,$s6,$s7
         alnv.ps         $f12,$f18,$f30,$12
-        bitrev          $14,$at
         c.eq.d          $fcc1,$f15,$f15
         c.eq.ps         $fcc5,$f0,$f9
         c.eq.s          $fcc5,$f24,$f17
@@ -76,18 +59,6 @@
         ceil.l.d        $f1,$f3
         ceil.l.s        $f18,$f13
         cfcmsa          $s6,$19
-        cmp.eq.ph       $s7,$14
-        cmp.le.ph       $8,$14
-        cmp.lt.ph       $k0,$sp
-        cmpgdu.eq.qb    $s3,$zero,$k0
-        cmpgdu.le.qb    $v1,$15,$s2
-        cmpgdu.lt.qb    $s0,$gp,$sp
-        cmpgu.eq.qb     $14,$s6,$s8
-        cmpgu.le.qb     $9,$a3,$s4
-        cmpgu.lt.qb     $sp,$at,$8
-        cmpu.eq.qb      $v0,$24
-        cmpu.le.qb      $s1,$a1
-        cmpu.lt.qb      $at,$a3
         ctcmsa          $31,$s7
         cvt.d.l         $f4,$f16
         cvt.ps.s        $f3,$f18,$f19
@@ -95,112 +66,44 @@
         cvt.s.pl        $f30,$f1
         cvt.s.pu        $f14,$f25
         dmt $k0
-        dpa.w.ph        $ac1,$s7,$k0
-        dpaq_s.w.ph     $ac2,$a0,$13
-        dpaq_sa.l.w     $ac0,$a2,$14
-        dpaqx_s.w.ph    $ac3,$a0,$24
-        dpaqx_sa.w.ph   $ac1,$zero,$s5
-        dpau.h.qbl      $ac1,$10,$24
-        dpau.h.qbr      $ac1,$s7,$s6
-        dpax.w.ph       $ac3,$a0,$k0
-        dps.w.ph        $ac1,$a3,$a1
-        dpsq_s.w.ph     $ac0,$gp,$k0
-        dpsq_sa.l.w     $ac0,$a3,$15
-        dpsqx_s.w.ph    $ac3,$13,$a3
-        dpsqx_sa.w.ph   $ac3,$sp,$s2
-        dpsu.h.qbl      $ac2,$14,$10
-        dpsu.h.qbr      $ac2,$a1,$s6
-        dpsx.w.ph       $ac0,$s7,$gp
         dvpe            $s6
         emt $8
         evpe            $v0
-        extpdpv         $s6,$ac0,$s8
-        extpv           $13,$ac0,$14
-        extrv.w         $8,$ac3,$at
-        extrv_r.w       $8,$ac1,$s6
-        extrv_rs.w      $gp,$ac1,$s6
-        extrv_s.h       $s2,$ac1,$14
         floor.l.d       $f26,$f7
         floor.l.s       $f12,$f5
         fork            $s2,$8,$a0
-        insv            $s2,$at
         iret
         lbe             $14,122($9)
         lbue            $11,-108($10)
-        lbux            $9,$14($v0)
         lhe             $s6,219($v1)
         lhue            $gp,118($11)
-        lhx             $sp,$k0($15)
         lle             $gp,-237($ra)
         lwe             $ra,-145($14)
         lwle            $11,-42($11)
         lwre            $sp,-152($24)
-        lwx             $12,$12($s4)
         madd.ps         $f22,$f3,$f14,$f3
-        maq_s.w.phl     $ac2,$25,$11
-        maq_s.w.phr     $ac0,$10,$25
-        maq_sa.w.phl    $ac3,$a1,$v1
-        maq_sa.w.phr    $ac1,$at,$10
         mfgc0           $s6,c0_datahi1
-        mflo            $9,$ac2
-        modsub          $a3,$12,$a3
         mov.ps          $f22,$f17
         movf.ps         $f10,$f28,$fcc6
         movn.ps         $f31,$f31,$s3
         movt.ps         $f20,$f25,$fcc2
         movz.ps         $f18,$f17,$ra
-        msub            $ac2,$sp,$14
         msub.ps         $f12,$f14,$f29,$f17
-        msubu           $ac2,$a1,$24
         mtc0            $9,c0_datahi1
         mtgc0           $s4,$21,7
-        mthi            $v0,$ac1
-        mthlip          $a3,$ac0
-        mul.ph          $s4,$24,$s0
         mul.ps          $f14,$f0,$f16
-        mul_s.ph        $10,$14,$15
-        muleq_s.w.phl   $11,$s4,$s4
-        muleq_s.w.phr   $s6,$a0,$s8
-        muleu_s.ph.qbl  $a2,$14,$8
-        muleu_s.ph.qbr  $a1,$ra,$9
-        mulq_rs.ph      $s2,$14,$15
-        mulq_rs.w       $at,$s4,$25
-        mulq_s.ph       $s0,$k1,$15
-        mulq_s.w        $9,$a3,$s0
-        mulsa.w.ph      $ac1,$s4,$s6
-        mulsaq_s.w.ph   $ac0,$ra,$s2
         neg.ps          $f19,$f13
         nmadd.ps        $f27,$f4,$f9,$f25
         nmsub.ps        $f6,$f12,$f14,$f17
-        packrl.ph       $ra,$24,$14
-        pick.ph         $ra,$a2,$gp
-        pick.qb         $11,$a0,$gp
         pll.ps          $f25,$f9,$f30
         plu.ps          $f1,$f26,$f29
         preceq.w.phl    $s8,$gp
         preceq.w.phr    $s5,$15
-        precequ.ph.qbl  $s7,$ra
-        precequ.ph.qbla $a0,$9
-        precequ.ph.qbr  $ra,$s3
-        precequ.ph.qbra $24,$8
-        preceu.ph.qbl   $sp,$8
-        preceu.ph.qbla  $s6,$11
-        preceu.ph.qbr   $gp,$s1
-        preceu.ph.qbra  $k1,$s0
-        precr.qb.ph     $v0,$12,$s8
-        precrq.ph.w     $14,$s8,$24
-        precrq.qb.ph    $a2,$12,$12
-        precrq_rs.ph.w  $a1,$k0,$a3
-        precrqu_s.qb.ph $zero,$gp,$s5
         pul.ps          $f9,$f30,$f26
         puu.ps          $f24,$f9,$f2
-        raddu.w.qb      $25,$s3
         rdpgpr          $s3,$9
         recip.d         $f19,$f6
         recip.s         $f3,$f30
-        repl.ph         $at,-307
-        replv.ph        $v1,$s7
-        replv.qb        $25,$12
         rorv            $13,$a3,$s5
         round.l.d       $f12,$f1
         round.l.s       $f25,$f5
@@ -209,33 +112,7 @@
         sbe             $s7,33($s1)
         sce             $sp,189($10)
         she             $24,105($v0)
-        shilo           $ac1,26
-        shilov          $ac2,$10
-        shllv.ph        $10,$s0,$s0
-        shllv.qb        $gp,$v1,$zero
-        shllv_s.ph      $k1,$at,$13
-        shllv_s.w       $s1,$ra,$k0
-        shrav.ph        $25,$s2,$s1
-        shrav.qb        $zero,$24,$11
-        shrav_r.ph      $s3,$11,$25
-        shrav_r.qb      $a0,$sp,$s5
-        shrav_r.w       $s7,$s4,$s6
-        shrlv.ph        $14,$10,$9
-        shrlv.qb        $a2,$s2,$11
         sub.ps          $f5,$f14,$f26
-        subq.ph         $ra,$9,$s8
-        subq_s.ph       $13,$s8,$s5
-        subq_s.w        $k1,$a2,$a3
-        subqh.ph        $10,$at,$9
-        subqh.w         $v0,$a2,$zero
-        subqh_r.ph      $a0,$12,$s6
-        subqh_r.w       $10,$a2,$gp
-        subu.ph         $9,$s6,$s4
-        subu.qb         $s6,$a2,$s6
-        subu_s.ph       $v1,$a1,$s3
-        subu_s.qb       $s1,$at,$ra
-        subuh.qb        $zero,$gp,$gp
-        subuh_r.qb      $s4,$s8,$s6
         swe             $24,94($k0)
         swle            $v1,-209($gp)
         swre            $k0,-202($s2)
diff --git a/test/MC/Mips/mips32r3/valid-xfail.s b/test/MC/Mips/mips32r3/valid-xfail.s
index 09e19e8bb3b6..defa388c4df2 100644
--- a/test/MC/Mips/mips32r3/valid-xfail.s
+++ b/test/MC/Mips/mips32r3/valid-xfail.s
@@ -8,27 +8,10 @@
 
         .set noat
         abs.ps          $f22,$f8
-        absq_s.ph       $8,$a0
-        absq_s.qb       $15,$s1
-        absq_s.w        $s3,$ra
         add.ps          $f25,$f27,$f13
-        addq.ph         $s1,$15,$at
-        addq_s.ph       $s3,$s6,$s2
-        addq_s.w        $a2,$8,$at
-        addqh.ph        $s4,$14,$s1
         addqh.w         $s7,$s7,$k1
-        addqh_r.ph      $sp,$25,$s8
         addqh_r.w       $8,$v1,$zero
-        addsc           $s8,$15,$12
-        addu.ph         $a2,$14,$s3
-        addu.qb         $s6,$v1,$v1
-        addu_s.ph       $a3,$s3,$gp
-        addu_s.qb       $s4,$s8,$s1
-        adduh.qb        $a1,$a1,$at
-        adduh_r.qb      $a0,$9,$12
-        addwc           $k0,$s6,$s7
         alnv.ps         $f12,$f18,$f30,$12
-        bitrev          $14,$at
         c.eq.d          $fcc1,$f15,$f15
         c.eq.ps         $fcc5,$f0,$f9
         c.eq.s          $fcc5,$f24,$f17
@@ -76,18 +59,6 @@
         ceil.l.d        $f1,$f3
         ceil.l.s        $f18,$f13
         cfcmsa          $s6,$19
-        cmp.eq.ph       $s7,$14
-        cmp.le.ph       $8,$14
-        cmp.lt.ph       $k0,$sp
-        cmpgdu.eq.qb    $s3,$zero,$k0
-        cmpgdu.le.qb    $v1,$15,$s2
-        cmpgdu.lt.qb    $s0,$gp,$sp
-        cmpgu.eq.qb     $14,$s6,$s8
-        cmpgu.le.qb     $9,$a3,$s4
-        cmpgu.lt.qb     $sp,$at,$8
-        cmpu.eq.qb      $v0,$24
-        cmpu.le.qb      $s1,$a1
-        cmpu.lt.qb      $at,$a3
         ctcmsa          $31,$s7
         cvt.d.l         $f4,$f16
         cvt.ps.s        $f3,$f18,$f19
@@ -95,112 +66,44 @@
         cvt.s.pl        $f30,$f1
         cvt.s.pu        $f14,$f25
         dmt $k0
-        dpa.w.ph        $ac1,$s7,$k0
-        dpaq_s.w.ph     $ac2,$a0,$13
-        dpaq_sa.l.w     $ac0,$a2,$14
-        dpaqx_s.w.ph    $ac3,$a0,$24
-        dpaqx_sa.w.ph   $ac1,$zero,$s5
-        dpau.h.qbl      $ac1,$10,$24
-        dpau.h.qbr      $ac1,$s7,$s6
-        dpax.w.ph       $ac3,$a0,$k0
-        dps.w.ph        $ac1,$a3,$a1
-        dpsq_s.w.ph     $ac0,$gp,$k0
-        dpsq_sa.l.w     $ac0,$a3,$15
-        dpsqx_s.w.ph    $ac3,$13,$a3
-        dpsqx_sa.w.ph   $ac3,$sp,$s2
-        dpsu.h.qbl      $ac2,$14,$10
-        dpsu.h.qbr      $ac2,$a1,$s6
-        dpsx.w.ph       $ac0,$s7,$gp
         dvpe            $s6
         emt $8
         evpe            $v0
-        extpdpv         $s6,$ac0,$s8
-        extpv           $13,$ac0,$14
-        extrv.w         $8,$ac3,$at
-        extrv_r.w       $8,$ac1,$s6
-        extrv_rs.w      $gp,$ac1,$s6
-        extrv_s.h       $s2,$ac1,$14
         floor.l.d       $f26,$f7
         floor.l.s       $f12,$f5
         fork            $s2,$8,$a0
-        insv            $s2,$at
         iret
         lbe             $14,122($9)
         lbue            $11,-108($10)
-        lbux            $9,$14($v0)
         lhe             $s6,219($v1)
         lhue            $gp,118($11)
-        lhx             $sp,$k0($15)
         lle             $gp,-237($ra)
         lwe             $ra,-145($14)
         lwle            $11,-42($11)
         lwre            $sp,-152($24)
-        lwx             $12,$12($s4)
         madd.ps         $f22,$f3,$f14,$f3
-        maq_s.w.phl     $ac2,$25,$11
-        maq_s.w.phr     $ac0,$10,$25
-        maq_sa.w.phl    $ac3,$a1,$v1
-        maq_sa.w.phr    $ac1,$at,$10
         mfgc0           $s6,c0_datahi1
-        mflo            $9,$ac2
-        modsub          $a3,$12,$a3
         mov.ps          $f22,$f17
         movf.ps         $f10,$f28,$fcc6
         movn.ps         $f31,$f31,$s3
         movt.ps         $f20,$f25,$fcc2
         movz.ps         $f18,$f17,$ra
-        msub            $ac2,$sp,$14
         msub.ps         $f12,$f14,$f29,$f17
-        msubu           $ac2,$a1,$24
         mtc0            $9,c0_datahi1
         mtgc0           $s4,$21,7
-        mthi            $v0,$ac1
-        mthlip          $a3,$ac0
-        mul.ph          $s4,$24,$s0
         mul.ps          $f14,$f0,$f16
-        mul_s.ph        $10,$14,$15
-        muleq_s.w.phl   $11,$s4,$s4
-        muleq_s.w.phr   $s6,$a0,$s8
-        muleu_s.ph.qbl  $a2,$14,$8
-        muleu_s.ph.qbr  $a1,$ra,$9
-        mulq_rs.ph      $s2,$14,$15
-        mulq_rs.w       $at,$s4,$25
-        mulq_s.ph       $s0,$k1,$15
-        mulq_s.w        $9,$a3,$s0
-        mulsa.w.ph      $ac1,$s4,$s6
-        mulsaq_s.w.ph   $ac0,$ra,$s2
         neg.ps          $f19,$f13
         nmadd.ps        $f27,$f4,$f9,$f25
         nmsub.ps        $f6,$f12,$f14,$f17
-        packrl.ph       $ra,$24,$14
-        pick.ph         $ra,$a2,$gp
-        pick.qb         $11,$a0,$gp
         pll.ps          $f25,$f9,$f30
         plu.ps          $f1,$f26,$f29
         preceq.w.phl    $s8,$gp
         preceq.w.phr    $s5,$15
-        precequ.ph.qbl  $s7,$ra
-        precequ.ph.qbla $a0,$9
-        precequ.ph.qbr  $ra,$s3
-        precequ.ph.qbra $24,$8
-        preceu.ph.qbl   $sp,$8
-        preceu.ph.qbla  $s6,$11
-        preceu.ph.qbr   $gp,$s1
-        preceu.ph.qbra  $k1,$s0
-        precr.qb.ph     $v0,$12,$s8
-        precrq.ph.w     $14,$s8,$24
-        precrq.qb.ph    $a2,$12,$12
-        precrq_rs.ph.w  $a1,$k0,$a3
-        precrqu_s.qb.ph $zero,$gp,$s5
         pul.ps          $f9,$f30,$f26
         puu.ps          $f24,$f9,$f2
-        raddu.w.qb      $25,$s3
         rdpgpr          $s3,$9
         recip.d         $f19,$f6
         recip.s         $f3,$f30
-        repl.ph         $at,-307
-        replv.ph        $v1,$s7
-        replv.qb        $25,$12
         rorv            $13,$a3,$s5
         round.l.d       $f12,$f1
         round.l.s       $f25,$f5
@@ -209,33 +112,7 @@
         sbe             $s7,33($s1)
         sce             $sp,189($10)
         she             $24,105($v0)
-        shilo           $ac1,26
-        shilov          $ac2,$10
-        shllv.ph        $10,$s0,$s0
-        shllv.qb        $gp,$v1,$zero
-        shllv_s.ph      $k1,$at,$13
-        shllv_s.w       $s1,$ra,$k0
-        shrav.ph        $25,$s2,$s1
-        shrav.qb        $zero,$24,$11
-        shrav_r.ph      $s3,$11,$25
-        shrav_r.qb      $a0,$sp,$s5
-        shrav_r.w       $s7,$s4,$s6
-        shrlv.ph        $14,$10,$9
-        shrlv.qb        $a2,$s2,$11
         sub.ps          $f5,$f14,$f26
-        subq.ph         $ra,$9,$s8
-        subq_s.ph       $13,$s8,$s5
-        subq_s.w        $k1,$a2,$a3
-        subqh.ph        $10,$at,$9
-        subqh.w         $v0,$a2,$zero
-        subqh_r.ph      $a0,$12,$s6
-        subqh_r.w       $10,$a2,$gp
-        subu.ph         $9,$s6,$s4
-        subu.qb         $s6,$a2,$s6
-        subu_s.ph       $v1,$a1,$s3
-        subu_s.qb       $s1,$at,$ra
-        subuh.qb        $zero,$gp,$gp
-        subuh_r.qb      $s4,$s8,$s6
         swe             $24,94($k0)
         swle            $v1,-209($gp)
         swre            $k0,-202($s2)
diff --git a/test/MC/Mips/mips32r5/valid-xfail.s b/test/MC/Mips/mips32r5/valid-xfail.s
index 30fc4b98e056..c1bf7a4b3a29 100644
--- a/test/MC/Mips/mips32r5/valid-xfail.s
+++ b/test/MC/Mips/mips32r5/valid-xfail.s
@@ -8,27 +8,10 @@
 
         .set noat
         abs.ps          $f22,$f8
-        absq_s.ph       $8,$a0
-        absq_s.qb       $15,$s1
-        absq_s.w        $s3,$ra
         add.ps          $f25,$f27,$f13
-        addq.ph         $s1,$15,$at
-        addq_s.ph       $s3,$s6,$s2
-        addq_s.w        $a2,$8,$at
-        addqh.ph        $s4,$14,$s1
         addqh.w         $s7,$s7,$k1
-        addqh_r.ph      $sp,$25,$s8
         addqh_r.w       $8,$v1,$zero
-        addsc           $s8,$15,$12
-        addu.ph         $a2,$14,$s3
-        addu.qb         $s6,$v1,$v1
-        addu_s.ph       $a3,$s3,$gp
-        addu_s.qb       $s4,$s8,$s1
-        adduh.qb        $a1,$a1,$at
-        adduh_r.qb      $a0,$9,$12
-        addwc           $k0,$s6,$s7
         alnv.ps         $f12,$f18,$f30,$12
-        bitrev          $14,$at
         c.eq.d          $fcc1,$f15,$f15
         c.eq.ps         $fcc5,$f0,$f9
         c.eq.s          $fcc5,$f24,$f17
@@ -76,18 +59,6 @@
         ceil.l.d        $f1,$f3
         ceil.l.s        $f18,$f13
         cfcmsa          $s6,$19
-        cmp.eq.ph       $s7,$14
-        cmp.le.ph       $8,$14
-        cmp.lt.ph       $k0,$sp
-        cmpgdu.eq.qb    $s3,$zero,$k0
-        cmpgdu.le.qb    $v1,$15,$s2
-        cmpgdu.lt.qb    $s0,$gp,$sp
-        cmpgu.eq.qb     $14,$s6,$s8
-        cmpgu.le.qb     $9,$a3,$s4
-        cmpgu.lt.qb     $sp,$at,$8
-        cmpu.eq.qb      $v0,$24
-        cmpu.le.qb      $s1,$a1
-        cmpu.lt.qb      $at,$a3
         ctcmsa          $31,$s7
         cvt.d.l         $f4,$f16
         cvt.ps.s        $f3,$f18,$f19
@@ -95,112 +66,44 @@
         cvt.s.pl        $f30,$f1
         cvt.s.pu        $f14,$f25
         dmt $k0
-        dpa.w.ph        $ac1,$s7,$k0
-        dpaq_s.w.ph     $ac2,$a0,$13
-        dpaq_sa.l.w     $ac0,$a2,$14
-        dpaqx_s.w.ph    $ac3,$a0,$24
-        dpaqx_sa.w.ph   $ac1,$zero,$s5
-        dpau.h.qbl      $ac1,$10,$24
-        dpau.h.qbr      $ac1,$s7,$s6
-        dpax.w.ph       $ac3,$a0,$k0
-        dps.w.ph        $ac1,$a3,$a1
-        dpsq_s.w.ph     $ac0,$gp,$k0
-        dpsq_sa.l.w     $ac0,$a3,$15
-        dpsqx_s.w.ph    $ac3,$13,$a3
-        dpsqx_sa.w.ph   $ac3,$sp,$s2
-        dpsu.h.qbl      $ac2,$14,$10
-        dpsu.h.qbr      $ac2,$a1,$s6
-        dpsx.w.ph       $ac0,$s7,$gp
         dvpe            $s6
         emt $8
         evpe            $v0
-        extpdpv         $s6,$ac0,$s8
-        extpv           $13,$ac0,$14
-        extrv.w         $8,$ac3,$at
-        extrv_r.w       $8,$ac1,$s6
-        extrv_rs.w      $gp,$ac1,$s6
-        extrv_s.h       $s2,$ac1,$14
         floor.l.d       $f26,$f7
         floor.l.s       $f12,$f5
         fork            $s2,$8,$a0
-        insv            $s2,$at
         iret
         lbe             $14,122($9)
         lbue            $11,-108($10)
-        lbux            $9,$14($v0)
         lhe             $s6,219($v1)
         lhue            $gp,118($11)
-        lhx             $sp,$k0($15)
         lle             $gp,-237($ra)
         lwe             $ra,-145($14)
         lwle            $11,-42($11)
         lwre            $sp,-152($24)
-        lwx             $12,$12($s4)
         madd.ps         $f22,$f3,$f14,$f3
-        maq_s.w.phl     $ac2,$25,$11
-        maq_s.w.phr     $ac0,$10,$25
-        maq_sa.w.phl    $ac3,$a1,$v1
-        maq_sa.w.phr    $ac1,$at,$10
         mfgc0           $s6,c0_datahi1
-        mflo            $9,$ac2
-        modsub          $a3,$12,$a3
         mov.ps          $f22,$f17
         movf.ps         $f10,$f28,$fcc6
         movn.ps         $f31,$f31,$s3
         movt.ps         $f20,$f25,$fcc2
         movz.ps         $f18,$f17,$ra
-        msub            $ac2,$sp,$14
         msub.ps         $f12,$f14,$f29,$f17
-        msubu           $ac2,$a1,$24
         mtc0            $9,c0_datahi1
         mtgc0           $s4,$21,7
-        mthi            $v0,$ac1
-        mthlip          $a3,$ac0
-        mul.ph          $s4,$24,$s0
         mul.ps          $f14,$f0,$f16
-        mul_s.ph        $10,$14,$15
-        muleq_s.w.phl   $11,$s4,$s4
-        muleq_s.w.phr   $s6,$a0,$s8
-        muleu_s.ph.qbl  $a2,$14,$8
-        muleu_s.ph.qbr  $a1,$ra,$9
-        mulq_rs.ph      $s2,$14,$15
-        mulq_rs.w       $at,$s4,$25
-        mulq_s.ph       $s0,$k1,$15
-        mulq_s.w        $9,$a3,$s0
-        mulsa.w.ph      $ac1,$s4,$s6
-        mulsaq_s.w.ph   $ac0,$ra,$s2
         neg.ps          $f19,$f13
         nmadd.ps        $f27,$f4,$f9,$f25
         nmsub.ps        $f6,$f12,$f14,$f17
-        packrl.ph       $ra,$24,$14
-        pick.ph         $ra,$a2,$gp
-        pick.qb         $11,$a0,$gp
         pll.ps          $f25,$f9,$f30
         plu.ps          $f1,$f26,$f29
         preceq.w.phl    $s8,$gp
         preceq.w.phr    $s5,$15
-        precequ.ph.qbl  $s7,$ra
-        precequ.ph.qbla $a0,$9
-        precequ.ph.qbr  $ra,$s3
-        precequ.ph.qbra $24,$8
-        preceu.ph.qbl   $sp,$8
-        preceu.ph.qbla  $s6,$11
-        preceu.ph.qbr   $gp,$s1
-        preceu.ph.qbra  $k1,$s0
-        precr.qb.ph     $v0,$12,$s8
-        precrq.ph.w     $14,$s8,$24
-        precrq.qb.ph    $a2,$12,$12
-        precrq_rs.ph.w  $a1,$k0,$a3
-        precrqu_s.qb.ph $zero,$gp,$s5
         pul.ps          $f9,$f30,$f26
         puu.ps          $f24,$f9,$f2
-        raddu.w.qb      $25,$s3
         rdpgpr          $s3,$9
         recip.d         $f19,$f6
         recip.s         $f3,$f30
-        repl.ph         $at,-307
-        replv.ph        $v1,$s7
-        replv.qb        $25,$12
         rorv            $13,$a3,$s5
         round.l.d       $f12,$f1
         round.l.s       $f25,$f5
@@ -209,33 +112,7 @@
         sbe             $s7,33($s1)
         sce             $sp,189($10)
         she             $24,105($v0)
-        shilo           $ac1,26
-        shilov          $ac2,$10
-        shllv.ph        $10,$s0,$s0
-        shllv.qb        $gp,$v1,$zero
-        shllv_s.ph      $k1,$at,$13
-        shllv_s.w       $s1,$ra,$k0
-        shrav.ph        $25,$s2,$s1
-        shrav.qb        $zero,$24,$11
-        shrav_r.ph      $s3,$11,$25
-        shrav_r.qb      $a0,$sp,$s5
-        shrav_r.w       $s7,$s4,$s6
-        shrlv.ph        $14,$10,$9
-        shrlv.qb        $a2,$s2,$11
         sub.ps          $f5,$f14,$f26
-        subq.ph         $ra,$9,$s8
-        subq_s.ph       $13,$s8,$s5
-        subq_s.w        $k1,$a2,$a3
-        subqh.ph        $10,$at,$9
-        subqh.w         $v0,$a2,$zero
-        subqh_r.ph      $a0,$12,$s6
-        subqh_r.w       $10,$a2,$gp
-        subu.ph         $9,$s6,$s4
-        subu.qb         $s6,$a2,$s6
-        subu_s.ph       $v1,$a1,$s3
-        subu_s.qb       $s1,$at,$ra
-        subuh.qb        $zero,$gp,$gp
-        subuh_r.qb      $s4,$s8,$s6
         swe             $24,94($k0)
         swle            $v1,-209($gp)
         swre            $k0,-202($s2)
diff --git a/test/MC/Mips/mips64r2/valid-xfail.s b/test/MC/Mips/mips64r2/valid-xfail.s
index 5faa29d6468e..bf17b35c446c 100644
--- a/test/MC/Mips/mips64r2/valid-xfail.s
+++ b/test/MC/Mips/mips64r2/valid-xfail.s
@@ -8,30 +8,10 @@
 
         .set noat
         abs.ps          $f22,$f8
-        absq_s.ph       $8,$a0
-        absq_s.qb       $15,$s1
-        absq_s.w        $s3,$ra
         add.ps          $f25,$f27,$f13
-        addq.ph         $s1,$15,$at
-        addq_s.ph       $s3,$s6,$s2
-        addq_s.w        $a2,$8,$at
-        addqh.ph        $s4,$14,$s1
         addqh.w         $s7,$s7,$k1
-        addqh_r.ph      $sp,$25,$s8
         addqh_r.w       $8,$v1,$zero
-        addsc           $s8,$15,$12
-        addu.ph         $a2,$14,$s3
-        addu.qb         $s6,$v1,$v1
-        addu_s.ph       $a3,$s3,$gp
-        addu_s.qb       $s4,$s8,$s1
-        adduh.qb        $a1,$a1,$at
-        adduh_r.qb      $a0,$9,$12
-        addwc           $k0,$s6,$s7
-        alnv.ob         $v22,$v19,$v30,$v1
-        alnv.ob         $v31,$v23,$v30,$at
-        alnv.ob         $v8,$v17,$v30,$a1
         alnv.ps         $f12,$f18,$f30,$12
-        bitrev          $14,$at
         c.eq.d          $fcc1,$f15,$f15
         c.eq.ps         $fcc5,$f0,$f9
         c.eq.s          $fcc5,$f24,$f17
@@ -77,18 +57,6 @@
         c.un.ps         $fcc4,$f2,$f26
         c.un.s          $fcc1,$f30,$f4
         cvt.ps.s        $f3,$f18,$f19
-        cmp.eq.ph       $s7,$14
-        cmp.le.ph       $8,$14
-        cmp.lt.ph       $k0,$sp
-        cmpgdu.eq.qb    $s3,$zero,$k0
-        cmpgdu.le.qb    $v1,$15,$s2
-        cmpgdu.lt.qb    $s0,$gp,$sp
-        cmpgu.eq.qb     $14,$s6,$s8
-        cmpgu.le.qb     $9,$a3,$s4
-        cmpgu.lt.qb     $sp,$at,$8
-        cmpu.eq.qb      $v0,$24
-        cmpu.le.qb      $s1,$a1
-        cmpu.lt.qb      $at,$a3
         cvt.s.pl        $f30,$f1
         cvt.s.pu        $f14,$f25
         dmfc0           $10,c0_watchhi,2
@@ -96,54 +64,22 @@
         dmt $k0
         dmtc0           $15,c0_datalo
         dmtgc0          $a2,c0_watchlo,2
-        dpa.w.ph        $ac1,$s7,$k0
-        dpaq_s.w.ph     $ac2,$a0,$13
-        dpaq_sa.l.w     $ac0,$a2,$14
-        dpaqx_s.w.ph    $ac3,$a0,$24
-        dpaqx_sa.w.ph   $ac1,$zero,$s5
-        dpau.h.qbl      $ac1,$10,$24
-        dpau.h.qbr      $ac1,$s7,$s6
-        dpax.w.ph       $ac3,$a0,$k0
-        dps.w.ph        $ac1,$a3,$a1
-        dpsq_s.w.ph     $ac0,$gp,$k0
-        dpsq_sa.l.w     $ac0,$a3,$15
-        dpsqx_s.w.ph    $ac3,$13,$a3
-        dpsqx_sa.w.ph   $ac3,$sp,$s2
-        dpsu.h.qbl      $ac2,$14,$10
-        dpsu.h.qbr      $ac2,$a1,$s6
-        dpsx.w.ph       $ac0,$s7,$gp
         drorv           $at,$a1,$s7
         dvpe            $s6
         emt $8
         evpe            $v0
-        extpdpv         $s6,$ac0,$s8
-        extpv           $13,$ac0,$14
-        extrv.w         $8,$ac3,$at
-        extrv_r.w       $8,$ac1,$s6
-        extrv_rs.w      $gp,$ac1,$s6
-        extrv_s.h       $s2,$ac1,$14
         fork            $s2,$8,$a0
-        insv            $s2,$at
         iret
-        lbe $14,122($9)
+        lbe             $14,122($9)
         lbue            $11,-108($10)
-        lbux            $9,$14($v0)
-        lhe $s6,219($v1)
+        lhe             $s6,219($v1)
         lhue            $gp,118($11)
-        lhx $sp,$k0($15)
-        lle $gp,-237($ra)
-        lwe $ra,-145($14)
+        lle             $gp,-237($ra)
+        lwe             $ra,-145($14)
         lwle            $11,-42($11)
         lwre            $sp,-152($24)
-        lwx $12,$12($s4)
         madd.ps         $f22,$f3,$f14,$f3
-        maq_s.w.phl     $ac2,$25,$11
-        maq_s.w.phr     $ac0,$10,$25
-        maq_sa.w.phl    $ac3,$a1,$v1
-        maq_sa.w.phr    $ac1,$at,$10
         mfgc0           $s6,c0_datahi1
-        mflo            $9,$ac2
-        modsub          $a3,$12,$a3
         mov.ps          $f22,$f17
         movf.ps         $f10,$f28,$fcc6
         movn.ps         $f31,$f31,$s3
@@ -151,92 +87,30 @@
         movz.ps         $f18,$f17,$ra
         msgn.qh         $v0,$v24,$v20
         msgn.qh         $v12,$v21,$v0[1]
-        msub            $ac2,$sp,$14
         msub.ps         $f12,$f14,$f29,$f17
-        msubu           $ac2,$a1,$24
         mtc0            $9,c0_datahi1
         mtgc0           $s4,$21,7
-        mthi            $v0,$ac1
-        mthlip          $a3,$ac0
-        mul.ph          $s4,$24,$s0
         mul.ps          $f14,$f0,$f16
-        mul_s.ph        $10,$14,$15
-        muleq_s.w.phl   $11,$s4,$s4
-        muleq_s.w.phr   $s6,$a0,$s8
-        muleu_s.ph.qbl  $a2,$14,$8
-        muleu_s.ph.qbr  $a1,$ra,$9
-        mulq_rs.ph      $s2,$14,$15
-        mulq_rs.w       $at,$s4,$25
-        mulq_s.ph       $s0,$k1,$15
-        mulq_s.w        $9,$a3,$s0
-        mulsa.w.ph      $ac1,$s4,$s6
-        mulsaq_s.w.ph   $ac0,$ra,$s2
         neg.ps          $f19,$f13
         nmadd.ps        $f27,$f4,$f9,$f25
         nmsub.ps        $f6,$f12,$f14,$f17
-        packrl.ph       $ra,$24,$14
-        pick.ph         $ra,$a2,$gp
-        pick.qb         $11,$a0,$gp
         pll.ps          $f25,$f9,$f30
         plu.ps          $f1,$f26,$f29
         preceq.w.phl    $s8,$gp
         preceq.w.phr    $s5,$15
-        precequ.ph.qbl  $s7,$ra
-        precequ.ph.qbla $a0,$9
-        precequ.ph.qbr  $ra,$s3
-        precequ.ph.qbra $24,$8
-        preceu.ph.qbl   $sp,$8
-        preceu.ph.qbla  $s6,$11
-        preceu.ph.qbr   $gp,$s1
-        preceu.ph.qbra  $k1,$s0
-        precr.qb.ph     $v0,$12,$s8
-        precrq.ph.w     $14,$s8,$24
-        precrq.qb.ph    $a2,$12,$12
-        precrq_rs.ph.w  $a1,$k0,$a3
-        precrqu_s.qb.ph $zero,$gp,$s5
         pul.ps          $f9,$f30,$f26
         puu.ps          $f24,$f9,$f2
-        raddu.w.qb      $25,$s3
         rdpgpr          $s3,$9
         recip.d         $f19,$f6
         recip.s         $f3,$f30
-        repl.ph         $at,-307
-        replv.ph        $v1,$s7
-        replv.qb        $25,$12
         rorv            $13,$a3,$s5
         rsqrt.d         $f3,$f28
         rsqrt.s         $f4,$f8
-        sbe $s7,33($s1)
-        sce $sp,189($10)
-        she $24,105($v0)
-        shilo           $ac1,26
-        shilov          $ac2,$10
-        shllv.ph        $10,$s0,$s0
-        shllv.qb        $gp,$v1,$zero
-        shllv_s.ph      $k1,$at,$13
-        shllv_s.w       $s1,$ra,$k0
-        shrav.ph        $25,$s2,$s1
-        shrav.qb        $zero,$24,$11
-        shrav_r.ph      $s3,$11,$25
-        shrav_r.qb      $a0,$sp,$s5
-        shrav_r.w       $s7,$s4,$s6
-        shrlv.ph        $14,$10,$9
-        shrlv.qb        $a2,$s2,$11
+        sbe             $s7,33($s1)
+        sce             $sp,189($10)
+        she             $24,105($v0)
         sub.ps          $f5,$f14,$f26
-        subq.ph         $ra,$9,$s8
-        subq_s.ph       $13,$s8,$s5
-        subq_s.w        $k1,$a2,$a3
-        subqh.ph        $10,$at,$9
-        subqh.w         $v0,$a2,$zero
-        subqh_r.ph      $a0,$12,$s6
-        subqh_r.w       $10,$a2,$gp
-        subu.ph         $9,$s6,$s4
-        subu.qb         $s6,$a2,$s6
-        subu_s.ph       $v1,$a1,$s3
-        subu_s.qb       $s1,$at,$ra
-        subuh.qb        $zero,$gp,$gp
-        subuh_r.qb      $s4,$s8,$s6
-        swe $24,94($k0)
+        swe             $24,94($k0)
         swle            $v1,-209($gp)
         swre            $k0,-202($s2)
         tlbginv
diff --git a/test/MC/Mips/mips64r3/valid-xfail.s b/test/MC/Mips/mips64r3/valid-xfail.s
index dcf66bf97d68..7e94200dfd62 100644
--- a/test/MC/Mips/mips64r3/valid-xfail.s
+++ b/test/MC/Mips/mips64r3/valid-xfail.s
@@ -8,30 +8,13 @@
 
         .set noat
         abs.ps          $f22,$f8
-        absq_s.ph       $8,$a0
-        absq_s.qb       $15,$s1
-        absq_s.w        $s3,$ra
         add.ps          $f25,$f27,$f13
-        addq.ph         $s1,$15,$at
-        addq_s.ph       $s3,$s6,$s2
-        addq_s.w        $a2,$8,$at
-        addqh.ph        $s4,$14,$s1
         addqh.w         $s7,$s7,$k1
-        addqh_r.ph      $sp,$25,$s8
         addqh_r.w       $8,$v1,$zero
-        addsc           $s8,$15,$12
-        addu.ph         $a2,$14,$s3
-        addu.qb         $s6,$v1,$v1
-        addu_s.ph       $a3,$s3,$gp
-        addu_s.qb       $s4,$s8,$s1
-        adduh.qb        $a1,$a1,$at
-        adduh_r.qb      $a0,$9,$12
-        addwc           $k0,$s6,$s7
         alnv.ob         $v22,$v19,$v30,$v1
         alnv.ob         $v31,$v23,$v30,$at
         alnv.ob         $v8,$v17,$v30,$a1
         alnv.ps         $f12,$f18,$f30,$12
-        bitrev          $14,$at
         c.eq.d          $fcc1,$f15,$f15
         c.eq.ps         $fcc5,$f0,$f9
         c.eq.s          $fcc5,$f24,$f17
@@ -77,18 +60,6 @@
         c.un.ps         $fcc4,$f2,$f26
         c.un.s          $fcc1,$f30,$f4
         cvt.ps.s        $f3,$f18,$f19
-        cmp.eq.ph       $s7,$14
-        cmp.le.ph       $8,$14
-        cmp.lt.ph       $k0,$sp
-        cmpgdu.eq.qb    $s3,$zero,$k0
-        cmpgdu.le.qb    $v1,$15,$s2
-        cmpgdu.lt.qb    $s0,$gp,$sp
-        cmpgu.eq.qb     $14,$s6,$s8
-        cmpgu.le.qb     $9,$a3,$s4
-        cmpgu.lt.qb     $sp,$at,$8
-        cmpu.eq.qb      $v0,$24
-        cmpu.le.qb      $s1,$a1
-        cmpu.lt.qb      $at,$a3
         cvt.s.pl        $f30,$f1
         cvt.s.pu        $f14,$f25
         dmfc0           $10,c0_watchhi,2
@@ -96,54 +67,22 @@
         dmt $k0
         dmtc0           $15,c0_datalo
         dmtgc0          $a2,c0_watchlo,2
-        dpa.w.ph        $ac1,$s7,$k0
-        dpaq_s.w.ph     $ac2,$a0,$13
-        dpaq_sa.l.w     $ac0,$a2,$14
-        dpaqx_s.w.ph    $ac3,$a0,$24
-        dpaqx_sa.w.ph   $ac1,$zero,$s5
-        dpau.h.qbl      $ac1,$10,$24
-        dpau.h.qbr      $ac1,$s7,$s6
-        dpax.w.ph       $ac3,$a0,$k0
-        dps.w.ph        $ac1,$a3,$a1
-        dpsq_s.w.ph     $ac0,$gp,$k0
-        dpsq_sa.l.w     $ac0,$a3,$15
-        dpsqx_s.w.ph    $ac3,$13,$a3
-        dpsqx_sa.w.ph   $ac3,$sp,$s2
-        dpsu.h.qbl      $ac2,$14,$10
-        dpsu.h.qbr      $ac2,$a1,$s6
-        dpsx.w.ph       $ac0,$s7,$gp
         drorv           $at,$a1,$s7
         dvpe            $s6
         emt $8
         evpe            $v0
-        extpdpv         $s6,$ac0,$s8
-        extpv           $13,$ac0,$14
-        extrv.w         $8,$ac3,$at
-        extrv_r.w       $8,$ac1,$s6
-        extrv_rs.w      $gp,$ac1,$s6
-        extrv_s.h       $s2,$ac1,$14
         fork            $s2,$8,$a0
-        insv            $s2,$at
         iret
-        lbe $14,122($9)
+        lbe             $14,122($9)
         lbue            $11,-108($10)
-        lbux            $9,$14($v0)
-        lhe $s6,219($v1)
+        lhe             $s6,219($v1)
         lhue            $gp,118($11)
-        lhx $sp,$k0($15)
-        lle $gp,-237($ra)
-        lwe $ra,-145($14)
+        lle             $gp,-237($ra)
+        lwe             $ra,-145($14)
         lwle            $11,-42($11)
         lwre            $sp,-152($24)
-        lwx $12,$12($s4)
         madd.ps         $f22,$f3,$f14,$f3
-        maq_s.w.phl     $ac2,$25,$11
-        maq_s.w.phr     $ac0,$10,$25
-        maq_sa.w.phl    $ac3,$a1,$v1
-        maq_sa.w.phr    $ac1,$at,$10
         mfgc0           $s6,c0_datahi1
-        mflo            $9,$ac2
-        modsub          $a3,$12,$a3
         mov.ps          $f22,$f17
         movf.ps         $f10,$f28,$fcc6
         movn.ps         $f31,$f31,$s3
@@ -151,92 +90,30 @@
         movz.ps         $f18,$f17,$ra
         msgn.qh         $v0,$v24,$v20
         msgn.qh         $v12,$v21,$v0[1]
-        msub            $ac2,$sp,$14
         msub.ps         $f12,$f14,$f29,$f17
-        msubu           $ac2,$a1,$24
         mtc0            $9,c0_datahi1
         mtgc0           $s4,$21,7
-        mthi            $v0,$ac1
-        mthlip          $a3,$ac0
-        mul.ph          $s4,$24,$s0
         mul.ps          $f14,$f0,$f16
-        mul_s.ph        $10,$14,$15
-        muleq_s.w.phl   $11,$s4,$s4
-        muleq_s.w.phr   $s6,$a0,$s8
-        muleu_s.ph.qbl  $a2,$14,$8
-        muleu_s.ph.qbr  $a1,$ra,$9
-        mulq_rs.ph      $s2,$14,$15
-        mulq_rs.w       $at,$s4,$25
-        mulq_s.ph       $s0,$k1,$15
-        mulq_s.w        $9,$a3,$s0
-        mulsa.w.ph      $ac1,$s4,$s6
-        mulsaq_s.w.ph   $ac0,$ra,$s2
         neg.ps          $f19,$f13
         nmadd.ps        $f27,$f4,$f9,$f25
         nmsub.ps        $f6,$f12,$f14,$f17
-        packrl.ph       $ra,$24,$14
-        pick.ph         $ra,$a2,$gp
-        pick.qb         $11,$a0,$gp
         pll.ps          $f25,$f9,$f30
         plu.ps          $f1,$f26,$f29
         preceq.w.phl    $s8,$gp
         preceq.w.phr    $s5,$15
-        precequ.ph.qbl  $s7,$ra
-        precequ.ph.qbla $a0,$9
-        precequ.ph.qbr  $ra,$s3
-        precequ.ph.qbra $24,$8
-        preceu.ph.qbl   $sp,$8
-        preceu.ph.qbla  $s6,$11
-        preceu.ph.qbr   $gp,$s1
-        preceu.ph.qbra  $k1,$s0
-        precr.qb.ph     $v0,$12,$s8
-        precrq.ph.w     $14,$s8,$24
-        precrq.qb.ph    $a2,$12,$12
-        precrq_rs.ph.w  $a1,$k0,$a3
-        precrqu_s.qb.ph $zero,$gp,$s5
         pul.ps          $f9,$f30,$f26
         puu.ps          $f24,$f9,$f2
-        raddu.w.qb      $25,$s3
         rdpgpr          $s3,$9
         recip.d         $f19,$f6
         recip.s         $f3,$f30
-        repl.ph         $at,-307
-        replv.ph        $v1,$s7
-        replv.qb        $25,$12
         rorv            $13,$a3,$s5
         rsqrt.d         $f3,$f28
         rsqrt.s         $f4,$f8
-        sbe $s7,33($s1)
-        sce $sp,189($10)
-        she $24,105($v0)
-        shilo           $ac1,26
-        shilov          $ac2,$10
-        shllv.ph        $10,$s0,$s0
-        shllv.qb        $gp,$v1,$zero
-        shllv_s.ph      $k1,$at,$13
-        shllv_s.w       $s1,$ra,$k0
-        shrav.ph        $25,$s2,$s1
-        shrav.qb        $zero,$24,$11
-        shrav_r.ph      $s3,$11,$25
-        shrav_r.qb      $a0,$sp,$s5
-        shrav_r.w       $s7,$s4,$s6
-        shrlv.ph        $14,$10,$9
-        shrlv.qb        $a2,$s2,$11
+        sbe             $s7,33($s1)
+        sce             $sp,189($10)
+        she             $24,105($v0)
         sub.ps          $f5,$f14,$f26
-        subq.ph         $ra,$9,$s8
-        subq_s.ph       $13,$s8,$s5
-        subq_s.w        $k1,$a2,$a3
-        subqh.ph        $10,$at,$9
-        subqh.w         $v0,$a2,$zero
-        subqh_r.ph      $a0,$12,$s6
-        subqh_r.w       $10,$a2,$gp
-        subu.ph         $9,$s6,$s4
-        subu.qb         $s6,$a2,$s6
-        subu_s.ph       $v1,$a1,$s3
-        subu_s.qb       $s1,$at,$ra
-        subuh.qb        $zero,$gp,$gp
-        subuh_r.qb      $s4,$s8,$s6
-        swe $24,94($k0)
+        swe             $24,94($k0)
         swle            $v1,-209($gp)
         swre            $k0,-202($s2)
         tlbginv
diff --git a/test/MC/Mips/mips64r5/valid-xfail.s b/test/MC/Mips/mips64r5/valid-xfail.s
index 0f7788359cf2..b5ecdcbfb726 100644
--- a/test/MC/Mips/mips64r5/valid-xfail.s
+++ b/test/MC/Mips/mips64r5/valid-xfail.s
@@ -8,30 +8,13 @@
 
         .set noat
         abs.ps          $f22,$f8
-        absq_s.ph       $8,$a0
-        absq_s.qb       $15,$s1
-        absq_s.w        $s3,$ra
         add.ps          $f25,$f27,$f13
-        addq.ph         $s1,$15,$at
-        addq_s.ph       $s3,$s6,$s2
-        addq_s.w        $a2,$8,$at
-        addqh.ph        $s4,$14,$s1
         addqh.w         $s7,$s7,$k1
-        addqh_r.ph      $sp,$25,$s8
         addqh_r.w       $8,$v1,$zero
-        addsc           $s8,$15,$12
-        addu.ph         $a2,$14,$s3
-        addu.qb         $s6,$v1,$v1
-        addu_s.ph       $a3,$s3,$gp
-        addu_s.qb       $s4,$s8,$s1
-        adduh.qb        $a1,$a1,$at
-        adduh_r.qb      $a0,$9,$12
-        addwc           $k0,$s6,$s7
         alnv.ob         $v22,$v19,$v30,$v1
         alnv.ob         $v31,$v23,$v30,$at
         alnv.ob         $v8,$v17,$v30,$a1
         alnv.ps         $f12,$f18,$f30,$12
-        bitrev          $14,$at
         c.eq.d          $fcc1,$f15,$f15
         c.eq.ps         $fcc5,$f0,$f9
         c.eq.s          $fcc5,$f24,$f17
@@ -77,18 +60,6 @@
         c.un.ps         $fcc4,$f2,$f26
         c.un.s          $fcc1,$f30,$f4
         cvt.ps.s        $f3,$f18,$f19
-        cmp.eq.ph       $s7,$14
-        cmp.le.ph       $8,$14
-        cmp.lt.ph       $k0,$sp
-        cmpgdu.eq.qb    $s3,$zero,$k0
-        cmpgdu.le.qb    $v1,$15,$s2
-        cmpgdu.lt.qb    $s0,$gp,$sp
-        cmpgu.eq.qb     $14,$s6,$s8
-        cmpgu.le.qb     $9,$a3,$s4
-        cmpgu.lt.qb     $sp,$at,$8
-        cmpu.eq.qb      $v0,$24
-        cmpu.le.qb      $s1,$a1
-        cmpu.lt.qb      $at,$a3
         cvt.s.pl        $f30,$f1
         cvt.s.pu        $f14,$f25
         dmfc0           $10,c0_watchhi,2
@@ -96,54 +67,22 @@
         dmt $k0
         dmtc0           $15,c0_datalo
         dmtgc0          $a2,c0_watchlo,2
-        dpa.w.ph        $ac1,$s7,$k0
-        dpaq_s.w.ph     $ac2,$a0,$13
-        dpaq_sa.l.w     $ac0,$a2,$14
-        dpaqx_s.w.ph    $ac3,$a0,$24
-        dpaqx_sa.w.ph   $ac1,$zero,$s5
-        dpau.h.qbl      $ac1,$10,$24
-        dpau.h.qbr      $ac1,$s7,$s6
-        dpax.w.ph       $ac3,$a0,$k0
-        dps.w.ph        $ac1,$a3,$a1
-        dpsq_s.w.ph     $ac0,$gp,$k0
-        dpsq_sa.l.w     $ac0,$a3,$15
-        dpsqx_s.w.ph    $ac3,$13,$a3
-        dpsqx_sa.w.ph   $ac3,$sp,$s2
-        dpsu.h.qbl      $ac2,$14,$10
-        dpsu.h.qbr      $ac2,$a1,$s6
-        dpsx.w.ph       $ac0,$s7,$gp
         drorv           $at,$a1,$s7
         dvpe            $s6
         emt $8
         evpe            $v0
-        extpdpv         $s6,$ac0,$s8
-        extpv           $13,$ac0,$14
-        extrv.w         $8,$ac3,$at
-        extrv_r.w       $8,$ac1,$s6
-        extrv_rs.w      $gp,$ac1,$s6
-        extrv_s.h       $s2,$ac1,$14
         fork            $s2,$8,$a0
-        insv            $s2,$at
         iret
-        lbe $14,122($9)
+        lbe             $14,122($9)
         lbue            $11,-108($10)
-        lbux            $9,$14($v0)
-        lhe $s6,219($v1)
+        lhe             $s6,219($v1)
         lhue            $gp,118($11)
-        lhx $sp,$k0($15)
-        lle $gp,-237($ra)
-        lwe $ra,-145($14)
+        lle             $gp,-237($ra)
+        lwe             $ra,-145($14)
         lwle            $11,-42($11)
         lwre            $sp,-152($24)
-        lwx $12,$12($s4)
         madd.ps         $f22,$f3,$f14,$f3
-        maq_s.w.phl     $ac2,$25,$11
-        maq_s.w.phr     $ac0,$10,$25
-        maq_sa.w.phl    $ac3,$a1,$v1
-        maq_sa.w.phr    $ac1,$at,$10
         mfgc0           $s6,c0_datahi1
-        mflo            $9,$ac2
-        modsub          $a3,$12,$a3
         mov.ps          $f22,$f17
         movf.ps         $f10,$f28,$fcc6
         movn.ps         $f31,$f31,$s3
@@ -151,92 +90,30 @@
         movz.ps         $f18,$f17,$ra
         msgn.qh         $v0,$v24,$v20
         msgn.qh         $v12,$v21,$v0[1]
-        msub            $ac2,$sp,$14
         msub.ps         $f12,$f14,$f29,$f17
-        msubu           $ac2,$a1,$24
         mtc0            $9,c0_datahi1
         mtgc0           $s4,$21,7
-        mthi            $v0,$ac1
-        mthlip          $a3,$ac0
-        mul.ph          $s4,$24,$s0
         mul.ps          $f14,$f0,$f16
-        mul_s.ph        $10,$14,$15
-        muleq_s.w.phl   $11,$s4,$s4
-        muleq_s.w.phr   $s6,$a0,$s8
-        muleu_s.ph.qbl  $a2,$14,$8
-        muleu_s.ph.qbr  $a1,$ra,$9
-        mulq_rs.ph      $s2,$14,$15
-        mulq_rs.w       $at,$s4,$25
-        mulq_s.ph       $s0,$k1,$15
-        mulq_s.w        $9,$a3,$s0
-        mulsa.w.ph      $ac1,$s4,$s6
-        mulsaq_s.w.ph   $ac0,$ra,$s2
         neg.ps          $f19,$f13
         nmadd.ps        $f27,$f4,$f9,$f25
         nmsub.ps        $f6,$f12,$f14,$f17
-        packrl.ph       $ra,$24,$14
-        pick.ph         $ra,$a2,$gp
-        pick.qb         $11,$a0,$gp
         pll.ps          $f25,$f9,$f30
         plu.ps          $f1,$f26,$f29
         preceq.w.phl    $s8,$gp
         preceq.w.phr    $s5,$15
-        precequ.ph.qbl  $s7,$ra
-        precequ.ph.qbla $a0,$9
-        precequ.ph.qbr  $ra,$s3
-        precequ.ph.qbra $24,$8
-        preceu.ph.qbl   $sp,$8
-        preceu.ph.qbla  $s6,$11
-        preceu.ph.qbr   $gp,$s1
-        preceu.ph.qbra  $k1,$s0
-        precr.qb.ph     $v0,$12,$s8
-        precrq.ph.w     $14,$s8,$24
-        precrq.qb.ph    $a2,$12,$12
-        precrq_rs.ph.w  $a1,$k0,$a3
-        precrqu_s.qb.ph $zero,$gp,$s5
         pul.ps          $f9,$f30,$f26
         puu.ps          $f24,$f9,$f2
-        raddu.w.qb      $25,$s3
         rdpgpr          $s3,$9
         recip.d         $f19,$f6
         recip.s         $f3,$f30
-        repl.ph         $at,-307
-        replv.ph        $v1,$s7
-        replv.qb        $25,$12
         rorv            $13,$a3,$s5
         rsqrt.d         $f3,$f28
         rsqrt.s         $f4,$f8
-        sbe $s7,33($s1)
-        sce $sp,189($10)
-        she $24,105($v0)
-        shilo           $ac1,26
-        shilov          $ac2,$10
-        shllv.ph        $10,$s0,$s0
-        shllv.qb        $gp,$v1,$zero
-        shllv_s.ph      $k1,$at,$13
-        shllv_s.w       $s1,$ra,$k0
-        shrav.ph        $25,$s2,$s1
-        shrav.qb        $zero,$24,$11
-        shrav_r.ph      $s3,$11,$25
-        shrav_r.qb      $a0,$sp,$s5
-        shrav_r.w       $s7,$s4,$s6
-        shrlv.ph        $14,$10,$9
-        shrlv.qb        $a2,$s2,$11
+        sbe             $s7,33($s1)
+        sce             $sp,189($10)
+        she             $24,105($v0)
         sub.ps          $f5,$f14,$f26
-        subq.ph         $ra,$9,$s8
-        subq_s.ph       $13,$s8,$s5
-        subq_s.w        $k1,$a2,$a3
-        subqh.ph        $10,$at,$9
-        subqh.w         $v0,$a2,$zero
-        subqh_r.ph      $a0,$12,$s6
-        subqh_r.w       $10,$a2,$gp
-        subu.ph         $9,$s6,$s4
-        subu.qb         $s6,$a2,$s6
-        subu_s.ph       $v1,$a1,$s3
-        subu_s.qb       $s1,$at,$ra
-        subuh.qb        $zero,$gp,$gp
-        subuh_r.qb      $s4,$s8,$s6
-        swe $24,94($k0)
+        swe             $24,94($k0)
         swle            $v1,-209($gp)
         swre            $k0,-202($s2)
         tlbginv

From fad998fc360e9d7140354e14ef71de8426446ead Mon Sep 17 00:00:00 2001
From: Artyom Skrobov <Artyom.Skrobov@arm.com>
Date: Mon, 7 Dec 2015 14:22:39 +0000
Subject: [PATCH 170/364] [ARM] Generate ABI_optimization_goals build
 attribute, as described in the ARM ARM.

Summary: This reverts r254234, and adds a simple fix for the annoying case of use-after-free.

Reviewers: rengolin

Subscribers: aemerson, llvm-commits, rengolin

Differential Revision: http://reviews.llvm.org/D15236

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254912 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/ARMAsmPrinter.cpp              | 45 +++++++++++++++++--
 lib/Target/ARM/ARMAsmPrinter.h                |  5 +++
 .../ARM/MCTargetDesc/ARMELFStreamer.cpp       |  6 +--
 .../build-attributes-optimization-minsize.ll  | 18 ++++++++
 .../build-attributes-optimization-mixed.ll    | 23 ++++++++++
 .../build-attributes-optimization-optnone.ll  | 18 ++++++++
 .../build-attributes-optimization-optsize.ll  | 18 ++++++++
 .../ARM/build-attributes-optimization.ll      | 23 ++++++++++
 test/MC/ARM/data-in-code.ll                   | 16 +++----
 9 files changed, 157 insertions(+), 15 deletions(-)
 create mode 100644 test/CodeGen/ARM/build-attributes-optimization-minsize.ll
 create mode 100644 test/CodeGen/ARM/build-attributes-optimization-mixed.ll
 create mode 100644 test/CodeGen/ARM/build-attributes-optimization-optnone.ll
 create mode 100644 test/CodeGen/ARM/build-attributes-optimization-optsize.ll
 create mode 100644 test/CodeGen/ARM/build-attributes-optimization.ll

diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index 67ebfa2b581d..61141c0031df 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -60,7 +60,7 @@ using namespace llvm;
 ARMAsmPrinter::ARMAsmPrinter(TargetMachine &TM,
                              std::unique_ptr<MCStreamer> Streamer)
     : AsmPrinter(TM, std::move(Streamer)), AFI(nullptr), MCP(nullptr),
-      InConstantPool(false) {}
+      InConstantPool(false), OptimizationGoals(-1) {}
 
 void ARMAsmPrinter::EmitFunctionBodyEnd() {
   // Make sure to terminate any constant pools that were at the end
@@ -106,9 +106,38 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   Subtarget = &MF.getSubtarget<ARMSubtarget>();
 
   SetupMachineFunction(MF);
+  const Function* F = MF.getFunction();
+  const TargetMachine& TM = MF.getTarget();
+
+  // Calculate this function's optimization goal.
+  unsigned OptimizationGoal;
+  if (F->hasFnAttribute(Attribute::OptimizeNone))
+    // For best debugging illusion, speed and small size sacrificed
+    OptimizationGoal = 6;
+  else if (F->optForMinSize())
+    // Aggressively for small size, speed and debug illusion sacrificed
+    OptimizationGoal = 4;
+  else if (F->optForSize())
+    // For small size, but speed and debugging illusion preserved
+    OptimizationGoal = 3;
+  else if (TM.getOptLevel() == CodeGenOpt::Aggressive)
+    // Aggressively for speed, small size and debug illusion sacrificed
+    OptimizationGoal = 2;
+  else if (TM.getOptLevel() > CodeGenOpt::None)
+    // For speed, but small size and good debug illusion preserved
+    OptimizationGoal = 1;
+  else // TM.getOptLevel() == CodeGenOpt::None
+    // For good debugging, but speed and small size preserved
+    OptimizationGoal = 5;
+
+  // Combine a new optimization goal with existing ones.
+  if (OptimizationGoals == -1) // uninitialized goals
+    OptimizationGoals = OptimizationGoal;
+  else if (OptimizationGoals != (int)OptimizationGoal) // conflicting goals
+    OptimizationGoals = 0;
 
   if (Subtarget->isTargetCOFF()) {
-    bool Internal = MF.getFunction()->hasInternalLinkage();
+    bool Internal = F->hasInternalLinkage();
     COFF::SymbolStorageClass Scl = Internal ? COFF::IMAGE_SYM_CLASS_STATIC
                                             : COFF::IMAGE_SYM_CLASS_EXTERNAL;
     int Type = COFF::IMAGE_SYM_DTYPE_FUNCTION << COFF::SCT_COMPLEX_TYPE_SHIFT;
@@ -506,6 +535,16 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
     // generates code that does this, it is always safe to set.
     OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
   }
+
+  // The last attribute to be emitted is ABI_optimization_goals
+  MCTargetStreamer &TS = *OutStreamer->getTargetStreamer();
+  ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
+
+  if (OptimizationGoals > 0)
+    ATS.emitAttribute(ARMBuildAttrs::ABI_optimization_goals, OptimizationGoals);
+  OptimizationGoals = -1;
+
+  ATS.finishAttributeSection();
 }
 
 //===----------------------------------------------------------------------===//
@@ -798,8 +837,6 @@ void ARMAsmPrinter::emitAttributes() {
   else if (STI.hasVirtualization())
     ATS.emitAttribute(ARMBuildAttrs::Virtualization_use,
                       ARMBuildAttrs::AllowVirtualization);
-
-  ATS.finishAttributeSection();
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMAsmPrinter.h b/lib/Target/ARM/ARMAsmPrinter.h
index fb925f162f70..ed7be2de51ca 100644
--- a/lib/Target/ARM/ARMAsmPrinter.h
+++ b/lib/Target/ARM/ARMAsmPrinter.h
@@ -51,6 +51,11 @@ class LLVM_LIBRARY_VISIBILITY ARMAsmPrinter : public AsmPrinter {
   /// labels used for ARMv4t thumb code to make register indirect calls.
   SmallVector<std::pair<unsigned, MCSymbol*>, 4> ThumbIndirectPads;
 
+  /// OptimizationGoals - Maintain a combined optimization goal for all
+  /// functions in a module: one of Tag_ABI_optimization_goals values,
+  /// -1 if uninitialized, 0 if conflicting goals
+  int OptimizationGoals;
+
 public:
   explicit ARMAsmPrinter(TargetMachine &TM,
                          std::unique_ptr<MCStreamer> Streamer);
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index f316ad17576a..6084f22c8470 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -79,7 +79,7 @@ class ARMTargetAsmStreamer : public ARMTargetStreamer {
   void emitAttribute(unsigned Attribute, unsigned Value) override;
   void emitTextAttribute(unsigned Attribute, StringRef String) override;
   void emitIntTextAttribute(unsigned Attribute, unsigned IntValue,
-                            StringRef StrinValue) override;
+                            StringRef StringValue) override;
   void emitArch(unsigned Arch) override;
   void emitArchExtension(unsigned ArchExt) override;
   void emitObjectArch(unsigned Arch) override;
@@ -243,7 +243,7 @@ void ARMTargetAsmStreamer::emitUnwindRaw(int64_t Offset,
 class ARMTargetELFStreamer : public ARMTargetStreamer {
 private:
   // This structure holds all attributes, accounting for
-  // their string/numeric value, so we can later emmit them
+  // their string/numeric value, so we can later emit them
   // in declaration order, keeping all in the same vector
   struct AttributeItem {
     enum {
@@ -254,7 +254,7 @@ class ARMTargetELFStreamer : public ARMTargetStreamer {
     } Type;
     unsigned Tag;
     unsigned IntValue;
-    StringRef StringValue;
+    std::string StringValue;
 
     static bool LessTag(const AttributeItem &LHS, const AttributeItem &RHS) {
       // The conformance tag must be emitted first when serialised
diff --git a/test/CodeGen/ARM/build-attributes-optimization-minsize.ll b/test/CodeGen/ARM/build-attributes-optimization-minsize.ll
new file mode 100644
index 000000000000..4cfb6012f439
--- /dev/null
+++ b/test/CodeGen/ARM/build-attributes-optimization-minsize.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O0 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O3 | FileCheck %s
+
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O0 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O1 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O3 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+
+; CHECK: .eabi_attribute 30, 4	@ Tag_ABI_optimization_goals
+; CHECK-OBJ:          TagName: ABI_optimization_goals
+; CHECK-OBJ-NEXT:     Description: Aggressive Size
+
+define i32 @f(i64 %z) #0 {
+    ret i32 0
+}
+
+attributes #0 = { minsize optsize }
+
diff --git a/test/CodeGen/ARM/build-attributes-optimization-mixed.ll b/test/CodeGen/ARM/build-attributes-optimization-mixed.ll
new file mode 100644
index 000000000000..8009fc6e28f8
--- /dev/null
+++ b/test/CodeGen/ARM/build-attributes-optimization-mixed.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O0 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O3 | FileCheck %s
+
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O0 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O1 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O3 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s
+
+; CHECK-NOT: .eabi_attribute 30
+; CHECK-NOT: Tag_ABI_optimization_goals
+
+define i32 @f(i64 %z) #0 {
+    ret i32 0
+}
+
+define i32 @g(i64 %z) #1 {
+    ret i32 1
+}
+
+attributes #0 = { noinline optnone }
+
+attributes #1 = { minsize optsize }
+
diff --git a/test/CodeGen/ARM/build-attributes-optimization-optnone.ll b/test/CodeGen/ARM/build-attributes-optimization-optnone.ll
new file mode 100644
index 000000000000..cbdb915045c6
--- /dev/null
+++ b/test/CodeGen/ARM/build-attributes-optimization-optnone.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O0 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O3 | FileCheck %s
+
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O0 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O1 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O3 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+
+; CHECK: .eabi_attribute 30, 6	@ Tag_ABI_optimization_goals
+; CHECK-OBJ:          TagName: ABI_optimization_goals
+; CHECK-OBJ-NEXT:     Description: Best Debugging
+
+define i32 @f(i64 %z) #0 {
+    ret i32 0
+}
+
+attributes #0 = { noinline optnone }
+
diff --git a/test/CodeGen/ARM/build-attributes-optimization-optsize.ll b/test/CodeGen/ARM/build-attributes-optimization-optsize.ll
new file mode 100644
index 000000000000..bab210aa8d01
--- /dev/null
+++ b/test/CodeGen/ARM/build-attributes-optimization-optsize.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O0 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O3 | FileCheck %s
+
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O0 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O1 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O3 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=CHECK-OBJ
+
+; CHECK: .eabi_attribute 30, 3	@ Tag_ABI_optimization_goals
+; CHECK-OBJ:          TagName: ABI_optimization_goals
+; CHECK-OBJ-NEXT:     Description: Size
+
+define i32 @f(i64 %z) #0 {
+    ret i32 0
+}
+
+attributes #0 = { optsize }
+
diff --git a/test/CodeGen/ARM/build-attributes-optimization.ll b/test/CodeGen/ARM/build-attributes-optimization.ll
new file mode 100644
index 000000000000..21b7b3c3ab0c
--- /dev/null
+++ b/test/CodeGen/ARM/build-attributes-optimization.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O0 | FileCheck %s --check-prefix=NONE
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O1 | FileCheck %s --check-prefix=SPEED
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O3 | FileCheck %s --check-prefix=MAXSPEED
+
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O0 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=NONE-OBJ
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O1 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=SPEED-OBJ
+; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-a7 -O3 -filetype obj -o - | llvm-readobj -arm-attributes - | FileCheck %s --check-prefix=MAXSPEED-OBJ
+
+; NONE:     .eabi_attribute 30, 5	@ Tag_ABI_optimization_goals
+; SPEED:    .eabi_attribute 30, 1	@ Tag_ABI_optimization_goals
+; MAXSPEED: .eabi_attribute 30, 2	@ Tag_ABI_optimization_goals
+
+; NONE-OBJ:          TagName: ABI_optimization_goals
+; NONE-OBJ-NEXT:     Description: Debugging
+; SPEED-OBJ:         TagName: ABI_optimization_goals
+; SPEED-OBJ-NEXT:    Description: Speed
+; MAXSPEED-OBJ:      TagName: ABI_optimization_goals
+; MAXSPEED-OBJ-NEXT: Description: Aggressive Speed
+
+define i32 @f(i64 %z) {
+    ret i32 0
+}
+
diff --git a/test/MC/ARM/data-in-code.ll b/test/MC/ARM/data-in-code.ll
index c4910ff20e61..10657a3fed39 100644
--- a/test/MC/ARM/data-in-code.ll
+++ b/test/MC/ARM/data-in-code.ll
@@ -51,13 +51,6 @@ exit:
 ;; ARM-NEXT:     Other:
 ;; ARM-NEXT:     Section: [[MIXED_SECT]]
 
-;; ARM:        Symbol {
-;; ARM:          Name: $d
-;; ARM-NEXT:     Value: 0
-;; ARM-NEXT:     Size: 0
-;; ARM-NEXT:     Binding: Local
-;; ARM-NEXT:     Type: None
-
 ;; ARM:        Symbol {
 ;; ARM:          Name: $d
 ;; ARM-NEXT:     Value: 0x{{[0-9A-F]+}}
@@ -77,10 +70,17 @@ exit:
 ;; ARM-NEXT:     Section: .ARM.exidx
 ;; ARM-NEXT:   }
 
+;; ARM:        Symbol {
+;; ARM:          Name: $d
+;; ARM-NEXT:     Value: 0
+;; ARM-NEXT:     Size: 0
+;; ARM-NEXT:     Binding: Local
+;; ARM-NEXT:     Type: None
+
 ;; ARM-NOT:     ${{[atd]}}
 
 ;; TMB:        Symbol {
-;; TMB:          Name: $d.2
+;; TMB:          Name: $d.1
 ;; TMB-NEXT:     Value: 0x{{[0-9A-F]+}}
 ;; TMB-NEXT:     Size: 0
 ;; TMB-NEXT:     Binding: Local

From 3b45f263c3669c65a76f7033ee24093a9870cfcf Mon Sep 17 00:00:00 2001
From: Elena Demikhovsky <elena.demikhovsky@intel.com>
Date: Mon, 7 Dec 2015 14:33:34 +0000
Subject: [PATCH 171/364] VX-512: Fixed a bug in FP logic operation lowering

FP logic instructions are supported in DQ extension on AVX-512 target.
I use integer operations instead.
Added tests.
I also enabled FABS in this patch in order to check ANDPS.
The operations are FOR, FXOR, FAND, FANDN.
The instructions, that supported for 512-bit vector under DQ are:
VORPS/PD, VXORPS/PD, VANDPS/PD, FANDNPS/PD.

Differential Revision: http://reviews.llvm.org/D15110


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254913 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp | 55 +++++++++++++++--------
 lib/Target/X86/X86InstrInfo.td     |  1 +
 lib/Target/X86/X86InstrSSE.td      |  2 +-
 test/CodeGen/X86/avx-logic.ll      |  1 +
 test/CodeGen/X86/avx512-arith.ll   | 71 ++++++++++++++++++++++++++++++
 test/CodeGen/X86/vec_fabs.ll       |  2 +-
 6 files changed, 111 insertions(+), 21 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index fa6f5c8be88c..21bca74353c4 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1340,6 +1340,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
     setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
     setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
+    setOperationAction(ISD::FABS,               MVT::v16f32, Custom);
 
     setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
     setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
@@ -1347,6 +1348,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
     setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
     setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
+    setOperationAction(ISD::FABS,               MVT::v8f64, Custom);
     setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
     setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
 
@@ -26339,6 +26341,31 @@ static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
+                              const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  if (VT.is512BitVector() && !Subtarget->hasDQI()) {
+    // VXORPS, VORPS, VANDPS, VANDNPS are supported only under DQ extention.
+    // These logic operations may be executed in the integer domain.
+    SDLoc dl(N);
+    MVT IntScalar = MVT::getIntegerVT(VT.getScalarSizeInBits());
+    MVT IntVT = MVT::getVectorVT(IntScalar, VT.getVectorNumElements());
+
+    SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(0));
+    SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(1));
+    unsigned IntOpcode = 0;
+    switch (N->getOpcode()) {
+      default: llvm_unreachable("Unexpected FP logic op");
+      case X86ISD::FOR: IntOpcode = ISD::OR; break;
+      case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
+      case X86ISD::FAND: IntOpcode = ISD::AND; break;
+      case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
+    }
+    SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
+    return  DAG.getNode(ISD::BITCAST, dl, VT, IntOp);
+  }
+  return SDValue();
+}
 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
 static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG,
                                  const X86Subtarget *Subtarget) {
@@ -26354,19 +26381,7 @@ static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG,
     if (C->getValueAPF().isPosZero())
       return N->getOperand(0);
 
-  EVT VT = N->getValueType(0);
-  if (VT.is512BitVector() && !Subtarget->hasDQI()) {
-    SDLoc dl(N);
-    MVT IntScalar = MVT::getIntegerVT(VT.getScalarSizeInBits());
-    MVT IntVT = MVT::getVectorVT(IntScalar, VT.getVectorNumElements());
-
-    SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(0));
-    SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(1));
-    unsigned IntOpcode = (N->getOpcode() == X86ISD::FOR) ? ISD::OR : ISD::XOR;
-    SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
-    return  DAG.getNode(ISD::BITCAST, dl, VT, IntOp);
-  }
-  return SDValue();
+  return lowerX86FPLogicOp(N, DAG, Subtarget);
 }
 
 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
@@ -26391,7 +26406,8 @@ static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
 }
 
 /// Do target-specific dag combines on X86ISD::FAND nodes.
-static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG,
+                                  const X86Subtarget *Subtarget) {
   // FAND(0.0, x) -> 0.0
   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
     if (C->getValueAPF().isPosZero())
@@ -26402,11 +26418,12 @@ static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
     if (C->getValueAPF().isPosZero())
       return N->getOperand(1);
 
-  return SDValue();
+  return lowerX86FPLogicOp(N, DAG, Subtarget);
 }
 
 /// Do target-specific dag combines on X86ISD::FANDN nodes
-static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG,
+                                   const X86Subtarget *Subtarget) {
   // FANDN(0.0, x) -> x
   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
     if (C->getValueAPF().isPosZero())
@@ -26417,7 +26434,7 @@ static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
     if (C->getValueAPF().isPosZero())
       return N->getOperand(1);
 
-  return SDValue();
+  return lowerX86FPLogicOp(N, DAG, Subtarget);
 }
 
 static SDValue PerformBTCombine(SDNode *N,
@@ -27233,8 +27250,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::FOR:         return PerformFORCombine(N, DAG, Subtarget);
   case X86ISD::FMIN:
   case X86ISD::FMAX:        return PerformFMinFMaxCombine(N, DAG);
-  case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
-  case X86ISD::FANDN:       return PerformFANDNCombine(N, DAG);
+  case X86ISD::FAND:        return PerformFANDCombine(N, DAG, Subtarget);
+  case X86ISD::FANDN:       return PerformFANDNCombine(N, DAG, Subtarget);
   case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
   case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
   case ISD::ANY_EXTEND:
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 4a4ceaca88f4..b412f8fb3ecb 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -770,6 +770,7 @@ def HasVLX       : Predicate<"Subtarget->hasVLX()">,
                      AssemblerPredicate<"FeatureVLX", "AVX-512 VL ISA">;
 def NoVLX        : Predicate<"!Subtarget->hasVLX()">;
 def NoVLX_Or_NoBWI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasBWI()">;
+def NoVLX_Or_NoDQI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasDQI()">;
 
 def HasPOPCNT    : Predicate<"Subtarget->hasPOPCNT()">;
 def HasAES       : Predicate<"Subtarget->hasAES()">;
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index a93240bd717c..a545335dd5dd 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -2906,7 +2906,7 @@ let isCodeGenOnly = 1 in {
 // Multiclass for vectors using the X86 logical operation aliases for FP.
 multiclass sse12_fp_packed_vector_logical_alias<
     bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> {
-  let Predicates = [HasAVX, NoVLX] in {
+  let Predicates = [HasAVX, NoVLX_Or_NoDQI] in {
   defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
               VR128, v4f32, f128mem, loadv4f32, SSEPackedSingle, itins, 0>,
               PS, VEX_4V;
diff --git a/test/CodeGen/X86/avx-logic.ll b/test/CodeGen/X86/avx-logic.ll
index a91fe7e0c523..e9e7d5aea273 100644
--- a/test/CodeGen/X86/avx-logic.ll
+++ b/test/CodeGen/X86/avx-logic.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s
 
 define <4 x double> @andpd256(<4 x double> %y, <4 x double> %x) nounwind uwtable readnone ssp {
 ; CHECK-LABEL: andpd256:
diff --git a/test/CodeGen/X86/avx512-arith.ll b/test/CodeGen/X86/avx512-arith.ll
index d7da77a5eb54..9220e4f269cd 100644
--- a/test/CodeGen/X86/avx512-arith.ll
+++ b/test/CodeGen/X86/avx512-arith.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck --check-prefix=CHECK --check-prefix=AVX512F %s
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vl | FileCheck --check-prefix=CHECK --check-prefix=AVX512VL %s
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512bw | FileCheck --check-prefix=CHECK --check-prefix=AVX512BW %s
@@ -823,3 +824,73 @@ define <16 x float>  @test_fxor(<16 x float> %a) {
   ret <16 x float>%res
 }
 
+define <8 x float>  @test_fxor_8f32(<8 x float> %a) {
+; CHECK-LABEL: test_fxor_8f32:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vxorps {{.*}}(%rip), %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
+  ret <8 x float>%res
+}
+
+define <8 x double> @fabs_v8f64(<8 x double> %p)
+; AVX512F-LABEL: fabs_v8f64:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fabs_v8f64:
+; AVX512VL:       ## BB#0:
+; AVX512VL-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: fabs_v8f64:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: fabs_v8f64:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vandpd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT:    retq
+;
+; SKX-LABEL: fabs_v8f64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vandpd {{.*}}(%rip), %zmm0, %zmm0
+; SKX-NEXT:    retq
+{
+  %t = call <8 x double> @llvm.fabs.v8f64(<8 x double> %p)
+  ret <8 x double> %t
+}
+declare <8 x double> @llvm.fabs.v8f64(<8 x double> %p)
+
+define <16 x float> @fabs_v16f32(<16 x float> %p)
+; AVX512F-LABEL: fabs_v16f32:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vpandd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fabs_v16f32:
+; AVX512VL:       ## BB#0:
+; AVX512VL-NEXT:    vpandd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: fabs_v16f32:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpandd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: fabs_v16f32:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vandps {{.*}}(%rip), %zmm0, %zmm0
+; AVX512DQ-NEXT:    retq
+;
+; SKX-LABEL: fabs_v16f32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vandps {{.*}}(%rip), %zmm0, %zmm0
+; SKX-NEXT:    retq
+{
+  %t = call <16 x float> @llvm.fabs.v16f32(<16 x float> %p)
+  ret <16 x float> %t
+}
+declare <16 x float> @llvm.fabs.v16f32(<16 x float> %p)
diff --git a/test/CodeGen/X86/vec_fabs.ll b/test/CodeGen/X86/vec_fabs.ll
index 960b5f27cf53..54f33b2bd224 100644
--- a/test/CodeGen/X86/vec_fabs.ll
+++ b/test/CodeGen/X86/vec_fabs.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s
-
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s
 
 define <2 x double> @fabs_v2f64(<2 x double> %p)
 {

From fda49fdbc8fadb4d3fdf839da37d8543085eb738 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Mon, 7 Dec 2015 15:05:44 +0000
Subject: [PATCH 172/364] [ThinLTO] Support cloning of temporary DILocation
 metadata

This is needed to support linking of module-level metadata as a
postpass after function importing, where we will be leaving temporary
metadata on imported instructions until the postpass metadata import.

Also added unittest. Split from D14838.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254914 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/DebugInfoMetadata.h | 6 ++++--
 unittests/IR/MetadataTest.cpp       | 8 ++++++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/include/llvm/IR/DebugInfoMetadata.h b/include/llvm/IR/DebugInfoMetadata.h
index 68a497745ae7..0b3fe06f3576 100644
--- a/include/llvm/IR/DebugInfoMetadata.h
+++ b/include/llvm/IR/DebugInfoMetadata.h
@@ -1118,8 +1118,10 @@ class DILocation : public MDNode {
   }
 
   TempDILocation cloneImpl() const {
-    return getTemporary(getContext(), getLine(), getColumn(), getScope(),
-                        getInlinedAt());
+    // Get the raw scope/inlinedAt since it is possible to invoke this on
+    // a DILocation containing temporary metadata.
+    return getTemporary(getContext(), getLine(), getColumn(), getRawScope(),
+                        getRawInlinedAt());
   }
 
   // Disallow replacing operands.
diff --git a/unittests/IR/MetadataTest.cpp b/unittests/IR/MetadataTest.cpp
index da4271a30aea..8f346f53a2d2 100644
--- a/unittests/IR/MetadataTest.cpp
+++ b/unittests/IR/MetadataTest.cpp
@@ -813,6 +813,14 @@ TEST_F(DILocationTest, getTemporary) {
   EXPECT_FALSE(L->isResolved());
 }
 
+TEST_F(DILocationTest, cloneTemporary) {
+  MDNode *N = MDNode::get(Context, None);
+  auto L = DILocation::getTemporary(Context, 2, 7, N);
+  EXPECT_TRUE(L->isTemporary());
+  auto L2 = L->clone();
+  EXPECT_TRUE(L2->isTemporary());
+}
+
 typedef MetadataTest GenericDINodeTest;
 
 TEST_F(GenericDINodeTest, get) {

From c4724f60d7d2d5bbea55e60e5711f2ed5f3ec6ce Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Mon, 7 Dec 2015 15:44:34 +0000
Subject: [PATCH 173/364] Silence all C4592 warnings with MSVC 2015 Update 1.
 This warning produces false positives that Microsoft says will be fixed in
 Update 2. Until this produces reliable diagnostics, it is safe to disable the
 diagnostic -- the compiler is not doing anything different than it previously
 did aside from issuing the diagnostic.

(Note, this silences at least one false positive in LLVM with FeatureBitset uses.)

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254915 91177308-0d34-0410-b5e6-96231b3b80d8
---
 cmake/modules/HandleLLVMOptions.cmake | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake
index 9e4c4c5078af..6c3109c4de76 100644
--- a/cmake/modules/HandleLLVMOptions.cmake
+++ b/cmake/modules/HandleLLVMOptions.cmake
@@ -317,6 +317,9 @@ if( MSVC )
     -wd4204 # Suppress 'nonstandard extension used : non-constant aggregate initializer'
     -wd4577 # Suppress 'noexcept used with no exception handling mode specified; termination on exception is not guaranteed'
     -wd4091 # Suppress 'typedef: ignored on left of '' when no variable is declared'
+        # C4592 is disabled because of false positives in Visual Studio 2015
+        # Update 1. Re-evaluate the usefulness of this diagnostic with Update 2.
+    -wd4592 # Suppress ''var': symbol will be dynamically initialized (implementation limitation)
 
 	# Ideally, we'd like this warning to be enabled, but MSVC 2013 doesn't
 	# support the 'aligned' attribute in the way that clang sources requires (for

From 7218e37dabb5cc8b3b61b08cbaf0da49da65a479 Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Mon, 7 Dec 2015 16:01:40 +0000
Subject: [PATCH 174/364] Simplify test.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254916 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Linker/2003-05-31-LinkerRename.ll | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/test/Linker/2003-05-31-LinkerRename.ll b/test/Linker/2003-05-31-LinkerRename.ll
index e10e239071ad..ee070b6dbdf0 100644
--- a/test/Linker/2003-05-31-LinkerRename.ll
+++ b/test/Linker/2003-05-31-LinkerRename.ll
@@ -1,6 +1,4 @@
-; RUN: llvm-as %S/Inputs/2003-05-31-LinkerRename.ll -o %t.1.bc
-; RUN: llvm-as  %s -o %t.2.bc
-; RUN: llvm-link %t.1.bc %t.2.bc -S | FileCheck %s
+; RUN: llvm-link %S/Inputs/2003-05-31-LinkerRename.ll %s -S | FileCheck %s
 
 ; CHECK: @bar = global i32 ()* @foo.2
 

From 0178d23ade9b549a91e3409dd15833ea8a451cb4 Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Mon, 7 Dec 2015 16:31:41 +0000
Subject: [PATCH 175/364] Link declaration lazily.

We already linked available_externally and linkonce lazily, this just
adds declarations to the list.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254917 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Linker/LinkModules.cpp                   |  3 +++
 test/Linker/2003-04-23-LinkOnceLost.ll       |  4 ++++
 test/Linker/2003-05-31-LinkerRename.ll       |  4 ++--
 test/Linker/ConstantGlobals.ll               |  4 ++++
 test/Linker/Inputs/opaque.ll                 |  8 ++++++++
 test/Linker/Inputs/testlink.ll               |  4 +++-
 test/Linker/Inputs/type-unique-dst-types2.ll |  4 ++++
 test/Linker/Inputs/type-unique-dst-types3.ll |  4 ++++
 test/Linker/opaque.ll                        |  4 ++++
 test/Linker/testlink.ll                      |  9 ++++++++-
 test/Linker/type-unique-dst-types.ll         |  4 ++++
 test/Linker/type-unique-src-type.ll          |  4 ++--
 test/Linker/unnamed-addr1-a.ll               | 10 ++++++++++
 test/Linker/weakextern.ll                    |  4 ++++
 14 files changed, 64 insertions(+), 6 deletions(-)

diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index a1c3162bf796..8e0904a858bc 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -1753,6 +1753,9 @@ bool ModuleLinker::linkIfNeeded(GlobalValue &GV) {
        GV.hasAvailableExternallyLinkage()))
     return false;
 
+  if (GV.isDeclaration())
+    return false;
+
   if (const Comdat *SC = GV.getComdat()) {
     bool LinkFromSrc;
     Comdat::SelectionKind SK;
diff --git a/test/Linker/2003-04-23-LinkOnceLost.ll b/test/Linker/2003-04-23-LinkOnceLost.ll
index c699d1eb058e..e2b600c877c0 100644
--- a/test/Linker/2003-04-23-LinkOnceLost.ll
+++ b/test/Linker/2003-04-23-LinkOnceLost.ll
@@ -4,3 +4,7 @@
 
 declare void @foo()
 
+define void @use_foo() {
+  call void @foo()
+  ret void
+}
diff --git a/test/Linker/2003-05-31-LinkerRename.ll b/test/Linker/2003-05-31-LinkerRename.ll
index ee070b6dbdf0..f511be1bf22f 100644
--- a/test/Linker/2003-05-31-LinkerRename.ll
+++ b/test/Linker/2003-05-31-LinkerRename.ll
@@ -6,13 +6,13 @@
 ; CHECK-NEXT:   ret i32 7
 ; CHECK-NEXT: }
 
-; CHECK: declare i32 @foo()
-
 ; CHECK:      define i32 @test() {
 ; CHECK-NEXT:   %X = call i32 @foo()
 ; CHECK-NEXT:   ret i32 %X
 ; CHECK-NEXT: }
 
+; CHECK: declare i32 @foo()
+
 declare i32 @foo()
 
 define i32 @test() {
diff --git a/test/Linker/ConstantGlobals.ll b/test/Linker/ConstantGlobals.ll
index 49f86a51bd7f..58c0d711a07e 100644
--- a/test/Linker/ConstantGlobals.ll
+++ b/test/Linker/ConstantGlobals.ll
@@ -6,3 +6,7 @@
 
 ; CHECK-DAG: @Y = external global [1 x i32]
 @Y = external global [1 x i32]
+
+define [1 x i32]* @use-Y() {
+  ret [1 x i32] *@Y
+}
diff --git a/test/Linker/Inputs/opaque.ll b/test/Linker/Inputs/opaque.ll
index f164abd586d1..a5f27cba418e 100644
--- a/test/Linker/Inputs/opaque.ll
+++ b/test/Linker/Inputs/opaque.ll
@@ -11,3 +11,11 @@ define void @f1()  {
   getelementptr %A, %A* null, i32 0
   ret void
 }
+
+define %A* @use_g2() {
+ ret %A* @g2
+}
+
+define %B* @use_g3() {
+  ret %B* @g3
+}
diff --git a/test/Linker/Inputs/testlink.ll b/test/Linker/Inputs/testlink.ll
index 263d9e77d1ab..22a66399be09 100644
--- a/test/Linker/Inputs/testlink.ll
+++ b/test/Linker/Inputs/testlink.ll
@@ -53,4 +53,6 @@ define internal void @testIntern() {
   ret void
 }
 
-declare void @VecSizeCrash1(%VecSize)
+define void @VecSizeCrash1(%VecSize) {
+  ret void
+}
diff --git a/test/Linker/Inputs/type-unique-dst-types2.ll b/test/Linker/Inputs/type-unique-dst-types2.ll
index b565c6d73649..7770ea3cca07 100644
--- a/test/Linker/Inputs/type-unique-dst-types2.ll
+++ b/test/Linker/Inputs/type-unique-dst-types2.ll
@@ -1,3 +1,7 @@
 %A.11 = type { %B }
 %B = type { i8 }
 @g1 = external global %A.11
+
+define %A.11* @use_g1() {
+  ret %A.11* @g1
+}
diff --git a/test/Linker/Inputs/type-unique-dst-types3.ll b/test/Linker/Inputs/type-unique-dst-types3.ll
index c5794ad839a2..8a5ac2694791 100644
--- a/test/Linker/Inputs/type-unique-dst-types3.ll
+++ b/test/Linker/Inputs/type-unique-dst-types3.ll
@@ -1,2 +1,6 @@
 %A.11 = type opaque
 @g2 = external global %A.11
+
+define %A.11* @use_g2() {
+  ret %A.11* @g2
+}
diff --git a/test/Linker/opaque.ll b/test/Linker/opaque.ll
index 4f3f398f8f1b..6fd1ae90d4f4 100644
--- a/test/Linker/opaque.ll
+++ b/test/Linker/opaque.ll
@@ -19,3 +19,7 @@
 %C = type { %A }
 
 @g1 = external global %B
+
+define %B* @use_g1() {
+  ret %B* @g1
+}
diff --git a/test/Linker/testlink.ll b/test/Linker/testlink.ll
index 82a2229f57a0..6a316a3bf846 100644
--- a/test/Linker/testlink.ll
+++ b/test/Linker/testlink.ll
@@ -32,6 +32,11 @@
 
 ; CHECK-DAG: @0 = external global i32
 @0 = external global i32
+
+define i32* @use0() {
+  ret i32* @0
+}
+
 ; CHECK-DAG: @Inte = global i32 1
 @Inte = global i32 1
 
@@ -101,4 +106,6 @@ define void @testIntern() {
   ret void
 }
 
-declare void @VecSizeCrash(%VecSize)
+define void @VecSizeCrash(%VecSize) {
+  ret void
+}
diff --git a/test/Linker/type-unique-dst-types.ll b/test/Linker/type-unique-dst-types.ll
index 30aecbb970cb..1adad49de91d 100644
--- a/test/Linker/type-unique-dst-types.ll
+++ b/test/Linker/type-unique-dst-types.ll
@@ -17,3 +17,7 @@
 %A = type { %B }
 %B = type { i8 }
 @g3 = external global %A
+
+define %A* @use_g3() {
+  ret %A* @g3
+}
diff --git a/test/Linker/type-unique-src-type.ll b/test/Linker/type-unique-src-type.ll
index 110ecc87e1b1..ab7322892e07 100644
--- a/test/Linker/type-unique-src-type.ll
+++ b/test/Linker/type-unique-src-type.ll
@@ -10,7 +10,7 @@
 ; CHECK-NEXT: %B = type { %A }
 ; CHECK-NEXT: %A = type { i8 }
 
-; CHECK: @g1 = external global %C.0
+; CHECK: @g1 = global %C.0 zeroinitializer
 ; CHECK:  getelementptr %C.0, %C.0* null, i64 0, i32 0, i32 0
 
 %A   = type { i8 }
@@ -21,4 +21,4 @@ define void @f1() {
   getelementptr %C, %C* null, i64 0, i32 0, i32 0
   ret void
 }
-@g1 = external global %C.0
+@g1 = global %C.0 zeroinitializer
diff --git a/test/Linker/unnamed-addr1-a.ll b/test/Linker/unnamed-addr1-a.ll
index 6ff0fd90b687..a96f089a99c0 100644
--- a/test/Linker/unnamed-addr1-a.ll
+++ b/test/Linker/unnamed-addr1-a.ll
@@ -15,6 +15,11 @@ define weak void @func-b() unnamed_addr { ret void }
 @global-c = common unnamed_addr global i32 0
 ; CHECK-DAG: @global-c = common unnamed_addr global i32 0
 @global-d = external global i32
+
+define i32* @use-global-d() {
+  ret i32* @global-d
+}
+
 ; CHECK-DAG: @global-d = global i32 42
 @global-e = external unnamed_addr global i32
 ; CHECK-DAG: @global-e = unnamed_addr global i32 42
@@ -27,6 +32,11 @@ define weak void @func-b() unnamed_addr { ret void }
 ; CHECK-DAG: @alias-b = unnamed_addr alias i32, i32* @global-f
 
 declare void @func-c()
+define void @use-func-c() {
+  call void @func-c()
+  ret void
+}
+
 ; CHECK-DAG: define weak void @func-c() {
 define weak void @func-d() { ret void }
 ; CHECK-DAG: define weak void @func-d() {
diff --git a/test/Linker/weakextern.ll b/test/Linker/weakextern.ll
index e1754e60547e..814550a907bb 100644
--- a/test/Linker/weakextern.ll
+++ b/test/Linker/weakextern.ll
@@ -6,3 +6,7 @@
 @kallsyms_names = extern_weak global [0 x i8]
 @MyVar = extern_weak global i32
 @Inte = extern_weak global i32
+
+define weak [0 x i8]* @use_kallsyms_names() {
+  ret [0 x i8]* @kallsyms_names
+}

From 07b637e04d1e88688c2456da1bd02eef08682cf7 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Mon, 7 Dec 2015 17:35:56 +0000
Subject: [PATCH 176/364] [Orc] Removing traces of takeOwnershipOfBuffers left
 after r251560.

Patch by Joshua Gerrard. Thanks Joshua!


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254919 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../Orc/ObjectTransformLayer.h                |  8 ------
 tools/llvm-rtdyld/llvm-rtdyld.cpp             | 12 ---------
 .../Orc/ObjectTransformLayerTest.cpp          | 27 -------------------
 3 files changed, 47 deletions(-)

diff --git a/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h b/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h
index 7af662085474..f96e83ed5a1a 100644
--- a/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h
+++ b/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h
@@ -87,14 +87,6 @@ class ObjectTransformLayer {
     BaseLayer.mapSectionAddress(H, LocalAddress, TargetAddr);
   }
 
-  // Ownership hack.
-  // FIXME: Remove this as soon as RuntimeDyldELF can apply relocations without
-  //        referencing the original object.
-  template <typename OwningMBSet>
-  void takeOwnershipOfBuffers(ObjSetHandleT H, OwningMBSet MBs) {
-    BaseLayer.takeOwnershipOfBuffers(H, std::move(MBs));
-  }
-
   /// @brief Access the transform functor directly.
   TransformFtor &getTransform() { return Transform; }
 
diff --git a/tools/llvm-rtdyld/llvm-rtdyld.cpp b/tools/llvm-rtdyld/llvm-rtdyld.cpp
index 59c9a0c990f3..6ee3a44b63bf 100644
--- a/tools/llvm-rtdyld/llvm-rtdyld.cpp
+++ b/tools/llvm-rtdyld/llvm-rtdyld.cpp
@@ -388,11 +388,6 @@ static int executeInput() {
   doPreallocation(MemMgr);
   RuntimeDyld Dyld(MemMgr, MemMgr);
 
-  // FIXME: Preserve buffers until resolveRelocations time to work around a bug
-  //        in RuntimeDyldELF.
-  // This fixme should be fixed ASAP. This is a very brittle workaround.
-  std::vector<std::unique_ptr<MemoryBuffer>> InputBuffers;
-
   // If we don't have any input files, read from stdin.
   if (!InputFileList.size())
     InputFileList.push_back("-");
@@ -409,7 +404,6 @@ static int executeInput() {
       return Error("unable to create object file: '" + EC.message() + "'");
 
     ObjectFile &Obj = **MaybeObj;
-    InputBuffers.push_back(std::move(*InputBuffer));
 
     // Load the object file
     Dyld.loadObject(Obj);
@@ -656,11 +650,6 @@ static int linkAndVerify() {
   RuntimeDyldChecker Checker(Dyld, Disassembler.get(), InstPrinter.get(),
                              llvm::dbgs());
 
-  // FIXME: Preserve buffers until resolveRelocations time to work around a bug
-  //        in RuntimeDyldELF.
-  // This fixme should be fixed ASAP. This is a very brittle workaround.
-  std::vector<std::unique_ptr<MemoryBuffer>> InputBuffers;
-
   // If we don't have any input files, read from stdin.
   if (!InputFileList.size())
     InputFileList.push_back("-");
@@ -679,7 +668,6 @@ static int linkAndVerify() {
       return Error("unable to create object file: '" + EC.message() + "'");
 
     ObjectFile &Obj = **MaybeObj;
-    InputBuffers.push_back(std::move(*InputBuffer));
 
     // Load the object file
     Dyld.loadObject(Obj);
diff --git a/unittests/ExecutionEngine/Orc/ObjectTransformLayerTest.cpp b/unittests/ExecutionEngine/Orc/ObjectTransformLayerTest.cpp
index 41b2307cadd8..c88c94f17b1c 100644
--- a/unittests/ExecutionEngine/Orc/ObjectTransformLayerTest.cpp
+++ b/unittests/ExecutionEngine/Orc/ObjectTransformLayerTest.cpp
@@ -157,21 +157,6 @@ class MockBaseLayer {
     resetExpectations();
   }
 
-  template <typename OwningMBSet>
-  void takeOwnershipOfBuffers(ObjSetHandleT H, OwningMBSet MBs) {
-    EXPECT_EQ(MockObjSetHandle, H);
-    EXPECT_EQ(MockBufferSet, *MBs);
-    LastCalled = "takeOwnershipOfBuffers";
-  }
-  void expectTakeOwnershipOfBuffers(ObjSetHandleT H, MockMemoryBufferSet *MBs) {
-    MockObjSetHandle = H;
-    MockBufferSet = *MBs;
-  }
-  void verifyTakeOwnershipOfBuffers() {
-    EXPECT_EQ("takeOwnershipOfBuffers", LastCalled);
-    resetExpectations();
-  }
-
 private:
   // Backing fields for remembering parameter/return values
   std::string LastCalled;
@@ -275,18 +260,6 @@ TEST(ObjectTransformLayerTest, Main) {
   T1.mapSectionAddress(H, Buffer, MockAddress);
   M.verifyMapSectionAddress();
 
-  // Test takeOwnershipOfBuffers, using unique pointer to buffer set
-  auto MockBufferSetPtr = llvm::make_unique<MockMemoryBufferSet>(366);
-  M.expectTakeOwnershipOfBuffers(H, MockBufferSetPtr.get());
-  T2.takeOwnershipOfBuffers(H, std::move(MockBufferSetPtr));
-  M.verifyTakeOwnershipOfBuffers();
-
-  // Test takeOwnershipOfBuffers, using naked pointer to buffer set
-  MockMemoryBufferSet MockBufferSet = 266;
-  M.expectTakeOwnershipOfBuffers(H, &MockBufferSet);
-  T1.takeOwnershipOfBuffers(H, &MockBufferSet);
-  M.verifyTakeOwnershipOfBuffers();
-
   // Verify transform getter (non-const)
   MockObjectFile Mutatee = 277;
   MockObjectFile *Out = T2.getTransform()(&Mutatee);

From bd58adf18d5a517be128887b57b67bd3ea976456 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 7 Dec 2015 17:39:48 +0000
Subject: [PATCH 177/364] Tighten checks so we can see existing codegen

The 2-element vector case shows a surprising bug: we failed to
eliminate ops on undefs, so there are 4 fmax calls even though
there can only be 2 valid elements in the inputs.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254920 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/fmaxnum.ll | 114 +++++++++++++++++++++++++++++++++---
 1 file changed, 106 insertions(+), 8 deletions(-)

diff --git a/test/CodeGen/X86/fmaxnum.ll b/test/CodeGen/X86/fmaxnum.ll
index 7aa087f92bdc..19041c9ff677 100644
--- a/test/CodeGen/X86/fmaxnum.ll
+++ b/test/CodeGen/X86/fmaxnum.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=sse2  < %s | FileCheck %s
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx  < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=sse2  < %s | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx  < %s | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
 
 declare float @fmaxf(float, float)
 declare double @fmax(double, double)
@@ -58,18 +58,116 @@ define x86_fp80 @test_intrinsic_fmaxl(x86_fp80 %x, x86_fp80 %y) {
 }
 
 ; CHECK-LABEL: @test_intrinsic_fmax_v2f32
-; CHECK: callq fmaxf
-; CHECK: callq fmaxf
+; SSE:         movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; SSE-NEXT:    callq fmaxf
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; SSE-NEXT:    callq fmaxf
+; SSE-NEXT:    unpcklps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; SSE:         movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE-NEXT:    callq fmaxf
+; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1,0]
+; SSE-NEXT:    callq fmaxf
+; SSE-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT:    unpcklps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; SSE:         movaps %xmm1, %xmm0
+; SSE-NEXT:    addq $72, %rsp
+; SSE-NEXT:    retq
+;
+; AVX:         vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX-NEXT:    callq fmaxf
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovshdup {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; AVX:         vmovshdup {{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; AVX:         callq fmaxf
+; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vpermilpd $1, {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; AVX:         vpermilpd $1, {{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; AVX:         callq fmaxf
+; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vpermilps $231, {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; AVX:         vpermilps $231, {{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; AVX:         callq fmaxf
+; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX-NEXT:    addq $56, %rsp
+; AVX-NEXT:    retq
 define <2 x float> @test_intrinsic_fmax_v2f32(<2 x float> %x, <2 x float> %y) {
   %z = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %x, <2 x float> %y) readnone
   ret <2 x float> %z
 }
 
 ; CHECK-LABEL: @test_intrinsic_fmax_v4f32
-; CHECK: callq fmaxf
-; CHECK: callq fmaxf
-; CHECK: callq fmaxf
-; CHECK: callq fmaxf
+; SSE:         movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; SSE-NEXT:    callq fmaxf
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; SSE-NEXT:    callq fmaxf
+; SSE-NEXT:    unpcklps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; SSE:         movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE-NEXT:    callq fmaxf
+; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1,0]
+; SSE-NEXT:    callq fmaxf
+; SSE-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT:    unpcklps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; SSE:         movaps %xmm1, %xmm0
+; SSE-NEXT:    addq $72, %rsp
+; SSE-NEXT:    retq
+;
+; AVX:         vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX-NEXT:    callq fmaxf
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovshdup {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; AVX:         vmovshdup {{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; AVX:         callq fmaxf
+; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vpermilpd $1, {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; AVX:         vpermilpd $1, {{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; AVX:         callq fmaxf
+; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vpermilps $231, {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; AVX:         vpermilps $231, {{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; AVX:         callq fmaxf
+; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX-NEXT:    addq $56, %rsp
+; AVX-NEXT:    retq
 define <4 x float> @test_intrinsic_fmax_v4f32(<4 x float> %x, <4 x float> %y) {
   %z = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float> %y) readnone
   ret <4 x float> %z

From f8a6223dd008c680193a723e685acbe46ea61836 Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Mon, 7 Dec 2015 18:46:41 +0000
Subject: [PATCH 178/364] [llvm-dwp] Restructure inputs for test case so
 they're all grouped together

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254922 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../llvm-dwp/Inputs/simple/{ => notypes}/a.dwo      | Bin
 .../llvm-dwp/Inputs/simple/{ => notypes}/b.dwo      | Bin
 .../Inputs/{type_units => simple/types}/a.dwo       | Bin
 .../Inputs/{type_units => simple/types}/b.dwo       | Bin
 test/tools/llvm-dwp/X86/simple.test                 |   4 ++--
 5 files changed, 2 insertions(+), 2 deletions(-)
 rename test/tools/llvm-dwp/Inputs/simple/{ => notypes}/a.dwo (100%)
 rename test/tools/llvm-dwp/Inputs/simple/{ => notypes}/b.dwo (100%)
 rename test/tools/llvm-dwp/Inputs/{type_units => simple/types}/a.dwo (100%)
 rename test/tools/llvm-dwp/Inputs/{type_units => simple/types}/b.dwo (100%)

diff --git a/test/tools/llvm-dwp/Inputs/simple/a.dwo b/test/tools/llvm-dwp/Inputs/simple/notypes/a.dwo
similarity index 100%
rename from test/tools/llvm-dwp/Inputs/simple/a.dwo
rename to test/tools/llvm-dwp/Inputs/simple/notypes/a.dwo
diff --git a/test/tools/llvm-dwp/Inputs/simple/b.dwo b/test/tools/llvm-dwp/Inputs/simple/notypes/b.dwo
similarity index 100%
rename from test/tools/llvm-dwp/Inputs/simple/b.dwo
rename to test/tools/llvm-dwp/Inputs/simple/notypes/b.dwo
diff --git a/test/tools/llvm-dwp/Inputs/type_units/a.dwo b/test/tools/llvm-dwp/Inputs/simple/types/a.dwo
similarity index 100%
rename from test/tools/llvm-dwp/Inputs/type_units/a.dwo
rename to test/tools/llvm-dwp/Inputs/simple/types/a.dwo
diff --git a/test/tools/llvm-dwp/Inputs/type_units/b.dwo b/test/tools/llvm-dwp/Inputs/simple/types/b.dwo
similarity index 100%
rename from test/tools/llvm-dwp/Inputs/type_units/b.dwo
rename to test/tools/llvm-dwp/Inputs/simple/types/b.dwo
diff --git a/test/tools/llvm-dwp/X86/simple.test b/test/tools/llvm-dwp/X86/simple.test
index 962e270a594e..d7365c814435 100644
--- a/test/tools/llvm-dwp/X86/simple.test
+++ b/test/tools/llvm-dwp/X86/simple.test
@@ -1,7 +1,7 @@
-RUN: llvm-dwp %p/../Inputs/simple/a.dwo %p/../Inputs/simple/b.dwo -o %t
+RUN: llvm-dwp %p/../Inputs/simple/notypes/a.dwo %p/../Inputs/simple/notypes/b.dwo -o %t
 RUN: llvm-dwarfdump %t | FileCheck --check-prefix=CHECK --check-prefix=NOTYP %s
 RUN: llvm-objdump -h %t | FileCheck --check-prefix=NOTYPOBJ %s
-RUN: llvm-dwp %p/../Inputs/type_units/a.dwo %p/../Inputs/type_units/b.dwo -o %t
+RUN: llvm-dwp %p/../Inputs/simple/types/a.dwo %p/../Inputs/simple/types/b.dwo -o %t
 RUN: llvm-dwarfdump %t | FileCheck --check-prefix=CHECK --check-prefix=TYPES %s
 
 FIXME: For some reason, piping straight from llvm-dwp to llvm-dwarfdump doesn't behave well - looks like dwarfdump is reading/closes before dwp has finished.

From 55aaa984cb3318df455b998f362646bf72414e10 Mon Sep 17 00:00:00 2001
From: Ron Lieberman <ronl@codeaurora.org>
Date: Mon, 7 Dec 2015 18:52:39 +0000
Subject: [PATCH 179/364] [Hexagon] Adding v60 test, vasr in particular.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254923 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/Hexagon/v60Vasr.ll | 247 ++++++++++++++++++++++++++++++++
 1 file changed, 247 insertions(+)
 create mode 100644 test/CodeGen/Hexagon/v60Vasr.ll

diff --git a/test/CodeGen/Hexagon/v60Vasr.ll b/test/CodeGen/Hexagon/v60Vasr.ll
new file mode 100644
index 000000000000..fb177f614f72
--- /dev/null
+++ b/test/CodeGen/Hexagon/v60Vasr.ll
@@ -0,0 +1,247 @@
+; RUN: llc -march=hexagon -O2 -mcpu=hexagonv60  < %s | FileCheck %s
+
+; CHECK: vasr(v{{[0-9]+}}.h,v{{[0-9]+}}.h,r{{[0-7]+}}):sat
+
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a:0-n16:32"
+target triple = "hexagon-unknown--elf"
+
+%struct.buffer_t = type { i64, i8*, [4 x i32], [4 x i32], [4 x i32], i32, i8, i8, [6 x i8] }
+
+; Function Attrs: norecurse nounwind
+define i32 @__test_vasr(%struct.buffer_t* noalias nocapture %f.buffer, %struct.buffer_t* noalias nocapture %g.buffer, %struct.buffer_t* noalias nocapture %res.buffer) #0 {
+entry:
+  %buf_host = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 1
+  %f.host = load i8*, i8** %buf_host, align 4
+  %buf_dev = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 0
+  %f.dev = load i64, i64* %buf_dev, align 8
+  %0 = icmp eq i8* %f.host, null
+  %1 = icmp eq i64 %f.dev, 0
+  %f.host_and_dev_are_null = and i1 %0, %1
+  %buf_min = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 4, i32 0
+  %f.min.0 = load i32, i32* %buf_min, align 4
+  %buf_host10 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 1
+  %g.host = load i8*, i8** %buf_host10, align 4
+  %buf_dev11 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 0
+  %g.dev = load i64, i64* %buf_dev11, align 8
+  %2 = icmp eq i8* %g.host, null
+  %3 = icmp eq i64 %g.dev, 0
+  %g.host_and_dev_are_null = and i1 %2, %3
+  %buf_min22 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 4, i32 0
+  %g.min.0 = load i32, i32* %buf_min22, align 4
+  %buf_host27 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 1
+  %res.host = load i8*, i8** %buf_host27, align 4
+  %buf_dev28 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 0
+  %res.dev = load i64, i64* %buf_dev28, align 8
+  %4 = icmp eq i8* %res.host, null
+  %5 = icmp eq i64 %res.dev, 0
+  %res.host_and_dev_are_null = and i1 %4, %5
+  %buf_extent31 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 2, i32 0
+  %res.extent.0 = load i32, i32* %buf_extent31, align 4
+  %buf_min39 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 4, i32 0
+  %res.min.0 = load i32, i32* %buf_min39, align 4
+  %6 = add nsw i32 %res.extent.0, -1
+  %7 = and i32 %6, -64
+  %8 = add i32 %res.min.0, 63
+  %9 = add i32 %8, %7
+  %10 = add nsw i32 %res.min.0, %res.extent.0
+  %11 = add nsw i32 %10, -1
+  %12 = icmp slt i32 %9, %11
+  %13 = select i1 %12, i32 %9, i32 %11
+  %14 = add nsw i32 %10, -64
+  %15 = icmp slt i32 %res.min.0, %14
+  %16 = select i1 %15, i32 %res.min.0, i32 %14
+  %f.extent.0.required.s = sub nsw i32 %13, %16
+  br i1 %f.host_and_dev_are_null, label %true_bb, label %after_bb
+
+true_bb:                                          ; preds = %entry
+  %buf_elem_size44 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 5
+  store i32 1, i32* %buf_elem_size44, align 4
+  store i32 %16, i32* %buf_min, align 4
+  %17 = add nsw i32 %f.extent.0.required.s, 1
+  %buf_extent46 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 2, i32 0
+  store i32 %17, i32* %buf_extent46, align 4
+  %buf_stride47 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 3, i32 0
+  store i32 1, i32* %buf_stride47, align 4
+  %buf_min48 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 4, i32 1
+  store i32 0, i32* %buf_min48, align 4
+  %buf_extent49 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 2, i32 1
+  store i32 0, i32* %buf_extent49, align 4
+  %buf_stride50 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 3, i32 1
+  store i32 0, i32* %buf_stride50, align 4
+  %buf_min51 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 4, i32 2
+  store i32 0, i32* %buf_min51, align 4
+  %buf_extent52 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 2, i32 2
+  store i32 0, i32* %buf_extent52, align 4
+  %buf_stride53 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 3, i32 2
+  store i32 0, i32* %buf_stride53, align 4
+  %buf_min54 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 4, i32 3
+  store i32 0, i32* %buf_min54, align 4
+  %buf_extent55 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 2, i32 3
+  store i32 0, i32* %buf_extent55, align 4
+  %buf_stride56 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %f.buffer, i32 0, i32 3, i32 3
+  store i32 0, i32* %buf_stride56, align 4
+  br label %after_bb
+
+after_bb:                                         ; preds = %true_bb, %entry
+  br i1 %g.host_and_dev_are_null, label %true_bb57, label %after_bb59
+
+true_bb57:                                        ; preds = %after_bb
+  %buf_elem_size60 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 5
+  store i32 1, i32* %buf_elem_size60, align 4
+  store i32 %16, i32* %buf_min22, align 4
+  %18 = add nsw i32 %f.extent.0.required.s, 1
+  %buf_extent62 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 2, i32 0
+  store i32 %18, i32* %buf_extent62, align 4
+  %buf_stride63 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 3, i32 0
+  store i32 1, i32* %buf_stride63, align 4
+  %buf_min64 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 4, i32 1
+  store i32 0, i32* %buf_min64, align 4
+  %buf_extent65 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 2, i32 1
+  store i32 0, i32* %buf_extent65, align 4
+  %buf_stride66 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 3, i32 1
+  store i32 0, i32* %buf_stride66, align 4
+  %buf_min67 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 4, i32 2
+  store i32 0, i32* %buf_min67, align 4
+  %buf_extent68 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 2, i32 2
+  store i32 0, i32* %buf_extent68, align 4
+  %buf_stride69 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 3, i32 2
+  store i32 0, i32* %buf_stride69, align 4
+  %buf_min70 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 4, i32 3
+  store i32 0, i32* %buf_min70, align 4
+  %buf_extent71 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 2, i32 3
+  store i32 0, i32* %buf_extent71, align 4
+  %buf_stride72 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %g.buffer, i32 0, i32 3, i32 3
+  store i32 0, i32* %buf_stride72, align 4
+  br label %after_bb59
+
+after_bb59:                                       ; preds = %true_bb57, %after_bb
+  br i1 %res.host_and_dev_are_null, label %after_bb75.thread, label %after_bb75
+
+after_bb75.thread:                                ; preds = %after_bb59
+  %buf_elem_size76 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 5
+  store i32 1, i32* %buf_elem_size76, align 4
+  store i32 %16, i32* %buf_min39, align 4
+  %19 = add nsw i32 %f.extent.0.required.s, 1
+  store i32 %19, i32* %buf_extent31, align 4
+  %buf_stride79 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 3, i32 0
+  store i32 1, i32* %buf_stride79, align 4
+  %buf_min80 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 4, i32 1
+  store i32 0, i32* %buf_min80, align 4
+  %buf_extent81 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 2, i32 1
+  store i32 0, i32* %buf_extent81, align 4
+  %buf_stride82 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 3, i32 1
+  store i32 0, i32* %buf_stride82, align 4
+  %buf_min83 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 4, i32 2
+  store i32 0, i32* %buf_min83, align 4
+  %buf_extent84 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 2, i32 2
+  store i32 0, i32* %buf_extent84, align 4
+  %buf_stride85 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 3, i32 2
+  store i32 0, i32* %buf_stride85, align 4
+  %buf_min86 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 4, i32 3
+  store i32 0, i32* %buf_min86, align 4
+  %buf_extent87 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 2, i32 3
+  store i32 0, i32* %buf_extent87, align 4
+  %buf_stride88 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %res.buffer, i32 0, i32 3, i32 3
+  store i32 0, i32* %buf_stride88, align 4
+  br label %destructor_block
+
+after_bb75:                                       ; preds = %after_bb59
+  %20 = or i1 %f.host_and_dev_are_null, %g.host_and_dev_are_null
+  br i1 %20, label %destructor_block, label %"produce res"
+
+"produce res":                                    ; preds = %after_bb75
+  %21 = ashr i32 %res.extent.0, 6
+  %22 = icmp sgt i32 %21, 0
+  br i1 %22, label %"for res.s0.x.x", label %"end for res.s0.x.x", !prof !4
+
+"for res.s0.x.x":                                 ; preds = %"for res.s0.x.x", %"produce res"
+  %res.s0.x.x = phi i32 [ %41, %"for res.s0.x.x" ], [ 0, %"produce res" ]
+  %23 = shl nsw i32 %res.s0.x.x, 6
+  %24 = add nsw i32 %23, %res.min.0
+  %25 = sub nsw i32 %24, %f.min.0
+  %26 = getelementptr inbounds i8, i8* %f.host, i32 %25
+  %27 = bitcast i8* %26 to <16 x i32>*
+  %28 = load <16 x i32>, <16 x i32>* %27, align 1, !tbaa !5
+  %29 = tail call <32 x i32> @llvm.hexagon.V6.vzb(<16 x i32> %28)
+  %30 = sub nsw i32 %24, %g.min.0
+  %31 = getelementptr inbounds i8, i8* %g.host, i32 %30
+  %32 = bitcast i8* %31 to <16 x i32>*
+  %33 = load <16 x i32>, <16 x i32>* %32, align 1, !tbaa !8
+  %34 = tail call <32 x i32> @llvm.hexagon.V6.vzb(<16 x i32> %33)
+  %35 = tail call <32 x i32> @llvm.hexagon.V6.vaddh.dv(<32 x i32> %29, <32 x i32> %34)
+  %36 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %35)
+  %37 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %35)
+  %38 = tail call <16 x i32> @llvm.hexagon.V6.vasrhubsat(<16 x i32> %36, <16 x i32> %37, i32 4)
+  %39 = getelementptr inbounds i8, i8* %res.host, i32 %23
+  %40 = bitcast i8* %39 to <16 x i32>*
+  store <16 x i32> %38, <16 x i32>* %40, align 1, !tbaa !10
+  %41 = add nuw nsw i32 %res.s0.x.x, 1
+  %42 = icmp eq i32 %41, %21
+  br i1 %42, label %"end for res.s0.x.x", label %"for res.s0.x.x"
+
+"end for res.s0.x.x":                             ; preds = %"for res.s0.x.x", %"produce res"
+  %43 = add nsw i32 %res.extent.0, 63
+  %44 = ashr i32 %43, 6
+  %45 = icmp sgt i32 %44, %21
+  br i1 %45, label %"for res.s0.x.x92.preheader", label %destructor_block, !prof !4
+
+"for res.s0.x.x92.preheader":                     ; preds = %"end for res.s0.x.x"
+  %46 = sub i32 -64, %f.min.0
+  %47 = add i32 %46, %10
+  %48 = getelementptr inbounds i8, i8* %f.host, i32 %47
+  %49 = bitcast i8* %48 to <16 x i32>*
+  %50 = load <16 x i32>, <16 x i32>* %49, align 1
+  %51 = tail call <32 x i32> @llvm.hexagon.V6.vzb(<16 x i32> %50)
+  %52 = sub i32 -64, %g.min.0
+  %53 = add i32 %52, %10
+  %54 = getelementptr inbounds i8, i8* %g.host, i32 %53
+  %55 = bitcast i8* %54 to <16 x i32>*
+  %56 = load <16 x i32>, <16 x i32>* %55, align 1
+  %57 = tail call <32 x i32> @llvm.hexagon.V6.vzb(<16 x i32> %56)
+  %58 = tail call <32 x i32> @llvm.hexagon.V6.vaddh.dv(<32 x i32> %51, <32 x i32> %57)
+  %59 = tail call <16 x i32> @llvm.hexagon.V6.lo(<32 x i32> %58)
+  %60 = add nsw i32 %res.extent.0, -64
+  %61 = getelementptr inbounds i8, i8* %res.host, i32 %60
+  %62 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %58)
+  %63 = tail call <16 x i32> @llvm.hexagon.V6.vasrhubsat(<16 x i32> %62, <16 x i32> %59, i32 4)
+  %64 = bitcast i8* %61 to <16 x i32>*
+  store <16 x i32> %63, <16 x i32>* %64, align 1, !tbaa !10
+  br label %destructor_block
+
+destructor_block:                                 ; preds = %"for res.s0.x.x92.preheader", %"end for res.s0.x.x", %after_bb75, %after_bb75.thread
+  ret i32 0
+}
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vaddh.dv(<32 x i32>, <32 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vzb(<16 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.hi(<32 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.lo(<32 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <16 x i32> @llvm.hexagon.V6.vasrhubsat(<16 x i32>, <16 x i32>, i32) #1
+
+attributes #0 = { norecurse nounwind }
+attributes #1 = { nounwind readnone }
+
+!llvm.ident = !{!0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0}
+!llvm.module.flags = !{!1, !2, !3}
+
+!0 = !{!"Clang $LLVM_VERSION_MAJOR.$LLVM_VERSION_MINOR (based on LLVM 3.8.0)"}
+!1 = !{i32 2, !"halide_use_soft_float_abi", i32 0}
+!2 = !{i32 2, !"halide_mcpu", !"hexagonv60"}
+!3 = !{i32 2, !"halide_mattrs", !"+hvx"}
+!4 = !{!"branch_weights", i32 1073741824, i32 0}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"f", !7}
+!7 = !{!"Halide buffer"}
+!8 = !{!9, !9, i64 0}
+!9 = !{!"g", !7}
+!10 = !{!11, !11, i64 0}
+!11 = !{!"res", !7}

From fbee4fa427e8b0959a95654924714449c7e076ba Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 7 Dec 2015 19:13:40 +0000
Subject: [PATCH 180/364] remove redundant check: optForSize() includes a check
 for the minsize attribute; NFCI

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254925 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86OptimizeLEAs.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/lib/Target/X86/X86OptimizeLEAs.cpp b/lib/Target/X86/X86OptimizeLEAs.cpp
index 9171786707d8..da83c8ad8248 100644
--- a/lib/Target/X86/X86OptimizeLEAs.cpp
+++ b/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -294,11 +294,9 @@ bool OptimizeLEAPass::removeRedundantAddrCalc(
 
 bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
-  bool OptSize = MF.getFunction()->optForSize();
-  bool MinSize = MF.getFunction()->optForMinSize();
 
   // Perform this optimization only if we care about code size.
-  if (!OptSize && !MinSize)
+  if (!MF.getFunction()->optForSize())
     return false;
 
   MRI = &MF.getRegInfo();

From 5d0d98f6ec23fc83ec1b4f0816bcba9d393db1c4 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Mon, 7 Dec 2015 19:21:11 +0000
Subject: [PATCH 181/364] [ThinLTO] Support for specifying function index from
 pass manager

Summary:
Add a field on the PassManagerBuilder that clang or gold can use to pass
down a pointer to the function index in memory to use for importing when
the ThinLTO backend is triggered. Add support to supply this to the
function import pass.

Reviewers: joker.eph, dexonsmith

Subscribers: davidxl, llvm-commits, joker.eph

Differential Revision: http://reviews.llvm.org/D15024

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254926 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/LinkAllPasses.h                  |  1 +
 include/llvm/Transforms/IPO.h                 |  5 +++
 .../llvm/Transforms/IPO/PassManagerBuilder.h  |  5 +++
 lib/Transforms/IPO/FunctionImport.cpp         | 39 +++++++++++++------
 lib/Transforms/IPO/PassManagerBuilder.cpp     |  5 +++
 5 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h
index fbc112ba45b7..1b22d01a3a25 100644
--- a/include/llvm/LinkAllPasses.h
+++ b/include/llvm/LinkAllPasses.h
@@ -86,6 +86,7 @@ namespace {
       (void) llvm::createDomViewerPass();
       (void) llvm::createGCOVProfilerPass();
       (void) llvm::createInstrProfilingPass();
+      (void) llvm::createFunctionImportPass();
       (void) llvm::createFunctionInliningPass();
       (void) llvm::createAlwaysInlinerPass();
       (void) llvm::createGlobalDCEPass();
diff --git a/include/llvm/Transforms/IPO.h b/include/llvm/Transforms/IPO.h
index 96ddc6eceed2..eabf0556babd 100644
--- a/include/llvm/Transforms/IPO.h
+++ b/include/llvm/Transforms/IPO.h
@@ -20,6 +20,7 @@
 
 namespace llvm {
 
+class FunctionInfoIndex;
 class ModulePass;
 class Pass;
 class Function;
@@ -85,6 +86,10 @@ ModulePass *createEliminateAvailableExternallyPass();
 ModulePass *createGVExtractionPass(std::vector<GlobalValue*>& GVs, bool
                                    deleteFn = false);
 
+//===----------------------------------------------------------------------===//
+/// This pass performs iterative function importing from other modules.
+ModulePass *createFunctionImportPass(FunctionInfoIndex *Index = nullptr);
+
 //===----------------------------------------------------------------------===//
 /// createFunctionInliningPass - Return a new pass object that uses a heuristic
 /// to inline direct function calls to small functions.
diff --git a/include/llvm/Transforms/IPO/PassManagerBuilder.h b/include/llvm/Transforms/IPO/PassManagerBuilder.h
index ef01fa350531..70b785f9efa3 100644
--- a/include/llvm/Transforms/IPO/PassManagerBuilder.h
+++ b/include/llvm/Transforms/IPO/PassManagerBuilder.h
@@ -15,9 +15,11 @@
 #ifndef LLVM_TRANSFORMS_IPO_PASSMANAGERBUILDER_H
 #define LLVM_TRANSFORMS_IPO_PASSMANAGERBUILDER_H
 
+#include <memory>
 #include <vector>
 
 namespace llvm {
+class FunctionInfoIndex;
 class Pass;
 class TargetLibraryInfoImpl;
 class TargetMachine;
@@ -114,6 +116,9 @@ class PassManagerBuilder {
   /// added to the per-module passes.
   Pass *Inliner;
 
+  /// The function summary index to use for function importing.
+  FunctionInfoIndex *FunctionIndex;
+
   bool DisableTailCalls;
   bool DisableUnitAtATime;
   bool DisableUnrollLoops;
diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp
index c2359a8a172e..67d77adb650a 100644
--- a/lib/Transforms/IPO/FunctionImport.cpp
+++ b/lib/Transforms/IPO/FunctionImport.cpp
@@ -256,23 +256,38 @@ getFunctionIndexForFile(StringRef Path, std::string &Error,
 
 /// Pass that performs cross-module function import provided a summary file.
 class FunctionImportPass : public ModulePass {
+  /// Optional function summary index to use for importing, otherwise
+  /// the summary-file option must be specified.
+  FunctionInfoIndex *Index;
 
 public:
   /// Pass identification, replacement for typeid
   static char ID;
 
-  explicit FunctionImportPass() : ModulePass(ID) {}
+  /// Specify pass name for debug output
+  const char *getPassName() const override {
+    return "Function Importing";
+  }
+
+  explicit FunctionImportPass(FunctionInfoIndex *Index = nullptr)
+      : ModulePass(ID), Index(Index) {}
 
   bool runOnModule(Module &M) override {
-    if (SummaryFile.empty()) {
-      report_fatal_error("error: -function-import requires -summary-file\n");
-    }
-    std::string Error;
-    std::unique_ptr<FunctionInfoIndex> Index =
-        getFunctionIndexForFile(SummaryFile, Error, diagnosticHandler);
-    if (!Index) {
-      errs() << "Error loading file '" << SummaryFile << "': " << Error << "\n";
-      return false;
+    if (SummaryFile.empty() && !Index)
+      report_fatal_error("error: -function-import requires -summary-file or "
+                         "file from frontend\n");
+    std::unique_ptr<FunctionInfoIndex> IndexPtr;
+    if (!SummaryFile.empty()) {
+      if (Index)
+        report_fatal_error("error: -summary-file and index from frontend\n");
+      std::string Error;
+      IndexPtr = getFunctionIndexForFile(SummaryFile, Error, diagnosticHandler);
+      if (!IndexPtr) {
+        errs() << "Error loading file '" << SummaryFile << "': " << Error
+               << "\n";
+        return false;
+      }
+      Index = IndexPtr.get();
     }
 
     // Perform the import now.
@@ -293,5 +308,7 @@ INITIALIZE_PASS_END(FunctionImportPass, "function-import",
                     "Summary Based Function Import", false, false)
 
 namespace llvm {
-Pass *createFunctionImportPass() { return new FunctionImportPass(); }
+Pass *createFunctionImportPass(FunctionInfoIndex *Index = nullptr) {
+  return new FunctionImportPass(Index);
+}
 }
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index ec6f21e8c64f..b8d1b7e78e35 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/FunctionInfo.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Support/CommandLine.h"
@@ -108,6 +109,7 @@ PassManagerBuilder::PassManagerBuilder() {
     SizeLevel = 0;
     LibraryInfo = nullptr;
     Inliner = nullptr;
+    FunctionIndex = nullptr;
     DisableUnitAtATime = false;
     DisableUnrollLoops = false;
     BBVectorize = RunBBVectorization;
@@ -476,6 +478,9 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   // Provide AliasAnalysis services for optimizations.
   addInitialAliasAnalysisPasses(PM);
 
+  if (FunctionIndex)
+    PM.add(createFunctionImportPass(FunctionIndex));
+
   // Propagate constants at call sites into the functions they call.  This
   // opens opportunities for globalopt (and inlining) by substituting function
   // pointers passed as arguments to direct uses of functions.

From 322ee9e42128124fc21f69d0d1126bd74593b380 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 7 Dec 2015 19:21:39 +0000
Subject: [PATCH 182/364] fix 'the the '; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254928 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/ValueTracking.h             | 2 +-
 lib/Analysis/LoopAccessAnalysis.cpp               | 2 +-
 lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp | 2 +-
 lib/Target/Hexagon/HexagonInstrInfo.cpp           | 2 +-
 tools/dsymutil/DwarfLinker.cpp                    | 2 +-
 tools/llc/llc.cpp                                 | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/llvm/Analysis/ValueTracking.h b/include/llvm/Analysis/ValueTracking.h
index eb2c000e07cd..8e0291068472 100644
--- a/include/llvm/Analysis/ValueTracking.h
+++ b/include/llvm/Analysis/ValueTracking.h
@@ -286,7 +286,7 @@ namespace llvm {
 
   /// Returns true if the result or effects of the given instructions \p I
   /// depend on or influence global memory.
-  /// Memory dependence arises for example if the the instruction reads from
+  /// Memory dependence arises for example if the instruction reads from
   /// memory or may produce effects or undefined behaviour. Memory dependent
   /// instructions generally cannot be reorderd with respect to other memory
   /// dependent instructions or moved into non-dominated basic blocks.
diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp
index 49b28078c976..b2670bf48dd8 100644
--- a/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/lib/Analysis/LoopAccessAnalysis.cpp
@@ -289,7 +289,7 @@ void RuntimePointerChecking::groupChecks(
   // don't process them twice.
   SmallSet<unsigned, 2> Seen;
 
-  // Go through all equivalence classes, get the the "pointer check groups"
+  // Go through all equivalence classes, get the "pointer check groups"
   // and add them to the overall solution. We use the order in which accesses
   // appear in 'Pointers' to enforce determinism.
   for (unsigned I = 0; I < Pointers.size(); ++I) {
diff --git a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index 64050824c10b..a8622a96527c 100644
--- a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -1592,7 +1592,7 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
     //   not use the other opcode as it is a legacy artifact of TD files.
     int64_t Value;
     if (MO.getExpr()->evaluateAsAbsolute(Value)) {
-      // if the the operand can fit within a 7:2 field
+      // if the operand can fit within a 7:2 field
       if (Value < (1 << 8) && Value >= -(1 << 8)) {
         SMLoc myLoc = Operands[2]->getStartLoc();
         // # is left in startLoc in the case of ##
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 3dc49337ecb5..2862468563c2 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -1475,7 +1475,7 @@ bool HexagonInstrInfo::isComplex(const MachineInstr *MI) const {
 }
 
 
-// Return true if the the instruction is a compund branch instruction.
+// Return true if the instruction is a compund branch instruction.
 bool HexagonInstrInfo::isCompoundBranchInstr(const MachineInstr *MI) const {
   return (getType(MI) == HexagonII::TypeCOMPOUND && MI->isBranch());
 }
diff --git a/tools/dsymutil/DwarfLinker.cpp b/tools/dsymutil/DwarfLinker.cpp
index 7ac6f8ed5e3d..e8877a5da894 100644
--- a/tools/dsymutil/DwarfLinker.cpp
+++ b/tools/dsymutil/DwarfLinker.cpp
@@ -2826,7 +2826,7 @@ void DwarfLinker::patchRangesForUnit(const CompileUnit &Unit,
   uint64_t OrigLowPc = OrigUnitDie->getAttributeValueAsAddress(
       &OrigUnit, dwarf::DW_AT_low_pc, -1ULL);
   // Ranges addresses are based on the unit's low_pc. Compute the
-  // offset we need to apply to adapt to the the new unit's low_pc.
+  // offset we need to apply to adapt to the new unit's low_pc.
   int64_t UnitPcOffset = 0;
   if (OrigLowPc != -1ULL)
     UnitPcOffset = int64_t(OrigLowPc) - Unit.getLowPc();
diff --git a/tools/llc/llc.cpp b/tools/llc/llc.cpp
index 531aba1f64bf..bffa39fd9e5c 100644
--- a/tools/llc/llc.cpp
+++ b/tools/llc/llc.cpp
@@ -100,7 +100,7 @@ static cl::opt<bool> AsmVerbose("asm-verbose",
 static cl::opt<bool>
     CompileTwice("compile-twice", cl::Hidden,
                  cl::desc("Run everything twice, re-using the same pass "
-                          "manager and verify the the result is the same."),
+                          "manager and verify the result is the same."),
                  cl::init(false));
 
 static int compileModule(char **, LLVMContext &);

From 3317c77cea4b033fa0dc447e6b008dc1e9278094 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 7 Dec 2015 19:31:34 +0000
Subject: [PATCH 183/364] don't repeat function names in comments; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254930 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86.h | 53 +++++++++++++++++++-------------------------
 1 file changed, 23 insertions(+), 30 deletions(-)

diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index 6bdb07d1df04..fbec6626d99d 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -23,54 +23,47 @@ class FunctionPass;
 class ImmutablePass;
 class X86TargetMachine;
 
-/// createX86ISelDag - This pass converts a legalized DAG into a
-/// X86-specific DAG, ready for instruction scheduling.
-///
+/// This pass converts a legalized DAG into a X86-specific DAG, ready for
+/// instruction scheduling.
 FunctionPass *createX86ISelDag(X86TargetMachine &TM,
                                CodeGenOpt::Level OptLevel);
 
-/// createX86GlobalBaseRegPass - This pass initializes a global base
-/// register for PIC on x86-32.
+/// This pass initializes a global base register for PIC on x86-32.
 FunctionPass* createX86GlobalBaseRegPass();
 
-/// createCleanupLocalDynamicTLSPass() - This pass combines multiple accesses
-/// to local-dynamic TLS variables so that the TLS base address for the module
-/// is only fetched once per execution path through the function.
+/// This pass combines multiple accesses to local-dynamic TLS variables so that
+/// the TLS base address for the module is only fetched once per execution path
+/// through the function.
 FunctionPass *createCleanupLocalDynamicTLSPass();
 
-/// createX86FloatingPointStackifierPass - This function returns a pass which
-/// converts floating point register references and pseudo instructions into
-/// floating point stack references and physical instructions.
-///
+/// This function returns a pass which converts floating-point register
+/// references and pseudo instructions into floating-point stack references and
+/// physical instructions.
 FunctionPass *createX86FloatingPointStackifierPass();
 
-/// createX86IssueVZeroUpperPass - This pass inserts AVX vzeroupper instructions
-/// before each call to avoid transition penalty between functions encoded with
-/// AVX and SSE.
+/// This pass inserts AVX vzeroupper instructions before each call to avoid
+/// transition penalty between functions encoded with AVX and SSE.
 FunctionPass *createX86IssueVZeroUpperPass();
 
-/// createX86PadShortFunctions - Return a pass that pads short functions
-/// with NOOPs. This will prevent a stall when returning on the Atom.
+/// Return a pass that pads short functions with NOOPs.
+/// This will prevent a stall when returning on the Atom.
 FunctionPass *createX86PadShortFunctions();
-/// createX86FixupLEAs - Return a a pass that selectively replaces
-/// certain instructions (like add, sub, inc, dec, some shifts,
-/// and some multiplies) by equivalent LEA instructions, in order
-/// to eliminate execution delays in some Atom processors.
+
+/// Return a a pass that selectively replaces certain instructions (like add,
+/// sub, inc, dec, some shifts, and some multiplies) by equivalent LEA
+/// instructions, in order to eliminate execution delays in some processors.
 FunctionPass *createX86FixupLEAs();
 
-/// createX86OptimizeLEAs() - Return a pass that removes redundant
-/// address recalculations.
+/// Return a pass that removes redundant address recalculations.
 FunctionPass *createX86OptimizeLEAs();
 
-/// createX86CallFrameOptimization - Return a pass that optimizes
-/// the code-size of x86 call sequences. This is done by replacing
-/// esp-relative movs with pushes.
+/// Return a pass that optimizes the code-size of x86 call sequences. This is
+/// done by replacing esp-relative movs with pushes.
 FunctionPass *createX86CallFrameOptimization();
 
-/// createX86WinEHStatePass - Return an IR pass that inserts EH registration
-/// stack objects and explicit EH state updates. This pass must run after EH
-/// preparation, which does Windows-specific but architecture-neutral
-/// preparation.
+/// Return an IR pass that inserts EH registration stack objects and explicit
+/// EH state updates. This pass must run after EH preparation, which does
+/// Windows-specific but architecture-neutral preparation.
 FunctionPass *createX86WinEHStatePass();
 
 /// Return a Machine IR pass that expands X86-specific pseudo

From a16511be95e9579b1a60497ae5fe70ac41b552b1 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Mon, 7 Dec 2015 19:53:38 +0000
Subject: [PATCH 184/364] Fix function return type in declaration (bot errors
 from r254926).

Try to fix bot build errors from r254926 by correcting the function
return type.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254934 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Transforms/IPO.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/llvm/Transforms/IPO.h b/include/llvm/Transforms/IPO.h
index eabf0556babd..04032d3b328a 100644
--- a/include/llvm/Transforms/IPO.h
+++ b/include/llvm/Transforms/IPO.h
@@ -88,7 +88,7 @@ ModulePass *createGVExtractionPass(std::vector<GlobalValue*>& GVs, bool
 
 //===----------------------------------------------------------------------===//
 /// This pass performs iterative function importing from other modules.
-ModulePass *createFunctionImportPass(FunctionInfoIndex *Index = nullptr);
+Pass *createFunctionImportPass(FunctionInfoIndex *Index = nullptr);
 
 //===----------------------------------------------------------------------===//
 /// createFunctionInliningPass - Return a new pass object that uses a heuristic

From 7d9752d1d3cd23f31a258804ecbb7e04b9813865 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 7 Dec 2015 20:36:00 +0000
Subject: [PATCH 185/364] Fix line endings

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254939 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/avx2-vbroadcast.ll | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll
index 186f50873650..a18a587e4a66 100644
--- a/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -300,8 +300,8 @@ entry:
 define <8 x float> @load_splat_8f32_4f32_33333333(<4 x float>* %ptr) nounwind uwtable readnone ssp {
 ; CHECK-LABEL: load_splat_8f32_4f32_33333333:
 ; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vmovaps (%rdi), %xmm0
-; CHECK-NEXT:    vbroadcastss LCPI18_0(%rip), %ymm1
+; CHECK-NEXT:    vmovaps (%rdi), %xmm0
+; CHECK-NEXT:    vbroadcastss LCPI18_0(%rip), %ymm1
 ; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
 entry:

From 4e2c148ec86b66853f64326a7ee8e85afd479ae1 Mon Sep 17 00:00:00 2001
From: Kit Barton <kbarton@ca.ibm.com>
Date: Mon, 7 Dec 2015 20:50:29 +0000
Subject: [PATCH 186/364] [PPC64] Convert bool literals to i32

Convert i1 values to i32 values if they should be allocated in GPRs instead of CRs.

Phabricator: http://reviews.llvm.org/D14064

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254942 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/PowerPC/CMakeLists.txt        |   1 +
 lib/Target/PowerPC/PPC.h                 |   2 +
 lib/Target/PowerPC/PPCBoolRetToInt.cpp   | 253 +++++++++++++++++++++++
 lib/Target/PowerPC/PPCTargetMachine.cpp  |   5 +
 test/CodeGen/PowerPC/BoolRetToIntTest.ll | 203 ++++++++++++++++++
 5 files changed, 464 insertions(+)
 create mode 100644 lib/Target/PowerPC/PPCBoolRetToInt.cpp
 create mode 100644 test/CodeGen/PowerPC/BoolRetToIntTest.ll

diff --git a/lib/Target/PowerPC/CMakeLists.txt b/lib/Target/PowerPC/CMakeLists.txt
index df6e08db8d35..c31ababafbe7 100644
--- a/lib/Target/PowerPC/CMakeLists.txt
+++ b/lib/Target/PowerPC/CMakeLists.txt
@@ -13,6 +13,7 @@ tablegen(LLVM PPCGenSubtargetInfo.inc -gen-subtarget)
 add_public_tablegen_target(PowerPCCommonTableGen)
 
 add_llvm_target(PowerPCCodeGen
+  PPCBoolRetToInt.cpp
   PPCAsmPrinter.cpp
   PPCBranchSelector.cpp
   PPCCTRLoops.cpp
diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h
index e157fd37c6e1..a259ed3fd327 100644
--- a/lib/Target/PowerPC/PPC.h
+++ b/lib/Target/PowerPC/PPC.h
@@ -45,10 +45,12 @@ namespace llvm {
   FunctionPass *createPPCBranchSelectionPass();
   FunctionPass *createPPCISelDag(PPCTargetMachine &TM);
   FunctionPass *createPPCTLSDynamicCallPass();
+  FunctionPass *createPPCBoolRetToIntPass();
   void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                     AsmPrinter &AP, bool isDarwin);
 
   void initializePPCVSXFMAMutatePass(PassRegistry&);
+  void initializePPCBoolRetToIntPass(PassRegistry&);
   extern char &PPCVSXFMAMutateID;
 
   namespace PPCII {
diff --git a/lib/Target/PowerPC/PPCBoolRetToInt.cpp b/lib/Target/PowerPC/PPCBoolRetToInt.cpp
new file mode 100644
index 000000000000..7920240bc2b9
--- /dev/null
+++ b/lib/Target/PowerPC/PPCBoolRetToInt.cpp
@@ -0,0 +1,253 @@
+//===- PPCBoolRetToInt.cpp - Convert bool literals to i32 if they are returned ==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements converting i1 values to i32 if they could be more
+// profitably allocated as GPRs rather than CRs. This pass will become totally
+// unnecessary if Register Bank Allocation and Global Instruction Selection ever
+// go upstream.
+//
+// Presently, the pass converts i1 Constants, and Arguments to i32 if the
+// transitive closure of their uses includes only PHINodes, CallInsts, and
+// ReturnInsts. The rational is that arguments are generally passed and returned
+// in GPRs rather than CRs, so casting them to i32 at the LLVM IR level will
+// actually save casts at the Machine Instruction level.
+//
+// It might be useful to expand this pass to add bit-wise operations to the list
+// of safe transitive closure types. Also, we miss some opportunities when LLVM
+// represents logical AND and OR operations with control flow rather than data
+// flow. For example by lowering the expression: return (A && B && C)
+//
+// as: return A ? true : B && C.
+//
+// There's code in SimplifyCFG that code be used to turn control flow in data
+// flow using SelectInsts. Selects are slow on some architectures (P7/P8), so
+// this probably isn't good in general, but for the special case of i1, the
+// Selects could be further lowered to bit operations that are fast everywhere.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+namespace {
+
+#define DEBUG_TYPE "bool-ret-to-int"
+
+STATISTIC(NumBoolRetPromotion,
+          "Number of times a bool feeding a RetInst was promoted to an int");
+STATISTIC(NumBoolCallPromotion,
+          "Number of times a bool feeding a CallInst was promoted to an int");
+STATISTIC(NumBoolToIntPromotion,
+          "Total number of times a bool was promoted to an int");
+
+class PPCBoolRetToInt : public FunctionPass {
+
+  static SmallPtrSet<Value *, 8> findAllDefs(Value *V) {
+    SmallPtrSet<Value *, 8> Defs;
+    SmallVector<Value *, 8> WorkList;
+    WorkList.push_back(V);
+    Defs.insert(V);
+    while (!WorkList.empty()) {
+      Value *Curr = WorkList.back();
+      WorkList.pop_back();
+      if (User *CurrUser = dyn_cast<User>(Curr))
+        for (auto &Op : CurrUser->operands())
+          if (Defs.insert(Op).second)
+            WorkList.push_back(Op);
+    }
+    return Defs;
+  }
+
+  // Translate a i1 value to an equivalent i32 value:
+  static Value *translate(Value *V) {
+    Type *Int32Ty = Type::getInt32Ty(V->getContext());
+    if (Constant *C = dyn_cast<Constant>(V))
+      return ConstantExpr::getZExt(C, Int32Ty);
+    if (PHINode *P = dyn_cast<PHINode>(V)) {
+      // Temporarily set the operands to 0. We'll fix this later in
+      // runOnUse.
+      Value *Zero = Constant::getNullValue(Int32Ty);
+      PHINode *Q =
+        PHINode::Create(Int32Ty, P->getNumIncomingValues(), P->getName(), P);
+      for (unsigned i = 0; i < P->getNumOperands(); ++i)
+        Q->addIncoming(Zero, P->getIncomingBlock(i));
+      return Q;
+    }
+
+    Argument *A = dyn_cast<Argument>(V);
+    Instruction *I = dyn_cast<Instruction>(V);
+    assert((A || I) && "Unknown value type");
+
+    auto InstPt =
+      A ? &*A->getParent()->getEntryBlock().begin() : I->getNextNode();
+    return new ZExtInst(V, Int32Ty, "", InstPt);
+  }
+
+  typedef SmallPtrSet<const PHINode *, 8> PHINodeSet;
+
+  // A PHINode is Promotable if:
+  // 1. Its type is i1 AND
+  // 2. All of its uses are ReturnInt, CallInst, PHINode, or DbgInfoIntrinsic
+  // AND
+  // 3. All of its operands are Constant or Argument or
+  //    CallInst or PHINode AND
+  // 4. All of its PHINode uses are Promotable AND
+  // 5. All of its PHINode operands are Promotable
+  static PHINodeSet getPromotablePHINodes(const Function &F) {
+    PHINodeSet Promotable;
+    // Condition 1
+    for (auto &BB : F)
+      for (auto &I : BB)
+        if (const PHINode *P = dyn_cast<PHINode>(&I))
+          if (P->getType()->isIntegerTy(1))
+            Promotable.insert(P);
+
+    SmallVector<const PHINode *, 8> ToRemove;
+    for (const auto &P : Promotable) {
+      // Condition 2 and 3
+      auto IsValidUser = [] (const Value *V) -> bool {
+        return isa<ReturnInst>(V) || isa<CallInst>(V) || isa<PHINode>(V) ||
+        isa<DbgInfoIntrinsic>(V);
+      };
+      auto IsValidOperand = [] (const Value *V) -> bool {
+        return isa<Constant>(V) || isa<Argument>(V) || isa<CallInst>(V) ||
+        isa<PHINode>(V);
+      };
+      const auto &Users = P->users();
+      const auto &Operands = P->operands();
+      if (!std::all_of(Users.begin(), Users.end(), IsValidUser) ||
+          !std::all_of(Operands.begin(), Operands.end(), IsValidOperand))
+        ToRemove.push_back(P);
+    }
+
+    // Iterate to convergence
+    auto IsPromotable = [&Promotable] (const Value *V) -> bool {
+      const PHINode *Phi = dyn_cast<PHINode>(V);
+      return !Phi || Promotable.count(Phi);
+    };
+    while (!ToRemove.empty()) {
+      for (auto &User : ToRemove)
+        Promotable.erase(User);
+      ToRemove.clear();
+
+      for (const auto &P : Promotable) {
+        // Condition 4 and 5
+        const auto &Users = P->users();
+        const auto &Operands = P->operands();
+        if (!std::all_of(Users.begin(), Users.end(), IsPromotable) ||
+            !std::all_of(Operands.begin(), Operands.end(), IsPromotable))
+          ToRemove.push_back(P);
+      }
+    }
+
+    return Promotable;
+  }
+
+  typedef DenseMap<Value *, Value *> B2IMap;
+
+ public:
+  static char ID;
+  PPCBoolRetToInt() : FunctionPass(ID) {
+    initializePPCBoolRetToIntPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) {
+    PHINodeSet PromotablePHINodes = getPromotablePHINodes(F);
+    B2IMap Bool2IntMap;
+    bool Changed = false;
+    for (auto &BB : F) {
+      for (auto &I : BB) {
+        if (ReturnInst *R = dyn_cast<ReturnInst>(&I))
+          if (F.getReturnType()->isIntegerTy(1))
+            Changed |=
+              runOnUse(R->getOperandUse(0), PromotablePHINodes, Bool2IntMap);
+
+        if (CallInst *CI = dyn_cast<CallInst>(&I))
+          for (auto &U : CI->operands())
+            if (U->getType()->isIntegerTy(1))
+              Changed |= runOnUse(U, PromotablePHINodes, Bool2IntMap);
+      }
+    }
+
+    return Changed;
+  }
+
+  static bool runOnUse(Use &U, const PHINodeSet &PromotablePHINodes,
+                       B2IMap &BoolToIntMap) {
+    auto Defs = findAllDefs(U);
+
+    // If the values are all Constants or Arguments, don't bother
+    if (!std::any_of(Defs.begin(), Defs.end(), isa<Instruction, Value *>))
+      return false;
+
+    // Presently, we only know how to handle PHINode, Constant, and Arguments.
+    // Potentially, bitwise operations (AND, OR, XOR, NOT) and sign extension
+    // could also be handled in the future.
+    for (const auto &V : Defs)
+      if (!isa<PHINode>(V) && !isa<Constant>(V) && !isa<Argument>(V))
+        return false;
+
+    for (const auto &V : Defs)
+      if (const PHINode *P = dyn_cast<PHINode>(V))
+        if (!PromotablePHINodes.count(P))
+          return false;
+
+    if (isa<ReturnInst>(U.getUser()))
+      ++NumBoolRetPromotion;
+    if (isa<CallInst>(U.getUser()))
+      ++NumBoolCallPromotion;
+    ++NumBoolToIntPromotion;
+
+    for (const auto &V : Defs)
+      if (!BoolToIntMap.count(V))
+        BoolToIntMap[V] = translate(V);
+
+    // Replace the operands of the translated instructions. There were set to
+    // zero in the translate function.
+    for (auto &Pair : BoolToIntMap) {
+      User *First = dyn_cast<User>(Pair.first);
+      User *Second = dyn_cast<User>(Pair.second);
+      assert((!First || Second) && "translated from user to non-user!?");
+      if (First)
+        for (unsigned i = 0; i < First->getNumOperands(); ++i)
+          Second->setOperand(i, BoolToIntMap[First->getOperand(i)]);
+    }
+
+    Value *IntRetVal = BoolToIntMap[U];
+    Type *Int1Ty = Type::getInt1Ty(U->getContext());
+    Instruction *I = cast<Instruction>(U.getUser());
+    Value *BackToBool = new TruncInst(IntRetVal, Int1Ty, "backToBool", I);
+    U.set(BackToBool);
+
+    return true;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+};
+}
+
+char PPCBoolRetToInt::ID = 0;
+INITIALIZE_PASS(PPCBoolRetToInt, "bool-ret-to-int",
+                "Convert i1 constants to i32 if they are returned",
+                false, false)
+
+FunctionPass *llvm::createPPCBoolRetToIntPass() { return new PPCBoolRetToInt(); }
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index 24a9ef0ef077..946e0f10cddd 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -71,6 +71,9 @@ extern "C" void LLVMInitializePowerPCTarget() {
   RegisterTargetMachine<PPC32TargetMachine> A(ThePPC32Target);
   RegisterTargetMachine<PPC64TargetMachine> B(ThePPC64Target);
   RegisterTargetMachine<PPC64TargetMachine> C(ThePPC64LETarget);
+
+  PassRegistry &PR = *PassRegistry::getPassRegistry();
+  initializePPCBoolRetToIntPass(PR);
 }
 
 /// Return the datalayout string of a subtarget.
@@ -286,6 +289,8 @@ TargetPassConfig *PPCTargetMachine::createPassConfig(PassManagerBase &PM) {
 }
 
 void PPCPassConfig::addIRPasses() {
+  if (TM->getOptLevel() != CodeGenOpt::None)
+    addPass(createPPCBoolRetToIntPass());
   addPass(createAtomicExpandPass(&getPPCTargetMachine()));
 
   // For the BG/Q (or if explicitly requested), add explicit data prefetch
diff --git a/test/CodeGen/PowerPC/BoolRetToIntTest.ll b/test/CodeGen/PowerPC/BoolRetToIntTest.ll
new file mode 100644
index 000000000000..a7b79789b4ca
--- /dev/null
+++ b/test/CodeGen/PowerPC/BoolRetToIntTest.ll
@@ -0,0 +1,203 @@
+; RUN: opt -bool-ret-to-int -S -o - < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; CHECK-LABEL: notBoolRet
+define signext i32 @notBoolRet() {
+entry:
+; CHECK: ret i32 1
+  ret i32 1
+}
+
+; CHECK-LABEL: find
+define zeroext i1 @find(i8** readonly %begin, i8** readnone %end, i1 (i8*)* nocapture %hasProp) {
+entry:
+  %cmp.4 = icmp eq i8** %begin, %end
+  br i1 %cmp.4, label %cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond:                                         ; preds = %for.body
+  %cmp = icmp eq i8** %incdec.ptr, %end
+  br i1 %cmp, label %cleanup.loopexit, label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.cond
+  %curr.05 = phi i8** [ %incdec.ptr, %for.cond ], [ %begin, %for.body.preheader ]
+  %0 = load i8*, i8** %curr.05, align 8
+  %call = tail call zeroext i1 %hasProp(i8* %0)
+  %incdec.ptr = getelementptr inbounds i8*, i8** %curr.05, i64 1
+  br i1 %call, label %cleanup.loopexit, label %for.cond
+
+cleanup.loopexit:                                 ; preds = %for.body, %for.cond
+; CHECK: [[PHI:%.+]] = phi i32 [ 1, %for.body ], [ 0, %for.cond ]
+  %cleanup.dest.slot.0.ph = phi i1 [ true, %for.body ], [ false, %for.cond ]
+  br label %cleanup
+
+cleanup:                                          ; preds = %cleanup.loopexit, %entry
+; CHECK: = phi i32 [ 0, %entry ], [ [[PHI]], %cleanup.loopexit ]
+  %cleanup.dest.slot.0 = phi i1 [ false, %entry ], [ %cleanup.dest.slot.0.ph, %cleanup.loopexit ]
+; CHECK: [[REG:%.+]] = trunc i32 {{%.+}} to i1
+; CHECK: ret i1 [[REG]]
+  ret i1 %cleanup.dest.slot.0
+}
+
+; CHECK-LABEL: retFalse
+define zeroext i1 @retFalse() {
+entry:
+; CHECK: ret i1 false
+  ret i1 false
+}
+
+; CHECK-LABEL: retCvtFalse
+define zeroext i1 @retCvtFalse() {
+entry:
+; CHECK: ret i1 false
+  ret i1 trunc(i32 0 to i1)
+}
+
+; CHECK-LABEL: find_cont
+define void @find_cont(i8** readonly %begin, i8** readnone %end, i1 (i8*)* nocapture %hasProp, void (i1)* nocapture %cont) {
+entry:
+  %cmp.4 = icmp eq i8** %begin, %end
+  br i1 %cmp.4, label %cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond:                                         ; preds = %for.body
+  %cmp = icmp eq i8** %incdec.ptr, %end
+  br i1 %cmp, label %cleanup.loopexit, label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.cond
+  %curr.05 = phi i8** [ %incdec.ptr, %for.cond ], [ %begin, %for.body.preheader ]
+  %0 = load i8*, i8** %curr.05, align 8
+  %call = tail call zeroext i1 %hasProp(i8* %0)
+  %incdec.ptr = getelementptr inbounds i8*, i8** %curr.05, i64 1
+  br i1 %call, label %cleanup.loopexit, label %for.cond
+
+cleanup.loopexit:                                 ; preds = %for.body, %for.cond
+; CHECK: [[PHI:%.+]] = phi i32 [ 1, %for.body ], [ 0, %for.cond ]
+  %cleanup.dest.slot.0.ph = phi i1 [ true, %for.body ], [ false, %for.cond ]
+  br label %cleanup
+
+cleanup:                                          ; preds = %cleanup.loopexit, %entry
+; CHECK: = phi i32 [ 0, %entry ], [ [[PHI]], %cleanup.loopexit ]
+  %cleanup.dest.slot.0 = phi i1 [ false, %entry ], [ %cleanup.dest.slot.0.ph, %cleanup.loopexit ]
+; CHECK: [[REG:%.+]] = trunc i32 {{%.+}} to i1
+; CHECK: call void %cont(i1 [[REG]]
+  tail call void %cont(i1 %cleanup.dest.slot.0)
+  ret void
+}
+
+; CHECK-LABEL: find_cont_ret
+define zeroext i1 @find_cont_ret(i8** readonly %begin, i8** readnone %end, i1 (i8*)* nocapture %hasProp, void (i1)* nocapture %cont) {
+entry:
+  %cmp.4 = icmp eq i8** %begin, %end
+  br i1 %cmp.4, label %cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond:                                         ; preds = %for.body
+  %cmp = icmp eq i8** %incdec.ptr, %end
+  br i1 %cmp, label %cleanup.loopexit, label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.cond
+  %curr.05 = phi i8** [ %incdec.ptr, %for.cond ], [ %begin, %for.body.preheader ]
+  %0 = load i8*, i8** %curr.05, align 8
+  %call = tail call zeroext i1 %hasProp(i8* %0)
+  %incdec.ptr = getelementptr inbounds i8*, i8** %curr.05, i64 1
+  br i1 %call, label %cleanup.loopexit, label %for.cond
+
+cleanup.loopexit:                                 ; preds = %for.body, %for.cond
+; CHECK: [[PHI:%.+]] = phi i32 [ 1, %for.body ], [ 0, %for.cond ]
+  %cleanup.dest.slot.0.ph = phi i1 [ true, %for.body ], [ false, %for.cond ]
+  br label %cleanup
+
+cleanup:                                          ; preds = %cleanup.loopexit, %entry
+; CHECK: = phi i32 [ 0, %entry ], [ [[PHI]], %cleanup.loopexit ]
+  %cleanup.dest.slot.0 = phi i1 [ false, %entry ], [ %cleanup.dest.slot.0.ph, %cleanup.loopexit ]
+; CHECK: [[REG:%.+]] = trunc i32 {{%.+}} to i1
+; CHECK: call void %cont(i1 [[REG]]
+  tail call void %cont(i1 %cleanup.dest.slot.0)
+; CHECK: [[REG:%.+]] = trunc i32 {{%.+}} to i1
+; CHECK: ret i1 [[REG]]
+  ret i1 %cleanup.dest.slot.0
+}
+
+; CHECK-LABEL: arg_operand
+define zeroext i1 @arg_operand(i1 %operand) {
+entry:
+  br i1 %operand, label %foo, label %cleanup
+
+foo:
+  br label %cleanup
+
+cleanup:
+; CHECK: [[REG:%.+]] = trunc i32 {{%.+}} to i1
+; CHECK: ret i1 [[REG]]
+  %result = phi i1 [ false, %foo ], [ %operand, %entry ]
+  ret i1 %result
+}
+
+; CHECK-LABEL: bad_use
+define zeroext i1 @bad_use(i1 %operand) {
+entry:
+  br i1 %operand, label %foo, label %cleanup
+
+foo:
+  br label %cleanup
+
+cleanup:
+; CHECK: [[REG:%.+]] = phi i1
+; CHECK: ret i1 [[REG]]
+  %result = phi i1 [ false, %foo], [ true, %entry ]
+  %0 = icmp eq i1 %result, %operand
+  ret i1 %result
+}
+
+; CHECK-LABEL: bad_use_closure
+define zeroext i1 @bad_use_closure(i1 %operand) {
+entry:
+  br i1 %operand, label %foo, label %cleanup
+
+foo:
+  %bar = phi i1 [ false, %entry ]
+  %0 = icmp eq i1 %bar, %operand
+  br label %cleanup
+
+cleanup:
+; CHECK: [[REG:%.+]] = phi i1 [ true
+; CHECK: ret i1 [[REG]]
+  %result = phi i1 [ true, %entry ], [ %bar, %foo]
+  ret i1 %result
+}
+
+; CHECK-LABEL: arg_test
+define zeroext i1 @arg_test(i1 %operand) {
+entry:
+  br i1 %operand, label %foo, label %cleanup
+
+foo:
+  %bar = phi i1 [ false, %entry ]
+  br label %cleanup
+
+; CHECK-LABEL: cleanup
+cleanup:
+; CHECK: [[REG:%.+]] = trunc i32 {{%.+}} to i1
+; CHECK: ret i1 [[REG]]
+  %result = phi i1 [ %bar, %foo], [ %operand, %entry ]
+  ret i1 %result
+}
+
+declare zeroext i1 @return_i1()
+
+; CHECK-LABEL: call_test
+define zeroext i1 @call_test() {
+; CHECK: [[REG:%.+]] = call i1
+  %result = call i1 @return_i1()
+; CHECK: ret i1 [[REG]]
+  ret i1 %result
+}
\ No newline at end of file

From aa464aada41ae2b8ef9f3ef4147c6ef09d2879e2 Mon Sep 17 00:00:00 2001
From: Easwaran Raman <eraman@google.com>
Date: Mon, 7 Dec 2015 21:21:20 +0000
Subject: [PATCH 187/364] Use updated threshold for indirect call bonus

When considering foo->bar inlining, if there is an indirect call in foo which gets resolved to a direct call (say baz), then we try to inline baz into bar with a threshold T and subtract max(T - Cost(bar->baz), 0) from Cost(foo->bar). This patch uses max(Threshold(bar->baz) - Cost(bar->baz)) instead, where Thresheld(bar->baz) could be different from T due to bonuses or subtractions. Threshold(bar->baz) - Cost(bar->baz) better represents the desirability of inlining baz into bar.

Differential Revision: http://reviews.llvm.org/D14309


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254945 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Analysis/InlineCost.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index 6d7d74999061..cebc8731d4d3 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -834,8 +834,8 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
   CallAnalyzer CA(TTI, ACT, *F, InlineConstants::IndirectCallThreshold, CS);
   if (CA.analyzeCall(CS)) {
     // We were able to inline the indirect call! Subtract the cost from the
-    // bonus we want to apply, but don't go below zero.
-    Cost -= std::max(0, InlineConstants::IndirectCallThreshold - CA.getCost());
+    // threshold to get the bonus we want to apply, but don't go below zero.
+    Cost -= std::max(0, CA.getThreshold() - CA.getCost());
   }
 
   return Base::visitCallSite(CS);

From 3f8a9448c501898a912830554762408460b9a61d Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Mon, 7 Dec 2015 21:27:15 +0000
Subject: [PATCH 188/364] [EarlyCSE] Simplify and invert ParseMemoryInst [NFCI]

Restructure ParseMemoryInst - which was introduced to abstract over target specific load and stores instructions - to just query the underlying instructions. In theory, this could be slightly slower than caching the results, but in practice, it's very unlikely to be measurable.

The simple query scheme makes it far easier to understand, and much easier to extend with new queries. Given I'm about to need to add new query types, doing the cleanup first seemed worthwhile.

Do we still believe the target specific intrinsic handling is worthwhile in EarlyCSE? It adds quite a bit of complexity and makes the code harder to read. Being able to delete the abstraction entirely would be wonderful.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254950 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/EarlyCSE.cpp | 89 +++++++++++++++---------------
 1 file changed, 45 insertions(+), 44 deletions(-)

diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index b055044ba6d0..4c28d4bc5f7d 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -388,57 +388,58 @@ class EarlyCSE {
   class ParseMemoryInst {
   public:
     ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI)
-      : Load(false), Store(false), IsSimple(true), MayReadFromMemory(false),
-        MayWriteToMemory(false), MatchingId(-1), Ptr(nullptr) {
-      MayReadFromMemory = Inst->mayReadFromMemory();
-      MayWriteToMemory = Inst->mayWriteToMemory();
-      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
-        MemIntrinsicInfo Info;
-        if (!TTI.getTgtMemIntrinsic(II, Info))
-          return;
-        if (Info.NumMemRefs == 1) {
-          Store = Info.WriteMem;
-          Load = Info.ReadMem;
-          MatchingId = Info.MatchingId;
-          MayReadFromMemory = Info.ReadMem;
-          MayWriteToMemory = Info.WriteMem;
-          IsSimple = Info.IsSimple;
-          Ptr = Info.PtrVal;
-        }
-      } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
-        Load = true;
-        IsSimple = LI->isSimple();
-        Ptr = LI->getPointerOperand();
+      : IsTargetMemInst(false), Inst(Inst) {
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst))
+        if (TTI.getTgtMemIntrinsic(II, Info) && Info.NumMemRefs == 1)
+          IsTargetMemInst = true;
+    }
+    bool isLoad() const {
+      if (IsTargetMemInst) return Info.ReadMem;
+      return isa<LoadInst>(Inst);
+    }
+    bool isStore() const {
+      if (IsTargetMemInst) return Info.WriteMem;
+      return isa<StoreInst>(Inst);
+    }
+    bool isSimple() const {
+      if (IsTargetMemInst) return Info.IsSimple;
+      if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+        return LI->isSimple();
       } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
-        Store = true;
-        IsSimple = SI->isSimple();
-        Ptr = SI->getPointerOperand();
+        return SI->isSimple();
       }
+      return Inst->isAtomic();
     }
-    bool isLoad() const { return Load; }
-    bool isStore() const { return Store; }
-    bool isSimple() const { return IsSimple; }
     bool isMatchingMemLoc(const ParseMemoryInst &Inst) const {
-      return Ptr == Inst.Ptr && MatchingId == Inst.MatchingId;
+      return (getPointerOperand() == Inst.getPointerOperand() &&
+              getMatchingId() == Inst.getMatchingId());
     }
-    bool isValid() const { return Ptr != nullptr; }
-    int getMatchingId() const { return MatchingId; }
-    Value *getPtr() const { return Ptr; }
-    bool mayReadFromMemory() const { return MayReadFromMemory; }
-    bool mayWriteToMemory() const { return MayWriteToMemory; }
+    bool isValid() const { return getPointerOperand() != nullptr; }
 
-  private:
-    bool Load;
-    bool Store;
-    bool IsSimple;
-    bool MayReadFromMemory;
-    bool MayWriteToMemory;
     // For regular (non-intrinsic) loads/stores, this is set to -1. For
     // intrinsic loads/stores, the id is retrieved from the corresponding
     // field in the MemIntrinsicInfo structure.  That field contains
     // non-negative values only.
-    int MatchingId;
-    Value *Ptr;
+    int getMatchingId() const {
+      if (IsTargetMemInst) return Info.MatchingId;
+      return -1;
+    }
+    Value *getPointerOperand() const {
+      if (IsTargetMemInst) return Info.PtrVal;
+      if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+        return LI->getPointerOperand();
+      } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+        return SI->getPointerOperand();
+      }
+      return nullptr;
+    }
+    bool mayReadFromMemory() const { return Inst->mayReadFromMemory(); }
+    bool mayWriteToMemory() const { return Inst->mayWriteToMemory(); }
+
+  private:
+    bool IsTargetMemInst;
+    MemIntrinsicInfo Info;
+    Instruction *Inst;
   };
 
   bool processNode(DomTreeNode *Node);
@@ -565,7 +566,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
 
       // If we have an available version of this load, and if it is the right
       // generation, replace this instruction.
-      LoadValue InVal = AvailableLoads.lookup(MemInst.getPtr());
+      LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand());
       if (InVal.Data != nullptr && InVal.Generation == CurrentGeneration &&
           InVal.MatchingId == MemInst.getMatchingId()) {
         Value *Op = getOrCreateResult(InVal.Data, Inst->getType());
@@ -583,7 +584,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
 
       // Otherwise, remember that we have this instruction.
       AvailableLoads.insert(
-          MemInst.getPtr(),
+          MemInst.getPointerOperand(),
           LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId()));
       LastStore = nullptr;
       continue;
@@ -659,7 +660,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
         // to non-volatile loads, so we don't have to check for volatility of
         // the store.
         AvailableLoads.insert(
-            MemInst.getPtr(),
+            MemInst.getPointerOperand(),
             LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId()));
 
         // Remember that this was the last normal store we saw for DSE.

From 73036cee732953e35eef06d6eb3597ba084b8a4c Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Mon, 7 Dec 2015 21:28:22 +0000
Subject: [PATCH 189/364] Simplify the error handling a bit. NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254952 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Linker/LinkModules.cpp | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index 8e0904a858bc..3e1416d8d506 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -1559,7 +1559,6 @@ bool ModuleLinker::linkModuleFlagsMetadata() {
 
   // Merge in the flags from the source module, and also collect its set of
   // requirements.
-  bool HasErr = false;
   for (unsigned I = 0, E = SrcModFlags->getNumOperands(); I != E; ++I) {
     MDNode *SrcOp = SrcModFlags->getOperand(I);
     ConstantInt *SrcBehavior =
@@ -1597,8 +1596,8 @@ bool ModuleLinker::linkModuleFlagsMetadata() {
       // Diagnose inconsistent flags which both have override behavior.
       if (SrcBehaviorValue == Module::Override &&
           SrcOp->getOperand(2) != DstOp->getOperand(2)) {
-        HasErr |= emitError("linking module flags '" + ID->getString() +
-                            "': IDs have conflicting override values");
+        emitError("linking module flags '" + ID->getString() +
+                  "': IDs have conflicting override values");
       }
       continue;
     } else if (SrcBehaviorValue == Module::Override) {
@@ -1610,8 +1609,8 @@ bool ModuleLinker::linkModuleFlagsMetadata() {
 
     // Diagnose inconsistent merge behavior types.
     if (SrcBehaviorValue != DstBehaviorValue) {
-      HasErr |= emitError("linking module flags '" + ID->getString() +
-                          "': IDs have conflicting behaviors");
+      emitError("linking module flags '" + ID->getString() +
+                "': IDs have conflicting behaviors");
       continue;
     }
 
@@ -1630,8 +1629,8 @@ bool ModuleLinker::linkModuleFlagsMetadata() {
     case Module::Error: {
       // Emit an error if the values differ.
       if (SrcOp->getOperand(2) != DstOp->getOperand(2)) {
-        HasErr |= emitError("linking module flags '" + ID->getString() +
-                            "': IDs have conflicting values");
+        emitError("linking module flags '" + ID->getString() +
+                  "': IDs have conflicting values");
       }
       continue;
     }
@@ -1676,13 +1675,13 @@ bool ModuleLinker::linkModuleFlagsMetadata() {
 
     MDNode *Op = Flags[Flag].first;
     if (!Op || Op->getOperand(2) != ReqValue) {
-      HasErr |= emitError("linking module flags '" + Flag->getString() +
-                          "': does not have the required value");
+      emitError("linking module flags '" + Flag->getString() +
+                "': does not have the required value");
       continue;
     }
   }
 
-  return HasErr;
+  return HasError;
 }
 
 // This function returns true if the triples match.

From 8d6e45b694ca4b18cdb3223f70145736fcd3cd54 Mon Sep 17 00:00:00 2001
From: Manman Ren <manman.ren@gmail.com>
Date: Mon, 7 Dec 2015 21:40:09 +0000
Subject: [PATCH 190/364] Update doc for C++ TLS calling convention.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254953 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/LangRef.rst | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index 82b33557c128..ca0939e53575 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -407,12 +407,22 @@ added in the future:
     used by a future version of the ObjectiveC runtime and should be considered
     experimental at this time.
 "``cxx_fast_tlscc``" - The `CXX_FAST_TLS` calling convention for access functions
+    Clang generates an access function to access C++-style TLS. The access
+    function generally has an entry block, an exit block and an initialization
+    block that is run at the first time. The entry and exit blocks can access
+    a few TLS IR variables, each access will be lowered to a platform-specific
+    sequence.
+
     This calling convention aims to minimize overhead in the caller by
-    preserving as many registers as possible. This calling convention behaves
-    identical to the `C` calling convention on how arguments and return values
-    are passed, but it uses a different set of caller/callee-saved registers.
-    Given that C-style TLS on Darwin has its own special CSRs, we can't use the
-    existing `PreserveMost`.
+    preserving as many registers as possible (all the registers that are
+    perserved on the fast path, composed of the entry and exit blocks).
+
+    This calling convention behaves identical to the `C` calling convention on
+    how arguments and return values are passed, but it uses a different set of
+    caller/callee-saved registers.
+
+    Given that each platform has its own lowering sequence, hence its own set
+    of preserved registers, we can't use the existing `PreserveMost`.
 
     - On X86-64 the callee preserves all general purpose registers, except for
       RDI and RAX.

From 1440f53307347d3f3ee6da62b21e00cfd4cc968a Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Mon, 7 Dec 2015 21:41:29 +0000
Subject: [PATCH 191/364] Revert 254950

It's causing test failures on AArch64.  Due to a bad build config on my part, I apparently wasn't running the tests I thought I was.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254954 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/EarlyCSE.cpp | 89 +++++++++++++++---------------
 1 file changed, 44 insertions(+), 45 deletions(-)

diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index 4c28d4bc5f7d..b055044ba6d0 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -388,58 +388,57 @@ class EarlyCSE {
   class ParseMemoryInst {
   public:
     ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI)
-      : IsTargetMemInst(false), Inst(Inst) {
-      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst))
-        if (TTI.getTgtMemIntrinsic(II, Info) && Info.NumMemRefs == 1)
-          IsTargetMemInst = true;
-    }
-    bool isLoad() const {
-      if (IsTargetMemInst) return Info.ReadMem;
-      return isa<LoadInst>(Inst);
-    }
-    bool isStore() const {
-      if (IsTargetMemInst) return Info.WriteMem;
-      return isa<StoreInst>(Inst);
-    }
-    bool isSimple() const {
-      if (IsTargetMemInst) return Info.IsSimple;
-      if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
-        return LI->isSimple();
+      : Load(false), Store(false), IsSimple(true), MayReadFromMemory(false),
+        MayWriteToMemory(false), MatchingId(-1), Ptr(nullptr) {
+      MayReadFromMemory = Inst->mayReadFromMemory();
+      MayWriteToMemory = Inst->mayWriteToMemory();
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+        MemIntrinsicInfo Info;
+        if (!TTI.getTgtMemIntrinsic(II, Info))
+          return;
+        if (Info.NumMemRefs == 1) {
+          Store = Info.WriteMem;
+          Load = Info.ReadMem;
+          MatchingId = Info.MatchingId;
+          MayReadFromMemory = Info.ReadMem;
+          MayWriteToMemory = Info.WriteMem;
+          IsSimple = Info.IsSimple;
+          Ptr = Info.PtrVal;
+        }
+      } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+        Load = true;
+        IsSimple = LI->isSimple();
+        Ptr = LI->getPointerOperand();
       } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
-        return SI->isSimple();
+        Store = true;
+        IsSimple = SI->isSimple();
+        Ptr = SI->getPointerOperand();
       }
-      return Inst->isAtomic();
     }
+    bool isLoad() const { return Load; }
+    bool isStore() const { return Store; }
+    bool isSimple() const { return IsSimple; }
     bool isMatchingMemLoc(const ParseMemoryInst &Inst) const {
-      return (getPointerOperand() == Inst.getPointerOperand() &&
-              getMatchingId() == Inst.getMatchingId());
+      return Ptr == Inst.Ptr && MatchingId == Inst.MatchingId;
     }
-    bool isValid() const { return getPointerOperand() != nullptr; }
+    bool isValid() const { return Ptr != nullptr; }
+    int getMatchingId() const { return MatchingId; }
+    Value *getPtr() const { return Ptr; }
+    bool mayReadFromMemory() const { return MayReadFromMemory; }
+    bool mayWriteToMemory() const { return MayWriteToMemory; }
 
+  private:
+    bool Load;
+    bool Store;
+    bool IsSimple;
+    bool MayReadFromMemory;
+    bool MayWriteToMemory;
     // For regular (non-intrinsic) loads/stores, this is set to -1. For
     // intrinsic loads/stores, the id is retrieved from the corresponding
     // field in the MemIntrinsicInfo structure.  That field contains
     // non-negative values only.
-    int getMatchingId() const {
-      if (IsTargetMemInst) return Info.MatchingId;
-      return -1;
-    }
-    Value *getPointerOperand() const {
-      if (IsTargetMemInst) return Info.PtrVal;
-      if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
-        return LI->getPointerOperand();
-      } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
-        return SI->getPointerOperand();
-      }
-      return nullptr;
-    }
-    bool mayReadFromMemory() const { return Inst->mayReadFromMemory(); }
-    bool mayWriteToMemory() const { return Inst->mayWriteToMemory(); }
-
-  private:
-    bool IsTargetMemInst;
-    MemIntrinsicInfo Info;
-    Instruction *Inst;
+    int MatchingId;
+    Value *Ptr;
   };
 
   bool processNode(DomTreeNode *Node);
@@ -566,7 +565,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
 
       // If we have an available version of this load, and if it is the right
       // generation, replace this instruction.
-      LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand());
+      LoadValue InVal = AvailableLoads.lookup(MemInst.getPtr());
       if (InVal.Data != nullptr && InVal.Generation == CurrentGeneration &&
           InVal.MatchingId == MemInst.getMatchingId()) {
         Value *Op = getOrCreateResult(InVal.Data, Inst->getType());
@@ -584,7 +583,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
 
       // Otherwise, remember that we have this instruction.
       AvailableLoads.insert(
-          MemInst.getPointerOperand(),
+          MemInst.getPtr(),
           LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId()));
       LastStore = nullptr;
       continue;
@@ -660,7 +659,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
         // to non-volatile loads, so we don't have to check for volatility of
         // the store.
         AvailableLoads.insert(
-            MemInst.getPointerOperand(),
+            MemInst.getPtr(),
             LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId()));
 
         // Remember that this was the last normal store we saw for DSE.

From 3ff1b160f82fb7887a77f41d5c1f47d9e9126af9 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <mehdi.amini@apple.com>
Date: Mon, 7 Dec 2015 22:27:19 +0000
Subject: [PATCH 192/364] Remove useless hack that avoids calling
 LLVMLinkInInterpreter()

This is supposed to force-link the Interpreter, by inserting a dead
call to LLVMLinkInInterpreter().
Since it is actually an empty function, there is no reason for the
call to be dead.

From: Mehdi Amini <mehdi.amini@apple.com>

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254956 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ExecutionEngine/Interpreter.h | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/include/llvm/ExecutionEngine/Interpreter.h b/include/llvm/ExecutionEngine/Interpreter.h
index f49d0c487fe9..a14707840ad8 100644
--- a/include/llvm/ExecutionEngine/Interpreter.h
+++ b/include/llvm/ExecutionEngine/Interpreter.h
@@ -16,22 +16,12 @@
 #define LLVM_EXECUTIONENGINE_INTERPRETER_H
 
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
-#include <cstdlib>
 
 extern "C" void LLVMLinkInInterpreter();
 
 namespace {
   struct ForceInterpreterLinking {
-    ForceInterpreterLinking() {
-      // We must reference the interpreter in such a way that compilers will not
-      // delete it all as dead code, even with whole program optimization,
-      // yet is effectively a NO-OP. As the compiler isn't smart enough
-      // to know that getenv() never returns -1, this will do the job.
-      if (std::getenv("bar") != (char*) -1)
-        return;
-
-      LLVMLinkInInterpreter();
-    }
+    ForceInterpreterLinking() { LLVMLinkInInterpreter(); }
   } ForceInterpreterLinking;
 }
 

From 0890b95b60c353074d8d3a2e11906e66282e30da Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Mon, 7 Dec 2015 22:41:23 +0000
Subject: [PATCH 193/364] Reapply 254950 w/fix

254950 ended up being not NFC.  The previous code was overriding the flags for whether an instruction read or wrote memory using the target specific flags returned via TTI.  I'd missed this in my refactoring.  Since I mistakenly built only x86 and didn't notice the number of unsupported tests, I didn't catch that before the original checkin.

This raises an interesting issue though.  Given we have function attributes (i.e. readonly, readnone, argmemonly) which describe the aliasing of intrinsics, why does TTI have this information overriding the instruction definition at all?  I see no reason for this, but decided to preserve existing behavior for the moment.  The root issue might be that we don't have a "writeonly" attribute.

Original commit message:
[EarlyCSE] Simplify and invert ParseMemoryInst [NFCI]

Restructure ParseMemoryInst - which was introduced to abstract over target specific load and stores instructions - to just query the underlying instructions. In theory, this could be slightly slower than caching the results, but in practice, it's very unlikely to be measurable.

The simple query scheme makes it far easier to understand, and much easier to extend with new queries. Given I'm about to need to add new query types, doing the cleanup first seemed worthwhile.

Do we still believe the target specific intrinsic handling is worthwhile in EarlyCSE? It adds quite a bit of complexity and makes the code harder to read. Being able to delete the abstraction entirely would be wonderful.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254957 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/IR/LegacyPassManager.cpp       |  6 ++
 lib/Transforms/Scalar/EarlyCSE.cpp | 95 ++++++++++++++++--------------
 2 files changed, 57 insertions(+), 44 deletions(-)

diff --git a/lib/IR/LegacyPassManager.cpp b/lib/IR/LegacyPassManager.cpp
index f2e0c7d32c02..3fa11f0d51ae 100644
--- a/lib/IR/LegacyPassManager.cpp
+++ b/lib/IR/LegacyPassManager.cpp
@@ -589,6 +589,12 @@ AnalysisUsage *PMTopLevelManager::findAnalysisUsage(Pass *P) {
     if (auto *N = UniqueAnalysisUsages.FindNodeOrInsertPos(ID, IP))
       Node = N;
     else {
+#if 0
+      dbgs() << AU.getRequiredSet().size() << " "
+             << AU.getRequiredTransitiveSet().size() << " "
+             << AU.getPreservedSet().size() << " "
+             << AU.getUsedSet().size() << "\n";
+#endif
       Node = new (AUFoldingSetNodeAllocator.Allocate()) AUFoldingSetNode(AU);
       UniqueAnalysisUsages.InsertNode(Node, IP);
     }
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index b055044ba6d0..7e3703de25e4 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -388,57 +388,64 @@ class EarlyCSE {
   class ParseMemoryInst {
   public:
     ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI)
-      : Load(false), Store(false), IsSimple(true), MayReadFromMemory(false),
-        MayWriteToMemory(false), MatchingId(-1), Ptr(nullptr) {
-      MayReadFromMemory = Inst->mayReadFromMemory();
-      MayWriteToMemory = Inst->mayWriteToMemory();
-      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
-        MemIntrinsicInfo Info;
-        if (!TTI.getTgtMemIntrinsic(II, Info))
-          return;
-        if (Info.NumMemRefs == 1) {
-          Store = Info.WriteMem;
-          Load = Info.ReadMem;
-          MatchingId = Info.MatchingId;
-          MayReadFromMemory = Info.ReadMem;
-          MayWriteToMemory = Info.WriteMem;
-          IsSimple = Info.IsSimple;
-          Ptr = Info.PtrVal;
-        }
-      } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
-        Load = true;
-        IsSimple = LI->isSimple();
-        Ptr = LI->getPointerOperand();
+      : IsTargetMemInst(false), Inst(Inst) {
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst))
+        if (TTI.getTgtMemIntrinsic(II, Info) && Info.NumMemRefs == 1)
+          IsTargetMemInst = true;
+    }
+    bool isLoad() const {
+      if (IsTargetMemInst) return Info.ReadMem;
+      return isa<LoadInst>(Inst);
+    }
+    bool isStore() const {
+      if (IsTargetMemInst) return Info.WriteMem;
+      return isa<StoreInst>(Inst);
+    }
+    bool isSimple() const {
+      if (IsTargetMemInst) return Info.IsSimple;
+      if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+        return LI->isSimple();
       } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
-        Store = true;
-        IsSimple = SI->isSimple();
-        Ptr = SI->getPointerOperand();
+        return SI->isSimple();
       }
+      return Inst->isAtomic();
     }
-    bool isLoad() const { return Load; }
-    bool isStore() const { return Store; }
-    bool isSimple() const { return IsSimple; }
     bool isMatchingMemLoc(const ParseMemoryInst &Inst) const {
-      return Ptr == Inst.Ptr && MatchingId == Inst.MatchingId;
+      return (getPointerOperand() == Inst.getPointerOperand() &&
+              getMatchingId() == Inst.getMatchingId());
     }
-    bool isValid() const { return Ptr != nullptr; }
-    int getMatchingId() const { return MatchingId; }
-    Value *getPtr() const { return Ptr; }
-    bool mayReadFromMemory() const { return MayReadFromMemory; }
-    bool mayWriteToMemory() const { return MayWriteToMemory; }
+    bool isValid() const { return getPointerOperand() != nullptr; }
 
-  private:
-    bool Load;
-    bool Store;
-    bool IsSimple;
-    bool MayReadFromMemory;
-    bool MayWriteToMemory;
     // For regular (non-intrinsic) loads/stores, this is set to -1. For
     // intrinsic loads/stores, the id is retrieved from the corresponding
     // field in the MemIntrinsicInfo structure.  That field contains
     // non-negative values only.
-    int MatchingId;
-    Value *Ptr;
+    int getMatchingId() const {
+      if (IsTargetMemInst) return Info.MatchingId;
+      return -1;
+    }
+    Value *getPointerOperand() const {
+      if (IsTargetMemInst) return Info.PtrVal;
+      if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+        return LI->getPointerOperand();
+      } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+        return SI->getPointerOperand();
+      }
+      return nullptr;
+    }
+    bool mayReadFromMemory() const {
+      if (IsTargetMemInst) return Info.ReadMem;
+      return Inst->mayReadFromMemory();
+    }
+    bool mayWriteToMemory() const {
+      if (IsTargetMemInst) return Info.WriteMem;
+      return Inst->mayWriteToMemory();
+    }
+
+  private:
+    bool IsTargetMemInst;
+    MemIntrinsicInfo Info;
+    Instruction *Inst;
   };
 
   bool processNode(DomTreeNode *Node);
@@ -565,7 +572,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
 
       // If we have an available version of this load, and if it is the right
       // generation, replace this instruction.
-      LoadValue InVal = AvailableLoads.lookup(MemInst.getPtr());
+      LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand());
       if (InVal.Data != nullptr && InVal.Generation == CurrentGeneration &&
           InVal.MatchingId == MemInst.getMatchingId()) {
         Value *Op = getOrCreateResult(InVal.Data, Inst->getType());
@@ -583,7 +590,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
 
       // Otherwise, remember that we have this instruction.
       AvailableLoads.insert(
-          MemInst.getPtr(),
+          MemInst.getPointerOperand(),
           LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId()));
       LastStore = nullptr;
       continue;
@@ -659,7 +666,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
         // to non-volatile loads, so we don't have to check for volatility of
         // the store.
         AvailableLoads.insert(
-            MemInst.getPtr(),
+            MemInst.getPointerOperand(),
             LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId()));
 
         // Remember that this was the last normal store we saw for DSE.

From ba40ae0275ee81cf25c087abcf0dac035b7a4547 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Mon, 7 Dec 2015 22:43:56 +0000
Subject: [PATCH 194/364] Remove debug output that snuck into 254957

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254960 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/IR/LegacyPassManager.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/lib/IR/LegacyPassManager.cpp b/lib/IR/LegacyPassManager.cpp
index 3fa11f0d51ae..f2e0c7d32c02 100644
--- a/lib/IR/LegacyPassManager.cpp
+++ b/lib/IR/LegacyPassManager.cpp
@@ -589,12 +589,6 @@ AnalysisUsage *PMTopLevelManager::findAnalysisUsage(Pass *P) {
     if (auto *N = UniqueAnalysisUsages.FindNodeOrInsertPos(ID, IP))
       Node = N;
     else {
-#if 0
-      dbgs() << AU.getRequiredSet().size() << " "
-             << AU.getRequiredTransitiveSet().size() << " "
-             << AU.getPreservedSet().size() << " "
-             << AU.getUsedSet().size() << "\n";
-#endif
       Node = new (AUFoldingSetNodeAllocator.Allocate()) AUFoldingSetNode(AU);
       UniqueAnalysisUsages.InsertNode(Node, IP);
     }

From 5b1522a8d1ff60a987137c8d1d585959e30d70d0 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Mon, 7 Dec 2015 23:07:16 +0000
Subject: [PATCH 195/364] Let llvm-lto installed. A few tests in clang/test are
 using it.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254963 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-lto/Makefile | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tools/llvm-lto/Makefile b/tools/llvm-lto/Makefile
index f1801b4b20cc..f8ca7e1cac5d 100644
--- a/tools/llvm-lto/Makefile
+++ b/tools/llvm-lto/Makefile
@@ -14,6 +14,4 @@ LINK_COMPONENTS := lto ipo scalaropts linker bitreader bitwriter mcdisassembler
 # This tool has no plugins, optimize startup time.
 TOOL_NO_EXPORTS := 1
 
-NO_INSTALL := 1
-
 include $(LEVEL)/Makefile.common

From d5841c9f1f378e13de5bb4eccffaf1c213662e11 Mon Sep 17 00:00:00 2001
From: Justin Bogner <mail@justinbogner.com>
Date: Mon, 7 Dec 2015 23:12:26 +0000
Subject: [PATCH 196/364] Support: Teach Asan about BumpPtrAllocator

Based on patch by Pete Cooper.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254964 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Support/Allocator.h | 15 ++++++++++++++-
 include/llvm/Support/Compiler.h  |  3 +++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/include/llvm/Support/Allocator.h b/include/llvm/Support/Allocator.h
index f9b5cf22f97d..c608736fa956 100644
--- a/include/llvm/Support/Allocator.h
+++ b/include/llvm/Support/Allocator.h
@@ -222,6 +222,8 @@ class BumpPtrAllocatorImpl
       // Without this, MemorySanitizer messages for values originated from here
       // will point to the allocation of the entire slab.
       __msan_allocated_memory(AlignedPtr, Size);
+      // Similarly, tell ASan about this space.
+      __asan_unpoison_memory_region(AlignedPtr, Size);
       return AlignedPtr;
     }
 
@@ -229,12 +231,16 @@ class BumpPtrAllocatorImpl
     size_t PaddedSize = Size + Alignment - 1;
     if (PaddedSize > SizeThreshold) {
       void *NewSlab = Allocator.Allocate(PaddedSize, 0);
+      // We own the new slab and don't want anyone reading anyting other than
+      // pieces returned from this method.  So poison the whole slab.
+      __asan_poison_memory_region(NewSlab, PaddedSize);
       CustomSizedSlabs.push_back(std::make_pair(NewSlab, PaddedSize));
 
       uintptr_t AlignedAddr = alignAddr(NewSlab, Alignment);
       assert(AlignedAddr + Size <= (uintptr_t)NewSlab + PaddedSize);
       char *AlignedPtr = (char*)AlignedAddr;
       __msan_allocated_memory(AlignedPtr, Size);
+      __asan_unpoison_memory_region(AlignedPtr, Size);
       return AlignedPtr;
     }
 
@@ -246,13 +252,16 @@ class BumpPtrAllocatorImpl
     char *AlignedPtr = (char*)AlignedAddr;
     CurPtr = AlignedPtr + Size;
     __msan_allocated_memory(AlignedPtr, Size);
+    __asan_unpoison_memory_region(AlignedPtr, Size);
     return AlignedPtr;
   }
 
   // Pull in base class overloads.
   using AllocatorBase<BumpPtrAllocatorImpl>::Allocate;
 
-  void Deallocate(const void * /*Ptr*/, size_t /*Size*/) {}
+  void Deallocate(const void *Ptr, size_t Size) {
+    __asan_poison_memory_region(Ptr, Size);
+  }
 
   // Pull in base class overloads.
   using AllocatorBase<BumpPtrAllocatorImpl>::Deallocate;
@@ -310,6 +319,10 @@ class BumpPtrAllocatorImpl
     size_t AllocatedSlabSize = computeSlabSize(Slabs.size());
 
     void *NewSlab = Allocator.Allocate(AllocatedSlabSize, 0);
+    // We own the new slab and don't want anyone reading anything other than
+    // pieces returned from this method.  So poison the whole slab.
+    __asan_poison_memory_region(NewSlab, AllocatedSlabSize);
+
     Slabs.push_back(NewSlab);
     CurPtr = (char *)(NewSlab);
     End = ((char *)NewSlab) + AllocatedSlabSize;
diff --git a/include/llvm/Support/Compiler.h b/include/llvm/Support/Compiler.h
index 99a02f7ada03..b3416bbfffb6 100644
--- a/include/llvm/Support/Compiler.h
+++ b/include/llvm/Support/Compiler.h
@@ -361,8 +361,11 @@
 /// \brief Whether LLVM itself is built with AddressSanitizer instrumentation.
 #if __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__)
 # define LLVM_ADDRESS_SANITIZER_BUILD 1
+# include <sanitizer/asan_interface.h>
 #else
 # define LLVM_ADDRESS_SANITIZER_BUILD 0
+# define __asan_poison_memory_region(p, size)
+# define __asan_unpoison_memory_region(p, size)
 #endif
 
 /// \macro LLVM_THREAD_SANITIZER_BUILD

From b69b6b56339cffdb675817ff4bab206254141bbe Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Mon, 7 Dec 2015 23:15:57 +0000
Subject: [PATCH 197/364] Stabilize llvm/test/Object/archive-update.test a bit.

A manipulation (in this case, mkdir) can make slack between creating and touching %t.older/evenlen.

I would make this rewrote with python if this were still unstable.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254965 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Object/archive-update.test | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/Object/archive-update.test b/test/Object/archive-update.test
index ef1a71f60f43..fd1ea4113c39 100644
--- a/test/Object/archive-update.test
+++ b/test/Object/archive-update.test
@@ -7,12 +7,13 @@ Create a file named evenlen that is newer than the evenlen on the source dir.
 RUN: mkdir -p %t.older
 RUN: echo older > %t.older/evenlen
 
+RUN: mkdir -p %t.newer
+
 Either the shell supports the 'touch' command with a flag to manually set the
 mtime or we sleep for over two seconds so that the mtime is definitely
 observable.
 RUN: touch -m -t 200001010000 %t.older/evenlen || sleep 2.1
 
-RUN: mkdir -p %t.newer
 RUN: echo newer > %t.newer/evenlen
 RUN: touch %t.newer/evenlen
 

From e826a2ef86c0e89c60d7a4fa792dbbb0358e030e Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Mon, 7 Dec 2015 23:32:39 +0000
Subject: [PATCH 198/364] Factor two calls to a common location.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254967 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Linker/LinkModules.cpp | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index 3e1416d8d506..a2bc95602210 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -451,8 +451,7 @@ class ModuleLinker {
 
   /// Handles cloning of a global values from the source module into
   /// the destination module, including setting the attributes and visibility.
-  GlobalValue *copyGlobalValueProto(const GlobalValue *SGV,
-                                    const GlobalValue *DGV, bool ForDefinition);
+  GlobalValue *copyGlobalValueProto(const GlobalValue *SGV, bool ForDefinition);
 
   /// Check if we should promote the given local value to global scope.
   bool doPromoteLocalToGlobal(const GlobalValue *SGV);
@@ -816,7 +815,6 @@ void ModuleLinker::setVisibility(GlobalValue *NewGV, const GlobalValue *SGV,
 }
 
 GlobalValue *ModuleLinker::copyGlobalValueProto(const GlobalValue *SGV,
-                                                const GlobalValue *DGV,
                                                 bool ForDefinition) {
   GlobalValue *NewGV;
   if (auto *SGVar = dyn_cast<GlobalVariable>(SGV)) {
@@ -842,7 +840,6 @@ GlobalValue *ModuleLinker::copyGlobalValueProto(const GlobalValue *SGV,
     NewGV->setLinkage(GlobalValue::ExternalWeakLinkage);
 
   copyGVAttributes(NewGV, SGV);
-  setVisibility(NewGV, SGV, DGV);
   return NewGV;
 }
 
@@ -1364,8 +1361,6 @@ Constant *ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) {
   GlobalValue *NewGV;
   if (!LinkFromSrc && DGV) {
     NewGV = DGV;
-    // When linking from source we setVisibility from copyGlobalValueProto.
-    setVisibility(NewGV, SGV, DGV);
   } else {
     // If we are done linking global value bodies (i.e. we are performing
     // metadata linking), don't link in the global value due to this
@@ -1373,9 +1368,10 @@ Constant *ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) {
     if (DoneLinkingBodies)
       return nullptr;
 
-    NewGV = copyGlobalValueProto(SGV, DGV, LinkFromSrc);
+    NewGV = copyGlobalValueProto(SGV, LinkFromSrc);
   }
 
+  setVisibility(NewGV, SGV, DGV);
   NewGV->setUnnamedAddr(HasUnnamedAddr);
 
   if (auto *NewGO = dyn_cast<GlobalObject>(NewGV)) {

From afd3f07154466b8707b767bfb97ce5672a3a1f17 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 7 Dec 2015 23:34:30 +0000
Subject: [PATCH 199/364] fix return values to match bool return type; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254968 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 10fb334c4c60..97a9646c3232 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -3888,10 +3888,10 @@ static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize) {
 
   // Check whether Neg has the form (sub NegC, NegOp1) for some NegC and NegOp1.
   if (Neg.getOpcode() != ISD::SUB)
-    return 0;
+    return false;
   ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(0));
   if (!NegC)
-    return 0;
+    return false;
   SDValue NegOp1 = Neg.getOperand(1);
 
   // On the RHS of [A], if Pos is Pos' & (EltSize - 1), just replace Pos with

From a187d6f327f3882f1925c95b7e9ac553d62d4a9f Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Tue, 8 Dec 2015 00:10:56 +0000
Subject: [PATCH 200/364] [PassManager] Tuning Memory Usage of AnalysisUsage

We were using unneccessarily large initial sizes for these SmallVectors.  This was wasting around 50kb of memory for the O3 pipeline, even after the uniquing changes.  We're still using around 20kb which is a bit much, but it's definitely better.  This is about a 6% improvement in total O3 memory usage.

Note: The raw data on structure size which were used to pick these thresholds can be found in the review thread.

Differential Revision: http://reviews.llvm.org/D15244


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254974 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/PassAnalysisSupport.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/include/llvm/PassAnalysisSupport.h b/include/llvm/PassAnalysisSupport.h
index f6265b62cbf5..492a4ef464f8 100644
--- a/include/llvm/PassAnalysisSupport.h
+++ b/include/llvm/PassAnalysisSupport.h
@@ -36,11 +36,17 @@ namespace llvm {
 ///
 class AnalysisUsage {
 public:
-  typedef SmallVector<AnalysisID, 32> VectorType;
+  typedef SmallVectorImpl<AnalysisID> VectorType;
 
 private:
   /// Sets of analyses required and preserved by a pass
-  VectorType Required, RequiredTransitive, Preserved, Used;
+  // TODO: It's not clear that SmallVector is an appropriate data structure for
+  // this usecase.  The sizes were picked to minimize wasted space, but are
+  // otherwise fairly meaningless.
+  SmallVector<AnalysisID, 8> Required;
+  SmallVector<AnalysisID, 2> RequiredTransitive;
+  SmallVector<AnalysisID, 2> Preserved;
+  SmallVector<AnalysisID, 0> Used;
   bool PreservesAll;
 
 public:

From c11338cf10a01b70961536250358b0dc6ebd4438 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@playingwithpointers.com>
Date: Tue, 8 Dec 2015 00:13:12 +0000
Subject: [PATCH 201/364] Add Instruction::getFunction; NFC

Will be used in a upcoming patch.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254975 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/Instruction.h | 7 +++++++
 lib/IR/Instruction.cpp        | 5 +++++
 2 files changed, 12 insertions(+)

diff --git a/include/llvm/IR/Instruction.h b/include/llvm/IR/Instruction.h
index c7ba8721fe06..77ba87c6b664 100644
--- a/include/llvm/IR/Instruction.h
+++ b/include/llvm/IR/Instruction.h
@@ -66,6 +66,13 @@ class Instruction : public User,
   const Module *getModule() const;
   Module *getModule();
 
+  /// \brief Return the function this instruction belongs to.
+  ///
+  /// Note: it is undefined behavior to call this on an instruction not
+  /// currently inserted into a function.
+  const Function *getFunction() const;
+  Function *getFunction();
+
   /// removeFromParent - This method unlinks 'this' from the containing basic
   /// block, but does not delete it.
   ///
diff --git a/lib/IR/Instruction.cpp b/lib/IR/Instruction.cpp
index b5a30a4969b3..7bd50328b126 100644
--- a/lib/IR/Instruction.cpp
+++ b/lib/IR/Instruction.cpp
@@ -62,6 +62,11 @@ Module *Instruction::getModule() {
   return getParent()->getModule();
 }
 
+Function *Instruction::getFunction() { return getParent()->getParent(); }
+
+const Function *Instruction::getFunction() const {
+  return getParent()->getParent();
+}
 
 void Instruction::removeFromParent() {
   getParent()->getInstList().remove(getIterator());

From 51d40aea3b6e74dea4b3c6680f00921441a30522 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@playingwithpointers.com>
Date: Tue, 8 Dec 2015 00:13:17 +0000
Subject: [PATCH 202/364] [SCEVExpander] Have hoistIVInc preserve LCSSA

Summary:
(Note: the problematic invocation of hoistIVInc that caused PR24804 came
from IndVarSimplify, not from SCEVExpander itself)

Fixes PR24804.  Test case by David Majnemer.

Reviewers: hfinkel, majnemer, atrick, mzolotukhin

Subscribers: llvm-commits

Differential Revision: http://reviews.llvm.org/D15058

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254976 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/LoopInfo.h          | 73 +++++++++++++++++++++++
 lib/Analysis/ScalarEvolutionExpander.cpp  |  3 +
 test/Transforms/IndVarSimplify/pr24804.ll | 25 ++++++++
 3 files changed, 101 insertions(+)
 create mode 100644 test/Transforms/IndVarSimplify/pr24804.ll

diff --git a/include/llvm/Analysis/LoopInfo.h b/include/llvm/Analysis/LoopInfo.h
index 9196250233cd..616d6ad1761a 100644
--- a/include/llvm/Analysis/LoopInfo.h
+++ b/include/llvm/Analysis/LoopInfo.h
@@ -37,6 +37,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/Pass.h"
 #include <algorithm>
 
@@ -681,6 +682,78 @@ class LoopInfo : public LoopInfoBase<BasicBlock, Loop> {
     // it as a replacement will not break LCSSA form.
     return ToLoop->contains(getLoopFor(From->getParent()));
   }
+
+  /// \brief Checks if moving a specific instruction can break LCSSA in any
+  /// loop.
+  ///
+  /// Return true if moving \p Inst to before \p NewLoc will break LCSSA,
+  /// assuming that the function containing \p Inst and \p NewLoc is currently
+  /// in LCSSA form.
+  bool movementPreservesLCSSAForm(Instruction *Inst, Instruction *NewLoc) {
+    assert(Inst->getFunction() == NewLoc->getFunction() &&
+           "Can't reason about IPO!");
+
+    auto *OldBB = Inst->getParent();
+    auto *NewBB = NewLoc->getParent();
+
+    // Movement within the same loop does not break LCSSA (the equality check is
+    // to avoid doing a hashtable lookup in case of intra-block movement).
+    if (OldBB == NewBB)
+      return true;
+
+    auto *OldLoop = getLoopFor(OldBB);
+    auto *NewLoop = getLoopFor(NewBB);
+
+    if (OldLoop == NewLoop)
+      return true;
+
+    // Check if Outer contains Inner; with the null loop counting as the
+    // "outermost" loop.
+    auto Contains = [](const Loop *Outer, const Loop *Inner) {
+      return !Outer || Outer->contains(Inner);
+    };
+
+    // To check that the movement of Inst to before NewLoc does not break LCSSA,
+    // we need to check two sets of uses for possible LCSSA violations at
+    // NewLoc: the users of NewInst, and the operands of NewInst.
+
+    // If we know we're hoisting Inst out of an inner loop to an outer loop,
+    // then the uses *of* Inst don't need to be checked.
+
+    if (!Contains(NewLoop, OldLoop)) {
+      for (Use &U : Inst->uses()) {
+        auto *UI = cast<Instruction>(U.getUser());
+        auto *UBB = isa<PHINode>(UI) ? cast<PHINode>(UI)->getIncomingBlock(U)
+                                     : UI->getParent();
+        if (UBB != NewBB && getLoopFor(UBB) != NewLoop)
+          return false;
+      }
+    }
+
+    // If we know we're sinking Inst from an outer loop into an inner loop, then
+    // the *operands* of Inst don't need to be checked.
+
+    if (!Contains(OldLoop, NewLoop)) {
+      // See below on why we can't handle phi nodes here.
+      if (isa<PHINode>(Inst))
+        return false;
+
+      for (Use &U : Inst->operands()) {
+        auto *DefI = dyn_cast<Instruction>(U.get());
+        if (!DefI)
+          return false;
+
+        // This would need adjustment if we allow Inst to be a phi node -- the
+        // new use block won't simply be NewBB.
+
+        auto *DefBlock = DefI->getParent();
+        if (DefBlock != NewBB && getLoopFor(DefBlock) != NewLoop)
+          return false;
+      }
+    }
+
+    return true;
+  }
 };
 
 // Allow clients to walk the list of nested loops...
diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp
index 8c5805e9d168..abfcfbafb32e 100644
--- a/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/lib/Analysis/ScalarEvolutionExpander.cpp
@@ -933,6 +933,9 @@ bool SCEVExpander::hoistIVInc(Instruction *IncV, Instruction *InsertPos) {
       !SE.DT.dominates(InsertPos->getParent(), IncV->getParent()))
     return false;
 
+  if (!SE.LI.movementPreservesLCSSAForm(IncV, InsertPos))
+    return false;
+
   // Check that the chain of IV operands leading back to Phi can be hoisted.
   SmallVector<Instruction*, 4> IVIncs;
   for(;;) {
diff --git a/test/Transforms/IndVarSimplify/pr24804.ll b/test/Transforms/IndVarSimplify/pr24804.ll
new file mode 100644
index 000000000000..6f89481853ad
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/pr24804.ll
@@ -0,0 +1,25 @@
+; RUN: opt -indvars -loop-idiom -loop-deletion -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Checking for a crash
+
+define void @f(i32* %a) {
+; CHECK-LABEL: @f(
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %for.cond, %entry
+  %iv = phi i32 [ 0, %entry ], [ %add, %for.inc ], [ %iv, %for.cond ]
+  %add = add nsw i32 %iv, 1
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom
+  br i1 undef, label %for.cond, label %for.inc
+
+for.inc:                                          ; preds = %for.cond
+  br i1 undef, label %for.cond, label %for.end
+
+for.end:                                          ; preds = %for.inc
+  ret void
+}

From ca40161d4a0f48fcb5c60d984ba1bc774dc0a45c Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@playingwithpointers.com>
Date: Tue, 8 Dec 2015 00:13:21 +0000
Subject: [PATCH 203/364] [IndVars] Have getInsertPointForUses preserve LCSSA

Summary:
Also add a stricter post-condition for IndVarSimplify.

Fixes PR25578.  Test case by Michael Zolotukhin.

Reviewers: hfinkel, atrick, mzolotukhin

Subscribers: llvm-commits

Differential Revision: http://reviews.llvm.org/D15059

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254977 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/LoopInfo.h          |  3 ++
 lib/Analysis/LoopInfo.cpp                 |  9 +++++
 lib/Transforms/Scalar/IndVarSimplify.cpp  | 42 ++++++++++++++-------
 test/Transforms/IndVarSimplify/pr25578.ll | 45 +++++++++++++++++++++++
 4 files changed, 85 insertions(+), 14 deletions(-)
 create mode 100644 test/Transforms/IndVarSimplify/pr25578.ll

diff --git a/include/llvm/Analysis/LoopInfo.h b/include/llvm/Analysis/LoopInfo.h
index 616d6ad1761a..57695b46d640 100644
--- a/include/llvm/Analysis/LoopInfo.h
+++ b/include/llvm/Analysis/LoopInfo.h
@@ -402,6 +402,9 @@ class Loop : public LoopBase<BasicBlock, Loop> {
   /// isLCSSAForm - Return true if the Loop is in LCSSA form
   bool isLCSSAForm(DominatorTree &DT) const;
 
+  /// \brief Return true if this Loop and all inner subloops are in LCSSA form.
+  bool isRecursivelyLCSSAForm(DominatorTree &DT) const;
+
   /// isLoopSimplifyForm - Return true if the Loop is in the form that
   /// the LoopSimplify form transforms loops to, which is sometimes called
   /// normal form.
diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
index e679b7ad7b86..67a82b192e56 100644
--- a/lib/Analysis/LoopInfo.cpp
+++ b/lib/Analysis/LoopInfo.cpp
@@ -200,6 +200,15 @@ bool Loop::isLCSSAForm(DominatorTree &DT) const {
   return true;
 }
 
+bool Loop::isRecursivelyLCSSAForm(DominatorTree &DT) const {
+  if (!isLCSSAForm(DT))
+    return false;
+
+  return std::all_of(begin(), end(), [&](const Loop *L) {
+    return L->isRecursivelyLCSSAForm(DT);
+  });
+}
+
 /// isLoopSimplifyForm - Return true if the Loop is in the form that
 /// the LoopSimplify form transforms loops to, which is sometimes called
 /// normal form.
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 4ea92df9924f..308c8f8f7c6d 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -50,6 +50,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
 using namespace llvm;
 
@@ -215,7 +216,7 @@ bool IndVarSimplify::isValidRewrite(Value *FromVal, Value *ToVal) {
 /// loop. For PHI nodes, there may be multiple uses, so compute the nearest
 /// common dominator for the incoming blocks.
 static Instruction *getInsertPointForUses(Instruction *User, Value *Def,
-                                          DominatorTree *DT) {
+                                          DominatorTree *DT, LoopInfo *LI) {
   PHINode *PHI = dyn_cast<PHINode>(User);
   if (!PHI)
     return User;
@@ -234,10 +235,21 @@ static Instruction *getInsertPointForUses(Instruction *User, Value *Def,
     InsertPt = InsertBB->getTerminator();
   }
   assert(InsertPt && "Missing phi operand");
-  assert((!isa<Instruction>(Def) ||
-          DT->dominates(cast<Instruction>(Def), InsertPt)) &&
-         "def does not dominate all uses");
-  return InsertPt;
+
+  auto *DefI = dyn_cast<Instruction>(Def);
+  if (!DefI)
+    return InsertPt;
+
+  assert(DT->dominates(DefI, InsertPt) && "def does not dominate all uses");
+
+  auto *L = LI->getLoopFor(DefI->getParent());
+  assert(!L || L->contains(LI->getLoopFor(InsertPt->getParent())));
+
+  for (auto *DTN = (*DT)[InsertPt->getParent()]; DTN; DTN = DTN->getIDom())
+    if (LI->getLoopFor(DTN->getBlock()) == L)
+      return DTN->getBlock()->getTerminator();
+
+  llvm_unreachable("DefI dominates InsertPt!");
 }
 
 //===----------------------------------------------------------------------===//
@@ -528,8 +540,8 @@ Value *IndVarSimplify::expandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S,
 /// able to brute-force evaluate arbitrary instructions as long as they have
 /// constant operands at the beginning of the loop.
 void IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
-  // Verify the input to the pass in already in LCSSA form.
-  assert(L->isLCSSAForm(*DT));
+  // Check a pre-condition.
+  assert(L->isRecursivelyLCSSAForm(*DT) && "Indvars did not preserve LCSSA!");
 
   SmallVector<BasicBlock*, 8> ExitBlocks;
   L->getUniqueExitBlocks(ExitBlocks);
@@ -1167,10 +1179,11 @@ const SCEVAddRecExpr *WidenIV::getWideRecurrence(Instruction *NarrowUse) {
 
 /// This IV user cannot be widen. Replace this use of the original narrow IV
 /// with a truncation of the new wide IV to isolate and eliminate the narrow IV.
-static void truncateIVUse(NarrowIVDefUse DU, DominatorTree *DT) {
+static void truncateIVUse(NarrowIVDefUse DU, DominatorTree *DT, LoopInfo *LI) {
   DEBUG(dbgs() << "INDVARS: Truncate IV " << *DU.WideDef
         << " for user " << *DU.NarrowUse << "\n");
-  IRBuilder<> Builder(getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT));
+  IRBuilder<> Builder(
+      getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT, LI));
   Value *Trunc = Builder.CreateTrunc(DU.WideDef, DU.NarrowDef->getType());
   DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, Trunc);
 }
@@ -1207,7 +1220,8 @@ bool WidenIV::widenLoopCompare(NarrowIVDefUse DU) {
   assert (CastWidth <= IVWidth && "Unexpected width while widening compare.");
 
   // Widen the compare instruction.
-  IRBuilder<> Builder(getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT));
+  IRBuilder<> Builder(
+      getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT, LI));
   DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, DU.WideDef);
 
   // Widen the other operand of the compare, if necessary.
@@ -1229,7 +1243,7 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
       // After SimplifyCFG most loop exit targets have a single predecessor.
       // Otherwise fall back to a truncate within the loop.
       if (UsePhi->getNumOperands() != 1)
-        truncateIVUse(DU, DT);
+        truncateIVUse(DU, DT, LI);
       else {
         PHINode *WidePhi =
           PHINode::Create(DU.WideDef->getType(), 1, UsePhi->getName() + ".wide",
@@ -1297,7 +1311,7 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
     // This user does not evaluate to a recurence after widening, so don't
     // follow it. Instead insert a Trunc to kill off the original use,
     // eventually isolating the original narrow IV so it can be removed.
-    truncateIVUse(DU, DT);
+    truncateIVUse(DU, DT, LI);
     return nullptr;
   }
   // Assume block terminators cannot evaluate to a recurrence. We can't to
@@ -2165,9 +2179,9 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   // Clean up dead instructions.
   Changed |= DeleteDeadPHIs(L->getHeader(), TLI);
+
   // Check a post-condition.
-  assert(L->isLCSSAForm(*DT) &&
-         "Indvars did not leave the loop in lcssa form!");
+  assert(L->isRecursivelyLCSSAForm(*DT) && "Indvars did not preserve LCSSA!");
 
   // Verify that LFTR, and any other change have not interfered with SCEV's
   // ability to compute trip count.
diff --git a/test/Transforms/IndVarSimplify/pr25578.ll b/test/Transforms/IndVarSimplify/pr25578.ll
new file mode 100644
index 000000000000..bc648b517bbe
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/pr25578.ll
@@ -0,0 +1,45 @@
+; RUN: opt < %s -indvars -S | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: @foo
+define void @foo() {
+entry:
+  br label %L1_header
+
+L1_header:
+  br label %L2_header
+
+; CHECK: L2_header:
+; CHECK: %[[INDVAR:.*]] = phi i64
+; CHECK: %[[TRUNC:.*]] = trunc i64 %[[INDVAR]] to i32
+L2_header:
+  %i = phi i32 [ 0, %L1_header ], [ %i_next, %L2_latch ]
+  %i_prom = sext i32 %i to i64
+  br label %L3_header
+
+L3_header:
+  br i1 undef, label %L3_latch, label %L2_exiting_1
+
+L3_latch:
+  br i1 undef, label %L3_header, label %L2_exiting_2
+
+L2_exiting_1:
+  br i1 undef, label %L2_latch, label %L1_latch
+
+L2_exiting_2:
+  br i1 undef, label %L2_latch, label %L1_latch
+
+L2_latch:
+  %i_next = add nsw i32 %i, 1
+  br label %L2_header
+
+L1_latch:
+; CHECK: L1_latch:
+; CHECK: %i_lcssa = phi i32 [ %[[TRUNC]], %L2_exiting_1 ], [ %[[TRUNC]], %L2_exiting_2 ]
+
+  %i_lcssa = phi i32 [ %i, %L2_exiting_1 ], [ %i, %L2_exiting_2 ]
+  br i1 undef, label %exit, label %L1_header
+
+exit:
+  ret void
+}

From 23ae77267189ac95510c0915de0adc1f056f9427 Mon Sep 17 00:00:00 2001
From: Manman Ren <manman.ren@gmail.com>
Date: Tue, 8 Dec 2015 00:14:38 +0000
Subject: [PATCH 204/364] [CXX TLS calling convention] Add support for AArch64.

rdar://9001553


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254978 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../AArch64/AArch64CallingConvention.td       |  9 +++
 lib/Target/AArch64/AArch64RegisterInfo.cpp    |  4 +
 test/CodeGen/AArch64/cxx-tlscc.ll             | 77 +++++++++++++++++++
 3 files changed, 90 insertions(+)
 create mode 100644 test/CodeGen/AArch64/cxx-tlscc.ll

diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td
index 948b9ddb5df6..66d92100e637 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/lib/Target/AArch64/AArch64CallingConvention.td
@@ -279,6 +279,15 @@ def CSR_AArch64_TLS_Darwin
                            FP,
                            (sequence "Q%u", 0, 31))>;
 
+// We can only handle a register pair with adjacent registers, the register pair
+// should belong to the same class as well. Since the access function on the
+// fast path calls a function that follows CSR_AArch64_TLS_Darwin,
+// CSR_AArch64_CXX_TLS_Darwin should be a subset of CSR_AArch64_TLS_Darwin.
+def CSR_AArch64_CXX_TLS_Darwin
+    : CalleeSavedRegs<(add CSR_AArch64_AAPCS,
+                           (sub (sequence "X%u", 1, 28), X15, X16, X17, X18),
+                           (sequence "D%u", 0, 31))>;
+
 // The ELF stub used for TLS-descriptor access saves every feasible
 // register. Only X0 and LR are clobbered.
 def CSR_AArch64_TLS_ELF
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 1aef31baad20..763b2337de12 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -46,6 +46,8 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     return CSR_AArch64_NoRegs_SaveList;
   if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg)
     return CSR_AArch64_AllRegs_SaveList;
+  if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS)
+    return CSR_AArch64_CXX_TLS_Darwin_SaveList;
   else
     return CSR_AArch64_AAPCS_SaveList;
 }
@@ -58,6 +60,8 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
     return CSR_AArch64_NoRegs_RegMask;
   if (CC == CallingConv::AnyReg)
     return CSR_AArch64_AllRegs_RegMask;
+  if (CC == CallingConv::CXX_FAST_TLS)
+    return CSR_AArch64_CXX_TLS_Darwin_RegMask;
   else
     return CSR_AArch64_AAPCS_RegMask;
 }
diff --git a/test/CodeGen/AArch64/cxx-tlscc.ll b/test/CodeGen/AArch64/cxx-tlscc.ll
new file mode 100644
index 000000000000..39f6c0fbec94
--- /dev/null
+++ b/test/CodeGen/AArch64/cxx-tlscc.ll
@@ -0,0 +1,77 @@
+; RUN: llc < %s -mtriple=aarch64-apple-ios | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-apple-ios -enable-shrink-wrap=true | FileCheck --check-prefix=CHECK %s
+; Shrink wrapping currently does not kick in because we have a TLS CALL
+; in the entry block and it will clobber the link register.
+
+%struct.S = type { i8 }
+
+@sg = internal thread_local global %struct.S zeroinitializer, align 1
+@__dso_handle = external global i8
+@__tls_guard = internal thread_local unnamed_addr global i1 false
+
+declare %struct.S* @_ZN1SC1Ev(%struct.S* returned)
+declare %struct.S* @_ZN1SD1Ev(%struct.S* returned)
+declare i32 @_tlv_atexit(void (i8*)*, i8*, i8*)
+
+define cxx_fast_tlscc nonnull %struct.S* @_ZTW2sg() {
+  %.b.i = load i1, i1* @__tls_guard, align 1
+  br i1 %.b.i, label %__tls_init.exit, label %init.i
+
+init.i:
+  store i1 true, i1* @__tls_guard, align 1
+  %call.i.i = tail call %struct.S* @_ZN1SC1Ev(%struct.S* nonnull @sg)
+  %1 = tail call i32 @_tlv_atexit(void (i8*)* nonnull bitcast (%struct.S* (%struct.S*)* @_ZN1SD1Ev to void (i8*)*), i8* nonnull getelementptr inbounds (%struct.S, %struct.S* @sg, i64 0, i32 0), i8* nonnull @__dso_handle)
+  br label %__tls_init.exit
+
+__tls_init.exit:
+  ret %struct.S* @sg
+}
+
+; CHECK-LABEL: _ZTW2sg
+; CHECK-DAG: stp d31, d30
+; CHECK-DAG: stp d29, d28
+; CHECK-DAG: stp d27, d26
+; CHECK-DAG: stp d25, d24
+; CHECK-DAG: stp d23, d22
+; CHECK-DAG: stp d21, d20
+; CHECK-DAG: stp d19, d18
+; CHECK-DAG: stp d17, d16
+; CHECK-DAG: stp d7, d6
+; CHECK-DAG: stp d5, d4
+; CHECK-DAG: stp d3, d2
+; CHECK-DAG: stp d1, d0
+; CHECK-DAG: stp x20, x19
+; CHECK-DAG: stp x14, x13
+; CHECK-DAG: stp x12, x11
+; CHECK-DAG: stp x10, x9
+; CHECK-DAG: stp x8, x7
+; CHECK-DAG: stp x6, x5
+; CHECK-DAG: stp x4, x3
+; CHECK-DAG: stp x2, x1
+; CHECK-DAG: stp x29, x30
+; CHECK: blr
+; CHECK: tbnz w{{.*}}, #0, [[BB_end:.?LBB0_[0-9]+]]
+; CHECK: blr
+; CHECK: tlv_atexit
+; CHECK: [[BB_end]]:
+; CHECK: blr
+; CHECK-DAG: ldp x2, x1
+; CHECK-DAG: ldp x4, x3
+; CHECK-DAG: ldp x6, x5
+; CHECK-DAG: ldp x8, x7
+; CHECK-DAG: ldp x10, x9
+; CHECK-DAG: ldp x12, x11
+; CHECK-DAG: ldp x14, x13
+; CHECK-DAG: ldp x20, x19
+; CHECK-DAG: ldp d1, d0
+; CHECK-DAG: ldp d3, d2
+; CHECK-DAG: ldp d5, d4
+; CHECK-DAG: ldp d7, d6
+; CHECK-DAG: ldp d17, d16
+; CHECK-DAG: ldp d19, d18
+; CHECK-DAG: ldp d21, d20
+; CHECK-DAG: ldp d23, d22
+; CHECK-DAG: ldp d25, d24
+; CHECK-DAG: ldp d27, d26
+; CHECK-DAG: ldp d29, d28
+; CHECK-DAG: ldp d31, d30

From 4609cb778ad34d55e05762b7159f3339c4054145 Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Tue, 8 Dec 2015 02:29:45 +0000
Subject: [PATCH 205/364] Simplify test. NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254987 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Linker/link-flags.ll | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/test/Linker/link-flags.ll b/test/Linker/link-flags.ll
index d03503aa4548..c901b699575a 100644
--- a/test/Linker/link-flags.ll
+++ b/test/Linker/link-flags.ll
@@ -1,9 +1,7 @@
-; RUN: llvm-as %S/Inputs/linkage.b.ll -o %t.b.bc
-; RUN: llvm-as %S/Inputs/linkage.c.ll -o %t.c.bc
-; RUN: llvm-link -S %t.b.bc %t.c.bc | FileCheck %s -check-prefix=B -check-prefix=C -check-prefix=CU
-; RUN: llvm-link -S -only-needed %t.b.bc %t.c.bc | FileCheck %s -check-prefix=B -check-prefix=C -check-prefix=CN
-; RUN: llvm-link -S -internalize %t.b.bc %t.c.bc | FileCheck %s -check-prefix=B -check-prefix=CI
-; RUN: llvm-link -S -internalize -only-needed %t.b.bc %t.c.bc | FileCheck %s -check-prefix=B -check-prefix=CN
+; RUN: llvm-link -S %S/Inputs/linkage.b.ll %S/Inputs/linkage.c.ll | FileCheck %s -check-prefix=B -check-prefix=C -check-prefix=CU
+; RUN: llvm-link -S -only-needed %S/Inputs/linkage.b.ll %S/Inputs/linkage.c.ll | FileCheck %s -check-prefix=B -check-prefix=C -check-prefix=CN
+; RUN: llvm-link -S -internalize %S/Inputs/linkage.b.ll %S/Inputs/linkage.c.ll | FileCheck %s -check-prefix=B -check-prefix=CI
+; RUN: llvm-link -S -internalize -only-needed %S/Inputs/linkage.b.ll %S/Inputs/linkage.c.ll | FileCheck %s -check-prefix=B -check-prefix=CN
 
 C-LABEL: @X = global i32 5
 CI-LABEL: @X = internal global i32 5

From cbf2c65b9e200ee1b7827171773faa1e4c399fb0 Mon Sep 17 00:00:00 2001
From: Justin Bogner <mail@justinbogner.com>
Date: Tue, 8 Dec 2015 02:37:48 +0000
Subject: [PATCH 206/364] AsmPrinter: Use emitGlobalConstantFP to emit elements
 of constant data

It's strange to duplicate the logic for emitting FP values into
emitGlobalConstantDataSequential, and it's even stranger that we end
up printing the verbose assembly comments differently between the two
paths. Just call into emitGlobalConstantFP rather than crudely
duplicating its logic.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254988 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/AsmPrinter/AsmPrinter.cpp         | 20 ++----
 test/CodeGen/ARM/constants.ll                 |  6 +-
 test/CodeGen/Mips/sitofp-selectcc-opt.ll      |  3 +-
 .../X86/copysign-constant-magnitude.ll        | 24 +++----
 test/CodeGen/X86/fadd-combines.ll             | 64 +++++++++----------
 test/CodeGen/X86/fmul-combines.ll             | 40 ++++++------
 test/CodeGen/X86/vec_uint_to_fp-fastmath.ll   | 16 ++---
 test/CodeGen/X86/vec_uint_to_fp.ll            |  8 +--
 .../X86/x86-setcc-int-to-fp-combine.ll        | 16 ++---
 9 files changed, 92 insertions(+), 105 deletions(-)

diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index b8604240b5d9..f1f3547750b4 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1873,6 +1873,8 @@ static void emitGlobalConstantImpl(const DataLayout &DL, const Constant *C,
                                    const Constant *BaseCV = nullptr,
                                    uint64_t Offset = 0);
 
+static void emitGlobalConstantFP(const ConstantFP *CFP, AsmPrinter &AP);
+
 /// isRepeatedByteSequence - Determine whether the given value is
 /// composed of a repeated sequence of identical bytes and return the
 /// byte value.  If it is not a repeated sequence, return -1.
@@ -1951,22 +1953,8 @@ static void emitGlobalConstantDataSequential(const DataLayout &DL,
                                    ElementByteSize);
     }
   } else {
-    // FP Constants are printed as integer constants to avoid losing precision.
-    for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I) {
-      APFloat Num = CDS->getElementAsAPFloat(I);
-      if (AP.isVerbose()) {
-        if (ElementByteSize == 4)
-          AP.OutStreamer->GetCommentOS() << "float " << Num.convertToFloat()
-                                         << '\n';
-        else if (ElementByteSize == 8)
-          AP.OutStreamer->GetCommentOS() << "double " << Num.convertToDouble()
-                                         << '\n';
-        else
-          llvm_unreachable("Unexpected float width");
-      }
-      AP.OutStreamer->EmitIntValue(Num.bitcastToAPInt().getLimitedValue(),
-                                   ElementByteSize);
-    }
+    for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
+      emitGlobalConstantFP(cast<ConstantFP>(CDS->getElementAsConstant(I)), AP);
   }
 
   unsigned Size = DL.getTypeAllocSize(CDS->getType());
diff --git a/test/CodeGen/ARM/constants.ll b/test/CodeGen/ARM/constants.ll
index 3baa103e3d5d..75a90bbf0caa 100644
--- a/test/CodeGen/ARM/constants.ll
+++ b/test/CodeGen/ARM/constants.ll
@@ -63,7 +63,7 @@ define i32 @f8() nounwind {
                                     float 3.000000e+00> }, align 16
 ; CHECK: const1
 ; CHECK: .zero 16
-; CHECK: float 1.0
-; CHECK: float 2.0
-; CHECK: float 3.0
+; CHECK: float 1
+; CHECK: float 2
+; CHECK: float 3
 ; CHECK: .zero 4
diff --git a/test/CodeGen/Mips/sitofp-selectcc-opt.ll b/test/CodeGen/Mips/sitofp-selectcc-opt.ll
index c60fceb1a04c..751fba46d72f 100644
--- a/test/CodeGen/Mips/sitofp-selectcc-opt.ll
+++ b/test/CodeGen/Mips/sitofp-selectcc-opt.ll
@@ -7,7 +7,7 @@ entry:
 ; check that this transformation doesn't happen:
 ; (sint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc)
 ;
-; CHECK-NOT:   # double -1.000000e+00
+; CHECK-NOT:   # double -1
 
   %tobool1 = icmp ne i32 %a, 0
   %not.tobool = icmp ne i64 %b, 0
@@ -19,4 +19,3 @@ entry:
   store double %add, double* @foo12.d4, align 8
   ret double %add
 }
-
diff --git a/test/CodeGen/X86/copysign-constant-magnitude.ll b/test/CodeGen/X86/copysign-constant-magnitude.ll
index 537d6298ddf4..6c577a2cfcc7 100644
--- a/test/CodeGen/X86/copysign-constant-magnitude.ll
+++ b/test/CodeGen/X86/copysign-constant-magnitude.ll
@@ -5,13 +5,13 @@ target triple = "x86_64-apple-macosx10.10.0"
 
 define void @test_copysign_const_magnitude_d(double %X) {
 ; CHECK: [[SIGNMASK:L.+]]:
-; CHECK-NEXT:   .quad -9223372036854775808    ## double -0.000000e+00
-; CHECK-NEXT:   .quad 0                       ## double 0.000000e+00
+; CHECK-NEXT:   .quad -9223372036854775808    ## double -0
+; CHECK-NEXT:   .quad 0                       ## double 0
 ; CHECK: [[ZERO:L.+]]:
 ; CHECK-NEXT:   .space 16
 ; CHECK: [[ONE:L.+]]:
-; CHECK-NEXT:   .quad 4607182418800017408     ## double 1.000000e+00
-; CHECK-NEXT:   .quad 0                       ## double 0.000000e+00
+; CHECK-NEXT:   .quad 4607182418800017408     ## double 1
+; CHECK-NEXT:   .quad 0                       ## double 0
 ; CHECK-LABEL: test_copysign_const_magnitude_d:
 
 ; CHECK: id
@@ -50,17 +50,17 @@ define void @test_copysign_const_magnitude_d(double %X) {
 
 define void @test_copysign_const_magnitude_f(float %X) {
 ; CHECK: [[SIGNMASK:L.+]]:
-; CHECK-NEXT:   .long	2147483648              ## float -0.000000e+00
-; CHECK-NEXT:   .long	0                       ## float 0.000000e+00
-; CHECK-NEXT:   .long	0                       ## float 0.000000e+00
-; CHECK-NEXT:   .long	0                       ## float 0.000000e+00
+; CHECK-NEXT:   .long	2147483648              ## float -0
+; CHECK-NEXT:   .long	0                       ## float 0
+; CHECK-NEXT:   .long	0                       ## float 0
+; CHECK-NEXT:   .long	0                       ## float 0
 ; CHECK: [[ZERO:L.+]]:
 ; CHECK-NEXT:   .space 16
 ; CHECK: [[ONE:L.+]]:
-; CHECK-NEXT:   .long	1065353216              ## float 1.000000e+00
-; CHECK-NEXT:   .long	0                       ## float 0.000000e+00
-; CHECK-NEXT:   .long	0                       ## float 0.000000e+00
-; CHECK-NEXT:   .long	0                       ## float 0.000000e+00
+; CHECK-NEXT:   .long	1065353216              ## float 1
+; CHECK-NEXT:   .long	0                       ## float 0
+; CHECK-NEXT:   .long	0                       ## float 0
+; CHECK-NEXT:   .long	0                       ## float 0
 ; CHECK-LABEL: test_copysign_const_magnitude_f:
 
 ; CHECK: id
diff --git a/test/CodeGen/X86/fadd-combines.ll b/test/CodeGen/X86/fadd-combines.ll
index 6b389f4099c2..2df0e06dc252 100644
--- a/test/CodeGen/X86/fadd-combines.ll
+++ b/test/CodeGen/X86/fadd-combines.ll
@@ -28,10 +28,10 @@ define float @fadd_2const_f32(float %x) #0 {
   ret float %z
 }
 
-; CHECK: float 5.000000e+00
-; CHECK: float 5.000000e+00
-; CHECK: float 5.000000e+00
-; CHECK: float 5.000000e+00
+; CHECK: float 5
+; CHECK: float 5
+; CHECK: float 5
+; CHECK: float 5
 define <4 x float> @fadd_2const_4f32(<4 x float> %x) #0 {
 ; CHECK-LABEL: fadd_2const_4f32:
 ; CHECK:       # BB#0:
@@ -53,10 +53,10 @@ define float @fadd_x_fmul_x_c_f32(float %x) #0 {
   ret float %z
 }
 
-; CHECK: float 2.000000e+00
-; CHECK: float 3.000000e+00
-; CHECK: float 4.000000e+00
-; CHECK: float 5.000000e+00
+; CHECK: float 2
+; CHECK: float 3
+; CHECK: float 4
+; CHECK: float 5
 define <4 x float> @fadd_x_fmul_x_c_4f32(<4 x float> %x) #0 {
 ; CHECK-LABEL: fadd_x_fmul_x_c_4f32:
 ; CHECK:       # BB#0:
@@ -78,10 +78,10 @@ define float @fadd_fmul_x_c_x_f32(float %x) #0 {
   ret float %z
 }
 
-; CHECK: float 2.000000e+00
-; CHECK: float 3.000000e+00
-; CHECK: float 4.000000e+00
-; CHECK: float 5.000000e+00
+; CHECK: float 2
+; CHECK: float 3
+; CHECK: float 4
+; CHECK: float 5
 define <4 x float> @fadd_fmul_x_c_x_4f32(<4 x float> %x) #0 {
 ; CHECK-LABEL: fadd_fmul_x_c_x_4f32:
 ; CHECK:       # BB#0:
@@ -104,10 +104,10 @@ define float @fadd_fadd_x_x_fmul_x_c_f32(float %x) #0 {
   ret float %w
 }
 
-; CHECK: float 3.000000e+00
-; CHECK: float 4.000000e+00
-; CHECK: float 5.000000e+00
-; CHECK: float 6.000000e+00
+; CHECK: float 3
+; CHECK: float 4
+; CHECK: float 5
+; CHECK: float 6
 define <4 x float> @fadd_fadd_x_x_fmul_x_c_4f32(<4 x float> %x) #0 {
 ; CHECK-LABEL: fadd_fadd_x_x_fmul_x_c_4f32:
 ; CHECK:       # BB#0:
@@ -131,10 +131,10 @@ define float @fadd_fmul_x_c_fadd_x_x_f32(float %x) #0 {
   ret float %w
 }
 
-; CHECK: float 3.000000e+00
-; CHECK: float 4.000000e+00
-; CHECK: float 5.000000e+00
-; CHECK: float 6.000000e+00
+; CHECK: float 3
+; CHECK: float 4
+; CHECK: float 5
+; CHECK: float 6
 define <4 x float> @fadd_fmul_x_c_fadd_x_x_4f32(<4 x float> %x) #0 {
 ; CHECK-LABEL: fadd_fmul_x_c_fadd_x_x_4f32:
 ; CHECK:       # BB#0:
@@ -157,10 +157,10 @@ define float @fadd_x_fadd_x_x_f32(float %x) #0 {
   ret float %z
 }
 
-; CHECK: float 3.000000e+00
-; CHECK: float 3.000000e+00
-; CHECK: float 3.000000e+00
-; CHECK: float 3.000000e+00
+; CHECK: float 3
+; CHECK: float 3
+; CHECK: float 3
+; CHECK: float 3
 define <4 x float> @fadd_x_fadd_x_x_4f32(<4 x float> %x) #0 {
 ; CHECK-LABEL: fadd_x_fadd_x_x_4f32:
 ; CHECK:       # BB#0:
@@ -182,10 +182,10 @@ define float @fadd_fadd_x_x_x_f32(float %x) #0 {
   ret float %z
 }
 
-; CHECK: float 3.000000e+00
-; CHECK: float 3.000000e+00
-; CHECK: float 3.000000e+00
-; CHECK: float 3.000000e+00
+; CHECK: float 3
+; CHECK: float 3
+; CHECK: float 3
+; CHECK: float 3
 define <4 x float> @fadd_fadd_x_x_x_4f32(<4 x float> %x) #0 {
 ; CHECK-LABEL: fadd_fadd_x_x_x_4f32:
 ; CHECK:       # BB#0:
@@ -207,10 +207,10 @@ define float @fadd_fadd_x_x_fadd_x_x_f32(float %x) #0 {
   ret float %z
 }
 
-; CHECK: float 4.000000e+00
-; CHECK: float 4.000000e+00
-; CHECK: float 4.000000e+00
-; CHECK: float 4.000000e+00
+; CHECK: float 4
+; CHECK: float 4
+; CHECK: float 4
+; CHECK: float 4
 define <4 x float> @fadd_fadd_x_x_fadd_x_x_4f32(<4 x float> %x) #0 {
 ; CHECK-LABEL: fadd_fadd_x_x_fadd_x_x_4f32:
 ; CHECK:       # BB#0:
diff --git a/test/CodeGen/X86/fmul-combines.ll b/test/CodeGen/X86/fmul-combines.ll
index 42e538646ffe..564ce42fdb75 100644
--- a/test/CodeGen/X86/fmul-combines.ll
+++ b/test/CodeGen/X86/fmul-combines.ll
@@ -56,10 +56,10 @@ define <4 x float> @fmul_c3_c4_v4f32(<4 x float> %x) #0 {
 }
 
 ; We should be able to pre-multiply the two constant vectors.
-; CHECK: float 5.000000e+00
-; CHECK: float 1.200000e+01
-; CHECK: float 2.100000e+01
-; CHECK: float 3.200000e+01
+; CHECK: float 5
+; CHECK: float 12
+; CHECK: float 21
+; CHECK: float 32
 ; CHECK-LABEL: fmul_v4f32_two_consts_no_splat:
 ; CHECK: mulps
 ; CHECK-NOT: mulps
@@ -71,10 +71,10 @@ define <4 x float> @fmul_v4f32_two_consts_no_splat(<4 x float> %x) #0 {
 }
 
 ; Same as above, but reverse operands to make sure non-canonical form is also handled.
-; CHECK: float 5.000000e+00
-; CHECK: float 1.200000e+01
-; CHECK: float 2.100000e+01
-; CHECK: float 3.200000e+01
+; CHECK: float 5
+; CHECK: float 12
+; CHECK: float 21
+; CHECK: float 32
 ; CHECK-LABEL: fmul_v4f32_two_consts_no_splat_non_canonical:
 ; CHECK: mulps
 ; CHECK-NOT: mulps
@@ -87,10 +87,10 @@ define <4 x float> @fmul_v4f32_two_consts_no_splat_non_canonical(<4 x float> %x)
 
 ; More than one use of a constant multiply should not inhibit the optimization.
 ; Instead of a chain of 2 dependent mults, this test will have 2 independent mults.
-; CHECK: float 6.000000e+00
-; CHECK: float 1.400000e+01
-; CHECK: float 2.400000e+01
-; CHECK: float 3.600000e+01
+; CHECK: float 6
+; CHECK: float 14
+; CHECK: float 24
+; CHECK: float 36
 ; CHECK-LABEL: fmul_v4f32_two_consts_no_splat_multiple_use:
 ; CHECK: mulps
 ; CHECK: ret
@@ -110,10 +110,10 @@ define <4 x float> @PR22698_splats(<4 x float> %a) #0 {
   %mul3 = fmul fast <4 x float> %a, %mul2
   ret <4 x float> %mul3
 
-; CHECK: float 2.400000e+01
-; CHECK: float 2.400000e+01
-; CHECK: float 2.400000e+01
-; CHECK: float 2.400000e+01
+; CHECK: float 24
+; CHECK: float 24
+; CHECK: float 24
+; CHECK: float 24
 ; CHECK-LABEL: PR22698_splats:
 ; CHECK: mulps
 ; CHECK: ret
@@ -126,10 +126,10 @@ define <4 x float> @PR22698_no_splats(<4 x float> %a) #0 {
   %mul3 = fmul fast <4 x float> %a, %mul2
   ret <4 x float> %mul3
 
-; CHECK: float 4.500000e+01
-; CHECK: float 1.200000e+02
-; CHECK: float 2.310000e+02
-; CHECK: float 3.840000e+02
+; CHECK: float 45
+; CHECK: float 120
+; CHECK: float 231
+; CHECK: float 384
 ; CHECK-LABEL: PR22698_no_splats:
 ; CHECK: mulps
 ; CHECK: ret
diff --git a/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll b/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
index 0d67ac4bc25a..1f36d064f873 100644
--- a/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
+++ b/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
@@ -14,10 +14,10 @@
 ; CST-NEXT: .long 65535 # 0xffff
 
 ; CST: [[FPMASKCSTADDR:.LCPI[0-9_]+]]:
-; CST-NEXT: .long 1199570944 # float 6.553600e+04
-; CST-NEXT: .long 1199570944 # float 6.553600e+04
-; CST-NEXT: .long 1199570944 # float 6.553600e+04
-; CST-NEXT: .long 1199570944 # float 6.553600e+04
+; CST-NEXT: .long 1199570944 # float 65536
+; CST-NEXT: .long 1199570944 # float 65536
+; CST-NEXT: .long 1199570944 # float 65536
+; CST-NEXT: .long 1199570944 # float 65536
 
 ; AVX2: [[FPMASKCSTADDR:.LCPI[0-9_]+]]:
 ; AVX2-NEXT: .long 1199570944 # float 65536
@@ -69,10 +69,10 @@ define <4 x float> @test_uitofp_v4i32_to_v4f32(<4 x i32> %arg) {
 ; AVX-NEXT: .long 65535 # 0xffff
 
 ; AVX: [[FPMASKCSTADDR_v8:.LCPI[0-9_]+]]:
-; AVX-NEXT: .long 1199570944 # float 6.553600e+04
-; AVX-NEXT: .long 1199570944 # float 6.553600e+04
-; AVX-NEXT: .long 1199570944 # float 6.553600e+04
-; AVX-NEXT: .long 1199570944 # float 6.553600e+04
+; AVX-NEXT: .long 1199570944 # float 65536
+; AVX-NEXT: .long 1199570944 # float 65536
+; AVX-NEXT: .long 1199570944 # float 65536
+; AVX-NEXT: .long 1199570944 # float 65536
 
 ; AVX2: [[FPMASKCSTADDR_v8:.LCPI[0-9_]+]]:
 ; AVX2-NEXT: .long 1199570944 # float 65536
diff --git a/test/CodeGen/X86/vec_uint_to_fp.ll b/test/CodeGen/X86/vec_uint_to_fp.ll
index 46cfcd9a9a12..ce0c11b2fa2a 100644
--- a/test/CodeGen/X86/vec_uint_to_fp.ll
+++ b/test/CodeGen/X86/vec_uint_to_fp.ll
@@ -23,10 +23,10 @@
 ; CST-NEXT: .long	1392508928              ## 0x53000000
 
 ; CST: [[MAGICCSTADDR:LCPI0_[0-9]+]]:
-; CST-NEXT: .long	3539992704              ## float -5.497642e+11
-; CST-NEXT: .long	3539992704              ## float -5.497642e+11
-; CST-NEXT: .long	3539992704              ## float -5.497642e+11
-; CST-NEXT: .long	3539992704              ## float -5.497642e+11
+; CST-NEXT: .long	3539992704              ## float -5.49764202E+11
+; CST-NEXT: .long	3539992704              ## float -5.49764202E+11
+; CST-NEXT: .long	3539992704              ## float -5.49764202E+11
+; CST-NEXT: .long	3539992704              ## float -5.49764202E+11
 
 ; AVX2: [[LOWCSTADDR:LCPI0_[0-9]+]]:
 ; AVX2-NEXT: .long	1258291200              ## 0x4b000000
diff --git a/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll b/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll
index 248a9202e997..99b27efe7f54 100644
--- a/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll
+++ b/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll
@@ -39,10 +39,10 @@ define void @foo1(<4 x float> %val, <4 x float> %test, <4 x double>* %p) nounwin
 ; Also test the general purpose constant folding of int->fp.
 define void @foo2(<4 x float>* noalias %result) nounwind {
 ; CHECK-LABEL: LCPI2_0:
-; CHECK-NEXT: .long 1082130432              ## float 4.000000e+00
-; CHECK-NEXT: .long 1084227584              ## float 5.000000e+00
-; CHECK-NEXT: .long 1086324736              ## float 6.000000e+00
-; CHECK-NEXT: .long 1088421888              ## float 7.000000e+00
+; CHECK-NEXT: .long 1082130432              ## float 4
+; CHECK-NEXT: .long 1084227584              ## float 5
+; CHECK-NEXT: .long 1086324736              ## float 6
+; CHECK-NEXT: .long 1088421888              ## float 7
 ; CHECK-LABEL: foo2:
 ; CHECK:  movaps LCPI2_0(%rip), %xmm0
 
@@ -72,10 +72,10 @@ define <4 x float> @foo3(<4 x float> %val, <4 x float> %test) nounwind {
 ; Test the general purpose constant folding of uint->fp.
 define void @foo4(<4 x float>* noalias %result) nounwind {
 ; CHECK-LABEL: LCPI4_0:
-; CHECK-NEXT: .long 1065353216              ## float 1.000000e+00
-; CHECK-NEXT: .long 1123942400              ## float 1.270000e+02
-; CHECK-NEXT: .long 1124073472              ## float 1.280000e+02
-; CHECK-NEXT: .long 1132396544              ## float 2.550000e+02
+; CHECK-NEXT: .long 1065353216              ## float 1
+; CHECK-NEXT: .long 1123942400              ## float 127
+; CHECK-NEXT: .long 1124073472              ## float 128
+; CHECK-NEXT: .long 1132396544              ## float 255
 ; CHECK-LABEL: foo4:
 ; CHECK:  movaps LCPI4_0(%rip), %xmm0
 

From fcb0893d8819bd6e1aecb360c49d5476b3b2ac24 Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Tue, 8 Dec 2015 02:38:14 +0000
Subject: [PATCH 207/364] Add a test showing that we internalize lazily linked
 GVs.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254989 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Linker/Inputs/internalize-lazy.ll | 8 ++++++++
 test/Linker/internalize-lazy.ll        | 4 ++++
 2 files changed, 12 insertions(+)
 create mode 100644 test/Linker/Inputs/internalize-lazy.ll
 create mode 100644 test/Linker/internalize-lazy.ll

diff --git a/test/Linker/Inputs/internalize-lazy.ll b/test/Linker/Inputs/internalize-lazy.ll
new file mode 100644
index 000000000000..43f9a7ab7455
--- /dev/null
+++ b/test/Linker/Inputs/internalize-lazy.ll
@@ -0,0 +1,8 @@
+define linkonce_odr void @g() {
+  ret void
+}
+
+define void @f() {
+  call void @g()
+  ret void
+}
diff --git a/test/Linker/internalize-lazy.ll b/test/Linker/internalize-lazy.ll
new file mode 100644
index 000000000000..480335927b51
--- /dev/null
+++ b/test/Linker/internalize-lazy.ll
@@ -0,0 +1,4 @@
+; RUN: llvm-link -S -internalize %s %p/Inputs/internalize-lazy.ll | FileCheck %s
+
+; CHECK: define internal void @f
+; CHECK: define internal void @g

From f2765767e6131c9b9b50acd53d58b54717fbd7c7 Mon Sep 17 00:00:00 2001
From: Davide Italiano <davide@freebsd.org>
Date: Tue, 8 Dec 2015 02:45:59 +0000
Subject: [PATCH 208/364] [llvm-objdump/MachO] Don't cut'n'paste the same code
 over and over.

Use the appropriate helper instead.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254990 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-objdump/MachODump.cpp    | 28 ++++++++--------------------
 tools/llvm-objdump/llvm-objdump.cpp |  2 +-
 tools/llvm-objdump/llvm-objdump.h   |  1 +
 3 files changed, 10 insertions(+), 21 deletions(-)

diff --git a/tools/llvm-objdump/MachODump.cpp b/tools/llvm-objdump/MachODump.cpp
index b270057151e1..61567436a1cc 100644
--- a/tools/llvm-objdump/MachODump.cpp
+++ b/tools/llvm-objdump/MachODump.cpp
@@ -1495,11 +1495,8 @@ void llvm::ParseInputMachO(StringRef Filename) {
       printArchiveHeaders(A, !NonVerbose, ArchiveMemberOffsets);
     for (Archive::child_iterator I = A->child_begin(), E = A->child_end();
          I != E; ++I) {
-      if (std::error_code EC = I->getError()) {
-        errs() << "llvm-objdump: '" << Filename << "': " << EC.message()
-               << ".\n";
-        exit(1);
-      }
+      if (std::error_code EC = I->getError())
+        report_error(Filename, EC);
       auto &C = I->get();
       ErrorOr<std::unique_ptr<Binary>> ChildOrErr = C.getAsBinary();
       if (ChildOrErr.getError())
@@ -1549,11 +1546,8 @@ void llvm::ParseInputMachO(StringRef Filename) {
               for (Archive::child_iterator AI = A->child_begin(),
                                            AE = A->child_end();
                    AI != AE; ++AI) {
-                if (std::error_code EC = AI->getError()) {
-                  errs() << "llvm-objdump: '" << Filename
-                         << "': " << EC.message() << ".\n";
-                  exit(1);
-                }
+                if (std::error_code EC = AI->getError())
+                  report_error(Filename, EC);
                 auto &C = AI->get();
                 ErrorOr<std::unique_ptr<Binary>> ChildOrErr = C.getAsBinary();
                 if (ChildOrErr.getError())
@@ -1597,11 +1591,8 @@ void llvm::ParseInputMachO(StringRef Filename) {
             for (Archive::child_iterator AI = A->child_begin(),
                                          AE = A->child_end();
                  AI != AE; ++AI) {
-              if (std::error_code EC = AI->getError()) {
-                errs() << "llvm-objdump: '" << Filename << "': " << EC.message()
-                       << ".\n";
-                exit(1);
-              }
+              if (std::error_code EC = AI->getError())
+                report_error(Filename, EC);
               auto &C = AI->get();
               ErrorOr<std::unique_ptr<Binary>> ChildOrErr = C.getAsBinary();
               if (ChildOrErr.getError())
@@ -1639,11 +1630,8 @@ void llvm::ParseInputMachO(StringRef Filename) {
           printArchiveHeaders(A.get(), !NonVerbose, ArchiveMemberOffsets);
         for (Archive::child_iterator AI = A->child_begin(), AE = A->child_end();
              AI != AE; ++AI) {
-          if (std::error_code EC = AI->getError()) {
-            errs() << "llvm-objdump: '" << Filename << "': " << EC.message()
-                   << ".\n";
-            exit(1);
-          }
+          if (std::error_code EC = AI->getError())
+            report_error(Filename, EC);
           auto &C = AI->get();
           ErrorOr<std::unique_ptr<Binary>> ChildOrErr = C.getAsBinary();
           if (ChildOrErr.getError())
diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp
index 069425429d16..a2c43e11a78e 100644
--- a/tools/llvm-objdump/llvm-objdump.cpp
+++ b/tools/llvm-objdump/llvm-objdump.cpp
@@ -252,7 +252,7 @@ void llvm::error(std::error_code EC) {
   exit(1);
 }
 
-static void report_error(StringRef File, std::error_code EC) {
+void llvm::report_error(StringRef File, std::error_code EC) {
   assert(EC);
   errs() << ToolName << ": '" << File << "': " << EC.message() << ".\n";
   exit(1);
diff --git a/tools/llvm-objdump/llvm-objdump.h b/tools/llvm-objdump/llvm-objdump.h
index 3c5e7b365825..f74ed010d1d1 100644
--- a/tools/llvm-objdump/llvm-objdump.h
+++ b/tools/llvm-objdump/llvm-objdump.h
@@ -78,6 +78,7 @@ void PrintRelocations(const object::ObjectFile *o);
 void PrintSectionHeaders(const object::ObjectFile *o);
 void PrintSectionContents(const object::ObjectFile *o);
 void PrintSymbolTable(const object::ObjectFile *o);
+void report_error(StringRef File, std::error_code EC);
 
 } // end namespace llvm
 

From e32f0e20e5af6fe16869bde63c121b5571133550 Mon Sep 17 00:00:00 2001
From: Justin Bogner <mail@justinbogner.com>
Date: Tue, 8 Dec 2015 03:01:16 +0000
Subject: [PATCH 209/364] IR: Allow vectors of halfs to be ConstantDataVectors

Currently, vectors of halfs end up as ConstantVectors, but there isn't
a good reason they can't be ConstantDataVectors. This should save some
memory.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254991 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/IR/Constants.cpp               | 18 +++++++++++++++---
 test/CodeGen/X86/float-asmprint.ll | 15 +++++++++++++++
 unittests/IR/ConstantsTest.cpp     | 28 ++++++++++++++++++++++++++++
 3 files changed, 58 insertions(+), 3 deletions(-)

diff --git a/lib/IR/Constants.cpp b/lib/IR/Constants.cpp
index b4a07a1b6b4a..509783fff8bd 100644
--- a/lib/IR/Constants.cpp
+++ b/lib/IR/Constants.cpp
@@ -899,7 +899,9 @@ static Constant *getSequenceIfElementsMatch(Constant *C,
     else if (CI->getType()->isIntegerTy(64))
       return getIntSequenceIfElementsMatch<SequenceTy, uint64_t>(V);
   } else if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
-    if (CFP->getType()->isFloatTy())
+    if (CFP->getType()->isHalfTy())
+      return getFPSequenceIfElementsMatch<SequenceTy, uint16_t>(V);
+    else if (CFP->getType()->isFloatTy())
       return getFPSequenceIfElementsMatch<SequenceTy, uint32_t>(V);
     else if (CFP->getType()->isDoubleTy())
       return getFPSequenceIfElementsMatch<SequenceTy, uint64_t>(V);
@@ -2365,7 +2367,7 @@ StringRef ConstantDataSequential::getRawDataValues() const {
 /// ConstantDataArray only works with normal float and int types that are
 /// stored densely in memory, not with things like i42 or x86_f80.
 bool ConstantDataSequential::isElementTypeCompatible(Type *Ty) {
-  if (Ty->isFloatTy() || Ty->isDoubleTy()) return true;
+  if (Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy()) return true;
   if (auto *IT = dyn_cast<IntegerType>(Ty)) {
     switch (IT->getBitWidth()) {
     case 8:
@@ -2637,6 +2639,11 @@ Constant *ConstantDataVector::getSplat(unsigned NumElts, Constant *V) {
   }
 
   if (ConstantFP *CFP = dyn_cast<ConstantFP>(V)) {
+    if (CFP->getType()->isHalfTy()) {
+      SmallVector<uint16_t, 16> Elts(
+          NumElts, CFP->getValueAPF().bitcastToAPInt().getLimitedValue());
+      return getFP(V->getContext(), Elts);
+    }
     if (CFP->getType()->isFloatTy()) {
       SmallVector<uint32_t, 16> Elts(
           NumElts, CFP->getValueAPF().bitcastToAPInt().getLimitedValue());
@@ -2682,6 +2689,10 @@ APFloat ConstantDataSequential::getElementAsAPFloat(unsigned Elt) const {
   switch (getElementType()->getTypeID()) {
   default:
     llvm_unreachable("Accessor can only be used when element is float/double!");
+  case Type::HalfTyID: {
+    auto EltVal = *reinterpret_cast<const uint16_t *>(EltPtr);
+    return APFloat(APFloat::IEEEhalf, APInt(16, EltVal));
+  }
   case Type::FloatTyID: {
     auto EltVal = *reinterpret_cast<const uint32_t *>(EltPtr);
     return APFloat(APFloat::IEEEsingle, APInt(32, EltVal));
@@ -2716,7 +2727,8 @@ double ConstantDataSequential::getElementAsDouble(unsigned Elt) const {
 /// Note that this has to compute a new constant to return, so it isn't as
 /// efficient as getElementAsInteger/Float/Double.
 Constant *ConstantDataSequential::getElementAsConstant(unsigned Elt) const {
-  if (getElementType()->isFloatTy() || getElementType()->isDoubleTy())
+  if (getElementType()->isHalfTy() || getElementType()->isFloatTy() ||
+      getElementType()->isDoubleTy())
     return ConstantFP::get(getContext(), getElementAsAPFloat(Elt));
 
   return ConstantInt::get(getElementType(), getElementAsInteger(Elt));
diff --git a/test/CodeGen/X86/float-asmprint.ll b/test/CodeGen/X86/float-asmprint.ll
index 5de9700fc064..0108430ee93e 100644
--- a/test/CodeGen/X86/float-asmprint.ll
+++ b/test/CodeGen/X86/float-asmprint.ll
@@ -9,6 +9,8 @@
 @var64 = global double -0.0, align 8
 @var32 = global float -0.0, align 4
 @var16 = global half -0.0, align 2
+@var4f32 = global <4 x float> <float -0.0, float 0.0, float 1.0, float 2.0>
+@var4f16 = global <4 x half> <half -0.0, half 0.0, half 1.0, half 2.0>
 
 ; CHECK: var128:
 ; CHECK-NEXT: .quad 0                         # fp128 -0
@@ -39,3 +41,16 @@
 ; CHECK-NEXT: .short 32768                    # half -0
 ; CHECK-NEXT: .size
 
+; CHECK: var4f32:
+; CHECK-NEXT: .long 2147483648               # float -0
+; CHECK-NEXT: .long 0                        # float 0
+; CHECK-NEXT: .long 1065353216               # float 1
+; CHECK-NEXT: .long 1073741824               # float 2
+; CHECK-NEXT: .size
+
+; CHECK: var4f16:
+; CHECK-NEXT: .short 32768                    # half -0
+; CHECK-NEXT: .short 0                        # half 0
+; CHECK-NEXT: .short 15360                    # half 1
+; CHECK-NEXT: .short 16384                    # half 2
+; CHECK-NEXT: .size
diff --git a/unittests/IR/ConstantsTest.cpp b/unittests/IR/ConstantsTest.cpp
index 7741b448fa8d..8c33453d293d 100644
--- a/unittests/IR/ConstantsTest.cpp
+++ b/unittests/IR/ConstantsTest.cpp
@@ -382,5 +382,33 @@ TEST(ConstantsTest, AliasCAPI) {
   ASSERT_EQ(unwrap<GlobalAlias>(AliasRef)->getAliasee(), Aliasee);
 }
 
+static std::string getNameOfType(Type *T) {
+  std::string S;
+  raw_string_ostream RSOS(S);
+  T->print(RSOS);
+  return S;
+}
+
+TEST(ConstantsTest, BuildConstantDataVectors) {
+  LLVMContext Context;
+  std::unique_ptr<Module> M(new Module("MyModule", Context));
+
+  for (Type *T : {Type::getInt8Ty(Context), Type::getInt16Ty(Context),
+                  Type::getInt32Ty(Context), Type::getInt64Ty(Context)}) {
+    Constant *Vals[] = {ConstantInt::get(T, 0), ConstantInt::get(T, 1)};
+    Constant *CDV = ConstantVector::get(Vals);
+    ASSERT_TRUE(dyn_cast<ConstantDataVector>(CDV) != nullptr)
+        << " T = " << getNameOfType(T);
+  }
+
+  for (Type *T : {Type::getHalfTy(Context), Type::getFloatTy(Context),
+                  Type::getDoubleTy(Context)}) {
+    Constant *Vals[] = {ConstantFP::get(T, 0), ConstantFP::get(T, 1)};
+    Constant *CDV = ConstantVector::get(Vals);
+    ASSERT_TRUE(dyn_cast<ConstantDataVector>(CDV) != nullptr)
+        << " T = " << getNameOfType(T);
+  }
+}
+
 }  // end anonymous namespace
 }  // end namespace llvm

From c2d82ef29f45abe3b75069b1fbdf9d0d9f8d8335 Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Tue, 8 Dec 2015 03:22:33 +0000
Subject: [PATCH 210/364] [WebAssembly] Remove the override of haveFastSqrt.

The default implementation in BasicTTI already checks TLI and does
the right thing.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254993 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp | 5 -----
 lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h   | 1 -
 2 files changed, 6 deletions(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index ea7044d58834..356631711921 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -25,8 +25,3 @@ WebAssemblyTTIImpl::getPopcntSupport(unsigned TyWidth) const {
   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
   return TargetTransformInfo::PSK_FastHardware;
 }
-
-bool WebAssemblyTTIImpl::haveFastSqrt(Type *Ty) const {
-  assert(Ty->isFPOrFPVectorTy() && "Ty must be floating point");
-  return true;
-}
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index 84f9f0e3e55e..26dc388cc922 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -55,7 +55,6 @@ class WebAssemblyTTIImpl final : public BasicTTIImplBase<WebAssemblyTTIImpl> {
   // TODO: Implement more Scalar TTI for WebAssembly
 
   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const;
-  bool haveFastSqrt(Type *Ty) const;
 
   /// @}
 

From 579ccfd983f4deb9d2374176fbb0119d73f546e2 Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Tue, 8 Dec 2015 03:25:35 +0000
Subject: [PATCH 211/364] [WebAssembly] Trim some unneeded #includes.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254994 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h   | 1 -
 lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h | 1 -
 lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h       | 2 --
 3 files changed, 4 deletions(-)

diff --git a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
index 20569da0b110..c585d44f0150 100644
--- a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
+++ b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
@@ -16,7 +16,6 @@
 #define LLVM_LIB_TARGET_WEBASSEMBLY_INSTPRINTER_WEBASSEMBLYINSTPRINTER_H
 
 #include "llvm/MC/MCInstPrinter.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/CodeGen/MachineValueType.h"
 
 namespace llvm {
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index 6ef01ffade43..b13dd148adf3 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -16,7 +16,6 @@
 #define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCTARGETDESC_H
 
 #include "llvm/Support/DataTypes.h"
-#include <string>
 
 namespace llvm {
 
diff --git a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
index 62c5f33cfad7..af4dabb2c6c3 100644
--- a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
@@ -16,8 +16,6 @@
 #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYMACHINEFUNCTIONINFO_H
 #define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYMACHINEFUNCTIONINFO_H
 
-#include "WebAssemblyRegisterInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
 namespace llvm {

From 55a29f75fd81ea0a107fe3cfab9fe4000082cc9a Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Tue, 8 Dec 2015 03:30:42 +0000
Subject: [PATCH 212/364] [WebAssembly] Assert MRI.isSSA() in passes that
 depend on SSA form.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254995 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/WebAssemblyRegStackify.cpp  | 2 ++
 lib/Target/WebAssembly/WebAssemblyStoreResults.cpp | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index ac016a7b9b0a..9fbde70634ac 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -127,6 +127,8 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
   WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
   AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
 
+  assert(MRI.isSSA() && "RegStackify depends on SSA form");
+
   // Walk the instructions from the bottom up. Currently we don't look past
   // block boundaries, and the blocks aren't ordered so the block visitation
   // order isn't significant, but we may want to change this in the future.
diff --git a/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp b/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
index b67453bee708..21122ba2b2ea 100644
--- a/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
@@ -72,6 +72,8 @@ bool WebAssemblyStoreResults::runOnMachineFunction(MachineFunction &MF) {
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
 
+  assert(MRI.isSSA() && "StoreResults depends on SSA form");
+
   for (auto &MBB : MF) {
     DEBUG(dbgs() << "Basic Block: " << MBB.getName() << '\n');
     for (auto &MI : MBB)

From 1acb0660c303fc422ca4f5bfe7e8147941409fd6 Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Tue, 8 Dec 2015 03:33:51 +0000
Subject: [PATCH 213/364] [WebAssembly] Convert a file-level comment to doxygen
 style.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254996 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/WebAssembly.td | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/lib/Target/WebAssembly/WebAssembly.td b/lib/Target/WebAssembly/WebAssembly.td
index 53dd9290348a..551ad9345154 100644
--- a/lib/Target/WebAssembly/WebAssembly.td
+++ b/lib/Target/WebAssembly/WebAssembly.td
@@ -6,10 +6,11 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This is a target description file for the WebAssembly architecture, which is
-// also known as "wasm".
-//
+///
+/// \file
+/// \brief This is a target description file for the WebAssembly architecture,
+/// which is also known as "wasm".
+///
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//

From 7710c66ebd1a584c72a803f6528e42cb892507b2 Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Tue, 8 Dec 2015 03:36:00 +0000
Subject: [PATCH 214/364] [WebAssembly] Fix an emacs syntax highlighting
 comment.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254997 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/WebAssemblyInstrFormats.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFormats.td b/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
index 62a108c0d400..8008dd32353a 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
@@ -1,4 +1,4 @@
-// WebAssemblyInstrFormats.td - WebAssembly Instruction Formats -*- tblgen -*-//
+//=- WebAssemblyInstrFormats.td - WebAssembly Instr. Formats -*- tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //

From ca9fa31c8cddf8adbe019dbc12871a7b5703d0a4 Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Tue, 8 Dec 2015 03:42:50 +0000
Subject: [PATCH 215/364] [WebAssembly] Remove an unneeded static_cast.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254998 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/WebAssemblyISelLowering.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 85fb753ed0e0..4883d83647db 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -593,8 +593,7 @@ SDValue WebAssemblyTargetLowering::LowerVASTART(SDValue Op,
   // the current frame pointer.
   DAG.getMachineFunction().getFrameInfo()->setFrameAddressIsTaken(true);
   unsigned FP =
-      static_cast<const WebAssemblyRegisterInfo *>(Subtarget->getRegisterInfo())
-          ->getFrameRegister(DAG.getMachineFunction());
+      Subtarget->getRegisterInfo()->getFrameRegister(DAG.getMachineFunction());
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, FP, PtrVT);
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   return DAG.getStore(Op.getOperand(0), DL, FrameAddr, Op.getOperand(1),

From 4474471834be491ad1c6de19da0fa82bf667e70a Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Tue, 8 Dec 2015 03:43:03 +0000
Subject: [PATCH 216/364] [WebAssembly] Fix a typo in a comment.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254999 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/WebAssemblyRegStackify.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index 9fbde70634ac..d890310ac501 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -15,7 +15,7 @@
 /// are then marked as "stackified", meaning references to them are replaced by
 /// "push" and "pop" from the stack.
 ///
-/// This is primarily a code size optimiation, since temporary values on the
+/// This is primarily a code size optimization, since temporary values on the
 /// expression don't need to be named.
 ///
 //===----------------------------------------------------------------------===//

From 43a68c82e51919cc6d38187f95d30fef456e6ad7 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@playingwithpointers.com>
Date: Tue, 8 Dec 2015 03:50:32 +0000
Subject: [PATCH 217/364] [OperandBundles] Remove unncessary constructor

The StringRef constructor is unnecessary (since we're converting to
std::string anyway), and having it requires an explicit call to
StringRef's or std::string's constructor.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255000 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/InstrTypes.h            | 3 ---
 lib/Transforms/Utils/InlineFunction.cpp | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/include/llvm/IR/InstrTypes.h b/include/llvm/IR/InstrTypes.h
index 157cb27cefbb..2a0927266656 100644
--- a/include/llvm/IR/InstrTypes.h
+++ b/include/llvm/IR/InstrTypes.h
@@ -1167,9 +1167,6 @@ template <typename InputTy> class OperandBundleDefT {
   std::vector<InputTy> Inputs;
 
 public:
-  explicit OperandBundleDefT(StringRef Tag, std::vector<InputTy> Inputs)
-      : Tag(Tag), Inputs(std::move(Inputs)) {}
-
   explicit OperandBundleDefT(std::string Tag, std::vector<InputTy> Inputs)
       : Tag(std::move(Tag)), Inputs(std::move(Inputs)) {}
 
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index aee84c07d593..52bde6797dbe 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -1191,7 +1191,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
           MergedDeoptArgs.insert(MergedDeoptArgs.end(), ChildOB.Inputs.begin(),
                                  ChildOB.Inputs.end());
 
-          OpDefs.emplace_back(StringRef("deopt"), std::move(MergedDeoptArgs));
+          OpDefs.emplace_back("deopt", std::move(MergedDeoptArgs));
         }
 
         Instruction *NewI = nullptr;

From 354f2216a71dd030ec818aa637b53ee192dbfa1a Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@playingwithpointers.com>
Date: Tue, 8 Dec 2015 04:32:51 +0000
Subject: [PATCH 218/364] [SCEV] Fix indentation; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255002 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Analysis/ScalarEvolution.cpp | 300 +++++++++++++++----------------
 1 file changed, 150 insertions(+), 150 deletions(-)

diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 9a0570d47f02..59c845baa289 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -446,179 +446,179 @@ bool SCEVUnknown::isOffsetOf(Type *&CTy, Constant *&FieldNo) const {
 //===----------------------------------------------------------------------===//
 
 namespace {
-  /// SCEVComplexityCompare - Return true if the complexity of the LHS is less
-  /// than the complexity of the RHS.  This comparator is used to canonicalize
-  /// expressions.
-  class SCEVComplexityCompare {
-    const LoopInfo *const LI;
-  public:
-    explicit SCEVComplexityCompare(const LoopInfo *li) : LI(li) {}
-
-    // Return true or false if LHS is less than, or at least RHS, respectively.
-    bool operator()(const SCEV *LHS, const SCEV *RHS) const {
-      return compare(LHS, RHS) < 0;
-    }
-
-    // Return negative, zero, or positive, if LHS is less than, equal to, or
-    // greater than RHS, respectively. A three-way result allows recursive
-    // comparisons to be more efficient.
-    int compare(const SCEV *LHS, const SCEV *RHS) const {
-      // Fast-path: SCEVs are uniqued so we can do a quick equality check.
-      if (LHS == RHS)
-        return 0;
-
-      // Primarily, sort the SCEVs by their getSCEVType().
-      unsigned LType = LHS->getSCEVType(), RType = RHS->getSCEVType();
-      if (LType != RType)
-        return (int)LType - (int)RType;
-
-      // Aside from the getSCEVType() ordering, the particular ordering
-      // isn't very important except that it's beneficial to be consistent,
-      // so that (a + b) and (b + a) don't end up as different expressions.
-      switch (static_cast<SCEVTypes>(LType)) {
-      case scUnknown: {
-        const SCEVUnknown *LU = cast<SCEVUnknown>(LHS);
-        const SCEVUnknown *RU = cast<SCEVUnknown>(RHS);
-
-        // Sort SCEVUnknown values with some loose heuristics. TODO: This is
-        // not as complete as it could be.
-        const Value *LV = LU->getValue(), *RV = RU->getValue();
-
-        // Order pointer values after integer values. This helps SCEVExpander
-        // form GEPs.
-        bool LIsPointer = LV->getType()->isPointerTy(),
-             RIsPointer = RV->getType()->isPointerTy();
-        if (LIsPointer != RIsPointer)
-          return (int)LIsPointer - (int)RIsPointer;
-
-        // Compare getValueID values.
-        unsigned LID = LV->getValueID(),
-                 RID = RV->getValueID();
-        if (LID != RID)
-          return (int)LID - (int)RID;
-
-        // Sort arguments by their position.
-        if (const Argument *LA = dyn_cast<Argument>(LV)) {
-          const Argument *RA = cast<Argument>(RV);
-          unsigned LArgNo = LA->getArgNo(), RArgNo = RA->getArgNo();
-          return (int)LArgNo - (int)RArgNo;
-        }
-
-        // For instructions, compare their loop depth, and their operand
-        // count.  This is pretty loose.
-        if (const Instruction *LInst = dyn_cast<Instruction>(LV)) {
-          const Instruction *RInst = cast<Instruction>(RV);
-
-          // Compare loop depths.
-          const BasicBlock *LParent = LInst->getParent(),
-                           *RParent = RInst->getParent();
-          if (LParent != RParent) {
-            unsigned LDepth = LI->getLoopDepth(LParent),
-                     RDepth = LI->getLoopDepth(RParent);
-            if (LDepth != RDepth)
-              return (int)LDepth - (int)RDepth;
-          }
-
-          // Compare the number of operands.
-          unsigned LNumOps = LInst->getNumOperands(),
-                   RNumOps = RInst->getNumOperands();
-          return (int)LNumOps - (int)RNumOps;
-        }
+/// SCEVComplexityCompare - Return true if the complexity of the LHS is less
+/// than the complexity of the RHS.  This comparator is used to canonicalize
+/// expressions.
+class SCEVComplexityCompare {
+  const LoopInfo *const LI;
+public:
+  explicit SCEVComplexityCompare(const LoopInfo *li) : LI(li) {}
 
-        return 0;
-      }
+  // Return true or false if LHS is less than, or at least RHS, respectively.
+  bool operator()(const SCEV *LHS, const SCEV *RHS) const {
+    return compare(LHS, RHS) < 0;
+  }
 
-      case scConstant: {
-        const SCEVConstant *LC = cast<SCEVConstant>(LHS);
-        const SCEVConstant *RC = cast<SCEVConstant>(RHS);
-
-        // Compare constant values.
-        const APInt &LA = LC->getValue()->getValue();
-        const APInt &RA = RC->getValue()->getValue();
-        unsigned LBitWidth = LA.getBitWidth(), RBitWidth = RA.getBitWidth();
-        if (LBitWidth != RBitWidth)
-          return (int)LBitWidth - (int)RBitWidth;
-        return LA.ult(RA) ? -1 : 1;
+  // Return negative, zero, or positive, if LHS is less than, equal to, or
+  // greater than RHS, respectively. A three-way result allows recursive
+  // comparisons to be more efficient.
+  int compare(const SCEV *LHS, const SCEV *RHS) const {
+    // Fast-path: SCEVs are uniqued so we can do a quick equality check.
+    if (LHS == RHS)
+      return 0;
+
+    // Primarily, sort the SCEVs by their getSCEVType().
+    unsigned LType = LHS->getSCEVType(), RType = RHS->getSCEVType();
+    if (LType != RType)
+      return (int)LType - (int)RType;
+
+    // Aside from the getSCEVType() ordering, the particular ordering
+    // isn't very important except that it's beneficial to be consistent,
+    // so that (a + b) and (b + a) don't end up as different expressions.
+    switch (static_cast<SCEVTypes>(LType)) {
+    case scUnknown: {
+      const SCEVUnknown *LU = cast<SCEVUnknown>(LHS);
+      const SCEVUnknown *RU = cast<SCEVUnknown>(RHS);
+
+      // Sort SCEVUnknown values with some loose heuristics. TODO: This is
+      // not as complete as it could be.
+      const Value *LV = LU->getValue(), *RV = RU->getValue();
+
+      // Order pointer values after integer values. This helps SCEVExpander
+      // form GEPs.
+      bool LIsPointer = LV->getType()->isPointerTy(),
+        RIsPointer = RV->getType()->isPointerTy();
+      if (LIsPointer != RIsPointer)
+        return (int)LIsPointer - (int)RIsPointer;
+
+      // Compare getValueID values.
+      unsigned LID = LV->getValueID(),
+        RID = RV->getValueID();
+      if (LID != RID)
+        return (int)LID - (int)RID;
+
+      // Sort arguments by their position.
+      if (const Argument *LA = dyn_cast<Argument>(LV)) {
+        const Argument *RA = cast<Argument>(RV);
+        unsigned LArgNo = LA->getArgNo(), RArgNo = RA->getArgNo();
+        return (int)LArgNo - (int)RArgNo;
       }
 
-      case scAddRecExpr: {
-        const SCEVAddRecExpr *LA = cast<SCEVAddRecExpr>(LHS);
-        const SCEVAddRecExpr *RA = cast<SCEVAddRecExpr>(RHS);
-
-        // Compare addrec loop depths.
-        const Loop *LLoop = LA->getLoop(), *RLoop = RA->getLoop();
-        if (LLoop != RLoop) {
-          unsigned LDepth = LLoop->getLoopDepth(),
-                   RDepth = RLoop->getLoopDepth();
+      // For instructions, compare their loop depth, and their operand
+      // count.  This is pretty loose.
+      if (const Instruction *LInst = dyn_cast<Instruction>(LV)) {
+        const Instruction *RInst = cast<Instruction>(RV);
+
+        // Compare loop depths.
+        const BasicBlock *LParent = LInst->getParent(),
+          *RParent = RInst->getParent();
+        if (LParent != RParent) {
+          unsigned LDepth = LI->getLoopDepth(LParent),
+            RDepth = LI->getLoopDepth(RParent);
           if (LDepth != RDepth)
             return (int)LDepth - (int)RDepth;
         }
 
-        // Addrec complexity grows with operand count.
-        unsigned LNumOps = LA->getNumOperands(), RNumOps = RA->getNumOperands();
-        if (LNumOps != RNumOps)
-          return (int)LNumOps - (int)RNumOps;
+        // Compare the number of operands.
+        unsigned LNumOps = LInst->getNumOperands(),
+          RNumOps = RInst->getNumOperands();
+        return (int)LNumOps - (int)RNumOps;
+      }
 
-        // Lexicographically compare.
-        for (unsigned i = 0; i != LNumOps; ++i) {
-          long X = compare(LA->getOperand(i), RA->getOperand(i));
-          if (X != 0)
-            return X;
-        }
+      return 0;
+    }
+
+    case scConstant: {
+      const SCEVConstant *LC = cast<SCEVConstant>(LHS);
+      const SCEVConstant *RC = cast<SCEVConstant>(RHS);
+
+      // Compare constant values.
+      const APInt &LA = LC->getValue()->getValue();
+      const APInt &RA = RC->getValue()->getValue();
+      unsigned LBitWidth = LA.getBitWidth(), RBitWidth = RA.getBitWidth();
+      if (LBitWidth != RBitWidth)
+        return (int)LBitWidth - (int)RBitWidth;
+      return LA.ult(RA) ? -1 : 1;
+    }
 
-        return 0;
+    case scAddRecExpr: {
+      const SCEVAddRecExpr *LA = cast<SCEVAddRecExpr>(LHS);
+      const SCEVAddRecExpr *RA = cast<SCEVAddRecExpr>(RHS);
+
+      // Compare addrec loop depths.
+      const Loop *LLoop = LA->getLoop(), *RLoop = RA->getLoop();
+      if (LLoop != RLoop) {
+        unsigned LDepth = LLoop->getLoopDepth(),
+          RDepth = RLoop->getLoopDepth();
+        if (LDepth != RDepth)
+          return (int)LDepth - (int)RDepth;
       }
 
-      case scAddExpr:
-      case scMulExpr:
-      case scSMaxExpr:
-      case scUMaxExpr: {
-        const SCEVNAryExpr *LC = cast<SCEVNAryExpr>(LHS);
-        const SCEVNAryExpr *RC = cast<SCEVNAryExpr>(RHS);
-
-        // Lexicographically compare n-ary expressions.
-        unsigned LNumOps = LC->getNumOperands(), RNumOps = RC->getNumOperands();
-        if (LNumOps != RNumOps)
-          return (int)LNumOps - (int)RNumOps;
-
-        for (unsigned i = 0; i != LNumOps; ++i) {
-          if (i >= RNumOps)
-            return 1;
-          long X = compare(LC->getOperand(i), RC->getOperand(i));
-          if (X != 0)
-            return X;
-        }
+      // Addrec complexity grows with operand count.
+      unsigned LNumOps = LA->getNumOperands(), RNumOps = RA->getNumOperands();
+      if (LNumOps != RNumOps)
         return (int)LNumOps - (int)RNumOps;
+
+      // Lexicographically compare.
+      for (unsigned i = 0; i != LNumOps; ++i) {
+        long X = compare(LA->getOperand(i), RA->getOperand(i));
+        if (X != 0)
+          return X;
       }
 
-      case scUDivExpr: {
-        const SCEVUDivExpr *LC = cast<SCEVUDivExpr>(LHS);
-        const SCEVUDivExpr *RC = cast<SCEVUDivExpr>(RHS);
+      return 0;
+    }
+
+    case scAddExpr:
+    case scMulExpr:
+    case scSMaxExpr:
+    case scUMaxExpr: {
+      const SCEVNAryExpr *LC = cast<SCEVNAryExpr>(LHS);
+      const SCEVNAryExpr *RC = cast<SCEVNAryExpr>(RHS);
 
-        // Lexicographically compare udiv expressions.
-        long X = compare(LC->getLHS(), RC->getLHS());
+      // Lexicographically compare n-ary expressions.
+      unsigned LNumOps = LC->getNumOperands(), RNumOps = RC->getNumOperands();
+      if (LNumOps != RNumOps)
+        return (int)LNumOps - (int)RNumOps;
+
+      for (unsigned i = 0; i != LNumOps; ++i) {
+        if (i >= RNumOps)
+          return 1;
+        long X = compare(LC->getOperand(i), RC->getOperand(i));
         if (X != 0)
           return X;
-        return compare(LC->getRHS(), RC->getRHS());
       }
+      return (int)LNumOps - (int)RNumOps;
+    }
 
-      case scTruncate:
-      case scZeroExtend:
-      case scSignExtend: {
-        const SCEVCastExpr *LC = cast<SCEVCastExpr>(LHS);
-        const SCEVCastExpr *RC = cast<SCEVCastExpr>(RHS);
+    case scUDivExpr: {
+      const SCEVUDivExpr *LC = cast<SCEVUDivExpr>(LHS);
+      const SCEVUDivExpr *RC = cast<SCEVUDivExpr>(RHS);
 
-        // Compare cast expressions by operand.
-        return compare(LC->getOperand(), RC->getOperand());
-      }
+      // Lexicographically compare udiv expressions.
+      long X = compare(LC->getLHS(), RC->getLHS());
+      if (X != 0)
+        return X;
+      return compare(LC->getRHS(), RC->getRHS());
+    }
 
-      case scCouldNotCompute:
-        llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!");
-      }
-      llvm_unreachable("Unknown SCEV kind!");
+    case scTruncate:
+    case scZeroExtend:
+    case scSignExtend: {
+      const SCEVCastExpr *LC = cast<SCEVCastExpr>(LHS);
+      const SCEVCastExpr *RC = cast<SCEVCastExpr>(RHS);
+
+      // Compare cast expressions by operand.
+      return compare(LC->getOperand(), RC->getOperand());
     }
-  };
-}
+
+    case scCouldNotCompute:
+      llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!");
+    }
+    llvm_unreachable("Unknown SCEV kind!");
+  }
+};
+}  // end anonymous namespace
 
 /// GroupByComplexity - Given a list of SCEV objects, order them by their
 /// complexity, and group objects of the same complexity together by value.

From 711641834a5e07534b823880a932747d0dca174c Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@playingwithpointers.com>
Date: Tue, 8 Dec 2015 04:32:54 +0000
Subject: [PATCH 219/364] [SCEV] Move some struct declarations inside
 functions; NFC

Reduces the scope over which the struct is visible, making its usages
obvious.  I did not move structs in cases where this wasn't a clear
win (the struct is too large, or is grouped in some other interesting
way).

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255003 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Analysis/ScalarEvolution.cpp | 117 ++++++++++++++-----------------
 1 file changed, 54 insertions(+), 63 deletions(-)

diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 59c845baa289..6c8f3ba9c6e0 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -666,24 +666,22 @@ static void GroupByComplexity(SmallVectorImpl<const SCEV *> &Ops,
   }
 }
 
-namespace {
-struct FindSCEVSize {
-  int Size;
-  FindSCEVSize() : Size(0) {}
-
-  bool follow(const SCEV *S) {
-    ++Size;
-    // Keep looking at all operands of S.
-    return true;
-  }
-  bool isDone() const {
-    return false;
-  }
-};
-}
-
 // Returns the size of the SCEV S.
 static inline int sizeOfSCEV(const SCEV *S) {
+  struct FindSCEVSize {
+    int Size;
+    FindSCEVSize() : Size(0) {}
+
+    bool follow(const SCEV *S) {
+      ++Size;
+      // Keep looking at all operands of S.
+      return true;
+    }
+    bool isDone() const {
+      return false;
+    }
+  };
+
   FindSCEVSize F;
   SCEVTraversal<FindSCEVSize> ST(F);
   ST.visitAll(S);
@@ -1929,14 +1927,6 @@ CollectAddOperandsWithScales(DenseMap<const SCEV *, APInt> &M,
   return Interesting;
 }
 
-namespace {
-  struct APIntCompare {
-    bool operator()(const APInt &LHS, const APInt &RHS) const {
-      return LHS.ult(RHS);
-    }
-  };
-}
-
 // We're trying to construct a SCEV of type `Type' with `Ops' as operands and
 // `OldFlags' as can't-wrap behavior.  Infer a more aggressive set of
 // can't-overflow flags for the operation if possible.
@@ -2149,6 +2139,12 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
     if (CollectAddOperandsWithScales(M, NewOps, AccumulatedConstant,
                                      Ops.data(), Ops.size(),
                                      APInt(BitWidth, 1), *this)) {
+      struct APIntCompare {
+        bool operator()(const APInt &LHS, const APInt &RHS) const {
+          return LHS.ult(RHS);
+        }
+      };
+
       // Some interesting folding opportunity is present, so its worthwhile to
       // re-generate the operands list. Group the operands by constant scale,
       // to avoid multiplying by the same constant scale multiple times.
@@ -3289,7 +3285,8 @@ const SCEV *ScalarEvolution::getCouldNotCompute() {
   return CouldNotCompute.get();
 }
 
-namespace {
+
+bool ScalarEvolution::checkValidity(const SCEV *S) const {
   // Helper class working with SCEVTraversal to figure out if a SCEV contains
   // a SCEVUnknown with null value-pointer. FindInvalidSCEVUnknown::FindOne
   // is set iff if find such SCEVUnknown.
@@ -3311,9 +3308,7 @@ namespace {
     }
     bool isDone() const { return FindOne; }
   };
-}
 
-bool ScalarEvolution::checkValidity(const SCEV *S) const {
   FindInvalidSCEVUnknown F;
   SCEVTraversal<FindInvalidSCEVUnknown> ST(F);
   ST.visitAll(S);
@@ -8721,30 +8716,28 @@ static bool findArrayDimensionsRec(ScalarEvolution &SE,
   return true;
 }
 
-namespace {
-struct FindParameter {
-  bool FoundParameter;
-  FindParameter() : FoundParameter(false) {}
-
-  bool follow(const SCEV *S) {
-    if (isa<SCEVUnknown>(S)) {
-      FoundParameter = true;
-      // Stop recursion: we found a parameter.
-      return false;
-    }
-    // Keep looking.
-    return true;
-  }
-  bool isDone() const {
-    // Stop recursion if we have found a parameter.
-    return FoundParameter;
-  }
-};
-}
-
 // Returns true when S contains at least a SCEVUnknown parameter.
 static inline bool
 containsParameters(const SCEV *S) {
+  struct FindParameter {
+    bool FoundParameter;
+    FindParameter() : FoundParameter(false) {}
+
+    bool follow(const SCEV *S) {
+      if (isa<SCEVUnknown>(S)) {
+        FoundParameter = true;
+        // Stop recursion: we found a parameter.
+        return false;
+      }
+      // Keep looking.
+      return true;
+    }
+    bool isDone() const {
+      // Stop recursion if we have found a parameter.
+      return FoundParameter;
+    }
+  };
+
   FindParameter F;
   SCEVTraversal<FindParameter> ST(F);
   ST.visitAll(S);
@@ -9406,24 +9399,22 @@ bool ScalarEvolution::properlyDominates(const SCEV *S, const BasicBlock *BB) {
   return getBlockDisposition(S, BB) == ProperlyDominatesBlock;
 }
 
-namespace {
-// Search for a SCEV expression node within an expression tree.
-// Implements SCEVTraversal::Visitor.
-struct SCEVSearch {
-  const SCEV *Node;
-  bool IsFound;
+bool ScalarEvolution::hasOperand(const SCEV *S, const SCEV *Op) const {
+  // Search for a SCEV expression node within an expression tree.
+  // Implements SCEVTraversal::Visitor.
+  struct SCEVSearch {
+    const SCEV *Node;
+    bool IsFound;
 
-  SCEVSearch(const SCEV *N): Node(N), IsFound(false) {}
+    SCEVSearch(const SCEV *N): Node(N), IsFound(false) {}
 
-  bool follow(const SCEV *S) {
-    IsFound |= (S == Node);
-    return !IsFound;
-  }
-  bool isDone() const { return IsFound; }
-};
-}
+    bool follow(const SCEV *S) {
+      IsFound |= (S == Node);
+      return !IsFound;
+    }
+    bool isDone() const { return IsFound; }
+  };
 
-bool ScalarEvolution::hasOperand(const SCEV *S, const SCEV *Op) const {
   SCEVSearch Search(Op);
   visitAll(S, Search);
   return Search.IsFound;

From 2450ea130a3776fe6285f19feea9ae330e2a7ba7 Mon Sep 17 00:00:00 2001
From: Michael Zuckerman <Michael.zuckerman@intel.com>
Date: Tue, 8 Dec 2015 12:00:24 +0000
Subject: [PATCH 220/364] dding test for fnstsw

continue of Wrong FNSTSW size operator
url: http://reviews.llvm.org/D14953


Differential Revision: http://reviews.llvm.org/D15155


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255007 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/MC/X86/intel-syntax.s | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/MC/X86/intel-syntax.s b/test/MC/X86/intel-syntax.s
index 001a26d07019..c7ec77eadfee 100644
--- a/test/MC/X86/intel-syntax.s
+++ b/test/MC/X86/intel-syntax.s
@@ -489,10 +489,12 @@ test [ECX], AL
 // CHECK: fnstsw %ax
 // CHECK: fnstsw %ax
 // CHECK: fnstsw %ax
+// CHECK: fnstsw (%eax)
 fnstsw
 fnstsw AX
 fnstsw EAX
 fnstsw AL
+fnstsw WORD PTR [EAX]
 
 // CHECK: faddp %st(1)
 // CHECK: fmulp %st(1)

From 59ad77e46ed43d552c2dee1ffd8e5bd522022919 Mon Sep 17 00:00:00 2001
From: Oliver Stannard <oliver.stannard@arm.com>
Date: Tue, 8 Dec 2015 12:16:10 +0000
Subject: [PATCH 221/364] [AArch64] Add ARMv8.2-A FP16 vector instructions

ARMv8.2-A adds 16-bit floating point versions of all existing SIMD
floating-point instructions. This is an optional extension, so all of
these instructions require the FeatureFullFP16 subtarget feature.

Note that VFP without SIMD is not a valid combination for any version of
ARMv8-A, but I have ensured that these instructions all depend on both
FeatureNEON and FeatureFullFP16 for consistency.

The ".2h" vector type specifier is now legal (for the scalar pairwise
reduction instructions), so some unrelated tests have been modified as
different error messages are emitted. This is not a problem as the
invalid operands are still caught.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255010 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64InstrFormats.td     | 576 ++++++++++++------
 lib/Target/AArch64/AArch64InstrInfo.td        | 152 +++--
 lib/Target/AArch64/AArch64RegisterInfo.td     |   2 +-
 .../AArch64/AsmParser/AArch64AsmParser.cpp    |   2 +
 test/MC/AArch64/arm64-advsimd.s               | 153 ++++-
 test/MC/AArch64/armv8.1a-rdma.s               |  18 -
 test/MC/AArch64/fullfp16-diagnostics.s        |  42 ++
 test/MC/AArch64/fullfp16-neon-neg.s           | 382 ++++++++++++
 test/MC/AArch64/neon-2velem.s                 |  18 +-
 test/MC/AArch64/neon-aba-abd.s                |   4 +-
 test/MC/AArch64/neon-across.s                 |  18 +-
 test/MC/AArch64/neon-add-pairwise.s           |   6 +-
 test/MC/AArch64/neon-add-sub-instructions.s   |  10 +-
 test/MC/AArch64/neon-compare-instructions.s   |  62 +-
 test/MC/AArch64/neon-diagnostics.s            | 132 ++--
 test/MC/AArch64/neon-facge-facgt.s            |  18 +-
 test/MC/AArch64/neon-frsqrt-frecp.s           |  10 +-
 test/MC/AArch64/neon-max-min-pairwise.s       |  18 +-
 test/MC/AArch64/neon-max-min.s                |  18 +-
 test/MC/AArch64/neon-mla-mls-instructions.s   |  10 +-
 test/MC/AArch64/neon-scalar-abs.s             |   4 +-
 test/MC/AArch64/neon-scalar-by-elem-mla.s     |   6 +-
 test/MC/AArch64/neon-scalar-by-elem-mul.s     |   6 +-
 test/MC/AArch64/neon-scalar-cvt.s             |  34 +-
 test/MC/AArch64/neon-scalar-fp-compare.s      |  32 +-
 test/MC/AArch64/neon-scalar-mul.s             |   4 +-
 test/MC/AArch64/neon-scalar-recip.s           |  12 +-
 test/MC/AArch64/neon-scalar-reduce-pairwise.s |   7 +-
 test/MC/AArch64/neon-simd-misc.s              |  98 ++-
 test/MC/AArch64/neon-simd-shift.s             |  18 +-
 .../AArch64/fullfp16-neon-neg.txt             | 382 ++++++++++++
 31 files changed, 1917 insertions(+), 337 deletions(-)
 create mode 100644 test/MC/AArch64/fullfp16-diagnostics.s
 create mode 100644 test/MC/AArch64/fullfp16-neon-neg.s
 create mode 100644 test/MC/Disassembler/AArch64/fullfp16-neon-neg.txt

diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index 5eef82153e39..101b0f7e1d3a 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -4315,7 +4315,7 @@ let Predicates = [HasNEON] in {
 //----------------------------------------------------------------------------
 
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDThreeSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+class BaseSIMDThreeSameVector<bit Q, bit U, bits<3> size, bits<5> opcode,
                         RegisterOperand regtype, string asm, string kind,
                         list<dag> pattern>
   : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
@@ -4329,8 +4329,7 @@ class BaseSIMDThreeSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
   let Inst{30}    = Q;
   let Inst{29}    = U;
   let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21}    = 1;
+  let Inst{23-21} = size;
   let Inst{20-16} = Rm;
   let Inst{15-11} = opcode;
   let Inst{10}    = 1;
@@ -4339,7 +4338,7 @@ class BaseSIMDThreeSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
 }
 
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<3> size, bits<5> opcode,
                         RegisterOperand regtype, string asm, string kind,
                         list<dag> pattern>
   : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), asm,
@@ -4353,8 +4352,7 @@ class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
   let Inst{30}    = Q;
   let Inst{29}    = U;
   let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21}    = 1;
+  let Inst{23-21} = size;
   let Inst{20-16} = Rm;
   let Inst{15-11} = opcode;
   let Inst{10}    = 1;
@@ -4365,25 +4363,25 @@ class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
 // All operand sizes distinguished in the encoding.
 multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
                                SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64,
                                       asm, ".8b",
          [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128,
                                       asm, ".16b",
          [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
-  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64,
                                       asm, ".4h",
          [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128,
                                       asm, ".8h",
          [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
-  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64,
                                       asm, ".2s",
          [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128,
                                       asm, ".4s",
          [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
-  def v2i64 : BaseSIMDThreeSameVector<1, U, 0b11, opc, V128,
+  def v2i64 : BaseSIMDThreeSameVector<1, U, 0b111, opc, V128,
                                       asm, ".2d",
          [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
 }
@@ -4391,49 +4389,49 @@ multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
 // As above, but D sized elements unsupported.
 multiclass SIMDThreeSameVectorBHS<bit U, bits<5> opc, string asm,
                                   SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64,
                                       asm, ".8b",
         [(set V64:$Rd, (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))]>;
-  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128,
                                       asm, ".16b",
         [(set V128:$Rd, (v16i8 (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm))))]>;
-  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64,
                                       asm, ".4h",
         [(set V64:$Rd, (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))]>;
-  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128,
                                       asm, ".8h",
         [(set V128:$Rd, (v8i16 (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm))))]>;
-  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64,
                                       asm, ".2s",
         [(set V64:$Rd, (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))]>;
-  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128,
                                       asm, ".4s",
         [(set V128:$Rd, (v4i32 (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm))))]>;
 }
 
 multiclass SIMDThreeSameVectorBHSTied<bit U, bits<5> opc, string asm,
                                   SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVectorTied<0, U, 0b00, opc, V64,
+  def v8i8  : BaseSIMDThreeSameVectorTied<0, U, 0b001, opc, V64,
                                       asm, ".8b",
       [(set (v8i8 V64:$dst),
             (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8 : BaseSIMDThreeSameVectorTied<1, U, 0b00, opc, V128,
+  def v16i8 : BaseSIMDThreeSameVectorTied<1, U, 0b001, opc, V128,
                                       asm, ".16b",
       [(set (v16i8 V128:$dst),
             (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
-  def v4i16 : BaseSIMDThreeSameVectorTied<0, U, 0b01, opc, V64,
+  def v4i16 : BaseSIMDThreeSameVectorTied<0, U, 0b011, opc, V64,
                                       asm, ".4h",
       [(set (v4i16 V64:$dst),
             (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16 : BaseSIMDThreeSameVectorTied<1, U, 0b01, opc, V128,
+  def v8i16 : BaseSIMDThreeSameVectorTied<1, U, 0b011, opc, V128,
                                       asm, ".8h",
       [(set (v8i16 V128:$dst),
             (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
-  def v2i32 : BaseSIMDThreeSameVectorTied<0, U, 0b10, opc, V64,
+  def v2i32 : BaseSIMDThreeSameVectorTied<0, U, 0b101, opc, V64,
                                       asm, ".2s",
       [(set (v2i32 V64:$dst),
             (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32 : BaseSIMDThreeSameVectorTied<1, U, 0b10, opc, V128,
+  def v4i32 : BaseSIMDThreeSameVectorTied<1, U, 0b101, opc, V128,
                                       asm, ".4s",
       [(set (v4i32 V128:$dst),
             (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
@@ -4442,54 +4440,80 @@ multiclass SIMDThreeSameVectorBHSTied<bit U, bits<5> opc, string asm,
 // As above, but only B sized elements supported.
 multiclass SIMDThreeSameVectorB<bit U, bits<5> opc, string asm,
                                 SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64,
                                       asm, ".8b",
     [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128,
                                       asm, ".16b",
     [(set (v16i8 V128:$Rd),
           (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
 }
 
-// As above, but only S and D sized floating point elements supported.
-multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<5> opc,
+// As above, but only floating point elements supported.
+multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<3> opc,
                                  string asm, SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64,
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64,
+                                      asm, ".4h",
+        [(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>;
+  def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128,
+                                      asm, ".8h",
+        [(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>;
+  } // Predicates = [HasNEON, HasFullFP16]
+  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64,
                                       asm, ".2s",
         [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
-  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128,
+  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128,
                                       asm, ".4s",
         [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
-  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128,
+  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128,
                                       asm, ".2d",
         [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
 }
 
-multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<5> opc,
+multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<3> opc,
                                     string asm,
                                     SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64,
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64,
+                                      asm, ".4h",
+        [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>;
+  def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128,
+                                      asm, ".8h",
+        [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>;
+  } // Predicates = [HasNEON, HasFullFP16]
+  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64,
                                       asm, ".2s",
         [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
-  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128,
+  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128,
                                       asm, ".4s",
         [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
-  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128,
+  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128,
                                       asm, ".2d",
         [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
 }
 
-multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<5> opc,
+multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<3> opc,
                                  string asm, SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0}, opc, V64,
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4f16 : BaseSIMDThreeSameVectorTied<0, U, {S,0b10}, {0b00,opc}, V64,
+                                      asm, ".4h",
+     [(set (v4f16 V64:$dst),
+           (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>;
+  def v8f16 : BaseSIMDThreeSameVectorTied<1, U, {S,0b10}, {0b00,opc}, V128,
+                                      asm, ".8h",
+     [(set (v8f16 V128:$dst),
+           (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>;
+  } // Predicates = [HasNEON, HasFullFP16]
+  def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0b01}, {0b11,opc}, V64,
                                       asm, ".2s",
      [(set (v2f32 V64:$dst),
            (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
-  def v4f32 : BaseSIMDThreeSameVectorTied<1, U, {S,0}, opc, V128,
+  def v4f32 : BaseSIMDThreeSameVectorTied<1, U, {S,0b01}, {0b11,opc}, V128,
                                       asm, ".4s",
      [(set (v4f32 V128:$dst),
            (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
-  def v2f64 : BaseSIMDThreeSameVectorTied<1, U, {S,1}, opc, V128,
+  def v2f64 : BaseSIMDThreeSameVectorTied<1, U, {S,0b11}, {0b11,opc}, V128,
                                       asm, ".2d",
      [(set (v2f64 V128:$dst),
            (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
@@ -4498,16 +4522,16 @@ multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<5> opc,
 // As above, but D and B sized elements unsupported.
 multiclass SIMDThreeSameVectorHS<bit U, bits<5> opc, string asm,
                                 SDPatternOperator OpNode> {
-  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64,
                                       asm, ".4h",
         [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128,
                                       asm, ".8h",
         [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
-  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64,
                                       asm, ".2s",
         [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128,
                                       asm, ".4s",
         [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
 }
@@ -4515,10 +4539,10 @@ multiclass SIMDThreeSameVectorHS<bit U, bits<5> opc, string asm,
 // Logical three vector ops share opcode bits, and only use B sized elements.
 multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm,
                                   SDPatternOperator OpNode = null_frag> {
-  def v8i8  : BaseSIMDThreeSameVector<0, U, size, 0b00011, V64,
+  def v8i8  : BaseSIMDThreeSameVector<0, U, {size,1}, 0b00011, V64,
                                      asm, ".8b",
                          [(set (v8i8 V64:$Rd), (OpNode V64:$Rn, V64:$Rm))]>;
-  def v16i8  : BaseSIMDThreeSameVector<1, U, size, 0b00011, V128,
+  def v16i8  : BaseSIMDThreeSameVector<1, U, {size,1}, 0b00011, V128,
                                      asm, ".16b",
                          [(set (v16i8 V128:$Rd), (OpNode V128:$Rn, V128:$Rm))]>;
 
@@ -4539,11 +4563,11 @@ multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm,
 
 multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
                                   string asm, SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVectorTied<0, U, size, 0b00011, V64,
+  def v8i8  : BaseSIMDThreeSameVectorTied<0, U, {size,1}, 0b00011, V64,
                                      asm, ".8b",
              [(set (v8i8 V64:$dst),
                    (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8  : BaseSIMDThreeSameVectorTied<1, U, size, 0b00011, V128,
+  def v16i8  : BaseSIMDThreeSameVectorTied<1, U, {size,1}, 0b00011, V128,
                                      asm, ".16b",
              [(set (v16i8 V128:$dst),
                    (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
@@ -4583,8 +4607,8 @@ multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
 
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
 class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
-                        RegisterOperand regtype, string asm, string dstkind,
-                        string srckind, list<dag> pattern>
+                            bits<2> size2, RegisterOperand regtype, string asm,
+                            string dstkind, string srckind, list<dag> pattern>
   : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
       "{\t$Rd" # dstkind # ", $Rn" # srckind #
       "|" # dstkind # "\t$Rd, $Rn}", "", pattern>,
@@ -4596,7 +4620,9 @@ class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
   let Inst{29}    = U;
   let Inst{28-24} = 0b01110;
   let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
+  let Inst{21} = 0b1;
+  let Inst{20-19} = size2;
+  let Inst{18-17} = 0b00;
   let Inst{16-12} = opcode;
   let Inst{11-10} = 0b10;
   let Inst{9-5}   = Rn;
@@ -4605,8 +4631,9 @@ class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
 
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
 class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
-                            RegisterOperand regtype, string asm, string dstkind,
-                            string srckind, list<dag> pattern>
+                                bits<2> size2, RegisterOperand regtype,
+                                string asm, string dstkind, string srckind,
+                                list<dag> pattern>
   : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn), asm,
       "{\t$Rd" # dstkind # ", $Rn" # srckind #
       "|" # dstkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
@@ -4618,7 +4645,9 @@ class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
   let Inst{29}    = U;
   let Inst{28-24} = 0b01110;
   let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
+  let Inst{21} = 0b1;
+  let Inst{20-19} = size2;
+  let Inst{18-17} = 0b00;
   let Inst{16-12} = opcode;
   let Inst{11-10} = 0b10;
   let Inst{9-5}   = Rn;
@@ -4628,22 +4657,22 @@ class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
 // Supports B, H, and S element sizes.
 multiclass SIMDTwoVectorBHS<bit U, bits<5> opc, string asm,
                             SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
                                       asm, ".8b", ".8b",
                           [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
-  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
                                       asm, ".16b", ".16b",
                           [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
-  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
                                       asm, ".4h", ".4h",
                           [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
-  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
                                       asm, ".8h", ".8h",
                           [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
-  def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+  def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64,
                                       asm, ".2s", ".2s",
                           [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
-  def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+  def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128,
                                       asm, ".4s", ".4s",
                           [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
 }
@@ -4686,49 +4715,49 @@ multiclass SIMDVectorLShiftLongBySizeBHS {
 // Supports all element sizes.
 multiclass SIMDLongTwoVector<bit U, bits<5> opc, string asm,
                              SDPatternOperator OpNode> {
-  def v8i8_v4i16  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+  def v8i8_v4i16  : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
                                       asm, ".4h", ".8b",
                [(set (v4i16 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
-  def v16i8_v8i16 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+  def v16i8_v8i16 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
                                       asm, ".8h", ".16b",
                [(set (v8i16 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
-  def v4i16_v2i32 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+  def v4i16_v2i32 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
                                       asm, ".2s", ".4h",
                [(set (v2i32 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
-  def v8i16_v4i32 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+  def v8i16_v4i32 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
                                       asm, ".4s", ".8h",
                [(set (v4i32 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
-  def v2i32_v1i64 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+  def v2i32_v1i64 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64,
                                       asm, ".1d", ".2s",
                [(set (v1i64 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
-  def v4i32_v2i64 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+  def v4i32_v2i64 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128,
                                       asm, ".2d", ".4s",
                [(set (v2i64 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
 }
 
 multiclass SIMDLongTwoVectorTied<bit U, bits<5> opc, string asm,
                                  SDPatternOperator OpNode> {
-  def v8i8_v4i16  : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64,
+  def v8i8_v4i16  : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, 0b00, V64,
                                           asm, ".4h", ".8b",
       [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd),
                                       (v8i8 V64:$Rn)))]>;
-  def v16i8_v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128,
+  def v16i8_v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, 0b00, V128,
                                           asm, ".8h", ".16b",
       [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd),
                                       (v16i8 V128:$Rn)))]>;
-  def v4i16_v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64,
+  def v4i16_v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, 0b00, V64,
                                           asm, ".2s", ".4h",
       [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd),
                                       (v4i16 V64:$Rn)))]>;
-  def v8i16_v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128,
+  def v8i16_v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, 0b00, V128,
                                           asm, ".4s", ".8h",
       [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd),
                                       (v8i16 V128:$Rn)))]>;
-  def v2i32_v1i64 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64,
+  def v2i32_v1i64 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, 0b00, V64,
                                           asm, ".1d", ".2s",
       [(set (v1i64 V64:$dst), (OpNode (v1i64 V64:$Rd),
                                       (v2i32 V64:$Rn)))]>;
-  def v4i32_v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128,
+  def v4i32_v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, 0b00, V128,
                                           asm, ".2d", ".4s",
       [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd),
                                       (v4i32 V128:$Rn)))]>;
@@ -4737,50 +4766,50 @@ multiclass SIMDLongTwoVectorTied<bit U, bits<5> opc, string asm,
 // Supports all element sizes, except 1xD.
 multiclass SIMDTwoVectorBHSDTied<bit U, bits<5> opc, string asm,
                                   SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64,
+  def v8i8  : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, 0b00, V64,
                                     asm, ".8b", ".8b",
     [(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn)))]>;
-  def v16i8 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128,
+  def v16i8 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, 0b00, V128,
                                     asm, ".16b", ".16b",
     [(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>;
-  def v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64,
+  def v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, 0b00, V64,
                                     asm, ".4h", ".4h",
     [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn)))]>;
-  def v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128,
+  def v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, 0b00, V128,
                                     asm, ".8h", ".8h",
     [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn)))]>;
-  def v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64,
+  def v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, 0b00, V64,
                                     asm, ".2s", ".2s",
     [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn)))]>;
-  def v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128,
+  def v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, 0b00, V128,
                                     asm, ".4s", ".4s",
     [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>;
-  def v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b11, opc, V128,
+  def v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b11, opc, 0b00, V128,
                                     asm, ".2d", ".2d",
     [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn)))]>;
 }
 
 multiclass SIMDTwoVectorBHSD<bit U, bits<5> opc, string asm,
                              SDPatternOperator OpNode = null_frag> {
-  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
                                 asm, ".8b", ".8b",
     [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
-  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
                                 asm, ".16b", ".16b",
     [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
-  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
                                 asm, ".4h", ".4h",
     [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
-  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
                                 asm, ".8h", ".8h",
     [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
-  def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+  def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64,
                                 asm, ".2s", ".2s",
     [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
-  def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+  def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128,
                                 asm, ".4s", ".4s",
     [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
-  def v2i64 : BaseSIMDTwoSameVector<1, U, 0b11, opc, V128,
+  def v2i64 : BaseSIMDTwoSameVector<1, U, 0b11, opc, 0b00, V128,
                                 asm, ".2d", ".2d",
     [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
 }
@@ -4789,10 +4818,10 @@ multiclass SIMDTwoVectorBHSD<bit U, bits<5> opc, string asm,
 // Supports only B element sizes.
 multiclass SIMDTwoVectorB<bit U, bits<2> size, bits<5> opc, string asm,
                           SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDTwoSameVector<0, U, size, opc, V64,
+  def v8i8  : BaseSIMDTwoSameVector<0, U, size, opc, 0b00, V64,
                                 asm, ".8b", ".8b",
                     [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
-  def v16i8 : BaseSIMDTwoSameVector<1, U, size, opc, V128,
+  def v16i8 : BaseSIMDTwoSameVector<1, U, size, opc, 0b00, V128,
                                 asm, ".16b", ".16b",
                     [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
 
@@ -4801,16 +4830,16 @@ multiclass SIMDTwoVectorB<bit U, bits<2> size, bits<5> opc, string asm,
 // Supports only B and H element sizes.
 multiclass SIMDTwoVectorBH<bit U, bits<5> opc, string asm,
                                 SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
                                 asm, ".8b", ".8b",
                     [(set (v8i8 V64:$Rd), (OpNode V64:$Rn))]>;
-  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
                                 asm, ".16b", ".16b",
                     [(set (v16i8 V128:$Rd), (OpNode V128:$Rn))]>;
-  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
                                 asm, ".4h", ".4h",
                     [(set (v4i16 V64:$Rd), (OpNode V64:$Rn))]>;
-  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
                                 asm, ".8h", ".8h",
                     [(set (v8i16 V128:$Rd), (OpNode V128:$Rn))]>;
 }
@@ -4819,13 +4848,21 @@ multiclass SIMDTwoVectorBH<bit U, bits<5> opc, string asm,
 // as an extra opcode bit.
 multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm,
                            SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64,
+                                asm, ".4h", ".4h",
+                          [(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn)))]>;
+  def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128,
+                                asm, ".8h", ".8h",
+                          [(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn)))]>;
+  } // Predicates = [HasNEON, HasFullFP16]
+  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
                                 asm, ".2s", ".2s",
                           [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
-  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
                                 asm, ".4s", ".4s",
                           [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
-  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128,
                                 asm, ".2d", ".2d",
                           [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
 }
@@ -4833,10 +4870,10 @@ multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm,
 // Supports only S element size.
 multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm,
                            SDPatternOperator OpNode> {
-  def v2i32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+  def v2i32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
                                 asm, ".2s", ".2s",
                           [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
-  def v4i32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+  def v4i32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
                                 asm, ".4s", ".4s",
                           [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
 }
@@ -4844,26 +4881,42 @@ multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm,
 
 multiclass SIMDTwoVectorFPToInt<bit U, bit S, bits<5> opc, string asm,
                            SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64,
+                                asm, ".4h", ".4h",
+                          [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn)))]>;
+  def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128,
+                                asm, ".8h", ".8h",
+                          [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn)))]>;
+  } // Predicates = [HasNEON, HasFullFP16]
+  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
                                 asm, ".2s", ".2s",
                           [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
-  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
                                 asm, ".4s", ".4s",
                           [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
-  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128,
                                 asm, ".2d", ".2d",
                           [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
 }
 
 multiclass SIMDTwoVectorIntToFP<bit U, bit S, bits<5> opc, string asm,
                            SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64,
+                                asm, ".4h", ".4h",
+                          [(set (v4f16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
+  def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128,
+                                asm, ".8h", ".8h",
+                          [(set (v8f16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+  } // Predicates = [HasNEON, HasFullFP16]
+  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
                                 asm, ".2s", ".2s",
                           [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
-  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
                                 asm, ".4s", ".4s",
                           [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
-  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128,
                                 asm, ".2d", ".2d",
                           [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
 }
@@ -4942,10 +4995,10 @@ multiclass SIMDMixedTwoVector<bit U, bits<5> opc, string asm,
                 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
 }
 
-class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
-                           RegisterOperand regtype,
-                           string asm, string kind, string zero,
-                           ValueType dty, ValueType sty, SDNode OpNode>
+class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<2> size2,
+                           bits<5> opcode, RegisterOperand regtype, string asm,
+                           string kind, string zero, ValueType dty,
+                           ValueType sty, SDNode OpNode>
   : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
       "{\t$Rd" # kind # ", $Rn" # kind # ", #" # zero #
       "|" # kind # "\t$Rd, $Rn, #" # zero # "}", "",
@@ -4958,7 +5011,9 @@ class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
   let Inst{29}    = U;
   let Inst{28-24} = 0b01110;
   let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
+  let Inst{21} = 0b1;
+  let Inst{20-19} = size2;
+  let Inst{18-17} = 0b00;
   let Inst{16-12} = opcode;
   let Inst{11-10} = 0b10;
   let Inst{9-5}   = Rn;
@@ -4968,49 +5023,69 @@ class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
 // Comparisons support all element sizes, except 1xD.
 multiclass SIMDCmpTwoVector<bit U, bits<5> opc, string asm,
                             SDNode OpNode> {
-  def v8i8rz  : BaseSIMDCmpTwoVector<0, U, 0b00, opc, V64,
+  def v8i8rz  : BaseSIMDCmpTwoVector<0, U, 0b00, 0b00, opc, V64,
                                      asm, ".8b", "0",
                                      v8i8, v8i8, OpNode>;
-  def v16i8rz : BaseSIMDCmpTwoVector<1, U, 0b00, opc, V128,
+  def v16i8rz : BaseSIMDCmpTwoVector<1, U, 0b00, 0b00, opc, V128,
                                      asm, ".16b", "0",
                                      v16i8, v16i8, OpNode>;
-  def v4i16rz : BaseSIMDCmpTwoVector<0, U, 0b01, opc, V64,
+  def v4i16rz : BaseSIMDCmpTwoVector<0, U, 0b01, 0b00, opc, V64,
                                      asm, ".4h", "0",
                                      v4i16, v4i16, OpNode>;
-  def v8i16rz : BaseSIMDCmpTwoVector<1, U, 0b01, opc, V128,
+  def v8i16rz : BaseSIMDCmpTwoVector<1, U, 0b01, 0b00, opc, V128,
                                      asm, ".8h", "0",
                                      v8i16, v8i16, OpNode>;
-  def v2i32rz : BaseSIMDCmpTwoVector<0, U, 0b10, opc, V64,
+  def v2i32rz : BaseSIMDCmpTwoVector<0, U, 0b10, 0b00, opc, V64,
                                      asm, ".2s", "0",
                                      v2i32, v2i32, OpNode>;
-  def v4i32rz : BaseSIMDCmpTwoVector<1, U, 0b10, opc, V128,
+  def v4i32rz : BaseSIMDCmpTwoVector<1, U, 0b10, 0b00, opc, V128,
                                      asm, ".4s", "0",
                                      v4i32, v4i32, OpNode>;
-  def v2i64rz : BaseSIMDCmpTwoVector<1, U, 0b11, opc, V128,
+  def v2i64rz : BaseSIMDCmpTwoVector<1, U, 0b11, 0b00, opc, V128,
                                      asm, ".2d", "0",
                                      v2i64, v2i64, OpNode>;
 }
 
-// FP Comparisons support only S and D element sizes.
+// FP Comparisons support only S and D element sizes (and H for v8.2a).
 multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc,
                               string asm, SDNode OpNode> {
 
-  def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, opc, V64,
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4i16rz : BaseSIMDCmpTwoVector<0, U, {S,1}, 0b11, opc, V64,
+                                     asm, ".4h", "0.0",
+                                     v4i16, v4f16, OpNode>;
+  def v8i16rz : BaseSIMDCmpTwoVector<1, U, {S,1}, 0b11, opc, V128,
+                                     asm, ".8h", "0.0",
+                                     v8i16, v8f16, OpNode>;
+  } // Predicates = [HasNEON, HasFullFP16]
+  def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, 0b00, opc, V64,
                                      asm, ".2s", "0.0",
                                      v2i32, v2f32, OpNode>;
-  def v4i32rz : BaseSIMDCmpTwoVector<1, U, {S,0}, opc, V128,
+  def v4i32rz : BaseSIMDCmpTwoVector<1, U, {S,0}, 0b00, opc, V128,
                                      asm, ".4s", "0.0",
                                      v4i32, v4f32, OpNode>;
-  def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, opc, V128,
+  def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, 0b00, opc, V128,
                                      asm, ".2d", "0.0",
                                      v2i64, v2f64, OpNode>;
 
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def : InstAlias<asm # "\t$Vd.4h, $Vn.4h, #0",
+                  (!cast<Instruction>(NAME # v4i16rz) V64:$Vd, V64:$Vn), 0>;
+  def : InstAlias<asm # "\t$Vd.8h, $Vn.8h, #0",
+                  (!cast<Instruction>(NAME # v8i16rz) V128:$Vd, V128:$Vn), 0>;
+  }
   def : InstAlias<asm # "\t$Vd.2s, $Vn.2s, #0",
                   (!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>;
   def : InstAlias<asm # "\t$Vd.4s, $Vn.4s, #0",
                   (!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>;
   def : InstAlias<asm # "\t$Vd.2d, $Vn.2d, #0",
                   (!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>;
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def : InstAlias<asm # ".4h\t$Vd, $Vn, #0",
+                  (!cast<Instruction>(NAME # v4i16rz) V64:$Vd, V64:$Vn), 0>;
+  def : InstAlias<asm # ".8h\t$Vd, $Vn, #0",
+                  (!cast<Instruction>(NAME # v8i16rz) V128:$Vd, V128:$Vn), 0>;
+  }
   def : InstAlias<asm # ".2s\t$Vd, $Vn, #0",
                   (!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>;
   def : InstAlias<asm # ".4s\t$Vd, $Vn, #0",
@@ -5561,7 +5636,7 @@ multiclass SIMDZipVector<bits<3>opc, string asm,
 //----------------------------------------------------------------------------
 
 let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseSIMDThreeScalar<bit U, bits<2> size, bits<5> opcode,
+class BaseSIMDThreeScalar<bit U, bits<3> size, bits<5> opcode,
                         RegisterClass regtype, string asm,
                         list<dag> pattern>
   : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
@@ -5573,8 +5648,7 @@ class BaseSIMDThreeScalar<bit U, bits<2> size, bits<5> opcode,
   let Inst{31-30} = 0b01;
   let Inst{29}    = U;
   let Inst{28-24} = 0b11110;
-  let Inst{23-22} = size;
-  let Inst{21}    = 1;
+  let Inst{23-21} = size;
   let Inst{20-16} = Rm;
   let Inst{15-11} = opcode;
   let Inst{10}    = 1;
@@ -5605,17 +5679,17 @@ class BaseSIMDThreeScalarTied<bit U, bits<2> size, bit R, bits<5> opcode,
 
 multiclass SIMDThreeScalarD<bit U, bits<5> opc, string asm,
                             SDPatternOperator OpNode> {
-  def v1i64  : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
+  def v1i64  : BaseSIMDThreeScalar<U, 0b111, opc, FPR64, asm,
     [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
 }
 
 multiclass SIMDThreeScalarBHSD<bit U, bits<5> opc, string asm,
                                SDPatternOperator OpNode> {
-  def v1i64  : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
+  def v1i64  : BaseSIMDThreeScalar<U, 0b111, opc, FPR64, asm,
     [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
-  def v1i32  : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm, []>;
-  def v1i16  : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
-  def v1i8   : BaseSIMDThreeScalar<U, 0b00, opc, FPR8 , asm, []>;
+  def v1i32  : BaseSIMDThreeScalar<U, 0b101, opc, FPR32, asm, []>;
+  def v1i16  : BaseSIMDThreeScalar<U, 0b011, opc, FPR16, asm, []>;
+  def v1i8   : BaseSIMDThreeScalar<U, 0b001, opc, FPR8 , asm, []>;
 
   def : Pat<(i64 (OpNode (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
             (!cast<Instruction>(NAME#"v1i64") FPR64:$Rn, FPR64:$Rm)>;
@@ -5625,9 +5699,9 @@ multiclass SIMDThreeScalarBHSD<bit U, bits<5> opc, string asm,
 
 multiclass SIMDThreeScalarHS<bit U, bits<5> opc, string asm,
                              SDPatternOperator OpNode> {
-  def v1i32  : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm,
+  def v1i32  : BaseSIMDThreeScalar<U, 0b101, opc, FPR32, asm,
                              [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
-  def v1i16  : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
+  def v1i16  : BaseSIMDThreeScalar<U, 0b011, opc, FPR16, asm, []>;
 }
 
 multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm,
@@ -5640,26 +5714,34 @@ multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm,
                                      asm, []>;
 }
 
-multiclass SIMDThreeScalarSD<bit U, bit S, bits<5> opc, string asm,
+multiclass SIMDFPThreeScalar<bit U, bit S, bits<3> opc, string asm,
                              SDPatternOperator OpNode = null_frag> {
   let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-    def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm,
+    def #NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm,
       [(set (f64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
-    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm,
+    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm,
       [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
+    let Predicates = [HasNEON, HasFullFP16] in {
+    def #NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm,
+      [(set FPR16:$Rd, (OpNode FPR16:$Rn, FPR16:$Rm))]>;
+    } // Predicates = [HasNEON, HasFullFP16]
   }
 
   def : Pat<(v1f64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
             (!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>;
 }
 
-multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<5> opc, string asm,
+multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<3> opc, string asm,
                                 SDPatternOperator OpNode = null_frag> {
   let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-    def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm,
+    def #NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm,
       [(set (i64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
-    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm,
+    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm,
       [(set (i32 FPR32:$Rd), (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]>;
+    let Predicates = [HasNEON, HasFullFP16] in {
+    def #NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm,
+      []>;
+    } // Predicates = [HasNEON, HasFullFP16]
   }
 
   def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
@@ -5718,7 +5800,7 @@ multiclass SIMDThreeScalarMixedTiedHS<bit U, bits<5> opc, string asm,
 //----------------------------------------------------------------------------
 
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDTwoScalar<bit U, bits<2> size, bits<5> opcode,
+class BaseSIMDTwoScalar<bit U, bits<2> size, bits<2> size2, bits<5> opcode,
                         RegisterClass regtype, RegisterClass regtype2,
                         string asm, list<dag> pat>
   : I<(outs regtype:$Rd), (ins regtype2:$Rn), asm,
@@ -5730,7 +5812,9 @@ class BaseSIMDTwoScalar<bit U, bits<2> size, bits<5> opcode,
   let Inst{29}    = U;
   let Inst{28-24} = 0b11110;
   let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
+  let Inst{21} = 0b1;
+  let Inst{20-19} = size2;
+  let Inst{18-17} = 0b00;
   let Inst{16-12} = opcode;
   let Inst{11-10} = 0b10;
   let Inst{9-5}   = Rn;
@@ -5759,7 +5843,7 @@ class BaseSIMDTwoScalarTied<bit U, bits<2> size, bits<5> opcode,
 
 
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<5> opcode,
+class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<2> size2, bits<5> opcode,
                         RegisterClass regtype, string asm, string zero>
   : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
       "\t$Rd, $Rn, #" # zero, "", []>,
@@ -5770,7 +5854,9 @@ class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<5> opcode,
   let Inst{29}    = U;
   let Inst{28-24} = 0b11110;
   let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
+  let Inst{21} = 0b1;
+  let Inst{20-19} = size2;
+  let Inst{18-17} = 0b00;
   let Inst{16-12} = opcode;
   let Inst{11-10} = 0b10;
   let Inst{9-5}   = Rn;
@@ -5792,7 +5878,7 @@ class SIMDInexactCvtTwoScalar<bits<5> opcode, string asm>
 
 multiclass SIMDCmpTwoScalarD<bit U, bits<5> opc, string asm,
                              SDPatternOperator OpNode> {
-  def v1i64rz  : BaseSIMDCmpTwoScalar<U, 0b11, opc, FPR64, asm, "0">;
+  def v1i64rz  : BaseSIMDCmpTwoScalar<U, 0b11, 0b00, opc, FPR64, asm, "0">;
 
   def : Pat<(v1i64 (OpNode FPR64:$Rn)),
             (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
@@ -5800,13 +5886,20 @@ multiclass SIMDCmpTwoScalarD<bit U, bits<5> opc, string asm,
 
 multiclass SIMDFPCmpTwoScalar<bit U, bit S, bits<5> opc, string asm,
                               SDPatternOperator OpNode> {
-  def v1i64rz  : BaseSIMDCmpTwoScalar<U, {S,1}, opc, FPR64, asm, "0.0">;
-  def v1i32rz  : BaseSIMDCmpTwoScalar<U, {S,0}, opc, FPR32, asm, "0.0">;
+  def v1i64rz  : BaseSIMDCmpTwoScalar<U, {S,1}, 0b00, opc, FPR64, asm, "0.0">;
+  def v1i32rz  : BaseSIMDCmpTwoScalar<U, {S,0}, 0b00, opc, FPR32, asm, "0.0">;
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v1i16rz  : BaseSIMDCmpTwoScalar<U, {S,1}, 0b11, opc, FPR16, asm, "0.0">;
+  }
 
   def : InstAlias<asm # "\t$Rd, $Rn, #0",
                   (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rd, FPR64:$Rn), 0>;
   def : InstAlias<asm # "\t$Rd, $Rn, #0",
                   (!cast<Instruction>(NAME # v1i32rz) FPR32:$Rd, FPR32:$Rn), 0>;
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def : InstAlias<asm # "\t$Rd, $Rn, #0",
+                  (!cast<Instruction>(NAME # v1i16rz) FPR16:$Rd, FPR16:$Rn), 0>;
+  }
 
   def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn))),
             (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
@@ -5814,7 +5907,7 @@ multiclass SIMDFPCmpTwoScalar<bit U, bit S, bits<5> opc, string asm,
 
 multiclass SIMDTwoScalarD<bit U, bits<5> opc, string asm,
                           SDPatternOperator OpNode = null_frag> {
-  def v1i64       : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm,
+  def v1i64       : BaseSIMDTwoScalar<U, 0b11, 0b00, opc, FPR64, FPR64, asm,
     [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn)))]>;
 
   def : Pat<(i64 (OpNode (i64 FPR64:$Rn))),
@@ -5822,27 +5915,34 @@ multiclass SIMDTwoScalarD<bit U, bits<5> opc, string asm,
 }
 
 multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm> {
-  def v1i64       : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm,[]>;
-  def v1i32       : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm,[]>;
+  def v1i64       : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm,[]>;
+  def v1i32       : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm,[]>;
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v1f16       : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,[]>;
+  }
 }
 
-multiclass SIMDTwoScalarCVTSD<bit U, bit S, bits<5> opc, string asm,
+multiclass SIMDFPTwoScalarCVT<bit U, bit S, bits<5> opc, string asm,
                               SDPatternOperator OpNode> {
-  def v1i64 : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm,
+  def v1i64 : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm,
                                 [(set FPR64:$Rd, (OpNode (f64 FPR64:$Rn)))]>;
-  def v1i32 : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm,
+  def v1i32 : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm,
                                 [(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>;
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v1i16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,
+                                [(set FPR16:$Rd, (OpNode (f16 FPR16:$Rn)))]>;
+  }
 }
 
 multiclass SIMDTwoScalarBHSD<bit U, bits<5> opc, string asm,
                              SDPatternOperator OpNode = null_frag> {
   let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-    def v1i64  : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm,
+    def v1i64  : BaseSIMDTwoScalar<U, 0b11, 0b00, opc, FPR64, FPR64, asm,
            [(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
-    def v1i32  : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR32, asm,
+    def v1i32  : BaseSIMDTwoScalar<U, 0b10, 0b00, opc, FPR32, FPR32, asm,
            [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
-    def v1i16  : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR16, asm, []>;
-    def v1i8   : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR8 , asm, []>;
+    def v1i16  : BaseSIMDTwoScalar<U, 0b01, 0b00, opc, FPR16, FPR16, asm, []>;
+    def v1i8   : BaseSIMDTwoScalar<U, 0b00, 0b00, opc, FPR8 , FPR8 , asm, []>;
   }
 
   def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn))),
@@ -5869,10 +5969,10 @@ multiclass SIMDTwoScalarBHSDTied<bit U, bits<5> opc, string asm,
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
 multiclass SIMDTwoScalarMixedBHS<bit U, bits<5> opc, string asm,
                                  SDPatternOperator OpNode = null_frag> {
-  def v1i32  : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR64, asm,
+  def v1i32  : BaseSIMDTwoScalar<U, 0b10, 0b00, opc, FPR32, FPR64, asm,
         [(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
-  def v1i16  : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR32, asm, []>;
-  def v1i8   : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR16, asm, []>;
+  def v1i16  : BaseSIMDTwoScalar<U, 0b01, 0b00, opc, FPR16, FPR32, asm, []>;
+  def v1i8   : BaseSIMDTwoScalar<U, 0b00, 0b00, opc, FPR8 , FPR16, asm, []>;
 }
 
 //----------------------------------------------------------------------------
@@ -5904,10 +6004,14 @@ multiclass SIMDPairwiseScalarD<bit U, bits<5> opc, string asm> {
                                       asm, ".2d">;
 }
 
-multiclass SIMDFPPairwiseScalar<bit U, bit S, bits<5> opc, string asm> {
-  def v2i32p : BaseSIMDPairwiseScalar<U, {S,0}, opc, FPR32Op, V64,
+multiclass SIMDFPPairwiseScalar<bit S, bits<5> opc, string asm> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v2i16p : BaseSIMDPairwiseScalar<0, {S,0}, opc, FPR16Op, V64,
+                                      asm, ".2h">;
+  }
+  def v2i32p : BaseSIMDPairwiseScalar<1, {S,0}, opc, FPR32Op, V64,
                                       asm, ".2s">;
-  def v2i64p : BaseSIMDPairwiseScalar<U, {S,1}, opc, FPR64Op, V128,
+  def v2i64p : BaseSIMDPairwiseScalar<1, {S,1}, opc, FPR64Op, V128,
                                       asm, ".2d">;
 }
 
@@ -5963,8 +6067,16 @@ multiclass SIMDAcrossLanesHSD<bit U, bits<5> opcode, string asm> {
                                    asm, ".4s", []>;
 }
 
-multiclass SIMDAcrossLanesS<bits<5> opcode, bit sz1, string asm,
+multiclass SIMDFPAcrossLanes<bits<5> opcode, bit sz1, string asm,
                             Intrinsic intOp> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4i16v : BaseSIMDAcrossLanes<0, 0, {sz1, 0}, opcode, FPR16, V64,
+                                   asm, ".4h",
+        [(set FPR16:$Rd, (intOp (v4f16 V64:$Rn)))]>;
+  def v8i16v : BaseSIMDAcrossLanes<1, 0, {sz1, 0}, opcode, FPR16, V128,
+                                   asm, ".8h",
+        [(set FPR16:$Rd, (intOp (v8f16 V128:$Rn)))]>;
+  } // Predicates = [HasNEON, HasFullFP16]
   def v4i32v : BaseSIMDAcrossLanes<1, 1, {sz1, 0}, opcode, FPR32, V128,
                                    asm, ".4s",
         [(set FPR32:$Rd, (intOp (v4f32 V128:$Rn)))]>;
@@ -6451,7 +6563,7 @@ multiclass SIMDScalarCPY<string asm> {
 // AdvSIMD modified immediate instructions
 //----------------------------------------------------------------------------
 
-class BaseSIMDModifiedImm<bit Q, bit op, dag oops, dag iops,
+class BaseSIMDModifiedImm<bit Q, bit op, bit op2, dag oops, dag iops,
                           string asm, string op_string,
                           string cstr, list<dag> pattern>
   : I<oops, iops, asm, op_string, cstr, pattern>,
@@ -6463,16 +6575,17 @@ class BaseSIMDModifiedImm<bit Q, bit op, dag oops, dag iops,
   let Inst{29}    = op;
   let Inst{28-19} = 0b0111100000;
   let Inst{18-16} = imm8{7-5};
-  let Inst{11-10} = 0b01;
+  let Inst{11} = op2;
+  let Inst{10} = 1;
   let Inst{9-5}   = imm8{4-0};
   let Inst{4-0}   = Rd;
 }
 
-class BaseSIMDModifiedImmVector<bit Q, bit op, RegisterOperand vectype,
+class BaseSIMDModifiedImmVector<bit Q, bit op, bit op2, RegisterOperand vectype,
                                 Operand immtype, dag opt_shift_iop,
                                 string opt_shift, string asm, string kind,
                                 list<dag> pattern>
-  : BaseSIMDModifiedImm<Q, op, (outs vectype:$Rd),
+  : BaseSIMDModifiedImm<Q, op, op2, (outs vectype:$Rd),
                         !con((ins immtype:$imm8), opt_shift_iop), asm,
                         "{\t$Rd" # kind # ", $imm8" # opt_shift #
                         "|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
@@ -6484,7 +6597,7 @@ class BaseSIMDModifiedImmVectorTied<bit Q, bit op, RegisterOperand vectype,
                                 Operand immtype, dag opt_shift_iop,
                                 string opt_shift, string asm, string kind,
                                 list<dag> pattern>
-  : BaseSIMDModifiedImm<Q, op, (outs vectype:$dst),
+  : BaseSIMDModifiedImm<Q, op, 0, (outs vectype:$dst),
                         !con((ins vectype:$Rd, immtype:$imm8), opt_shift_iop),
                         asm, "{\t$Rd" # kind # ", $imm8" # opt_shift #
                              "|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
@@ -6495,7 +6608,7 @@ class BaseSIMDModifiedImmVectorTied<bit Q, bit op, RegisterOperand vectype,
 class BaseSIMDModifiedImmVectorShift<bit Q, bit op, bits<2> b15_b12,
                                      RegisterOperand vectype, string asm,
                                      string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+  : BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255,
                               (ins logical_vec_shift:$shift),
                               "$shift", asm, kind, pattern> {
   bits<2> shift;
@@ -6520,7 +6633,7 @@ class BaseSIMDModifiedImmVectorShiftTied<bit Q, bit op, bits<2> b15_b12,
 class BaseSIMDModifiedImmVectorShiftHalf<bit Q, bit op, bits<2> b15_b12,
                                          RegisterOperand vectype, string asm,
                                          string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+  : BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255,
                               (ins logical_vec_hw_shift:$shift),
                               "$shift", asm, kind, pattern> {
   bits<2> shift;
@@ -6585,7 +6698,7 @@ multiclass SIMDModifiedImmVectorShiftTied<bit op, bits<2> hw_cmode,
 class SIMDModifiedImmMoveMSL<bit Q, bit op, bits<4> cmode,
                              RegisterOperand vectype, string asm,
                              string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+  : BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255,
                               (ins move_vec_shift:$shift),
                               "$shift", asm, kind, pattern> {
   bits<1> shift;
@@ -6593,18 +6706,18 @@ class SIMDModifiedImmMoveMSL<bit Q, bit op, bits<4> cmode,
   let Inst{12}    = shift;
 }
 
-class SIMDModifiedImmVectorNoShift<bit Q, bit op, bits<4> cmode,
+class SIMDModifiedImmVectorNoShift<bit Q, bit op, bit op2, bits<4> cmode,
                                    RegisterOperand vectype,
                                    Operand imm_type, string asm,
                                    string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVector<Q, op, vectype, imm_type, (ins), "",
+  : BaseSIMDModifiedImmVector<Q, op, op2, vectype, imm_type, (ins), "",
                               asm, kind, pattern> {
   let Inst{15-12} = cmode;
 }
 
 class SIMDModifiedImmScalarNoShift<bit Q, bit op, bits<4> cmode, string asm,
                                    list<dag> pattern>
-  : BaseSIMDModifiedImm<Q, op, (outs FPR64:$Rd), (ins simdimmtype10:$imm8), asm,
+  : BaseSIMDModifiedImm<Q, op, 0, (outs FPR64:$Rd), (ins simdimmtype10:$imm8), asm,
                         "\t$Rd, $imm8", "", pattern> {
   let Inst{15-12} = cmode;
   let DecoderMethod = "DecodeModImmInstruction";
@@ -6676,6 +6789,34 @@ class BaseSIMDIndexedTied<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
 
 multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
                          SDPatternOperator OpNode> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b00, opc,
+                                      V64, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4h", ".4h", ".4h", ".h",
+    [(set (v4f16 V64:$Rd),
+        (OpNode (v4f16 V64:$Rn),
+         (v4f16 (AArch64duplane16 (v8f16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b00, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".8h", ".8h", ".8h", ".h",
+    [(set (v8f16 V128:$Rd),
+        (OpNode (v8f16 V128:$Rn),
+         (v8f16 (AArch64duplane16 (v8f16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+  } // Predicates = [HasNEON, HasFullFP16]
+
   def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
                                       V64, V64,
                                       V128, VectorIndexS,
@@ -6712,6 +6853,21 @@ multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
     let Inst{21} = 0;
   }
 
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v1i16_indexed : BaseSIMDIndexed<1, U, 1, 0b00, opc,
+                                      FPR16Op, FPR16Op, V128, VectorIndexH,
+                                      asm, ".h", "", "", ".h",
+    [(set (f16 FPR16Op:$Rd),
+          (OpNode (f16 FPR16Op:$Rn),
+                  (f16 (vector_extract (v8f16 V128:$Rm),
+                                       VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+  } // Predicates = [HasNEON, HasFullFP16]
+
   def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
                                       FPR32Op, FPR32Op, V128, VectorIndexS,
                                       asm, ".s", "", "", ".s",
@@ -6790,6 +6946,27 @@ multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> {
 }
 
 multiclass SIMDFPIndexedTied<bit U, bits<4> opc, string asm> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b00, opc, V64, V64,
+                                          V128_lo, VectorIndexH,
+                                          asm, ".4h", ".4h", ".4h", ".h", []> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b00, opc,
+                                          V128, V128,
+                                          V128_lo, VectorIndexH,
+                                          asm, ".8h", ".8h", ".8h", ".h", []> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+  } // Predicates = [HasNEON, HasFullFP16]
+
   def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, V64, V64,
                                           V128, VectorIndexS,
                                           asm, ".2s", ".2s", ".2s", ".s", []> {
@@ -6816,6 +6993,16 @@ multiclass SIMDFPIndexedTied<bit U, bits<4> opc, string asm> {
     let Inst{21} = 0;
   }
 
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v1i16_indexed : BaseSIMDIndexedTied<1, U, 1, 0b00, opc,
+                                      FPR16Op, FPR16Op, V128, VectorIndexH,
+                                      asm, ".h", "", "", ".h", []> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+  } // Predicates = [HasNEON, HasFullFP16]
 
   def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
                                       FPR32Op, FPR32Op, V128, VectorIndexS,
@@ -7353,7 +7540,13 @@ class BaseSIMDScalarShiftTied<bit U, bits<5> opc, bits<7> fixed_imm,
 }
 
 
-multiclass SIMDScalarRShiftSD<bit U, bits<5> opc, string asm> {
+multiclass SIMDFPScalarRShift<bit U, bits<5> opc, string asm> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
+                              FPR16, FPR16, vecshiftR16, asm, []> {
+    let Inst{19-16} = imm{3-0};
+  }
+  } // Predicates = [HasNEON, HasFullFP16]
   def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
                               FPR32, FPR32, vecshiftR32, asm, []> {
     let Inst{20-16} = imm{4-0};
@@ -7533,6 +7726,23 @@ class BaseSIMDVectorShiftTied<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
 
 multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm,
                               Intrinsic OpNode> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftR16,
+                                  asm, ".4h", ".4h",
+      [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (i32 imm:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftR16,
+                                  asm, ".8h", ".8h",
+      [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+  } // Predicates = [HasNEON, HasFullFP16]
   def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
                                   V64, V64, vecshiftR32,
                                   asm, ".2s", ".2s",
@@ -7558,8 +7768,26 @@ multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm,
   }
 }
 
-multiclass SIMDVectorRShiftSDToFP<bit U, bits<5> opc, string asm,
+multiclass SIMDVectorRShiftToFP<bit U, bits<5> opc, string asm,
                                   Intrinsic OpNode> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftR16,
+                                  asm, ".4h", ".4h",
+      [(set (v4f16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (i32 imm:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftR16,
+                                  asm, ".8h", ".8h",
+      [(set (v8f16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+  } // Predicates = [HasNEON, HasFullFP16]
+
   def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
                                   V64, V64, vecshiftR32,
                                   asm, ".2s", ".2s",
@@ -8840,9 +9068,8 @@ let Predicates = [HasNEON, HasV8_1a] in {
 class BaseSIMDThreeSameVectorTiedR0<bit Q, bit U, bits<2> size, bits<5> opcode,
                                     RegisterOperand regtype, string asm, 
                                     string kind, list<dag> pattern>
-  : BaseSIMDThreeSameVectorTied<Q, U, size, opcode, regtype, asm, kind, 
+  : BaseSIMDThreeSameVectorTied<Q, U, {size,0}, opcode, regtype, asm, kind, 
                                 pattern> {
-  let Inst{21}=0;
 }
 multiclass SIMDThreeSameVectorSQRDMLxHTiedHS<bit U, bits<5> opc, string asm,
                                              SDPatternOperator Accum> {
@@ -9277,6 +9504,7 @@ def : TokenAlias<".8H", ".8h">;
 def : TokenAlias<".4S", ".4s">;
 def : TokenAlias<".2D", ".2d">;
 def : TokenAlias<".1Q", ".1q">;
+def : TokenAlias<".2H", ".2h">;
 def : TokenAlias<".B", ".b">;
 def : TokenAlias<".H", ".h">;
 def : TokenAlias<".S", ".s">;
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 881f55ebeef9..cfb0c1b578da 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -2857,29 +2857,29 @@ defm CMGT    : SIMDThreeSameVector<0, 0b00110, "cmgt", AArch64cmgt>;
 defm CMHI    : SIMDThreeSameVector<1, 0b00110, "cmhi", AArch64cmhi>;
 defm CMHS    : SIMDThreeSameVector<1, 0b00111, "cmhs", AArch64cmhs>;
 defm CMTST   : SIMDThreeSameVector<0, 0b10001, "cmtst", AArch64cmtst>;
-defm FABD    : SIMDThreeSameVectorFP<1,1,0b11010,"fabd", int_aarch64_neon_fabd>;
-defm FACGE   : SIMDThreeSameVectorFPCmp<1,0,0b11101,"facge",int_aarch64_neon_facge>;
-defm FACGT   : SIMDThreeSameVectorFPCmp<1,1,0b11101,"facgt",int_aarch64_neon_facgt>;
-defm FADDP   : SIMDThreeSameVectorFP<1,0,0b11010,"faddp",int_aarch64_neon_addp>;
-defm FADD    : SIMDThreeSameVectorFP<0,0,0b11010,"fadd", fadd>;
-defm FCMEQ   : SIMDThreeSameVectorFPCmp<0, 0, 0b11100, "fcmeq", AArch64fcmeq>;
-defm FCMGE   : SIMDThreeSameVectorFPCmp<1, 0, 0b11100, "fcmge", AArch64fcmge>;
-defm FCMGT   : SIMDThreeSameVectorFPCmp<1, 1, 0b11100, "fcmgt", AArch64fcmgt>;
-defm FDIV    : SIMDThreeSameVectorFP<1,0,0b11111,"fdiv", fdiv>;
-defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b11000,"fmaxnmp", int_aarch64_neon_fmaxnmp>;
-defm FMAXNM  : SIMDThreeSameVectorFP<0,0,0b11000,"fmaxnm", fmaxnum>;
-defm FMAXP   : SIMDThreeSameVectorFP<1,0,0b11110,"fmaxp", int_aarch64_neon_fmaxp>;
-defm FMAX    : SIMDThreeSameVectorFP<0,0,0b11110,"fmax", fmaxnan>;
-defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b11000,"fminnmp", int_aarch64_neon_fminnmp>;
-defm FMINNM  : SIMDThreeSameVectorFP<0,1,0b11000,"fminnm", fminnum>;
-defm FMINP   : SIMDThreeSameVectorFP<1,1,0b11110,"fminp", int_aarch64_neon_fminp>;
-defm FMIN    : SIMDThreeSameVectorFP<0,1,0b11110,"fmin", fminnan>;
+defm FABD    : SIMDThreeSameVectorFP<1,1,0b010,"fabd", int_aarch64_neon_fabd>;
+defm FACGE   : SIMDThreeSameVectorFPCmp<1,0,0b101,"facge",int_aarch64_neon_facge>;
+defm FACGT   : SIMDThreeSameVectorFPCmp<1,1,0b101,"facgt",int_aarch64_neon_facgt>;
+defm FADDP   : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_addp>;
+defm FADD    : SIMDThreeSameVectorFP<0,0,0b010,"fadd", fadd>;
+defm FCMEQ   : SIMDThreeSameVectorFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
+defm FCMGE   : SIMDThreeSameVectorFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
+defm FCMGT   : SIMDThreeSameVectorFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
+defm FDIV    : SIMDThreeSameVectorFP<1,0,0b111,"fdiv", fdiv>;
+defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b000,"fmaxnmp", int_aarch64_neon_fmaxnmp>;
+defm FMAXNM  : SIMDThreeSameVectorFP<0,0,0b000,"fmaxnm", fmaxnum>;
+defm FMAXP   : SIMDThreeSameVectorFP<1,0,0b110,"fmaxp", int_aarch64_neon_fmaxp>;
+defm FMAX    : SIMDThreeSameVectorFP<0,0,0b110,"fmax", fmaxnan>;
+defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b000,"fminnmp", int_aarch64_neon_fminnmp>;
+defm FMINNM  : SIMDThreeSameVectorFP<0,1,0b000,"fminnm", fminnum>;
+defm FMINP   : SIMDThreeSameVectorFP<1,1,0b110,"fminp", int_aarch64_neon_fminp>;
+defm FMIN    : SIMDThreeSameVectorFP<0,1,0b110,"fmin", fminnan>;
 
 // NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the
 // instruction expects the addend first, while the fma intrinsic puts it last.
-defm FMLA     : SIMDThreeSameVectorFPTied<0, 0, 0b11001, "fmla",
+defm FMLA     : SIMDThreeSameVectorFPTied<0, 0, 0b001, "fmla",
             TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
-defm FMLS     : SIMDThreeSameVectorFPTied<0, 1, 0b11001, "fmls",
+defm FMLS     : SIMDThreeSameVectorFPTied<0, 1, 0b001, "fmls",
             TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
 
 // The following def pats catch the case where the LHS of an FMA is negated.
@@ -2893,11 +2893,11 @@ def : Pat<(v4f32 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
 def : Pat<(v2f64 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
           (FMLSv2f64 V128:$Rd, V128:$Rn, V128:$Rm)>;
 
-defm FMULX    : SIMDThreeSameVectorFP<0,0,0b11011,"fmulx", int_aarch64_neon_fmulx>;
-defm FMUL     : SIMDThreeSameVectorFP<1,0,0b11011,"fmul", fmul>;
-defm FRECPS   : SIMDThreeSameVectorFP<0,0,0b11111,"frecps", int_aarch64_neon_frecps>;
-defm FRSQRTS  : SIMDThreeSameVectorFP<0,1,0b11111,"frsqrts", int_aarch64_neon_frsqrts>;
-defm FSUB     : SIMDThreeSameVectorFP<0,1,0b11010,"fsub", fsub>;
+defm FMULX    : SIMDThreeSameVectorFP<0,0,0b011,"fmulx", int_aarch64_neon_fmulx>;
+defm FMUL     : SIMDThreeSameVectorFP<1,0,0b011,"fmul", fmul>;
+defm FRECPS   : SIMDThreeSameVectorFP<0,0,0b111,"frecps", int_aarch64_neon_frecps>;
+defm FRSQRTS  : SIMDThreeSameVectorFP<0,1,0b111,"frsqrts", int_aarch64_neon_frsqrts>;
+defm FSUB     : SIMDThreeSameVectorFP<0,1,0b010,"fsub", fsub>;
 defm MLA      : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla",
                       TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))> >;
 defm MLS      : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls",
@@ -3081,6 +3081,14 @@ def : InstAlias<"{cmlt\t$dst.2d, $src1.2d, $src2.2d" #
                 "|cmlt.2d\t$dst, $src1, $src2}",
                 (CMGTv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
 
+let Predicates = [HasNEON, HasFullFP16] in {
+def : InstAlias<"{fcmle\t$dst.4h, $src1.4h, $src2.4h" #
+                "|fcmle.4h\t$dst, $src1, $src2}",
+                (FCMGEv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{fcmle\t$dst.8h, $src1.8h, $src2.8h" #
+                "|fcmle.8h\t$dst, $src1, $src2}",
+                (FCMGEv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
+}
 def : InstAlias<"{fcmle\t$dst.2s, $src1.2s, $src2.2s" #
                 "|fcmle.2s\t$dst, $src1, $src2}",
                 (FCMGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
@@ -3091,6 +3099,14 @@ def : InstAlias<"{fcmle\t$dst.2d, $src1.2d, $src2.2d" #
                 "|fcmle.2d\t$dst, $src1, $src2}",
                 (FCMGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
 
+let Predicates = [HasNEON, HasFullFP16] in {
+def : InstAlias<"{fcmlt\t$dst.4h, $src1.4h, $src2.4h" #
+                "|fcmlt.4h\t$dst, $src1, $src2}",
+                (FCMGTv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{fcmlt\t$dst.8h, $src1.8h, $src2.8h" #
+                "|fcmlt.8h\t$dst, $src1, $src2}",
+                (FCMGTv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
+}
 def : InstAlias<"{fcmlt\t$dst.2s, $src1.2s, $src2.2s" #
                 "|fcmlt.2s\t$dst, $src1, $src2}",
                 (FCMGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
@@ -3101,6 +3117,14 @@ def : InstAlias<"{fcmlt\t$dst.2d, $src1.2d, $src2.2d" #
                 "|fcmlt.2d\t$dst, $src1, $src2}",
                 (FCMGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
 
+let Predicates = [HasNEON, HasFullFP16] in {
+def : InstAlias<"{facle\t$dst.4h, $src1.4h, $src2.4h" #
+                "|facle.4h\t$dst, $src1, $src2}",
+                (FACGEv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{facle\t$dst.8h, $src1.8h, $src2.8h" #
+                "|facle.8h\t$dst, $src1, $src2}",
+                (FACGEv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
+}
 def : InstAlias<"{facle\t$dst.2s, $src1.2s, $src2.2s" #
                 "|facle.2s\t$dst, $src1, $src2}",
                 (FACGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
@@ -3111,6 +3135,14 @@ def : InstAlias<"{facle\t$dst.2d, $src1.2d, $src2.2d" #
                 "|facle.2d\t$dst, $src1, $src2}",
                 (FACGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
 
+let Predicates = [HasNEON, HasFullFP16] in {
+def : InstAlias<"{faclt\t$dst.4h, $src1.4h, $src2.4h" #
+                "|faclt.4h\t$dst, $src1, $src2}",
+                (FACGTv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{faclt\t$dst.8h, $src1.8h, $src2.8h" #
+                "|faclt.8h\t$dst, $src1, $src2}",
+                (FACGTv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
+}
 def : InstAlias<"{faclt\t$dst.2s, $src1.2s, $src2.2s" #
                 "|faclt.2s\t$dst, $src1, $src2}",
                 (FACGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
@@ -3132,19 +3164,19 @@ defm CMGT     : SIMDThreeScalarD<0, 0b00110, "cmgt", AArch64cmgt>;
 defm CMHI     : SIMDThreeScalarD<1, 0b00110, "cmhi", AArch64cmhi>;
 defm CMHS     : SIMDThreeScalarD<1, 0b00111, "cmhs", AArch64cmhs>;
 defm CMTST    : SIMDThreeScalarD<0, 0b10001, "cmtst", AArch64cmtst>;
-defm FABD     : SIMDThreeScalarSD<1, 1, 0b11010, "fabd", int_aarch64_sisd_fabd>;
+defm FABD     : SIMDFPThreeScalar<1, 1, 0b010, "fabd", int_aarch64_sisd_fabd>;
 def : Pat<(v1f64 (int_aarch64_neon_fabd (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
           (FABD64 FPR64:$Rn, FPR64:$Rm)>;
-defm FACGE    : SIMDThreeScalarFPCmp<1, 0, 0b11101, "facge",
+defm FACGE    : SIMDThreeScalarFPCmp<1, 0, 0b101, "facge",
                                      int_aarch64_neon_facge>;
-defm FACGT    : SIMDThreeScalarFPCmp<1, 1, 0b11101, "facgt",
+defm FACGT    : SIMDThreeScalarFPCmp<1, 1, 0b101, "facgt",
                                      int_aarch64_neon_facgt>;
-defm FCMEQ    : SIMDThreeScalarFPCmp<0, 0, 0b11100, "fcmeq", AArch64fcmeq>;
-defm FCMGE    : SIMDThreeScalarFPCmp<1, 0, 0b11100, "fcmge", AArch64fcmge>;
-defm FCMGT    : SIMDThreeScalarFPCmp<1, 1, 0b11100, "fcmgt", AArch64fcmgt>;
-defm FMULX    : SIMDThreeScalarSD<0, 0, 0b11011, "fmulx", int_aarch64_neon_fmulx>;
-defm FRECPS   : SIMDThreeScalarSD<0, 0, 0b11111, "frecps", int_aarch64_neon_frecps>;
-defm FRSQRTS  : SIMDThreeScalarSD<0, 1, 0b11111, "frsqrts", int_aarch64_neon_frsqrts>;
+defm FCMEQ    : SIMDThreeScalarFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
+defm FCMGE    : SIMDThreeScalarFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
+defm FCMGT    : SIMDThreeScalarFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
+defm FMULX    : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx>;
+defm FRECPS   : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps>;
+defm FRSQRTS  : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts>;
 defm SQADD    : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>;
 defm SQDMULH  : SIMDThreeScalarHS<  0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>;
 defm SQRDMULH : SIMDThreeScalarHS<  1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
@@ -3248,14 +3280,14 @@ defm FRECPX : SIMDFPTwoScalar<   0, 1, 0b11111, "frecpx">;
 defm FRSQRTE : SIMDFPTwoScalar<  1, 1, 0b11101, "frsqrte">;
 defm NEG    : SIMDTwoScalarD<    1, 0b01011, "neg",
                                  UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
-defm SCVTF  : SIMDTwoScalarCVTSD<   0, 0, 0b11101, "scvtf", AArch64sitof>;
+defm SCVTF  : SIMDFPTwoScalarCVT<   0, 0, 0b11101, "scvtf", AArch64sitof>;
 defm SQABS  : SIMDTwoScalarBHSD< 0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
 defm SQNEG  : SIMDTwoScalarBHSD< 1, 0b00111, "sqneg", int_aarch64_neon_sqneg>;
 defm SQXTN  : SIMDTwoScalarMixedBHS< 0, 0b10100, "sqxtn", int_aarch64_neon_scalar_sqxtn>;
 defm SQXTUN : SIMDTwoScalarMixedBHS< 1, 0b10010, "sqxtun", int_aarch64_neon_scalar_sqxtun>;
 defm SUQADD : SIMDTwoScalarBHSDTied< 0, 0b00011, "suqadd",
                                      int_aarch64_neon_suqadd>;
-defm UCVTF  : SIMDTwoScalarCVTSD<   1, 0, 0b11101, "ucvtf", AArch64uitof>;
+defm UCVTF  : SIMDFPTwoScalarCVT<   1, 0, 0b11101, "ucvtf", AArch64uitof>;
 defm UQXTN  : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_aarch64_neon_scalar_uqxtn>;
 defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd",
                                     int_aarch64_neon_usqadd>;
@@ -3620,11 +3652,11 @@ defm CPY : SIMDScalarCPY<"cpy">;
 //----------------------------------------------------------------------------
 
 defm ADDP    : SIMDPairwiseScalarD<0, 0b11011, "addp">;
-defm FADDP   : SIMDFPPairwiseScalar<1, 0, 0b01101, "faddp">;
-defm FMAXNMP : SIMDFPPairwiseScalar<1, 0, 0b01100, "fmaxnmp">;
-defm FMAXP   : SIMDFPPairwiseScalar<1, 0, 0b01111, "fmaxp">;
-defm FMINNMP : SIMDFPPairwiseScalar<1, 1, 0b01100, "fminnmp">;
-defm FMINP   : SIMDFPPairwiseScalar<1, 1, 0b01111, "fminp">;
+defm FADDP   : SIMDFPPairwiseScalar<0, 0b01101, "faddp">;
+defm FMAXNMP : SIMDFPPairwiseScalar<0, 0b01100, "fmaxnmp">;
+defm FMAXP   : SIMDFPPairwiseScalar<0, 0b01111, "fmaxp">;
+defm FMINNMP : SIMDFPPairwiseScalar<1, 0b01100, "fminnmp">;
+defm FMINP   : SIMDFPPairwiseScalar<1, 0b01111, "fminp">;
 def : Pat<(v2i64 (AArch64saddv V128:$Rn)),
           (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (ADDPv2i64p V128:$Rn), dsub)>;
 def : Pat<(v2i64 (AArch64uaddv V128:$Rn)),
@@ -3976,10 +4008,10 @@ defm UMAXV   : SIMDAcrossLanesBHS<1, 0b01010, "umaxv">;
 defm UMINV   : SIMDAcrossLanesBHS<1, 0b11010, "uminv">;
 defm SADDLV  : SIMDAcrossLanesHSD<0, 0b00011, "saddlv">;
 defm UADDLV  : SIMDAcrossLanesHSD<1, 0b00011, "uaddlv">;
-defm FMAXNMV : SIMDAcrossLanesS<0b01100, 0, "fmaxnmv", int_aarch64_neon_fmaxnmv>;
-defm FMAXV   : SIMDAcrossLanesS<0b01111, 0, "fmaxv", int_aarch64_neon_fmaxv>;
-defm FMINNMV : SIMDAcrossLanesS<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>;
-defm FMINV   : SIMDAcrossLanesS<0b01111, 1, "fminv", int_aarch64_neon_fminv>;
+defm FMAXNMV : SIMDFPAcrossLanes<0b01100, 0, "fmaxnmv", int_aarch64_neon_fmaxnmv>;
+defm FMAXV   : SIMDFPAcrossLanes<0b01111, 0, "fmaxv", int_aarch64_neon_fmaxv>;
+defm FMINNMV : SIMDFPAcrossLanes<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>;
+defm FMINV   : SIMDFPAcrossLanes<0b01111, 1, "fminv", int_aarch64_neon_fminv>;
 
 // Patterns for across-vector intrinsics, that have a node equivalent, that
 // returns a vector (with only the low lane defined) instead of a scalar.
@@ -4226,15 +4258,23 @@ def : InstAlias<"orr.2s $Vd, $imm", (ORRv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
 def : InstAlias<"orr.4s $Vd, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
 
 // AdvSIMD FMOV
-def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0b1111, V128, fpimm8,
+def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1111, V128, fpimm8,
                                               "fmov", ".2d",
                        [(set (v2f64 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
-def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0b1111, V64,  fpimm8,
+def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0, 0b1111, V64,  fpimm8,
                                               "fmov", ".2s",
                        [(set (v2f32 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>;
-def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0b1111, V128, fpimm8,
+def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0, 0b1111, V128, fpimm8,
                                               "fmov", ".4s",
                        [(set (v4f32 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+let Predicates = [HasNEON, HasFullFP16] in {
+def FMOVv4f16_ns : SIMDModifiedImmVectorNoShift<0, 0, 1, 0b1111, V64,  fpimm8,
+                                              "fmov", ".4h",
+                       [(set (v4f16 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+def FMOVv8f16_ns : SIMDModifiedImmVectorNoShift<1, 0, 1, 0b1111, V128, fpimm8,
+                                              "fmov", ".8h",
+                       [(set (v8f16 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+} // Predicates = [HasNEON, HasFullFP16]
 
 // AdvSIMD MOVI
 
@@ -4262,7 +4302,7 @@ def : Pat<(v8i8  immAllOnesV), (MOVID (i32 255))>;
 // The movi_edit node has the immediate value already encoded, so we use
 // a plain imm0_255 in the pattern
 let isReMaterializable = 1, isAsCheapAsAMove = 1 in
-def MOVIv2d_ns   : SIMDModifiedImmVectorNoShift<1, 1, 0b1110, V128,
+def MOVIv2d_ns   : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1110, V128,
                                                 simdimmtype10,
                                                 "movi", ".2d",
                    [(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>;
@@ -4323,10 +4363,10 @@ def MOVIv4s_msl  : SIMDModifiedImmMoveMSL<1, 0, {1,1,0,?}, V128, "movi", ".4s",
                             (AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
 
 // Per byte: 8b & 16b
-def MOVIv8b_ns   : SIMDModifiedImmVectorNoShift<0, 0, 0b1110, V64,  imm0_255,
+def MOVIv8b_ns   : SIMDModifiedImmVectorNoShift<0, 0, 0, 0b1110, V64,  imm0_255,
                                                  "movi", ".8b",
                        [(set (v8i8 V64:$Rd), (AArch64movi imm0_255:$imm8))]>;
-def MOVIv16b_ns  : SIMDModifiedImmVectorNoShift<1, 0, 0b1110, V128, imm0_255,
+def MOVIv16b_ns  : SIMDModifiedImmVectorNoShift<1, 0, 0, 0b1110, V128, imm0_255,
                                                  "movi", ".16b",
                        [(set (v16i8 V128:$Rd), (AArch64movi imm0_255:$imm8))]>;
 
@@ -4526,10 +4566,10 @@ def : Pat<(int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
 //----------------------------------------------------------------------------
 // AdvSIMD scalar shift instructions
 //----------------------------------------------------------------------------
-defm FCVTZS : SIMDScalarRShiftSD<0, 0b11111, "fcvtzs">;
-defm FCVTZU : SIMDScalarRShiftSD<1, 0b11111, "fcvtzu">;
-defm SCVTF  : SIMDScalarRShiftSD<0, 0b11100, "scvtf">;
-defm UCVTF  : SIMDScalarRShiftSD<1, 0b11100, "ucvtf">;
+defm FCVTZS : SIMDFPScalarRShift<0, 0b11111, "fcvtzs">;
+defm FCVTZU : SIMDFPScalarRShift<1, 0b11111, "fcvtzu">;
+defm SCVTF  : SIMDFPScalarRShift<0, 0b11100, "scvtf">;
+defm UCVTF  : SIMDFPScalarRShift<1, 0b11100, "ucvtf">;
 // Codegen patterns for the above. We don't put these directly on the
 // instructions because TableGen's type inference can't handle the truth.
 // Having the same base pattern for fp <--> int totally freaks it out.
@@ -4602,7 +4642,7 @@ defm USRA     : SIMDScalarRShiftDTied<   1, 0b00010, "usra",
 //----------------------------------------------------------------------------
 defm FCVTZS:SIMDVectorRShiftSD<0, 0b11111, "fcvtzs", int_aarch64_neon_vcvtfp2fxs>;
 defm FCVTZU:SIMDVectorRShiftSD<1, 0b11111, "fcvtzu", int_aarch64_neon_vcvtfp2fxu>;
-defm SCVTF: SIMDVectorRShiftSDToFP<0, 0b11100, "scvtf",
+defm SCVTF: SIMDVectorRShiftToFP<0, 0b11100, "scvtf",
                                    int_aarch64_neon_vcvtfxs2fp>;
 defm RSHRN   : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn",
                                          int_aarch64_neon_rshrn>;
@@ -4637,7 +4677,7 @@ defm SSHLL   : SIMDVectorLShiftLongBHSD<0, 0b10100, "sshll",
 defm SSHR    : SIMDVectorRShiftBHSD<0, 0b00000, "sshr", AArch64vashr>;
 defm SSRA    : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra",
                 TriOpFrag<(add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>;
-defm UCVTF   : SIMDVectorRShiftSDToFP<1, 0b11100, "ucvtf",
+defm UCVTF   : SIMDVectorRShiftToFP<1, 0b11100, "ucvtf",
                         int_aarch64_neon_vcvtfxu2fp>;
 defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn",
                                          int_aarch64_neon_uqrshrn>;
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td
index b2efca023372..a8c8b176efa9 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -407,7 +407,7 @@ def FPR128 : RegisterClass<"AArch64",
 // The lower 16 vector registers.  Some instructions can only take registers
 // in this range.
 def FPR128_lo : RegisterClass<"AArch64",
-                              [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                              [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16],
                               128, (trunc FPR128, 16)>;
 
 // Pairs, triples, and quads of 64-bit vector registers.
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index f0ad855ed5e6..394c8e78581f 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -1921,6 +1921,8 @@ static bool isValidVectorKind(StringRef Name) {
       .Case(".h", true)
       .Case(".s", true)
       .Case(".d", true)
+      // Needed for fp16 scalar pairwise reductions
+      .Case(".2h", true)
       .Default(false);
 }
 
diff --git a/test/MC/AArch64/arm64-advsimd.s b/test/MC/AArch64/arm64-advsimd.s
index c627de708d31..294f09082916 100644
--- a/test/MC/AArch64/arm64-advsimd.s
+++ b/test/MC/AArch64/arm64-advsimd.s
@@ -1,4 +1,4 @@
-; RUN: llvm-mc -triple arm64-apple-darwin -mattr=crypto -output-asm-variant=1 -show-encoding < %s | FileCheck %s
+; RUN: llvm-mc -triple arm64-apple-darwin -mattr=crypto,fullfp16 -output-asm-variant=1 -show-encoding < %s | FileCheck %s
 
 foo:
 
@@ -440,6 +440,106 @@ foo:
 ; CHECK: urshl.8b	v0, v0, v0      ; encoding: [0x00,0x54,0x20,0x2e]
 ; CHECK: ushl.8b	v0, v0, v0              ; encoding: [0x00,0x44,0x20,0x2e]
 
+  fabd.4h v0, v0, v0
+  facge.4h  v0, v0, v0
+  facgt.4h  v0, v0, v0
+  faddp.4h v0, v0, v0
+  fadd.4h v0, v0, v0
+  fcmeq.4h  v0, v0, v0
+  fcmge.4h  v0, v0, v0
+  fcmgt.4h  v0, v0, v0
+  fdiv.4h v0, v0, v0
+  fmaxnmp.4h v0, v0, v0
+  fmaxnm.4h v0, v0, v0
+  fmaxp.4h v0, v0, v0
+  fmax.4h v0, v0, v0
+  fminnmp.4h v0, v0, v0
+  fminnm.4h v0, v0, v0
+  fminp.4h v0, v0, v0
+  fmin.4h v0, v0, v0
+  fmla.4h v0, v0, v0
+  fmls.4h v0, v0, v0
+  fmulx.4h v0, v0, v0
+  fmul.4h v0, v0, v0
+  frecps.4h v0, v0, v0
+  frsqrts.4h v0, v0, v0
+  fsub.4h v0, v0, v0
+
+; CHECK: fabd.4h    v0, v0, v0     ; encoding: [0x00,0x14,0xc0,0x2e]
+; CHECK: facge.4h   v0, v0, v0     ; encoding: [0x00,0x2c,0x40,0x2e]
+; CHECK: facgt.4h   v0, v0, v0     ; encoding: [0x00,0x2c,0xc0,0x2e]
+; CHECK: faddp.4h   v0, v0, v0     ; encoding: [0x00,0x14,0x40,0x2e]
+; CHECK: fadd.4h    v0, v0, v0     ; encoding: [0x00,0x14,0x40,0x0e]
+; CHECK: fcmeq.4h   v0, v0, v0     ; encoding: [0x00,0x24,0x40,0x0e]
+; CHECK: fcmge.4h   v0, v0, v0     ; encoding: [0x00,0x24,0x40,0x2e]
+; CHECK: fcmgt.4h   v0, v0, v0     ; encoding: [0x00,0x24,0xc0,0x2e]
+; CHECK: fdiv.4h    v0, v0, v0     ; encoding: [0x00,0x3c,0x40,0x2e]
+; CHECK: fmaxnmp.4h v0, v0, v0     ; encoding: [0x00,0x04,0x40,0x2e]
+; CHECK: fmaxnm.4h  v0, v0, v0     ; encoding: [0x00,0x04,0x40,0x0e]
+; CHECK: fmaxp.4h   v0, v0, v0     ; encoding: [0x00,0x34,0x40,0x2e]
+; CHECK: fmax.4h    v0, v0, v0     ; encoding: [0x00,0x34,0x40,0x0e]
+; CHECK: fminnmp.4h v0, v0, v0     ; encoding: [0x00,0x04,0xc0,0x2e]
+; CHECK: fminnm.4h  v0, v0, v0     ; encoding: [0x00,0x04,0xc0,0x0e]
+; CHECK: fminp.4h   v0, v0, v0     ; encoding: [0x00,0x34,0xc0,0x2e]
+; CHECK: fmin.4h    v0, v0, v0     ; encoding: [0x00,0x34,0xc0,0x0e]
+; CHECK: fmla.4h    v0, v0, v0     ; encoding: [0x00,0x0c,0x40,0x0e]
+; CHECK: fmls.4h    v0, v0, v0     ; encoding: [0x00,0x0c,0xc0,0x0e]
+; CHECK: fmulx.4h   v0, v0, v0     ; encoding: [0x00,0x1c,0x40,0x0e]
+; CHECK: fmul.4h    v0, v0, v0     ; encoding: [0x00,0x1c,0x40,0x2e]
+; CHECK: frecps.4h  v0, v0, v0     ; encoding: [0x00,0x3c,0x40,0x0e]
+; CHECK: frsqrts.4h v0, v0, v0     ; encoding: [0x00,0x3c,0xc0,0x0e]
+; CHECK: fsub.4h    v0, v0, v0     ; encoding: [0x00,0x14,0xc0,0x0e]
+
+  fabd.8h v0, v0, v0
+  facge.8h  v0, v0, v0
+  facgt.8h  v0, v0, v0
+  faddp.8h v0, v0, v0
+  fadd.8h v0, v0, v0
+  fcmeq.8h  v0, v0, v0
+  fcmge.8h  v0, v0, v0
+  fcmgt.8h  v0, v0, v0
+  fdiv.8h v0, v0, v0
+  fmaxnmp.8h v0, v0, v0
+  fmaxnm.8h v0, v0, v0
+  fmaxp.8h v0, v0, v0
+  fmax.8h v0, v0, v0
+  fminnmp.8h v0, v0, v0
+  fminnm.8h v0, v0, v0
+  fminp.8h v0, v0, v0
+  fmin.8h v0, v0, v0
+  fmla.8h v0, v0, v0
+  fmls.8h v0, v0, v0
+  fmulx.8h v0, v0, v0
+  fmul.8h v0, v0, v0
+  frecps.8h v0, v0, v0
+  frsqrts.8h v0, v0, v0
+  fsub.8h v0, v0, v0
+
+; CHECK: fabd.8h v0, v0, v0              ; encoding: [0x00,0x14,0xc0,0x6e]
+; CHECK: facge.8h        v0, v0, v0      ; encoding: [0x00,0x2c,0x40,0x6e]
+; CHECK: facgt.8h        v0, v0, v0      ; encoding: [0x00,0x2c,0xc0,0x6e]
+; CHECK: faddp.8h        v0, v0, v0      ; encoding: [0x00,0x14,0x40,0x6e]
+; CHECK: fadd.8h v0, v0, v0              ; encoding: [0x00,0x14,0x40,0x4e]
+; CHECK: fcmeq.8h        v0, v0, v0      ; encoding: [0x00,0x24,0x40,0x4e]
+; CHECK: fcmge.8h        v0, v0, v0      ; encoding: [0x00,0x24,0x40,0x6e]
+; CHECK: fcmgt.8h        v0, v0, v0      ; encoding: [0x00,0x24,0xc0,0x6e]
+; CHECK: fdiv.8h v0, v0, v0              ; encoding: [0x00,0x3c,0x40,0x6e]
+; CHECK: fmaxnmp.8h      v0, v0, v0      ; encoding: [0x00,0x04,0x40,0x6e]
+; CHECK: fmaxnm.8h       v0, v0, v0      ; encoding: [0x00,0x04,0x40,0x4e]
+; CHECK: fmaxp.8h        v0, v0, v0      ; encoding: [0x00,0x34,0x40,0x6e]
+; CHECK: fmax.8h v0, v0, v0              ; encoding: [0x00,0x34,0x40,0x4e]
+; CHECK: fminnmp.8h      v0, v0, v0      ; encoding: [0x00,0x04,0xc0,0x6e]
+; CHECK: fminnm.8h       v0, v0, v0      ; encoding: [0x00,0x04,0xc0,0x4e]
+; CHECK: fminp.8h        v0, v0, v0      ; encoding: [0x00,0x34,0xc0,0x6e]
+; CHECK: fmin.8h v0, v0, v0              ; encoding: [0x00,0x34,0xc0,0x4e]
+; CHECK: fmla.8h v0, v0, v0              ; encoding: [0x00,0x0c,0x40,0x4e]
+; CHECK: fmls.8h v0, v0, v0              ; encoding: [0x00,0x0c,0xc0,0x4e]
+; CHECK: fmulx.8h        v0, v0, v0      ; encoding: [0x00,0x1c,0x40,0x4e]
+; CHECK: fmul.8h v0, v0, v0              ; encoding: [0x00,0x1c,0x40,0x6e]
+; CHECK: frecps.8h       v0, v0, v0      ; encoding: [0x00,0x3c,0x40,0x4e]
+; CHECK: frsqrts.8h      v0, v0, v0      ; encoding: [0x00,0x3c,0xc0,0x4e]
+; CHECK: fsub.8h v0, v0, v0              ; encoding: [0x00,0x14,0xc0,0x4e]
+
   bif.8b v0, v0, v0
   bit.8b v0, v0, v0
   bsl.8b v0, v0, v0
@@ -568,6 +668,57 @@ foo:
 ; CHECK: shll2.4s	v1, v2, #16     ; encoding: [0x41,0x38,0x61,0x6e]
 ; CHECK: shll2.2d	v1, v2, #32     ; encoding: [0x41,0x38,0xa1,0x6e]
 
+  fabs.4h     v0, v0
+  fneg.4h     v0, v0
+  frecpe.4h   v0, v0
+  frinta.4h   v0, v0
+  frintx.4h   v0, v0
+  frinti.4h   v0, v0
+  frintm.4h   v0, v0
+  frintn.4h   v0, v0
+  frintp.4h   v0, v0
+  frintz.4h   v0, v0
+  frsqrte.4h  v0, v0
+  fsqrt.4h    v0, v0
+
+; CHECK: fabs.4h v0, v0                  ; encoding: [0x00,0xf8,0xf8,0x0e]
+; CHECK: fneg.4h v0, v0                  ; encoding: [0x00,0xf8,0xf8,0x2e]
+; CHECK: frecpe.4h       v0, v0          ; encoding: [0x00,0xd8,0xf9,0x0e]
+; CHECK: frinta.4h       v0, v0          ; encoding: [0x00,0x88,0x79,0x2e]
+; CHECK: frintx.4h       v0, v0          ; encoding: [0x00,0x98,0x79,0x2e]
+; CHECK: frinti.4h       v0, v0          ; encoding: [0x00,0x98,0xf9,0x2e]
+; CHECK: frintm.4h       v0, v0          ; encoding: [0x00,0x98,0x79,0x0e]
+; CHECK: frintn.4h       v0, v0          ; encoding: [0x00,0x88,0x79,0x0e]
+; CHECK: frintp.4h       v0, v0          ; encoding: [0x00,0x88,0xf9,0x0e]
+; CHECK: frintz.4h       v0, v0          ; encoding: [0x00,0x98,0xf9,0x0e]
+; CHECK: frsqrte.4h      v0, v0          ; encoding: [0x00,0xd8,0xf9,0x2e]
+; CHECK: fsqrt.4h        v0, v0          ; encoding: [0x00,0xf8,0xf9,0x2e]
+
+  fabs.8h     v0, v0
+  fneg.8h     v0, v0
+  frecpe.8h   v0, v0
+  frinta.8h   v0, v0
+  frintx.8h   v0, v0
+  frinti.8h   v0, v0
+  frintm.8h   v0, v0
+  frintn.8h   v0, v0
+  frintp.8h   v0, v0
+  frintz.8h   v0, v0
+  frsqrte.8h  v0, v0
+  fsqrt.8h    v0, v0
+
+; CHECK: fabs.8h v0, v0                  ; encoding: [0x00,0xf8,0xf8,0x4e]
+; CHECK: fneg.8h v0, v0                  ; encoding: [0x00,0xf8,0xf8,0x6e]
+; CHECK: frecpe.8h       v0, v0          ; encoding: [0x00,0xd8,0xf9,0x4e]
+; CHECK: frinta.8h       v0, v0          ; encoding: [0x00,0x88,0x79,0x6e]
+; CHECK: frintx.8h       v0, v0          ; encoding: [0x00,0x98,0x79,0x6e]
+; CHECK: frinti.8h       v0, v0          ; encoding: [0x00,0x98,0xf9,0x6e]
+; CHECK: frintm.8h       v0, v0          ; encoding: [0x00,0x98,0x79,0x4e]
+; CHECK: frintn.8h       v0, v0          ; encoding: [0x00,0x88,0x79,0x4e]
+; CHECK: frintp.8h       v0, v0          ; encoding: [0x00,0x88,0xf9,0x4e]
+; CHECK: frintz.8h       v0, v0          ; encoding: [0x00,0x98,0xf9,0x4e]
+; CHECK: frsqrte.8h      v0, v0          ; encoding: [0x00,0xd8,0xf9,0x6e]
+; CHECK: fsqrt.8h        v0, v0          ; encoding: [0x00,0xf8,0xf9,0x6e]
 
   cmeq.8b   v0, v0, #0
   cmeq.16b  v0, v0, #0
diff --git a/test/MC/AArch64/armv8.1a-rdma.s b/test/MC/AArch64/armv8.1a-rdma.s
index 1de2a0fb15dd..36158428d6c4 100644
--- a/test/MC/AArch64/armv8.1a-rdma.s
+++ b/test/MC/AArch64/armv8.1a-rdma.s
@@ -26,27 +26,9 @@
   sqrdmlsh v0.8s, v1.8s, v2.8s
   sqrdmlah v0.2s, v1.4h, v2.8h
   sqrdmlsh v0.4s, v1.8h, v2.2s
-// CHECK-ERROR: error: invalid vector kind qualifier
-// CHECK-ERROR:   sqrdmlah v0.2h, v1.2h, v2.2h
-// CHECK-ERROR:            ^
-// CHECK-ERROR: error: invalid vector kind qualifier
-// CHECK-ERROR:   sqrdmlah v0.2h, v1.2h, v2.2h
-// CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid vector kind qualifier
-// CHECK-ERROR:   sqrdmlah v0.2h, v1.2h, v2.2h
-// CHECK-ERROR:                          ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:   sqrdmlah v0.2h, v1.2h, v2.2h
 // CHECK-ERROR:            ^
-// CHECK-ERROR: error: invalid vector kind qualifier
-// CHECK-ERROR:   sqrdmlsh v0.2h, v1.2h, v2.2h
-// CHECK-ERROR:            ^
-// CHECK-ERROR: error: invalid vector kind qualifier
-// CHECK-ERROR:   sqrdmlsh v0.2h, v1.2h, v2.2h
-// CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid vector kind qualifier
-// CHECK-ERROR:   sqrdmlsh v0.2h, v1.2h, v2.2h
-// CHECK-ERROR:                          ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:   sqrdmlsh v0.2h, v1.2h, v2.2h
 // CHECK-ERROR:            ^
diff --git a/test/MC/AArch64/fullfp16-diagnostics.s b/test/MC/AArch64/fullfp16-diagnostics.s
new file mode 100644
index 000000000000..190b6e25a4b1
--- /dev/null
+++ b/test/MC/AArch64/fullfp16-diagnostics.s
@@ -0,0 +1,42 @@
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon < %s 2> %t
+// RUN: FileCheck < %t %s
+
+  fmla v0.4h, v1.4h, v16.h[3]
+  fmla v2.8h, v3.8h, v17.h[6]
+
+// CHECK:      error: invalid operand for instruction
+// CHECK-NEXT: fmla v0.4h, v1.4h, v16.h[3]
+// CHECK-NEXT:                    ^
+// CHECK:      error: invalid operand for instruction
+// CHECK-NEXT: fmla v2.8h, v3.8h, v17.h[6]
+// CHECK-NEXT:                    ^
+
+  fmls v0.4h, v1.4h, v16.h[3]
+  fmls v2.8h, v3.8h, v17.h[6]
+
+// CHECK:      error: invalid operand for instruction
+// CHECK-NEXT: fmls v0.4h, v1.4h, v16.h[3]
+// CHECK-NEXT:                    ^
+// CHECK:      error: invalid operand for instruction
+// CHECK-NEXT: fmls v2.8h, v3.8h, v17.h[6]
+// CHECK-NEXT:                    ^
+
+  fmul v0.4h, v1.4h, v16.h[3]
+  fmul v2.8h, v3.8h, v17.h[6]
+
+// CHECK:      error: invalid operand for instruction
+// CHECK-NEXT: fmul v0.4h, v1.4h, v16.h[3]
+// CHECK-NEXT:                    ^
+// CHECK:      error: invalid operand for instruction
+// CHECK-NEXT: fmul v2.8h, v3.8h, v17.h[6]
+// CHECK-NEXT:                    ^
+
+  fmulx v0.4h, v1.4h, v16.h[3]
+  fmulx v2.8h, v3.8h, v17.h[6]
+
+// CHECK:      error: invalid operand for instruction
+// CHECK-NEXT: fmulx v0.4h, v1.4h, v16.h[3]
+// CHECK-NEXT:                     ^
+// CHECK:      error: invalid operand for instruction
+// CHECK-NEXT: fmulx v2.8h, v3.8h, v17.h[6]
+// CHECK-NEXT:                     ^
diff --git a/test/MC/AArch64/fullfp16-neon-neg.s b/test/MC/AArch64/fullfp16-neon-neg.s
new file mode 100644
index 000000000000..0913ecb7e9ab
--- /dev/null
+++ b/test/MC/AArch64/fullfp16-neon-neg.s
@@ -0,0 +1,382 @@
+// RUN: not llvm-mc -triple=aarch64 -mattr=+neon,-fullfp16 -show-encoding < %s 2>&1 | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -mattr=-neon,+fullfp16 -show-encoding < %s 2>&1 | FileCheck %s
+
+
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fabs.4h     v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fneg.4h     v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frecpe.4h   v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frinta.4h   v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintx.4h   v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frinti.4h   v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintm.4h   v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintn.4h   v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintp.4h   v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintz.4h   v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frsqrte.4h  v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fsqrt.4h    v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fabs.8h     v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fneg.8h     v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frecpe.8h   v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frinta.8h   v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintx.8h   v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frinti.8h   v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintm.8h   v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintn.8h   v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintp.8h   v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintz.8h   v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frsqrte.8h  v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fsqrt.8h    v0, v0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmla v0.4h, v1.4h, v2.h[2]
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmla v3.8h, v8.8h, v2.h[1]
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmls v0.4h, v1.4h, v2.h[2]
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmls v3.8h, v8.8h, v2.h[1]
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmul v0.4h, v1.4h, v2.h[2]
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmul v0.8h, v1.8h, v2.h[2]
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmulx v0.4h, v1.4h, v2.h[2]
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmulx v0.8h, v1.8h, v2.h[2]
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fabd v0.4h, v1.4h, v2.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmaxnmv h0, v1.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fminnmv h0, v1.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmaxv h0, v1.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fminv h0, v1.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  faddp v0.4h, v1.4h, v2.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  faddp v0.8h, v1.8h, v2.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fadd v0.4h, v1.4h, v2.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fadd v0.8h, v1.8h, v2.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fsub v0.4h, v1.4h, v2.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fsub v0.8h, v1.8h, v2.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmeq v0.4h, v31.4h, v16.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmeq v4.8h, v7.8h, v15.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmge v3.4h, v8.4h, v12.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmge v31.8h, v29.8h, v28.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmle v3.4h,  v12.4h, v8.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmle v31.8h, v28.8h, v29.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmgt v0.4h, v31.4h, v16.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmgt v4.8h, v7.8h, v15.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmlt v0.4h, v16.4h, v31.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmlt v4.8h, v15.8h, v7.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmeq v0.4h, v31.4h, #0.0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmeq v4.8h, v7.8h, #0.0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmeq v0.4h, v31.4h, #0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmeq v4.8h, v7.8h, #0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmge v3.4h, v8.4h, #0.0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmge v31.8h, v29.8h, #0.0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmge v3.4h, v8.4h, #0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmge v31.8h, v29.8h, #0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmgt v0.4h, v31.4h, #0.0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmgt v4.8h, v7.8h, #0.0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmgt v0.4h, v31.4h, #0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmgt v4.8h, v7.8h, #0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmle v3.4h, v20.4h, #0.0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmle v1.8h, v8.8h, #0.0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmle v3.4h, v20.4h, #0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmle v1.8h, v8.8h, #0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmlt v16.4h, v2.4h, #0.0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmlt v15.8h, v4.8h, #0.0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmlt v16.4h, v2.4h, #0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmlt v15.8h, v4.8h, #0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  facge v0.4h, v31.4h, v16.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  facge v4.8h, v7.8h, v15.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  facle v0.4h, v16.4h, v31.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  facle v4.8h, v15.8h, v7.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  facgt v3.4h, v8.4h, v12.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  facgt v31.8h, v29.8h, v28.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  faclt v3.4h,  v12.4h, v8.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  faclt v31.8h, v28.8h, v29.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frsqrts v0.4h, v31.4h, v16.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frsqrts v4.8h, v7.8h, v15.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frecps v3.4h, v8.4h, v12.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frecps v31.8h, v29.8h, v28.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmaxp v0.4h, v1.4h, v2.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmaxp v31.8h, v15.8h, v16.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fminp v10.4h, v15.4h, v22.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fminp v3.8h, v5.8h, v6.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmaxnmp v0.4h, v1.4h, v2.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmaxnmp v31.8h, v15.8h, v16.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fminnmp v10.4h, v15.4h, v22.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fminnmp v3.8h, v5.8h, v6.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmax v0.4h, v1.4h, v2.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmax v0.8h, v1.8h, v2.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmin v10.4h, v15.4h, v22.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmin v10.8h, v15.8h, v22.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmaxnm v0.4h, v1.4h, v2.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmaxnm v0.8h, v1.8h, v2.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fminnm v10.4h, v15.4h, v22.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fminnm v10.8h, v15.8h, v22.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmla v0.4h, v1.4h, v2.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmla v0.8h, v1.8h, v2.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmls v0.4h, v1.4h, v2.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmls v0.8h, v1.8h, v2.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fabd h29, h24, h20
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmla    h0, h1, v1.h[5]
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmls    h2, h3, v4.h[5]
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmul    h0, h1, v1.h[5]
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmulx   h6, h2, v8.h[5]
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtzs h21, h12, #1
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtzu h21, h12, #1
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtas h12, h13
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtau h12, h13
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtms h22, h13
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtmu h12, h13
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtns h22, h13
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtnu h12, h13
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtps h22, h13
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtpu h12, h13
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtzs h12, h13
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtzu h12, h13
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmeq h10, h11, h12
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmeq h10, h11, #0.0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmeq h10, h11, #0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmge h10, h11, h12
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmge h10, h11, #0.0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmge h10, h11, #0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmgt h10, h11, h12
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmgt h10, h11, #0.0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmgt h10, h11, #0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmle h10, h11, #0.0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmle h10, h11, #0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmlt h10, h11, #0.0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcmlt h10, h11, #0
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  facge h10, h11, h12
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  facgt h10, h11, h12
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fmulx h20, h22, h15
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frecps h21, h16, h13
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frsqrts h21, h5, h12
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frecpe h19, h14
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frecpx h18, h10
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frsqrte h22, h13
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  faddp h18, v3.2h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fabs v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fabs v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fneg v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fneg v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintn v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintn v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frinta v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frinta v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintp v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintp v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintm v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintm v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintx v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintx v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintz v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frintz v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frinti v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frinti v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtns v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtns v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtnu v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtnu v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtps v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtps v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtpu v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtpu v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtms v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtms v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtmu v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtmu v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtzs v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtzs v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtzu v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtzu v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtas v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtas v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtau v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fcvtau v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frecpe v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frecpe v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frsqrte v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  frsqrte v6.8h, v8.8h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fsqrt v4.4h, v0.4h
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
+  fsqrt v6.8h, v8.8h
+
+// CHECK-NOT: :[[@LINE+1]]:{{[0-9]+}}: error: instruction requires:
diff --git a/test/MC/AArch64/neon-2velem.s b/test/MC/AArch64/neon-2velem.s
index 04841d0164f2..ed55ad0b1363 100644
--- a/test/MC/AArch64/neon-2velem.s
+++ b/test/MC/AArch64/neon-2velem.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=arm64 -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=arm64 -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -46,6 +46,8 @@
 // CHECK: mls	v0.8h, v1.8h, v2.h[7]   // encoding: [0x20,0x48,0x72,0x6f]
 // CHECK: mls	v0.8h, v1.8h, v14.h[6]  // encoding: [0x20,0x48,0x6e,0x6f]
 
+        fmla v0.4h, v1.4h, v2.h[2]
+        fmla v3.8h, v8.8h, v2.h[1]
         fmla v0.2s, v1.2s, v2.s[2]
         fmla v0.2s, v1.2s, v22.s[2]
         fmla v3.4s, v8.4s, v2.s[1]
@@ -53,6 +55,8 @@
         fmla v0.2d, v1.2d, v2.d[1]
         fmla v0.2d, v1.2d, v22.d[1]
 
+// CHECK: fmla    v0.4h, v1.4h, v2.h[2]   // encoding: [0x20,0x10,0x22,0x0f]
+// CHECK: fmla    v3.8h, v8.8h, v2.h[1]   // encoding: [0x03,0x11,0x12,0x4f]
 // CHECK: fmla	v0.2s, v1.2s, v2.s[2]   // encoding: [0x20,0x18,0x82,0x0f]
 // CHECK: fmla	v0.2s, v1.2s, v22.s[2]  // encoding: [0x20,0x18,0x96,0x0f]
 // CHECK: fmla	v3.4s, v8.4s, v2.s[1]   // encoding: [0x03,0x11,0xa2,0x4f]
@@ -60,6 +64,8 @@
 // CHECK: fmla	v0.2d, v1.2d, v2.d[1]   // encoding: [0x20,0x18,0xc2,0x4f]
 // CHECK: fmla	v0.2d, v1.2d, v22.d[1]  // encoding: [0x20,0x18,0xd6,0x4f]
 
+        fmls v0.4h, v1.4h, v2.h[2]
+        fmls v3.8h, v8.8h, v2.h[1]
         fmls v0.2s, v1.2s, v2.s[2]
         fmls v0.2s, v1.2s, v22.s[2]
         fmls v3.4s, v8.4s, v2.s[1]
@@ -67,6 +73,8 @@
         fmls v0.2d, v1.2d, v2.d[1]
         fmls v0.2d, v1.2d, v22.d[1]
 
+// CHECK: fmls    v0.4h, v1.4h, v2.h[2]   // encoding: [0x20,0x50,0x22,0x0f]
+// CHECK: fmls    v3.8h, v8.8h, v2.h[1]   // encoding: [0x03,0x51,0x12,0x4f]
 // CHECK: fmls	v0.2s, v1.2s, v2.s[2]   // encoding: [0x20,0x58,0x82,0x0f]
 // CHECK: fmls	v0.2s, v1.2s, v22.s[2]  // encoding: [0x20,0x58,0x96,0x0f]
 // CHECK: fmls	v3.4s, v8.4s, v2.s[1]   // encoding: [0x03,0x51,0xa2,0x4f]
@@ -172,6 +180,8 @@
 // CHECK: mul	v0.4s, v1.4s, v2.s[2]   // encoding: [0x20,0x88,0x82,0x4f]
 // CHECK: mul	v0.4s, v1.4s, v22.s[2]  // encoding: [0x20,0x88,0x96,0x4f]
 
+        fmul v0.4h, v1.4h, v2.h[2]
+        fmul v0.8h, v1.8h, v2.h[2]
         fmul v0.2s, v1.2s, v2.s[2]
         fmul v0.2s, v1.2s, v22.s[2]
         fmul v0.4s, v1.4s, v2.s[2]
@@ -179,6 +189,8 @@
         fmul v0.2d, v1.2d, v2.d[1]
         fmul v0.2d, v1.2d, v22.d[1]
 
+// CHECK: fmul    v0.4h, v1.4h, v2.h[2]   // encoding: [0x20,0x90,0x22,0x0f]
+// CHECK: fmul    v0.8h, v1.8h, v2.h[2]   // encoding: [0x20,0x90,0x22,0x4f]
 // CHECK: fmul	v0.2s, v1.2s, v2.s[2]   // encoding: [0x20,0x98,0x82,0x0f]
 // CHECK: fmul	v0.2s, v1.2s, v22.s[2]  // encoding: [0x20,0x98,0x96,0x0f]
 // CHECK: fmul	v0.4s, v1.4s, v2.s[2]   // encoding: [0x20,0x98,0x82,0x4f]
@@ -186,6 +198,8 @@
 // CHECK: fmul	v0.2d, v1.2d, v2.d[1]   // encoding: [0x20,0x98,0xc2,0x4f]
 // CHECK: fmul	v0.2d, v1.2d, v22.d[1]  // encoding: [0x20,0x98,0xd6,0x4f]
 
+        fmulx v0.4h, v1.4h, v2.h[2]
+        fmulx v0.8h, v1.8h, v2.h[2]
         fmulx v0.2s, v1.2s, v2.s[2]
         fmulx v0.2s, v1.2s, v22.s[2]
         fmulx v0.4s, v1.4s, v2.s[2]
@@ -193,6 +207,8 @@
         fmulx v0.2d, v1.2d, v2.d[1]
         fmulx v0.2d, v1.2d, v22.d[1]
 
+// CHECK: fmulx   v0.4h, v1.4h, v2.h[2]   // encoding: [0x20,0x90,0x22,0x2f]
+// CHECK: fmulx   v0.8h, v1.8h, v2.h[2]   // encoding: [0x20,0x90,0x22,0x6f]
 // CHECK: fmulx	v0.2s, v1.2s, v2.s[2]   // encoding: [0x20,0x98,0x82,0x2f]
 // CHECK: fmulx	v0.2s, v1.2s, v22.s[2]  // encoding: [0x20,0x98,0x96,0x2f]
 // CHECK: fmulx	v0.4s, v1.4s, v2.s[2]   // encoding: [0x20,0x98,0x82,0x6f]
diff --git a/test/MC/AArch64/neon-aba-abd.s b/test/MC/AArch64/neon-aba-abd.s
index 178eb26f64c2..b3a90bb14895 100644
--- a/test/MC/AArch64/neon-aba-abd.s
+++ b/test/MC/AArch64/neon-aba-abd.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -68,10 +68,12 @@
 //----------------------------------------------------------------------
 // Vector Absolute Difference (Floating Point)
 //----------------------------------------------------------------------
+         fabd v0.4h, v1.4h, v2.4h
          fabd v0.2s, v1.2s, v2.2s
          fabd v31.4s, v15.4s, v16.4s
          fabd v7.2d, v8.2d, v25.2d
 
+// CHECK: fabd    v0.4h, v1.4h, v2.4h     // encoding: [0x20,0x14,0xc2,0x2e]
 // CHECK: fabd v0.2s, v1.2s, v2.2s    // encoding: [0x20,0xd4,0xa2,0x2e]
 // CHECK: fabd v31.4s, v15.4s, v16.4s // encoding: [0xff,0xd5,0xb0,0x6e]
 // CHECK: fabd v7.2d, v8.2d, v25.2d   // encoding: [0x07,0xd5,0xf9,0x6e]
diff --git a/test/MC/AArch64/neon-across.s b/test/MC/AArch64/neon-across.s
index 60b766d8c881..74edc519a475 100644
--- a/test/MC/AArch64/neon-across.s
+++ b/test/MC/AArch64/neon-across.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=arm64 -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=arm64 -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -90,11 +90,27 @@
 // CHECK: addv	h0, v1.8h               // encoding: [0x20,0xb8,0x71,0x4e]
 // CHECK: addv	s0, v1.4s               // encoding: [0x20,0xb8,0xb1,0x4e]
 
+        fmaxnmv h0, v1.4h
+        fminnmv h0, v1.4h
+        fmaxv h0, v1.4h
+        fminv h0, v1.4h
+        fmaxnmv h0, v1.8h
+        fminnmv h0, v1.8h
+        fmaxv h0, v1.8h
+        fminv h0, v1.8h
         fmaxnmv s0, v1.4s
         fminnmv s0, v1.4s
         fmaxv s0, v1.4s
         fminv s0, v1.4s
 
+// CHECK: fmaxnmv h0, v1.4h               // encoding: [0x20,0xc8,0x30,0x0e]
+// CHECK: fminnmv h0, v1.4h               // encoding: [0x20,0xc8,0xb0,0x0e]
+// CHECK: fmaxv   h0, v1.4h               // encoding: [0x20,0xf8,0x30,0x0e]
+// CHECK: fminv   h0, v1.4h               // encoding: [0x20,0xf8,0xb0,0x0e]
+// CHECK: fmaxnmv h0, v1.8h               // encoding: [0x20,0xc8,0x30,0x4e]
+// CHECK: fminnmv h0, v1.8h               // encoding: [0x20,0xc8,0xb0,0x4e]
+// CHECK: fmaxv   h0, v1.8h               // encoding: [0x20,0xf8,0x30,0x4e]
+// CHECK: fminv   h0, v1.8h               // encoding: [0x20,0xf8,0xb0,0x4e]
 // CHECK: fmaxnmv	s0, v1.4s               // encoding: [0x20,0xc8,0x30,0x6e]
 // CHECK: fminnmv	s0, v1.4s               // encoding: [0x20,0xc8,0xb0,0x6e]
 // CHECK: fmaxv	s0, v1.4s               // encoding: [0x20,0xf8,0x30,0x6e]
diff --git a/test/MC/AArch64/neon-add-pairwise.s b/test/MC/AArch64/neon-add-pairwise.s
index df9938b07e52..3d77c6e2790f 100644
--- a/test/MC/AArch64/neon-add-pairwise.s
+++ b/test/MC/AArch64/neon-add-pairwise.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -25,10 +25,14 @@
 //------------------------------------------------------------------------------
 // Vector Add Pairwise (Floating Point
 //------------------------------------------------------------------------------
+         faddp v0.4h, v1.4h, v2.4h
+         faddp v0.8h, v1.8h, v2.8h
          faddp v0.2s, v1.2s, v2.2s
          faddp v0.4s, v1.4s, v2.4s
          faddp v0.2d, v1.2d, v2.2d
 
+// CHECK: faddp   v0.4h, v1.4h, v2.4h     // encoding: [0x20,0x14,0x42,0x2e]
+// CHECK: faddp   v0.8h, v1.8h, v2.8h     // encoding: [0x20,0x14,0x42,0x6e]
 // CHECK: faddp v0.2s, v1.2s, v2.2s       // encoding: [0x20,0xd4,0x22,0x2e]
 // CHECK: faddp v0.4s, v1.4s, v2.4s       // encoding: [0x20,0xd4,0x22,0x6e]
 // CHECK: faddp v0.2d, v1.2d, v2.2d       // encoding: [0x20,0xd4,0x62,0x6e]
diff --git a/test/MC/AArch64/neon-add-sub-instructions.s b/test/MC/AArch64/neon-add-sub-instructions.s
index 68f169b3dd90..0d8416537022 100644
--- a/test/MC/AArch64/neon-add-sub-instructions.s
+++ b/test/MC/AArch64/neon-add-sub-instructions.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -44,10 +44,14 @@
 //------------------------------------------------------------------------------
 // Vector Floating-Point Add
 //------------------------------------------------------------------------------
+         fadd v0.4h, v1.4h, v2.4h
+         fadd v0.8h, v1.8h, v2.8h
          fadd v0.2s, v1.2s, v2.2s
          fadd v0.4s, v1.4s, v2.4s
          fadd v0.2d, v1.2d, v2.2d
 
+// CHECK: fadd v0.4h, v1.4h, v2.4h       // encoding: [0x20,0x14,0x42,0x0e]
+// CHECK: fadd v0.8h, v1.8h, v2.8h       // encoding: [0x20,0x14,0x42,0x4e]
 // CHECK: fadd v0.2s, v1.2s, v2.2s       // encoding: [0x20,0xd4,0x22,0x0e]
 // CHECK: fadd v0.4s, v1.4s, v2.4s       // encoding: [0x20,0xd4,0x22,0x4e]
 // CHECK: fadd v0.2d, v1.2d, v2.2d       // encoding: [0x20,0xd4,0x62,0x4e]
@@ -56,10 +60,14 @@
 //------------------------------------------------------------------------------
 // Vector Floating-Point Sub
 //------------------------------------------------------------------------------
+         fsub v0.4h, v1.4h, v2.4h
+         fsub v0.8h, v1.8h, v2.8h
          fsub v0.2s, v1.2s, v2.2s
          fsub v0.4s, v1.4s, v2.4s
          fsub v0.2d, v1.2d, v2.2d
 
+// CHECK: fsub v0.4h, v1.4h, v2.4h       // encoding: [0x20,0x14,0xc2,0x0e]
+// CHECK; fsub v0.8h, v1.8h, v2.8h       // encoding: [0x20,0x14,0xc2,0x4e]
 // CHECK: fsub v0.2s, v1.2s, v2.2s       // encoding: [0x20,0xd4,0xa2,0x0e]
 // CHECK: fsub v0.4s, v1.4s, v2.4s       // encoding: [0x20,0xd4,0xa2,0x4e]
 // CHECK: fsub v0.2d, v1.2d, v2.2d       // encoding: [0x20,0xd4,0xe2,0x4e]
diff --git a/test/MC/AArch64/neon-compare-instructions.s b/test/MC/AArch64/neon-compare-instructions.s
index 19cfaf1f4d36..ffa88e50e0ce 100644
--- a/test/MC/AArch64/neon-compare-instructions.s
+++ b/test/MC/AArch64/neon-compare-instructions.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -194,10 +194,14 @@
 // Vector Compare Mask Equal (Floating Point)
 //----------------------------------------------------------------------
 
+         fcmeq v0.4h, v31.4h, v16.4h
+         fcmeq v4.8h, v7.8h, v15.8h
          fcmeq v0.2s, v31.2s, v16.2s
          fcmeq v4.4s, v7.4s, v15.4s
          fcmeq v29.2d, v2.2d, v5.2d
 
+// CHECK: fcmeq   v0.4h, v31.4h, v16.4h   // encoding: [0xe0,0x27,0x50,0x0e]
+// CHECK: fcmeq   v4.8h, v7.8h, v15.8h    // encoding: [0xe4,0x24,0x4f,0x4e]
 // CHECK: fcmeq v0.2s, v31.2s, v16.2s // encoding: [0xe0,0xe7,0x30,0x0e]
 // CHECK: fcmeq v4.4s, v7.4s, v15.4s  // encoding: [0xe4,0xe4,0x2f,0x4e]
 // CHECK: fcmeq v29.2d, v2.2d, v5.2d  // encoding: [0x5d,0xe4,0x65,0x4e]
@@ -208,6 +212,10 @@
 // FCMLE is alias for FCMGE with operands reversed.
 //----------------------------------------------------------------------
 
+         fcmge v3.4h, v8.4h, v12.4h
+         fcmge v31.8h, v29.8h, v28.8h
+         fcmle v3.4h,  v12.4h, v8.4h
+         fcmle v31.8h, v28.8h, v29.8h
          fcmge v31.4s, v29.4s, v28.4s
          fcmge v3.2s, v8.2s, v12.2s
          fcmge v17.2d, v15.2d, v13.2d
@@ -215,6 +223,10 @@
          fcmle v3.2s,  v12.2s, v8.2s
          fcmle v17.2d, v13.2d, v15.2d
 
+// CHECK: fcmge   v3.4h, v8.4h, v12.4h    // encoding: [0x03,0x25,0x4c,0x2e]
+// CHECK: fcmge   v31.8h, v29.8h, v28.8h  // encoding: [0xbf,0x27,0x5c,0x6e]
+// CHECK: fcmge   v3.4h, v8.4h, v12.4h    // encoding: [0x03,0x25,0x4c,0x2e]
+// CHECK: fcmge   v31.8h, v29.8h, v28.8h  // encoding: [0xbf,0x27,0x5c,0x6e]
 // CHECK: fcmge v31.4s, v29.4s, v28.4s  // encoding: [0xbf,0xe7,0x3c,0x6e]
 // CHECK: fcmge v3.2s, v8.2s, v12.2s    // encoding: [0x03,0xe5,0x2c,0x2e]
 // CHECK: fcmge v17.2d, v15.2d, v13.2d  // encoding: [0xf1,0xe5,0x6d,0x6e]
@@ -228,6 +240,10 @@
 // FCMLT is alias for FCMGT with operands reversed.
 //----------------------------------------------------------------------
 
+         fcmgt v0.4h, v31.4h, v16.4h
+         fcmgt v4.8h, v7.8h, v15.8h
+         fcmlt v0.4h, v16.4h, v31.4h
+         fcmlt v4.8h, v15.8h, v7.8h
          fcmgt v0.2s, v31.2s, v16.2s
          fcmgt v4.4s, v7.4s, v15.4s
          fcmgt v29.2d, v2.2d, v5.2d
@@ -235,6 +251,10 @@
          fcmlt v4.4s, v15.4s, v7.4s
          fcmlt v29.2d, v5.2d, v2.2d
 
+// CHECK: fcmgt   v0.4h, v31.4h, v16.4h   // encoding: [0xe0,0x27,0xd0,0x2e]
+// CHECK: fcmgt   v4.8h, v7.8h, v15.8h    // encoding: [0xe4,0x24,0xcf,0x6e]
+// CHECK: fcmgt   v0.4h, v31.4h, v16.4h   // encoding: [0xe0,0x27,0xd0,0x2e]
+// CHECK: fcmgt   v4.8h, v7.8h, v15.8h    // encoding: [0xe4,0x24,0xcf,0x6e]
 // CHECK: fcmgt v0.2s, v31.2s, v16.2s  // encoding: [0xe0,0xe7,0xb0,0x2e]
 // CHECK: fcmgt v4.4s, v7.4s, v15.4s   // encoding: [0xe4,0xe4,0xaf,0x6e]
 // CHECK: fcmgt v29.2d, v2.2d, v5.2d   // encoding: [0x5d,0xe4,0xe5,0x6e]
@@ -343,16 +363,24 @@
 //----------------------------------------------------------------------
 // Vector Compare Mask Equal to Zero (Floating Point)
 //----------------------------------------------------------------------
+         fcmeq v0.4h, v31.4h, #0.0
+         fcmeq v4.8h, v7.8h, #0.0
          fcmeq v0.2s, v31.2s, #0.0
          fcmeq v4.4s, v7.4s, #0.0
          fcmeq v29.2d, v2.2d, #0.0
+         fcmeq v0.4h, v31.4h, #0
+         fcmeq v4.8h, v7.8h, #0
          fcmeq v0.2s, v31.2s, #0
          fcmeq v4.4s, v7.4s, #0
          fcmeq v29.2d, v2.2d, #0
 
+// CHECK: fcmeq   v0.4h, v31.4h, #0.0     // encoding: [0xe0,0xdb,0xf8,0x0e]
+// CHECK: fcmeq   v4.8h, v7.8h, #0.0      // encoding: [0xe4,0xd8,0xf8,0x4e]
 // CHECK: fcmeq v0.2s, v31.2s, #0.0  // encoding: [0xe0,0xdb,0xa0,0x0e]
 // CHECK: fcmeq v4.4s, v7.4s, #0.0   // encoding: [0xe4,0xd8,0xa0,0x4e]
 // CHECK: fcmeq v29.2d, v2.2d, #0.0  // encoding: [0x5d,0xd8,0xe0,0x4e]
+// CHECK: fcmeq   v0.4h, v31.4h, #0.0     // encoding: [0xe0,0xdb,0xf8,0x0e]
+// CHECK: fcmeq   v4.8h, v7.8h, #0.0      // encoding: [0xe4,0xd8,0xf8,0x4e]
 // CHECK: fcmeq v0.2s, v31.2s, #0.0  // encoding: [0xe0,0xdb,0xa0,0x0e]
 // CHECK: fcmeq v4.4s, v7.4s, #0.0   // encoding: [0xe4,0xd8,0xa0,0x4e]
 // CHECK: fcmeq v29.2d, v2.2d, #0.0  // encoding: [0x5d,0xd8,0xe0,0x4e]
@@ -360,16 +388,24 @@
 //----------------------------------------------------------------------
 // Vector Compare Mask Greater Than or Equal to Zero (Floating Point)
 //----------------------------------------------------------------------
+         fcmge v3.4h, v8.4h, #0.0
+         fcmge v31.8h, v29.8h, #0.0
          fcmge v31.4s, v29.4s, #0.0
          fcmge v3.2s, v8.2s, #0.0
          fcmge v17.2d, v15.2d, #0.0
+         fcmge v3.4h, v8.4h, #0
+         fcmge v31.8h, v29.8h, #0
          fcmge v31.4s, v29.4s, #0
          fcmge v3.2s, v8.2s, #0
          fcmge v17.2d, v15.2d, #0
 
+// CHECK: fcmge   v3.4h, v8.4h, #0.0      // encoding: [0x03,0xc9,0xf8,0x2e]
+// CHECK: fcmge   v31.8h, v29.8h, #0.0    // encoding: [0xbf,0xcb,0xf8,0x6e]
 // CHECK: fcmge v31.4s, v29.4s, #0.0  // encoding: [0xbf,0xcb,0xa0,0x6e]
 // CHECK: fcmge v3.2s, v8.2s, #0.0    // encoding: [0x03,0xc9,0xa0,0x2e]
 // CHECK: fcmge v17.2d, v15.2d, #0.0   // encoding: [0xf1,0xc9,0xe0,0x6e]
+// CHECK: fcmge   v3.4h, v8.4h, #0.0      // encoding: [0x03,0xc9,0xf8,0x2e]
+// CHECK: fcmge   v31.8h, v29.8h, #0.0    // encoding: [0xbf,0xcb,0xf8,0x6e]
 // CHECK: fcmge v31.4s, v29.4s, #0.0  // encoding: [0xbf,0xcb,0xa0,0x6e]
 // CHECK: fcmge v3.2s, v8.2s, #0.0    // encoding: [0x03,0xc9,0xa0,0x2e]
 // CHECK: fcmge v17.2d, v15.2d, #0.0   // encoding: [0xf1,0xc9,0xe0,0x6e]
@@ -377,16 +413,24 @@
 //----------------------------------------------------------------------
 // Vector Compare Mask Greater Than Zero (Floating Point)
 //----------------------------------------------------------------------
+         fcmgt v0.4h, v31.4h, #0.0
+         fcmgt v4.8h, v7.8h, #0.0
          fcmgt v0.2s, v31.2s, #0.0
          fcmgt v4.4s, v7.4s, #0.0
          fcmgt v29.2d, v2.2d, #0.0
+         fcmgt v0.4h, v31.4h, #0
+         fcmgt v4.8h, v7.8h, #0
          fcmgt v0.2s, v31.2s, #0
          fcmgt v4.4s, v7.4s, #0
          fcmgt v29.2d, v2.2d, #0
 
+// CHECK: fcmgt   v0.4h, v31.4h, #0.0     // encoding: [0xe0,0xcb,0xf8,0x0e]
+// CHECK: fcmgt   v4.8h, v7.8h, #0.0      // encoding: [0xe4,0xc8,0xf8,0x4e]
 // CHECK: fcmgt v0.2s, v31.2s, #0.0   // encoding: [0xe0,0xcb,0xa0,0x0e]
 // CHECK: fcmgt v4.4s, v7.4s, #0.0    // encoding: [0xe4,0xc8,0xa0,0x4e]
 // CHECK: fcmgt v29.2d, v2.2d, #0.0   // encoding: [0x5d,0xc8,0xe0,0x4e]
+// CHECK: fcmgt   v0.4h, v31.4h, #0.0     // encoding: [0xe0,0xcb,0xf8,0x0e]
+// CHECK: fcmgt   v4.8h, v7.8h, #0.0      // encoding: [0xe4,0xc8,0xf8,0x4e]
 // CHECK: fcmgt v0.2s, v31.2s, #0.0   // encoding: [0xe0,0xcb,0xa0,0x0e]
 // CHECK: fcmgt v4.4s, v7.4s, #0.0    // encoding: [0xe4,0xc8,0xa0,0x4e]
 // CHECK: fcmgt v29.2d, v2.2d, #0.0   // encoding: [0x5d,0xc8,0xe0,0x4e]
@@ -394,16 +438,24 @@
 //----------------------------------------------------------------------
 // Vector Compare Mask Less Than or Equal To Zero (Floating Point)
 //----------------------------------------------------------------------
+         fcmle v3.4h, v20.4h, #0.0
+         fcmle v1.8h, v8.8h, #0.0
          fcmle v1.4s, v8.4s, #0.0
          fcmle v3.2s, v20.2s, #0.0
          fcmle v7.2d, v13.2d, #0.0
+         fcmle v3.4h, v20.4h, #0
+         fcmle v1.8h, v8.8h, #0
          fcmle v1.4s, v8.4s, #0
          fcmle v3.2s, v20.2s, #0
          fcmle v7.2d, v13.2d, #0
 
+// CHECK: fcmle   v3.4h, v20.4h, #0.0     // encoding: [0x83,0xda,0xf8,0x2e]
+// CHECK: fcmle   v1.8h, v8.8h, #0.0      // encoding: [0x01,0xd9,0xf8,0x6e]
 // CHECK: fcmle v1.4s, v8.4s, #0.0   // encoding: [0x01,0xd9,0xa0,0x6e]
 // CHECK: fcmle v3.2s, v20.2s, #0.0  // encoding: [0x83,0xda,0xa0,0x2e]
 // CHECK: fcmle v7.2d, v13.2d, #0.0  // encoding: [0xa7,0xd9,0xe0,0x6e]
+// CHECK: fcmle   v3.4h, v20.4h, #0.0     // encoding: [0x83,0xda,0xf8,0x2e]
+// CHECK: fcmle   v1.8h, v8.8h, #0.0      // encoding: [0x01,0xd9,0xf8,0x6e]
 // CHECK: fcmle v1.4s, v8.4s, #0.0   // encoding: [0x01,0xd9,0xa0,0x6e]
 // CHECK: fcmle v3.2s, v20.2s, #0.0  // encoding: [0x83,0xda,0xa0,0x2e]
 // CHECK: fcmle v7.2d, v13.2d, #0.0  // encoding: [0xa7,0xd9,0xe0,0x6e]
@@ -411,16 +463,24 @@
 //----------------------------------------------------------------------
 // Vector Compare Mask Less Than Zero (Floating Point)
 //----------------------------------------------------------------------
+         fcmlt v16.4h, v2.4h, #0.0
+         fcmlt v15.8h, v4.8h, #0.0
          fcmlt v16.2s, v2.2s, #0.0
          fcmlt v15.4s, v4.4s, #0.0
          fcmlt v5.2d, v29.2d, #0.0
+         fcmlt v16.4h, v2.4h, #0
+         fcmlt v15.8h, v4.8h, #0
          fcmlt v16.2s, v2.2s, #0
          fcmlt v15.4s, v4.4s, #0
          fcmlt v5.2d, v29.2d, #0
 
+// CHECK: fcmlt   v16.4h, v2.4h, #0.0     // encoding: [0x50,0xe8,0xf8,0x0e]
+// CHECK: fcmlt   v15.8h, v4.8h, #0.0     // encoding: [0x8f,0xe8,0xf8,0x4e]
 // CHECK: fcmlt v16.2s, v2.2s, #0.0   // encoding: [0x50,0xe8,0xa0,0x0e]
 // CHECK: fcmlt v15.4s, v4.4s, #0.0   // encoding: [0x8f,0xe8,0xa0,0x4e]
 // CHECK: fcmlt v5.2d, v29.2d, #0.0   // encoding: [0xa5,0xeb,0xe0,0x4e]
+// CHECK: fcmlt   v16.4h, v2.4h, #0.0     // encoding: [0x50,0xe8,0xf8,0x0e]
+// CHECK: fcmlt   v15.8h, v4.8h, #0.0     // encoding: [0x8f,0xe8,0xf8,0x4e]
 // CHECK: fcmlt v16.2s, v2.2s, #0.0   // encoding: [0x50,0xe8,0xa0,0x0e]
 // CHECK: fcmlt v15.4s, v4.4s, #0.0   // encoding: [0x8f,0xe8,0xa0,0x4e]
 // CHECK: fcmlt v5.2d, v29.2d, #0.0   // encoding: [0xa5,0xeb,0xe0,0x4e]
diff --git a/test/MC/AArch64/neon-diagnostics.s b/test/MC/AArch64/neon-diagnostics.s
index 973acb8249ad..6ded6e40bfb9 100644
--- a/test/MC/AArch64/neon-diagnostics.s
+++ b/test/MC/AArch64/neon-diagnostics.s
@@ -341,7 +341,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fabd v0.2s, v1.4s, v2.2d
 // CHECK-ERROR:                        ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fabd v0.4h, v1.4h, v2.4h
 // CHECK-ERROR:                 ^
 //----------------------------------------------------------------------
@@ -385,7 +385,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        frecps v0.4s, v1.2d, v2.4s
 // CHECK-ERROR:                         ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:        frecps v0.8h, v1.8h, v2.8h
 // CHECK-ERROR:                  ^
 
@@ -400,7 +400,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        frsqrts v0.2d, v1.2d, v2.2s
 // CHECK-ERROR:                                 ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:        frsqrts v0.4h, v1.4h, v2.4h
 // CHECK-ERROR:                   ^
 
@@ -417,7 +417,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        facge v0.2d, v1.2s, v2.2d
 // CHECK-ERROR:                        ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:        facge v0.4h, v1.4h, v2.4h
 // CHECK-ERROR:                 ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -435,7 +435,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        facgt v0.2d, v1.2d, v2.4s
 // CHECK-ERROR:                               ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:        facgt v0.8h, v1.8h, v2.8h
 // CHECK-ERROR:                 ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -1092,7 +1092,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        fmin v0.4s, v1.4s, v2.2d
 // CHECK-ERROR:                              ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:        fmin v0.8h, v1.8h, v2.8h
 // CHECK-ERROR:                ^
 
@@ -1177,7 +1177,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        fminp v0.4s, v1.4s, v2.2d
 // CHECK-ERROR:                               ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:        fminp v0.8h, v1.8h, v2.8h
 // CHECK-ERROR:                 ^
 
@@ -1283,7 +1283,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fmulx v21.2s, v5.2s, v13.2d
 // CHECK-ERROR:                                  ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fmulx v1.4h, v25.4h, v3.4h
 // CHECK-ERROR:                  ^
 
@@ -3023,10 +3023,10 @@
       fmla v0.2d, v1.2d, v2.d[2]
       fmla v0.2d, v1.2d, v22.d[2]
 
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:        fmla v0.4h, v1.4h, v2.h[2]
 // CHECK-ERROR:                ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:        fmla v0.8h, v1.8h, v2.h[2]
 // CHECK-ERROR:                ^
 // CHECK-ERROR: vector lane must be an integer in range
@@ -3057,10 +3057,10 @@
       fmls v0.2d, v1.2d, v2.d[2]
       fmls v0.2d, v1.2d, v22.d[2]
 
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:        fmls v0.4h, v1.4h, v2.h[2]
 // CHECK-ERROR:                ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:        fmls v0.8h, v1.8h, v2.h[2]
 // CHECK-ERROR:                ^
 // CHECK-ERROR: vector lane must be an integer in range
@@ -3428,7 +3428,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        mul v0.2d, v1.2d, v2.d[1]
 // CHECK-ERROR:               ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:        fmul v0.4h, v1.4h, v2.h[4]
 // CHECK-ERROR:                ^
 // CHECK-ERROR: vector lane must be an integer in range
@@ -3458,7 +3458,7 @@
       fmulx v0.2d, v1.2d, v2.d[2]
       fmulx v0.2d, v1.2d, v22.d[2]
 
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:        fmulx v0.4h, v1.4h, v2.h[4]
 // CHECK-ERROR:                 ^
 // CHECK-ERROR: vector lane must be an integer in range
@@ -3837,16 +3837,16 @@
         fmaxv h0, v1.8h
         fminv h0, v1.8h
 
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:        fmaxnmv h0, v1.8h
 // CHECK-ERROR:                ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:        fminnmv h0, v1.8h
 // CHECK-ERROR:                ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:        fmaxv h0, v1.8h
 // CHECK-ERROR:              ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:        fminv h0, v1.8h
 // CHECK-ERROR:              ^
 
@@ -5594,13 +5594,13 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fabs v0.16b, v31.16b
 // CHECK-ERROR:                 ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fabs v2.8h, v4.8h
 // CHECK-ERROR:                 ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fabs v1.8b, v9.8b
 // CHECK-ERROR:                 ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fabs v13.4h, v21.4h
 // CHECK-ERROR:                  ^
 
@@ -5616,13 +5616,13 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fneg v0.16b, v31.16b
 // CHECK-ERROR:                 ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fneg v2.8h, v4.8h
 // CHECK-ERROR:                 ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fneg v1.8b, v9.8b
 // CHECK-ERROR:                 ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fneg v13.4h, v21.4h
 // CHECK-ERROR:                  ^
 
@@ -5978,205 +5978,205 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frintn v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frintn v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frintn v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frintn v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frinta v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frinta v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frinta v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frinta v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frintp v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frintp v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frintp v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frintp v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frintm v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frintm v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frintm v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frintm v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frintx v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frintx v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frintx v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frintx v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frintz v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frintz v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frintz v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frintz v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frinti v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frinti v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frinti v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frinti v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtns v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtns v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtns v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtns v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtnu v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtnu v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtnu v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtnu v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtps v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtps v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtps v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtps v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtpu v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtpu v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtpu v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtpu v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtms v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtms v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtms v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtms v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtmu v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtmu v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtmu v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtmu v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtzs v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtzs v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtzs v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtzs v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtzu v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtzu v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtzu v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtzu v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtas v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtas v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtas v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtas v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtau v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtau v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtau v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fcvtau v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -6212,61 +6212,61 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         scvtf v0.16b, v31.16b
 // CHECK-ERROR:                  ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         scvtf v2.8h, v4.8h
 // CHECK-ERROR:                  ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         scvtf v1.8b, v9.8b
 // CHECK-ERROR:                  ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         scvtf v13.4h, v21.4h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         ucvtf v0.16b, v31.16b
 // CHECK-ERROR:                  ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         ucvtf v2.8h, v4.8h
 // CHECK-ERROR:                  ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         ucvtf v1.8b, v9.8b
 // CHECK-ERROR:                  ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         ucvtf v13.4h, v21.4h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frecpe v0.16b, v31.16b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frecpe v2.8h, v4.8h
 // CHECK-ERROR:                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frecpe v1.8b, v9.8b
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frecpe v13.4h, v21.4h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frsqrte v0.16b, v31.16b
 // CHECK-ERROR:                    ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frsqrte v2.8h, v4.8h
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         frsqrte v1.8b, v9.8b
 // CHECK-ERROR:                    ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         frsqrte v13.4h, v21.4h
 // CHECK-ERROR:                     ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fsqrt v0.16b, v31.16b
 // CHECK-ERROR:                  ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fsqrt v2.8h, v4.8h
 // CHECK-ERROR:                  ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fsqrt v1.8b, v9.8b
 // CHECK-ERROR:                  ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: instruction requires: fullfp16
 // CHECK-ERROR:         fsqrt v13.4h, v21.4h
 // CHECK-ERROR:                   ^
 
diff --git a/test/MC/AArch64/neon-facge-facgt.s b/test/MC/AArch64/neon-facge-facgt.s
index 212eda2f2092..9c10caa0f7c2 100644
--- a/test/MC/AArch64/neon-facge-facgt.s
+++ b/test/MC/AArch64/neon-facge-facgt.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -6,16 +6,24 @@
 // Vector Absolute Compare Mask Less Than Or Equal (Floating Point)
 // FACLE is alias for FACGE with operands reversed
 //----------------------------------------------------------------------
+         facge v0.4h, v31.4h, v16.4h
+         facge v4.8h, v7.8h, v15.8h
          facge v0.2s, v31.2s, v16.2s
          facge v4.4s, v7.4s, v15.4s
          facge v29.2d, v2.2d, v5.2d
+         facle v0.4h, v16.4h, v31.4h
+         facle v4.8h, v15.8h, v7.8h
          facle v0.2s, v16.2s, v31.2s
          facle v4.4s, v15.4s, v7.4s
          facle v29.2d, v5.2d, v2.2d
 
+// CHECK: facge   v0.4h, v31.4h, v16.4h   // encoding: [0xe0,0x2f,0x50,0x2e]
+// CHECK: facge   v4.8h, v7.8h, v15.8h    // encoding: [0xe4,0x2c,0x4f,0x6e]
 // CHECK: facge v0.2s, v31.2s, v16.2s // encoding: [0xe0,0xef,0x30,0x2e]
 // CHECK: facge v4.4s, v7.4s, v15.4s  // encoding: [0xe4,0xec,0x2f,0x6e]
 // CHECK: facge v29.2d, v2.2d, v5.2d  // encoding: [0x5d,0xec,0x65,0x6e]
+// CHECK: facge   v0.4h, v31.4h, v16.4h   // encoding: [0xe0,0x2f,0x50,0x2e]
+// CHECK: facge   v4.8h, v7.8h, v15.8h    // encoding: [0xe4,0x2c,0x4f,0x6e]
 // CHECK: facge v0.2s, v31.2s, v16.2s // encoding: [0xe0,0xef,0x30,0x2e]
 // CHECK: facge v4.4s, v7.4s, v15.4s  // encoding: [0xe4,0xec,0x2f,0x6e]
 // CHECK: facge v29.2d, v2.2d, v5.2d  // encoding: [0x5d,0xec,0x65,0x6e]
@@ -24,16 +32,24 @@
 // Vector Absolute Compare Mask Less Than (Floating Point)
 // FACLT is alias for FACGT with operands reversed
 //----------------------------------------------------------------------
+         facgt v3.4h, v8.4h, v12.4h
+         facgt v31.8h, v29.8h, v28.8h
          facgt v31.4s, v29.4s, v28.4s
          facgt v3.2s, v8.2s, v12.2s
          facgt v17.2d, v15.2d, v13.2d
+         faclt v3.4h,  v12.4h, v8.4h
+         faclt v31.8h, v28.8h, v29.8h
          faclt v31.4s, v28.4s, v29.4s
          faclt v3.2s,  v12.2s, v8.2s
          faclt v17.2d, v13.2d, v15.2d
 
+// CHECK: facgt   v3.4h, v8.4h, v12.4h    // encoding: [0x03,0x2d,0xcc,0x2e]
+// CHECK: facgt   v31.8h, v29.8h, v28.8h  // encoding: [0xbf,0x2f,0xdc,0x6e]
 // CHECK: facgt v31.4s, v29.4s, v28.4s  // encoding: [0xbf,0xef,0xbc,0x6e]
 // CHECK: facgt v3.2s, v8.2s, v12.2s    // encoding: [0x03,0xed,0xac,0x2e]
 // CHECK: facgt v17.2d, v15.2d, v13.2d  // encoding: [0xf1,0xed,0xed,0x6e]
+// CHECK: facgt   v3.4h, v8.4h, v12.4h    // encoding: [0x03,0x2d,0xcc,0x2e]
+// CHECK: facgt   v31.8h, v29.8h, v28.8h  // encoding: [0xbf,0x2f,0xdc,0x6e]
 // CHECK: facgt v31.4s, v29.4s, v28.4s  // encoding: [0xbf,0xef,0xbc,0x6e]
 // CHECK: facgt v3.2s, v8.2s, v12.2s    // encoding: [0x03,0xed,0xac,0x2e]
 // CHECK: facgt v17.2d, v15.2d, v13.2d  // encoding: [0xf1,0xed,0xed,0x6e]
diff --git a/test/MC/AArch64/neon-frsqrt-frecp.s b/test/MC/AArch64/neon-frsqrt-frecp.s
index 79fe5da5e76f..67a1340ecc32 100644
--- a/test/MC/AArch64/neon-frsqrt-frecp.s
+++ b/test/MC/AArch64/neon-frsqrt-frecp.s
@@ -1,14 +1,18 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
 //----------------------------------------------------------------------
 // Vector Reciprocal Square Root Step (Floating Point)
 //----------------------------------------------------------------------
+         frsqrts v0.4h, v31.4h, v16.4h
+         frsqrts v4.8h, v7.8h, v15.8h
          frsqrts v0.2s, v31.2s, v16.2s
          frsqrts v4.4s, v7.4s, v15.4s
          frsqrts v29.2d, v2.2d, v5.2d
 
+// CHECK: frsqrts v0.4h, v31.4h, v16.4h   // encoding: [0xe0,0x3f,0xd0,0x0e]
+// CHECK: frsqrts v4.8h, v7.8h, v15.8h    // encoding: [0xe4,0x3c,0xcf,0x4e]
 // CHECK: frsqrts v0.2s, v31.2s, v16.2s // encoding: [0xe0,0xff,0xb0,0x0e]
 // CHECK: frsqrts v4.4s, v7.4s, v15.4s  // encoding: [0xe4,0xfc,0xaf,0x4e]
 // CHECK: frsqrts v29.2d, v2.2d, v5.2d  // encoding: [0x5d,0xfc,0xe5,0x4e]
@@ -16,10 +20,14 @@
 //----------------------------------------------------------------------
 // Vector Reciprocal Step (Floating Point)
 //----------------------------------------------------------------------
+         frecps v3.4h, v8.4h, v12.4h
+         frecps v31.8h, v29.8h, v28.8h
          frecps v31.4s, v29.4s, v28.4s
          frecps v3.2s, v8.2s, v12.2s
          frecps v17.2d, v15.2d, v13.2d
 
+// CHECK: frecps  v3.4h, v8.4h, v12.4h    // encoding: [0x03,0x3d,0x4c,0x0e]
+// CHECK: frecps  v31.8h, v29.8h, v28.8h  // encoding: [0xbf,0x3f,0x5c,0x4e]
 // CHECK: frecps v31.4s, v29.4s, v28.4s  // encoding: [0xbf,0xff,0x3c,0x4e]
 // CHECK: frecps v3.2s, v8.2s, v12.2s    // encoding: [0x03,0xfd,0x2c,0x0e]
 // CHECK: frecps v17.2d, v15.2d, v13.2d  // encoding: [0xf1,0xfd,0x6d,0x4e]
diff --git a/test/MC/AArch64/neon-max-min-pairwise.s b/test/MC/AArch64/neon-max-min-pairwise.s
index 8d2dadb1997f..27cf4c8d830a 100644
--- a/test/MC/AArch64/neon-max-min-pairwise.s
+++ b/test/MC/AArch64/neon-max-min-pairwise.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -67,10 +67,14 @@
 //----------------------------------------------------------------------
 // Vector Maximum Pairwise (Floating Point)
 //----------------------------------------------------------------------
+         fmaxp v0.4h, v1.4h, v2.4h
+         fmaxp v31.8h, v15.8h, v16.8h
          fmaxp v0.2s, v1.2s, v2.2s
          fmaxp v31.4s, v15.4s, v16.4s
          fmaxp v7.2d, v8.2d, v25.2d
 
+// CHECK: fmaxp   v0.4h, v1.4h, v2.4h     // encoding: [0x20,0x34,0x42,0x2e]
+// CHECK: fmaxp   v31.8h, v15.8h, v16.8h  // encoding: [0xff,0x35,0x50,0x6e]
 // CHECK: fmaxp v0.2s, v1.2s, v2.2s    // encoding: [0x20,0xf4,0x22,0x2e]
 // CHECK: fmaxp v31.4s, v15.4s, v16.4s // encoding: [0xff,0xf5,0x30,0x6e]
 // CHECK: fmaxp v7.2d, v8.2d, v25.2d   // encoding: [0x07,0xf5,0x79,0x6e]
@@ -78,10 +82,14 @@
 //----------------------------------------------------------------------
 // Vector Minimum Pairwise (Floating Point)
 //----------------------------------------------------------------------
+         fminp v10.4h, v15.4h, v22.4h
+         fminp v3.8h, v5.8h, v6.8h
          fminp v10.2s, v15.2s, v22.2s
          fminp v3.4s, v5.4s, v6.4s
          fminp v17.2d, v13.2d, v2.2d
 
+// CHECK: fminp   v10.4h, v15.4h, v22.4h  // encoding: [0xea,0x35,0xd6,0x2e]
+// CHECK: fminp   v3.8h, v5.8h, v6.8h     // encoding: [0xa3,0x34,0xc6,0x6e]
 // CHECK: fminp v10.2s, v15.2s, v22.2s  // encoding: [0xea,0xf5,0xb6,0x2e]
 // CHECK: fminp v3.4s, v5.4s, v6.4s     // encoding: [0xa3,0xf4,0xa6,0x6e]
 // CHECK: fminp v17.2d, v13.2d, v2.2d   // encoding: [0xb1,0xf5,0xe2,0x6e]
@@ -89,10 +97,14 @@
 //----------------------------------------------------------------------
 // Vector maxNum Pairwise (Floating Point)
 //----------------------------------------------------------------------
+         fmaxnmp v0.4h, v1.4h, v2.4h
+         fmaxnmp v31.8h, v15.8h, v16.8h
          fmaxnmp v0.2s, v1.2s, v2.2s
          fmaxnmp v31.4s, v15.4s, v16.4s
          fmaxnmp v7.2d, v8.2d, v25.2d
 
+// CHECK: fmaxnmp v0.4h, v1.4h, v2.4h     // encoding: [0x20,0x04,0x42,0x2e]
+// CHECK: fmaxnmp v31.8h, v15.8h, v16.8h  // encoding: [0xff,0x05,0x50,0x6e]
 // CHECK: fmaxnmp v0.2s, v1.2s, v2.2s    // encoding: [0x20,0xc4,0x22,0x2e]
 // CHECK: fmaxnmp v31.4s, v15.4s, v16.4s // encoding: [0xff,0xc5,0x30,0x6e]
 // CHECK: fmaxnmp v7.2d, v8.2d, v25.2d   // encoding: [0x07,0xc5,0x79,0x6e]
@@ -100,10 +112,14 @@
 //----------------------------------------------------------------------
 // Vector minNum Pairwise (Floating Point)
 //----------------------------------------------------------------------
+         fminnmp v10.4h, v15.4h, v22.4h
+         fminnmp v3.8h, v5.8h, v6.8h
          fminnmp v10.2s, v15.2s, v22.2s
          fminnmp v3.4s, v5.4s, v6.4s
          fminnmp v17.2d, v13.2d, v2.2d
 
+// CHECK: fminnmp v10.4h, v15.4h, v22.4h  // encoding: [0xea,0x05,0xd6,0x2e]
+// CHECK: fminnmp v3.8h, v5.8h, v6.8h     // encoding: [0xa3,0x04,0xc6,0x6e]
 // CHECK: fminnmp v10.2s, v15.2s, v22.2s  // encoding: [0xea,0xc5,0xb6,0x2e]
 // CHECK: fminnmp v3.4s, v5.4s, v6.4s     // encoding: [0xa3,0xc4,0xa6,0x6e]
 // CHECK: fminnmp v17.2d, v13.2d, v2.2d   // encoding: [0xb1,0xc5,0xe2,0x6e]
diff --git a/test/MC/AArch64/neon-max-min.s b/test/MC/AArch64/neon-max-min.s
index 6d1efde5077f..c4bd74d98882 100644
--- a/test/MC/AArch64/neon-max-min.s
+++ b/test/MC/AArch64/neon-max-min.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -67,10 +67,14 @@
 //----------------------------------------------------------------------
 // Vector Maximum (Floating Point)
 //----------------------------------------------------------------------
+         fmax v0.4h, v1.4h, v2.4h
+         fmax v0.8h, v1.8h, v2.8h
          fmax v0.2s, v1.2s, v2.2s
          fmax v31.4s, v15.4s, v16.4s
          fmax v7.2d, v8.2d, v25.2d
 
+// CHECK: fmax    v0.4h, v1.4h, v2.4h     // encoding: [0x20,0x34,0x42,0x0e]
+// CHECK: fmax    v0.8h, v1.8h, v2.8h     // encoding: [0x20,0x34,0x42,0x4e]
 // CHECK: fmax v0.2s, v1.2s, v2.2s    // encoding: [0x20,0xf4,0x22,0x0e]
 // CHECK: fmax v31.4s, v15.4s, v16.4s // encoding: [0xff,0xf5,0x30,0x4e]
 // CHECK: fmax v7.2d, v8.2d, v25.2d   // encoding: [0x07,0xf5,0x79,0x4e]
@@ -78,10 +82,14 @@
 //----------------------------------------------------------------------
 // Vector Minimum (Floating Point)
 //----------------------------------------------------------------------
+         fmin v10.4h, v15.4h, v22.4h
+         fmin v10.8h, v15.8h, v22.8h
          fmin v10.2s, v15.2s, v22.2s
          fmin v3.4s, v5.4s, v6.4s
          fmin v17.2d, v13.2d, v2.2d
 
+// CHECK: fmin    v10.4h, v15.4h, v22.4h  // encoding: [0xea,0x35,0xd6,0x0e]
+// CHECK: fmin    v10.8h, v15.8h, v22.8h  // encoding: [0xea,0x35,0xd6,0x4e]
 // CHECK: fmin v10.2s, v15.2s, v22.2s  // encoding: [0xea,0xf5,0xb6,0x0e]
 // CHECK: fmin v3.4s, v5.4s, v6.4s     // encoding: [0xa3,0xf4,0xa6,0x4e]
 // CHECK: fmin v17.2d, v13.2d, v2.2d   // encoding: [0xb1,0xf5,0xe2,0x4e]
@@ -89,10 +97,14 @@
 //----------------------------------------------------------------------
 // Vector maxNum (Floating Point)
 //----------------------------------------------------------------------
+         fmaxnm v0.4h, v1.4h, v2.4h
+         fmaxnm v0.8h, v1.8h, v2.8h
          fmaxnm v0.2s, v1.2s, v2.2s
          fmaxnm v31.4s, v15.4s, v16.4s
          fmaxnm v7.2d, v8.2d, v25.2d
 
+// CHECK: fmaxnm  v0.4h, v1.4h, v2.4h     // encoding: [0x20,0x04,0x42,0x0e]
+// CHECK: fmaxnm  v0.8h, v1.8h, v2.8h     // encoding: [0x20,0x04,0x42,0x4e]
 // CHECK: fmaxnm v0.2s, v1.2s, v2.2s    // encoding: [0x20,0xc4,0x22,0x0e]
 // CHECK: fmaxnm v31.4s, v15.4s, v16.4s // encoding: [0xff,0xc5,0x30,0x4e]
 // CHECK: fmaxnm v7.2d, v8.2d, v25.2d   // encoding: [0x07,0xc5,0x79,0x4e]
@@ -100,10 +112,14 @@
 //----------------------------------------------------------------------
 // Vector minNum (Floating Point)
 //----------------------------------------------------------------------
+         fminnm v10.4h, v15.4h, v22.4h
+         fminnm v10.8h, v15.8h, v22.8h
          fminnm v10.2s, v15.2s, v22.2s
          fminnm v3.4s, v5.4s, v6.4s
          fminnm v17.2d, v13.2d, v2.2d
 
+// CHECK: fminnm  v10.4h, v15.4h, v22.4h  // encoding: [0xea,0x05,0xd6,0x0e]
+// CHECK: fminnm  v10.8h, v15.8h, v22.8h  // encoding: [0xea,0x05,0xd6,0x4e]
 // CHECK: fminnm v10.2s, v15.2s, v22.2s  // encoding: [0xea,0xc5,0xb6,0x0e]
 // CHECK: fminnm v3.4s, v5.4s, v6.4s     // encoding: [0xa3,0xc4,0xa6,0x4e]
 // CHECK: fminnm v17.2d, v13.2d, v2.2d   // encoding: [0xb1,0xc5,0xe2,0x4e]
diff --git a/test/MC/AArch64/neon-mla-mls-instructions.s b/test/MC/AArch64/neon-mla-mls-instructions.s
index 3072e6f1200d..a510fc8c7b91 100644
--- a/test/MC/AArch64/neon-mla-mls-instructions.s
+++ b/test/MC/AArch64/neon-mla-mls-instructions.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -40,10 +40,14 @@
 //----------------------------------------------------------------------
 // Vector Floating-Point Multiply-accumulate
 //----------------------------------------------------------------------
+         fmla v0.4h, v1.4h, v2.4h
+         fmla v0.8h, v1.8h, v2.8h
          fmla v0.2s, v1.2s, v2.2s
          fmla v0.4s, v1.4s, v2.4s
          fmla v0.2d, v1.2d, v2.2d
 
+// CHECK: fmla    v0.4h, v1.4h, v2.4h     // encoding: [0x20,0x0c,0x42,0x0e]
+// CHECK: fmla    v0.8h, v1.8h, v2.8h     // encoding: [0x20,0x0c,0x42,0x4e]
 // CHECK: fmla v0.2s, v1.2s, v2.2s       // encoding: [0x20,0xcc,0x22,0x0e]
 // CHECK: fmla v0.4s, v1.4s, v2.4s       // encoding: [0x20,0xcc,0x22,0x4e]
 // CHECK: fmla v0.2d, v1.2d, v2.2d       // encoding: [0x20,0xcc,0x62,0x4e]
@@ -51,10 +55,14 @@
 //----------------------------------------------------------------------
 // Vector Floating-Point Multiply-subtract
 //----------------------------------------------------------------------
+         fmls v0.4h, v1.4h, v2.4h
+         fmls v0.8h, v1.8h, v2.8h
          fmls v0.2s, v1.2s, v2.2s
          fmls v0.4s, v1.4s, v2.4s
          fmls v0.2d, v1.2d, v2.2d
 
+// CHECK: fmls    v0.4h, v1.4h, v2.4h     // encoding: [0x20,0x0c,0xc2,0x0e]
+// CHECK: fmls    v0.8h, v1.8h, v2.8h     // encoding: [0x20,0x0c,0xc2,0x4e]
 // CHECK: fmls v0.2s, v1.2s, v2.2s       // encoding: [0x20,0xcc,0xa2,0x0e]
 // CHECK: fmls v0.4s, v1.4s, v2.4s       // encoding: [0x20,0xcc,0xa2,0x4e]
 // CHECK: fmls v0.2d, v1.2d, v2.2d       // encoding: [0x20,0xcc,0xe2,0x4e]
diff --git a/test/MC/AArch64/neon-scalar-abs.s b/test/MC/AArch64/neon-scalar-abs.s
index d08756c0c10c..71130617848f 100644
--- a/test/MC/AArch64/neon-scalar-abs.s
+++ b/test/MC/AArch64/neon-scalar-abs.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -14,9 +14,11 @@
 // Scalar Floating-point Absolute Difference
 //----------------------------------------------------------------------
 
+    fabd h29, h24, h20
     fabd s29, s24, s20
     fabd d29, d24, d20
 
+// CHECK: fabd    h29, h24, h20           // encoding: [0x1d,0x17,0xd4,0x7e]
 // CHECK: fabd s29, s24, s20  // encoding: [0x1d,0xd7,0xb4,0x7e]
 // CHECK: fabd d29, d24, d20  // encoding: [0x1d,0xd7,0xf4,0x7e]
 
diff --git a/test/MC/AArch64/neon-scalar-by-elem-mla.s b/test/MC/AArch64/neon-scalar-by-elem-mla.s
index fec9d12d8b8d..394fda673e20 100644
--- a/test/MC/AArch64/neon-scalar-by-elem-mla.s
+++ b/test/MC/AArch64/neon-scalar-by-elem-mla.s
@@ -1,8 +1,9 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 //------------------------------------------------------------------------------
 // Floating Point fused multiply-add (scalar, by element)
 //------------------------------------------------------------------------------
+    fmla    h0, h1, v1.h[5]
     fmla    s0, s1, v1.s[0]
     fmla    s30, s11, v1.s[1]
     fmla    s4, s5, v7.s[2]
@@ -10,6 +11,7 @@
     fmla    d0, d1, v1.d[0]
     fmla    d30, d11, v1.d[1]
 
+// CHECK: fmla    h0, h1, v1.h[5]         // encoding: [0x20,0x18,0x11,0x5f]
 // CHECK: fmla    s0, s1, v1.s[0]         // encoding: [0x20,0x10,0x81,0x5f]
 // CHECK: fmla    s30, s11, v1.s[1]       // encoding: [0x7e,0x11,0xa1,0x5f]
 // CHECK: fmla    s4, s5, v7.s[2]         // encoding: [0xa4,0x18,0x87,0x5f]
@@ -21,6 +23,7 @@
 // Floating Point fused multiply-subtract (scalar, by element)
 //------------------------------------------------------------------------------
 
+    fmls    h2, h3, v4.h[5]
     fmls    s2, s3, v4.s[0]
     fmls    s29, s10, v28.s[1]      
     fmls    s5, s12, v23.s[2]       
@@ -28,6 +31,7 @@
     fmls    d0, d1, v1.d[0]         
     fmls    d30, d11, v1.d[1]       
 
+// CHECK: fmls    h2, h3, v4.h[5]         // encoding: [0x62,0x58,0x14,0x5f]
 // CHECK: fmls    s2, s3, v4.s[0]     // encoding: [0x62,0x50,0x84,0x5f]
 // CHECK: fmls    s29, s10, v28.s[1]  // encoding: [0x5d,0x51,0xbc,0x5f]
 // CHECK: fmls    s5, s12, v23.s[2]   // encoding: [0x85,0x59,0x97,0x5f]
diff --git a/test/MC/AArch64/neon-scalar-by-elem-mul.s b/test/MC/AArch64/neon-scalar-by-elem-mul.s
index 8b8a3f57a9ca..0d832742a389 100644
--- a/test/MC/AArch64/neon-scalar-by-elem-mul.s
+++ b/test/MC/AArch64/neon-scalar-by-elem-mul.s
@@ -1,8 +1,9 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 //------------------------------------------------------------------------------
 // Floating Point  multiply (scalar, by element)
 //------------------------------------------------------------------------------
+    fmul    h0, h1, v1.h[5]
     fmul    s0, s1, v1.s[0]
     fmul    s30, s11, v1.s[1]
     fmul    s4, s5, v7.s[2]
@@ -10,6 +11,7 @@
     fmul    d0, d1, v1.d[0]
     fmul    d30, d11, v1.d[1]
 
+// CHECK: fmul    h0, h1, v1.h[5]         // encoding: [0x20,0x98,0x11,0x5f]
 // CHECK: fmul    s0, s1, v1.s[0]      // encoding: [0x20,0x90,0x81,0x5f]
 // CHECK: fmul    s30, s11, v1.s[1]    // encoding: [0x7e,0x91,0xa1,0x5f]
 // CHECK: fmul    s4, s5, v7.s[2]      // encoding: [0xa4,0x98,0x87,0x5f]
@@ -21,6 +23,7 @@
 //------------------------------------------------------------------------------
 // Floating Point  multiply extended (scalar, by element)
 //------------------------------------------------------------------------------
+    fmulx   h6, h2, v8.h[5]
     fmulx   s6, s2, v8.s[0]
     fmulx   s7, s3, v13.s[1]
     fmulx   s9, s7, v9.s[2]
@@ -28,6 +31,7 @@
     fmulx   d15, d9, v7.d[0]
     fmulx   d13, d12, v11.d[1]
 
+// CHECK: fmulx   h6, h2, v8.h[5]         // encoding: [0x46,0x98,0x18,0x7f]
 // CHECK: fmulx   s6, s2, v8.s[0]         // encoding: [0x46,0x90,0x88,0x7f]
 // CHECK: fmulx   s7, s3, v13.s[1]        // encoding: [0x67,0x90,0xad,0x7f]
 // CHECK: fmulx   s9, s7, v9.s[2]         // encoding: [0xe9,0x98,0x89,0x7f]
diff --git a/test/MC/AArch64/neon-scalar-cvt.s b/test/MC/AArch64/neon-scalar-cvt.s
index 97416daf0801..3cbf6bae6758 100644
--- a/test/MC/AArch64/neon-scalar-cvt.s
+++ b/test/MC/AArch64/neon-scalar-cvt.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -6,9 +6,11 @@
 // Scalar Signed Integer Convert To Floating-point
 //----------------------------------------------------------------------
 
+    scvtf h23, h14
     scvtf s22, s13
     scvtf d21, d12
 
+// CHECK: scvtf   h23, h14                // encoding: [0xd7,0xd9,0x79,0x5e]
 // CHECK: scvtf s22, s13    // encoding: [0xb6,0xd9,0x21,0x5e]
 // CHECK: scvtf d21, d12    // encoding: [0x95,0xd9,0x61,0x5e]
 
@@ -16,9 +18,11 @@
 // Scalar Unsigned Integer Convert To Floating-point
 //----------------------------------------------------------------------
 
+    ucvtf h20, h12
     ucvtf s22, s13
     ucvtf d21, d14
 
+// CHECK: ucvtf   h20, h12                // encoding: [0x94,0xd9,0x79,0x7e]
 // CHECK: ucvtf s22, s13    // encoding: [0xb6,0xd9,0x21,0x7e]
 // CHECK: ucvtf d21, d14    // encoding: [0xd5,0xd9,0x61,0x7e]
 
@@ -26,9 +30,11 @@
 // Scalar Signed Fixed-point Convert To Floating-Point (Immediate)
 //----------------------------------------------------------------------
 
+    scvtf h22, h13, #16
     scvtf s22, s13, #32
     scvtf d21, d12, #64
 
+// CHECK: scvtf   h22, h13, #16           // encoding: [0xb6,0xe5,0x10,0x5f]
 // CHECK: scvtf s22, s13, #32  // encoding: [0xb6,0xe5,0x20,0x5f]
 // CHECK: scvtf d21, d12, #64  // encoding: [0x95,0xe5,0x40,0x5f]    
 
@@ -36,9 +42,11 @@
 // Scalar Unsigned Fixed-point Convert To Floating-Point (Immediate)
 //----------------------------------------------------------------------
 
+    ucvtf h22, h13, #16
     ucvtf s22, s13, #32
     ucvtf d21, d14, #64
 
+// CHECK: ucvtf   h22, h13, #16           // encoding: [0xb6,0xe5,0x10,0x7f]
 // CHECK: ucvtf s22, s13, #32  // encoding: [0xb6,0xe5,0x20,0x7f]
 // CHECK: ucvtf d21, d14, #64  // encoding: [0xd5,0xe5,0x40,0x7f]
 
@@ -46,9 +54,11 @@
 // Scalar Floating-point Convert To Signed Fixed-point (Immediate)
 //----------------------------------------------------------------------
 
+    fcvtzs h21, h12, #1
     fcvtzs s21, s12, #1
     fcvtzs d21, d12, #1
 
+// CHECK: fcvtzs  h21, h12, #1            // encoding: [0x95,0xfd,0x1f,0x5f]
 // CHECK: fcvtzs s21, s12, #1  // encoding: [0x95,0xfd,0x3f,0x5f]
 // CHECK: fcvtzs d21, d12, #1  // encoding: [0x95,0xfd,0x7f,0x5f]
         
@@ -56,9 +66,11 @@
 // Scalar Floating-point Convert To Unsigned Fixed-point (Immediate)
 //----------------------------------------------------------------------
 
+    fcvtzu h21, h12, #1
     fcvtzu s21, s12, #1
     fcvtzu d21, d12, #1
 
+// CHECK: fcvtzu  h21, h12, #1            // encoding: [0x95,0xfd,0x1f,0x7f]
 // CHECK: fcvtzu s21, s12, #1  // encoding: [0x95,0xfd,0x3f,0x7f]
 // CHECK: fcvtzu d21, d12, #1  // encoding: [0x95,0xfd,0x7f,0x7f]
 
@@ -76,9 +88,11 @@
 // With Ties To Away
 //----------------------------------------------------------------------
 
+    fcvtas h12, h13
     fcvtas s12, s13
     fcvtas d21, d14
 
+// CHECK: fcvtas  h12, h13                // encoding: [0xac,0xc9,0x79,0x5e]
 // CHECK: fcvtas s12, s13    // encoding: [0xac,0xc9,0x21,0x5e]
 // CHECK: fcvtas d21, d14    // encoding: [0xd5,0xc9,0x61,0x5e]
 
@@ -87,9 +101,11 @@
 // Nearest With Ties To Away
 //----------------------------------------------------------------------
 
+    fcvtau h12, h13
     fcvtau s12, s13
     fcvtau d21, d14
 
+// CHECK: fcvtau  h12, h13                // encoding: [0xac,0xc9,0x79,0x7e]
 // CHECK: fcvtau s12, s13    // encoding: [0xac,0xc9,0x21,0x7e]
 // CHECK: fcvtau d21, d14    // encoding: [0xd5,0xc9,0x61,0x7e]
 
@@ -98,9 +114,11 @@
 // Minus Infinity
 //----------------------------------------------------------------------
 
+    fcvtms h22, h13
     fcvtms s22, s13
     fcvtms d21, d14
 
+// CHECK: fcvtms  h22, h13                // encoding: [0xb6,0xb9,0x79,0x5e]
 // CHECK: fcvtms s22, s13    // encoding: [0xb6,0xb9,0x21,0x5e]
 // CHECK: fcvtms d21, d14    // encoding: [0xd5,0xb9,0x61,0x5e]
 
@@ -109,9 +127,11 @@
 // Minus Infinity
 //----------------------------------------------------------------------
 
+    fcvtmu h12, h13
     fcvtmu s12, s13
     fcvtmu d21, d14
 
+// CHECK: fcvtmu  h12, h13                // encoding: [0xac,0xb9,0x79,0x7e]
 // CHECK: fcvtmu s12, s13    // encoding: [0xac,0xb9,0x21,0x7e]
 // CHECK: fcvtmu d21, d14    // encoding: [0xd5,0xb9,0x61,0x7e]
 
@@ -120,9 +140,11 @@
 // With Ties To Even
 //----------------------------------------------------------------------
 
+    fcvtns h22, h13
     fcvtns s22, s13
     fcvtns d21, d14
 
+// CHECK: fcvtns  h22, h13                // encoding: [0xb6,0xa9,0x79,0x5e]
 // CHECK: fcvtns s22, s13    // encoding: [0xb6,0xa9,0x21,0x5e]
 // CHECK: fcvtns d21, d14    // encoding: [0xd5,0xa9,0x61,0x5e]
 
@@ -131,9 +153,11 @@
 // Nearest With Ties To Even
 //----------------------------------------------------------------------
 
+    fcvtnu h12, h13
     fcvtnu s12, s13
     fcvtnu d21, d14
 
+// CHECK: fcvtnu  h12, h13                // encoding: [0xac,0xa9,0x79,0x7e]
 // CHECK: fcvtnu s12, s13    // encoding: [0xac,0xa9,0x21,0x7e]
 // CHECK: fcvtnu d21, d14    // encoding: [0xd5,0xa9,0x61,0x7e]
         
@@ -142,9 +166,11 @@
 // Positive Infinity
 //----------------------------------------------------------------------
 
+    fcvtps h22, h13
     fcvtps s22, s13
     fcvtps d21, d14
 
+// CHECK: fcvtps  h22, h13                // encoding: [0xb6,0xa9,0xf9,0x5e]
 // CHECK: fcvtps s22, s13    // encoding: [0xb6,0xa9,0xa1,0x5e]
 // CHECK: fcvtps d21, d14    // encoding: [0xd5,0xa9,0xe1,0x5e]
         
@@ -153,9 +179,11 @@
 // Positive Infinity
 //----------------------------------------------------------------------
 
+    fcvtpu h12, h13
     fcvtpu s12, s13
     fcvtpu d21, d14
 
+// CHECK: fcvtpu  h12, h13                // encoding: [0xac,0xa9,0xf9,0x7e]
 // CHECK: fcvtpu s12, s13    // encoding: [0xac,0xa9,0xa1,0x7e]
 // CHECK: fcvtpu d21, d14    // encoding: [0xd5,0xa9,0xe1,0x7e]
 
@@ -163,9 +191,11 @@
 // Scalar Floating-point Convert To Signed Integer, Rounding Toward Zero
 //----------------------------------------------------------------------
 
+    fcvtzs h12, h13
     fcvtzs s12, s13
     fcvtzs d21, d14
 
+// CHECK: fcvtzs  h12, h13                // encoding: [0xac,0xb9,0xf9,0x5e]
 // CHECK: fcvtzs s12, s13    // encoding: [0xac,0xb9,0xa1,0x5e]
 // CHECK: fcvtzs d21, d14    // encoding: [0xd5,0xb9,0xe1,0x5e]
         
@@ -174,8 +204,10 @@
 // Zero
 //----------------------------------------------------------------------
 
+    fcvtzu h12, h13
     fcvtzu s12, s13
     fcvtzu d21, d14
 
+// CHECK: fcvtzu  h12, h13                // encoding: [0xac,0xb9,0xf9,0x7e]
 // CHECK: fcvtzu s12, s13    // encoding: [0xac,0xb9,0xa1,0x7e]
 // CHECK: fcvtzu d21, d14    // encoding: [0xd5,0xb9,0xe1,0x7e]
diff --git a/test/MC/AArch64/neon-scalar-fp-compare.s b/test/MC/AArch64/neon-scalar-fp-compare.s
index b798b3410670..0b91d945a719 100644
--- a/test/MC/AArch64/neon-scalar-fp-compare.s
+++ b/test/MC/AArch64/neon-scalar-fp-compare.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -6,9 +6,11 @@
 // Scalar Floating-point Compare Mask Equal
 //----------------------------------------------------------------------
 
+         fcmeq h10, h11, h12
          fcmeq s10, s11, s12
          fcmeq d20, d21, d22
 
+// CHECK: fcmeq   h10, h11, h12           // encoding: [0x6a,0x25,0x4c,0x5e]
 // CHECK: fcmeq s10, s11, s12   // encoding: [0x6a,0xe5,0x2c,0x5e]
 // CHECK: fcmeq d20, d21, d22   // encoding: [0xb4,0xe6,0x76,0x5e]
 
@@ -16,13 +18,17 @@
 // Scalar Floating-point Compare Mask Equal To Zero
 //----------------------------------------------------------------------
 
+         fcmeq h10, h11, #0.0
          fcmeq s10, s11, #0.0
          fcmeq d20, d21, #0.0
+         fcmeq h10, h11, #0
          fcmeq s10, s11, #0
          fcmeq d20, d21, #0x0
 
+// CHECK: fcmeq   h10, h11, #0.0          // encoding: [0x6a,0xd9,0xf8,0x5e]
 // CHECK: fcmeq s10, s11, #0.0   // encoding: [0x6a,0xd9,0xa0,0x5e]
 // CHECK: fcmeq d20, d21, #0.0   // encoding: [0xb4,0xda,0xe0,0x5e]
+// CHECK: fcmeq   h10, h11, #0.0          // encoding: [0x6a,0xd9,0xf8,0x5e]
 // CHECK: fcmeq s10, s11, #0.0   // encoding: [0x6a,0xd9,0xa0,0x5e]
 // CHECK: fcmeq d20, d21, #0.0   // encoding: [0xb4,0xda,0xe0,0x5e]
 
@@ -30,9 +36,11 @@
 // Scalar Floating-point Compare Mask Greater Than Or Equal
 //----------------------------------------------------------------------
 
+         fcmge h10, h11, h12
          fcmge s10, s11, s12
          fcmge d20, d21, d22
 
+// CHECK: fcmge   h10, h11, h12           // encoding: [0x6a,0x25,0x4c,0x7e]
 // CHECK: fcmge s10, s11, s12   // encoding: [0x6a,0xe5,0x2c,0x7e]
 // CHECK: fcmge d20, d21, d22   // encoding: [0xb4,0xe6,0x76,0x7e]
 
@@ -40,13 +48,17 @@
 // Scalar Floating-point Compare Mask Greater Than Or Equal To Zero
 //----------------------------------------------------------------------
 
+         fcmge h10, h11, #0.0
          fcmge s10, s11, #0.0
          fcmge d20, d21, #0.0
+         fcmge h10, h11, #0
          fcmge s10, s11, #0
          fcmge d20, d21, #0x0
 
+// CHECK: fcmge   h10, h11, #0.0          // encoding: [0x6a,0xc9,0xf8,0x7e]
 // CHECK: fcmge s10, s11, #0.0   // encoding: [0x6a,0xc9,0xa0,0x7e]
 // CHECK: fcmge d20, d21, #0.0   // encoding: [0xb4,0xca,0xe0,0x7e]
+// CHECK: fcmge   h10, h11, #0.0          // encoding: [0x6a,0xc9,0xf8,0x7e]
 // CHECK: fcmge s10, s11, #0.0   // encoding: [0x6a,0xc9,0xa0,0x7e]
 // CHECK: fcmge d20, d21, #0.0   // encoding: [0xb4,0xca,0xe0,0x7e]
 
@@ -54,9 +66,11 @@
 // Scalar Floating-point Compare Mask Greather Than
 //----------------------------------------------------------------------
 
+         fcmgt h10, h11, h12
          fcmgt s10, s11, s12
          fcmgt d20, d21, d22
 
+// CHECK: fcmgt   h10, h11, h12           // encoding: [0x6a,0x25,0xcc,0x7e]
 // CHECK: fcmgt s10, s11, s12   // encoding: [0x6a,0xe5,0xac,0x7e]
 // CHECK: fcmgt d20, d21, d22   // encoding: [0xb4,0xe6,0xf6,0x7e]
 
@@ -64,13 +78,17 @@
 // Scalar Floating-point Compare Mask Greather Than Zero
 //----------------------------------------------------------------------
 
+         fcmgt h10, h11, #0.0
          fcmgt s10, s11, #0.0
          fcmgt d20, d21, #0.0
+         fcmgt h10, h11, #0
          fcmgt s10, s11, #0
          fcmgt d20, d21, #0x0
 
+// CHECK: fcmgt   h10, h11, #0.0          // encoding: [0x6a,0xc9,0xf8,0x5e]
 // CHECK: fcmgt s10, s11, #0.0   // encoding: [0x6a,0xc9,0xa0,0x5e]
 // CHECK: fcmgt d20, d21, #0.0   // encoding: [0xb4,0xca,0xe0,0x5e]
+// CHECK: fcmgt   h10, h11, #0.0          // encoding: [0x6a,0xc9,0xf8,0x5e]
 // CHECK: fcmgt s10, s11, #0.0   // encoding: [0x6a,0xc9,0xa0,0x5e]
 // CHECK: fcmgt d20, d21, #0.0   // encoding: [0xb4,0xca,0xe0,0x5e]
 
@@ -78,13 +96,17 @@
 // Scalar Floating-point Compare Mask Less Than Or Equal To Zero
 //----------------------------------------------------------------------
 
+         fcmle h10, h11, #0.0
          fcmle s10, s11, #0.0
          fcmle d20, d21, #0.0
+         fcmle h10, h11, #0
          fcmle s10, s11, #0
          fcmle d20, d21, #0x0
 
+// CHECK: fcmle   h10, h11, #0.0          // encoding: [0x6a,0xd9,0xf8,0x7e]
 // CHECK: fcmle s10, s11, #0.0   // encoding: [0x6a,0xd9,0xa0,0x7e]
 // CHECK: fcmle d20, d21, #0.0   // encoding: [0xb4,0xda,0xe0,0x7e]
+// CHECK: fcmle   h10, h11, #0.0          // encoding: [0x6a,0xd9,0xf8,0x7e]
 // CHECK: fcmle s10, s11, #0.0   // encoding: [0x6a,0xd9,0xa0,0x7e]
 // CHECK: fcmle d20, d21, #0.0   // encoding: [0xb4,0xda,0xe0,0x7e]
 
@@ -92,13 +114,17 @@
 // Scalar Floating-point Compare Mask Less Than
 //----------------------------------------------------------------------
 
+         fcmlt h10, h11, #0.0
          fcmlt s10, s11, #0.0
          fcmlt d20, d21, #0.0
+         fcmlt h10, h11, #0
          fcmlt s10, s11, #0
          fcmlt d20, d21, #0x0
 
+// CHECK: fcmlt   h10, h11, #0.0          // encoding: [0x6a,0xe9,0xf8,0x5e]
 // CHECK: fcmlt s10, s11, #0.0   // encoding: [0x6a,0xe9,0xa0,0x5e]
 // CHECK: fcmlt d20, d21, #0.0   // encoding: [0xb4,0xea,0xe0,0x5e]
+// CHECK: fcmlt   h10, h11, #0.0          // encoding: [0x6a,0xe9,0xf8,0x5e]
 // CHECK: fcmlt s10, s11, #0.0   // encoding: [0x6a,0xe9,0xa0,0x5e]
 // CHECK: fcmlt d20, d21, #0.0   // encoding: [0xb4,0xea,0xe0,0x5e]
 
@@ -106,9 +132,11 @@
 // Scalar Floating-point Absolute Compare Mask Greater Than Or Equal
 //----------------------------------------------------------------------
 
+         facge h10, h11, h12
          facge s10, s11, s12
          facge d20, d21, d22
 
+// CHECK: facge   h10, h11, h12           // encoding: [0x6a,0x2d,0x4c,0x7e]
 // CHECK: facge s10, s11, s12    // encoding: [0x6a,0xed,0x2c,0x7e]
 // CHECK: facge d20, d21, d22    // encoding: [0xb4,0xee,0x76,0x7e]
 
@@ -116,8 +144,10 @@
 // Scalar Floating-point Absolute Compare Mask Greater Than
 //----------------------------------------------------------------------
 
+         facgt h10, h11, h12
          facgt s10, s11, s12
          facgt d20, d21, d22
 
+// CHECK: facgt   h10, h11, h12           // encoding: [0x6a,0x2d,0xcc,0x7e]
 // CHECK: facgt s10, s11, s12   // encoding: [0x6a,0xed,0xac,0x7e]
 // CHECK: facgt d20, d21, d22   // encoding: [0xb4,0xee,0xf6,0x7e]
diff --git a/test/MC/AArch64/neon-scalar-mul.s b/test/MC/AArch64/neon-scalar-mul.s
index e33bdad91a94..323fad206c4d 100644
--- a/test/MC/AArch64/neon-scalar-mul.s
+++ b/test/MC/AArch64/neon-scalar-mul.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -26,9 +26,11 @@
 // Floating-point Multiply Extended
 //----------------------------------------------------------------------
 
+    fmulx h20, h22, h15
     fmulx s20, s22, s15
     fmulx d23, d11, d1
 
+// CHECK: fmulx   h20, h22, h15           // encoding: [0xd4,0x1e,0x4f,0x5e]
 // CHECK: fmulx s20, s22, s15   // encoding: [0xd4,0xde,0x2f,0x5e]
 // CHECK: fmulx d23, d11, d1    // encoding: [0x77,0xdd,0x61,0x5e]
 
diff --git a/test/MC/AArch64/neon-scalar-recip.s b/test/MC/AArch64/neon-scalar-recip.s
index 7a886f3b4a73..923c3549d6f0 100644
--- a/test/MC/AArch64/neon-scalar-recip.s
+++ b/test/MC/AArch64/neon-scalar-recip.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -6,9 +6,11 @@
 // Floating-point Reciprocal Step
 //----------------------------------------------------------------------
 
+    frecps h21, h16, h13
     frecps s21, s16, s13
     frecps d22, d30, d21
 
+// CHECK: frecps  h21, h16, h13           // encoding: [0x15,0x3e,0x4d,0x5e]
 // CHECK: frecps s21, s16, s13   // encoding: [0x15,0xfe,0x2d,0x5e]
 // CHECK: frecps d22, d30, d21   // encoding: [0xd6,0xff,0x75,0x5e]
 
@@ -16,9 +18,11 @@
 // Floating-point Reciprocal Square Root Step
 //----------------------------------------------------------------------
 
+    frsqrts h21, h5, h12
     frsqrts s21, s5, s12
     frsqrts d8, d22, d18
 
+// CHECK: frsqrts h21, h5, h12            // encoding: [0xb5,0x3c,0xcc,0x5e]
 // CHECK: frsqrts s21, s5, s12   // encoding: [0xb5,0xfc,0xac,0x5e]
 // CHECK: frsqrts d8, d22, d18   // encoding: [0xc8,0xfe,0xf2,0x5e]
 
@@ -26,9 +30,11 @@
 // Scalar Floating-point Reciprocal Estimate
 //----------------------------------------------------------------------
 
+    frecpe h19, h14
     frecpe s19, s14
     frecpe d13, d13
 
+// CHECK: frecpe  h19, h14                // encoding: [0xd3,0xd9,0xf9,0x5e]
 // CHECK: frecpe s19, s14    // encoding: [0xd3,0xd9,0xa1,0x5e]
 // CHECK: frecpe d13, d13    // encoding: [0xad,0xd9,0xe1,0x5e]
 
@@ -36,9 +42,11 @@
 // Scalar Floating-point Reciprocal Exponent
 //----------------------------------------------------------------------
 
+    frecpx h18, h10
     frecpx s18, s10
     frecpx d16, d19
 
+// CHECK: frecpx  h18, h10                // encoding: [0x52,0xf9,0xf9,0x5e]
 // CHECK: frecpx s18, s10    // encoding: [0x52,0xf9,0xa1,0x5e]
 // CHECK: frecpx d16, d19    // encoding: [0x70,0xfa,0xe1,0x5e]
 
@@ -46,8 +54,10 @@
 // Scalar Floating-point Reciprocal Square Root Estimate
 //----------------------------------------------------------------------
 
+    frsqrte h22, h13
     frsqrte s22, s13
     frsqrte d21, d12
 
+// CHECK: frsqrte h22, h13                // encoding: [0xb6,0xd9,0xf9,0x7e]
 // CHECK: frsqrte s22, s13    // encoding: [0xb6,0xd9,0xa1,0x7e]
 // CHECK: frsqrte d21, d12    // encoding: [0x95,0xd9,0xe1,0x7e]
diff --git a/test/MC/AArch64/neon-scalar-reduce-pairwise.s b/test/MC/AArch64/neon-scalar-reduce-pairwise.s
index 403a940ec2f2..dae61d0f0f32 100644
--- a/test/MC/AArch64/neon-scalar-reduce-pairwise.s
+++ b/test/MC/AArch64/neon-scalar-reduce-pairwise.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 //----------------------------------------------------------------------
 // Scalar Reduce Add Pairwise (Integer)
@@ -10,7 +10,12 @@
 //----------------------------------------------------------------------
 // Scalar Reduce Add Pairwise (Floating Point)
 //----------------------------------------------------------------------
+      faddp h18, v3.2h
+      faddp h18, v3.2H
+      faddp s19, v2.2s
       faddp d20, v1.2d
 
+// CHECK: faddp h18, v3.2h     // encoding: [0x72,0xd8,0x30,0x5e]
+// CHECK: faddp s19, v2.2s     // encoding: [0x53,0xd8,0x30,0x7e]
 // CHECK: faddp d20, v1.2d     // encoding: [0x34,0xd8,0x70,0x7e]
 
diff --git a/test/MC/AArch64/neon-simd-misc.s b/test/MC/AArch64/neon-simd-misc.s
index 6d1aafdd7725..32dd48629cd8 100644
--- a/test/MC/AArch64/neon-simd-misc.s
+++ b/test/MC/AArch64/neon-simd-misc.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=arm64 -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=arm64 -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -298,10 +298,14 @@
 // Floating-point absolute
 //------------------------------------------------------------------------------
 
+         fabs v4.4h, v0.4h
+         fabs v6.8h, v8.8h
          fabs v6.4s, v8.4s
          fabs v6.2d, v8.2d
          fabs v4.2s, v0.2s
 
+// CHECK: fabs    v4.4h, v0.4h            // encoding: [0x04,0xf8,0xf8,0x0e]
+// CHECK: fabs    v6.8h, v8.8h            // encoding: [0x06,0xf9,0xf8,0x4e]
 // CHECK:	fabs	v6.4s, v8.4s            // encoding: [0x06,0xf9,0xa0,0x4e]
 // CHECK:	fabs	v6.2d, v8.2d            // encoding: [0x06,0xf9,0xe0,0x4e]
 // CHECK:	fabs	v4.2s, v0.2s            // encoding: [0x04,0xf8,0xa0,0x0e]
@@ -310,10 +314,14 @@
 // Floating-point negate
 //------------------------------------------------------------------------------
 
+         fneg v4.4h, v0.4h
+         fneg v6.8h, v8.8h
          fneg v6.4s, v8.4s
          fneg v6.2d, v8.2d
          fneg v4.2s, v0.2s
 
+// CHECK: fneg    v4.4h, v0.4h            // encoding: [0x04,0xf8,0xf8,0x2e]
+// CHECK: fneg    v6.8h, v8.8h            // encoding: [0x06,0xf9,0xf8,0x6e]
 // CHECK:	fneg	v6.4s, v8.4s            // encoding: [0x06,0xf9,0xa0,0x6e]
 // CHECK:	fneg	v6.2d, v8.2d            // encoding: [0x06,0xf9,0xe0,0x6e]
 // CHECK:	fneg	v4.2s, v0.2s            // encoding: [0x04,0xf8,0xa0,0x2e]
@@ -450,58 +458,86 @@
 // Floating-point round to integral
 //------------------------------------------------------------------------------
 
+         frintn v4.4h, v0.4h
+         frintn v6.8h, v8.8h
          frintn v6.4s, v8.4s
          frintn v6.2d, v8.2d
          frintn v4.2s, v0.2s
 
+// CHECK: frintn  v4.4h, v0.4h            // encoding: [0x04,0x88,0x79,0x0e]
+// CHECK: frintn  v6.8h, v8.8h            // encoding: [0x06,0x89,0x79,0x4e]
 // CHECK:	frintn	v6.4s, v8.4s            // encoding: [0x06,0x89,0x21,0x4e]
 // CHECK:	frintn	v6.2d, v8.2d            // encoding: [0x06,0x89,0x61,0x4e]
 // CHECK:	frintn	v4.2s, v0.2s            // encoding: [0x04,0x88,0x21,0x0e]
 
+         frinta v4.4h, v0.4h
+         frinta v6.8h, v8.8h
          frinta v6.4s, v8.4s
          frinta v6.2d, v8.2d
          frinta v4.2s, v0.2s
 
+// CHECK: frinta  v4.4h, v0.4h            // encoding: [0x04,0x88,0x79,0x2e]
+// CHECK: frinta  v6.8h, v8.8h            // encoding: [0x06,0x89,0x79,0x6e]
 // CHECK:	frinta	v6.4s, v8.4s            // encoding: [0x06,0x89,0x21,0x6e]
 // CHECK:	frinta	v6.2d, v8.2d            // encoding: [0x06,0x89,0x61,0x6e]
 // CHECK:	frinta	v4.2s, v0.2s            // encoding: [0x04,0x88,0x21,0x2e]
 
+         frintp v4.4h, v0.4h
+         frintp v6.8h, v8.8h
          frintp v6.4s, v8.4s
          frintp v6.2d, v8.2d
          frintp v4.2s, v0.2s
 
+// CHECK: frintp  v4.4h, v0.4h            // encoding: [0x04,0x88,0xf9,0x0e]
+// CHECK: frintp  v6.8h, v8.8h            // encoding: [0x06,0x89,0xf9,0x4e]
 // CHECK:	frintp	v6.4s, v8.4s            // encoding: [0x06,0x89,0xa1,0x4e]
 // CHECK:	frintp	v6.2d, v8.2d            // encoding: [0x06,0x89,0xe1,0x4e]
 // CHECK:	frintp	v4.2s, v0.2s            // encoding: [0x04,0x88,0xa1,0x0e]
 
+         frintm v4.4h, v0.4h
+         frintm v6.8h, v8.8h
          frintm v6.4s, v8.4s
          frintm v6.2d, v8.2d
          frintm v4.2s, v0.2s
 
+// CHECK: frintm  v4.4h, v0.4h            // encoding: [0x04,0x98,0x79,0x0e]
+// CHECK: frintm  v6.8h, v8.8h            // encoding: [0x06,0x99,0x79,0x4e]
 // CHECK:	frintm	v6.4s, v8.4s            // encoding: [0x06,0x99,0x21,0x4e]
 // CHECK:	frintm	v6.2d, v8.2d            // encoding: [0x06,0x99,0x61,0x4e]
 // CHECK:	frintm	v4.2s, v0.2s            // encoding: [0x04,0x98,0x21,0x0e]
 
+         frintx v4.4h, v0.4h
+         frintx v6.8h, v8.8h
          frintx v6.4s, v8.4s
          frintx v6.2d, v8.2d
          frintx v4.2s, v0.2s
 
+// CHECK: frintx  v4.4h, v0.4h            // encoding: [0x04,0x98,0x79,0x2e]
+// CHECK: frintx  v6.8h, v8.8h            // encoding: [0x06,0x99,0x79,0x6e]
 // CHECK:	frintx	v6.4s, v8.4s            // encoding: [0x06,0x99,0x21,0x6e]
 // CHECK:	frintx	v6.2d, v8.2d            // encoding: [0x06,0x99,0x61,0x6e]
 // CHECK:	frintx	v4.2s, v0.2s            // encoding: [0x04,0x98,0x21,0x2e]
 
+         frintz v4.4h, v0.4h
+         frintz v6.8h, v8.8h
          frintz v6.4s, v8.4s
          frintz v6.2d, v8.2d
          frintz v4.2s, v0.2s
 
+// CHECK: frintz  v4.4h, v0.4h            // encoding: [0x04,0x98,0xf9,0x0e]
+// CHECK: frintz  v6.8h, v8.8h            // encoding: [0x06,0x99,0xf9,0x4e]
 // CHECK:	frintz	v6.4s, v8.4s            // encoding: [0x06,0x99,0xa1,0x4e]
 // CHECK:	frintz	v6.2d, v8.2d            // encoding: [0x06,0x99,0xe1,0x4e]
 // CHECK:	frintz	v4.2s, v0.2s            // encoding: [0x04,0x98,0xa1,0x0e]
 
+         frinti v4.4h, v0.4h
+         frinti v6.8h, v8.8h
          frinti v6.4s, v8.4s
          frinti v6.2d, v8.2d
          frinti v4.2s, v0.2s
 
+// CHECK: frinti  v4.4h, v0.4h            // encoding: [0x04,0x98,0xf9,0x2e]
+// CHECK: frinti  v6.8h, v8.8h            // encoding: [0x06,0x99,0xf9,0x6e]
 // CHECK:	frinti	v6.4s, v8.4s            // encoding: [0x06,0x99,0xa1,0x6e]
 // CHECK:	frinti	v6.2d, v8.2d            // encoding: [0x06,0x99,0xe1,0x6e]
 // CHECK:	frinti	v4.2s, v0.2s            // encoding: [0x04,0x98,0xa1,0x2e]
@@ -510,83 +546,123 @@
 // Floating-point convert to integer
 //------------------------------------------------------------------------------
 
+         fcvtns v4.4h, v0.4h
+         fcvtns v6.8h, v8.8h
          fcvtns v6.4s, v8.4s
          fcvtns v6.2d, v8.2d
          fcvtns v4.2s, v0.2s
 
+// CHECK: fcvtns  v4.4h, v0.4h            // encoding: [0x04,0xa8,0x79,0x0e]
+// CHECK: fcvtns  v6.8h, v8.8h            // encoding: [0x06,0xa9,0x79,0x4e]
 // CHECK:	fcvtns	v6.4s, v8.4s            // encoding: [0x06,0xa9,0x21,0x4e]
 // CHECK:	fcvtns	v6.2d, v8.2d            // encoding: [0x06,0xa9,0x61,0x4e]
 // CHECK:	fcvtns	v4.2s, v0.2s            // encoding: [0x04,0xa8,0x21,0x0e]
 
+         fcvtnu v4.4h, v0.4h
+         fcvtnu v6.8h, v8.8h
          fcvtnu v6.4s, v8.4s
          fcvtnu v6.2d, v8.2d
          fcvtnu v4.2s, v0.2s
 
+// CHECK: fcvtnu  v4.4h, v0.4h            // encoding: [0x04,0xa8,0x79,0x2e]
+// CHECK: fcvtnu  v6.8h, v8.8h            // encoding: [0x06,0xa9,0x79,0x6e]
 // CHECK:	fcvtnu	v6.4s, v8.4s            // encoding: [0x06,0xa9,0x21,0x6e]
 // CHECK:	fcvtnu	v6.2d, v8.2d            // encoding: [0x06,0xa9,0x61,0x6e]
 // CHECK:	fcvtnu	v4.2s, v0.2s            // encoding: [0x04,0xa8,0x21,0x2e]
 
+         fcvtps v4.4h, v0.4h
+         fcvtps v6.8h, v8.8h
          fcvtps v6.4s, v8.4s
          fcvtps v6.2d, v8.2d
          fcvtps v4.2s, v0.2s
 
+// CHECK: fcvtps  v4.4h, v0.4h            // encoding: [0x04,0xa8,0xf9,0x0e]
+// CHECK: fcvtps  v6.8h, v8.8h            // encoding: [0x06,0xa9,0xf9,0x4e]
 // CHECK:	fcvtps	v6.4s, v8.4s            // encoding: [0x06,0xa9,0xa1,0x4e]
 // CHECK:	fcvtps	v6.2d, v8.2d            // encoding: [0x06,0xa9,0xe1,0x4e]
 // CHECK:	fcvtps	v4.2s, v0.2s            // encoding: [0x04,0xa8,0xa1,0x0e]
 
+         fcvtpu v4.4h, v0.4h
+         fcvtpu v6.8h, v8.8h
          fcvtpu v6.4s, v8.4s
          fcvtpu v6.2d, v8.2d
          fcvtpu v4.2s, v0.2s
 
+// CHECK: fcvtpu  v4.4h, v0.4h            // encoding: [0x04,0xa8,0xf9,0x2e]
+// CHECK: fcvtpu  v6.8h, v8.8h            // encoding: [0x06,0xa9,0xf9,0x6e]
 // CHECK:	fcvtpu	v6.4s, v8.4s            // encoding: [0x06,0xa9,0xa1,0x6e]
 // CHECK:	fcvtpu	v6.2d, v8.2d            // encoding: [0x06,0xa9,0xe1,0x6e]
 // CHECK:	fcvtpu	v4.2s, v0.2s            // encoding: [0x04,0xa8,0xa1,0x2e]
 
+         fcvtms v4.4h, v0.4h
+         fcvtms v6.8h, v8.8h
          fcvtms v6.4s, v8.4s
          fcvtms v6.2d, v8.2d
          fcvtms v4.2s, v0.2s
 
+// CHECK: fcvtms  v4.4h, v0.4h            // encoding: [0x04,0xb8,0x79,0x0e]
+// CHECK: fcvtms  v6.8h, v8.8h            // encoding: [0x06,0xb9,0x79,0x4e]
 // CHECK:	fcvtms	v6.4s, v8.4s            // encoding: [0x06,0xb9,0x21,0x4e]
 // CHECK:	fcvtms	v6.2d, v8.2d            // encoding: [0x06,0xb9,0x61,0x4e]
 // CHECK:	fcvtms	v4.2s, v0.2s            // encoding: [0x04,0xb8,0x21,0x0e]
 
+         fcvtmu v4.4h, v0.4h
+         fcvtmu v6.8h, v8.8h
          fcvtmu v6.4s, v8.4s
          fcvtmu v6.2d, v8.2d
          fcvtmu v4.2s, v0.2s
 
+// CHECK: fcvtmu  v4.4h, v0.4h            // encoding: [0x04,0xb8,0x79,0x2e]
+// CHECK: fcvtmu  v6.8h, v8.8h            // encoding: [0x06,0xb9,0x79,0x6e]
 // CHECK:	fcvtmu	v6.4s, v8.4s            // encoding: [0x06,0xb9,0x21,0x6e]
 // CHECK:	fcvtmu	v6.2d, v8.2d            // encoding: [0x06,0xb9,0x61,0x6e]
 // CHECK:	fcvtmu	v4.2s, v0.2s            // encoding: [0x04,0xb8,0x21,0x2e]
 
+         fcvtzs v4.4h, v0.4h
+         fcvtzs v6.8h, v8.8h
          fcvtzs v6.4s, v8.4s
          fcvtzs v6.2d, v8.2d
          fcvtzs v4.2s, v0.2s
 
+// CHECK: fcvtzs  v4.4h, v0.4h            // encoding: [0x04,0xb8,0xf9,0x0e]
+// CHECK: fcvtzs  v6.8h, v8.8h            // encoding: [0x06,0xb9,0xf9,0x4e]
 // CHECK:	fcvtzs	v6.4s, v8.4s            // encoding: [0x06,0xb9,0xa1,0x4e]
 // CHECK:	fcvtzs	v6.2d, v8.2d            // encoding: [0x06,0xb9,0xe1,0x4e]
 // CHECK:	fcvtzs	v4.2s, v0.2s            // encoding: [0x04,0xb8,0xa1,0x0e]
 
 
+         fcvtzu v4.4h, v0.4h
+         fcvtzu v6.8h, v8.8h
          fcvtzu v6.4s, v8.4s
          fcvtzu v6.2d, v8.2d
          fcvtzu v4.2s, v0.2s
 
+// CHECK: fcvtzu  v4.4h, v0.4h            // encoding: [0x04,0xb8,0xf9,0x2e]
+// CHECK: fcvtzu  v6.8h, v8.8h            // encoding: [0x06,0xb9,0xf9,0x6e]
 // CHECK:	fcvtzu	v6.4s, v8.4s            // encoding: [0x06,0xb9,0xa1,0x6e]
 // CHECK:	fcvtzu	v6.2d, v8.2d            // encoding: [0x06,0xb9,0xe1,0x6e]
 // CHECK:	fcvtzu	v4.2s, v0.2s            // encoding: [0x04,0xb8,0xa1,0x2e]
 
+         fcvtas v4.4h, v0.4h
+         fcvtas v6.8h, v8.8h
          fcvtas v6.4s, v8.4s
          fcvtas v6.2d, v8.2d
          fcvtas v4.2s, v0.2s
 
+// CHECK: fcvtas  v4.4h, v0.4h            // encoding: [0x04,0xc8,0x79,0x0e]
+// CHECK: fcvtas  v6.8h, v8.8h            // encoding: [0x06,0xc9,0x79,0x4e]
 // CHECK:	fcvtas	v6.4s, v8.4s            // encoding: [0x06,0xc9,0x21,0x4e]
 // CHECK:	fcvtas	v6.2d, v8.2d            // encoding: [0x06,0xc9,0x61,0x4e]
 // CHECK:	fcvtas	v4.2s, v0.2s            // encoding: [0x04,0xc8,0x21,0x0e]
 
+         fcvtau v4.4h, v0.4h
+         fcvtau v6.8h, v8.8h
          fcvtau v6.4s, v8.4s
          fcvtau v6.2d, v8.2d
          fcvtau v4.2s, v0.2s
 
+// CHECK: fcvtau  v4.4h, v0.4h            // encoding: [0x04,0xc8,0x79,0x2e]
+// CHECK: fcvtau  v6.8h, v8.8h            // encoding: [0x06,0xc9,0x79,0x6e]
 // CHECK:	fcvtau	v6.4s, v8.4s            // encoding: [0x06,0xc9,0x21,0x6e]
 // CHECK:	fcvtau	v6.2d, v8.2d            // encoding: [0x06,0xc9,0x61,0x6e]
 // CHECK:	fcvtau	v4.2s, v0.2s            // encoding: [0x04,0xc8,0x21,0x2e]
@@ -603,42 +679,62 @@
 // CHECK:	ursqrte	v6.4s, v8.4s            // encoding: [0x06,0xc9,0xa1,0x6e]
 // CHECK:	ursqrte	v4.2s, v0.2s            // encoding: [0x04,0xc8,0xa1,0x2e]
 
+         scvtf v4.4h, v0.4h
+         scvtf v6.8h, v8.8h
          scvtf v6.4s, v8.4s
          scvtf v6.2d, v8.2d
          scvtf v4.2s, v0.2s
 
+// CHECK: scvtf   v4.4h, v0.4h            // encoding: [0x04,0xd8,0x79,0x0e]
+// CHECK: scvtf   v6.8h, v8.8h            // encoding: [0x06,0xd9,0x79,0x4e]
 // CHECK:	scvtf	v6.4s, v8.4s            // encoding: [0x06,0xd9,0x21,0x4e]
 // CHECK:	scvtf	v6.2d, v8.2d            // encoding: [0x06,0xd9,0x61,0x4e]
 // CHECK:	scvtf	v4.2s, v0.2s            // encoding: [0x04,0xd8,0x21,0x0e]
 
+         ucvtf v4.4h, v0.4h
+         ucvtf v6.8h, v8.8h
          ucvtf v6.4s, v8.4s
          ucvtf v6.2d, v8.2d
          ucvtf v4.2s, v0.2s
 
+// CHECK: ucvtf   v4.4h, v0.4h            // encoding: [0x04,0xd8,0x79,0x2e]
+// CHECK: ucvtf   v6.8h, v8.8h            // encoding: [0x06,0xd9,0x79,0x6e]
 // CHECK:	ucvtf	v6.4s, v8.4s            // encoding: [0x06,0xd9,0x21,0x6e]
 // CHECK:	ucvtf	v6.2d, v8.2d            // encoding: [0x06,0xd9,0x61,0x6e]
 // CHECK:	ucvtf	v4.2s, v0.2s            // encoding: [0x04,0xd8,0x21,0x2e]
 
+         frecpe v4.4h, v0.4h
+         frecpe v6.8h, v8.8h
          frecpe v6.4s, v8.4s
          frecpe v6.2d, v8.2d
          frecpe v4.2s, v0.2s
 
+// CHECK: frecpe  v4.4h, v0.4h            // encoding: [0x04,0xd8,0xf9,0x0e]
+// CHECK: frecpe  v6.8h, v8.8h            // encoding: [0x06,0xd9,0xf9,0x4e]
 // CHECK:	frecpe	v6.4s, v8.4s            // encoding: [0x06,0xd9,0xa1,0x4e]
 // CHECK:	frecpe	v6.2d, v8.2d            // encoding: [0x06,0xd9,0xe1,0x4e]
 // CHECK:	frecpe	v4.2s, v0.2s            // encoding: [0x04,0xd8,0xa1,0x0e]
 
+         frsqrte v4.4h, v0.4h
+         frsqrte v6.8h, v8.8h
          frsqrte v6.4s, v8.4s
          frsqrte v6.2d, v8.2d
          frsqrte v4.2s, v0.2s
 
+// CHECK: frsqrte v4.4h, v0.4h            // encoding: [0x04,0xd8,0xf9,0x2e]
+// CHECK: frsqrte v6.8h, v8.8h            // encoding: [0x06,0xd9,0xf9,0x6e]
 // CHECK:	frsqrte	v6.4s, v8.4s            // encoding: [0x06,0xd9,0xa1,0x6e]
 // CHECK:	frsqrte	v6.2d, v8.2d            // encoding: [0x06,0xd9,0xe1,0x6e]
 // CHECK:	frsqrte	v4.2s, v0.2s            // encoding: [0x04,0xd8,0xa1,0x2e]
 
+         fsqrt v4.4h, v0.4h
+         fsqrt v6.8h, v8.8h
          fsqrt v6.4s, v8.4s
          fsqrt v6.2d, v8.2d
          fsqrt v4.2s, v0.2s
 
+// CHECK: fsqrt   v4.4h, v0.4h            // encoding: [0x04,0xf8,0xf9,0x2e]
+// CHECK: fsqrt   v6.8h, v8.8h            // encoding: [0x06,0xf9,0xf9,0x6e]
 // CHECK:	fsqrt	v6.4s, v8.4s            // encoding: [0x06,0xf9,0xa1,0x6e]
 // CHECK:	fsqrt	v6.2d, v8.2d            // encoding: [0x06,0xf9,0xe1,0x6e]
 // CHECK:	fsqrt	v4.2s, v0.2s            // encoding: [0x04,0xf8,0xa1,0x2e]
diff --git a/test/MC/AArch64/neon-simd-shift.s b/test/MC/AArch64/neon-simd-shift.s
index a16432324efc..4638c535a6a7 100644
--- a/test/MC/AArch64/neon-simd-shift.s
+++ b/test/MC/AArch64/neon-simd-shift.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -mattr=+neon,+fullfp16 -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -400,16 +400,24 @@
 //------------------------------------------------------------------------------
 // Fixed-point convert to floating-point
 //------------------------------------------------------------------------------
+         scvtf v0.4h, v1.4h, #3
+         scvtf v0.8h, v1.8h, #3
          scvtf v0.2s, v1.2s, #3
          scvtf v0.4s, v1.4s, #3
          scvtf v0.2d, v1.2d, #3
+         ucvtf v0.4h, v1.4h, #3
+         ucvtf v0.8h, v1.8h, #3
          ucvtf v0.2s, v1.2s, #3
          ucvtf v0.4s, v1.4s, #3
          ucvtf v0.2d, v1.2d, #3
 
+// CHECK: scvtf v0.4h, v1.4h, #3        // encoding: [0x20,0xe4,0x1d,0x0f]
+// CHECK: scvtf v0.8h, v1.8h, #3        // encoding: [0x20,0xe4,0x1d,0x4f]
 // CHECK:	scvtf	v0.2s, v1.2s, #3        // encoding: [0x20,0xe4,0x3d,0x0f]
 // CHECK:	scvtf	v0.4s, v1.4s, #3        // encoding: [0x20,0xe4,0x3d,0x4f]
 // CHECK:	scvtf	v0.2d, v1.2d, #3        // encoding: [0x20,0xe4,0x7d,0x4f]
+// CHECK:	ucvtf v0.4h, v1.4h, #3        // encoding: [0x20,0xe4,0x1d,0x2f]
+// CHECK:	ucvtf v0.8h, v1.8h, #3        // encoding: [0x20,0xe4,0x1d,0x6f]
 // CHECK:	ucvtf	v0.2s, v1.2s, #3        // encoding: [0x20,0xe4,0x3d,0x2f]
 // CHECK:	ucvtf	v0.4s, v1.4s, #3        // encoding: [0x20,0xe4,0x3d,0x6f]
 // CHECK:	ucvtf	v0.2d, v1.2d, #3        // encoding: [0x20,0xe4,0x7d,0x6f]
@@ -417,17 +425,25 @@
 //------------------------------------------------------------------------------
 // Floating-point convert to fixed-point
 //------------------------------------------------------------------------------
+         fcvtzs v0.4h, v1.4h, #3
+         fcvtzs v0.8h, v1.8h, #3
          fcvtzs v0.2s, v1.2s, #3
          fcvtzs v0.4s, v1.4s, #3
          fcvtzs v0.2d, v1.2d, #3
+         fcvtzu v0.4h, v1.4h, #3
+         fcvtzu v0.8h, v1.8h, #3
          fcvtzu v0.2s, v1.2s, #3
          fcvtzu v0.4s, v1.4s, #3
          fcvtzu v0.2d, v1.2d, #3
 
 
+// CHECK:	fcvtzs  v0.4h, v1.4h, #3        // encoding: [0x20,0xfc,0x1d,0x0f]
+// CHECK:	fcvtzs  v0.8h, v1.8h, #3        // encoding: [0x20,0xfc,0x1d,0x4f]
 // CHECK:	fcvtzs	v0.2s, v1.2s, #3        // encoding: [0x20,0xfc,0x3d,0x0f]
 // CHECK:	fcvtzs	v0.4s, v1.4s, #3        // encoding: [0x20,0xfc,0x3d,0x4f]
 // CHECK:	fcvtzs	v0.2d, v1.2d, #3        // encoding: [0x20,0xfc,0x7d,0x4f]
+// CHECK:	fcvtzu  v0.4h, v1.4h, #3        // encoding: [0x20,0xfc,0x1d,0x2f]
+// CHECK:	fcvtzu  v0.8h, v1.8h, #3        // encoding: [0x20,0xfc,0x1d,0x6f]
 // CHECK:	fcvtzu	v0.2s, v1.2s, #3        // encoding: [0x20,0xfc,0x3d,0x2f]
 // CHECK:	fcvtzu	v0.4s, v1.4s, #3        // encoding: [0x20,0xfc,0x3d,0x6f]
 // CHECK:	fcvtzu	v0.2d, v1.2d, #3        // encoding: [0x20,0xfc,0x7d,0x6f]
diff --git a/test/MC/Disassembler/AArch64/fullfp16-neon-neg.txt b/test/MC/Disassembler/AArch64/fullfp16-neon-neg.txt
new file mode 100644
index 000000000000..8b7e1c878002
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/fullfp16-neon-neg.txt
@@ -0,0 +1,382 @@
+# RUN: not llvm-mc -disassemble -triple=aarch64 -mattr=+neon,-fullfp16 < %s 2>&1 | FileCheck %s
+# RUN: not llvm-mc -disassemble -triple=aarch64 -mattr=-neon,-fullfp16 < %s 2>&1 | FileCheck %s
+# RUN: not llvm-mc -disassemble -triple=aarch64 -mattr=-neon,+fullfp16 < %s 2>&1 | FileCheck %s
+
+[0x00,0xf8,0xf8,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0xf8,0xf8,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0xd8,0xf9,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0x88,0x79,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0x98,0x79,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0x98,0xf9,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0x98,0x79,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0x88,0x79,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0x88,0xf9,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0x98,0xf9,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0xd8,0xf9,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0xf8,0xf9,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0xf8,0xf8,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0xf8,0xf8,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0xd8,0xf9,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0x88,0x79,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0x98,0x79,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0x98,0xf9,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0x98,0x79,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0x88,0x79,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0x88,0xf9,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0x98,0xf9,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0xd8,0xf9,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x00,0xf8,0xf9,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x10,0x22,0x0f]
+# CHECK: warning: invalid instruction encoding
+[0x03,0x11,0x12,0x4f]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x50,0x22,0x0f]
+# CHECK: warning: invalid instruction encoding
+[0x03,0x51,0x12,0x4f]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x90,0x22,0x0f]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x90,0x22,0x4f]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x90,0x22,0x2f]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x90,0x22,0x6f]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x14,0xc2,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0xc8,0x30,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0xc8,0xb0,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0xf8,0x30,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0xf8,0xb0,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x14,0x42,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x14,0x42,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x14,0x42,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x14,0x42,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x14,0xc2,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x14,0xc2,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0xe0,0x27,0x50,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0xe4,0x24,0x4f,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x03,0x25,0x4c,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0xbf,0x27,0x5c,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x03,0x25,0x4c,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0xbf,0x27,0x5c,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0xe0,0x27,0xd0,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0xe4,0x24,0xcf,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0xe0,0x27,0xd0,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0xe4,0x24,0xcf,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0xe0,0xdb,0xf8,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0xe4,0xd8,0xf8,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0xe0,0xdb,0xf8,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0xe4,0xd8,0xf8,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x03,0xc9,0xf8,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0xbf,0xcb,0xf8,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x03,0xc9,0xf8,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0xbf,0xcb,0xf8,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0xe0,0xcb,0xf8,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0xe4,0xc8,0xf8,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0xe0,0xcb,0xf8,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0xe4,0xc8,0xf8,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x83,0xda,0xf8,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x01,0xd9,0xf8,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x83,0xda,0xf8,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x01,0xd9,0xf8,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x50,0xe8,0xf8,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x8f,0xe8,0xf8,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x50,0xe8,0xf8,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x8f,0xe8,0xf8,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0xe0,0x2f,0x50,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0xe4,0x2c,0x4f,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0xe0,0x2f,0x50,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0xe4,0x2c,0x4f,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x03,0x2d,0xcc,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0xbf,0x2f,0xdc,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x03,0x2d,0xcc,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0xbf,0x2f,0xdc,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0xe0,0x3f,0xd0,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0xe4,0x3c,0xcf,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x03,0x3d,0x4c,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0xbf,0x3f,0x5c,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x34,0x42,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0xff,0x35,0x50,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0xea,0x35,0xd6,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0xa3,0x34,0xc6,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x04,0x42,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0xff,0x05,0x50,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0xea,0x05,0xd6,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0xa3,0x04,0xc6,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x34,0x42,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x34,0x42,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0xea,0x35,0xd6,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0xea,0x35,0xd6,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x04,0x42,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x04,0x42,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0xea,0x05,0xd6,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0xea,0x05,0xd6,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x0c,0x42,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x0c,0x42,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x0c,0xc2,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x0c,0xc2,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x1d,0x17,0xd4,0x7e]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x18,0x11,0x5f]
+# CHECK: warning: invalid instruction encoding
+[0x62,0x58,0x14,0x5f]
+# CHECK: warning: invalid instruction encoding
+[0x20,0x98,0x11,0x5f]
+# CHECK: warning: invalid instruction encoding
+[0x46,0x98,0x18,0x7f]
+# CHECK: warning: invalid instruction encoding
+[0x95,0xfd,0x1f,0x5f]
+# CHECK: warning: invalid instruction encoding
+[0x95,0xfd,0x1f,0x7f]
+# CHECK: warning: invalid instruction encoding
+[0xac,0xc9,0x79,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0xac,0xc9,0x79,0x7e]
+# CHECK: warning: invalid instruction encoding
+[0xb6,0xb9,0x79,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0xac,0xb9,0x79,0x7e]
+# CHECK: warning: invalid instruction encoding
+[0xb6,0xa9,0x79,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0xac,0xa9,0x79,0x7e]
+# CHECK: warning: invalid instruction encoding
+[0xb6,0xa9,0xf9,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0xac,0xa9,0xf9,0x7e]
+# CHECK: warning: invalid instruction encoding
+[0xac,0xb9,0xf9,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0xac,0xb9,0xf9,0x7e]
+# CHECK: warning: invalid instruction encoding
+[0x6a,0x25,0x4c,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0x6a,0xd9,0xf8,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0x6a,0xd9,0xf8,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0x6a,0x25,0x4c,0x7e]
+# CHECK: warning: invalid instruction encoding
+[0x6a,0xc9,0xf8,0x7e]
+# CHECK: warning: invalid instruction encoding
+[0x6a,0xc9,0xf8,0x7e]
+# CHECK: warning: invalid instruction encoding
+[0x6a,0x25,0xcc,0x7e]
+# CHECK: warning: invalid instruction encoding
+[0x6a,0xc9,0xf8,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0x6a,0xc9,0xf8,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0x6a,0xd9,0xf8,0x7e]
+# CHECK: warning: invalid instruction encoding
+[0x6a,0xd9,0xf8,0x7e]
+# CHECK: warning: invalid instruction encoding
+[0x6a,0xe9,0xf8,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0x6a,0xe9,0xf8,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0x6a,0x2d,0x4c,0x7e]
+# CHECK: warning: invalid instruction encoding
+[0x6a,0x2d,0xcc,0x7e]
+# CHECK: warning: invalid instruction encoding
+[0xd4,0x1e,0x4f,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0x15,0x3e,0x4d,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0xb5,0x3c,0xcc,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0xd3,0xd9,0xf9,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0x52,0xf9,0xf9,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0xb6,0xd9,0xf9,0x7e]
+# CHECK: warning: invalid instruction encoding
+[0x72,0xd8,0x30,0x5e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0xf8,0xf8,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0xf9,0xf8,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0xf8,0xf8,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0xf9,0xf8,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0x88,0x79,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0x89,0x79,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0x88,0x79,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0x89,0x79,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0x88,0xf9,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0x89,0xf9,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0x98,0x79,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0x99,0x79,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0x98,0x79,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0x99,0x79,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0x98,0xf9,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0x99,0xf9,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0x98,0xf9,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0x99,0xf9,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0xa8,0x79,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0xa9,0x79,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0xa8,0x79,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0xa9,0x79,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0xa8,0xf9,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0xa9,0xf9,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0xa8,0xf9,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0xa9,0xf9,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0xb8,0x79,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0xb9,0x79,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0xb8,0x79,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0xb9,0x79,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0xb8,0xf9,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0xb9,0xf9,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0xb8,0xf9,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0xb9,0xf9,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0xc8,0x79,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0xc9,0x79,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0xc8,0x79,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0xc9,0x79,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0xd8,0xf9,0x0e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0xd9,0xf9,0x4e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0xd8,0xf9,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0xd9,0xf9,0x6e]
+# CHECK: warning: invalid instruction encoding
+[0x04,0xf8,0xf9,0x2e]
+# CHECK: warning: invalid instruction encoding
+[0x06,0xf9,0xf9,0x6e]
+# CHECK: warning: invalid instruction encoding
+
+# CHECK-NOT: warning: invalid instruction encoding

From c14af0e1c8aff647498cf5e21362fec267b983e7 Mon Sep 17 00:00:00 2001
From: Asaf Badouh <asaf.badouh@intel.com>
Date: Tue, 8 Dec 2015 12:34:34 +0000
Subject: [PATCH 222/364] [x86][avx512] more changes in intrinsics to be align
 with gcc format

Differential Revision: http://reviews.llvm.org/D15329


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255011 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/IntrinsicsX86.td | 36 ++++++++++++++++----------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td
index c287a3a1928e..370b527c6f7c 100644
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@@ -5248,27 +5248,27 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                      llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_mask_rndscale_ss : GCCBuiltin<"__builtin_ia32_rndscaless_mask">,
+  def int_x86_avx512_mask_rndscale_ss : GCCBuiltin<"__builtin_ia32_rndscaless_round">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
                                      llvm_i8_ty, llvm_i32_ty, llvm_i32_ty],
                                      [IntrNoMem]>;
-  def int_x86_avx512_mask_rndscale_sd : GCCBuiltin<"__builtin_ia32_rndscalesd_mask">,
+  def int_x86_avx512_mask_rndscale_sd : GCCBuiltin<"__builtin_ia32_rndscalesd_round">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
                                       llvm_i8_ty, llvm_i32_ty, llvm_i32_ty],
                                      [IntrNoMem]>;
-  def int_x86_avx512_mask_range_ss : GCCBuiltin<"__builtin_ia32_rangess_mask">,
+  def int_x86_avx512_mask_range_ss : GCCBuiltin<"__builtin_ia32_rangess128_round">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
                                      llvm_i8_ty, llvm_i32_ty, llvm_i32_ty],
                                      [IntrNoMem]>;
-  def int_x86_avx512_mask_range_sd : GCCBuiltin<"__builtin_ia32_rangesd_mask">,
+  def int_x86_avx512_mask_range_sd : GCCBuiltin<"__builtin_ia32_rangesd128_round">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
                                       llvm_i8_ty, llvm_i32_ty, llvm_i32_ty],
                                      [IntrNoMem]>;
-  def int_x86_avx512_mask_reduce_ss : GCCBuiltin<"__builtin_ia32_reducess_mask">,
+  def int_x86_avx512_mask_reduce_ss : GCCBuiltin<"__builtin_ia32_reducess">,
           Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
                                      llvm_i8_ty, llvm_i32_ty, llvm_i32_ty],
                                      [IntrNoMem]>;
-  def int_x86_avx512_mask_reduce_sd : GCCBuiltin<"__builtin_ia32_reducesd_mask">,
+  def int_x86_avx512_mask_reduce_sd : GCCBuiltin<"__builtin_ia32_reducesd">,
           Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
                                       llvm_i8_ty, llvm_i32_ty, llvm_i32_ty],
                                      [IntrNoMem]>;
@@ -5297,10 +5297,10 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, 
                     llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_mask_sqrt_ss : GCCBuiltin<"__builtin_ia32_sqrtrndss_mask">,
+  def int_x86_avx512_mask_sqrt_ss : GCCBuiltin<"__builtin_ia32_sqrtss_round">,
         Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
                                     llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_sqrt_sd : GCCBuiltin<"__builtin_ia32_sqrtrndsd_mask">,
+  def int_x86_avx512_mask_sqrt_sd : GCCBuiltin<"__builtin_ia32_sqrtsd_round">,
         Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
                                     llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
 
@@ -5385,21 +5385,21 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
           [IntrNoMem]>;
 
   def int_x86_avx512_mask_getmant_ss :
-         GCCBuiltin<"__builtin_ia32_getmantss_mask">,
+         GCCBuiltin<"__builtin_ia32_getmantss_round">,
           Intrinsic<[llvm_v4f32_ty],
           [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_v4f32_ty,
            llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
 
   def int_x86_avx512_mask_getmant_sd :
-         GCCBuiltin<"__builtin_ia32_getmantsd_mask">,
+         GCCBuiltin<"__builtin_ia32_getmantsd_round">,
           Intrinsic<[llvm_v2f64_ty],
           [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty, llvm_v2f64_ty,
            llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_rsqrt14_ss : GCCBuiltin<"__builtin_ia32_rsqrt14ss_mask">,
+  def int_x86_avx512_rsqrt14_ss : GCCBuiltin<"__builtin_ia32_rsqrt14ss">,
         Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
                                     llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_rsqrt14_sd : GCCBuiltin<"__builtin_ia32_rsqrt14sd_mask">,
+  def int_x86_avx512_rsqrt14_sd : GCCBuiltin<"__builtin_ia32_rsqrt14sd">,
         Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
                                     llvm_i8_ty], [IntrNoMem]>;
 
@@ -5409,10 +5409,10 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx512_rsqrt14_ps_512 : GCCBuiltin<"__builtin_ia32_rsqrt14ps512_mask">,
         Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
                                      llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_rcp14_ss : GCCBuiltin<"__builtin_ia32_rcp14ss_mask">,
+  def int_x86_avx512_rcp14_ss : GCCBuiltin<"__builtin_ia32_rcp14ss">,
         Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
                                     llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_rcp14_sd : GCCBuiltin<"__builtin_ia32_rcp14sd_mask">,
+  def int_x86_avx512_rcp14_sd : GCCBuiltin<"__builtin_ia32_rcp14sd">,
         Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
                                     llvm_i8_ty], [IntrNoMem]>;
 
@@ -5436,11 +5436,11 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
             Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
                                         llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_rcp28_ss : GCCBuiltin<"__builtin_ia32_rcp28ss_mask">,
+  def int_x86_avx512_rcp28_ss : GCCBuiltin<"__builtin_ia32_rcp28ss_round">,
             Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                                         llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty],
                       [IntrNoMem]>;
-  def int_x86_avx512_rcp28_sd : GCCBuiltin<"__builtin_ia32_rcp28sd_mask">,
+  def int_x86_avx512_rcp28_sd : GCCBuiltin<"__builtin_ia32_rcp28sd_round">,
             Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                                         llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty],
                       [IntrNoMem]>;
@@ -5452,11 +5452,11 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
             Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
                                         llvm_i8_ty, llvm_i32_ty],
                       [IntrNoMem]>;
-  def int_x86_avx512_rsqrt28_ss : GCCBuiltin<"__builtin_ia32_rsqrt28ss_mask">,
+  def int_x86_avx512_rsqrt28_ss : GCCBuiltin<"__builtin_ia32_rsqrt28ss_round">,
             Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                                         llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty],
                       [IntrNoMem]>;
-  def int_x86_avx512_rsqrt28_sd : GCCBuiltin<"__builtin_ia32_rsqrt28sd_mask">,
+  def int_x86_avx512_rsqrt28_sd : GCCBuiltin<"__builtin_ia32_rsqrt28sd_round">,
             Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
                                         llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty],
                       [IntrNoMem]>;

From 5a34e2eef2c83684f1220e3a38180b204a043f13 Mon Sep 17 00:00:00 2001
From: Daniel Sanders <daniel.sanders@imgtec.com>
Date: Tue, 8 Dec 2015 13:49:19 +0000
Subject: [PATCH 223/364] [mips][ias] Range check uimm6 operands and fix a bug
 this revealed.

Summary:
We don't check the size operand on ext/dext*/ins/dins* yet because the
permitted range depends on the pos argument and we can't check that using
this mechanism.

The bug was that dextu/dinsu accepted 0..31 in the pos operand instead of 32..63.

Reviewers: vkalintiris

Subscribers: llvm-commits, dsanders

Differential Revision: http://reviews.llvm.org/D15190

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255015 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Mips/AsmParser/MipsAsmParser.cpp |   3 +
 lib/Target/Mips/MicroMips64r6InstrInfo.td   |   4 +-
 lib/Target/Mips/Mips64InstrInfo.td          |   4 +-
 lib/Target/Mips/MipsInstrInfo.td            |  25 ++--
 lib/Target/Mips/MipsMSAInstrInfo.td         | 126 ++++++++------------
 test/MC/Mips/micromips64r6/invalid.s        |  14 +++
 test/MC/Mips/micromips64r6/valid.s          |   2 +-
 test/MC/Mips/mips32r2/invalid.s             |  14 +++
 test/MC/Mips/mips64r2/invalid.s             |  42 +++++++
 test/MC/Mips/msa/invalid-64.s               |  28 +++++
 test/MC/Mips/msa/invalid.s                  |  24 ++++
 11 files changed, 193 insertions(+), 93 deletions(-)

diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 44f665a86c66..ac32a25a8b3c 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -3647,6 +3647,9 @@ bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_UImm5_Lsl2:
     return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
                  "expected both 7-bit unsigned immediate and multiple of 4");
+  case Match_UImm6_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 6-bit unsigned immediate");
   }
 
   llvm_unreachable("Implement any new match types added!");
diff --git a/lib/Target/Mips/MicroMips64r6InstrInfo.td b/lib/Target/Mips/MicroMips64r6InstrInfo.td
index dfc54d75b361..f9e6a3c2f6ca 100644
--- a/lib/Target/Mips/MicroMips64r6InstrInfo.td
+++ b/lib/Target/Mips/MicroMips64r6InstrInfo.td
@@ -66,9 +66,9 @@ class EXTBITS_DESC_BASE<string instr_asm, RegisterOperand RO, Operand PosOpnd,
 }
 class DEXT_MMR6_DESC : EXTBITS_DESC_BASE<"dext", GPR64Opnd, uimm6,
                                                  MipsExt>;
-class DEXTM_MMR6_DESC : EXTBITS_DESC_BASE<"dextm", GPR64Opnd, uimm6,
+class DEXTM_MMR6_DESC : EXTBITS_DESC_BASE<"dextm", GPR64Opnd, uimm5,
                                                   MipsExt>;
-class DEXTU_MMR6_DESC : EXTBITS_DESC_BASE<"dextu", GPR64Opnd, uimm6,
+class DEXTU_MMR6_DESC : EXTBITS_DESC_BASE<"dextu", GPR64Opnd, uimm5_plus32,
                                                   MipsExt>;
 
 class DALIGN_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index 0992186a087f..b0cf96dbd75d 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -275,11 +275,11 @@ def RDHWR64 : ReadHardware<GPR64Opnd, HWRegsOpnd>, RDHWR_FM;
 let AdditionalPredicates = [NotInMicroMips] in {
   def DEXT : ExtBase<"dext", GPR64Opnd, uimm6, MipsExt>, EXT_FM<3>;
   def DEXTM : ExtBase<"dextm", GPR64Opnd, uimm5>, EXT_FM<1>;
-  def DEXTU : ExtBase<"dextu", GPR64Opnd, uimm6>, EXT_FM<2>;
+  def DEXTU : ExtBase<"dextu", GPR64Opnd, uimm5_plus32>, EXT_FM<2>;
 }
 
 def DINS : InsBase<"dins", GPR64Opnd, uimm6, MipsIns>, EXT_FM<7>;
-def DINSU : InsBase<"dinsu", GPR64Opnd, uimm6>, EXT_FM<6>;
+def DINSU : InsBase<"dinsu", GPR64Opnd, uimm5_plus32>, EXT_FM<6>;
 def DINSM : InsBase<"dinsm", GPR64Opnd, uimm5>, EXT_FM<5>;
 
 let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index f3be7fc46187..dae61c3c782d 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -394,8 +394,13 @@ class ConstantUImmAsmOperandClass<int Bits, list<AsmOperandClass> Supers = [],
   let DiagnosticType = "UImm" # Bits # "_" # Offset;
 }
 
+def ConstantUImm6AsmOperandClass
+    : ConstantUImmAsmOperandClass<6, []>;
+def ConstantUImm5Plus32AsmOperandClass
+    : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass], 32>;
 def ConstantUImm5Plus32NormalizeAsmOperandClass
-    : ConstantUImmAsmOperandClass<5, [], 32> {
+    : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass], 32> {
+  let Name = "ConstantUImm5_32_Norm";
   // We must also subtract 32 when we render the operand.
   let RenderMethod = "addConstantUImmOperands<5, 32, -32>";
 }
@@ -403,19 +408,20 @@ def ConstantUImm5Lsl2AsmOperandClass : AsmOperandClass {
   let Name = "UImm5Lsl2";
   let RenderMethod = "addImmOperands";
   let PredicateMethod = "isScaledUImm<5, 2>";
-  let SuperClasses = [];
+  let SuperClasses = [ConstantUImm6AsmOperandClass];
   let DiagnosticType = "UImm5_Lsl2";
 }
 def ConstantUImm5ReportUImm6AsmOperandClass
-    : ConstantUImmAsmOperandClass<5, []> {
+    : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass]> {
   let Name = "ConstantUImm5_0_Report_UImm6";
   let DiagnosticType = "UImm5_0_Report_UImm6";
 }
 def ConstantUImm5AsmOperandClass
-    : ConstantUImmAsmOperandClass<5, []>;
+    : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass]>;
 def ConstantUImm4AsmOperandClass
     : ConstantUImmAsmOperandClass<
           4, [ConstantUImm5AsmOperandClass,
+              ConstantUImm5Plus32AsmOperandClass,
               ConstantUImm5Plus32NormalizeAsmOperandClass]>;
 def ConstantUImm3AsmOperandClass
     : ConstantUImmAsmOperandClass<3, [ConstantUImm4AsmOperandClass]>;
@@ -506,7 +512,7 @@ def uimmz       : Operand<i32> {
 }
 
 // Unsigned Operands
-foreach I = {1, 2, 3, 4, 5} in
+foreach I = {1, 2, 3, 4, 5, 6} in
   def uimm # I : Operand<i32> {
     let PrintMethod = "printUnsignedImm";
     let ParserMatchClass =
@@ -520,6 +526,11 @@ def uimm2_plus1 : Operand<i32> {
   let ParserMatchClass = ConstantUImm2Plus1AsmOperandClass;
 }
 
+def uimm5_plus32 : Operand<i32> {
+  let PrintMethod = "printUnsignedImm";
+  let ParserMatchClass = ConstantUImm5Plus32AsmOperandClass;
+}
+
 def uimm5_plus32_normalize : Operand<i32> {
   let PrintMethod = "printUnsignedImm";
   let ParserMatchClass = ConstantUImm5Plus32NormalizeAsmOperandClass;
@@ -550,10 +561,6 @@ def uimm5_64_report_uimm6 : Operand<i64> {
   let ParserMatchClass = ConstantUImm5ReportUImm6AsmOperandClass;
 }
 
-def uimm6 : Operand<i32> {
-  let PrintMethod = "printUnsignedImm";
-}
-
 def uimm16      : Operand<i32> {
   let PrintMethod = "printUnsignedImm";
 }
diff --git a/lib/Target/Mips/MipsMSAInstrInfo.td b/lib/Target/Mips/MipsMSAInstrInfo.td
index b74e967e6817..68e6ca1086bb 100644
--- a/lib/Target/Mips/MipsMSAInstrInfo.td
+++ b/lib/Target/Mips/MipsMSAInstrInfo.td
@@ -1177,47 +1177,14 @@ class MSA_BIT_D_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   InstrItinClass Itinerary = itin;
 }
 
-// This class is deprecated and will be removed soon.
-class MSA_BIT_B_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
-                            InstrItinClass itin = NoItinerary> {
-  dag OutOperandList = (outs ROWD:$wd);
-  dag InOperandList = (ins ROWS:$ws, uimm3:$m);
-  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
-  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt3:$m))];
-  InstrItinClass Itinerary = itin;
-}
-
-// This class is deprecated and will be removed soon.
-class MSA_BIT_H_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
-                            InstrItinClass itin = NoItinerary> {
-  dag OutOperandList = (outs ROWD:$wd);
-  dag InOperandList = (ins ROWS:$ws, uimm4:$m);
-  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
-  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt4:$m))];
-  InstrItinClass Itinerary = itin;
-}
-
-// This class is deprecated and will be removed soon.
-class MSA_BIT_W_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
-                            InstrItinClass itin = NoItinerary> {
-  dag OutOperandList = (outs ROWD:$wd);
-  dag InOperandList = (ins ROWS:$ws, uimm5:$m);
-  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
-  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt5:$m))];
-  InstrItinClass Itinerary = itin;
-}
-
-// This class is deprecated and will be removed soon.
-class MSA_BIT_D_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
-                            InstrItinClass itin = NoItinerary> {
+class MSA_BIT_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          Operand ImmOp, ImmLeaf Imm, RegisterOperand ROWD,
+                          RegisterOperand ROWS = ROWD,
+                          InstrItinClass itin = NoItinerary> {
   dag OutOperandList = (outs ROWD:$wd);
-  dag InOperandList = (ins ROWS:$ws, uimm6:$m);
+  dag InOperandList = (ins ROWS:$ws, ImmOp:$m);
   string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
-  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt6:$m))];
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, Imm:$m))];
   InstrItinClass Itinerary = itin;
 }
 
@@ -1502,13 +1469,14 @@ class MSA_INSERT_VIDX_PSEUDO_BASE<SDPatternOperator OpNode, ValueType Ty,
 }
 
 class MSA_INSVE_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                          RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                          Operand ImmOp, ImmLeaf Imm, RegisterOperand ROWD,
+                          RegisterOperand ROWS = ROWD,
                           InstrItinClass itin = NoItinerary> {
   dag OutOperandList = (outs ROWD:$wd);
-  dag InOperandList = (ins ROWD:$wd_in, uimm6:$n, ROWS:$ws, uimmz:$n2);
+  dag InOperandList = (ins ROWD:$wd_in, ImmOp:$n, ROWS:$ws, uimmz:$n2);
   string AsmString = !strconcat(instr_asm, "\t$wd[$n], $ws[$n2]");
   list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in,
-                                              immZExt6:$n,
+                                              Imm:$n,
                                               ROWS:$ws,
                                               immz:$n2))];
   InstrItinClass Itinerary = itin;
@@ -2327,13 +2295,13 @@ class INSERT_FW_VIDX64_PSEUDO_DESC :
 class INSERT_FD_VIDX64_PSEUDO_DESC :
     MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v2f64, MSA128DOpnd, FGR64Opnd, GPR64Opnd>;
 
-class INSVE_B_DESC : MSA_INSVE_DESC_BASE<"insve.b", insve_v16i8,
+class INSVE_B_DESC : MSA_INSVE_DESC_BASE<"insve.b", insve_v16i8, uimm4, immZExt4,
                                          MSA128BOpnd>;
-class INSVE_H_DESC : MSA_INSVE_DESC_BASE<"insve.h", insve_v8i16,
+class INSVE_H_DESC : MSA_INSVE_DESC_BASE<"insve.h", insve_v8i16, uimm3, immZExt3,
                                          MSA128HOpnd>;
-class INSVE_W_DESC : MSA_INSVE_DESC_BASE<"insve.w", insve_v4i32,
+class INSVE_W_DESC : MSA_INSVE_DESC_BASE<"insve.w", insve_v4i32, uimm2, immZExt2,
                                          MSA128WOpnd>;
-class INSVE_D_DESC : MSA_INSVE_DESC_BASE<"insve.d", insve_v2i64,
+class INSVE_D_DESC : MSA_INSVE_DESC_BASE<"insve.d", insve_v2i64, uimm1, immZExt1,
                                          MSA128DOpnd>;
 
 class LD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
@@ -2542,23 +2510,23 @@ class PCNT_H_DESC : MSA_2R_DESC_BASE<"pcnt.h", ctpop, MSA128HOpnd>;
 class PCNT_W_DESC : MSA_2R_DESC_BASE<"pcnt.w", ctpop, MSA128WOpnd>;
 class PCNT_D_DESC : MSA_2R_DESC_BASE<"pcnt.d", ctpop, MSA128DOpnd>;
 
-class SAT_S_B_DESC : MSA_BIT_B_X_DESC_BASE<"sat_s.b", int_mips_sat_s_b,
-                                           MSA128BOpnd>;
-class SAT_S_H_DESC : MSA_BIT_H_X_DESC_BASE<"sat_s.h", int_mips_sat_s_h,
-                                           MSA128HOpnd>;
-class SAT_S_W_DESC : MSA_BIT_W_X_DESC_BASE<"sat_s.w", int_mips_sat_s_w,
-                                           MSA128WOpnd>;
-class SAT_S_D_DESC : MSA_BIT_D_X_DESC_BASE<"sat_s.d", int_mips_sat_s_d,
-                                           MSA128DOpnd>;
-
-class SAT_U_B_DESC : MSA_BIT_B_X_DESC_BASE<"sat_u.b", int_mips_sat_u_b,
-                                           MSA128BOpnd>;
-class SAT_U_H_DESC : MSA_BIT_H_X_DESC_BASE<"sat_u.h", int_mips_sat_u_h,
-                                           MSA128HOpnd>;
-class SAT_U_W_DESC : MSA_BIT_W_X_DESC_BASE<"sat_u.w", int_mips_sat_u_w,
-                                           MSA128WOpnd>;
-class SAT_U_D_DESC : MSA_BIT_D_X_DESC_BASE<"sat_u.d", int_mips_sat_u_d,
-                                           MSA128DOpnd>;
+class SAT_S_B_DESC : MSA_BIT_X_DESC_BASE<"sat_s.b", int_mips_sat_s_b, uimm3,
+                                         immZExt3, MSA128BOpnd>;
+class SAT_S_H_DESC : MSA_BIT_X_DESC_BASE<"sat_s.h", int_mips_sat_s_h, uimm4,
+                                         immZExt4, MSA128HOpnd>;
+class SAT_S_W_DESC : MSA_BIT_X_DESC_BASE<"sat_s.w", int_mips_sat_s_w, uimm5,
+                                         immZExt5, MSA128WOpnd>;
+class SAT_S_D_DESC : MSA_BIT_X_DESC_BASE<"sat_s.d", int_mips_sat_s_d, uimm6,
+                                         immZExt6, MSA128DOpnd>;
+
+class SAT_U_B_DESC : MSA_BIT_X_DESC_BASE<"sat_u.b", int_mips_sat_u_b, uimm3,
+                                         immZExt3, MSA128BOpnd>;
+class SAT_U_H_DESC : MSA_BIT_X_DESC_BASE<"sat_u.h", int_mips_sat_u_h, uimm4,
+                                         immZExt4, MSA128HOpnd>;
+class SAT_U_W_DESC : MSA_BIT_X_DESC_BASE<"sat_u.w", int_mips_sat_u_w, uimm5,
+                                         immZExt5, MSA128WOpnd>;
+class SAT_U_D_DESC : MSA_BIT_X_DESC_BASE<"sat_u.d", int_mips_sat_u_d, uimm6,
+                                         immZExt6, MSA128DOpnd>;
 
 class SHF_B_DESC : MSA_I8_SHF_DESC_BASE<"shf.b", MSA128BOpnd>;
 class SHF_H_DESC : MSA_I8_SHF_DESC_BASE<"shf.h", MSA128HOpnd>;
@@ -2633,14 +2601,14 @@ class SRAR_H_DESC : MSA_3R_DESC_BASE<"srar.h", int_mips_srar_h, MSA128HOpnd>;
 class SRAR_W_DESC : MSA_3R_DESC_BASE<"srar.w", int_mips_srar_w, MSA128WOpnd>;
 class SRAR_D_DESC : MSA_3R_DESC_BASE<"srar.d", int_mips_srar_d, MSA128DOpnd>;
 
-class SRARI_B_DESC : MSA_BIT_B_X_DESC_BASE<"srari.b", int_mips_srari_b,
-                                           MSA128BOpnd>;
-class SRARI_H_DESC : MSA_BIT_H_X_DESC_BASE<"srari.h", int_mips_srari_h,
-                                           MSA128HOpnd>;
-class SRARI_W_DESC : MSA_BIT_W_X_DESC_BASE<"srari.w", int_mips_srari_w,
-                                           MSA128WOpnd>;
-class SRARI_D_DESC : MSA_BIT_D_X_DESC_BASE<"srari.d", int_mips_srari_d,
-                                           MSA128DOpnd>;
+class SRARI_B_DESC : MSA_BIT_X_DESC_BASE<"srari.b", int_mips_srari_b, uimm3,
+                                         immZExt3, MSA128BOpnd>;
+class SRARI_H_DESC : MSA_BIT_X_DESC_BASE<"srari.h", int_mips_srari_h, uimm4,
+                                         immZExt4, MSA128HOpnd>;
+class SRARI_W_DESC : MSA_BIT_X_DESC_BASE<"srari.w", int_mips_srari_w, uimm5,
+                                         immZExt5, MSA128WOpnd>;
+class SRARI_D_DESC : MSA_BIT_X_DESC_BASE<"srari.d", int_mips_srari_d, uimm6,
+                                         immZExt6, MSA128DOpnd>;
 
 class SRL_B_DESC : MSA_3R_DESC_BASE<"srl.b", srl, MSA128BOpnd>;
 class SRL_H_DESC : MSA_3R_DESC_BASE<"srl.h", srl, MSA128HOpnd>;
@@ -2661,14 +2629,14 @@ class SRLR_H_DESC : MSA_3R_DESC_BASE<"srlr.h", int_mips_srlr_h, MSA128HOpnd>;
 class SRLR_W_DESC : MSA_3R_DESC_BASE<"srlr.w", int_mips_srlr_w, MSA128WOpnd>;
 class SRLR_D_DESC : MSA_3R_DESC_BASE<"srlr.d", int_mips_srlr_d, MSA128DOpnd>;
 
-class SRLRI_B_DESC : MSA_BIT_B_X_DESC_BASE<"srlri.b", int_mips_srlri_b,
-                                           MSA128BOpnd>;
-class SRLRI_H_DESC : MSA_BIT_H_X_DESC_BASE<"srlri.h", int_mips_srlri_h,
-                                           MSA128HOpnd>;
-class SRLRI_W_DESC : MSA_BIT_W_X_DESC_BASE<"srlri.w", int_mips_srlri_w,
-                                           MSA128WOpnd>;
-class SRLRI_D_DESC : MSA_BIT_D_X_DESC_BASE<"srlri.d", int_mips_srlri_d,
-                                           MSA128DOpnd>;
+class SRLRI_B_DESC : MSA_BIT_X_DESC_BASE<"srlri.b", int_mips_srlri_b, uimm3,
+                                         immZExt3, MSA128BOpnd>;
+class SRLRI_H_DESC : MSA_BIT_X_DESC_BASE<"srlri.h", int_mips_srlri_h, uimm4,
+                                         immZExt4, MSA128HOpnd>;
+class SRLRI_W_DESC : MSA_BIT_X_DESC_BASE<"srlri.w", int_mips_srlri_w, uimm5,
+                                         immZExt5, MSA128WOpnd>;
+class SRLRI_D_DESC : MSA_BIT_X_DESC_BASE<"srlri.d", int_mips_srlri_d, uimm6,
+                                         immZExt6, MSA128DOpnd>;
 
 class ST_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                    ValueType TyNode, RegisterOperand ROWD,
diff --git a/test/MC/Mips/micromips64r6/invalid.s b/test/MC/Mips/micromips64r6/invalid.s
index 27b5146af3eb..23d2b875963f 100644
--- a/test/MC/Mips/micromips64r6/invalid.s
+++ b/test/MC/Mips/micromips64r6/invalid.s
@@ -18,6 +18,20 @@
   bnezc16 $6, 130          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: branch target out of range
   cache -1, 255($7)        # CHECK: :[[@LINE]]:9: error: expected 5-bit unsigned immediate
   cache 32, 255($7)        # CHECK: :[[@LINE]]:9: error: expected 5-bit unsigned immediate
+  # FIXME: Check size on dext*
+  dext $2, $3, -1, 1   # CHECK: :[[@LINE]]:16: error: expected 6-bit unsigned immediate
+  dext $2, $3, 64, 1   # CHECK: :[[@LINE]]:16: error: expected 6-bit unsigned immediate
+  dextm $2, $3, -1, 1  # CHECK: :[[@LINE]]:17: error: expected 5-bit unsigned immediate
+  dextm $2, $3, 32, 1  # CHECK: :[[@LINE]]:17: error: expected 5-bit unsigned immediate
+  dextu $2, $3, 31, 1  # CHECK: :[[@LINE]]:17: error: expected immediate in range 32 .. 63
+  dextu $2, $3, 64, 1  # CHECK: :[[@LINE]]:17: error: expected immediate in range 32 .. 63
+  # FIXME: Check size on dins*
+  dins $2, $3, -1, 1   # CHECK: :[[@LINE]]:16: error: expected 6-bit unsigned immediate
+  dins $2, $3, 64, 1   # CHECK: :[[@LINE]]:16: error: expected 6-bit unsigned immediate
+  dinsm $2, $3, -1, 1  # CHECK: :[[@LINE]]:17: error: expected 5-bit unsigned immediate
+  dinsm $2, $3, 32, 1  # CHECK: :[[@LINE]]:17: error: expected 5-bit unsigned immediate
+  dinsu $2, $3, 31, 1  # CHECK: :[[@LINE]]:17: error: expected immediate in range 32 .. 63
+  dinsu $2, $3, 64, 1  # CHECK: :[[@LINE]]:17: error: expected immediate in range 32 .. 63
   ext $2, $3, -1, 31       # CHECK: :[[@LINE]]:15: error: expected 5-bit unsigned immediate
   ext $2, $3, 32, 31       # CHECK: :[[@LINE]]:15: error: expected 5-bit unsigned immediate
   ins $2, $3, -1, 31       # CHECK: :[[@LINE]]:15: error: expected 5-bit unsigned immediate
diff --git a/test/MC/Mips/micromips64r6/valid.s b/test/MC/Mips/micromips64r6/valid.s
index edee56adeda4..304527b78f4d 100644
--- a/test/MC/Mips/micromips64r6/valid.s
+++ b/test/MC/Mips/micromips64r6/valid.s
@@ -19,7 +19,7 @@ a:
         dati $3, 4               # CHECK: dati $3, 4          # encoding: [0x42,0x03,0x00,0x04]
         dext $9, $6, 3, 7        # CHECK: dext $9, $6, 3, 7   # encoding: [0x59,0x26,0x30,0xec]
         dextm $9, $6, 3, 7       # CHECK: dextm $9, $6, 3, 7  # encoding: [0x59,0x26,0x30,0xe4]
-        dextu $9, $6, 3, 7       # CHECK: dextu $9, $6, 3, 7  # encoding: [0x59,0x26,0x30,0xd4]
+        dextu $9, $6, 35, 7      # CHECK: dextu $9, $6, 35, 7  # encoding: [0x59,0x26,0x30,0xd4]
         dalign $4, $2, $3, 5     # CHECK: dalign $4, $2, $3, 5  # encoding: [0x58,0x43,0x25,0x1c]
         lw $3, 32($gp)           # CHECK: lw $3, 32($gp)        # encoding: [0x65,0x88]
         lw $3, 24($sp)           # CHECK: lw $3, 24($sp)        # encoding: [0x48,0x66]
diff --git a/test/MC/Mips/mips32r2/invalid.s b/test/MC/Mips/mips32r2/invalid.s
index ab8aebd351a3..7d4349c2f687 100644
--- a/test/MC/Mips/mips32r2/invalid.s
+++ b/test/MC/Mips/mips32r2/invalid.s
@@ -8,7 +8,21 @@
         .set noreorder
         cache -1, 255($7)    # CHECK: :[[@LINE]]:15: error: expected 5-bit unsigned immediate
         cache 32, 255($7)    # CHECK: :[[@LINE]]:15: error: expected 5-bit unsigned immediate
+        # FIXME: Check size on ext
+        ext $2, $3, -1, 1    # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        ext $2, $3, 32, 1    # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        # FIXME: Check size on ins
+        ins $2, $3, -1, 1    # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        ins $2, $3, 32, 1    # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
         jalr.hb $31          # CHECK: :[[@LINE]]:9: error: source and destination must be different
         jalr.hb $31, $31     # CHECK: :[[@LINE]]:9: error: source and destination must be different
         pref -1, 255($7)     # CHECK: :[[@LINE]]:14: error: expected 5-bit unsigned immediate
         pref 32, 255($7)     # CHECK: :[[@LINE]]:14: error: expected 5-bit unsigned immediate
+        sll $2, $3, -1       # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        sll $2, $3, 32       # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        srl $2, $3, -1       # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        srl $2, $3, 32       # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        sra $2, $3, -1       # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        sra $2, $3, 32       # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        rotr $2, $3, -1      # CHECK: :[[@LINE]]:22: error: expected 5-bit unsigned immediate
+        rotr $2, $3, 32      # CHECK: :[[@LINE]]:22: error: expected 5-bit unsigned immediate
diff --git a/test/MC/Mips/mips64r2/invalid.s b/test/MC/Mips/mips64r2/invalid.s
index 58eb811c9460..edb2d898c1c2 100644
--- a/test/MC/Mips/mips64r2/invalid.s
+++ b/test/MC/Mips/mips64r2/invalid.s
@@ -8,9 +8,51 @@
         .set noreorder
         cache -1, 255($7)    # CHECK: :[[@LINE]]:15: error: expected 5-bit unsigned immediate
         cache 32, 255($7)    # CHECK: :[[@LINE]]:15: error: expected 5-bit unsigned immediate
+        # FIXME: Check size on dext*
+        dext $2, $3, -1, 1   # CHECK: :[[@LINE]]:22: error: expected 6-bit unsigned immediate
+        dext $2, $3, 64, 1   # CHECK: :[[@LINE]]:22: error: expected 6-bit unsigned immediate
+        dextm $2, $3, -1, 1  # CHECK: :[[@LINE]]:23: error: expected 5-bit unsigned immediate
+        dextm $2, $3, 32, 1  # CHECK: :[[@LINE]]:23: error: expected 5-bit unsigned immediate
+        dextu $2, $3, 31, 1  # CHECK: :[[@LINE]]:23: error: expected immediate in range 32 .. 63
+        dextu $2, $3, 64, 1  # CHECK: :[[@LINE]]:23: error: expected immediate in range 32 .. 63
+        # FIXME: Check size on dins*
+        dins $2, $3, -1, 1   # CHECK: :[[@LINE]]:22: error: expected 6-bit unsigned immediate
+        dins $2, $3, 64, 1   # CHECK: :[[@LINE]]:22: error: expected 6-bit unsigned immediate
+        dinsm $2, $3, -1, 1  # CHECK: :[[@LINE]]:23: error: expected 5-bit unsigned immediate
+        dinsm $2, $3, 32, 1  # CHECK: :[[@LINE]]:23: error: expected 5-bit unsigned immediate
+        dinsu $2, $3, 31, 1  # CHECK: :[[@LINE]]:23: error: expected immediate in range 32 .. 63
+        dinsu $2, $3, 64, 1  # CHECK: :[[@LINE]]:23: error: expected immediate in range 32 .. 63
+        drotr $2, $3, -1     # CHECK: :[[@LINE]]:23: error: expected 6-bit unsigned immediate
+        drotr $2, $3, 64     # CHECK: :[[@LINE]]:23: error: expected 6-bit unsigned immediate
         drotr32 $2, $3, -1   # CHECK: :[[@LINE]]:25: error: expected 5-bit unsigned immediate
         drotr32 $2, $3, 32   # CHECK: :[[@LINE]]:25: error: expected 5-bit unsigned immediate
+        dsll $2, $3, -1      # CHECK: :[[@LINE]]:22: error: expected 6-bit unsigned immediate
+        dsll $2, $3, 64      # CHECK: :[[@LINE]]:22: error: expected 6-bit unsigned immediate
+        dsll32 $2, $3, -1    # CHECK: :[[@LINE]]:24: error: expected 5-bit unsigned immediate
+        dsll32 $2, $3, 32    # CHECK: :[[@LINE]]:24: error: expected 5-bit unsigned immediate
+        dsrl $2, $3, -1      # CHECK: :[[@LINE]]:22: error: expected 6-bit unsigned immediate
+        dsrl $2, $3, 64      # CHECK: :[[@LINE]]:22: error: expected 6-bit unsigned immediate
+        dsrl32 $2, $3, -1    # CHECK: :[[@LINE]]:24: error: expected 5-bit unsigned immediate
+        dsrl32 $2, $3, 64    # CHECK: :[[@LINE]]:24: error: expected 5-bit unsigned immediate
+        dsra $2, $3, -1      # CHECK: :[[@LINE]]:22: error: expected 6-bit unsigned immediate
+        dsra $2, $3, 64      # CHECK: :[[@LINE]]:22: error: expected 6-bit unsigned immediate
+        dsra32 $2, $3, -1    # CHECK: :[[@LINE]]:24: error: expected 5-bit unsigned immediate
+        dsra32 $2, $3, 64    # CHECK: :[[@LINE]]:24: error: expected 5-bit unsigned immediate
+        # FIXME: Check size on ext
+        ext $2, $3, -1, 1    # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        ext $2, $3, 32, 1    # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        # FIXME: Check size on ins
+        ins $2, $3, -1, 1    # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        ins $2, $3, 32, 1    # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
         jalr.hb $31          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: source and destination must be different
         jalr.hb $31, $31     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: source and destination must be different
         pref -1, 255($7)     # CHECK: :[[@LINE]]:14: error: expected 5-bit unsigned immediate
         pref 32, 255($7)     # CHECK: :[[@LINE]]:14: error: expected 5-bit unsigned immediate
+        sll $2, $3, -1       # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        sll $2, $3, 32       # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        srl $2, $3, -1       # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        srl $2, $3, 32       # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        sra $2, $3, -1       # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        sra $2, $3, 32       # CHECK: :[[@LINE]]:21: error: expected 5-bit unsigned immediate
+        rotr $2, $3, -1      # CHECK: :[[@LINE]]:22: error: expected 5-bit unsigned immediate
+        rotr $2, $3, 32      # CHECK: :[[@LINE]]:22: error: expected 5-bit unsigned immediate
diff --git a/test/MC/Mips/msa/invalid-64.s b/test/MC/Mips/msa/invalid-64.s
index bfa5dad77749..90f19568b584 100644
--- a/test/MC/Mips/msa/invalid-64.s
+++ b/test/MC/Mips/msa/invalid-64.s
@@ -7,6 +7,14 @@
     .set noat
     dlsa    $2, $3, $4, 0   # CHECK: :[[@LINE]]:25: error: expected immediate in range 1 .. 4
     dlsa    $2, $3, $4, 5   # CHECK: :[[@LINE]]:25: error: expected immediate in range 1 .. 4
+    insve.b $w25[-1], $w9[0] # CHECK: :[[@LINE]]:18: error: expected 4-bit unsigned immediate
+    insve.b $w25[16], $w9[0] # CHECK: :[[@LINE]]:18: error: expected 4-bit unsigned immediate
+    insve.h $w24[-1], $w2[0] # CHECK: :[[@LINE]]:18: error: expected 3-bit unsigned immediate
+    insve.h $w24[8], $w2[0]  # CHECK: :[[@LINE]]:18: error: expected 3-bit unsigned immediate
+    insve.w $w0[-1], $w13[0] # CHECK: :[[@LINE]]:17: error: expected 2-bit unsigned immediate
+    insve.w $w0[4], $w13[0]  # CHECK: :[[@LINE]]:17: error: expected 2-bit unsigned immediate
+    insve.d $w3[-1], $w18[0] # CHECK: :[[@LINE]]:17: error: expected 1-bit unsigned immediate
+    insve.d $w3[2], $w18[0]  # CHECK: :[[@LINE]]:17: error: expected 1-bit unsigned immediate
     insve.b $w25[3], $w9[1] # CHECK: :[[@LINE]]:26: error: expected '0'
     insve.h $w24[2], $w2[1] # CHECK: :[[@LINE]]:26: error: expected '0'
     insve.w $w0[2], $w13[1] # CHECK: :[[@LINE]]:26: error: expected '0'
@@ -17,10 +25,18 @@
     sat_s.b $w31, $w31, 8   # CHECK: :[[@LINE]]:25: error: expected 3-bit unsigned immediate
     sat_s.h $w31, $w31, -1  # CHECK: :[[@LINE]]:25: error: expected 4-bit unsigned immediate
     sat_s.h $w31, $w31, 16  # CHECK: :[[@LINE]]:25: error: expected 4-bit unsigned immediate
+    sat_s.w $w31, $w31, -1  # CHECK: :[[@LINE]]:25: error: expected 5-bit unsigned immediate
+    sat_s.w $w31, $w31, 32  # CHECK: :[[@LINE]]:25: error: expected 5-bit unsigned immediate
+    sat_s.d $w31, $w31, -1  # CHECK: :[[@LINE]]:25: error: expected 6-bit unsigned immediate
+    sat_s.d $w31, $w31, 64  # CHECK: :[[@LINE]]:25: error: expected 6-bit unsigned immediate
     sat_u.b $w31, $w31, -1  # CHECK: :[[@LINE]]:25: error: expected 3-bit unsigned immediate
     sat_u.b $w31, $w31, 8   # CHECK: :[[@LINE]]:25: error: expected 3-bit unsigned immediate
     sat_u.h $w31, $w31, -1  # CHECK: :[[@LINE]]:25: error: expected 4-bit unsigned immediate
     sat_u.h $w31, $w31, 16  # CHECK: :[[@LINE]]:25: error: expected 4-bit unsigned immediate
+    sat_u.w $w31, $w31, -1  # CHECK: :[[@LINE]]:25: error: expected 5-bit unsigned immediate
+    sat_u.w $w31, $w31, 32  # CHECK: :[[@LINE]]:25: error: expected 5-bit unsigned immediate
+    sat_u.d $w31, $w31, -1  # CHECK: :[[@LINE]]:25: error: expected 6-bit unsigned immediate
+    sat_u.d $w31, $w31, 64  # CHECK: :[[@LINE]]:25: error: expected 6-bit unsigned immediate
     sldi.b $w0, $w29[-1]    # CHECK: :[[@LINE]]:22: error: expected 4-bit unsigned immediate
     sldi.b $w0, $w29[16]    # CHECK: :[[@LINE]]:22: error: expected 4-bit unsigned immediate
     sldi.d $w4, $w12[-1]    # CHECK: :[[@LINE]]:22: error: expected 1-bit unsigned immediate
@@ -31,5 +47,17 @@
     sldi.w $w20, $w27[4]    # CHECK: :[[@LINE]]:23: error: expected 2-bit unsigned immediate
     srari.b $w5, $w25, -1   # CHECK: :[[@LINE]]:24: error: expected 3-bit unsigned immediate
     srari.b $w5, $w25, 8    # CHECK: :[[@LINE]]:24: error: expected 3-bit unsigned immediate
+    srari.h $w5, $w25, -1   # CHECK: :[[@LINE]]:24: error: expected 4-bit unsigned immediate
+    srari.h $w5, $w25, 16   # CHECK: :[[@LINE]]:24: error: expected 4-bit unsigned immediate
+    srari.w $w5, $w25, -1   # CHECK: :[[@LINE]]:24: error: expected 5-bit unsigned immediate
+    srari.w $w5, $w25, 32   # CHECK: :[[@LINE]]:24: error: expected 5-bit unsigned immediate
+    srari.d $w5, $w25, -1   # CHECK: :[[@LINE]]:24: error: expected 6-bit unsigned immediate
+    srari.d $w5, $w25, 64   # CHECK: :[[@LINE]]:24: error: expected 6-bit unsigned immediate
     srlri.b $w18, $w3, -1   # CHECK: :[[@LINE]]:24: error: expected 3-bit unsigned immediate
     srlri.b $w18, $w3, 8    # CHECK: :[[@LINE]]:24: error: expected 3-bit unsigned immediate
+    srlri.h $w18, $w3, -1   # CHECK: :[[@LINE]]:24: error: expected 4-bit unsigned immediate
+    srlri.h $w18, $w3, 16   # CHECK: :[[@LINE]]:24: error: expected 4-bit unsigned immediate
+    srlri.w $w18, $w3, -1   # CHECK: :[[@LINE]]:24: error: expected 5-bit unsigned immediate
+    srlri.w $w18, $w3, 32   # CHECK: :[[@LINE]]:24: error: expected 5-bit unsigned immediate
+    srlri.d $w18, $w3, -1   # CHECK: :[[@LINE]]:24: error: expected 6-bit unsigned immediate
+    srlri.d $w18, $w3, 64   # CHECK: :[[@LINE]]:24: error: expected 6-bit unsigned immediate
diff --git a/test/MC/Mips/msa/invalid.s b/test/MC/Mips/msa/invalid.s
index a57df3182df0..22cec375e7e3 100644
--- a/test/MC/Mips/msa/invalid.s
+++ b/test/MC/Mips/msa/invalid.s
@@ -5,6 +5,14 @@
 # RUN: FileCheck %s < %t1
 
     .set noat
+    insve.b $w25[-1], $w9[0] # CHECK: :[[@LINE]]:18: error: expected 4-bit unsigned immediate
+    insve.b $w25[16], $w9[0] # CHECK: :[[@LINE]]:18: error: expected 4-bit unsigned immediate
+    insve.h $w24[-1], $w2[0] # CHECK: :[[@LINE]]:18: error: expected 3-bit unsigned immediate
+    insve.h $w24[8], $w2[0]  # CHECK: :[[@LINE]]:18: error: expected 3-bit unsigned immediate
+    insve.w $w0[-1], $w13[0] # CHECK: :[[@LINE]]:17: error: expected 2-bit unsigned immediate
+    insve.w $w0[4], $w13[0]  # CHECK: :[[@LINE]]:17: error: expected 2-bit unsigned immediate
+    insve.d $w3[-1], $w18[0] # CHECK: :[[@LINE]]:17: error: expected 1-bit unsigned immediate
+    insve.d $w3[2], $w18[0]  # CHECK: :[[@LINE]]:17: error: expected 1-bit unsigned immediate
     insve.b $w25[3], $w9[1] # CHECK: :[[@LINE]]:26: error: expected '0'
     insve.h $w24[2], $w2[1] # CHECK: :[[@LINE]]:26: error: expected '0'
     insve.w $w0[2], $w13[1] # CHECK: :[[@LINE]]:26: error: expected '0'
@@ -17,12 +25,16 @@
     sat_s.h $w31, $w31, 16  # CHECK: :[[@LINE]]:25: error: expected 4-bit unsigned immediate
     sat_s.w $w31, $w31, -1  # CHECK: :[[@LINE]]:25: error: expected 5-bit unsigned immediate
     sat_s.w $w31, $w31, 32  # CHECK: :[[@LINE]]:25: error: expected 5-bit unsigned immediate
+    sat_s.d $w31, $w31, -1  # CHECK: :[[@LINE]]:25: error: expected 6-bit unsigned immediate
+    sat_s.d $w31, $w31, 64  # CHECK: :[[@LINE]]:25: error: expected 6-bit unsigned immediate
     sat_u.b $w31, $w31, -1  # CHECK: :[[@LINE]]:25: error: expected 3-bit unsigned immediate
     sat_u.b $w31, $w31, 8   # CHECK: :[[@LINE]]:25: error: expected 3-bit unsigned immediate
     sat_u.h $w31, $w31, -1  # CHECK: :[[@LINE]]:25: error: expected 4-bit unsigned immediate
     sat_u.h $w31, $w31, 16  # CHECK: :[[@LINE]]:25: error: expected 4-bit unsigned immediate
     sat_u.w $w31, $w31, -1  # CHECK: :[[@LINE]]:25: error: expected 5-bit unsigned immediate
     sat_u.w $w31, $w31, 32  # CHECK: :[[@LINE]]:25: error: expected 5-bit unsigned immediate
+    sat_u.d $w31, $w31, -1  # CHECK: :[[@LINE]]:25: error: expected 6-bit unsigned immediate
+    sat_u.d $w31, $w31, 64  # CHECK: :[[@LINE]]:25: error: expected 6-bit unsigned immediate
     sldi.b $w0, $w29[-1]    # CHECK: :[[@LINE]]:22: error: expected 4-bit unsigned immediate
     sldi.b $w0, $w29[16]    # CHECK: :[[@LINE]]:22: error: expected 4-bit unsigned immediate
     sldi.d $w4, $w12[-1]    # CHECK: :[[@LINE]]:22: error: expected 1-bit unsigned immediate
@@ -33,5 +45,17 @@
     sldi.w $w20, $w27[4]    # CHECK: :[[@LINE]]:23: error: expected 2-bit unsigned immediate
     srari.b $w5, $w25, -1   # CHECK: :[[@LINE]]:24: error: expected 3-bit unsigned immediate
     srari.b $w5, $w25, 8    # CHECK: :[[@LINE]]:24: error: expected 3-bit unsigned immediate
+    srari.h $w5, $w25, -1   # CHECK: :[[@LINE]]:24: error: expected 4-bit unsigned immediate
+    srari.h $w5, $w25, 16   # CHECK: :[[@LINE]]:24: error: expected 4-bit unsigned immediate
+    srari.w $w5, $w25, -1   # CHECK: :[[@LINE]]:24: error: expected 5-bit unsigned immediate
+    srari.w $w5, $w25, 32   # CHECK: :[[@LINE]]:24: error: expected 5-bit unsigned immediate
+    srari.d $w5, $w25, -1   # CHECK: :[[@LINE]]:24: error: expected 6-bit unsigned immediate
+    srari.d $w5, $w25, 64   # CHECK: :[[@LINE]]:24: error: expected 6-bit unsigned immediate
     srlri.b $w18, $w3, -1   # CHECK: :[[@LINE]]:24: error: expected 3-bit unsigned immediate
     srlri.b $w18, $w3, 8    # CHECK: :[[@LINE]]:24: error: expected 3-bit unsigned immediate
+    srlri.h $w18, $w3, -1   # CHECK: :[[@LINE]]:24: error: expected 4-bit unsigned immediate
+    srlri.h $w18, $w3, 16   # CHECK: :[[@LINE]]:24: error: expected 4-bit unsigned immediate
+    srlri.w $w18, $w3, -1   # CHECK: :[[@LINE]]:24: error: expected 5-bit unsigned immediate
+    srlri.w $w18, $w3, 32   # CHECK: :[[@LINE]]:24: error: expected 5-bit unsigned immediate
+    srlri.d $w18, $w3, -1   # CHECK: :[[@LINE]]:24: error: expected 6-bit unsigned immediate
+    srlri.d $w18, $w3, 64   # CHECK: :[[@LINE]]:24: error: expected 6-bit unsigned immediate

From fd031a51c35d1781c066a42e221a7ae28610be3f Mon Sep 17 00:00:00 2001
From: Daniel Sanders <daniel.sanders@imgtec.com>
Date: Tue, 8 Dec 2015 14:42:10 +0000
Subject: [PATCH 224/364] [mips][ias] Range check uimm8 operands

Summary:

Reviewers: vkalintiris

Subscribers: llvm-commits, dsanders

Differential Revision: http://reviews.llvm.org/D15226


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255018 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Mips/AsmParser/MipsAsmParser.cpp | 3 +++
 lib/Target/Mips/MipsInstrInfo.td            | 6 ++++--
 lib/Target/Mips/MipsMSAInstrInfo.td         | 4 ----
 test/MC/Mips/msa/invalid-64.s               | 3 +++
 test/MC/Mips/msa/invalid.s                  | 6 ++++++
 5 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index ac32a25a8b3c..d04e8d4e4fa5 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -3650,6 +3650,9 @@ bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_UImm6_0:
     return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
                  "expected 6-bit unsigned immediate");
+  case Match_UImm8_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 8-bit unsigned immediate");
   }
 
   llvm_unreachable("Implement any new match types added!");
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index dae61c3c782d..5dd01b538f15 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -394,8 +394,10 @@ class ConstantUImmAsmOperandClass<int Bits, list<AsmOperandClass> Supers = [],
   let DiagnosticType = "UImm" # Bits # "_" # Offset;
 }
 
+def ConstantUImm8AsmOperandClass
+    : ConstantUImmAsmOperandClass<8, []>;
 def ConstantUImm6AsmOperandClass
-    : ConstantUImmAsmOperandClass<6, []>;
+    : ConstantUImmAsmOperandClass<6, [ConstantUImm8AsmOperandClass]>;
 def ConstantUImm5Plus32AsmOperandClass
     : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass], 32>;
 def ConstantUImm5Plus32NormalizeAsmOperandClass
@@ -512,7 +514,7 @@ def uimmz       : Operand<i32> {
 }
 
 // Unsigned Operands
-foreach I = {1, 2, 3, 4, 5, 6} in
+foreach I = {1, 2, 3, 4, 5, 6, 8} in
   def uimm # I : Operand<i32> {
     let PrintMethod = "printUnsignedImm";
     let ParserMatchClass =
diff --git a/lib/Target/Mips/MipsMSAInstrInfo.td b/lib/Target/Mips/MipsMSAInstrInfo.td
index 68e6ca1086bb..eacfcec78bc7 100644
--- a/lib/Target/Mips/MipsMSAInstrInfo.td
+++ b/lib/Target/Mips/MipsMSAInstrInfo.td
@@ -78,10 +78,6 @@ def uimm6_ptr : Operand<iPTR> {
   let PrintMethod = "printUnsignedImm8";
 }
 
-def uimm8 : Operand<i32> {
-  let PrintMethod = "printUnsignedImm8";
-}
-
 def simm5 : Operand<i32>;
 
 def vsplat_uimm1 : Operand<vAny> {
diff --git a/test/MC/Mips/msa/invalid-64.s b/test/MC/Mips/msa/invalid-64.s
index 90f19568b584..a15ee270bccf 100644
--- a/test/MC/Mips/msa/invalid-64.s
+++ b/test/MC/Mips/msa/invalid-64.s
@@ -37,6 +37,9 @@
     sat_u.w $w31, $w31, 32  # CHECK: :[[@LINE]]:25: error: expected 5-bit unsigned immediate
     sat_u.d $w31, $w31, -1  # CHECK: :[[@LINE]]:25: error: expected 6-bit unsigned immediate
     sat_u.d $w31, $w31, 64  # CHECK: :[[@LINE]]:25: error: expected 6-bit unsigned immediate
+    shf.b $w19, $w30, -1    # CHECK: :[[@LINE]]:23: error: expected 8-bit unsigned immediate
+    shf.h $w17, $w8, -1     # CHECK: :[[@LINE]]:22: error: expected 8-bit unsigned immediate
+    shf.w $w14, $w3, -1     # CHECK: :[[@LINE]]:22: error: expected 8-bit unsigned immediate
     sldi.b $w0, $w29[-1]    # CHECK: :[[@LINE]]:22: error: expected 4-bit unsigned immediate
     sldi.b $w0, $w29[16]    # CHECK: :[[@LINE]]:22: error: expected 4-bit unsigned immediate
     sldi.d $w4, $w12[-1]    # CHECK: :[[@LINE]]:22: error: expected 1-bit unsigned immediate
diff --git a/test/MC/Mips/msa/invalid.s b/test/MC/Mips/msa/invalid.s
index 22cec375e7e3..724d9c193e0a 100644
--- a/test/MC/Mips/msa/invalid.s
+++ b/test/MC/Mips/msa/invalid.s
@@ -35,6 +35,12 @@
     sat_u.w $w31, $w31, 32  # CHECK: :[[@LINE]]:25: error: expected 5-bit unsigned immediate
     sat_u.d $w31, $w31, -1  # CHECK: :[[@LINE]]:25: error: expected 6-bit unsigned immediate
     sat_u.d $w31, $w31, 64  # CHECK: :[[@LINE]]:25: error: expected 6-bit unsigned immediate
+    shf.b $w19, $w30, -1    # CHECK: :[[@LINE]]:23: error: expected 8-bit unsigned immediate
+    shf.b $w19, $w30, 256   # CHECK: :[[@LINE]]:23: error: expected 8-bit unsigned immediate
+    shf.h $w17, $w8, -1     # CHECK: :[[@LINE]]:22: error: expected 8-bit unsigned immediate
+    shf.h $w17, $w8, 256    # CHECK: :[[@LINE]]:22: error: expected 8-bit unsigned immediate
+    shf.w $w14, $w3, -1     # CHECK: :[[@LINE]]:22: error: expected 8-bit unsigned immediate
+    shf.w $w14, $w3, 256    # CHECK: :[[@LINE]]:22: error: expected 8-bit unsigned immediate
     sldi.b $w0, $w29[-1]    # CHECK: :[[@LINE]]:22: error: expected 4-bit unsigned immediate
     sldi.b $w0, $w29[16]    # CHECK: :[[@LINE]]:22: error: expected 4-bit unsigned immediate
     sldi.d $w4, $w12[-1]    # CHECK: :[[@LINE]]:22: error: expected 1-bit unsigned immediate

From 5c953e3267ebfc4b0430aea2b296348fa6320d38 Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Tue, 8 Dec 2015 14:54:49 +0000
Subject: [PATCH 225/364] Move all private members together. NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255021 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Linker/LinkModules.cpp | 45 +++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 23 deletions(-)

diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index a2bc95602210..a9fcee7c98ce 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -420,29 +420,6 @@ class ModuleLinker {
 
   bool HasError = false;
 
-public:
-  ModuleLinker(Module &DstM, Linker::IdentifiedStructTypeSet &Set, Module &SrcM,
-               DiagnosticHandlerFunction DiagnosticHandler, unsigned Flags,
-               const FunctionInfoIndex *Index = nullptr,
-               DenseSet<const GlobalValue *> *FunctionsToImport = nullptr)
-      : DstM(DstM), SrcM(SrcM), TypeMap(Set), ValMaterializer(this),
-        DiagnosticHandler(DiagnosticHandler), Flags(Flags), ImportIndex(Index),
-        ImportFunction(FunctionsToImport) {
-    assert((ImportIndex || !ImportFunction) &&
-           "Expect a FunctionInfoIndex when importing");
-    // If we have a FunctionInfoIndex but no function to import,
-    // then this is the primary module being compiled in a ThinLTO
-    // backend compilation, and we need to see if it has functions that
-    // may be exported to another backend compilation.
-    if (ImportIndex && !ImportFunction)
-      HasExportedFunctions = ImportIndex->hasExportedFunctions(SrcM);
-  }
-
-  bool run();
-  Value *materializeDeclFor(Value *V);
-  void materializeInitFor(GlobalValue *New, GlobalValue *Old);
-
-private:
   bool shouldOverrideFromSrc() { return Flags & Linker::OverrideFromSrc; }
   bool shouldLinkOnlyNeeded() { return Flags & Linker::LinkOnlyNeeded; }
   bool shouldInternalizeLinkedSymbols() {
@@ -556,6 +533,28 @@ class ModuleLinker {
                      const GlobalValue *DGV = nullptr);
 
   void linkNamedMDNodes();
+
+public:
+  ModuleLinker(Module &DstM, Linker::IdentifiedStructTypeSet &Set, Module &SrcM,
+               DiagnosticHandlerFunction DiagnosticHandler, unsigned Flags,
+               const FunctionInfoIndex *Index = nullptr,
+               DenseSet<const GlobalValue *> *FunctionsToImport = nullptr)
+      : DstM(DstM), SrcM(SrcM), TypeMap(Set), ValMaterializer(this),
+        DiagnosticHandler(DiagnosticHandler), Flags(Flags), ImportIndex(Index),
+        ImportFunction(FunctionsToImport) {
+    assert((ImportIndex || !ImportFunction) &&
+           "Expect a FunctionInfoIndex when importing");
+    // If we have a FunctionInfoIndex but no function to import,
+    // then this is the primary module being compiled in a ThinLTO
+    // backend compilation, and we need to see if it has functions that
+    // may be exported to another backend compilation.
+    if (ImportIndex && !ImportFunction)
+      HasExportedFunctions = ImportIndex->hasExportedFunctions(SrcM);
+  }
+
+  bool run();
+  Value *materializeDeclFor(Value *V);
+  void materializeInitFor(GlobalValue *New, GlobalValue *Old);
 };
 }
 

From b691e2ed749bb6b463573006ce05982fab86c873 Mon Sep 17 00:00:00 2001
From: Ron Lieberman <ronl@codeaurora.org>
Date: Tue, 8 Dec 2015 16:28:32 +0000
Subject: [PATCH 226/364] [Hexagon] Add NewValueJump support for C4_cmpneq,
 C4_cmplte, C4_cmplteu

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255027 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Hexagon/HexagonNewValueJump.cpp | 18 +++++
 test/CodeGen/Hexagon/NVJumpCmp.ll          | 89 ++++++++++++++++++++++
 2 files changed, 107 insertions(+)
 create mode 100644 test/CodeGen/Hexagon/NVJumpCmp.ll

diff --git a/lib/Target/Hexagon/HexagonNewValueJump.cpp b/lib/Target/Hexagon/HexagonNewValueJump.cpp
index 6415c6cc7906..20c4ab112b5f 100644
--- a/lib/Target/Hexagon/HexagonNewValueJump.cpp
+++ b/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -342,6 +342,24 @@ static unsigned getNewValueJumpOpcode(MachineInstr *MI, int reg,
       return taken ? Hexagon::J4_cmpgtui_t_jumpnv_t
                    : Hexagon::J4_cmpgtui_t_jumpnv_nt;
 
+    case Hexagon::C4_cmpneq:
+      return taken ? Hexagon::J4_cmpeq_f_jumpnv_t
+                   : Hexagon::J4_cmpeq_f_jumpnv_nt;
+
+    case Hexagon::C4_cmplte:
+      if (secondRegNewified)
+        return taken ? Hexagon::J4_cmplt_f_jumpnv_t
+                     : Hexagon::J4_cmplt_f_jumpnv_nt;
+      return taken ? Hexagon::J4_cmpgt_f_jumpnv_t
+                   : Hexagon::J4_cmpgt_f_jumpnv_nt;
+
+    case Hexagon::C4_cmplteu:
+      if (secondRegNewified)
+        return taken ? Hexagon::J4_cmpltu_f_jumpnv_t
+                     : Hexagon::J4_cmpltu_f_jumpnv_nt;
+      return taken ? Hexagon::J4_cmpgtu_f_jumpnv_t
+                   : Hexagon::J4_cmpgtu_f_jumpnv_nt;
+
     default:
        llvm_unreachable("Could not find matching New Value Jump instruction.");
   }
diff --git a/test/CodeGen/Hexagon/NVJumpCmp.ll b/test/CodeGen/Hexagon/NVJumpCmp.ll
new file mode 100644
index 000000000000..6b160d962ebb
--- /dev/null
+++ b/test/CodeGen/Hexagon/NVJumpCmp.ll
@@ -0,0 +1,89 @@
+; RUN: llc -march=hexagon -O2 -mcpu=hexagonv60  < %s | FileCheck %s
+
+; Look for an instruction, we really just do not want to see an abort.
+; CHECK: trace_event
+; REQUIRES: asserts
+
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a:0-n16:32"
+target triple = "hexagon-unknown--elf"
+
+; Function Attrs: nounwind
+define void @_ZN6Halide7Runtime8Internal13default_traceEPvPK18halide_trace_event() #0 {
+entry:
+  br i1 undef, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %if.then
+  br i1 undef, label %while.cond, label %while.end
+
+while.end:                                        ; preds = %while.cond
+  %add = add i32 undef, 48
+  br i1 undef, label %if.end, label %if.then17
+
+if.then17:                                        ; preds = %while.end
+  unreachable
+
+if.end:                                           ; preds = %while.end
+  %arrayidx21 = getelementptr inbounds [4096 x i8], [4096 x i8]* undef, i32 0, i32 8
+  store i8 undef, i8* %arrayidx21, align 4, !tbaa !1
+  br i1 undef, label %for.body42.preheader6, label %min.iters.checked
+
+for.body42.preheader6:                            ; preds = %vector.body.preheader, %min.iters.checked, %if.end
+  unreachable
+
+min.iters.checked:                                ; preds = %if.end
+  br i1 undef, label %for.body42.preheader6, label %vector.body.preheader
+
+vector.body.preheader:                            ; preds = %min.iters.checked
+  br i1 undef, label %for.cond48.preheader, label %for.body42.preheader6
+
+for.cond48.preheader:                             ; preds = %vector.body.preheader
+  br i1 undef, label %while.cond.i, label %for.body61.lr.ph
+
+for.body61.lr.ph:                                 ; preds = %for.cond48.preheader
+  br i1 undef, label %for.body61, label %min.iters.checked595
+
+min.iters.checked595:                             ; preds = %for.body61.lr.ph
+  br i1 undef, label %for.body61, label %vector.memcheck608
+
+vector.memcheck608:                               ; preds = %min.iters.checked595
+  %scevgep600 = getelementptr [4096 x i8], [4096 x i8]* undef, i32 0, i32 %add
+  %bound0604 = icmp ule i8* %scevgep600, undef
+  %memcheck.conflict607 = and i1 undef, %bound0604
+  br i1 %memcheck.conflict607, label %for.body61, label %vector.body590
+
+vector.body590:                                   ; preds = %vector.body590, %vector.memcheck608
+  br i1 undef, label %middle.block591, label %vector.body590, !llvm.loop !4
+
+middle.block591:                                  ; preds = %vector.body590
+  %cmp.n613 = icmp eq i32 undef, 0
+  br i1 %cmp.n613, label %while.cond.i, label %for.body61
+
+while.cond.i:                                     ; preds = %for.body61, %while.cond.i, %middle.block591, %for.cond48.preheader
+  br i1 undef, label %_ZN6Halide7Runtime8Internal14ScopedSpinLockC2EPVi.exit, label %while.cond.i
+
+_ZN6Halide7Runtime8Internal14ScopedSpinLockC2EPVi.exit: ; preds = %while.cond.i
+  unreachable
+
+for.body61:                                       ; preds = %for.body61, %middle.block591, %vector.memcheck608, %min.iters.checked595, %for.body61.lr.ph
+  %cmp59 = icmp ult i32 undef, undef
+  br i1 %cmp59, label %for.body61, label %while.cond.i, !llvm.loop !7
+
+if.else:                                          ; preds = %entry
+  unreachable
+}
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"halide_mattrs", !"+hvx"}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C/C++ TBAA"}
+!4 = distinct !{!4, !5, !6}
+!5 = !{!"llvm.loop.vectorize.width", i32 1}
+!6 = !{!"llvm.loop.interleave.count", i32 1}
+!7 = distinct !{!7, !5, !6}

From 72f75faa283d31acd018a40226470b852e39df0b Mon Sep 17 00:00:00 2001
From: Mike Aizatsky <aizatsky@chromium.org>
Date: Tue, 8 Dec 2015 17:44:51 +0000
Subject: [PATCH 227/364] adding readability-identifier-naming to llvm
 clang-tidy configuration.

Differential Revision: http://reviews.llvm.org/D15196

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255028 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .clang-tidy | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/.clang-tidy b/.clang-tidy
index 3186da43d43d..97fbe23333bd 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -1 +1,13 @@
-Checks: '-*,clang-diagnostic-*,llvm-*,misc-*'
+Checks: '-*,clang-diagnostic-*,llvm-*,misc-*,readability-identifier-naming'
+CheckOptions:
+  - key:             readability-identifier-naming.ClassCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.EnumCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.FunctionCase
+    value:           lowerCase
+  - key:             readability-identifier-naming.UnionCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.VariableCase
+    value:           CamelCase
+

From a0beb06ac92b08c8e62c04b3f2d5279785e12e96 Mon Sep 17 00:00:00 2001
From: Renato Golin <renato.golin@linaro.org>
Date: Tue, 8 Dec 2015 18:10:58 +0000
Subject: [PATCH 228/364] [ARM] Allowing SP/PC for AND/BIC mod_imm_not

AND/BIC instructions do accept SP/PC, so the register class should be
more generic (rGPR -> GPR) to cope with that case. Adding more tests.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255034 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/ARMInstrInfo.td       |  8 ++++----
 test/MC/ARM/basic-arm-instructions.s | 11 +++++++++++
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 4c7107aee6a2..2aa9475e6f47 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -5655,16 +5655,16 @@ def : ARMInstAlias<"mvn${s}${p} $Rd, $imm",
                    (MOVi rGPR:$Rd, mod_imm_not:$imm, pred:$p, cc_out:$s)>;
 // Same for AND <--> BIC
 def : ARMInstAlias<"bic${s}${p} $Rd, $Rn, $imm",
-                   (ANDri rGPR:$Rd, rGPR:$Rn, mod_imm_not:$imm,
+                   (ANDri GPR:$Rd, GPR:$Rn, mod_imm_not:$imm,
                           pred:$p, cc_out:$s)>;
 def : ARMInstAlias<"bic${s}${p} $Rdn, $imm",
-                   (ANDri rGPR:$Rdn, rGPR:$Rdn, mod_imm_not:$imm,
+                   (ANDri GPR:$Rdn, GPR:$Rdn, mod_imm_not:$imm,
                           pred:$p, cc_out:$s)>;
 def : ARMInstAlias<"and${s}${p} $Rd, $Rn, $imm",
-                   (BICri rGPR:$Rd, rGPR:$Rn, mod_imm_not:$imm,
+                   (BICri GPR:$Rd, GPR:$Rn, mod_imm_not:$imm,
                           pred:$p, cc_out:$s)>;
 def : ARMInstAlias<"and${s}${p} $Rdn, $imm",
-                   (BICri rGPR:$Rdn, rGPR:$Rdn, mod_imm_not:$imm,
+                   (BICri GPR:$Rdn, GPR:$Rdn, mod_imm_not:$imm,
                           pred:$p, cc_out:$s)>;
 
 // Likewise, "add Rd, mod_imm_neg" -> sub
diff --git a/test/MC/ARM/basic-arm-instructions.s b/test/MC/ARM/basic-arm-instructions.s
index a1f13b76dda3..99a3cfa7b29e 100644
--- a/test/MC/ARM/basic-arm-instructions.s
+++ b/test/MC/ARM/basic-arm-instructions.s
@@ -349,6 +349,8 @@ Lforward:
     and r6, r7, r8, ror r2
     and r10, r1, r6, rrx
     and r2, r3, #0x7fffffff
+    and sp, sp, #0x7fffffff
+    and pc, pc, #0x7fffffff
 
     @ destination register is optional
     and r1, #0xf
@@ -397,6 +399,8 @@ Lforward:
 @ CHECK: and	r6, r7, r8, ror r2      @ encoding: [0x78,0x62,0x07,0xe0]
 @ CHECK: and	r10, r1, r6, rrx        @ encoding: [0x66,0xa0,0x01,0xe0]
 @ CHECK: bic	r2, r3, #-2147483648    @ encoding: [0x02,0x21,0xc3,0xe3]
+@ CHECK: bic	sp, sp, #-2147483648    @ encoding: [0x02,0xd1,0xcd,0xe3]
+@ CHECK: bic	pc, pc, #-2147483648    @ encoding: [0x02,0xf1,0xcf,0xe3]
 
 @ CHECK: and	r1, r1, #15             @ encoding: [0x0f,0x10,0x01,0xe2]
 @ CHECK: and	r1, r1, #15             @ encoding: [0x0f,0x10,0x01,0xe2]
@@ -502,6 +506,10 @@ Lforward:
         bic r6, r7, r8, asr r2
         bic r6, r7, r8, ror r2
         bic r10, r1, r6, rrx
+        bic r2, r3, #0x7fffffff
+        bic sp, sp, #0x7fffffff
+        bic pc, pc, #0x7fffffff
+
 
         @ destination register is optional
         bic r1, #0xf
@@ -548,6 +556,9 @@ Lforward:
 @ CHECK: bic	r6, r7, r8, asr r2      @ encoding: [0x58,0x62,0xc7,0xe1]
 @ CHECK: bic	r6, r7, r8, ror r2      @ encoding: [0x78,0x62,0xc7,0xe1]
 @ CHECK: bic	r10, r1, r6, rrx        @ encoding: [0x66,0xa0,0xc1,0xe1]
+@ CHECK: and  r2, r3, #-2147483648    @ encoding: [0x02,0x21,0x03,0xe2]
+@ CHECK: and  sp, sp, #-2147483648    @ encoding: [0x02,0xd1,0x0d,0xe2]
+@ CHECK: and  pc, pc, #-2147483648    @ encoding: [0x02,0xf1,0x0f,0xe2]
 
 
 @ CHECK: bic	r1, r1, #15             @ encoding: [0x0f,0x10,0xc1,0xe3]

From 81bf65619f565510552f650b118e0098e047f1a4 Mon Sep 17 00:00:00 2001
From: Tim Northover <tnorthover@apple.com>
Date: Tue, 8 Dec 2015 18:31:35 +0000
Subject: [PATCH 229/364] X86: produce more friendly errors during MachO
 relocation handling

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255036 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../X86/MCTargetDesc/X86MachObjectWriter.cpp  | 92 ++++++++++++-------
 test/MC/X86/macho-reloc-errors-x86.s          | 15 +++
 test/MC/X86/macho-reloc-errors-x86_64.s       | 19 ++++
 3 files changed, 94 insertions(+), 32 deletions(-)
 create mode 100644 test/MC/X86/macho-reloc-errors-x86.s
 create mode 100644 test/MC/X86/macho-reloc-errors-x86_64.s

diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index 614e43864088..191ebeac7265 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -149,14 +149,19 @@ void X86MachObjectWriter::RecordX86_64Relocation(
 
     // Neither symbol can be modified.
     if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
-        Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None)
-      report_fatal_error("unsupported relocation of modified symbol", false);
+        Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) {
+      Asm.getContext().reportError(Fixup.getLoc(),
+                                   "unsupported relocation of modified symbol");
+      return;
+    }
 
     // We don't support PCrel relocations of differences. Darwin 'as' doesn't
     // implement most of these correctly.
-    if (IsPCRel)
-      report_fatal_error("unsupported pc-relative relocation of difference",
-                         false);
+    if (IsPCRel) {
+      Asm.getContext().reportError(
+          Fixup.getLoc(), "unsupported pc-relative relocation of difference");
+      return;
+    }
 
     // The support for the situation where one or both of the symbols would
     // require a local relocation is handled just like if the symbols were
@@ -168,8 +173,11 @@ void X86MachObjectWriter::RecordX86_64Relocation(
     // Darwin 'as' doesn't emit correct relocations for this (it ends up with a
     // single SIGNED relocation); reject it for now.  Except the case where both
     // symbols don't have a base, equal but both NULL.
-    if (A_Base == B_Base && A_Base)
-      report_fatal_error("unsupported relocation with identical base", false);
+    if (A_Base == B_Base && A_Base) {
+      Asm.getContext().reportError(
+          Fixup.getLoc(), "unsupported relocation with identical base");
+      return;
+    }
 
     // A subtraction expression where either symbol is undefined is a
     // non-relocatable expression.
@@ -245,12 +253,16 @@ void X86MachObjectWriter::RecordX86_64Relocation(
         FixedValue = Res;
         return;
       } else {
-        report_fatal_error("unsupported relocation of variable '" +
-                           Symbol->getName() + "'", false);
+        Asm.getContext().reportError(Fixup.getLoc(),
+                                     "unsupported relocation of variable '" +
+                                         Symbol->getName() + "'");
+        return;
       }
     } else {
-      report_fatal_error("unsupported relocation of undefined symbol '" +
-                         Symbol->getName() + "'", false);
+      Asm.getContext().reportError(
+          Fixup.getLoc(), "unsupported relocation of undefined symbol '" +
+                              Symbol->getName() + "'");
+      return;
     }
 
     MCSymbolRefExpr::VariantKind Modifier = Target.getSymA()->getKind();
@@ -267,8 +279,9 @@ void X86MachObjectWriter::RecordX86_64Relocation(
         }  else if (Modifier == MCSymbolRefExpr::VK_TLVP) {
           Type = MachO::X86_64_RELOC_TLV;
         }  else if (Modifier != MCSymbolRefExpr::VK_None) {
-          report_fatal_error("unsupported symbol modifier in relocation",
-                             false);
+          Asm.getContext().reportError(
+              Fixup.getLoc(), "unsupported symbol modifier in relocation");
+          return;
         } else {
           Type = MachO::X86_64_RELOC_SIGNED;
 
@@ -293,9 +306,12 @@ void X86MachObjectWriter::RecordX86_64Relocation(
           }
         }
       } else {
-        if (Modifier != MCSymbolRefExpr::VK_None)
-          report_fatal_error("unsupported symbol modifier in branch "
-                             "relocation", false);
+        if (Modifier != MCSymbolRefExpr::VK_None) {
+          Asm.getContext().reportError(
+              Fixup.getLoc(),
+              "unsupported symbol modifier in branch relocation");
+          return;
+        }
 
         Type = MachO::X86_64_RELOC_BRANCH;
       }
@@ -310,16 +326,22 @@ void X86MachObjectWriter::RecordX86_64Relocation(
         Type = MachO::X86_64_RELOC_GOT;
         IsPCRel = 1;
       } else if (Modifier == MCSymbolRefExpr::VK_TLVP) {
-        report_fatal_error("TLVP symbol modifier should have been rip-rel",
-                           false);
-      } else if (Modifier != MCSymbolRefExpr::VK_None)
-        report_fatal_error("unsupported symbol modifier in relocation", false);
-      else {
+        Asm.getContext().reportError(
+            Fixup.getLoc(), "TLVP symbol modifier should have been rip-rel");
+        return;
+      } else if (Modifier != MCSymbolRefExpr::VK_None) {
+        Asm.getContext().reportError(
+            Fixup.getLoc(), "unsupported symbol modifier in relocation");
+        return;
+      } else {
         Type = MachO::X86_64_RELOC_UNSIGNED;
         unsigned Kind = Fixup.getKind();
-        if (Kind == X86::reloc_signed_4byte)
-          report_fatal_error("32-bit absolute addressing is not supported in "
-                             "64-bit mode", false);
+        if (Kind == X86::reloc_signed_4byte) {
+          Asm.getContext().reportError(
+              Fixup.getLoc(),
+              "32-bit absolute addressing is not supported in 64-bit mode");
+          return;
+        }
       }
     }
   }
@@ -351,10 +373,13 @@ bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer,
   // See <reloc.h>.
   const MCSymbol *A = &Target.getSymA()->getSymbol();
 
-  if (!A->getFragment())
-    report_fatal_error("symbol '" + A->getName() +
-                       "' can not be undefined in a subtraction expression",
-                       false);
+  if (!A->getFragment()) {
+    Asm.getContext().reportError(
+        Fixup.getLoc(),
+        "symbol '" + A->getName() +
+            "' can not be undefined in a subtraction expression");
+    return false;
+  }
 
   uint32_t Value = Writer->getSymbolAddress(*A, Layout);
   uint64_t SecAddr = Writer->getSectionAddress(A->getFragment()->getParent());
@@ -364,10 +389,13 @@ bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer,
   if (const MCSymbolRefExpr *B = Target.getSymB()) {
     const MCSymbol *SB = &B->getSymbol();
 
-    if (!SB->getFragment())
-      report_fatal_error("symbol '" + B->getSymbol().getName() +
-                         "' can not be undefined in a subtraction expression",
-                         false);
+    if (!SB->getFragment()) {
+      Asm.getContext().reportError(
+          Fixup.getLoc(),
+          "symbol '" + B->getSymbol().getName() +
+              "' can not be undefined in a subtraction expression");
+      return false;
+    }
 
     // Select the appropriate difference relocation type.
     //
diff --git a/test/MC/X86/macho-reloc-errors-x86.s b/test/MC/X86/macho-reloc-errors-x86.s
new file mode 100644
index 000000000000..4af202220073
--- /dev/null
+++ b/test/MC/X86/macho-reloc-errors-x86.s
@@ -0,0 +1,15 @@
+// RUN: not llvm-mc -triple=i686-apple-darwin -filetype=obj -o /dev/null %s 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+        .space 0x1000000
+        mov %eax, thing-thing2
+        mov %eax, defined-thing2
+        mov %eax, later-defined
+
+        .section __DATA,__tim
+defined:
+
+        .section __DATA,__tim2
+later:
+
+// CHECK-ERROR: 3:9: error: symbol 'thing' can not be undefined in a subtraction expression
+// CHECK-ERROR: 4:9: error: symbol 'thing2' can not be undefined in a subtraction expression
+// CHECK-ERROR: 5:9: error: Section too large, can't encode r_address (0x100000b) into 24 bits of scattered relocation entry.
diff --git a/test/MC/X86/macho-reloc-errors-x86_64.s b/test/MC/X86/macho-reloc-errors-x86_64.s
new file mode 100644
index 000000000000..05f77c495b24
--- /dev/null
+++ b/test/MC/X86/macho-reloc-errors-x86_64.s
@@ -0,0 +1,19 @@
+// RUN: not llvm-mc -triple=x86_64-apple-darwin -filetype=obj -o /dev/null %s 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+
+        mov %rax, thing
+        mov %rax, thing@GOT-thing2@GOT
+        mov %rax, (thing-thing2)(%rip)
+        mov %rax, thing-thing
+        mov %rax, thing-thing2
+        mov %rax, thing@PLT
+        jmp thing@PLT
+        mov %rax, thing@TLVP
+
+// CHECK-ERROR: 3:9: error: 32-bit absolute addressing is not supported in 64-bit mode
+// CHECK-ERROR: 4:9: error: unsupported relocation of modified symbol
+// CHECK-ERROR: 5:9: error: unsupported pc-relative relocation of difference
+// CHECK-ERROR: 6:9: error: unsupported relocation with identical base
+// CHECK-ERROR: 7:9: error: unsupported relocation with subtraction expression, symbol 'thing' can not be undefined in a subtraction expression
+// CHECK-ERROR: 8:9: error: unsupported symbol modifier in relocation
+// CHECK-ERROR: 9:9: error: unsupported symbol modifier in branch relocation
+// CHECK-ERROR: 10:9: error: TLVP symbol modifier should have been rip-rel

From 0cb0e797f0e7b13bed123e14b538abbcbb4709b4 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <mehdi.amini@apple.com>
Date: Tue, 8 Dec 2015 19:01:29 +0000
Subject: [PATCH 230/364] Add Available Externally linkage type to
 isWeakForLinker()

Per LangRef: "Globals with available_externally linkage are
allowed to be discarded at will, and are otherwise the same
as linkonce_odr", since linkonce_odr is in this list it makes
sense to have available_externally there as well.

Reviewers: rafael

Differential Revision: http://reviews.llvm.org/D15323

From: Mehdi Amini <mehdi.amini@apple.com>

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255043 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/GlobalValue.h                 |  3 ++-
 .../available_externally_global_ctors.ll      | 22 +++++++++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)
 create mode 100644 test/Transforms/GlobalOpt/available_externally_global_ctors.ll

diff --git a/include/llvm/IR/GlobalValue.h b/include/llvm/IR/GlobalValue.h
index 2f9172648a26..3461b9ee8b82 100644
--- a/include/llvm/IR/GlobalValue.h
+++ b/include/llvm/IR/GlobalValue.h
@@ -256,7 +256,8 @@ class GlobalValue : public Constant {
   static bool isWeakForLinker(LinkageTypes Linkage)  {
     return Linkage == WeakAnyLinkage || Linkage == WeakODRLinkage ||
            Linkage == LinkOnceAnyLinkage || Linkage == LinkOnceODRLinkage ||
-           Linkage == CommonLinkage || Linkage == ExternalWeakLinkage;
+           Linkage == CommonLinkage || Linkage == ExternalWeakLinkage ||
+           Linkage == AvailableExternallyLinkage;
   }
 
   bool hasExternalLinkage() const { return isExternalLinkage(Linkage); }
diff --git a/test/Transforms/GlobalOpt/available_externally_global_ctors.ll b/test/Transforms/GlobalOpt/available_externally_global_ctors.ll
new file mode 100644
index 000000000000..39dc054ac227
--- /dev/null
+++ b/test/Transforms/GlobalOpt/available_externally_global_ctors.ll
@@ -0,0 +1,22 @@
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+; RUN: opt -S -globalopt < %s | FileCheck %s
+
+; Verify that the initialization of the available_externally global is not eliminated
+; CHECK: @llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @foo_static_init, i8* null }]
+
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @foo_static_init, i8* null }]
+@foo_external = available_externally global void ()* null, align 8
+
+define internal void @foo_static_init() {
+entry:
+  store void ()* @foo_impl, void ()** @foo_external, align 8
+  ret void
+}
+
+define internal void @foo_impl() {
+entry:
+  ret void
+}
+

From 0a5a3d4acb906d07bdd290b459afcd838f5af932 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <mehdi.amini@apple.com>
Date: Tue, 8 Dec 2015 19:02:55 +0000
Subject: [PATCH 231/364] Cleanup test: remove useless alignment

From: Mehdi Amini <mehdi.amini@apple.com>

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255044 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../Transforms/GlobalOpt/available_externally_global_ctors.ll | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/Transforms/GlobalOpt/available_externally_global_ctors.ll b/test/Transforms/GlobalOpt/available_externally_global_ctors.ll
index 39dc054ac227..7092a5ae2226 100644
--- a/test/Transforms/GlobalOpt/available_externally_global_ctors.ll
+++ b/test/Transforms/GlobalOpt/available_externally_global_ctors.ll
@@ -7,11 +7,11 @@ target triple = "x86_64-apple-macosx10.11.0"
 ; CHECK: @llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @foo_static_init, i8* null }]
 
 @llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @foo_static_init, i8* null }]
-@foo_external = available_externally global void ()* null, align 8
+@foo_external = available_externally global void ()* null
 
 define internal void @foo_static_init() {
 entry:
-  store void ()* @foo_impl, void ()** @foo_external, align 8
+  store void ()* @foo_impl, void ()** @foo_external
   ret void
 }
 

From 917e9a38ca5d5fc5cd567fd209864e62d5e641ad Mon Sep 17 00:00:00 2001
From: Mehdi Amini <mehdi.amini@apple.com>
Date: Tue, 8 Dec 2015 19:13:31 +0000
Subject: [PATCH 232/364] Revert "Add Available Externally linkage type to
 isWeakForLinker()"

This reverts r255043, as per post-review concern were raised on the correctness.

From: Mehdi Amini <mehdi.amini@apple.com>

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255045 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/GlobalValue.h                 |  3 +--
 .../available_externally_global_ctors.ll      | 22 -------------------
 2 files changed, 1 insertion(+), 24 deletions(-)
 delete mode 100644 test/Transforms/GlobalOpt/available_externally_global_ctors.ll

diff --git a/include/llvm/IR/GlobalValue.h b/include/llvm/IR/GlobalValue.h
index 3461b9ee8b82..2f9172648a26 100644
--- a/include/llvm/IR/GlobalValue.h
+++ b/include/llvm/IR/GlobalValue.h
@@ -256,8 +256,7 @@ class GlobalValue : public Constant {
   static bool isWeakForLinker(LinkageTypes Linkage)  {
     return Linkage == WeakAnyLinkage || Linkage == WeakODRLinkage ||
            Linkage == LinkOnceAnyLinkage || Linkage == LinkOnceODRLinkage ||
-           Linkage == CommonLinkage || Linkage == ExternalWeakLinkage ||
-           Linkage == AvailableExternallyLinkage;
+           Linkage == CommonLinkage || Linkage == ExternalWeakLinkage;
   }
 
   bool hasExternalLinkage() const { return isExternalLinkage(Linkage); }
diff --git a/test/Transforms/GlobalOpt/available_externally_global_ctors.ll b/test/Transforms/GlobalOpt/available_externally_global_ctors.ll
deleted file mode 100644
index 7092a5ae2226..000000000000
--- a/test/Transforms/GlobalOpt/available_externally_global_ctors.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.11.0"
-
-; RUN: opt -S -globalopt < %s | FileCheck %s
-
-; Verify that the initialization of the available_externally global is not eliminated
-; CHECK: @llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @foo_static_init, i8* null }]
-
-@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @foo_static_init, i8* null }]
-@foo_external = available_externally global void ()* null
-
-define internal void @foo_static_init() {
-entry:
-  store void ()* @foo_impl, void ()** @foo_external
-  ret void
-}
-
-define internal void @foo_impl() {
-entry:
-  ret void
-}
-

From d10549743ae65fcc4420a5b07606f97f5cac4bae Mon Sep 17 00:00:00 2001
From: Artyom Skrobov <Artyom.Skrobov@arm.com>
Date: Tue, 8 Dec 2015 19:59:01 +0000
Subject: [PATCH 233/364] Fix ARMv4T (Thumb1) epilogue generation

Summary:
Before ARMv5T, Thumb1 code could not pop PC, as described at D14357 and D14986;
so we need the special fixup in the epilogue.

Reviewers: jroelofs, qcolombet

Subscribers: aemerson, llvm-commits, rengolin

Differential Revision: http://reviews.llvm.org/D15126

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255047 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/Thumb1FrameLowering.cpp      |  41 ++++++--
 test/CodeGen/ARM/debug-frame.ll             |   4 +-
 test/CodeGen/Thumb/large-stack.ll           |  20 ++--
 test/CodeGen/Thumb/pop-special-fixup.ll     |  60 ------------
 test/CodeGen/Thumb/thumb-shrink-wrapping.ll | 102 +++++++++++++++++---
 5 files changed, 132 insertions(+), 95 deletions(-)
 delete mode 100644 test/CodeGen/Thumb/pop-special-fixup.ll

diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index fd96af6cb6e0..8771c68e5931 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -406,9 +406,6 @@ bool Thumb1FrameLowering::needPopSpecialFixUp(const MachineFunction &MF) const {
   if (AFI->getArgRegsSaveSize())
     return true;
 
-  // FIXME: this doesn't make sense, and the following patch will remove it.
-  if (!STI.hasV4TOps()) return false;
-
   // LR cannot be encoded with Thumb1, i.e., it requires a special fix-up.
   for (const CalleeSavedInfo &CSI : MF.getFrameInfo()->getCalleeSavedInfo())
     if (CSI.getReg() == ARM::LR)
@@ -532,10 +529,32 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
                        .addReg(PopReg, RegState::Kill));
   }
 
+  bool AddBx = false;
   if (MBBI == MBB.end()) {
     MachineInstr& Pop = MBB.back();
     assert(Pop.getOpcode() == ARM::tPOP);
     Pop.RemoveOperand(Pop.findRegisterDefOperandIdx(ARM::LR));
+  } else if (MBBI->getOpcode() == ARM::tPOP_RET) {
+    // We couldn't use the direct restoration above, so
+    // perform the opposite conversion: tPOP_RET to tPOP.
+    MachineInstrBuilder MIB =
+        AddDefaultPred(
+            BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII.get(ARM::tPOP)));
+    unsigned Popped = 0;
+    for (auto MO: MBBI->operands())
+      if (MO.isReg() && (MO.isImplicit() || MO.isDef()) &&
+          MO.getReg() != ARM::PC) {
+        MIB.addOperand(MO);
+        if (!MO.isImplicit())
+          Popped++;
+      }
+    // Is there anything left to pop?
+    if (!Popped)
+      MBB.erase(MIB.getInstr());
+    // Erase the old instruction.
+    MBB.erase(MBBI);
+    MBBI = MBB.end();
+    AddBx = true;
   }
 
   assert(PopReg && "Do not know how to get LR");
@@ -554,14 +573,20 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
     return true;
   }
 
-  AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
-                     .addReg(ARM::LR, RegState::Define)
-                     .addReg(PopReg, RegState::Kill));
-
+  if (AddBx && !TemporaryReg) {
+    AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tBX))
+                       .addReg(PopReg, RegState::Kill));
+  } else {
+    AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
+                       .addReg(ARM::LR, RegState::Define)
+                       .addReg(PopReg, RegState::Kill));
+  }
   if (TemporaryReg) {
     AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
                        .addReg(PopReg, RegState::Define)
                        .addReg(TemporaryReg, RegState::Kill));
+    if (AddBx)
+      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tBX_RET)));
   }
 
   return true;
@@ -628,7 +653,7 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
       if (isVarArg)
         continue;
       // ARMv4T requires BX, see emitEpilogue
-      if (STI.hasV4TOps() && !STI.hasV5TOps())
+      if (!STI.hasV5TOps())
         continue;
       Reg = ARM::PC;
       (*MIB).setDesc(TII.get(ARM::tPOP_RET));
diff --git a/test/CodeGen/ARM/debug-frame.ll b/test/CodeGen/ARM/debug-frame.ll
index 33d389698995..4bd401b68496 100644
--- a/test/CodeGen/ARM/debug-frame.ll
+++ b/test/CodeGen/ARM/debug-frame.ll
@@ -30,11 +30,11 @@
 ; RUN:     -filetype=asm -o - %s \
 ; RUN:   | FileCheck %s --check-prefix=CHECK-V7-FP-ELIM
 
-; RUN: llc -mtriple thumb-unknown-linux-gnueabi \
+; RUN: llc -mtriple thumbv5-unknown-linux-gnueabi \
 ; RUN:     -disable-fp-elim -filetype=asm -o - %s \
 ; RUN:   | FileCheck %s --check-prefix=CHECK-THUMB-FP
 
-; RUN: llc -mtriple thumb-unknown-linux-gnueabi \
+; RUN: llc -mtriple thumbv5-unknown-linux-gnueabi \
 ; RUN:     -filetype=asm -o - %s \
 ; RUN:   | FileCheck %s --check-prefix=CHECK-THUMB-FP-ELIM
 
diff --git a/test/CodeGen/Thumb/large-stack.ll b/test/CodeGen/Thumb/large-stack.ll
index 0d534589ae0a..c5d1044e9d69 100644
--- a/test/CodeGen/Thumb/large-stack.ll
+++ b/test/CodeGen/Thumb/large-stack.ll
@@ -32,10 +32,10 @@ define void @test100() {
 ; Smallest stack for which we use a constant pool
 define void @test2() {
 ; CHECK-LABEL: test2:
-; CHECK: ldr r0,
-; CHECK: add sp, r0
-; EABI: ldr r0,
-; EABI: add sp, r0
+; CHECK: ldr [[TEMP:r[0-7]]],
+; CHECK: add sp, [[TEMP]]
+; EABI: ldr [[TEMP:r[0-7]]],
+; EABI: add sp, [[TEMP]]
 ; IOS: subs r4, r7, #4
 ; IOS: mov sp, r4
     %tmp = alloca [ 1528 x i8 ] , align 4
@@ -44,12 +44,12 @@ define void @test2() {
 
 define i32 @test3() {
 ; CHECK-LABEL: test3:
-; CHECK: ldr r1,
-; CHECK: add sp, r1
-; CHECK: ldr r1,
-; CHECK: add r1, sp
-; EABI: ldr r1,
-; EABI: add sp, r1
+; CHECK: ldr [[TEMP:r[0-7]]],
+; CHECK: add sp, [[TEMP]]
+; CHECK: ldr [[TEMP]],
+; CHECK: add [[TEMP]], sp
+; EABI: ldr [[TEMP:r[0-7]]],
+; EABI: add sp, [[TEMP]]
 ; IOS: subs r4, r7, #4
 ; IOS: mov sp, r4
     %retval = alloca i32, align 4
diff --git a/test/CodeGen/Thumb/pop-special-fixup.ll b/test/CodeGen/Thumb/pop-special-fixup.ll
deleted file mode 100644
index 9ba589d6cec3..000000000000
--- a/test/CodeGen/Thumb/pop-special-fixup.ll
+++ /dev/null
@@ -1,60 +0,0 @@
-; RUN: llc %s -enable-shrink-wrap=true -o - | FileCheck %s
-
-target triple = "thumbv6m-none-none-eabi"
-
-@retval = global i32 0, align 4
-
-define i32 @test(i32 %i, i32 %argc, i8** nocapture readonly %argv) {
-  %1 = icmp sgt i32 %argc, %i
-  br i1 %1, label %2, label %19
-
-  %3 = getelementptr inbounds i8*, i8** %argv, i32 %i
-  %4 = load i8*, i8** %3, align 4
-  %5 = load i8, i8* %4, align 1
-  %6 = icmp eq i8 %5, 45
-  %7 = getelementptr inbounds i8, i8* %4, i32 1
-  %. = select i1 %6, i8* %7, i8* %4
-  %.1 = select i1 %6, i32 -1, i32 1
-  %8 = load i8, i8* %., align 1
-  %.off2 = add i8 %8, -48
-  %9 = icmp ult i8 %.off2, 10
-  %.pre = load i32, i32* @retval, align 4
-  br i1 %9, label %.lr.ph.preheader, label %.critedge
-
-.lr.ph.preheader:                                 ; preds = %2
-  br label %.lr.ph
-
-.lr.ph:                                           ; preds = %.lr.ph.preheader, %.lr.ph
-  %10 = phi i32 [ %14, %.lr.ph ], [ %.pre, %.lr.ph.preheader ]
-  %11 = phi i8 [ %15, %.lr.ph ], [ %8, %.lr.ph.preheader ]
-  %valstring.03 = phi i8* [ %13, %.lr.ph ], [ %., %.lr.ph.preheader ]
-  %12 = zext i8 %11 to i32
-  %13 = getelementptr inbounds i8, i8* %valstring.03, i32 1
-  %14 = add nsw i32 %10, %12
-  store i32 %14, i32* @retval, align 4
-  %15 = load i8, i8* %13, align 1
-  %.off = add i8 %15, -48
-  %16 = icmp ult i8 %.off, 10
-  br i1 %16, label %.lr.ph, label %.critedge.loopexit
-
-.critedge.loopexit:                               ; preds = %.lr.ph
-  %.lcssa = phi i32 [ %14, %.lr.ph ]
-  br label %.critedge
-
-.critedge:                                        ; preds = %.critedge.loopexit, %2
-  %17 = phi i32 [ %.pre, %2 ], [ %.lcssa, %.critedge.loopexit ]
-  %18 = mul nsw i32 %17, %.1
-  store i32 %18, i32* @retval, align 4
-  br label %19
-
-; <label>:19                                      ; preds = %.critedge, %0
-  ret i32 0
-}
-
-; CHECK: push {r4, r5, r7, lr}
-; CHECK: pop {r4, r5, r7}
-; CHECK: pop {r0}
-; CHECK: mov lr, r0
-; CHECK: movs r0, #0
-; CHECK: bx  lr
-
diff --git a/test/CodeGen/Thumb/thumb-shrink-wrapping.ll b/test/CodeGen/Thumb/thumb-shrink-wrapping.ll
index 09c2ae3b4f72..e68ca0bd78c9 100644
--- a/test/CodeGen/Thumb/thumb-shrink-wrapping.ll
+++ b/test/CodeGen/Thumb/thumb-shrink-wrapping.ll
@@ -1,7 +1,11 @@
 ; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumb-macho \
-; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE 
+; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE --check-prefix=ENABLE-V4T
+; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumbv5-macho \
+; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE --check-prefix=ENABLE-V5T
 ; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumb-macho \
-; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
+; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE --check-prefix=DISABLE-V4T
+; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumbv5-macho \
+; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE --check-prefix=DISABLE-V5T
 ;
 ; Note: Lots of tests use inline asm instead of regular calls.
 ; This allows to have a better control on what the allocation will do.
@@ -39,14 +43,20 @@
 ;
 ; With shrink-wrapping, epilogue is just after the call.
 ; ENABLE-NEXT: add sp, #8
-; ENABLE-NEXT: pop {r7, lr}
+; ENABLE-V5T-NEXT: pop {r7, pc}
+; ENABLE-V4T-NEXT: pop {r7}
+; ENABLE-V4T-NEXT: pop {r1}
+; ENABLE-V4T-NEXT: mov lr, r1
 ;
 ; CHECK: [[EXIT_LABEL]]:
 ;
 ; Without shrink-wrapping, epilogue is in the exit block.
 ; Epilogue code. (What we pop does not matter.)
 ; DISABLE: add sp, #8
-; DISABLE-NEXT: pop {r7, pc}
+; DISABLE-V5T-NEXT: pop {r7, pc}
+; DISABLE-V4T-NEXT: pop {r7}
+; DISABLE-V4T-NEXT: pop {r1}
+; DISABLE-V4T-NEXT: bx r1
 ;
 ; ENABLE-NEXT: bx lr
 define i32 @foo(i32 %a, i32 %b) {
@@ -64,6 +74,42 @@ false:
   ret i32 %tmp.0
 }
 
+
+; Same, but the final BB is non-trivial, so we don't duplicate the return inst.
+; CHECK-LABEL: bar:
+;
+; With shrink-wrapping, epilogue is just after the call.
+; CHECK: bl
+; ENABLE-NEXT: add sp, #8
+; ENABLE-NEXT: pop {r7}
+; ENABLE-NEXT: pop {r0}
+; ENABLE-NEXT: mov lr, r0
+;
+; CHECK: movs r0, #42
+;
+; Without shrink-wrapping, epilogue is in the exit block.
+; Epilogue code. (What we pop does not matter.)
+; DISABLE: add sp, #8
+; DISABLE-V5T-NEXT: pop {r7, pc}
+; DISABLE-V4T-NEXT: pop {r7}
+; DISABLE-V4T-NEXT: pop {r1}
+; DISABLE-V4T-NEXT: bx r1
+;
+; ENABLE-NEXT: bx lr
+define i32 @bar(i32 %a, i32 %b) {
+  %tmp = alloca i32, align 4
+  %tmp2 = icmp slt i32 %a, %b
+  br i1 %tmp2, label %true, label %false
+
+true:
+  store i32 %a, i32* %tmp, align 4
+  %tmp4 = call i32 @doSomething(i32 0, i32* %tmp)
+  br label %false
+
+false:
+  ret i32 42
+}
+
 ; Function Attrs: optsize
 declare i32 @doSomething(i32, i32*)
 
@@ -101,12 +147,17 @@ declare i32 @doSomething(i32, i32*)
 ; CHECK: lsls [[SUM]], [[SUM]], #3
 ;
 ; Duplicated epilogue.
-; DISABLE: pop {r4, pc}
+; DISABLE-V5T: pop {r4, pc}
+; DISABLE-V4T: b [[END_LABEL:LBB[0-9_]+]]
 ;
 ; CHECK: [[ELSE_LABEL]]: @ %if.else
 ; Shift second argument by one and store into returned register.
 ; CHECK: lsls r0, r1, #1
-; DISABLE-NEXT: pop {r4, pc}
+; DISABLE-V5T-NEXT: pop {r4, pc}
+; DISABLE-V4T-NEXT: [[END_LABEL]]: @ %if.end
+; DISABLE-V4T-NEXT: pop {r4}
+; DISABLE-V4T-NEXT: pop {r1}
+; DISABLE-V4T-NEXT: bx r1
 ;
 ; ENABLE-NEXT: bx lr
 define i32 @freqSaveAndRestoreOutsideLoop(i32 %cond, i32 %N) {
@@ -222,12 +273,17 @@ for.end:                                          ; preds = %for.body
 ; ENABLE-NEXT: pop {r4, lr}
 ;
 ; Duplicated epilogue.
-; DISABLE: pop {r4, pc}
+; DISABLE-V5T: pop {r4, pc}
+; DISABLE-V4T: b [[END_LABEL:LBB[0-9_]+]]
 ;
 ; CHECK: [[ELSE_LABEL]]: @ %if.else
 ; Shift second argument by one and store into returned register.
 ; CHECK: lsls r0, r1, #1
-; DISABLE-NEXT: pop {r4, pc}
+; DISABLE-V5T-NEXT: pop {r4, pc}
+; DISABLE-V4T-NEXT: [[END_LABEL]]: @ %if.end
+; DISABLE-V4T-NEXT: pop {r4}
+; DISABLE-V4T-NEXT: pop {r1}
+; DISABLE-V4T-NEXT: bx r1
 ;
 ; ENABLE-NEXT: bx lr
 define i32 @loopInfoSaveOutsideLoop(i32 %cond, i32 %N) {
@@ -297,12 +353,17 @@ declare void @somethingElse(...)
 ; ENABLE: pop {r4, lr}
 ;
 ; Duplicated epilogue.
-; DISABLE: pop {r4, pc}
+; DISABLE-V5T: pop {r4, pc}
+; DISABLE-V4T: b [[END_LABEL:LBB[0-9_]+]]
 ;
 ; CHECK: [[ELSE_LABEL]]: @ %if.else
 ; Shift second argument by one and store into returned register.
 ; CHECK: lsls r0, r1, #1
-; DISABLE-NEXT: pop {r4, pc}
+; DISABLE-V5T-NEXT: pop {r4, pc}
+; DISABLE-V4T-NEXT: [[END_LABEL]]: @ %if.end
+; DISABLE-V4T-NEXT: pop {r4}
+; DISABLE-V4T-NEXT: pop {r1}
+; DISABLE-V4T-NEXT: bx r1
 ;
 ; ENABLE-NEXT: bx lr
 define i32 @loopInfoRestoreOutsideLoop(i32 %cond, i32 %N) #0 {
@@ -373,12 +434,17 @@ entry:
 ; ENABLE-NEXT: pop {r4, lr}
 ;
 ; Duplicated epilogue.
-; DISABLE-NEXT: pop {r4, pc}
+; DISABLE-V5T-NEXT: pop {r4, pc}
+; DISABLE-V4T-NEXT: b [[END_LABEL:LBB[0-9_]+]]
 ;
 ; CHECK: [[ELSE_LABEL]]: @ %if.else
 ; Shift second argument by one and store into returned register.
 ; CHECK: lsls r0, r1, #1
-; DISABLE-NEXT: pop {r4, pc}
+; DISABLE-V5T-NEXT: pop {r4, pc}
+; DISABLE-V4T-NEXT: [[END_LABEL]]: @ %if.end
+; DISABLE-V4T-NEXT: pop {r4}
+; DISABLE-V4T-NEXT: pop {r1}
+; DISABLE-V4T-NEXT: bx r1
 ;
 ; ENABLE-NEXT: bx lr
 define i32 @inlineAsm(i32 %cond, i32 %N) {
@@ -438,12 +504,14 @@ if.end:                                           ; preds = %for.body, %if.else
 ; CHECK-NEXT: pop {r3}
 ; CHECK-NEXT: bl
 ; CHECK-NEXT: lsls r0, r0, #3
-; CHECK-NEXT: add sp, #16
 ;
+; ENABLE-NEXT: add sp, #16
 ; ENABLE-NEXT: pop {[[TMP]], lr}
 ;
 ; Duplicated epilogue.
-; DISABLE-NEXT: pop {[[TMP]], pc}
+; DISABLE-V5T-NEXT: add sp, #16
+; DISABLE-V5T-NEXT: pop {[[TMP]], pc}
+; DISABLE-V4T-NEXT: b [[END_LABEL:LBB[0-9_]+]]
 ;
 ; CHECK: [[ELSE_LABEL]]: @ %if.else
 ; Shift second argument by one and store into returned register.
@@ -452,8 +520,12 @@ if.end:                                           ; preds = %for.body, %if.else
 ; Epilogue code.
 ; ENABLE-NEXT: bx lr
 ;
+; DISABLE-V4T-NEXT: [[END_LABEL]]: @ %if.end
 ; DISABLE-NEXT: add sp, #16
-; DISABLE-NEXT: pop {[[TMP]], pc}
+; DISABLE-V5T-NEXT: pop {[[TMP]], pc}
+; DISABLE-V4T-NEXT: pop {[[TMP]]}
+; DISABLE-V4T-NEXT: pop {r1}
+; DISABLE-V4T-NEXT: bx r1
 define i32 @callVariadicFunc(i32 %cond, i32 %N) {
 entry:
   %tobool = icmp eq i32 %cond, 0

From 246e4ceb42fa7f5fe898aac0b51c19061ff5cf91 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 8 Dec 2015 21:27:19 +0000
Subject: [PATCH 234/364] [X86][SSE3] Added fast-isel intrinsics tests

As discussed on PR24580, this patch adds fast-isel codegen tests to match the IR generated in clang/test/CodeGen/sse3-builtins.c

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255051 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/sse3-intrinsics-fast-isel.ll | 171 ++++++++++++++++++
 1 file changed, 171 insertions(+)
 create mode 100644 test/CodeGen/X86/sse3-intrinsics-fast-isel.ll

diff --git a/test/CodeGen/X86/sse3-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse3-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..217be9aeae3a
--- /dev/null
+++ b/test/CodeGen/X86/sse3-intrinsics-fast-isel.ll
@@ -0,0 +1,171 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL  --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL  --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse3-builtins.c
+
+define <2 x double> @test_mm_addsub_pd(<2 x double> %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_addsub_pd:
+; X32:       # BB#0:
+; X32-NEXT:    addsubpd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_addsub_pd:
+; X64:       # BB#0:
+; X64-NEXT:    addsubpd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <4 x float> @test_mm_addsub_ps(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_addsub_ps:
+; X32:       # BB#0:
+; X32-NEXT:    addsubps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_addsub_ps:
+; X64:       # BB#0:
+; X64-NEXT:    addsubps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <2 x double> @test_mm_hadd_pd(<2 x double> %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_hadd_pd:
+; X32:       # BB#0:
+; X32-NEXT:    haddpd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_hadd_pd:
+; X64:       # BB#0:
+; X64-NEXT:    haddpd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <4 x float> @test_mm_hadd_ps(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_hadd_ps:
+; X32:       # BB#0:
+; X32-NEXT:    haddps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_hadd_ps:
+; X64:       # BB#0:
+; X64-NEXT:    haddps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <2 x double> @test_mm_hsub_pd(<2 x double> %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_hsub_pd:
+; X32:       # BB#0:
+; X32-NEXT:    hsubpd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_hsub_pd:
+; X64:       # BB#0:
+; X64-NEXT:    hsubpd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1)
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <4 x float> @test_mm_hsub_ps(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_hsub_ps:
+; X32:       # BB#0:
+; X32-NEXT:    hsubps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_hsub_ps:
+; X64:       # BB#0:
+; X64-NEXT:    hsubps %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <2 x i64> @test_mm_lddqu_si128(i8* %a0) {
+; X32-LABEL: test_mm_lddqu_si128:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    lddqu (%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_lddqu_si128:
+; X64:       # BB#0:
+; X64-NEXT:    lddqu (%rdi), %xmm0
+; X64-NEXT:    retq
+  %call = call <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %a0)
+  %res = bitcast <16 x i8> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <16 x i8> @llvm.x86.sse3.ldu.dq(i8*) nounwind readonly
+
+define <2 x double> @test_mm_loaddup_pd(double* %a0) {
+; X32-LABEL: test_mm_loaddup_pd:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movddup (%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_loaddup_pd:
+; X64:       # BB#0:
+; X64-NEXT:    movddup (%rdi), %xmm0
+; X64-NEXT:    retq
+  %ld = load double, double* %a0
+  %res0 = insertelement <2 x double> undef, double %ld, i32 0
+  %res1 = insertelement <2 x double> %res0, double %ld, i32 1
+  ret <2 x double> %res1
+}
+
+define <2 x double> @test_mm_movedup_pd(<2 x double> %a0) {
+; X32-LABEL: test_mm_movedup_pd:
+; X32:       # BB#0:
+; X32-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_movedup_pd:
+; X64:       # BB#0:
+; X64-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
+; X64-NEXT:    retq
+  %res = shufflevector <2 x double> %a0, <2 x double> %a0, <2 x i32> zeroinitializer
+  ret <2 x double> %res
+}
+
+define <4 x float> @test_mm_movehdup_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm_movehdup_ps:
+; X32:       # BB#0:
+; X32-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_movehdup_ps:
+; X64:       # BB#0:
+; X64-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-NEXT:    retq
+  %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_moveldup_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm_moveldup_ps:
+; X32:       # BB#0:
+; X32-NEXT:    movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_moveldup_ps:
+; X64:       # BB#0:
+; X64-NEXT:    movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; X64-NEXT:    retq
+  %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  ret <4 x float> %res
+}

From 9a17b124c523f1503b14bd58aa44ef0bbae9e24d Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 8 Dec 2015 21:32:08 +0000
Subject: [PATCH 235/364] [X86][SSSE3] Added fast-isel intrinsics tests

As discussed on PR24580, this patch adds fast-isel codegen tests to match the IR generated in clang/test/CodeGen/ssse3-builtins.c


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255052 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../CodeGen/X86/ssse3-intrinsics-fast-isel.ll | 290 ++++++++++++++++++
 1 file changed, 290 insertions(+)
 create mode 100644 test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll

diff --git a/test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll b/test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..4f7ff20c6e0d
--- /dev/null
+++ b/test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll
@@ -0,0 +1,290 @@
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=ssse3 | FileCheck %s --check-prefix=ALL  --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=ssse3 | FileCheck %s --check-prefix=ALL  --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/ssse3-builtins.c
+
+define <2 x i64> @test_mm_abs_epi8(<2 x i64> %a0) {
+; X32-LABEL: test_mm_abs_epi8:
+; X32:       # BB#0:
+; X32-NEXT:    pabsb %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_abs_epi8:
+; X64:       # BB#0:
+; X64-NEXT:    pabsb %xmm0, %xmm0
+; X64-NEXT:    retq
+  %arg = bitcast <2 x i64> %a0 to <16 x i8>
+  %call = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %arg)
+  %res = bitcast <16 x i8> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_abs_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_abs_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    pabsw %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_abs_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    pabsw %xmm0, %xmm0
+; X64-NEXT:    retq
+  %arg = bitcast <2 x i64> %a0 to <8 x i16>
+  %call = call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %arg)
+  %res = bitcast <8 x i16> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_abs_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_abs_epi32:
+; X32:       # BB#0:
+; X32-NEXT:    pabsd %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_abs_epi32:
+; X64:       # BB#0:
+; X64-NEXT:    pabsd %xmm0, %xmm0
+; X64-NEXT:    retq
+  %arg = bitcast <2 x i64> %a0 to <4 x i32>
+  %call = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %arg)
+  %res = bitcast <4 x i32> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_alignr_epi8(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_alignr_epi8:
+; X32:       # BB#0:
+; X32-NEXT:    palignr {{.*#}} xmm1 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1]
+; X32-NEXT:    movdqa %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_alignr_epi8:
+; X64:       # BB#0:
+; X64-NEXT:    palignr {{.*#}} xmm1 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1]
+; X64-NEXT:    movdqa %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+  %shuf = shufflevector <16 x i8> %arg0, <16 x i8> %arg1, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+  %res = bitcast <16 x i8> %shuf to <2 x i64>
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_hadd_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_hadd_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    phaddw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_hadd_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    phaddw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %call = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %arg0, <8 x i16> %arg1)
+  %res = bitcast <8 x i16> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_hadd_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_hadd_epi32:
+; X32:       # BB#0:
+; X32-NEXT:    phaddd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_hadd_epi32:
+; X64:       # BB#0:
+; X64-NEXT:    phaddd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+  %call = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %arg0, <4 x i32> %arg1)
+  %res = bitcast <4 x i32> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_hadds_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_hadds_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    phaddsw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_hadds_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    phaddsw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %call = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %arg0, <8 x i16> %arg1)
+  %res = bitcast <8 x i16> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_hsub_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_hsub_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    phsubw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_hsub_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    phsubw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %call = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %arg0, <8 x i16> %arg1)
+  %res = bitcast <8 x i16> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_hsub_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_hsub_epi32:
+; X32:       # BB#0:
+; X32-NEXT:    phsubd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_hsub_epi32:
+; X64:       # BB#0:
+; X64-NEXT:    phsubd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+  %call = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %arg0, <4 x i32> %arg1)
+  %res = bitcast <4 x i32> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_hsubs_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_hsubs_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    phsubsw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_hsubs_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    phsubsw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %call = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %arg0, <8 x i16> %arg1)
+  %res = bitcast <8 x i16> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_maddubs_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_maddubs_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    pmaddubsw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_maddubs_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    pmaddubsw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+  %call = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %arg0, <16 x i8> %arg1)
+  %res = bitcast <8 x i16> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_mulhrs_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_mulhrs_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    pmulhrsw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_mulhrs_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    pmulhrsw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %call = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %arg0, <8 x i16> %arg1)
+  %res = bitcast <8 x i16> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_shuffle_epi8(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_shuffle_epi8:
+; X32:       # BB#0:
+; X32-NEXT:    pshufb %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_shuffle_epi8:
+; X64:       # BB#0:
+; X64-NEXT:    pshufb %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+  %call = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %arg0, <16 x i8> %arg1)
+  %res = bitcast <16 x i8> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_sign_epi8(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sign_epi8:
+; X32:       # BB#0:
+; X32-NEXT:    psignb %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_sign_epi8:
+; X64:       # BB#0:
+; X64-NEXT:    psignb %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+  %call = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %arg0, <16 x i8> %arg1)
+  %res = bitcast <16 x i8> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_sign_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sign_epi16:
+; X32:       # BB#0:
+; X32-NEXT:    psignw %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_sign_epi16:
+; X64:       # BB#0:
+; X64-NEXT:    psignw %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+  %call = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %arg0, <8 x i16> %arg1)
+  %res = bitcast <8 x i16> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_sign_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sign_epi32:
+; X32:       # BB#0:
+; X32-NEXT:    psignd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_sign_epi32:
+; X64:       # BB#0:
+; X64-NEXT:    psignd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+  %call = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %arg0, <4 x i32> %arg1)
+  %res = bitcast <4 x i32> %call to <2 x i64>
+  ret <2 x i64> %res
+}
+declare <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32>, <4 x i32>) nounwind readnone

From 9d46b32e8801178491b93d8813ae98e05d13adcc Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 8 Dec 2015 21:43:41 +0000
Subject: [PATCH 236/364] [X86][SSE4A] Added fast-isel intrinsics tests

As discussed on PR24580, this patch adds fast-isel codegen tests to match the IR generated in clang/test/CodeGen/sse4a-builtins.c


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255053 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../CodeGen/X86/sse4a-intrinsics-fast-isel.ll | 98 +++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100644 test/CodeGen/X86/sse4a-intrinsics-fast-isel.ll

diff --git a/test/CodeGen/X86/sse4a-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse4a-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..f93a16a5eb3d
--- /dev/null
+++ b/test/CodeGen/X86/sse4a-intrinsics-fast-isel.ll
@@ -0,0 +1,98 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=ALL  --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=ALL  --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse4a-builtins.c
+
+define <2 x i64> @test_mm_extracti_si64(<2 x i64> %x) {
+; X32-LABEL: test_mm_extracti_si64:
+; X32:       # BB#0:
+; X32-NEXT:    extrq $2, $3, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_extracti_si64:
+; X64:       # BB#0:
+; X64-NEXT:    extrq $2, $3, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 3, i8 2)
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64>, i8, i8) nounwind readnone
+
+define <2 x i64> @test_mm_extract_si64(<2 x i64> %x, <2 x i64> %y) {
+; X32-LABEL: test_mm_extract_si64:
+; X32:       # BB#0:
+; X32-NEXT:    extrq %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_extract_si64:
+; X64:       # BB#0:
+; X64-NEXT:    extrq %xmm1, %xmm0
+; X64-NEXT:    retq
+  %bc = bitcast <2 x i64> %y to <16 x i8>
+  %res = call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %bc)
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64>, <16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_inserti_si64(<2 x i64> %x, <2 x i64> %y) {
+; X32-LABEL: test_mm_inserti_si64:
+; X32:       # BB#0:
+; X32-NEXT:    insertq $6, $5, %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_inserti_si64:
+; X64:       # BB#0:
+; X64-NEXT:    insertq $6, $5, %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 5, i8 6)
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwind readnone
+
+define <2 x i64> @test_mm_insert_si64(<2 x i64> %x, <2 x i64> %y) {
+; X32-LABEL: test_mm_insert_si64:
+; X32:       # BB#0:
+; X32-NEXT:    insertq %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_insert_si64:
+; X64:       # BB#0:
+; X64-NEXT:    insertq %xmm1, %xmm0
+; X64-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y)
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define void @test_stream_sd(i8* %p, <2 x double> %a) {
+; X32-LABEL: test_stream_sd:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movntsd %xmm0, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_stream_sd:
+; X64:       # BB#0:
+; X64-NEXT:    movntsd %xmm0, (%rdi)
+; X64-NEXT:    retq
+  call void @llvm.x86.sse4a.movnt.sd(i8* %p, <2 x double> %a)
+  ret void
+}
+declare void @llvm.x86.sse4a.movnt.sd(i8*, <2 x double>) nounwind readnone
+
+define void @test_mm_stream_ss(i8* %p, <4 x float> %a) {
+; X32-LABEL: test_mm_stream_ss:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movntss %xmm0, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_stream_ss:
+; X64:       # BB#0:
+; X64-NEXT:    movntss %xmm0, (%rdi)
+; X64-NEXT:    retq
+  call void @llvm.x86.sse4a.movnt.ss(i8* %p, <4 x float> %a)
+  ret void
+}
+declare void @llvm.x86.sse4a.movnt.ss(i8*, <4 x float>) nounwind readnone

From eca5f1938e545905126f43b6f1afb7b515c36b0e Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Tue, 8 Dec 2015 21:45:41 +0000
Subject: [PATCH 237/364] [EarlyCSE] Value forwarding for unordered atomics

This patch teaches the fully redundant load part of EarlyCSE how to forward from atomic and volatile loads and stores, and how to eliminate unordered atomics (only). This patch does not include dead store elimination support for unordered atomics, that will follow in the near future.

The basic idea is that we allow all loads and stores to be tracked by the AvailableLoad table. We store a bit in the table which tracks whether load/store was atomic, and then only replace atomic loads with ones which were also atomic.

No attempt is made to refine our handling of ordered loads or stores. Those are still treated as full fences. We could pretty easily extend the release fence handling to release stores, but that should be a separate patch.

Differential Revision: http://reviews.llvm.org/D15337


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255054 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/EarlyCSE.cpp  |  96 ++++++++++++++++-----
 test/Transforms/EarlyCSE/atomics.ll | 127 ++++++++++++++++++++++++++++
 2 files changed, 204 insertions(+), 19 deletions(-)
 create mode 100644 test/Transforms/EarlyCSE/atomics.ll

diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index 7e3703de25e4..eb38ef5f1645 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -281,21 +281,31 @@ class EarlyCSE {
   /// that dominated values can succeed in their lookup.
   ScopedHTType AvailableValues;
 
-  /// \brief A scoped hash table of the current values of loads.
+  /// A scoped hash table of the current values of previously encounted memory
+  /// locations.
   ///
-  /// This allows us to get efficient access to dominating loads when we have
-  /// a fully redundant load.  In addition to the most recent load, we keep
-  /// track of a generation count of the read, which is compared against the
-  /// current generation count.  The current generation count is incremented
+  /// This allows us to get efficient access to dominating loads or stores when
+  /// we have a fully redundant load.  In addition to the most recent load, we
+  /// keep track of a generation count of the read, which is compared against
+  /// the current generation count.  The current generation count is incremented
   /// after every possibly writing memory operation, which ensures that we only
-  /// CSE loads with other loads that have no intervening store.
+  /// CSE loads with other loads that have no intervening store.  Ordering
+  /// events (such as fences or atomic instructions) increment the generation
+  /// count as well; essentially, we model these as writes to all possible
+  /// locations.  Note that atomic and/or volatile loads and stores can be
+  /// present the table; it is the responsibility of the consumer to inspect
+  /// the atomicity/volatility if needed.
   struct LoadValue {
     Value *Data;
     unsigned Generation;
     int MatchingId;
-    LoadValue() : Data(nullptr), Generation(0), MatchingId(-1) {}
-    LoadValue(Value *Data, unsigned Generation, unsigned MatchingId)
-        : Data(Data), Generation(Generation), MatchingId(MatchingId) {}
+    bool IsAtomic;
+    LoadValue()
+      : Data(nullptr), Generation(0), MatchingId(-1), IsAtomic(false) {}
+    LoadValue(Value *Data, unsigned Generation, unsigned MatchingId,
+              bool IsAtomic)
+      : Data(Data), Generation(Generation), MatchingId(MatchingId),
+        IsAtomic(IsAtomic) {}
   };
   typedef RecyclingAllocator<BumpPtrAllocator,
                              ScopedHashTableVal<Value *, LoadValue>>
@@ -410,6 +420,42 @@ class EarlyCSE {
       }
       return Inst->isAtomic();
     }
+    bool isAtomic() const {
+      if (IsTargetMemInst) {
+        assert(Info.IsSimple && "need to refine IsSimple in TTI");
+        return false;
+      }
+      return Inst->isAtomic();
+    }
+    bool isUnordered() const {
+      if (IsTargetMemInst) {
+        assert(Info.IsSimple && "need to refine IsSimple in TTI");
+        return true;
+      }
+      if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+        return LI->isUnordered();
+      } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+        return SI->isUnordered();
+      }
+      // Conservative answer
+      return !Inst->isAtomic();
+    }
+
+    bool isVolatile() const {
+      if (IsTargetMemInst) {
+        assert(Info.IsSimple && "need to refine IsSimple in TTI");
+        return false;
+      }
+      if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+        return LI->isVolatile();
+      } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+        return SI->isVolatile();
+      }
+      // Conservative answer
+      return true;
+    }
+
+    
     bool isMatchingMemLoc(const ParseMemoryInst &Inst) const {
       return (getPointerOperand() == Inst.getPointerOperand() &&
               getMatchingId() == Inst.getMatchingId());
@@ -561,20 +607,22 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
     ParseMemoryInst MemInst(Inst, TTI);
     // If this is a non-volatile load, process it.
     if (MemInst.isValid() && MemInst.isLoad()) {
-      // Ignore volatile or ordered loads.
-      if (!MemInst.isSimple()) {
+      // (conservatively) we can't peak past the ordering implied by this
+      // operation, but we can add this load to our set of available values
+      if (MemInst.isVolatile() || !MemInst.isUnordered()) {
         LastStore = nullptr;
-        // Don't CSE across synchronization boundaries.
-        if (Inst->mayWriteToMemory())
-          ++CurrentGeneration;
-        continue;
+        ++CurrentGeneration;
       }
 
       // If we have an available version of this load, and if it is the right
       // generation, replace this instruction.
       LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand());
       if (InVal.Data != nullptr && InVal.Generation == CurrentGeneration &&
-          InVal.MatchingId == MemInst.getMatchingId()) {
+          InVal.MatchingId == MemInst.getMatchingId() &&
+          // We don't yet handle removing loads with ordering of any kind.
+          !MemInst.isVolatile() && MemInst.isUnordered() &&
+          // We can't replace an atomic load with one which isn't also atomic.
+          InVal.IsAtomic >= MemInst.isAtomic()) {
         Value *Op = getOrCreateResult(InVal.Data, Inst->getType());
         if (Op != nullptr) {
           DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst
@@ -591,7 +639,8 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       // Otherwise, remember that we have this instruction.
       AvailableLoads.insert(
           MemInst.getPointerOperand(),
-          LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId()));
+          LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(),
+                    MemInst.isAtomic()));
       LastStore = nullptr;
       continue;
     }
@@ -646,9 +695,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
 
       if (MemInst.isValid() && MemInst.isStore()) {
         // We do a trivial form of DSE if there are two stores to the same
-        // location with no intervening loads.  Delete the earlier store.
+        // location with no intervening loads.  Delete the earlier store. Note
+        // that we can delete an earlier simple store even if the following one
+        // is ordered/volatile/atomic store.
         if (LastStore) {
           ParseMemoryInst LastStoreMemInst(LastStore, TTI);
+          assert(LastStoreMemInst.isSimple() && "Violated invariant");
           if (LastStoreMemInst.isMatchingMemLoc(MemInst)) {
             DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore
                          << "  due to: " << *Inst << '\n');
@@ -667,11 +719,17 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
         // the store.
         AvailableLoads.insert(
             MemInst.getPointerOperand(),
-            LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId()));
+            LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(),
+                      MemInst.isAtomic()));
 
         // Remember that this was the last normal store we saw for DSE.
+        // Note that we can't delete an earlier atomic or volatile store in
+        // favor of a later one which isn't.  We could in principal remove an
+        // earlier unordered store if the later one is also unordered.
         if (MemInst.isSimple())
           LastStore = Inst;
+        else
+          LastStore = nullptr;
       }
     }
   }
diff --git a/test/Transforms/EarlyCSE/atomics.ll b/test/Transforms/EarlyCSE/atomics.ll
new file mode 100644
index 000000000000..f08220fd42d8
--- /dev/null
+++ b/test/Transforms/EarlyCSE/atomics.ll
@@ -0,0 +1,127 @@
+; RUN: opt < %s -S -early-cse | FileCheck %s
+
+; CHECK-LABEL: @test12(
+define i32 @test12(i1 %B, i32* %P1, i32* %P2) {
+  %load0 = load i32, i32* %P1
+  %1 = load atomic i32, i32* %P2 seq_cst, align 4
+  %load1 = load i32, i32* %P1
+  %sel = select i1 %B, i32 %load0, i32 %load1
+  ret i32 %sel
+  ; CHECK: load i32, i32* %P1
+  ; CHECK: load i32, i32* %P1
+}
+
+; CHECK-LABEL: @test13(
+; atomic to non-atomic forwarding is legal
+define i32 @test13(i1 %B, i32* %P1) {
+  %a = load atomic i32, i32* %P1 seq_cst, align 4
+  %b = load i32, i32* %P1
+  %res = sub i32 %a, %b
+  ret i32 %res
+  ; CHECK: load atomic i32, i32* %P1
+  ; CHECK: ret i32 0
+}
+
+; CHECK-LABEL: @test14(
+; atomic to unordered atomic forwarding is legal
+define i32 @test14(i1 %B, i32* %P1) {
+  %a = load atomic i32, i32* %P1 seq_cst, align 4
+  %b = load atomic i32, i32* %P1 unordered, align 4
+  %res = sub i32 %a, %b
+  ret i32 %res
+  ; CHECK: load atomic i32, i32* %P1
+  ; CHECK: ret i32 0
+}
+
+; CHECK-LABEL: @test15(
+; implementation restiction: can't forward to stonger
+; than unordered
+define i32 @test15(i1 %B, i32* %P1, i32* %P2) {
+  %a = load atomic i32, i32* %P1 seq_cst, align 4
+  %b = load atomic i32, i32* %P1 seq_cst, align 4
+  %res = sub i32 %a, %b
+  ret i32 %res
+  ; CHECK: load atomic i32, i32* %P1
+  ; CHECK: load atomic i32, i32* %P1
+}
+
+; CHECK-LABEL: @test16(
+; forwarding non-atomic to atomic is wrong! (However,
+; it would be legal to use the later value in place of the
+; former in this particular example.  We just don't
+; do that right now.)
+define i32 @test16(i1 %B, i32* %P1, i32* %P2) {
+  %a = load i32, i32* %P1, align 4
+  %b = load atomic i32, i32* %P1 unordered, align 4
+  %res = sub i32 %a, %b
+  ret i32 %res
+  ; CHECK: load i32, i32* %P1
+  ; CHECK: load atomic i32, i32* %P1
+}
+
+; Can't DSE across a full fence
+define void @test17(i1 %B, i32* %P1, i32* %P2) {
+; CHECK-LABEL: @test17
+; CHECK: store
+; CHECK: store atomic
+; CHECK: store
+  store i32 0, i32* %P1, align 4
+  store atomic i32 0, i32* %P2 seq_cst, align 4
+  store i32 0, i32* %P1, align 4
+  ret void
+}
+
+; Can't remove a volatile load
+define i32 @test18(i1 %B, i32* %P1, i32* %P2) {
+  %a = load i32, i32* %P1, align 4
+  %b = load volatile i32, i32* %P1, align 4
+  %res = sub i32 %a, %b
+  ret i32 %res
+  ; CHECK-LABEL: @test18
+  ; CHECK: load i32, i32* %P1
+  ; CHECK: load volatile i32, i32* %P1
+}
+
+; Can't DSE a volatile store
+define void @test19(i1 %B, i32* %P1, i32* %P2) {
+; CHECK-LABEL: @test19
+; CHECK: store volatile
+; CHECK: store
+  store volatile i32 0, i32* %P1, align 4
+  store i32 3, i32* %P1, align 4
+  ret void
+}
+
+; Can value forward from volailes
+define i32 @test20(i1 %B, i32* %P1, i32* %P2) {
+  %a = load volatile i32, i32* %P1, align 4
+  %b = load i32, i32* %P1, align 4
+  %res = sub i32 %a, %b
+  ret i32 %res
+  ; CHECK-LABEL: @test20
+  ; CHECK: load volatile i32, i32* %P1
+  ; CHECK: ret i32 0
+}
+
+; Can DSE a non-volatile store in favor of a volatile one
+; currently a missed optimization
+define void @test21(i1 %B, i32* %P1, i32* %P2) {
+; CHECK-LABEL: @test21
+; CHECK: store 
+; CHECK: store volatile
+  store i32 0, i32* %P1, align 4
+  store volatile i32 3, i32* %P1, align 4
+  ret void
+}
+
+; Can DSE a normal store in favor of a unordered one
+define void @test22(i1 %B, i32* %P1, i32* %P2) {
+; CHECK-LABEL: @test22
+; CHECK-NEXT: store atomic
+  store i32 0, i32* %P1, align 4
+  store atomic i32 3, i32* %P1 unordered, align 4
+  ret void
+}
+
+
+

From 898fa74bf23beff0a5e9155741cdc97ce48fb515 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Tue, 8 Dec 2015 21:50:52 +0000
Subject: [PATCH 238/364] [CGP] Check that we have an insert point before
 moving llvm.dbg.value around

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255055 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/CodeGenPrepare.cpp                | 12 ++--
 .../CodeGenPrepare/catchpad-phi-cast.ll       | 57 +++++++++++++++++++
 2 files changed, 65 insertions(+), 4 deletions(-)

diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index d619044e86ca..615bcf337679 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -5251,12 +5251,16 @@ bool CodeGenPrepare::placeDbgValues(Function &F) {
 
       Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue());
       if (VI && VI != PrevNonDbgInst && !VI->isTerminator()) {
-        DEBUG(dbgs() << "Moving Debug Value before :\n" << *DVI << ' ' << *VI);
-        DVI->removeFromParent();
+        BasicBlock::iterator IP;
         if (isa<PHINode>(VI))
-          DVI->insertBefore(&*VI->getParent()->getFirstInsertionPt());
+          IP = VI->getParent()->getFirstInsertionPt();
         else
-          DVI->insertAfter(VI);
+          IP = ++VI->getIterator();
+        if (IP == VI->getParent()->end())
+          continue;
+        DEBUG(dbgs() << "Moving Debug Value before :\n" << *DVI << ' ' << *VI);
+        DVI->removeFromParent();
+        VI->getParent()->getInstList().insert(IP, DVI);
         MadeChange = true;
         ++NumDbgValueMoved;
       }
diff --git a/test/Transforms/CodeGenPrepare/catchpad-phi-cast.ll b/test/Transforms/CodeGenPrepare/catchpad-phi-cast.ll
index 998ca7069c8b..c0b63b7d6d9e 100644
--- a/test/Transforms/CodeGenPrepare/catchpad-phi-cast.ll
+++ b/test/Transforms/CodeGenPrepare/catchpad-phi-cast.ll
@@ -10,6 +10,7 @@ declare i32 @__CxxFrameHandler3(...)
 declare void @f()
 
 declare void @g(i8*)
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
 
 ; CodeGenPrepare will want to sink these bitcasts, but it selects the catchpad
 ; blocks as the place to which the bitcast should be sunk.  Since catchpads
@@ -57,3 +58,59 @@ catchend1:
 catchend2:
   catchendpad unwind to caller
 }
+
+; CodeGenPrepare will want to hoist these llvm.dbg.value calls to the phi, but
+; there is no insertion point in a catchpad block.
+
+; CHECK-LABEL: @test_dbg_value(
+define void @test_dbg_value() personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  %a = alloca i8
+  %b = alloca i8
+  invoke void @f() to label %next unwind label %catch.dispatch
+next:
+  invoke void @f() to label %ret unwind label %catch.dispatch
+ret:
+  ret void
+
+catch.dispatch:
+  %p = phi i8* [%a, %entry], [%b, %next]
+  %cp1 = catchpad [] to label %catch unwind label %catchend
+
+catch:
+  tail call void @llvm.dbg.value(metadata i8* %p, i64 0, metadata !11, metadata !13), !dbg !14
+  invoke void @g(i8* %p) to label %catchret unwind label %catchend
+catchret:
+  catchret %cp1 to label %ret
+
+; CHECK: catch.dispatch:
+; CHECK-NEXT: phi i8
+; CHECK-NEXT: catchpad
+; CHECK-NOT: llvm.dbg.value
+
+; CHECK: catch:
+; CHECK-NEXT: call void @llvm.dbg.value
+
+catchend:
+  catchendpad unwind to caller
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7, !8, !9}
+!llvm.ident = !{!10}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 (trunk 254906) (llvm/trunk 254917)", isOptimized: false, runtimeVersion: 0, emissionKind: 1, enums: null, subprograms: !3)
+!1 = !DIFile(filename: "t.c", directory: "D:\5Csrc\5Cllvm\5Cbuild")
+!3 = !{!4}
+!4 = distinct !DISubprogram(name: "test_dbg_value", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: false, variables: null)
+!5 = !DISubroutineType(types: !6)
+!6 = !{null}
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{i32 1, !"PIC Level", i32 2}
+!10 = !{!"clang version 3.8.0 (trunk 254906) (llvm/trunk 254917)"}
+!11 = !DILocalVariable(name: "p", scope: !4, file: !1, line: 2, type: !12)
+!12 = !DIBasicType(name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
+!13 = !DIExpression()
+!14 = !DILocation(line: 2, column: 8, scope: !4)
+!15 = !DILocation(line: 3, column: 1, scope: !4)

From 5fb71e111619c7169b884c7e1139d37662dd324e Mon Sep 17 00:00:00 2001
From: Chris Bieneman <beanz@apple.com>
Date: Tue, 8 Dec 2015 21:51:48 +0000
Subject: [PATCH 239/364] [CMake] Ignore externalizing debuginfo for unit tests

If you externalize debug info for unit tests the test runner finds the mach-o inside the dsym bundle and tries to execute it as a test.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255056 91177308-0d34-0410-b5e6-96231b3b80d8
---
 cmake/modules/AddLLVM.cmake | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake
index 8cc8abbbc7de..23559d604bca 100644
--- a/cmake/modules/AddLLVM.cmake
+++ b/cmake/modules/AddLLVM.cmake
@@ -599,7 +599,7 @@ endmacro(add_llvm_loadable_module name)
 
 
 macro(add_llvm_executable name)
-  cmake_parse_arguments(ARG "DISABLE_LLVM_LINK_LLVM_DYLIB" "" "" ${ARGN})
+  cmake_parse_arguments(ARG "DISABLE_LLVM_LINK_LLVM_DYLIB;IGNORE_EXTERNALIZE_DEBUGINFO" "" "" ${ARGN})
   llvm_process_sources( ALL_FILES ${ARG_UNPARSED_ARGUMENTS} )
 
   # Generate objlib
@@ -660,7 +660,9 @@ macro(add_llvm_executable name)
     add_dependencies( ${name} ${LLVM_COMMON_DEPENDS} )
   endif( LLVM_COMMON_DEPENDS )
 
-  llvm_externalize_debuginfo(${name})
+  if(NOT ARG_IGNORE_EXTERNALIZE_DEBUGINFO)
+    llvm_externalize_debuginfo(${name})
+  endif()
 endmacro(add_llvm_executable name)
 
 function(export_executable_symbols target)
@@ -876,7 +878,7 @@ function(add_unittest test_suite test_name)
 
   set(LLVM_REQUIRES_RTTI OFF)
 
-  add_llvm_executable(${test_name} ${ARGN})
+  add_llvm_executable(${test_name} IGNORE_EXTERNALIZE_DEBUGINFO ${ARGN})
   set(outdir ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR})
   set_output_directory(${test_name} BINARY_DIR ${outdir} LIBRARY_DIR ${outdir})
   target_link_libraries(${test_name}

From 029bc0f70a472dc9e8a158a01375673a362f69b0 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 8 Dec 2015 22:17:11 +0000
Subject: [PATCH 240/364] [X86][AVX] Fold loads + splats into broadcast
 instructions

On AVX and AVX2, BROADCAST instructions can load a scalar into all elements of a target vector.

This patch improves the lowering of 'splat' shuffles of a loaded vector into a broadcast - currently the lowering only works for cases where we are splatting the zero'th element, which is now generalised to any element.

Fix for PR23022

Differential Revision: http://reviews.llvm.org/D15310

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255061 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp        | 15 +++++++
 test/CodeGen/X86/avx-splat.ll             | 12 ++----
 test/CodeGen/X86/avx-vbroadcast.ll        | 19 +++------
 test/CodeGen/X86/avx2-vbroadcast.ll       | 52 ++++++++---------------
 test/CodeGen/X86/vector-shuffle-256-v4.ll | 22 +++-------
 5 files changed, 45 insertions(+), 75 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 21bca74353c4..b1b7f5d45865 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -8033,6 +8033,7 @@ static SDValue lowerVectorShuffleAsTruncBroadcast(SDLoc DL, MVT VT, SDValue V0,
 /// For convenience, this code also bundles all of the subtarget feature set
 /// filtering. While a little annoying to re-dispatch on type here, there isn't
 /// a convenient way to factor it out.
+/// FIXME: This is very similar to LowerVectorBroadcast - can we merge them?
 static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
                                              ArrayRef<int> Mask,
                                              const X86Subtarget *Subtarget,
@@ -8105,6 +8106,20 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
     // Only AVX2 has register broadcasts.
     if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
       return SDValue();
+  } else if (MayFoldLoad(V) && !cast<LoadSDNode>(V)->isVolatile()) {
+    // If we are broadcasting a load that is only used by the shuffle
+    // then we can reduce the vector load to the broadcasted scalar load.
+    LoadSDNode *Ld = cast<LoadSDNode>(V);
+    SDValue BaseAddr = Ld->getOperand(1);
+    EVT AddrVT = BaseAddr.getValueType();
+    EVT SVT = VT.getScalarType();
+    unsigned Offset = BroadcastIdx * SVT.getStoreSize();
+    SDValue NewAddr = DAG.getNode(
+        ISD::ADD, DL, AddrVT, BaseAddr,
+        DAG.getConstant(Offset, DL, AddrVT));
+    V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
+                    DAG.getMachineFunction().getMachineMemOperand(
+                        Ld->getMemOperand(), Offset, SVT.getStoreSize()));
   } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) {
     // We can't broadcast from a vector register without AVX2, and we can only
     // broadcast from the zero-element of a vector register.
diff --git a/test/CodeGen/X86/avx-splat.ll b/test/CodeGen/X86/avx-splat.ll
index 2fdea2eb2562..70dabe441444 100644
--- a/test/CodeGen/X86/avx-splat.ll
+++ b/test/CodeGen/X86/avx-splat.ll
@@ -145,10 +145,7 @@ define <2 x double> @splat_load_2f64_11(<2 x double>* %ptr) {
 define <4 x double> @splat_load_4f64_2222(<4 x double>* %ptr) {
 ; CHECK-LABEL: splat_load_4f64_2222:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vmovapd (%rdi), %ymm0
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
-; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    vbroadcastsd 16(%rdi), %ymm0
 ; CHECK-NEXT:    retq
   %x = load <4 x double>, <4 x double>* %ptr
   %x1 = shufflevector <4 x double> %x, <4 x double> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
@@ -158,7 +155,7 @@ define <4 x double> @splat_load_4f64_2222(<4 x double>* %ptr) {
 define <4 x float> @splat_load_4f32_0000(<4 x float>* %ptr) {
 ; CHECK-LABEL: splat_load_4f32_0000:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,0,0,0]
+; CHECK-NEXT:    vbroadcastss (%rdi), %xmm0
 ; CHECK-NEXT:    retq
   %x = load <4 x float>, <4 x float>* %ptr
   %x1 = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
@@ -168,10 +165,7 @@ define <4 x float> @splat_load_4f32_0000(<4 x float>* %ptr) {
 define <8 x float> @splat_load_8f32_77777777(<8 x float>* %ptr) {
 ; CHECK-LABEL: splat_load_8f32_77777777:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vmovaps (%rdi), %ymm0
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    vbroadcastss 28(%rdi), %ymm0
 ; CHECK-NEXT:    retq
   %x = load <8 x float>, <8 x float>* %ptr
   %x1 = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
diff --git a/test/CodeGen/X86/avx-vbroadcast.ll b/test/CodeGen/X86/avx-vbroadcast.ll
index 5c0f43da876d..86b0628aa0bc 100644
--- a/test/CodeGen/X86/avx-vbroadcast.ll
+++ b/test/CodeGen/X86/avx-vbroadcast.ll
@@ -144,7 +144,7 @@ entry:
 define <4 x float> @load_splat_4f32_4f32_1111(<4 x float>* %ptr) nounwind uwtable readnone ssp {
 ; CHECK-LABEL: load_splat_4f32_4f32_1111:
 ; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = mem[1,1,1,1]
+; CHECK-NEXT:    vbroadcastss 4(%rdi), %xmm0
 ; CHECK-NEXT:    retq
 entry:
   %ld = load <4 x float>, <4 x float>* %ptr
@@ -155,8 +155,7 @@ entry:
 define <8 x float> @load_splat_8f32_4f32_33333333(<4 x float>* %ptr) nounwind uwtable readnone ssp {
 ; CHECK-LABEL: load_splat_8f32_4f32_33333333:
 ; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = mem[3,3,3,3]
-; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    vbroadcastss 12(%rdi), %ymm0
 ; CHECK-NEXT:    retq
 entry:
   %ld = load <4 x float>, <4 x float>* %ptr
@@ -167,10 +166,7 @@ entry:
 define <8 x float> @load_splat_8f32_8f32_55555555(<8 x float>* %ptr) nounwind uwtable readnone ssp {
 ; CHECK-LABEL: load_splat_8f32_8f32_55555555:
 ; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vmovaps (%rdi), %ymm0
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    vbroadcastss 20(%rdi), %ymm0
 ; CHECK-NEXT:    retq
 entry:
   %ld = load <8 x float>, <8 x float>* %ptr
@@ -231,9 +227,7 @@ entry:
 define <4 x double> @load_splat_4f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp {
 ; CHECK-LABEL: load_splat_4f64_2f64_1111:
 ; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vmovaps (%rdi), %xmm0
-; CHECK-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
-; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    vbroadcastsd 8(%rdi), %ymm0
 ; CHECK-NEXT:    retq
 entry:
   %ld = load <2 x double>, <2 x double>* %ptr
@@ -244,10 +238,7 @@ entry:
 define <4 x double> @load_splat_4f64_4f64_2222(<4 x double>* %ptr) nounwind uwtable readnone ssp {
 ; CHECK-LABEL: load_splat_4f64_4f64_2222:
 ; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vmovapd (%rdi), %ymm0
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
-; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    vbroadcastsd 16(%rdi), %ymm0
 ; CHECK-NEXT:    retq
 entry:
   %ld = load <4 x double>, <4 x double>* %ptr
diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll
index a18a587e4a66..6b77edb155a4 100644
--- a/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -177,8 +177,7 @@ entry:
 define <16 x i8> @load_splat_16i8_16i8_1111111111111111(<16 x i8>* %ptr) nounwind uwtable readnone ssp {
 ; CHECK-LABEL: load_splat_16i8_16i8_1111111111111111:
 ; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm0
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; CHECK-NEXT:    vpbroadcastb 1(%rdi), %xmm0
 ; CHECK-NEXT:    retq
 entry:
   %ld = load <16 x i8>, <16 x i8>* %ptr
@@ -189,9 +188,7 @@ entry:
 define <32 x i8> @load_splat_32i8_16i8_11111111111111111111111111111111(<16 x i8>* %ptr) nounwind uwtable readnone ssp {
 ; CHECK-LABEL: load_splat_32i8_16i8_11111111111111111111111111111111:
 ; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm0
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    vpbroadcastb 1(%rdi), %ymm0
 ; CHECK-NEXT:    retq
 entry:
   %ld = load <16 x i8>, <16 x i8>* %ptr
@@ -202,9 +199,7 @@ entry:
 define <32 x i8> @load_splat_32i8_32i8_11111111111111111111111111111111(<32 x i8>* %ptr) nounwind uwtable readnone ssp {
 ; CHECK-LABEL: load_splat_32i8_32i8_11111111111111111111111111111111:
 ; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    vpbroadcastb 1(%rdi), %ymm0
 ; CHECK-NEXT:    retq
 entry:
   %ld = load <32 x i8>, <32 x i8>* %ptr
@@ -215,8 +210,7 @@ entry:
 define <8 x i16> @load_splat_8i16_8i16_11111111(<8 x i16>* %ptr) nounwind uwtable readnone ssp {
 ; CHECK-LABEL: load_splat_8i16_8i16_11111111:
 ; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm0
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; CHECK-NEXT:    vpbroadcastw 2(%rdi), %xmm0
 ; CHECK-NEXT:    retq
 entry:
   %ld = load <8 x i16>, <8 x i16>* %ptr
@@ -227,9 +221,7 @@ entry:
 define <16 x i16> @load_splat_16i16_8i16_1111111111111111(<8 x i16>* %ptr) nounwind uwtable readnone ssp {
 ; CHECK-LABEL: load_splat_16i16_8i16_1111111111111111:
 ; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm0
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
-; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    vpbroadcastw 2(%rdi), %ymm0
 ; CHECK-NEXT:    retq
 entry:
   %ld = load <8 x i16>, <8 x i16>* %ptr
@@ -240,9 +232,7 @@ entry:
 define <16 x i16> @load_splat_16i16_16i16_1111111111111111(<16 x i16>* %ptr) nounwind uwtable readnone ssp {
 ; CHECK-LABEL: load_splat_16i16_16i16_1111111111111111:
 ; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
-; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    vpbroadcastw 2(%rdi), %ymm0
 ; CHECK-NEXT:    retq
 entry:
   %ld = load <16 x i16>, <16 x i16>* %ptr
@@ -253,7 +243,7 @@ entry:
 define <4 x i32> @load_splat_4i32_4i32_1111(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
 ; CHECK-LABEL: load_splat_4i32_4i32_1111:
 ; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = mem[1,1,1,1]
+; CHECK-NEXT:    vbroadcastss 4(%rdi), %xmm0
 ; CHECK-NEXT:    retq
 entry:
   %ld = load <4 x i32>, <4 x i32>* %ptr
@@ -264,9 +254,7 @@ entry:
 define <8 x i32> @load_splat_8i32_4i32_33333333(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
 ; CHECK-LABEL: load_splat_8i32_4i32_33333333:
 ; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm0
-; CHECK-NEXT:    vpbroadcastd LCPI15_0(%rip), %ymm1
-; CHECK-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    vbroadcastss 12(%rdi), %ymm0
 ; CHECK-NEXT:    retq
 entry:
   %ld = load <4 x i32>, <4 x i32>* %ptr
@@ -277,8 +265,7 @@ entry:
 define <8 x i32> @load_splat_8i32_8i32_55555555(<8 x i32>* %ptr) nounwind uwtable readnone ssp {
 ; CHECK-LABEL: load_splat_8i32_8i32_55555555:
 ; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpbroadcastd LCPI16_0(%rip), %ymm0
-; CHECK-NEXT:    vpermd (%rdi), %ymm0, %ymm0
+; CHECK-NEXT:    vbroadcastss 20(%rdi), %ymm0
 ; CHECK-NEXT:    retq
 entry:
   %ld = load <8 x i32>, <8 x i32>* %ptr
@@ -289,7 +276,7 @@ entry:
 define <4 x float> @load_splat_4f32_4f32_1111(<4 x float>* %ptr) nounwind uwtable readnone ssp {
 ; CHECK-LABEL: load_splat_4f32_4f32_1111:
 ; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = mem[1,1,1,1]
+; CHECK-NEXT:    vbroadcastss 4(%rdi), %xmm0
 ; CHECK-NEXT:    retq
 entry:
   %ld = load <4 x float>, <4 x float>* %ptr
@@ -300,9 +287,7 @@ entry:
 define <8 x float> @load_splat_8f32_4f32_33333333(<4 x float>* %ptr) nounwind uwtable readnone ssp {
 ; CHECK-LABEL: load_splat_8f32_4f32_33333333:
 ; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vmovaps (%rdi), %xmm0
-; CHECK-NEXT:    vbroadcastss LCPI18_0(%rip), %ymm1
-; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    vbroadcastss 12(%rdi), %ymm0
 ; CHECK-NEXT:    retq
 entry:
   %ld = load <4 x float>, <4 x float>* %ptr
@@ -313,8 +298,7 @@ entry:
 define <8 x float> @load_splat_8f32_8f32_55555555(<8 x float>* %ptr) nounwind uwtable readnone ssp {
 ; CHECK-LABEL: load_splat_8f32_8f32_55555555:
 ; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vbroadcastss LCPI19_0(%rip), %ymm0
-; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0
+; CHECK-NEXT:    vbroadcastss 20(%rdi), %ymm0
 ; CHECK-NEXT:    retq
 entry:
   %ld = load <8 x float>, <8 x float>* %ptr
@@ -325,7 +309,7 @@ entry:
 define <2 x i64> @load_splat_2i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp {
 ; CHECK-LABEL: load_splat_2i64_2i64_1111:
 ; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,3,2,3]
+; CHECK-NEXT:    vpbroadcastq 8(%rdi), %xmm0
 ; CHECK-NEXT:    retq
 entry:
   %ld = load <2 x i64>, <2 x i64>* %ptr
@@ -336,8 +320,7 @@ entry:
 define <4 x i64> @load_splat_4i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp {
 ; CHECK-LABEL: load_splat_4i64_2i64_1111:
 ; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm0
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1]
+; CHECK-NEXT:    vbroadcastsd 8(%rdi), %ymm0
 ; CHECK-NEXT:    retq
 entry:
   %ld = load <2 x i64>, <2 x i64>* %ptr
@@ -348,7 +331,7 @@ entry:
 define <4 x i64> @load_splat_4i64_4i64_2222(<4 x i64>* %ptr) nounwind uwtable readnone ssp {
 ; CHECK-LABEL: load_splat_4i64_4i64_2222:
 ; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = mem[2,2,2,2]
+; CHECK-NEXT:    vbroadcastsd 16(%rdi), %ymm0
 ; CHECK-NEXT:    retq
 entry:
   %ld = load <4 x i64>, <4 x i64>* %ptr
@@ -371,8 +354,7 @@ entry:
 define <4 x double> @load_splat_4f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp {
 ; CHECK-LABEL: load_splat_4f64_2f64_1111:
 ; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vmovapd (%rdi), %xmm0
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1]
+; CHECK-NEXT:    vbroadcastsd 8(%rdi), %ymm0
 ; CHECK-NEXT:    retq
 entry:
   %ld = load <2 x double>, <2 x double>* %ptr
@@ -383,7 +365,7 @@ entry:
 define <4 x double> @load_splat_4f64_4f64_2222(<4 x double>* %ptr) nounwind uwtable readnone ssp {
 ; CHECK-LABEL: load_splat_4f64_4f64_2222:
 ; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,2,2,2]
+; CHECK-NEXT:    vbroadcastsd 16(%rdi), %ymm0
 ; CHECK-NEXT:    retq
 entry:
   %ld = load <4 x double>, <4 x double>* %ptr
diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll
index 8c6d5105d820..2dcda842920e 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -1357,8 +1357,7 @@ define <4 x i64> @splat_mem_v4i64_from_v2i64(<2 x i64>* %ptr) {
 ;
 ; AVX512VL-LABEL: splat_mem_v4i64_from_v2i64:
 ; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vmovdqa64 (%rdi), %xmm0
-; AVX512VL-NEXT:    vpbroadcastq %xmm0, %ymm0
+; AVX512VL-NEXT:    vpbroadcastq (%rdi), %ymm0
 ; AVX512VL-NEXT:    retq
   %v = load <2 x i64>, <2 x i64>* %ptr
   %shuffle = shufflevector <2 x i64> %v, <2 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
@@ -1366,21 +1365,10 @@ define <4 x i64> @splat_mem_v4i64_from_v2i64(<2 x i64>* %ptr) {
 }
 
 define <4 x double> @splat_mem_v4f64_from_v2f64(<2 x double>* %ptr) {
-; AVX1-LABEL: splat_mem_v4f64_from_v2f64:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: splat_mem_v4f64_from_v2f64:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512VL-LABEL: splat_mem_v4f64_from_v2f64:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vbroadcastsd (%rdi), %ymm0
-; AVX512VL-NEXT:    retq
+; ALL-LABEL: splat_mem_v4f64_from_v2f64:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcastsd (%rdi), %ymm0
+; ALL-NEXT:    retq
   %v = load <2 x double>, <2 x double>* %ptr
   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
   ret <4 x double> %shuffle

From 12ab14b2f8ca5385ce1ac43603192b507f0ebe21 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@playingwithpointers.com>
Date: Tue, 8 Dec 2015 22:26:08 +0000
Subject: [PATCH 241/364] [OperandBundles] Fix a transform in simplifycfg

Reviewers: pcc, majnemer, reames

Subscribers: reames, llvm-commits

Differential Revision: http://reviews.llvm.org/D15345

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255062 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Utils/Local.cpp               |  8 ++++++--
 test/Transforms/SimplifyCFG/invoke_unwind.ll | 13 +++++++++++++
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index ba79b32ac3d5..19122dd21a71 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -1210,8 +1210,12 @@ static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) {
 
 /// changeToCall - Convert the specified invoke into a normal call.
 static void changeToCall(InvokeInst *II) {
-  SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3);
-  CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, "", II);
+  CallSite CS(II);
+  SmallVector<Value*, 8> Args(CS.arg_begin(), CS.arg_end());
+  SmallVector<OperandBundleDef, 1> OpBundles;
+  II->getOperandBundlesAsDefs(OpBundles);
+  CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, OpBundles,
+                                       "", II);
   NewCall->takeName(II);
   NewCall->setCallingConv(II->getCallingConv());
   NewCall->setAttributes(II->getAttributes());
diff --git a/test/Transforms/SimplifyCFG/invoke_unwind.ll b/test/Transforms/SimplifyCFG/invoke_unwind.ll
index 3b4c09d96f77..100bfd4e9e3e 100644
--- a/test/Transforms/SimplifyCFG/invoke_unwind.ll
+++ b/test/Transforms/SimplifyCFG/invoke_unwind.ll
@@ -17,4 +17,17 @@ Rethrow:
         resume { i8*, i32 } %exn
 }
 
+define i32 @test2() personality i32 (...)* @__gxx_personality_v0 {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT: call void @bar() [ "foo"(i32 100) ]
+; CHECK-NEXT: ret i32 0
+        invoke void @bar( ) [ "foo"(i32 100) ]
+                        to label %1 unwind label %Rethrow
+        ret i32 0
+Rethrow:
+        %exn = landingpad {i8*, i32}
+                 catch i8* null
+        resume { i8*, i32 } %exn
+}
+
 declare i32 @__gxx_personality_v0(...)

From 903c0998cc5aa55e9392cf46c9da397ee4e1d9af Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Tue, 8 Dec 2015 22:33:23 +0000
Subject: [PATCH 242/364] Revert "[CGP] Check that we have an insert point
 before moving llvm.dbg.value around"

This reverts commit r255055.

Breakage has been reported.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255063 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/CodeGenPrepare.cpp                | 12 ++--
 .../CodeGenPrepare/catchpad-phi-cast.ll       | 57 -------------------
 2 files changed, 4 insertions(+), 65 deletions(-)

diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index 615bcf337679..d619044e86ca 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -5251,16 +5251,12 @@ bool CodeGenPrepare::placeDbgValues(Function &F) {
 
       Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue());
       if (VI && VI != PrevNonDbgInst && !VI->isTerminator()) {
-        BasicBlock::iterator IP;
-        if (isa<PHINode>(VI))
-          IP = VI->getParent()->getFirstInsertionPt();
-        else
-          IP = ++VI->getIterator();
-        if (IP == VI->getParent()->end())
-          continue;
         DEBUG(dbgs() << "Moving Debug Value before :\n" << *DVI << ' ' << *VI);
         DVI->removeFromParent();
-        VI->getParent()->getInstList().insert(IP, DVI);
+        if (isa<PHINode>(VI))
+          DVI->insertBefore(&*VI->getParent()->getFirstInsertionPt());
+        else
+          DVI->insertAfter(VI);
         MadeChange = true;
         ++NumDbgValueMoved;
       }
diff --git a/test/Transforms/CodeGenPrepare/catchpad-phi-cast.ll b/test/Transforms/CodeGenPrepare/catchpad-phi-cast.ll
index c0b63b7d6d9e..998ca7069c8b 100644
--- a/test/Transforms/CodeGenPrepare/catchpad-phi-cast.ll
+++ b/test/Transforms/CodeGenPrepare/catchpad-phi-cast.ll
@@ -10,7 +10,6 @@ declare i32 @__CxxFrameHandler3(...)
 declare void @f()
 
 declare void @g(i8*)
-declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
 
 ; CodeGenPrepare will want to sink these bitcasts, but it selects the catchpad
 ; blocks as the place to which the bitcast should be sunk.  Since catchpads
@@ -58,59 +57,3 @@ catchend1:
 catchend2:
   catchendpad unwind to caller
 }
-
-; CodeGenPrepare will want to hoist these llvm.dbg.value calls to the phi, but
-; there is no insertion point in a catchpad block.
-
-; CHECK-LABEL: @test_dbg_value(
-define void @test_dbg_value() personality i32 (...)* @__CxxFrameHandler3 {
-entry:
-  %a = alloca i8
-  %b = alloca i8
-  invoke void @f() to label %next unwind label %catch.dispatch
-next:
-  invoke void @f() to label %ret unwind label %catch.dispatch
-ret:
-  ret void
-
-catch.dispatch:
-  %p = phi i8* [%a, %entry], [%b, %next]
-  %cp1 = catchpad [] to label %catch unwind label %catchend
-
-catch:
-  tail call void @llvm.dbg.value(metadata i8* %p, i64 0, metadata !11, metadata !13), !dbg !14
-  invoke void @g(i8* %p) to label %catchret unwind label %catchend
-catchret:
-  catchret %cp1 to label %ret
-
-; CHECK: catch.dispatch:
-; CHECK-NEXT: phi i8
-; CHECK-NEXT: catchpad
-; CHECK-NOT: llvm.dbg.value
-
-; CHECK: catch:
-; CHECK-NEXT: call void @llvm.dbg.value
-
-catchend:
-  catchendpad unwind to caller
-}
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!7, !8, !9}
-!llvm.ident = !{!10}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 (trunk 254906) (llvm/trunk 254917)", isOptimized: false, runtimeVersion: 0, emissionKind: 1, enums: null, subprograms: !3)
-!1 = !DIFile(filename: "t.c", directory: "D:\5Csrc\5Cllvm\5Cbuild")
-!3 = !{!4}
-!4 = distinct !DISubprogram(name: "test_dbg_value", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: false, variables: null)
-!5 = !DISubroutineType(types: !6)
-!6 = !{null}
-!7 = !{i32 2, !"Dwarf Version", i32 4}
-!8 = !{i32 2, !"Debug Info Version", i32 3}
-!9 = !{i32 1, !"PIC Level", i32 2}
-!10 = !{!"clang version 3.8.0 (trunk 254906) (llvm/trunk 254917)"}
-!11 = !DILocalVariable(name: "p", scope: !4, file: !1, line: 2, type: !12)
-!12 = !DIBasicType(name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
-!13 = !DIExpression()
-!14 = !DILocation(line: 2, column: 8, scope: !4)
-!15 = !DILocation(line: 3, column: 1, scope: !4)

From b2bc86f25144aa907b977e27fd93e316e15500d6 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <mehdi.amini@apple.com>
Date: Tue, 8 Dec 2015 22:39:40 +0000
Subject: [PATCH 243/364] Remove caching in FunctionImport: a Module can't be
 reused after being linked from

The Linker destroys the source module (API change coming to make it explicit)

From: Mehdi Amini <mehdi.amini@apple.com>

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255064 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Transforms/IPO/FunctionImport.h | 30 +++++-------------
 lib/Transforms/IPO/FunctionImport.cpp        | 33 ++++++++------------
 2 files changed, 20 insertions(+), 43 deletions(-)

diff --git a/include/llvm/Transforms/IPO/FunctionImport.h b/include/llvm/Transforms/IPO/FunctionImport.h
index 0315c72811c1..e3da8a79c23e 100644
--- a/include/llvm/Transforms/IPO/FunctionImport.h
+++ b/include/llvm/Transforms/IPO/FunctionImport.h
@@ -18,23 +18,6 @@ class LLVMContext;
 class Module;
 class FunctionInfoIndex;
 
-/// Helper to load on demand a Module from file and cache it for subsequent
-/// queries. It can be used with the FunctionImporter.
-class ModuleLazyLoaderCache {
-  /// The context that will be used for importing.
-  LLVMContext &Context;
-
-  /// Cache of lazily loaded module for import.
-  StringMap<std::unique_ptr<Module>> ModuleMap;
-
-public:
-  /// Create the loader, Module will be initialized in \p Context.
-  ModuleLazyLoaderCache(LLVMContext &Context) : Context(Context) {}
-
-  /// Retrieve a Module from the cache or lazily load it on demand.
-  Module &operator()(StringRef FileName);
-};
-
 /// The function importer is automatically importing function from other modules
 /// based on the provided summary informations.
 class FunctionImporter {
@@ -45,16 +28,17 @@ class FunctionImporter {
   /// Diagnostic will be sent to this handler.
   DiagnosticHandlerFunction DiagnosticHandler;
 
-  /// Retrieve a Module from the cache or lazily load it on demand.
-  std::function<Module &(StringRef FileName)> getLazyModule;
+  /// Factory function to load a Module for a given identifier
+  std::function<std::unique_ptr<Module>(StringRef Identifier)> ModuleLoader;
 
 public:
   /// Create a Function Importer.
-  FunctionImporter(const FunctionInfoIndex &Index,
-                   DiagnosticHandlerFunction DiagnosticHandler,
-                   std::function<Module &(StringRef FileName)> ModuleLoader)
+  FunctionImporter(
+      const FunctionInfoIndex &Index,
+      DiagnosticHandlerFunction DiagnosticHandler,
+      std::function<std::unique_ptr<Module>(StringRef Identifier)> ModuleLoader)
       : Index(Index), DiagnosticHandler(DiagnosticHandler),
-        getLazyModule(ModuleLoader) {}
+        ModuleLoader(ModuleLoader) {}
 
   /// Import functions in Module \p M based on the summary informations.
   bool importFunctions(Module &M);
diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp
index 67d77adb650a..48d6e40f8b9d 100644
--- a/lib/Transforms/IPO/FunctionImport.cpp
+++ b/lib/Transforms/IPO/FunctionImport.cpp
@@ -50,14 +50,6 @@ static std::unique_ptr<Module> loadFile(const std::string &FileName,
   return Result;
 }
 
-// Get a Module for \p FileName from the cache, or load it lazily.
-Module &ModuleLazyLoaderCache::operator()(StringRef FileName) {
-  auto &Module = ModuleMap[FileName];
-  if (!Module)
-    Module = loadFile(FileName, Context);
-  return *Module;
-}
-
 /// Walk through the instructions in \p F looking for external
 /// calls not already in the \p CalledFunctions set. If any are
 /// found they are added to the \p Worklist for importing.
@@ -86,7 +78,8 @@ static unsigned ProcessImportWorklist(
     Module &DestModule, SmallVector<StringRef, 64> &Worklist,
     StringSet<> &CalledFunctions, Linker &TheLinker,
     const FunctionInfoIndex &Index,
-    std::function<Module &(StringRef FileName)> &LazyModuleLoader) {
+    std::function<std::unique_ptr<Module>(StringRef FileName)> &
+        LazyModuleLoader) {
   unsigned ImportCount = 0;
   while (!Worklist.empty()) {
     auto CalledFunctionName = Worklist.pop_back_val();
@@ -125,18 +118,18 @@ static unsigned ProcessImportWorklist(
     DEBUG(dbgs() << "Importing " << CalledFunctionName << " from " << FileName
                  << "\n");
 
-    // Get the module for the import (potentially from the cache).
-    auto &Module = LazyModuleLoader(FileName);
-    assert(&Module.getContext() == &DestModule.getContext());
+    // Get the module for the import
+    auto SrcModule = LazyModuleLoader(FileName);
+    assert(&SrcModule->getContext() == &DestModule.getContext());
 
     // The function that we will import!
-    GlobalValue *SGV = Module.getNamedValue(CalledFunctionName);
+    GlobalValue *SGV = SrcModule->getNamedValue(CalledFunctionName);
     StringRef ImportFunctionName = CalledFunctionName;
     if (!SGV) {
       // Might be local in source Module, promoted/renamed in DestModule.
       std::pair<StringRef, StringRef> Split =
           CalledFunctionName.split(".llvm.");
-      SGV = Module.getNamedValue(Split.first);
+      SGV = SrcModule->getNamedValue(Split.first);
 #ifndef NDEBUG
       // Assert that Split.second is module id
       uint64_t ModuleId;
@@ -169,7 +162,7 @@ static unsigned ProcessImportWorklist(
     // Link in the specified function.
     DenseSet<const GlobalValue *> FunctionsToImport;
     FunctionsToImport.insert(F);
-    if (TheLinker.linkInModule(Module, Linker::Flags::None, &Index,
+    if (TheLinker.linkInModule(*SrcModule, Linker::Flags::None, &Index,
                                &FunctionsToImport))
       report_fatal_error("Function Import: link error");
 
@@ -211,7 +204,7 @@ bool FunctionImporter::importFunctions(Module &DestModule) {
   Linker TheLinker(DestModule, DiagnosticHandler);
 
   ImportedCount += ProcessImportWorklist(DestModule, Worklist, CalledFunctions,
-                                         TheLinker, Index, getLazyModule);
+                                         TheLinker, Index, ModuleLoader);
 
   DEBUG(errs() << "Imported " << ImportedCount << " functions for Module "
                << DestModule.getModuleIdentifier() << "\n");
@@ -291,10 +284,10 @@ class FunctionImportPass : public ModulePass {
     }
 
     // Perform the import now.
-    ModuleLazyLoaderCache Loader(M.getContext());
-    FunctionImporter Importer(*Index, diagnosticHandler,
-                              [&](StringRef Name)
-                                  -> Module &{ return Loader(Name); });
+    auto ModuleLoader = [&M](StringRef Identifier) {
+      return loadFile(Identifier, M.getContext());
+    };
+    FunctionImporter Importer(*Index, diagnosticHandler, ModuleLoader);
     return Importer.importFunctions(M);
 
     return false;

From 8f28b0e1d848584abf18ba56acb21c23d339e92e Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@playingwithpointers.com>
Date: Tue, 8 Dec 2015 22:53:36 +0000
Subject: [PATCH 244/364] [SCEV] Use for-each; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255069 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Analysis/ScalarEvolution.cpp | 32 +++++++++++++-------------------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 6c8f3ba9c6e0..f57997e146e0 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -2452,9 +2452,8 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
         if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Ops[1])) {
           SmallVector<const SCEV *, 4> NewOps;
           bool AnyFolded = false;
-          for (SCEVAddRecExpr::op_iterator I = Add->op_begin(),
-                 E = Add->op_end(); I != E; ++I) {
-            const SCEV *Mul = getMulExpr(Ops[0], *I);
+          for (const SCEV *AddOp : Add->operands()) {
+            const SCEV *Mul = getMulExpr(Ops[0], AddOp);
             if (!isa<SCEVMulExpr>(Mul)) AnyFolded = true;
             NewOps.push_back(Mul);
           }
@@ -2463,10 +2462,9 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
         } else if (const auto *AddRec = dyn_cast<SCEVAddRecExpr>(Ops[1])) {
           // Negation preserves a recurrence's no self-wrap property.
           SmallVector<const SCEV *, 4> Operands;
-          for (SCEVAddRecExpr::op_iterator I = AddRec->op_begin(),
-                 E = AddRec->op_end(); I != E; ++I) {
-            Operands.push_back(getMulExpr(Ops[0], *I));
-          }
+          for (const SCEV *AddRecOp : AddRec->operands())
+            Operands.push_back(getMulExpr(Ops[0], AddRecOp));
+
           return getAddRecExpr(Operands, AddRec->getLoop(),
                                AddRec->getNoWrapFlags(SCEV::FlagNW));
         }
@@ -3550,13 +3548,12 @@ const SCEV *ScalarEvolution::getPointerBase(const SCEV *V) {
     return getPointerBase(Cast->getOperand());
   } else if (const SCEVNAryExpr *NAry = dyn_cast<SCEVNAryExpr>(V)) {
     const SCEV *PtrOp = nullptr;
-    for (SCEVNAryExpr::op_iterator I = NAry->op_begin(), E = NAry->op_end();
-         I != E; ++I) {
-      if ((*I)->getType()->isPointerTy()) {
+    for (const SCEV *NAryOp : NAry->operands()) {
+      if (NAryOp->getType()->isPointerTy()) {
         // Cannot find the base of an expression with multiple pointer operands.
         if (PtrOp)
           return V;
-        PtrOp = *I;
+        PtrOp = NAryOp;
       }
     }
     if (!PtrOp)
@@ -5827,12 +5824,10 @@ getConstantEvolvingPHIOperands(Instruction *UseInst, const Loop *L,
   // Otherwise, we can evaluate this instruction if all of its operands are
   // constant or derived from a PHI node themselves.
   PHINode *PHI = nullptr;
-  for (Instruction::op_iterator OpI = UseInst->op_begin(),
-         OpE = UseInst->op_end(); OpI != OpE; ++OpI) {
+  for (Value *Op : UseInst->operands()) {
+    if (isa<Constant>(Op)) continue;
 
-    if (isa<Constant>(*OpI)) continue;
-
-    Instruction *OpInst = dyn_cast<Instruction>(*OpI);
+    Instruction *OpInst = dyn_cast<Instruction>(Op);
     if (!OpInst || !canConstantEvolve(OpInst, L)) return nullptr;
 
     PHINode *P = dyn_cast<PHINode>(OpInst);
@@ -9353,9 +9348,8 @@ ScalarEvolution::computeBlockDisposition(const SCEV *S, const BasicBlock *BB) {
   case scSMaxExpr: {
     const SCEVNAryExpr *NAry = cast<SCEVNAryExpr>(S);
     bool Proper = true;
-    for (SCEVNAryExpr::op_iterator I = NAry->op_begin(), E = NAry->op_end();
-         I != E; ++I) {
-      BlockDisposition D = getBlockDisposition(*I, BB);
+    for (const SCEV *NAryOp : NAry->operands()) {
+      BlockDisposition D = getBlockDisposition(NAryOp, BB);
       if (D == DoesNotDominateBlock)
         return DoesNotDominateBlock;
       if (D == DominatesBlock)

From f4e0a47be33bdb8f1d3696c3bed0cd01ac2d758e Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Tue, 8 Dec 2015 23:00:03 +0000
Subject: [PATCH 245/364] [CGP] Reimplement r255055 a different way

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255070 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/CodeGenPrepare.cpp                |  4 ++
 .../CodeGenPrepare/catchpad-phi-cast.ll       | 57 +++++++++++++++++++
 2 files changed, 61 insertions(+)

diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index d619044e86ca..30af1c54fac9 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -5251,6 +5251,10 @@ bool CodeGenPrepare::placeDbgValues(Function &F) {
 
       Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue());
       if (VI && VI != PrevNonDbgInst && !VI->isTerminator()) {
+        // If VI is a phi in a block with an EHPad terminator, we can't insert
+        // after it.
+        if (isa<PHINode>(VI) && VI->getParent()->getTerminator()->isEHPad())
+          continue;
         DEBUG(dbgs() << "Moving Debug Value before :\n" << *DVI << ' ' << *VI);
         DVI->removeFromParent();
         if (isa<PHINode>(VI))
diff --git a/test/Transforms/CodeGenPrepare/catchpad-phi-cast.ll b/test/Transforms/CodeGenPrepare/catchpad-phi-cast.ll
index 998ca7069c8b..c0b63b7d6d9e 100644
--- a/test/Transforms/CodeGenPrepare/catchpad-phi-cast.ll
+++ b/test/Transforms/CodeGenPrepare/catchpad-phi-cast.ll
@@ -10,6 +10,7 @@ declare i32 @__CxxFrameHandler3(...)
 declare void @f()
 
 declare void @g(i8*)
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
 
 ; CodeGenPrepare will want to sink these bitcasts, but it selects the catchpad
 ; blocks as the place to which the bitcast should be sunk.  Since catchpads
@@ -57,3 +58,59 @@ catchend1:
 catchend2:
   catchendpad unwind to caller
 }
+
+; CodeGenPrepare will want to hoist these llvm.dbg.value calls to the phi, but
+; there is no insertion point in a catchpad block.
+
+; CHECK-LABEL: @test_dbg_value(
+define void @test_dbg_value() personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  %a = alloca i8
+  %b = alloca i8
+  invoke void @f() to label %next unwind label %catch.dispatch
+next:
+  invoke void @f() to label %ret unwind label %catch.dispatch
+ret:
+  ret void
+
+catch.dispatch:
+  %p = phi i8* [%a, %entry], [%b, %next]
+  %cp1 = catchpad [] to label %catch unwind label %catchend
+
+catch:
+  tail call void @llvm.dbg.value(metadata i8* %p, i64 0, metadata !11, metadata !13), !dbg !14
+  invoke void @g(i8* %p) to label %catchret unwind label %catchend
+catchret:
+  catchret %cp1 to label %ret
+
+; CHECK: catch.dispatch:
+; CHECK-NEXT: phi i8
+; CHECK-NEXT: catchpad
+; CHECK-NOT: llvm.dbg.value
+
+; CHECK: catch:
+; CHECK-NEXT: call void @llvm.dbg.value
+
+catchend:
+  catchendpad unwind to caller
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7, !8, !9}
+!llvm.ident = !{!10}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 (trunk 254906) (llvm/trunk 254917)", isOptimized: false, runtimeVersion: 0, emissionKind: 1, enums: null, subprograms: !3)
+!1 = !DIFile(filename: "t.c", directory: "D:\5Csrc\5Cllvm\5Cbuild")
+!3 = !{!4}
+!4 = distinct !DISubprogram(name: "test_dbg_value", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: false, variables: null)
+!5 = !DISubroutineType(types: !6)
+!6 = !{null}
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{i32 1, !"PIC Level", i32 2}
+!10 = !{!"clang version 3.8.0 (trunk 254906) (llvm/trunk 254917)"}
+!11 = !DILocalVariable(name: "p", scope: !4, file: !1, line: 2, type: !12)
+!12 = !DIBasicType(name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
+!13 = !DIExpression()
+!14 = !DILocation(line: 2, column: 8, scope: !4)
+!15 = !DILocation(line: 3, column: 1, scope: !4)

From 1a2a86e8cde272e37b7e1f15e24746df9ff71014 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <mehdi.amini@apple.com>
Date: Tue, 8 Dec 2015 23:04:19 +0000
Subject: [PATCH 246/364] Fix/Improve Debug print in FunctionImport pass

From: Mehdi Amini <mehdi.amini@apple.com>

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255071 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/IPO/FunctionImport.cpp | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp
index 48d6e40f8b9d..b585a86b86a6 100644
--- a/lib/Transforms/IPO/FunctionImport.cpp
+++ b/lib/Transforms/IPO/FunctionImport.cpp
@@ -83,13 +83,14 @@ static unsigned ProcessImportWorklist(
   unsigned ImportCount = 0;
   while (!Worklist.empty()) {
     auto CalledFunctionName = Worklist.pop_back_val();
-    DEBUG(dbgs() << "Process import for " << CalledFunctionName << "\n");
+    DEBUG(dbgs() << DestModule.getModuleIdentifier() << "Process import for "
+                 << CalledFunctionName << "\n");
 
     // Try to get a summary for this function call.
     auto InfoList = Index.findFunctionInfoList(CalledFunctionName);
     if (InfoList == Index.end()) {
-      DEBUG(dbgs() << "No summary for " << CalledFunctionName
-                   << " Ignoring.\n");
+      DEBUG(dbgs() << DestModule.getModuleIdentifier() << "No summary for "
+                   << CalledFunctionName << " Ignoring.\n");
       continue;
     }
     assert(!InfoList->second.empty() && "No summary, error at import?");
@@ -101,15 +102,16 @@ static unsigned ProcessImportWorklist(
     auto *Summary = Info->functionSummary();
     if (!Summary) {
       // FIXME: in case we are lazyloading summaries, we can do it now.
-      DEBUG(dbgs() << "Missing summary for  " << CalledFunctionName
+      DEBUG(dbgs() << DestModule.getModuleIdentifier()
+                   << " Missing summary for  " << CalledFunctionName
                    << ", error at import?\n");
       llvm_unreachable("Missing summary");
     }
 
     if (Summary->instCount() > ImportInstrLimit) {
-      DEBUG(dbgs() << "Skip import of " << CalledFunctionName << " with "
-                   << Summary->instCount() << " instructions (limit "
-                   << ImportInstrLimit << ")\n");
+      DEBUG(dbgs() << DestModule.getModuleIdentifier() << " Skip import of "
+                   << CalledFunctionName << " with " << Summary->instCount()
+                   << " instructions (limit " << ImportInstrLimit << ")\n");
       continue;
     }
 
@@ -153,7 +155,8 @@ static unsigned ProcessImportWorklist(
     // the order they are seen and selected by the linker, changing program
     // semantics.
     if (SGV->hasWeakAnyLinkage()) {
-      DEBUG(dbgs() << "Ignoring import request for weak-any "
+      DEBUG(dbgs() << DestModule.getModuleIdentifier()
+                   << " Ignoring import request for weak-any "
                    << (isa<Function>(SGV) ? "function " : "alias ")
                    << CalledFunctionName << " from " << FileName << "\n");
       continue;
@@ -183,7 +186,7 @@ static unsigned ProcessImportWorklist(
 // The current implementation imports every called functions that exists in the
 // summaries index.
 bool FunctionImporter::importFunctions(Module &DestModule) {
-  DEBUG(errs() << "Starting import for Module "
+  DEBUG(dbgs() << "Starting import for Module "
                << DestModule.getModuleIdentifier() << "\n");
   unsigned ImportedCount = 0;
 

From 46994bcf9c7e4f7f51567b15b5b2161b106dbeca Mon Sep 17 00:00:00 2001
From: Pirama Arumuga Nainar <pirama@google.com>
Date: Tue, 8 Dec 2015 23:07:06 +0000
Subject: [PATCH 247/364] Define selection for v4f16, v8f16 scalar_to_vector

Summary:
This fixes failure when trying to select
    insertelement <4 x half> undef, half %a, i64 0
which gets transformed to a scalar_to_vector node.

The accompanying v4 and v8 tests fail instruction selection without this
patch.

Reviewers: ab, jmolloy

Subscribers: srhines, llvm-commits

Differential Revision: http://reviews.llvm.org/D15322

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255072 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64InstrInfo.td       | 5 +++++
 test/CodeGen/AArch64/fp16-v4-instructions.ll | 9 +++++++++
 test/CodeGen/AArch64/fp16-v8-instructions.ll | 9 +++++++++
 3 files changed, 23 insertions(+)

diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index cfb0c1b578da..ed6e171caa98 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -3843,6 +3843,11 @@ def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))),
             (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
                                   (i64 FPR64:$Rn), dsub))>;
 
+def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))),
+          (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
+def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))),
+          (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
+
 def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
           (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
 def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
diff --git a/test/CodeGen/AArch64/fp16-v4-instructions.ll b/test/CodeGen/AArch64/fp16-v4-instructions.ll
index 0dbda152fca9..1249c14eb9cc 100644
--- a/test/CodeGen/AArch64/fp16-v4-instructions.ll
+++ b/test/CodeGen/AArch64/fp16-v4-instructions.ll
@@ -218,4 +218,13 @@ define <4 x half> @uitofp_i64(<4 x i64> %a) #0 {
   ret <4 x half> %1
 }
 
+define void @test_insert_at_zero(half %a, <4 x half>* %b) #0 {
+; CHECK-LABEL: test_insert_at_zero:
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+  %1 = insertelement <4 x half> undef, half %a, i64 0
+  store <4 x half> %1, <4 x half>* %b, align 4
+  ret void
+}
+
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/fp16-v8-instructions.ll b/test/CodeGen/AArch64/fp16-v8-instructions.ll
index 10a8c22d6f7e..dfad6bc12a17 100644
--- a/test/CodeGen/AArch64/fp16-v8-instructions.ll
+++ b/test/CodeGen/AArch64/fp16-v8-instructions.ll
@@ -358,4 +358,13 @@ define <8 x half> @uitofp_i64(<8 x i64> %a) #0 {
   ret <8 x half> %1
 }
 
+define void @test_insert_at_zero(half %a, <8 x half>* %b) #0 {
+; CHECK-LABEL: test_insert_at_zero:
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ret
+  %1 = insertelement <8 x half> undef, half %a, i64 0
+  store <8 x half> %1, <8 x half>* %b, align 4
+  ret void
+}
+
 attributes #0 = { nounwind }

From 9c609eaad830ef4d94c43ca50e41b766dddcd8d7 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@playingwithpointers.com>
Date: Tue, 8 Dec 2015 23:16:52 +0000
Subject: [PATCH 248/364] [OperandBundles] Have PruneEH work correct with
 operand bundles.

For an invoke with operand bundles, the [op_begin(), op_end()-3] range
can contain things other than invoke arguments.  This change teaches
PruneEH to use arg_begin() and arg_end() explicitly.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255073 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/IPO/PruneEH.cpp             |  9 ++++++--
 test/Transforms/PruneEH/operand-bundles.ll | 26 ++++++++++++++++++++++
 2 files changed, 33 insertions(+), 2 deletions(-)
 create mode 100644 test/Transforms/PruneEH/operand-bundles.ll

diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp
index 714e1d6e42d2..c9c0b197eae6 100644
--- a/lib/Transforms/IPO/PruneEH.cpp
+++ b/lib/Transforms/IPO/PruneEH.cpp
@@ -191,9 +191,14 @@ bool PruneEH::SimplifyFunction(Function *F) {
   for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
     if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator()))
       if (II->doesNotThrow() && canSimplifyInvokeNoUnwind(F)) {
-        SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3);
+        CallSite CS(II);
+        SmallVector<Value*, 8> Args(CS.arg_begin(), CS.arg_end());
+        SmallVector<OperandBundleDef, 1> OpBundles;
+        II->getOperandBundlesAsDefs(OpBundles);
+
         // Insert a call instruction before the invoke.
-        CallInst *Call = CallInst::Create(II->getCalledValue(), Args, "", II);
+        CallInst *Call = CallInst::Create(II->getCalledValue(), Args, OpBundles,
+                                          "", II);
         Call->takeName(II);
         Call->setCallingConv(II->getCallingConv());
         Call->setAttributes(II->getAttributes());
diff --git a/test/Transforms/PruneEH/operand-bundles.ll b/test/Transforms/PruneEH/operand-bundles.ll
new file mode 100644
index 000000000000..efe8f62a8fb2
--- /dev/null
+++ b/test/Transforms/PruneEH/operand-bundles.ll
@@ -0,0 +1,26 @@
+; RUN: opt < %s -prune-eh -S | FileCheck %s
+
+declare void @nounwind() nounwind
+
+define internal void @foo() {
+	call void @nounwind()
+	ret void
+}
+
+define i32 @caller() personality i32 (...)* @__gxx_personality_v0 {
+; CHECK-LABEL: @caller(
+; CHECK-NOT: invoke
+; CHECK: call void @foo() [ "foo"(i32 0, i8 1) ]
+	invoke void @foo() [ "foo"(i32 0, i8 1) ]
+			to label %Normal unwind label %Except
+
+Normal:		; preds = %0
+	ret i32 0
+
+Except:		; preds = %0
+        landingpad { i8*, i32 }
+                catch i8* null
+	ret i32 1
+}
+
+declare i32 @__gxx_personality_v0(...)

From cc026567d1cf4f85707ed20990c3589995b0f212 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@playingwithpointers.com>
Date: Tue, 8 Dec 2015 23:52:58 +0000
Subject: [PATCH 249/364] [IndVars] Use any_of and foreach instead of explicit
 for loops; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255077 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/IndVarSimplify.cpp | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 308c8f8f7c6d..c3db22dba56f 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -759,14 +759,9 @@ bool IndVarSimplify::canLoopBeDeleted(
     ++BI;
   }
 
-  for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
-       LI != LE; ++LI) {
-    for (BasicBlock::iterator BI = (*LI)->begin(), BE = (*LI)->end(); BI != BE;
-         ++BI) {
-      if (BI->mayHaveSideEffects())
-        return false;
-    }
-  }
+  for (auto *BB : L->blocks())
+    if (any_of(*BB, [](Instruction &I) { return I.mayHaveSideEffects(); }))
+      return false;
 
   return true;
 }
@@ -1675,10 +1670,10 @@ static bool hasConcreteDefImpl(Value *V, SmallPtrSetImpl<Value*> &Visited,
     return false;
 
   // Optimistically handle other instructions.
-  for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) {
-    if (!Visited.insert(*OI).second)
+  for (Value *Op : I->operands()) {
+    if (!Visited.insert(Op).second)
       continue;
-    if (!hasConcreteDefImpl(*OI, Visited, Depth+1))
+    if (!hasConcreteDefImpl(Op, Visited, Depth+1))
       return false;
   }
   return true;

From b3df2d03ce1a3b413ade8c80e7289abd481fd0d6 Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Tue, 8 Dec 2015 23:57:17 +0000
Subject: [PATCH 250/364] Return a std::unique_ptr from CloneModule. NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255078 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Transforms/Utils/Cloning.h |  8 +++----
 lib/Transforms/Utils/CloneModule.cpp    | 28 +++++++++++++------------
 tools/bugpoint/CrashDebugger.cpp        | 16 +++++++-------
 tools/bugpoint/ExtractFunction.cpp      |  4 ++--
 tools/bugpoint/Miscompilation.cpp       | 24 ++++++++++-----------
 unittests/Transforms/Utils/Cloning.cpp  |  2 +-
 6 files changed, 42 insertions(+), 40 deletions(-)

diff --git a/include/llvm/Transforms/Utils/Cloning.h b/include/llvm/Transforms/Utils/Cloning.h
index 2fccbb802964..5d5689c2c1ac 100644
--- a/include/llvm/Transforms/Utils/Cloning.h
+++ b/include/llvm/Transforms/Utils/Cloning.h
@@ -48,16 +48,16 @@ class AllocaInst;
 class AssumptionCacheTracker;
 class DominatorTree;
 
-/// CloneModule - Return an exact copy of the specified module
+/// Return an exact copy of the specified module
 ///
-Module *CloneModule(const Module *M);
-Module *CloneModule(const Module *M, ValueToValueMapTy &VMap);
+std::unique_ptr<Module> CloneModule(const Module *M);
+std::unique_ptr<Module> CloneModule(const Module *M, ValueToValueMapTy &VMap);
 
 /// Return a copy of the specified module. The ShouldCloneDefinition function
 /// controls whether a specific GlobalValue's definition is cloned. If the
 /// function returns false, the module copy will contain an external reference
 /// in place of the global definition.
-Module *
+std::unique_ptr<Module>
 CloneModule(const Module *M, ValueToValueMapTy &VMap,
             std::function<bool(const GlobalValue *)> ShouldCloneDefinition);
 
diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp
index acb88c2395fe..ab083353ece6 100644
--- a/lib/Transforms/Utils/CloneModule.cpp
+++ b/lib/Transforms/Utils/CloneModule.cpp
@@ -20,27 +20,28 @@
 #include "llvm-c/Core.h"
 using namespace llvm;
 
-/// CloneModule - Return an exact copy of the specified module.  This is not as
-/// easy as it might seem because we have to worry about making copies of global
-/// variables and functions, and making their (initializers and references,
-/// respectively) refer to the right globals.
+/// This is not as easy as it might seem because we have to worry about making
+/// copies of global variables and functions, and making their (initializers and
+/// references, respectively) refer to the right globals.
 ///
-Module *llvm::CloneModule(const Module *M) {
+std::unique_ptr<Module> llvm::CloneModule(const Module *M) {
   // Create the value map that maps things from the old module over to the new
   // module.
   ValueToValueMapTy VMap;
   return CloneModule(M, VMap);
 }
 
-Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) {
+std::unique_ptr<Module> llvm::CloneModule(const Module *M,
+                                          ValueToValueMapTy &VMap) {
   return CloneModule(M, VMap, [](const GlobalValue *GV) { return true; });
 }
 
-Module *llvm::CloneModule(
+std::unique_ptr<Module> llvm::CloneModule(
     const Module *M, ValueToValueMapTy &VMap,
     std::function<bool(const GlobalValue *)> ShouldCloneDefinition) {
   // First off, we need to create the new module.
-  Module *New = new Module(M->getModuleIdentifier(), M->getContext());
+  std::unique_ptr<Module> New =
+      llvm::make_unique<Module>(M->getModuleIdentifier(), M->getContext());
   New->setDataLayout(M->getDataLayout());
   New->setTargetTriple(M->getTargetTriple());
   New->setModuleInlineAsm(M->getModuleInlineAsm());
@@ -65,8 +66,8 @@ Module *llvm::CloneModule(
   // Loop over the functions in the module, making external functions as before
   for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I) {
     Function *NF =
-      Function::Create(cast<FunctionType>(I->getType()->getElementType()),
-                       I->getLinkage(), I->getName(), New);
+        Function::Create(cast<FunctionType>(I->getType()->getElementType()),
+                         I->getLinkage(), I->getName(), New.get());
     NF->copyAttributesFrom(&*I);
     VMap[&*I] = NF;
   }
@@ -82,7 +83,8 @@ Module *llvm::CloneModule(
       GlobalValue *GV;
       if (I->getValueType()->isFunctionTy())
         GV = Function::Create(cast<FunctionType>(I->getValueType()),
-                              GlobalValue::ExternalLinkage, I->getName(), New);
+                              GlobalValue::ExternalLinkage, I->getName(),
+                              New.get());
       else
         GV = new GlobalVariable(
             *New, I->getValueType(), false, GlobalValue::ExternalLinkage,
@@ -96,7 +98,7 @@ Module *llvm::CloneModule(
     }
     auto *GA = GlobalAlias::create(I->getValueType(),
                                    I->getType()->getPointerAddressSpace(),
-                                   I->getLinkage(), I->getName(), New);
+                                   I->getLinkage(), I->getName(), New.get());
     GA->copyAttributesFrom(&*I);
     VMap[&*I] = GA;
   }
@@ -168,7 +170,7 @@ Module *llvm::CloneModule(
 extern "C" {
 
 LLVMModuleRef LLVMCloneModule(LLVMModuleRef M) {
-  return wrap(CloneModule(unwrap(M)));
+  return wrap(CloneModule(unwrap(M)).release());
 }
 
 }
diff --git a/tools/bugpoint/CrashDebugger.cpp b/tools/bugpoint/CrashDebugger.cpp
index 631a58455c5b..6cdc43ab8699 100644
--- a/tools/bugpoint/CrashDebugger.cpp
+++ b/tools/bugpoint/CrashDebugger.cpp
@@ -143,7 +143,7 @@ ReduceCrashingGlobalVariables::TestGlobalVariables(
                               std::vector<GlobalVariable*> &GVs) {
   // Clone the program to try hacking it apart...
   ValueToValueMapTy VMap;
-  Module *M = CloneModule(BD.getProgram(), VMap);
+  Module *M = CloneModule(BD.getProgram(), VMap).release();
 
   // Convert list to set for fast lookup...
   std::set<GlobalVariable*> GVSet;
@@ -239,7 +239,7 @@ bool ReduceCrashingFunctions::TestFuncs(std::vector<Function*> &Funcs) {
 
   // Clone the program to try hacking it apart...
   ValueToValueMapTy VMap;
-  Module *M = CloneModule(BD.getProgram(), VMap);
+  Module *M = CloneModule(BD.getProgram(), VMap).release();
 
   // Convert list to set for fast lookup...
   std::set<Function*> Functions;
@@ -346,7 +346,7 @@ namespace {
 bool ReduceCrashingBlocks::TestBlocks(std::vector<const BasicBlock*> &BBs) {
   // Clone the program to try hacking it apart...
   ValueToValueMapTy VMap;
-  Module *M = CloneModule(BD.getProgram(), VMap);
+  Module *M = CloneModule(BD.getProgram(), VMap).release();
 
   // Convert list to set for fast lookup...
   SmallPtrSet<BasicBlock*, 8> Blocks;
@@ -456,7 +456,7 @@ bool ReduceCrashingInstructions::TestInsts(std::vector<const Instruction*>
                                            &Insts) {
   // Clone the program to try hacking it apart...
   ValueToValueMapTy VMap;
-  Module *M = CloneModule(BD.getProgram(), VMap);
+  Module *M = CloneModule(BD.getProgram(), VMap).release();
 
   // Convert list to set for fast lookup...
   SmallPtrSet<Instruction*, 64> Instructions;
@@ -532,7 +532,7 @@ class ReduceCrashingNamedMD : public ListReducer<std::string> {
 bool ReduceCrashingNamedMD::TestNamedMDs(std::vector<std::string> &NamedMDs) {
 
   ValueToValueMapTy VMap;
-  Module *M = CloneModule(BD.getProgram(), VMap);
+  Module *M = CloneModule(BD.getProgram(), VMap).release();
 
   outs() << "Checking for crash with only these named metadata nodes:";
   unsigned NumPrint = std::min<size_t>(NamedMDs.size(), 10);
@@ -612,7 +612,7 @@ bool ReduceCrashingNamedMDOps::TestNamedMDOps(
     outs() << " named metadata operands: ";
 
   ValueToValueMapTy VMap;
-  Module *M = CloneModule(BD.getProgram(), VMap);
+  Module *M = CloneModule(BD.getProgram(), VMap).release();
 
   // This is a little wasteful. In the future it might be good if we could have
   // these dropped during cloning.
@@ -658,7 +658,7 @@ static bool DebugACrash(BugDriver &BD,
       BD.getProgram()->global_begin() != BD.getProgram()->global_end()) {
     // Now try to reduce the number of global variable initializers in the
     // module to something small.
-    Module *M = CloneModule(BD.getProgram());
+    Module *M = CloneModule(BD.getProgram()).release();
     bool DeletedInit = false;
 
     for (Module::global_iterator I = M->global_begin(), E = M->global_end();
@@ -840,7 +840,7 @@ static bool DebugACrash(BugDriver &BD,
   // Try to clean up the testcase by running funcresolve and globaldce...
   if (!BugpointIsInterrupted) {
     outs() << "\n*** Attempting to perform final cleanups: ";
-    Module *M = CloneModule(BD.getProgram());
+    Module *M = CloneModule(BD.getProgram()).release();
     M = BD.performFinalCleanups(M, true).release();
 
     // Find out if the pass still crashes on the cleaned up program...
diff --git a/tools/bugpoint/ExtractFunction.cpp b/tools/bugpoint/ExtractFunction.cpp
index 7b98cb8fb55c..62c3f3e9f253 100644
--- a/tools/bugpoint/ExtractFunction.cpp
+++ b/tools/bugpoint/ExtractFunction.cpp
@@ -86,7 +86,7 @@ std::unique_ptr<Module>
 BugDriver::deleteInstructionFromProgram(const Instruction *I,
                                         unsigned Simplification) {
   // FIXME, use vmap?
-  Module *Clone = CloneModule(Program);
+  Module *Clone = CloneModule(Program).release();
 
   const BasicBlock *PBB = I->getParent();
   const Function *PF = PBB->getParent();
@@ -323,7 +323,7 @@ llvm::SplitFunctionsOutOfModule(Module *M,
   }
 
   ValueToValueMapTy NewVMap;
-  Module *New = CloneModule(M, NewVMap);
+  Module *New = CloneModule(M, NewVMap).release();
 
   // Remove the Test functions from the Safe module
   std::set<Function *> TestFunctions;
diff --git a/tools/bugpoint/Miscompilation.cpp b/tools/bugpoint/Miscompilation.cpp
index 0b61b0969855..db6dd54a0879 100644
--- a/tools/bugpoint/Miscompilation.cpp
+++ b/tools/bugpoint/Miscompilation.cpp
@@ -228,8 +228,8 @@ static Module *TestMergedProgram(const BugDriver &BD, Module *M1, Module *M2,
                                  bool &Broken) {
   // Link the two portions of the program back to together.
   if (!DeleteInputs) {
-    M1 = CloneModule(M1);
-    M2 = CloneModule(M2);
+    M1 = CloneModule(M1).release();
+    M2 = CloneModule(M2).release();
   }
   if (Linker::linkModules(*M1, *M2, diagnosticHandler))
     exit(1);
@@ -268,7 +268,7 @@ bool ReduceMiscompilingFunctions::TestFuncs(const std::vector<Function*> &Funcs,
   //   we can conclude that a function triggers the bug when in fact one
   //   needs a larger set of original functions to do so.
   ValueToValueMapTy VMap;
-  Module *Clone = CloneModule(BD.getProgram(), VMap);
+  Module *Clone = CloneModule(BD.getProgram(), VMap).release();
   Module *Orig = BD.swapProgramIn(Clone);
 
   std::vector<Function*> FuncsOnClone;
@@ -279,7 +279,7 @@ bool ReduceMiscompilingFunctions::TestFuncs(const std::vector<Function*> &Funcs,
 
   // Split the module into the two halves of the program we want.
   VMap.clear();
-  Module *ToNotOptimize = CloneModule(BD.getProgram(), VMap);
+  Module *ToNotOptimize = CloneModule(BD.getProgram(), VMap).release();
   Module *ToOptimize = SplitFunctionsOutOfModule(ToNotOptimize, FuncsOnClone,
                                                  VMap);
 
@@ -317,7 +317,7 @@ static bool ExtractLoops(BugDriver &BD,
     if (BugpointIsInterrupted) return MadeChange;
 
     ValueToValueMapTy VMap;
-    Module *ToNotOptimize = CloneModule(BD.getProgram(), VMap);
+    Module *ToNotOptimize = CloneModule(BD.getProgram(), VMap).release();
     Module *ToOptimize = SplitFunctionsOutOfModule(ToNotOptimize,
                                                    MiscompiledFunctions,
                                                    VMap);
@@ -376,8 +376,8 @@ static bool ExtractLoops(BugDriver &BD,
 
     outs() << "  Testing after loop extraction:\n";
     // Clone modules, the tester function will free them.
-    Module *TOLEBackup = CloneModule(ToOptimizeLoopExtracted, VMap);
-    Module *TNOBackup  = CloneModule(ToNotOptimize, VMap);
+    Module *TOLEBackup = CloneModule(ToOptimizeLoopExtracted, VMap).release();
+    Module *TNOBackup = CloneModule(ToNotOptimize, VMap).release();
 
     for (unsigned i = 0, e = MiscompiledFunctions.size(); i != e; ++i)
       MiscompiledFunctions[i] = cast<Function>(VMap[MiscompiledFunctions[i]]);
@@ -506,7 +506,7 @@ bool ReduceMiscompiledBlocks::TestFuncs(const std::vector<BasicBlock*> &BBs,
 
   // Split the module into the two halves of the program we want.
   ValueToValueMapTy VMap;
-  Module *Clone = CloneModule(BD.getProgram(), VMap);
+  Module *Clone = CloneModule(BD.getProgram(), VMap).release();
   Module *Orig = BD.swapProgramIn(Clone);
   std::vector<Function*> FuncsOnClone;
   std::vector<BasicBlock*> BBsOnClone;
@@ -520,7 +520,7 @@ bool ReduceMiscompiledBlocks::TestFuncs(const std::vector<BasicBlock*> &BBs,
   }
   VMap.clear();
 
-  Module *ToNotOptimize = CloneModule(BD.getProgram(), VMap);
+  Module *ToNotOptimize = CloneModule(BD.getProgram(), VMap).release();
   Module *ToOptimize = SplitFunctionsOutOfModule(ToNotOptimize,
                                                  FuncsOnClone,
                                                  VMap);
@@ -581,7 +581,7 @@ static bool ExtractBlocks(BugDriver &BD,
   }
 
   ValueToValueMapTy VMap;
-  Module *ProgClone = CloneModule(BD.getProgram(), VMap);
+  Module *ProgClone = CloneModule(BD.getProgram(), VMap).release();
   Module *ToExtract = SplitFunctionsOutOfModule(ProgClone,
                                                 MiscompiledFunctions,
                                                 VMap);
@@ -763,7 +763,7 @@ void BugDriver::debugMiscompilation(std::string *Error) {
   // Output a bunch of bitcode files for the user...
   outs() << "Outputting reduced bitcode files which expose the problem:\n";
   ValueToValueMapTy VMap;
-  Module *ToNotOptimize = CloneModule(getProgram(), VMap);
+  Module *ToNotOptimize = CloneModule(getProgram(), VMap).release();
   Module *ToOptimize = SplitFunctionsOutOfModule(ToNotOptimize,
                                                  MiscompiledFunctions,
                                                  VMap);
@@ -1039,7 +1039,7 @@ bool BugDriver::debugCodeGenerator(std::string *Error) {
 
   // Split the module into the two halves of the program we want.
   ValueToValueMapTy VMap;
-  Module *ToNotCodeGen = CloneModule(getProgram(), VMap);
+  Module *ToNotCodeGen = CloneModule(getProgram(), VMap).release();
   Module *ToCodeGen = SplitFunctionsOutOfModule(ToNotCodeGen, Funcs, VMap);
 
   // Condition the modules
diff --git a/unittests/Transforms/Utils/Cloning.cpp b/unittests/Transforms/Utils/Cloning.cpp
index e22573f886c3..25e322ee5a8e 100644
--- a/unittests/Transforms/Utils/Cloning.cpp
+++ b/unittests/Transforms/Utils/Cloning.cpp
@@ -436,7 +436,7 @@ class CloneModule : public ::testing::Test {
     IBuilder.CreateRetVoid();
   }
 
-  void CreateNewModule() { NewM = llvm::CloneModule(OldM); }
+  void CreateNewModule() { NewM = llvm::CloneModule(OldM).release(); }
 
   LLVMContext C;
   Module *OldM;

From 5d88a3e22651862799b2ea226c676d3e076df6de Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Wed, 9 Dec 2015 00:08:22 +0000
Subject: [PATCH 251/364] Simplify memory management a bit. NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255079 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/bugpoint/Miscompilation.cpp | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/tools/bugpoint/Miscompilation.cpp b/tools/bugpoint/Miscompilation.cpp
index db6dd54a0879..d511d39b9b8d 100644
--- a/tools/bugpoint/Miscompilation.cpp
+++ b/tools/bugpoint/Miscompilation.cpp
@@ -321,7 +321,8 @@ static bool ExtractLoops(BugDriver &BD,
     Module *ToOptimize = SplitFunctionsOutOfModule(ToNotOptimize,
                                                    MiscompiledFunctions,
                                                    VMap);
-    Module *ToOptimizeLoopExtracted = BD.extractLoop(ToOptimize).release();
+    std::unique_ptr<Module> ToOptimizeLoopExtracted =
+        BD.extractLoop(ToOptimize);
     if (!ToOptimizeLoopExtracted) {
       // If the loop extractor crashed or if there were no extractible loops,
       // then this chapter of our odyssey is over with.
@@ -339,7 +340,7 @@ static bool ExtractLoops(BugDriver &BD,
     // extraction.
     AbstractInterpreter *AI = BD.switchToSafeInterpreter();
     bool Failure;
-    Module *New = TestMergedProgram(BD, ToOptimizeLoopExtracted,
+    Module *New = TestMergedProgram(BD, ToOptimizeLoopExtracted.get(),
                                     ToNotOptimize, false, Error, Failure);
     if (!New)
       return false;
@@ -363,7 +364,7 @@ static bool ExtractLoops(BugDriver &BD,
       BD.writeProgramToFile(OutputPrefix + "-loop-extract-fail-to.bc",
                             ToOptimize);
       BD.writeProgramToFile(OutputPrefix + "-loop-extract-fail-to-le.bc",
-                            ToOptimizeLoopExtracted);
+                            ToOptimizeLoopExtracted.get());
 
       errs() << "Please submit the "
              << OutputPrefix << "-loop-extract-fail-*.bc files.\n";
@@ -376,17 +377,18 @@ static bool ExtractLoops(BugDriver &BD,
 
     outs() << "  Testing after loop extraction:\n";
     // Clone modules, the tester function will free them.
-    Module *TOLEBackup = CloneModule(ToOptimizeLoopExtracted, VMap).release();
+    std::unique_ptr<Module> TOLEBackup =
+        CloneModule(ToOptimizeLoopExtracted.get(), VMap);
     Module *TNOBackup = CloneModule(ToNotOptimize, VMap).release();
 
     for (unsigned i = 0, e = MiscompiledFunctions.size(); i != e; ++i)
       MiscompiledFunctions[i] = cast<Function>(VMap[MiscompiledFunctions[i]]);
 
-    Failure = TestFn(BD, ToOptimizeLoopExtracted, ToNotOptimize, Error);
+    Failure = TestFn(BD, ToOptimizeLoopExtracted.get(), ToNotOptimize, Error);
     if (!Error.empty())
       return false;
 
-    ToOptimizeLoopExtracted = TOLEBackup;
+    ToOptimizeLoopExtracted = std::move(TOLEBackup);
     ToNotOptimize = TNOBackup;
 
     if (!Failure) {
@@ -411,7 +413,6 @@ static bool ExtractLoops(BugDriver &BD,
         MiscompiledFunctions.push_back(NewF);
       }
 
-      delete ToOptimizeLoopExtracted;
       BD.setNewProgram(ToNotOptimize);
       return MadeChange;
     }
@@ -432,8 +433,6 @@ static bool ExtractLoops(BugDriver &BD,
                             diagnosticHandler))
       exit(1);
 
-    delete ToOptimizeLoopExtracted;
-
     // All of the Function*'s in the MiscompiledFunctions list are in the old
     // module.  Update this list to include all of the functions in the
     // optimized and loop extracted module.

From a23ddb789139a324c07f36edf80178f34bcbee56 Mon Sep 17 00:00:00 2001
From: Vyacheslav Klochkov <vyacheslav.n.klochkov@gmail.com>
Date: Wed, 9 Dec 2015 00:12:13 +0000
Subject: [PATCH 252/364] X86-FMA3: Defined the ExeDomain property for Scalar
 FMA3 opcodes.

Reviewer: Simon Pilgrim.
Differential Revision: http://reviews.llvm.org/D15317


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255080 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86InstrFMA.td          |  3 ++
 test/CodeGen/X86/fma-commute-x86.ll    | 48 +++++++++---------
 test/CodeGen/X86/fma-intrinsics-x86.ll | 68 +++++++++++++-------------
 test/CodeGen/X86/fma-scalar-memfold.ll | 16 +++---
 test/CodeGen/X86/fma_patterns.ll       |  2 +-
 5 files changed, 70 insertions(+), 67 deletions(-)

diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index b11ff6e253fa..fd800cf077f7 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -223,9 +223,12 @@ multiclass fma3s_int_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
 multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
                  string OpStr, Intrinsic IntF32, Intrinsic IntF64,
                  SDNode OpNode> {
+  let ExeDomain = SSEPackedSingle in
   defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", OpNode,
                         FR32, f32mem>,
             fma3s_int_forms<opc132, opc213, opc231, OpStr, "ss", VR128, ssmem>;
+
+  let ExeDomain = SSEPackedDouble in
   defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", OpNode,
                         FR64, f64mem>,
             fma3s_int_forms<opc132, opc213, opc231, OpStr, "sd", VR128, sdmem>,
diff --git a/test/CodeGen/X86/fma-commute-x86.ll b/test/CodeGen/X86/fma-commute-x86.ll
index 9a368792133b..162a97ac025c 100644
--- a/test/CodeGen/X86/fma-commute-x86.ll
+++ b/test/CodeGen/X86/fma-commute-x86.ll
@@ -8,8 +8,8 @@ declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float
 define <4 x float> @test_x86_fmadd_baa_ss(<4 x float> %a, <4 x float> %b) #0 {
 ; CHECK-LABEL: test_x86_fmadd_baa_ss:
 ; CHECK:       # BB#0:
-; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
-; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
 ; CHECK-NEXT: vfmadd213ss %xmm1, %xmm1, %xmm0
 ; CHECK-NEXT: retq
   %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
@@ -102,8 +102,8 @@ declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x do
 define <2 x double> @test_x86_fmadd_baa_sd(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK-LABEL: test_x86_fmadd_baa_sd:
 ; CHECK:       # BB#0:
-; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
-; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
 ; CHECK-NEXT: vfmadd213sd %xmm1, %xmm1, %xmm0
 ; CHECK-NEXT: retq
   %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
@@ -113,7 +113,7 @@ define <2 x double> @test_x86_fmadd_baa_sd(<2 x double> %a, <2 x double> %b) #0
 define <2 x double> @test_x86_fmadd_aba_sd(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK-LABEL: test_x86_fmadd_aba_sd:
 ; CHECK:       # BB#0:
-; CHECK-NEXT: vmovaps	(%rcx), %xmm0
+; CHECK-NEXT: vmovapd	(%rcx), %xmm0
 ; CHECK-NEXT: vfmadd132sd (%rdx), %xmm0, %xmm0
 ; CHECK-NEXT: retq
   %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
@@ -123,7 +123,7 @@ define <2 x double> @test_x86_fmadd_aba_sd(<2 x double> %a, <2 x double> %b) #0
 define <2 x double> @test_x86_fmadd_bba_sd(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK-LABEL: test_x86_fmadd_bba_sd:
 ; CHECK:       # BB#0:
-; CHECK-NEXT: vmovaps	(%rdx), %xmm0
+; CHECK-NEXT: vmovapd	(%rdx), %xmm0
 ; CHECK-NEXT: vfmadd213sd (%rcx), %xmm0, %xmm0
 ; CHECK-NEXT: retq
   %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
@@ -197,8 +197,8 @@ declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x floa
 define <4 x float> @test_x86_fnmadd_baa_ss(<4 x float> %a, <4 x float> %b) #0 {
 ; CHECK-LABEL: test_x86_fnmadd_baa_ss:
 ; CHECK:       # BB#0:
-; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
-; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
 ; CHECK-NEXT: vfnmadd213ss %xmm1, %xmm1, %xmm0
 ; CHECK-NEXT: retq
   %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
@@ -291,8 +291,8 @@ declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x d
 define <2 x double> @test_x86_fnmadd_baa_sd(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK-LABEL: test_x86_fnmadd_baa_sd:
 ; CHECK:       # BB#0:
-; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
-; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
 ; CHECK-NEXT: vfnmadd213sd %xmm1, %xmm1, %xmm0
 ; CHECK-NEXT: retq
   %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
@@ -302,7 +302,7 @@ define <2 x double> @test_x86_fnmadd_baa_sd(<2 x double> %a, <2 x double> %b) #0
 define <2 x double> @test_x86_fnmadd_aba_sd(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK-LABEL: test_x86_fnmadd_aba_sd:
 ; CHECK:       # BB#0:
-; CHECK-NEXT: vmovaps	(%rcx), %xmm0
+; CHECK-NEXT: vmovapd	(%rcx), %xmm0
 ; CHECK-NEXT: vfnmadd132sd (%rdx), %xmm0, %xmm0
 ; CHECK-NEXT: retq
   %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
@@ -312,7 +312,7 @@ define <2 x double> @test_x86_fnmadd_aba_sd(<2 x double> %a, <2 x double> %b) #0
 define <2 x double> @test_x86_fnmadd_bba_sd(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK-LABEL: test_x86_fnmadd_bba_sd:
 ; CHECK:       # BB#0:
-; CHECK-NEXT: vmovaps	(%rdx), %xmm0
+; CHECK-NEXT: vmovapd	(%rdx), %xmm0
 ; CHECK-NEXT: vfnmadd213sd (%rcx), %xmm0, %xmm0
 ; CHECK-NEXT: retq
   %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
@@ -386,8 +386,8 @@ declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float
 define <4 x float> @test_x86_fmsub_baa_ss(<4 x float> %a, <4 x float> %b) #0 {
 ; CHECK-LABEL: test_x86_fmsub_baa_ss:
 ; CHECK:       # BB#0:
-; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
-; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
 ; CHECK-NEXT: vfmsub213ss %xmm1, %xmm1, %xmm0
 ; CHECK-NEXT: retq
   %res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
@@ -480,8 +480,8 @@ declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x do
 define <2 x double> @test_x86_fmsub_baa_sd(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK-LABEL: test_x86_fmsub_baa_sd:
 ; CHECK:       # BB#0:
-; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
-; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
 ; CHECK-NEXT: vfmsub213sd %xmm1, %xmm1, %xmm0
 ; CHECK-NEXT: retq
   %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
@@ -491,7 +491,7 @@ define <2 x double> @test_x86_fmsub_baa_sd(<2 x double> %a, <2 x double> %b) #0
 define <2 x double> @test_x86_fmsub_aba_sd(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK-LABEL: test_x86_fmsub_aba_sd:
 ; CHECK:       # BB#0:
-; CHECK-NEXT: vmovaps	(%rcx), %xmm0
+; CHECK-NEXT: vmovapd	(%rcx), %xmm0
 ; CHECK-NEXT: vfmsub132sd (%rdx), %xmm0, %xmm0
 ; CHECK-NEXT: retq
   %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
@@ -501,7 +501,7 @@ define <2 x double> @test_x86_fmsub_aba_sd(<2 x double> %a, <2 x double> %b) #0
 define <2 x double> @test_x86_fmsub_bba_sd(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK-LABEL: test_x86_fmsub_bba_sd:
 ; CHECK:       # BB#0:
-; CHECK-NEXT: vmovaps	(%rdx), %xmm0
+; CHECK-NEXT: vmovapd	(%rdx), %xmm0
 ; CHECK-NEXT: vfmsub213sd (%rcx), %xmm0, %xmm0
 ; CHECK-NEXT: retq
   %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
@@ -575,8 +575,8 @@ declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x floa
 define <4 x float> @test_x86_fnmsub_baa_ss(<4 x float> %a, <4 x float> %b) #0 {
 ; CHECK-LABEL: test_x86_fnmsub_baa_ss:
 ; CHECK:       # BB#0:
-; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
-; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
 ; CHECK-NEXT: vfnmsub213ss %xmm1, %xmm1, %xmm0
 ; CHECK-NEXT: retq
   %res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
@@ -669,8 +669,8 @@ declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x d
 define <2 x double> @test_x86_fnmsub_baa_sd(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK-LABEL: test_x86_fnmsub_baa_sd:
 ; CHECK:       # BB#0:
-; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
-; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
+; CHECK-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
 ; CHECK-NEXT: vfnmsub213sd %xmm1, %xmm1, %xmm0
 ; CHECK-NEXT: retq
   %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
@@ -680,7 +680,7 @@ define <2 x double> @test_x86_fnmsub_baa_sd(<2 x double> %a, <2 x double> %b) #0
 define <2 x double> @test_x86_fnmsub_aba_sd(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK-LABEL: test_x86_fnmsub_aba_sd:
 ; CHECK:       # BB#0:
-; CHECK-NEXT: vmovaps	(%rcx), %xmm0
+; CHECK-NEXT: vmovapd	(%rcx), %xmm0
 ; CHECK-NEXT: vfnmsub132sd (%rdx), %xmm0, %xmm0
 ; CHECK-NEXT: retq
   %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
@@ -690,7 +690,7 @@ define <2 x double> @test_x86_fnmsub_aba_sd(<2 x double> %a, <2 x double> %b) #0
 define <2 x double> @test_x86_fnmsub_bba_sd(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK-LABEL: test_x86_fnmsub_bba_sd:
 ; CHECK:       # BB#0:
-; CHECK-NEXT: vmovaps	(%rdx), %xmm0
+; CHECK-NEXT: vmovapd	(%rdx), %xmm0
 ; CHECK-NEXT: vfnmsub213sd (%rcx), %xmm0, %xmm0
 ; CHECK-NEXT: retq
   %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
diff --git a/test/CodeGen/X86/fma-intrinsics-x86.ll b/test/CodeGen/X86/fma-intrinsics-x86.ll
index 5a97579b54f4..cf4c8933fcab 100644
--- a/test/CodeGen/X86/fma-intrinsics-x86.ll
+++ b/test/CodeGen/X86/fma-intrinsics-x86.ll
@@ -10,8 +10,8 @@ define <4 x float> @test_x86_fma_vfmadd_ss(<4 x float> %a0, <4 x float> %a1, <4
 ; CHECK-LABEL: test_x86_fma_vfmadd_ss:
 ; CHECK-NEXT:  # BB#0:
 ;
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
 ; CHECK-FMA-WIN-NEXT: vfmadd132ss     (%rdx), %xmm1, %xmm0
 ;
 ; CHECK-FMA-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0
@@ -27,8 +27,8 @@ define <4 x float> @test_x86_fma_vfmadd_bac_ss(<4 x float> %a0, <4 x float> %a1,
 ; CHECK-LABEL: test_x86_fma_vfmadd_bac_ss:
 ; CHECK-NEXT:  # BB#0:
 ;
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
 ; CHECK-FMA-WIN-NEXT: vfmadd132ss (%rcx), %xmm1, %xmm0
 ;
 ; CHECK-FMA-NEXT:    vfmadd213ss %xmm2, %xmm0, %xmm1
@@ -45,8 +45,8 @@ define <2 x double> @test_x86_fma_vfmadd_sd(<2 x double> %a0, <2 x double> %a1,
 ; CHECK-LABEL: test_x86_fma_vfmadd_sd:
 ; CHECK-NEXT:  # BB#0:
 ;
-; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
 ; CHECK-FMA-WIN-NEXT: vfmadd132sd (%rdx), %xmm1, %xmm0
 ;
 ; CHECK-FMA-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0
@@ -62,12 +62,12 @@ define <2 x double> @test_x86_fma_vfmadd_bac_sd(<2 x double> %a0, <2 x double> %
 ; CHECK-LABEL: test_x86_fma_vfmadd_bac_sd:
 ; CHECK-NEXT:  # BB#0:
 ;
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
 ; CHECK-FMA-WIN-NEXT: vfmadd132sd (%rcx), %xmm1, %xmm0
 ;
 ; CHECK-FMA-NEXT:    vfmadd213sd %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT:    vmovaps	%xmm1, %xmm0
+; CHECK-FMA-NEXT:    vmovapd	%xmm1, %xmm0
 ;
 ; CHECK-FMA4-NEXT: vfmaddsd %xmm2, %xmm0, %xmm1, %xmm0
 ;
@@ -171,8 +171,8 @@ define <4 x float> @test_x86_fma_vfmsub_bac_ss(<4 x float> %a0, <4 x float> %a1,
 ; CHECK-LABEL: test_x86_fma_vfmsub_bac_ss:
 ; CHECK-NEXT:  # BB#0:
 ;
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
 ; CHECK-FMA-WIN-NEXT: vfmsub132ss (%rcx), %xmm1, %xmm0
 ;
 ; CHECK-FMA-NEXT:    vfmsub213ss %xmm2, %xmm0, %xmm1
@@ -190,8 +190,8 @@ define <2 x double> @test_x86_fma_vfmsub_sd(<2 x double> %a0, <2 x double> %a1,
 ; CHECK-LABEL: test_x86_fma_vfmsub_sd:
 ; CHECK-NEXT:  # BB#0:
 ;
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
 ; CHECK-FMA-WIN-NEXT: vfmsub132sd (%rdx), %xmm1, %xmm0
 ;
 ; CHECK-FMA-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0
@@ -207,12 +207,12 @@ define <2 x double> @test_x86_fma_vfmsub_bac_sd(<2 x double> %a0, <2 x double> %
 ; CHECK-LABEL: test_x86_fma_vfmsub_bac_sd:
 ; CHECK-NEXT:  # BB#0:
 ;
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
 ; CHECK-FMA-WIN-NEXT: vfmsub132sd (%rcx), %xmm1, %xmm0
 ;
 ; CHECK-FMA-NEXT:    vfmsub213sd %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT:    vmovaps	%xmm1, %xmm0
+; CHECK-FMA-NEXT:    vmovapd	%xmm1, %xmm0
 ;
 ; CHECK-FMA4-NEXT: vfmsubsd %xmm2, %xmm0, %xmm1, %xmm0
 ;
@@ -299,8 +299,8 @@ define <4 x float> @test_x86_fma_vfnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4
 ; CHECK-LABEL: test_x86_fma_vfnmadd_ss:
 ; CHECK-NEXT:  # BB#0:
 ;
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
 ; CHECK-FMA-WIN-NEXT: vfnmadd132ss (%rdx), %xmm1, %xmm0
 ;
 ; CHECK-FMA-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0
@@ -316,8 +316,8 @@ define <4 x float> @test_x86_fma_vfnmadd_bac_ss(<4 x float> %a0, <4 x float> %a1
 ; CHECK-LABEL: test_x86_fma_vfnmadd_bac_ss:
 ; CHECK-NEXT:  # BB#0:
 ;
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
 ; CHECK-FMA-WIN-NEXT: vfnmadd132ss (%rcx), %xmm1, %xmm0
 ;
 ; CHECK-FMA-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm1
@@ -335,8 +335,8 @@ define <2 x double> @test_x86_fma_vfnmadd_sd(<2 x double> %a0, <2 x double> %a1,
 ; CHECK-LABEL: test_x86_fma_vfnmadd_sd:
 ; CHECK-NEXT:  # BB#0:
 ;
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
 ; CHECK-FMA-WIN-NEXT: vfnmadd132sd (%rdx), %xmm1, %xmm0
 ;
 ; CHECK-FMA-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0
@@ -352,12 +352,12 @@ define <2 x double> @test_x86_fma_vfnmadd_bac_sd(<2 x double> %a0, <2 x double>
 ; CHECK-LABEL: test_x86_fma_vfnmadd_bac_sd:
 ; CHECK-NEXT:  # BB#0:
 ;
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
 ; CHECK-FMA-WIN-NEXT: vfnmadd132sd (%rcx), %xmm1, %xmm0
 ;
 ; CHECK-FMA-NEXT:    vfnmadd213sd %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT:    vmovaps	%xmm1, %xmm0
+; CHECK-FMA-NEXT:    vmovapd	%xmm1, %xmm0
 ;
 ; CHECK-FMA4-NEXT: vfnmaddsd %xmm2, %xmm0, %xmm1, %xmm0
 ;
@@ -444,8 +444,8 @@ define <4 x float> @test_x86_fma_vfnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4
 ; CHECK-LABEL: test_x86_fma_vfnmsub_ss:
 ; CHECK-NEXT:  # BB#0:
 ;
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
 ; CHECK-FMA-WIN-NEXT: vfnmsub132ss (%rdx), %xmm1, %xmm0
 ;
 ; CHECK-FMA-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0
@@ -461,8 +461,8 @@ define <4 x float> @test_x86_fma_vfnmsub_bac_ss(<4 x float> %a0, <4 x float> %a1
 ; CHECK-LABEL: test_x86_fma_vfnmsub_bac_ss:
 ; CHECK-NEXT:  # BB#0:
 ;
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
 ; CHECK-FMA-WIN-NEXT: vfnmsub132ss (%rcx), %xmm1, %xmm0
 ;
 ; CHECK-FMA-NEXT:    vfnmsub213ss %xmm2, %xmm0, %xmm1
@@ -480,8 +480,8 @@ define <2 x double> @test_x86_fma_vfnmsub_sd(<2 x double> %a0, <2 x double> %a1,
 ; CHECK-LABEL: test_x86_fma_vfnmsub_sd:
 ; CHECK-NEXT:  # BB#0:
 ;
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
 ; CHECK-FMA-WIN-NEXT: vfnmsub132sd (%rdx), %xmm1, %xmm0
 ;
 ; CHECK-FMA-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0
@@ -497,12 +497,12 @@ define <2 x double> @test_x86_fma_vfnmsub_bac_sd(<2 x double> %a0, <2 x double>
 ; CHECK-LABEL: test_x86_fma_vfnmsub_bac_sd:
 ; CHECK-NEXT:  # BB#0:
 ;
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
+; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
 ; CHECK-FMA-WIN-NEXT: vfnmsub132sd (%rcx), %xmm1, %xmm0
 ;
 ; CHECK-FMA-NEXT:    vfnmsub213sd %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT:    vmovaps	%xmm1, %xmm0
+; CHECK-FMA-NEXT:    vmovapd	%xmm1, %xmm0
 ;
 ; CHECK-FMA4-NEXT: vfnmsubsd %xmm2, %xmm0, %xmm1, %xmm0
 ;
diff --git a/test/CodeGen/X86/fma-scalar-memfold.ll b/test/CodeGen/X86/fma-scalar-memfold.ll
index b6126ffe4fd6..0ceaa562a5d4 100644
--- a/test/CodeGen/X86/fma-scalar-memfold.ll
+++ b/test/CodeGen/X86/fma-scalar-memfold.ll
@@ -216,7 +216,7 @@ define void @fmadd_aab_sd(double* %a, double* %b) #0 {
 ; CHECK-LABEL: fmadd_aab_sd:
 ; CHECK:      vmovsd (%rcx), %[[XMM:xmm[0-9]+]]
 ; CHECK-NEXT: vfmadd213sd (%rdx), %[[XMM]], %[[XMM]]
-; CHECK-NEXT: vmovlps %[[XMM]], (%rcx)
+; CHECK-NEXT: vmovlpd %[[XMM]], (%rcx)
 ; CHECK-NEXT: ret
   %a.val = load double, double* %a
   %av0 = insertelement <2 x double> undef, double %a.val, i32 0
@@ -237,7 +237,7 @@ define void @fmadd_aba_sd(double* %a, double* %b) #0 {
 ; CHECK-LABEL: fmadd_aba_sd:
 ; CHECK:      vmovsd (%rcx), %[[XMM:xmm[0-9]+]]
 ; CHECK-NEXT: vfmadd132sd (%rdx), %[[XMM]], %[[XMM]]
-; CHECK-NEXT: vmovlps %[[XMM]], (%rcx)
+; CHECK-NEXT: vmovlpd %[[XMM]], (%rcx)
 ; CHECK-NEXT: ret
   %a.val = load double, double* %a
   %av0 = insertelement <2 x double> undef, double %a.val, i32 0
@@ -258,7 +258,7 @@ define void @fmsub_aab_sd(double* %a, double* %b) #0 {
 ; CHECK-LABEL: fmsub_aab_sd:
 ; CHECK:      vmovsd (%rcx), %[[XMM:xmm[0-9]+]]
 ; CHECK-NEXT: vfmsub213sd (%rdx), %[[XMM]], %[[XMM]]
-; CHECK-NEXT: vmovlps %[[XMM]], (%rcx)
+; CHECK-NEXT: vmovlpd %[[XMM]], (%rcx)
 ; CHECK-NEXT: ret
   %a.val = load double, double* %a
   %av0 = insertelement <2 x double> undef, double %a.val, i32 0
@@ -279,7 +279,7 @@ define void @fmsub_aba_sd(double* %a, double* %b) #0 {
 ; CHECK-LABEL: fmsub_aba_sd:
 ; CHECK:      vmovsd (%rcx), %[[XMM:xmm[0-9]+]]
 ; CHECK-NEXT: vfmsub132sd (%rdx), %[[XMM]], %[[XMM]]
-; CHECK-NEXT: vmovlps %[[XMM]], (%rcx)
+; CHECK-NEXT: vmovlpd %[[XMM]], (%rcx)
 ; CHECK-NEXT: ret
   %a.val = load double, double* %a
   %av0 = insertelement <2 x double> undef, double %a.val, i32 0
@@ -300,7 +300,7 @@ define void @fnmadd_aab_sd(double* %a, double* %b) #0 {
 ; CHECK-LABEL: fnmadd_aab_sd:
 ; CHECK:      vmovsd (%rcx), %[[XMM:xmm[0-9]+]]
 ; CHECK-NEXT: vfnmadd213sd (%rdx), %[[XMM]], %[[XMM]]
-; CHECK-NEXT: vmovlps %[[XMM]], (%rcx)
+; CHECK-NEXT: vmovlpd %[[XMM]], (%rcx)
 ; CHECK-NEXT: ret
   %a.val = load double, double* %a
   %av0 = insertelement <2 x double> undef, double %a.val, i32 0
@@ -321,7 +321,7 @@ define void @fnmadd_aba_sd(double* %a, double* %b) #0 {
 ; CHECK-LABEL: fnmadd_aba_sd:
 ; CHECK:      vmovsd (%rcx), %[[XMM:xmm[0-9]+]]
 ; CHECK-NEXT: vfnmadd132sd (%rdx), %[[XMM]], %[[XMM]]
-; CHECK-NEXT: vmovlps %[[XMM]], (%rcx)
+; CHECK-NEXT: vmovlpd %[[XMM]], (%rcx)
 ; CHECK-NEXT: ret
   %a.val = load double, double* %a
   %av0 = insertelement <2 x double> undef, double %a.val, i32 0
@@ -342,7 +342,7 @@ define void @fnmsub_aab_sd(double* %a, double* %b) #0 {
 ; CHECK-LABEL: fnmsub_aab_sd:
 ; CHECK:      vmovsd (%rcx), %[[XMM:xmm[0-9]+]]
 ; CHECK-NEXT: vfnmsub213sd (%rdx), %[[XMM]], %[[XMM]]
-; CHECK-NEXT: vmovlps %[[XMM]], (%rcx)
+; CHECK-NEXT: vmovlpd %[[XMM]], (%rcx)
 ; CHECK-NEXT: ret
   %a.val = load double, double* %a
   %av0 = insertelement <2 x double> undef, double %a.val, i32 0
@@ -363,7 +363,7 @@ define void @fnmsub_aba_sd(double* %a, double* %b) #0 {
 ; CHECK-LABEL: fnmsub_aba_sd:
 ; CHECK:      vmovsd (%rcx), %[[XMM:xmm[0-9]+]]
 ; CHECK-NEXT: vfnmsub132sd (%rdx), %[[XMM]], %[[XMM]]
-; CHECK-NEXT: vmovlps %[[XMM]], (%rcx)
+; CHECK-NEXT: vmovlpd %[[XMM]], (%rcx)
 ; CHECK-NEXT: ret
   %a.val = load double, double* %a
   %av0 = insertelement <2 x double> undef, double %a.val, i32 0
diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll
index c6a4954e51a8..76a4acf00f90 100644
--- a/test/CodeGen/X86/fma_patterns.ll
+++ b/test/CodeGen/X86/fma_patterns.ll
@@ -1114,7 +1114,7 @@ define <4 x float> @test_v4f32_fma_fmul_x_c1_c2_y(<4 x float> %x, <4 x float> %y
 define double @test_f64_fneg_fmul(double %x, double %y) #0 {
 ; FMA-LABEL: test_f64_fneg_fmul:
 ; FMA:       # BB#0:
-; FMA-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FMA-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; FMA-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0
 ; FMA-NEXT:    retq
 ;

From 4ed1f8cb98b7b9279d5d59abd43398083bc48f1d Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Wed, 9 Dec 2015 00:18:41 +0000
Subject: [PATCH 253/364] Simplify memory management. NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255082 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/bugpoint/Miscompilation.cpp | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/tools/bugpoint/Miscompilation.cpp b/tools/bugpoint/Miscompilation.cpp
index d511d39b9b8d..d2aba92c7c99 100644
--- a/tools/bugpoint/Miscompilation.cpp
+++ b/tools/bugpoint/Miscompilation.cpp
@@ -317,8 +317,8 @@ static bool ExtractLoops(BugDriver &BD,
     if (BugpointIsInterrupted) return MadeChange;
 
     ValueToValueMapTy VMap;
-    Module *ToNotOptimize = CloneModule(BD.getProgram(), VMap).release();
-    Module *ToOptimize = SplitFunctionsOutOfModule(ToNotOptimize,
+    std::unique_ptr<Module> ToNotOptimize = CloneModule(BD.getProgram(), VMap);
+    Module *ToOptimize = SplitFunctionsOutOfModule(ToNotOptimize.get(),
                                                    MiscompiledFunctions,
                                                    VMap);
     std::unique_ptr<Module> ToOptimizeLoopExtracted =
@@ -326,7 +326,6 @@ static bool ExtractLoops(BugDriver &BD,
     if (!ToOptimizeLoopExtracted) {
       // If the loop extractor crashed or if there were no extractible loops,
       // then this chapter of our odyssey is over with.
-      delete ToNotOptimize;
       delete ToOptimize;
       return MadeChange;
     }
@@ -341,7 +340,7 @@ static bool ExtractLoops(BugDriver &BD,
     AbstractInterpreter *AI = BD.switchToSafeInterpreter();
     bool Failure;
     Module *New = TestMergedProgram(BD, ToOptimizeLoopExtracted.get(),
-                                    ToNotOptimize, false, Error, Failure);
+                                    ToNotOptimize.get(), false, Error, Failure);
     if (!New)
       return false;
 
@@ -360,7 +359,7 @@ static bool ExtractLoops(BugDriver &BD,
       errs() << "      Continuing on with un-loop-extracted version.\n";
 
       BD.writeProgramToFile(OutputPrefix + "-loop-extract-fail-tno.bc",
-                            ToNotOptimize);
+                            ToNotOptimize.get());
       BD.writeProgramToFile(OutputPrefix + "-loop-extract-fail-to.bc",
                             ToOptimize);
       BD.writeProgramToFile(OutputPrefix + "-loop-extract-fail-to-le.bc",
@@ -369,7 +368,6 @@ static bool ExtractLoops(BugDriver &BD,
       errs() << "Please submit the "
              << OutputPrefix << "-loop-extract-fail-*.bc files.\n";
       delete ToOptimize;
-      delete ToNotOptimize;
       return MadeChange;
     }
     delete ToOptimize;
@@ -379,17 +377,18 @@ static bool ExtractLoops(BugDriver &BD,
     // Clone modules, the tester function will free them.
     std::unique_ptr<Module> TOLEBackup =
         CloneModule(ToOptimizeLoopExtracted.get(), VMap);
-    Module *TNOBackup = CloneModule(ToNotOptimize, VMap).release();
+    std::unique_ptr<Module> TNOBackup = CloneModule(ToNotOptimize.get(), VMap);
 
     for (unsigned i = 0, e = MiscompiledFunctions.size(); i != e; ++i)
       MiscompiledFunctions[i] = cast<Function>(VMap[MiscompiledFunctions[i]]);
 
-    Failure = TestFn(BD, ToOptimizeLoopExtracted.get(), ToNotOptimize, Error);
+    Failure =
+        TestFn(BD, ToOptimizeLoopExtracted.get(), ToNotOptimize.get(), Error);
     if (!Error.empty())
       return false;
 
     ToOptimizeLoopExtracted = std::move(TOLEBackup);
-    ToNotOptimize = TNOBackup;
+    ToNotOptimize = std::move(TNOBackup);
 
     if (!Failure) {
       outs() << "*** Loop extraction masked the problem.  Undoing.\n";
@@ -413,7 +412,7 @@ static bool ExtractLoops(BugDriver &BD,
         MiscompiledFunctions.push_back(NewF);
       }
 
-      BD.setNewProgram(ToNotOptimize);
+      BD.setNewProgram(ToNotOptimize.release());
       return MadeChange;
     }
 
@@ -444,7 +443,7 @@ static bool ExtractLoops(BugDriver &BD,
       MiscompiledFunctions.push_back(NewF);
     }
 
-    BD.setNewProgram(ToNotOptimize);
+    BD.setNewProgram(ToNotOptimize.release());
     MadeChange = true;
   }
 }

From 18d705086f7f3f0d460aae88b26daecdae5f634e Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Wed, 9 Dec 2015 00:34:10 +0000
Subject: [PATCH 254/364] Return std::unique_ptr from
 SplitFunctionsOutOfModule. NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255084 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/bugpoint/BugDriver.h         | 10 +++++-----
 tools/bugpoint/ExtractFunction.cpp | 17 ++++++-----------
 tools/bugpoint/Miscompilation.cpp  | 28 ++++++++++++++--------------
 3 files changed, 25 insertions(+), 30 deletions(-)

diff --git a/tools/bugpoint/BugDriver.h b/tools/bugpoint/BugDriver.h
index 45fcf74aa6bf..20efff3fda5f 100644
--- a/tools/bugpoint/BugDriver.h
+++ b/tools/bugpoint/BugDriver.h
@@ -331,11 +331,11 @@ void DeleteGlobalInitializer(GlobalVariable *GV);
 //
 void DeleteFunctionBody(Function *F);
 
-/// SplitFunctionsOutOfModule - Given a module and a list of functions in the
-/// module, split the functions OUT of the specified module, and place them in
-/// the new module.
-Module *SplitFunctionsOutOfModule(Module *M, const std::vector<Function*> &F,
-                                  ValueToValueMapTy &VMap);
+/// Given a module and a list of functions in the module, split the functions
+/// OUT of the specified module, and place them in the new module.
+std::unique_ptr<Module>
+SplitFunctionsOutOfModule(Module *M, const std::vector<Function *> &F,
+                          ValueToValueMapTy &VMap);
 
 } // End llvm namespace
 
diff --git a/tools/bugpoint/ExtractFunction.cpp b/tools/bugpoint/ExtractFunction.cpp
index 62c3f3e9f253..fe0ab69dc162 100644
--- a/tools/bugpoint/ExtractFunction.cpp
+++ b/tools/bugpoint/ExtractFunction.cpp
@@ -303,13 +303,8 @@ static void SplitStaticCtorDtor(const char *GlobalName, Module *M1, Module *M2,
   }
 }
 
-
-/// SplitFunctionsOutOfModule - Given a module and a list of functions in the
-/// module, split the functions OUT of the specified module, and place them in
-/// the new module.
-Module *
-llvm::SplitFunctionsOutOfModule(Module *M,
-                                const std::vector<Function*> &F,
+std::unique_ptr<Module>
+llvm::SplitFunctionsOutOfModule(Module *M, const std::vector<Function *> &F,
                                 ValueToValueMapTy &VMap) {
   // Make sure functions & globals are all external so that linkage
   // between the two modules will work.
@@ -323,7 +318,7 @@ llvm::SplitFunctionsOutOfModule(Module *M,
   }
 
   ValueToValueMapTy NewVMap;
-  Module *New = CloneModule(M, NewVMap).release();
+  std::unique_ptr<Module> New = CloneModule(M, NewVMap);
 
   // Remove the Test functions from the Safe module
   std::set<Function *> TestFunctions;
@@ -364,9 +359,9 @@ llvm::SplitFunctionsOutOfModule(Module *M,
 
   // Make sure that there is a global ctor/dtor array in both halves of the
   // module if they both have static ctor/dtor functions.
-  SplitStaticCtorDtor("llvm.global_ctors", M, New, NewVMap);
-  SplitStaticCtorDtor("llvm.global_dtors", M, New, NewVMap);
-  
+  SplitStaticCtorDtor("llvm.global_ctors", M, New.get(), NewVMap);
+  SplitStaticCtorDtor("llvm.global_dtors", M, New.get(), NewVMap);
+
   return New;
 }
 
diff --git a/tools/bugpoint/Miscompilation.cpp b/tools/bugpoint/Miscompilation.cpp
index d2aba92c7c99..0c8b313c6a8a 100644
--- a/tools/bugpoint/Miscompilation.cpp
+++ b/tools/bugpoint/Miscompilation.cpp
@@ -280,8 +280,8 @@ bool ReduceMiscompilingFunctions::TestFuncs(const std::vector<Function*> &Funcs,
   // Split the module into the two halves of the program we want.
   VMap.clear();
   Module *ToNotOptimize = CloneModule(BD.getProgram(), VMap).release();
-  Module *ToOptimize = SplitFunctionsOutOfModule(ToNotOptimize, FuncsOnClone,
-                                                 VMap);
+  Module *ToOptimize =
+      SplitFunctionsOutOfModule(ToNotOptimize, FuncsOnClone, VMap).release();
 
   // Run the predicate, note that the predicate will delete both input modules.
   bool Broken = TestFn(BD, ToOptimize, ToNotOptimize, Error);
@@ -319,8 +319,8 @@ static bool ExtractLoops(BugDriver &BD,
     ValueToValueMapTy VMap;
     std::unique_ptr<Module> ToNotOptimize = CloneModule(BD.getProgram(), VMap);
     Module *ToOptimize = SplitFunctionsOutOfModule(ToNotOptimize.get(),
-                                                   MiscompiledFunctions,
-                                                   VMap);
+                                                   MiscompiledFunctions, VMap)
+                             .release();
     std::unique_ptr<Module> ToOptimizeLoopExtracted =
         BD.extractLoop(ToOptimize);
     if (!ToOptimizeLoopExtracted) {
@@ -519,9 +519,8 @@ bool ReduceMiscompiledBlocks::TestFuncs(const std::vector<BasicBlock*> &BBs,
   VMap.clear();
 
   Module *ToNotOptimize = CloneModule(BD.getProgram(), VMap).release();
-  Module *ToOptimize = SplitFunctionsOutOfModule(ToNotOptimize,
-                                                 FuncsOnClone,
-                                                 VMap);
+  Module *ToOptimize =
+      SplitFunctionsOutOfModule(ToNotOptimize, FuncsOnClone, VMap).release();
 
   // Try the extraction.  If it doesn't work, then the block extractor crashed
   // or something, in which case bugpoint can't chase down this possibility.
@@ -580,9 +579,9 @@ static bool ExtractBlocks(BugDriver &BD,
 
   ValueToValueMapTy VMap;
   Module *ProgClone = CloneModule(BD.getProgram(), VMap).release();
-  Module *ToExtract = SplitFunctionsOutOfModule(ProgClone,
-                                                MiscompiledFunctions,
-                                                VMap);
+  Module *ToExtract =
+      SplitFunctionsOutOfModule(ProgClone, MiscompiledFunctions, VMap)
+          .release();
   std::unique_ptr<Module> Extracted =
       BD.extractMappedBlocksFromModule(Blocks, ToExtract);
   if (!Extracted) {
@@ -762,9 +761,9 @@ void BugDriver::debugMiscompilation(std::string *Error) {
   outs() << "Outputting reduced bitcode files which expose the problem:\n";
   ValueToValueMapTy VMap;
   Module *ToNotOptimize = CloneModule(getProgram(), VMap).release();
-  Module *ToOptimize = SplitFunctionsOutOfModule(ToNotOptimize,
-                                                 MiscompiledFunctions,
-                                                 VMap);
+  Module *ToOptimize =
+      SplitFunctionsOutOfModule(ToNotOptimize, MiscompiledFunctions, VMap)
+          .release();
 
   outs() << "  Non-optimized portion: ";
   EmitProgressBitcode(ToNotOptimize, "tonotoptimize", true);
@@ -1038,7 +1037,8 @@ bool BugDriver::debugCodeGenerator(std::string *Error) {
   // Split the module into the two halves of the program we want.
   ValueToValueMapTy VMap;
   Module *ToNotCodeGen = CloneModule(getProgram(), VMap).release();
-  Module *ToCodeGen = SplitFunctionsOutOfModule(ToNotCodeGen, Funcs, VMap);
+  Module *ToCodeGen =
+      SplitFunctionsOutOfModule(ToNotCodeGen, Funcs, VMap).release();
 
   // Condition the modules
   CleanupAndPrepareModules(*this, ToCodeGen, ToNotCodeGen);

From 0340a6ff3176c0e76e3f11838d0f8a541ea2da67 Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Wed, 9 Dec 2015 00:51:06 +0000
Subject: [PATCH 255/364] Simplify memory management. NFC.

This passes std::unique_ptr to predicates that are expected to delete
their argument.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255086 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/bugpoint/Miscompilation.cpp | 151 ++++++++++++++----------------
 1 file changed, 72 insertions(+), 79 deletions(-)

diff --git a/tools/bugpoint/Miscompilation.cpp b/tools/bugpoint/Miscompilation.cpp
index 0c8b313c6a8a..7458a080b103 100644
--- a/tools/bugpoint/Miscompilation.cpp
+++ b/tools/bugpoint/Miscompilation.cpp
@@ -177,12 +177,15 @@ ReduceMiscompilingPasses::doTest(std::vector<std::string> &Prefix,
 namespace {
   class ReduceMiscompilingFunctions : public ListReducer<Function*> {
     BugDriver &BD;
-    bool (*TestFn)(BugDriver &, Module *, Module *, std::string &);
+    bool (*TestFn)(BugDriver &, std::unique_ptr<Module>,
+                   std::unique_ptr<Module>, std::string &);
+
   public:
     ReduceMiscompilingFunctions(BugDriver &bd,
-                                bool (*F)(BugDriver &, Module *, Module *,
+                                bool (*F)(BugDriver &, std::unique_ptr<Module>,
+                                          std::unique_ptr<Module>,
                                           std::string &))
-      : BD(bd), TestFn(F) {}
+        : BD(bd), TestFn(F) {}
 
     TestResult doTest(std::vector<Function*> &Prefix,
                       std::vector<Function*> &Suffix,
@@ -279,12 +282,12 @@ bool ReduceMiscompilingFunctions::TestFuncs(const std::vector<Function*> &Funcs,
 
   // Split the module into the two halves of the program we want.
   VMap.clear();
-  Module *ToNotOptimize = CloneModule(BD.getProgram(), VMap).release();
-  Module *ToOptimize =
-      SplitFunctionsOutOfModule(ToNotOptimize, FuncsOnClone, VMap).release();
+  std::unique_ptr<Module> ToNotOptimize = CloneModule(BD.getProgram(), VMap);
+  std::unique_ptr<Module> ToOptimize =
+      SplitFunctionsOutOfModule(ToNotOptimize.get(), FuncsOnClone, VMap);
 
-  // Run the predicate, note that the predicate will delete both input modules.
-  bool Broken = TestFn(BD, ToOptimize, ToNotOptimize, Error);
+  bool Broken =
+      TestFn(BD, std::move(ToOptimize), std::move(ToNotOptimize), Error);
 
   delete BD.swapProgramIn(Orig);
 
@@ -303,14 +306,14 @@ static void DisambiguateGlobalSymbols(Module *M) {
       I->setName("anon_fn");
 }
 
-/// ExtractLoops - Given a reduced list of functions that still exposed the bug,
-/// check to see if we can extract the loops in the region without obscuring the
-/// bug.  If so, it reduces the amount of code identified.
+/// Given a reduced list of functions that still exposed the bug, check to see
+/// if we can extract the loops in the region without obscuring the bug.  If so,
+/// it reduces the amount of code identified.
 ///
 static bool ExtractLoops(BugDriver &BD,
-                         bool (*TestFn)(BugDriver &, Module *, Module *,
-                                        std::string &),
-                         std::vector<Function*> &MiscompiledFunctions,
+                         bool (*TestFn)(BugDriver &, std::unique_ptr<Module>,
+                                        std::unique_ptr<Module>, std::string &),
+                         std::vector<Function *> &MiscompiledFunctions,
                          std::string &Error) {
   bool MadeChange = false;
   while (1) {
@@ -382,8 +385,8 @@ static bool ExtractLoops(BugDriver &BD,
     for (unsigned i = 0, e = MiscompiledFunctions.size(); i != e; ++i)
       MiscompiledFunctions[i] = cast<Function>(VMap[MiscompiledFunctions[i]]);
 
-    Failure =
-        TestFn(BD, ToOptimizeLoopExtracted.get(), ToNotOptimize.get(), Error);
+    Failure = TestFn(BD, std::move(ToOptimizeLoopExtracted),
+                     std::move(ToNotOptimize), Error);
     if (!Error.empty())
       return false;
 
@@ -451,14 +454,15 @@ static bool ExtractLoops(BugDriver &BD,
 namespace {
   class ReduceMiscompiledBlocks : public ListReducer<BasicBlock*> {
     BugDriver &BD;
-    bool (*TestFn)(BugDriver &, Module *, Module *, std::string &);
+    bool (*TestFn)(BugDriver &, std::unique_ptr<Module>,
+                   std::unique_ptr<Module>, std::string &);
     std::vector<Function*> FunctionsBeingTested;
   public:
     ReduceMiscompiledBlocks(BugDriver &bd,
-                            bool (*F)(BugDriver &, Module *, Module *,
-                                      std::string &),
-                            const std::vector<Function*> &Fns)
-      : BD(bd), TestFn(F), FunctionsBeingTested(Fns) {}
+                            bool (*F)(BugDriver &, std::unique_ptr<Module>,
+                                      std::unique_ptr<Module>, std::string &),
+                            const std::vector<Function *> &Fns)
+        : BD(bd), TestFn(F), FunctionsBeingTested(Fns) {}
 
     TestResult doTest(std::vector<BasicBlock*> &Prefix,
                       std::vector<BasicBlock*> &Suffix,
@@ -518,36 +522,30 @@ bool ReduceMiscompiledBlocks::TestFuncs(const std::vector<BasicBlock*> &BBs,
   }
   VMap.clear();
 
-  Module *ToNotOptimize = CloneModule(BD.getProgram(), VMap).release();
-  Module *ToOptimize =
-      SplitFunctionsOutOfModule(ToNotOptimize, FuncsOnClone, VMap).release();
+  std::unique_ptr<Module> ToNotOptimize = CloneModule(BD.getProgram(), VMap);
+  std::unique_ptr<Module> ToOptimize =
+      SplitFunctionsOutOfModule(ToNotOptimize.get(), FuncsOnClone, VMap);
 
   // Try the extraction.  If it doesn't work, then the block extractor crashed
   // or something, in which case bugpoint can't chase down this possibility.
   if (std::unique_ptr<Module> New =
-          BD.extractMappedBlocksFromModule(BBsOnClone, ToOptimize)) {
-    delete ToOptimize;
-    // Run the predicate,
-    // note that the predicate will delete both input modules.
-    bool Ret = TestFn(BD, New.get(), ToNotOptimize, Error);
+          BD.extractMappedBlocksFromModule(BBsOnClone, ToOptimize.get())) {
+    bool Ret = TestFn(BD, std::move(New), std::move(ToNotOptimize), Error);
     delete BD.swapProgramIn(Orig);
     return Ret;
   }
   delete BD.swapProgramIn(Orig);
-  delete ToOptimize;
-  delete ToNotOptimize;
   return false;
 }
 
-
-/// ExtractBlocks - Given a reduced list of functions that still expose the bug,
-/// extract as many basic blocks from the region as possible without obscuring
-/// the bug.
+/// Given a reduced list of functions that still expose the bug, extract as many
+/// basic blocks from the region as possible without obscuring the bug.
 ///
 static bool ExtractBlocks(BugDriver &BD,
-                          bool (*TestFn)(BugDriver &, Module *, Module *,
+                          bool (*TestFn)(BugDriver &, std::unique_ptr<Module>,
+                                         std::unique_ptr<Module>,
                                          std::string &),
-                          std::vector<Function*> &MiscompiledFunctions,
+                          std::vector<Function *> &MiscompiledFunctions,
                           std::string &Error) {
   if (BugpointIsInterrupted) return false;
 
@@ -620,14 +618,13 @@ static bool ExtractBlocks(BugDriver &BD,
   return true;
 }
 
-
-/// DebugAMiscompilation - This is a generic driver to narrow down
-/// miscompilations, either in an optimization or a code generator.
+/// This is a generic driver to narrow down miscompilations, either in an
+/// optimization or a code generator.
 ///
-static std::vector<Function*>
+static std::vector<Function *>
 DebugAMiscompilation(BugDriver &BD,
-                     bool (*TestFn)(BugDriver &, Module *, Module *,
-                                    std::string &),
+                     bool (*TestFn)(BugDriver &, std::unique_ptr<Module>,
+                                    std::unique_ptr<Module>, std::string &),
                      std::string &Error) {
   // Okay, now that we have reduced the list of passes which are causing the
   // failure, see if we can pin down which functions are being
@@ -706,24 +703,24 @@ DebugAMiscompilation(BugDriver &BD,
   return MiscompiledFunctions;
 }
 
-/// TestOptimizer - This is the predicate function used to check to see if the
-/// "Test" portion of the program is misoptimized.  If so, return true.  In any
-/// case, both module arguments are deleted.
+/// This is the predicate function used to check to see if the "Test" portion of
+/// the program is misoptimized.  If so, return true.  In any case, both module
+/// arguments are deleted.
 ///
-static bool TestOptimizer(BugDriver &BD, Module *Test, Module *Safe,
-                          std::string &Error) {
+static bool TestOptimizer(BugDriver &BD, std::unique_ptr<Module> Test,
+                          std::unique_ptr<Module> Safe, std::string &Error) {
   // Run the optimization passes on ToOptimize, producing a transformed version
   // of the functions being tested.
   outs() << "  Optimizing functions being tested: ";
-  std::unique_ptr<Module> Optimized = BD.runPassesOn(Test, BD.getPassesToRun(),
-                                                     /*AutoDebugCrashes*/ true);
+  std::unique_ptr<Module> Optimized =
+      BD.runPassesOn(Test.get(), BD.getPassesToRun(),
+                     /*AutoDebugCrashes*/ true);
   outs() << "done.\n";
-  delete Test;
 
   outs() << "  Checking to see if the merged program executes correctly: ";
   bool Broken;
   Module *New =
-      TestMergedProgram(BD, Optimized.get(), Safe, true, Error, Broken);
+      TestMergedProgram(BD, Optimized.get(), Safe.get(), true, Error, Broken);
   if (New) {
     outs() << (Broken ? " nope.\n" : " yup.\n");
     // Delete the original and set the new program.
@@ -776,13 +773,13 @@ void BugDriver::debugMiscompilation(std::string *Error) {
   return;
 }
 
-/// CleanupAndPrepareModules - Get the specified modules ready for code
-/// generator testing.
+/// Get the specified modules ready for code generator testing.
 ///
-static void CleanupAndPrepareModules(BugDriver &BD, Module *&Test,
+static void CleanupAndPrepareModules(BugDriver &BD,
+                                     std::unique_ptr<Module> &Test,
                                      Module *Safe) {
   // Clean up the modules, removing extra cruft that we don't need anymore...
-  Test = BD.performFinalCleanups(Test).release();
+  Test = BD.performFinalCleanups(Test.get());
 
   // If we are executing the JIT, we have several nasty issues to take care of.
   if (!BD.isExecutingJIT()) return;
@@ -795,14 +792,14 @@ static void CleanupAndPrepareModules(BugDriver &BD, Module *&Test,
       // Rename it
       oldMain->setName("llvm_bugpoint_old_main");
       // Create a NEW `main' function with same type in the test module.
-      Function *newMain = Function::Create(oldMain->getFunctionType(),
-                                           GlobalValue::ExternalLinkage,
-                                           "main", Test);
+      Function *newMain =
+          Function::Create(oldMain->getFunctionType(),
+                           GlobalValue::ExternalLinkage, "main", Test.get());
       // Create an `oldmain' prototype in the test module, which will
       // corresponds to the real main function in the same module.
       Function *oldMainProto = Function::Create(oldMain->getFunctionType(),
                                                 GlobalValue::ExternalLinkage,
-                                                oldMain->getName(), Test);
+                                                oldMain->getName(), Test.get());
       // Set up and remember the argument list for the main function.
       std::vector<Value*> args;
       for (Function::arg_iterator
@@ -938,15 +935,14 @@ static void CleanupAndPrepareModules(BugDriver &BD, Module *&Test,
   }
 }
 
-
-
-/// TestCodeGenerator - This is the predicate function used to check to see if
-/// the "Test" portion of the program is miscompiled by the code generator under
-/// test.  If so, return true.  In any case, both module arguments are deleted.
+/// This is the predicate function used to check to see if the "Test" portion of
+/// the program is miscompiled by the code generator under test.  If so, return
+/// true.  In any case, both module arguments are deleted.
 ///
-static bool TestCodeGenerator(BugDriver &BD, Module *Test, Module *Safe,
+static bool TestCodeGenerator(BugDriver &BD, std::unique_ptr<Module> Test,
+                              std::unique_ptr<Module> Safe,
                               std::string &Error) {
-  CleanupAndPrepareModules(BD, Test, Safe);
+  CleanupAndPrepareModules(BD, Test, Safe.get());
 
   SmallString<128> TestModuleBC;
   int TestModuleFD;
@@ -957,12 +953,11 @@ static bool TestCodeGenerator(BugDriver &BD, Module *Test, Module *Safe,
            << EC.message() << "\n";
     exit(1);
   }
-  if (BD.writeProgramToFile(TestModuleBC.str(), TestModuleFD, Test)) {
+  if (BD.writeProgramToFile(TestModuleBC.str(), TestModuleFD, Test.get())) {
     errs() << "Error writing bitcode to `" << TestModuleBC.str()
            << "'\nExiting.";
     exit(1);
   }
-  delete Test;
 
   FileRemover TestModuleBCRemover(TestModuleBC.str(), !SaveTemps);
 
@@ -977,7 +972,7 @@ static bool TestCodeGenerator(BugDriver &BD, Module *Test, Module *Safe,
     exit(1);
   }
 
-  if (BD.writeProgramToFile(SafeModuleBC.str(), SafeModuleFD, Safe)) {
+  if (BD.writeProgramToFile(SafeModuleBC.str(), SafeModuleFD, Safe.get())) {
     errs() << "Error writing bitcode to `" << SafeModuleBC
            << "'\nExiting.";
     exit(1);
@@ -988,7 +983,6 @@ static bool TestCodeGenerator(BugDriver &BD, Module *Test, Module *Safe,
   std::string SharedObject = BD.compileSharedObject(SafeModuleBC.str(), Error);
   if (!Error.empty())
     return false;
-  delete Safe;
 
   FileRemover SharedObjectRemover(SharedObject, !SaveTemps);
 
@@ -1036,12 +1030,12 @@ bool BugDriver::debugCodeGenerator(std::string *Error) {
 
   // Split the module into the two halves of the program we want.
   ValueToValueMapTy VMap;
-  Module *ToNotCodeGen = CloneModule(getProgram(), VMap).release();
-  Module *ToCodeGen =
-      SplitFunctionsOutOfModule(ToNotCodeGen, Funcs, VMap).release();
+  std::unique_ptr<Module> ToNotCodeGen = CloneModule(getProgram(), VMap);
+  std::unique_ptr<Module> ToCodeGen =
+      SplitFunctionsOutOfModule(ToNotCodeGen.get(), Funcs, VMap);
 
   // Condition the modules
-  CleanupAndPrepareModules(*this, ToCodeGen, ToNotCodeGen);
+  CleanupAndPrepareModules(*this, ToCodeGen, ToNotCodeGen.get());
 
   SmallString<128> TestModuleBC;
   int TestModuleFD;
@@ -1053,12 +1047,11 @@ bool BugDriver::debugCodeGenerator(std::string *Error) {
     exit(1);
   }
 
-  if (writeProgramToFile(TestModuleBC.str(), TestModuleFD, ToCodeGen)) {
+  if (writeProgramToFile(TestModuleBC.str(), TestModuleFD, ToCodeGen.get())) {
     errs() << "Error writing bitcode to `" << TestModuleBC
            << "'\nExiting.";
     exit(1);
   }
-  delete ToCodeGen;
 
   // Make the shared library
   SmallString<128> SafeModuleBC;
@@ -1071,7 +1064,8 @@ bool BugDriver::debugCodeGenerator(std::string *Error) {
     exit(1);
   }
 
-  if (writeProgramToFile(SafeModuleBC.str(), SafeModuleFD, ToNotCodeGen)) {
+  if (writeProgramToFile(SafeModuleBC.str(), SafeModuleFD,
+                         ToNotCodeGen.get())) {
     errs() << "Error writing bitcode to `" << SafeModuleBC
            << "'\nExiting.";
     exit(1);
@@ -1079,7 +1073,6 @@ bool BugDriver::debugCodeGenerator(std::string *Error) {
   std::string SharedObject = compileSharedObject(SafeModuleBC.str(), *Error);
   if (!Error->empty())
     return true;
-  delete ToNotCodeGen;
 
   outs() << "You can reproduce the problem with the command line: \n";
   if (isExecutingJIT()) {

From bdc3e4db02c67d317d39cd2f54c67446c6284bc2 Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Wed, 9 Dec 2015 00:55:05 +0000
Subject: [PATCH 256/364] Simplify testMergedProgram.

It now receives and returns std::unique_ptr.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255087 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/bugpoint/Miscompilation.cpp | 45 +++++++++++++------------------
 1 file changed, 19 insertions(+), 26 deletions(-)

diff --git a/tools/bugpoint/Miscompilation.cpp b/tools/bugpoint/Miscompilation.cpp
index 7458a080b103..5c9f0271cecb 100644
--- a/tools/bugpoint/Miscompilation.cpp
+++ b/tools/bugpoint/Miscompilation.cpp
@@ -219,32 +219,24 @@ static void diagnosticHandler(const DiagnosticInfo &DI) {
     exit(1);
 }
 
-/// TestMergedProgram - Given two modules, link them together and run the
-/// program, checking to see if the program matches the diff. If there is
-/// an error, return NULL. If not, return the merged module. The Broken argument
-/// will be set to true if the output is different. If the DeleteInputs
-/// argument is set to true then this function deletes both input
-/// modules before it returns.
+/// Given two modules, link them together and run the program, checking to see
+/// if the program matches the diff. If there is an error, return NULL. If not,
+/// return the merged module. The Broken argument will be set to true if the
+/// output is different. If the DeleteInputs argument is set to true then this
+/// function deletes both input modules before it returns.
 ///
-static Module *TestMergedProgram(const BugDriver &BD, Module *M1, Module *M2,
-                                 bool DeleteInputs, std::string &Error,
-                                 bool &Broken) {
-  // Link the two portions of the program back to together.
-  if (!DeleteInputs) {
-    M1 = CloneModule(M1).release();
-    M2 = CloneModule(M2).release();
-  }
+static std::unique_ptr<Module> testMergedProgram(const BugDriver &BD,
+                                                 std::unique_ptr<Module> M1,
+                                                 std::unique_ptr<Module> M2,
+                                                 std::string &Error,
+                                                 bool &Broken) {
   if (Linker::linkModules(*M1, *M2, diagnosticHandler))
     exit(1);
-  delete M2;   // We are done with this module.
 
   // Execute the program.
-  Broken = BD.diffProgram(M1, "", "", false, &Error);
-  if (!Error.empty()) {
-    // Delete the linked module
-    delete M1;
+  Broken = BD.diffProgram(M1.get(), "", "", false, &Error);
+  if (!Error.empty())
     return nullptr;
-  }
   return M1;
 }
 
@@ -342,13 +334,14 @@ static bool ExtractLoops(BugDriver &BD,
     // extraction.
     AbstractInterpreter *AI = BD.switchToSafeInterpreter();
     bool Failure;
-    Module *New = TestMergedProgram(BD, ToOptimizeLoopExtracted.get(),
-                                    ToNotOptimize.get(), false, Error, Failure);
+    std::unique_ptr<Module> New =
+        testMergedProgram(BD, std::move(ToOptimizeLoopExtracted),
+                          std::move(ToNotOptimize), Error, Failure);
     if (!New)
       return false;
 
     // Delete the original and set the new program.
-    Module *Old = BD.swapProgramIn(New);
+    Module *Old = BD.swapProgramIn(New.release());
     for (unsigned i = 0, e = MiscompiledFunctions.size(); i != e; ++i)
       MiscompiledFunctions[i] = cast<Function>(VMap[MiscompiledFunctions[i]]);
     delete Old;
@@ -719,12 +712,12 @@ static bool TestOptimizer(BugDriver &BD, std::unique_ptr<Module> Test,
 
   outs() << "  Checking to see if the merged program executes correctly: ";
   bool Broken;
-  Module *New =
-      TestMergedProgram(BD, Optimized.get(), Safe.get(), true, Error, Broken);
+  std::unique_ptr<Module> New = testMergedProgram(
+      BD, std::move(Optimized), std::move(Safe), Error, Broken);
   if (New) {
     outs() << (Broken ? " nope.\n" : " yup.\n");
     // Delete the original and set the new program.
-    delete BD.swapProgramIn(New);
+    delete BD.swapProgramIn(New.release());
   }
   return Broken;
 }

From f001e6b8dbc1bb1c5e52afab14d13a79b50267e4 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@playingwithpointers.com>
Date: Wed, 9 Dec 2015 01:01:28 +0000
Subject: [PATCH 257/364] Don't drop attributes when inlining through "deopt"
 operand bundles

Test case attached (test case also checks that we don't drop the calling
convention, but that functionality was correct before this patch).

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255088 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/IR/Instructions.cpp                 |  2 ++
 test/Transforms/Inline/deopt-bundles.ll | 39 +++++++++++++++++++++++++
 2 files changed, 41 insertions(+)

diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp
index bba0ef2d7d34..6ec2e2899700 100644
--- a/lib/IR/Instructions.cpp
+++ b/lib/IR/Instructions.cpp
@@ -307,6 +307,7 @@ CallInst *CallInst::Create(CallInst *CI, ArrayRef<OperandBundleDef> OpB,
   NewCI->setTailCallKind(CI->getTailCallKind());
   NewCI->setCallingConv(CI->getCallingConv());
   NewCI->SubclassOptionalData = CI->SubclassOptionalData;
+  NewCI->setAttributes(CI->getAttributes());
   return NewCI;
 }
 
@@ -594,6 +595,7 @@ InvokeInst *InvokeInst::Create(InvokeInst *II, ArrayRef<OperandBundleDef> OpB,
                                    II->getName(), InsertPt);
   NewII->setCallingConv(II->getCallingConv());
   NewII->SubclassOptionalData = II->SubclassOptionalData;
+  NewII->setAttributes(II->getAttributes());
   return NewII;
 }
 
diff --git a/test/Transforms/Inline/deopt-bundles.ll b/test/Transforms/Inline/deopt-bundles.ll
index 91d4690e0e61..b4176089075e 100644
--- a/test/Transforms/Inline/deopt-bundles.ll
+++ b/test/Transforms/Inline/deopt-bundles.ll
@@ -2,6 +2,7 @@
 
 declare void @f()
 declare i32 @g()
+declare fastcc i32 @g.fastcc()
 
 define i32 @callee_0() alwaysinline {
  entry:
@@ -95,3 +96,41 @@ define i32 @caller_4() {
   %x = call i32 @callee_4() [ "deopt"(i32 7) ]
   ret i32 %x
 }
+
+define i32 @callee_5() alwaysinline personality i8 3 {
+ entry:
+  %v = invoke fastcc i32 @g.fastcc() #0 [ "deopt"(i32 0, i32 1), "foo"(double 0.0) ] to label %normal unwind label %unwind
+
+ normal:
+  ret i32 %v
+
+ unwind:
+  %cleanup = landingpad i8 cleanup
+  ret i32 100
+}
+
+define i32 @caller_5() {
+; CHECK-LABEL: @caller_5(
+ entry:
+; CHECK:  invoke fastcc i32 @g.fastcc() #[[FOO_BAR_ATTR_IDX:[0-9]+]] [ "deopt"(i32 7, i32 0, i32 1), "foo"(double 0.000000e+00) ]
+  %x = call i32 @callee_5() [ "deopt"(i32 7) ]
+  ret i32 %x
+}
+
+define i32 @callee_6() alwaysinline personality i8 3 {
+ entry:
+  %v = call fastcc i32 @g.fastcc() #0 [ "deopt"(i32 0, i32 1), "foo"(double 0.0) ]
+  ret i32 %v
+}
+
+define i32 @caller_6() {
+; CHECK-LABEL: @caller_6(
+ entry:
+; CHECK: call fastcc i32 @g.fastcc() #[[FOO_BAR_ATTR_IDX]] [ "deopt"(i32 7, i32 0, i32 1), "foo"(double 0.000000e+00) ]
+  %x = call i32 @callee_6() [ "deopt"(i32 7) ]
+  ret i32 %x
+}
+
+attributes #0 = { "foo"="bar" }
+
+; CHECK: attributes #[[FOO_BAR_ATTR_IDX]] = { "foo"="bar" }

From 65422bf239b9cb8806dec1f6045e366d50c06841 Mon Sep 17 00:00:00 2001
From: Ahmed Bougacha <ahmed.bougacha@gmail.com>
Date: Wed, 9 Dec 2015 01:19:50 +0000
Subject: [PATCH 258/364] [AArch64][ARM] Don't base interleaved op legality on
 type alloc size.

Otherwise, we think that most types that look like they'd fit in a
legal vector type are legal (so, basically, *any* vector type with a
size between 33 and 128 bits, I think, since we use pow2 alignment;
e.g., v2i25, v3f32, ...).

DataLayout::getTypeAllocSize rounds up based on alignment.
When checking for target intrinsic legality, that's not what we want:
if rounding makes a difference, the type isn't legal, and the
target intrinsics shouldn't be used, as they are always assumed legal.

One could make the argument that alloc size is ultimately the most
relevant here, since we're dealing with LD/ST intrinsics. That's only
true if we did legalize them though; that's a problem for another day.

Use DataLayout::getTypeSizeInBits instead of getTypeAllocSizeInBits.
Type::getSizeInBits can't be used because that'd gratuitously break
pointer vector support.

Some of these uses are currently fine, because we only hit them when
the type is already known legal (e.g., r114454). Update them for
consistency. It's faster to avoid the rounding anyway!

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255089 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64ISelLowering.cpp    |  8 ++--
 .../AArch64/AArch64TargetTransformInfo.cpp    |  2 +-
 lib/Target/ARM/ARMISelLowering.cpp            | 12 +++---
 lib/Target/ARM/ARMTargetTransformInfo.cpp     |  4 +-
 .../AArch64/aarch64-interleaved-accesses.ll   | 40 ++++++++++++++++++-
 test/CodeGen/ARM/arm-interleaved-accesses.ll  | 40 +++++++++++++++++++
 6 files changed, 91 insertions(+), 15 deletions(-)

diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9340e7f0a55c..a23032bb3beb 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -6727,7 +6727,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::aarch64_neon_ld4r: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     // Conservatively set memVT to the entire set of vectors loaded.
-    uint64_t NumElts = DL.getTypeAllocSize(I.getType()) / 8;
+    uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
     Info.offset = 0;
@@ -6753,7 +6753,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
       Type *ArgTy = I.getArgOperand(ArgI)->getType();
       if (!ArgTy->isVectorTy())
         break;
-      NumElts += DL.getTypeAllocSize(ArgTy) / 8;
+      NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
     }
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
@@ -6996,7 +6996,7 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
   const DataLayout &DL = LI->getModule()->getDataLayout();
 
   VectorType *VecTy = Shuffles[0]->getType();
-  unsigned VecSize = DL.getTypeAllocSizeInBits(VecTy);
+  unsigned VecSize = DL.getTypeSizeInBits(VecTy);
 
   // Skip if we do not have NEON and skip illegal vector types.
   if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128))
@@ -7082,7 +7082,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
   VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);
 
   const DataLayout &DL = SI->getModule()->getDataLayout();
-  unsigned SubVecSize = DL.getTypeAllocSizeInBits(SubVecTy);
+  unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
 
   // Skip if we do not have NEON and skip illegal vector types.
   if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128))
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index e803ef949b9d..9af0e6444789 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -448,7 +448,7 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
   if (Factor <= TLI->getMaxSupportedInterleaveFactor()) {
     unsigned NumElts = VecTy->getVectorNumElements();
     Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
-    unsigned SubVecSize = DL.getTypeAllocSizeInBits(SubVecTy);
+    unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
 
     // ldN/stN only support legal vector types of size 64 or 128 in bits.
     if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128))
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 23f7bd0f4c8b..cc9656aa0b4f 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -11739,7 +11739,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     // Conservatively set memVT to the entire set of vectors loaded.
     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
-    uint64_t NumElts = DL.getTypeAllocSize(I.getType()) / 8;
+    uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
@@ -11765,7 +11765,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
       Type *ArgTy = I.getArgOperand(ArgI)->getType();
       if (!ArgTy->isVectorTy())
         break;
-      NumElts += DL.getTypeAllocSize(ArgTy) / 8;
+      NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
     }
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(0);
@@ -12108,8 +12108,8 @@ bool ARMTargetLowering::lowerInterleavedLoad(
   Type *EltTy = VecTy->getVectorElementType();
 
   const DataLayout &DL = LI->getModule()->getDataLayout();
-  unsigned VecSize = DL.getTypeAllocSizeInBits(VecTy);
-  bool EltIs64Bits = DL.getTypeAllocSizeInBits(EltTy) == 64;
+  unsigned VecSize = DL.getTypeSizeInBits(VecTy);
+  bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64;
 
   // Skip if we do not have NEON and skip illegal vector types and vector types
   // with i64/f64 elements (vldN doesn't support i64/f64 elements).
@@ -12198,8 +12198,8 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
   VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);
 
   const DataLayout &DL = SI->getModule()->getDataLayout();
-  unsigned SubVecSize = DL.getTypeAllocSizeInBits(SubVecTy);
-  bool EltIs64Bits = DL.getTypeAllocSizeInBits(EltTy) == 64;
+  unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
+  bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64;
 
   // Skip if we do not have NEON and skip illegal vector types and vector types
   // with i64/f64 elements (vstN doesn't support i64/f64 elements).
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 582a057e9234..c1520119ef21 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -478,12 +478,12 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
   assert(isa<VectorType>(VecTy) && "Expect a vector type");
 
   // vldN/vstN doesn't support vector types of i64/f64 element.
-  bool EltIs64Bits = DL.getTypeAllocSizeInBits(VecTy->getScalarType()) == 64;
+  bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
 
   if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits) {
     unsigned NumElts = VecTy->getVectorNumElements();
     Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
-    unsigned SubVecSize = DL.getTypeAllocSizeInBits(SubVecTy);
+    unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
 
     // vldN/vstN only support legal vector types of size 64 or 128 in bits.
     if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128))
diff --git a/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll b/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll
index 545aeda88602..1bc2a3ccb1ca 100644
--- a/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll
+++ b/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=aarch64 -aarch64-neon-syntax=generic -lower-interleaved-accesses=true < %s | FileCheck %s -check-prefix=NEON
-; RUN: llc -march=aarch64 -mattr=-neon -lower-interleaved-accesses=true < %s | FileCheck %s -check-prefix=NONEON
+; RUN: llc -mtriple=aarch64 -lower-interleaved-accesses=true < %s | FileCheck %s -check-prefix=NEON
+; RUN: llc -mtriple=aarch64 -lower-interleaved-accesses=true -mattr=-neon < %s | FileCheck %s -check-prefix=NONEON
 
 ; NEON-LABEL: load_factor2:
 ; NEON: ld2 { v0.8b, v1.8b }, [x0]
@@ -232,3 +232,39 @@ define void @store_undef_mask_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <
   store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
   ret void
 }
+
+; Check that we do something sane with illegal types.
+
+; NEON-LABEL: load_illegal_factor2:
+; NEON: BB#0:
+; NEON-NEXT: ldr q[[V:[0-9]+]], [x0]
+; NEON-NEXT: uzp1 v0.4s, v[[V]].4s, v{{.*}}.4s
+; NEON-NEXT: ret
+; NONEON-LABEL: load_illegal_factor2:
+; NONEON: BB#0:
+; NONEON-NEXT: ldr s0, [x0]
+; NONEON-NEXT: ldr s1, [x0, #8]
+; NONEON-NEXT: ret
+define <3 x float> @load_illegal_factor2(<3 x float>* %p) nounwind {
+  %tmp1 = load <3 x float>, <3 x float>* %p, align 16
+  %tmp2 = shufflevector <3 x float> %tmp1, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
+  ret <3 x float> %tmp2
+}
+
+; NEON-LABEL: store_illegal_factor2:
+; NEON: BB#0:
+; NEON-NEXT: uzp1 v0.4s, v0.4s, v{{.*}}.4s
+; NEON-NEXT: st1 { v0.d }[0], [x0]
+; NEON-NEXT: ret
+; NONEON-LABEL: store_illegal_factor2:
+; NONEON: BB#0:
+; NONEON-NEXT: fmov w[[ELT2:[0-9]+]], s2
+; NONEON-NEXT: fmov w[[RES:[0-9]+]], s0
+; NONEON-NEXT: bfi x[[RES]], x[[ELT2]], #32, #32
+; NONEON-NEXT: str x[[RES]], [x0]
+; NONEON-NEXT: ret
+define void @store_illegal_factor2(<3 x float>* %p, <3 x float> %v) nounwind {
+  %tmp1 = shufflevector <3 x float> %v, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
+  store <3 x float> %tmp1, <3 x float>* %p, align 16
+  ret void
+}
diff --git a/test/CodeGen/ARM/arm-interleaved-accesses.ll b/test/CodeGen/ARM/arm-interleaved-accesses.ll
index c3aa2d6b4da2..002e71f6d9b8 100644
--- a/test/CodeGen/ARM/arm-interleaved-accesses.ll
+++ b/test/CodeGen/ARM/arm-interleaved-accesses.ll
@@ -264,3 +264,43 @@ define void @store_address_space(<2 x i32>* %A, <2 x i32>* %B, <4 x i32> addrspa
  store <4 x i32> %interleaved, <4 x i32> addrspace(1)* %C
  ret void
 }
+
+; Check that we do something sane with illegal types.
+
+; NEON-LABEL: load_illegal_factor2:
+; NEON: BB#0:
+; NEON-NEXT: vld1.64 {d16, d17}, [r0:128]
+; NEON-NEXT: vuzp.32 q8, {{.*}}
+; NEON-NEXT: vmov r0, r1, d16
+; NEON-NEXT: vmov r2, r3, {{.*}}
+; NEON-NEXT: mov pc, lr
+; NONEON-LABEL: load_illegal_factor2:
+; NONEON: BB#0:
+; NONEON-NEXT: ldr [[ELT0:r[0-9]+]], [r0]
+; NONEON-NEXT: ldr r1, [r0, #8]
+; NONEON-NEXT: mov r0, [[ELT0]]
+; NONEON-NEXT: mov pc, lr
+define <3 x float> @load_illegal_factor2(<3 x float>* %p) nounwind {
+  %tmp1 = load <3 x float>, <3 x float>* %p, align 16
+  %tmp2 = shufflevector <3 x float> %tmp1, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
+  ret <3 x float> %tmp2
+}
+
+; This lowering isn't great, but it's at least correct.
+
+; NEON-LABEL: store_illegal_factor2:
+; NEON: BB#0:
+; NEON-NEXT: vldr d17, [sp]
+; NEON-NEXT: vmov d16, r2, r3
+; NEON-NEXT: vuzp.32 q8, {{.*}}
+; NEON-NEXT: vstr d16, [r0]
+; NEON-NEXT: mov pc, lr
+; NONEON-LABEL: store_illegal_factor2:
+; NONEON: BB#0:
+; NONEON-NEXT: stm r0, {r1, r3}
+; NONEON-NEXT: mov pc, lr
+define void @store_illegal_factor2(<3 x float>* %p, <3 x float> %v) nounwind {
+  %tmp1 = shufflevector <3 x float> %v, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
+  store <3 x float> %tmp1, <3 x float>* %p, align 16
+  ret void
+}

From 1208ef58cc26a928add3f5d797f6f1c5811bd1ed Mon Sep 17 00:00:00 2001
From: Steven Wu <stevenwu@apple.com>
Date: Wed, 9 Dec 2015 03:37:51 +0000
Subject: [PATCH 259/364] Fix the order of destructors in LibLTOCodeGenerator

Summary:
The order of destructors in LTOCodeGenerator gets changed in r254696.
It is possible for LTOCodeGenerator to have a MergedModule created in
the OwnedContext, in which case the module must be destructed before
the context.

Reviewers: rafael, dexonsmith

Subscribers: llvm-commits, joker.eph

Differential Revision: http://reviews.llvm.org/D15346

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255092 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/LTO/LTOCodeGenerator.h | 2 ++
 tools/lto/lto.cpp                   | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/include/llvm/LTO/LTOCodeGenerator.h b/include/llvm/LTO/LTOCodeGenerator.h
index 876defbdcd3f..8a79e6044f5b 100644
--- a/include/llvm/LTO/LTOCodeGenerator.h
+++ b/include/llvm/LTO/LTOCodeGenerator.h
@@ -148,6 +148,8 @@ struct LTOCodeGenerator {
 
   LLVMContext &getContext() { return Context; }
 
+  void resetMergedModule() { MergedModule.reset(); }
+
 private:
   void initializeLTOPasses();
 
diff --git a/tools/lto/lto.cpp b/tools/lto/lto.cpp
index d13de57e830c..d8f99c050a34 100644
--- a/tools/lto/lto.cpp
+++ b/tools/lto/lto.cpp
@@ -124,6 +124,10 @@ struct LibLTOCodeGenerator : LTOCodeGenerator {
       : LTOCodeGenerator(*Context), OwnedContext(std::move(Context)) {
     setDiagnosticHandler(handleLibLTODiagnostic, nullptr); }
 
+  // Reset the module first in case MergedModule is created in OwnedContext.
+  // Module must be destructed before its context gets destructed.
+  ~LibLTOCodeGenerator() { resetMergedModule(); }
+
   std::unique_ptr<MemoryBuffer> NativeObjectFile;
   std::unique_ptr<LLVMContext> OwnedContext;
 };

From a8048dade78e1678e7dd08e098cb1dea4f1c649a Mon Sep 17 00:00:00 2001
From: Vikram TV <vikram.tarikere@gmail.com>
Date: Wed, 9 Dec 2015 05:16:24 +0000
Subject: [PATCH 260/364] Test commit access - Fix few missing '.' in comments
 of LoopInterchange code.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255095 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/LoopInterchange.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/Transforms/Scalar/LoopInterchange.cpp b/lib/Transforms/Scalar/LoopInterchange.cpp
index 8233bdcb6804..6026a907ff81 100644
--- a/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -176,7 +176,7 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
     }
   }
 
-  // We don't have a DepMatrix to check legality return false
+  // We don't have a DepMatrix to check legality return false.
   if (DepMatrix.size() == 0)
     return false;
   return true;
@@ -371,7 +371,7 @@ class LoopInterchangeProfitability {
   LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE)
       : OuterLoop(Outer), InnerLoop(Inner), SE(SE) {}
 
-  /// Check if the loop interchange is profitable
+  /// Check if the loop interchange is profitable.
   bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId,
                     CharMatrix &DepMatrix);
 
@@ -385,7 +385,7 @@ class LoopInterchangeProfitability {
   ScalarEvolution *SE;
 };
 
-/// LoopInterchangeTransform interchanges the loop
+/// LoopInterchangeTransform interchanges the loop.
 class LoopInterchangeTransform {
 public:
   LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
@@ -424,7 +424,7 @@ class LoopInterchangeTransform {
   bool InnerLoopHasReduction;
 };
 
-// Main LoopInterchange Pass
+// Main LoopInterchange Pass.
 struct LoopInterchange : public FunctionPass {
   static char ID;
   ScalarEvolution *SE;

From 2f351a5ca7784cdb01586086199789a291d98c50 Mon Sep 17 00:00:00 2001
From: Vikram TV <vikram.tarikere@gmail.com>
Date: Wed, 9 Dec 2015 05:49:14 +0000
Subject: [PATCH 261/364] Implement a new pass - LiveDebugValues - to compute
 the set of live DEBUG_VALUEs at each basic block and insert them. Reviewed
 and accepted at: http://reviews.llvm.org/D11933

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255096 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/Passes.h                 |   3 +
 include/llvm/InitializePasses.h               |   1 +
 lib/CodeGen/CMakeLists.txt                    |   1 +
 lib/CodeGen/CodeGen.cpp                       |   1 +
 lib/CodeGen/LiveDebugValues.cpp               | 402 ++++++++++++++++++
 lib/CodeGen/Passes.cpp                        |   1 +
 test/DebugInfo/MIR/X86/lit.local.cfg          |   2 +
 .../MIR/X86/live-debug-values-3preds.mir      | 299 +++++++++++++
 test/DebugInfo/MIR/X86/live-debug-values.mir  | 260 +++++++++++
 test/DebugInfo/MIR/lit.local.cfg              |   2 +
 test/DebugInfo/X86/array.ll                   |   2 +
 test/DebugInfo/X86/fission-ranges.ll          |   8 +-
 test/DebugInfo/X86/pieces-3.ll                |   2 +-
 test/DebugInfo/live-debug-values.ll           | 153 +++++++
 14 files changed, 1132 insertions(+), 5 deletions(-)
 create mode 100644 lib/CodeGen/LiveDebugValues.cpp
 create mode 100644 test/DebugInfo/MIR/X86/lit.local.cfg
 create mode 100644 test/DebugInfo/MIR/X86/live-debug-values-3preds.mir
 create mode 100644 test/DebugInfo/MIR/X86/live-debug-values.mir
 create mode 100644 test/DebugInfo/MIR/lit.local.cfg
 create mode 100644 test/DebugInfo/live-debug-values.ll

diff --git a/include/llvm/CodeGen/Passes.h b/include/llvm/CodeGen/Passes.h
index cae47066e14c..f45f0ed57d6b 100644
--- a/include/llvm/CodeGen/Passes.h
+++ b/include/llvm/CodeGen/Passes.h
@@ -640,6 +640,9 @@ namespace llvm {
   /// the intrinsic for later emission to the StackMap.
   extern char &StackMapLivenessID;
 
+  /// LiveDebugValues pass
+  extern char &LiveDebugValuesID;
+
   /// createJumpInstrTables - This pass creates jump-instruction tables.
   ModulePass *createJumpInstrTablesPass();
 
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index 29c8b27db36a..1f54d4437200 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -289,6 +289,7 @@ void initializeBBVectorizePass(PassRegistry&);
 void initializeMachineFunctionPrinterPassPass(PassRegistry&);
 void initializeMIRPrintingPassPass(PassRegistry&);
 void initializeStackMapLivenessPass(PassRegistry&);
+void initializeLiveDebugValuesPass(PassRegistry&);
 void initializeMachineCombinerPass(PassRegistry &);
 void initializeLoadCombinePass(PassRegistry&);
 void initializeRewriteSymbolsPass(PassRegistry&);
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 9c63dc7a0fd8..a078c3c707a0 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -25,6 +25,7 @@ add_llvm_library(LLVMCodeGen
   ExecutionDepsFix.cpp
   ExpandISelPseudos.cpp
   ExpandPostRAPseudos.cpp
+  LiveDebugValues.cpp
   FaultMaps.cpp
   FuncletLayout.cpp
   GCMetadata.cpp
diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp
index 7d18058db508..dc13b5b11d30 100644
--- a/lib/CodeGen/CodeGen.cpp
+++ b/lib/CodeGen/CodeGen.cpp
@@ -67,6 +67,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeSlotIndexesPass(Registry);
   initializeStackColoringPass(Registry);
   initializeStackMapLivenessPass(Registry);
+  initializeLiveDebugValuesPass(Registry);
   initializeStackProtectorPass(Registry);
   initializeStackSlotColoringPass(Registry);
   initializeTailDuplicatePassPass(Registry);
diff --git a/lib/CodeGen/LiveDebugValues.cpp b/lib/CodeGen/LiveDebugValues.cpp
new file mode 100644
index 000000000000..7fccbea45b42
--- /dev/null
+++ b/lib/CodeGen/LiveDebugValues.cpp
@@ -0,0 +1,402 @@
+//===------ LiveDebugValues.cpp - Tracking Debug Value MIs ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// This pass implements a data flow analysis that propagates debug location
+/// information by inserting additional DBG_VALUE instructions into the machine
+/// instruction stream. The pass internally builds debug location liveness
+/// ranges to determine the points where additional DBG_VALUEs need to be
+/// inserted.
+///
+/// This is a separate pass from DbgValueHistoryCalculator to facilitate
+/// testing and improve modularity.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <list>
+#include <deque>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "live-debug-values"
+
+STATISTIC(NumInserted, "Number of DBG_VALUE instructions inserted");
+
+namespace {
+
+class LiveDebugValues : public MachineFunctionPass {
+
+private:
+  const TargetRegisterInfo *TRI;
+  const TargetInstrInfo *TII;
+
+  typedef std::pair<const DILocalVariable *, const DILocation *>
+      InlinedVariable;
+
+  /// A potentially inlined instance of a variable.
+  struct DebugVariable {
+    const DILocalVariable *Var;
+    const DILocation *InlinedAt;
+
+    DebugVariable(const DILocalVariable *_var, const DILocation *_inlinedAt)
+        : Var(_var), InlinedAt(_inlinedAt) {}
+
+    bool operator==(const DebugVariable &DV) const {
+      return (Var == DV.Var) && (InlinedAt == DV.InlinedAt);
+    }
+  };
+
+  /// Member variables and functions for Range Extension across basic blocks.
+  struct VarLoc {
+    DebugVariable Var;
+    const MachineInstr *MI; // MachineInstr should be a DBG_VALUE instr.
+
+    VarLoc(DebugVariable _var, const MachineInstr *_mi) : Var(_var), MI(_mi) {}
+
+    bool operator==(const VarLoc &V) const;
+  };
+
+  typedef std::list<VarLoc> VarLocList;
+  typedef SmallDenseMap<const MachineBasicBlock *, VarLocList> VarLocInMBB;
+
+  bool OLChanged; // OutgoingLocs got changed for this bb.
+  bool MBBJoined; // The MBB was joined.
+
+  void transferDebugValue(MachineInstr &MI, VarLocList &OpenRanges);
+  void transferRegisterDef(MachineInstr &MI, VarLocList &OpenRanges);
+  void transferTerminatorInst(MachineInstr &MI, VarLocList &OpenRanges,
+                              VarLocInMBB &OutLocs);
+  void transfer(MachineInstr &MI, VarLocList &OpenRanges, VarLocInMBB &OutLocs);
+
+  void join(MachineBasicBlock &MBB, VarLocInMBB &OutLocs, VarLocInMBB &InLocs);
+
+  bool ExtendRanges(MachineFunction &MF);
+
+public:
+  static char ID;
+
+  /// Default construct and initialize the pass.
+  LiveDebugValues();
+
+  /// Tell the pass manager which passes we depend on and what
+  /// information we preserve.
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  /// Print to ostream with a message.
+  void printVarLocInMBB(const VarLocInMBB &V, const char *msg,
+                        raw_ostream &Out) const;
+
+  /// Calculate the liveness information for the given machine function.
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // namespace
+
+//===----------------------------------------------------------------------===//
+//            Implementation
+//===----------------------------------------------------------------------===//
+
+char LiveDebugValues::ID = 0;
+char &llvm::LiveDebugValuesID = LiveDebugValues::ID;
+INITIALIZE_PASS(LiveDebugValues, "livedebugvalues", "Live DEBUG_VALUE analysis",
+                false, false)
+
+/// Default construct and initialize the pass.
+LiveDebugValues::LiveDebugValues() : MachineFunctionPass(ID) {
+  initializeLiveDebugValuesPass(*PassRegistry::getPassRegistry());
+}
+
+/// Tell the pass manager which passes we depend on and what information we
+/// preserve.
+void LiveDebugValues::getAnalysisUsage(AnalysisUsage &AU) const {
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+// \brief If @MI is a DBG_VALUE with debug value described by a defined
+// register, returns the number of this register. In the other case, returns 0.
+static unsigned isDescribedByReg(const MachineInstr &MI) {
+  assert(MI.isDebugValue());
+  assert(MI.getNumOperands() == 4);
+  // If location of variable is described using a register (directly or
+  // indirecltly), this register is always a first operand.
+  return MI.getOperand(0).isReg() ? MI.getOperand(0).getReg() : 0;
+}
+
+// \brief This function takes two DBG_VALUE instructions and returns true
+// if their offsets are equal; otherwise returns false.
+static bool areOffsetsEqual(const MachineInstr &MI1, const MachineInstr &MI2) {
+  assert(MI1.isDebugValue());
+  assert(MI1.getNumOperands() == 4);
+
+  assert(MI2.isDebugValue());
+  assert(MI2.getNumOperands() == 4);
+
+  if (!MI1.isIndirectDebugValue() && !MI2.isIndirectDebugValue())
+    return true;
+
+  // Check if both MIs are indirect and they are equal.
+  if (MI1.isIndirectDebugValue() && MI2.isIndirectDebugValue())
+    return MI1.getOperand(1).getImm() == MI2.getOperand(1).getImm();
+
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+//            Debug Range Extension Implementation
+//===----------------------------------------------------------------------===//
+
+void LiveDebugValues::printVarLocInMBB(const VarLocInMBB &V, const char *msg,
+                                       raw_ostream &Out) const {
+  Out << "Printing " << msg << ":\n";
+  for (const auto &L : V) {
+    Out << "MBB: " << L.first->getName() << ":\n";
+    for (const auto &VLL : L.second) {
+      Out << " Var: " << VLL.Var.Var->getName();
+      Out << " MI: ";
+      (*VLL.MI).dump();
+      Out << "\n";
+    }
+  }
+  Out << "\n";
+}
+
+bool LiveDebugValues::VarLoc::operator==(const VarLoc &V) const {
+  return (Var == V.Var) && (isDescribedByReg(*MI) == isDescribedByReg(*V.MI)) &&
+         (areOffsetsEqual(*MI, *V.MI));
+}
+
+/// End all previous ranges related to @MI and start a new range from @MI
+/// if it is a DBG_VALUE instr.
+void LiveDebugValues::transferDebugValue(MachineInstr &MI,
+                                         VarLocList &OpenRanges) {
+  if (!MI.isDebugValue())
+    return;
+  const DILocalVariable *RawVar = MI.getDebugVariable();
+  assert(RawVar->isValidLocationForIntrinsic(MI.getDebugLoc()) &&
+         "Expected inlined-at fields to agree");
+  DebugVariable Var(RawVar, MI.getDebugLoc()->getInlinedAt());
+
+  // End all previous ranges of Var.
+  OpenRanges.erase(
+      std::remove_if(OpenRanges.begin(), OpenRanges.end(),
+                     [&](const VarLoc &V) { return (Var == V.Var); }),
+      OpenRanges.end());
+
+  // Add Var to OpenRanges from this DBG_VALUE.
+  // TODO: Currently handles DBG_VALUE which has only reg as location.
+  if (isDescribedByReg(MI)) {
+    VarLoc V(Var, &MI);
+    OpenRanges.push_back(std::move(V));
+  }
+}
+
+/// A definition of a register may mark the end of a range.
+void LiveDebugValues::transferRegisterDef(MachineInstr &MI,
+                                          VarLocList &OpenRanges) {
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!(MO.isReg() && MO.isDef() && MO.getReg()))
+      continue;
+    // Remove ranges of all aliased registers.
+    for (MCRegAliasIterator RAI(MO.getReg(), TRI, true); RAI.isValid(); ++RAI)
+      OpenRanges.erase(std::remove_if(OpenRanges.begin(), OpenRanges.end(),
+                                      [&](const VarLoc &V) {
+                         return (*RAI == isDescribedByReg(*V.MI));
+                       }),
+                       OpenRanges.end());
+  }
+}
+
+/// Terminate all open ranges at the end of the current basic block.
+void LiveDebugValues::transferTerminatorInst(MachineInstr &MI,
+                                             VarLocList &OpenRanges,
+                                             VarLocInMBB &OutLocs) {
+  const MachineBasicBlock *CurMBB = MI.getParent();
+  if (!(MI.isTerminator() || (&MI == &CurMBB->instr_back())))
+    return;
+
+  if (OpenRanges.empty())
+    return;
+
+  if (OutLocs.find(CurMBB) == OutLocs.end()) {
+    // Create space for new Outgoing locs entries.
+    VarLocList VLL;
+    OutLocs.insert(std::make_pair(CurMBB, std::move(VLL)));
+  }
+  auto OL = OutLocs.find(CurMBB);
+  assert(OL != OutLocs.end());
+  VarLocList &VLL = OL->second;
+
+  for (auto OR : OpenRanges) {
+    // Copy OpenRanges to OutLocs, if not already present.
+    assert(OR.MI->isDebugValue());
+    DEBUG(dbgs() << "Add to OutLocs: "; OR.MI->dump(););
+    if (std::find_if(VLL.begin(), VLL.end(),
+                     [&](const VarLoc &V) { return (OR == V); }) == VLL.end()) {
+      VLL.push_back(std::move(OR));
+      OLChanged = true;
+    }
+  }
+  OpenRanges.clear();
+}
+
+/// This routine creates OpenRanges and OutLocs.
+void LiveDebugValues::transfer(MachineInstr &MI, VarLocList &OpenRanges,
+                               VarLocInMBB &OutLocs) {
+  transferDebugValue(MI, OpenRanges);
+  transferRegisterDef(MI, OpenRanges);
+  transferTerminatorInst(MI, OpenRanges, OutLocs);
+}
+
+/// This routine joins the analysis results of all incoming edges in @MBB by
+/// inserting a new DBG_VALUE instruction at the start of the @MBB - if the same
+/// source variable in all the predecessors of @MBB reside in the same location.
+void LiveDebugValues::join(MachineBasicBlock &MBB, VarLocInMBB &OutLocs,
+                           VarLocInMBB &InLocs) {
+  DEBUG(dbgs() << "join MBB: " << MBB.getName() << "\n");
+
+  MBBJoined = false;
+
+  VarLocList InLocsT; // Temporary incoming locations.
+
+  // For all predecessors of this MBB, find the set of VarLocs that can be
+  // joined.
+  for (auto p : MBB.predecessors()) {
+    auto OL = OutLocs.find(p);
+    // Join is null in case of empty OutLocs from any of the pred.
+    if (OL == OutLocs.end())
+      return;
+
+    // Just copy over the Out locs to incoming locs for the first predecessor.
+    if (p == *MBB.pred_begin()) {
+      InLocsT = OL->second;
+      continue;
+    }
+
+    // Join with this predecessor.
+    VarLocList &VLL = OL->second;
+    InLocsT.erase(
+        std::remove_if(InLocsT.begin(), InLocsT.end(), [&](VarLoc &ILT) {
+          return (std::find_if(VLL.begin(), VLL.end(), [&](const VarLoc &V) {
+                    return (ILT == V);
+                  }) == VLL.end());
+        }),
+        InLocsT.end());
+  }
+
+  if (InLocsT.empty())
+    return;
+
+  if (InLocs.find(&MBB) == InLocs.end()) {
+    // Create space for new Incoming locs entries.
+    VarLocList VLL;
+    InLocs.insert(std::make_pair(&MBB, std::move(VLL)));
+  }
+  auto IL = InLocs.find(&MBB);
+  assert(IL != InLocs.end());
+  VarLocList &ILL = IL->second;
+
+  // Insert DBG_VALUE instructions, if not already inserted.
+  for (auto ILT : InLocsT) {
+    if (std::find_if(ILL.begin(), ILL.end(), [&](const VarLoc &I) {
+          return (ILT == I);
+        }) == ILL.end()) {
+      // This VarLoc is not found in InLocs i.e. it is not yet inserted. So, a
+      // new range is started for the var from the mbb's beginning by inserting
+      // a new DBG_VALUE. transfer() will end this range however appropriate.
+      const MachineInstr *DMI = ILT.MI;
+      MachineInstr *MI =
+          BuildMI(MBB, MBB.instr_begin(), DMI->getDebugLoc(), DMI->getDesc(),
+                  DMI->isIndirectDebugValue(), DMI->getOperand(0).getReg(), 0,
+                  DMI->getDebugVariable(), DMI->getDebugExpression());
+      if (DMI->isIndirectDebugValue())
+        MI->getOperand(1).setImm(DMI->getOperand(1).getImm());
+      DEBUG(dbgs() << "Inserted: "; MI->dump(););
+      ++NumInserted;
+      MBBJoined = true; // rerun transfer().
+
+      VarLoc V(ILT.Var, MI);
+      ILL.push_back(std::move(V));
+    }
+  }
+}
+
+/// Calculate the liveness information for the given machine function and
+/// extend ranges across basic blocks.
+bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
+
+  DEBUG(dbgs() << "\nDebug Range Extension\n");
+
+  bool Changed = false;
+  OLChanged = MBBJoined = false;
+
+  VarLocList OpenRanges; // Ranges that are open until end of bb.
+  VarLocInMBB OutLocs;   // Ranges that exist beyond bb.
+  VarLocInMBB InLocs;    // Ranges that are incoming after joining.
+
+  std::deque<MachineBasicBlock *> BBWorklist;
+
+  // Initialize every mbb with OutLocs.
+  for (auto &MBB : MF)
+    for (auto &MI : MBB)
+      transfer(MI, OpenRanges, OutLocs);
+  DEBUG(printVarLocInMBB(OutLocs, "OutLocs after initialization", dbgs()));
+
+  // Construct a worklist of MBBs.
+  for (auto &MBB : MF)
+    BBWorklist.push_back(&MBB);
+
+  // Perform join() and transfer() using the worklist until the ranges converge
+  // Ranges have converged when the worklist is empty.
+  while (!BBWorklist.empty()) {
+    MachineBasicBlock *MBB = BBWorklist.front();
+    BBWorklist.pop_front();
+
+    join(*MBB, OutLocs, InLocs);
+
+    if (MBBJoined) {
+      Changed = true;
+      for (auto &MI : *MBB)
+        transfer(MI, OpenRanges, OutLocs);
+      DEBUG(printVarLocInMBB(OutLocs, "OutLocs after propagating", dbgs()));
+      DEBUG(printVarLocInMBB(InLocs, "InLocs after propagating", dbgs()));
+
+      if (OLChanged) {
+        OLChanged = false;
+        for (auto s : MBB->successors())
+          if (std::find(BBWorklist.begin(), BBWorklist.end(), s) ==
+              BBWorklist.end()) // add if not already present.
+            BBWorklist.push_back(s);
+      }
+    }
+  }
+  DEBUG(printVarLocInMBB(OutLocs, "Final OutLocs", dbgs()));
+  DEBUG(printVarLocInMBB(InLocs, "Final InLocs", dbgs()));
+  return Changed;
+}
+
+bool LiveDebugValues::runOnMachineFunction(MachineFunction &MF) {
+  TRI = MF.getSubtarget().getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+
+  bool Changed = false;
+
+  Changed |= ExtendRanges(MF);
+
+  return Changed;
+}
diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp
index 9d473fdb6cd7..b87979215481 100644
--- a/lib/CodeGen/Passes.cpp
+++ b/lib/CodeGen/Passes.cpp
@@ -597,6 +597,7 @@ void TargetPassConfig::addMachinePasses() {
   addPass(&FuncletLayoutID, false);
 
   addPass(&StackMapLivenessID, false);
+  addPass(&LiveDebugValuesID);
 
   AddingMachinePasses = false;
 }
diff --git a/test/DebugInfo/MIR/X86/lit.local.cfg b/test/DebugInfo/MIR/X86/lit.local.cfg
new file mode 100644
index 000000000000..c8625f4d9d24
--- /dev/null
+++ b/test/DebugInfo/MIR/X86/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'X86' in config.root.targets:
+    config.unsupported = True
diff --git a/test/DebugInfo/MIR/X86/live-debug-values-3preds.mir b/test/DebugInfo/MIR/X86/live-debug-values-3preds.mir
new file mode 100644
index 000000000000..84be910aaf74
--- /dev/null
+++ b/test/DebugInfo/MIR/X86/live-debug-values-3preds.mir
@@ -0,0 +1,299 @@
+# RUN: llc -run-pass=livedebugvalues -march=x86-64 -o /dev/null %s | FileCheck %s
+
+# Test the extension of debug ranges from 3 predecessors.
+# Generated from the source file LiveDebugValues-3preds.c:
+# #include <stdio.h>
+# int add(int x, int y, int z, int a) {
+#  int i;
+#  for (i = 0; i < x * y; i++) {
+#    if (i < x) {
+#      a = a * x;
+#      break;
+#    }
+#    if (i < y) {
+#      a = a * y;
+#      break;
+#    }
+#    if (i < z) {
+#      a = a * z;
+#      break;
+#    }
+#  }
+#  return a;
+# }
+# with clang -g -O1 -c -emit-llvm LiveDebugValues-3preds.c -S -o live-debug-values-3preds.ll
+# then llc -stop-after stackmap-liveness live-debug-values-3preds.ll -o /dev/null > live-debug-values-3preds.mir
+
+# DBG_VALUE for variables "x", "y" and "z" are extended into BB#9 from its
+# predecessors BB#0, BB#2 and BB#8.
+# CHECK:      bb.9.for.end:
+# CHECK:      DBG_VALUE debug-use %edx, debug-use _, !11, !17, debug-location !21
+# CHECK-NEXT: DBG_VALUE debug-use %esi, debug-use _, !10, !17, debug-location !19
+# CHECK-NEXT: DBG_VALUE debug-use %edi, debug-use _, !9, !17, debug-location !18
+
+
+--- |
+  ; ModuleID = 'live-debug-values-3preds.ll'
+  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-unknown-linux-gnu"
+  
+  ; Function Attrs: norecurse nounwind readnone uwtable
+  define i32 @add(i32 %x, i32 %y, i32 %z, i32 %a) #0 !dbg !4 {
+  entry:
+    tail call void @llvm.dbg.value(metadata i32 %x, i64 0, metadata !9, metadata !17), !dbg !18
+    tail call void @llvm.dbg.value(metadata i32 %y, i64 0, metadata !10, metadata !17), !dbg !19
+    tail call void @llvm.dbg.value(metadata i32 %z, i64 0, metadata !11, metadata !17), !dbg !21
+    tail call void @llvm.dbg.value(metadata i32 %a, i64 0, metadata !12, metadata !17), !dbg !23
+    tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !13, metadata !17), !dbg !25
+    %mul = mul nsw i32 %y, %x, !dbg !26
+    %cmp.24 = icmp sgt i32 %mul, 0, !dbg !30
+    br i1 %cmp.24, label %for.body.preheader, label %for.end, !dbg !31
+  
+  for.body.preheader:                               ; preds = %entry
+    br label %for.body, !dbg !32
+  
+  for.cond:                                         ; preds = %if.end.6
+    %cmp = icmp slt i32 %inc, %mul, !dbg !30
+    br i1 %cmp, label %for.body, label %for.end, !dbg !31
+  
+  for.body:                                         ; preds = %for.cond, %for.body.preheader
+    %i.025 = phi i32 [ %inc, %for.cond ], [ 0, %for.body.preheader ]
+    %0 = icmp sgt i32 %x, 0
+    br i1 %0, label %if.then, label %if.end, !dbg !35
+  
+  if.then:                                          ; preds = %for.body
+    %mul2 = mul nsw i32 %a, %x, !dbg !36
+    tail call void @llvm.dbg.value(metadata i32 %mul2, i64 0, metadata !12, metadata !17), !dbg !23
+    br label %for.end, !dbg !38
+  
+  if.end:                                           ; preds = %for.body
+    %1 = icmp sgt i32 %y, 0
+    br i1 %1, label %if.then.4, label %if.end.6, !dbg !39
+  
+  if.then.4:                                        ; preds = %if.end
+    %mul5 = mul nsw i32 %a, %y, !dbg !40
+    tail call void @llvm.dbg.value(metadata i32 %mul5, i64 0, metadata !12, metadata !17), !dbg !23
+    br label %for.end, !dbg !43
+  
+  if.end.6:                                         ; preds = %if.end
+    %2 = icmp sgt i32 %z, 0
+    %inc = add nuw nsw i32 %i.025, 1, !dbg !44
+    tail call void @llvm.dbg.value(metadata i32 %inc, i64 0, metadata !13, metadata !17), !dbg !25
+    br i1 %2, label %if.then.8, label %for.cond, !dbg !45
+  
+  if.then.8:                                        ; preds = %if.end.6
+    %mul9 = mul nsw i32 %a, %z, !dbg !46
+    tail call void @llvm.dbg.value(metadata i32 %mul9, i64 0, metadata !12, metadata !17), !dbg !23
+    br label %for.end, !dbg !49
+  
+  for.end:                                          ; preds = %for.cond, %if.then.8, %if.then.4, %if.then, %entry
+    %a.addr.0 = phi i32 [ %mul2, %if.then ], [ %mul5, %if.then.4 ], [ %mul9, %if.then.8 ], [ %a, %entry ], [ %a, %for.cond ]
+    ret i32 %a.addr.0, !dbg !50
+  }
+  
+  ; Function Attrs: nounwind readnone
+  declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+  
+  attributes #0 = { norecurse nounwind readnone uwtable }
+  attributes #1 = { nounwind readnone }
+  
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!14, !15}
+  !llvm.ident = !{!16}
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 (trunk 253049) ", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, subprograms: !3)
+  !1 = !DIFile(filename: "LiveDebugValues-3preds.c", directory: "/home/vt/julia/test/tvvikram")
+  !2 = !{}
+  !3 = !{!4}
+  !4 = distinct !DISubprogram(name: "add", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, variables: !8)
+  !5 = !DISubroutineType(types: !6)
+  !6 = !{!7, !7, !7, !7, !7}
+  !7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+  !8 = !{!9, !10, !11, !12, !13}
+  !9 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !1, line: 1, type: !7)
+  !10 = !DILocalVariable(name: "y", arg: 2, scope: !4, file: !1, line: 1, type: !7)
+  !11 = !DILocalVariable(name: "z", arg: 3, scope: !4, file: !1, line: 1, type: !7)
+  !12 = !DILocalVariable(name: "a", arg: 4, scope: !4, file: !1, line: 1, type: !7)
+  !13 = !DILocalVariable(name: "i", scope: !4, file: !1, line: 2, type: !7)
+  !14 = !{i32 2, !"Dwarf Version", i32 4}
+  !15 = !{i32 2, !"Debug Info Version", i32 3}
+  !16 = !{!"clang version 3.8.0 (trunk 253049) "}
+  !17 = !DIExpression()
+  !18 = !DILocation(line: 1, column: 13, scope: !4)
+  !19 = !DILocation(line: 1, column: 20, scope: !20)
+  !20 = !DILexicalBlockFile(scope: !4, file: !1, discriminator: 1)
+  !21 = !DILocation(line: 1, column: 27, scope: !22)
+  !22 = !DILexicalBlockFile(scope: !4, file: !1, discriminator: 2)
+  !23 = !DILocation(line: 1, column: 34, scope: !24)
+  !24 = !DILexicalBlockFile(scope: !4, file: !1, discriminator: 3)
+  !25 = !DILocation(line: 2, column: 7, scope: !20)
+  !26 = !DILocation(line: 3, column: 21, scope: !27)
+  !27 = !DILexicalBlockFile(scope: !28, file: !1, discriminator: 1)
+  !28 = distinct !DILexicalBlock(scope: !29, file: !1, line: 3, column: 3)
+  !29 = distinct !DILexicalBlock(scope: !4, file: !1, line: 3, column: 3)
+  !30 = !DILocation(line: 3, column: 17, scope: !27)
+  !31 = !DILocation(line: 3, column: 3, scope: !27)
+  !32 = !DILocation(line: 4, column: 11, scope: !33)
+  !33 = distinct !DILexicalBlock(scope: !34, file: !1, line: 4, column: 9)
+  !34 = distinct !DILexicalBlock(scope: !28, file: !1, line: 3, column: 31)
+  !35 = !DILocation(line: 4, column: 9, scope: !34)
+  !36 = !DILocation(line: 5, column: 13, scope: !37)
+  !37 = distinct !DILexicalBlock(scope: !33, file: !1, line: 4, column: 16)
+  !38 = !DILocation(line: 6, column: 7, scope: !37)
+  !39 = !DILocation(line: 8, column: 9, scope: !34)
+  !40 = !DILocation(line: 9, column: 13, scope: !41)
+  !41 = distinct !DILexicalBlock(scope: !42, file: !1, line: 8, column: 16)
+  !42 = distinct !DILexicalBlock(scope: !34, file: !1, line: 8, column: 9)
+  !43 = !DILocation(line: 10, column: 7, scope: !41)
+  !44 = !DILocation(line: 3, column: 27, scope: !28)
+  !45 = !DILocation(line: 12, column: 9, scope: !34)
+  !46 = !DILocation(line: 13, column: 13, scope: !47)
+  !47 = distinct !DILexicalBlock(scope: !48, file: !1, line: 12, column: 16)
+  !48 = distinct !DILexicalBlock(scope: !34, file: !1, line: 12, column: 9)
+  !49 = !DILocation(line: 14, column: 7, scope: !47)
+  !50 = !DILocation(line: 17, column: 3, scope: !4)
+
+...
+---
+name:            add
+alignment:       4
+exposesReturnsTwice: false
+hasInlineAsm:    false
+isSSA:           false
+tracksRegLiveness: true
+tracksSubRegLiveness: false
+liveins:         
+  - { reg: '%edi' }
+  - { reg: '%esi' }
+  - { reg: '%edx' }
+  - { reg: '%ecx' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0.entry:
+    successors: %bb.1.for.body.preheader(20), %bb.9.for.end(12)
+    liveins: %ecx, %edi, %edx, %esi
+  
+    DBG_VALUE debug-use %edi, debug-use _, !9, !17, debug-location !18
+    DBG_VALUE debug-use %esi, debug-use _, !10, !17, debug-location !19
+    DBG_VALUE debug-use %edx, debug-use _, !11, !17, debug-location !21
+    DBG_VALUE debug-use %ecx, debug-use _, !12, !17, debug-location !23
+    DBG_VALUE 0, 0, !13, !17, debug-location !25
+    %r8d = MOV32rr %esi, debug-location !26
+    %r8d = IMUL32rr killed %r8d, %edi, implicit-def dead %eflags, debug-location !26
+    TEST32rr %r8d, %r8d, implicit-def %eflags, debug-location !31
+    JLE_1 %bb.9.for.end, implicit %eflags
+  
+  bb.1.for.body.preheader:
+    successors: %bb.3.for.body(0)
+    liveins: %ecx, %edi, %edx, %esi, %r8d
+  
+    DBG_VALUE debug-use %edi, debug-use _, !9, !17, debug-location !18
+    DBG_VALUE debug-use %esi, debug-use _, !10, !17, debug-location !19
+    DBG_VALUE debug-use %edx, debug-use _, !11, !17, debug-location !21
+    DBG_VALUE debug-use %ecx, debug-use _, !12, !17, debug-location !23
+    DBG_VALUE 0, 0, !13, !17, debug-location !25
+    %eax = XOR32rr undef %eax, undef %eax, implicit-def dead %eflags
+  
+  bb.3.for.body (align 4):
+    successors: %bb.4.if.then(4), %bb.5.if.end(124)
+    liveins: %eax, %ecx, %edi, %edx, %esi, %r8d
+  
+    DBG_VALUE debug-use %edi, debug-use _, !9, !17, debug-location !18
+    DBG_VALUE debug-use %esi, debug-use _, !10, !17, debug-location !19
+    DBG_VALUE debug-use %edx, debug-use _, !11, !17, debug-location !21
+    DBG_VALUE debug-use %ecx, debug-use _, !12, !17, debug-location !23
+    DBG_VALUE 0, 0, !13, !17, debug-location !25
+    TEST32rr %edi, %edi, implicit-def %eflags, debug-location !35
+    JG_1 %bb.4.if.then, implicit %eflags
+  
+  bb.5.if.end:
+    successors: %bb.6.if.then.4(4), %bb.7.if.end.6(124)
+    liveins: %eax, %ecx, %edi, %edx, %esi, %r8d
+  
+    DBG_VALUE debug-use %edi, debug-use _, !9, !17, debug-location !18
+    DBG_VALUE debug-use %esi, debug-use _, !10, !17, debug-location !19
+    DBG_VALUE debug-use %edx, debug-use _, !11, !17, debug-location !21
+    DBG_VALUE debug-use %ecx, debug-use _, !12, !17, debug-location !23
+    DBG_VALUE 0, 0, !13, !17, debug-location !25
+    TEST32rr %esi, %esi, implicit-def %eflags, debug-location !39
+    JG_1 %bb.6.if.then.4, implicit %eflags
+  
+  bb.7.if.end.6:
+    successors: %bb.8.if.then.8(4), %bb.2.for.cond(124)
+    liveins: %eax, %ecx, %edi, %edx, %esi, %r8d
+  
+    DBG_VALUE debug-use %edi, debug-use _, !9, !17, debug-location !18
+    DBG_VALUE debug-use %esi, debug-use _, !10, !17, debug-location !19
+    DBG_VALUE debug-use %edx, debug-use _, !11, !17, debug-location !21
+    DBG_VALUE debug-use %ecx, debug-use _, !12, !17, debug-location !23
+    DBG_VALUE 0, 0, !13, !17, debug-location !25
+    TEST32rr %edx, %edx, implicit-def %eflags, debug-location !45
+    JG_1 %bb.8.if.then.8, implicit %eflags
+  
+  bb.2.for.cond:
+    successors: %bb.3.for.body(124), %bb.9.for.end(4)
+    liveins: %eax, %ecx, %edi, %edx, %esi, %r8d
+  
+    DBG_VALUE debug-use %edi, debug-use _, !9, !17, debug-location !18
+    DBG_VALUE debug-use %esi, debug-use _, !10, !17, debug-location !19
+    DBG_VALUE debug-use %edx, debug-use _, !11, !17, debug-location !21
+    DBG_VALUE debug-use %ecx, debug-use _, !12, !17, debug-location !23
+    DBG_VALUE 0, 0, !13, !17, debug-location !25
+    %eax = INC32r killed %eax, implicit-def dead %eflags, debug-location !44
+    DBG_VALUE debug-use %eax, debug-use _, !13, !17, debug-location !25
+    CMP32rr %eax, %r8d, implicit-def %eflags, debug-location !31
+    JL_1 %bb.3.for.body, implicit %eflags
+    JMP_1 %bb.9.for.end
+  
+  bb.4.if.then:
+    liveins: %ecx, %edi
+  
+    DBG_VALUE debug-use %edi, debug-use _, !9, !17, debug-location !18
+    DBG_VALUE debug-use %ecx, debug-use _, !12, !17, debug-location !23
+    DBG_VALUE 0, 0, !13, !17, debug-location !25
+    %ecx = IMUL32rr killed %ecx, killed %edi, implicit-def dead %eflags, debug-location !36
+    DBG_VALUE 0, 0, !13, !17, debug-location !25
+    %eax = MOV32rr killed %ecx, debug-location !50
+    RETQ %eax, debug-location !50
+  
+  bb.6.if.then.4:
+    liveins: %ecx, %esi
+  
+    DBG_VALUE debug-use %esi, debug-use _, !10, !17, debug-location !19
+    DBG_VALUE debug-use %ecx, debug-use _, !12, !17, debug-location !23
+    DBG_VALUE 0, 0, !13, !17, debug-location !25
+    %ecx = IMUL32rr killed %ecx, killed %esi, implicit-def dead %eflags, debug-location !40
+    DBG_VALUE 0, 0, !13, !17, debug-location !25
+    %eax = MOV32rr killed %ecx, debug-location !50
+    RETQ %eax, debug-location !50
+  
+  bb.8.if.then.8:
+    successors: %bb.9.for.end(0)
+    liveins: %ecx, %edx
+  
+    DBG_VALUE debug-use %edx, debug-use _, !11, !17, debug-location !21
+    DBG_VALUE debug-use %ecx, debug-use _, !12, !17, debug-location !23
+    DBG_VALUE 0, 0, !13, !17, debug-location !25
+    %ecx = IMUL32rr killed %ecx, killed %edx, implicit-def dead %eflags, debug-location !46
+  
+  bb.9.for.end:
+    liveins: %ecx
+  
+    DBG_VALUE 0, 0, !13, !17, debug-location !25
+    %eax = MOV32rr killed %ecx, debug-location !50
+    RETQ %eax, debug-location !50
+
+...
diff --git a/test/DebugInfo/MIR/X86/live-debug-values.mir b/test/DebugInfo/MIR/X86/live-debug-values.mir
new file mode 100644
index 000000000000..0af408a635f6
--- /dev/null
+++ b/test/DebugInfo/MIR/X86/live-debug-values.mir
@@ -0,0 +1,260 @@
+# RUN: llc -run-pass=livedebugvalues -march=x86-64 -o /dev/null %s | FileCheck %s
+
+# Test the extension of debug ranges from predecessors.
+# Generated from the source file LiveDebugValues.c:
+# #include <stdio.h>
+# int m;
+# extern int inc(int n); 
+# extern int change(int n); 
+# extern int modify(int n); 
+# int main(int argc, char **argv) {
+#   int n;
+#   if (argc != 2)
+#     n = 2;
+#   else
+#     n = atoi(argv[1]);
+#   n = change(n);
+#   if (n > 10) {
+#     m = modify(n);
+#     m = m + n;  // var `m' doesn't has a dbg.value
+#   }   
+#   else
+#     m = inc(n); // var `m' doesn't has a dbg.value
+#   printf("m(main): %d\n", m); 
+#   return 0;
+# }
+# with clang -g -O3 -c -emit-llvm LiveDebugValues.c -S -o live-debug-values.ll
+# then llc -stop-after stackmap-liveness live-debug-values.ll -o /dev/null > live-debug-values.mir
+# This case will also produce multiple locations but only the debug range
+# extension is tested here. This test case is tested with DWARF information under
+# llvm/test/DebugInfo/live-debug-values.ll and present here for testing under
+# MIR->MIR serialization.
+
+# DBG_VALUE for variable "n" is extended into BB#5 from its predecessors BB#3
+# and BB#4.
+# CHECK:      bb.5.if.end.7:
+# CHECK:      DBG_VALUE debug-use %rsi, debug-use _, !13, !20, debug-location !22
+# CHECK-NEXT: DBG_VALUE debug-use %ebx, debug-use _, !14, !20, debug-location !33
+
+
+--- |
+  ; ModuleID = 'live-debug-values.ll'
+  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-unknown-linux-gnu"
+  
+  @m = common global i32 0, align 4
+  @.str = private unnamed_addr constant [13 x i8] c"m(main): %d\0A\00", align 1
+  
+  ; Function Attrs: nounwind uwtable
+  define i32 @main(i32 %argc, i8** nocapture readonly %argv) #0 !dbg !4 {
+  entry:
+    tail call void @llvm.dbg.value(metadata i32 %argc, i64 0, metadata !12, metadata !20), !dbg !21
+    tail call void @llvm.dbg.value(metadata i8** %argv, i64 0, metadata !13, metadata !20), !dbg !22
+    %cmp = icmp eq i32 %argc, 2, !dbg !24
+    br i1 %cmp, label %if.else, label %if.end, !dbg !26
+  
+  if.else:                                          ; preds = %entry
+    %arrayidx = getelementptr inbounds i8*, i8** %argv, i64 1, !dbg !27
+    %0 = load i8*, i8** %arrayidx, align 8, !dbg !27, !tbaa !28
+    %call = tail call i32 (i8*, ...) bitcast (i32 (...)* @atoi to i32 (i8*, ...)*)(i8* %0) #4, !dbg !32
+    tail call void @llvm.dbg.value(metadata i32 %call, i64 0, metadata !14, metadata !20), !dbg !33
+    br label %if.end
+  
+  if.end:                                           ; preds = %if.else, %entry
+    %n.0 = phi i32 [ %call, %if.else ], [ 2, %entry ]
+    %call1 = tail call i32 @change(i32 %n.0) #4, !dbg !34
+    tail call void @llvm.dbg.value(metadata i32 %call1, i64 0, metadata !14, metadata !20), !dbg !33
+    %cmp2 = icmp sgt i32 %call1, 10, !dbg !35
+    br i1 %cmp2, label %if.then.3, label %if.else.5, !dbg !37
+  
+  if.then.3:                                        ; preds = %if.end
+    %call4 = tail call i32 @modify(i32 %call1) #4, !dbg !38
+    %add = add nsw i32 %call4, %call1, !dbg !40
+    br label %if.end.7, !dbg !41
+  
+  if.else.5:                                        ; preds = %if.end
+    %call6 = tail call i32 @inc(i32 %call1) #4, !dbg !42
+    br label %if.end.7
+  
+  if.end.7:                                         ; preds = %if.else.5, %if.then.3
+    %storemerge = phi i32 [ %call6, %if.else.5 ], [ %add, %if.then.3 ]
+    store i32 %storemerge, i32* @m, align 4, !dbg !43, !tbaa !44
+    %call8 = tail call i32 (i8*, ...) @printf(i8* nonnull getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i64 0, i64 0), i32 %storemerge) #4, !dbg !46
+    ret i32 0, !dbg !47
+  }
+  
+  declare i32 @atoi(...) #1
+  
+  declare i32 @change(i32) #1
+  
+  declare i32 @modify(i32) #1
+  
+  declare i32 @inc(i32) #1
+  
+  ; Function Attrs: nounwind
+  declare i32 @printf(i8* nocapture readonly, ...) #2
+  
+  ; Function Attrs: nounwind readnone
+  declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #3
+  
+  attributes #0 = { nounwind uwtable }
+  attributes #1 = { nounwind }
+  attributes #2 = { nounwind }
+  attributes #3 = { nounwind readnone }
+  attributes #4 = { nounwind }
+  
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!17, !18}
+  !llvm.ident = !{!19}
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 (trunk 253049) ", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, subprograms: !3, globals: !15)
+  !1 = !DIFile(filename: "LiveDebugValues.c", directory: "/home/vt/julia/test/tvvikram")
+  !2 = !{}
+  !3 = !{!4}
+  !4 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 6, type: !5, isLocal: false, isDefinition: true, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: true, variables: !11)
+  !5 = !DISubroutineType(types: !6)
+  !6 = !{!7, !7, !8}
+  !7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+  !8 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 64, align: 64)
+  !9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64, align: 64)
+  !10 = !DIBasicType(name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
+  !11 = !{!12, !13, !14}
+  !12 = !DILocalVariable(name: "argc", arg: 1, scope: !4, file: !1, line: 6, type: !7)
+  !13 = !DILocalVariable(name: "argv", arg: 2, scope: !4, file: !1, line: 6, type: !8)
+  !14 = !DILocalVariable(name: "n", scope: !4, file: !1, line: 7, type: !7)
+  !15 = !{!16}
+  !16 = !DIGlobalVariable(name: "m", scope: !0, file: !1, line: 2, type: !7, isLocal: false, isDefinition: true, variable: i32* @m)
+  !17 = !{i32 2, !"Dwarf Version", i32 4}
+  !18 = !{i32 2, !"Debug Info Version", i32 3}
+  !19 = !{!"clang version 3.8.0 (trunk 253049)"}
+  !20 = !DIExpression()
+  !21 = !DILocation(line: 6, column: 14, scope: !4)
+  !22 = !DILocation(line: 6, column: 27, scope: !23)
+  !23 = !DILexicalBlockFile(scope: !4, file: !1, discriminator: 1)
+  !24 = !DILocation(line: 8, column: 12, scope: !25)
+  !25 = distinct !DILexicalBlock(scope: !4, file: !1, line: 8, column: 7)
+  !26 = !DILocation(line: 8, column: 7, scope: !4)
+  !27 = !DILocation(line: 11, column: 14, scope: !25)
+  !28 = !{!29, !29, i64 0}
+  !29 = !{!"any pointer", !30, i64 0}
+  !30 = !{!"omnipotent char", !31, i64 0}
+  !31 = !{!"Simple C/C++ TBAA"}
+  !32 = !DILocation(line: 11, column: 9, scope: !25)
+  !33 = !DILocation(line: 7, column: 7, scope: !23)
+  !34 = !DILocation(line: 12, column: 7, scope: !4)
+  !35 = !DILocation(line: 13, column: 9, scope: !36)
+  !36 = distinct !DILexicalBlock(scope: !4, file: !1, line: 13, column: 7)
+  !37 = !DILocation(line: 13, column: 7, scope: !4)
+  !38 = !DILocation(line: 14, column: 9, scope: !39)
+  !39 = distinct !DILexicalBlock(scope: !36, file: !1, line: 13, column: 15)
+  !40 = !DILocation(line: 15, column: 11, scope: !39)
+  !41 = !DILocation(line: 16, column: 3, scope: !39)
+  !42 = !DILocation(line: 18, column: 9, scope: !36)
+  !43 = !DILocation(line: 15, column: 7, scope: !39)
+  !44 = !{!45, !45, i64 0}
+  !45 = !{!"int", !30, i64 0}
+  !46 = !DILocation(line: 19, column: 3, scope: !4)
+  !47 = !DILocation(line: 20, column: 3, scope: !4)
+
+...
+---
+name:            main
+alignment:       4
+exposesReturnsTwice: false
+hasInlineAsm:    false
+isSSA:           false
+tracksRegLiveness: true
+tracksSubRegLiveness: false
+liveins:         
+  - { reg: '%edi' }
+  - { reg: '%rsi' }
+calleeSavedRegisters: [ '%bh', '%bl', '%bp', '%bpl', '%bx', '%ebp', '%ebx', 
+                        '%rbp', '%rbx', '%r12', '%r13', '%r14', '%r15', 
+                        '%r12b', '%r13b', '%r14b', '%r15b', '%r12d', '%r13d', 
+                        '%r14d', '%r15d', '%r12w', '%r13w', '%r14w', '%r15w' ]
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       8
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+fixedStack:      
+  - { id: 0, type: spill-slot, offset: -16, size: 8, alignment: 16, callee-saved-register: '%rbx' }
+body:             |
+  bb.0.entry:
+    successors: %bb.1.if.else(16), %bb.2.if.end(16)
+    liveins: %edi, %rsi, %rbx
+  
+    frame-setup PUSH64r killed %rbx, implicit-def %rsp, implicit %rsp
+    CFI_INSTRUCTION .cfi_def_cfa_offset 16
+    CFI_INSTRUCTION .cfi_offset %rbx, -16
+    DBG_VALUE debug-use %edi, debug-use _, !12, !20, debug-location !21
+    DBG_VALUE debug-use %rsi, debug-use _, !13, !20, debug-location !22
+    %eax = MOV32rr %edi
+    DBG_VALUE debug-use %eax, debug-use _, !12, !20, debug-location !21
+    %edi = MOV32ri 2
+    CMP32ri8 killed %eax, 2, implicit-def %eflags, debug-location !26
+    JNE_1 %bb.2.if.end, implicit %eflags
+  
+  bb.1.if.else:
+    successors: %bb.2.if.end(0)
+    liveins: %rsi
+  
+    DBG_VALUE debug-use %rsi, debug-use _, !13, !20, debug-location !22
+    %rdi = MOV64rm killed %rsi, 1, _, 8, _, debug-location !27 :: (load 8 from %ir.arrayidx, !tbaa !28)
+    dead %eax = XOR32rr undef %eax, undef %eax, implicit-def dead %eflags, implicit-def %al, debug-location !32
+    CALL64pcrel32 @atoi, csr_64, implicit %rsp, implicit %rdi, implicit %al, implicit-def %rsp, implicit-def %eax, debug-location !32
+    %edi = MOV32rr %eax, debug-location !32
+    DBG_VALUE debug-use %edi, debug-use _, !14, !20, debug-location !33
+  
+  bb.2.if.end:
+    successors: %bb.3.if.then.3(16), %bb.4.if.else.5(16)
+    liveins: %edi
+  
+    CALL64pcrel32 @change, csr_64, implicit %rsp, implicit %edi, implicit-def %rsp, implicit-def %eax, debug-location !34
+    %ebx = MOV32rr %eax, debug-location !34
+    DBG_VALUE debug-use %ebx, debug-use _, !14, !20, debug-location !33
+    CMP32ri8 %ebx, 11, implicit-def %eflags, debug-location !37
+    JL_1 %bb.4.if.else.5, implicit killed %eflags, debug-location !37
+  
+  bb.3.if.then.3:
+    successors: %bb.5.if.end.7(0)
+    liveins: %ebx
+  
+    DBG_VALUE debug-use %ebx, debug-use _, !14, !20, debug-location !33
+    %edi = MOV32rr %ebx, debug-location !38
+    CALL64pcrel32 @modify, csr_64, implicit %rsp, implicit %edi, implicit-def %rsp, implicit-def %eax, debug-location !38
+    %ecx = MOV32rr %eax, debug-location !38
+    %ecx = ADD32rr killed %ecx, killed %ebx, implicit-def dead %eflags, debug-location !40
+    JMP_1 %bb.5.if.end.7
+  
+  bb.4.if.else.5:
+    successors: %bb.5.if.end.7(0)
+    liveins: %ebx
+  
+    DBG_VALUE debug-use %ebx, debug-use _, !14, !20, debug-location !33
+    %edi = MOV32rr killed %ebx, debug-location !42
+    CALL64pcrel32 @inc, csr_64, implicit %rsp, implicit %edi, implicit-def %rsp, implicit-def %eax, debug-location !42
+    %ecx = MOV32rr %eax, debug-location !42
+  
+  bb.5.if.end.7:
+    liveins: %ecx
+  
+    MOV32mr %rip, 1, _, @m, _, %ecx, debug-location !43 :: (store 4 into @m, !tbaa !44)
+    dead undef %edi = MOV32ri64 @.str, implicit-def %rdi, debug-location !46
+    dead %eax = XOR32rr undef %eax, undef %eax, implicit-def dead %eflags, implicit-def %al, debug-location !47
+    %esi = MOV32rr killed %ecx, debug-location !46
+    CALL64pcrel32 @printf, csr_64, implicit %rsp, implicit %rdi, implicit %esi, implicit %al, implicit-def %rsp, implicit-def dead %eax, debug-location !46
+    %eax = XOR32rr undef %eax, undef %eax, implicit-def dead %eflags, debug-location !47
+    %rbx = POP64r implicit-def %rsp, implicit %rsp, debug-location !47
+    RETQ %eax, debug-location !47
+
+...
diff --git a/test/DebugInfo/MIR/lit.local.cfg b/test/DebugInfo/MIR/lit.local.cfg
new file mode 100644
index 000000000000..e69aa5765356
--- /dev/null
+++ b/test/DebugInfo/MIR/lit.local.cfg
@@ -0,0 +1,2 @@
+config.suffixes = ['.mir']
+
diff --git a/test/DebugInfo/X86/array.ll b/test/DebugInfo/X86/array.ll
index a50a44cfb954..2d2a8fb86639 100644
--- a/test/DebugInfo/X86/array.ll
+++ b/test/DebugInfo/X86/array.ll
@@ -17,6 +17,8 @@
 ; rdar://problem/14874886
 ;
 ; CHECK:     ##DEBUG_VALUE: main:array <- [%R{{.*}}+0]
+; CHECK:     ##DEBUG_VALUE: main:array <- [%R{{.*}}+0]
+; CHECK:     ##DEBUG_VALUE: main:array <- [%R{{.*}}+0]
 ; CHECK-NOT: ##DEBUG_VALUE: main:array <- %R{{.*}}
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.9.0"
diff --git a/test/DebugInfo/X86/fission-ranges.ll b/test/DebugInfo/X86/fission-ranges.ll
index 531024ace9b8..9c9fd7d6e6fe 100644
--- a/test/DebugInfo/X86/fission-ranges.ll
+++ b/test/DebugInfo/X86/fission-ranges.ll
@@ -29,16 +29,16 @@
 ; CHECK-NEXT:      Location description: 11 00
 ; CHECK-NEXT: {{^$}}
 ; CHECK-NEXT:   Beginning address index: 3
-; CHECK-NEXT:                    Length: 21
+; CHECK-NEXT:                    Length: 25
 ; CHECK-NEXT:      Location description: 50 93 04
 ; CHECK: [[E]]: Beginning address index: 4
-; CHECK-NEXT:                    Length: 19
+; CHECK-NEXT:                    Length: 23
 ; CHECK-NEXT:      Location description: 50 93 04
 ; CHECK: [[B]]: Beginning address index: 5
-; CHECK-NEXT:                    Length: 17
+; CHECK-NEXT:                    Length: 21
 ; CHECK-NEXT:      Location description: 50 93 04
 ; CHECK: [[D]]: Beginning address index: 6
-; CHECK-NEXT:                    Length: 17
+; CHECK-NEXT:                    Length: 21
 ; CHECK-NEXT:      Location description: 50 93 04
 
 ; Make sure we don't produce any relocations in any .dwo section (though in particular, debug_info.dwo)
diff --git a/test/DebugInfo/X86/pieces-3.ll b/test/DebugInfo/X86/pieces-3.ll
index d52748fc17fb..7a93e393b25f 100644
--- a/test/DebugInfo/X86/pieces-3.ll
+++ b/test/DebugInfo/X86/pieces-3.ll
@@ -26,7 +26,7 @@
 ; CHECK: .debug_loc
 ; CHECK: [[LOC]]:
 ; CHECK: Beginning address offset: 0x0000000000000000
-; CHECK:    Ending address offset: 0x0000000000000004
+; CHECK:    Ending address offset: 0x0000000000000008
 ; rdi, piece 0x00000008, piece 0x00000004, rsi, piece 0x00000004
 ; CHECK: Location description: 55 93 08 93 04 54 93 04 
 ;
diff --git a/test/DebugInfo/live-debug-values.ll b/test/DebugInfo/live-debug-values.ll
new file mode 100644
index 000000000000..8a9f16eb6d6c
--- /dev/null
+++ b/test/DebugInfo/live-debug-values.ll
@@ -0,0 +1,153 @@
+; RUN: llc -filetype=asm %s -o - | FileCheck %s
+
+; Test the extension of debug ranges from predecessors.
+; Generated from the source file LiveDebugValues.c:
+; #include <stdio.h>
+; int m;
+; extern int inc(int n); 
+; extern int change(int n); 
+; extern int modify(int n); 
+; int main(int argc, char **argv) {
+;   int n;
+;   if (argc != 2)
+;     n = 2;
+;   else
+;     n = atoi(argv[1]);
+;   n = change(n);
+;   if (n > 10) {
+;     m = modify(n);
+;     m = m + n;  // var `m' doesn't has a dbg.value
+;   }
+;   else
+;     m = inc(n); // var `m' doesn't has a dbg.value
+;   printf("m(main): %d\n", m); 
+;   return 0;
+; }
+; with clang -g -O3 -emit-llvm -c LiveDebugValues.c -S -o live-debug-values.ll
+; This case will also produce multiple locations but only the debug range
+; extension is tested here.
+
+; DBG_VALUE for variable "n" is extended into BB#5 from its predecessors BB#3
+; and BB#4.
+; CHECK:       .LBB0_5:
+; CHECK-NEXT:  #DEBUG_VALUE: main:argv <- %RSI
+; CHECK-NEXT:  #DEBUG_VALUE: main:n <- %EBX
+
+
+; ModuleID = 'LiveDebugValues.c'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@m = common global i32 0, align 4
+@.str = private unnamed_addr constant [13 x i8] c"m(main): %d\0A\00", align 1
+
+; Function Attrs: nounwind uwtable
+define i32 @main(i32 %argc, i8** nocapture readonly %argv) #0 !dbg !4 {
+entry:
+  tail call void @llvm.dbg.value(metadata i32 %argc, i64 0, metadata !12, metadata !20), !dbg !21
+  tail call void @llvm.dbg.value(metadata i8** %argv, i64 0, metadata !13, metadata !20), !dbg !22
+  %cmp = icmp eq i32 %argc, 2, !dbg !24
+  br i1 %cmp, label %if.else, label %if.end, !dbg !26
+
+if.else:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds i8*, i8** %argv, i64 1, !dbg !27
+  %0 = load i8*, i8** %arrayidx, align 8, !dbg !27, !tbaa !28
+  %call = tail call i32 (i8*, ...) bitcast (i32 (...)* @atoi to i32 (i8*, ...)*)(i8* %0) #4, !dbg !32
+  tail call void @llvm.dbg.value(metadata i32 %call, i64 0, metadata !14, metadata !20), !dbg !33
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.else
+  %n.0 = phi i32 [ %call, %if.else ], [ 2, %entry ]
+  %call1 = tail call i32 @change(i32 %n.0) #4, !dbg !34
+  tail call void @llvm.dbg.value(metadata i32 %call1, i64 0, metadata !14, metadata !20), !dbg !33
+  %cmp2 = icmp sgt i32 %call1, 10, !dbg !35
+  br i1 %cmp2, label %if.then.3, label %if.else.5, !dbg !37
+
+if.then.3:                                        ; preds = %if.end
+  %call4 = tail call i32 @modify(i32 %call1) #4, !dbg !38
+  %add = add nsw i32 %call4, %call1, !dbg !40
+  br label %if.end.7, !dbg !41
+
+if.else.5:                                        ; preds = %if.end
+  %call6 = tail call i32 @inc(i32 %call1) #4, !dbg !42
+  br label %if.end.7
+
+if.end.7:                                         ; preds = %if.else.5, %if.then.3
+  %storemerge = phi i32 [ %call6, %if.else.5 ], [ %add, %if.then.3 ]
+  store i32 %storemerge, i32* @m, align 4, !dbg !43, !tbaa !44
+  %call8 = tail call i32 (i8*, ...) @printf(i8* nonnull getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i64 0, i64 0), i32 %storemerge) #4, !dbg !46
+  ret i32 0, !dbg !47
+}
+
+declare i32 @atoi(...) #1
+
+declare i32 @change(i32) #1
+
+declare i32 @modify(i32) #1
+
+declare i32 @inc(i32) #1
+
+; Function Attrs: nounwind
+declare i32 @printf(i8* nocapture readonly, ...) #2
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #3
+
+attributes #0 = { nounwind uwtable }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind }
+attributes #3 = { nounwind readnone }
+attributes #4 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!17, !18}
+!llvm.ident = !{!19}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 (trunk 253049) ", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, subprograms: !3, globals: !15)
+!1 = !DIFile(filename: "LiveDebugValues.c", directory: "/home/vt/julia/test/tvvikram")
+!2 = !{}
+!3 = !{!4}
+!4 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 6, type: !5, isLocal: false, isDefinition: true, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: true, variables: !11)
+!5 = !DISubroutineType(types: !6)
+!6 = !{!7, !7, !8}
+!7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!8 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 64, align: 64)
+!9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64, align: 64)
+!10 = !DIBasicType(name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
+!11 = !{!12, !13, !14}
+!12 = !DILocalVariable(name: "argc", arg: 1, scope: !4, file: !1, line: 6, type: !7)
+!13 = !DILocalVariable(name: "argv", arg: 2, scope: !4, file: !1, line: 6, type: !8)
+!14 = !DILocalVariable(name: "n", scope: !4, file: !1, line: 7, type: !7)
+!15 = !{!16}
+!16 = !DIGlobalVariable(name: "m", scope: !0, file: !1, line: 2, type: !7, isLocal: false, isDefinition: true, variable: i32* @m)
+!17 = !{i32 2, !"Dwarf Version", i32 4}
+!18 = !{i32 2, !"Debug Info Version", i32 3}
+!19 = !{!"clang version 3.8.0 (trunk 253049) "}
+!20 = !DIExpression()
+!21 = !DILocation(line: 6, column: 14, scope: !4)
+!22 = !DILocation(line: 6, column: 27, scope: !23)
+!23 = !DILexicalBlockFile(scope: !4, file: !1, discriminator: 1)
+!24 = !DILocation(line: 8, column: 12, scope: !25)
+!25 = distinct !DILexicalBlock(scope: !4, file: !1, line: 8, column: 7)
+!26 = !DILocation(line: 8, column: 7, scope: !4)
+!27 = !DILocation(line: 11, column: 14, scope: !25)
+!28 = !{!29, !29, i64 0}
+!29 = !{!"any pointer", !30, i64 0}
+!30 = !{!"omnipotent char", !31, i64 0}
+!31 = !{!"Simple C/C++ TBAA"}
+!32 = !DILocation(line: 11, column: 9, scope: !25)
+!33 = !DILocation(line: 7, column: 7, scope: !23)
+!34 = !DILocation(line: 12, column: 7, scope: !4)
+!35 = !DILocation(line: 13, column: 9, scope: !36)
+!36 = distinct !DILexicalBlock(scope: !4, file: !1, line: 13, column: 7)
+!37 = !DILocation(line: 13, column: 7, scope: !4)
+!38 = !DILocation(line: 14, column: 9, scope: !39)
+!39 = distinct !DILexicalBlock(scope: !36, file: !1, line: 13, column: 15)
+!40 = !DILocation(line: 15, column: 11, scope: !39)
+!41 = !DILocation(line: 16, column: 3, scope: !39)
+!42 = !DILocation(line: 18, column: 9, scope: !36)
+!43 = !DILocation(line: 15, column: 7, scope: !39)
+!44 = !{!45, !45, i64 0}
+!45 = !{!"int", !30, i64 0}
+!46 = !DILocation(line: 19, column: 3, scope: !4)
+!47 = !DILocation(line: 20, column: 3, scope: !4)

From dc6de88d3b33c3424d029b3f557b961e37ae213d Mon Sep 17 00:00:00 2001
From: Mehdi Amini <mehdi.amini@apple.com>
Date: Wed, 9 Dec 2015 08:17:35 +0000
Subject: [PATCH 262/364] The current importing scheme is processing one
 function at a time, loading the source Module, linking the function in the
 destination module, and destroying the source Module before repeating with
 the next function to import (potentially from the same Module).

Ideally we would keep the source Module alive and import the next
Function needed from this Module. Unfortunately this is not possible
because the linker does not leave it in a usable state.

However we can do better by first computing the list of all candidates
per Module, and only then load the source Module and import all the
function we need for it.

The trick to process callees is to materialize function in the source
module when building the list of function to import, and inspect them
in their source module, collecting the list of callees for each
callee.

When we move the the actual import, we will import from each source
module exactly once. Each source module is loaded exactly once.
The only drawback it that it requires to have all the lazy-loaded
source Module in memory at the same time.

Currently this patch already improves considerably the link time,
a multithreaded link of llvm-dis on my laptop was:

  real  1m12.175s  user  6m32.430s sys  0m10.529s

and is now:

  real  0m40.697s  user  2m10.237s sys  0m4.375s

Note: this is the full link time (linker+Import+Optimizer+CodeGen)

Differential Revision: http://reviews.llvm.org/D15178

From: Mehdi Amini <mehdi.amini@apple.com>

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255100 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/IPO/FunctionImport.cpp | 198 +++++++++++++++++++-------
 1 file changed, 144 insertions(+), 54 deletions(-)

diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp
index b585a86b86a6..c6a70381e76c 100644
--- a/lib/Transforms/IPO/FunctionImport.cpp
+++ b/lib/Transforms/IPO/FunctionImport.cpp
@@ -24,6 +24,9 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/SourceMgr.h"
+
+#include <map>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "function-import"
@@ -50,37 +53,108 @@ static std::unique_ptr<Module> loadFile(const std::string &FileName,
   return Result;
 }
 
+namespace {
+/// Helper to load on demand a Module from file and cache it for subsequent
+/// queries. It can be used with the FunctionImporter.
+class ModuleLazyLoaderCache {
+  /// Cache of lazily loaded module for import.
+  StringMap<std::unique_ptr<Module>> ModuleMap;
+
+  /// Retrieve a Module from the cache or lazily load it on demand.
+  std::function<std::unique_ptr<Module>(StringRef FileName)> createLazyModule;
+
+public:
+  /// Create the loader, Module will be initialized in \p Context.
+  ModuleLazyLoaderCache(std::function<
+      std::unique_ptr<Module>(StringRef FileName)> createLazyModule)
+      : createLazyModule(createLazyModule) {}
+
+  /// Retrieve a Module from the cache or lazily load it on demand.
+  Module &operator()(StringRef FileName);
+};
+
+// Get a Module for \p FileName from the cache, or load it lazily.
+Module &ModuleLazyLoaderCache::operator()(StringRef Identifier) {
+  auto &Module = ModuleMap[Identifier];
+  if (!Module)
+    Module = createLazyModule(Identifier);
+  return *Module;
+}
+} // anonymous namespace
+
 /// Walk through the instructions in \p F looking for external
 /// calls not already in the \p CalledFunctions set. If any are
 /// found they are added to the \p Worklist for importing.
-static void findExternalCalls(const Function &F, StringSet<> &CalledFunctions,
+static void findExternalCalls(const Module &DestModule, Function &F,
+                              const FunctionInfoIndex &Index,
+                              StringSet<> &CalledFunctions,
                               SmallVector<StringRef, 64> &Worklist) {
+  // We need to suffix internal function calls imported from other modules,
+  // prepare the suffix ahead of time.
+  StringRef Suffix;
+  if (F.getParent() != &DestModule)
+    Suffix =
+        (Twine(".llvm.") +
+         Twine(Index.getModuleId(F.getParent()->getModuleIdentifier()))).str();
+
   for (auto &BB : F) {
     for (auto &I : BB) {
       if (isa<CallInst>(I)) {
         auto CalledFunction = cast<CallInst>(I).getCalledFunction();
         // Insert any new external calls that have not already been
         // added to set/worklist.
-        if (CalledFunction && CalledFunction->hasName() &&
-            CalledFunction->isDeclaration() &&
-            !CalledFunctions.count(CalledFunction->getName())) {
-          CalledFunctions.insert(CalledFunction->getName());
-          Worklist.push_back(CalledFunction->getName());
+        if (!CalledFunction || !CalledFunction->hasName())
+          continue;
+        // Ignore intrinsics early
+        if (CalledFunction->isIntrinsic()) {
+          assert(CalledFunction->getIntrinsicID() != 0);
+          continue;
+        }
+        auto ImportedName = CalledFunction->getName();
+        auto Renamed = (ImportedName + Suffix).str();
+        // Rename internal functions
+        if (CalledFunction->hasInternalLinkage()) {
+          ImportedName = Renamed;
+        }
+        auto It = CalledFunctions.insert(ImportedName);
+        if (!It.second) {
+          // This is a call to a function we already considered, skip.
+          continue;
         }
+        // Ignore functions already present in the destination module
+        auto *SrcGV = DestModule.getNamedValue(ImportedName);
+        if (SrcGV) {
+          assert(isa<Function>(SrcGV) && "Name collision during import");
+          if (!cast<Function>(SrcGV)->isDeclaration()) {
+            DEBUG(dbgs() << DestModule.getModuleIdentifier() << "Ignoring "
+                         << ImportedName << " already in DestinationModule\n");
+            continue;
+          }
+        }
+
+        Worklist.push_back(It.first->getKey());
+        DEBUG(dbgs() << DestModule.getModuleIdentifier()
+                     << " Adding callee for : " << ImportedName << " : "
+                     << F.getName() << "\n");
       }
     }
   }
 }
 
 // Helper function: given a worklist and an index, will process all the worklist
-// and import them based on the summary information
-static unsigned ProcessImportWorklist(
+// and decide what to import based on the summary information.
+//
+// Nothing is actually imported, functions are materialized in their source
+// module and analyzed there.
+//
+// \p ModuleToFunctionsToImportMap is filled with the set of Function to import
+// per Module.
+static void GetImportList(
     Module &DestModule, SmallVector<StringRef, 64> &Worklist,
-    StringSet<> &CalledFunctions, Linker &TheLinker,
-    const FunctionInfoIndex &Index,
-    std::function<std::unique_ptr<Module>(StringRef FileName)> &
-        LazyModuleLoader) {
-  unsigned ImportCount = 0;
+    StringSet<> &CalledFunctions,
+    std::map<StringRef, std::pair<Module *, DenseSet<const GlobalValue *>>> &
+        ModuleToFunctionsToImportMap,
+    const FunctionInfoIndex &Index, ModuleLazyLoaderCache &ModuleLoaderCache) {
   while (!Worklist.empty()) {
     auto CalledFunctionName = Worklist.pop_back_val();
     DEBUG(dbgs() << DestModule.getModuleIdentifier() << "Process import for "
@@ -116,40 +190,39 @@ static unsigned ProcessImportWorklist(
     }
 
     // Get the module path from the summary.
-    auto FileName = Summary->modulePath();
-    DEBUG(dbgs() << "Importing " << CalledFunctionName << " from " << FileName
-                 << "\n");
+    auto ModuleIdentifier = Summary->modulePath();
+    DEBUG(dbgs() << DestModule.getModuleIdentifier() << " Importing "
+                 << CalledFunctionName << " from " << ModuleIdentifier << "\n");
 
-    // Get the module for the import
-    auto SrcModule = LazyModuleLoader(FileName);
-    assert(&SrcModule->getContext() == &DestModule.getContext());
+    auto &SrcModule = ModuleLoaderCache(ModuleIdentifier);
 
     // The function that we will import!
-    GlobalValue *SGV = SrcModule->getNamedValue(CalledFunctionName);
-    StringRef ImportFunctionName = CalledFunctionName;
+    GlobalValue *SGV = SrcModule.getNamedValue(CalledFunctionName);
+
     if (!SGV) {
-      // Might be local in source Module, promoted/renamed in DestModule.
+      // The destination module is referencing function using their renamed name
+      // when importing a function that was originally local in the source
+      // module. The source module we have might not have been renamed so we try
+      // to remove the suffix added during the renaming to recover the original
+      // name in the source module.
       std::pair<StringRef, StringRef> Split =
           CalledFunctionName.split(".llvm.");
-      SGV = SrcModule->getNamedValue(Split.first);
-#ifndef NDEBUG
-      // Assert that Split.second is module id
-      uint64_t ModuleId;
-      assert(!Split.second.getAsInteger(10, ModuleId));
-      assert(ModuleId == Index.getModuleId(FileName));
-#endif
+      SGV = SrcModule.getNamedValue(Split.first);
+      assert(SGV && "Can't find function to import in source module");
     }
+    if (!SGV) {
+      report_fatal_error(Twine("Can't load function '") + CalledFunctionName +
+                         "' in Module '" + SrcModule.getModuleIdentifier() +
+                         "', error in the summary?\n");
+    }
+
     Function *F = dyn_cast<Function>(SGV);
     if (!F && isa<GlobalAlias>(SGV)) {
       auto *SGA = dyn_cast<GlobalAlias>(SGV);
       F = dyn_cast<Function>(SGA->getBaseObject());
-      ImportFunctionName = F->getName();
-    }
-    if (!F) {
-      errs() << "Can't load function '" << CalledFunctionName << "' in Module '"
-             << FileName << "', error in the summary?\n";
-      llvm_unreachable("Can't load function in Module");
+      CalledFunctionName = F->getName();
     }
+    assert(F && "Imported Function is ... not a Function");
 
     // We cannot import weak_any functions/aliases without possibly affecting
     // the order they are seen and selected by the linker, changing program
@@ -158,26 +231,20 @@ static unsigned ProcessImportWorklist(
       DEBUG(dbgs() << DestModule.getModuleIdentifier()
                    << " Ignoring import request for weak-any "
                    << (isa<Function>(SGV) ? "function " : "alias ")
-                   << CalledFunctionName << " from " << FileName << "\n");
+                   << CalledFunctionName << " from "
+                   << SrcModule.getModuleIdentifier() << "\n");
       continue;
     }
 
-    // Link in the specified function.
-    DenseSet<const GlobalValue *> FunctionsToImport;
-    FunctionsToImport.insert(F);
-    if (TheLinker.linkInModule(*SrcModule, Linker::Flags::None, &Index,
-                               &FunctionsToImport))
-      report_fatal_error("Function Import: link error");
+    // Add the function to the import list
+    auto &Entry = ModuleToFunctionsToImportMap[SrcModule.getModuleIdentifier()];
+    Entry.first = &SrcModule;
+    Entry.second.insert(F);
 
-    // Process the newly imported function and add callees to the worklist.
-    GlobalValue *NewGV = DestModule.getNamedValue(ImportFunctionName);
-    assert(NewGV);
-    Function *NewF = dyn_cast<Function>(NewGV);
-    assert(NewF);
-    findExternalCalls(*NewF, CalledFunctions, Worklist);
-    ++ImportCount;
+    // Process the newly imported functions and add callees to the worklist.
+    F->materialize();
+    findExternalCalls(DestModule, *F, Index, CalledFunctions, Worklist);
   }
-  return ImportCount;
 }
 
 // Automatically import functions in Module \p DestModule based on the summaries
@@ -196,7 +263,7 @@ bool FunctionImporter::importFunctions(Module &DestModule) {
   for (auto &F : DestModule) {
     if (F.isDeclaration() || F.hasFnAttribute(Attribute::OptimizeNone))
       continue;
-    findExternalCalls(F, CalledFunctions, Worklist);
+    findExternalCalls(DestModule, F, Index, CalledFunctions, Worklist);
   }
   if (Worklist.empty())
     return false;
@@ -206,10 +273,33 @@ bool FunctionImporter::importFunctions(Module &DestModule) {
   // Linker that will be used for importing function
   Linker TheLinker(DestModule, DiagnosticHandler);
 
-  ImportedCount += ProcessImportWorklist(DestModule, Worklist, CalledFunctions,
-                                         TheLinker, Index, ModuleLoader);
+  // Map of Module -> List of Function to import from the Module
+  std::map<StringRef, std::pair<Module *, DenseSet<const GlobalValue *>>>
+      ModuleToFunctionsToImportMap;
 
-  DEBUG(errs() << "Imported " << ImportedCount << " functions for Module "
+  // Analyze the summaries and get the list of functions to import by
+  // populating ModuleToFunctionsToImportMap
+  ModuleLazyLoaderCache ModuleLoaderCache(ModuleLoader);
+  GetImportList(DestModule, Worklist, CalledFunctions,
+                ModuleToFunctionsToImportMap, Index, ModuleLoaderCache);
+  assert(Worklist.empty() && "Worklist hasn't been flushed in GetImportList");
+
+  // Do the actual import of functions now, one Module at a time
+  for (auto &FunctionsToImportPerModule : ModuleToFunctionsToImportMap) {
+    // Get the module for the import
+    auto &FunctionsToImport = FunctionsToImportPerModule.second.second;
+    auto *SrcModule = FunctionsToImportPerModule.second.first;
+    assert(&DestModule.getContext() == &SrcModule->getContext() &&
+           "Context mismatch");
+
+    // Link in the specified functions.
+    if (TheLinker.linkInModule(*SrcModule, Linker::Flags::None, &Index,
+                               &FunctionsToImport))
+      report_fatal_error("Function Import: link error");
+
+    ImportedCount += FunctionsToImport.size();
+  }
+  DEBUG(dbgs() << "Imported " << ImportedCount << " functions for Module "
                << DestModule.getModuleIdentifier() << "\n");
   return ImportedCount;
 }

From b310704b133d191cf5617b62600a3d57a4fa79ac Mon Sep 17 00:00:00 2001
From: Mehdi Amini <mehdi.amini@apple.com>
Date: Wed, 9 Dec 2015 08:17:42 +0000
Subject: [PATCH 263/364] Revert "Implement a new pass - LiveDebugValues - to
 compute the set of live DEBUG_VALUEs at each basic block and insert them.
 Reviewed and accepted at: http://reviews.llvm.org/D11933"

This reverts commit r255096.

Break the bots: http://lab.llvm.org:8080/green/job/clang-stage1-cmake-RA-incremental_check/16378/

From: Mehdi Amini <mehdi.amini@apple.com>

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255101 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/Passes.h                 |   3 -
 include/llvm/InitializePasses.h               |   1 -
 lib/CodeGen/CMakeLists.txt                    |   1 -
 lib/CodeGen/CodeGen.cpp                       |   1 -
 lib/CodeGen/LiveDebugValues.cpp               | 402 ------------------
 lib/CodeGen/Passes.cpp                        |   1 -
 test/DebugInfo/MIR/X86/lit.local.cfg          |   2 -
 .../MIR/X86/live-debug-values-3preds.mir      | 299 -------------
 test/DebugInfo/MIR/X86/live-debug-values.mir  | 260 -----------
 test/DebugInfo/MIR/lit.local.cfg              |   2 -
 test/DebugInfo/X86/array.ll                   |   2 -
 test/DebugInfo/X86/fission-ranges.ll          |   8 +-
 test/DebugInfo/X86/pieces-3.ll                |   2 +-
 test/DebugInfo/live-debug-values.ll           | 153 -------
 14 files changed, 5 insertions(+), 1132 deletions(-)
 delete mode 100644 lib/CodeGen/LiveDebugValues.cpp
 delete mode 100644 test/DebugInfo/MIR/X86/lit.local.cfg
 delete mode 100644 test/DebugInfo/MIR/X86/live-debug-values-3preds.mir
 delete mode 100644 test/DebugInfo/MIR/X86/live-debug-values.mir
 delete mode 100644 test/DebugInfo/MIR/lit.local.cfg
 delete mode 100644 test/DebugInfo/live-debug-values.ll

diff --git a/include/llvm/CodeGen/Passes.h b/include/llvm/CodeGen/Passes.h
index f45f0ed57d6b..cae47066e14c 100644
--- a/include/llvm/CodeGen/Passes.h
+++ b/include/llvm/CodeGen/Passes.h
@@ -640,9 +640,6 @@ namespace llvm {
   /// the intrinsic for later emission to the StackMap.
   extern char &StackMapLivenessID;
 
-  /// LiveDebugValues pass
-  extern char &LiveDebugValuesID;
-
   /// createJumpInstrTables - This pass creates jump-instruction tables.
   ModulePass *createJumpInstrTablesPass();
 
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index 1f54d4437200..29c8b27db36a 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -289,7 +289,6 @@ void initializeBBVectorizePass(PassRegistry&);
 void initializeMachineFunctionPrinterPassPass(PassRegistry&);
 void initializeMIRPrintingPassPass(PassRegistry&);
 void initializeStackMapLivenessPass(PassRegistry&);
-void initializeLiveDebugValuesPass(PassRegistry&);
 void initializeMachineCombinerPass(PassRegistry &);
 void initializeLoadCombinePass(PassRegistry&);
 void initializeRewriteSymbolsPass(PassRegistry&);
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index a078c3c707a0..9c63dc7a0fd8 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -25,7 +25,6 @@ add_llvm_library(LLVMCodeGen
   ExecutionDepsFix.cpp
   ExpandISelPseudos.cpp
   ExpandPostRAPseudos.cpp
-  LiveDebugValues.cpp
   FaultMaps.cpp
   FuncletLayout.cpp
   GCMetadata.cpp
diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp
index dc13b5b11d30..7d18058db508 100644
--- a/lib/CodeGen/CodeGen.cpp
+++ b/lib/CodeGen/CodeGen.cpp
@@ -67,7 +67,6 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeSlotIndexesPass(Registry);
   initializeStackColoringPass(Registry);
   initializeStackMapLivenessPass(Registry);
-  initializeLiveDebugValuesPass(Registry);
   initializeStackProtectorPass(Registry);
   initializeStackSlotColoringPass(Registry);
   initializeTailDuplicatePassPass(Registry);
diff --git a/lib/CodeGen/LiveDebugValues.cpp b/lib/CodeGen/LiveDebugValues.cpp
deleted file mode 100644
index 7fccbea45b42..000000000000
--- a/lib/CodeGen/LiveDebugValues.cpp
+++ /dev/null
@@ -1,402 +0,0 @@
-//===------ LiveDebugValues.cpp - Tracking Debug Value MIs ----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-///
-/// This pass implements a data flow analysis that propagates debug location
-/// information by inserting additional DBG_VALUE instructions into the machine
-/// instruction stream. The pass internally builds debug location liveness
-/// ranges to determine the points where additional DBG_VALUEs need to be
-/// inserted.
-///
-/// This is a separate pass from DbgValueHistoryCalculator to facilitate
-/// testing and improve modularity.
-///
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
-#include <list>
-#include <deque>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "live-debug-values"
-
-STATISTIC(NumInserted, "Number of DBG_VALUE instructions inserted");
-
-namespace {
-
-class LiveDebugValues : public MachineFunctionPass {
-
-private:
-  const TargetRegisterInfo *TRI;
-  const TargetInstrInfo *TII;
-
-  typedef std::pair<const DILocalVariable *, const DILocation *>
-      InlinedVariable;
-
-  /// A potentially inlined instance of a variable.
-  struct DebugVariable {
-    const DILocalVariable *Var;
-    const DILocation *InlinedAt;
-
-    DebugVariable(const DILocalVariable *_var, const DILocation *_inlinedAt)
-        : Var(_var), InlinedAt(_inlinedAt) {}
-
-    bool operator==(const DebugVariable &DV) const {
-      return (Var == DV.Var) && (InlinedAt == DV.InlinedAt);
-    }
-  };
-
-  /// Member variables and functions for Range Extension across basic blocks.
-  struct VarLoc {
-    DebugVariable Var;
-    const MachineInstr *MI; // MachineInstr should be a DBG_VALUE instr.
-
-    VarLoc(DebugVariable _var, const MachineInstr *_mi) : Var(_var), MI(_mi) {}
-
-    bool operator==(const VarLoc &V) const;
-  };
-
-  typedef std::list<VarLoc> VarLocList;
-  typedef SmallDenseMap<const MachineBasicBlock *, VarLocList> VarLocInMBB;
-
-  bool OLChanged; // OutgoingLocs got changed for this bb.
-  bool MBBJoined; // The MBB was joined.
-
-  void transferDebugValue(MachineInstr &MI, VarLocList &OpenRanges);
-  void transferRegisterDef(MachineInstr &MI, VarLocList &OpenRanges);
-  void transferTerminatorInst(MachineInstr &MI, VarLocList &OpenRanges,
-                              VarLocInMBB &OutLocs);
-  void transfer(MachineInstr &MI, VarLocList &OpenRanges, VarLocInMBB &OutLocs);
-
-  void join(MachineBasicBlock &MBB, VarLocInMBB &OutLocs, VarLocInMBB &InLocs);
-
-  bool ExtendRanges(MachineFunction &MF);
-
-public:
-  static char ID;
-
-  /// Default construct and initialize the pass.
-  LiveDebugValues();
-
-  /// Tell the pass manager which passes we depend on and what
-  /// information we preserve.
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-
-  /// Print to ostream with a message.
-  void printVarLocInMBB(const VarLocInMBB &V, const char *msg,
-                        raw_ostream &Out) const;
-
-  /// Calculate the liveness information for the given machine function.
-  bool runOnMachineFunction(MachineFunction &MF) override;
-};
-} // namespace
-
-//===----------------------------------------------------------------------===//
-//            Implementation
-//===----------------------------------------------------------------------===//
-
-char LiveDebugValues::ID = 0;
-char &llvm::LiveDebugValuesID = LiveDebugValues::ID;
-INITIALIZE_PASS(LiveDebugValues, "livedebugvalues", "Live DEBUG_VALUE analysis",
-                false, false)
-
-/// Default construct and initialize the pass.
-LiveDebugValues::LiveDebugValues() : MachineFunctionPass(ID) {
-  initializeLiveDebugValuesPass(*PassRegistry::getPassRegistry());
-}
-
-/// Tell the pass manager which passes we depend on and what information we
-/// preserve.
-void LiveDebugValues::getAnalysisUsage(AnalysisUsage &AU) const {
-  MachineFunctionPass::getAnalysisUsage(AU);
-}
-
-// \brief If @MI is a DBG_VALUE with debug value described by a defined
-// register, returns the number of this register. In the other case, returns 0.
-static unsigned isDescribedByReg(const MachineInstr &MI) {
-  assert(MI.isDebugValue());
-  assert(MI.getNumOperands() == 4);
-  // If location of variable is described using a register (directly or
-  // indirecltly), this register is always a first operand.
-  return MI.getOperand(0).isReg() ? MI.getOperand(0).getReg() : 0;
-}
-
-// \brief This function takes two DBG_VALUE instructions and returns true
-// if their offsets are equal; otherwise returns false.
-static bool areOffsetsEqual(const MachineInstr &MI1, const MachineInstr &MI2) {
-  assert(MI1.isDebugValue());
-  assert(MI1.getNumOperands() == 4);
-
-  assert(MI2.isDebugValue());
-  assert(MI2.getNumOperands() == 4);
-
-  if (!MI1.isIndirectDebugValue() && !MI2.isIndirectDebugValue())
-    return true;
-
-  // Check if both MIs are indirect and they are equal.
-  if (MI1.isIndirectDebugValue() && MI2.isIndirectDebugValue())
-    return MI1.getOperand(1).getImm() == MI2.getOperand(1).getImm();
-
-  return false;
-}
-
-//===----------------------------------------------------------------------===//
-//            Debug Range Extension Implementation
-//===----------------------------------------------------------------------===//
-
-void LiveDebugValues::printVarLocInMBB(const VarLocInMBB &V, const char *msg,
-                                       raw_ostream &Out) const {
-  Out << "Printing " << msg << ":\n";
-  for (const auto &L : V) {
-    Out << "MBB: " << L.first->getName() << ":\n";
-    for (const auto &VLL : L.second) {
-      Out << " Var: " << VLL.Var.Var->getName();
-      Out << " MI: ";
-      (*VLL.MI).dump();
-      Out << "\n";
-    }
-  }
-  Out << "\n";
-}
-
-bool LiveDebugValues::VarLoc::operator==(const VarLoc &V) const {
-  return (Var == V.Var) && (isDescribedByReg(*MI) == isDescribedByReg(*V.MI)) &&
-         (areOffsetsEqual(*MI, *V.MI));
-}
-
-/// End all previous ranges related to @MI and start a new range from @MI
-/// if it is a DBG_VALUE instr.
-void LiveDebugValues::transferDebugValue(MachineInstr &MI,
-                                         VarLocList &OpenRanges) {
-  if (!MI.isDebugValue())
-    return;
-  const DILocalVariable *RawVar = MI.getDebugVariable();
-  assert(RawVar->isValidLocationForIntrinsic(MI.getDebugLoc()) &&
-         "Expected inlined-at fields to agree");
-  DebugVariable Var(RawVar, MI.getDebugLoc()->getInlinedAt());
-
-  // End all previous ranges of Var.
-  OpenRanges.erase(
-      std::remove_if(OpenRanges.begin(), OpenRanges.end(),
-                     [&](const VarLoc &V) { return (Var == V.Var); }),
-      OpenRanges.end());
-
-  // Add Var to OpenRanges from this DBG_VALUE.
-  // TODO: Currently handles DBG_VALUE which has only reg as location.
-  if (isDescribedByReg(MI)) {
-    VarLoc V(Var, &MI);
-    OpenRanges.push_back(std::move(V));
-  }
-}
-
-/// A definition of a register may mark the end of a range.
-void LiveDebugValues::transferRegisterDef(MachineInstr &MI,
-                                          VarLocList &OpenRanges) {
-  for (const MachineOperand &MO : MI.operands()) {
-    if (!(MO.isReg() && MO.isDef() && MO.getReg()))
-      continue;
-    // Remove ranges of all aliased registers.
-    for (MCRegAliasIterator RAI(MO.getReg(), TRI, true); RAI.isValid(); ++RAI)
-      OpenRanges.erase(std::remove_if(OpenRanges.begin(), OpenRanges.end(),
-                                      [&](const VarLoc &V) {
-                         return (*RAI == isDescribedByReg(*V.MI));
-                       }),
-                       OpenRanges.end());
-  }
-}
-
-/// Terminate all open ranges at the end of the current basic block.
-void LiveDebugValues::transferTerminatorInst(MachineInstr &MI,
-                                             VarLocList &OpenRanges,
-                                             VarLocInMBB &OutLocs) {
-  const MachineBasicBlock *CurMBB = MI.getParent();
-  if (!(MI.isTerminator() || (&MI == &CurMBB->instr_back())))
-    return;
-
-  if (OpenRanges.empty())
-    return;
-
-  if (OutLocs.find(CurMBB) == OutLocs.end()) {
-    // Create space for new Outgoing locs entries.
-    VarLocList VLL;
-    OutLocs.insert(std::make_pair(CurMBB, std::move(VLL)));
-  }
-  auto OL = OutLocs.find(CurMBB);
-  assert(OL != OutLocs.end());
-  VarLocList &VLL = OL->second;
-
-  for (auto OR : OpenRanges) {
-    // Copy OpenRanges to OutLocs, if not already present.
-    assert(OR.MI->isDebugValue());
-    DEBUG(dbgs() << "Add to OutLocs: "; OR.MI->dump(););
-    if (std::find_if(VLL.begin(), VLL.end(),
-                     [&](const VarLoc &V) { return (OR == V); }) == VLL.end()) {
-      VLL.push_back(std::move(OR));
-      OLChanged = true;
-    }
-  }
-  OpenRanges.clear();
-}
-
-/// This routine creates OpenRanges and OutLocs.
-void LiveDebugValues::transfer(MachineInstr &MI, VarLocList &OpenRanges,
-                               VarLocInMBB &OutLocs) {
-  transferDebugValue(MI, OpenRanges);
-  transferRegisterDef(MI, OpenRanges);
-  transferTerminatorInst(MI, OpenRanges, OutLocs);
-}
-
-/// This routine joins the analysis results of all incoming edges in @MBB by
-/// inserting a new DBG_VALUE instruction at the start of the @MBB - if the same
-/// source variable in all the predecessors of @MBB reside in the same location.
-void LiveDebugValues::join(MachineBasicBlock &MBB, VarLocInMBB &OutLocs,
-                           VarLocInMBB &InLocs) {
-  DEBUG(dbgs() << "join MBB: " << MBB.getName() << "\n");
-
-  MBBJoined = false;
-
-  VarLocList InLocsT; // Temporary incoming locations.
-
-  // For all predecessors of this MBB, find the set of VarLocs that can be
-  // joined.
-  for (auto p : MBB.predecessors()) {
-    auto OL = OutLocs.find(p);
-    // Join is null in case of empty OutLocs from any of the pred.
-    if (OL == OutLocs.end())
-      return;
-
-    // Just copy over the Out locs to incoming locs for the first predecessor.
-    if (p == *MBB.pred_begin()) {
-      InLocsT = OL->second;
-      continue;
-    }
-
-    // Join with this predecessor.
-    VarLocList &VLL = OL->second;
-    InLocsT.erase(
-        std::remove_if(InLocsT.begin(), InLocsT.end(), [&](VarLoc &ILT) {
-          return (std::find_if(VLL.begin(), VLL.end(), [&](const VarLoc &V) {
-                    return (ILT == V);
-                  }) == VLL.end());
-        }),
-        InLocsT.end());
-  }
-
-  if (InLocsT.empty())
-    return;
-
-  if (InLocs.find(&MBB) == InLocs.end()) {
-    // Create space for new Incoming locs entries.
-    VarLocList VLL;
-    InLocs.insert(std::make_pair(&MBB, std::move(VLL)));
-  }
-  auto IL = InLocs.find(&MBB);
-  assert(IL != InLocs.end());
-  VarLocList &ILL = IL->second;
-
-  // Insert DBG_VALUE instructions, if not already inserted.
-  for (auto ILT : InLocsT) {
-    if (std::find_if(ILL.begin(), ILL.end(), [&](const VarLoc &I) {
-          return (ILT == I);
-        }) == ILL.end()) {
-      // This VarLoc is not found in InLocs i.e. it is not yet inserted. So, a
-      // new range is started for the var from the mbb's beginning by inserting
-      // a new DBG_VALUE. transfer() will end this range however appropriate.
-      const MachineInstr *DMI = ILT.MI;
-      MachineInstr *MI =
-          BuildMI(MBB, MBB.instr_begin(), DMI->getDebugLoc(), DMI->getDesc(),
-                  DMI->isIndirectDebugValue(), DMI->getOperand(0).getReg(), 0,
-                  DMI->getDebugVariable(), DMI->getDebugExpression());
-      if (DMI->isIndirectDebugValue())
-        MI->getOperand(1).setImm(DMI->getOperand(1).getImm());
-      DEBUG(dbgs() << "Inserted: "; MI->dump(););
-      ++NumInserted;
-      MBBJoined = true; // rerun transfer().
-
-      VarLoc V(ILT.Var, MI);
-      ILL.push_back(std::move(V));
-    }
-  }
-}
-
-/// Calculate the liveness information for the given machine function and
-/// extend ranges across basic blocks.
-bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
-
-  DEBUG(dbgs() << "\nDebug Range Extension\n");
-
-  bool Changed = false;
-  OLChanged = MBBJoined = false;
-
-  VarLocList OpenRanges; // Ranges that are open until end of bb.
-  VarLocInMBB OutLocs;   // Ranges that exist beyond bb.
-  VarLocInMBB InLocs;    // Ranges that are incoming after joining.
-
-  std::deque<MachineBasicBlock *> BBWorklist;
-
-  // Initialize every mbb with OutLocs.
-  for (auto &MBB : MF)
-    for (auto &MI : MBB)
-      transfer(MI, OpenRanges, OutLocs);
-  DEBUG(printVarLocInMBB(OutLocs, "OutLocs after initialization", dbgs()));
-
-  // Construct a worklist of MBBs.
-  for (auto &MBB : MF)
-    BBWorklist.push_back(&MBB);
-
-  // Perform join() and transfer() using the worklist until the ranges converge
-  // Ranges have converged when the worklist is empty.
-  while (!BBWorklist.empty()) {
-    MachineBasicBlock *MBB = BBWorklist.front();
-    BBWorklist.pop_front();
-
-    join(*MBB, OutLocs, InLocs);
-
-    if (MBBJoined) {
-      Changed = true;
-      for (auto &MI : *MBB)
-        transfer(MI, OpenRanges, OutLocs);
-      DEBUG(printVarLocInMBB(OutLocs, "OutLocs after propagating", dbgs()));
-      DEBUG(printVarLocInMBB(InLocs, "InLocs after propagating", dbgs()));
-
-      if (OLChanged) {
-        OLChanged = false;
-        for (auto s : MBB->successors())
-          if (std::find(BBWorklist.begin(), BBWorklist.end(), s) ==
-              BBWorklist.end()) // add if not already present.
-            BBWorklist.push_back(s);
-      }
-    }
-  }
-  DEBUG(printVarLocInMBB(OutLocs, "Final OutLocs", dbgs()));
-  DEBUG(printVarLocInMBB(InLocs, "Final InLocs", dbgs()));
-  return Changed;
-}
-
-bool LiveDebugValues::runOnMachineFunction(MachineFunction &MF) {
-  TRI = MF.getSubtarget().getRegisterInfo();
-  TII = MF.getSubtarget().getInstrInfo();
-
-  bool Changed = false;
-
-  Changed |= ExtendRanges(MF);
-
-  return Changed;
-}
diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp
index b87979215481..9d473fdb6cd7 100644
--- a/lib/CodeGen/Passes.cpp
+++ b/lib/CodeGen/Passes.cpp
@@ -597,7 +597,6 @@ void TargetPassConfig::addMachinePasses() {
   addPass(&FuncletLayoutID, false);
 
   addPass(&StackMapLivenessID, false);
-  addPass(&LiveDebugValuesID);
 
   AddingMachinePasses = false;
 }
diff --git a/test/DebugInfo/MIR/X86/lit.local.cfg b/test/DebugInfo/MIR/X86/lit.local.cfg
deleted file mode 100644
index c8625f4d9d24..000000000000
--- a/test/DebugInfo/MIR/X86/lit.local.cfg
+++ /dev/null
@@ -1,2 +0,0 @@
-if not 'X86' in config.root.targets:
-    config.unsupported = True
diff --git a/test/DebugInfo/MIR/X86/live-debug-values-3preds.mir b/test/DebugInfo/MIR/X86/live-debug-values-3preds.mir
deleted file mode 100644
index 84be910aaf74..000000000000
--- a/test/DebugInfo/MIR/X86/live-debug-values-3preds.mir
+++ /dev/null
@@ -1,299 +0,0 @@
-# RUN: llc -run-pass=livedebugvalues -march=x86-64 -o /dev/null %s | FileCheck %s
-
-# Test the extension of debug ranges from 3 predecessors.
-# Generated from the source file LiveDebugValues-3preds.c:
-# #include <stdio.h>
-# int add(int x, int y, int z, int a) {
-#  int i;
-#  for (i = 0; i < x * y; i++) {
-#    if (i < x) {
-#      a = a * x;
-#      break;
-#    }
-#    if (i < y) {
-#      a = a * y;
-#      break;
-#    }
-#    if (i < z) {
-#      a = a * z;
-#      break;
-#    }
-#  }
-#  return a;
-# }
-# with clang -g -O1 -c -emit-llvm LiveDebugValues-3preds.c -S -o live-debug-values-3preds.ll
-# then llc -stop-after stackmap-liveness live-debug-values-3preds.ll -o /dev/null > live-debug-values-3preds.mir
-
-# DBG_VALUE for variables "x", "y" and "z" are extended into BB#9 from its
-# predecessors BB#0, BB#2 and BB#8.
-# CHECK:      bb.9.for.end:
-# CHECK:      DBG_VALUE debug-use %edx, debug-use _, !11, !17, debug-location !21
-# CHECK-NEXT: DBG_VALUE debug-use %esi, debug-use _, !10, !17, debug-location !19
-# CHECK-NEXT: DBG_VALUE debug-use %edi, debug-use _, !9, !17, debug-location !18
-
-
---- |
-  ; ModuleID = 'live-debug-values-3preds.ll'
-  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-  target triple = "x86_64-unknown-linux-gnu"
-  
-  ; Function Attrs: norecurse nounwind readnone uwtable
-  define i32 @add(i32 %x, i32 %y, i32 %z, i32 %a) #0 !dbg !4 {
-  entry:
-    tail call void @llvm.dbg.value(metadata i32 %x, i64 0, metadata !9, metadata !17), !dbg !18
-    tail call void @llvm.dbg.value(metadata i32 %y, i64 0, metadata !10, metadata !17), !dbg !19
-    tail call void @llvm.dbg.value(metadata i32 %z, i64 0, metadata !11, metadata !17), !dbg !21
-    tail call void @llvm.dbg.value(metadata i32 %a, i64 0, metadata !12, metadata !17), !dbg !23
-    tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !13, metadata !17), !dbg !25
-    %mul = mul nsw i32 %y, %x, !dbg !26
-    %cmp.24 = icmp sgt i32 %mul, 0, !dbg !30
-    br i1 %cmp.24, label %for.body.preheader, label %for.end, !dbg !31
-  
-  for.body.preheader:                               ; preds = %entry
-    br label %for.body, !dbg !32
-  
-  for.cond:                                         ; preds = %if.end.6
-    %cmp = icmp slt i32 %inc, %mul, !dbg !30
-    br i1 %cmp, label %for.body, label %for.end, !dbg !31
-  
-  for.body:                                         ; preds = %for.cond, %for.body.preheader
-    %i.025 = phi i32 [ %inc, %for.cond ], [ 0, %for.body.preheader ]
-    %0 = icmp sgt i32 %x, 0
-    br i1 %0, label %if.then, label %if.end, !dbg !35
-  
-  if.then:                                          ; preds = %for.body
-    %mul2 = mul nsw i32 %a, %x, !dbg !36
-    tail call void @llvm.dbg.value(metadata i32 %mul2, i64 0, metadata !12, metadata !17), !dbg !23
-    br label %for.end, !dbg !38
-  
-  if.end:                                           ; preds = %for.body
-    %1 = icmp sgt i32 %y, 0
-    br i1 %1, label %if.then.4, label %if.end.6, !dbg !39
-  
-  if.then.4:                                        ; preds = %if.end
-    %mul5 = mul nsw i32 %a, %y, !dbg !40
-    tail call void @llvm.dbg.value(metadata i32 %mul5, i64 0, metadata !12, metadata !17), !dbg !23
-    br label %for.end, !dbg !43
-  
-  if.end.6:                                         ; preds = %if.end
-    %2 = icmp sgt i32 %z, 0
-    %inc = add nuw nsw i32 %i.025, 1, !dbg !44
-    tail call void @llvm.dbg.value(metadata i32 %inc, i64 0, metadata !13, metadata !17), !dbg !25
-    br i1 %2, label %if.then.8, label %for.cond, !dbg !45
-  
-  if.then.8:                                        ; preds = %if.end.6
-    %mul9 = mul nsw i32 %a, %z, !dbg !46
-    tail call void @llvm.dbg.value(metadata i32 %mul9, i64 0, metadata !12, metadata !17), !dbg !23
-    br label %for.end, !dbg !49
-  
-  for.end:                                          ; preds = %for.cond, %if.then.8, %if.then.4, %if.then, %entry
-    %a.addr.0 = phi i32 [ %mul2, %if.then ], [ %mul5, %if.then.4 ], [ %mul9, %if.then.8 ], [ %a, %entry ], [ %a, %for.cond ]
-    ret i32 %a.addr.0, !dbg !50
-  }
-  
-  ; Function Attrs: nounwind readnone
-  declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
-  
-  attributes #0 = { norecurse nounwind readnone uwtable }
-  attributes #1 = { nounwind readnone }
-  
-  !llvm.dbg.cu = !{!0}
-  !llvm.module.flags = !{!14, !15}
-  !llvm.ident = !{!16}
-  
-  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 (trunk 253049) ", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, subprograms: !3)
-  !1 = !DIFile(filename: "LiveDebugValues-3preds.c", directory: "/home/vt/julia/test/tvvikram")
-  !2 = !{}
-  !3 = !{!4}
-  !4 = distinct !DISubprogram(name: "add", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, variables: !8)
-  !5 = !DISubroutineType(types: !6)
-  !6 = !{!7, !7, !7, !7, !7}
-  !7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-  !8 = !{!9, !10, !11, !12, !13}
-  !9 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !1, line: 1, type: !7)
-  !10 = !DILocalVariable(name: "y", arg: 2, scope: !4, file: !1, line: 1, type: !7)
-  !11 = !DILocalVariable(name: "z", arg: 3, scope: !4, file: !1, line: 1, type: !7)
-  !12 = !DILocalVariable(name: "a", arg: 4, scope: !4, file: !1, line: 1, type: !7)
-  !13 = !DILocalVariable(name: "i", scope: !4, file: !1, line: 2, type: !7)
-  !14 = !{i32 2, !"Dwarf Version", i32 4}
-  !15 = !{i32 2, !"Debug Info Version", i32 3}
-  !16 = !{!"clang version 3.8.0 (trunk 253049) "}
-  !17 = !DIExpression()
-  !18 = !DILocation(line: 1, column: 13, scope: !4)
-  !19 = !DILocation(line: 1, column: 20, scope: !20)
-  !20 = !DILexicalBlockFile(scope: !4, file: !1, discriminator: 1)
-  !21 = !DILocation(line: 1, column: 27, scope: !22)
-  !22 = !DILexicalBlockFile(scope: !4, file: !1, discriminator: 2)
-  !23 = !DILocation(line: 1, column: 34, scope: !24)
-  !24 = !DILexicalBlockFile(scope: !4, file: !1, discriminator: 3)
-  !25 = !DILocation(line: 2, column: 7, scope: !20)
-  !26 = !DILocation(line: 3, column: 21, scope: !27)
-  !27 = !DILexicalBlockFile(scope: !28, file: !1, discriminator: 1)
-  !28 = distinct !DILexicalBlock(scope: !29, file: !1, line: 3, column: 3)
-  !29 = distinct !DILexicalBlock(scope: !4, file: !1, line: 3, column: 3)
-  !30 = !DILocation(line: 3, column: 17, scope: !27)
-  !31 = !DILocation(line: 3, column: 3, scope: !27)
-  !32 = !DILocation(line: 4, column: 11, scope: !33)
-  !33 = distinct !DILexicalBlock(scope: !34, file: !1, line: 4, column: 9)
-  !34 = distinct !DILexicalBlock(scope: !28, file: !1, line: 3, column: 31)
-  !35 = !DILocation(line: 4, column: 9, scope: !34)
-  !36 = !DILocation(line: 5, column: 13, scope: !37)
-  !37 = distinct !DILexicalBlock(scope: !33, file: !1, line: 4, column: 16)
-  !38 = !DILocation(line: 6, column: 7, scope: !37)
-  !39 = !DILocation(line: 8, column: 9, scope: !34)
-  !40 = !DILocation(line: 9, column: 13, scope: !41)
-  !41 = distinct !DILexicalBlock(scope: !42, file: !1, line: 8, column: 16)
-  !42 = distinct !DILexicalBlock(scope: !34, file: !1, line: 8, column: 9)
-  !43 = !DILocation(line: 10, column: 7, scope: !41)
-  !44 = !DILocation(line: 3, column: 27, scope: !28)
-  !45 = !DILocation(line: 12, column: 9, scope: !34)
-  !46 = !DILocation(line: 13, column: 13, scope: !47)
-  !47 = distinct !DILexicalBlock(scope: !48, file: !1, line: 12, column: 16)
-  !48 = distinct !DILexicalBlock(scope: !34, file: !1, line: 12, column: 9)
-  !49 = !DILocation(line: 14, column: 7, scope: !47)
-  !50 = !DILocation(line: 17, column: 3, scope: !4)
-
-...
----
-name:            add
-alignment:       4
-exposesReturnsTwice: false
-hasInlineAsm:    false
-isSSA:           false
-tracksRegLiveness: true
-tracksSubRegLiveness: false
-liveins:         
-  - { reg: '%edi' }
-  - { reg: '%esi' }
-  - { reg: '%edx' }
-  - { reg: '%ecx' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0.entry:
-    successors: %bb.1.for.body.preheader(20), %bb.9.for.end(12)
-    liveins: %ecx, %edi, %edx, %esi
-  
-    DBG_VALUE debug-use %edi, debug-use _, !9, !17, debug-location !18
-    DBG_VALUE debug-use %esi, debug-use _, !10, !17, debug-location !19
-    DBG_VALUE debug-use %edx, debug-use _, !11, !17, debug-location !21
-    DBG_VALUE debug-use %ecx, debug-use _, !12, !17, debug-location !23
-    DBG_VALUE 0, 0, !13, !17, debug-location !25
-    %r8d = MOV32rr %esi, debug-location !26
-    %r8d = IMUL32rr killed %r8d, %edi, implicit-def dead %eflags, debug-location !26
-    TEST32rr %r8d, %r8d, implicit-def %eflags, debug-location !31
-    JLE_1 %bb.9.for.end, implicit %eflags
-  
-  bb.1.for.body.preheader:
-    successors: %bb.3.for.body(0)
-    liveins: %ecx, %edi, %edx, %esi, %r8d
-  
-    DBG_VALUE debug-use %edi, debug-use _, !9, !17, debug-location !18
-    DBG_VALUE debug-use %esi, debug-use _, !10, !17, debug-location !19
-    DBG_VALUE debug-use %edx, debug-use _, !11, !17, debug-location !21
-    DBG_VALUE debug-use %ecx, debug-use _, !12, !17, debug-location !23
-    DBG_VALUE 0, 0, !13, !17, debug-location !25
-    %eax = XOR32rr undef %eax, undef %eax, implicit-def dead %eflags
-  
-  bb.3.for.body (align 4):
-    successors: %bb.4.if.then(4), %bb.5.if.end(124)
-    liveins: %eax, %ecx, %edi, %edx, %esi, %r8d
-  
-    DBG_VALUE debug-use %edi, debug-use _, !9, !17, debug-location !18
-    DBG_VALUE debug-use %esi, debug-use _, !10, !17, debug-location !19
-    DBG_VALUE debug-use %edx, debug-use _, !11, !17, debug-location !21
-    DBG_VALUE debug-use %ecx, debug-use _, !12, !17, debug-location !23
-    DBG_VALUE 0, 0, !13, !17, debug-location !25
-    TEST32rr %edi, %edi, implicit-def %eflags, debug-location !35
-    JG_1 %bb.4.if.then, implicit %eflags
-  
-  bb.5.if.end:
-    successors: %bb.6.if.then.4(4), %bb.7.if.end.6(124)
-    liveins: %eax, %ecx, %edi, %edx, %esi, %r8d
-  
-    DBG_VALUE debug-use %edi, debug-use _, !9, !17, debug-location !18
-    DBG_VALUE debug-use %esi, debug-use _, !10, !17, debug-location !19
-    DBG_VALUE debug-use %edx, debug-use _, !11, !17, debug-location !21
-    DBG_VALUE debug-use %ecx, debug-use _, !12, !17, debug-location !23
-    DBG_VALUE 0, 0, !13, !17, debug-location !25
-    TEST32rr %esi, %esi, implicit-def %eflags, debug-location !39
-    JG_1 %bb.6.if.then.4, implicit %eflags
-  
-  bb.7.if.end.6:
-    successors: %bb.8.if.then.8(4), %bb.2.for.cond(124)
-    liveins: %eax, %ecx, %edi, %edx, %esi, %r8d
-  
-    DBG_VALUE debug-use %edi, debug-use _, !9, !17, debug-location !18
-    DBG_VALUE debug-use %esi, debug-use _, !10, !17, debug-location !19
-    DBG_VALUE debug-use %edx, debug-use _, !11, !17, debug-location !21
-    DBG_VALUE debug-use %ecx, debug-use _, !12, !17, debug-location !23
-    DBG_VALUE 0, 0, !13, !17, debug-location !25
-    TEST32rr %edx, %edx, implicit-def %eflags, debug-location !45
-    JG_1 %bb.8.if.then.8, implicit %eflags
-  
-  bb.2.for.cond:
-    successors: %bb.3.for.body(124), %bb.9.for.end(4)
-    liveins: %eax, %ecx, %edi, %edx, %esi, %r8d
-  
-    DBG_VALUE debug-use %edi, debug-use _, !9, !17, debug-location !18
-    DBG_VALUE debug-use %esi, debug-use _, !10, !17, debug-location !19
-    DBG_VALUE debug-use %edx, debug-use _, !11, !17, debug-location !21
-    DBG_VALUE debug-use %ecx, debug-use _, !12, !17, debug-location !23
-    DBG_VALUE 0, 0, !13, !17, debug-location !25
-    %eax = INC32r killed %eax, implicit-def dead %eflags, debug-location !44
-    DBG_VALUE debug-use %eax, debug-use _, !13, !17, debug-location !25
-    CMP32rr %eax, %r8d, implicit-def %eflags, debug-location !31
-    JL_1 %bb.3.for.body, implicit %eflags
-    JMP_1 %bb.9.for.end
-  
-  bb.4.if.then:
-    liveins: %ecx, %edi
-  
-    DBG_VALUE debug-use %edi, debug-use _, !9, !17, debug-location !18
-    DBG_VALUE debug-use %ecx, debug-use _, !12, !17, debug-location !23
-    DBG_VALUE 0, 0, !13, !17, debug-location !25
-    %ecx = IMUL32rr killed %ecx, killed %edi, implicit-def dead %eflags, debug-location !36
-    DBG_VALUE 0, 0, !13, !17, debug-location !25
-    %eax = MOV32rr killed %ecx, debug-location !50
-    RETQ %eax, debug-location !50
-  
-  bb.6.if.then.4:
-    liveins: %ecx, %esi
-  
-    DBG_VALUE debug-use %esi, debug-use _, !10, !17, debug-location !19
-    DBG_VALUE debug-use %ecx, debug-use _, !12, !17, debug-location !23
-    DBG_VALUE 0, 0, !13, !17, debug-location !25
-    %ecx = IMUL32rr killed %ecx, killed %esi, implicit-def dead %eflags, debug-location !40
-    DBG_VALUE 0, 0, !13, !17, debug-location !25
-    %eax = MOV32rr killed %ecx, debug-location !50
-    RETQ %eax, debug-location !50
-  
-  bb.8.if.then.8:
-    successors: %bb.9.for.end(0)
-    liveins: %ecx, %edx
-  
-    DBG_VALUE debug-use %edx, debug-use _, !11, !17, debug-location !21
-    DBG_VALUE debug-use %ecx, debug-use _, !12, !17, debug-location !23
-    DBG_VALUE 0, 0, !13, !17, debug-location !25
-    %ecx = IMUL32rr killed %ecx, killed %edx, implicit-def dead %eflags, debug-location !46
-  
-  bb.9.for.end:
-    liveins: %ecx
-  
-    DBG_VALUE 0, 0, !13, !17, debug-location !25
-    %eax = MOV32rr killed %ecx, debug-location !50
-    RETQ %eax, debug-location !50
-
-...
diff --git a/test/DebugInfo/MIR/X86/live-debug-values.mir b/test/DebugInfo/MIR/X86/live-debug-values.mir
deleted file mode 100644
index 0af408a635f6..000000000000
--- a/test/DebugInfo/MIR/X86/live-debug-values.mir
+++ /dev/null
@@ -1,260 +0,0 @@
-# RUN: llc -run-pass=livedebugvalues -march=x86-64 -o /dev/null %s | FileCheck %s
-
-# Test the extension of debug ranges from predecessors.
-# Generated from the source file LiveDebugValues.c:
-# #include <stdio.h>
-# int m;
-# extern int inc(int n); 
-# extern int change(int n); 
-# extern int modify(int n); 
-# int main(int argc, char **argv) {
-#   int n;
-#   if (argc != 2)
-#     n = 2;
-#   else
-#     n = atoi(argv[1]);
-#   n = change(n);
-#   if (n > 10) {
-#     m = modify(n);
-#     m = m + n;  // var `m' doesn't has a dbg.value
-#   }   
-#   else
-#     m = inc(n); // var `m' doesn't has a dbg.value
-#   printf("m(main): %d\n", m); 
-#   return 0;
-# }
-# with clang -g -O3 -c -emit-llvm LiveDebugValues.c -S -o live-debug-values.ll
-# then llc -stop-after stackmap-liveness live-debug-values.ll -o /dev/null > live-debug-values.mir
-# This case will also produce multiple locations but only the debug range
-# extension is tested here. This test case is tested with DWARF information under
-# llvm/test/DebugInfo/live-debug-values.ll and present here for testing under
-# MIR->MIR serialization.
-
-# DBG_VALUE for variable "n" is extended into BB#5 from its predecessors BB#3
-# and BB#4.
-# CHECK:      bb.5.if.end.7:
-# CHECK:      DBG_VALUE debug-use %rsi, debug-use _, !13, !20, debug-location !22
-# CHECK-NEXT: DBG_VALUE debug-use %ebx, debug-use _, !14, !20, debug-location !33
-
-
---- |
-  ; ModuleID = 'live-debug-values.ll'
-  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-  target triple = "x86_64-unknown-linux-gnu"
-  
-  @m = common global i32 0, align 4
-  @.str = private unnamed_addr constant [13 x i8] c"m(main): %d\0A\00", align 1
-  
-  ; Function Attrs: nounwind uwtable
-  define i32 @main(i32 %argc, i8** nocapture readonly %argv) #0 !dbg !4 {
-  entry:
-    tail call void @llvm.dbg.value(metadata i32 %argc, i64 0, metadata !12, metadata !20), !dbg !21
-    tail call void @llvm.dbg.value(metadata i8** %argv, i64 0, metadata !13, metadata !20), !dbg !22
-    %cmp = icmp eq i32 %argc, 2, !dbg !24
-    br i1 %cmp, label %if.else, label %if.end, !dbg !26
-  
-  if.else:                                          ; preds = %entry
-    %arrayidx = getelementptr inbounds i8*, i8** %argv, i64 1, !dbg !27
-    %0 = load i8*, i8** %arrayidx, align 8, !dbg !27, !tbaa !28
-    %call = tail call i32 (i8*, ...) bitcast (i32 (...)* @atoi to i32 (i8*, ...)*)(i8* %0) #4, !dbg !32
-    tail call void @llvm.dbg.value(metadata i32 %call, i64 0, metadata !14, metadata !20), !dbg !33
-    br label %if.end
-  
-  if.end:                                           ; preds = %if.else, %entry
-    %n.0 = phi i32 [ %call, %if.else ], [ 2, %entry ]
-    %call1 = tail call i32 @change(i32 %n.0) #4, !dbg !34
-    tail call void @llvm.dbg.value(metadata i32 %call1, i64 0, metadata !14, metadata !20), !dbg !33
-    %cmp2 = icmp sgt i32 %call1, 10, !dbg !35
-    br i1 %cmp2, label %if.then.3, label %if.else.5, !dbg !37
-  
-  if.then.3:                                        ; preds = %if.end
-    %call4 = tail call i32 @modify(i32 %call1) #4, !dbg !38
-    %add = add nsw i32 %call4, %call1, !dbg !40
-    br label %if.end.7, !dbg !41
-  
-  if.else.5:                                        ; preds = %if.end
-    %call6 = tail call i32 @inc(i32 %call1) #4, !dbg !42
-    br label %if.end.7
-  
-  if.end.7:                                         ; preds = %if.else.5, %if.then.3
-    %storemerge = phi i32 [ %call6, %if.else.5 ], [ %add, %if.then.3 ]
-    store i32 %storemerge, i32* @m, align 4, !dbg !43, !tbaa !44
-    %call8 = tail call i32 (i8*, ...) @printf(i8* nonnull getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i64 0, i64 0), i32 %storemerge) #4, !dbg !46
-    ret i32 0, !dbg !47
-  }
-  
-  declare i32 @atoi(...) #1
-  
-  declare i32 @change(i32) #1
-  
-  declare i32 @modify(i32) #1
-  
-  declare i32 @inc(i32) #1
-  
-  ; Function Attrs: nounwind
-  declare i32 @printf(i8* nocapture readonly, ...) #2
-  
-  ; Function Attrs: nounwind readnone
-  declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #3
-  
-  attributes #0 = { nounwind uwtable }
-  attributes #1 = { nounwind }
-  attributes #2 = { nounwind }
-  attributes #3 = { nounwind readnone }
-  attributes #4 = { nounwind }
-  
-  !llvm.dbg.cu = !{!0}
-  !llvm.module.flags = !{!17, !18}
-  !llvm.ident = !{!19}
-  
-  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 (trunk 253049) ", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, subprograms: !3, globals: !15)
-  !1 = !DIFile(filename: "LiveDebugValues.c", directory: "/home/vt/julia/test/tvvikram")
-  !2 = !{}
-  !3 = !{!4}
-  !4 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 6, type: !5, isLocal: false, isDefinition: true, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: true, variables: !11)
-  !5 = !DISubroutineType(types: !6)
-  !6 = !{!7, !7, !8}
-  !7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-  !8 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 64, align: 64)
-  !9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64, align: 64)
-  !10 = !DIBasicType(name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
-  !11 = !{!12, !13, !14}
-  !12 = !DILocalVariable(name: "argc", arg: 1, scope: !4, file: !1, line: 6, type: !7)
-  !13 = !DILocalVariable(name: "argv", arg: 2, scope: !4, file: !1, line: 6, type: !8)
-  !14 = !DILocalVariable(name: "n", scope: !4, file: !1, line: 7, type: !7)
-  !15 = !{!16}
-  !16 = !DIGlobalVariable(name: "m", scope: !0, file: !1, line: 2, type: !7, isLocal: false, isDefinition: true, variable: i32* @m)
-  !17 = !{i32 2, !"Dwarf Version", i32 4}
-  !18 = !{i32 2, !"Debug Info Version", i32 3}
-  !19 = !{!"clang version 3.8.0 (trunk 253049)"}
-  !20 = !DIExpression()
-  !21 = !DILocation(line: 6, column: 14, scope: !4)
-  !22 = !DILocation(line: 6, column: 27, scope: !23)
-  !23 = !DILexicalBlockFile(scope: !4, file: !1, discriminator: 1)
-  !24 = !DILocation(line: 8, column: 12, scope: !25)
-  !25 = distinct !DILexicalBlock(scope: !4, file: !1, line: 8, column: 7)
-  !26 = !DILocation(line: 8, column: 7, scope: !4)
-  !27 = !DILocation(line: 11, column: 14, scope: !25)
-  !28 = !{!29, !29, i64 0}
-  !29 = !{!"any pointer", !30, i64 0}
-  !30 = !{!"omnipotent char", !31, i64 0}
-  !31 = !{!"Simple C/C++ TBAA"}
-  !32 = !DILocation(line: 11, column: 9, scope: !25)
-  !33 = !DILocation(line: 7, column: 7, scope: !23)
-  !34 = !DILocation(line: 12, column: 7, scope: !4)
-  !35 = !DILocation(line: 13, column: 9, scope: !36)
-  !36 = distinct !DILexicalBlock(scope: !4, file: !1, line: 13, column: 7)
-  !37 = !DILocation(line: 13, column: 7, scope: !4)
-  !38 = !DILocation(line: 14, column: 9, scope: !39)
-  !39 = distinct !DILexicalBlock(scope: !36, file: !1, line: 13, column: 15)
-  !40 = !DILocation(line: 15, column: 11, scope: !39)
-  !41 = !DILocation(line: 16, column: 3, scope: !39)
-  !42 = !DILocation(line: 18, column: 9, scope: !36)
-  !43 = !DILocation(line: 15, column: 7, scope: !39)
-  !44 = !{!45, !45, i64 0}
-  !45 = !{!"int", !30, i64 0}
-  !46 = !DILocation(line: 19, column: 3, scope: !4)
-  !47 = !DILocation(line: 20, column: 3, scope: !4)
-
-...
----
-name:            main
-alignment:       4
-exposesReturnsTwice: false
-hasInlineAsm:    false
-isSSA:           false
-tracksRegLiveness: true
-tracksSubRegLiveness: false
-liveins:         
-  - { reg: '%edi' }
-  - { reg: '%rsi' }
-calleeSavedRegisters: [ '%bh', '%bl', '%bp', '%bpl', '%bx', '%ebp', '%ebx', 
-                        '%rbp', '%rbx', '%r12', '%r13', '%r14', '%r15', 
-                        '%r12b', '%r13b', '%r14b', '%r15b', '%r12d', '%r13d', 
-                        '%r14d', '%r15d', '%r12w', '%r13w', '%r14w', '%r15w' ]
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       8
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    true
-  hasCalls:        true
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-fixedStack:      
-  - { id: 0, type: spill-slot, offset: -16, size: 8, alignment: 16, callee-saved-register: '%rbx' }
-body:             |
-  bb.0.entry:
-    successors: %bb.1.if.else(16), %bb.2.if.end(16)
-    liveins: %edi, %rsi, %rbx
-  
-    frame-setup PUSH64r killed %rbx, implicit-def %rsp, implicit %rsp
-    CFI_INSTRUCTION .cfi_def_cfa_offset 16
-    CFI_INSTRUCTION .cfi_offset %rbx, -16
-    DBG_VALUE debug-use %edi, debug-use _, !12, !20, debug-location !21
-    DBG_VALUE debug-use %rsi, debug-use _, !13, !20, debug-location !22
-    %eax = MOV32rr %edi
-    DBG_VALUE debug-use %eax, debug-use _, !12, !20, debug-location !21
-    %edi = MOV32ri 2
-    CMP32ri8 killed %eax, 2, implicit-def %eflags, debug-location !26
-    JNE_1 %bb.2.if.end, implicit %eflags
-  
-  bb.1.if.else:
-    successors: %bb.2.if.end(0)
-    liveins: %rsi
-  
-    DBG_VALUE debug-use %rsi, debug-use _, !13, !20, debug-location !22
-    %rdi = MOV64rm killed %rsi, 1, _, 8, _, debug-location !27 :: (load 8 from %ir.arrayidx, !tbaa !28)
-    dead %eax = XOR32rr undef %eax, undef %eax, implicit-def dead %eflags, implicit-def %al, debug-location !32
-    CALL64pcrel32 @atoi, csr_64, implicit %rsp, implicit %rdi, implicit %al, implicit-def %rsp, implicit-def %eax, debug-location !32
-    %edi = MOV32rr %eax, debug-location !32
-    DBG_VALUE debug-use %edi, debug-use _, !14, !20, debug-location !33
-  
-  bb.2.if.end:
-    successors: %bb.3.if.then.3(16), %bb.4.if.else.5(16)
-    liveins: %edi
-  
-    CALL64pcrel32 @change, csr_64, implicit %rsp, implicit %edi, implicit-def %rsp, implicit-def %eax, debug-location !34
-    %ebx = MOV32rr %eax, debug-location !34
-    DBG_VALUE debug-use %ebx, debug-use _, !14, !20, debug-location !33
-    CMP32ri8 %ebx, 11, implicit-def %eflags, debug-location !37
-    JL_1 %bb.4.if.else.5, implicit killed %eflags, debug-location !37
-  
-  bb.3.if.then.3:
-    successors: %bb.5.if.end.7(0)
-    liveins: %ebx
-  
-    DBG_VALUE debug-use %ebx, debug-use _, !14, !20, debug-location !33
-    %edi = MOV32rr %ebx, debug-location !38
-    CALL64pcrel32 @modify, csr_64, implicit %rsp, implicit %edi, implicit-def %rsp, implicit-def %eax, debug-location !38
-    %ecx = MOV32rr %eax, debug-location !38
-    %ecx = ADD32rr killed %ecx, killed %ebx, implicit-def dead %eflags, debug-location !40
-    JMP_1 %bb.5.if.end.7
-  
-  bb.4.if.else.5:
-    successors: %bb.5.if.end.7(0)
-    liveins: %ebx
-  
-    DBG_VALUE debug-use %ebx, debug-use _, !14, !20, debug-location !33
-    %edi = MOV32rr killed %ebx, debug-location !42
-    CALL64pcrel32 @inc, csr_64, implicit %rsp, implicit %edi, implicit-def %rsp, implicit-def %eax, debug-location !42
-    %ecx = MOV32rr %eax, debug-location !42
-  
-  bb.5.if.end.7:
-    liveins: %ecx
-  
-    MOV32mr %rip, 1, _, @m, _, %ecx, debug-location !43 :: (store 4 into @m, !tbaa !44)
-    dead undef %edi = MOV32ri64 @.str, implicit-def %rdi, debug-location !46
-    dead %eax = XOR32rr undef %eax, undef %eax, implicit-def dead %eflags, implicit-def %al, debug-location !47
-    %esi = MOV32rr killed %ecx, debug-location !46
-    CALL64pcrel32 @printf, csr_64, implicit %rsp, implicit %rdi, implicit %esi, implicit %al, implicit-def %rsp, implicit-def dead %eax, debug-location !46
-    %eax = XOR32rr undef %eax, undef %eax, implicit-def dead %eflags, debug-location !47
-    %rbx = POP64r implicit-def %rsp, implicit %rsp, debug-location !47
-    RETQ %eax, debug-location !47
-
-...
diff --git a/test/DebugInfo/MIR/lit.local.cfg b/test/DebugInfo/MIR/lit.local.cfg
deleted file mode 100644
index e69aa5765356..000000000000
--- a/test/DebugInfo/MIR/lit.local.cfg
+++ /dev/null
@@ -1,2 +0,0 @@
-config.suffixes = ['.mir']
-
diff --git a/test/DebugInfo/X86/array.ll b/test/DebugInfo/X86/array.ll
index 2d2a8fb86639..a50a44cfb954 100644
--- a/test/DebugInfo/X86/array.ll
+++ b/test/DebugInfo/X86/array.ll
@@ -17,8 +17,6 @@
 ; rdar://problem/14874886
 ;
 ; CHECK:     ##DEBUG_VALUE: main:array <- [%R{{.*}}+0]
-; CHECK:     ##DEBUG_VALUE: main:array <- [%R{{.*}}+0]
-; CHECK:     ##DEBUG_VALUE: main:array <- [%R{{.*}}+0]
 ; CHECK-NOT: ##DEBUG_VALUE: main:array <- %R{{.*}}
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.9.0"
diff --git a/test/DebugInfo/X86/fission-ranges.ll b/test/DebugInfo/X86/fission-ranges.ll
index 9c9fd7d6e6fe..531024ace9b8 100644
--- a/test/DebugInfo/X86/fission-ranges.ll
+++ b/test/DebugInfo/X86/fission-ranges.ll
@@ -29,16 +29,16 @@
 ; CHECK-NEXT:      Location description: 11 00
 ; CHECK-NEXT: {{^$}}
 ; CHECK-NEXT:   Beginning address index: 3
-; CHECK-NEXT:                    Length: 25
+; CHECK-NEXT:                    Length: 21
 ; CHECK-NEXT:      Location description: 50 93 04
 ; CHECK: [[E]]: Beginning address index: 4
-; CHECK-NEXT:                    Length: 23
+; CHECK-NEXT:                    Length: 19
 ; CHECK-NEXT:      Location description: 50 93 04
 ; CHECK: [[B]]: Beginning address index: 5
-; CHECK-NEXT:                    Length: 21
+; CHECK-NEXT:                    Length: 17
 ; CHECK-NEXT:      Location description: 50 93 04
 ; CHECK: [[D]]: Beginning address index: 6
-; CHECK-NEXT:                    Length: 21
+; CHECK-NEXT:                    Length: 17
 ; CHECK-NEXT:      Location description: 50 93 04
 
 ; Make sure we don't produce any relocations in any .dwo section (though in particular, debug_info.dwo)
diff --git a/test/DebugInfo/X86/pieces-3.ll b/test/DebugInfo/X86/pieces-3.ll
index 7a93e393b25f..d52748fc17fb 100644
--- a/test/DebugInfo/X86/pieces-3.ll
+++ b/test/DebugInfo/X86/pieces-3.ll
@@ -26,7 +26,7 @@
 ; CHECK: .debug_loc
 ; CHECK: [[LOC]]:
 ; CHECK: Beginning address offset: 0x0000000000000000
-; CHECK:    Ending address offset: 0x0000000000000008
+; CHECK:    Ending address offset: 0x0000000000000004
 ; rdi, piece 0x00000008, piece 0x00000004, rsi, piece 0x00000004
 ; CHECK: Location description: 55 93 08 93 04 54 93 04 
 ;
diff --git a/test/DebugInfo/live-debug-values.ll b/test/DebugInfo/live-debug-values.ll
deleted file mode 100644
index 8a9f16eb6d6c..000000000000
--- a/test/DebugInfo/live-debug-values.ll
+++ /dev/null
@@ -1,153 +0,0 @@
-; RUN: llc -filetype=asm %s -o - | FileCheck %s
-
-; Test the extension of debug ranges from predecessors.
-; Generated from the source file LiveDebugValues.c:
-; #include <stdio.h>
-; int m;
-; extern int inc(int n); 
-; extern int change(int n); 
-; extern int modify(int n); 
-; int main(int argc, char **argv) {
-;   int n;
-;   if (argc != 2)
-;     n = 2;
-;   else
-;     n = atoi(argv[1]);
-;   n = change(n);
-;   if (n > 10) {
-;     m = modify(n);
-;     m = m + n;  // var `m' doesn't has a dbg.value
-;   }
-;   else
-;     m = inc(n); // var `m' doesn't has a dbg.value
-;   printf("m(main): %d\n", m); 
-;   return 0;
-; }
-; with clang -g -O3 -emit-llvm -c LiveDebugValues.c -S -o live-debug-values.ll
-; This case will also produce multiple locations but only the debug range
-; extension is tested here.
-
-; DBG_VALUE for variable "n" is extended into BB#5 from its predecessors BB#3
-; and BB#4.
-; CHECK:       .LBB0_5:
-; CHECK-NEXT:  #DEBUG_VALUE: main:argv <- %RSI
-; CHECK-NEXT:  #DEBUG_VALUE: main:n <- %EBX
-
-
-; ModuleID = 'LiveDebugValues.c'
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@m = common global i32 0, align 4
-@.str = private unnamed_addr constant [13 x i8] c"m(main): %d\0A\00", align 1
-
-; Function Attrs: nounwind uwtable
-define i32 @main(i32 %argc, i8** nocapture readonly %argv) #0 !dbg !4 {
-entry:
-  tail call void @llvm.dbg.value(metadata i32 %argc, i64 0, metadata !12, metadata !20), !dbg !21
-  tail call void @llvm.dbg.value(metadata i8** %argv, i64 0, metadata !13, metadata !20), !dbg !22
-  %cmp = icmp eq i32 %argc, 2, !dbg !24
-  br i1 %cmp, label %if.else, label %if.end, !dbg !26
-
-if.else:                                          ; preds = %entry
-  %arrayidx = getelementptr inbounds i8*, i8** %argv, i64 1, !dbg !27
-  %0 = load i8*, i8** %arrayidx, align 8, !dbg !27, !tbaa !28
-  %call = tail call i32 (i8*, ...) bitcast (i32 (...)* @atoi to i32 (i8*, ...)*)(i8* %0) #4, !dbg !32
-  tail call void @llvm.dbg.value(metadata i32 %call, i64 0, metadata !14, metadata !20), !dbg !33
-  br label %if.end
-
-if.end:                                           ; preds = %entry, %if.else
-  %n.0 = phi i32 [ %call, %if.else ], [ 2, %entry ]
-  %call1 = tail call i32 @change(i32 %n.0) #4, !dbg !34
-  tail call void @llvm.dbg.value(metadata i32 %call1, i64 0, metadata !14, metadata !20), !dbg !33
-  %cmp2 = icmp sgt i32 %call1, 10, !dbg !35
-  br i1 %cmp2, label %if.then.3, label %if.else.5, !dbg !37
-
-if.then.3:                                        ; preds = %if.end
-  %call4 = tail call i32 @modify(i32 %call1) #4, !dbg !38
-  %add = add nsw i32 %call4, %call1, !dbg !40
-  br label %if.end.7, !dbg !41
-
-if.else.5:                                        ; preds = %if.end
-  %call6 = tail call i32 @inc(i32 %call1) #4, !dbg !42
-  br label %if.end.7
-
-if.end.7:                                         ; preds = %if.else.5, %if.then.3
-  %storemerge = phi i32 [ %call6, %if.else.5 ], [ %add, %if.then.3 ]
-  store i32 %storemerge, i32* @m, align 4, !dbg !43, !tbaa !44
-  %call8 = tail call i32 (i8*, ...) @printf(i8* nonnull getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i64 0, i64 0), i32 %storemerge) #4, !dbg !46
-  ret i32 0, !dbg !47
-}
-
-declare i32 @atoi(...) #1
-
-declare i32 @change(i32) #1
-
-declare i32 @modify(i32) #1
-
-declare i32 @inc(i32) #1
-
-; Function Attrs: nounwind
-declare i32 @printf(i8* nocapture readonly, ...) #2
-
-; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #3
-
-attributes #0 = { nounwind uwtable }
-attributes #1 = { nounwind }
-attributes #2 = { nounwind }
-attributes #3 = { nounwind readnone }
-attributes #4 = { nounwind }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!17, !18}
-!llvm.ident = !{!19}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 (trunk 253049) ", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, subprograms: !3, globals: !15)
-!1 = !DIFile(filename: "LiveDebugValues.c", directory: "/home/vt/julia/test/tvvikram")
-!2 = !{}
-!3 = !{!4}
-!4 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 6, type: !5, isLocal: false, isDefinition: true, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: true, variables: !11)
-!5 = !DISubroutineType(types: !6)
-!6 = !{!7, !7, !8}
-!7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!8 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 64, align: 64)
-!9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64, align: 64)
-!10 = !DIBasicType(name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
-!11 = !{!12, !13, !14}
-!12 = !DILocalVariable(name: "argc", arg: 1, scope: !4, file: !1, line: 6, type: !7)
-!13 = !DILocalVariable(name: "argv", arg: 2, scope: !4, file: !1, line: 6, type: !8)
-!14 = !DILocalVariable(name: "n", scope: !4, file: !1, line: 7, type: !7)
-!15 = !{!16}
-!16 = !DIGlobalVariable(name: "m", scope: !0, file: !1, line: 2, type: !7, isLocal: false, isDefinition: true, variable: i32* @m)
-!17 = !{i32 2, !"Dwarf Version", i32 4}
-!18 = !{i32 2, !"Debug Info Version", i32 3}
-!19 = !{!"clang version 3.8.0 (trunk 253049) "}
-!20 = !DIExpression()
-!21 = !DILocation(line: 6, column: 14, scope: !4)
-!22 = !DILocation(line: 6, column: 27, scope: !23)
-!23 = !DILexicalBlockFile(scope: !4, file: !1, discriminator: 1)
-!24 = !DILocation(line: 8, column: 12, scope: !25)
-!25 = distinct !DILexicalBlock(scope: !4, file: !1, line: 8, column: 7)
-!26 = !DILocation(line: 8, column: 7, scope: !4)
-!27 = !DILocation(line: 11, column: 14, scope: !25)
-!28 = !{!29, !29, i64 0}
-!29 = !{!"any pointer", !30, i64 0}
-!30 = !{!"omnipotent char", !31, i64 0}
-!31 = !{!"Simple C/C++ TBAA"}
-!32 = !DILocation(line: 11, column: 9, scope: !25)
-!33 = !DILocation(line: 7, column: 7, scope: !23)
-!34 = !DILocation(line: 12, column: 7, scope: !4)
-!35 = !DILocation(line: 13, column: 9, scope: !36)
-!36 = distinct !DILexicalBlock(scope: !4, file: !1, line: 13, column: 7)
-!37 = !DILocation(line: 13, column: 7, scope: !4)
-!38 = !DILocation(line: 14, column: 9, scope: !39)
-!39 = distinct !DILexicalBlock(scope: !36, file: !1, line: 13, column: 15)
-!40 = !DILocation(line: 15, column: 11, scope: !39)
-!41 = !DILocation(line: 16, column: 3, scope: !39)
-!42 = !DILocation(line: 18, column: 9, scope: !36)
-!43 = !DILocation(line: 15, column: 7, scope: !39)
-!44 = !{!45, !45, i64 0}
-!45 = !{!"int", !30, i64 0}
-!46 = !DILocation(line: 19, column: 3, scope: !4)
-!47 = !DILocation(line: 20, column: 3, scope: !4)

From 547c6d442b5dc7a79b4ea64e2b99e312b50fd5c6 Mon Sep 17 00:00:00 2001
From: JF Bastien <jfb@google.com>
Date: Wed, 9 Dec 2015 09:05:42 +0000
Subject: [PATCH 264/364] EarlyCSE: fix typo from rL255054.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255102 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/EarlyCSE.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index eb38ef5f1645..6fa194e57092 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -724,7 +724,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
 
         // Remember that this was the last normal store we saw for DSE.
         // Note that we can't delete an earlier atomic or volatile store in
-        // favor of a later one which isn't.  We could in principal remove an
+        // favor of a later one which isn't.  We could in principle remove an
         // earlier unordered store if the later one is also unordered.
         if (MemInst.isSimple())
           LastStore = Inst;

From d2c7ea53a6869cb013c953ea48c6686a58ad8fdb Mon Sep 17 00:00:00 2001
From: Zlatko Buljan <Zlatko.Buljan@imgtec.com>
Date: Wed, 9 Dec 2015 13:07:45 +0000
Subject: [PATCH 265/364] Revert r254897 "[mips][microMIPS] Implement LH, LHE,
 LHU and LHUE instructions"

Commited patch was intended to implement LH, LHE, LHU and LHUE instructions.
After commit test-suite failed with error message in the form of:
fatal error: error in backend: Cannot select: t124: i32,ch = load<LD2[%d](tbaa=<0x94acc48>), sext from i16> t0, t2, undef:i32
For that reason I decided to revert commit r254897 and make new patch which besides implementation and standard regression tests will also have dedicated tests (CodeGen) for the above error.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255109 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Mips/MicroMipsInstrInfo.td         | 12 +++----
 lib/Target/Mips/MipsInstrInfo.td              | 35 ++++---------------
 .../Disassembler/Mips/micromips32r6/valid.txt |  4 ---
 .../Disassembler/Mips/micromips64r6/valid.txt |  4 ---
 test/MC/Mips/micromips-invalid.s              | 12 -------
 test/MC/Mips/micromips32r6/invalid.s          | 12 -------
 test/MC/Mips/micromips32r6/valid.s            |  4 ---
 test/MC/Mips/micromips64r6/invalid.s          | 12 -------
 test/MC/Mips/micromips64r6/valid.s            |  4 ---
 test/MC/Mips/mips32r6/invalid.s               | 12 -------
 test/MC/Mips/mips64r6/invalid.s               | 12 -------
 11 files changed, 10 insertions(+), 113 deletions(-)

diff --git a/lib/Target/Mips/MicroMipsInstrInfo.td b/lib/Target/Mips/MicroMipsInstrInfo.td
index 5745601b32e4..175a9559e004 100644
--- a/lib/Target/Mips/MicroMipsInstrInfo.td
+++ b/lib/Target/Mips/MicroMipsInstrInfo.td
@@ -738,10 +738,8 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
   let DecoderMethod = "DecodeMemMMImm16" in {
     def LB_MM  : Load<"lb", GPR32Opnd>, MMRel, LW_FM_MM<0x7>;
     def LBu_MM : Load<"lbu", GPR32Opnd>, MMRel, LW_FM_MM<0x5>;
-    def LH_MM  : LoadMemory<"lh", GPR32Opnd, mem_simm16gpr>, MMRel,
-                 LW_FM_MM<0xf>;
-    def LHu_MM : LoadMemory<"lhu", GPR32Opnd, mem_simm16gpr>, MMRel,
-                 LW_FM_MM<0xd>;
+    def LH_MM  : Load<"lh", GPR32Opnd>, MMRel, LW_FM_MM<0xf>;
+    def LHu_MM : Load<"lhu", GPR32Opnd>, MMRel, LW_FM_MM<0xd>;
     def LW_MM  : Load<"lw", GPR32Opnd>, MMRel, LW_FM_MM<0x3f>;
     def SB_MM  : Store<"sb", GPR32Opnd>, MMRel, LW_FM_MM<0x6>;
     def SH_MM  : Store<"sh", GPR32Opnd>, MMRel, LW_FM_MM<0xe>;
@@ -751,10 +749,8 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
   let DecoderMethod = "DecodeMemMMImm9" in {
     def LBE_MM  : Load<"lbe", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x4>;
     def LBuE_MM : Load<"lbue", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x0>;
-    def LHE_MM  : LoadMemory<"lhe", GPR32Opnd, mem_simm9gpr>,
-                  POOL32C_LHUE_FM_MM<0x18, 0x6, 0x5>;
-    def LHuE_MM : LoadMemory<"lhue", GPR32Opnd, mem_simm9gpr>,
-                  POOL32C_LHUE_FM_MM<0x18, 0x6, 0x1>;
+    def LHE_MM  : Load<"lhe", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x5>;
+    def LHuE_MM : Load<"lhue", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x1>;
     def LWE_MM  : Load<"lwe", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x7>;
     def SBE_MM  : Store<"sbe", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0xa, 0x4>;
     def SHE_MM  : Store<"she", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0xa, 0x5>;
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index 5dd01b538f15..e3844d67daac 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -607,14 +607,6 @@ def MipsMemSimm16AsmOperand : AsmOperandClass {
   let PredicateMethod = "isMemWithSimmOffset<16>";
 }
 
-def MipsMemSimm16GPRAsmOperand : AsmOperandClass {
-  let Name = "MemOffsetSimm16GPR";
-  let SuperClasses = [MipsMemAsmOperand];
-  let RenderMethod = "addMemOperands";
-  let ParserMethod = "parseMemOperand";
-  let PredicateMethod = "isMemWithSimmOffsetGPR<16>";
-}
-
 def MipsInvertedImmoperand : AsmOperandClass {
   let Name = "InvNum";
   let RenderMethod = "addImmOperands";
@@ -670,12 +662,6 @@ def mem_simm16 : mem_generic {
   let ParserMatchClass = MipsMemSimm16AsmOperand;
 }
 
-def mem_simm16gpr : mem_generic {
-  let MIOperandInfo = (ops ptr_rc, simm16);
-  let EncoderMethod = "getMemEncoding";
-  let ParserMatchClass = MipsMemSimm16GPRAsmOperand;
-}
-
 def mem_ea : Operand<iPTR> {
   let PrintMethod = "printMemOperandEA";
   let MIOperandInfo = (ops ptr_rc, simm16);
@@ -843,21 +829,15 @@ class LoadUpper<string opstr, RegisterOperand RO, Operand Imm>:
 }
 
 // Memory Load/Store
-class LoadMemory<string opstr, DAGOperand RO, DAGOperand MO,
-                 SDPatternOperator OpNode = null_frag,
-                 InstrItinClass Itin = NoItinerary,
-                 ComplexPattern Addr = addr> :
-  InstSE<(outs RO:$rt), (ins MO:$addr), !strconcat(opstr, "\t$rt, $addr"),
+class Load<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag,
+           InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> :
+  InstSE<(outs RO:$rt), (ins mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
          [(set RO:$rt, (OpNode Addr:$addr))], Itin, FrmI, opstr> {
   let DecoderMethod = "DecodeMem";
   let canFoldAsLoad = 1;
   let mayLoad = 1;
 }
 
-class Load<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag,
-           InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> :
-  LoadMemory<opstr, RO, mem, OpNode, Itin, Addr>;
-
 class StoreMemory<string opstr, DAGOperand RO, DAGOperand MO,
             SDPatternOperator OpNode = null_frag,
             InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> :
@@ -1403,12 +1383,9 @@ def ROTRV : MMRel, shift_rotate_reg<"rotrv", GPR32Opnd, II_ROTRV, rotr>,
 def LB  : Load<"lb", GPR32Opnd, sextloadi8, II_LB>, MMRel, LW_FM<0x20>;
 def LBu : Load<"lbu", GPR32Opnd, zextloadi8, II_LBU, addrDefault>, MMRel,
           LW_FM<0x24>;
-let AdditionalPredicates = [NotInMicroMips] in {
-  def LH  : LoadMemory<"lh", GPR32Opnd, mem_simm16gpr, sextloadi16, II_LH,
-                       addrDefault>, MMRel, LW_FM<0x21>;
-  def LHu : LoadMemory<"lhu", GPR32Opnd, mem_simm16gpr, zextloadi16, II_LHU>,
-            MMRel, LW_FM<0x25>;
-}
+def LH  : Load<"lh", GPR32Opnd, sextloadi16, II_LH, addrDefault>, MMRel,
+          LW_FM<0x21>;
+def LHu : Load<"lhu", GPR32Opnd, zextloadi16, II_LHU>, MMRel, LW_FM<0x25>;
 let AdditionalPredicates = [NotInMicroMips] in {
 def LW  : StdMMR6Rel, Load<"lw", GPR32Opnd, load, II_LW, addrDefault>, MMRel,
           LW_FM<0x23>;
diff --git a/test/MC/Disassembler/Mips/micromips32r6/valid.txt b/test/MC/Disassembler/Mips/micromips32r6/valid.txt
index d96a243eef8c..5fa2138262a4 100644
--- a/test/MC/Disassembler/Mips/micromips32r6/valid.txt
+++ b/test/MC/Disassembler/Mips/micromips32r6/valid.txt
@@ -253,7 +253,3 @@
 0x55 0x04 0x12 0x78 # CHECK: selnez.d $f2, $f4, $f8
 0x54 0x62 0x00 0x60 # CHECK: class.s $f2, $f3
 0x54 0x82 0x02 0x60 # CHECK: class.d $f2, $f4
-0x3c 0x44 0x00 0x08 # CHECK: lh $2, 8($4)
-0x60 0x82 0x6a 0x08 # CHECK: lhe $4, 8($2)
-0x34 0x82 0x00 0x08 # CHECK: lhu $4, 8($2)
-0x60 0x82 0x62 0x08 # CHECK: lhue $4, 8($2)
diff --git a/test/MC/Disassembler/Mips/micromips64r6/valid.txt b/test/MC/Disassembler/Mips/micromips64r6/valid.txt
index fadd61c9ecd7..10a9687384ea 100644
--- a/test/MC/Disassembler/Mips/micromips64r6/valid.txt
+++ b/test/MC/Disassembler/Mips/micromips64r6/valid.txt
@@ -166,7 +166,3 @@
 0x55 0x04 0x12 0x78 # CHECK: selnez.d $f2, $f4, $f8
 0x54 0x62 0x00 0x60 # CHECK: class.s $f2, $f3
 0x54 0x82 0x02 0x60 # CHECK: class.d $f2, $f4
-0x3c 0x44 0x00 0x08 # CHECK: lh $2, 8($4)
-0x60 0x82 0x6a 0x08 # CHECK: lhe $4, 8($2)
-0x34 0x82 0x00 0x08 # CHECK: lhu $4, 8($2)
-0x60 0x82 0x62 0x08 # CHECK: lhue $4, 8($2)
diff --git a/test/MC/Mips/micromips-invalid.s b/test/MC/Mips/micromips-invalid.s
index 63a1c914301a..ed0ab1bdc233 100644
--- a/test/MC/Mips/micromips-invalid.s
+++ b/test/MC/Mips/micromips-invalid.s
@@ -91,15 +91,3 @@
   jraddiusp 33      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: expected both 7-bit unsigned immediate and multiple of 4
   jraddiusp 125     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: expected both 7-bit unsigned immediate and multiple of 4
   jraddiusp 132     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: expected both 7-bit unsigned immediate and multiple of 4
-  lh $33, 8($4)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  lhe $34, 8($2)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  lhu $35, 8($2)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  lhue $36, 8($2)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  lh $2, 8($34)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  lhe $4, 8($33)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  lhu $4, 8($35)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  lhue $4, 8($37)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  lh $2, 65536($4)  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  lhe $4, 512($2)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  lhu $4, 65536($2) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  lhue $4, 512($2)  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/test/MC/Mips/micromips32r6/invalid.s b/test/MC/Mips/micromips32r6/invalid.s
index 8f1e64acf777..14259eadaeac 100644
--- a/test/MC/Mips/micromips32r6/invalid.s
+++ b/test/MC/Mips/micromips32r6/invalid.s
@@ -109,15 +109,3 @@
   swm16 $16-$20, 8($sp)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
   swm16 $16, $17, $ra, 8($fp)  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
   swm16 $16, $17, $ra, 64($sp) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  lh $33, 8($4)            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  lhe $34, 8($2)           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  lhu $35, 8($2)           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  lhue $36, 8($2)          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  lh $2, 8($34)            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  lhe $4, 8($33)           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  lhu $4, 8($35)           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  lhue $4, 8($37)          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  lh $2, 65536($4)         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  lhe $4, 512($2)          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  lhu $4, 65536($2)        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  lhue $4, 512($2)         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/test/MC/Mips/micromips32r6/valid.s b/test/MC/Mips/micromips32r6/valid.s
index 85958bc49179..194b15e1a4f6 100644
--- a/test/MC/Mips/micromips32r6/valid.s
+++ b/test/MC/Mips/micromips32r6/valid.s
@@ -246,7 +246,3 @@
   selnez.d $f2, $f4, $f8   # CHECK: selnez.d $f2, $f4, $f8 # encoding: [0x55,0x04,0x12,0x78]
   class.s $f2, $f3         # CHECK: class.s $f2, $f3       # encoding: [0x54,0x62,0x00,0x60]
   class.d $f2, $f4         # CHECK: class.d $f2, $f4       # encoding: [0x54,0x82,0x02,0x60]
-  lh $2, 8($4)             # CHECK: lh $2, 8($4)        # encoding: [0x3c,0x44,0x00,0x08]
-  lhe $4, 8($2)            # CHECK: lhe $4, 8($2)       # encoding: [0x60,0x82,0x6a,0x08]
-  lhu $4, 8($2)            # CHECK: lhu $4, 8($2)       # encoding: [0x34,0x82,0x00,0x08]
-  lhue $4, 8($2)           # CHECK: lhue $4, 8($2)      # encoding: [0x60,0x82,0x62,0x08]
diff --git a/test/MC/Mips/micromips64r6/invalid.s b/test/MC/Mips/micromips64r6/invalid.s
index 23d2b875963f..67d7d8b23500 100644
--- a/test/MC/Mips/micromips64r6/invalid.s
+++ b/test/MC/Mips/micromips64r6/invalid.s
@@ -132,15 +132,3 @@
   swm16 $16-$20, 8($sp)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
   swm16 $16, $17, $ra, 8($fp)  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
   swm16 $16, $17, $ra, 64($sp) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  lh $33, 8($4)            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  lhe $34, 8($2)           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  lhu $35, 8($2)           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  lhue $36, 8($2)          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  lh $2, 8($34)            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  lhe $4, 8($33)           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  lhu $4, 8($35)           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  lhue $4, 8($37)          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  lh $2, 65536($4)         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  lhe $4, 512($2)          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  lhu $4, 65536($2)        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  lhue $4, 512($2)         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/test/MC/Mips/micromips64r6/valid.s b/test/MC/Mips/micromips64r6/valid.s
index 304527b78f4d..441d9693ad63 100644
--- a/test/MC/Mips/micromips64r6/valid.s
+++ b/test/MC/Mips/micromips64r6/valid.s
@@ -146,9 +146,5 @@ a:
         selnez.d $f2, $f4, $f8   # CHECK: selnez.d $f2, $f4, $f8  # encoding: [0x55,0x04,0x12,0x78]
         class.s $f2, $f3         # CHECK: class.s $f2, $f3        # encoding: [0x54,0x62,0x00,0x60]
         class.d $f2, $f4         # CHECK: class.d $f2, $f4        # encoding: [0x54,0x82,0x02,0x60]
-        lh $2, 8($4)             # CHECK: lh $2, 8($4)        # encoding: [0x3c,0x44,0x00,0x08]
-        lhe $4, 8($2)            # CHECK: lhe $4, 8($2)       # encoding: [0x60,0x82,0x6a,0x08]
-        lhu $4, 8($2)            # CHECK: lhu $4, 8($2)       # encoding: [0x34,0x82,0x00,0x08]
-        lhue $4, 8($2)           # CHECK: lhue $4, 8($2)      # encoding: [0x60,0x82,0x62,0x08]
 
 1:
diff --git a/test/MC/Mips/mips32r6/invalid.s b/test/MC/Mips/mips32r6/invalid.s
index 56edcb372a4a..452cd3a5ee62 100644
--- a/test/MC/Mips/mips32r6/invalid.s
+++ b/test/MC/Mips/mips32r6/invalid.s
@@ -19,18 +19,6 @@ local_label:
         break 1024, 5     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         break 7, 1024     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         break 1024, 1024  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        lh  $33, 8($4)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        lhe $34, 8($2)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        lhu $35, 8($2)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        lhue $36, 8($2)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        lh  $2, 8($34)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        lhe $4, 8($33)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        lhu $4, 8($35)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        lhue $4, 8($37)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        lh  $2, 65536($4) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        lhe $4, 512($2)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        lhu $4, 65536($2) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        lhue $4, 512($2)  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         // FIXME: Following tests are temporarely disabled, until "PredicateControl not in hierarchy" problem is resolved
         bltl  $7, $8, local_label  # -CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         bltul $7, $8, local_label  # -CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips64r6/invalid.s b/test/MC/Mips/mips64r6/invalid.s
index c615b06e4bfd..8d68b51c5111 100644
--- a/test/MC/Mips/mips64r6/invalid.s
+++ b/test/MC/Mips/mips64r6/invalid.s
@@ -17,18 +17,6 @@ local_label:
         break 1024, 5     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         break 7, 1024     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         break 1024, 1024  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        lh  $33, 8($4)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        lhe $34, 8($2)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        lhu $35, 8($2)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        lhue $36, 8($2)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        lh  $2, 8($34)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        lhe $4, 8($33)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        lhu $4, 8($35)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        lhue $4, 8($37)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        lh  $2, 65536($4) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        lhe $4, 512($2)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        lhu $4, 65536($2) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        lhue $4, 512($2)  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         // FIXME: Following tests are temporarely disabled, until "PredicateControl not in hierarchy" problem is resolved
         bltl  $7, $8, local_label  # -CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         bltul $7, $8, local_label  # -CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled

From b204acc525ce426f068b00d05124aac66fd78324 Mon Sep 17 00:00:00 2001
From: Vasileios Kalintiris <Vasileios.Kalintiris@imgtec.com>
Date: Wed, 9 Dec 2015 13:24:22 +0000
Subject: [PATCH 266/364] [mips] Use multiclass patterns for f32/f64
 comparisons and i32 selects.

Summary:
Although the multiclass for i32 selects might seem redundant as it has
only one instantiation, we will use it to replace the correspondent
patterns in Mips64r6InstrInfo.td in follow-up commits.

Reviewers: dsanders

Subscribers: llvm-commits, dsanders

Differential Revision: http://reviews.llvm.org/D14612

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255110 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Mips/Mips32r6InstrInfo.td | 144 +++++++++++++--------------
 1 file changed, 69 insertions(+), 75 deletions(-)

diff --git a/lib/Target/Mips/Mips32r6InstrInfo.td b/lib/Target/Mips/Mips32r6InstrInfo.td
index c36a45acbf79..82d2c8ee9905 100644
--- a/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -770,84 +770,78 @@ def : MipsInstAlias<"jr $rs", (JALR ZERO, GPR32Opnd:$rs), 1>, ISA_MIPS32R6;
 //
 //===----------------------------------------------------------------------===//
 
-// f32 comparisons supported via another comparison
-def : MipsPat<(setone f32:$lhs, f32:$rhs),
-              (NOR (CMP_UEQ_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6;
-def : MipsPat<(seto f32:$lhs, f32:$rhs),
-              (NOR (CMP_UN_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6;
-def : MipsPat<(setune f32:$lhs, f32:$rhs),
-              (NOR (CMP_EQ_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6;
-def : MipsPat<(seteq f32:$lhs, f32:$rhs), (CMP_EQ_S f32:$lhs, f32:$rhs)>,
-      ISA_MIPS32R6;
-def : MipsPat<(setgt f32:$lhs, f32:$rhs), (CMP_LE_S f32:$rhs, f32:$lhs)>,
-      ISA_MIPS32R6;
-def : MipsPat<(setge f32:$lhs, f32:$rhs), (CMP_LT_S f32:$rhs, f32:$lhs)>,
-      ISA_MIPS32R6;
-def : MipsPat<(setlt f32:$lhs, f32:$rhs), (CMP_LT_S f32:$lhs, f32:$rhs)>,
-      ISA_MIPS32R6;
-def : MipsPat<(setle f32:$lhs, f32:$rhs), (CMP_LE_S f32:$lhs, f32:$rhs)>,
-      ISA_MIPS32R6;
-def : MipsPat<(setne f32:$lhs, f32:$rhs),
-              (NOR (CMP_EQ_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6;
-
-// f64 comparisons supported via another comparison
-def : MipsPat<(setone f64:$lhs, f64:$rhs),
-              (NOR (CMP_UEQ_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6;
-def : MipsPat<(seto f64:$lhs, f64:$rhs),
-              (NOR (CMP_UN_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6;
-def : MipsPat<(setune f64:$lhs, f64:$rhs),
-              (NOR (CMP_EQ_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6;
-def : MipsPat<(seteq f64:$lhs, f64:$rhs), (CMP_EQ_D f64:$lhs, f64:$rhs)>,
-      ISA_MIPS32R6;
-def : MipsPat<(setgt f64:$lhs, f64:$rhs), (CMP_LE_D f64:$rhs, f64:$lhs)>,
-      ISA_MIPS32R6;
-def : MipsPat<(setge f64:$lhs, f64:$rhs), (CMP_LT_D f64:$rhs, f64:$lhs)>,
-      ISA_MIPS32R6;
-def : MipsPat<(setlt f64:$lhs, f64:$rhs), (CMP_LT_D f64:$lhs, f64:$rhs)>,
-      ISA_MIPS32R6;
-def : MipsPat<(setle f64:$lhs, f64:$rhs), (CMP_LE_D f64:$lhs, f64:$rhs)>,
-      ISA_MIPS32R6;
-def : MipsPat<(setne f64:$lhs, f64:$rhs),
-              (NOR (CMP_EQ_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6;
+// comparisons supported via another comparison
+multiclass Cmp_Pats<ValueType VT, Instruction NOROp, Register ZEROReg> {
+def : MipsPat<(setone VT:$lhs, VT:$rhs),
+      (NOROp (!cast<Instruction>("CMP_UEQ_"#NAME) VT:$lhs, VT:$rhs), ZEROReg)>;
+def : MipsPat<(seto VT:$lhs, VT:$rhs),
+      (NOROp (!cast<Instruction>("CMP_UN_"#NAME) VT:$lhs, VT:$rhs), ZEROReg)>;
+def : MipsPat<(setune VT:$lhs, VT:$rhs),
+      (NOROp (!cast<Instruction>("CMP_EQ_"#NAME) VT:$lhs, VT:$rhs), ZEROReg)>;
+def : MipsPat<(seteq VT:$lhs, VT:$rhs),
+      (!cast<Instruction>("CMP_EQ_"#NAME) VT:$lhs, VT:$rhs)>;
+def : MipsPat<(setgt VT:$lhs, VT:$rhs),
+      (!cast<Instruction>("CMP_LE_"#NAME) VT:$rhs, VT:$lhs)>;
+def : MipsPat<(setge VT:$lhs, VT:$rhs),
+      (!cast<Instruction>("CMP_LT_"#NAME) VT:$rhs, VT:$lhs)>;
+def : MipsPat<(setlt VT:$lhs, VT:$rhs),
+      (!cast<Instruction>("CMP_LT_"#NAME) VT:$lhs, VT:$rhs)>;
+def : MipsPat<(setle VT:$lhs, VT:$rhs),
+      (!cast<Instruction>("CMP_LE_"#NAME) VT:$lhs, VT:$rhs)>;
+def : MipsPat<(setne VT:$lhs, VT:$rhs),
+      (NOROp (!cast<Instruction>("CMP_EQ_"#NAME) VT:$lhs, VT:$rhs), ZEROReg)>;
+}
+
+defm S : Cmp_Pats<f32, NOR, ZERO>, ISA_MIPS32R6;
+defm D : Cmp_Pats<f64, NOR, ZERO>, ISA_MIPS32R6;
 
 // i32 selects
+multiclass SelectInt_Pats<ValueType RC, Instruction OROp, Instruction XORiOp,
+                          Instruction SLTiOp, Instruction SLTiuOp,
+                          Instruction SELEQZOp, Instruction SELNEZOp,
+                          SDPatternOperator imm_type, ValueType Opg> {
+// reg, immz
+def : MipsPat<(select (Opg (seteq RC:$cond, immz)), RC:$t, RC:$f),
+              (OROp (SELEQZOp RC:$t, RC:$cond), (SELNEZOp RC:$f, RC:$cond))>;
+def : MipsPat<(select (Opg (setne RC:$cond, immz)), RC:$t, RC:$f),
+              (OROp (SELNEZOp RC:$t, RC:$cond), (SELEQZOp RC:$f, RC:$cond))>;
+
+// reg, immZExt16[_64]
+def : MipsPat<(select (Opg (seteq RC:$cond, imm_type:$imm)), RC:$t, RC:$f),
+              (OROp (SELEQZOp RC:$t, (XORiOp RC:$cond, imm_type:$imm)),
+                    (SELNEZOp RC:$f, (XORiOp RC:$cond, imm_type:$imm)))>;
+def : MipsPat<(select (Opg (setne RC:$cond, imm_type:$imm)), RC:$t, RC:$f),
+              (OROp (SELNEZOp RC:$t, (XORiOp RC:$cond, imm_type:$imm)),
+                    (SELEQZOp RC:$f, (XORiOp RC:$cond, imm_type:$imm)))>;
+
+// reg, immSExt16Plus1
+def : MipsPat<(select (Opg (setgt RC:$cond, immSExt16Plus1:$imm)), RC:$t, RC:$f),
+              (OROp (SELEQZOp RC:$t, (SLTiOp RC:$cond, (Plus1 imm:$imm))),
+                    (SELNEZOp RC:$f, (SLTiOp RC:$cond, (Plus1 imm:$imm))))>;
+def : MipsPat<(select (Opg (setugt RC:$cond, immSExt16Plus1:$imm)), RC:$t, RC:$f),
+              (OROp (SELEQZOp RC:$t, (SLTiuOp RC:$cond, (Plus1 imm:$imm))),
+                    (SELNEZOp RC:$f, (SLTiuOp RC:$cond, (Plus1 imm:$imm))))>;
+
+def : MipsPat<(select (Opg (seteq RC:$cond, immz)), RC:$t, immz),
+              (SELEQZOp RC:$t, RC:$cond)>;
+def : MipsPat<(select (Opg (setne RC:$cond, immz)), RC:$t, immz),
+              (SELNEZOp RC:$t, RC:$cond)>;
+def : MipsPat<(select (Opg (seteq RC:$cond, immz)), immz, RC:$f),
+              (SELNEZOp RC:$f, RC:$cond)>;
+def : MipsPat<(select (Opg (setne RC:$cond, immz)), immz, RC:$f),
+              (SELEQZOp RC:$f, RC:$cond)>;
+}
+
+defm : SelectInt_Pats<i32, OR, XORi, SLTi, SLTiu, SELEQZ, SELNEZ,
+                      immZExt16, i32>, ISA_MIPS32R6;
+
 def : MipsPat<(select i32:$cond, i32:$t, i32:$f),
-              (OR (SELNEZ i32:$t, i32:$cond), (SELEQZ i32:$f, i32:$cond))>,
-              ISA_MIPS32R6;
-def : MipsPat<(select (i32 (seteq i32:$cond, immz)), i32:$t, i32:$f),
-              (OR (SELEQZ i32:$t, i32:$cond), (SELNEZ i32:$f, i32:$cond))>,
-              ISA_MIPS32R6;
-def : MipsPat<(select (i32 (setne i32:$cond, immz)), i32:$t, i32:$f),
-              (OR (SELNEZ i32:$t, i32:$cond), (SELEQZ i32:$f, i32:$cond))>,
-              ISA_MIPS32R6;
-def : MipsPat<(select (i32 (seteq i32:$cond, immZExt16:$imm)), i32:$t, i32:$f),
-              (OR (SELEQZ i32:$t, (XORi i32:$cond, immZExt16:$imm)),
-                  (SELNEZ i32:$f, (XORi i32:$cond, immZExt16:$imm)))>,
-              ISA_MIPS32R6;
-def : MipsPat<(select (i32 (setne i32:$cond, immZExt16:$imm)), i32:$t, i32:$f),
-              (OR (SELNEZ i32:$t, (XORi i32:$cond, immZExt16:$imm)),
-                  (SELEQZ i32:$f, (XORi i32:$cond, immZExt16:$imm)))>,
+              (OR (SELNEZ i32:$t, i32:$cond),
+                  (SELEQZ i32:$f, i32:$cond))>,
               ISA_MIPS32R6;
-def : MipsPat<(select (i32 (setgt i32:$cond, immSExt16Plus1:$imm)), i32:$t,
-                      i32:$f),
-              (OR (SELEQZ i32:$t, (SLTi i32:$cond, (Plus1 imm:$imm))),
-                  (SELNEZ i32:$f, (SLTi i32:$cond, (Plus1 imm:$imm))))>,
-              ISA_MIPS32R6;
-def : MipsPat<(select (i32 (setugt i32:$cond, immSExt16Plus1:$imm)),
-                      i32:$t, i32:$f),
-              (OR (SELEQZ i32:$t, (SLTiu i32:$cond, (Plus1 imm:$imm))),
-                  (SELNEZ i32:$f, (SLTiu i32:$cond, (Plus1 imm:$imm))))>,
-              ISA_MIPS32R6;
-
 def : MipsPat<(select i32:$cond, i32:$t, immz),
-              (SELNEZ i32:$t, i32:$cond)>, ISA_MIPS32R6;
-def : MipsPat<(select (i32 (setne i32:$cond, immz)), i32:$t, immz),
-              (SELNEZ i32:$t, i32:$cond)>, ISA_MIPS32R6;
-def : MipsPat<(select (i32 (seteq i32:$cond, immz)), i32:$t, immz),
-              (SELEQZ i32:$t, i32:$cond)>, ISA_MIPS32R6;
+              (SELNEZ i32:$t, i32:$cond)>,
+              ISA_MIPS32R6;
 def : MipsPat<(select i32:$cond, immz, i32:$f),
-              (SELEQZ i32:$f, i32:$cond)>, ISA_MIPS32R6;
-def : MipsPat<(select (i32 (setne i32:$cond, immz)), immz, i32:$f),
-              (SELEQZ i32:$f, i32:$cond)>, ISA_MIPS32R6;
-def : MipsPat<(select (i32 (seteq i32:$cond, immz)), immz, i32:$f),
-              (SELNEZ i32:$f, i32:$cond)>, ISA_MIPS32R6;
+              (SELEQZ i32:$f, i32:$cond)>,
+              ISA_MIPS32R6;

From f58b104ec7a6a5b3912c9f13887f90fa06927334 Mon Sep 17 00:00:00 2001
From: JF Bastien <jfb@google.com>
Date: Wed, 9 Dec 2015 13:29:32 +0000
Subject: [PATCH 267/364] WebAssembly: add known failures

The bots are now running the torture tests properly. Bin all failures from the GCC C torture tests so that we can tackle failures and make the tree go red on regressions.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255111 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/README.txt             |   3 +-
 .../WebAssembly/known_gcc_test_failures.txt   | 443 +++++++++++++++++-
 2 files changed, 444 insertions(+), 2 deletions(-)

diff --git a/lib/Target/WebAssembly/README.txt b/lib/Target/WebAssembly/README.txt
index 97072ab1cc15..0e7aa23ac3fe 100644
--- a/lib/Target/WebAssembly/README.txt
+++ b/lib/Target/WebAssembly/README.txt
@@ -19,7 +19,8 @@ The backend's bringup is done using the GCC torture test suite first since it
 doesn't require C library support. Current known failures are in
 known_gcc_test_failures.txt, all other tests should pass. The waterfall will
 turn red if not. Once most of these pass, further testing will use LLVM's own
-test suite.
+test suite. The tests can be run locally using:
+  github.com/WebAssembly/experimental/blob/master/buildbot/torture_test.py
 
 Interesting work that remains to be done:
 * Write a pass to restructurize irreducible control flow. This needs to be done
diff --git a/lib/Target/WebAssembly/known_gcc_test_failures.txt b/lib/Target/WebAssembly/known_gcc_test_failures.txt
index 6038b198abea..1e72ad6d5a91 100644
--- a/lib/Target/WebAssembly/known_gcc_test_failures.txt
+++ b/lib/Target/WebAssembly/known_gcc_test_failures.txt
@@ -1,2 +1,443 @@
 # Tests which are known to fail from the GCC torture test suite.
-# FIXME: placeholder. The script which runs the tests needs a file here!
+
+# Core dump.
+920908-1.c
+pr38151.c
+va-arg-22.c
+
+# ValueTypes.h:222: llvm::MVT llvm::EVT::getSimpleVT() const: Assertion `isSimple() && "Expected a SimpleValueType!"' failed.
+inst-check.c
+loop-12.c
+pr57344-1.c
+pr57344-2.c
+pr57344-3.c
+pr57344-4.c
+pr58570.c
+
+# TargetRegisterInfo.h:315: static unsigned int llvm::TargetRegisterInfo::virtReg2Index(unsigned int): Assertion `isVirtualRegister(Reg) && "Not a virtual register"' failed.
+20021120-3.c
+20070201-1.c
+920501-9.c
+930513-1.c
+941014-2.c
+980605-1.c
+fprintf-1.c
+gofast.c
+pr41239.c
+printf-1.c
+struct-ret-1.c
+va-arg-11.c
+va-arg-21.c
+va-arg-24.c
+va-arg-trap-1.c
+
+# LiveInterval.cpp:1092: void llvm::LiveRange::verify() const: Assertion `I->end <= std::next(I)->start' failed.
+20020406-1.c
+
+# PeepholeOptimizer.cpp:1706: {anonymous}::ValueTrackerResult {anonymous}::ValueTracker::getNextSourceFromBitcast(): Assertion `!MO.isDef() && "We should have skipped all the definitions by now"' failed.
+20020227-1.c
+pr35456.c
+strct-pack-1.c
+
+# WebAssemblyCFGStackify.cpp:211: void SortBlocks(llvm::MachineFunction&, const llvm::MachineLoopInfo&): Assertion `L->contains( MLI.getLoopFor(&*prev(MachineFunction::iterator(&MBB)))) && "Loop isn't contiguous"' failed.
+20000815-1.c
+20010129-1.c
+20120427-1.c
+20120427-2.c
+930628-1.c
+980707-1.c
+990524-1.c
+arith-rand-ll.c
+arith-rand.c
+builtin-bitops-1.c
+memcpy-2.c
+memset-1.c
+memset-3.c
+pending-4.c
+pr20601-1.c
+pr34415.c
+pr49073.c
+pr53465.c
+strcpy-1.c
+switch-1.c
+
+# WebAssemblyCFGStackify.cpp:374: virtual bool {anonymous}::WebAssemblyCFGStackify::runOnMachineFunction(llvm::MachineFunction&): Assertion `Stack.empty()' failed.
+20020506-1.c
+20020510-1.c
+20071120-1.c
+compare-1.c
+pr15296.c
+pr17133.c
+pr19005.c
+
+# WebAssemblyISelLowering.cpp:316: virtual llvm::SDValue llvm::WebAssemblyTargetLowering::LowerCall(llvm::TargetLowering::CallLoweringInfo&, llvm::SmallVectorImpl<llvm::SDValue>&) const: Assertion `!Out.Flags.isByVal() && "byval is not valid for return values"' failed.
+20030914-2.c
+20040703-1.c
+20081117-1.c
+920625-1.c
+931004-11.c
+931004-13.c
+980223.c
+bitfld-5.c
+complex-7.c
+pr38969.c
+pr51323.c
+pr52129.c
+pr57130.c
+
+# Cannot select FrameIndex.
+20000412-2.c
+20000519-1.c
+20000706-4.c
+20000706-5.c
+20000801-1.c
+20000801-2.c
+20000801-4.c
+20001228-1.c
+20010116-1.c
+20010518-2.c
+20010915-1.c
+20011126-2.c
+20020413-1.c
+20020529-1.c
+20021024-1.c
+20021219-1.c
+20030218-1.c
+20030221-1.c
+20030222-1.c
+20030313-1.c
+20030828-1.c
+20030914-1.c
+20030916-1.c
+20031012-1.c
+20031201-1.c
+20040218-1.c
+20040302-1.c
+20040625-1.c
+20040823-1.c
+20041019-1.c
+20041113-1.c
+20041124-1.c
+20041126-1.c
+20041214-1.c
+20050121-1.c
+20050203-1.c
+20050502-1.c
+20050502-2.c
+20050826-2.c
+20060420-1.c
+20070212-1.c
+20070517-1.c
+20071029-1.c
+20071030-1.c
+20071108-1.c
+20071202-1.c
+20071213-1.c
+20071219-1.c
+20080502-1.c
+20080506-2.c
+20080519-1.c
+20080522-1.c
+20080604-1.c
+20081103-1.c
+20090113-1.c
+20090113-2.c
+20090113-3.c
+20090207-1.c
+20090623-1.c
+20100708-1.c
+20101013-1.c
+20111208-1.c
+20111212-1.c
+20120105-1.c
+20120808-1.c
+20120919-1.c
+20121108-1.c
+20131127-1.c
+20140425-1.c
+920411-1.c
+920501-8.c
+920726-1.c
+930518-1.c
+930622-2.c
+930930-2.c
+931004-10.c
+931004-12.c
+931004-14.c
+931004-2.c
+931004-4.c
+931004-6.c
+931004-8.c
+950710-1.c
+960215-1.c
+960327-1.c
+960513-1.c
+980205.c
+980608-1.c
+980709-1.c
+980716-1.c
+990127-1.c
+990513-1.c
+990531-1.c
+991216-2.c
+991228-1.c
+alloca-1.c
+builtin-prefetch-2.c
+cbrt.c
+complex-5.c
+complex-6.c
+conversion.c
+enum-3.c
+fprintf-chk-1.c
+frame-address.c
+loop-15.c
+loop-ivopts-2.c
+mayalias-3.c
+memcpy-1.c
+multi-ix.c
+pr15262.c
+pr20466-1.c
+pr20527-1.c
+pr27073.c
+pr27285.c
+pr28778.c
+pr28982b.c
+pr29006.c
+pr30778.c
+pr31448-2.c
+pr31448.c
+pr33142.c
+pr33870-1.c
+pr33870.c
+pr34176.c
+pr35472.c
+pr36339.c
+pr36343.c
+pr37573.c
+pr38051.c
+pr38212.c
+pr38236.c
+pr39100.c
+pr39120.c
+pr39339.c
+pr40022.c
+pr40657.c
+pr42614.c
+pr42691.c
+pr43236.c
+pr43784.c
+pr43835.c
+pr43987.c
+pr44202-1.c
+pr44575.c
+pr44852.c
+pr44942.c
+pr45070.c
+pr46309.c
+pr47538.c
+pr47925.c
+pr49218.c
+pr49279.c
+pr49390.c
+pr49419.c
+pr51466.c
+pr51877.c
+pr51933.c
+pr52760.c
+pr52979-1.c
+pr52979-2.c
+pr53645-2.c
+pr53645.c
+pr54471.c
+pr54985.c
+pr56205.c
+pr56799.c
+pr56866.c
+pr56982.c
+pr57124.c
+pr57131.c
+pr57876.c
+pr58277-1.c
+pr58365.c
+pr59229.c
+pr59358.c
+pr59643.c
+pr60960.c
+printf-chk-1.c
+pta-field-1.c
+pta-field-2.c
+regstack-1.c
+stdarg-1.c
+stdarg-2.c
+stdarg-3.c
+stdarg-4.c
+strct-stdarg-1.c
+strct-varg-1.c
+string-opt-17.c
+string-opt-18.c
+string-opt-5.c
+struct-aliasing-1.c
+va-arg-1.c
+va-arg-10.c
+va-arg-12.c
+va-arg-13.c
+va-arg-14.c
+va-arg-15.c
+va-arg-16.c
+va-arg-17.c
+va-arg-18.c
+va-arg-19.c
+va-arg-2.c
+va-arg-20.c
+va-arg-23.c
+va-arg-26.c
+va-arg-4.c
+va-arg-5.c
+va-arg-6.c
+va-arg-7.c
+va-arg-8.c
+va-arg-9.c
+va-arg-pack-1.c
+vfprintf-1.c
+vfprintf-chk-1.c
+vprintf-1.c
+vprintf-chk-1.c
+
+# Cannot select sign_extend_inreg.
+20001108-1.c
+20060110-1.c
+20060110-2.c
+pr33992.c
+
+# Cannot select any_extend.
+20040709-1.c
+20040709-2.c
+20050111-1.c
+20050316-2.c
+20050316-3.c
+20050604-1.c
+920501-6.c
+921029-1.c
+961122-1.c
+comp-goto-1.c
+mode-dependent-address.c
+pr23135.c
+pr42269-2.c
+pr59747.c
+restrict-1.c
+simd-1.c
+simd-2.c
+
+# Cannot select callseq_end.
+20040811-1.c
+pr43220.c
+vla-dealloc-1.c
+
+# Cannot select brind.
+20071210-1.c
+920501-4.c
+920501-5.c
+
+# Cannot select BlockAddress.
+980526-1.c
+990208-1.c
+
+# WebAssembly hasn't implemented byval arguments.
+20000412-3.c
+20000419-1.c
+20000706-1.c
+20000706-2.c
+20000707-1.c
+20000717-1.c
+20000717-5.c
+20000808-1.c
+20010605-2.c
+20011113-1.c
+20020215-1.c
+20020810-1.c
+20021118-1.c
+20040707-1.c
+20041201-1.c
+20050713-1.c
+20070614-1.c
+920908-2.c
+921112-1.c
+921117-1.c
+921123-2.c
+921204-1.c
+930126-1.c
+930208-1.c
+931004-5.c
+931004-9.c
+931031-1.c
+950607-2.c
+960416-1.c
+990525-1.c
+991118-1.c
+bf64-1.c
+complex-1.c
+complex-2.c
+pr15262-2.c
+pr20621-1.c
+pr30185.c
+pr42248.c
+
+# unimplemented operation lowering.
+20010122-1.c
+20030323-1.c
+20030811-1.c
+pr17377.c
+
+# Error: invalid output constraint '=t' in asm.
+990413-2.c
+
+# Error: __builtin_setjmp / __builtin_longjmp is not supported for the current target.
+built-in-setjmp.c
+pr60003.c
+
+# Error in the program / unsupported by Clang.
+scal-to-vec1.c
+scal-to-vec2.c
+scal-to-vec3.c
+20000822-1.c
+20010209-1.c
+20010605-1.c
+20030501-1.c
+20040520-1.c
+20061220-1.c
+20090219-1.c
+920415-1.c
+920428-2.c
+920501-7.c
+920612-2.c
+920721-4.c
+921017-1.c
+921215-1.c
+931002-1.c
+comp-goto-2.c
+nest-align-1.c
+nest-stdar-1.c
+nestfunc-1.c
+nestfunc-2.c
+nestfunc-3.c
+nestfunc-5.c
+nestfunc-6.c
+nestfunc-7.c
+pr22061-3.c
+pr22061-4.c
+pr24135.c
+pr51447.c
+20020412-1.c
+20040308-1.c
+20040423-1.c
+20041218-2.c
+20070919-1.c
+align-nest.c
+pr41935.c
+20050107-1.c
+20050119-1.c
+20050119-2.c
+920302-1.c
+920501-3.c
+920728-1.c
+pr28865.c

From 43638210b855f841f309d35c48fc1dad8ba9f4b1 Mon Sep 17 00:00:00 2001
From: Daniel Sanders <daniel.sanders@imgtec.com>
Date: Wed, 9 Dec 2015 13:48:05 +0000
Subject: [PATCH 268/364] [mips][ias] Range check uimm10 operands

Summary:

Reviewers: vkalintiris

Subscribers: dsanders, llvm-commits

Differential Revision: http://reviews.llvm.org/D15229


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255112 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Mips/AsmParser/MipsAsmParser.cpp |  3 +++
 lib/Target/Mips/MipsInstrInfo.td            | 17 ++++-------------
 test/MC/Mips/micromips-invalid.s            |  5 -----
 test/MC/Mips/micromips/invalid.s            |  6 ++++++
 test/MC/Mips/micromips32r6/invalid.s        | 10 ++++++++--
 test/MC/Mips/mips32r6/invalid.s             | 10 ++++++----
 test/MC/Mips/mips64r6/invalid.s             | 10 ++++++----
 7 files changed, 33 insertions(+), 28 deletions(-)

diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index d04e8d4e4fa5..14e63ad0c2be 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -3653,6 +3653,9 @@ bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_UImm8_0:
     return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
                  "expected 8-bit unsigned immediate");
+  case Match_UImm10_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 10-bit unsigned immediate");
   }
 
   llvm_unreachable("Implement any new match types added!");
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index e3844d67daac..c07edef86731 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -394,8 +394,10 @@ class ConstantUImmAsmOperandClass<int Bits, list<AsmOperandClass> Supers = [],
   let DiagnosticType = "UImm" # Bits # "_" # Offset;
 }
 
+def ConstantUImm10AsmOperandClass
+    : ConstantUImmAsmOperandClass<10, []>;
 def ConstantUImm8AsmOperandClass
-    : ConstantUImmAsmOperandClass<8, []>;
+    : ConstantUImmAsmOperandClass<8, [ConstantUImm10AsmOperandClass]>;
 def ConstantUImm6AsmOperandClass
     : ConstantUImmAsmOperandClass<6, [ConstantUImm8AsmOperandClass]>;
 def ConstantUImm5Plus32AsmOperandClass
@@ -492,17 +494,6 @@ def simm32      : Operand<i32>;
 def uimm20      : Operand<i32> {
 }
 
-def MipsUImm10AsmOperand : AsmOperandClass {
-  let Name = "UImm10";
-  let RenderMethod = "addImmOperands";
-  let ParserMethod = "parseImm";
-  let PredicateMethod = "isUImm<10>";
-}
-
-def uimm10      : Operand<i32> {
-  let ParserMatchClass = MipsUImm10AsmOperand;
-}
-
 def simm16_64   : Operand<i64> {
   let DecoderMethod = "DecodeSimm16";
 }
@@ -514,7 +505,7 @@ def uimmz       : Operand<i32> {
 }
 
 // Unsigned Operands
-foreach I = {1, 2, 3, 4, 5, 6, 8} in
+foreach I = {1, 2, 3, 4, 5, 6, 8, 10} in
   def uimm # I : Operand<i32> {
     let PrintMethod = "printUnsignedImm";
     let ParserMatchClass =
diff --git a/test/MC/Mips/micromips-invalid.s b/test/MC/Mips/micromips-invalid.s
index ed0ab1bdc233..60c86987b093 100644
--- a/test/MC/Mips/micromips-invalid.s
+++ b/test/MC/Mips/micromips-invalid.s
@@ -75,11 +75,6 @@
   movep   $8, $6, $2, $3  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
   movep   $5, $6, $5, $3  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
   movep   $5, $6, $2, $9  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  break 1024        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  break 1024, 5     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  break 7, 1024     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  break 1024, 1024  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  wait 1024         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
   prefx -1, $8($5)  # CHECK: :[[@LINE]]:9: error: expected 5-bit unsigned immediate
   prefx 32, $8($5)  # CHECK: :[[@LINE]]:9: error: expected 5-bit unsigned immediate
   jraddiusp 1       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: expected both 7-bit unsigned immediate and multiple of 4
diff --git a/test/MC/Mips/micromips/invalid.s b/test/MC/Mips/micromips/invalid.s
index ee5aafc28c97..2bb7886eb2ae 100644
--- a/test/MC/Mips/micromips/invalid.s
+++ b/test/MC/Mips/micromips/invalid.s
@@ -1,6 +1,12 @@
 # RUN: not llvm-mc %s -triple=mips -show-encoding -mattr=micromips 2>%t1
 # RUN: FileCheck %s < %t1
 
+  break -1            # CHECK: :[[@LINE]]:9: error: expected 10-bit unsigned immediate
+  break 1024          # CHECK: :[[@LINE]]:9: error: expected 10-bit unsigned immediate
+  break -1, 5         # CHECK: :[[@LINE]]:9: error: expected 10-bit unsigned immediate
+  break 1024, 5       # CHECK: :[[@LINE]]:9: error: expected 10-bit unsigned immediate
+  break 7, -1         # CHECK: :[[@LINE]]:12: error: expected 10-bit unsigned immediate
+  break 7, 1024       # CHECK: :[[@LINE]]:12: error: expected 10-bit unsigned immediate
   break16 -1          # CHECK: :[[@LINE]]:11: error: expected 4-bit unsigned immediate
   break16 16          # CHECK: :[[@LINE]]:11: error: expected 4-bit unsigned immediate
   cache -1, 255($7)   # CHECK: :[[@LINE]]:9: error: expected 5-bit unsigned immediate
diff --git a/test/MC/Mips/micromips32r6/invalid.s b/test/MC/Mips/micromips32r6/invalid.s
index 14259eadaeac..35f698397ba2 100644
--- a/test/MC/Mips/micromips32r6/invalid.s
+++ b/test/MC/Mips/micromips32r6/invalid.s
@@ -16,8 +16,12 @@
   bnezc16 $9, 20           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
   bnezc16 $6, 31           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: branch to misaligned address
   bnezc16 $6, 130          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: branch target out of range
-  break 1024               # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-  break 1023, 1024         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  break -1                 # CHECK: :[[@LINE]]:9: error: expected 10-bit unsigned immediate
+  break 1024               # CHECK: :[[@LINE]]:9: error: expected 10-bit unsigned immediate
+  break -1, 5              # CHECK: :[[@LINE]]:9: error: expected 10-bit unsigned immediate
+  break 1024, 5            # CHECK: :[[@LINE]]:9: error: expected 10-bit unsigned immediate
+  break 7, -1              # CHECK: :[[@LINE]]:12: error: expected 10-bit unsigned immediate
+  break 7, 1024            # CHECK: :[[@LINE]]:12: error: expected 10-bit unsigned immediate
   cache -1, 255($7)        # CHECK: :[[@LINE]]:9: error: expected 5-bit unsigned immediate
   cache 32, 255($7)        # CHECK: :[[@LINE]]:9: error: expected 5-bit unsigned immediate
   ext $2, $3, -1, 31       # CHECK: :[[@LINE]]:15: error: expected 5-bit unsigned immediate
@@ -68,6 +72,8 @@
   tlt $8, $9, $2           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
   tltu $8, $9, $2          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
   tne $8, $9, $2           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  wait -1                  # CHECK: :[[@LINE]]:8: error: expected 10-bit unsigned immediate
+  wait 1024                # CHECK: :[[@LINE]]:8: error: expected 10-bit unsigned immediate
   wrpgpr $34, $4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
   wrpgpr $3, $33           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
   wsbh $34, $4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/test/MC/Mips/mips32r6/invalid.s b/test/MC/Mips/mips32r6/invalid.s
index 452cd3a5ee62..ace04085cb7e 100644
--- a/test/MC/Mips/mips32r6/invalid.s
+++ b/test/MC/Mips/mips32r6/invalid.s
@@ -15,10 +15,12 @@ local_label:
         ldc2    $8,-21181($at)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         sdc2    $20,23157($s2)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         swc2    $25,24880($s0)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        break 1024        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        break 1024, 5     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        break 7, 1024     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        break 1024, 1024  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        break -1          # CHECK: :[[@LINE]]:15: error: expected 10-bit unsigned immediate
+        break 1024        # CHECK: :[[@LINE]]:15: error: expected 10-bit unsigned immediate
+        break -1, 5       # CHECK: :[[@LINE]]:15: error: expected 10-bit unsigned immediate
+        break 1024, 5     # CHECK: :[[@LINE]]:15: error: expected 10-bit unsigned immediate
+        break 7, -1       # CHECK: :[[@LINE]]:18: error: expected 10-bit unsigned immediate
+        break 7, 1024     # CHECK: :[[@LINE]]:18: error: expected 10-bit unsigned immediate
         // FIXME: Following tests are temporarely disabled, until "PredicateControl not in hierarchy" problem is resolved
         bltl  $7, $8, local_label  # -CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         bltul $7, $8, local_label  # -CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips64r6/invalid.s b/test/MC/Mips/mips64r6/invalid.s
index 8d68b51c5111..373ad94ad2a5 100644
--- a/test/MC/Mips/mips64r6/invalid.s
+++ b/test/MC/Mips/mips64r6/invalid.s
@@ -13,10 +13,12 @@ local_label:
         jalr.hb $31 # CHECK: :[[@LINE]]:9: error: source and destination must be different
         jalr.hb $31, $31 # CHECK: :[[@LINE]]:9: error: source and destination must be different
         ldc2    $8,-21181($at)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        break 1024        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        break 1024, 5     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        break 7, 1024     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        break 1024, 1024  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        break -1          # CHECK: :[[@LINE]]:15: error: expected 10-bit unsigned immediate
+        break 1024        # CHECK: :[[@LINE]]:15: error: expected 10-bit unsigned immediate
+        break -1, 5       # CHECK: :[[@LINE]]:15: error: expected 10-bit unsigned immediate
+        break 1024, 5     # CHECK: :[[@LINE]]:15: error: expected 10-bit unsigned immediate
+        break 7, -1       # CHECK: :[[@LINE]]:18: error: expected 10-bit unsigned immediate
+        break 7, 1024     # CHECK: :[[@LINE]]:18: error: expected 10-bit unsigned immediate
         // FIXME: Following tests are temporarely disabled, until "PredicateControl not in hierarchy" problem is resolved
         bltl  $7, $8, local_label  # -CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         bltul $7, $8, local_label  # -CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled

From 43ecf2d4d1d9c89b0a7519708bfce9c728d51f08 Mon Sep 17 00:00:00 2001
From: Oliver Stannard <oliver.stannard@arm.com>
Date: Wed, 9 Dec 2015 14:32:11 +0000
Subject: [PATCH 269/364] [AArch64] Fix FP16 vector instructions that should
 only accept low registers

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255113 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64InstrFormats.td |  6 ++--
 test/MC/AArch64/fullfp16-diagnostics.s    | 40 +++++++++++++++++++++++
 2 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index 101b0f7e1d3a..6ac2175e5035 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -6855,11 +6855,11 @@ multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
 
   let Predicates = [HasNEON, HasFullFP16] in {
   def v1i16_indexed : BaseSIMDIndexed<1, U, 1, 0b00, opc,
-                                      FPR16Op, FPR16Op, V128, VectorIndexH,
+                                      FPR16Op, FPR16Op, V128_lo, VectorIndexH,
                                       asm, ".h", "", "", ".h",
     [(set (f16 FPR16Op:$Rd),
           (OpNode (f16 FPR16Op:$Rn),
-                  (f16 (vector_extract (v8f16 V128:$Rm),
+                  (f16 (vector_extract (v8f16 V128_lo:$Rm),
                                        VectorIndexH:$idx))))]> {
     bits<3> idx;
     let Inst{11} = idx{2};
@@ -6995,7 +6995,7 @@ multiclass SIMDFPIndexedTied<bit U, bits<4> opc, string asm> {
 
   let Predicates = [HasNEON, HasFullFP16] in {
   def v1i16_indexed : BaseSIMDIndexedTied<1, U, 1, 0b00, opc,
-                                      FPR16Op, FPR16Op, V128, VectorIndexH,
+                                      FPR16Op, FPR16Op, V128_lo, VectorIndexH,
                                       asm, ".h", "", "", ".h", []> {
     bits<3> idx;
     let Inst{11} = idx{2};
diff --git a/test/MC/AArch64/fullfp16-diagnostics.s b/test/MC/AArch64/fullfp16-diagnostics.s
index 190b6e25a4b1..06035dbf7028 100644
--- a/test/MC/AArch64/fullfp16-diagnostics.s
+++ b/test/MC/AArch64/fullfp16-diagnostics.s
@@ -40,3 +40,43 @@
 // CHECK:      error: invalid operand for instruction
 // CHECK-NEXT: fmulx v2.8h, v3.8h, v17.h[6]
 // CHECK-NEXT:                     ^
+
+  fmla h0, h1, v16.h[3]
+  fmla h2, h3, v17.h[6]
+
+// CHECK:      error: invalid operand for instruction
+// CHECK-NEXT: fmla h0, h1, v16.h[3]
+// CHECK-NEXT:              ^
+// CHECK:      error: invalid operand for instruction
+// CHECK-NEXT: fmla h2, h3, v17.h[6]
+// CHECK-NEXT:              ^
+
+  fmls h0, h1, v16.h[3]
+  fmls h2, h3, v17.h[6]
+
+// CHECK:      error: invalid operand for instruction
+// CHECK-NEXT: fmls h0, h1, v16.h[3]
+// CHECK-NEXT:              ^
+// CHECK:      error: invalid operand for instruction
+// CHECK-NEXT: fmls h2, h3, v17.h[6]
+// CHECK-NEXT:              ^
+
+  fmul h0, h1, v16.h[3]
+  fmul h2, h3, v17.h[6]
+
+// CHECK:      error: invalid operand for instruction
+// CHECK-NEXT: fmul h0, h1, v16.h[3]
+// CHECK-NEXT:              ^
+// CHECK:      error: invalid operand for instruction
+// CHECK-NEXT: fmul h2, h3, v17.h[6]
+// CHECK-NEXT:              ^
+
+  fmulx h0, h1, v16.h[3]
+  fmulx h2, h3, v17.h[6]
+
+// CHECK:      error: invalid operand for instruction
+// CHECK-NEXT: fmulx h0, h1, v16.h[3]
+// CHECK-NEXT:               ^
+// CHECK:      error: invalid operand for instruction
+// CHECK-NEXT: fmulx h2, h3, v17.h[6]
+// CHECK-NEXT:               ^

From c8a1727001447d1dc04df35f5b4e282651186345 Mon Sep 17 00:00:00 2001
From: Robert Lougher <rob.lougher@gmail.com>
Date: Wed, 9 Dec 2015 14:34:10 +0000
Subject: [PATCH 270/364] Fix cycle in selection DAG introduced by
 extractelement legalization

During selection DAG legalization, extractelement is replaced with a load
instruction.  To do this, a temporary store to the stack is used unless an
existing store is found that can be re-used.

If re-using a store, the chain going out of the store must be replaced by
the one going out of the new load (this ensures that any stores that must
take place after the store happens after the load, else the value might
be overwritten before it is loaded).

The problem is, if the extractelement index is dependent on the store
replacing the chain will introduce a cycle in the selection DAG (the load
uses the index, and by replacing the chain we will make the index dependent
on the load).

To fix this, if the index is dependent on the store, the store is skipped.
This is conservative as we may end up creating an unnecessary extra store
to the stack.  However, the situation is not expected to occur very often.

Differential Revision: http://reviews.llvm.org/D15330


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255114 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp      | 11 ++++++++++
 .../X86/extractelement-legalization-cycle.ll  | 21 +++++++++++++++++++
 2 files changed, 32 insertions(+)
 create mode 100644 test/CodeGen/X86/extractelement-legalization-cycle.ll

diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 3393e17b8e09..f46767f6c4a1 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1463,6 +1463,11 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
   // series of EXTRACT_VECTOR_ELT nodes are generated, one for each element in
   // the vector. If all are expanded here, we don't want one store per vector
   // element.
+
+  // Caches for hasPredecessorHelper
+  SmallPtrSet<const SDNode *, 32> Visited;
+  SmallVector<const SDNode *, 16> Worklist;
+
   SDValue StackPtr, Ch;
   for (SDNode::use_iterator UI = Vec.getNode()->use_begin(),
        UE = Vec.getNode()->use_end(); UI != UE; ++UI) {
@@ -1477,6 +1482,12 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
       if (!ST->getChain().reachesChainWithoutSideEffects(DAG.getEntryNode()))
         continue;
 
+      // If the index is dependent on the store we will introduce a cycle when
+      // creating the load (the load uses the index, and by replacing the chain
+      // we will make the index dependent on the load).
+      if (Idx.getNode()->hasPredecessorHelper(ST, Visited, Worklist))
+        continue;
+
       StackPtr = ST->getBasePtr();
       Ch = SDValue(ST, 0);
       break;
diff --git a/test/CodeGen/X86/extractelement-legalization-cycle.ll b/test/CodeGen/X86/extractelement-legalization-cycle.ll
new file mode 100644
index 000000000000..d75f03ba1680
--- /dev/null
+++ b/test/CodeGen/X86/extractelement-legalization-cycle.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+; When the extractelement is converted to a load the store can be re-used.
+; This will, however, introduce a cycle into the selection DAG (the load
+; of the extractelement index is dependent on the store, and so after the
+; conversion it becomes dependent on the new load, which is dependent on
+; the index).  Make sure we skip the store, and conservatively instead
+; use a store to the stack.
+
+define float @foo(i32* %i, <4 x float>* %v) {
+; CHECK-LABEL: foo:
+; CHECK:    movaps %xmm0, -[[OFFSET:[0-9]+]](%rsp)
+; CHECK:    movss -[[OFFSET]](%rsp,{{.*}}), %xmm0 {{.*}}
+; CHECK-NEXT:    retq
+  %1 = load <4 x float>, <4 x float>* %v, align 16
+  %mul = fmul <4 x float> %1, %1
+  store <4 x float> %mul, <4 x float>* %v, align 16
+  %2 = load i32, i32* %i, align 4
+  %vecext = extractelement <4 x float> %mul, i32 %2
+  ret float %vecext
+}

From 69c30d5b6c6a81d5351c7173d2a5260b2d59dace Mon Sep 17 00:00:00 2001
From: Silviu Baranga <silviu.baranga@arm.com>
Date: Wed, 9 Dec 2015 15:03:52 +0000
Subject: [PATCH 271/364] [LV][LAA] Add a layer over SCEV to apply run-time
 checked knowledge on SCEV expressions

Summary:
This change creates a layer over ScalarEvolution for LAA and LV, and centralizes the
usage of SCEV predicates. The SCEVPredicatedLayer takes the statically deduced knowledge
by ScalarEvolution and applies the knowledge from the SCEV predicates. The end goal is
that both LAA and LV should use this interface everywhere.

This also solves a problem involving the result of SCEV expression rewritting when
the predicate changes. Suppose we have the expression (sext {a,+,b}) and two predicates
  P1: {a,+,b} has nsw
  P2: b = 1.

Applying P1 and then P2 gives us {a,+,1}, while applying P2 and the P1 gives us
sext({a,+,1}) (the AddRec expression was changed by P2 so P1 no longer applies).
The SCEVPredicatedLayer maintains the order of transformations by feeding back
the results of previous transformations into new transformations, and therefore
avoiding this issue.

The SCEVPredicatedLayer maintains a cache to remember the results of previous
SCEV rewritting results. This also has the benefit of reducing the overall number
of expression rewrites.

Reviewers: mzolotukhin, anemet

Subscribers: jmolloy, sanjoy, llvm-commits

Differential Revision: http://reviews.llvm.org/D14296

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255115 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/LoopAccessAnalysis.h    |  51 +++---
 include/llvm/Transforms/Utils/LoopUtils.h     |  52 ++++++
 lib/Analysis/LoopAccessAnalysis.cpp           |  87 ++++-----
 lib/Transforms/Scalar/LoopDistribute.cpp      |   4 +-
 lib/Transforms/Scalar/LoopLoadElimination.cpp |   7 +-
 lib/Transforms/Utils/LoopUtils.cpp            |  43 +++++
 lib/Transforms/Utils/LoopVersioning.cpp       |   4 +-
 lib/Transforms/Vectorize/LoopVectorize.cpp    | 165 +++++++++---------
 8 files changed, 249 insertions(+), 164 deletions(-)

diff --git a/include/llvm/Analysis/LoopAccessAnalysis.h b/include/llvm/Analysis/LoopAccessAnalysis.h
index 77d412a4f927..9b4de81c6deb 100644
--- a/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -24,6 +24,7 @@
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
 
 namespace llvm {
 
@@ -193,11 +194,10 @@ class MemoryDepChecker {
                const SmallVectorImpl<Instruction *> &Instrs) const;
   };
 
-  MemoryDepChecker(ScalarEvolution *Se, const Loop *L,
-                   SCEVUnionPredicate &Preds)
-      : SE(Se), InnermostLoop(L), AccessIdx(0),
+  MemoryDepChecker(PredicatedScalarEvolution &PSE, const Loop *L)
+      : PSE(PSE), InnermostLoop(L), AccessIdx(0),
         ShouldRetryWithRuntimeCheck(false), SafeForVectorization(true),
-        RecordDependences(true), Preds(Preds) {}
+        RecordDependences(true) {}
 
   /// \brief Register the location (instructions are given increasing numbers)
   /// of a write access.
@@ -266,7 +266,13 @@ class MemoryDepChecker {
                                                          bool isWrite) const;
 
 private:
-  ScalarEvolution *SE;
+  /// A wrapper around ScalarEvolution, used to add runtime SCEV checks, and
+  /// applies dynamic knowledge to simplify SCEV expressions and convert them
+  /// to a more usable form. We need this in case assumptions about SCEV
+  /// expressions need to be made in order to avoid unknown dependences. For
+  /// example we might assume a unit stride for a pointer in order to prove
+  /// that a memory access is strided and doesn't wrap.
+  PredicatedScalarEvolution &PSE;
   const Loop *InnermostLoop;
 
   /// \brief Maps access locations (ptr, read/write) to program order.
@@ -317,15 +323,6 @@ class MemoryDepChecker {
   /// \brief Check whether the data dependence could prevent store-load
   /// forwarding.
   bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize);
-
-  /// The SCEV predicate containing all the SCEV-related assumptions.
-  /// The dependence checker needs this in order to convert SCEVs of pointers
-  /// to more accurate expressions in the context of existing assumptions.
-  /// We also need this in case assumptions about SCEV expressions need to
-  /// be made in order to avoid unknown dependences. For example we might
-  /// assume a unit stride for a pointer in order to prove that a memory access
-  /// is strided and doesn't wrap.
-  SCEVUnionPredicate &Preds;
 };
 
 /// \brief Holds information about the memory runtime legality checks to verify
@@ -373,7 +370,7 @@ class RuntimePointerChecking {
   /// and change \p Preds.
   void insert(Loop *Lp, Value *Ptr, bool WritePtr, unsigned DepSetId,
               unsigned ASId, const ValueToValueMap &Strides,
-              SCEVUnionPredicate &Preds);
+              PredicatedScalarEvolution &PSE);
 
   /// \brief No run-time memory checking is necessary.
   bool empty() const { return Pointers.empty(); }
@@ -508,8 +505,8 @@ class RuntimePointerChecking {
 /// ScalarEvolution, we will generate run-time checks by emitting a
 /// SCEVUnionPredicate.
 ///
-/// Checks for both memory dependences and SCEV predicates must be emitted in
-/// order for the results of this analysis to be valid.
+/// Checks for both memory dependences and the SCEV predicates contained in the
+/// PSE must be emitted in order for the results of this analysis to be valid.
 class LoopAccessInfo {
 public:
   LoopAccessInfo(Loop *L, ScalarEvolution *SE, const DataLayout &DL,
@@ -591,14 +588,12 @@ class LoopAccessInfo {
     return StoreToLoopInvariantAddress;
   }
 
-  /// The SCEV predicate contains all the SCEV-related assumptions.
-  /// The is used to keep track of the minimal set of assumptions on SCEV
-  /// expressions that the analysis needs to make in order to return a
-  /// meaningful result. All SCEV expressions during the analysis should be
-  /// re-written (and therefore simplified) according to Preds.
+  /// Used to add runtime SCEV checks. Simplifies SCEV expressions and converts
+  /// them to a more usable form.  All SCEV expressions during the analysis
+  /// should be re-written (and therefore simplified) according to PSE.
   /// A user of LoopAccessAnalysis will need to emit the runtime checks
   /// associated with this predicate.
-  SCEVUnionPredicate Preds;
+  PredicatedScalarEvolution PSE;
 
 private:
   /// \brief Analyze the loop.  Substitute symbolic strides using Strides.
@@ -619,7 +614,6 @@ class LoopAccessInfo {
   MemoryDepChecker DepChecker;
 
   Loop *TheLoop;
-  ScalarEvolution *SE;
   const DataLayout &DL;
   const TargetLibraryInfo *TLI;
   AliasAnalysis *AA;
@@ -654,18 +648,17 @@ Value *stripIntegerCast(Value *V);
 /// If \p OrigPtr is not null, use it to look up the stride value instead of \p
 /// Ptr.  \p PtrToStride provides the mapping between the pointer value and its
 /// stride as collected by LoopVectorizationLegality::collectStridedAccess.
-const SCEV *replaceSymbolicStrideSCEV(ScalarEvolution *SE,
+const SCEV *replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
                                       const ValueToValueMap &PtrToStride,
-                                      SCEVUnionPredicate &Preds, Value *Ptr,
-                                      Value *OrigPtr = nullptr);
+                                      Value *Ptr, Value *OrigPtr = nullptr);
 
 /// \brief Check the stride of the pointer and ensure that it does not wrap in
 /// the address space, assuming \p Preds is true.
 ///
 /// If necessary this method will version the stride of the pointer according
 /// to \p PtrToStride and therefore add a new predicate to \p Preds.
-int isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp,
-                 const ValueToValueMap &StridesMap, SCEVUnionPredicate &Preds);
+int isStridedPtr(PredicatedScalarEvolution &PSE, Value *Ptr, const Loop *Lp,
+                 const ValueToValueMap &StridesMap);
 
 /// \brief This analysis provides dependence information for the memory accesses
 /// of a loop.
diff --git a/include/llvm/Transforms/Utils/LoopUtils.h b/include/llvm/Transforms/Utils/LoopUtils.h
index da92b25f4f03..30f1f5037ae7 100644
--- a/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/include/llvm/Transforms/Utils/LoopUtils.h
@@ -374,6 +374,58 @@ void computeLICMSafetyInfo(LICMSafetyInfo *, Loop *);
 
 /// \brief Returns the instructions that use values defined in the loop.
 SmallVector<Instruction *, 8> findDefsUsedOutsideOfLoop(Loop *L);
+
+/// An interface layer with SCEV used to manage how we see SCEV expressions for
+/// values in the context of existing predicates. We can add new predicates,
+/// but we cannot remove them.
+///
+/// This layer has multiple purposes:
+///   - provides a simple interface for SCEV versioning.
+///   - guarantees that the order of transformations applied on a SCEV
+///     expression for a single Value is consistent across two different
+///     getSCEV calls. This means that, for example, once we've obtained
+///     an AddRec expression for a certain value through expression rewriting,
+///     we will continue to get an AddRec expression for that Value.
+///   - lowers the number of expression rewrites.
+class PredicatedScalarEvolution {
+public:
+  PredicatedScalarEvolution(ScalarEvolution &SE);
+  const SCEVUnionPredicate &getUnionPredicate() const;
+  /// \brief Returns the SCEV expression of V, in the context of the current
+  /// SCEV predicate.
+  /// The order of transformations applied on the expression of V returned
+  /// by ScalarEvolution is guaranteed to be preserved, even when adding new
+  /// predicates.
+  const SCEV *getSCEV(Value *V);
+  /// \brief Adds a new predicate.
+  void addPredicate(const SCEVPredicate &Pred);
+  /// \brief Returns the ScalarEvolution analysis used.
+  ScalarEvolution *getSE() const { return &SE; }
+
+private:
+  /// \brief Increments the version number of the predicate.
+  /// This needs to be called every time the SCEV predicate changes.
+  void updateGeneration();
+  /// Holds a SCEV and the version number of the SCEV predicate used to
+  /// perform the rewrite of the expression.
+  typedef std::pair<unsigned, const SCEV *> RewriteEntry;
+  /// Maps a SCEV to the rewrite result of that SCEV at a certain version
+  /// number. If this number doesn't match the current Generation, we will
+  /// need to do a rewrite. To preserve the transformation order of previous
+  /// rewrites, we will rewrite the previous result instead of the original
+  /// SCEV.
+  DenseMap<const SCEV *, RewriteEntry> RewriteMap;
+  /// The ScalarEvolution analysis.
+  ScalarEvolution &SE;
+  /// The SCEVPredicate that forms our context. We will rewrite all expressions
+  /// assuming that this predicate true.
+  SCEVUnionPredicate Preds;
+  /// Marks the version of the SCEV predicate used. When rewriting a SCEV
+  /// expression we mark it with the version of the predicate. We use this to
+  /// figure out if the predicate has changed from the last rewrite of the
+  /// SCEV. If so, we need to perform a new rewrite.
+  unsigned Generation;
+};
 }
 
 #endif
diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp
index b2670bf48dd8..e221c1bbba23 100644
--- a/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/lib/Analysis/LoopAccessAnalysis.cpp
@@ -87,11 +87,10 @@ Value *llvm::stripIntegerCast(Value *V) {
   return V;
 }
 
-const SCEV *llvm::replaceSymbolicStrideSCEV(ScalarEvolution *SE,
+const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
                                             const ValueToValueMap &PtrToStride,
-                                            SCEVUnionPredicate &Preds,
                                             Value *Ptr, Value *OrigPtr) {
-  const SCEV *OrigSCEV = SE->getSCEV(Ptr);
+  const SCEV *OrigSCEV = PSE.getSCEV(Ptr);
 
   // If there is an entry in the map return the SCEV of the pointer with the
   // symbolic stride replaced by one.
@@ -108,16 +107,17 @@ const SCEV *llvm::replaceSymbolicStrideSCEV(ScalarEvolution *SE,
     ValueToValueMap RewriteMap;
     RewriteMap[StrideVal] = One;
 
+    ScalarEvolution *SE = PSE.getSE();
     const auto *U = cast<SCEVUnknown>(SE->getSCEV(StrideVal));
     const auto *CT =
         static_cast<const SCEVConstant *>(SE->getOne(StrideVal->getType()));
 
-    Preds.add(SE->getEqualPredicate(U, CT));
+    PSE.addPredicate(*SE->getEqualPredicate(U, CT));
+    auto *Expr = PSE.getSCEV(Ptr);
 
-    const SCEV *ByOne = SE->rewriteUsingPredicate(OrigSCEV, Preds);
-    DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV << " by: " << *ByOne
+    DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV << " by: " << *Expr
                  << "\n");
-    return ByOne;
+    return Expr;
   }
 
   // Otherwise, just return the SCEV of the original pointer.
@@ -127,11 +127,12 @@ const SCEV *llvm::replaceSymbolicStrideSCEV(ScalarEvolution *SE,
 void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, bool WritePtr,
                                     unsigned DepSetId, unsigned ASId,
                                     const ValueToValueMap &Strides,
-                                    SCEVUnionPredicate &Preds) {
+                                    PredicatedScalarEvolution &PSE) {
   // Get the stride replaced scev.
-  const SCEV *Sc = replaceSymbolicStrideSCEV(SE, Strides, Preds, Ptr);
+  const SCEV *Sc = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
   assert(AR && "Invalid addrec expression");
+  ScalarEvolution *SE = PSE.getSE();
   const SCEV *Ex = SE->getBackedgeTakenCount(Lp);
 
   const SCEV *ScStart = AR->getStart();
@@ -423,9 +424,10 @@ class AccessAnalysis {
   typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
 
   AccessAnalysis(const DataLayout &Dl, AliasAnalysis *AA, LoopInfo *LI,
-                 MemoryDepChecker::DepCandidates &DA, SCEVUnionPredicate &Preds)
+                 MemoryDepChecker::DepCandidates &DA,
+                 PredicatedScalarEvolution &PSE)
       : DL(Dl), AST(*AA), LI(LI), DepCands(DA), IsRTCheckAnalysisNeeded(false),
-        Preds(Preds) {}
+        PSE(PSE) {}
 
   /// \brief Register a load  and whether it is only read from.
   void addLoad(MemoryLocation &Loc, bool IsReadOnly) {
@@ -512,16 +514,16 @@ class AccessAnalysis {
   bool IsRTCheckAnalysisNeeded;
 
   /// The SCEV predicate containing all the SCEV-related assumptions.
-  SCEVUnionPredicate &Preds;
+  PredicatedScalarEvolution &PSE;
 };
 
 } // end anonymous namespace
 
 /// \brief Check whether a pointer can participate in a runtime bounds check.
-static bool hasComputableBounds(ScalarEvolution *SE,
+static bool hasComputableBounds(PredicatedScalarEvolution &PSE,
                                 const ValueToValueMap &Strides, Value *Ptr,
-                                Loop *L, SCEVUnionPredicate &Preds) {
-  const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, Strides, Preds, Ptr);
+                                Loop *L) {
+  const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
   if (!AR)
     return false;
@@ -564,11 +566,11 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
       else
         ++NumReadPtrChecks;
 
-      if (hasComputableBounds(SE, StridesMap, Ptr, TheLoop, Preds) &&
+      if (hasComputableBounds(PSE, StridesMap, Ptr, TheLoop) &&
           // When we run after a failing dependency check we have to make sure
           // we don't have wrapping pointers.
           (!ShouldCheckStride ||
-           isStridedPtr(SE, Ptr, TheLoop, StridesMap, Preds) == 1)) {
+           isStridedPtr(PSE, Ptr, TheLoop, StridesMap) == 1)) {
         // The id of the dependence set.
         unsigned DepId;
 
@@ -582,7 +584,7 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
           // Each access has its own dependence set.
           DepId = RunningDepId++;
 
-        RtCheck.insert(TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap, Preds);
+        RtCheck.insert(TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap, PSE);
 
         DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n');
       } else {
@@ -817,9 +819,8 @@ static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR,
 }
 
 /// \brief Check whether the access through \p Ptr has a constant stride.
-int llvm::isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp,
-                       const ValueToValueMap &StridesMap,
-                       SCEVUnionPredicate &Preds) {
+int llvm::isStridedPtr(PredicatedScalarEvolution &PSE, Value *Ptr,
+                       const Loop *Lp, const ValueToValueMap &StridesMap) {
   Type *Ty = Ptr->getType();
   assert(Ty->isPointerTy() && "Unexpected non-ptr");
 
@@ -831,7 +832,7 @@ int llvm::isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp,
     return 0;
   }
 
-  const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, StridesMap, Preds, Ptr);
+  const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr);
 
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
   if (!AR) {
@@ -854,16 +855,16 @@ int llvm::isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp,
   // to access the pointer value "0" which is undefined behavior in address
   // space 0, therefore we can also vectorize this case.
   bool IsInBoundsGEP = isInBoundsGep(Ptr);
-  bool IsNoWrapAddRec = isNoWrapAddRec(Ptr, AR, SE, Lp);
+  bool IsNoWrapAddRec = isNoWrapAddRec(Ptr, AR, PSE.getSE(), Lp);
   bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0;
   if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero) {
     DEBUG(dbgs() << "LAA: Bad stride - Pointer may wrap in the address space "
-          << *Ptr << " SCEV: " << *PtrScev << "\n");
+                 << *Ptr << " SCEV: " << *PtrScev << "\n");
     return 0;
   }
 
   // Check the step is constant.
-  const SCEV *Step = AR->getStepRecurrence(*SE);
+  const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
 
   // Calculate the pointer stride and check if it is constant.
   const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
@@ -1046,11 +1047,11 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
       BPtr->getType()->getPointerAddressSpace())
     return Dependence::Unknown;
 
-  const SCEV *AScev = replaceSymbolicStrideSCEV(SE, Strides, Preds, APtr);
-  const SCEV *BScev = replaceSymbolicStrideSCEV(SE, Strides, Preds, BPtr);
+  const SCEV *AScev = replaceSymbolicStrideSCEV(PSE, Strides, APtr);
+  const SCEV *BScev = replaceSymbolicStrideSCEV(PSE, Strides, BPtr);
 
-  int StrideAPtr = isStridedPtr(SE, APtr, InnermostLoop, Strides, Preds);
-  int StrideBPtr = isStridedPtr(SE, BPtr, InnermostLoop, Strides, Preds);
+  int StrideAPtr = isStridedPtr(PSE, APtr, InnermostLoop, Strides);
+  int StrideBPtr = isStridedPtr(PSE, BPtr, InnermostLoop, Strides);
 
   const SCEV *Src = AScev;
   const SCEV *Sink = BScev;
@@ -1067,10 +1068,10 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
     std::swap(StrideAPtr, StrideBPtr);
   }
 
-  const SCEV *Dist = SE->getMinusSCEV(Sink, Src);
+  const SCEV *Dist = PSE.getSE()->getMinusSCEV(Sink, Src);
 
   DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink
-        << "(Induction step: " << StrideAPtr <<  ")\n");
+               << "(Induction step: " << StrideAPtr << ")\n");
   DEBUG(dbgs() << "LAA: Distance for " << *InstMap[AIdx] << " to "
         << *InstMap[BIdx] << ": " << *Dist << "\n");
 
@@ -1343,10 +1344,10 @@ bool LoopAccessInfo::canAnalyzeLoop() {
   }
 
   // ScalarEvolution needs to be able to find the exit count.
-  const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
-  if (ExitCount == SE->getCouldNotCompute()) {
-    emitAnalysis(LoopAccessReport() <<
-                 "could not determine number of loop iterations");
+  const SCEV *ExitCount = PSE.getSE()->getBackedgeTakenCount(TheLoop);
+  if (ExitCount == PSE.getSE()->getCouldNotCompute()) {
+    emitAnalysis(LoopAccessReport()
+                 << "could not determine number of loop iterations");
     DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n");
     return false;
   }
@@ -1447,7 +1448,7 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
 
   MemoryDepChecker::DepCandidates DependentAccesses;
   AccessAnalysis Accesses(TheLoop->getHeader()->getModule()->getDataLayout(),
-                          AA, LI, DependentAccesses, Preds);
+                          AA, LI, DependentAccesses, PSE);
 
   // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
   // multiple times on the same object. If the ptr is accessed twice, once
@@ -1498,8 +1499,7 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
     // read a few words, modify, and write a few words, and some of the
     // words may be written to the same address.
     bool IsReadOnlyPtr = false;
-    if (Seen.insert(Ptr).second ||
-        !isStridedPtr(SE, Ptr, TheLoop, Strides, Preds)) {
+    if (Seen.insert(Ptr).second || !isStridedPtr(PSE, Ptr, TheLoop, Strides)) {
       ++NumReads;
       IsReadOnlyPtr = true;
     }
@@ -1529,7 +1529,7 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
   // Find pointers with computable bounds. We are going to use this information
   // to place a runtime bound check.
   bool CanDoRTIfNeeded =
-      Accesses.canCheckPtrAtRT(PtrRtChecking, SE, TheLoop, Strides);
+      Accesses.canCheckPtrAtRT(PtrRtChecking, PSE.getSE(), TheLoop, Strides);
   if (!CanDoRTIfNeeded) {
     emitAnalysis(LoopAccessReport() << "cannot identify array bounds");
     DEBUG(dbgs() << "LAA: We can't vectorize because we can't find "
@@ -1556,6 +1556,7 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
       PtrRtChecking.reset();
       PtrRtChecking.Need = true;
 
+      auto *SE = PSE.getSE();
       CanDoRTIfNeeded =
           Accesses.canCheckPtrAtRT(PtrRtChecking, SE, TheLoop, Strides, true);
 
@@ -1598,7 +1599,7 @@ void LoopAccessInfo::emitAnalysis(LoopAccessReport &Message) {
 }
 
 bool LoopAccessInfo::isUniform(Value *V) const {
-  return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
+  return (PSE.getSE()->isLoopInvariant(PSE.getSE()->getSCEV(V), TheLoop));
 }
 
 // FIXME: this function is currently a duplicate of the one in
@@ -1679,7 +1680,7 @@ std::pair<Instruction *, Instruction *> LoopAccessInfo::addRuntimeChecks(
     Instruction *Loc,
     const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &PointerChecks)
     const {
-
+  auto *SE = PSE.getSE();
   SCEVExpander Exp(*SE, DL, "induction");
   auto ExpandedChecks =
       expandBounds(PointerChecks, TheLoop, Loc, SE, Exp, PtrRtChecking);
@@ -1749,7 +1750,7 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
                                const TargetLibraryInfo *TLI, AliasAnalysis *AA,
                                DominatorTree *DT, LoopInfo *LI,
                                const ValueToValueMap &Strides)
-    : PtrRtChecking(SE), DepChecker(SE, L, Preds), TheLoop(L), SE(SE), DL(DL),
+    : PSE(*SE), PtrRtChecking(SE), DepChecker(PSE, L), TheLoop(L), DL(DL),
       TLI(TLI), AA(AA), DT(DT), LI(LI), NumLoads(0), NumStores(0),
       MaxSafeDepDistBytes(-1U), CanVecMem(false),
       StoreToLoopInvariantAddress(false) {
@@ -1786,7 +1787,7 @@ void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
                    << "found in loop.\n";
 
   OS.indent(Depth) << "SCEV assumptions:\n";
-  Preds.print(OS, Depth);
+  PSE.getUnionPredicate().print(OS, Depth);
 }
 
 const LoopAccessInfo &
diff --git a/lib/Transforms/Scalar/LoopDistribute.cpp b/lib/Transforms/Scalar/LoopDistribute.cpp
index 67ebd2532b16..fce063ab40a0 100644
--- a/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -761,7 +761,7 @@ class LoopDistribute : public FunctionPass {
     }
 
     // Don't distribute the loop if we need too many SCEV run-time checks.
-    const SCEVUnionPredicate &Pred = LAI.Preds;
+    const SCEVUnionPredicate &Pred = LAI.PSE.getUnionPredicate();
     if (Pred.getComplexity() > DistributeSCEVCheckThreshold) {
       DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n");
       return false;
@@ -790,7 +790,7 @@ class LoopDistribute : public FunctionPass {
       DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks));
       LoopVersioning LVer(LAI, L, LI, DT, SE, false);
       LVer.setAliasChecks(std::move(Checks));
-      LVer.setSCEVChecks(LAI.Preds);
+      LVer.setSCEVChecks(LAI.PSE.getUnionPredicate());
       LVer.versionLoop(DefsUsedOutside);
     }
 
diff --git a/lib/Transforms/Scalar/LoopLoadElimination.cpp b/lib/Transforms/Scalar/LoopLoadElimination.cpp
index 7c7bf64ba79c..09d022b3013b 100644
--- a/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -459,17 +459,18 @@ class LoadEliminationForLoop {
       return false;
     }
 
-    if (LAI.Preds.getComplexity() > LoadElimSCEVCheckThreshold) {
+    if (LAI.PSE.getUnionPredicate().getComplexity() >
+        LoadElimSCEVCheckThreshold) {
       DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n");
       return false;
     }
 
     // Point of no-return, start the transformation.  First, version the loop if
     // necessary.
-    if (!Checks.empty() || !LAI.Preds.isAlwaysTrue()) {
+    if (!Checks.empty() || !LAI.PSE.getUnionPredicate().isAlwaysTrue()) {
       LoopVersioning LV(LAI, L, LI, DT, SE, false);
       LV.setAliasChecks(std::move(Checks));
-      LV.setSCEVChecks(LAI.Preds);
+      LV.setSCEVChecks(LAI.PSE.getUnionPredicate());
       LV.versionLoop();
     }
 
diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp
index e03880526bfa..091f14ebe9f8 100644
--- a/lib/Transforms/Utils/LoopUtils.cpp
+++ b/lib/Transforms/Utils/LoopUtils.cpp
@@ -727,3 +727,46 @@ SmallVector<Instruction *, 8> llvm::findDefsUsedOutsideOfLoop(Loop *L) {
 
   return UsedOutside;
 }
+
+PredicatedScalarEvolution::PredicatedScalarEvolution(ScalarEvolution &SE)
+    : SE(SE), Generation(0) {}
+
+const SCEV *PredicatedScalarEvolution::getSCEV(Value *V) {
+  const SCEV *Expr = SE.getSCEV(V);
+  RewriteEntry &Entry = RewriteMap[Expr];
+
+  // If we already have an entry and the version matches, return it.
+  if (Entry.second && Generation == Entry.first)
+    return Entry.second;
+
+  // We found an entry but it's stale. Rewrite the stale entry
+  // acording to the current predicate.
+  if (Entry.second)
+    Expr = Entry.second;
+
+  const SCEV *NewSCEV = SE.rewriteUsingPredicate(Expr, Preds);
+  Entry = {Generation, NewSCEV};
+
+  return NewSCEV;
+}
+
+void PredicatedScalarEvolution::addPredicate(const SCEVPredicate &Pred) {
+  if (Preds.implies(&Pred))
+    return;
+  Preds.add(&Pred);
+  updateGeneration();
+}
+
+const SCEVUnionPredicate &PredicatedScalarEvolution::getUnionPredicate() const {
+  return Preds;
+}
+
+void PredicatedScalarEvolution::updateGeneration() {
+  // If the generation number wrapped recompute everything.
+  if (++Generation == 0) {
+    for (auto &II : RewriteMap) {
+      const SCEV *Rewritten = II.second.second;
+      II.second = {Generation, SE.rewriteUsingPredicate(Rewritten, Preds)};
+    }
+  }
+}
diff --git a/lib/Transforms/Utils/LoopVersioning.cpp b/lib/Transforms/Utils/LoopVersioning.cpp
index cc3ff5d80d42..9a2a06cf6891 100644
--- a/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/lib/Transforms/Utils/LoopVersioning.cpp
@@ -32,7 +32,7 @@ LoopVersioning::LoopVersioning(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI,
   assert(L->getLoopPreheader() && "No preheader");
   if (UseLAIChecks) {
     setAliasChecks(LAI.getRuntimePointerChecking()->getChecks());
-    setSCEVChecks(LAI.Preds);
+    setSCEVChecks(LAI.PSE.getUnionPredicate());
   }
 }
 
@@ -58,7 +58,7 @@ void LoopVersioning::versionLoop(
       LAI.addRuntimeChecks(RuntimeCheckBB->getTerminator(), AliasChecks);
   assert(MemRuntimeCheck && "called even though needsAnyChecking = false");
 
-  const SCEVUnionPredicate &Pred = LAI.Preds;
+  const SCEVUnionPredicate &Pred = LAI.PSE.getUnionPredicate();
   SCEVExpander Exp(*SE, RuntimeCheckBB->getModule()->getDataLayout(),
                    "scev.check");
   SCEVRuntimeCheck =
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 917f2d55f6cb..9adc80c8bd0f 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -310,15 +310,16 @@ static GetElementPtrInst *getGEPInstruction(Value *Ptr) {
 /// and reduction variables that were found to a given vectorization factor.
 class InnerLoopVectorizer {
 public:
-  InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
-                      DominatorTree *DT, const TargetLibraryInfo *TLI,
+  InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
+                      LoopInfo *LI, DominatorTree *DT,
+                      const TargetLibraryInfo *TLI,
                       const TargetTransformInfo *TTI, unsigned VecWidth,
-                      unsigned UnrollFactor, SCEVUnionPredicate &Preds)
-      : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
-        VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()),
+                      unsigned UnrollFactor)
+      : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
+        VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()),
         Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor),
         TripCount(nullptr), VectorTripCount(nullptr), Legal(nullptr),
-        AddedSafetyChecks(false), Preds(Preds) {}
+        AddedSafetyChecks(false) {}
 
   // Perform the actual loop widening (vectorization).
   // MinimumBitWidths maps scalar integer values to the smallest bitwidth they
@@ -486,8 +487,10 @@ class InnerLoopVectorizer {
 
   /// The original loop.
   Loop *OrigLoop;
-  /// Scev analysis to use.
-  ScalarEvolution *SE;
+  /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
+  /// dynamic knowledge to simplify SCEV expressions and converts them to a
+  /// more usable form.
+  PredicatedScalarEvolution &PSE;
   /// Loop Info.
   LoopInfo *LI;
   /// Dominator Tree.
@@ -551,23 +554,15 @@ class InnerLoopVectorizer {
 
   // Record whether runtime check is added.
   bool AddedSafetyChecks;
-
-  /// The SCEV predicate containing all the SCEV-related assumptions.
-  /// The predicate is used to simplify existing expressions in the
-  /// context of existing SCEV assumptions. Since legality checking is
-  /// not done here, we don't need to use this predicate to record
-  /// further assumptions.
-  SCEVUnionPredicate &Preds;
 };
 
 class InnerLoopUnroller : public InnerLoopVectorizer {
 public:
-  InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
-                    DominatorTree *DT, const TargetLibraryInfo *TLI,
-                    const TargetTransformInfo *TTI, unsigned UnrollFactor,
-                    SCEVUnionPredicate &Preds)
-      : InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, 1, UnrollFactor,
-                            Preds) {}
+  InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
+                    LoopInfo *LI, DominatorTree *DT,
+                    const TargetLibraryInfo *TLI,
+                    const TargetTransformInfo *TTI, unsigned UnrollFactor)
+      : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, 1, UnrollFactor) {}
 
 private:
   void scalarizeInstruction(Instruction *Instr,
@@ -789,9 +784,9 @@ class InterleaveGroup {
 /// between the member and the group in a map.
 class InterleavedAccessInfo {
 public:
-  InterleavedAccessInfo(ScalarEvolution *SE, Loop *L, DominatorTree *DT,
-                        SCEVUnionPredicate &Preds)
-      : SE(SE), TheLoop(L), DT(DT), Preds(Preds) {}
+  InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
+                        DominatorTree *DT)
+      : PSE(PSE), TheLoop(L), DT(DT) {}
 
   ~InterleavedAccessInfo() {
     SmallSet<InterleaveGroup *, 4> DelSet;
@@ -821,17 +816,14 @@ class InterleavedAccessInfo {
   }
 
 private:
-  ScalarEvolution *SE;
+  /// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
+  /// Simplifies SCEV expressions in the context of existing SCEV assumptions.
+  /// The interleaved access analysis can also add new predicates (for example
+  /// by versioning strides of pointers).
+  PredicatedScalarEvolution &PSE;
   Loop *TheLoop;
   DominatorTree *DT;
 
-  /// The SCEV predicate containing all the SCEV-related assumptions.
-  /// The predicate is used to simplify SCEV expressions in the
-  /// context of existing SCEV assumptions. The interleaved access
-  /// analysis can also add new predicates (for example by versioning
-  /// strides of pointers).
-  SCEVUnionPredicate &Preds;
-
   /// Holds the relationships between the members and the interleave group.
   DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap;
 
@@ -1189,18 +1181,17 @@ static void emitMissedWarning(Function *F, Loop *L,
 /// induction variable and the different reduction variables.
 class LoopVectorizationLegality {
 public:
-  LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DominatorTree *DT,
-                            TargetLibraryInfo *TLI, AliasAnalysis *AA,
-                            Function *F, const TargetTransformInfo *TTI,
+  LoopVectorizationLegality(Loop *L, PredicatedScalarEvolution &PSE,
+                            DominatorTree *DT, TargetLibraryInfo *TLI,
+                            AliasAnalysis *AA, Function *F,
+                            const TargetTransformInfo *TTI,
                             LoopAccessAnalysis *LAA,
                             LoopVectorizationRequirements *R,
-                            const LoopVectorizeHints *H,
-                            SCEVUnionPredicate &Preds)
-      : NumPredStores(0), TheLoop(L), SE(SE), TLI(TLI), TheFunction(F),
-        TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr),
-        InterleaveInfo(SE, L, DT, Preds), Induction(nullptr),
-        WidestIndTy(nullptr), HasFunNoNaNAttr(false), Requirements(R), Hints(H),
-        Preds(Preds) {}
+                            const LoopVectorizeHints *H)
+      : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TheFunction(F),
+        TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(PSE, L, DT),
+        Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false),
+        Requirements(R), Hints(H) {}
 
   /// ReductionList contains the reduction descriptors for all
   /// of the reductions that were found in the loop.
@@ -1347,8 +1338,12 @@ class LoopVectorizationLegality {
 
   /// The loop that we evaluate.
   Loop *TheLoop;
-  /// Scev analysis.
-  ScalarEvolution *SE;
+  /// A wrapper around ScalarEvolution used to add runtime SCEV checks.
+  /// Applies dynamic knowledge to simplify SCEV expressions in the context
+  /// of existing SCEV assumptions. The analysis will also add a minimal set
+  /// of new predicates if this is required to enable vectorization and
+  /// unrolling.
+  PredicatedScalarEvolution &PSE;
   /// Target Library Info.
   TargetLibraryInfo *TLI;
   /// Parent function
@@ -1403,13 +1398,6 @@ class LoopVectorizationLegality {
   /// While vectorizing these instructions we have to generate a
   /// call to the appropriate masked intrinsic
   SmallPtrSet<const Instruction *, 8> MaskedOp;
-
-  /// The SCEV predicate containing all the SCEV-related assumptions.
-  /// The predicate is used to simplify SCEV expressions in the
-  /// context of existing SCEV assumptions. The analysis will also
-  /// add a minimal set of new predicates if this is required to
-  /// enable vectorization/unrolling.
-  SCEVUnionPredicate &Preds;
 };
 
 /// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -1427,8 +1415,7 @@ class LoopVectorizationCostModel {
                              const TargetLibraryInfo *TLI, DemandedBits *DB,
                              AssumptionCache *AC, const Function *F,
                              const LoopVectorizeHints *Hints,
-                             SmallPtrSetImpl<const Value *> &ValuesToIgnore,
-                             SCEVUnionPredicate &Preds)
+                             SmallPtrSetImpl<const Value *> &ValuesToIgnore)
       : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
         TheFunction(F), Hints(Hints), ValuesToIgnore(ValuesToIgnore) {}
 
@@ -1758,12 +1745,12 @@ struct LoopVectorize : public FunctionPass {
       }
     }
 
-    SCEVUnionPredicate Preds;
+    PredicatedScalarEvolution PSE(*SE);
 
     // Check if it is legal to vectorize the loop.
     LoopVectorizationRequirements Requirements;
-    LoopVectorizationLegality LVL(L, SE, DT, TLI, AA, F, TTI, LAA,
-                                  &Requirements, &Hints, Preds);
+    LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA,
+                                  &Requirements, &Hints);
     if (!LVL.canVectorize()) {
       DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
       emitMissedWarning(F, L, Hints);
@@ -1781,8 +1768,8 @@ struct LoopVectorize : public FunctionPass {
     }
 
     // Use the cost model.
-    LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, TLI, DB, AC, F, &Hints,
-                                  ValuesToIgnore, Preds);
+    LoopVectorizationCostModel CM(L, PSE.getSE(), LI, &LVL, *TTI, TLI, DB, AC,
+                                  F, &Hints, ValuesToIgnore);
 
     // Check the function attributes to find out if this function should be
     // optimized for size.
@@ -1893,7 +1880,7 @@ struct LoopVectorize : public FunctionPass {
       assert(IC > 1 && "interleave count should not be 1 or 0");
       // If we decided that it is not legal to vectorize the loop then
       // interleave it.
-      InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, IC, Preds);
+      InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, IC);
       Unroller.vectorize(&LVL, CM.MinBWs);
 
       emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(),
@@ -1901,7 +1888,7 @@ struct LoopVectorize : public FunctionPass {
                                  Twine(IC) + ")");
     } else {
       // If we decided that it is *legal* to vectorize the loop then do it.
-      InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, IC, Preds);
+      InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, VF.Width, IC);
       LB.vectorize(&LVL, CM.MinBWs);
       ++LoopsVectorized;
 
@@ -2002,6 +1989,7 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
 
 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr");
+  auto *SE = PSE.getSE();
   // Make sure that the pointer does not point to structs.
   if (Ptr->getType()->getPointerElementType()->isAggregateType())
     return 0;
@@ -2031,7 +2019,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
 
     // Make sure that all of the index operands are loop invariant.
     for (unsigned i = 1; i < NumOperands; ++i)
-      if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
+      if (!SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop))
         return 0;
 
     InductionDescriptor II = Inductions[Phi];
@@ -2044,14 +2032,14 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   // operand.
   for (unsigned i = 0; i != NumOperands; ++i)
     if (i != InductionOperand &&
-        !SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
+        !SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop))
       return 0;
 
   // We can emit wide load/stores only if the last non-zero index is the
   // induction variable.
   const SCEV *Last = nullptr;
   if (!Strides.count(Gep))
-    Last = SE->getSCEV(Gep->getOperand(InductionOperand));
+    Last = PSE.getSCEV(Gep->getOperand(InductionOperand));
   else {
     // Because of the multiplication by a stride we can have a s/zext cast.
     // We are going to replace this stride by 1 so the cast is safe to ignore.
@@ -2062,7 +2050,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
     //  %idxprom = zext i32 %mul to i64  << Safe cast.
     //  %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom
     //
-    Last = replaceSymbolicStrideSCEV(SE, Strides, Preds,
+    Last = replaceSymbolicStrideSCEV(PSE, Strides,
                                      Gep->getOperand(InductionOperand), Gep);
     if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(Last))
       Last =
@@ -2420,8 +2408,9 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
     Ptr = Builder.Insert(Gep2);
   } else if (Gep) {
     setDebugLocFromInst(Builder, Gep);
-    assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()),
-                               OrigLoop) && "Base ptr must be invariant");
+    assert(PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getPointerOperand()),
+                                        OrigLoop) &&
+           "Base ptr must be invariant");
 
     // The last index does not have to be the induction. It can be
     // consecutive and be a function of the index. For example A[I+1];
@@ -2438,7 +2427,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
       if (i == InductionOperand ||
           (GepOperandInst && OrigLoop->contains(GepOperandInst))) {
         assert((i == InductionOperand ||
-               SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) &&
+                PSE.getSE()->isLoopInvariant(PSE.getSCEV(GepOperandInst),
+                                             OrigLoop)) &&
                "Must be last index or loop invariant");
 
         VectorParts &GEPParts = getVectorValue(GepOperand);
@@ -2658,6 +2648,7 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
 
   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
   // Find the loop boundaries.
+  ScalarEvolution *SE = PSE.getSE();
   const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(OrigLoop);
   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
          "Invalid loop count");
@@ -2765,8 +2756,10 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
   // Generate the code to check that the SCEV assumptions that we made.
   // We want the new basic block to start at the first instruction in a
   // sequence of instructions that form a check.
-  SCEVExpander Exp(*SE, Bypass->getModule()->getDataLayout(), "scev.check");
-  Value *SCEVCheck = Exp.expandCodeForPredicate(&Preds, BB->getTerminator());
+  SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
+                   "scev.check");
+  Value *SCEVCheck =
+      Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
 
   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
     if (C->isZero())
@@ -3785,8 +3778,9 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
       // Widen selects.
       // If the selector is loop invariant we can create a select
       // instruction with a scalar condition. Otherwise, use vector-select.
-      bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)),
-                                               OrigLoop);
+      auto *SE = PSE.getSE();
+      bool InvariantCond =
+          SE->isLoopInvariant(PSE.getSCEV(it->getOperand(0)), OrigLoop);
       setDebugLocFromInst(Builder, &*it);
 
       // The condition can be loop invariant  but still defined inside the
@@ -3967,7 +3961,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
 
 void InnerLoopVectorizer::updateAnalysis() {
   // Forget the original basic block.
-  SE->forgetLoop(OrigLoop);
+  PSE.getSE()->forgetLoop(OrigLoop);
 
   // Update the dominator tree information.
   assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
@@ -4119,10 +4113,10 @@ bool LoopVectorizationLegality::canVectorize() {
   }
 
   // ScalarEvolution needs to be able to find the exit count.
-  const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
-  if (ExitCount == SE->getCouldNotCompute()) {
-    emitAnalysis(VectorizationReport() <<
-                 "could not determine number of loop iterations");
+  const SCEV *ExitCount = PSE.getSE()->getBackedgeTakenCount(TheLoop);
+  if (ExitCount == PSE.getSE()->getCouldNotCompute()) {
+    emitAnalysis(VectorizationReport()
+                 << "could not determine number of loop iterations");
     DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
     return false;
   }
@@ -4162,7 +4156,7 @@ bool LoopVectorizationLegality::canVectorize() {
   if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
     SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
 
-  if (Preds.getComplexity() > SCEVThreshold) {
+  if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {
     emitAnalysis(VectorizationReport()
                  << "Too many SCEV assumptions need to be made and checked "
                  << "at runtime");
@@ -4268,7 +4262,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         }
 
         InductionDescriptor ID;
-        if (InductionDescriptor::isInductionPHI(Phi, SE, ID)) {
+        if (InductionDescriptor::isInductionPHI(Phi, PSE.getSE(), ID)) {
           Inductions[Phi] = ID;
           // Get the widest type.
           if (!WidestIndTy)
@@ -4337,7 +4331,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       // second argument is the same (i.e. loop invariant)
       if (CI &&
           hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) {
-        if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) {
+        auto *SE = PSE.getSE();
+        if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) {
           emitAnalysis(VectorizationReport(&*it)
                        << "intrinsic instruction cannot be vectorized");
           DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n");
@@ -4410,7 +4405,7 @@ void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) {
   else
     return;
 
-  Value *Stride = getStrideFromPointer(Ptr, SE, TheLoop);
+  Value *Stride = getStrideFromPointer(Ptr, PSE.getSE(), TheLoop);
   if (!Stride)
     return;
 
@@ -4474,7 +4469,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
   }
 
   Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
-  Preds.add(&LAI->Preds);
+  PSE.addPredicate(LAI->PSE.getUnionPredicate());
 
   return true;
 }
@@ -4589,7 +4584,7 @@ void InterleavedAccessInfo::collectConstStridedAccesses(
     StoreInst *SI = dyn_cast<StoreInst>(I);
 
     Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
-    int Stride = isStridedPtr(SE, Ptr, TheLoop, Strides, Preds);
+    int Stride = isStridedPtr(PSE, Ptr, TheLoop, Strides);
 
     // The factor of the corresponding interleave group.
     unsigned Factor = std::abs(Stride);
@@ -4598,7 +4593,7 @@ void InterleavedAccessInfo::collectConstStridedAccesses(
     if (Factor < 2 || Factor > MaxInterleaveGroupFactor)
       continue;
 
-    const SCEV *Scev = replaceSymbolicStrideSCEV(SE, Strides, Preds, Ptr);
+    const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
     PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
     unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType());
 
@@ -4685,8 +4680,8 @@ void InterleavedAccessInfo::analyzeInterleaving(
         continue;
 
       // Calculate the distance and prepare for the rule 3.
-      const SCEVConstant *DistToA =
-          dyn_cast<SCEVConstant>(SE->getMinusSCEV(DesB.Scev, DesA.Scev));
+      const SCEVConstant *DistToA = dyn_cast<SCEVConstant>(
+          PSE.getSE()->getMinusSCEV(DesB.Scev, DesA.Scev));
       if (!DistToA)
         continue;
 

From bdd73bcbd7eb38e922f86a02299dcd15b8bcb6e3 Mon Sep 17 00:00:00 2001
From: Silviu Baranga <silviu.baranga@arm.com>
Date: Wed, 9 Dec 2015 15:25:28 +0000
Subject: [PATCH 272/364] Revert r255115 until we figure out how to fix the bot
 failures.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255117 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/LoopAccessAnalysis.h    |  51 +++---
 include/llvm/Transforms/Utils/LoopUtils.h     |  52 ------
 lib/Analysis/LoopAccessAnalysis.cpp           |  87 +++++----
 lib/Transforms/Scalar/LoopDistribute.cpp      |   4 +-
 lib/Transforms/Scalar/LoopLoadElimination.cpp |   7 +-
 lib/Transforms/Utils/LoopUtils.cpp            |  43 -----
 lib/Transforms/Utils/LoopVersioning.cpp       |   4 +-
 lib/Transforms/Vectorize/LoopVectorize.cpp    | 165 +++++++++---------
 8 files changed, 164 insertions(+), 249 deletions(-)

diff --git a/include/llvm/Analysis/LoopAccessAnalysis.h b/include/llvm/Analysis/LoopAccessAnalysis.h
index 9b4de81c6deb..77d412a4f927 100644
--- a/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -24,7 +24,6 @@
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
 
 namespace llvm {
 
@@ -194,10 +193,11 @@ class MemoryDepChecker {
                const SmallVectorImpl<Instruction *> &Instrs) const;
   };
 
-  MemoryDepChecker(PredicatedScalarEvolution &PSE, const Loop *L)
-      : PSE(PSE), InnermostLoop(L), AccessIdx(0),
+  MemoryDepChecker(ScalarEvolution *Se, const Loop *L,
+                   SCEVUnionPredicate &Preds)
+      : SE(Se), InnermostLoop(L), AccessIdx(0),
         ShouldRetryWithRuntimeCheck(false), SafeForVectorization(true),
-        RecordDependences(true) {}
+        RecordDependences(true), Preds(Preds) {}
 
   /// \brief Register the location (instructions are given increasing numbers)
   /// of a write access.
@@ -266,13 +266,7 @@ class MemoryDepChecker {
                                                          bool isWrite) const;
 
 private:
-  /// A wrapper around ScalarEvolution, used to add runtime SCEV checks, and
-  /// applies dynamic knowledge to simplify SCEV expressions and convert them
-  /// to a more usable form. We need this in case assumptions about SCEV
-  /// expressions need to be made in order to avoid unknown dependences. For
-  /// example we might assume a unit stride for a pointer in order to prove
-  /// that a memory access is strided and doesn't wrap.
-  PredicatedScalarEvolution &PSE;
+  ScalarEvolution *SE;
   const Loop *InnermostLoop;
 
   /// \brief Maps access locations (ptr, read/write) to program order.
@@ -323,6 +317,15 @@ class MemoryDepChecker {
   /// \brief Check whether the data dependence could prevent store-load
   /// forwarding.
   bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize);
+
+  /// The SCEV predicate containing all the SCEV-related assumptions.
+  /// The dependence checker needs this in order to convert SCEVs of pointers
+  /// to more accurate expressions in the context of existing assumptions.
+  /// We also need this in case assumptions about SCEV expressions need to
+  /// be made in order to avoid unknown dependences. For example we might
+  /// assume a unit stride for a pointer in order to prove that a memory access
+  /// is strided and doesn't wrap.
+  SCEVUnionPredicate &Preds;
 };
 
 /// \brief Holds information about the memory runtime legality checks to verify
@@ -370,7 +373,7 @@ class RuntimePointerChecking {
   /// and change \p Preds.
   void insert(Loop *Lp, Value *Ptr, bool WritePtr, unsigned DepSetId,
               unsigned ASId, const ValueToValueMap &Strides,
-              PredicatedScalarEvolution &PSE);
+              SCEVUnionPredicate &Preds);
 
   /// \brief No run-time memory checking is necessary.
   bool empty() const { return Pointers.empty(); }
@@ -505,8 +508,8 @@ class RuntimePointerChecking {
 /// ScalarEvolution, we will generate run-time checks by emitting a
 /// SCEVUnionPredicate.
 ///
-/// Checks for both memory dependences and the SCEV predicates contained in the
-/// PSE must be emitted in order for the results of this analysis to be valid.
+/// Checks for both memory dependences and SCEV predicates must be emitted in
+/// order for the results of this analysis to be valid.
 class LoopAccessInfo {
 public:
   LoopAccessInfo(Loop *L, ScalarEvolution *SE, const DataLayout &DL,
@@ -588,12 +591,14 @@ class LoopAccessInfo {
     return StoreToLoopInvariantAddress;
   }
 
-  /// Used to add runtime SCEV checks. Simplifies SCEV expressions and converts
-  /// them to a more usable form.  All SCEV expressions during the analysis
-  /// should be re-written (and therefore simplified) according to PSE.
+  /// The SCEV predicate contains all the SCEV-related assumptions.
+  /// The is used to keep track of the minimal set of assumptions on SCEV
+  /// expressions that the analysis needs to make in order to return a
+  /// meaningful result. All SCEV expressions during the analysis should be
+  /// re-written (and therefore simplified) according to Preds.
   /// A user of LoopAccessAnalysis will need to emit the runtime checks
   /// associated with this predicate.
-  PredicatedScalarEvolution PSE;
+  SCEVUnionPredicate Preds;
 
 private:
   /// \brief Analyze the loop.  Substitute symbolic strides using Strides.
@@ -614,6 +619,7 @@ class LoopAccessInfo {
   MemoryDepChecker DepChecker;
 
   Loop *TheLoop;
+  ScalarEvolution *SE;
   const DataLayout &DL;
   const TargetLibraryInfo *TLI;
   AliasAnalysis *AA;
@@ -648,17 +654,18 @@ Value *stripIntegerCast(Value *V);
 /// If \p OrigPtr is not null, use it to look up the stride value instead of \p
 /// Ptr.  \p PtrToStride provides the mapping between the pointer value and its
 /// stride as collected by LoopVectorizationLegality::collectStridedAccess.
-const SCEV *replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
+const SCEV *replaceSymbolicStrideSCEV(ScalarEvolution *SE,
                                       const ValueToValueMap &PtrToStride,
-                                      Value *Ptr, Value *OrigPtr = nullptr);
+                                      SCEVUnionPredicate &Preds, Value *Ptr,
+                                      Value *OrigPtr = nullptr);
 
 /// \brief Check the stride of the pointer and ensure that it does not wrap in
 /// the address space, assuming \p Preds is true.
 ///
 /// If necessary this method will version the stride of the pointer according
 /// to \p PtrToStride and therefore add a new predicate to \p Preds.
-int isStridedPtr(PredicatedScalarEvolution &PSE, Value *Ptr, const Loop *Lp,
-                 const ValueToValueMap &StridesMap);
+int isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp,
+                 const ValueToValueMap &StridesMap, SCEVUnionPredicate &Preds);
 
 /// \brief This analysis provides dependence information for the memory accesses
 /// of a loop.
diff --git a/include/llvm/Transforms/Utils/LoopUtils.h b/include/llvm/Transforms/Utils/LoopUtils.h
index 30f1f5037ae7..da92b25f4f03 100644
--- a/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/include/llvm/Transforms/Utils/LoopUtils.h
@@ -374,58 +374,6 @@ void computeLICMSafetyInfo(LICMSafetyInfo *, Loop *);
 
 /// \brief Returns the instructions that use values defined in the loop.
 SmallVector<Instruction *, 8> findDefsUsedOutsideOfLoop(Loop *L);
-
-/// An interface layer with SCEV used to manage how we see SCEV expressions for
-/// values in the context of existing predicates. We can add new predicates,
-/// but we cannot remove them.
-///
-/// This layer has multiple purposes:
-///   - provides a simple interface for SCEV versioning.
-///   - guarantees that the order of transformations applied on a SCEV
-///     expression for a single Value is consistent across two different
-///     getSCEV calls. This means that, for example, once we've obtained
-///     an AddRec expression for a certain value through expression rewriting,
-///     we will continue to get an AddRec expression for that Value.
-///   - lowers the number of expression rewrites.
-class PredicatedScalarEvolution {
-public:
-  PredicatedScalarEvolution(ScalarEvolution &SE);
-  const SCEVUnionPredicate &getUnionPredicate() const;
-  /// \brief Returns the SCEV expression of V, in the context of the current
-  /// SCEV predicate.
-  /// The order of transformations applied on the expression of V returned
-  /// by ScalarEvolution is guaranteed to be preserved, even when adding new
-  /// predicates.
-  const SCEV *getSCEV(Value *V);
-  /// \brief Adds a new predicate.
-  void addPredicate(const SCEVPredicate &Pred);
-  /// \brief Returns the ScalarEvolution analysis used.
-  ScalarEvolution *getSE() const { return &SE; }
-
-private:
-  /// \brief Increments the version number of the predicate.
-  /// This needs to be called every time the SCEV predicate changes.
-  void updateGeneration();
-  /// Holds a SCEV and the version number of the SCEV predicate used to
-  /// perform the rewrite of the expression.
-  typedef std::pair<unsigned, const SCEV *> RewriteEntry;
-  /// Maps a SCEV to the rewrite result of that SCEV at a certain version
-  /// number. If this number doesn't match the current Generation, we will
-  /// need to do a rewrite. To preserve the transformation order of previous
-  /// rewrites, we will rewrite the previous result instead of the original
-  /// SCEV.
-  DenseMap<const SCEV *, RewriteEntry> RewriteMap;
-  /// The ScalarEvolution analysis.
-  ScalarEvolution &SE;
-  /// The SCEVPredicate that forms our context. We will rewrite all expressions
-  /// assuming that this predicate true.
-  SCEVUnionPredicate Preds;
-  /// Marks the version of the SCEV predicate used. When rewriting a SCEV
-  /// expression we mark it with the version of the predicate. We use this to
-  /// figure out if the predicate has changed from the last rewrite of the
-  /// SCEV. If so, we need to perform a new rewrite.
-  unsigned Generation;
-};
 }
 
 #endif
diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp
index e221c1bbba23..b2670bf48dd8 100644
--- a/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/lib/Analysis/LoopAccessAnalysis.cpp
@@ -87,10 +87,11 @@ Value *llvm::stripIntegerCast(Value *V) {
   return V;
 }
 
-const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
+const SCEV *llvm::replaceSymbolicStrideSCEV(ScalarEvolution *SE,
                                             const ValueToValueMap &PtrToStride,
+                                            SCEVUnionPredicate &Preds,
                                             Value *Ptr, Value *OrigPtr) {
-  const SCEV *OrigSCEV = PSE.getSCEV(Ptr);
+  const SCEV *OrigSCEV = SE->getSCEV(Ptr);
 
   // If there is an entry in the map return the SCEV of the pointer with the
   // symbolic stride replaced by one.
@@ -107,17 +108,16 @@ const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
     ValueToValueMap RewriteMap;
     RewriteMap[StrideVal] = One;
 
-    ScalarEvolution *SE = PSE.getSE();
     const auto *U = cast<SCEVUnknown>(SE->getSCEV(StrideVal));
     const auto *CT =
         static_cast<const SCEVConstant *>(SE->getOne(StrideVal->getType()));
 
-    PSE.addPredicate(*SE->getEqualPredicate(U, CT));
-    auto *Expr = PSE.getSCEV(Ptr);
+    Preds.add(SE->getEqualPredicate(U, CT));
 
-    DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV << " by: " << *Expr
+    const SCEV *ByOne = SE->rewriteUsingPredicate(OrigSCEV, Preds);
+    DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV << " by: " << *ByOne
                  << "\n");
-    return Expr;
+    return ByOne;
   }
 
   // Otherwise, just return the SCEV of the original pointer.
@@ -127,12 +127,11 @@ const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
 void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, bool WritePtr,
                                     unsigned DepSetId, unsigned ASId,
                                     const ValueToValueMap &Strides,
-                                    PredicatedScalarEvolution &PSE) {
+                                    SCEVUnionPredicate &Preds) {
   // Get the stride replaced scev.
-  const SCEV *Sc = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
+  const SCEV *Sc = replaceSymbolicStrideSCEV(SE, Strides, Preds, Ptr);
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
   assert(AR && "Invalid addrec expression");
-  ScalarEvolution *SE = PSE.getSE();
   const SCEV *Ex = SE->getBackedgeTakenCount(Lp);
 
   const SCEV *ScStart = AR->getStart();
@@ -424,10 +423,9 @@ class AccessAnalysis {
   typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
 
   AccessAnalysis(const DataLayout &Dl, AliasAnalysis *AA, LoopInfo *LI,
-                 MemoryDepChecker::DepCandidates &DA,
-                 PredicatedScalarEvolution &PSE)
+                 MemoryDepChecker::DepCandidates &DA, SCEVUnionPredicate &Preds)
       : DL(Dl), AST(*AA), LI(LI), DepCands(DA), IsRTCheckAnalysisNeeded(false),
-        PSE(PSE) {}
+        Preds(Preds) {}
 
   /// \brief Register a load  and whether it is only read from.
   void addLoad(MemoryLocation &Loc, bool IsReadOnly) {
@@ -514,16 +512,16 @@ class AccessAnalysis {
   bool IsRTCheckAnalysisNeeded;
 
   /// The SCEV predicate containing all the SCEV-related assumptions.
-  PredicatedScalarEvolution &PSE;
+  SCEVUnionPredicate &Preds;
 };
 
 } // end anonymous namespace
 
 /// \brief Check whether a pointer can participate in a runtime bounds check.
-static bool hasComputableBounds(PredicatedScalarEvolution &PSE,
+static bool hasComputableBounds(ScalarEvolution *SE,
                                 const ValueToValueMap &Strides, Value *Ptr,
-                                Loop *L) {
-  const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
+                                Loop *L, SCEVUnionPredicate &Preds) {
+  const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, Strides, Preds, Ptr);
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
   if (!AR)
     return false;
@@ -566,11 +564,11 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
       else
         ++NumReadPtrChecks;
 
-      if (hasComputableBounds(PSE, StridesMap, Ptr, TheLoop) &&
+      if (hasComputableBounds(SE, StridesMap, Ptr, TheLoop, Preds) &&
           // When we run after a failing dependency check we have to make sure
           // we don't have wrapping pointers.
           (!ShouldCheckStride ||
-           isStridedPtr(PSE, Ptr, TheLoop, StridesMap) == 1)) {
+           isStridedPtr(SE, Ptr, TheLoop, StridesMap, Preds) == 1)) {
         // The id of the dependence set.
         unsigned DepId;
 
@@ -584,7 +582,7 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
           // Each access has its own dependence set.
           DepId = RunningDepId++;
 
-        RtCheck.insert(TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap, PSE);
+        RtCheck.insert(TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap, Preds);
 
         DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n');
       } else {
@@ -819,8 +817,9 @@ static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR,
 }
 
 /// \brief Check whether the access through \p Ptr has a constant stride.
-int llvm::isStridedPtr(PredicatedScalarEvolution &PSE, Value *Ptr,
-                       const Loop *Lp, const ValueToValueMap &StridesMap) {
+int llvm::isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp,
+                       const ValueToValueMap &StridesMap,
+                       SCEVUnionPredicate &Preds) {
   Type *Ty = Ptr->getType();
   assert(Ty->isPointerTy() && "Unexpected non-ptr");
 
@@ -832,7 +831,7 @@ int llvm::isStridedPtr(PredicatedScalarEvolution &PSE, Value *Ptr,
     return 0;
   }
 
-  const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr);
+  const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, StridesMap, Preds, Ptr);
 
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
   if (!AR) {
@@ -855,16 +854,16 @@ int llvm::isStridedPtr(PredicatedScalarEvolution &PSE, Value *Ptr,
   // to access the pointer value "0" which is undefined behavior in address
   // space 0, therefore we can also vectorize this case.
   bool IsInBoundsGEP = isInBoundsGep(Ptr);
-  bool IsNoWrapAddRec = isNoWrapAddRec(Ptr, AR, PSE.getSE(), Lp);
+  bool IsNoWrapAddRec = isNoWrapAddRec(Ptr, AR, SE, Lp);
   bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0;
   if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero) {
     DEBUG(dbgs() << "LAA: Bad stride - Pointer may wrap in the address space "
-                 << *Ptr << " SCEV: " << *PtrScev << "\n");
+          << *Ptr << " SCEV: " << *PtrScev << "\n");
     return 0;
   }
 
   // Check the step is constant.
-  const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
+  const SCEV *Step = AR->getStepRecurrence(*SE);
 
   // Calculate the pointer stride and check if it is constant.
   const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
@@ -1047,11 +1046,11 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
       BPtr->getType()->getPointerAddressSpace())
     return Dependence::Unknown;
 
-  const SCEV *AScev = replaceSymbolicStrideSCEV(PSE, Strides, APtr);
-  const SCEV *BScev = replaceSymbolicStrideSCEV(PSE, Strides, BPtr);
+  const SCEV *AScev = replaceSymbolicStrideSCEV(SE, Strides, Preds, APtr);
+  const SCEV *BScev = replaceSymbolicStrideSCEV(SE, Strides, Preds, BPtr);
 
-  int StrideAPtr = isStridedPtr(PSE, APtr, InnermostLoop, Strides);
-  int StrideBPtr = isStridedPtr(PSE, BPtr, InnermostLoop, Strides);
+  int StrideAPtr = isStridedPtr(SE, APtr, InnermostLoop, Strides, Preds);
+  int StrideBPtr = isStridedPtr(SE, BPtr, InnermostLoop, Strides, Preds);
 
   const SCEV *Src = AScev;
   const SCEV *Sink = BScev;
@@ -1068,10 +1067,10 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
     std::swap(StrideAPtr, StrideBPtr);
   }
 
-  const SCEV *Dist = PSE.getSE()->getMinusSCEV(Sink, Src);
+  const SCEV *Dist = SE->getMinusSCEV(Sink, Src);
 
   DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink
-               << "(Induction step: " << StrideAPtr << ")\n");
+        << "(Induction step: " << StrideAPtr <<  ")\n");
   DEBUG(dbgs() << "LAA: Distance for " << *InstMap[AIdx] << " to "
         << *InstMap[BIdx] << ": " << *Dist << "\n");
 
@@ -1344,10 +1343,10 @@ bool LoopAccessInfo::canAnalyzeLoop() {
   }
 
   // ScalarEvolution needs to be able to find the exit count.
-  const SCEV *ExitCount = PSE.getSE()->getBackedgeTakenCount(TheLoop);
-  if (ExitCount == PSE.getSE()->getCouldNotCompute()) {
-    emitAnalysis(LoopAccessReport()
-                 << "could not determine number of loop iterations");
+  const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
+  if (ExitCount == SE->getCouldNotCompute()) {
+    emitAnalysis(LoopAccessReport() <<
+                 "could not determine number of loop iterations");
     DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n");
     return false;
   }
@@ -1448,7 +1447,7 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
 
   MemoryDepChecker::DepCandidates DependentAccesses;
   AccessAnalysis Accesses(TheLoop->getHeader()->getModule()->getDataLayout(),
-                          AA, LI, DependentAccesses, PSE);
+                          AA, LI, DependentAccesses, Preds);
 
   // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
   // multiple times on the same object. If the ptr is accessed twice, once
@@ -1499,7 +1498,8 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
     // read a few words, modify, and write a few words, and some of the
     // words may be written to the same address.
     bool IsReadOnlyPtr = false;
-    if (Seen.insert(Ptr).second || !isStridedPtr(PSE, Ptr, TheLoop, Strides)) {
+    if (Seen.insert(Ptr).second ||
+        !isStridedPtr(SE, Ptr, TheLoop, Strides, Preds)) {
       ++NumReads;
       IsReadOnlyPtr = true;
     }
@@ -1529,7 +1529,7 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
   // Find pointers with computable bounds. We are going to use this information
   // to place a runtime bound check.
   bool CanDoRTIfNeeded =
-      Accesses.canCheckPtrAtRT(PtrRtChecking, PSE.getSE(), TheLoop, Strides);
+      Accesses.canCheckPtrAtRT(PtrRtChecking, SE, TheLoop, Strides);
   if (!CanDoRTIfNeeded) {
     emitAnalysis(LoopAccessReport() << "cannot identify array bounds");
     DEBUG(dbgs() << "LAA: We can't vectorize because we can't find "
@@ -1556,7 +1556,6 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
       PtrRtChecking.reset();
       PtrRtChecking.Need = true;
 
-      auto *SE = PSE.getSE();
       CanDoRTIfNeeded =
           Accesses.canCheckPtrAtRT(PtrRtChecking, SE, TheLoop, Strides, true);
 
@@ -1599,7 +1598,7 @@ void LoopAccessInfo::emitAnalysis(LoopAccessReport &Message) {
 }
 
 bool LoopAccessInfo::isUniform(Value *V) const {
-  return (PSE.getSE()->isLoopInvariant(PSE.getSE()->getSCEV(V), TheLoop));
+  return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
 }
 
 // FIXME: this function is currently a duplicate of the one in
@@ -1680,7 +1679,7 @@ std::pair<Instruction *, Instruction *> LoopAccessInfo::addRuntimeChecks(
     Instruction *Loc,
     const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &PointerChecks)
     const {
-  auto *SE = PSE.getSE();
+
   SCEVExpander Exp(*SE, DL, "induction");
   auto ExpandedChecks =
       expandBounds(PointerChecks, TheLoop, Loc, SE, Exp, PtrRtChecking);
@@ -1750,7 +1749,7 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
                                const TargetLibraryInfo *TLI, AliasAnalysis *AA,
                                DominatorTree *DT, LoopInfo *LI,
                                const ValueToValueMap &Strides)
-    : PSE(*SE), PtrRtChecking(SE), DepChecker(PSE, L), TheLoop(L), DL(DL),
+    : PtrRtChecking(SE), DepChecker(SE, L, Preds), TheLoop(L), SE(SE), DL(DL),
       TLI(TLI), AA(AA), DT(DT), LI(LI), NumLoads(0), NumStores(0),
       MaxSafeDepDistBytes(-1U), CanVecMem(false),
       StoreToLoopInvariantAddress(false) {
@@ -1787,7 +1786,7 @@ void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
                    << "found in loop.\n";
 
   OS.indent(Depth) << "SCEV assumptions:\n";
-  PSE.getUnionPredicate().print(OS, Depth);
+  Preds.print(OS, Depth);
 }
 
 const LoopAccessInfo &
diff --git a/lib/Transforms/Scalar/LoopDistribute.cpp b/lib/Transforms/Scalar/LoopDistribute.cpp
index fce063ab40a0..67ebd2532b16 100644
--- a/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -761,7 +761,7 @@ class LoopDistribute : public FunctionPass {
     }
 
     // Don't distribute the loop if we need too many SCEV run-time checks.
-    const SCEVUnionPredicate &Pred = LAI.PSE.getUnionPredicate();
+    const SCEVUnionPredicate &Pred = LAI.Preds;
     if (Pred.getComplexity() > DistributeSCEVCheckThreshold) {
       DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n");
       return false;
@@ -790,7 +790,7 @@ class LoopDistribute : public FunctionPass {
       DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks));
       LoopVersioning LVer(LAI, L, LI, DT, SE, false);
       LVer.setAliasChecks(std::move(Checks));
-      LVer.setSCEVChecks(LAI.PSE.getUnionPredicate());
+      LVer.setSCEVChecks(LAI.Preds);
       LVer.versionLoop(DefsUsedOutside);
     }
 
diff --git a/lib/Transforms/Scalar/LoopLoadElimination.cpp b/lib/Transforms/Scalar/LoopLoadElimination.cpp
index 09d022b3013b..7c7bf64ba79c 100644
--- a/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -459,18 +459,17 @@ class LoadEliminationForLoop {
       return false;
     }
 
-    if (LAI.PSE.getUnionPredicate().getComplexity() >
-        LoadElimSCEVCheckThreshold) {
+    if (LAI.Preds.getComplexity() > LoadElimSCEVCheckThreshold) {
       DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n");
       return false;
     }
 
     // Point of no-return, start the transformation.  First, version the loop if
     // necessary.
-    if (!Checks.empty() || !LAI.PSE.getUnionPredicate().isAlwaysTrue()) {
+    if (!Checks.empty() || !LAI.Preds.isAlwaysTrue()) {
       LoopVersioning LV(LAI, L, LI, DT, SE, false);
       LV.setAliasChecks(std::move(Checks));
-      LV.setSCEVChecks(LAI.PSE.getUnionPredicate());
+      LV.setSCEVChecks(LAI.Preds);
       LV.versionLoop();
     }
 
diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp
index 091f14ebe9f8..e03880526bfa 100644
--- a/lib/Transforms/Utils/LoopUtils.cpp
+++ b/lib/Transforms/Utils/LoopUtils.cpp
@@ -727,46 +727,3 @@ SmallVector<Instruction *, 8> llvm::findDefsUsedOutsideOfLoop(Loop *L) {
 
   return UsedOutside;
 }
-
-PredicatedScalarEvolution::PredicatedScalarEvolution(ScalarEvolution &SE)
-    : SE(SE), Generation(0) {}
-
-const SCEV *PredicatedScalarEvolution::getSCEV(Value *V) {
-  const SCEV *Expr = SE.getSCEV(V);
-  RewriteEntry &Entry = RewriteMap[Expr];
-
-  // If we already have an entry and the version matches, return it.
-  if (Entry.second && Generation == Entry.first)
-    return Entry.second;
-
-  // We found an entry but it's stale. Rewrite the stale entry
-  // acording to the current predicate.
-  if (Entry.second)
-    Expr = Entry.second;
-
-  const SCEV *NewSCEV = SE.rewriteUsingPredicate(Expr, Preds);
-  Entry = {Generation, NewSCEV};
-
-  return NewSCEV;
-}
-
-void PredicatedScalarEvolution::addPredicate(const SCEVPredicate &Pred) {
-  if (Preds.implies(&Pred))
-    return;
-  Preds.add(&Pred);
-  updateGeneration();
-}
-
-const SCEVUnionPredicate &PredicatedScalarEvolution::getUnionPredicate() const {
-  return Preds;
-}
-
-void PredicatedScalarEvolution::updateGeneration() {
-  // If the generation number wrapped recompute everything.
-  if (++Generation == 0) {
-    for (auto &II : RewriteMap) {
-      const SCEV *Rewritten = II.second.second;
-      II.second = {Generation, SE.rewriteUsingPredicate(Rewritten, Preds)};
-    }
-  }
-}
diff --git a/lib/Transforms/Utils/LoopVersioning.cpp b/lib/Transforms/Utils/LoopVersioning.cpp
index 9a2a06cf6891..cc3ff5d80d42 100644
--- a/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/lib/Transforms/Utils/LoopVersioning.cpp
@@ -32,7 +32,7 @@ LoopVersioning::LoopVersioning(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI,
   assert(L->getLoopPreheader() && "No preheader");
   if (UseLAIChecks) {
     setAliasChecks(LAI.getRuntimePointerChecking()->getChecks());
-    setSCEVChecks(LAI.PSE.getUnionPredicate());
+    setSCEVChecks(LAI.Preds);
   }
 }
 
@@ -58,7 +58,7 @@ void LoopVersioning::versionLoop(
       LAI.addRuntimeChecks(RuntimeCheckBB->getTerminator(), AliasChecks);
   assert(MemRuntimeCheck && "called even though needsAnyChecking = false");
 
-  const SCEVUnionPredicate &Pred = LAI.PSE.getUnionPredicate();
+  const SCEVUnionPredicate &Pred = LAI.Preds;
   SCEVExpander Exp(*SE, RuntimeCheckBB->getModule()->getDataLayout(),
                    "scev.check");
   SCEVRuntimeCheck =
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 9adc80c8bd0f..917f2d55f6cb 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -310,16 +310,15 @@ static GetElementPtrInst *getGEPInstruction(Value *Ptr) {
 /// and reduction variables that were found to a given vectorization factor.
 class InnerLoopVectorizer {
 public:
-  InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
-                      LoopInfo *LI, DominatorTree *DT,
-                      const TargetLibraryInfo *TLI,
+  InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
+                      DominatorTree *DT, const TargetLibraryInfo *TLI,
                       const TargetTransformInfo *TTI, unsigned VecWidth,
-                      unsigned UnrollFactor)
-      : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
-        VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()),
+                      unsigned UnrollFactor, SCEVUnionPredicate &Preds)
+      : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
+        VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()),
         Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor),
         TripCount(nullptr), VectorTripCount(nullptr), Legal(nullptr),
-        AddedSafetyChecks(false) {}
+        AddedSafetyChecks(false), Preds(Preds) {}
 
   // Perform the actual loop widening (vectorization).
   // MinimumBitWidths maps scalar integer values to the smallest bitwidth they
@@ -487,10 +486,8 @@ class InnerLoopVectorizer {
 
   /// The original loop.
   Loop *OrigLoop;
-  /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
-  /// dynamic knowledge to simplify SCEV expressions and converts them to a
-  /// more usable form.
-  PredicatedScalarEvolution &PSE;
+  /// Scev analysis to use.
+  ScalarEvolution *SE;
   /// Loop Info.
   LoopInfo *LI;
   /// Dominator Tree.
@@ -554,15 +551,23 @@ class InnerLoopVectorizer {
 
   // Record whether runtime check is added.
   bool AddedSafetyChecks;
+
+  /// The SCEV predicate containing all the SCEV-related assumptions.
+  /// The predicate is used to simplify existing expressions in the
+  /// context of existing SCEV assumptions. Since legality checking is
+  /// not done here, we don't need to use this predicate to record
+  /// further assumptions.
+  SCEVUnionPredicate &Preds;
 };
 
 class InnerLoopUnroller : public InnerLoopVectorizer {
 public:
-  InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
-                    LoopInfo *LI, DominatorTree *DT,
-                    const TargetLibraryInfo *TLI,
-                    const TargetTransformInfo *TTI, unsigned UnrollFactor)
-      : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, 1, UnrollFactor) {}
+  InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
+                    DominatorTree *DT, const TargetLibraryInfo *TLI,
+                    const TargetTransformInfo *TTI, unsigned UnrollFactor,
+                    SCEVUnionPredicate &Preds)
+      : InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, 1, UnrollFactor,
+                            Preds) {}
 
 private:
   void scalarizeInstruction(Instruction *Instr,
@@ -784,9 +789,9 @@ class InterleaveGroup {
 /// between the member and the group in a map.
 class InterleavedAccessInfo {
 public:
-  InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
-                        DominatorTree *DT)
-      : PSE(PSE), TheLoop(L), DT(DT) {}
+  InterleavedAccessInfo(ScalarEvolution *SE, Loop *L, DominatorTree *DT,
+                        SCEVUnionPredicate &Preds)
+      : SE(SE), TheLoop(L), DT(DT), Preds(Preds) {}
 
   ~InterleavedAccessInfo() {
     SmallSet<InterleaveGroup *, 4> DelSet;
@@ -816,14 +821,17 @@ class InterleavedAccessInfo {
   }
 
 private:
-  /// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
-  /// Simplifies SCEV expressions in the context of existing SCEV assumptions.
-  /// The interleaved access analysis can also add new predicates (for example
-  /// by versioning strides of pointers).
-  PredicatedScalarEvolution &PSE;
+  ScalarEvolution *SE;
   Loop *TheLoop;
   DominatorTree *DT;
 
+  /// The SCEV predicate containing all the SCEV-related assumptions.
+  /// The predicate is used to simplify SCEV expressions in the
+  /// context of existing SCEV assumptions. The interleaved access
+  /// analysis can also add new predicates (for example by versioning
+  /// strides of pointers).
+  SCEVUnionPredicate &Preds;
+
   /// Holds the relationships between the members and the interleave group.
   DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap;
 
@@ -1181,17 +1189,18 @@ static void emitMissedWarning(Function *F, Loop *L,
 /// induction variable and the different reduction variables.
 class LoopVectorizationLegality {
 public:
-  LoopVectorizationLegality(Loop *L, PredicatedScalarEvolution &PSE,
-                            DominatorTree *DT, TargetLibraryInfo *TLI,
-                            AliasAnalysis *AA, Function *F,
-                            const TargetTransformInfo *TTI,
+  LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DominatorTree *DT,
+                            TargetLibraryInfo *TLI, AliasAnalysis *AA,
+                            Function *F, const TargetTransformInfo *TTI,
                             LoopAccessAnalysis *LAA,
                             LoopVectorizationRequirements *R,
-                            const LoopVectorizeHints *H)
-      : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TheFunction(F),
-        TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(PSE, L, DT),
-        Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false),
-        Requirements(R), Hints(H) {}
+                            const LoopVectorizeHints *H,
+                            SCEVUnionPredicate &Preds)
+      : NumPredStores(0), TheLoop(L), SE(SE), TLI(TLI), TheFunction(F),
+        TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr),
+        InterleaveInfo(SE, L, DT, Preds), Induction(nullptr),
+        WidestIndTy(nullptr), HasFunNoNaNAttr(false), Requirements(R), Hints(H),
+        Preds(Preds) {}
 
   /// ReductionList contains the reduction descriptors for all
   /// of the reductions that were found in the loop.
@@ -1338,12 +1347,8 @@ class LoopVectorizationLegality {
 
   /// The loop that we evaluate.
   Loop *TheLoop;
-  /// A wrapper around ScalarEvolution used to add runtime SCEV checks.
-  /// Applies dynamic knowledge to simplify SCEV expressions in the context
-  /// of existing SCEV assumptions. The analysis will also add a minimal set
-  /// of new predicates if this is required to enable vectorization and
-  /// unrolling.
-  PredicatedScalarEvolution &PSE;
+  /// Scev analysis.
+  ScalarEvolution *SE;
   /// Target Library Info.
   TargetLibraryInfo *TLI;
   /// Parent function
@@ -1398,6 +1403,13 @@ class LoopVectorizationLegality {
   /// While vectorizing these instructions we have to generate a
   /// call to the appropriate masked intrinsic
   SmallPtrSet<const Instruction *, 8> MaskedOp;
+
+  /// The SCEV predicate containing all the SCEV-related assumptions.
+  /// The predicate is used to simplify SCEV expressions in the
+  /// context of existing SCEV assumptions. The analysis will also
+  /// add a minimal set of new predicates if this is required to
+  /// enable vectorization/unrolling.
+  SCEVUnionPredicate &Preds;
 };
 
 /// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -1415,7 +1427,8 @@ class LoopVectorizationCostModel {
                              const TargetLibraryInfo *TLI, DemandedBits *DB,
                              AssumptionCache *AC, const Function *F,
                              const LoopVectorizeHints *Hints,
-                             SmallPtrSetImpl<const Value *> &ValuesToIgnore)
+                             SmallPtrSetImpl<const Value *> &ValuesToIgnore,
+                             SCEVUnionPredicate &Preds)
       : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
         TheFunction(F), Hints(Hints), ValuesToIgnore(ValuesToIgnore) {}
 
@@ -1745,12 +1758,12 @@ struct LoopVectorize : public FunctionPass {
       }
     }
 
-    PredicatedScalarEvolution PSE(*SE);
+    SCEVUnionPredicate Preds;
 
     // Check if it is legal to vectorize the loop.
     LoopVectorizationRequirements Requirements;
-    LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA,
-                                  &Requirements, &Hints);
+    LoopVectorizationLegality LVL(L, SE, DT, TLI, AA, F, TTI, LAA,
+                                  &Requirements, &Hints, Preds);
     if (!LVL.canVectorize()) {
       DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
       emitMissedWarning(F, L, Hints);
@@ -1768,8 +1781,8 @@ struct LoopVectorize : public FunctionPass {
     }
 
     // Use the cost model.
-    LoopVectorizationCostModel CM(L, PSE.getSE(), LI, &LVL, *TTI, TLI, DB, AC,
-                                  F, &Hints, ValuesToIgnore);
+    LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, TLI, DB, AC, F, &Hints,
+                                  ValuesToIgnore, Preds);
 
     // Check the function attributes to find out if this function should be
     // optimized for size.
@@ -1880,7 +1893,7 @@ struct LoopVectorize : public FunctionPass {
       assert(IC > 1 && "interleave count should not be 1 or 0");
       // If we decided that it is not legal to vectorize the loop then
       // interleave it.
-      InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, IC);
+      InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, IC, Preds);
       Unroller.vectorize(&LVL, CM.MinBWs);
 
       emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(),
@@ -1888,7 +1901,7 @@ struct LoopVectorize : public FunctionPass {
                                  Twine(IC) + ")");
     } else {
       // If we decided that it is *legal* to vectorize the loop then do it.
-      InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, VF.Width, IC);
+      InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, IC, Preds);
       LB.vectorize(&LVL, CM.MinBWs);
       ++LoopsVectorized;
 
@@ -1989,7 +2002,6 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
 
 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr");
-  auto *SE = PSE.getSE();
   // Make sure that the pointer does not point to structs.
   if (Ptr->getType()->getPointerElementType()->isAggregateType())
     return 0;
@@ -2019,7 +2031,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
 
     // Make sure that all of the index operands are loop invariant.
     for (unsigned i = 1; i < NumOperands; ++i)
-      if (!SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop))
+      if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
         return 0;
 
     InductionDescriptor II = Inductions[Phi];
@@ -2032,14 +2044,14 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   // operand.
   for (unsigned i = 0; i != NumOperands; ++i)
     if (i != InductionOperand &&
-        !SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop))
+        !SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
       return 0;
 
   // We can emit wide load/stores only if the last non-zero index is the
   // induction variable.
   const SCEV *Last = nullptr;
   if (!Strides.count(Gep))
-    Last = PSE.getSCEV(Gep->getOperand(InductionOperand));
+    Last = SE->getSCEV(Gep->getOperand(InductionOperand));
   else {
     // Because of the multiplication by a stride we can have a s/zext cast.
     // We are going to replace this stride by 1 so the cast is safe to ignore.
@@ -2050,7 +2062,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
     //  %idxprom = zext i32 %mul to i64  << Safe cast.
     //  %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom
     //
-    Last = replaceSymbolicStrideSCEV(PSE, Strides,
+    Last = replaceSymbolicStrideSCEV(SE, Strides, Preds,
                                      Gep->getOperand(InductionOperand), Gep);
     if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(Last))
       Last =
@@ -2408,9 +2420,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
     Ptr = Builder.Insert(Gep2);
   } else if (Gep) {
     setDebugLocFromInst(Builder, Gep);
-    assert(PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getPointerOperand()),
-                                        OrigLoop) &&
-           "Base ptr must be invariant");
+    assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()),
+                               OrigLoop) && "Base ptr must be invariant");
 
     // The last index does not have to be the induction. It can be
     // consecutive and be a function of the index. For example A[I+1];
@@ -2427,8 +2438,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
       if (i == InductionOperand ||
           (GepOperandInst && OrigLoop->contains(GepOperandInst))) {
         assert((i == InductionOperand ||
-                PSE.getSE()->isLoopInvariant(PSE.getSCEV(GepOperandInst),
-                                             OrigLoop)) &&
+               SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) &&
                "Must be last index or loop invariant");
 
         VectorParts &GEPParts = getVectorValue(GepOperand);
@@ -2648,7 +2658,6 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
 
   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
   // Find the loop boundaries.
-  ScalarEvolution *SE = PSE.getSE();
   const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(OrigLoop);
   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
          "Invalid loop count");
@@ -2756,10 +2765,8 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
   // Generate the code to check that the SCEV assumptions that we made.
   // We want the new basic block to start at the first instruction in a
   // sequence of instructions that form a check.
-  SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
-                   "scev.check");
-  Value *SCEVCheck =
-      Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
+  SCEVExpander Exp(*SE, Bypass->getModule()->getDataLayout(), "scev.check");
+  Value *SCEVCheck = Exp.expandCodeForPredicate(&Preds, BB->getTerminator());
 
   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
     if (C->isZero())
@@ -3778,9 +3785,8 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
       // Widen selects.
       // If the selector is loop invariant we can create a select
       // instruction with a scalar condition. Otherwise, use vector-select.
-      auto *SE = PSE.getSE();
-      bool InvariantCond =
-          SE->isLoopInvariant(PSE.getSCEV(it->getOperand(0)), OrigLoop);
+      bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)),
+                                               OrigLoop);
       setDebugLocFromInst(Builder, &*it);
 
       // The condition can be loop invariant  but still defined inside the
@@ -3961,7 +3967,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
 
 void InnerLoopVectorizer::updateAnalysis() {
   // Forget the original basic block.
-  PSE.getSE()->forgetLoop(OrigLoop);
+  SE->forgetLoop(OrigLoop);
 
   // Update the dominator tree information.
   assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
@@ -4113,10 +4119,10 @@ bool LoopVectorizationLegality::canVectorize() {
   }
 
   // ScalarEvolution needs to be able to find the exit count.
-  const SCEV *ExitCount = PSE.getSE()->getBackedgeTakenCount(TheLoop);
-  if (ExitCount == PSE.getSE()->getCouldNotCompute()) {
-    emitAnalysis(VectorizationReport()
-                 << "could not determine number of loop iterations");
+  const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
+  if (ExitCount == SE->getCouldNotCompute()) {
+    emitAnalysis(VectorizationReport() <<
+                 "could not determine number of loop iterations");
     DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
     return false;
   }
@@ -4156,7 +4162,7 @@ bool LoopVectorizationLegality::canVectorize() {
   if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
     SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
 
-  if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {
+  if (Preds.getComplexity() > SCEVThreshold) {
     emitAnalysis(VectorizationReport()
                  << "Too many SCEV assumptions need to be made and checked "
                  << "at runtime");
@@ -4262,7 +4268,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         }
 
         InductionDescriptor ID;
-        if (InductionDescriptor::isInductionPHI(Phi, PSE.getSE(), ID)) {
+        if (InductionDescriptor::isInductionPHI(Phi, SE, ID)) {
           Inductions[Phi] = ID;
           // Get the widest type.
           if (!WidestIndTy)
@@ -4331,8 +4337,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       // second argument is the same (i.e. loop invariant)
       if (CI &&
           hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) {
-        auto *SE = PSE.getSE();
-        if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) {
+        if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) {
           emitAnalysis(VectorizationReport(&*it)
                        << "intrinsic instruction cannot be vectorized");
           DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n");
@@ -4405,7 +4410,7 @@ void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) {
   else
     return;
 
-  Value *Stride = getStrideFromPointer(Ptr, PSE.getSE(), TheLoop);
+  Value *Stride = getStrideFromPointer(Ptr, SE, TheLoop);
   if (!Stride)
     return;
 
@@ -4469,7 +4474,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
   }
 
   Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
-  PSE.addPredicate(LAI->PSE.getUnionPredicate());
+  Preds.add(&LAI->Preds);
 
   return true;
 }
@@ -4584,7 +4589,7 @@ void InterleavedAccessInfo::collectConstStridedAccesses(
     StoreInst *SI = dyn_cast<StoreInst>(I);
 
     Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
-    int Stride = isStridedPtr(PSE, Ptr, TheLoop, Strides);
+    int Stride = isStridedPtr(SE, Ptr, TheLoop, Strides, Preds);
 
     // The factor of the corresponding interleave group.
     unsigned Factor = std::abs(Stride);
@@ -4593,7 +4598,7 @@ void InterleavedAccessInfo::collectConstStridedAccesses(
     if (Factor < 2 || Factor > MaxInterleaveGroupFactor)
       continue;
 
-    const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
+    const SCEV *Scev = replaceSymbolicStrideSCEV(SE, Strides, Preds, Ptr);
     PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
     unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType());
 
@@ -4680,8 +4685,8 @@ void InterleavedAccessInfo::analyzeInterleaving(
         continue;
 
       // Calculate the distance and prepare for the rule 3.
-      const SCEVConstant *DistToA = dyn_cast<SCEVConstant>(
-          PSE.getSE()->getMinusSCEV(DesB.Scev, DesA.Scev));
+      const SCEVConstant *DistToA =
+          dyn_cast<SCEVConstant>(SE->getMinusSCEV(DesB.Scev, DesA.Scev));
       if (!DistToA)
         continue;
 

From fecb71a6a4db2198fa71a02de10e3f8762e7348e Mon Sep 17 00:00:00 2001
From: JF Bastien <jfb@google.com>
Date: Wed, 9 Dec 2015 15:52:57 +0000
Subject: [PATCH 273/364] WebAssembly: add missing failure to the list.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255119 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/known_gcc_test_failures.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/Target/WebAssembly/known_gcc_test_failures.txt b/lib/Target/WebAssembly/known_gcc_test_failures.txt
index 1e72ad6d5a91..ca0fe204fd4a 100644
--- a/lib/Target/WebAssembly/known_gcc_test_failures.txt
+++ b/lib/Target/WebAssembly/known_gcc_test_failures.txt
@@ -390,6 +390,7 @@ pr17377.c
 
 # Error: invalid output constraint '=t' in asm.
 990413-2.c
+990826-0.c
 
 # Error: __builtin_setjmp / __builtin_longjmp is not supported for the current target.
 built-in-setjmp.c

From 6d8e50b6e2c07d1502dc899dd1741af14111b480 Mon Sep 17 00:00:00 2001
From: Tim Northover <tnorthover@apple.com>
Date: Wed, 9 Dec 2015 15:54:50 +0000
Subject: [PATCH 274/364] ARM: don't use a deleted node as the BaseReg in
 complex pattern.

We mutated the DAG, which invalidated the node we were trying to use
as a base register. Sometimes we got away with it, but other times the
node really did get deleted before it was finished with.

Should fix PR25733

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255120 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/ARMISelDAGToDAG.cpp  |  5 ++++-
 test/CodeGen/ARM/shifter_operand.ll | 15 +++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index cd73021dfc62..024244092a34 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -548,8 +548,11 @@ bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N,
     unsigned PowerOfTwo = 0;
     SDValue NewMulConst;
     if (canExtractShiftFromMul(N, 31, PowerOfTwo, NewMulConst)) {
+      BaseReg = SDValue(Select(CurDAG->getNode(ISD::MUL, SDLoc(N), MVT::i32,
+                                               N.getOperand(0), NewMulConst)
+                                   .getNode()),
+                        0);
       replaceDAGValue(N.getOperand(1), NewMulConst);
-      BaseReg = N;
       Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ARM_AM::lsl,
                                                           PowerOfTwo),
                                       SDLoc(N), MVT::i32);
diff --git a/test/CodeGen/ARM/shifter_operand.ll b/test/CodeGen/ARM/shifter_operand.ll
index 11a8ee6ef7de..5d44eb0f11d1 100644
--- a/test/CodeGen/ARM/shifter_operand.ll
+++ b/test/CodeGen/ARM/shifter_operand.ll
@@ -224,3 +224,18 @@ entry:
   %conv = zext i8 %0 to i32
   ret i32 %conv
 }
+
+
+define void @test_well_formed_dag(i32 %in1, i32 %in2, i32* %addr) {
+; CHECK-LABEL: test_well_formed_dag:
+; CHECK-ARM: movw [[SMALL_CONST:r[0-9]+]], #675
+; CHECK-ARM: mul [[SMALL_PROD:r[0-9]+]], r0, [[SMALL_CONST]]
+; CHECK-ARM: add {{r[0-9]+}}, r1, [[SMALL_PROD]], lsl #7
+
+  %mul.small = mul i32 %in1, 675
+  store i32 %mul.small, i32* %addr
+  %mul.big = mul i32 %in1, 86400
+  %add = add i32 %in2, %mul.big
+  store i32 %add, i32* %addr
+  ret void
+}

From 90f6cd579a7dd1d91b8931bb38a271723749f326 Mon Sep 17 00:00:00 2001
From: Silviu Baranga <silviu.baranga@arm.com>
Date: Wed, 9 Dec 2015 16:06:28 +0000
Subject: [PATCH 275/364] Re-commit r255115, with the PredicatedScalarEvolution
 class moved to ScalarEvolution.h, in order to avoid cyclic dependencies
 between the Transform and Analysis modules:

[LV][LAA] Add a layer over SCEV to apply run-time checked knowledge on SCEV expressions

Summary:
This change creates a layer over ScalarEvolution for LAA and LV, and centralizes the
usage of SCEV predicates. The SCEVPredicatedLayer takes the statically deduced knowledge
by ScalarEvolution and applies the knowledge from the SCEV predicates. The end goal is
that both LAA and LV should use this interface everywhere.

This also solves a problem involving the result of SCEV expression rewritting when
the predicate changes. Suppose we have the expression (sext {a,+,b}) and two predicates
  P1: {a,+,b} has nsw
  P2: b = 1.

Applying P1 and then P2 gives us {a,+,1}, while applying P2 and the P1 gives us
sext({a,+,1}) (the AddRec expression was changed by P2 so P1 no longer applies).
The SCEVPredicatedLayer maintains the order of transformations by feeding back
the results of previous transformations into new transformations, and therefore
avoiding this issue.

The SCEVPredicatedLayer maintains a cache to remember the results of previous
SCEV rewritting results. This also has the benefit of reducing the overall number
of expression rewrites.

Reviewers: mzolotukhin, anemet

Subscribers: jmolloy, sanjoy, llvm-commits

Differential Revision: http://reviews.llvm.org/D14296


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255122 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/LoopAccessAnalysis.h    |  50 +++---
 include/llvm/Analysis/ScalarEvolution.h       |  53 ++++++
 lib/Analysis/LoopAccessAnalysis.cpp           |  89 +++++-----
 lib/Analysis/ScalarEvolution.cpp              |  43 +++++
 lib/Transforms/Scalar/LoopDistribute.cpp      |   4 +-
 lib/Transforms/Scalar/LoopLoadElimination.cpp |   7 +-
 lib/Transforms/Utils/LoopVersioning.cpp       |   4 +-
 lib/Transforms/Vectorize/LoopVectorize.cpp    | 165 +++++++++---------
 8 files changed, 250 insertions(+), 165 deletions(-)

diff --git a/include/llvm/Analysis/LoopAccessAnalysis.h b/include/llvm/Analysis/LoopAccessAnalysis.h
index 77d412a4f927..871d35e99b74 100644
--- a/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -193,11 +193,10 @@ class MemoryDepChecker {
                const SmallVectorImpl<Instruction *> &Instrs) const;
   };
 
-  MemoryDepChecker(ScalarEvolution *Se, const Loop *L,
-                   SCEVUnionPredicate &Preds)
-      : SE(Se), InnermostLoop(L), AccessIdx(0),
+  MemoryDepChecker(PredicatedScalarEvolution &PSE, const Loop *L)
+      : PSE(PSE), InnermostLoop(L), AccessIdx(0),
         ShouldRetryWithRuntimeCheck(false), SafeForVectorization(true),
-        RecordDependences(true), Preds(Preds) {}
+        RecordDependences(true) {}
 
   /// \brief Register the location (instructions are given increasing numbers)
   /// of a write access.
@@ -266,7 +265,13 @@ class MemoryDepChecker {
                                                          bool isWrite) const;
 
 private:
-  ScalarEvolution *SE;
+  /// A wrapper around ScalarEvolution, used to add runtime SCEV checks, and
+  /// applies dynamic knowledge to simplify SCEV expressions and convert them
+  /// to a more usable form. We need this in case assumptions about SCEV
+  /// expressions need to be made in order to avoid unknown dependences. For
+  /// example we might assume a unit stride for a pointer in order to prove
+  /// that a memory access is strided and doesn't wrap.
+  PredicatedScalarEvolution &PSE;
   const Loop *InnermostLoop;
 
   /// \brief Maps access locations (ptr, read/write) to program order.
@@ -317,15 +322,6 @@ class MemoryDepChecker {
   /// \brief Check whether the data dependence could prevent store-load
   /// forwarding.
   bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize);
-
-  /// The SCEV predicate containing all the SCEV-related assumptions.
-  /// The dependence checker needs this in order to convert SCEVs of pointers
-  /// to more accurate expressions in the context of existing assumptions.
-  /// We also need this in case assumptions about SCEV expressions need to
-  /// be made in order to avoid unknown dependences. For example we might
-  /// assume a unit stride for a pointer in order to prove that a memory access
-  /// is strided and doesn't wrap.
-  SCEVUnionPredicate &Preds;
 };
 
 /// \brief Holds information about the memory runtime legality checks to verify
@@ -373,7 +369,7 @@ class RuntimePointerChecking {
   /// and change \p Preds.
   void insert(Loop *Lp, Value *Ptr, bool WritePtr, unsigned DepSetId,
               unsigned ASId, const ValueToValueMap &Strides,
-              SCEVUnionPredicate &Preds);
+              PredicatedScalarEvolution &PSE);
 
   /// \brief No run-time memory checking is necessary.
   bool empty() const { return Pointers.empty(); }
@@ -508,8 +504,8 @@ class RuntimePointerChecking {
 /// ScalarEvolution, we will generate run-time checks by emitting a
 /// SCEVUnionPredicate.
 ///
-/// Checks for both memory dependences and SCEV predicates must be emitted in
-/// order for the results of this analysis to be valid.
+/// Checks for both memory dependences and the SCEV predicates contained in the
+/// PSE must be emitted in order for the results of this analysis to be valid.
 class LoopAccessInfo {
 public:
   LoopAccessInfo(Loop *L, ScalarEvolution *SE, const DataLayout &DL,
@@ -591,14 +587,12 @@ class LoopAccessInfo {
     return StoreToLoopInvariantAddress;
   }
 
-  /// The SCEV predicate contains all the SCEV-related assumptions.
-  /// The is used to keep track of the minimal set of assumptions on SCEV
-  /// expressions that the analysis needs to make in order to return a
-  /// meaningful result. All SCEV expressions during the analysis should be
-  /// re-written (and therefore simplified) according to Preds.
+  /// Used to add runtime SCEV checks. Simplifies SCEV expressions and converts
+  /// them to a more usable form.  All SCEV expressions during the analysis
+  /// should be re-written (and therefore simplified) according to PSE.
   /// A user of LoopAccessAnalysis will need to emit the runtime checks
   /// associated with this predicate.
-  SCEVUnionPredicate Preds;
+  PredicatedScalarEvolution PSE;
 
 private:
   /// \brief Analyze the loop.  Substitute symbolic strides using Strides.
@@ -619,7 +613,6 @@ class LoopAccessInfo {
   MemoryDepChecker DepChecker;
 
   Loop *TheLoop;
-  ScalarEvolution *SE;
   const DataLayout &DL;
   const TargetLibraryInfo *TLI;
   AliasAnalysis *AA;
@@ -654,18 +647,17 @@ Value *stripIntegerCast(Value *V);
 /// If \p OrigPtr is not null, use it to look up the stride value instead of \p
 /// Ptr.  \p PtrToStride provides the mapping between the pointer value and its
 /// stride as collected by LoopVectorizationLegality::collectStridedAccess.
-const SCEV *replaceSymbolicStrideSCEV(ScalarEvolution *SE,
+const SCEV *replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
                                       const ValueToValueMap &PtrToStride,
-                                      SCEVUnionPredicate &Preds, Value *Ptr,
-                                      Value *OrigPtr = nullptr);
+                                      Value *Ptr, Value *OrigPtr = nullptr);
 
 /// \brief Check the stride of the pointer and ensure that it does not wrap in
 /// the address space, assuming \p Preds is true.
 ///
 /// If necessary this method will version the stride of the pointer according
 /// to \p PtrToStride and therefore add a new predicate to \p Preds.
-int isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp,
-                 const ValueToValueMap &StridesMap, SCEVUnionPredicate &Preds);
+int isStridedPtr(PredicatedScalarEvolution &PSE, Value *Ptr, const Loop *Lp,
+                 const ValueToValueMap &StridesMap);
 
 /// \brief This analysis provides dependence information for the memory accesses
 /// of a loop.
diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index f674cc7ee56f..15565daf6d3b 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@@ -1324,6 +1324,59 @@ namespace llvm {
     void print(raw_ostream &OS, const Module * = nullptr) const override;
     void verifyAnalysis() const override;
   };
+
+  /// An interface layer with SCEV used to manage how we see SCEV expressions
+  /// for values in the context of existing predicates. We can add new
+  /// predicates, but we cannot remove them.
+  ///
+  /// This layer has multiple purposes:
+  ///   - provides a simple interface for SCEV versioning.
+  ///   - guarantees that the order of transformations applied on a SCEV
+  ///     expression for a single Value is consistent across two different
+  ///     getSCEV calls. This means that, for example, once we've obtained
+  ///     an AddRec expression for a certain value through expression
+  ///     rewriting, we will continue to get an AddRec expression for that
+  ///     Value.
+  ///   - lowers the number of expression rewrites.
+  class PredicatedScalarEvolution {
+  public:
+    PredicatedScalarEvolution(ScalarEvolution &SE);
+    const SCEVUnionPredicate &getUnionPredicate() const;
+    /// \brief Returns the SCEV expression of V, in the context of the current
+    /// SCEV predicate.
+    /// The order of transformations applied on the expression of V returned
+    /// by ScalarEvolution is guaranteed to be preserved, even when adding new
+    /// predicates.
+    const SCEV *getSCEV(Value *V);
+    /// \brief Adds a new predicate.
+    void addPredicate(const SCEVPredicate &Pred);
+    /// \brief Returns the ScalarEvolution analysis used.
+    ScalarEvolution *getSE() const { return &SE; }
+
+  private:
+    /// \brief Increments the version number of the predicate.
+    /// This needs to be called every time the SCEV predicate changes.
+    void updateGeneration();
+    /// Holds a SCEV and the version number of the SCEV predicate used to
+    /// perform the rewrite of the expression.
+    typedef std::pair<unsigned, const SCEV *> RewriteEntry;
+    /// Maps a SCEV to the rewrite result of that SCEV at a certain version
+    /// number. If this number doesn't match the current Generation, we will
+    /// need to do a rewrite. To preserve the transformation order of previous
+    /// rewrites, we will rewrite the previous result instead of the original
+    /// SCEV.
+    DenseMap<const SCEV *, RewriteEntry> RewriteMap;
+    /// The ScalarEvolution analysis.
+    ScalarEvolution &SE;
+    /// The SCEVPredicate that forms our context. We will rewrite all
+    /// expressions assuming that this predicate true.
+    SCEVUnionPredicate Preds;
+    /// Marks the version of the SCEV predicate used. When rewriting a SCEV
+    /// expression we mark it with the version of the predicate. We use this to
+    /// figure out if the predicate has changed from the last rewrite of the
+    /// SCEV. If so, we need to perform a new rewrite.
+    unsigned Generation;
+  };
 }
 
 #endif
diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp
index b2670bf48dd8..ce6a5ab5656d 100644
--- a/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/lib/Analysis/LoopAccessAnalysis.cpp
@@ -87,11 +87,10 @@ Value *llvm::stripIntegerCast(Value *V) {
   return V;
 }
 
-const SCEV *llvm::replaceSymbolicStrideSCEV(ScalarEvolution *SE,
+const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
                                             const ValueToValueMap &PtrToStride,
-                                            SCEVUnionPredicate &Preds,
                                             Value *Ptr, Value *OrigPtr) {
-  const SCEV *OrigSCEV = SE->getSCEV(Ptr);
+  const SCEV *OrigSCEV = PSE.getSCEV(Ptr);
 
   // If there is an entry in the map return the SCEV of the pointer with the
   // symbolic stride replaced by one.
@@ -108,16 +107,17 @@ const SCEV *llvm::replaceSymbolicStrideSCEV(ScalarEvolution *SE,
     ValueToValueMap RewriteMap;
     RewriteMap[StrideVal] = One;
 
+    ScalarEvolution *SE = PSE.getSE();
     const auto *U = cast<SCEVUnknown>(SE->getSCEV(StrideVal));
     const auto *CT =
         static_cast<const SCEVConstant *>(SE->getOne(StrideVal->getType()));
 
-    Preds.add(SE->getEqualPredicate(U, CT));
+    PSE.addPredicate(*SE->getEqualPredicate(U, CT));
+    auto *Expr = PSE.getSCEV(Ptr);
 
-    const SCEV *ByOne = SE->rewriteUsingPredicate(OrigSCEV, Preds);
-    DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV << " by: " << *ByOne
+    DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV << " by: " << *Expr
                  << "\n");
-    return ByOne;
+    return Expr;
   }
 
   // Otherwise, just return the SCEV of the original pointer.
@@ -127,11 +127,12 @@ const SCEV *llvm::replaceSymbolicStrideSCEV(ScalarEvolution *SE,
 void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, bool WritePtr,
                                     unsigned DepSetId, unsigned ASId,
                                     const ValueToValueMap &Strides,
-                                    SCEVUnionPredicate &Preds) {
+                                    PredicatedScalarEvolution &PSE) {
   // Get the stride replaced scev.
-  const SCEV *Sc = replaceSymbolicStrideSCEV(SE, Strides, Preds, Ptr);
+  const SCEV *Sc = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
   assert(AR && "Invalid addrec expression");
+  ScalarEvolution *SE = PSE.getSE();
   const SCEV *Ex = SE->getBackedgeTakenCount(Lp);
 
   const SCEV *ScStart = AR->getStart();
@@ -423,9 +424,10 @@ class AccessAnalysis {
   typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
 
   AccessAnalysis(const DataLayout &Dl, AliasAnalysis *AA, LoopInfo *LI,
-                 MemoryDepChecker::DepCandidates &DA, SCEVUnionPredicate &Preds)
+                 MemoryDepChecker::DepCandidates &DA,
+                 PredicatedScalarEvolution &PSE)
       : DL(Dl), AST(*AA), LI(LI), DepCands(DA), IsRTCheckAnalysisNeeded(false),
-        Preds(Preds) {}
+        PSE(PSE) {}
 
   /// \brief Register a load  and whether it is only read from.
   void addLoad(MemoryLocation &Loc, bool IsReadOnly) {
@@ -512,16 +514,16 @@ class AccessAnalysis {
   bool IsRTCheckAnalysisNeeded;
 
   /// The SCEV predicate containing all the SCEV-related assumptions.
-  SCEVUnionPredicate &Preds;
+  PredicatedScalarEvolution &PSE;
 };
 
 } // end anonymous namespace
 
 /// \brief Check whether a pointer can participate in a runtime bounds check.
-static bool hasComputableBounds(ScalarEvolution *SE,
+static bool hasComputableBounds(PredicatedScalarEvolution &PSE,
                                 const ValueToValueMap &Strides, Value *Ptr,
-                                Loop *L, SCEVUnionPredicate &Preds) {
-  const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, Strides, Preds, Ptr);
+                                Loop *L) {
+  const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
   if (!AR)
     return false;
@@ -564,11 +566,11 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
       else
         ++NumReadPtrChecks;
 
-      if (hasComputableBounds(SE, StridesMap, Ptr, TheLoop, Preds) &&
+      if (hasComputableBounds(PSE, StridesMap, Ptr, TheLoop) &&
           // When we run after a failing dependency check we have to make sure
           // we don't have wrapping pointers.
           (!ShouldCheckStride ||
-           isStridedPtr(SE, Ptr, TheLoop, StridesMap, Preds) == 1)) {
+           isStridedPtr(PSE, Ptr, TheLoop, StridesMap) == 1)) {
         // The id of the dependence set.
         unsigned DepId;
 
@@ -582,7 +584,7 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
           // Each access has its own dependence set.
           DepId = RunningDepId++;
 
-        RtCheck.insert(TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap, Preds);
+        RtCheck.insert(TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap, PSE);
 
         DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n');
       } else {
@@ -817,9 +819,8 @@ static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR,
 }
 
 /// \brief Check whether the access through \p Ptr has a constant stride.
-int llvm::isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp,
-                       const ValueToValueMap &StridesMap,
-                       SCEVUnionPredicate &Preds) {
+int llvm::isStridedPtr(PredicatedScalarEvolution &PSE, Value *Ptr,
+                       const Loop *Lp, const ValueToValueMap &StridesMap) {
   Type *Ty = Ptr->getType();
   assert(Ty->isPointerTy() && "Unexpected non-ptr");
 
@@ -831,7 +832,7 @@ int llvm::isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp,
     return 0;
   }
 
-  const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, StridesMap, Preds, Ptr);
+  const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr);
 
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
   if (!AR) {
@@ -854,16 +855,16 @@ int llvm::isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp,
   // to access the pointer value "0" which is undefined behavior in address
   // space 0, therefore we can also vectorize this case.
   bool IsInBoundsGEP = isInBoundsGep(Ptr);
-  bool IsNoWrapAddRec = isNoWrapAddRec(Ptr, AR, SE, Lp);
+  bool IsNoWrapAddRec = isNoWrapAddRec(Ptr, AR, PSE.getSE(), Lp);
   bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0;
   if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero) {
     DEBUG(dbgs() << "LAA: Bad stride - Pointer may wrap in the address space "
-          << *Ptr << " SCEV: " << *PtrScev << "\n");
+                 << *Ptr << " SCEV: " << *PtrScev << "\n");
     return 0;
   }
 
   // Check the step is constant.
-  const SCEV *Step = AR->getStepRecurrence(*SE);
+  const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
 
   // Calculate the pointer stride and check if it is constant.
   const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
@@ -1046,11 +1047,11 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
       BPtr->getType()->getPointerAddressSpace())
     return Dependence::Unknown;
 
-  const SCEV *AScev = replaceSymbolicStrideSCEV(SE, Strides, Preds, APtr);
-  const SCEV *BScev = replaceSymbolicStrideSCEV(SE, Strides, Preds, BPtr);
+  const SCEV *AScev = replaceSymbolicStrideSCEV(PSE, Strides, APtr);
+  const SCEV *BScev = replaceSymbolicStrideSCEV(PSE, Strides, BPtr);
 
-  int StrideAPtr = isStridedPtr(SE, APtr, InnermostLoop, Strides, Preds);
-  int StrideBPtr = isStridedPtr(SE, BPtr, InnermostLoop, Strides, Preds);
+  int StrideAPtr = isStridedPtr(PSE, APtr, InnermostLoop, Strides);
+  int StrideBPtr = isStridedPtr(PSE, BPtr, InnermostLoop, Strides);
 
   const SCEV *Src = AScev;
   const SCEV *Sink = BScev;
@@ -1067,12 +1068,12 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
     std::swap(StrideAPtr, StrideBPtr);
   }
 
-  const SCEV *Dist = SE->getMinusSCEV(Sink, Src);
+  const SCEV *Dist = PSE.getSE()->getMinusSCEV(Sink, Src);
 
   DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink
-        << "(Induction step: " << StrideAPtr <<  ")\n");
+               << "(Induction step: " << StrideAPtr << ")\n");
   DEBUG(dbgs() << "LAA: Distance for " << *InstMap[AIdx] << " to "
-        << *InstMap[BIdx] << ": " << *Dist << "\n");
+               << *InstMap[BIdx] << ": " << *Dist << "\n");
 
   // Need accesses with constant stride. We don't want to vectorize
   // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in
@@ -1343,10 +1344,10 @@ bool LoopAccessInfo::canAnalyzeLoop() {
   }
 
   // ScalarEvolution needs to be able to find the exit count.
-  const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
-  if (ExitCount == SE->getCouldNotCompute()) {
-    emitAnalysis(LoopAccessReport() <<
-                 "could not determine number of loop iterations");
+  const SCEV *ExitCount = PSE.getSE()->getBackedgeTakenCount(TheLoop);
+  if (ExitCount == PSE.getSE()->getCouldNotCompute()) {
+    emitAnalysis(LoopAccessReport()
+                 << "could not determine number of loop iterations");
     DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n");
     return false;
   }
@@ -1447,7 +1448,7 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
 
   MemoryDepChecker::DepCandidates DependentAccesses;
   AccessAnalysis Accesses(TheLoop->getHeader()->getModule()->getDataLayout(),
-                          AA, LI, DependentAccesses, Preds);
+                          AA, LI, DependentAccesses, PSE);
 
   // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
   // multiple times on the same object. If the ptr is accessed twice, once
@@ -1498,8 +1499,7 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
     // read a few words, modify, and write a few words, and some of the
     // words may be written to the same address.
     bool IsReadOnlyPtr = false;
-    if (Seen.insert(Ptr).second ||
-        !isStridedPtr(SE, Ptr, TheLoop, Strides, Preds)) {
+    if (Seen.insert(Ptr).second || !isStridedPtr(PSE, Ptr, TheLoop, Strides)) {
       ++NumReads;
       IsReadOnlyPtr = true;
     }
@@ -1529,7 +1529,7 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
   // Find pointers with computable bounds. We are going to use this information
   // to place a runtime bound check.
   bool CanDoRTIfNeeded =
-      Accesses.canCheckPtrAtRT(PtrRtChecking, SE, TheLoop, Strides);
+      Accesses.canCheckPtrAtRT(PtrRtChecking, PSE.getSE(), TheLoop, Strides);
   if (!CanDoRTIfNeeded) {
     emitAnalysis(LoopAccessReport() << "cannot identify array bounds");
     DEBUG(dbgs() << "LAA: We can't vectorize because we can't find "
@@ -1556,6 +1556,7 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) {
       PtrRtChecking.reset();
       PtrRtChecking.Need = true;
 
+      auto *SE = PSE.getSE();
       CanDoRTIfNeeded =
           Accesses.canCheckPtrAtRT(PtrRtChecking, SE, TheLoop, Strides, true);
 
@@ -1598,7 +1599,7 @@ void LoopAccessInfo::emitAnalysis(LoopAccessReport &Message) {
 }
 
 bool LoopAccessInfo::isUniform(Value *V) const {
-  return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
+  return (PSE.getSE()->isLoopInvariant(PSE.getSE()->getSCEV(V), TheLoop));
 }
 
 // FIXME: this function is currently a duplicate of the one in
@@ -1679,7 +1680,7 @@ std::pair<Instruction *, Instruction *> LoopAccessInfo::addRuntimeChecks(
     Instruction *Loc,
     const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &PointerChecks)
     const {
-
+  auto *SE = PSE.getSE();
   SCEVExpander Exp(*SE, DL, "induction");
   auto ExpandedChecks =
       expandBounds(PointerChecks, TheLoop, Loc, SE, Exp, PtrRtChecking);
@@ -1749,7 +1750,7 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
                                const TargetLibraryInfo *TLI, AliasAnalysis *AA,
                                DominatorTree *DT, LoopInfo *LI,
                                const ValueToValueMap &Strides)
-    : PtrRtChecking(SE), DepChecker(SE, L, Preds), TheLoop(L), SE(SE), DL(DL),
+    : PSE(*SE), PtrRtChecking(SE), DepChecker(PSE, L), TheLoop(L), DL(DL),
       TLI(TLI), AA(AA), DT(DT), LI(LI), NumLoads(0), NumStores(0),
       MaxSafeDepDistBytes(-1U), CanVecMem(false),
       StoreToLoopInvariantAddress(false) {
@@ -1786,7 +1787,7 @@ void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
                    << "found in loop.\n";
 
   OS.indent(Depth) << "SCEV assumptions:\n";
-  Preds.print(OS, Depth);
+  PSE.getUnionPredicate().print(OS, Depth);
 }
 
 const LoopAccessInfo &
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index f57997e146e0..1c2fb3d1ed02 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -9707,3 +9707,46 @@ void SCEVUnionPredicate::add(const SCEVPredicate *N) {
   SCEVToPreds[Key].push_back(N);
   Preds.push_back(N);
 }
+
+PredicatedScalarEvolution::PredicatedScalarEvolution(ScalarEvolution &SE)
+    : SE(SE), Generation(0) {}
+
+const SCEV *PredicatedScalarEvolution::getSCEV(Value *V) {
+  const SCEV *Expr = SE.getSCEV(V);
+  RewriteEntry &Entry = RewriteMap[Expr];
+
+  // If we already have an entry and the version matches, return it.
+  if (Entry.second && Generation == Entry.first)
+    return Entry.second;
+
+  // We found an entry but it's stale. Rewrite the stale entry
+  // acording to the current predicate.
+  if (Entry.second)
+    Expr = Entry.second;
+
+  const SCEV *NewSCEV = SE.rewriteUsingPredicate(Expr, Preds);
+  Entry = {Generation, NewSCEV};
+
+  return NewSCEV;
+}
+
+void PredicatedScalarEvolution::addPredicate(const SCEVPredicate &Pred) {
+  if (Preds.implies(&Pred))
+    return;
+  Preds.add(&Pred);
+  updateGeneration();
+}
+
+const SCEVUnionPredicate &PredicatedScalarEvolution::getUnionPredicate() const {
+  return Preds;
+}
+
+void PredicatedScalarEvolution::updateGeneration() {
+  // If the generation number wrapped recompute everything.
+  if (++Generation == 0) {
+    for (auto &II : RewriteMap) {
+      const SCEV *Rewritten = II.second.second;
+      II.second = {Generation, SE.rewriteUsingPredicate(Rewritten, Preds)};
+    }
+  }
+}
diff --git a/lib/Transforms/Scalar/LoopDistribute.cpp b/lib/Transforms/Scalar/LoopDistribute.cpp
index 67ebd2532b16..fce063ab40a0 100644
--- a/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -761,7 +761,7 @@ class LoopDistribute : public FunctionPass {
     }
 
     // Don't distribute the loop if we need too many SCEV run-time checks.
-    const SCEVUnionPredicate &Pred = LAI.Preds;
+    const SCEVUnionPredicate &Pred = LAI.PSE.getUnionPredicate();
     if (Pred.getComplexity() > DistributeSCEVCheckThreshold) {
       DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n");
       return false;
@@ -790,7 +790,7 @@ class LoopDistribute : public FunctionPass {
       DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks));
       LoopVersioning LVer(LAI, L, LI, DT, SE, false);
       LVer.setAliasChecks(std::move(Checks));
-      LVer.setSCEVChecks(LAI.Preds);
+      LVer.setSCEVChecks(LAI.PSE.getUnionPredicate());
       LVer.versionLoop(DefsUsedOutside);
     }
 
diff --git a/lib/Transforms/Scalar/LoopLoadElimination.cpp b/lib/Transforms/Scalar/LoopLoadElimination.cpp
index 7c7bf64ba79c..09d022b3013b 100644
--- a/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -459,17 +459,18 @@ class LoadEliminationForLoop {
       return false;
     }
 
-    if (LAI.Preds.getComplexity() > LoadElimSCEVCheckThreshold) {
+    if (LAI.PSE.getUnionPredicate().getComplexity() >
+        LoadElimSCEVCheckThreshold) {
       DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n");
       return false;
     }
 
     // Point of no-return, start the transformation.  First, version the loop if
     // necessary.
-    if (!Checks.empty() || !LAI.Preds.isAlwaysTrue()) {
+    if (!Checks.empty() || !LAI.PSE.getUnionPredicate().isAlwaysTrue()) {
       LoopVersioning LV(LAI, L, LI, DT, SE, false);
       LV.setAliasChecks(std::move(Checks));
-      LV.setSCEVChecks(LAI.Preds);
+      LV.setSCEVChecks(LAI.PSE.getUnionPredicate());
       LV.versionLoop();
     }
 
diff --git a/lib/Transforms/Utils/LoopVersioning.cpp b/lib/Transforms/Utils/LoopVersioning.cpp
index cc3ff5d80d42..9a2a06cf6891 100644
--- a/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/lib/Transforms/Utils/LoopVersioning.cpp
@@ -32,7 +32,7 @@ LoopVersioning::LoopVersioning(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI,
   assert(L->getLoopPreheader() && "No preheader");
   if (UseLAIChecks) {
     setAliasChecks(LAI.getRuntimePointerChecking()->getChecks());
-    setSCEVChecks(LAI.Preds);
+    setSCEVChecks(LAI.PSE.getUnionPredicate());
   }
 }
 
@@ -58,7 +58,7 @@ void LoopVersioning::versionLoop(
       LAI.addRuntimeChecks(RuntimeCheckBB->getTerminator(), AliasChecks);
   assert(MemRuntimeCheck && "called even though needsAnyChecking = false");
 
-  const SCEVUnionPredicate &Pred = LAI.Preds;
+  const SCEVUnionPredicate &Pred = LAI.PSE.getUnionPredicate();
   SCEVExpander Exp(*SE, RuntimeCheckBB->getModule()->getDataLayout(),
                    "scev.check");
   SCEVRuntimeCheck =
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 917f2d55f6cb..9adc80c8bd0f 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -310,15 +310,16 @@ static GetElementPtrInst *getGEPInstruction(Value *Ptr) {
 /// and reduction variables that were found to a given vectorization factor.
 class InnerLoopVectorizer {
 public:
-  InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
-                      DominatorTree *DT, const TargetLibraryInfo *TLI,
+  InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
+                      LoopInfo *LI, DominatorTree *DT,
+                      const TargetLibraryInfo *TLI,
                       const TargetTransformInfo *TTI, unsigned VecWidth,
-                      unsigned UnrollFactor, SCEVUnionPredicate &Preds)
-      : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
-        VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()),
+                      unsigned UnrollFactor)
+      : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
+        VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()),
         Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor),
         TripCount(nullptr), VectorTripCount(nullptr), Legal(nullptr),
-        AddedSafetyChecks(false), Preds(Preds) {}
+        AddedSafetyChecks(false) {}
 
   // Perform the actual loop widening (vectorization).
   // MinimumBitWidths maps scalar integer values to the smallest bitwidth they
@@ -486,8 +487,10 @@ class InnerLoopVectorizer {
 
   /// The original loop.
   Loop *OrigLoop;
-  /// Scev analysis to use.
-  ScalarEvolution *SE;
+  /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
+  /// dynamic knowledge to simplify SCEV expressions and converts them to a
+  /// more usable form.
+  PredicatedScalarEvolution &PSE;
   /// Loop Info.
   LoopInfo *LI;
   /// Dominator Tree.
@@ -551,23 +554,15 @@ class InnerLoopVectorizer {
 
   // Record whether runtime check is added.
   bool AddedSafetyChecks;
-
-  /// The SCEV predicate containing all the SCEV-related assumptions.
-  /// The predicate is used to simplify existing expressions in the
-  /// context of existing SCEV assumptions. Since legality checking is
-  /// not done here, we don't need to use this predicate to record
-  /// further assumptions.
-  SCEVUnionPredicate &Preds;
 };
 
 class InnerLoopUnroller : public InnerLoopVectorizer {
 public:
-  InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
-                    DominatorTree *DT, const TargetLibraryInfo *TLI,
-                    const TargetTransformInfo *TTI, unsigned UnrollFactor,
-                    SCEVUnionPredicate &Preds)
-      : InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, 1, UnrollFactor,
-                            Preds) {}
+  InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
+                    LoopInfo *LI, DominatorTree *DT,
+                    const TargetLibraryInfo *TLI,
+                    const TargetTransformInfo *TTI, unsigned UnrollFactor)
+      : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, 1, UnrollFactor) {}
 
 private:
   void scalarizeInstruction(Instruction *Instr,
@@ -789,9 +784,9 @@ class InterleaveGroup {
 /// between the member and the group in a map.
 class InterleavedAccessInfo {
 public:
-  InterleavedAccessInfo(ScalarEvolution *SE, Loop *L, DominatorTree *DT,
-                        SCEVUnionPredicate &Preds)
-      : SE(SE), TheLoop(L), DT(DT), Preds(Preds) {}
+  InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
+                        DominatorTree *DT)
+      : PSE(PSE), TheLoop(L), DT(DT) {}
 
   ~InterleavedAccessInfo() {
     SmallSet<InterleaveGroup *, 4> DelSet;
@@ -821,17 +816,14 @@ class InterleavedAccessInfo {
   }
 
 private:
-  ScalarEvolution *SE;
+  /// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
+  /// Simplifies SCEV expressions in the context of existing SCEV assumptions.
+  /// The interleaved access analysis can also add new predicates (for example
+  /// by versioning strides of pointers).
+  PredicatedScalarEvolution &PSE;
   Loop *TheLoop;
   DominatorTree *DT;
 
-  /// The SCEV predicate containing all the SCEV-related assumptions.
-  /// The predicate is used to simplify SCEV expressions in the
-  /// context of existing SCEV assumptions. The interleaved access
-  /// analysis can also add new predicates (for example by versioning
-  /// strides of pointers).
-  SCEVUnionPredicate &Preds;
-
   /// Holds the relationships between the members and the interleave group.
   DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap;
 
@@ -1189,18 +1181,17 @@ static void emitMissedWarning(Function *F, Loop *L,
 /// induction variable and the different reduction variables.
 class LoopVectorizationLegality {
 public:
-  LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DominatorTree *DT,
-                            TargetLibraryInfo *TLI, AliasAnalysis *AA,
-                            Function *F, const TargetTransformInfo *TTI,
+  LoopVectorizationLegality(Loop *L, PredicatedScalarEvolution &PSE,
+                            DominatorTree *DT, TargetLibraryInfo *TLI,
+                            AliasAnalysis *AA, Function *F,
+                            const TargetTransformInfo *TTI,
                             LoopAccessAnalysis *LAA,
                             LoopVectorizationRequirements *R,
-                            const LoopVectorizeHints *H,
-                            SCEVUnionPredicate &Preds)
-      : NumPredStores(0), TheLoop(L), SE(SE), TLI(TLI), TheFunction(F),
-        TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr),
-        InterleaveInfo(SE, L, DT, Preds), Induction(nullptr),
-        WidestIndTy(nullptr), HasFunNoNaNAttr(false), Requirements(R), Hints(H),
-        Preds(Preds) {}
+                            const LoopVectorizeHints *H)
+      : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TheFunction(F),
+        TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(PSE, L, DT),
+        Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false),
+        Requirements(R), Hints(H) {}
 
   /// ReductionList contains the reduction descriptors for all
   /// of the reductions that were found in the loop.
@@ -1347,8 +1338,12 @@ class LoopVectorizationLegality {
 
   /// The loop that we evaluate.
   Loop *TheLoop;
-  /// Scev analysis.
-  ScalarEvolution *SE;
+  /// A wrapper around ScalarEvolution used to add runtime SCEV checks.
+  /// Applies dynamic knowledge to simplify SCEV expressions in the context
+  /// of existing SCEV assumptions. The analysis will also add a minimal set
+  /// of new predicates if this is required to enable vectorization and
+  /// unrolling.
+  PredicatedScalarEvolution &PSE;
   /// Target Library Info.
   TargetLibraryInfo *TLI;
   /// Parent function
@@ -1403,13 +1398,6 @@ class LoopVectorizationLegality {
   /// While vectorizing these instructions we have to generate a
   /// call to the appropriate masked intrinsic
   SmallPtrSet<const Instruction *, 8> MaskedOp;
-
-  /// The SCEV predicate containing all the SCEV-related assumptions.
-  /// The predicate is used to simplify SCEV expressions in the
-  /// context of existing SCEV assumptions. The analysis will also
-  /// add a minimal set of new predicates if this is required to
-  /// enable vectorization/unrolling.
-  SCEVUnionPredicate &Preds;
 };
 
 /// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -1427,8 +1415,7 @@ class LoopVectorizationCostModel {
                              const TargetLibraryInfo *TLI, DemandedBits *DB,
                              AssumptionCache *AC, const Function *F,
                              const LoopVectorizeHints *Hints,
-                             SmallPtrSetImpl<const Value *> &ValuesToIgnore,
-                             SCEVUnionPredicate &Preds)
+                             SmallPtrSetImpl<const Value *> &ValuesToIgnore)
       : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
         TheFunction(F), Hints(Hints), ValuesToIgnore(ValuesToIgnore) {}
 
@@ -1758,12 +1745,12 @@ struct LoopVectorize : public FunctionPass {
       }
     }
 
-    SCEVUnionPredicate Preds;
+    PredicatedScalarEvolution PSE(*SE);
 
     // Check if it is legal to vectorize the loop.
     LoopVectorizationRequirements Requirements;
-    LoopVectorizationLegality LVL(L, SE, DT, TLI, AA, F, TTI, LAA,
-                                  &Requirements, &Hints, Preds);
+    LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA,
+                                  &Requirements, &Hints);
     if (!LVL.canVectorize()) {
       DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
       emitMissedWarning(F, L, Hints);
@@ -1781,8 +1768,8 @@ struct LoopVectorize : public FunctionPass {
     }
 
     // Use the cost model.
-    LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, TLI, DB, AC, F, &Hints,
-                                  ValuesToIgnore, Preds);
+    LoopVectorizationCostModel CM(L, PSE.getSE(), LI, &LVL, *TTI, TLI, DB, AC,
+                                  F, &Hints, ValuesToIgnore);
 
     // Check the function attributes to find out if this function should be
     // optimized for size.
@@ -1893,7 +1880,7 @@ struct LoopVectorize : public FunctionPass {
       assert(IC > 1 && "interleave count should not be 1 or 0");
       // If we decided that it is not legal to vectorize the loop then
       // interleave it.
-      InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, IC, Preds);
+      InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, IC);
       Unroller.vectorize(&LVL, CM.MinBWs);
 
       emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(),
@@ -1901,7 +1888,7 @@ struct LoopVectorize : public FunctionPass {
                                  Twine(IC) + ")");
     } else {
       // If we decided that it is *legal* to vectorize the loop then do it.
-      InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, IC, Preds);
+      InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, VF.Width, IC);
       LB.vectorize(&LVL, CM.MinBWs);
       ++LoopsVectorized;
 
@@ -2002,6 +1989,7 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
 
 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr");
+  auto *SE = PSE.getSE();
   // Make sure that the pointer does not point to structs.
   if (Ptr->getType()->getPointerElementType()->isAggregateType())
     return 0;
@@ -2031,7 +2019,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
 
     // Make sure that all of the index operands are loop invariant.
     for (unsigned i = 1; i < NumOperands; ++i)
-      if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
+      if (!SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop))
         return 0;
 
     InductionDescriptor II = Inductions[Phi];
@@ -2044,14 +2032,14 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   // operand.
   for (unsigned i = 0; i != NumOperands; ++i)
     if (i != InductionOperand &&
-        !SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
+        !SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop))
       return 0;
 
   // We can emit wide load/stores only if the last non-zero index is the
   // induction variable.
   const SCEV *Last = nullptr;
   if (!Strides.count(Gep))
-    Last = SE->getSCEV(Gep->getOperand(InductionOperand));
+    Last = PSE.getSCEV(Gep->getOperand(InductionOperand));
   else {
     // Because of the multiplication by a stride we can have a s/zext cast.
     // We are going to replace this stride by 1 so the cast is safe to ignore.
@@ -2062,7 +2050,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
     //  %idxprom = zext i32 %mul to i64  << Safe cast.
     //  %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom
     //
-    Last = replaceSymbolicStrideSCEV(SE, Strides, Preds,
+    Last = replaceSymbolicStrideSCEV(PSE, Strides,
                                      Gep->getOperand(InductionOperand), Gep);
     if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(Last))
       Last =
@@ -2420,8 +2408,9 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
     Ptr = Builder.Insert(Gep2);
   } else if (Gep) {
     setDebugLocFromInst(Builder, Gep);
-    assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()),
-                               OrigLoop) && "Base ptr must be invariant");
+    assert(PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getPointerOperand()),
+                                        OrigLoop) &&
+           "Base ptr must be invariant");
 
     // The last index does not have to be the induction. It can be
     // consecutive and be a function of the index. For example A[I+1];
@@ -2438,7 +2427,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
       if (i == InductionOperand ||
           (GepOperandInst && OrigLoop->contains(GepOperandInst))) {
         assert((i == InductionOperand ||
-               SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) &&
+                PSE.getSE()->isLoopInvariant(PSE.getSCEV(GepOperandInst),
+                                             OrigLoop)) &&
                "Must be last index or loop invariant");
 
         VectorParts &GEPParts = getVectorValue(GepOperand);
@@ -2658,6 +2648,7 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
 
   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
   // Find the loop boundaries.
+  ScalarEvolution *SE = PSE.getSE();
   const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(OrigLoop);
   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
          "Invalid loop count");
@@ -2765,8 +2756,10 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
   // Generate the code to check that the SCEV assumptions that we made.
   // We want the new basic block to start at the first instruction in a
   // sequence of instructions that form a check.
-  SCEVExpander Exp(*SE, Bypass->getModule()->getDataLayout(), "scev.check");
-  Value *SCEVCheck = Exp.expandCodeForPredicate(&Preds, BB->getTerminator());
+  SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
+                   "scev.check");
+  Value *SCEVCheck =
+      Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
 
   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
     if (C->isZero())
@@ -3785,8 +3778,9 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
       // Widen selects.
       // If the selector is loop invariant we can create a select
       // instruction with a scalar condition. Otherwise, use vector-select.
-      bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)),
-                                               OrigLoop);
+      auto *SE = PSE.getSE();
+      bool InvariantCond =
+          SE->isLoopInvariant(PSE.getSCEV(it->getOperand(0)), OrigLoop);
       setDebugLocFromInst(Builder, &*it);
 
       // The condition can be loop invariant  but still defined inside the
@@ -3967,7 +3961,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
 
 void InnerLoopVectorizer::updateAnalysis() {
   // Forget the original basic block.
-  SE->forgetLoop(OrigLoop);
+  PSE.getSE()->forgetLoop(OrigLoop);
 
   // Update the dominator tree information.
   assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
@@ -4119,10 +4113,10 @@ bool LoopVectorizationLegality::canVectorize() {
   }
 
   // ScalarEvolution needs to be able to find the exit count.
-  const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
-  if (ExitCount == SE->getCouldNotCompute()) {
-    emitAnalysis(VectorizationReport() <<
-                 "could not determine number of loop iterations");
+  const SCEV *ExitCount = PSE.getSE()->getBackedgeTakenCount(TheLoop);
+  if (ExitCount == PSE.getSE()->getCouldNotCompute()) {
+    emitAnalysis(VectorizationReport()
+                 << "could not determine number of loop iterations");
     DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
     return false;
   }
@@ -4162,7 +4156,7 @@ bool LoopVectorizationLegality::canVectorize() {
   if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
     SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
 
-  if (Preds.getComplexity() > SCEVThreshold) {
+  if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {
     emitAnalysis(VectorizationReport()
                  << "Too many SCEV assumptions need to be made and checked "
                  << "at runtime");
@@ -4268,7 +4262,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         }
 
         InductionDescriptor ID;
-        if (InductionDescriptor::isInductionPHI(Phi, SE, ID)) {
+        if (InductionDescriptor::isInductionPHI(Phi, PSE.getSE(), ID)) {
           Inductions[Phi] = ID;
           // Get the widest type.
           if (!WidestIndTy)
@@ -4337,7 +4331,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       // second argument is the same (i.e. loop invariant)
       if (CI &&
           hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) {
-        if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) {
+        auto *SE = PSE.getSE();
+        if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) {
           emitAnalysis(VectorizationReport(&*it)
                        << "intrinsic instruction cannot be vectorized");
           DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n");
@@ -4410,7 +4405,7 @@ void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) {
   else
     return;
 
-  Value *Stride = getStrideFromPointer(Ptr, SE, TheLoop);
+  Value *Stride = getStrideFromPointer(Ptr, PSE.getSE(), TheLoop);
   if (!Stride)
     return;
 
@@ -4474,7 +4469,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
   }
 
   Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
-  Preds.add(&LAI->Preds);
+  PSE.addPredicate(LAI->PSE.getUnionPredicate());
 
   return true;
 }
@@ -4589,7 +4584,7 @@ void InterleavedAccessInfo::collectConstStridedAccesses(
     StoreInst *SI = dyn_cast<StoreInst>(I);
 
     Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
-    int Stride = isStridedPtr(SE, Ptr, TheLoop, Strides, Preds);
+    int Stride = isStridedPtr(PSE, Ptr, TheLoop, Strides);
 
     // The factor of the corresponding interleave group.
     unsigned Factor = std::abs(Stride);
@@ -4598,7 +4593,7 @@ void InterleavedAccessInfo::collectConstStridedAccesses(
     if (Factor < 2 || Factor > MaxInterleaveGroupFactor)
       continue;
 
-    const SCEV *Scev = replaceSymbolicStrideSCEV(SE, Strides, Preds, Ptr);
+    const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
     PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
     unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType());
 
@@ -4685,8 +4680,8 @@ void InterleavedAccessInfo::analyzeInterleaving(
         continue;
 
       // Calculate the distance and prepare for the rule 3.
-      const SCEVConstant *DistToA =
-          dyn_cast<SCEVConstant>(SE->getMinusSCEV(DesB.Scev, DesA.Scev));
+      const SCEVConstant *DistToA = dyn_cast<SCEVConstant>(
+          PSE.getSE()->getMinusSCEV(DesB.Scev, DesA.Scev));
       if (!DistToA)
         continue;
 

From 13f632f17efc05fca61e9cb188f51a54bfc440ed Mon Sep 17 00:00:00 2001
From: Mehdi Amini <mehdi.amini@apple.com>
Date: Wed, 9 Dec 2015 16:17:07 +0000
Subject: [PATCH 276/364] Change hasUniqueInitializer() to call
 isStrongDefinitionForLinker() instead of !isWeakForLinker()

Summary:
Available_externally global variable with initializer were considered "hasInitializer()",
while obviously it can't match the description:

    Whether the global variable has an initializer, and any changes made to the
    initializer will turn up in the final executable.

since modifying the initializer of an externally available variable does not make sense.

Reviewers: pcc, rafael

Subscribers: llvm-commits

Differential Revision: http://reviews.llvm.org/D15351

From: Mehdi Amini <mehdi.amini@apple.com>

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255123 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/GlobalVariable.h              | 19 ++++++----------
 .../available_externally_global_ctors.ll      | 22 +++++++++++++++++++
 2 files changed, 29 insertions(+), 12 deletions(-)
 create mode 100644 test/Transforms/GlobalOpt/available_externally_global_ctors.ll

diff --git a/include/llvm/IR/GlobalVariable.h b/include/llvm/IR/GlobalVariable.h
index 62d3e43f9d60..342bdc01bfbd 100644
--- a/include/llvm/IR/GlobalVariable.h
+++ b/include/llvm/IR/GlobalVariable.h
@@ -105,18 +105,13 @@ class GlobalVariable : public GlobalObject, public ilist_node<GlobalVariable> {
   /// hasUniqueInitializer - Whether the global variable has an initializer, and
   /// any changes made to the initializer will turn up in the final executable.
   inline bool hasUniqueInitializer() const {
-    return hasInitializer() &&
-      // It's not safe to modify initializers of global variables with weak
-      // linkage, because the linker might choose to discard the initializer and
-      // use the initializer from another instance of the global variable
-      // instead. It is wrong to modify the initializer of a global variable
-      // with *_odr linkage because then different instances of the global may
-      // have different initializers, breaking the One Definition Rule.
-      !isWeakForLinker() &&
-      // It is not safe to modify initializers of global variables with the
-      // external_initializer marker since the value may be changed at runtime
-      // before C++ initializers are evaluated.
-      !isExternallyInitialized();
+    return
+        // We need to be sure this is the definition that will actually be used
+        isStrongDefinitionForLinker() &&
+        // It is not safe to modify initializers of global variables with the
+        // external_initializer marker since the value may be changed at runtime
+        // before C++ initializers are evaluated.
+        !isExternallyInitialized();
   }
 
   /// getInitializer - Return the initializer for this global variable.  It is
diff --git a/test/Transforms/GlobalOpt/available_externally_global_ctors.ll b/test/Transforms/GlobalOpt/available_externally_global_ctors.ll
new file mode 100644
index 000000000000..7092a5ae2226
--- /dev/null
+++ b/test/Transforms/GlobalOpt/available_externally_global_ctors.ll
@@ -0,0 +1,22 @@
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+; RUN: opt -S -globalopt < %s | FileCheck %s
+
+; Verify that the initialization of the available_externally global is not eliminated
+; CHECK: @llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @foo_static_init, i8* null }]
+
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @foo_static_init, i8* null }]
+@foo_external = available_externally global void ()* null
+
+define internal void @foo_static_init() {
+entry:
+  store void ()* @foo_impl, void ()** @foo_external
+  ret void
+}
+
+define internal void @foo_impl() {
+entry:
+  ret void
+}
+

From eb103602da8248367c11249982010b15885f8ae4 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 9 Dec 2015 16:17:20 +0000
Subject: [PATCH 277/364] [InstCombine] fold bitcasts around an extractelement

Example:
  bitcast (extractelement (bitcast <2 x float> %X to <2 x i32>), 1) to float
    --->
  extractelement <2 x float> %X, i32 1

This is part of fixing PR25543:
https://llvm.org/bugs/show_bug.cgi?id=25543

The next step will be to generalize this fold:
trunc ( lshr ( bitcast X) ) -> extractelement (X)

Ie, I'm hoping to replace the existing transform of:
bitcast ( trunc ( lshr ( bitcast X)))
added by:
http://reviews.llvm.org/rL112232

with 2 less specific transforms to catch the case in the bug report.

Differential Revision: http://reviews.llvm.org/D14879


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255124 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/InstCombineCasts.cpp          | 37 +++++++++++++++++++
 test/Transforms/InstCombine/bitcast.ll        | 13 +++----
 2 files changed, 42 insertions(+), 8 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 4afe1bb243ff..23bf40124b57 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1715,6 +1715,40 @@ static Value *optimizeIntegerToVectorInsertions(BitCastInst &CI,
   return Result;
 }
 
+/// Given a bitcasted vector fed into an extract element instruction and then
+/// bitcasted again, eliminate at least one bitcast by changing the vector type
+/// of the extractelement instruction.
+/// Example:
+///   bitcast (extractelement (bitcast <2 x float> %X to <2 x i32>), 1) to float
+///    --->
+///   extractelement <2 x float> %X, i32 1
+static Instruction *foldBitCastExtElt(BitCastInst &BitCast, InstCombiner &IC,
+                                      const DataLayout &DL) {
+  // TODO: Create and use a pattern matcher for ExtractElementInst.
+  auto *ExtElt = dyn_cast<ExtractElementInst>(BitCast.getOperand(0));
+  if (!ExtElt || !ExtElt->hasOneUse())
+    return nullptr;
+
+  Value *InnerBitCast = nullptr;
+  if (!match(ExtElt->getOperand(0), m_BitCast(m_Value(InnerBitCast))))
+    return nullptr;
+
+  VectorType *VecType = cast<VectorType>(InnerBitCast->getType());
+  Type *DestType = BitCast.getType();
+
+  // If the element type of the vector doesn't match the result type,
+  // bitcast it to a vector type that we can extract from.
+  if (VecType->getElementType() != DestType) {
+    unsigned VecWidth = VecType->getPrimitiveSizeInBits();
+    unsigned DestWidth = DestType->getPrimitiveSizeInBits();
+    unsigned NumElts = VecWidth / DestWidth;
+    VecType = VectorType::get(DestType, NumElts);
+    InnerBitCast = IC.Builder->CreateBitCast(InnerBitCast, VecType, "bc");
+  }
+
+  return ExtractElementInst::Create(InnerBitCast, ExtElt->getOperand(1));
+}
+
 static Instruction *foldVecTruncToExtElt(Value *VecInput, Type *DestTy,
                                          unsigned ShiftAmt, InstCombiner &IC,
                                          const DataLayout &DL) {
@@ -1886,6 +1920,9 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     }
   }
 
+  if (Instruction *I = foldBitCastExtElt(CI, *this, DL))
+    return I;
+
   if (SrcTy->isPointerTy())
     return commonPointerCastTransforms(CI);
   return commonCastTransforms(CI);
diff --git a/test/Transforms/InstCombine/bitcast.ll b/test/Transforms/InstCombine/bitcast.ll
index 2a8194e53032..132147f65321 100644
--- a/test/Transforms/InstCombine/bitcast.ll
+++ b/test/Transforms/InstCombine/bitcast.ll
@@ -64,7 +64,7 @@ define float @test3(<2 x float> %A, <2 x i64> %B) {
 ; CHECK-NEXT:  ret float %add
 }
 
-; TODO: Both bitcasts are unnecessary; change the extractelement.
+; Both bitcasts are unnecessary; change the extractelement.
 
 define float @bitcast_extelt1(<2 x float> %A) {
   %bc1 = bitcast <2 x float> %A to <2 x i32>
@@ -73,13 +73,11 @@ define float @bitcast_extelt1(<2 x float> %A) {
   ret float %bc2
 
 ; CHECK-LABEL: @bitcast_extelt1(
-; CHECK-NEXT:  %bc1 = bitcast <2 x float> %A to <2 x i32>
-; CHECK-NEXT:  %ext = extractelement <2 x i32> %bc1, i32 0
-; CHECK-NEXT:  %bc2 = bitcast i32 %ext to float
+; CHECK-NEXT:  %bc2 = extractelement <2 x float> %A, i32 0
 ; CHECK-NEXT:  ret float %bc2
 }
 
-; TODO: Second bitcast can be folded into the first.
+; Second bitcast can be folded into the first.
 
 define i64 @bitcast_extelt2(<4 x float> %A) {
   %bc1 = bitcast <4 x float> %A to <2 x double>
@@ -88,9 +86,8 @@ define i64 @bitcast_extelt2(<4 x float> %A) {
   ret i64 %bc2
 
 ; CHECK-LABEL: @bitcast_extelt2(
-; CHECK-NEXT:  %bc1 = bitcast <4 x float> %A to <2 x double>
-; CHECK-NEXT:  %ext = extractelement <2 x double> %bc1, i32 1
-; CHECK-NEXT:  %bc2 = bitcast double %ext to i64
+; CHECK-NEXT:  %bc = bitcast <4 x float> %A to <2 x i64>
+; CHECK-NEXT:  %bc2 = extractelement <2 x i64> %bc, i32 1
 ; CHECK-NEXT:  ret i64 %bc2
 }
 

From 4c8fe28374d0fee214c9af21329f5b0e93a14615 Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Wed, 9 Dec 2015 16:23:59 +0000
Subject: [PATCH 278/364] [WebAssembly] Reintroduce ARGUMENT moving logic

Reinteroduce the code for moving ARGUMENTS back to the top of the basic block.
While the ARGUMENTS physical register prevents sinking and scheduling from
moving them, it does not appear to be sufficient to prevent SelectionDAG from
moving them down in the initial schedule. This patch introduces a patch that
moves them back to the top immediately after SelectionDAG runs.

This is still hopefully a temporary solution. http://reviews.llvm.org/D14750 is
one alternative, though the review has not been favorable, and proposed
alternatives are longer-term and have other downsides.

This fixes the main outstanding -verify-machineinstrs failures, so it adds
-verify-machineinstrs to several tests.

Differential Revision: http://reviews.llvm.org/D15377


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255125 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/CMakeLists.txt         |   1 +
 lib/Target/WebAssembly/WebAssembly.h          |   1 +
 .../WebAssembly/WebAssemblyArgumentMove.cpp   | 110 ++++++++++++++++++
 .../WebAssembly/WebAssemblyTargetMachine.cpp  |   4 +
 test/CodeGen/WebAssembly/dead-vreg.ll         |   2 +-
 test/CodeGen/WebAssembly/phi.ll               |   2 +-
 test/CodeGen/WebAssembly/reg-stackify.ll      |   2 +-
 test/CodeGen/WebAssembly/switch.ll            |   2 +-
 test/CodeGen/WebAssembly/unreachable.ll       |   4 +-
 test/CodeGen/WebAssembly/varargs.ll           |   2 +-
 10 files changed, 123 insertions(+), 7 deletions(-)
 create mode 100644 lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp

diff --git a/lib/Target/WebAssembly/CMakeLists.txt b/lib/Target/WebAssembly/CMakeLists.txt
index 5d1a27a6f093..b5177265704c 100644
--- a/lib/Target/WebAssembly/CMakeLists.txt
+++ b/lib/Target/WebAssembly/CMakeLists.txt
@@ -11,6 +11,7 @@ add_public_tablegen_target(WebAssemblyCommonTableGen)
 
 add_llvm_target(WebAssemblyCodeGen
   Relooper.cpp
+  WebAssemblyArgumentMove.cpp
   WebAssemblyAsmPrinter.cpp
   WebAssemblyCFGStackify.cpp
   WebAssemblyFastISel.cpp
diff --git a/lib/Target/WebAssembly/WebAssembly.h b/lib/Target/WebAssembly/WebAssembly.h
index 6705b22a376f..f8d0a1ccdc2e 100644
--- a/lib/Target/WebAssembly/WebAssembly.h
+++ b/lib/Target/WebAssembly/WebAssembly.h
@@ -27,6 +27,7 @@ FunctionPass *createWebAssemblyOptimizeReturned();
 
 FunctionPass *createWebAssemblyISelDag(WebAssemblyTargetMachine &TM,
                                        CodeGenOpt::Level OptLevel);
+FunctionPass *createWebAssemblyArgumentMove();
 
 FunctionPass *createWebAssemblyStoreResults();
 FunctionPass *createWebAssemblyRegStackify();
diff --git a/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp b/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp
new file mode 100644
index 000000000000..3893c408cf63
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp
@@ -0,0 +1,110 @@
+//===-- WebAssemblyArgumentMove.cpp - Argument instruction moving ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file moves ARGUMENT instructions after ScheduleDAG scheduling.
+///
+/// Arguments are really live-in registers, however, since we use virtual
+/// registers and LLVM doesn't support live-in virtual registers, we're
+/// currently making do with ARGUMENT instructions which are placed at the top
+/// of the entry block. The trick is to get them to *stay* at the top of the
+/// entry block.
+///
+/// The ARGUMENTS physical register keeps these instructions pinned in place
+/// during liveness-aware CodeGen passes, however one thing which does not
+/// respect this is the ScheduleDAG scheduler. This pass is therefore run
+/// immediately after that.
+///
+/// This is all hopefully a temporary solution until we find a better solution
+/// for describing the live-in nature of arguments.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-argument-move"
+
+namespace {
+class WebAssemblyArgumentMove final : public MachineFunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyArgumentMove() : MachineFunctionPass(ID) {}
+
+  const char *getPassName() const override {
+    return "WebAssembly Argument Move";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addPreserved<MachineBlockFrequencyInfo>();
+    AU.addPreservedID(MachineDominatorsID);
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // end anonymous namespace
+
+char WebAssemblyArgumentMove::ID = 0;
+FunctionPass *llvm::createWebAssemblyArgumentMove() {
+  return new WebAssemblyArgumentMove();
+}
+
+/// Test whether the given instruction is an ARGUMENT.
+static bool IsArgument(const MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  case WebAssembly::ARGUMENT_I32:
+  case WebAssembly::ARGUMENT_I64:
+  case WebAssembly::ARGUMENT_F32:
+  case WebAssembly::ARGUMENT_F64:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool WebAssemblyArgumentMove::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG({
+    dbgs() << "********** Argument Move **********\n"
+           << "********** Function: " << MF.getName() << '\n';
+  });
+
+  bool Changed = false;
+  MachineBasicBlock &EntryMBB = MF.front();
+  MachineBasicBlock::iterator InsertPt = EntryMBB.end();
+
+  // Look for the first NonArg instruction.
+  for (auto MII = EntryMBB.begin(), MIE = EntryMBB.end(); MII != MIE; ++MII) {
+    MachineInstr *MI = MII;
+    if (!IsArgument(MI)) {
+      InsertPt = MII;
+      break;
+    }
+  }
+
+  // Now move any argument instructions later in the block
+  // to before our first NonArg instruction.
+  for (auto I = InsertPt, E = EntryMBB.end(); I != E; ++I) {
+    MachineInstr *MI = I;
+    if (IsArgument(MI)) {
+      EntryMBB.insert(InsertPt, MI->removeFromParent());
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index 917dfacfe9d5..a333fb3055e9 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -149,6 +149,10 @@ bool WebAssemblyPassConfig::addInstSelector() {
   (void)TargetPassConfig::addInstSelector();
   addPass(
       createWebAssemblyISelDag(getWebAssemblyTargetMachine(), getOptLevel()));
+  // Run the argument-move pass immediately after the ScheduleDAG scheduler
+  // so that we can fix up the ARGUMENT instructions before anything else
+  // sees them in the wrong place.
+  addPass(createWebAssemblyArgumentMove());
   return false;
 }
 
diff --git a/test/CodeGen/WebAssembly/dead-vreg.ll b/test/CodeGen/WebAssembly/dead-vreg.ll
index cf1415c1982b..b03e1569fde6 100644
--- a/test/CodeGen/WebAssembly/dead-vreg.ll
+++ b/test/CodeGen/WebAssembly/dead-vreg.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs | FileCheck %s
 
 ; Check that unused vregs aren't assigned registers.
 
diff --git a/test/CodeGen/WebAssembly/phi.ll b/test/CodeGen/WebAssembly/phi.ll
index abbc1c59af3c..bae8a7c9e3b8 100644
--- a/test/CodeGen/WebAssembly/phi.ll
+++ b/test/CodeGen/WebAssembly/phi.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs | FileCheck %s
 
 ; Test that phis are lowered.
 
diff --git a/test/CodeGen/WebAssembly/reg-stackify.ll b/test/CodeGen/WebAssembly/reg-stackify.ll
index af4a3501531b..3c3434348361 100644
--- a/test/CodeGen/WebAssembly/reg-stackify.ll
+++ b/test/CodeGen/WebAssembly/reg-stackify.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs | FileCheck %s
 
 ; Test the register stackifier pass.
 
diff --git a/test/CodeGen/WebAssembly/switch.ll b/test/CodeGen/WebAssembly/switch.ll
index b146a239b419..7f6f6efff7d6 100644
--- a/test/CodeGen/WebAssembly/switch.ll
+++ b/test/CodeGen/WebAssembly/switch.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false -disable-block-placement | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-block-placement -verify-machineinstrs | FileCheck %s
 
 ; Test switch instructions. Block placement is disabled because it reorders
 ; the blocks in a way that isn't interesting here.
diff --git a/test/CodeGen/WebAssembly/unreachable.ll b/test/CodeGen/WebAssembly/unreachable.ll
index e14c58876ed2..414767e5c35d 100644
--- a/test/CodeGen/WebAssembly/unreachable.ll
+++ b/test/CodeGen/WebAssembly/unreachable.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
-; RUN: llc < %s -asm-verbose=false -fast-isel | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -fast-isel -verify-machineinstrs | FileCheck %s
 
 ; Test that LLVM unreachable instruction and trap intrinsic are lowered to
 ; wasm unreachable
diff --git a/test/CodeGen/WebAssembly/varargs.ll b/test/CodeGen/WebAssembly/varargs.ll
index bda0dd779e65..10846f2a989c 100644
--- a/test/CodeGen/WebAssembly/varargs.ll
+++ b/test/CodeGen/WebAssembly/varargs.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs | FileCheck %s
 
 ; Test varargs constructs.
 

From cf1e58c00294ee005b4b5269a75de4c80096ba50 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <mehdi.amini@apple.com>
Date: Wed, 9 Dec 2015 16:31:39 +0000
Subject: [PATCH 279/364] Revert "[InstCombine] fold bitcasts around an
 extractelement"

This reverts commit r255124.

Broke http://lab.llvm.org:8011/builders/llvm-clang-lld-x86_64-scei-ps4-ubuntu-fast/builds/4193/steps/test/logs/stdio

From: Mehdi Amini <mehdi.amini@apple.com>

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255126 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/InstCombineCasts.cpp          | 37 -------------------
 test/Transforms/InstCombine/bitcast.ll        | 13 ++++---
 2 files changed, 8 insertions(+), 42 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 23bf40124b57..4afe1bb243ff 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1715,40 +1715,6 @@ static Value *optimizeIntegerToVectorInsertions(BitCastInst &CI,
   return Result;
 }
 
-/// Given a bitcasted vector fed into an extract element instruction and then
-/// bitcasted again, eliminate at least one bitcast by changing the vector type
-/// of the extractelement instruction.
-/// Example:
-///   bitcast (extractelement (bitcast <2 x float> %X to <2 x i32>), 1) to float
-///    --->
-///   extractelement <2 x float> %X, i32 1
-static Instruction *foldBitCastExtElt(BitCastInst &BitCast, InstCombiner &IC,
-                                      const DataLayout &DL) {
-  // TODO: Create and use a pattern matcher for ExtractElementInst.
-  auto *ExtElt = dyn_cast<ExtractElementInst>(BitCast.getOperand(0));
-  if (!ExtElt || !ExtElt->hasOneUse())
-    return nullptr;
-
-  Value *InnerBitCast = nullptr;
-  if (!match(ExtElt->getOperand(0), m_BitCast(m_Value(InnerBitCast))))
-    return nullptr;
-
-  VectorType *VecType = cast<VectorType>(InnerBitCast->getType());
-  Type *DestType = BitCast.getType();
-
-  // If the element type of the vector doesn't match the result type,
-  // bitcast it to a vector type that we can extract from.
-  if (VecType->getElementType() != DestType) {
-    unsigned VecWidth = VecType->getPrimitiveSizeInBits();
-    unsigned DestWidth = DestType->getPrimitiveSizeInBits();
-    unsigned NumElts = VecWidth / DestWidth;
-    VecType = VectorType::get(DestType, NumElts);
-    InnerBitCast = IC.Builder->CreateBitCast(InnerBitCast, VecType, "bc");
-  }
-
-  return ExtractElementInst::Create(InnerBitCast, ExtElt->getOperand(1));
-}
-
 static Instruction *foldVecTruncToExtElt(Value *VecInput, Type *DestTy,
                                          unsigned ShiftAmt, InstCombiner &IC,
                                          const DataLayout &DL) {
@@ -1920,9 +1886,6 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     }
   }
 
-  if (Instruction *I = foldBitCastExtElt(CI, *this, DL))
-    return I;
-
   if (SrcTy->isPointerTy())
     return commonPointerCastTransforms(CI);
   return commonCastTransforms(CI);
diff --git a/test/Transforms/InstCombine/bitcast.ll b/test/Transforms/InstCombine/bitcast.ll
index 132147f65321..2a8194e53032 100644
--- a/test/Transforms/InstCombine/bitcast.ll
+++ b/test/Transforms/InstCombine/bitcast.ll
@@ -64,7 +64,7 @@ define float @test3(<2 x float> %A, <2 x i64> %B) {
 ; CHECK-NEXT:  ret float %add
 }
 
-; Both bitcasts are unnecessary; change the extractelement.
+; TODO: Both bitcasts are unnecessary; change the extractelement.
 
 define float @bitcast_extelt1(<2 x float> %A) {
   %bc1 = bitcast <2 x float> %A to <2 x i32>
@@ -73,11 +73,13 @@ define float @bitcast_extelt1(<2 x float> %A) {
   ret float %bc2
 
 ; CHECK-LABEL: @bitcast_extelt1(
-; CHECK-NEXT:  %bc2 = extractelement <2 x float> %A, i32 0
+; CHECK-NEXT:  %bc1 = bitcast <2 x float> %A to <2 x i32>
+; CHECK-NEXT:  %ext = extractelement <2 x i32> %bc1, i32 0
+; CHECK-NEXT:  %bc2 = bitcast i32 %ext to float
 ; CHECK-NEXT:  ret float %bc2
 }
 
-; Second bitcast can be folded into the first.
+; TODO: Second bitcast can be folded into the first.
 
 define i64 @bitcast_extelt2(<4 x float> %A) {
   %bc1 = bitcast <4 x float> %A to <2 x double>
@@ -86,8 +88,9 @@ define i64 @bitcast_extelt2(<4 x float> %A) {
   ret i64 %bc2
 
 ; CHECK-LABEL: @bitcast_extelt2(
-; CHECK-NEXT:  %bc = bitcast <4 x float> %A to <2 x i64>
-; CHECK-NEXT:  %bc2 = extractelement <2 x i64> %bc, i32 1
+; CHECK-NEXT:  %bc1 = bitcast <4 x float> %A to <2 x double>
+; CHECK-NEXT:  %ext = extractelement <2 x double> %bc1, i32 1
+; CHECK-NEXT:  %bc2 = bitcast double %ext to i64
 ; CHECK-NEXT:  ret i64 %bc2
 }
 

From 09a7daad01b02482a58fb2ad6972e80871040c84 Mon Sep 17 00:00:00 2001
From: Nathan Slingerland <slingn@gmail.com>
Date: Wed, 9 Dec 2015 17:11:28 +0000
Subject: [PATCH 280/364] [Support] Change SaturatingAdd()/SaturatingMultiply()
 to use pointer for returning overflow state

Summary:
Improve SaturatingAdd()/SaturatingMultiply() to use bool * to optionally return overflow result.
This should make it clearer that the value is returned at callsites and reduces the size of the implementation.

Reviewers: davidxl, silvas

Subscribers: llvm-commits

Differential Revision: http://reviews.llvm.org/D15219

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255128 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ProfileData/InstrProf.h |  2 +-
 include/llvm/Support/MathExtras.h    | 37 +++++++++-------------------
 unittests/Support/MathExtrasTest.cpp | 34 ++++++++++++-------------
 3 files changed, 30 insertions(+), 43 deletions(-)

diff --git a/include/llvm/ProfileData/InstrProf.h b/include/llvm/ProfileData/InstrProf.h
index 3e711bb60cf0..2730cc167f7d 100644
--- a/include/llvm/ProfileData/InstrProf.h
+++ b/include/llvm/ProfileData/InstrProf.h
@@ -435,7 +435,7 @@ instrprof_error InstrProfRecord::merge(InstrProfRecord &Other) {
 
   for (size_t I = 0, E = Other.Counts.size(); I < E; ++I) {
     bool ResultOverflowed;
-    Counts[I] = SaturatingAdd(Counts[I], Other.Counts[I], ResultOverflowed);
+    Counts[I] = SaturatingAdd(Counts[I], Other.Counts[I], &ResultOverflowed);
     if (ResultOverflowed)
       Result = instrprof_error::counter_overflow;
   }
diff --git a/include/llvm/Support/MathExtras.h b/include/llvm/Support/MathExtras.h
index 0d0a2efa7ad2..8111aeebe6ee 100644
--- a/include/llvm/Support/MathExtras.h
+++ b/include/llvm/Support/MathExtras.h
@@ -659,38 +659,34 @@ inline int64_t SignExtend64(uint64_t X, unsigned B) {
 /// representable value of type T.
 template <typename T>
 typename std::enable_if<std::is_unsigned<T>::value, T>::type
-SaturatingAdd(T X, T Y, bool &ResultOverflowed) {
+SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
+  bool Dummy;
+  bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
   // Hacker's Delight, p. 29
   T Z = X + Y;
-  ResultOverflowed = (Z < X || Z < Y);
-  if (ResultOverflowed)
+  Overflowed = (Z < X || Z < Y);
+  if (Overflowed)
     return std::numeric_limits<T>::max();
   else
     return Z;
 }
 
-/// \brief Add two unsigned integers, X and Y, of type T.
-/// Clamp the result to the maximum representable value of T on overflow.
-template <typename T>
-typename std::enable_if<std::is_unsigned<T>::value, T>::type
-SaturatingAdd(T X, T Y) {
-  bool ResultOverflowed;
-  return SaturatingAdd(X, Y, ResultOverflowed);
-}
-
 /// \brief Multiply two unsigned integers, X and Y, of type T.
 /// Clamp the result to the maximum representable value of T on overflow.
 /// ResultOverflowed indicates if the result is larger than the maximum
 /// representable value of type T.
 template <typename T>
 typename std::enable_if<std::is_unsigned<T>::value, T>::type
-SaturatingMultiply(T X, T Y, bool &ResultOverflowed) {
+SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
+  bool Dummy;
+  bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
+
   // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
   // because it fails for uint16_t (where multiplication can have undefined
   // behavior due to promotion to int), and requires a division in addition
   // to the multiplication.
 
-  ResultOverflowed = false;
+  Overflowed = false;
 
   // Log2(Z) would be either Log2Z or Log2Z + 1.
   // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
@@ -702,7 +698,7 @@ SaturatingMultiply(T X, T Y, bool &ResultOverflowed) {
     return X * Y;
   }
   if (Log2Z > Log2Max) {
-    ResultOverflowed = true;
+    Overflowed = true;
     return Max;
   }
 
@@ -711,7 +707,7 @@ SaturatingMultiply(T X, T Y, bool &ResultOverflowed) {
   // that on at the end.
   T Z = (X >> 1) * Y;
   if (Z & ~(Max >> 1)) {
-    ResultOverflowed = true;
+    Overflowed = true;
     return Max;
   }
   Z <<= 1;
@@ -721,15 +717,6 @@ SaturatingMultiply(T X, T Y, bool &ResultOverflowed) {
   return Z;
 }
 
-/// \brief Multiply two unsigned integers, X and Y, of type T.
-/// Clamp the result to the maximum representable value of T on overflow.
-template <typename T>
-typename std::enable_if<std::is_unsigned<T>::value, T>::type
-SaturatingMultiply(T X, T Y) {
-  bool ResultOverflowed;
-  return SaturatingMultiply(X, Y, ResultOverflowed);
-}
-
 extern const float huge_valf;
 } // End llvm namespace
 
diff --git a/unittests/Support/MathExtrasTest.cpp b/unittests/Support/MathExtrasTest.cpp
index ee2fc1b9034e..945d8322b259 100644
--- a/unittests/Support/MathExtrasTest.cpp
+++ b/unittests/Support/MathExtrasTest.cpp
@@ -197,23 +197,23 @@ void SaturatingAddTestHelper()
   bool ResultOverflowed;
 
   EXPECT_EQ(T(3), SaturatingAdd(T(1), T(2)));
-  EXPECT_EQ(T(3), SaturatingAdd(T(1), T(2), ResultOverflowed));
+  EXPECT_EQ(T(3), SaturatingAdd(T(1), T(2), &ResultOverflowed));
   EXPECT_FALSE(ResultOverflowed);
 
   EXPECT_EQ(Max, SaturatingAdd(Max, T(1)));
-  EXPECT_EQ(Max, SaturatingAdd(Max, T(1), ResultOverflowed));
+  EXPECT_EQ(Max, SaturatingAdd(Max, T(1), &ResultOverflowed));
   EXPECT_TRUE(ResultOverflowed);
 
   EXPECT_EQ(Max, SaturatingAdd(T(1), T(Max - 1)));
-  EXPECT_EQ(Max, SaturatingAdd(T(1), T(Max - 1), ResultOverflowed));
+  EXPECT_EQ(Max, SaturatingAdd(T(1), T(Max - 1), &ResultOverflowed));
   EXPECT_FALSE(ResultOverflowed);
 
   EXPECT_EQ(Max, SaturatingAdd(T(1), Max));
-  EXPECT_EQ(Max, SaturatingAdd(T(1), Max, ResultOverflowed));
+  EXPECT_EQ(Max, SaturatingAdd(T(1), Max, &ResultOverflowed));
   EXPECT_TRUE(ResultOverflowed);
 
   EXPECT_EQ(Max, SaturatingAdd(Max, Max));
-  EXPECT_EQ(Max, SaturatingAdd(Max, Max, ResultOverflowed));
+  EXPECT_EQ(Max, SaturatingAdd(Max, Max, &ResultOverflowed));
   EXPECT_TRUE(ResultOverflowed);
 }
 
@@ -232,45 +232,45 @@ void SaturatingMultiplyTestHelper()
 
   // Test basic multiplication.
   EXPECT_EQ(T(6), SaturatingMultiply(T(2), T(3)));
-  EXPECT_EQ(T(6), SaturatingMultiply(T(2), T(3), ResultOverflowed));
+  EXPECT_EQ(T(6), SaturatingMultiply(T(2), T(3), &ResultOverflowed));
   EXPECT_FALSE(ResultOverflowed);
 
   EXPECT_EQ(T(6), SaturatingMultiply(T(3), T(2)));
-  EXPECT_EQ(T(6), SaturatingMultiply(T(3), T(2), ResultOverflowed));
+  EXPECT_EQ(T(6), SaturatingMultiply(T(3), T(2), &ResultOverflowed));
   EXPECT_FALSE(ResultOverflowed);
 
   // Test multiplication by zero.
   EXPECT_EQ(T(0), SaturatingMultiply(T(0), T(0)));
-  EXPECT_EQ(T(0), SaturatingMultiply(T(0), T(0), ResultOverflowed));
+  EXPECT_EQ(T(0), SaturatingMultiply(T(0), T(0), &ResultOverflowed));
   EXPECT_FALSE(ResultOverflowed);
 
   EXPECT_EQ(T(0), SaturatingMultiply(T(1), T(0)));
-  EXPECT_EQ(T(0), SaturatingMultiply(T(1), T(0), ResultOverflowed));
+  EXPECT_EQ(T(0), SaturatingMultiply(T(1), T(0), &ResultOverflowed));
   EXPECT_FALSE(ResultOverflowed);
 
   EXPECT_EQ(T(0), SaturatingMultiply(T(0), T(1)));
-  EXPECT_EQ(T(0), SaturatingMultiply(T(0), T(1), ResultOverflowed));
+  EXPECT_EQ(T(0), SaturatingMultiply(T(0), T(1), &ResultOverflowed));
   EXPECT_FALSE(ResultOverflowed);
 
   EXPECT_EQ(T(0), SaturatingMultiply(Max, T(0)));
-  EXPECT_EQ(T(0), SaturatingMultiply(Max, T(0), ResultOverflowed));
+  EXPECT_EQ(T(0), SaturatingMultiply(Max, T(0), &ResultOverflowed));
   EXPECT_FALSE(ResultOverflowed);
 
   EXPECT_EQ(T(0), SaturatingMultiply(T(0), Max));
-  EXPECT_EQ(T(0), SaturatingMultiply(T(0), Max, ResultOverflowed));
+  EXPECT_EQ(T(0), SaturatingMultiply(T(0), Max, &ResultOverflowed));
   EXPECT_FALSE(ResultOverflowed);
 
   // Test multiplication by maximum value.
   EXPECT_EQ(Max, SaturatingMultiply(Max, T(2)));
-  EXPECT_EQ(Max, SaturatingMultiply(Max, T(2), ResultOverflowed));
+  EXPECT_EQ(Max, SaturatingMultiply(Max, T(2), &ResultOverflowed));
   EXPECT_TRUE(ResultOverflowed);
 
   EXPECT_EQ(Max, SaturatingMultiply(T(2), Max));
-  EXPECT_EQ(Max, SaturatingMultiply(T(2), Max, ResultOverflowed));
+  EXPECT_EQ(Max, SaturatingMultiply(T(2), Max, &ResultOverflowed));
   EXPECT_TRUE(ResultOverflowed);
 
   EXPECT_EQ(Max, SaturatingMultiply(Max, Max));
-  EXPECT_EQ(Max, SaturatingMultiply(Max, Max, ResultOverflowed));
+  EXPECT_EQ(Max, SaturatingMultiply(Max, Max, &ResultOverflowed));
   EXPECT_TRUE(ResultOverflowed);
 
   // Test interesting boundary conditions for algorithm -
@@ -286,11 +286,11 @@ void SaturatingMultiplyTestHelper()
 
       if(OverflowExpected) {
         EXPECT_EQ(Max, SaturatingMultiply(X, Y));
-        EXPECT_EQ(Max, SaturatingMultiply(X, Y, ResultOverflowed));
+        EXPECT_EQ(Max, SaturatingMultiply(X, Y, &ResultOverflowed));
         EXPECT_TRUE(ResultOverflowed);
       } else {
         EXPECT_EQ(X * Y, SaturatingMultiply(X, Y));
-        EXPECT_EQ(X * Y, SaturatingMultiply(X, Y, ResultOverflowed));
+        EXPECT_EQ(X * Y, SaturatingMultiply(X, Y, &ResultOverflowed));
         EXPECT_FALSE(ResultOverflowed);
       }
     }

From 65711ad4e36f91de64aa3dde76fde143cec85c83 Mon Sep 17 00:00:00 2001
From: Rong Xu <xur@google.com>
Date: Wed, 9 Dec 2015 18:08:16 +0000
Subject: [PATCH 281/364] [PGO] Resubmit "MST based PGO instrumentation
 infrastructure" (r254021)

This new patch fixes a few bugs that exposed in last submit. It also improves
the test cases.
--Original Commit Message--
This patch implements a minimum spanning tree (MST) based instrumentation for
PGO. The use of MST guarantees minimum number of CFG edges getting
instrumented. An addition optimization is to instrument the less executed
edges to further reduce the instrumentation overhead. The patch contains both the
instrumentation and the use of the profile to set the branch weights.

Differential Revision: http://reviews.llvm.org/D12781


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255132 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/DiagnosticInfo.h              |  26 +
 include/llvm/InitializePasses.h               |   2 +
 include/llvm/LinkAllPasses.h                  |   2 +
 include/llvm/Transforms/Instrumentation.h     |  23 +
 lib/IR/DiagnosticInfo.cpp                     |   6 +
 lib/Transforms/Instrumentation/CFGMST.h       | 217 ++++++
 lib/Transforms/Instrumentation/CMakeLists.txt |   1 +
 .../Instrumentation/Instrumentation.cpp       |   2 +
 lib/Transforms/Instrumentation/LLVMBuild.txt  |   2 +-
 .../Instrumentation/PGOInstrumentation.cpp    | 718 ++++++++++++++++++
 .../PGOProfile/Inputs/branch1.proftext        |   6 +
 .../PGOProfile/Inputs/branch2.proftext        |   6 +
 .../PGOProfile/Inputs/criticaledge.proftext   |  17 +
 .../PGOProfile/Inputs/diag.proftext           |   5 +
 .../PGOProfile/Inputs/landingpad.proftext     |  14 +
 .../PGOProfile/Inputs/loop1.proftext          |   6 +
 .../PGOProfile/Inputs/loop2.proftext          |   7 +
 .../PGOProfile/Inputs/switch.proftext         |   8 +
 test/Transforms/PGOProfile/branch1.ll         |  30 +
 test/Transforms/PGOProfile/branch2.ll         |  37 +
 test/Transforms/PGOProfile/criticaledge.ll    | 108 +++
 test/Transforms/PGOProfile/diag_mismatch.ll   |  12 +
 .../PGOProfile/diag_no_funcprofdata.ll        |  12 +
 test/Transforms/PGOProfile/diag_no_profile.ll |   9 +
 test/Transforms/PGOProfile/landingpad.ll      | 124 +++
 test/Transforms/PGOProfile/loop1.ll           |  42 +
 test/Transforms/PGOProfile/loop2.ll           |  70 ++
 test/Transforms/PGOProfile/single_bb.ll       |  12 +
 test/Transforms/PGOProfile/switch.ll          |  47 ++
 29 files changed, 1570 insertions(+), 1 deletion(-)
 create mode 100644 lib/Transforms/Instrumentation/CFGMST.h
 create mode 100644 lib/Transforms/Instrumentation/PGOInstrumentation.cpp
 create mode 100644 test/Transforms/PGOProfile/Inputs/branch1.proftext
 create mode 100644 test/Transforms/PGOProfile/Inputs/branch2.proftext
 create mode 100644 test/Transforms/PGOProfile/Inputs/criticaledge.proftext
 create mode 100644 test/Transforms/PGOProfile/Inputs/diag.proftext
 create mode 100644 test/Transforms/PGOProfile/Inputs/landingpad.proftext
 create mode 100644 test/Transforms/PGOProfile/Inputs/loop1.proftext
 create mode 100644 test/Transforms/PGOProfile/Inputs/loop2.proftext
 create mode 100644 test/Transforms/PGOProfile/Inputs/switch.proftext
 create mode 100644 test/Transforms/PGOProfile/branch1.ll
 create mode 100644 test/Transforms/PGOProfile/branch2.ll
 create mode 100644 test/Transforms/PGOProfile/criticaledge.ll
 create mode 100644 test/Transforms/PGOProfile/diag_mismatch.ll
 create mode 100644 test/Transforms/PGOProfile/diag_no_funcprofdata.ll
 create mode 100644 test/Transforms/PGOProfile/diag_no_profile.ll
 create mode 100644 test/Transforms/PGOProfile/landingpad.ll
 create mode 100644 test/Transforms/PGOProfile/loop1.ll
 create mode 100644 test/Transforms/PGOProfile/loop2.ll
 create mode 100644 test/Transforms/PGOProfile/single_bb.ll
 create mode 100644 test/Transforms/PGOProfile/switch.ll

diff --git a/include/llvm/IR/DiagnosticInfo.h b/include/llvm/IR/DiagnosticInfo.h
index aac7a4509f38..e14387677265 100644
--- a/include/llvm/IR/DiagnosticInfo.h
+++ b/include/llvm/IR/DiagnosticInfo.h
@@ -60,6 +60,7 @@ enum DiagnosticKind {
   DK_OptimizationRemarkAnalysisAliasing,
   DK_OptimizationFailure,
   DK_MIRParser,
+  DK_PGOProfile,
   DK_FirstPluginKind
 };
 
@@ -250,6 +251,31 @@ class DiagnosticInfoSampleProfile : public DiagnosticInfo {
   const Twine &Msg;
 };
 
+/// Diagnostic information for the PGO profiler.
+class DiagnosticInfoPGOProfile : public DiagnosticInfo {
+public:
+  DiagnosticInfoPGOProfile(const char *FileName, const Twine &Msg,
+                           DiagnosticSeverity Severity = DS_Error)
+      : DiagnosticInfo(DK_PGOProfile, Severity), FileName(FileName), Msg(Msg) {}
+
+  /// \see DiagnosticInfo::print.
+  void print(DiagnosticPrinter &DP) const override;
+
+  static bool classof(const DiagnosticInfo *DI) {
+    return DI->getKind() == DK_PGOProfile;
+  }
+
+  const char *getFileName() const { return FileName; }
+  const Twine &getMsg() const { return Msg; }
+
+private:
+  /// Name of the input file associated with this diagnostic.
+  const char *FileName;
+
+  /// Message to report.
+  const Twine &Msg;
+};
+
 /// Common features for diagnostics dealing with optimization remarks.
 class DiagnosticInfoOptimizationBase : public DiagnosticInfo {
 public:
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index 29c8b27db36a..72b6a0f011be 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -117,6 +117,8 @@ void initializeEdgeBundlesPass(PassRegistry&);
 void initializeExpandPostRAPass(PassRegistry&);
 void initializeAAResultsWrapperPassPass(PassRegistry &);
 void initializeGCOVProfilerPass(PassRegistry&);
+void initializePGOInstrumentationGenPass(PassRegistry&);
+void initializePGOInstrumentationUsePass(PassRegistry&);
 void initializeInstrProfilingPass(PassRegistry&);
 void initializeAddressSanitizerPass(PassRegistry&);
 void initializeAddressSanitizerModulePass(PassRegistry&);
diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h
index 1b22d01a3a25..29fcd93a2a1c 100644
--- a/include/llvm/LinkAllPasses.h
+++ b/include/llvm/LinkAllPasses.h
@@ -85,6 +85,8 @@ namespace {
       (void) llvm::createDomOnlyViewerPass();
       (void) llvm::createDomViewerPass();
       (void) llvm::createGCOVProfilerPass();
+      (void) llvm::createPGOInstrumentationGenPass();
+      (void) llvm::createPGOInstrumentationUsePass();
       (void) llvm::createInstrProfilingPass();
       (void) llvm::createFunctionImportPass();
       (void) llvm::createFunctionInliningPass();
diff --git a/include/llvm/Transforms/Instrumentation.h b/include/llvm/Transforms/Instrumentation.h
index c8863481c34c..38dfeb04ace3 100644
--- a/include/llvm/Transforms/Instrumentation.h
+++ b/include/llvm/Transforms/Instrumentation.h
@@ -79,6 +79,11 @@ struct GCOVOptions {
 ModulePass *createGCOVProfilerPass(const GCOVOptions &Options =
                                    GCOVOptions::getDefault());
 
+// PGO Instrumention
+ModulePass *createPGOInstrumentationGenPass();
+ModulePass *
+createPGOInstrumentationUsePass(StringRef Filename = StringRef(""));
+
 /// Options for the frontend instrumentation based profiling pass.
 struct InstrProfOptions {
   InstrProfOptions() : NoRedZone(false) {}
@@ -149,6 +154,24 @@ FunctionPass *createBoundsCheckingPass();
 /// protect against stack-based overflow vulnerabilities.
 FunctionPass *createSafeStackPass(const TargetMachine *TM = nullptr);
 
+/// \brief Calculate what to divide by to scale counts.
+///
+/// Given the maximum count, calculate a divisor that will scale all the
+/// weights to strictly less than UINT32_MAX.
+static inline uint64_t calculateCountScale(uint64_t MaxCount) {
+  return MaxCount < UINT32_MAX ? 1 : MaxCount / UINT32_MAX + 1;
+}
+
+/// \brief Scale an individual branch count.
+///
+/// Scale a 64-bit weight down to 32-bits using \c Scale.
+///
+static inline uint32_t scaleBranchCount(uint64_t Count, uint64_t Scale) {
+  uint64_t Scaled = Count / Scale;
+  assert(Scaled <= UINT32_MAX && "overflow 32-bits");
+  return Scaled;
+}
+
 } // End llvm namespace
 
 #endif
diff --git a/lib/IR/DiagnosticInfo.cpp b/lib/IR/DiagnosticInfo.cpp
index 4753789d9c13..6426f76bbaa6 100644
--- a/lib/IR/DiagnosticInfo.cpp
+++ b/lib/IR/DiagnosticInfo.cpp
@@ -132,6 +132,12 @@ void DiagnosticInfoSampleProfile::print(DiagnosticPrinter &DP) const {
   DP << getMsg();
 }
 
+void DiagnosticInfoPGOProfile::print(DiagnosticPrinter &DP) const {
+  if (getFileName())
+    DP << getFileName() << ": ";
+  DP << getMsg();
+}
+
 bool DiagnosticInfoOptimizationBase::isLocationAvailable() const {
   return getDebugLoc();
 }
diff --git a/lib/Transforms/Instrumentation/CFGMST.h b/lib/Transforms/Instrumentation/CFGMST.h
new file mode 100644
index 000000000000..c47fdbf68996
--- /dev/null
+++ b/lib/Transforms/Instrumentation/CFGMST.h
@@ -0,0 +1,217 @@
+//===-- CFGMST.h - Minimum Spanning Tree for CFG ----------------*- C++ -*-===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a Union-find algorithm to compute Minimum Spanning Tree
+// for a given CFG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace llvm {
+
+#define DEBUG_TYPE "cfgmst"
+
+/// \brief An union-find based Minimum Spanning Tree for CFG
+///
+/// Implements a Union-find algorithm to compute Minimum Spanning Tree
+/// for a given CFG.
+template <class Edge, class BBInfo> class CFGMST {
+public:
+  Function &F;
+
+  // Store all the edges in CFG. It may contain some stale edges
+  // when Removed is set.
+  std::vector<std::unique_ptr<Edge>> AllEdges;
+
+  // This map records the auxiliary information for each BB.
+  DenseMap<const BasicBlock *, std::unique_ptr<BBInfo>> BBInfos;
+
+  // Find the root group of the G and compress the path from G to the root.
+  BBInfo *findAndCompressGroup(BBInfo *G) {
+    if (G->Group != G)
+      G->Group = findAndCompressGroup(static_cast<BBInfo *>(G->Group));
+    return static_cast<BBInfo *>(G->Group);
+  }
+
+  // Union BB1 and BB2 into the same group and return true.
+  // Returns false if BB1 and BB2 are already in the same group.
+  bool unionGroups(const BasicBlock *BB1, const BasicBlock *BB2) {
+    BBInfo *BB1G = findAndCompressGroup(&getBBInfo(BB1));
+    BBInfo *BB2G = findAndCompressGroup(&getBBInfo(BB2));
+
+    if (BB1G == BB2G)
+      return false;
+
+    // Make the smaller rank tree a direct child or the root of high rank tree.
+    if (BB1G->Rank < BB2G->Rank)
+      BB1G->Group = BB2G;
+    else {
+      BB2G->Group = BB1G;
+      // If the ranks are the same, increment root of one tree by one.
+      if (BB1G->Rank == BB2G->Rank)
+        BB1G->Rank++;
+    }
+    return true;
+  }
+
+  // Give BB, return the auxiliary information.
+  BBInfo &getBBInfo(const BasicBlock *BB) const {
+    auto It = BBInfos.find(BB);
+    assert(It->second.get() != nullptr);
+    return *It->second.get();
+  }
+
+  // Traverse the CFG using a stack. Find all the edges and assign the weight.
+  // Edges with large weight will be put into MST first so they are less likely
+  // to be instrumented.
+  void buildEdges() {
+    DEBUG(dbgs() << "Build Edge on " << F.getName() << "\n");
+
+    const BasicBlock *BB = &(F.getEntryBlock());
+    uint64_t EntryWeight = (BFI != nullptr ? BFI->getEntryFreq() : 2);
+    // Add a fake edge to the entry.
+    addEdge(nullptr, BB, EntryWeight);
+
+    // Special handling for single BB functions.
+    if (succ_empty(BB)) {
+      addEdge(BB, nullptr, EntryWeight);
+      return;
+    }
+
+    static const uint32_t CriticalEdgeMultiplier = 1000;
+
+    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+      TerminatorInst *TI = BB->getTerminator();
+      uint64_t BBWeight =
+          (BFI != nullptr ? BFI->getBlockFreq(&*BB).getFrequency() : 2);
+      uint64_t Weight = 2;
+      if (int successors = TI->getNumSuccessors()) {
+        for (int i = 0; i != successors; ++i) {
+          BasicBlock *TargetBB = TI->getSuccessor(i);
+          bool Critical = isCriticalEdge(TI, i);
+          uint64_t scaleFactor = BBWeight;
+          if (Critical) {
+            if (scaleFactor < UINT64_MAX / CriticalEdgeMultiplier)
+              scaleFactor *= CriticalEdgeMultiplier;
+            else
+              scaleFactor = UINT64_MAX;
+          }
+          if (BPI != nullptr)
+            Weight = BPI->getEdgeProbability(&*BB, TargetBB).scale(scaleFactor);
+          addEdge(&*BB, TargetBB, Weight).IsCritical = Critical;
+          DEBUG(dbgs() << "  Edge: from " << BB->getName() << " to "
+                       << TargetBB->getName() << "  w=" << Weight << "\n");
+        }
+      } else {
+        addEdge(&*BB, nullptr, BBWeight);
+        DEBUG(dbgs() << "  Edge: from " << BB->getName() << " to exit"
+                     << " w = " << BBWeight << "\n");
+      }
+    }
+  }
+
+  // Sort CFG edges based on its weight.
+  void sortEdgesByWeight() {
+    std::stable_sort(AllEdges.begin(), AllEdges.end(),
+                     [](const std::unique_ptr<Edge> &Edge1,
+                        const std::unique_ptr<Edge> &Edge2) {
+                       return Edge1->Weight > Edge2->Weight;
+                     });
+  }
+
+  // Traverse all the edges and compute the Minimum Weight Spanning Tree
+  // using union-find algorithm.
+  void computeMinimumSpanningTree() {
+    // First, put all the critical edge with landing-pad as the Dest to MST.
+    // This works around the insufficient support of critical edges split
+    // when destination BB is a landing pad.
+    for (auto &Ei : AllEdges) {
+      if (Ei->Removed)
+        continue;
+      if (Ei->IsCritical) {
+        if (Ei->DestBB && Ei->DestBB->isLandingPad()) {
+          if (unionGroups(Ei->SrcBB, Ei->DestBB))
+            Ei->InMST = true;
+        }
+      }
+    }
+
+    for (auto &Ei : AllEdges) {
+      if (Ei->Removed)
+        continue;
+      if (unionGroups(Ei->SrcBB, Ei->DestBB))
+        Ei->InMST = true;
+    }
+  }
+
+  // Dump the Debug information about the instrumentation.
+  void dumpEdges(raw_ostream &OS, const Twine &Message) const {
+    if (!Message.str().empty())
+      OS << Message << "\n";
+    OS << "  Number of Basic Blocks: " << BBInfos.size() << "\n";
+    for (auto &BI : BBInfos) {
+      const BasicBlock *BB = BI.first;
+      OS << "  BB: " << (BB == nullptr ? "FakeNode" : BB->getName()) << "  "
+         << BI.second->infoString() << "\n";
+    }
+
+    OS << "  Number of Edges: " << AllEdges.size()
+       << " (*: Instrument, C: CriticalEdge, -: Removed)\n";
+    uint32_t Count = 0;
+    for (auto &EI : AllEdges)
+      OS << "  Edge " << Count++ << ": " << getBBInfo(EI->SrcBB).Index << "-->"
+         << getBBInfo(EI->DestBB).Index << EI->infoString() << "\n";
+  }
+
+  // Add an edge to AllEdges with weight W.
+  Edge &addEdge(const BasicBlock *Src, const BasicBlock *Dest, uint64_t W) {
+    uint32_t Index = BBInfos.size();
+    auto Iter = BBInfos.end();
+    bool Inserted;
+    std::tie(Iter, Inserted) = BBInfos.insert(std::make_pair(Src, nullptr));
+    if (Inserted) {
+      // Newly inserted, update the real info.
+      Iter->second = std::move(llvm::make_unique<BBInfo>(Index));
+      Index++;
+    }
+    std::tie(Iter, Inserted) = BBInfos.insert(std::make_pair(Dest, nullptr));
+    if (Inserted)
+      // Newly inserted, update the real info.
+      Iter->second = std::move(llvm::make_unique<BBInfo>(Index));
+    AllEdges.emplace_back(new Edge(Src, Dest, W));
+    return *AllEdges.back();
+  }
+
+  BranchProbabilityInfo *BPI;
+  BlockFrequencyInfo *BFI;
+
+public:
+  CFGMST(Function &Func, BranchProbabilityInfo *BPI_ = nullptr,
+         BlockFrequencyInfo *BFI_ = nullptr)
+      : F(Func), BPI(BPI_), BFI(BFI_) {
+    buildEdges();
+    sortEdgesByWeight();
+    computeMinimumSpanningTree();
+  }
+};
+
+#undef DEBUG_TYPE // "cfgmst"
+} // end namespace llvm
diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt
index 9b81f4bb1619..cae1e5af7ac7 100644
--- a/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -6,6 +6,7 @@ add_llvm_library(LLVMInstrumentation
   MemorySanitizer.cpp
   Instrumentation.cpp
   InstrProfiling.cpp
+  PGOInstrumentation.cpp
   SafeStack.cpp
   SanitizerCoverage.cpp
   ThreadSanitizer.cpp
diff --git a/lib/Transforms/Instrumentation/Instrumentation.cpp b/lib/Transforms/Instrumentation/Instrumentation.cpp
index c504b5a8199a..a05a5fa09f9a 100644
--- a/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -60,6 +60,8 @@ void llvm::initializeInstrumentation(PassRegistry &Registry) {
   initializeAddressSanitizerModulePass(Registry);
   initializeBoundsCheckingPass(Registry);
   initializeGCOVProfilerPass(Registry);
+  initializePGOInstrumentationGenPass(Registry);
+  initializePGOInstrumentationUsePass(Registry);
   initializeInstrProfilingPass(Registry);
   initializeMemorySanitizerPass(Registry);
   initializeThreadSanitizerPass(Registry);
diff --git a/lib/Transforms/Instrumentation/LLVMBuild.txt b/lib/Transforms/Instrumentation/LLVMBuild.txt
index 14c174332ee4..bcefe795c193 100644
--- a/lib/Transforms/Instrumentation/LLVMBuild.txt
+++ b/lib/Transforms/Instrumentation/LLVMBuild.txt
@@ -19,4 +19,4 @@
 type = Library
 name = Instrumentation
 parent = Transforms
-required_libraries = Analysis Core MC Support TransformUtils
+required_libraries = Analysis Core MC Support TransformUtils ProfileData
diff --git a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
new file mode 100644
index 000000000000..4b59b93b325f
--- /dev/null
+++ b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -0,0 +1,718 @@
+//===-- PGOInstrumentation.cpp - MST-based PGO Instrumentation ------------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements PGO instrumentation using a minimum spanning tree based
+// on the following paper:
+//   [1] Donald E. Knuth, Francis R. Stevenson. Optimal measurement of points
+//   for program frequency counts. BIT Numerical Mathematics 1973, Volume 13,
+//   Issue 3, pp 313-322
+// The idea of the algorithm based on the fact that for each node (except for
+// the entry and exit), the sum of incoming edge counts equals the sum of
+// outgoing edge counts. The count of edge on spanning tree can be derived from
+// those edges not on the spanning tree. Knuth proves this method instruments
+// the minimum number of edges.
+//
+// The minimal spanning tree here is actually a maximum weight tree -- on-tree
+// edges have higher frequencies (more likely to execute). The idea is to
+// instrument those less frequently executed edges to reduce the runtime
+// overhead of instrumented binaries.
+//
+// This file contains two passes:
+// (1) Pass PGOInstrumentationGen which instruments the IR to generate edge
+// count profile, and
+// (2) Pass PGOInstrumentationUse which reads the edge count profile and
+// annotates the branch weights.
+// To get the precise counter information, These two passes need to invoke at
+// the same compilation point (so they see the same IR). For pass
+// PGOInstrumentationGen, the real work is done in instrumentOneFunc(). For
+// pass PGOInstrumentationUse, the real work in done in class PGOUseFunc and
+// the profile is opened in module level and passed to each PGOUseFunc instance.
+// The shared code for PGOInstrumentationGen and PGOInstrumentationUse is put
+// in class FuncPGOInstrumentation.
+//
+// Class PGOEdge represents a CFG edge and some auxiliary information. Class
+// BBInfo contains auxiliary information for each BB. These two classes are used
+// in pass PGOInstrumentationGen. Class PGOUseEdge and UseBBInfo are the derived
+// class of PGOEdge and BBInfo, respectively. They contains extra data structure
+// used in populating profile counters.
+// The MST implementation is in Class CFGMST (CFGMST.h).
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation.h"
+#include "CFGMST.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/ProfileData/InstrProfReader.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/JamCRC.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "pgo-instrumentation"
+
+STATISTIC(NumOfPGOInstrument, "Number of edges instrumented.");
+STATISTIC(NumOfPGOEdge, "Number of edges.");
+STATISTIC(NumOfPGOBB, "Number of basic-blocks.");
+STATISTIC(NumOfPGOSplit, "Number of critical edge splits.");
+STATISTIC(NumOfPGOFunc, "Number of functions having valid profile counts.");
+STATISTIC(NumOfPGOMismatch, "Number of functions having mismatch profile.");
+STATISTIC(NumOfPGOMissing, "Number of functions without profile.");
+
+// Command line option to specify the file to read profile from. This is
+// mainly used for testing.
+static cl::opt<std::string>
+    PGOTestProfileFile("pgo-test-profile-file", cl::init(""), cl::Hidden,
+                       cl::value_desc("filename"),
+                       cl::desc("Specify the path of profile data file. This is"
+                                "mainly for test purpose."));
+
+namespace {
+class PGOInstrumentationGen : public ModulePass {
+public:
+  static char ID;
+
+  PGOInstrumentationGen() : ModulePass(ID) {
+    initializePGOInstrumentationGenPass(*PassRegistry::getPassRegistry());
+  }
+
+  const char *getPassName() const override {
+    return "PGOInstrumentationGenPass";
+  }
+
+private:
+  bool runOnModule(Module &M) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
+  }
+};
+
+class PGOInstrumentationUse : public ModulePass {
+public:
+  static char ID;
+
+  // Provide the profile filename as the parameter.
+  PGOInstrumentationUse(std::string Filename = "")
+      : ModulePass(ID), ProfileFileName(Filename) {
+    if (!PGOTestProfileFile.empty())
+      ProfileFileName = PGOTestProfileFile;
+    initializePGOInstrumentationUsePass(*PassRegistry::getPassRegistry());
+  }
+
+  const char *getPassName() const override {
+    return "PGOInstrumentationUsePass";
+  }
+
+private:
+  std::string ProfileFileName;
+  std::unique_ptr<IndexedInstrProfReader> PGOReader;
+  bool runOnModule(Module &M) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
+  }
+};
+} // end anonymous namespace
+
+char PGOInstrumentationGen::ID = 0;
+INITIALIZE_PASS_BEGIN(PGOInstrumentationGen, "pgo-instr-gen",
+                      "PGO instrumentation.", false, false)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
+INITIALIZE_PASS_END(PGOInstrumentationGen, "pgo-instr-gen",
+                    "PGO instrumentation.", false, false)
+
+ModulePass *llvm::createPGOInstrumentationGenPass() {
+  return new PGOInstrumentationGen();
+}
+
+char PGOInstrumentationUse::ID = 0;
+INITIALIZE_PASS_BEGIN(PGOInstrumentationUse, "pgo-instr-use",
+                      "Read PGO instrumentation profile.", false, false)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
+INITIALIZE_PASS_END(PGOInstrumentationUse, "pgo-instr-use",
+                    "Read PGO instrumentation profile.", false, false)
+
+ModulePass *llvm::createPGOInstrumentationUsePass(StringRef Filename) {
+  return new PGOInstrumentationUse(Filename.str());
+}
+
+namespace {
+/// \brief An MST based instrumentation for PGO
+///
+/// Implements a Minimum Spanning Tree (MST) based instrumentation for PGO
+/// in the function level.
+struct PGOEdge {
+  // This class implements the CFG edges. Note the CFG can be a multi-graph.
+  // So there might be multiple edges with same SrcBB and DestBB.
+  const BasicBlock *SrcBB;
+  const BasicBlock *DestBB;
+  uint64_t Weight;
+  bool InMST;
+  bool Removed;
+  bool IsCritical;
+  PGOEdge(const BasicBlock *Src, const BasicBlock *Dest, unsigned W = 1)
+      : SrcBB(Src), DestBB(Dest), Weight(W), InMST(false), Removed(false),
+        IsCritical(false) {}
+  // Return the information string of an edge.
+  const std::string infoString() const {
+    return (Twine(Removed ? "-" : " ") + (InMST ? " " : "*") +
+            (IsCritical ? "c" : " ") + "  W=" + Twine(Weight)).str();
+  }
+};
+
+// This class stores the auxiliary information for each BB.
+struct BBInfo {
+  BBInfo *Group;
+  uint32_t Index;
+  uint32_t Rank;
+
+  BBInfo(unsigned IX) : Group(this), Index(IX), Rank(0) {}
+
+  // Return the information string of this object.
+  const std::string infoString() const {
+    return (Twine("Index=") + Twine(Index)).str();
+  }
+};
+
+// This class implements the CFG edges. Note the CFG can be a multi-graph.
+template <class Edge, class BBInfo> class FuncPGOInstrumentation {
+private:
+  Function &F;
+  void computeCFGHash();
+
+public:
+  std::string FuncName;
+  GlobalVariable *FuncNameVar;
+  // CFG hash value for this function.
+  uint64_t FunctionHash;
+
+  // The Minimum Spanning Tree of function CFG.
+  CFGMST<Edge, BBInfo> MST;
+
+  // Give an edge, find the BB that will be instrumented.
+  // Return nullptr if there is no BB to be instrumented.
+  BasicBlock *getInstrBB(Edge *E);
+
+  // Return the auxiliary BB information.
+  BBInfo &getBBInfo(const BasicBlock *BB) const { return MST.getBBInfo(BB); }
+
+  // Dump edges and BB information.
+  void dumpInfo(std::string Str = "") const {
+    MST.dumpEdges(dbgs(), Twine("Dump Function ") + FuncName + " Hash: " +
+                          Twine(FunctionHash) + "\t" + Str);
+  }
+
+  FuncPGOInstrumentation(Function &Func, bool CreateGlobalVar = false,
+                         BranchProbabilityInfo *BPI = nullptr,
+                         BlockFrequencyInfo *BFI = nullptr)
+      : F(Func), FunctionHash(0), MST(F, BPI, BFI) {
+    FuncName = getPGOFuncName(F);
+    computeCFGHash();
+    DEBUG(dumpInfo("after CFGMST"));
+
+    NumOfPGOBB += MST.BBInfos.size();
+    for (auto &E : MST.AllEdges) {
+      if (E->Removed)
+        continue;
+      NumOfPGOEdge++;
+      if (!E->InMST)
+        NumOfPGOInstrument++;
+    }
+
+    if (CreateGlobalVar)
+      FuncNameVar = createPGOFuncNameVar(F, FuncName);
+  };
+};
+
+// Compute Hash value for the CFG: the lower 32 bits are CRC32 of the index
+// value of each BB in the CFG. The higher 32 bits record the number of edges.
+template <class Edge, class BBInfo>
+void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() {
+  std::vector<char> Indexes;
+  JamCRC JC;
+  for (auto &BB : F) {
+    const TerminatorInst *TI = BB.getTerminator();
+    for (unsigned I = 0, E = TI->getNumSuccessors(); I != E; ++I) {
+      BasicBlock *Succ = TI->getSuccessor(I);
+      uint32_t Index = getBBInfo(Succ).Index;
+      for (int J = 0; J < 4; J++)
+        Indexes.push_back((char)(Index >> (J * 8)));
+    }
+  }
+  JC.update(Indexes);
+  FunctionHash = (uint64_t)MST.AllEdges.size() << 32 | JC.getCRC();
+}
+
+// Given a CFG E to be instrumented, find which BB to place the instrumented
+// code. The function will split the critical edge if necessary.
+template <class Edge, class BBInfo>
+BasicBlock *FuncPGOInstrumentation<Edge, BBInfo>::getInstrBB(Edge *E) {
+  if (E->InMST || E->Removed)
+    return nullptr;
+
+  BasicBlock *SrcBB = const_cast<BasicBlock *>(E->SrcBB);
+  BasicBlock *DestBB = const_cast<BasicBlock *>(E->DestBB);
+  // For a fake edge, instrument the real BB.
+  if (SrcBB == nullptr)
+    return DestBB;
+  if (DestBB == nullptr)
+    return SrcBB;
+
+  // Instrument the SrcBB if it has a single successor,
+  // otherwise, the DestBB if this is not a critical edge.
+  TerminatorInst *TI = SrcBB->getTerminator();
+  if (TI->getNumSuccessors() <= 1)
+    return SrcBB;
+  if (!E->IsCritical)
+    return DestBB;
+
+  // For a critical edge, we have to split. Instrument the newly
+  // created BB.
+  NumOfPGOSplit++;
+  DEBUG(dbgs() << "Split critical edge: " << getBBInfo(SrcBB).Index << " --> "
+               << getBBInfo(DestBB).Index << "\n");
+  unsigned SuccNum = GetSuccessorNumber(SrcBB, DestBB);
+  BasicBlock *InstrBB = SplitCriticalEdge(TI, SuccNum);
+  assert(InstrBB && "Critical edge is not split");
+
+  E->Removed = true;
+  return InstrBB;
+}
+
+// Visit all edge and instrument the edges not in MST.
+// Critical edges will be split.
+static void instrumentOneFunc(Function &F, Module *M,
+                              BranchProbabilityInfo *BPI,
+                              BlockFrequencyInfo *BFI) {
+  unsigned NumCounters = 0;
+  FuncPGOInstrumentation<PGOEdge, BBInfo> FuncInfo(F, true, BPI, BFI);
+  for (auto &E : FuncInfo.MST.AllEdges) {
+    if (!E->InMST && !E->Removed)
+      NumCounters++;
+  }
+
+  uint32_t I = 0;
+  for (auto &E : FuncInfo.MST.AllEdges) {
+    BasicBlock *InstrBB = FuncInfo.getInstrBB(E.get());
+    if (!InstrBB)
+      continue;
+
+    IRBuilder<> Builder(InstrBB, InstrBB->getFirstInsertionPt());
+    assert(Builder.GetInsertPoint() != InstrBB->end() &&
+           "Cannot get the Instrumentation point");
+    Type *I8PtrTy = Type::getInt8PtrTy(M->getContext());
+    Builder.CreateCall(
+        Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment),
+        {llvm::ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy),
+         Builder.getInt64(FuncInfo.FunctionHash), Builder.getInt32(NumCounters),
+         Builder.getInt32(I++)});
+  }
+}
+
+// This class represents a CFG edge in profile use compilation.
+struct PGOUseEdge : public PGOEdge {
+  bool CountValid;
+  uint64_t CountValue;
+  PGOUseEdge(const BasicBlock *Src, const BasicBlock *Dest, unsigned W = 1)
+      : PGOEdge(Src, Dest, W), CountValid(false), CountValue(0) {}
+
+  // Set edge count value
+  void setEdgeCount(uint64_t Value) {
+    CountValue = Value;
+    CountValid = true;
+  }
+
+  // Return the information string for this object.
+  const std::string infoString() const {
+    if (!CountValid)
+      return PGOEdge::infoString();
+    return (Twine(PGOEdge::infoString()) + "  Count=" + Twine(CountValue)).str();
+  }
+};
+
+typedef SmallVector<PGOUseEdge *, 2> DirectEdges;
+
+// This class stores the auxiliary information for each BB.
+struct UseBBInfo : public BBInfo {
+  uint64_t CountValue;
+  bool CountValid;
+  int32_t UnknownCountInEdge;
+  int32_t UnknownCountOutEdge;
+  DirectEdges InEdges;
+  DirectEdges OutEdges;
+  UseBBInfo(unsigned IX)
+      : BBInfo(IX), CountValue(0), CountValid(false), UnknownCountInEdge(0),
+        UnknownCountOutEdge(0) {}
+  UseBBInfo(unsigned IX, uint64_t C)
+      : BBInfo(IX), CountValue(C), CountValid(true), UnknownCountInEdge(0),
+        UnknownCountOutEdge(0) {}
+
+  // Set the profile count value for this BB.
+  void setBBInfoCount(uint64_t Value) {
+    CountValue = Value;
+    CountValid = true;
+  }
+
+  // Return the information string of this object.
+  const std::string infoString() const {
+    if (!CountValid)
+      return BBInfo::infoString();
+    return (Twine(BBInfo::infoString()) + "  Count=" + Twine(CountValue)).str();
+  }
+};
+
+// Sum up the count values for all the edges.
+static uint64_t sumEdgeCount(const ArrayRef<PGOUseEdge *> Edges) {
+  uint64_t Total = 0;
+  for (auto &E : Edges) {
+    if (E->Removed)
+      continue;
+    Total += E->CountValue;
+  }
+  return Total;
+}
+
+class PGOUseFunc {
+private:
+  Function &F;
+  Module *M;
+  // This member stores the shared information with class PGOGenFunc.
+  FuncPGOInstrumentation<PGOUseEdge, UseBBInfo> FuncInfo;
+
+  // Return the auxiliary BB information.
+  UseBBInfo &getBBInfo(const BasicBlock *BB) const {
+    return FuncInfo.getBBInfo(BB);
+  }
+
+  // The maximum count value in the profile. This is only used in PGO use
+  // compilation.
+  uint64_t ProgramMaxCount;
+
+  // Find the Instrumented BB and set the value.
+  void setInstrumentedCounts(const std::vector<uint64_t> &CountFromProfile);
+
+  // Set the edge counter value for the unknown edge -- there should be only
+  // one unknown edge.
+  void setEdgeCount(DirectEdges &Edges, uint64_t Value);
+
+  // Return FuncName string;
+  const std::string getFuncName() const { return FuncInfo.FuncName; }
+
+  // Set the hot/cold inline hints based on the count values.
+  // FIXME: This function should be removed once the functionality in
+  // the inliner is implemented.
+  void applyFunctionAttributes(uint64_t EntryCount, uint64_t MaxCount) {
+    if (ProgramMaxCount == 0)
+      return;
+    // Threshold of the hot functions.
+    const BranchProbability HotFunctionThreshold(1, 100);
+    // Threshold of the cold functions.
+    const BranchProbability ColdFunctionThreshold(2, 10000);
+    if (EntryCount >= HotFunctionThreshold.scale(ProgramMaxCount))
+      F.addFnAttr(llvm::Attribute::InlineHint);
+    else if (MaxCount <= ColdFunctionThreshold.scale(ProgramMaxCount))
+      F.addFnAttr(llvm::Attribute::Cold);
+  }
+
+public:
+  PGOUseFunc(Function &Func, Module *Modu, BranchProbabilityInfo *BPI = nullptr,
+             BlockFrequencyInfo *BFI = nullptr)
+      : F(Func), M(Modu), FuncInfo(Func, false, BPI, BFI) {}
+
+  // Read counts for the instrumented BB from profile.
+  bool readCounters(IndexedInstrProfReader *PGOReader);
+
+  // Populate the counts for all BBs.
+  void populateCounters();
+
+  // Set the branch weights based on the count values.
+  void setBranchWeights();
+};
+
+// Visit all the edges and assign the count value for the instrumented
+// edges and the BB.
+void PGOUseFunc::setInstrumentedCounts(
+    const std::vector<uint64_t> &CountFromProfile) {
+
+  // Use a worklist as we will update the vector during the iteration.
+  std::vector<PGOUseEdge *> WorkList;
+  for (auto &E : FuncInfo.MST.AllEdges)
+    WorkList.push_back(E.get());
+
+  uint32_t I = 0;
+  for (auto &E : WorkList) {
+    BasicBlock *InstrBB = FuncInfo.getInstrBB(E);
+    if (!InstrBB)
+      continue;
+    uint64_t CountValue = CountFromProfile[I++];
+    if (!E->Removed) {
+      getBBInfo(InstrBB).setBBInfoCount(CountValue);
+      E->setEdgeCount(CountValue);
+      continue;
+    }
+
+    // Need to add two new edges.
+    BasicBlock *SrcBB = const_cast<BasicBlock *>(E->SrcBB);
+    BasicBlock *DestBB = const_cast<BasicBlock *>(E->DestBB);
+    // Add new edge of SrcBB->InstrBB.
+    PGOUseEdge &NewEdge = FuncInfo.MST.addEdge(SrcBB, InstrBB, 0);
+    NewEdge.setEdgeCount(CountValue);
+    // Add new edge of InstrBB->DestBB.
+    PGOUseEdge &NewEdge1 = FuncInfo.MST.addEdge(InstrBB, DestBB, 0);
+    NewEdge1.setEdgeCount(CountValue);
+    NewEdge1.InMST = true;
+    getBBInfo(InstrBB).setBBInfoCount(CountValue);
+  }
+}
+
+// Set the count value for the unknown edge. There should be one and only one
+// unknown edge in Edges vector.
+void PGOUseFunc::setEdgeCount(DirectEdges &Edges, uint64_t Value) {
+  for (auto &E : Edges) {
+    if (E->CountValid)
+      continue;
+    E->setEdgeCount(Value);
+
+    getBBInfo(E->SrcBB).UnknownCountOutEdge--;
+    getBBInfo(E->DestBB).UnknownCountInEdge--;
+    return;
+  }
+  llvm_unreachable("Cannot find the unknown count edge");
+}
+
+// Read the profile from ProfileFileName and assign the value to the
+// instrumented BB and the edges. This function also updates ProgramMaxCount.
+// Return true if the profile are successfully read, and false on errors.
+bool PGOUseFunc::readCounters(IndexedInstrProfReader *PGOReader) {
+  auto &Ctx = M->getContext();
+  ErrorOr<InstrProfRecord> Result =
+      PGOReader->getInstrProfRecord(FuncInfo.FuncName, FuncInfo.FunctionHash);
+  if (std::error_code EC = Result.getError()) {
+    if (EC == instrprof_error::unknown_function)
+      NumOfPGOMissing++;
+    else if (EC == instrprof_error::hash_mismatch ||
+             EC == llvm::instrprof_error::malformed)
+      NumOfPGOMismatch++;
+
+    std::string Msg = EC.message() + std::string(" ") + F.getName().str();
+    Ctx.diagnose(
+        DiagnosticInfoPGOProfile(M->getName().data(), Msg, DS_Warning));
+    return false;
+  }
+  std::vector<uint64_t> &CountFromProfile = Result.get().Counts;
+
+  NumOfPGOFunc++;
+  DEBUG(dbgs() << CountFromProfile.size() << " counts\n");
+  uint64_t ValueSum = 0;
+  for (unsigned I = 0, S = CountFromProfile.size(); I < S; I++) {
+    DEBUG(dbgs() << "  " << I << ": " << CountFromProfile[I] << "\n");
+    ValueSum += CountFromProfile[I];
+  }
+
+  DEBUG(dbgs() << "SUM =  " << ValueSum << "\n");
+
+  getBBInfo(nullptr).UnknownCountOutEdge = 2;
+  getBBInfo(nullptr).UnknownCountInEdge = 2;
+
+  setInstrumentedCounts(CountFromProfile);
+  ProgramMaxCount = PGOReader->getMaximumFunctionCount();
+  return true;
+}
+
+// Populate the counters from instrumented BBs to all BBs.
+// In the end of this operation, all BBs should have a valid count value.
+void PGOUseFunc::populateCounters() {
+  // First set up Count variable for all BBs.
+  for (auto &E : FuncInfo.MST.AllEdges) {
+    if (E->Removed)
+      continue;
+
+    const BasicBlock *SrcBB = E->SrcBB;
+    const BasicBlock *DestBB = E->DestBB;
+    UseBBInfo &SrcInfo = getBBInfo(SrcBB);
+    UseBBInfo &DestInfo = getBBInfo(DestBB);
+    SrcInfo.OutEdges.push_back(E.get());
+    DestInfo.InEdges.push_back(E.get());
+    SrcInfo.UnknownCountOutEdge++;
+    DestInfo.UnknownCountInEdge++;
+
+    if (!E->CountValid)
+      continue;
+    DestInfo.UnknownCountInEdge--;
+    SrcInfo.UnknownCountOutEdge--;
+  }
+
+  bool Changes = true;
+  unsigned NumPasses = 0;
+  while (Changes) {
+    NumPasses++;
+    Changes = false;
+
+    // For efficient traversal, it's better to start from the end as most
+    // of the instrumented edges are at the end.
+    for (auto &BB : reverse(F)) {
+      UseBBInfo &Count = getBBInfo(&BB);
+      if (!Count.CountValid) {
+        if (Count.UnknownCountOutEdge == 0) {
+          Count.CountValue = sumEdgeCount(Count.OutEdges);
+          Count.CountValid = true;
+          Changes = true;
+        } else if (Count.UnknownCountInEdge == 0) {
+          Count.CountValue = sumEdgeCount(Count.InEdges);
+          Count.CountValid = true;
+          Changes = true;
+        }
+      }
+      if (Count.CountValid) {
+        if (Count.UnknownCountOutEdge == 1) {
+          uint64_t Total = Count.CountValue - sumEdgeCount(Count.OutEdges);
+          setEdgeCount(Count.OutEdges, Total);
+          Changes = true;
+        }
+        if (Count.UnknownCountInEdge == 1) {
+          uint64_t Total = Count.CountValue - sumEdgeCount(Count.InEdges);
+          setEdgeCount(Count.InEdges, Total);
+          Changes = true;
+        }
+      }
+    }
+  }
+
+  DEBUG(dbgs() << "Populate counts in " << NumPasses << " passes.\n");
+  // Assert every BB has a valid counter.
+  uint64_t FuncEntryCount = getBBInfo(&*F.begin()).CountValue;
+  uint64_t FuncMaxCount = FuncEntryCount;
+  for (auto &BB : F) {
+    assert(getBBInfo(&BB).CountValid && "BB count is not valid");
+    uint64_t Count = getBBInfo(&BB).CountValue;
+    if (Count > FuncMaxCount)
+      FuncMaxCount = Count;
+  }
+  applyFunctionAttributes(FuncEntryCount, FuncMaxCount);
+
+  DEBUG(FuncInfo.dumpInfo("after reading profile."));
+}
+
+// Assign the scaled count values to the BB with multiple out edges.
+void PGOUseFunc::setBranchWeights() {
+  // Generate MD_prof metadata for every branch instruction.
+  DEBUG(dbgs() << "\nSetting branch weights.\n");
+  MDBuilder MDB(M->getContext());
+  for (auto &BB : F) {
+    TerminatorInst *TI = BB.getTerminator();
+    if (TI->getNumSuccessors() < 2)
+      continue;
+    if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI))
+      continue;
+    if (getBBInfo(&BB).CountValue == 0)
+      continue;
+
+    // We have a non-zero Branch BB.
+    const UseBBInfo &BBCountInfo = getBBInfo(&BB);
+    unsigned Size = BBCountInfo.OutEdges.size();
+    SmallVector<unsigned, 2> EdgeCounts(Size, 0);
+    uint64_t MaxCount = 0;
+    for (unsigned s = 0; s < Size; s++) {
+      const PGOUseEdge *E = BBCountInfo.OutEdges[s];
+      const BasicBlock *SrcBB = E->SrcBB;
+      const BasicBlock *DestBB = E->DestBB;
+      if (DestBB == 0)
+        continue;
+      unsigned SuccNum = GetSuccessorNumber(SrcBB, DestBB);
+      uint64_t EdgeCount = E->CountValue;
+      if (EdgeCount > MaxCount)
+        MaxCount = EdgeCount;
+      EdgeCounts[SuccNum] = EdgeCount;
+    }
+    assert(MaxCount > 0 && "Bad max count");
+    uint64_t Scale = calculateCountScale(MaxCount);
+    SmallVector<unsigned, 4> Weights;
+    for (const auto &ECI : EdgeCounts)
+      Weights.push_back(scaleBranchCount(ECI, Scale));
+
+    TI->setMetadata(llvm::LLVMContext::MD_prof,
+                    MDB.createBranchWeights(Weights));
+    DEBUG(dbgs() << "Weight is: ";
+          for (const auto &W : Weights) { dbgs() << W << " "; }
+          dbgs() << "\n";);
+  }
+}
+} // end anonymous namespace
+
+bool PGOInstrumentationGen::runOnModule(Module &M) {
+  for (auto &F : M) {
+    if (F.isDeclaration())
+      continue;
+    BranchProbabilityInfo *BPI =
+        &(getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI());
+    BlockFrequencyInfo *BFI =
+        &(getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI());
+    instrumentOneFunc(F, &M, BPI, BFI);
+  }
+  return true;
+}
+
+static void setPGOCountOnFunc(PGOUseFunc &Func,
+                              IndexedInstrProfReader *PGOReader) {
+  if (Func.readCounters(PGOReader)) {
+    Func.populateCounters();
+    Func.setBranchWeights();
+  }
+}
+
+bool PGOInstrumentationUse::runOnModule(Module &M) {
+  DEBUG(dbgs() << "Read in profile counters: ");
+  auto &Ctx = M.getContext();
+  // Read the counter array from file.
+  auto ReaderOrErr = IndexedInstrProfReader::create(ProfileFileName);
+  if (std::error_code EC = ReaderOrErr.getError()) {
+    Ctx.diagnose(
+        DiagnosticInfoPGOProfile(ProfileFileName.data(), EC.message()));
+    return false;
+  }
+
+  PGOReader = std::move(ReaderOrErr.get());
+  if (!PGOReader) {
+    Ctx.diagnose(DiagnosticInfoPGOProfile(ProfileFileName.data(),
+                                          "Cannot get PGOReader"));
+    return false;
+  }
+
+  for (auto &F : M) {
+    if (F.isDeclaration())
+      continue;
+    BranchProbabilityInfo *BPI =
+        &(getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI());
+    BlockFrequencyInfo *BFI =
+        &(getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI());
+    PGOUseFunc Func(F, &M, BPI, BFI);
+    setPGOCountOnFunc(Func, PGOReader.get());
+  }
+  return true;
+}
diff --git a/test/Transforms/PGOProfile/Inputs/branch1.proftext b/test/Transforms/PGOProfile/Inputs/branch1.proftext
new file mode 100644
index 000000000000..3e28112706f1
--- /dev/null
+++ b/test/Transforms/PGOProfile/Inputs/branch1.proftext
@@ -0,0 +1,6 @@
+test_br_1
+25571299074
+2
+3
+2
+
diff --git a/test/Transforms/PGOProfile/Inputs/branch2.proftext b/test/Transforms/PGOProfile/Inputs/branch2.proftext
new file mode 100644
index 000000000000..7d9bd72b29f2
--- /dev/null
+++ b/test/Transforms/PGOProfile/Inputs/branch2.proftext
@@ -0,0 +1,6 @@
+test_br_2
+29667547796
+2
+1
+1
+
diff --git a/test/Transforms/PGOProfile/Inputs/criticaledge.proftext b/test/Transforms/PGOProfile/Inputs/criticaledge.proftext
new file mode 100644
index 000000000000..f369ba7c3504
--- /dev/null
+++ b/test/Transforms/PGOProfile/Inputs/criticaledge.proftext
@@ -0,0 +1,17 @@
+test_criticalEdge
+82323253069
+8
+2
+1
+2
+2
+0
+1
+2
+1
+
+<stdin>:bar
+12884901887
+1
+7
+
diff --git a/test/Transforms/PGOProfile/Inputs/diag.proftext b/test/Transforms/PGOProfile/Inputs/diag.proftext
new file mode 100644
index 000000000000..aaa137e3a420
--- /dev/null
+++ b/test/Transforms/PGOProfile/Inputs/diag.proftext
@@ -0,0 +1,5 @@
+foo
+12884999999
+1
+1
+
diff --git a/test/Transforms/PGOProfile/Inputs/landingpad.proftext b/test/Transforms/PGOProfile/Inputs/landingpad.proftext
new file mode 100644
index 000000000000..b2bd451611bf
--- /dev/null
+++ b/test/Transforms/PGOProfile/Inputs/landingpad.proftext
@@ -0,0 +1,14 @@
+foo
+59130013419
+4
+3
+1
+2
+0
+
+bar
+24868915205
+2
+1
+2
+
diff --git a/test/Transforms/PGOProfile/Inputs/loop1.proftext b/test/Transforms/PGOProfile/Inputs/loop1.proftext
new file mode 100644
index 000000000000..58c05fbe1676
--- /dev/null
+++ b/test/Transforms/PGOProfile/Inputs/loop1.proftext
@@ -0,0 +1,6 @@
+test_simple_for
+34137660316
+2
+96
+4
+
diff --git a/test/Transforms/PGOProfile/Inputs/loop2.proftext b/test/Transforms/PGOProfile/Inputs/loop2.proftext
new file mode 100644
index 000000000000..1c429ea5d5f4
--- /dev/null
+++ b/test/Transforms/PGOProfile/Inputs/loop2.proftext
@@ -0,0 +1,7 @@
+test_nested_for
+53929068288
+3
+33
+10
+6
+
diff --git a/test/Transforms/PGOProfile/Inputs/switch.proftext b/test/Transforms/PGOProfile/Inputs/switch.proftext
new file mode 100644
index 000000000000..7b406b87ef70
--- /dev/null
+++ b/test/Transforms/PGOProfile/Inputs/switch.proftext
@@ -0,0 +1,8 @@
+test_switch
+46200943743
+4
+0
+5
+2
+3
+
diff --git a/test/Transforms/PGOProfile/branch1.ll b/test/Transforms/PGOProfile/branch1.ll
new file mode 100644
index 000000000000..a0c5d7aeea72
--- /dev/null
+++ b/test/Transforms/PGOProfile/branch1.ll
@@ -0,0 +1,30 @@
+; RUN: opt < %s -pgo-instr-gen -S | FileCheck %s --check-prefix=GEN
+; RUN: llvm-profdata merge %S/Inputs/branch1.proftext -o %T/branch1.profdata
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%T/branch1.profdata -S | FileCheck %s --check-prefix=USE
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; GEN: @__llvm_profile_name_test_br_1 = private constant [9 x i8] c"test_br_1"
+
+define i32 @test_br_1(i32 %i) {
+entry:
+; GEN: entry:
+; GEN-NOT: llvm.instrprof.increment
+  %cmp = icmp sgt i32 %i, 0
+  br i1 %cmp, label %if.then, label %if.end
+; USE: br i1 %cmp, label %if.then, label %if.end
+; USE-SAME: !prof ![[BW_ENTRY:[0-9]+]]
+; USE: ![[BW_ENTRY]] = !{!"branch_weights", i32 2, i32 1}
+
+if.then:
+; GEN: if.then:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @__llvm_profile_name_test_br_1, i32 0, i32 0), i64 25571299074, i32 2, i32 1)
+  %add = add nsw i32 %i, 2
+  br label %if.end
+
+if.end:
+; GEN: if.end:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @__llvm_profile_name_test_br_1, i32 0, i32 0), i64 25571299074, i32 2, i32 0)
+  %retv = phi i32 [ %add, %if.then ], [ %i, %entry ]
+  ret i32 %retv
+}
diff --git a/test/Transforms/PGOProfile/branch2.ll b/test/Transforms/PGOProfile/branch2.ll
new file mode 100644
index 000000000000..97f49565d445
--- /dev/null
+++ b/test/Transforms/PGOProfile/branch2.ll
@@ -0,0 +1,37 @@
+; RUN: opt < %s -pgo-instr-gen -S | FileCheck %s --check-prefix=GEN
+; RUN: llvm-profdata merge %S/Inputs/branch2.proftext -o %T/branch2.profdata
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%T/branch2.profdata -S | FileCheck %s --check-prefix=USE
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; GEN: @__llvm_profile_name_test_br_2 = private constant [9 x i8] c"test_br_2"
+
+define i32 @test_br_2(i32 %i) {
+entry:
+; GEN: entry:
+; GEN-NOT: llvm.instrprof.increment
+  %cmp = icmp sgt i32 %i, 0
+  br i1 %cmp, label %if.then, label %if.else
+; USE: br i1 %cmp, label %if.then, label %if.else
+; USE-SAME: !prof ![[BW_ENTRY:[0-9]+]]
+; USE: ![[BW_ENTRY]] = !{!"branch_weights", i32 1, i32 1}
+
+if.then:
+; GEN: if.then:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @__llvm_profile_name_test_br_2, i32 0, i32 0), i64 29667547796, i32 2, i32 0)
+  %add = add nsw i32 %i, 2
+  br label %if.end
+
+if.else:
+; GEN: if.else:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @__llvm_profile_name_test_br_2, i32 0, i32 0), i64 29667547796, i32 2, i32 1)
+  %sub = sub nsw i32 %i, 2
+  br label %if.end
+
+if.end:
+; GEN: if.end:
+; GEN-NOT: llvm.instrprof.increment
+  %retv = phi i32 [ %add, %if.then ], [ %sub, %if.else ]
+  ret i32 %retv
+; GEN: ret
+}
diff --git a/test/Transforms/PGOProfile/criticaledge.ll b/test/Transforms/PGOProfile/criticaledge.ll
new file mode 100644
index 000000000000..7d0d1fea9e77
--- /dev/null
+++ b/test/Transforms/PGOProfile/criticaledge.ll
@@ -0,0 +1,108 @@
+; RUN: opt < %s -pgo-instr-gen -S | FileCheck %s --check-prefix=GEN
+; RUN: llvm-profdata merge %S/Inputs/criticaledge.proftext -o %T/criticaledge.profdata
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%T/criticaledge.profdata -S | FileCheck %s --check-prefix=USE
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; GEN: @__llvm_profile_name_test_criticalEdge = private constant [17 x i8] c"test_criticalEdge"
+; GEN: @"__llvm_profile_name_<stdin>:bar" = private constant [11 x i8] c"<stdin>:bar"
+
+define i32 @test_criticalEdge(i32 %i, i32 %j) {
+entry:
+; CHECK: entry:
+; GEN-NOT: call void @llvm.instrprof.increment
+  switch i32 %i, label %sw.default [
+    i32 1, label %sw.bb
+    i32 2, label %sw.bb1
+    i32 3, label %sw.bb2
+    i32 4, label %sw.bb2
+; CHECK:    i32 3, label %entry.sw.bb2_crit_edge
+; CHECK:    i32 4, label %entry.sw.bb2_crit_edge1
+    i32 5, label %sw.bb2
+  ]
+; USE: ]
+; USE-SAME: !prof ![[BW_SWITCH:[0-9]+]]
+
+; CHECK: entry.sw.bb2_crit_edge1:
+; GEN:   call void @llvm.instrprof.increment(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @__llvm_profile_name_test_criticalEdge, i32 0, i32 0), i64 82323253069, i32 8, i32 1)
+; CHECK:   br label %sw.bb2
+
+; CHECK: entry.sw.bb2_crit_edge:
+; GEN:   call void @llvm.instrprof.increment(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @__llvm_profile_name_test_criticalEdge, i32 0, i32 0), i64 82323253069, i32 8, i32 0)
+; CHECK:   br label %sw.bb2
+
+sw.bb:
+; GEN: sw.bb:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @__llvm_profile_name_test_criticalEdge, i32 0, i32 0), i64 82323253069, i32 8, i32 5)
+  %call = call i32 @bar(i32 2)
+  br label %sw.epilog
+
+sw.bb1:
+; GEN: sw.bb1:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @__llvm_profile_name_test_criticalEdge, i32 0, i32 0), i64 82323253069, i32 8, i32 4)
+  %call2 = call i32 @bar(i32 1024)
+  br label %sw.epilog
+
+sw.bb2:
+; GEN: sw.bb2:
+; GEN-NOT: call void @llvm.instrprof.increment
+  %cmp = icmp eq i32 %j, 2
+  br i1 %cmp, label %if.then, label %if.end
+; USE: br i1 %cmp, label %if.then, label %if.end
+; USE-SAME: !prof ![[BW_SW_BB2:[0-9]+]]
+
+if.then:
+; GEN: if.then:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @__llvm_profile_name_test_criticalEdge, i32 0, i32 0), i64 82323253069, i32 8, i32 2)
+  %call4 = call i32 @bar(i32 4)
+  br label %return
+
+if.end:
+; GEN: if.end:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @__llvm_profile_name_test_criticalEdge, i32 0, i32 0), i64 82323253069, i32 8, i32 3)
+  %call5 = call i32 @bar(i32 8)
+  br label %sw.epilog
+
+sw.default:
+; GEN: sw.default:
+; GEN-NOT: call void @llvm.instrprof.increment
+  %call6 = call i32 @bar(i32 32)
+  %cmp7 = icmp sgt i32 %j, 10
+  br i1 %cmp7, label %if.then8, label %if.end9
+; USE: br i1 %cmp7, label %if.then8, label %if.end9
+; USE-SAME: !prof ![[BW_SW_DEFAULT:[0-9]+]]
+
+if.then8:
+; GEN: if.then8:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @__llvm_profile_name_test_criticalEdge, i32 0, i32 0), i64 82323253069, i32 8, i32 7)
+  %add = add nsw i32 %call6, 10
+  br label %if.end9
+
+if.end9:
+; GEN: if.end9:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @__llvm_profile_name_test_criticalEdge, i32 0, i32 0), i64 82323253069, i32 8, i32 6)
+  %res.0 = phi i32 [ %add, %if.then8 ], [ %call6, %sw.default ]
+  br label %sw.epilog
+
+sw.epilog:
+; GEN: sw.epilog:
+; GEN-NOT: call void @llvm.instrprof.increment
+  %res.1 = phi i32 [ %res.0, %if.end9 ], [ %call5, %if.end ], [ %call2, %sw.bb1 ], [ %call, %sw.bb ]
+  br label %return
+
+return:
+; GEN: return:
+; GEN-NOT: call void @llvm.instrprof.increment
+  %retval = phi i32 [ %res.1, %sw.epilog ], [ %call4, %if.then ]
+  ret i32 %retval
+}
+
+define internal i32 @bar(i32 %i) {
+entry:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @"__llvm_profile_name_<stdin>:bar", i32 0, i32 0), i64 12884901887, i32 1, i32 0)
+  ret i32 %i
+}
+
+; USE: ![[BW_SWITCH]] = !{!"branch_weights", i32 2, i32 1, i32 0, i32 2, i32 1, i32 1}
+; USE: ![[BW_SW_BB2]] = !{!"branch_weights", i32 2, i32 2}
+; USE: ![[BW_SW_DEFAULT]] = !{!"branch_weights", i32 1, i32 1}
diff --git a/test/Transforms/PGOProfile/diag_mismatch.ll b/test/Transforms/PGOProfile/diag_mismatch.ll
new file mode 100644
index 000000000000..c655b4d77347
--- /dev/null
+++ b/test/Transforms/PGOProfile/diag_mismatch.ll
@@ -0,0 +1,12 @@
+; RUN: llvm-profdata merge %S/Inputs/diag.proftext -o %T/diag.profdata
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%T/diag.profdata -S 2>&1 | FileCheck %s
+
+; CHECK: Function control flow change detected (hash mismatch) foo
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @foo() {
+entry:
+  ret i32 0
+}
diff --git a/test/Transforms/PGOProfile/diag_no_funcprofdata.ll b/test/Transforms/PGOProfile/diag_no_funcprofdata.ll
new file mode 100644
index 000000000000..39cd978dc77e
--- /dev/null
+++ b/test/Transforms/PGOProfile/diag_no_funcprofdata.ll
@@ -0,0 +1,12 @@
+; RUN: llvm-profdata merge %S/Inputs/diag.proftext -o %T/diag.profdata
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%T/diag.profdata -S 2>&1 | FileCheck %s
+
+; CHECK: No profile data available for function bar
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @bar() {
+entry:
+  ret i32 0 
+}
diff --git a/test/Transforms/PGOProfile/diag_no_profile.ll b/test/Transforms/PGOProfile/diag_no_profile.ll
new file mode 100644
index 000000000000..c3f9d15ba24f
--- /dev/null
+++ b/test/Transforms/PGOProfile/diag_no_profile.ll
@@ -0,0 +1,9 @@
+; RUN: not opt < %s -pgo-instr-use -pgo-test-profile-file=%T/notexisting.profdata -S  2>&1
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @foo() {
+entry:
+  ret i32 0
+}
diff --git a/test/Transforms/PGOProfile/landingpad.ll b/test/Transforms/PGOProfile/landingpad.ll
new file mode 100644
index 000000000000..66671be2d390
--- /dev/null
+++ b/test/Transforms/PGOProfile/landingpad.ll
@@ -0,0 +1,124 @@
+; RUN: opt < %s -pgo-instr-gen -S | FileCheck %s --check-prefix=GEN
+; RUN: llvm-profdata merge %S/Inputs/landingpad.proftext -o %T/landingpad.profdata
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%T/landingpad.profdata -S | FileCheck %s --check-prefix=USE
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@val = global i32 0, align 4
+@_ZTIi = external constant i8*
+; GEN: @__llvm_profile_name_bar = private constant [3 x i8] c"bar"
+; GEN: @__llvm_profile_name_foo = private constant [3 x i8] c"foo"
+
+define i32 @bar(i32 %i) {
+entry:
+; GEN: entry:
+; GEN-NOT: call void @llvm.instrprof.increment
+  %rem = srem i32 %i, 3
+  %tobool = icmp ne i32 %rem, 0
+  br i1 %tobool, label %if.then, label %if.end
+; USE: br i1 %tobool, label %if.then, label %if.end
+; USE-SAME: !prof ![[BW_BAR_ENTRY:[0-9]+]]
+
+if.then:
+; GEN: if.then:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @__llvm_profile_name_bar, i32 0, i32 0), i64 24868915205, i32 2, i32 1)
+  %exception = call i8* @__cxa_allocate_exception(i64 4)
+  %tmp = bitcast i8* %exception to i32*
+  store i32 %i, i32* %tmp, align 16
+  call void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTIi to i8*), i8* null)
+  unreachable
+
+if.end:
+; GEN: if.end:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @__llvm_profile_name_bar, i32 0, i32 0), i64 24868915205, i32 2, i32 0)
+  ret i32 0
+}
+
+declare i8* @__cxa_allocate_exception(i64)
+
+declare void @__cxa_throw(i8*, i8*, i8*)
+
+define i32 @foo(i32 %i) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+; GEN: entry:
+; GEN-NOT: call void @llvm.instrprof.increment
+  %rem = srem i32 %i, 2
+  %tobool = icmp ne i32 %rem, 0
+  br i1 %tobool, label %if.then, label %if.end
+; USE: br i1 %tobool, label %if.then, label %if.end
+; USE-SAME: !prof ![[BW_FOO_ENTRY:[0-9]+]]
+
+if.then:
+; GEN: if.then:
+; GEN-NOT: call void @llvm.instrprof.increment
+  %mul = mul nsw i32 %i, 7
+  %call = invoke i32 @bar(i32 %mul)
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+; GEN: invoke.cont:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @__llvm_profile_name_foo, i32 0, i32 0), i64 59130013419, i32 4, i32 1)
+  br label %if.end
+
+lpad:
+; GEN: lpad:
+; GEN-NOT: call void @llvm.instrprof.increment
+  %tmp = landingpad { i8*, i32 }
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %tmp1 = extractvalue { i8*, i32 } %tmp, 0
+  %tmp2 = extractvalue { i8*, i32 } %tmp, 1
+  br label %catch.dispatch
+
+catch.dispatch:
+; GEN: catch.dispatch:
+; GEN-NOT: call void @llvm.instrprof.increment
+  %tmp3 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*))
+  %matches = icmp eq i32 %tmp2, %tmp3
+  br i1 %matches, label %catch, label %eh.resume
+; USE: br i1 %matches, label %catch, label %eh.resume
+; USE-SAME: !prof ![[BW_CATCH_DISPATCH:[0-9]+]]
+
+catch:
+; GEN: catch:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @__llvm_profile_name_foo, i32 0, i32 0), i64 59130013419, i32 4, i32 2)
+  %tmp4 = call i8* @__cxa_begin_catch(i8* %tmp1)
+  %tmp5 = bitcast i8* %tmp4 to i32*
+  %tmp6 = load i32, i32* %tmp5, align 4
+  %tmp7 = load i32, i32* @val, align 4
+  %sub = sub nsw i32 %tmp7, %tmp6
+  store i32 %sub, i32* @val, align 4
+  call void @__cxa_end_catch()
+  br label %try.cont
+
+try.cont:
+; GEN: try.cont:
+; GEN-NOT: call void @llvm.instrprof.increment
+  ret i32 -1
+
+if.end:
+; GEN: if.end:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @__llvm_profile_name_foo, i32 0, i32 0), i64 59130013419, i32 4, i32 0)
+  %tmp8 = load i32, i32* @val, align 4
+  %add = add nsw i32 %tmp8, %i
+  store i32 %add, i32* @val, align 4
+  br label %try.cont
+
+eh.resume:
+; GEN: eh.resume:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @__llvm_profile_name_foo, i32 0, i32 0), i64 59130013419, i32 4, i32 3)
+  %lpad.val = insertvalue { i8*, i32 } undef, i8* %tmp1, 0
+  %lpad.val3 = insertvalue { i8*, i32 } %lpad.val, i32 %tmp2, 1
+  resume { i8*, i32 } %lpad.val3
+}
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i32 @llvm.eh.typeid.for(i8*)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
+
+; USE: ![[BW_BAR_ENTRY]] = !{!"branch_weights", i32 2, i32 1}
+; USE: ![[BW_FOO_ENTRY]] = !{!"branch_weights", i32 3, i32 2}
+; USE: ![[BW_CATCH_DISPATCH]] = !{!"branch_weights", i32 2, i32 0}
diff --git a/test/Transforms/PGOProfile/loop1.ll b/test/Transforms/PGOProfile/loop1.ll
new file mode 100644
index 000000000000..71781d2abf78
--- /dev/null
+++ b/test/Transforms/PGOProfile/loop1.ll
@@ -0,0 +1,42 @@
+; RUN: opt < %s -pgo-instr-gen -S | FileCheck %s --check-prefix=GEN
+; RUN: llvm-profdata merge %S/Inputs/loop1.proftext -o %T/loop1.profdata
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%T/loop1.profdata -S | FileCheck %s --check-prefix=USE
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; GEN: @__llvm_profile_name_test_simple_for = private constant [15 x i8] c"test_simple_for"
+
+define i32 @test_simple_for(i32 %n) {
+entry:
+; GEN: entry:
+; GEN-NOT: call void @llvm.instrprof.increment
+  br label %for.cond
+
+for.cond:
+; GEN: for.cond:
+; GEN-NOT: call void @llvm.instrprof.increment
+  %i = phi i32 [ 0, %entry ], [ %inc1, %for.inc ]
+  %sum = phi i32 [ 1, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i, %n
+  br i1 %cmp, label %for.body, label %for.end
+; USE: br i1 %cmp, label %for.body, label %for.end
+; USE-SAME: !prof ![[BW_FOR_COND:[0-9]+]]
+; USE: ![[BW_FOR_COND]] = !{!"branch_weights", i32 96, i32 4}
+
+for.body:
+; GEN: for.body:
+; GEN-NOT: call void @llvm.instrprof.increment
+  %inc = add nsw i32 %sum, 1
+  br label %for.inc
+
+for.inc:
+; GEN: for.inc:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @__llvm_profile_name_test_simple_for, i32 0, i32 0), i64 34137660316, i32 2, i32 0)
+  %inc1 = add nsw i32 %i, 1
+  br label %for.cond
+
+for.end:
+; GEN: for.end:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @__llvm_profile_name_test_simple_for, i32 0, i32 0), i64 34137660316, i32 2, i32 1)
+  ret i32 %sum
+}
diff --git a/test/Transforms/PGOProfile/loop2.ll b/test/Transforms/PGOProfile/loop2.ll
new file mode 100644
index 000000000000..2f879cd83772
--- /dev/null
+++ b/test/Transforms/PGOProfile/loop2.ll
@@ -0,0 +1,70 @@
+; RUN: opt < %s -pgo-instr-gen -S | FileCheck %s --check-prefix=GEN
+; RUN: llvm-profdata merge %S/Inputs/loop2.proftext -o %T/loop2.profdata
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%T/loop2.profdata -S | FileCheck %s --check-prefix=USE
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; GEN: @__llvm_profile_name_test_nested_for = private constant [15 x i8] c"test_nested_for"
+
+define i32 @test_nested_for(i32 %r, i32 %s) {
+entry:
+; GEN: entry:
+; GEN-NOT: call void @llvm.instrprof.increment
+  br label %for.cond.outer
+
+for.cond.outer:
+; GEN: for.cond.outer:
+; GEN-NOT: call void @llvm.instrprof.increment
+  %i.0 = phi i32 [ 0, %entry ], [ %inc.2, %for.inc.outer ]
+  %sum.0 = phi i32 [ 1, %entry ], [ %sum.1, %for.inc.outer ]
+  %cmp = icmp slt i32 %i.0, %r
+  br i1 %cmp, label %for.body.outer, label %for.end.outer
+; USE: br i1 %cmp, label %for.body.outer, label %for.end.outer
+; USE-SAME: !prof ![[BW_FOR_COND_OUTER:[0-9]+]]
+
+for.body.outer:
+; GEN: for.body.outer:
+; GEN-NOT: call void @llvm.instrprof.increment
+  br label %for.cond.inner
+
+for.cond.inner:
+; GEN: for.cond.inner:
+; GEN-NOT: call void @llvm.instrprof.increment
+  %j.0 = phi i32 [ 0, %for.body.outer ], [ %inc.1, %for.inc.inner ]
+  %sum.1 = phi i32 [ %sum.0, %for.body.outer ], [ %inc, %for.inc.inner ]
+  %cmp2 = icmp slt i32 %j.0, %s
+  br i1 %cmp2, label %for.body.inner, label %for.end.inner
+; USE: br i1 %cmp2, label %for.body.inner, label %for.end.inner
+; USE-SAME: !prof ![[BW_FOR_COND_INNER:[0-9]+]]
+
+for.body.inner:
+; GEN: for.body.inner:
+; GEN-NOT: call void @llvm.instrprof.increment
+  %inc = add nsw i32 %sum.1, 1
+  br label %for.inc.inner
+
+for.inc.inner:
+; GEN: for.inc.inner:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @__llvm_profile_name_test_nested_for, i32 0, i32 0), i64 53929068288, i32 3, i32 0)
+  %inc.1 = add nsw i32 %j.0, 1
+  br label %for.cond.inner
+
+for.end.inner:
+; GEN: for.end.inner:
+  br label %for.inc.outer
+
+for.inc.outer:
+; GEN: for.inc.outer:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @__llvm_profile_name_test_nested_for, i32 0, i32 0), i64 53929068288, i32 3, i32 1)
+  %inc.2 = add nsw i32 %i.0, 1
+  br label %for.cond.outer
+
+for.end.outer:
+; GEN: for.end.outer:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @__llvm_profile_name_test_nested_for, i32 0, i32 0), i64 53929068288, i32 3, i32 2)
+  ret i32 %sum.0
+}
+
+; USE-DAG: ![[BW_FOR_COND_OUTER]] = !{!"branch_weights", i32 10, i32 6}
+; USE-DAG: ![[BW_FOR_COND_INNER]] = !{!"branch_weights", i32 33, i32 10}
+
diff --git a/test/Transforms/PGOProfile/single_bb.ll b/test/Transforms/PGOProfile/single_bb.ll
new file mode 100644
index 000000000000..8d976318ad6d
--- /dev/null
+++ b/test/Transforms/PGOProfile/single_bb.ll
@@ -0,0 +1,12 @@
+; RUN: opt < %s -pgo-instr-gen -S | FileCheck %s --check-prefix=GEN
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; GEN: @__llvm_profile_name_single_bb = private constant [9 x i8] c"single_bb"
+
+define i32 @single_bb() {
+entry:
+; GEN: entry:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @__llvm_profile_name_single_bb, i32 0, i32 0), i64 12884901887, i32 1, i32 0)
+  ret i32 0
+}
diff --git a/test/Transforms/PGOProfile/switch.ll b/test/Transforms/PGOProfile/switch.ll
new file mode 100644
index 000000000000..4301ac207c26
--- /dev/null
+++ b/test/Transforms/PGOProfile/switch.ll
@@ -0,0 +1,47 @@
+; RUN: opt < %s -pgo-instr-gen -S | FileCheck %s --check-prefix=GEN
+; RUN: llvm-profdata merge %S/Inputs/switch.proftext -o %T/switch.profdata
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%T/switch.profdata -S | FileCheck %s --check-prefix=USE
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; GEN: @__llvm_profile_name_test_switch = private constant [11 x i8] c"test_switch"
+
+define void @test_switch(i32 %i) {
+entry:
+; GEN: entry:
+; GEN-NOT: call void @llvm.instrprof.increment
+  switch i32 %i, label %sw.default [
+    i32 1, label %sw.bb
+    i32 2, label %sw.bb1
+    i32 3, label %sw.bb2
+  ]
+; USE: ]
+; USE-SAME: !prof ![[BW_SWITCH:[0-9]+]]
+; USE: ![[BW_SWITCH]] = !{!"branch_weights", i32 3, i32 2, i32 0, i32 5}
+
+sw.bb:
+; GEN: sw.bb:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @__llvm_profile_name_test_switch, i32 0, i32 0), i64 46200943743, i32 4, i32 2)
+  br label %sw.epilog
+
+sw.bb1:
+; GEN: sw.bb1:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @__llvm_profile_name_test_switch, i32 0, i32 0), i64 46200943743, i32 4, i32 0)
+  br label %sw.epilog
+
+sw.bb2:
+; GEN: sw.bb2:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @__llvm_profile_name_test_switch, i32 0, i32 0), i64 46200943743, i32 4, i32 1)
+  br label %sw.epilog
+
+sw.default:
+; GEN: sw.default:
+; GEN: call void @llvm.instrprof.increment(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @__llvm_profile_name_test_switch, i32 0, i32 0), i64 46200943743, i32 4, i32 3)
+  br label %sw.epilog
+
+sw.epilog:
+; GEN: sw.epilog:
+; GEN-NOT: call void @llvm.instrprof.increment
+  ret void
+; GEN: ret void
+}

From 4c0fb60923bd45a297757929800a8ba97b531cbb Mon Sep 17 00:00:00 2001
From: Michael Zolotukhin <mzolotukhin@apple.com>
Date: Wed, 9 Dec 2015 18:20:28 +0000
Subject: [PATCH 282/364] Revert "Revert r253253 and r253126: "Don't recompute
 LCSSA after loop-unrolling when possible.""

The bug in IndVarSimplify was fixed in r254976, r254977, so I'm
reapplying the original patch for avoiding redundant LCSSA recomputation.

This reverts commit ffe3b434e505e403146aff00be0c177bb6d13466.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255133 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Utils/LoopUnroll.cpp | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index ad4c388e4066..44dde1b51cfc 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp
@@ -221,6 +221,12 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
 
   // Are we eliminating the loop control altogether?
   bool CompletelyUnroll = Count == TripCount;
+  SmallVector<BasicBlock *, 4> ExitBlocks;
+  L->getExitBlocks(ExitBlocks);
+  Loop *ParentL = L->getParentLoop();
+  bool AllExitsAreInsideParentLoop = !ParentL ||
+      std::all_of(ExitBlocks.begin(), ExitBlocks.end(),
+                  [&](BasicBlock *BB) { return ParentL->contains(BB); });
 
   // We assume a run-time trip count if the compiler cannot
   // figure out the loop trip count and the unroll-runtime
@@ -544,7 +550,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
     if (!OuterL && !CompletelyUnroll)
       OuterL = L;
     if (OuterL) {
-      simplifyLoop(OuterL, DT, LI, PP, SE, AC);
+      bool Simplified = simplifyLoop(OuterL, DT, LI, PP, SE, AC);
 
       // LCSSA must be performed on the outermost affected loop. The unrolled
       // loop's last loop latch is guaranteed to be in the outermost loop after
@@ -554,7 +560,11 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
         while (OuterL->getParentLoop() != LatchLoop)
           OuterL = OuterL->getParentLoop();
 
-      formLCSSARecursively(*OuterL, *DT, LI, SE);
+      if (CompletelyUnroll && (!AllExitsAreInsideParentLoop || Simplified))
+        formLCSSARecursively(*OuterL, *DT, LI, SE);
+      else
+        assert(OuterL->isLCSSAForm(*DT) &&
+               "Loops should be in LCSSA form after loop-unroll.");
     }
   }
 

From 1e3aaa8beff4a0797feb2ca2cb57668ab02a95fa Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 9 Dec 2015 18:57:16 +0000
Subject: [PATCH 283/364] [InstCombine] fold bitcasts around an extractelement
 (2nd try)

This is a redo of r255124 (reverted at r255126) with an added check for a
scalar destination type and an added test for the failure seen in Clang's
test/CodeGen/vector.c. The extra test shows a different missing optimization.

Original commit message:

Example:
  bitcast (extractelement (bitcast <2 x float> %X to <2 x i32>), 1) to float
    --->
  extractelement <2 x float> %X, i32 1

This is part of fixing PR25543:
https://llvm.org/bugs/show_bug.cgi?id=25543

The next step will be to generalize this fold:
trunc ( lshr ( bitcast X) ) -> extractelement (X)

Ie, I'm hoping to replace the existing transform of:
bitcast ( trunc ( lshr ( bitcast X)))
added by:
http://reviews.llvm.org/rL112232

with 2 less specific transforms to catch the case in the bug report.

Differential Revision: http://reviews.llvm.org/D14879


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255137 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/InstCombineCasts.cpp          | 39 +++++++++++++++++++
 test/Transforms/InstCombine/bitcast.ll        | 28 +++++++++----
 2 files changed, 59 insertions(+), 8 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 4afe1bb243ff..2ce86436411b 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1715,6 +1715,42 @@ static Value *optimizeIntegerToVectorInsertions(BitCastInst &CI,
   return Result;
 }
 
+/// Given a bitcasted vector fed into an extract element instruction and then
+/// bitcasted again to a scalar type, eliminate at least one bitcast by changing
+/// the vector type of the extractelement instruction.
+/// Example:
+///   bitcast (extractelement (bitcast <2 x float> %X to <2 x i32>), 1) to float
+///    --->
+///   extractelement <2 x float> %X, i32 1
+static Instruction *foldBitCastExtElt(BitCastInst &BitCast, InstCombiner &IC,
+                                      const DataLayout &DL) {
+  Type *DestType = BitCast.getType();
+  if (DestType->isVectorTy())
+    return nullptr;
+
+  // TODO: Create and use a pattern matcher for ExtractElementInst.
+  auto *ExtElt = dyn_cast<ExtractElementInst>(BitCast.getOperand(0));
+  if (!ExtElt || !ExtElt->hasOneUse())
+    return nullptr;
+
+  Value *InnerBitCast = nullptr;
+  if (!match(ExtElt->getOperand(0), m_BitCast(m_Value(InnerBitCast))))
+    return nullptr;
+
+  // If the element type of the vector doesn't match the result type,
+  // bitcast it to a vector type that we can extract from.
+  VectorType *VecType = cast<VectorType>(InnerBitCast->getType());
+  if (VecType->getElementType() != DestType) {
+    unsigned VecWidth = VecType->getPrimitiveSizeInBits();
+    unsigned DestWidth = DestType->getPrimitiveSizeInBits();
+    unsigned NumElts = VecWidth / DestWidth;
+    VecType = VectorType::get(DestType, NumElts);
+    InnerBitCast = IC.Builder->CreateBitCast(InnerBitCast, VecType, "bc");
+  }
+
+  return ExtractElementInst::Create(InnerBitCast, ExtElt->getOperand(1));
+}
+
 static Instruction *foldVecTruncToExtElt(Value *VecInput, Type *DestTy,
                                          unsigned ShiftAmt, InstCombiner &IC,
                                          const DataLayout &DL) {
@@ -1886,6 +1922,9 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     }
   }
 
+  if (Instruction *I = foldBitCastExtElt(CI, *this, DL))
+    return I;
+
   if (SrcTy->isPointerTy())
     return commonPointerCastTransforms(CI);
   return commonCastTransforms(CI);
diff --git a/test/Transforms/InstCombine/bitcast.ll b/test/Transforms/InstCombine/bitcast.ll
index 2a8194e53032..5adb7c790fa7 100644
--- a/test/Transforms/InstCombine/bitcast.ll
+++ b/test/Transforms/InstCombine/bitcast.ll
@@ -64,7 +64,7 @@ define float @test3(<2 x float> %A, <2 x i64> %B) {
 ; CHECK-NEXT:  ret float %add
 }
 
-; TODO: Both bitcasts are unnecessary; change the extractelement.
+; Both bitcasts are unnecessary; change the extractelement.
 
 define float @bitcast_extelt1(<2 x float> %A) {
   %bc1 = bitcast <2 x float> %A to <2 x i32>
@@ -73,13 +73,11 @@ define float @bitcast_extelt1(<2 x float> %A) {
   ret float %bc2
 
 ; CHECK-LABEL: @bitcast_extelt1(
-; CHECK-NEXT:  %bc1 = bitcast <2 x float> %A to <2 x i32>
-; CHECK-NEXT:  %ext = extractelement <2 x i32> %bc1, i32 0
-; CHECK-NEXT:  %bc2 = bitcast i32 %ext to float
+; CHECK-NEXT:  %bc2 = extractelement <2 x float> %A, i32 0
 ; CHECK-NEXT:  ret float %bc2
 }
 
-; TODO: Second bitcast can be folded into the first.
+; Second bitcast can be folded into the first.
 
 define i64 @bitcast_extelt2(<4 x float> %A) {
   %bc1 = bitcast <4 x float> %A to <2 x double>
@@ -88,12 +86,26 @@ define i64 @bitcast_extelt2(<4 x float> %A) {
   ret i64 %bc2
 
 ; CHECK-LABEL: @bitcast_extelt2(
-; CHECK-NEXT:  %bc1 = bitcast <4 x float> %A to <2 x double>
-; CHECK-NEXT:  %ext = extractelement <2 x double> %bc1, i32 1
-; CHECK-NEXT:  %bc2 = bitcast double %ext to i64
+; CHECK-NEXT:  %bc = bitcast <4 x float> %A to <2 x i64>
+; CHECK-NEXT:  %bc2 = extractelement <2 x i64> %bc, i32 1
 ; CHECK-NEXT:  ret i64 %bc2
 }
 
+; TODO: This should return %A. 
+
+define <2 x i32> @bitcast_extelt3(<2 x i32> %A) {
+  %bc1 = bitcast <2 x i32> %A to <1 x i64>
+  %ext = extractelement <1 x i64> %bc1, i32 0
+  %bc2 = bitcast i64 %ext to <2 x i32>
+  ret <2 x i32> %bc2
+
+; CHECK-LABEL: @bitcast_extelt3(
+; CHECK-NEXT:  %bc1 = bitcast <2 x i32> %A to <1 x i64>
+; CHECK-NEXT:  %ext = extractelement <1 x i64> %bc1, i32 0
+; CHECK-NEXT:  %bc2 = bitcast i64 %ext to <2 x i32>
+; CHECK-NEXT:  ret <2 x i32> %bc2
+}
+
 define <2 x i32> @test4(i32 %A, i32 %B){
   %tmp38 = zext i32 %A to i64
   %tmp32 = zext i32 %B to i64

From fbc5640b6bbb679dc78d1e563b3283bcc1036205 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Wed, 9 Dec 2015 19:39:47 +0000
Subject: [PATCH 284/364] [ThinLTO] FunctionImport pass can take a const index
 pointer (NFC)

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255140 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Transforms/IPO.h                    | 2 +-
 include/llvm/Transforms/IPO/PassManagerBuilder.h | 2 +-
 lib/Transforms/IPO/FunctionImport.cpp            | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/llvm/Transforms/IPO.h b/include/llvm/Transforms/IPO.h
index 04032d3b328a..38b8ffdd9fc9 100644
--- a/include/llvm/Transforms/IPO.h
+++ b/include/llvm/Transforms/IPO.h
@@ -88,7 +88,7 @@ ModulePass *createGVExtractionPass(std::vector<GlobalValue*>& GVs, bool
 
 //===----------------------------------------------------------------------===//
 /// This pass performs iterative function importing from other modules.
-Pass *createFunctionImportPass(FunctionInfoIndex *Index = nullptr);
+Pass *createFunctionImportPass(const FunctionInfoIndex *Index = nullptr);
 
 //===----------------------------------------------------------------------===//
 /// createFunctionInliningPass - Return a new pass object that uses a heuristic
diff --git a/include/llvm/Transforms/IPO/PassManagerBuilder.h b/include/llvm/Transforms/IPO/PassManagerBuilder.h
index 70b785f9efa3..a4e7bce8ef4a 100644
--- a/include/llvm/Transforms/IPO/PassManagerBuilder.h
+++ b/include/llvm/Transforms/IPO/PassManagerBuilder.h
@@ -117,7 +117,7 @@ class PassManagerBuilder {
   Pass *Inliner;
 
   /// The function summary index to use for function importing.
-  FunctionInfoIndex *FunctionIndex;
+  const FunctionInfoIndex *FunctionIndex;
 
   bool DisableTailCalls;
   bool DisableUnitAtATime;
diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp
index c6a70381e76c..c9874fc37149 100644
--- a/lib/Transforms/IPO/FunctionImport.cpp
+++ b/lib/Transforms/IPO/FunctionImport.cpp
@@ -344,7 +344,7 @@ getFunctionIndexForFile(StringRef Path, std::string &Error,
 class FunctionImportPass : public ModulePass {
   /// Optional function summary index to use for importing, otherwise
   /// the summary-file option must be specified.
-  FunctionInfoIndex *Index;
+  const FunctionInfoIndex *Index;
 
 public:
   /// Pass identification, replacement for typeid
@@ -355,7 +355,7 @@ class FunctionImportPass : public ModulePass {
     return "Function Importing";
   }
 
-  explicit FunctionImportPass(FunctionInfoIndex *Index = nullptr)
+  explicit FunctionImportPass(const FunctionInfoIndex *Index = nullptr)
       : ModulePass(ID), Index(Index) {}
 
   bool runOnModule(Module &M) override {
@@ -394,7 +394,7 @@ INITIALIZE_PASS_END(FunctionImportPass, "function-import",
                     "Summary Based Function Import", false, false)
 
 namespace llvm {
-Pass *createFunctionImportPass(FunctionInfoIndex *Index = nullptr) {
+Pass *createFunctionImportPass(const FunctionInfoIndex *Index = nullptr) {
   return new FunctionImportPass(Index);
 }
 }

From a63df07eda4fdfe9ee46ca275d47481d0b14e0d5 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Wed, 9 Dec 2015 19:45:55 +0000
Subject: [PATCH 285/364] clang-format order of gold-plugin includes (NFC)

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255144 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/gold/gold-plugin.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/gold/gold-plugin.cpp b/tools/gold/gold-plugin.cpp
index 8eacdc3ff235..016dca523822 100644
--- a/tools/gold/gold-plugin.cpp
+++ b/tools/gold/gold-plugin.cpp
@@ -31,14 +31,14 @@
 #include "llvm/IR/Verifier.h"
 #include "llvm/Linker/Linker.h"
 #include "llvm/MC/SubtargetFeature.h"
-#include "llvm/Object/IRObjectFile.h"
 #include "llvm/Object/FunctionIndexObjectFile.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Object/IRObjectFile.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"

From 522f45d99c8a18853093b41d8c77d43878e927b0 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Wed, 9 Dec 2015 19:49:40 +0000
Subject: [PATCH 286/364] Delay context construction to when/if it is needed in
 gold plugin (NFC)

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255146 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/gold/gold-plugin.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/gold/gold-plugin.cpp b/tools/gold/gold-plugin.cpp
index 016dca523822..08ff45d45ebb 100644
--- a/tools/gold/gold-plugin.cpp
+++ b/tools/gold/gold-plugin.cpp
@@ -900,9 +900,6 @@ static ld_plugin_status allSymbolsReadHook(raw_fd_ostream *ApiFile) {
   if (Modules.empty())
     return LDPS_OK;
 
-  LLVMContext Context;
-  Context.setDiagnosticHandler(diagnosticHandlerForContext, nullptr, true);
-
   // If we are doing ThinLTO compilation, simply build the combined
   // function index/summary and emit it. We don't need to parse the modules
   // and link them in this case.
@@ -937,6 +934,9 @@ static ld_plugin_status allSymbolsReadHook(raw_fd_ostream *ApiFile) {
     exit(0);
   }
 
+  LLVMContext Context;
+  Context.setDiagnosticHandler(diagnosticHandlerForContext, nullptr, true);
+
   std::unique_ptr<Module> Combined(new Module("ld-temp.o", Context));
   Linker L(*Combined, diagnosticHandler);
 

From 9b40f2a9a5d77ca38eebe97b246aa15be78fef2e Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@playingwithpointers.com>
Date: Wed, 9 Dec 2015 20:33:45 +0000
Subject: [PATCH 287/364] Delete trailing whitespace; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255147 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Utils/CloneFunction.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index 465fc34f6f1d..854a3b855f54 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -142,7 +142,7 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
     if (BB.hasAddressTaken()) {
       Constant *OldBBAddr = BlockAddress::get(const_cast<Function*>(OldFunc),
                                               const_cast<BasicBlock*>(&BB));
-      VMap[OldBBAddr] = BlockAddress::get(NewFunc, CBB);                                         
+      VMap[OldBBAddr] = BlockAddress::get(NewFunc, CBB);
     }
 
     // Note return instructions for the caller.

From 587b47add04b876fa0e73ef0206949b92738bb99 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@playingwithpointers.com>
Date: Wed, 9 Dec 2015 20:33:52 +0000
Subject: [PATCH 288/364] Use WeakVH to keep track of calls with operand
 bundles in CloneCodeInfo

`CloneAndPruneIntoFromInst` can DCE instructions after cloning them into
the new function, and so an AssertingVH is too strong.  This change
switches CloneCodeInfo to use a std::vector<WeakVH>.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255148 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Transforms/Utils/Cloning.h |  5 ++--
 lib/Transforms/Utils/InlineFunction.cpp |  4 +++-
 test/Transforms/Inline/deopt-bundles.ll | 31 +++++++++++++++++++++++++
 3 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/include/llvm/Transforms/Utils/Cloning.h b/include/llvm/Transforms/Utils/Cloning.h
index 5d5689c2c1ac..d1a5fdeaddc4 100644
--- a/include/llvm/Transforms/Utils/Cloning.h
+++ b/include/llvm/Transforms/Utils/Cloning.h
@@ -75,8 +75,9 @@ struct ClonedCodeInfo {
   bool ContainsDynamicAllocas;
 
   /// All cloned call sites that have operand bundles attached are appended to
-  /// this vector.
-  std::vector<AssertingVH<Instruction>> OperandBundleCallSites;
+  /// this vector.  This vector may contain nulls if some of the originally
+  /// inserted callsites were DCE'ed after they were cloned.
+  std::vector<WeakVH> OperandBundleCallSites;
 
   ClonedCodeInfo() : ContainsCalls(false), ContainsDynamicAllocas(false) {}
 };
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index 52bde6797dbe..9a0aabc38a59 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -1162,7 +1162,9 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
       SmallVector<OperandBundleDef, 2> OpDefs;
 
       for (auto &VH : InlinedFunctionInfo.OperandBundleCallSites) {
-        Instruction *I = VH;
+        if (!VH) continue;  // instruction was DCE'd after being cloned
+
+        Instruction *I = cast<Instruction>(VH);
 
         OpDefs.clear();
 
diff --git a/test/Transforms/Inline/deopt-bundles.ll b/test/Transforms/Inline/deopt-bundles.ll
index b4176089075e..75e8d55acdb6 100644
--- a/test/Transforms/Inline/deopt-bundles.ll
+++ b/test/Transforms/Inline/deopt-bundles.ll
@@ -131,6 +131,37 @@ define i32 @caller_6() {
   ret i32 %x
 }
 
+define i32 @callee_7(i1 %val) alwaysinline personality i8 3 {
+; We want something that PruningFunctionCloner is not smart enough to
+; recognize, but can be recognized by recursivelySimplifyInstruction.
+
+ entry:
+  br i1 %val, label %check, label %precheck
+
+ precheck:
+  br label %check
+
+ check:
+  %p = phi i1 [ %val, %entry ], [ true, %precheck ]
+  br i1 %p, label %do.not, label %do
+
+ do.not:
+  ret i32 0
+
+ do:
+  %v = call fastcc i32 @g.fastcc() [ "deopt"(i32 0, i32 1), "foo"(double 0.0) ]
+  ret i32 %v
+}
+
+define i32 @caller_7() {
+; CHECK-LABEL: @caller_7(
+ entry:
+; CHECK-NOT: call fastcc i32 @g.fastcc() #[[FOO_BAR_ATTR_IDX]] [ "deopt"(i32 7, i32 0, i32 1), "foo"(double 0.000000e+00) ]
+; CHECK: ret i32 0
+  %x = call i32 @callee_7(i1 true) [ "deopt"(i32 7) ]
+  ret i32 %x
+}
+
 attributes #0 = { "foo"="bar" }
 
 ; CHECK: attributes #[[FOO_BAR_ATTR_IDX]] = { "foo"="bar" }

From e919414c036814772f8756fa7428bfe8d7a96617 Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Wed, 9 Dec 2015 20:41:10 +0000
Subject: [PATCH 289/364] Don't assign a temporary string to a StringRef.

Should fix the windows debug and asan bots.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255149 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/IPO/FunctionImport.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp
index c9874fc37149..6325a726673e 100644
--- a/lib/Transforms/IPO/FunctionImport.cpp
+++ b/lib/Transforms/IPO/FunctionImport.cpp
@@ -91,7 +91,7 @@ static void findExternalCalls(const Module &DestModule, Function &F,
                               SmallVector<StringRef, 64> &Worklist) {
   // We need to suffix internal function calls imported from other modules,
   // prepare the suffix ahead of time.
-  StringRef Suffix;
+  std::string Suffix;
   if (F.getParent() != &DestModule)
     Suffix =
         (Twine(".llvm.") +

From ce4a7239168aab6e323322248549b35f0ffb9917 Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Wed, 9 Dec 2015 21:02:33 +0000
Subject: [PATCH 290/364] [llvm-dwp] Sink debug_types.dwo emission into the
 code parsing the type signatures (NFC)

This is a preliminary change towards deduplicating type units based on
their signatures. Next change will skip emission of types when their
signature has already been seen.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255154 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/llvm-dwp/llvm-dwp.cpp | 42 ++++++++++++++++++++++++-------------
 1 file changed, 27 insertions(+), 15 deletions(-)

diff --git a/tools/llvm-dwp/llvm-dwp.cpp b/tools/llvm-dwp/llvm-dwp.cpp
index 9a9440574e85..5d95a751f715 100644
--- a/tools/llvm-dwp/llvm-dwp.cpp
+++ b/tools/llvm-dwp/llvm-dwp.cpp
@@ -137,9 +137,14 @@ struct UnitIndexEntry {
   DWARFUnitIndex::Entry::SectionContribution Contributions[8];
 };
 
-static void addAllTypes(std::vector<UnitIndexEntry> &TypeIndexEntries,
-                        uint32_t OutTypesOffset, StringRef Types,
-                        const UnitIndexEntry &CUEntry) {
+static void addAllTypes(MCStreamer &Out,
+                        std::vector<UnitIndexEntry> &TypeIndexEntries,
+                        MCSection *OutputTypes, StringRef Types,
+                        const UnitIndexEntry &CUEntry, uint32_t &TypesOffset) {
+  if (Types.empty())
+    return;
+
+  Out.SwitchSection(OutputTypes);
   uint32_t Offset = 0;
   DataExtractor Data(Types, true, 0);
   while (Data.isValidOffset(Offset)) {
@@ -148,11 +153,14 @@ static void addAllTypes(std::vector<UnitIndexEntry> &TypeIndexEntries,
     // Zero out the debug_info contribution
     Entry.Contributions[0] = {};
     auto &C = Entry.Contributions[DW_SECT_TYPES - DW_SECT_INFO];
-    C.Offset = OutTypesOffset + Offset;
+    C.Offset = TypesOffset + Offset;
     auto PrevOffset = Offset;
     // Length of the unit, including the 4 byte length field.
     C.Length = Data.getU32(&Offset) + 4;
 
+    Out.EmitBytes(Types.substr(Offset - 4, C.Length));
+    TypesOffset += C.Length;
+
     Data.getU16(&Offset); // Version
     Data.getU32(&Offset); // Abbrev offset
     Data.getU8(&Offset);  // Address size
@@ -184,8 +192,11 @@ static void writeIndex(MCStreamer &Out, MCSection *Section,
   for (size_t i = 0; i != IndexEntries.size(); ++i) {
     auto S = IndexEntries[i].Signature;
     auto H = S & Mask;
-    while (Buckets[H])
+    while (Buckets[H]) {
+      assert(S != IndexEntries[Buckets[H] - 1].Signature &&
+             "Duplicate type unit");
       H += ((S >> 32) & Mask) | 1;
+    }
     Buckets[H] = i + 1;
   }
 
@@ -220,6 +231,7 @@ static std::error_code write(MCStreamer &Out, ArrayRef<std::string> Inputs) {
   const auto &MCOFI = *Out.getContext().getObjectFileInfo();
   MCSection *const StrSection = MCOFI.getDwarfStrDWOSection();
   MCSection *const StrOffsetSection = MCOFI.getDwarfStrOffDWOSection();
+  MCSection *const TypesSection = MCOFI.getDwarfTypesDWOSection();
   const StringMap<std::pair<MCSection *, DWARFSectionKind>> KnownSections = {
       {"debug_info.dwo", {MCOFI.getDwarfInfoDWOSection(), DW_SECT_INFO}},
       {"debug_types.dwo", {MCOFI.getDwarfTypesDWOSection(), DW_SECT_TYPES}},
@@ -247,11 +259,9 @@ static std::error_code write(MCStreamer &Out, ArrayRef<std::string> Inputs) {
 
     StringRef CurStrSection;
     StringRef CurStrOffsetSection;
+    StringRef CurTypesSection;
     StringRef InfoSection;
     StringRef AbbrevSection;
-    StringRef TypesSection;
-
-    auto TypesOffset = ContributionOffsets[DW_SECT_TYPES - DW_SECT_INFO];
 
     for (const auto &Section : ErrOrObj->getBinary()->sections()) {
       StringRef Name;
@@ -269,9 +279,11 @@ static std::error_code write(MCStreamer &Out, ArrayRef<std::string> Inputs) {
 
       if (DWARFSectionKind Kind = SectionPair->second.second) {
         auto Index = Kind - DW_SECT_INFO;
-        CurEntry.Contributions[Index].Offset = ContributionOffsets[Index];
-        ContributionOffsets[Index] +=
-            (CurEntry.Contributions[Index].Length = Contents.size());
+        if (Kind != DW_SECT_TYPES) {
+          CurEntry.Contributions[Index].Offset = ContributionOffsets[Index];
+          ContributionOffsets[Index] +=
+              (CurEntry.Contributions[Index].Length = Contents.size());
+        }
 
         switch (Kind) {
         case DW_SECT_INFO:
@@ -280,9 +292,6 @@ static std::error_code write(MCStreamer &Out, ArrayRef<std::string> Inputs) {
         case DW_SECT_ABBREV:
           AbbrevSection = Contents;
           break;
-        case DW_SECT_TYPES:
-          TypesSection = Contents;
-          break;
         default:
           break;
         }
@@ -293,6 +302,8 @@ static std::error_code write(MCStreamer &Out, ArrayRef<std::string> Inputs) {
         CurStrOffsetSection = Contents;
       else if (OutSection == StrSection)
         CurStrSection = Contents;
+      else if (OutSection == TypesSection)
+        CurTypesSection = Contents;
       else {
         Out.SwitchSection(OutSection);
         Out.EmitBytes(Contents);
@@ -302,7 +313,8 @@ static std::error_code write(MCStreamer &Out, ArrayRef<std::string> Inputs) {
     assert(!AbbrevSection.empty());
     assert(!InfoSection.empty());
     CurEntry.Signature = getCUSignature(AbbrevSection, InfoSection);
-    addAllTypes(TypeIndexEntries, TypesOffset, TypesSection, CurEntry);
+    addAllTypes(Out, TypeIndexEntries, TypesSection, CurTypesSection, CurEntry,
+                ContributionOffsets[DW_SECT_TYPES - DW_SECT_INFO]);
 
     if (auto Err = writeStringsAndOffsets(Out, Strings, StringOffset,
                                           StrSection, StrOffsetSection,

From 48649c79c2a485af1140d9ae0ea0c01dc761fcf6 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Wed, 9 Dec 2015 21:08:18 +0000
Subject: [PATCH 291/364] [Float2Int] Don't operate on vector instructions

This fixes a crash bug. It's also not clear if we'd want to do this
transform for vectors.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255155 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/Float2Int.cpp |  2 ++
 test/Transforms/Float2Int/basic.ll  | 10 ++++++++++
 2 files changed, 12 insertions(+)

diff --git a/lib/Transforms/Scalar/Float2Int.cpp b/lib/Transforms/Scalar/Float2Int.cpp
index a9a4a24f8e93..7f5d78656b50 100644
--- a/lib/Transforms/Scalar/Float2Int.cpp
+++ b/lib/Transforms/Scalar/Float2Int.cpp
@@ -131,6 +131,8 @@ static Instruction::BinaryOps mapBinOpcode(unsigned Opcode) {
 // integer domain.
 void Float2Int::findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots) {
   for (auto &I : instructions(F)) {
+    if (isa<VectorType>(I.getType()))
+      continue;
     switch (I.getOpcode()) {
     default: break;
     case Instruction::FPToUI:
diff --git a/test/Transforms/Float2Int/basic.ll b/test/Transforms/Float2Int/basic.ll
index f4d946914cd4..7f04a594dc80 100644
--- a/test/Transforms/Float2Int/basic.ll
+++ b/test/Transforms/Float2Int/basic.ll
@@ -254,3 +254,13 @@ define i32 @neg_calluser(i32 %value) {
   ret i32 %7
 }
 declare double @g(double)
+
+; CHECK-LABEL: @neg_vector
+; CHECK:  %1 = uitofp <4 x i8> %a to <4 x float>
+; CHECK:  %2 = fptoui <4 x float> %1 to <4 x i16>
+; CHECK:  ret <4 x i16> %2
+define <4 x i16> @neg_vector(<4 x i8> %a) {
+  %1 = uitofp <4 x i8> %a to <4 x float>
+  %2 = fptoui <4 x float> %1 to <4 x i16>
+  ret <4 x i16> %2
+}

From e8315919f02c3e94bf701c86180b8da723cafcb7 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Wed, 9 Dec 2015 21:11:42 +0000
Subject: [PATCH 292/364] [ThinLTO] Release files read when creating combined
 index in gold plugin

This wasn't causing an issue since at HEAD we exit the linker completely
after creating the combined index.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255156 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/gold/gold-plugin.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/gold/gold-plugin.cpp b/tools/gold/gold-plugin.cpp
index 08ff45d45ebb..9f8b0b20f613 100644
--- a/tools/gold/gold-plugin.cpp
+++ b/tools/gold/gold-plugin.cpp
@@ -919,6 +919,9 @@ static ld_plugin_status allSymbolsReadHook(raw_fd_ostream *ApiFile) {
         continue;
 
       CombinedIndex.mergeFrom(std::move(Index), ++NextModuleId);
+
+      if (release_input_file(F.handle) != LDPS_OK)
+        message(LDPL_FATAL, "Failed to release file information");
     }
 
     std::error_code EC;

From 4381284cf88991a0971886daa914225539c570e5 Mon Sep 17 00:00:00 2001
From: Justin Bogner <mail@justinbogner.com>
Date: Wed, 9 Dec 2015 21:21:07 +0000
Subject: [PATCH 293/364] IR: Make ConstantDataArray::getFP actually return a
 ConstantDataArray

The ConstantDataArray::getFP(LLVMContext &, ArrayRef<uint16_t>)
overload has had a typo in it since it was written, where it will
create a Vector instead of an Array. This obviously doesn't work at
all, but it turns out that until r254991 there weren't actually any
callers of this overload. Fix the typo and add some test coverage.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255157 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/IR/Constants.cpp                     |  2 +-
 test/Transforms/ConstProp/insertvalue.ll | 10 ++++++++++
 unittests/IR/ConstantsTest.cpp           | 23 +++++++++++++++++++++++
 3 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/lib/IR/Constants.cpp b/lib/IR/Constants.cpp
index 509783fff8bd..36282c164293 100644
--- a/lib/IR/Constants.cpp
+++ b/lib/IR/Constants.cpp
@@ -2523,7 +2523,7 @@ Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<double> Elts) {
 /// object.
 Constant *ConstantDataArray::getFP(LLVMContext &Context,
                                    ArrayRef<uint16_t> Elts) {
-  Type *Ty = VectorType::get(Type::getHalfTy(Context), Elts.size());
+  Type *Ty = ArrayType::get(Type::getHalfTy(Context), Elts.size());
   const char *Data = reinterpret_cast<const char *>(Elts.data());
   return getImpl(StringRef(const_cast<char *>(Data), Elts.size() * 2), Ty);
 }
diff --git a/test/Transforms/ConstProp/insertvalue.ll b/test/Transforms/ConstProp/insertvalue.ll
index dce2b728b93b..606f7ddc679c 100644
--- a/test/Transforms/ConstProp/insertvalue.ll
+++ b/test/Transforms/ConstProp/insertvalue.ll
@@ -74,3 +74,13 @@ define i32 @test-float-Nan() {
 ; CHECK: @test-float-Nan
 ; CHECK: ret i32 2139171423
 }
+
+define i16 @test-half-Nan() {
+  %A = bitcast i16 32256 to half
+  %B = insertvalue [1 x half] undef, half %A, 0
+  %C = extractvalue [1 x half] %B, 0
+  %D = bitcast half %C to i16
+  ret i16 %D
+; CHECK: @test-half-Nan
+; CHECK: ret i16 32256
+}
diff --git a/unittests/IR/ConstantsTest.cpp b/unittests/IR/ConstantsTest.cpp
index 8c33453d293d..0bf98f35b3c8 100644
--- a/unittests/IR/ConstantsTest.cpp
+++ b/unittests/IR/ConstantsTest.cpp
@@ -389,6 +389,29 @@ static std::string getNameOfType(Type *T) {
   return S;
 }
 
+TEST(ConstantsTest, BuildConstantDataArrays) {
+  LLVMContext Context;
+  std::unique_ptr<Module> M(new Module("MyModule", Context));
+
+  for (Type *T : {Type::getInt8Ty(Context), Type::getInt16Ty(Context),
+                  Type::getInt32Ty(Context), Type::getInt64Ty(Context)}) {
+    ArrayType *ArrayTy = ArrayType::get(T, 2);
+    Constant *Vals[] = {ConstantInt::get(T, 0), ConstantInt::get(T, 1)};
+    Constant *CDV = ConstantArray::get(ArrayTy, Vals);
+    ASSERT_TRUE(dyn_cast<ConstantDataArray>(CDV) != nullptr)
+        << " T = " << getNameOfType(T);
+  }
+
+  for (Type *T : {Type::getHalfTy(Context), Type::getFloatTy(Context),
+                  Type::getDoubleTy(Context)}) {
+    ArrayType *ArrayTy = ArrayType::get(T, 2);
+    Constant *Vals[] = {ConstantFP::get(T, 0), ConstantFP::get(T, 1)};
+    Constant *CDV = ConstantArray::get(ArrayTy, Vals);
+    ASSERT_TRUE(dyn_cast<ConstantDataArray>(CDV) != nullptr)
+        << " T = " << getNameOfType(T);
+  }
+}
+
 TEST(ConstantsTest, BuildConstantDataVectors) {
   LLVMContext Context;
   std::unique_ptr<Module> M(new Module("MyModule", Context));

From 49c4c7b2a7d8630c7c30cfb7089d9b2e2c18946c Mon Sep 17 00:00:00 2001
From: Rong Xu <xur@google.com>
Date: Wed, 9 Dec 2015 21:27:59 +0000
Subject: [PATCH 294/364] [PGO] Rename the profdata filename to avoid the
 conflict b/w tests.

Two tests diag_mismatch.ll and diag_no_funcprofdata.ll generates the same
profdata filename which can conflict in current test runs. This patch
renames them to have different names.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255158 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/PGOProfile/diag_no_funcprofdata.ll | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/Transforms/PGOProfile/diag_no_funcprofdata.ll b/test/Transforms/PGOProfile/diag_no_funcprofdata.ll
index 39cd978dc77e..24262fdf2909 100644
--- a/test/Transforms/PGOProfile/diag_no_funcprofdata.ll
+++ b/test/Transforms/PGOProfile/diag_no_funcprofdata.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-profdata merge %S/Inputs/diag.proftext -o %T/diag.profdata
-; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%T/diag.profdata -S 2>&1 | FileCheck %s
+; RUN: llvm-profdata merge %S/Inputs/diag.proftext -o %T/diag2.profdata
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%T/diag2.profdata -S 2>&1 | FileCheck %s
 
 ; CHECK: No profile data available for function bar
 

From fad81ab170b3d422f0aaa15b88157fdb16d3e75d Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Wed, 9 Dec 2015 22:44:00 +0000
Subject: [PATCH 295/364] Synchronize the logic for deciding to link a gv.

We were deciding to not link an available_externally gv over a
declaration, but then copying over the body anyway.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255169 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Linker/LinkModules.cpp                           | 7 ++++++-
 test/Linker/Inputs/available_externally_over_decl.ll | 5 +++++
 test/Linker/available_externally_over_decl.ll        | 9 +++++++++
 3 files changed, 20 insertions(+), 1 deletion(-)
 create mode 100644 test/Linker/Inputs/available_externally_over_decl.ll
 create mode 100644 test/Linker/available_externally_over_decl.ll

diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index a9fcee7c98ce..3d40c126dd27 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -1050,7 +1050,12 @@ bool ModuleLinker::shouldLinkFromSource(bool &LinkFromSrc,
       return false;
     }
     // If the Dest is weak, use the source linkage.
-    LinkFromSrc = Dest.hasExternalWeakLinkage();
+    if (Dest.hasExternalWeakLinkage()) {
+      LinkFromSrc = true;
+      return false;
+    }
+    // Link an available_externally over a declaration.
+    LinkFromSrc = !Src.isDeclaration() && Dest.isDeclaration();
     return false;
   }
 
diff --git a/test/Linker/Inputs/available_externally_over_decl.ll b/test/Linker/Inputs/available_externally_over_decl.ll
new file mode 100644
index 000000000000..b440fde92144
--- /dev/null
+++ b/test/Linker/Inputs/available_externally_over_decl.ll
@@ -0,0 +1,5 @@
+@h = global void ()* @f
+
+define available_externally void @f() {
+  ret void
+}
diff --git a/test/Linker/available_externally_over_decl.ll b/test/Linker/available_externally_over_decl.ll
new file mode 100644
index 000000000000..e220b17edf72
--- /dev/null
+++ b/test/Linker/available_externally_over_decl.ll
@@ -0,0 +1,9 @@
+; RUN: llvm-link -S %s %p/Inputs/available_externally_over_decl.ll | FileCheck %s
+
+declare void @f()
+
+define void ()* @main() {
+  ret void ()* @f
+}
+
+; CHECK: define available_externally void @f() {

From 28e18e17826176dbe070907fe2c1f46ff14649b3 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 9 Dec 2015 22:45:45 +0000
Subject: [PATCH 296/364] use range-based for loops; NFCI

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255171 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/MachineTraceMetrics.cpp | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/lib/CodeGen/MachineTraceMetrics.cpp b/lib/CodeGen/MachineTraceMetrics.cpp
index d9a6b68462eb..f7edacd5ebaf 100644
--- a/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/lib/CodeGen/MachineTraceMetrics.cpp
@@ -724,13 +724,12 @@ static void updatePhysDepsDownwards(const MachineInstr *UseMI,
 
   // Update RegUnits to reflect live registers after UseMI.
   // First kills.
-  for (unsigned i = 0, e = Kills.size(); i != e; ++i)
-    for (MCRegUnitIterator Units(Kills[i], TRI); Units.isValid(); ++Units)
+  for (unsigned Kill : Kills)
+    for (MCRegUnitIterator Units(Kill, TRI); Units.isValid(); ++Units)
       RegUnits.erase(*Units);
 
   // Second, live defs.
-  for (unsigned i = 0, e = LiveDefOps.size(); i != e; ++i) {
-    unsigned DefOp = LiveDefOps[i];
+  for (unsigned DefOp : LiveDefOps) {
     for (MCRegUnitIterator Units(UseMI->getOperand(DefOp).getReg(), TRI);
          Units.isValid(); ++Units) {
       LiveRegUnit &LRU = RegUnits[*Units];
@@ -756,8 +755,7 @@ computeCrossBlockCriticalPath(const TraceBlockInfo &TBI) {
   assert(TBI.HasValidInstrDepths && "Missing depth info");
   assert(TBI.HasValidInstrHeights && "Missing height info");
   unsigned MaxLen = 0;
-  for (unsigned i = 0, e = TBI.LiveIns.size(); i != e; ++i) {
-    const LiveInReg &LIR = TBI.LiveIns[i];
+  for (const LiveInReg &LIR : TBI.LiveIns) {
     if (!TargetRegisterInfo::isVirtualRegister(LIR.Reg))
       continue;
     const MachineInstr *DefMI = MTM.MRI->getVRegDef(LIR.Reg);

From d110e2b4a3b6f443997ddb23fdb98020f28f84d2 Mon Sep 17 00:00:00 2001
From: Quentin Colombet <qcolombet@apple.com>
Date: Wed, 9 Dec 2015 23:08:18 +0000
Subject: [PATCH 297/364] [X86] Enable shrink-wrapping by default, but keep it
 disabled for stack frames without a frame pointer when unwind may happen.
 This is a workaround for a bug in the way we emit the CFI directives for
 frameless unwind information. See PR25614.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255175 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86FrameLowering.cpp        |   6 +
 lib/Target/X86/X86FrameLowering.h          |   3 +
 test/CodeGen/X86/avx-splat.ll              |  10 +-
 test/CodeGen/X86/tail-opts.ll              |   2 +-
 test/CodeGen/X86/x86-shrink-wrap-unwind.ll | 153 +++++++++++++++++++++
 5 files changed, 168 insertions(+), 6 deletions(-)
 create mode 100644 test/CodeGen/X86/x86-shrink-wrap-unwind.ll

diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 2e7ed58e340a..8695a0e2967a 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -2595,6 +2595,12 @@ bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
   return !flagsNeedToBePreservedBeforeTheTerminators(MBB);
 }
 
+bool X86FrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
+  // If we may need to emit frameless compact unwind information, give
+  // up as this is currently broken: PR25614.
+  return MF.getFunction()->hasFnAttribute(Attribute::NoUnwind) || hasFP(MF);
+}
+
 MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
     DebugLoc DL, bool RestoreSP) const {
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index 68dc8edfd430..3ab41b4a5789 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -134,6 +134,9 @@ class X86FrameLowering : public TargetFrameLowering {
   /// \p MBB will be correctly handled by the target.
   bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override;
 
+  /// Returns true if the target will correctly handle shrink wrapping.
+  bool enableShrinkWrapping(const MachineFunction &MF) const override;
+
   /// convertArgMovsToPushes - This method tries to convert a call sequence
   /// that uses sub and mov instructions to put the argument onto the stack
   /// into a series of pushes.
diff --git a/test/CodeGen/X86/avx-splat.ll b/test/CodeGen/X86/avx-splat.ll
index 70dabe441444..37eacc87c3fb 100644
--- a/test/CodeGen/X86/avx-splat.ll
+++ b/test/CodeGen/X86/avx-splat.ll
@@ -57,19 +57,19 @@ entry:
 define <8 x float> @funcE() nounwind {
 ; CHECK-LABEL: funcE:
 ; CHECK:       ## BB#0: ## %for_exit499
-; CHECK-NEXT:    pushq %rbp
-; CHECK-NEXT:    movq %rsp, %rbp
-; CHECK-NEXT:    andq $-32, %rsp
-; CHECK-NEXT:    subq $1312, %rsp ## imm = 0x520
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    ## implicit-def: %YMM0
 ; CHECK-NEXT:    testb %al, %al
 ; CHECK-NEXT:    jne LBB4_2
 ; CHECK-NEXT:  ## BB#1: ## %load.i1247
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    movq %rsp, %rbp
+; CHECK-NEXT:    andq $-32, %rsp
+; CHECK-NEXT:    subq $1312, %rsp ## imm = 0x520
 ; CHECK-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %ymm0
-; CHECK-NEXT:  LBB4_2: ## %__load_and_broadcast_32.exit1249
 ; CHECK-NEXT:    movq %rbp, %rsp
 ; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:  LBB4_2: ## %__load_and_broadcast_32.exit1249
 ; CHECK-NEXT:    retq
 allocas:
   %udx495 = alloca [18 x [18 x float]], align 32
diff --git a/test/CodeGen/X86/tail-opts.ll b/test/CodeGen/X86/tail-opts.ll
index c522ba60d6bf..bf778e5bad2b 100644
--- a/test/CodeGen/X86/tail-opts.ll
+++ b/test/CodeGen/X86/tail-opts.ll
@@ -277,8 +277,8 @@ declare fastcc %union.tree_node* @default_conversion(%union.tree_node*) nounwind
 
 ; CHECK-LABEL: foo:
 ; CHECK:        callq func
-; CHECK-NEXT: .LBB4_2:
 ; CHECK-NEXT:   popq
+; CHECK-NEXT: .LBB4_2:
 ; CHECK-NEXT:   ret
 
 define void @foo(i1* %V) nounwind {
diff --git a/test/CodeGen/X86/x86-shrink-wrap-unwind.ll b/test/CodeGen/X86/x86-shrink-wrap-unwind.ll
new file mode 100644
index 000000000000..7c00f407b1e0
--- /dev/null
+++ b/test/CodeGen/X86/x86-shrink-wrap-unwind.ll
@@ -0,0 +1,153 @@
+; RUN: llc %s -o - | FileCheck %s --check-prefix=CHECK
+;
+; This test checks that we do not use shrink-wrapping when
+; the function does not have any frame pointer and may unwind.
+; This is a workaround for a limitation in the emission of
+; the CFI directives, that are not correct in such case.
+; PR25614
+;
+; Note: This test cannot be merged with the shrink-wrapping tests
+; because the booleans set on the command line take precedence on
+; the target logic that disable shrink-wrapping.
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "x86_64-apple-macosx"
+
+
+; No shrink-wrapping should occur here, until the CFI information are fixed.
+; CHECK-LABEL: framelessUnwind:
+;
+; Prologue code.
+; (What we push does not matter. It should be some random sratch register.)
+; CHECK: pushq
+;
+; Compare the arguments and jump to exit.
+; After the prologue is set.
+; CHECK: movl %edi, [[ARG0CPY:%e[a-z]+]]
+; CHECK-NEXT: cmpl %esi, [[ARG0CPY]]
+; CHECK-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]]
+;
+; Store %a in the alloca.
+; CHECK: movl [[ARG0CPY]], 4(%rsp)
+; Set the alloca address in the second argument.
+; CHECK-NEXT: leaq 4(%rsp), %rsi
+; Set the first argument to zero.
+; CHECK-NEXT: xorl %edi, %edi
+; CHECK-NEXT: callq _doSomething
+;
+; CHECK: [[EXIT_LABEL]]:
+;
+; Without shrink-wrapping, epilogue is in the exit block.
+; Epilogue code. (What we pop does not matter.)
+; CHECK-NEXT: popq
+;
+; CHECK-NEXT: retq
+define i32 @framelessUnwind(i32 %a, i32 %b) #0 {
+  %tmp = alloca i32, align 4
+  %tmp2 = icmp slt i32 %a, %b
+  br i1 %tmp2, label %true, label %false
+
+true:
+  store i32 %a, i32* %tmp, align 4
+  %tmp4 = call i32 @doSomething(i32 0, i32* %tmp)
+  br label %false
+
+false:
+  %tmp.0 = phi i32 [ %tmp4, %true ], [ %a, %0 ]
+  ret i32 %tmp.0
+}
+
+declare i32 @doSomething(i32, i32*)
+
+attributes #0 = { "no-frame-pointer-elim"="false" }
+
+; Shrink-wrapping should occur here. We have a frame pointer.
+; CHECK-LABEL: frameUnwind:
+;
+; Compare the arguments and jump to exit.
+; No prologue needed.
+;
+; Compare the arguments and jump to exit.
+; After the prologue is set.
+; CHECK: movl %edi, [[ARG0CPY:%e[a-z]+]]
+; CHECK-NEXT: cmpl %esi, [[ARG0CPY]]
+; CHECK-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]]
+;
+; Prologue code.
+; CHECK: pushq %rbp
+; CHECK: movq %rsp, %rbp
+;
+; Store %a in the alloca.
+; CHECK: movl [[ARG0CPY]], -4(%rbp)
+; Set the alloca address in the second argument.
+; CHECK-NEXT: leaq -4(%rbp), %rsi
+; Set the first argument to zero.
+; CHECK-NEXT: xorl %edi, %edi
+; CHECK-NEXT: callq _doSomething
+;
+; Epilogue code. (What we pop does not matter.)
+; CHECK: popq %rbp
+;
+; CHECK: [[EXIT_LABEL]]:
+; CHECK-NEXT: retq
+define i32 @frameUnwind(i32 %a, i32 %b) #1 {
+  %tmp = alloca i32, align 4
+  %tmp2 = icmp slt i32 %a, %b
+  br i1 %tmp2, label %true, label %false
+
+true:
+  store i32 %a, i32* %tmp, align 4
+  %tmp4 = call i32 @doSomething(i32 0, i32* %tmp)
+  br label %false
+
+false:
+  %tmp.0 = phi i32 [ %tmp4, %true ], [ %a, %0 ]
+  ret i32 %tmp.0
+}
+
+attributes #1 = { "no-frame-pointer-elim"="true" }
+
+; Shrink-wrapping should occur here. We do not have to unwind.
+; CHECK-LABEL: framelessnoUnwind:
+;
+; Compare the arguments and jump to exit.
+; No prologue needed.
+;
+; Compare the arguments and jump to exit.
+; After the prologue is set.
+; CHECK: movl %edi, [[ARG0CPY:%e[a-z]+]]
+; CHECK-NEXT: cmpl %esi, [[ARG0CPY]]
+; CHECK-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]]
+;
+; Prologue code.
+; (What we push does not matter. It should be some random sratch register.)
+; CHECK: pushq
+;
+; Store %a in the alloca.
+; CHECK: movl [[ARG0CPY]], 4(%rsp)
+; Set the alloca address in the second argument.
+; CHECK-NEXT: leaq 4(%rsp), %rsi
+; Set the first argument to zero.
+; CHECK-NEXT: xorl %edi, %edi
+; CHECK-NEXT: callq _doSomething
+;
+; Epilogue code.
+; CHECK-NEXT: addq
+;
+; CHECK: [[EXIT_LABEL]]:
+; CHECK-NEXT: retq
+define i32 @framelessnoUnwind(i32 %a, i32 %b) #2 {
+  %tmp = alloca i32, align 4
+  %tmp2 = icmp slt i32 %a, %b
+  br i1 %tmp2, label %true, label %false
+
+true:
+  store i32 %a, i32* %tmp, align 4
+  %tmp4 = call i32 @doSomething(i32 0, i32* %tmp)
+  br label %false
+
+false:
+  %tmp.0 = phi i32 [ %tmp4, %true ], [ %a, %0 ]
+  ret i32 %tmp.0
+}
+
+attributes #2 = { "no-frame-pointer-elim"="false" nounwind }

From 492f1085a465114904f822fe36e737840442dd36 Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Thu, 10 Dec 2015 00:17:35 +0000
Subject: [PATCH 298/364] [WebAssembly] Implement anyext.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255179 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/WebAssemblyInstrConv.td |  9 +++++++++
 test/CodeGen/WebAssembly/conv.ll               | 11 +++++++++++
 2 files changed, 20 insertions(+)

diff --git a/lib/Target/WebAssembly/WebAssemblyInstrConv.td b/lib/Target/WebAssembly/WebAssemblyInstrConv.td
index 4926b8fd8725..931f4a913d0f 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrConv.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrConv.td
@@ -26,6 +26,15 @@ def I64_EXTEND_U_I32 : I<(outs I64:$dst), (ins I32:$src),
                          [(set I64:$dst, (zext I32:$src))],
                          "i64.extend_u/i32\t$dst, $src">;
 
+} // defs = [ARGUMENTS]
+
+// Expand a "don't care" extend into zero-extend (chosen over sign-extend
+// somewhat arbitrarily, although it favors popular hardware architectures
+// and is conceptually a simpler operation).
+def : Pat<(i64 (anyext I32:$src)), (I64_EXTEND_U_I32 I32:$src)>;
+
+let Defs = [ARGUMENTS] in {
+
 // Conversion from floating point to integer traps on overflow and invalid.
 let hasSideEffects = 1 in {
 def I32_TRUNC_S_F32 : I<(outs I32:$dst), (ins F32:$src),
diff --git a/test/CodeGen/WebAssembly/conv.ll b/test/CodeGen/WebAssembly/conv.ll
index 2674a335a137..76fa38090a1f 100644
--- a/test/CodeGen/WebAssembly/conv.ll
+++ b/test/CodeGen/WebAssembly/conv.ll
@@ -214,3 +214,14 @@ define float @f32_demote_f64(double %x) {
   %a = fptrunc double %x to float
   ret float %a
 }
+
+; If the high its are unused, LLVM will optimize sext/zext into anyext, which
+; we need to patterm-match back to a specific instruction.
+
+; CHECK-LABEL: anyext:
+; CHECK: i64.extend_u/i32 $push0=, $0{{$}}
+define i64 @anyext(i32 %x) {
+    %y = sext i32 %x to i64
+    %w = shl i64 %y, 32
+    ret i64 %w
+}

From f39d8644fababac8fa9a1b4dabe1754b831c1fb5 Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Thu, 10 Dec 2015 00:22:40 +0000
Subject: [PATCH 299/364] [WebAssembly] Fix copy+pastos.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255180 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../WebAssembly/WebAssemblyISelLowering.cpp   | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 4883d83647db..a8b93ca8a2fb 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -313,14 +313,16 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
   for (const ISD::OutputArg &Out : Outs) {
-    assert(!Out.Flags.isByVal() && "byval is not valid for return values");
-    assert(!Out.Flags.isNest() && "nest is not valid for return values");
+    if (Out.Flags.isByVal())
+      fail(DL, DAG, "WebAssembly hasn't implemented byval arguments");
+    if (Out.Flags.isNest())
+      fail(DL, DAG, "WebAssembly hasn't implemented nest arguments");
     if (Out.Flags.isInAlloca())
-      fail(DL, DAG, "WebAssembly hasn't implemented inalloca results");
+      fail(DL, DAG, "WebAssembly hasn't implemented inalloca arguments");
     if (Out.Flags.isInConsecutiveRegs())
-      fail(DL, DAG, "WebAssembly hasn't implemented cons regs results");
+      fail(DL, DAG, "WebAssembly hasn't implemented cons regs arguments");
     if (Out.Flags.isInConsecutiveRegsLast())
-      fail(DL, DAG, "WebAssembly hasn't implemented cons regs last results");
+      fail(DL, DAG, "WebAssembly hasn't implemented cons regs last arguments");
   }
 
   bool IsVarArg = CLI.IsVarArg;
@@ -388,16 +390,14 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   SmallVector<EVT, 8> Tys;
   for (const auto &In : Ins) {
-    if (In.Flags.isByVal())
-      fail(DL, DAG, "WebAssembly hasn't implemented byval arguments");
+    assert(!In.Flags.isByVal() && "byval is not valid for return values");
+    assert(!In.Flags.isNest() && "nest is not valid for return values");
     if (In.Flags.isInAlloca())
-      fail(DL, DAG, "WebAssembly hasn't implemented inalloca arguments");
-    if (In.Flags.isNest())
-      fail(DL, DAG, "WebAssembly hasn't implemented nest arguments");
+      fail(DL, DAG, "WebAssembly hasn't implemented inalloca return values");
     if (In.Flags.isInConsecutiveRegs())
-      fail(DL, DAG, "WebAssembly hasn't implemented cons regs arguments");
+      fail(DL, DAG, "WebAssembly hasn't implemented cons regs return values");
     if (In.Flags.isInConsecutiveRegsLast())
-      fail(DL, DAG, "WebAssembly hasn't implemented cons regs last arguments");
+      fail(DL, DAG, "WebAssembly hasn't implemented cons regs last return values");
     // Ignore In.getOrigAlign() because all our arguments are passed in
     // registers.
     Tys.push_back(In.VT);

From a796bb8e0a72426c87912b85d6af8a164aa179e3 Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Thu, 10 Dec 2015 00:26:26 +0000
Subject: [PATCH 300/364] [WebAssembly] Fix legalization of shift operators
 with illegal types.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255181 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../WebAssembly/WebAssemblyISelLowering.cpp   |  8 ++++++-
 test/CodeGen/WebAssembly/legalize.ll          | 24 +++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)
 create mode 100644 test/CodeGen/WebAssembly/legalize.ll

diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index a8b93ca8a2fb..37b82d6385a1 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -202,7 +202,13 @@ bool WebAssemblyTargetLowering::isOffsetFoldingLegal(
 
 MVT WebAssemblyTargetLowering::getScalarShiftAmountTy(const DataLayout & /*DL*/,
                                                       EVT VT) const {
-  return VT.getSimpleVT();
+  unsigned BitWidth = NextPowerOf2(VT.getSizeInBits() - 1);
+  if (BitWidth > 1 && BitWidth < 8)
+    BitWidth = 8;
+  MVT Result = MVT::getIntegerVT(BitWidth);
+  assert(Result != MVT::INVALID_SIMPLE_VALUE_TYPE &&
+         "Unable to represent scalar shift amount type");
+  return Result;
 }
 
 const char *
diff --git a/test/CodeGen/WebAssembly/legalize.ll b/test/CodeGen/WebAssembly/legalize.ll
new file mode 100644
index 000000000000..4bfec212cf76
--- /dev/null
+++ b/test/CodeGen/WebAssembly/legalize.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test various types and operators that need to be legalized.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: shl_i3:
+; CHECK: i32.const   $push0=, 7
+; CHECK: i32.and     $push1=, $1, $pop0
+; CHECK: i32.shl     $push2=, $0, $pop1
+define i3 @shl_i3(i3 %a, i3 %b, i3* %p) {
+  %t = shl i3 %a, %b
+  ret i3 %t
+}
+
+; CHECK-LABEL: shl_i53:
+; CHECK: i64.const   $push0=, 9007199254740991
+; CHECK: i64.and     $push1=, $1, $pop0
+; CHECK: i64.shl     $push2=, $0, $pop1
+define i53 @shl_i53(i53 %a, i53 %b, i53* %p) {
+  %t = shl i53 %a, %b
+  ret i53 %t
+}

From 029e84caf411eeaf5a458af0a987708500ec1ab5 Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Thu, 10 Dec 2015 00:37:51 +0000
Subject: [PATCH 301/364] PeepholeOptimizer: Ignore dead implicit defs

Target-specific instructions may have uninteresting physreg clobbers,
for target-specific reasons. The peephole pass doesn't need to concern
itself with such defs, as long as they're implicit and marked as dead.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255182 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/PeepholeOptimizer.cpp |  6 ++++++
 test/CodeGen/WebAssembly/conv.ll  | 28 ++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp
index 0fabc40b64e0..f861edf7da25 100644
--- a/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/lib/CodeGen/PeepholeOptimizer.cpp
@@ -1343,6 +1343,9 @@ bool PeepholeOptimizer::foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB,
     MachineOperand &MO = MI->getOperand(i);
     if (!MO.isReg() || MO.isDef())
       continue;
+    // Ignore dead implicit defs.
+    if (MO.isImplicit() && MO.isDead())
+      continue;
     unsigned Reg = MO.getReg();
     if (!TargetRegisterInfo::isVirtualRegister(Reg))
       continue;
@@ -1703,6 +1706,9 @@ ValueTrackerResult ValueTracker::getNextSourceFromBitcast() {
     const MachineOperand &MO = Def->getOperand(OpIdx);
     if (!MO.isReg() || !MO.getReg())
       continue;
+    // Ignore dead implicit defs.
+    if (MO.isImplicit() && MO.isDead())
+      continue;
     assert(!MO.isDef() && "We should have skipped all the definitions by now");
     if (SrcIdx != EndOpIdx)
       // Multiple sources?
diff --git a/test/CodeGen/WebAssembly/conv.ll b/test/CodeGen/WebAssembly/conv.ll
index 76fa38090a1f..e1acaca2c9ec 100644
--- a/test/CodeGen/WebAssembly/conv.ll
+++ b/test/CodeGen/WebAssembly/conv.ll
@@ -225,3 +225,31 @@ define i64 @anyext(i32 %x) {
     %w = shl i64 %y, 32
     ret i64 %w
 }
+
+; CHECK-LABEL: bitcast_i32_to_float:
+; CHECK: f32.reinterpret/i32   $push0=, $0{{$}}
+define float @bitcast_i32_to_float(i32 %a) {
+  %t = bitcast i32 %a to float
+  ret float %t
+}
+
+; CHECK-LABEL: bitcast_float_to_i32:
+; CHECK: i32.reinterpret/f32   $push0=, $0{{$}}
+define i32 @bitcast_float_to_i32(float %a) {
+  %t = bitcast float %a to i32
+  ret i32 %t
+}
+
+; CHECK-LABEL: bitcast_i64_to_double:
+; CHECK: f64.reinterpret/i64   $push0=, $0{{$}}
+define double @bitcast_i64_to_double(i64 %a) {
+  %t = bitcast i64 %a to double
+  ret double %t
+}
+
+; CHECK-LABEL: bitcast_double_to_i64:
+; CHECK: i64.reinterpret/f64   $push0=, $0{{$}}
+define i64 @bitcast_double_to_i64(double %a) {
+  %t = bitcast double %a to i64
+  ret i64 %t
+}

From 0c2c493a1d8a016cf70be95d2a860e83f4bb089b Mon Sep 17 00:00:00 2001
From: Derek Schuff <dschuff@google.com>
Date: Thu, 10 Dec 2015 00:56:18 +0000
Subject: [PATCH 302/364] [WebAssembly] Update test failure expectations

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255190 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../WebAssembly/known_gcc_test_failures.txt   | 38 +++++--------------
 1 file changed, 10 insertions(+), 28 deletions(-)

diff --git a/lib/Target/WebAssembly/known_gcc_test_failures.txt b/lib/Target/WebAssembly/known_gcc_test_failures.txt
index ca0fe204fd4a..b5c887f10d88 100644
--- a/lib/Target/WebAssembly/known_gcc_test_failures.txt
+++ b/lib/Target/WebAssembly/known_gcc_test_failures.txt
@@ -6,13 +6,9 @@ pr38151.c
 va-arg-22.c
 
 # ValueTypes.h:222: llvm::MVT llvm::EVT::getSimpleVT() const: Assertion `isSimple() && "Expected a SimpleValueType!"' failed.
-inst-check.c
 loop-12.c
-pr57344-1.c
-pr57344-2.c
-pr57344-3.c
-pr57344-4.c
-pr58570.c
+
+
 
 # TargetRegisterInfo.h:315: static unsigned int llvm::TargetRegisterInfo::virtReg2Index(unsigned int): Assertion `isVirtualRegister(Reg) && "Not a virtual register"' failed.
 20021120-3.c
@@ -34,10 +30,6 @@ va-arg-trap-1.c
 # LiveInterval.cpp:1092: void llvm::LiveRange::verify() const: Assertion `I->end <= std::next(I)->start' failed.
 20020406-1.c
 
-# PeepholeOptimizer.cpp:1706: {anonymous}::ValueTrackerResult {anonymous}::ValueTracker::getNextSourceFromBitcast(): Assertion `!MO.isDef() && "We should have skipped all the definitions by now"' failed.
-20020227-1.c
-pr35456.c
-strct-pack-1.c
 
 # WebAssemblyCFGStackify.cpp:211: void SortBlocks(llvm::MachineFunction&, const llvm::MachineLoopInfo&): Assertion `L->contains( MLI.getLoopFor(&*prev(MachineFunction::iterator(&MBB)))) && "Loop isn't contiguous"' failed.
 20000815-1.c
@@ -156,6 +148,7 @@ pr57130.c
 20131127-1.c
 20140425-1.c
 920411-1.c
+920501-6.c
 920501-8.c
 920726-1.c
 930518-1.c
@@ -309,24 +302,9 @@ vprintf-chk-1.c
 20060110-2.c
 pr33992.c
 
-# Cannot select any_extend.
-20040709-1.c
-20040709-2.c
-20050111-1.c
-20050316-2.c
-20050316-3.c
-20050604-1.c
-920501-6.c
-921029-1.c
-961122-1.c
-comp-goto-1.c
-mode-dependent-address.c
-pr23135.c
-pr42269-2.c
-pr59747.c
-restrict-1.c
-simd-1.c
-simd-2.c
+
+
+
 
 # Cannot select callseq_end.
 20040811-1.c
@@ -339,6 +317,7 @@ vla-dealloc-1.c
 920501-5.c
 
 # Cannot select BlockAddress.
+comp-goto-1.c
 980526-1.c
 990208-1.c
 
@@ -357,6 +336,8 @@ vla-dealloc-1.c
 20020810-1.c
 20021118-1.c
 20040707-1.c
+20040709-1.c
+20040709-2.c
 20041201-1.c
 20050713-1.c
 20070614-1.c
@@ -379,6 +360,7 @@ complex-1.c
 complex-2.c
 pr15262-2.c
 pr20621-1.c
+pr23135.c
 pr30185.c
 pr42248.c
 

From 338412765288cd6041d31db12f33a39fbad6efcb Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Thu, 10 Dec 2015 01:00:19 +0000
Subject: [PATCH 303/364] [WebAssembly] Also legalize sign_extend_inreg of
 i32->i64.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255191 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/WebAssemblyISelLowering.cpp | 2 +-
 test/CodeGen/WebAssembly/legalize.ll               | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 37b82d6385a1..21a28e3b7673 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -159,7 +159,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
 
   // As a special case, these operators use the type to mean the type to
   // sign-extend from.
-  for (auto T : {MVT::i1, MVT::i8, MVT::i16})
+  for (auto T : {MVT::i1, MVT::i8, MVT::i16, MVT::i32})
     setOperationAction(ISD::SIGN_EXTEND_INREG, T, Expand);
 
   // Dynamic stack allocation: use the default expansion.
diff --git a/test/CodeGen/WebAssembly/legalize.ll b/test/CodeGen/WebAssembly/legalize.ll
index 4bfec212cf76..5279f4fa7a58 100644
--- a/test/CodeGen/WebAssembly/legalize.ll
+++ b/test/CodeGen/WebAssembly/legalize.ll
@@ -22,3 +22,12 @@ define i53 @shl_i53(i53 %a, i53 %b, i53* %p) {
   %t = shl i53 %a, %b
   ret i53 %t
 }
+
+; CHECK-LABEL: sext_in_reg_i32_i64:
+; CHECK: i64.shl
+; CHECK: i64.shr_s
+define i64 @sext_in_reg_i32_i64(i64 %a) {
+  %b = shl i64 %a, 32
+  %c = ashr i64 %b, 32
+  ret i64 %c
+}

From a84e7b8f4d9f4f0278da9522937dac9dc2a5bd26 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Thu, 10 Dec 2015 01:04:15 +0000
Subject: [PATCH 304/364] RegisterPressure: Factor out liveness dead-def
 detection logic; NFCI

Detecting additional dead-defs without a dead flag that are only visible
through liveness information should be part of the register operand
collection not intertwined with the register pressure update logic.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255192 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/RegisterPressure.h |  2 -
 lib/CodeGen/RegisterPressure.cpp        | 83 +++++++++++++------------
 2 files changed, 43 insertions(+), 42 deletions(-)

diff --git a/include/llvm/CodeGen/RegisterPressure.h b/include/llvm/CodeGen/RegisterPressure.h
index e296701d8e8c..987634fb36c3 100644
--- a/include/llvm/CodeGen/RegisterPressure.h
+++ b/include/llvm/CodeGen/RegisterPressure.h
@@ -441,8 +441,6 @@ class RegPressureTracker {
   /// after the current position.
   SlotIndex getCurrSlot() const;
 
-  const LiveRange *getLiveRange(unsigned Reg) const;
-
   void increaseRegPressure(ArrayRef<unsigned> Regs);
   void decreaseRegPressure(ArrayRef<unsigned> Regs);
 
diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp
index 6e7feb5178ee..8382b0912bde 100644
--- a/lib/CodeGen/RegisterPressure.cpp
+++ b/lib/CodeGen/RegisterPressure.cpp
@@ -171,10 +171,10 @@ void LiveRegSet::clear() {
   Regs.clear();
 }
 
-const LiveRange *RegPressureTracker::getLiveRange(unsigned Reg) const {
+static const LiveRange *getLiveRange(const LiveIntervals &LIS, unsigned Reg) {
   if (TargetRegisterInfo::isVirtualRegister(Reg))
-    return &LIS->getInterval(Reg);
-  return LIS->getCachedRegUnit(Reg);
+    return &LIS.getInterval(Reg);
+  return LIS.getCachedRegUnit(Reg);
 }
 
 void RegPressureTracker::reset() {
@@ -323,6 +323,10 @@ class RegisterOperands {
 
   void collect(const MachineInstr &MI, const TargetRegisterInfo &TRI,
                const MachineRegisterInfo &MRI, bool IgnoreDead = false);
+
+  /// Use liveness information to find dead defs not marked with a dead flag
+  /// and move them to the DeadDefs vector.
+  void detectDeadDefs(const MachineInstr &MI, const LiveIntervals &LIS);
 };
 
 /// Collect this instruction's unique uses and defs into SmallVectors for
@@ -393,6 +397,27 @@ void RegisterOperands::collect(const MachineInstr &MI,
   Collector.collectInstr(MI);
 }
 
+void RegisterOperands::detectDeadDefs(const MachineInstr &MI,
+                                      const LiveIntervals &LIS) {
+  SlotIndex SlotIdx = LIS.getInstructionIndex(&MI);
+  for (SmallVectorImpl<unsigned>::iterator RI = Defs.begin();
+       RI != Defs.end(); /*empty*/) {
+    unsigned Reg = *RI;
+    const LiveRange *LR = getLiveRange(LIS, Reg);
+    if (LR != nullptr) {
+      LiveQueryResult LRQ = LR->Query(SlotIdx);
+      if (LRQ.isDeadDef()) {
+        // LiveIntervals knows this is a dead even though it's MachineOperand is
+        // not flagged as such.
+        DeadDefs.push_back(Reg);
+        RI = Defs.erase(RI);
+        continue;
+      }
+    }
+    ++RI;
+  }
+}
+
 } // namespace
 
 /// Initialize an array of N PressureDiffs.
@@ -514,8 +539,11 @@ void RegPressureTracker::recede(SmallVectorImpl<unsigned> *LiveUses,
   if (RequireIntervals && isTopClosed())
     static_cast<IntervalPressure&>(P).openTop(SlotIdx);
 
+  const MachineInstr &MI = *CurrPos;
   RegisterOperands RegOpers;
-  RegOpers.collect(*CurrPos, *TRI, *MRI);
+  RegOpers.collect(MI, *TRI, *MRI);
+  if (RequireIntervals)
+    RegOpers.detectDeadDefs(MI, *LIS);
 
   if (PDiff)
     collectPDiff(*PDiff, RegOpers, MRI);
@@ -527,26 +555,10 @@ void RegPressureTracker::recede(SmallVectorImpl<unsigned> *LiveUses,
   // Kill liveness at live defs.
   // TODO: consider earlyclobbers?
   for (unsigned Reg : RegOpers.Defs) {
-    bool DeadDef = false;
-    if (RequireIntervals) {
-      const LiveRange *LR = getLiveRange(Reg);
-      if (LR) {
-        LiveQueryResult LRQ = LR->Query(SlotIdx);
-        DeadDef = LRQ.isDeadDef();
-      }
-    }
-    if (DeadDef) {
-      // LiveIntervals knows this is a dead even though it's MachineOperand is
-      // not flagged as such. Since this register will not be recorded as
-      // live-out, increase its PDiff value to avoid underflowing pressure.
-      if (PDiff)
-        PDiff->addPressureChange(Reg, false, MRI);
-    } else {
-      if (LiveRegs.erase(Reg))
-        decreaseRegPressure(Reg);
-      else
-        discoverLiveOut(Reg);
-    }
+    if (LiveRegs.erase(Reg))
+      decreaseRegPressure(Reg);
+    else
+      discoverLiveOut(Reg);
   }
 
   // Generate liveness for uses.
@@ -554,7 +566,7 @@ void RegPressureTracker::recede(SmallVectorImpl<unsigned> *LiveUses,
     if (!LiveRegs.contains(Reg)) {
       // Adjust liveouts if LiveIntervals are available.
       if (RequireIntervals) {
-        const LiveRange *LR = getLiveRange(Reg);
+        const LiveRange *LR = getLiveRange(*LIS, Reg);
         if (LR) {
           LiveQueryResult LRQ = LR->Query(SlotIdx);
           if (!LRQ.isKill() && !LRQ.valueDefined())
@@ -606,7 +618,7 @@ void RegPressureTracker::advance() {
     // Kill liveness at last uses.
     bool lastUse = false;
     if (RequireIntervals) {
-      const LiveRange *LR = getLiveRange(Reg);
+      const LiveRange *LR = getLiveRange(*LIS, Reg);
       lastUse = LR && LR->Query(SlotIdx).isKill();
     } else {
       // Allocatable physregs are always single-use before register rewriting.
@@ -726,22 +738,13 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) {
   RegisterOperands RegOpers;
   RegOpers.collect(*MI, *TRI, *MRI, /*IgnoreDead=*/true);
   assert(RegOpers.DeadDefs.size() == 0);
+  if (RequireIntervals)
+    RegOpers.detectDeadDefs(*MI, *LIS);
 
   // Kill liveness at live defs.
   for (unsigned Reg : RegOpers.Defs) {
-    bool DeadDef = false;
-    if (RequireIntervals) {
-      const LiveRange *LR = getLiveRange(Reg);
-      if (LR) {
-        SlotIndex SlotIdx = LIS->getInstructionIndex(MI);
-        LiveQueryResult LRQ = LR->Query(SlotIdx);
-        DeadDef = LRQ.isDeadDef();
-      }
-    }
-    if (!DeadDef) {
-      if (!containsReg(RegOpers.Uses, Reg))
-        decreaseRegPressure(Reg);
-    }
+    if (!containsReg(RegOpers.Uses, Reg))
+      decreaseRegPressure(Reg);
   }
   // Generate liveness for uses.
   for (unsigned Reg : RegOpers.Uses) {
@@ -926,7 +929,7 @@ void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) {
       // FIXME: allow the caller to pass in the list of vreg uses that remain
       // to be bottom-scheduled to avoid searching uses at each query.
       SlotIndex CurrIdx = getCurrSlot();
-      const LiveRange *LR = getLiveRange(Reg);
+      const LiveRange *LR = getLiveRange(*LIS, Reg);
       if (LR) {
         LiveQueryResult LRQ = LR->Query(SlotIdx);
         if (LRQ.isKill() && !findUseBetween(Reg, CurrIdx, SlotIdx, *MRI, LIS))

From b4fe3333889dba4a45ff58857df527ad342ea8da Mon Sep 17 00:00:00 2001
From: Derek Schuff <dschuff@google.com>
Date: Thu, 10 Dec 2015 01:09:40 +0000
Subject: [PATCH 305/364] [WebAssembly] Update known test failures

We can now select sign_extend_inreg

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255197 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/known_gcc_test_failures.txt | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/lib/Target/WebAssembly/known_gcc_test_failures.txt b/lib/Target/WebAssembly/known_gcc_test_failures.txt
index b5c887f10d88..cb6f28dd6fde 100644
--- a/lib/Target/WebAssembly/known_gcc_test_failures.txt
+++ b/lib/Target/WebAssembly/known_gcc_test_failures.txt
@@ -202,6 +202,7 @@ pr31448.c
 pr33142.c
 pr33870-1.c
 pr33870.c
+pr33992.c
 pr34176.c
 pr35472.c
 pr36339.c
@@ -296,16 +297,6 @@ vfprintf-chk-1.c
 vprintf-1.c
 vprintf-chk-1.c
 
-# Cannot select sign_extend_inreg.
-20001108-1.c
-20060110-1.c
-20060110-2.c
-pr33992.c
-
-
-
-
-
 # Cannot select callseq_end.
 20040811-1.c
 pr43220.c

From b2d324bc2366003920d1460e5fb4a2d9d71e44b9 Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Thu, 10 Dec 2015 02:07:53 +0000
Subject: [PATCH 306/364] [WebAssembly] Fix legalization of f32->f64 EXTLOAD.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255202 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../WebAssembly/WebAssemblyISelLowering.cpp   |  2 +-
 test/CodeGen/WebAssembly/legalize.ll          | 20 +++++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 21a28e3b7673..fae10947ec6b 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -179,7 +179,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
   //  - Floating-point extending loads.
   //  - Floating-point truncating stores.
   //  - i1 extending loads.
-  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f64, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
   for (auto T : MVT::integer_valuetypes())
     for (auto Ext : {ISD::EXTLOAD, ISD::ZEXTLOAD, ISD::SEXTLOAD})
diff --git a/test/CodeGen/WebAssembly/legalize.ll b/test/CodeGen/WebAssembly/legalize.ll
index 5279f4fa7a58..4dbf96d4d8ad 100644
--- a/test/CodeGen/WebAssembly/legalize.ll
+++ b/test/CodeGen/WebAssembly/legalize.ll
@@ -31,3 +31,23 @@ define i64 @sext_in_reg_i32_i64(i64 %a) {
   %c = ashr i64 %b, 32
   ret i64 %c
 }
+
+; CHECK-LABEL: fpext_f32_f64:
+; CHECK: f32.load $push0=, 0($0){{$}}
+; CHECK: f64.promote/f32 $push1=, $pop0
+; CHECK: return $pop1{{$}}
+define double @fpext_f32_f64(float *%p) {
+  %v = load float, float* %p
+  %e = fpext float %v to double
+  ret double %e
+}
+
+; CHECK-LABEL: fpconv_f64_f32:
+; CHECK: f64.load $push0=, 0($0){{$}}
+; CHECK: f32.demote/f64 $push1=, $pop0
+; CHECK: return $pop1{{$}}
+define float @fpconv_f64_f32(double *%p) {
+  %v = load double, double* %p
+  %e = fptrunc double %v to float
+  ret float %e
+}

From 6052acda66656fc5764ca2e03de431c806e32fda Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Thu, 10 Dec 2015 02:12:53 +0000
Subject: [PATCH 307/364] AMDGPU/SI: Add support for sgpr and vgpr inline
 assembly constraints

Summary: The 's' constraint represents sgprs and the 'v' constraint represents vgprs.

Reviewers: arsenm, echristo

Subscribers: arsenm, llvm-commits

Differential Revision: http://reviews.llvm.org/D15342

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255203 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/SIISelLowering.cpp      | 53 ++++++++++++++++++++---
 lib/Target/AMDGPU/SIISelLowering.h        |  1 +
 test/CodeGen/AMDGPU/inline-constraints.ll | 23 ++++++++++
 3 files changed, 71 insertions(+), 6 deletions(-)
 create mode 100644 test/CodeGen/AMDGPU/inline-constraints.ll

diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 2cb801a707e1..27c6e6095e59 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2436,13 +2436,41 @@ std::pair<unsigned, const TargetRegisterClass *>
 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                                StringRef Constraint,
                                                MVT VT) const {
-  if (Constraint == "r") {
-    switch(VT.SimpleTy) {
-      default: llvm_unreachable("Unhandled type for 'r' inline asm constraint");
-      case MVT::i64:
-        return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
-      case MVT::i32:
+
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 's':
+    case 'r':
+      switch (VT.getSizeInBits()) {
+      default:
+        return std::make_pair(0U, nullptr);
+      case 32:
         return std::make_pair(0U, &AMDGPU::SGPR_32RegClass);
+      case 64:
+        return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
+      case 128:
+        return std::make_pair(0U, &AMDGPU::SReg_128RegClass);
+      case 256:
+        return std::make_pair(0U, &AMDGPU::SReg_256RegClass);
+      }
+
+    case 'v':
+      switch (VT.getSizeInBits()) {
+      default:
+        return std::make_pair(0U, nullptr);
+      case 32:
+        return std::make_pair(0U, &AMDGPU::VGPR_32RegClass);
+      case 64:
+        return std::make_pair(0U, &AMDGPU::VReg_64RegClass);
+      case 96:
+        return std::make_pair(0U, &AMDGPU::VReg_96RegClass);
+      case 128:
+        return std::make_pair(0U, &AMDGPU::VReg_128RegClass);
+      case 256:
+        return std::make_pair(0U, &AMDGPU::VReg_256RegClass);
+      case 512:
+        return std::make_pair(0U, &AMDGPU::VReg_512RegClass);
+      }
     }
   }
 
@@ -2463,3 +2491,16 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
   }
   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 }
+
+SITargetLowering::ConstraintType
+SITargetLowering::getConstraintType(StringRef Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default: break;
+    case 's':
+    case 'v':
+      return C_RegisterClass;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index b9f75cd11de0..4079be65d280 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -121,6 +121,7 @@ class SITargetLowering : public AMDGPUTargetLowering {
   std::pair<unsigned, const TargetRegisterClass *>
   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                StringRef Constraint, MVT VT) const override;
+  ConstraintType getConstraintType(StringRef Constraint) const override;
   SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, SDValue V) const;
 };
 
diff --git a/test/CodeGen/AMDGPU/inline-constraints.ll b/test/CodeGen/AMDGPU/inline-constraints.ll
new file mode 100644
index 000000000000..78868710c6a2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/inline-constraints.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -march=amdgcn -mcpu=bonaire -verify-machineinstrs | FileCheck --check-prefix=GCN %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}inline_reg_constraints:
+; GCN: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GCN: flat_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]
+; GCN: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
+; GCN: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
+; GCN: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
+
+define void @inline_reg_constraints(i32 addrspace(1)* %ptr) {
+entry:
+  %v32 = tail call i32 asm sideeffect "flat_load_dword   $0, $1", "=v,v"(i32 addrspace(1)* %ptr)
+  %v64 = tail call <2 x i32> asm sideeffect "flat_load_dwordx2 $0, $1", "=v,v"(i32 addrspace(1)* %ptr)
+  %v128 = tail call <4 x i32> asm sideeffect "flat_load_dwordx4 $0, $1", "=v,v"(i32 addrspace(1)* %ptr)
+  %s32 =  tail call i32 asm sideeffect "s_load_dword $0, $1", "=s,s"(i32 addrspace(1)* %ptr)
+  %s64 =  tail call <2 x i32> asm sideeffect "s_load_dwordx2 $0, $1", "=s,s"(i32 addrspace(1)* %ptr)
+  %s128 =  tail call <4 x i32> asm sideeffect "s_load_dwordx4 $0, $1", "=s,s"(i32 addrspace(1)* %ptr)
+  %s256 =  tail call <8 x i32> asm sideeffect "s_load_dwordx8 $0, $1", "=s,s"(i32 addrspace(1)* %ptr)
+  ret void
+}

From 7d2a810fefbec714ee717e7090fea882c8d336ef Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Thu, 10 Dec 2015 02:13:01 +0000
Subject: [PATCH 308/364] AMDGPU/SI: Emit constant arrays in the .text section

Summary:
This allows us to remove the END_OF_TEXT_LABEL hack we had been using
and simplifies the fixups used to compute the address of constant
arrays.

Reviewers: arsenm

Subscribers: arsenm, llvm-commits

Differential Revision: http://reviews.llvm.org/D15257

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255204 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/AMDGPU.h                    |  2 -
 lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp        | 10 ----
 lib/Target/AMDGPU/AMDGPUAsmPrinter.h          |  2 -
 lib/Target/AMDGPU/AMDGPUMCInstLower.cpp       |  7 ---
 lib/Target/AMDGPU/AMDGPUTargetMachine.cpp     |  4 +-
 ...ectFile.cpp => AMDGPUTargetObjectFile.cpp} | 21 +++++++-
 ...tObjectFile.h => AMDGPUTargetObjectFile.h} | 17 +++++--
 lib/Target/AMDGPU/CMakeLists.txt              |  2 +-
 .../AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp  | 27 +++++++----
 .../AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h    |  3 --
 .../AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp   | 12 +----
 lib/Target/AMDGPU/SIISelLowering.cpp          | 14 +-----
 lib/Target/AMDGPU/SIInstrInfo.cpp             | 48 +++++++++++--------
 lib/Target/AMDGPU/SIInstrInfo.td              |  5 +-
 lib/Target/AMDGPU/SIInstructions.td           |  4 +-
 test/CodeGen/AMDGPU/global-constant.ll        | 25 ++++++++++
 16 files changed, 113 insertions(+), 90 deletions(-)
 rename lib/Target/AMDGPU/{AMDGPUHSATargetObjectFile.cpp => AMDGPUTargetObjectFile.cpp} (73%)
 rename lib/Target/AMDGPU/{AMDGPUHSATargetObjectFile.h => AMDGPUTargetObjectFile.h} (66%)
 create mode 100644 test/CodeGen/AMDGPU/global-constant.ll

diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index a620e85101e6..fc0530dc4f24 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -92,8 +92,6 @@ enum TargetIndex {
 };
 }
 
-#define END_OF_TEXT_LABEL_NAME "EndOfTextLabel"
-
 } // End namespace llvm
 
 namespace ShaderType {
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index b677caa6c2c6..b4403432f3fc 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -100,16 +100,6 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
   }
 }
 
-void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
-
-  // This label is used to mark the end of the .text section.
-  const TargetLoweringObjectFile &TLOF = getObjFileLowering();
-  OutStreamer->SwitchSection(TLOF.getTextSection());
-  MCSymbol *EndOfTextLabel =
-      OutContext.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
-  OutStreamer->EmitLabel(EndOfTextLabel);
-}
-
 void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 1aaef00a4dd0..817cbfc0c0eb 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -99,8 +99,6 @@ class AMDGPUAsmPrinter : public AsmPrinter {
 
   void EmitFunctionBodyStart() override;
 
-  void EmitEndOfAsmFile(Module &M) override;
-
   void EmitFunctionEntryLabel() override;
 
   void EmitGlobalVariable(const GlobalVariable *GV) override;
diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 2e0283f040dd..8fe8a93dd758 100644
--- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -73,13 +73,6 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
       MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(Sym, Ctx));
       break;
     }
-    case MachineOperand::MO_TargetIndex: {
-      assert(MO.getIndex() == AMDGPU::TI_CONSTDATA_START);
-      MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
-      const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
-      MCOp = MCOperand::createExpr(Expr);
-      break;
-    }
     case MachineOperand::MO_ExternalSymbol: {
       MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(MO.getSymbolName()));
       const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 7b0445db4df2..a22933ecf8a6 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -14,7 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUTargetMachine.h"
-#include "AMDGPUHSATargetObjectFile.h"
+#include "AMDGPUTargetObjectFile.h"
 #include "AMDGPU.h"
 #include "AMDGPUTargetTransformInfo.h"
 #include "R600ISelLowering.h"
@@ -57,7 +57,7 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
   if (TT.getOS() == Triple::AMDHSA)
     return make_unique<AMDGPUHSATargetObjectFile>();
 
-  return make_unique<TargetLoweringObjectFileELF>();
+  return make_unique<AMDGPUTargetObjectFile>();
 }
 
 static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
diff --git a/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.cpp b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
similarity index 73%
rename from lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.cpp
rename to lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
index 32f53edeb770..48bb2641e072 100644
--- a/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AMDGPUHSATargetObjectFile.h"
+#include "AMDGPUTargetObjectFile.h"
 #include "AMDGPU.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -16,6 +16,25 @@
 
 using namespace llvm;
 
+//===----------------------------------------------------------------------===//
+// Generic Object File
+//===----------------------------------------------------------------------===//
+
+MCSection *AMDGPUTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
+                                                          SectionKind Kind,
+                                                          Mangler &Mang,
+                                                const TargetMachine &TM) const {
+  if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GV))
+    return TextSection;
+
+  return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang, TM);
+}
+
+//===----------------------------------------------------------------------===//
+// HSA Object File
+//===----------------------------------------------------------------------===//
+
+
 void AMDGPUHSATargetObjectFile::Initialize(MCContext &Ctx,
                                            const TargetMachine &TM){
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
diff --git a/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.h b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
similarity index 66%
rename from lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.h
rename to lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
index 9ea51ec9b29e..921341ebb897 100644
--- a/lib/Target/AMDGPU/AMDGPUHSATargetObjectFile.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
@@ -1,4 +1,4 @@
-//===-- AMDGPUHSATargetObjectFile.h - AMDGPU HSA Object Info ----*- C++ -*-===//
+//===-- AMDGPUTargetObjectFile.h - AMDGPU  Object Info ----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -9,19 +9,26 @@
 ///
 /// \file
 /// \brief This file declares the AMDGPU-specific subclass of
-/// TargetLoweringObjectFile use for targeting the HSA-runtime.
+/// TargetLoweringObjectFile.
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUHSATARGETOBJECTFILE_H
-#define LLVM_LIB_TARGET_AMDGPU_AMDGPUHSATARGETOBJECTFILE_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H
 
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
 
-class AMDGPUHSATargetObjectFile final : public TargetLoweringObjectFileELF {
+class AMDGPUTargetObjectFile : public TargetLoweringObjectFileELF {
+  public:
+    MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+                                      Mangler &Mang,
+                                      const TargetMachine &TM) const override;
+};
+
+class AMDGPUHSATargetObjectFile final : public AMDGPUTargetObjectFile {
 private:
   MCSection *DataGlobalAgentSection;
   MCSection *DataGlobalProgramSection;
diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt
index 64c9e1882e4f..3a51a5f5e103 100644
--- a/lib/Target/AMDGPU/CMakeLists.txt
+++ b/lib/Target/AMDGPU/CMakeLists.txt
@@ -19,7 +19,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUAsmPrinter.cpp
   AMDGPUDiagnosticInfoUnsupported.cpp
   AMDGPUFrameLowering.cpp
-  AMDGPUHSATargetObjectFile.cpp
+  AMDGPUTargetObjectFile.cpp
   AMDGPUIntrinsicInfo.cpp
   AMDGPUISelDAGToDAG.cpp
   AMDGPUMCInstLower.cpp
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index 4434d9b119c6..60e8c8f3d303 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -99,14 +99,22 @@ void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
 
     case AMDGPU::fixup_si_rodata: {
       uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset());
-      *Dst = Value;
-      break;
-    }
-
-    case AMDGPU::fixup_si_end_of_text: {
-      uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset());
-      // The value points to the last instruction in the text section, so we
-      // need to add 4 bytes to get to the start of the constants.
+      // We emit constant data at the end of the text section and generate its
+      // address using the following code sequence:
+      // s_getpc_b64 s[0:1]
+      // s_add_u32 s0, s0, $symbol
+      // s_addc_u32 s1, s1, 0
+      //
+      // s_getpc_b64 returns the address of the s_add_u32 instruction and then
+      // the fixup replaces $symbol with a literal constant, which is a
+      // pc-relative  offset from the encoding of the $symbol operand to the
+      // constant data.
+      //
+      // What we want here is an offset from the start of the s_add_u32
+      // instruction to the constant data, but since the encoding of $symbol
+      // starts 4 bytes after the start of the add instruction, we end up
+      // with an offset that is 4 bytes too small.  This requires us to
+      // add 4 to the fixup value before applying it.
       *Dst = Value + 4;
       break;
     }
@@ -136,8 +144,7 @@ const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo(
   const static MCFixupKindInfo Infos[AMDGPU::NumTargetFixupKinds] = {
     // name                   offset bits  flags
     { "fixup_si_sopp_br",     0,     16,   MCFixupKindInfo::FKF_IsPCRel },
-    { "fixup_si_rodata",      0,     32,   0 },
-    { "fixup_si_end_of_text", 0,     32,   MCFixupKindInfo::FKF_IsPCRel }
+    { "fixup_si_rodata",      0,     32,   MCFixupKindInfo::FKF_IsPCRel }
   };
 
   if (Kind < FirstTargetFixupKind)
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
index 01021d67ffd9..59a9178082f6 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
@@ -21,9 +21,6 @@ enum Fixups {
   /// fixup for global addresses with constant initializers
   fixup_si_rodata,
 
-  /// fixup for offset from instruction to end of text section
-  fixup_si_end_of_text,
-
   // Marker
   LastTargetFixupKind,
   NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index 65a0eeba2b16..d3163587b6b2 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -250,17 +250,7 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
 
   if (MO.isExpr()) {
     const MCSymbolRefExpr *Expr = cast<MCSymbolRefExpr>(MO.getExpr());
-    MCFixupKind Kind;
-    const MCSymbol *Sym =
-        Ctx.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
-
-    if (&Expr->getSymbol() == Sym) {
-      // Add the offset to the beginning of the constant values.
-      Kind = (MCFixupKind)AMDGPU::fixup_si_end_of_text;
-    } else {
-      // This is used for constant data stored in .rodata.
-     Kind = (MCFixupKind)AMDGPU::fixup_si_rodata;
-    }
+    MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_rodata;
     Fixups.push_back(MCFixup::create(4, Expr, Kind, MI.getLoc()));
   }
 
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 27c6e6095e59..8dc3934b8cbf 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1094,20 +1094,8 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
   const GlobalValue *GV = GSD->getGlobal();
   MVT PtrVT = getPointerTy(DAG.getDataLayout(), GSD->getAddressSpace());
 
-  SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT);
   SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32);
-
-  SDValue PtrLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr,
-                              DAG.getConstant(0, DL, MVT::i32));
-  SDValue PtrHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr,
-                              DAG.getConstant(1, DL, MVT::i32));
-
-  SDValue Lo = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i32, MVT::Glue),
-                           PtrLo, GA);
-  SDValue Hi = DAG.getNode(ISD::ADDE, DL, DAG.getVTList(MVT::i32, MVT::Glue),
-                           PtrHi, DAG.getConstant(0, DL, MVT::i32),
-                           SDValue(Lo.getNode(), 1));
-  return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi);
+  return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT, GA);
 }
 
 SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL,
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index a3a2d8c01eb5..65c4d032a510 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -762,26 +762,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   switch (MI->getOpcode()) {
   default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
 
-  case AMDGPU::SI_CONSTDATA_PTR: {
-    unsigned Reg = MI->getOperand(0).getReg();
-    unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
-    unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
-
-    BuildMI(MBB, MI, DL, get(AMDGPU::S_GETPC_B64), Reg);
-
-    // Add 32-bit offset from this instruction to the start of the constant data.
-    BuildMI(MBB, MI, DL, get(AMDGPU::S_ADD_U32), RegLo)
-            .addReg(RegLo)
-            .addTargetIndex(AMDGPU::TI_CONSTDATA_START)
-            .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit);
-    BuildMI(MBB, MI, DL, get(AMDGPU::S_ADDC_U32), RegHi)
-            .addReg(RegHi)
-            .addImm(0)
-            .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit)
-            .addReg(AMDGPU::SCC, RegState::Implicit);
-    MI->eraseFromParent();
-    break;
-  }
   case AMDGPU::SGPR_USE:
     // This is just a placeholder for register allocation.
     MI->eraseFromParent();
@@ -835,6 +815,34 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
     MI->eraseFromParent();
     break;
   }
+
+  case AMDGPU::SI_CONSTDATA_PTR: {
+    const SIRegisterInfo *TRI =
+        static_cast<const SIRegisterInfo *>(ST.getRegisterInfo());
+    MachineFunction &MF = *MBB.getParent();
+    unsigned Reg = MI->getOperand(0).getReg();
+    unsigned RegLo = TRI->getSubReg(Reg, AMDGPU::sub0);
+    unsigned RegHi = TRI->getSubReg(Reg, AMDGPU::sub1);
+
+    // Create a bundle so these instructions won't be re-ordered by the
+    // post-RA scheduler.
+    MIBundleBuilder Bundler(MBB, MI);
+    Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
+
+    // Add 32-bit offset from this instruction to the start of the
+    // constant data.
+    Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
+                           .addReg(RegLo)
+                           .addOperand(MI->getOperand(1)));
+    Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
+                           .addReg(RegHi)
+                           .addImm(0));
+
+    llvm::finalizeBundle(MBB, Bundler.begin());
+
+    MI->eraseFromParent();
+    break;
+  }
   }
   return true;
 }
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index d6c8a3d29513..c57d0c07aab1 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -133,7 +133,8 @@ def SIsampled : SDSample<"AMDGPUISD::SAMPLED">;
 def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">;
 
 def SIconstdata_ptr : SDNode<
-  "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 0, [SDTCisVT<0, i64>]>
+  "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, i64>,
+                                                     SDTCisVT<0, i64>]>
 >;
 
 //===----------------------------------------------------------------------===//
@@ -366,6 +367,8 @@ def sopp_brtarget : Operand<OtherVT> {
   let ParserMatchClass = SoppBrTarget;
 }
 
+def const_ga : Operand<iPTR>;
+
 include "SIInstrFormats.td"
 include "VIInstrFormats.td"
 
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 2cee993d751c..dcc74f77c146 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -2039,8 +2039,8 @@ let Defs = [SCC] in {
 
 def SI_CONSTDATA_PTR : InstSI <
   (outs SReg_64:$dst),
-  (ins),
-  "", [(set SReg_64:$dst, (i64 SIconstdata_ptr))]
+  (ins const_ga:$ptr),
+  "", [(set SReg_64:$dst, (i64 (SIconstdata_ptr (tglobaladdr:$ptr))))]
 > {
   let SALU = 1;
 }
diff --git a/test/CodeGen/AMDGPU/global-constant.ll b/test/CodeGen/AMDGPU/global-constant.ll
new file mode 100644
index 000000000000..b52a0a103130
--- /dev/null
+++ b/test/CodeGen/AMDGPU/global-constant.ll
@@ -0,0 +1,25 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+@readonly = private unnamed_addr addrspace(2) constant [4 x float] [float 0.0, float 1.0, float 2.0, float 3.0]
+@readonly2 = private unnamed_addr addrspace(2) constant [4 x float] [float 4.0, float 5.0, float 6.0, float 7.0]
+
+; GCN-LABEL: {{^}}main:
+; GCN: s_getpc_b64 s{{\[}}[[PC0_LO:[0-9]+]]:[[PC0_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s{{[0-9]+}}, s[[PC0_LO]], readonly
+; GCN: s_addc_u32 s{{[0-9]+}}, s[[PC0_HI]], 0
+; GCN: s_getpc_b64 s{{\[}}[[PC1_LO:[0-9]+]]:[[PC1_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s{{[0-9]+}}, s[[PC1_LO]], readonly
+; GCN: s_addc_u32 s{{[0-9]+}}, s[[PC1_HI]], 0
+; GCN: .text
+; GCN: readonly:
+; GCN: readonly2:
+define void @main(i32 %index, float addrspace(1)* %out) {
+  %ptr = getelementptr [4 x float], [4 x float] addrspace(2) * @readonly, i32 0, i32 %index
+  %val = load float, float addrspace(2)* %ptr
+  store float %val, float addrspace(1)* %out
+  %ptr2 = getelementptr [4 x float], [4 x float] addrspace(2) * @readonly2, i32 0, i32 %index
+  %val2 = load float, float addrspace(2)* %ptr2
+  store float %val2, float addrspace(1)* %out
+  ret void
+}
+

From 7e9bd658a7a067136d78788679396433c3fe1a96 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Thu, 10 Dec 2015 03:10:46 +0000
Subject: [PATCH 309/364] AMDGPU/SI: Fix warning introduced by r255204

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255205 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index d3163587b6b2..9eb3dadbc5e2 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -36,7 +36,6 @@ class SIMCCodeEmitter : public  AMDGPUMCCodeEmitter {
   void operator=(const SIMCCodeEmitter &) = delete;
   const MCInstrInfo &MCII;
   const MCRegisterInfo &MRI;
-  MCContext &Ctx;
 
   /// \brief Can this operand also contain immediate values?
   bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const;
@@ -47,7 +46,7 @@ class SIMCCodeEmitter : public  AMDGPUMCCodeEmitter {
 public:
   SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
                   MCContext &ctx)
-    : MCII(mcii), MRI(mri), Ctx(ctx) { }
+    : MCII(mcii), MRI(mri) { }
 
   ~SIMCCodeEmitter() override {}
 

From cc39a6efb8e162e314cc259a37b1912337079790 Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Thu, 10 Dec 2015 04:52:33 +0000
Subject: [PATCH 310/364] [WebAssembly] Implement fma.

It is lowered to a libcall for now, but this is expected to change in the future.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255219 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/WebAssemblyISelLowering.cpp | 2 +-
 test/CodeGen/WebAssembly/f32.ll                    | 9 +++++++++
 test/CodeGen/WebAssembly/f64.ll                    | 9 +++++++++
 3 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index fae10947ec6b..4ef0846d311a 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -134,7 +134,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
       setCondCodeAction(CC, T, Expand);
     // Expand floating-point library function operators.
     for (auto Op : {ISD::FSIN, ISD::FCOS, ISD::FSINCOS, ISD::FPOWI, ISD::FPOW,
-                    ISD::FREM})
+                    ISD::FREM, ISD::FMA})
       setOperationAction(Op, T, Expand);
     // Note supported floating-point library function operators that otherwise
     // default to expand.
diff --git a/test/CodeGen/WebAssembly/f32.ll b/test/CodeGen/WebAssembly/f32.ll
index a15b37beab84..10fe18560379 100644
--- a/test/CodeGen/WebAssembly/f32.ll
+++ b/test/CodeGen/WebAssembly/f32.ll
@@ -13,6 +13,7 @@ declare float @llvm.floor.f32(float)
 declare float @llvm.trunc.f32(float)
 declare float @llvm.nearbyint.f32(float)
 declare float @llvm.rint.f32(float)
+declare float @llvm.fma.f32(float, float, float)
 
 ; CHECK-LABEL: fadd32:
 ; CHECK-NEXT: .param f32, f32{{$}}
@@ -143,3 +144,11 @@ define float @fmax32(float %x) {
   %b = select i1 %a, float %x, float 0.0
   ret float %b
 }
+
+; CHECK-LABEL: fma32:
+; CHECK: call $push0=, fmaf, $0, $1, $2{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define float @fma32(float %a, float %b, float %c) {
+  %d = call float @llvm.fma.f32(float %a, float %b, float %c)
+  ret float %d
+}
diff --git a/test/CodeGen/WebAssembly/f64.ll b/test/CodeGen/WebAssembly/f64.ll
index 1407f713b480..51eb33790a9e 100644
--- a/test/CodeGen/WebAssembly/f64.ll
+++ b/test/CodeGen/WebAssembly/f64.ll
@@ -13,6 +13,7 @@ declare double @llvm.floor.f64(double)
 declare double @llvm.trunc.f64(double)
 declare double @llvm.nearbyint.f64(double)
 declare double @llvm.rint.f64(double)
+declare double @llvm.fma.f64(double, double, double)
 
 ; CHECK-LABEL: fadd64:
 ; CHECK-NEXT: .param f64, f64{{$}}
@@ -143,3 +144,11 @@ define double @fmax64(double %x) {
   %b = select i1 %a, double %x, double 0.0
   ret double %b
 }
+
+; CHECK-LABEL: fma64:
+; CHECK: call $push0=, fma, $0, $1, $2{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+define double @fma64(double %a, double %b, double %c) {
+  %d = call double @llvm.fma.f64(double %a, double %b, double %c)
+  ret double %d
+}

From 54fd4d436030f4fa15e8b9cd6643a9255890144c Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Thu, 10 Dec 2015 04:55:31 +0000
Subject: [PATCH 311/364] [WebAssembly] Implement mixed-type ISD::FCOPYSIGN.

ISD::FCOPYSIGN permits its operands to have differing types, and DAGCombiner
uses this. Add some def : Pat rules to expand this out into an explicit
conversion and a normal copysign operation.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255220 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../WebAssembly/WebAssemblyInstrFloat.td      |  6 ++++
 .../WebAssembly/WebAssemblyInstrInfo.td       |  2 +-
 test/CodeGen/WebAssembly/copysign-casts.ll    | 28 +++++++++++++++++++
 3 files changed, 35 insertions(+), 1 deletion(-)
 create mode 100644 test/CodeGen/WebAssembly/copysign-casts.ll

diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
index d966380e6b0c..5520c6de6732 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
@@ -38,6 +38,12 @@ defm NEAREST : UnaryFP<fnearbyint, "nearest">;
 
 } // Defs = [ARGUMENTS]
 
+// DAGCombine oddly folds casts into the rhs of copysign. Unfold them.
+def : Pat<(fcopysign F64:$lhs, F32:$rhs),
+          (COPYSIGN_F64 F64:$lhs, (F64_PROMOTE_F32 F32:$rhs))>;
+def : Pat<(fcopysign F32:$lhs, F64:$rhs),
+          (COPYSIGN_F32 F32:$lhs, (F32_DEMOTE_F64 F64:$rhs))>;
+
 // WebAssembly doesn't expose inexact exceptions, so map frint to fnearbyint.
 def : Pat<(frint f32:$src), (NEAREST_F32 f32:$src)>;
 def : Pat<(frint f64:$src), (NEAREST_F64 f64:$src)>;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index dafe6c1ed64b..5cf8664eba8d 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -154,7 +154,7 @@ include "WebAssemblyInstrMemory.td"
 include "WebAssemblyInstrCall.td"
 include "WebAssemblyInstrControl.td"
 include "WebAssemblyInstrInteger.td"
-include "WebAssemblyInstrFloat.td"
 include "WebAssemblyInstrConv.td"
+include "WebAssemblyInstrFloat.td"
 include "WebAssemblyInstrAtomics.td"
 include "WebAssemblyInstrSIMD.td"
diff --git a/test/CodeGen/WebAssembly/copysign-casts.ll b/test/CodeGen/WebAssembly/copysign-casts.ll
new file mode 100644
index 000000000000..0224febb4c00
--- /dev/null
+++ b/test/CodeGen/WebAssembly/copysign-casts.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; DAGCombiner oddly folds casts into the rhs of copysign. Test that they get
+; unfolded.
+
+target datalayout = "e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+declare double @copysign(double, double) nounwind readnone
+declare float @copysignf(float, float) nounwind readnone
+
+; CHECK-LABEL: fold_promote:
+; CHECK: f64.promote/f32 $push0=, $1
+; CHECK: f64.copysign    $push1=, $0, $pop0
+define double @fold_promote(double %a, float %b) {
+  %c = fpext float %b to double
+  %t = call double @copysign(double %a, double %c)
+  ret double %t
+}
+
+; CHECK-LABEL: fold_demote:
+; CHECK: f32.demote/f64  $push0=, $1
+; CHECK: f32.copysign    $push1=, $0, $pop0
+define float @fold_demote(float %a, double %b) {
+  %c = fptrunc double %b to float
+  %t = call float @copysignf(float %a, float %c)
+  ret float %t
+}

From 5ebd7badfb495172b0e77ba594aa3c766f626b82 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@hotmail.com>
Date: Thu, 10 Dec 2015 05:45:58 +0000
Subject: [PATCH 312/364] [OPENMP] Make -fopenmp to turn on OpenMP support by
 default. Patch turns on OpenMP support in clang by default after fixing
 OpenMP buildbots. Differential Revision: http://reviews.llvm.org/D13802

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255222 91177308-0d34-0410-b5e6-96231b3b80d8
---
 autoconf/configure.ac         |  2 +-
 configure                     |  2 +-
 docs/GettingStarted.rst       | 17 +++++++++--
 docs/ReleaseProcess.rst       |  2 +-
 utils/release/test-release.sh | 55 +++++------------------------------
 5 files changed, 25 insertions(+), 53 deletions(-)

diff --git a/autoconf/configure.ac b/autoconf/configure.ac
index 16535dfba4ab..02ab161e3b03 100644
--- a/autoconf/configure.ac
+++ b/autoconf/configure.ac
@@ -1343,7 +1343,7 @@ AC_DEFINE_UNQUOTED(DEFAULT_SYSROOT,"$withval",
 AC_ARG_WITH(clang-default-openmp-runtime,
   AS_HELP_STRING([--with-clang-default-openmp-runtime],
     [The default OpenMP runtime for Clang.]),,
-    withval="libgomp")
+    withval="libomp")
 AC_DEFINE_UNQUOTED(CLANG_DEFAULT_OPENMP_RUNTIME,"$withval",
                    [Default OpenMP runtime used by -fopenmp.])
 
diff --git a/configure b/configure
index 1b3d08b3ace2..33438c60365e 100755
--- a/configure
+++ b/configure
@@ -5954,7 +5954,7 @@ _ACEOF
 if test "${with_clang_default_openmp_runtime+set}" = set; then
   withval=$with_clang_default_openmp_runtime;
 else
-  withval="libgomp"
+  withval="libomp"
 fi
 
 
diff --git a/docs/GettingStarted.rst b/docs/GettingStarted.rst
index 8fb4daa5e26e..c0741f1400c5 100644
--- a/docs/GettingStarted.rst
+++ b/docs/GettingStarted.rst
@@ -55,6 +55,12 @@ Here's the short story for getting up and running quickly with LLVM:
    * ``cd llvm/projects``
    * ``svn co http://llvm.org/svn/llvm-project/compiler-rt/trunk compiler-rt``
 
+#. Checkout Libomp (required for OpenMP support):
+
+   * ``cd where-you-want-llvm-to-live``
+   * ``cd llvm/projects``
+   * ``svn co http://llvm.org/svn/llvm-project/openmp/trunk openmp``
+
 #. Checkout libcxx and libcxxabi **[Optional]**:
 
    * ``cd where-you-want-llvm-to-live``
@@ -535,6 +541,13 @@ If you want to check out compiler-rt (required to build the sanitizers), run:
   % cd llvm/projects
   % git clone http://llvm.org/git/compiler-rt.git
 
+If you want to check out libomp (required for OpenMP support), run:
+
+.. code-block:: console
+
+  % cd llvm/projects
+  % git clone http://llvm.org/git/openmp.git
+
 If you want to check out libcxx and libcxxabi (optional), run:
 
 .. code-block:: console
@@ -634,7 +647,7 @@ To set up clone from which you can submit code using ``git-svn``, run:
   % git config svn-remote.svn.fetch :refs/remotes/origin/master
   % git svn rebase -l
 
-Likewise for compiler-rt and test-suite.
+Likewise for compiler-rt, libomp and test-suite.
 
 To update this clone without generating git-svn tags that conflict with the
 upstream Git repo, run:
@@ -648,7 +661,7 @@ upstream Git repo, run:
      git checkout master &&
      git svn rebase -l)
 
-Likewise for compiler-rt and test-suite.
+Likewise for compiler-rt, libomp and test-suite.
 
 This leaves your working directories on their master branches, so you'll need to
 ``checkout`` each working branch individually and ``rebase`` it on top of its
diff --git a/docs/ReleaseProcess.rst b/docs/ReleaseProcess.rst
index c4bbc91c63ce..d7f703126019 100644
--- a/docs/ReleaseProcess.rst
+++ b/docs/ReleaseProcess.rst
@@ -53,7 +53,7 @@ test-release.sh
 ---------------
 
 This script will check-out, configure and compile LLVM+Clang (+ most add-ons, like ``compiler-rt``,
-``libcxx`` and ``clang-extra-tools``) in three stages, and will test the final stage.
+``libcxx``, ``libomp`` and ``clang-extra-tools``) in three stages, and will test the final stage.
 It'll have installed the final binaries on the Phase3/Releasei(+Asserts) directory, and
 that's the one you should use for the test-suite and other external tests.
 
diff --git a/utils/release/test-release.sh b/utils/release/test-release.sh
index e0983460bf24..bb1f7869467a 100755
--- a/utils/release/test-release.sh
+++ b/utils/release/test-release.sh
@@ -34,7 +34,7 @@ do_rt="yes"
 do_libs="yes"
 do_libunwind="yes"
 do_test_suite="yes"
-do_openmp="no"
+do_openmp="yes"
 BuildDir="`pwd`"
 use_autoconf="no"
 ExtraConfigureFlags=""
@@ -62,7 +62,7 @@ function usage() {
     echo " -no-libs             Disable check-out & build libcxx/libcxxabi/libunwind"
     echo " -no-libunwind        Disable check-out & build libunwind"
     echo " -no-test-suite       Disable check-out & build test-suite"
-    echo " -openmp              Check out and build the OpenMP run-time (experimental)"
+    echo " -no-openmp           Disable check-out & build libomp"
 }
 
 if [ `uname -s` = "Darwin" ]; then
@@ -143,8 +143,8 @@ while [ $# -gt 0 ]; do
         -no-test-suite )
             do_test_suite="no"
             ;;
-        -openmp )
-            do_openmp="yes"
+        -no-openmp )
+            do_openmp="no"
             ;;
         -help | --help | -h | --h | -\? )
             usage
@@ -293,6 +293,9 @@ function export_sources() {
     if [ -d $BuildDir/compiler-rt.src ] && [ ! -h compiler-rt ]; then
         ln -s ../../compiler-rt.src compiler-rt
     fi
+    if [ -d $BuildDir/openmp.src ] && [ ! -h openmp ]; then
+        ln -s ../../openmp.src openmp
+    fi
     if [ -d $BuildDir/libcxx.src ] && [ ! -h libcxx ]; then
         ln -s ../../libcxx.src libcxx
     fi
@@ -443,46 +446,6 @@ function package_release() {
     cd $cwd
 }
 
-# Build and package the OpenMP run-time. This is still experimental and not
-# meant for official testing in the release, but as a way for providing
-# binaries as a convenience to those who want to try it out.
-function build_OpenMP() {
-    cwd=`pwd`
-
-    rm -rf $BuildDir/Phase3/openmp
-    rm -rf $BuildDir/Phase3/openmp.install
-    mkdir -p $BuildDir/Phase3/openmp
-    cd $BuildDir/Phase3/openmp
-    clang=$BuildDir/Phase3/Release/llvmCore-$Release-$RC.install/usr/local/bin/clang
-
-    echo "#" cmake -DCMAKE_C_COMPILER=${clang} -DCMAKE_CXX_COMPILER=${clang}++ \
-            -DCMAKE_BUILD_TYPE=Release -DLIBOMP_MICRO_TESTS=on \
-            $BuildDir/openmp.src/runtime
-    cmake -DCMAKE_C_COMPILER=${clang} -DCMAKE_CXX_COMPILER=${clang}++ \
-            -DCMAKE_BUILD_TYPE=Release -DLIBOMP_MICRO_TESTS=on \
-            $BuildDir/openmp.src/runtime
-
-    echo "# Building OpenMP run-time"
-    echo "# ${MAKE} -j $NumJobs VERBOSE=1"
-    ${MAKE} -j $NumJobs VERBOSE=1
-    echo "# ${MAKE} libomp-micro-tests VERBOSE=1"
-    ${MAKE} libomp-micro-tests VERBOSE=1
-    echo "# ${MAKE} install DESTDIR=$BuildDir/Phase3/openmp.install"
-    ${MAKE} install DESTDIR=$BuildDir/Phase3/openmp.install
-
-    OpenMPPackage=OpenMP-$Release
-    if [ $RC != "final" ]; then
-        OpenMPPackage=$OpenMPPackage-$RC
-    fi
-    OpenMPPackage=$OpenMPPackage-$Triple
-
-    mv $BuildDir/Phase3/openmp.install/usr/local $BuildDir/$OpenMPPackage
-    cd $BuildDir
-    tar cvfJ $BuildDir/$OpenMPPackage.tar.xz $OpenMPPackage
-    mv $OpenMPPackage $BuildDir/Phase3/openmp.install/usr/local
-    cd $cwd
-}
-
 # Exit if any command fails
 # Note: pipefail is necessary for running build commands through
 # a pipe (i.e. it changes the output of ``false | tee /dev/null ; echo $?``)
@@ -594,10 +557,6 @@ for Flavor in $Flavors ; do
     fi
 done
 
-if [ $do_openmp = "yes" ]; then
-  build_OpenMP
-fi
-
 ) 2>&1 | tee $LogDir/testing.$Release-$RC.log
 
 package_release

From 224a9299ff8ffc4e9394566055c59b2da29490ac Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Thu, 10 Dec 2015 06:09:41 +0000
Subject: [PATCH 313/364] [X86] Fix a couple cases were bitwise and logical
 operations were being mixed. NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255224 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp | 2 +-
 lib/Target/X86/X86FrameLowering.cpp                    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
index 6140c5ac1e78..040143b15587 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
@@ -361,7 +361,7 @@ static int readPrefixes(struct InternalInstruction* insn) {
        * then it should be disassembled as a xacquire/xrelease not repne/rep.
        */
       if ((byte == 0xf2 || byte == 0xf3) &&
-          ((nextByte == 0xf0) |
+          ((nextByte == 0xf0) ||
           ((nextByte & 0xfe) == 0x86 || (nextByte & 0xf8) == 0x90)))
         insn->xAcquireRelease = true;
       /*
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 8695a0e2967a..7841a84326b4 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -1003,7 +1003,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
   else if (IsFunclet)
     Establisher = Uses64BitFramePtr ? X86::RDX : X86::EDX;
 
-  if (IsWin64Prologue && IsFunclet & !IsClrFunclet) {
+  if (IsWin64Prologue && IsFunclet && !IsClrFunclet) {
     // Immediately spill establisher into the home slot.
     // The runtime cares about this.
     // MOV64mr %rdx, 16(%rsp)

From fd947f33ba2b1260d7496776c96d5674ee3c9ef4 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@playingwithpointers.com>
Date: Thu, 10 Dec 2015 06:39:02 +0000
Subject: [PATCH 314/364] Add arg_begin() and arg_end() to CallInst and
 InvokeInst; NFCI

 - This simplifies the CallSite class, arg_begin / arg_end are now
   simple wrapper getters.

 - In several places, we were creating CallSite instances solely to call
   arg_begin and arg_end.  With this change, that's no longer required.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255226 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/IR/CallSite.h              | 29 ++++---------
 include/llvm/IR/Instructions.h          | 54 ++++++++++++++++++++-----
 lib/IR/Instructions.cpp                 |  6 +--
 lib/Transforms/IPO/PruneEH.cpp          |  3 +-
 lib/Transforms/Utils/InlineFunction.cpp |  5 +--
 lib/Transforms/Utils/Local.cpp          |  3 +-
 6 files changed, 58 insertions(+), 42 deletions(-)

diff --git a/include/llvm/IR/CallSite.h b/include/llvm/IR/CallSite.h
index 8556dda163b8..7d9902f944ed 100644
--- a/include/llvm/IR/CallSite.h
+++ b/include/llvm/IR/CallSite.h
@@ -148,15 +148,6 @@ class CallSiteBase {
   /// arguments at this call site.
   typedef IterTy arg_iterator;
 
-  /// arg_begin/arg_end - Return iterators corresponding to the actual argument
-  /// list for a call site.
-  IterTy arg_begin() const {
-    assert(getInstruction() && "Not a call or invoke instruction!");
-    // Skip non-arguments
-    return (*this)->op_begin();
-  }
-
-  IterTy arg_end() const { return (*this)->op_end() - getArgumentEndOffset(); }
   iterator_range<IterTy> args() const {
     return make_range(arg_begin(), arg_end());
   }
@@ -387,6 +378,14 @@ class CallSiteBase {
     CALLSITE_DELEGATE_GETTER(getOperandBundle(ID));
   }
 
+  IterTy arg_begin() const {
+    CALLSITE_DELEGATE_GETTER(arg_begin());
+  }
+
+  IterTy arg_end() const {
+    CALLSITE_DELEGATE_GETTER(arg_end());
+  }
+
 #undef CALLSITE_DELEGATE_GETTER
 #undef CALLSITE_DELEGATE_SETTER
 
@@ -460,18 +459,6 @@ class CallSiteBase {
   }
 
 private:
-  unsigned getArgumentEndOffset() const {
-    if (isCall()) {
-      // Skip [ operand bundles ], Callee
-      auto *CI = cast<CallInst>(getInstruction());
-      return 1 + CI->getNumTotalBundleOperands();
-    } else {
-      // Skip [ operand bundles ], BB, BB, Callee
-      auto *II = cast<InvokeInst>(getInstruction());
-      return 3 + II->getNumTotalBundleOperands();
-    }
-  }
-
   IterTy getCallee() const {
     if (isCall()) // Skip Callee
       return cast<CallInst>(getInstruction())->op_end() - 1;
diff --git a/include/llvm/IR/Instructions.h b/include/llvm/IR/Instructions.h
index ae06a5f641a1..84ab72138f01 100644
--- a/include/llvm/IR/Instructions.h
+++ b/include/llvm/IR/Instructions.h
@@ -1543,16 +1543,32 @@ class CallInst : public Instruction,
     setOperand(i, v);
   }
 
-  /// arg_operands - iteration adapter for range-for loops.
+  /// \brief Return the iterator pointing to the beginning of the argument list.
+  op_iterator arg_begin() { return op_begin(); }
+
+  /// \brief Return the iterator pointing to the end of the argument list.
+  op_iterator arg_end() {
+    // [ call args ], [ operand bundles ], callee
+    return op_end() - getNumTotalBundleOperands() - 1;
+  };
+
+  /// \brief Iteration adapter for range-for loops.
   iterator_range<op_iterator> arg_operands() {
-    // The last operand in the op list is the callee - it's not one of the args
-    // so we don't want to iterate over it.
-    return make_range(op_begin(), op_end() - getNumTotalBundleOperands() - 1);
+    return make_range(arg_begin(), arg_end());
   }
 
-  /// arg_operands - iteration adapter for range-for loops.
+  /// \brief Return the iterator pointing to the beginning of the argument list.
+  const_op_iterator arg_begin() const { return op_begin(); }
+
+  /// \brief Return the iterator pointing to the end of the argument list.
+  const_op_iterator arg_end() const {
+    // [ call args ], [ operand bundles ], callee
+    return op_end() - getNumTotalBundleOperands() - 1;
+  };
+
+  /// \brief Iteration adapter for range-for loops.
   iterator_range<const_op_iterator> arg_operands() const {
-    return make_range(op_begin(), op_end() - getNumTotalBundleOperands() - 1);
+    return make_range(arg_begin(), arg_end());
   }
 
   /// \brief Wrappers for getting the \c Use of a call argument.
@@ -3450,14 +3466,32 @@ class InvokeInst : public TerminatorInst,
     setOperand(i, v);
   }
 
-  /// arg_operands - iteration adapter for range-for loops.
+  /// \brief Return the iterator pointing to the beginning of the argument list.
+  op_iterator arg_begin() { return op_begin(); }
+
+  /// \brief Return the iterator pointing to the end of the argument list.
+  op_iterator arg_end() {
+    // [ invoke args ], [ operand bundles ], normal dest, unwind dest, callee
+    return op_end() - getNumTotalBundleOperands() - 3;
+  };
+
+  /// \brief Iteration adapter for range-for loops.
   iterator_range<op_iterator> arg_operands() {
-    return make_range(op_begin(), op_end() - getNumTotalBundleOperands() - 3);
+    return make_range(arg_begin(), arg_end());
   }
 
-  /// arg_operands - iteration adapter for range-for loops.
+  /// \brief Return the iterator pointing to the beginning of the argument list.
+  const_op_iterator arg_begin() const { return op_begin(); }
+
+  /// \brief Return the iterator pointing to the end of the argument list.
+  const_op_iterator arg_end() const {
+    // [ invoke args ], [ operand bundles ], normal dest, unwind dest, callee
+    return op_end() - getNumTotalBundleOperands() - 3;
+  };
+
+  /// \brief Iteration adapter for range-for loops.
   iterator_range<const_op_iterator> arg_operands() const {
-    return make_range(op_begin(), op_end() - getNumTotalBundleOperands() - 3);
+    return make_range(arg_begin(), arg_end());
   }
 
   /// \brief Wrappers for getting the \c Use of a invoke argument.
diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp
index 6ec2e2899700..f185caacdf6c 100644
--- a/lib/IR/Instructions.cpp
+++ b/lib/IR/Instructions.cpp
@@ -299,8 +299,7 @@ CallInst::CallInst(const CallInst &CI)
 
 CallInst *CallInst::Create(CallInst *CI, ArrayRef<OperandBundleDef> OpB,
                            Instruction *InsertPt) {
-  CallSite CS(CI);
-  std::vector<Value *> Args(CS.arg_begin(), CS.arg_end());
+  std::vector<Value *> Args(CI->arg_begin(), CI->arg_end());
 
   auto *NewCI = CallInst::Create(CI->getCalledValue(), Args, OpB, CI->getName(),
                                  InsertPt);
@@ -587,8 +586,7 @@ InvokeInst::InvokeInst(const InvokeInst &II)
 
 InvokeInst *InvokeInst::Create(InvokeInst *II, ArrayRef<OperandBundleDef> OpB,
                                Instruction *InsertPt) {
-  CallSite CS(II);
-  std::vector<Value *> Args(CS.arg_begin(), CS.arg_end());
+  std::vector<Value *> Args(II->arg_begin(), II->arg_end());
 
   auto *NewII = InvokeInst::Create(II->getCalledValue(), II->getNormalDest(),
                                    II->getUnwindDest(), Args, OpB,
diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp
index c9c0b197eae6..cd2411ba554c 100644
--- a/lib/Transforms/IPO/PruneEH.cpp
+++ b/lib/Transforms/IPO/PruneEH.cpp
@@ -191,8 +191,7 @@ bool PruneEH::SimplifyFunction(Function *F) {
   for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
     if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator()))
       if (II->doesNotThrow() && canSimplifyInvokeNoUnwind(F)) {
-        CallSite CS(II);
-        SmallVector<Value*, 8> Args(CS.arg_begin(), CS.arg_end());
+        SmallVector<Value*, 8> Args(II->arg_begin(), II->arg_end());
         SmallVector<OperandBundleDef, 1> OpBundles;
         II->getOperandBundlesAsDefs(OpBundles);
 
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index 9a0aabc38a59..cafd1818fed6 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -206,11 +206,10 @@ HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, BasicBlock *UnwindEdge) {
     BB->getInstList().pop_back();
 
     // Create the new invoke instruction.
-    ImmutableCallSite CS(CI);
-    SmallVector<Value*, 8> InvokeArgs(CS.arg_begin(), CS.arg_end());
+    SmallVector<Value*, 8> InvokeArgs(CI->arg_begin(), CI->arg_end());
     SmallVector<OperandBundleDef, 1> OpBundles;
 
-    CS.getOperandBundlesAsDefs(OpBundles);
+    CI->getOperandBundlesAsDefs(OpBundles);
 
     // Note: we're round tripping operand bundles through memory here, and that
     // can potentially be avoided with a cleverer API design that we do not have
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 19122dd21a71..391ed6857668 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -1210,8 +1210,7 @@ static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) {
 
 /// changeToCall - Convert the specified invoke into a normal call.
 static void changeToCall(InvokeInst *II) {
-  CallSite CS(II);
-  SmallVector<Value*, 8> Args(CS.arg_begin(), CS.arg_end());
+  SmallVector<Value*, 8> Args(II->arg_begin(), II->arg_end());
   SmallVector<OperandBundleDef, 1> OpBundles;
   II->getOperandBundlesAsDefs(OpBundles);
   CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, OpBundles,

From 342c5a64325164d0b4de0395f2a0df01979ba910 Mon Sep 17 00:00:00 2001
From: Akira Hatanaka <ahatanaka@apple.com>
Date: Thu, 10 Dec 2015 08:00:52 +0000
Subject: [PATCH 315/364] Revert r255137.

This commit broke apple's internal bot.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255227 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/InstCombineCasts.cpp          | 39 -------------------
 test/Transforms/InstCombine/bitcast.ll        | 28 ++++---------
 2 files changed, 8 insertions(+), 59 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 2ce86436411b..4afe1bb243ff 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1715,42 +1715,6 @@ static Value *optimizeIntegerToVectorInsertions(BitCastInst &CI,
   return Result;
 }
 
-/// Given a bitcasted vector fed into an extract element instruction and then
-/// bitcasted again to a scalar type, eliminate at least one bitcast by changing
-/// the vector type of the extractelement instruction.
-/// Example:
-///   bitcast (extractelement (bitcast <2 x float> %X to <2 x i32>), 1) to float
-///    --->
-///   extractelement <2 x float> %X, i32 1
-static Instruction *foldBitCastExtElt(BitCastInst &BitCast, InstCombiner &IC,
-                                      const DataLayout &DL) {
-  Type *DestType = BitCast.getType();
-  if (DestType->isVectorTy())
-    return nullptr;
-
-  // TODO: Create and use a pattern matcher for ExtractElementInst.
-  auto *ExtElt = dyn_cast<ExtractElementInst>(BitCast.getOperand(0));
-  if (!ExtElt || !ExtElt->hasOneUse())
-    return nullptr;
-
-  Value *InnerBitCast = nullptr;
-  if (!match(ExtElt->getOperand(0), m_BitCast(m_Value(InnerBitCast))))
-    return nullptr;
-
-  // If the element type of the vector doesn't match the result type,
-  // bitcast it to a vector type that we can extract from.
-  VectorType *VecType = cast<VectorType>(InnerBitCast->getType());
-  if (VecType->getElementType() != DestType) {
-    unsigned VecWidth = VecType->getPrimitiveSizeInBits();
-    unsigned DestWidth = DestType->getPrimitiveSizeInBits();
-    unsigned NumElts = VecWidth / DestWidth;
-    VecType = VectorType::get(DestType, NumElts);
-    InnerBitCast = IC.Builder->CreateBitCast(InnerBitCast, VecType, "bc");
-  }
-
-  return ExtractElementInst::Create(InnerBitCast, ExtElt->getOperand(1));
-}
-
 static Instruction *foldVecTruncToExtElt(Value *VecInput, Type *DestTy,
                                          unsigned ShiftAmt, InstCombiner &IC,
                                          const DataLayout &DL) {
@@ -1922,9 +1886,6 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     }
   }
 
-  if (Instruction *I = foldBitCastExtElt(CI, *this, DL))
-    return I;
-
   if (SrcTy->isPointerTy())
     return commonPointerCastTransforms(CI);
   return commonCastTransforms(CI);
diff --git a/test/Transforms/InstCombine/bitcast.ll b/test/Transforms/InstCombine/bitcast.ll
index 5adb7c790fa7..2a8194e53032 100644
--- a/test/Transforms/InstCombine/bitcast.ll
+++ b/test/Transforms/InstCombine/bitcast.ll
@@ -64,7 +64,7 @@ define float @test3(<2 x float> %A, <2 x i64> %B) {
 ; CHECK-NEXT:  ret float %add
 }
 
-; Both bitcasts are unnecessary; change the extractelement.
+; TODO: Both bitcasts are unnecessary; change the extractelement.
 
 define float @bitcast_extelt1(<2 x float> %A) {
   %bc1 = bitcast <2 x float> %A to <2 x i32>
@@ -73,11 +73,13 @@ define float @bitcast_extelt1(<2 x float> %A) {
   ret float %bc2
 
 ; CHECK-LABEL: @bitcast_extelt1(
-; CHECK-NEXT:  %bc2 = extractelement <2 x float> %A, i32 0
+; CHECK-NEXT:  %bc1 = bitcast <2 x float> %A to <2 x i32>
+; CHECK-NEXT:  %ext = extractelement <2 x i32> %bc1, i32 0
+; CHECK-NEXT:  %bc2 = bitcast i32 %ext to float
 ; CHECK-NEXT:  ret float %bc2
 }
 
-; Second bitcast can be folded into the first.
+; TODO: Second bitcast can be folded into the first.
 
 define i64 @bitcast_extelt2(<4 x float> %A) {
   %bc1 = bitcast <4 x float> %A to <2 x double>
@@ -86,26 +88,12 @@ define i64 @bitcast_extelt2(<4 x float> %A) {
   ret i64 %bc2
 
 ; CHECK-LABEL: @bitcast_extelt2(
-; CHECK-NEXT:  %bc = bitcast <4 x float> %A to <2 x i64>
-; CHECK-NEXT:  %bc2 = extractelement <2 x i64> %bc, i32 1
+; CHECK-NEXT:  %bc1 = bitcast <4 x float> %A to <2 x double>
+; CHECK-NEXT:  %ext = extractelement <2 x double> %bc1, i32 1
+; CHECK-NEXT:  %bc2 = bitcast double %ext to i64
 ; CHECK-NEXT:  ret i64 %bc2
 }
 
-; TODO: This should return %A. 
-
-define <2 x i32> @bitcast_extelt3(<2 x i32> %A) {
-  %bc1 = bitcast <2 x i32> %A to <1 x i64>
-  %ext = extractelement <1 x i64> %bc1, i32 0
-  %bc2 = bitcast i64 %ext to <2 x i32>
-  ret <2 x i32> %bc2
-
-; CHECK-LABEL: @bitcast_extelt3(
-; CHECK-NEXT:  %bc1 = bitcast <2 x i32> %A to <1 x i64>
-; CHECK-NEXT:  %ext = extractelement <1 x i64> %bc1, i32 0
-; CHECK-NEXT:  %bc2 = bitcast i64 %ext to <2 x i32>
-; CHECK-NEXT:  ret <2 x i32> %bc2
-}
-
 define <2 x i32> @test4(i32 %A, i32 %B){
   %tmp38 = zext i32 %A to i64
   %tmp32 = zext i32 %B to i64

From 92969682ea76a18c3a12d13f6618b16e8a4772c0 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulsson@linux.vnet.ibm.com>
Date: Thu, 10 Dec 2015 09:10:07 +0000
Subject: [PATCH 316/364] [PostRA scheduling] Allow a target to do scheduling
 when it wants post RA.

SystemZ needs to do its scheduling after branch relaxation, which can
only happen after block placement, and therefore the standard
PostRAScheduler point in the pass sequence is too early.

TargetMachine::targetSchedulesPostRAScheduling() is a new method that
signals on returning true that target will insert the final scheduling
pass on its own.

Reviewed by Hal Finkel

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255234 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Target/TargetMachine.h         |  7 +++++++
 lib/CodeGen/Passes.cpp                      | 13 ++++++++-----
 lib/Target/SystemZ/SystemZTargetMachine.cpp | 11 +++++++++++
 lib/Target/SystemZ/SystemZTargetMachine.h   |  3 +++
 4 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/include/llvm/Target/TargetMachine.h b/include/llvm/Target/TargetMachine.h
index b7760a61806f..74e91b5790cb 100644
--- a/include/llvm/Target/TargetMachine.h
+++ b/include/llvm/Target/TargetMachine.h
@@ -255,6 +255,13 @@ class TargetMachine {
     return true;
   }
 
+  /// True if subtarget inserts the final scheduling pass on its own.
+  ///
+  /// Branch relaxation, which must happen after block placement, can
+  /// on some targets (e.g. SystemZ) expose additional post-RA
+  /// scheduling opportunities.
+  virtual bool targetSchedulesPostRAScheduling() const { return false; };
+
   void getNameWithPrefix(SmallVectorImpl<char> &Name, const GlobalValue *GV,
                          Mangler &Mang, bool MayAlwaysUsePrivate = false) const;
   MCSymbol *getSymbol(const GlobalValue *GV, Mangler &Mang) const;
diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp
index 9d473fdb6cd7..4d7adcbbfe77 100644
--- a/lib/CodeGen/Passes.cpp
+++ b/lib/CodeGen/Passes.cpp
@@ -96,10 +96,10 @@ PrintMachineInstrs("print-machineinstrs", cl::ValueOptional,
 
 // Temporary option to allow experimenting with MachineScheduler as a post-RA
 // scheduler. Targets can "properly" enable this with
-// substitutePass(&PostRASchedulerID, &PostMachineSchedulerID); Ideally it
-// wouldn't be part of the standard pass pipeline, and the target would just add
-// a PostRA scheduling pass wherever it wants.
-static cl::opt<bool> MISchedPostRA("misched-postra", cl::Hidden,
+// substitutePass(&PostRASchedulerID, &PostMachineSchedulerID).
+// Targets can return true in targetSchedulesPostRAScheduling() and
+// insert a PostRA scheduling pass wherever it wants.
+cl::opt<bool> MISchedPostRA("misched-postra", cl::Hidden,
   cl::desc("Run MachineScheduler post regalloc (independent of preRA sched)"));
 
 // Experimental option to run live interval analysis early.
@@ -575,7 +575,10 @@ void TargetPassConfig::addMachinePasses() {
     addPass(&ImplicitNullChecksID);
 
   // Second pass scheduler.
-  if (getOptLevel() != CodeGenOpt::None) {
+  // Let Target optionally insert this pass by itself at some other
+  // point.
+  if (getOptLevel() != CodeGenOpt::None &&
+      !TM->targetSchedulesPostRAScheduling()) {
     if (MISchedPostRA)
       addPass(&PostMachineSchedulerID);
     else
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 22beaad2ab73..f305e85f6cfe 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -16,6 +16,7 @@
 
 using namespace llvm;
 
+extern cl::opt<bool> MISchedPostRA;
 extern "C" void LLVMInitializeSystemZTarget() {
   // Register the target.
   RegisterTargetMachine<SystemZTargetMachine> X(TheSystemZTarget);
@@ -163,6 +164,16 @@ void SystemZPassConfig::addPreEmitPass() {
   if (getOptLevel() != CodeGenOpt::None)
     addPass(createSystemZElimComparePass(getSystemZTargetMachine()), false);
   addPass(createSystemZLongBranchPass(getSystemZTargetMachine()));
+
+  // Do final scheduling after all other optimizations, to get an
+  // optimal input for the decoder (branch relaxation must happen
+  // after block placement).
+  if (getOptLevel() != CodeGenOpt::None) {
+    if (MISchedPostRA)
+      addPass(&PostMachineSchedulerID);
+    else
+      addPass(&PostRASchedulerID);
+  }
 }
 
 TargetPassConfig *SystemZTargetMachine::createPassConfig(PassManagerBase &PM) {
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.h b/lib/Target/SystemZ/SystemZTargetMachine.h
index 0a81e1f9fdf9..1a8f1f7f3aaa 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.h
+++ b/lib/Target/SystemZ/SystemZTargetMachine.h
@@ -43,6 +43,9 @@ class SystemZTargetMachine : public LLVMTargetMachine {
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
+
+  bool targetSchedulesPostRAScheduling() const override { return true; };
+
 };
 
 } // end namespace llvm

From f1cbc173046717ddfaf48eae5d6776653afa1090 Mon Sep 17 00:00:00 2001
From: Silviu Baranga <silviu.baranga@arm.com>
Date: Thu, 10 Dec 2015 11:07:18 +0000
Subject: [PATCH 317/364] [LLE] Use the PredicatedScalarEvolution interface to
 query SCEVs for dependences

Summary:
LAA uses the PredicatedScalarEvolution interface, so it can produce
forward/backward dependences having SCEVs that are AddRecExprs only after being
transformed by PredicatedScalarEvolution.

Use PredicatedScalarEvolution to get the expected expressions.

Reviewers: anemet

Subscribers: llvm-commits, sanjoy

Differential Revision: http://reviews.llvm.org/D15382

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255241 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/LoopLoadElimination.cpp | 31 +++++++++----------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/lib/Transforms/Scalar/LoopLoadElimination.cpp b/lib/Transforms/Scalar/LoopLoadElimination.cpp
index 09d022b3013b..c5bce7ae29da 100644
--- a/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -61,7 +61,7 @@ struct StoreToLoadForwardingCandidate {
 
   /// \brief Return true if the dependence from the store to the load has a
   /// distance of one.  E.g. A[i+1] = A[i]
-  bool isDependenceDistanceOfOne(ScalarEvolution *SE) const {
+  bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE) const {
     Value *LoadPtr = Load->getPointerOperand();
     Value *StorePtr = Store->getPointerOperand();
     Type *LoadPtrType = LoadPtr->getType();
@@ -75,13 +75,13 @@ struct StoreToLoadForwardingCandidate {
     auto &DL = Load->getParent()->getModule()->getDataLayout();
     unsigned TypeByteSize = DL.getTypeAllocSize(const_cast<Type *>(LoadType));
 
-    auto *LoadPtrSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
-    auto *StorePtrSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
+    auto *LoadPtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(LoadPtr));
+    auto *StorePtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(StorePtr));
 
     // We don't need to check non-wrapping here because forward/backward
     // dependence wouldn't be valid if these weren't monotonic accesses.
-    auto *Dist =
-        cast<SCEVConstant>(SE->getMinusSCEV(StorePtrSCEV, LoadPtrSCEV));
+    auto *Dist = cast<SCEVConstant>(
+        PSE.getSE()->getMinusSCEV(StorePtrSCEV, LoadPtrSCEV));
     const APInt &Val = Dist->getValue()->getValue();
     return Val.abs() == TypeByteSize;
   }
@@ -114,8 +114,8 @@ bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L,
 class LoadEliminationForLoop {
 public:
   LoadEliminationForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI,
-                         DominatorTree *DT, ScalarEvolution *SE)
-      : L(L), LI(LI), LAI(LAI), DT(DT), SE(SE) {}
+                         DominatorTree *DT)
+      : L(L), LI(LI), LAI(LAI), DT(DT), PSE(LAI.PSE) {}
 
   /// \brief Look through the loop-carried and loop-independent dependences in
   /// this loop and find store->load dependences.
@@ -223,8 +223,8 @@ class LoadEliminationForLoop {
         // block so deciding which one forwards is easy.  The later one forwards
         // as long as they both have a dependence distance of one to the load.
         if (Cand.Store->getParent() == OtherCand->Store->getParent() &&
-            Cand.isDependenceDistanceOfOne(SE) &&
-            OtherCand->isDependenceDistanceOfOne(SE)) {
+            Cand.isDependenceDistanceOfOne(PSE) &&
+            OtherCand->isDependenceDistanceOfOne(PSE)) {
           // They are in the same block, the later one will forward to the load.
           if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store))
             OtherCand = &Cand;
@@ -372,7 +372,7 @@ class LoadEliminationForLoop {
     //      store %y, %gep_i_plus_1
 
     Value *Ptr = Cand.Load->getPointerOperand();
-    auto *PtrSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(Ptr));
+    auto *PtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(Ptr));
     auto *PH = L->getLoopPreheader();
     Value *InitialPtr = SEE.expandCodeFor(PtrSCEV->getStart(), Ptr->getType(),
                                           PH->getTerminator());
@@ -436,7 +436,7 @@ class LoadEliminationForLoop {
 
       // Check whether the SCEV difference is the same as the induction step,
       // thus we load the value in the next iteration.
-      if (!Cand.isDependenceDistanceOfOne(SE))
+      if (!Cand.isDependenceDistanceOfOne(PSE))
         continue;
 
       ++NumForwarding;
@@ -468,7 +468,7 @@ class LoadEliminationForLoop {
     // Point of no-return, start the transformation.  First, version the loop if
     // necessary.
     if (!Checks.empty() || !LAI.PSE.getUnionPredicate().isAlwaysTrue()) {
-      LoopVersioning LV(LAI, L, LI, DT, SE, false);
+      LoopVersioning LV(LAI, L, LI, DT, PSE.getSE(), false);
       LV.setAliasChecks(std::move(Checks));
       LV.setSCEVChecks(LAI.PSE.getUnionPredicate());
       LV.versionLoop();
@@ -476,7 +476,7 @@ class LoadEliminationForLoop {
 
     // Next, propagate the value stored by the store to the users of the load.
     // Also for the first iteration, generate the initial value of the load.
-    SCEVExpander SEE(*SE, L->getHeader()->getModule()->getDataLayout(),
+    SCEVExpander SEE(*PSE.getSE(), L->getHeader()->getModule()->getDataLayout(),
                      "storeforward");
     for (const auto &Cand : Candidates)
       propagateStoredValueToLoadUsers(Cand, SEE);
@@ -496,7 +496,7 @@ class LoadEliminationForLoop {
   LoopInfo *LI;
   const LoopAccessInfo &LAI;
   DominatorTree *DT;
-  ScalarEvolution *SE;
+  PredicatedScalarEvolution PSE;
 };
 
 /// \brief The pass.  Most of the work is delegated to the per-loop
@@ -511,7 +511,6 @@ class LoopLoadElimination : public FunctionPass {
     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     auto *LAA = &getAnalysis<LoopAccessAnalysis>();
     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-    auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
 
     // Build up a worklist of inner-loops to vectorize. This is necessary as the
     // act of distributing a loop creates new loops and can invalidate iterators
@@ -529,7 +528,7 @@ class LoopLoadElimination : public FunctionPass {
     for (Loop *L : Worklist) {
       const LoopAccessInfo &LAI = LAA->getInfo(L, ValueToValueMap());
       // The actual work is performed by LoadEliminationForLoop.
-      LoadEliminationForLoop LEL(L, LI, LAI, DT, SE);
+      LoadEliminationForLoop LEL(L, LI, LAI, DT);
       Changed |= LEL.processLoop();
     }
 

From 7db3980c5f97eb21bdcb58955ec7ed5e67ceda55 Mon Sep 17 00:00:00 2001
From: Amjad Aboud <amjad.aboud@intel.com>
Date: Thu, 10 Dec 2015 12:56:35 +0000
Subject: [PATCH 318/364] Macro debug info support in LLVM IR Introduced
 DIMacro and DIMacroFile debug info metadata in the LLVM IR to support macros.

Differential Revision: http://reviews.llvm.org/D14687

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255245 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/LangRef.rst                     |  34 ++++-
 include/llvm/Bitcode/LLVMBitCodes.h  |   4 +-
 include/llvm/IR/DebugInfoMetadata.h  | 187 +++++++++++++++++++++++++--
 include/llvm/IR/Metadata.def         |   3 +
 include/llvm/IR/Metadata.h           |   4 +-
 include/llvm/Support/Dwarf.h         |   2 +
 lib/AsmParser/LLLexer.cpp            |   1 +
 lib/AsmParser/LLParser.cpp           |  64 ++++++++-
 lib/AsmParser/LLToken.h              |   1 +
 lib/Bitcode/Reader/BitcodeReader.cpp |  30 ++++-
 lib/Bitcode/Writer/BitcodeWriter.cpp |  28 ++++
 lib/IR/AsmWriter.cpp                 |  33 +++++
 lib/IR/DIBuilder.cpp                 |   2 +-
 lib/IR/DebugInfoMetadata.cpp         |  25 +++-
 lib/IR/LLVMContextImpl.h             |  43 ++++++
 lib/IR/Verifier.cpp                  |  29 ++++-
 lib/Support/Dwarf.cpp                |  10 ++
 test/Assembler/debug-info.ll         |  19 ++-
 test/Assembler/dicompileunit.ll      |  15 ++-
 unittests/IR/MetadataTest.cpp        |  15 ++-
 20 files changed, 509 insertions(+), 40 deletions(-)

diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index ca0939e53575..7f1a97428ee2 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -3751,9 +3751,9 @@ DICompileUnit
 """""""""""""
 
 ``DICompileUnit`` nodes represent a compile unit. The ``enums:``,
-``retainedTypes:``, ``subprograms:``, ``globals:`` and ``imports:`` fields are
-tuples containing the debug info to be emitted along with the compile unit,
-regardless of code optimizations (some nodes are only emitted if there are
+``retainedTypes:``, ``subprograms:``, ``globals:``, ``imports:`` and ``macros:``
+fields are tuples containing the debug info to be emitted along with the compile
+unit, regardless of code optimizations (some nodes are only emitted if there are
 references to them from instructions).
 
 .. code-block:: llvm
@@ -3762,7 +3762,7 @@ references to them from instructions).
                         isOptimized: true, flags: "-O2", runtimeVersion: 2,
                         splitDebugFilename: "abc.debug", emissionKind: 1,
                         enums: !2, retainedTypes: !3, subprograms: !4,
-                        globals: !5, imports: !6)
+                        globals: !5, imports: !6, macros: !7, dwoId: 0x0abcd)
 
 Compile unit descriptors provide the root scope for objects declared in a
 specific compilation unit. File descriptors are defined using this scope.
@@ -4128,6 +4128,32 @@ compile unit.
    !2 = !DIImportedEntity(tag: DW_TAG_imported_module, name: "foo", scope: !0,
                           entity: !1, line: 7)
 
+DIMacro
+"""""""
+
+``DIMacro`` nodes represent definition or undefinition of a macro identifiers.
+The ``name:`` field is the macro identifier, followed by macro parameters when
+definining a function-like macro, and the ``value`` field is the token-string
+used to expand the macro identifier.
+
+.. code-block:: llvm
+
+   !2 = !DIMacro(macinfo: DW_MACINFO_define, line: 7, name: "foo(x)",
+                 value: "((x) + 1)")
+   !3 = !DIMacro(macinfo: DW_MACINFO_undef, line: 30, name: "foo")
+
+DIMacroFile
+"""""""""""
+
+``DIMacroFile`` nodes represent inclusion of source files.
+The ``nodes:`` field is a list of ``DIMacro`` and ``DIMacroFile`` nodes that
+appear in the included source file.
+
+.. code-block:: llvm
+
+   !2 = !DIMacroFile(macinfo: DW_MACINFO_start_file, line: 7, file: !2,
+                     nodes: !3)
+
 '``tbaa``' Metadata
 ^^^^^^^^^^^^^^^^^^^
 
diff --git a/include/llvm/Bitcode/LLVMBitCodes.h b/include/llvm/Bitcode/LLVMBitCodes.h
index 7cb0d7edff48..55fe05938e61 100644
--- a/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/include/llvm/Bitcode/LLVMBitCodes.h
@@ -220,7 +220,9 @@ enum { BITCODE_CURRENT_EPOCH = 0 };
     METADATA_EXPRESSION    = 29,  // [distinct, n x element]
     METADATA_OBJC_PROPERTY = 30,  // [distinct, name, file, line, ...]
     METADATA_IMPORTED_ENTITY=31,  // [distinct, tag, scope, entity, line, name]
-    METADATA_MODULE=32,           // [distinct, scope, name, ...]
+    METADATA_MODULE        = 32,  // [distinct, scope, name, ...]
+    METADATA_MACRO         = 33,  // [distinct, macinfo, line, name, value]
+    METADATA_MACRO_FILE    = 34,  // [distinct, macinfo, line, file, ...]
   };
 
   // The constants block (CONSTANTS_BLOCK_ID) describes emission for each
diff --git a/include/llvm/IR/DebugInfoMetadata.h b/include/llvm/IR/DebugInfoMetadata.h
index 0b3fe06f3576..456313a70e83 100644
--- a/include/llvm/IR/DebugInfoMetadata.h
+++ b/include/llvm/IR/DebugInfoMetadata.h
@@ -949,15 +949,16 @@ class DICompileUnit : public DIScope {
           unsigned EmissionKind, DICompositeTypeArray EnumTypes,
           DITypeArray RetainedTypes, DISubprogramArray Subprograms,
           DIGlobalVariableArray GlobalVariables,
-          DIImportedEntityArray ImportedEntities, uint64_t DWOId,
-          StorageType Storage, bool ShouldCreate = true) {
+          DIImportedEntityArray ImportedEntities, DIMacroNodeArray Macros,
+          uint64_t DWOId, StorageType Storage, bool ShouldCreate = true) {
     return getImpl(Context, SourceLanguage, File,
                    getCanonicalMDString(Context, Producer), IsOptimized,
                    getCanonicalMDString(Context, Flags), RuntimeVersion,
                    getCanonicalMDString(Context, SplitDebugFilename),
                    EmissionKind, EnumTypes.get(), RetainedTypes.get(),
                    Subprograms.get(), GlobalVariables.get(),
-                   ImportedEntities.get(), DWOId, Storage, ShouldCreate);
+                   ImportedEntities.get(), Macros.get(), DWOId, Storage,
+                   ShouldCreate);
   }
   static DICompileUnit *
   getImpl(LLVMContext &Context, unsigned SourceLanguage, Metadata *File,
@@ -965,15 +966,15 @@ class DICompileUnit : public DIScope {
           unsigned RuntimeVersion, MDString *SplitDebugFilename,
           unsigned EmissionKind, Metadata *EnumTypes, Metadata *RetainedTypes,
           Metadata *Subprograms, Metadata *GlobalVariables,
-          Metadata *ImportedEntities, uint64_t DWOId, StorageType Storage,
-          bool ShouldCreate = true);
+          Metadata *ImportedEntities, Metadata *Macros, uint64_t DWOId,
+          StorageType Storage, bool ShouldCreate = true);
 
   TempDICompileUnit cloneImpl() const {
     return getTemporary(
         getContext(), getSourceLanguage(), getFile(), getProducer(),
         isOptimized(), getFlags(), getRuntimeVersion(), getSplitDebugFilename(),
         getEmissionKind(), getEnumTypes(), getRetainedTypes(), getSubprograms(),
-        getGlobalVariables(), getImportedEntities(), DWOId);
+        getGlobalVariables(), getImportedEntities(), getMacros(), DWOId);
   }
 
   static void get() = delete;
@@ -987,20 +988,22 @@ class DICompileUnit : public DIScope {
        StringRef SplitDebugFilename, unsigned EmissionKind,
        DICompositeTypeArray EnumTypes, DITypeArray RetainedTypes,
        DISubprogramArray Subprograms, DIGlobalVariableArray GlobalVariables,
-       DIImportedEntityArray ImportedEntities, uint64_t DWOId),
+       DIImportedEntityArray ImportedEntities, DIMacroNodeArray Macros,
+       uint64_t DWOId),
       (SourceLanguage, File, Producer, IsOptimized, Flags, RuntimeVersion,
        SplitDebugFilename, EmissionKind, EnumTypes, RetainedTypes, Subprograms,
-       GlobalVariables, ImportedEntities, DWOId))
+       GlobalVariables, ImportedEntities, Macros, DWOId))
   DEFINE_MDNODE_GET_DISTINCT_TEMPORARY(
       DICompileUnit,
       (unsigned SourceLanguage, Metadata *File, MDString *Producer,
        bool IsOptimized, MDString *Flags, unsigned RuntimeVersion,
        MDString *SplitDebugFilename, unsigned EmissionKind, Metadata *EnumTypes,
        Metadata *RetainedTypes, Metadata *Subprograms,
-       Metadata *GlobalVariables, Metadata *ImportedEntities, uint64_t DWOId),
+       Metadata *GlobalVariables, Metadata *ImportedEntities, Metadata *Macros,
+       uint64_t DWOId),
       (SourceLanguage, File, Producer, IsOptimized, Flags, RuntimeVersion,
        SplitDebugFilename, EmissionKind, EnumTypes, RetainedTypes, Subprograms,
-       GlobalVariables, ImportedEntities, DWOId))
+       GlobalVariables, ImportedEntities, Macros, DWOId))
 
   TempDICompileUnit clone() const { return cloneImpl(); }
 
@@ -1026,6 +1029,9 @@ class DICompileUnit : public DIScope {
   DIImportedEntityArray getImportedEntities() const {
     return cast_or_null<MDTuple>(getRawImportedEntities());
   }
+  DIMacroNodeArray getMacros() const {
+    return cast_or_null<MDTuple>(getRawMacros());
+  }
   uint64_t getDWOId() const { return DWOId; }
   void setDWOId(uint64_t DwoId) { DWOId = DwoId; }
 
@@ -1039,6 +1045,7 @@ class DICompileUnit : public DIScope {
   Metadata *getRawSubprograms() const { return getOperand(6); }
   Metadata *getRawGlobalVariables() const { return getOperand(7); }
   Metadata *getRawImportedEntities() const { return getOperand(8); }
+  Metadata *getRawMacros() const { return getOperand(9); }
 
   /// \brief Replace arrays.
   ///
@@ -1061,6 +1068,7 @@ class DICompileUnit : public DIScope {
   void replaceImportedEntities(DIImportedEntityArray N) {
     replaceOperandWith(8, N.get());
   }
+  void replaceMacros(DIMacroNodeArray N) { replaceOperandWith(9, N.get()); }
   /// @}
 
   static bool classof(const Metadata *MD) {
@@ -2199,6 +2207,165 @@ class DIImportedEntity : public DINode {
   }
 };
 
+/// \brief Macro Info DWARF-like metadata node.
+///
+/// A metadata node with a DWARF macro info (i.e., a constant named
+/// \c DW_MACINFO_*, defined in llvm/Support/Dwarf.h).  Called \a DIMacroNode
+/// because it's potentially used for non-DWARF output.
+class DIMacroNode : public MDNode {
+  friend class LLVMContextImpl;
+  friend class MDNode;
+
+protected:
+  DIMacroNode(LLVMContext &C, unsigned ID, StorageType Storage, unsigned MIType,
+              ArrayRef<Metadata *> Ops1, ArrayRef<Metadata *> Ops2 = None)
+      : MDNode(C, ID, Storage, Ops1, Ops2) {
+    assert(MIType < 1u << 16);
+    SubclassData16 = MIType;
+  }
+  ~DIMacroNode() = default;
+
+  template <class Ty> Ty *getOperandAs(unsigned I) const {
+    return cast_or_null<Ty>(getOperand(I));
+  }
+
+  StringRef getStringOperand(unsigned I) const {
+    if (auto *S = getOperandAs<MDString>(I))
+      return S->getString();
+    return StringRef();
+  }
+
+  static MDString *getCanonicalMDString(LLVMContext &Context, StringRef S) {
+    if (S.empty())
+      return nullptr;
+    return MDString::get(Context, S);
+  }
+
+public:
+  unsigned getMacinfoType() const { return SubclassData16; }
+
+  static bool classof(const Metadata *MD) {
+    switch (MD->getMetadataID()) {
+    default:
+      return false;
+    case DIMacroKind:
+    case DIMacroFileKind:
+      return true;
+    }
+  }
+};
+
+class DIMacro : public DIMacroNode {
+  friend class LLVMContextImpl;
+  friend class MDNode;
+
+  unsigned Line;
+
+  DIMacro(LLVMContext &C, StorageType Storage, unsigned MIType, unsigned Line,
+          ArrayRef<Metadata *> Ops)
+      : DIMacroNode(C, DIMacroKind, Storage, MIType, Ops), Line(Line) {}
+  ~DIMacro() = default;
+
+  static DIMacro *getImpl(LLVMContext &Context, unsigned MIType, unsigned Line,
+                          StringRef Name, StringRef Value, StorageType Storage,
+                          bool ShouldCreate = true) {
+    return getImpl(Context, MIType, Line, getCanonicalMDString(Context, Name),
+                   getCanonicalMDString(Context, Value), Storage, ShouldCreate);
+  }
+  static DIMacro *getImpl(LLVMContext &Context, unsigned MIType, unsigned Line,
+                          MDString *Name, MDString *Value, StorageType Storage,
+                          bool ShouldCreate = true);
+
+  TempDIMacro cloneImpl() const {
+    return getTemporary(getContext(), getMacinfoType(), getLine(), getName(),
+                        getValue());
+  }
+
+public:
+  DEFINE_MDNODE_GET(DIMacro, (unsigned MIType, unsigned Line, StringRef Name,
+                              StringRef Value = ""),
+                    (MIType, Line, Name, Value))
+  DEFINE_MDNODE_GET(DIMacro, (unsigned MIType, unsigned Line, MDString *Name,
+                              MDString *Value),
+                    (MIType, Line, Name, Value))
+
+  TempDIMacro clone() const { return cloneImpl(); }
+
+  unsigned getLine() const { return Line; }
+
+  StringRef getName() const { return getStringOperand(0); }
+  StringRef getValue() const { return getStringOperand(1); }
+
+  MDString *getRawName() const { return getOperandAs<MDString>(0); }
+  MDString *getRawValue() const { return getOperandAs<MDString>(1); }
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == DIMacroKind;
+  }
+};
+
+class DIMacroFile : public DIMacroNode {
+  friend class LLVMContextImpl;
+  friend class MDNode;
+
+  unsigned Line;
+
+  DIMacroFile(LLVMContext &C, StorageType Storage, unsigned MIType,
+              unsigned Line, ArrayRef<Metadata *> Ops)
+      : DIMacroNode(C, DIMacroFileKind, Storage, MIType, Ops), Line(Line) {}
+  ~DIMacroFile() = default;
+
+  static DIMacroFile *getImpl(LLVMContext &Context, unsigned MIType,
+                              unsigned Line, DIFile *File,
+                              DIMacroNodeArray Elements, StorageType Storage,
+                              bool ShouldCreate = true) {
+    return getImpl(Context, MIType, Line, static_cast<Metadata *>(File),
+                   Elements.get(), Storage, ShouldCreate);
+  }
+
+  static DIMacroFile *getImpl(LLVMContext &Context, unsigned MIType,
+                              unsigned Line, Metadata *File, Metadata *Elements,
+                              StorageType Storage, bool ShouldCreate = true);
+
+  TempDIMacroFile cloneImpl() const {
+    return getTemporary(getContext(), getMacinfoType(), getLine(), getFile(),
+                        getElements());
+  }
+
+public:
+  DEFINE_MDNODE_GET(DIMacroFile, (unsigned MIType, unsigned Line, DIFile *File,
+                                  DIMacroNodeArray Elements),
+                    (MIType, Line, File, Elements))
+  DEFINE_MDNODE_GET(DIMacroFile, (unsigned MIType, unsigned Line,
+                                  Metadata *File, Metadata *Elements),
+                    (MIType, Line, File, Elements))
+
+  TempDIMacroFile clone() const { return cloneImpl(); }
+
+  void replaceElements(DIMacroNodeArray Elements) {
+#ifndef NDEBUG
+    for (DIMacroNode *Op : getElements())
+      assert(std::find(Elements->op_begin(), Elements->op_end(), Op) &&
+             "Lost a macro node during macro node list replacement");
+#endif
+    replaceOperandWith(1, Elements.get());
+  }
+
+  unsigned getLine() const { return Line; }
+  DIFile *getFile() const { return cast_or_null<DIFile>(getRawFile()); }
+
+  DIMacroNodeArray getElements() const {
+    return cast_or_null<MDTuple>(getRawElements());
+  }
+
+  Metadata *getRawFile() const { return getOperand(0); }
+  Metadata *getRawElements() const { return getOperand(1); }
+
+  static bool classof(const Metadata *MD) {
+    return MD->getMetadataID() == DIMacroFileKind;
+  }
+};
+
 } // end namespace llvm
 
 #undef DEFINE_MDNODE_GET_UNPACK_IMPL
diff --git a/include/llvm/IR/Metadata.def b/include/llvm/IR/Metadata.def
index 9d6f929be349..b1d22178e262 100644
--- a/include/llvm/IR/Metadata.def
+++ b/include/llvm/IR/Metadata.def
@@ -108,6 +108,9 @@ HANDLE_SPECIALIZED_MDNODE_LEAF_UNIQUABLE(DIGlobalVariable)
 HANDLE_SPECIALIZED_MDNODE_LEAF_UNIQUABLE(DILocalVariable)
 HANDLE_SPECIALIZED_MDNODE_LEAF_UNIQUABLE(DIObjCProperty)
 HANDLE_SPECIALIZED_MDNODE_LEAF_UNIQUABLE(DIImportedEntity)
+HANDLE_SPECIALIZED_MDNODE_BRANCH(DIMacroNode)
+HANDLE_SPECIALIZED_MDNODE_LEAF_UNIQUABLE(DIMacro)
+HANDLE_SPECIALIZED_MDNODE_LEAF_UNIQUABLE(DIMacroFile)
 
 #undef HANDLE_METADATA
 #undef HANDLE_METADATA_LEAF
diff --git a/include/llvm/IR/Metadata.h b/include/llvm/IR/Metadata.h
index 276fa7d11885..84c82476c4f6 100644
--- a/include/llvm/IR/Metadata.h
+++ b/include/llvm/IR/Metadata.h
@@ -83,7 +83,9 @@ class Metadata {
     DIImportedEntityKind,
     ConstantAsMetadataKind,
     LocalAsMetadataKind,
-    MDStringKind
+    MDStringKind,
+    DIMacroKind,
+    DIMacroFileKind
   };
 
 protected:
diff --git a/include/llvm/Support/Dwarf.h b/include/llvm/Support/Dwarf.h
index 8d71353b1670..b63d12e9ff39 100644
--- a/include/llvm/Support/Dwarf.h
+++ b/include/llvm/Support/Dwarf.h
@@ -625,6 +625,7 @@ const char *GDBIndexEntryLinkageString(GDBIndexEntryLinkage Linkage);
 ///
 /// \li \a getTag() returns \a DW_TAG_invalid on invalid input.
 /// \li \a getVirtuality() returns \a DW_VIRTUALITY_invalid on invalid input.
+/// \li \a getMacinfo() returns \a DW_MACINFO_invalid on invalid input.
 ///
 /// @{
 unsigned getTag(StringRef TagString);
@@ -632,6 +633,7 @@ unsigned getOperationEncoding(StringRef OperationEncodingString);
 unsigned getVirtuality(StringRef VirtualityString);
 unsigned getLanguage(StringRef LanguageString);
 unsigned getAttributeEncoding(StringRef EncodingString);
+unsigned getMacinfo(StringRef MacinfoString);
 /// @}
 
 /// \brief Returns the symbolic string representing Val when used as a value
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index f95a763e3dae..db90f78b3186 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -778,6 +778,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   DWKEYWORD(VIRTUALITY, DwarfVirtuality);
   DWKEYWORD(LANG, DwarfLang);
   DWKEYWORD(OP, DwarfOp);
+  DWKEYWORD(MACINFO, DwarfMacinfo);
 #undef DWKEYWORD
 
   if (Keyword.startswith("DIFlag")) {
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index 145b5eaaceca..b5cbee5085bd 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -3279,6 +3279,11 @@ struct DwarfTagField : public MDUnsignedField {
   DwarfTagField(dwarf::Tag DefaultTag)
       : MDUnsignedField(DefaultTag, dwarf::DW_TAG_hi_user) {}
 };
+struct DwarfMacinfoTypeField : public MDUnsignedField {
+  DwarfMacinfoTypeField() : MDUnsignedField(0, dwarf::DW_MACINFO_vendor_ext) {}
+  DwarfMacinfoTypeField(dwarf::MacinfoRecordType DefaultType)
+    : MDUnsignedField(DefaultType, dwarf::DW_MACINFO_vendor_ext) {}
+};
 struct DwarfAttEncodingField : public MDUnsignedField {
   DwarfAttEncodingField() : MDUnsignedField(0, dwarf::DW_ATE_hi_user) {}
 };
@@ -3370,6 +3375,26 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name, DwarfTagField &Result) {
   return false;
 }
 
+template <>
+bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
+                            DwarfMacinfoTypeField &Result) {
+  if (Lex.getKind() == lltok::APSInt)
+    return ParseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
+
+  if (Lex.getKind() != lltok::DwarfMacinfo)
+    return TokError("expected DWARF macinfo type");
+
+  unsigned Macinfo = dwarf::getMacinfo(Lex.getStrVal());
+  if (Macinfo == dwarf::DW_MACINFO_invalid)
+    return TokError(
+        "invalid DWARF macinfo type" + Twine(" '") + Lex.getStrVal() + "'");
+  assert(Macinfo <= Result.Max && "Expected valid DWARF macinfo type");
+
+  Result.assign(Macinfo);
+  Lex.Lex();
+  return false;
+}
+
 template <>
 bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
                             DwarfVirtualityField &Result) {
@@ -3782,7 +3807,7 @@ bool LLParser::ParseDIFile(MDNode *&Result, bool IsDistinct) {
 ///                      isOptimized: true, flags: "-O2", runtimeVersion: 1,
 ///                      splitDebugFilename: "abc.debug", emissionKind: 1,
 ///                      enums: !1, retainedTypes: !2, subprograms: !3,
-///                      globals: !4, imports: !5, dwoId: 0x0abcd)
+///                      globals: !4, imports: !5, macros: !6, dwoId: 0x0abcd)
 bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) {
   if (!IsDistinct)
     return Lex.Error("missing 'distinct', required for !DICompileUnit");
@@ -3801,6 +3826,7 @@ bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(subprograms, MDField, );                                            \
   OPTIONAL(globals, MDField, );                                                \
   OPTIONAL(imports, MDField, );                                                \
+  OPTIONAL(macros, MDField, );                                                 \
   OPTIONAL(dwoId, MDUnsignedField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
@@ -3808,7 +3834,8 @@ bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) {
   Result = DICompileUnit::getDistinct(
       Context, language.Val, file.Val, producer.Val, isOptimized.Val, flags.Val,
       runtimeVersion.Val, splitDebugFilename.Val, emissionKind.Val, enums.Val,
-      retainedTypes.Val, subprograms.Val, globals.Val, imports.Val, dwoId.Val);
+      retainedTypes.Val, subprograms.Val, globals.Val, imports.Val, macros.Val,
+      dwoId.Val);
   return false;
 }
 
@@ -3904,6 +3931,39 @@ bool LLParser::ParseDINamespace(MDNode *&Result, bool IsDistinct) {
   return false;
 }
 
+/// ParseDIMacro:
+///   ::= !DIMacro(macinfo: type, line: 9, name: "SomeMacro", value: "SomeValue")
+bool LLParser::ParseDIMacro(MDNode *&Result, bool IsDistinct) {
+#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
+  REQUIRED(type, DwarfMacinfoTypeField, );                                     \
+  REQUIRED(line, LineField, );                                                 \
+  REQUIRED(name, MDStringField, );                                             \
+  OPTIONAL(value, MDStringField, );
+  PARSE_MD_FIELDS();
+#undef VISIT_MD_FIELDS
+
+  Result = GET_OR_DISTINCT(DIMacro,
+                           (Context, type.Val, line.Val, name.Val, value.Val));
+  return false;
+}
+
+/// ParseDIMacroFile:
+///   ::= !DIMacroFile(line: 9, file: !2, nodes: !3)
+bool LLParser::ParseDIMacroFile(MDNode *&Result, bool IsDistinct) {
+#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
+  OPTIONAL(type, DwarfMacinfoTypeField, (dwarf::DW_MACINFO_start_file));       \
+  REQUIRED(line, LineField, );                                                 \
+  REQUIRED(file, MDField, );                                                   \
+  OPTIONAL(nodes, MDField, );
+  PARSE_MD_FIELDS();
+#undef VISIT_MD_FIELDS
+
+  Result = GET_OR_DISTINCT(DIMacroFile,
+                           (Context, type.Val, line.Val, file.Val, nodes.Val));
+  return false;
+}
+
+
 /// ParseDIModule:
 ///   ::= !DIModule(scope: !0, name: "SomeModule", configMacros: "-DNDEBUG",
 ///                 includePath: "/usr/include", isysroot: "/")
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index 48abeac95066..10c840d257f7 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -215,6 +215,7 @@ namespace lltok {
     DwarfLang,         // DW_LANG_foo
     DwarfOp,           // DW_OP_foo
     DIFlag,            // DIFlagFoo
+    DwarfMacinfo,      // DW_MACINFO_foo
 
     // Type valued tokens (TyVal).
     Type,
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index e95aba771b9c..4b5af3dd80fc 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -2214,10 +2214,10 @@ std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
       break;
     }
     case bitc::METADATA_COMPILE_UNIT: {
-      if (Record.size() < 14 || Record.size() > 15)
+      if (Record.size() < 14 || Record.size() > 16)
         return error("Invalid record");
 
-      // Ignore Record[1], which indicates whether this compile unit is
+      // Ignore Record[0], which indicates whether this compile unit is
       // distinct.  It's always distinct.
       MDValueList.assignValue(
           DICompileUnit::getDistinct(
@@ -2226,7 +2226,9 @@ std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
               Record[6], getMDString(Record[7]), Record[8],
               getMDOrNull(Record[9]), getMDOrNull(Record[10]),
               getMDOrNull(Record[11]), getMDOrNull(Record[12]),
-              getMDOrNull(Record[13]), Record.size() == 14 ? 0 : Record[14]),
+              getMDOrNull(Record[13]),
+              Record.size() <= 15 ? 0 : getMDOrNull(Record[15]),
+              Record.size() <= 14 ? 0 : Record[14]),
           NextMDValueNo++);
       break;
     }
@@ -2294,6 +2296,28 @@ std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
           NextMDValueNo++);
       break;
     }
+    case bitc::METADATA_MACRO: {
+      if (Record.size() != 5)
+        return error("Invalid record");
+
+      MDValueList.assignValue(
+          GET_OR_DISTINCT(DIMacro, Record[0],
+                          (Context, Record[1], Record[2],
+                           getMDString(Record[3]), getMDString(Record[4]))),
+          NextMDValueNo++);
+      break;
+    }
+    case bitc::METADATA_MACRO_FILE: {
+      if (Record.size() != 5)
+        return error("Invalid record");
+
+      MDValueList.assignValue(
+          GET_OR_DISTINCT(DIMacroFile, Record[0],
+                          (Context, Record[1], Record[2],
+                           getMDOrNull(Record[3]), getMDOrNull(Record[4]))),
+          NextMDValueNo++);
+      break;
+    }
     case bitc::METADATA_TEMPLATE_TYPE: {
       if (Record.size() != 3)
         return error("Invalid record");
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index b1b699765bd4..201b4bc34c2b 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -1017,6 +1017,7 @@ static void WriteDICompileUnit(const DICompileUnit *N,
   Record.push_back(VE.getMetadataOrNullID(N->getGlobalVariables().get()));
   Record.push_back(VE.getMetadataOrNullID(N->getImportedEntities().get()));
   Record.push_back(N->getDWOId());
+  Record.push_back(VE.getMetadataOrNullID(N->getMacros().get()));
 
   Stream.EmitRecord(bitc::METADATA_COMPILE_UNIT, Record, Abbrev);
   Record.clear();
@@ -1092,6 +1093,33 @@ static void WriteDINamespace(const DINamespace *N, const ValueEnumerator &VE,
   Record.clear();
 }
 
+static void WriteDIMacro(const DIMacro *N, const ValueEnumerator &VE,
+                         BitstreamWriter &Stream,
+                         SmallVectorImpl<uint64_t> &Record, unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(N->getMacinfoType());
+  Record.push_back(N->getLine());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawValue()));
+
+  Stream.EmitRecord(bitc::METADATA_MACRO, Record, Abbrev);
+  Record.clear();
+}
+
+static void WriteDIMacroFile(const DIMacroFile *N, const ValueEnumerator &VE,
+                             BitstreamWriter &Stream,
+                             SmallVectorImpl<uint64_t> &Record,
+                             unsigned Abbrev) {
+  Record.push_back(N->isDistinct());
+  Record.push_back(N->getMacinfoType());
+  Record.push_back(N->getLine());
+  Record.push_back(VE.getMetadataOrNullID(N->getFile()));
+  Record.push_back(VE.getMetadataOrNullID(N->getElements().get()));
+
+  Stream.EmitRecord(bitc::METADATA_MACRO_FILE, Record, Abbrev);
+  Record.clear();
+}
+
 static void WriteDIModule(const DIModule *N, const ValueEnumerator &VE,
                           BitstreamWriter &Stream,
                           SmallVectorImpl<uint64_t> &Record, unsigned Abbrev) {
diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp
index f8040a7b5f86..e41815aafa80 100644
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp
@@ -1396,6 +1396,7 @@ struct MDFieldPrinter {
       : Out(Out), TypePrinter(TypePrinter), Machine(Machine), Context(Context) {
   }
   void printTag(const DINode *N);
+  void printMacinfoType(const DIMacroNode *N);
   void printString(StringRef Name, StringRef Value,
                    bool ShouldSkipEmpty = true);
   void printMetadata(StringRef Name, const Metadata *MD,
@@ -1418,6 +1419,14 @@ void MDFieldPrinter::printTag(const DINode *N) {
     Out << N->getTag();
 }
 
+void MDFieldPrinter::printMacinfoType(const DIMacroNode *N) {
+  Out << FS << "type: ";
+  if (const char *Type = dwarf::MacinfoString(N->getMacinfoType()))
+    Out << Type;
+  else
+    Out << N->getMacinfoType();
+}
+
 void MDFieldPrinter::printString(StringRef Name, StringRef Value,
                                  bool ShouldSkipEmpty) {
   if (ShouldSkipEmpty && Value.empty())
@@ -1643,6 +1652,7 @@ static void writeDICompileUnit(raw_ostream &Out, const DICompileUnit *N,
   Printer.printMetadata("subprograms", N->getRawSubprograms());
   Printer.printMetadata("globals", N->getRawGlobalVariables());
   Printer.printMetadata("imports", N->getRawImportedEntities());
+  Printer.printMetadata("macros", N->getRawMacros());
   Printer.printInt("dwoId", N->getDWOId());
   Out << ")";
 }
@@ -1711,6 +1721,29 @@ static void writeDINamespace(raw_ostream &Out, const DINamespace *N,
   Out << ")";
 }
 
+static void writeDIMacro(raw_ostream &Out, const DIMacro *N,
+                         TypePrinting *TypePrinter, SlotTracker *Machine,
+                         const Module *Context) {
+  Out << "!DIMacro(";
+  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+  Printer.printMacinfoType(N);
+  Printer.printInt("line", N->getLine());
+  Printer.printString("name", N->getName());
+  Printer.printString("value", N->getValue());
+  Out << ")";
+}
+
+static void writeDIMacroFile(raw_ostream &Out, const DIMacroFile *N,
+                             TypePrinting *TypePrinter, SlotTracker *Machine,
+                             const Module *Context) {
+  Out << "!DIMacroFile(";
+  MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
+  Printer.printInt("line", N->getLine());
+  Printer.printMetadata("file", N->getRawFile(), /* ShouldSkipNull */ false);
+  Printer.printMetadata("nodes", N->getRawElements());
+  Out << ")";
+}
+
 static void writeDIModule(raw_ostream &Out, const DIModule *N,
                           TypePrinting *TypePrinter, SlotTracker *Machine,
                           const Module *Context) {
diff --git a/lib/IR/DIBuilder.cpp b/lib/IR/DIBuilder.cpp
index 09b540350c28..b7841fe2b85c 100644
--- a/lib/IR/DIBuilder.cpp
+++ b/lib/IR/DIBuilder.cpp
@@ -148,7 +148,7 @@ DICompileUnit *DIBuilder::createCompileUnit(
   CUNode = DICompileUnit::getDistinct(
       VMContext, Lang, DIFile::get(VMContext, Filename, Directory), Producer,
       isOptimized, Flags, RunTimeVer, SplitName, Kind, nullptr,
-      nullptr, nullptr, nullptr, nullptr, DWOId);
+      nullptr, nullptr, nullptr, nullptr, nullptr, DWOId);
 
   // Create a named metadata so that it is easier to find cu in a module.
   // Note that we only generate this when the caller wants to actually
diff --git a/lib/IR/DebugInfoMetadata.cpp b/lib/IR/DebugInfoMetadata.cpp
index cead10652e07..58e0abdd577c 100644
--- a/lib/IR/DebugInfoMetadata.cpp
+++ b/lib/IR/DebugInfoMetadata.cpp
@@ -315,7 +315,7 @@ DICompileUnit *DICompileUnit::getImpl(
     unsigned RuntimeVersion, MDString *SplitDebugFilename,
     unsigned EmissionKind, Metadata *EnumTypes, Metadata *RetainedTypes,
     Metadata *Subprograms, Metadata *GlobalVariables,
-    Metadata *ImportedEntities, uint64_t DWOId,
+    Metadata *ImportedEntities, Metadata *Macros, uint64_t DWOId,
     StorageType Storage, bool ShouldCreate) {
   assert(Storage != Uniqued && "Cannot unique DICompileUnit");
   assert(isCanonical(Producer) && "Expected canonical MDString");
@@ -324,7 +324,7 @@ DICompileUnit *DICompileUnit::getImpl(
 
   Metadata *Ops[] = {File, Producer, Flags, SplitDebugFilename, EnumTypes,
                      RetainedTypes, Subprograms, GlobalVariables,
-                     ImportedEntities};
+                     ImportedEntities, Macros};
   return storeImpl(new (ArrayRef<Metadata *>(Ops).size()) DICompileUnit(
                        Context, Storage, SourceLanguage, IsOptimized,
                        RuntimeVersion, EmissionKind, DWOId, Ops),
@@ -557,3 +557,24 @@ DIImportedEntity *DIImportedEntity::getImpl(LLVMContext &Context, unsigned Tag,
   Metadata *Ops[] = {Scope, Entity, Name};
   DEFINE_GETIMPL_STORE(DIImportedEntity, (Tag, Line), Ops);
 }
+
+DIMacro *DIMacro::getImpl(LLVMContext &Context, unsigned MIType,
+                          unsigned Line, MDString *Name, MDString *Value,
+                          StorageType Storage, bool ShouldCreate) {
+  assert(isCanonical(Name) && "Expected canonical MDString");
+  DEFINE_GETIMPL_LOOKUP(DIMacro,
+                        (MIType, Line, getString(Name), getString(Value)));
+  Metadata *Ops[] = { Name, Value };
+  DEFINE_GETIMPL_STORE(DIMacro, (MIType, Line), Ops);
+}
+
+DIMacroFile *DIMacroFile::getImpl(LLVMContext &Context, unsigned MIType,
+                                  unsigned Line, Metadata *File,
+                                  Metadata *Elements, StorageType Storage,
+                                  bool ShouldCreate) {
+  DEFINE_GETIMPL_LOOKUP(DIMacroFile,
+                        (MIType, Line, File, Elements));
+  Metadata *Ops[] = { File, Elements };
+  DEFINE_GETIMPL_STORE(DIMacroFile, (MIType, Line), Ops);
+}
+
diff --git a/lib/IR/LLVMContextImpl.h b/lib/IR/LLVMContextImpl.h
index 7e89b582cbdd..ae987e65bcb1 100644
--- a/lib/IR/LLVMContextImpl.h
+++ b/lib/IR/LLVMContextImpl.h
@@ -792,6 +792,49 @@ template <> struct MDNodeKeyImpl<DIImportedEntity> {
   }
 };
 
+template <> struct MDNodeKeyImpl<DIMacro> {
+  unsigned MIType;
+  unsigned Line;
+  StringRef Name;
+  StringRef Value;
+
+  MDNodeKeyImpl(unsigned MIType, unsigned Line, StringRef Name, StringRef Value)
+      : MIType(MIType), Line(Line), Name(Name), Value(Value) {}
+  MDNodeKeyImpl(const DIMacro *N)
+      : MIType(N->getMacinfoType()), Line(N->getLine()), Name(N->getName()),
+        Value(N->getValue()) {}
+
+  bool isKeyOf(const DIMacro *RHS) const {
+    return MIType == RHS->getMacinfoType() && Line == RHS->getLine() &&
+           Name == RHS->getName() && Value == RHS->getValue();
+  }
+  unsigned getHashValue() const {
+    return hash_combine(MIType, Line, Name, Value);
+  }
+};
+
+template <> struct MDNodeKeyImpl<DIMacroFile> {
+  unsigned MIType;
+  unsigned Line;
+  Metadata *File;
+  Metadata *Elements;
+
+  MDNodeKeyImpl(unsigned MIType, unsigned Line, Metadata *File,
+                Metadata *Elements)
+      : MIType(MIType), Line(Line), File(File), Elements(Elements) {}
+  MDNodeKeyImpl(const DIMacroFile *N)
+      : MIType(N->getMacinfoType()), Line(N->getLine()), File(N->getRawFile()),
+        Elements(N->getRawElements()) {}
+
+  bool isKeyOf(const DIMacroFile *RHS) const {
+    return MIType == RHS->getMacinfoType() && Line == RHS->getLine() &&
+           File == RHS->getRawFile() && File == RHS->getRawElements();
+  }
+  unsigned getHashValue() const {
+    return hash_combine(MIType, Line, File, Elements);
+  }
+};
+
 /// \brief DenseMapInfo for MDNode subclasses.
 template <class NodeTy> struct MDNodeInfo {
   typedef MDNodeKeyImpl<NodeTy> KeyTy;
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index 96b8a779577d..819d7bb9ad9b 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -860,8 +860,6 @@ void Verifier::visitDICompositeType(const DICompositeType &N) {
          "invalid composite elements", &N, N.getRawElements());
   Assert(isTypeRef(N, N.getRawVTableHolder()), "invalid vtable holder", &N,
          N.getRawVTableHolder());
-  Assert(!N.getRawElements() || isa<MDTuple>(N.getRawElements()),
-         "invalid composite elements", &N, N.getRawElements());
   Assert(!hasConflictingReferenceFlags(N.getFlags()), "invalid reference flags",
          &N);
   if (auto *Params = N.getRawTemplateParams())
@@ -935,6 +933,12 @@ void Verifier::visitDICompileUnit(const DICompileUnit &N) {
              Op);
     }
   }
+  if (auto *Array = N.getRawMacros()) {
+    Assert(isa<MDTuple>(Array), "invalid macro list", &N, Array);
+    for (Metadata *Op : N.getMacros()->operands()) {
+      Assert(Op && isa<DIMacroNode>(Op), "invalid macro ref", &N, Op);
+    }
+  }
 }
 
 void Verifier::visitDISubprogram(const DISubprogram &N) {
@@ -988,6 +992,27 @@ void Verifier::visitDINamespace(const DINamespace &N) {
     Assert(isa<DIScope>(S), "invalid scope ref", &N, S);
 }
 
+void Verifier::visitDIMacro(const DIMacro &N) {
+  Assert(N.getMacinfoType() == dwarf::DW_MACINFO_define ||
+         N.getMacinfoType() == dwarf::DW_MACINFO_undef,
+         "invalid macinfo type", &N);
+  Assert(!N.getName().empty(), "anonymous macro", &N);
+}
+
+void Verifier::visitDIMacroFile(const DIMacroFile &N) {
+  Assert(N.getMacinfoType() == dwarf::DW_MACINFO_start_file,
+         "invalid macinfo type", &N);
+  if (auto *F = N.getRawFile())
+    Assert(isa<DIFile>(F), "invalid file", &N, F);
+
+  if (auto *Array = N.getRawElements()) {
+    Assert(isa<MDTuple>(Array), "invalid macro list", &N, Array);
+    for (Metadata *Op : N.getElements()->operands()) {
+      Assert(Op && isa<DIMacroNode>(Op), "invalid macro ref", &N, Op);
+    }
+  }
+}
+
 void Verifier::visitDIModule(const DIModule &N) {
   Assert(N.getTag() == dwarf::DW_TAG_module, "invalid tag", &N);
   Assert(!N.getName().empty(), "anonymous module", &N);
diff --git a/lib/Support/Dwarf.cpp b/lib/Support/Dwarf.cpp
index dd740384de38..7d7225671737 100644
--- a/lib/Support/Dwarf.cpp
+++ b/lib/Support/Dwarf.cpp
@@ -473,6 +473,16 @@ const char *llvm::dwarf::MacinfoString(unsigned Encoding) {
   return nullptr;
 }
 
+unsigned llvm::dwarf::getMacinfo(StringRef MacinfoString) {
+  return StringSwitch<unsigned>(MacinfoString)
+      .Case("DW_MACINFO_define", DW_MACINFO_define)
+      .Case("DW_MACINFO_undef", DW_MACINFO_undef)
+      .Case("DW_MACINFO_start_file", DW_MACINFO_start_file)
+      .Case("DW_MACINFO_end_file", DW_MACINFO_end_file)
+      .Case("DW_MACINFO_vendor_ext", DW_MACINFO_vendor_ext)
+      .Default(DW_MACINFO_invalid);
+}
+
 const char *llvm::dwarf::CallFrameString(unsigned Encoding) {
   switch (Encoding) {
   case DW_CFA_nop:                       return "DW_CFA_nop";
diff --git a/test/Assembler/debug-info.ll b/test/Assembler/debug-info.ll
index 91dfe561a2fe..86630840dc20 100644
--- a/test/Assembler/debug-info.ll
+++ b/test/Assembler/debug-info.ll
@@ -1,8 +1,8 @@
 ; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
 ; RUN: verify-uselistorder %s
 
-; CHECK: !named = !{!0, !0, !1, !2, !3, !4, !5, !6, !7, !8, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !27}
-!named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30}
+; CHECK: !named = !{!0, !0, !1, !2, !3, !4, !5, !6, !7, !8, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !27, !28, !29, !30, !31}
+!named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34}
 
 ; CHECK:      !0 = !DISubrange(count: 3)
 ; CHECK-NEXT: !1 = !DISubrange(count: 3, lowerBound: 4)
@@ -63,10 +63,19 @@
 !25 = !DICompositeType(tag: DW_TAG_structure_type)
 !26 = !DICompositeType(tag: DW_TAG_structure_type, runtimeLang: 6)
 
-; !25 = !{!7, !7}
-; !26 = !DISubroutineType(flags: DIFlagPublic | DIFlagStaticMember, types: !25)
-; !27 = !DISubroutineType(types: !25)
+; CHECK-NEXT: !25 = !{!6, !6}
+; CHECK-NEXT: !26 = !DISubroutineType(flags: DIFlagPublic | DIFlagStaticMember, types: !25)
+; CHECK-NEXT: !27 = !DISubroutineType(types: !25)
 !27 = !{!7, !7}
 !28 = !DISubroutineType(flags: DIFlagPublic | DIFlagStaticMember, types: !27)
 !29 = !DISubroutineType(flags: 0, types: !27)
 !30 = !DISubroutineType(types: !27)
+
+; CHECK-NEXT: !28 = !DIMacro(type: DW_MACINFO_define, line: 9, name: "Name", value: "Value")
+; CHECK-NEXT: !29 = distinct !{!28}
+; CHECK-NEXT: !30 = !DIMacroFile(line: 9, file: !12, nodes: !29)
+; CHECK-NEXT: !31 = !DIMacroFile(line: 11, file: !12)
+!31 = !DIMacro(type: DW_MACINFO_define, line: 9, name: "Name", value: "Value")
+!32 = distinct !{!31}
+!33 = !DIMacroFile(line: 9, file: !14, nodes: !32)
+!34 = !DIMacroFile(type: DW_MACINFO_start_file, line: 11, file: !14)
diff --git a/test/Assembler/dicompileunit.ll b/test/Assembler/dicompileunit.ll
index ba6731827cdb..92fa61fe6b90 100644
--- a/test/Assembler/dicompileunit.ll
+++ b/test/Assembler/dicompileunit.ll
@@ -1,8 +1,8 @@
 ; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
 ; RUN: verify-uselistorder %s
 
-; CHECK: !named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8}
-!named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8}
+; CHECK: !named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9}
+!named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9}
 
 !0 = distinct !{}
 !1 = !DIFile(filename: "path/to/file", directory: "/path/to/dir")
@@ -11,15 +11,16 @@
 !4 = distinct !{}
 !5 = distinct !{}
 !6 = distinct !{}
+!7 = distinct !{}
 
-; CHECK: !7 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: true, flags: "-O2", runtimeVersion: 2, splitDebugFilename: "abc.debug", emissionKind: 3, enums: !2, retainedTypes: !3, subprograms: !4, globals: !5, imports: !6, dwoId: 42)
-!7 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang",
+; CHECK: !8 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: true, flags: "-O2", runtimeVersion: 2, splitDebugFilename: "abc.debug", emissionKind: 3, enums: !2, retainedTypes: !3, subprograms: !4, globals: !5, imports: !6, macros: !7, dwoId: 42)
+!8 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang",
                              isOptimized: true, flags: "-O2", runtimeVersion: 2,
                              splitDebugFilename: "abc.debug", emissionKind: 3,
                              enums: !2, retainedTypes: !3, subprograms: !4,
-                             globals: !5, imports: !6, dwoId: 42)
+                             globals: !5, imports: !6, macros: !7, dwoId: 42)
 
-; CHECK: !8 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: 0)
-!8 = distinct !DICompileUnit(language: 12, file: !1, producer: "",
+; CHECK: !9 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: 0)
+!9 = distinct !DICompileUnit(language: 12, file: !1, producer: "",
                              isOptimized: false, flags: "", runtimeVersion: 0,
                              splitDebugFilename: "", emissionKind: 0)
diff --git a/unittests/IR/MetadataTest.cpp b/unittests/IR/MetadataTest.cpp
index 8f346f53a2d2..257ab7204c6c 100644
--- a/unittests/IR/MetadataTest.cpp
+++ b/unittests/IR/MetadataTest.cpp
@@ -1312,10 +1312,12 @@ TEST_F(DICompileUnitTest, get) {
   MDTuple *GlobalVariables = getTuple();
   MDTuple *ImportedEntities = getTuple();
   uint64_t DWOId = 0x10000000c0ffee;
+  MDTuple *Macros = getTuple();
   auto *N = DICompileUnit::getDistinct(
       Context, SourceLanguage, File, Producer, IsOptimized, Flags,
       RuntimeVersion, SplitDebugFilename, EmissionKind, EnumTypes,
-      RetainedTypes, Subprograms, GlobalVariables, ImportedEntities, DWOId);
+      RetainedTypes, Subprograms, GlobalVariables, ImportedEntities, Macros,
+      DWOId);
 
   EXPECT_EQ(dwarf::DW_TAG_compile_unit, N->getTag());
   EXPECT_EQ(SourceLanguage, N->getSourceLanguage());
@@ -1331,6 +1333,7 @@ TEST_F(DICompileUnitTest, get) {
   EXPECT_EQ(Subprograms, N->getSubprograms().get());
   EXPECT_EQ(GlobalVariables, N->getGlobalVariables().get());
   EXPECT_EQ(ImportedEntities, N->getImportedEntities().get());
+  EXPECT_EQ(Macros, N->getMacros().get());
   EXPECT_EQ(DWOId, N->getDWOId());
 
   TempDICompileUnit Temp = N->clone();
@@ -1348,6 +1351,7 @@ TEST_F(DICompileUnitTest, get) {
   EXPECT_EQ(Subprograms, Temp->getSubprograms().get());
   EXPECT_EQ(GlobalVariables, Temp->getGlobalVariables().get());
   EXPECT_EQ(ImportedEntities, Temp->getImportedEntities().get());
+  EXPECT_EQ(Macros, Temp->getMacros().get());
   EXPECT_EQ(DWOId, Temp->getDWOId());
 
   auto *TempAddress = Temp.get();
@@ -1372,7 +1376,7 @@ TEST_F(DICompileUnitTest, replaceArrays) {
   auto *N = DICompileUnit::getDistinct(
       Context, SourceLanguage, File, Producer, IsOptimized, Flags,
       RuntimeVersion, SplitDebugFilename, EmissionKind, EnumTypes,
-      RetainedTypes, nullptr, nullptr, ImportedEntities, DWOId);
+      RetainedTypes, nullptr, nullptr, ImportedEntities, nullptr, DWOId);
 
   auto *Subprograms = MDTuple::getDistinct(Context, None);
   EXPECT_EQ(nullptr, N->getSubprograms().get());
@@ -1387,6 +1391,13 @@ TEST_F(DICompileUnitTest, replaceArrays) {
   EXPECT_EQ(GlobalVariables, N->getGlobalVariables().get());
   N->replaceGlobalVariables(nullptr);
   EXPECT_EQ(nullptr, N->getGlobalVariables().get());
+
+  auto *Macros = MDTuple::getDistinct(Context, None);
+  EXPECT_EQ(nullptr, N->getMacros().get());
+  N->replaceMacros(Macros);
+  EXPECT_EQ(Macros, N->getMacros().get());
+  N->replaceMacros(nullptr);
+  EXPECT_EQ(nullptr, N->getMacros().get());
 }
 
 typedef MetadataTest DISubprogramTest;

From 1c23594d2c3887bbb714a55a51770524f7744293 Mon Sep 17 00:00:00 2001
From: Nemanja Ivanovic <nemanja.i.ibm@gmail.com>
Date: Thu, 10 Dec 2015 13:35:28 +0000
Subject: [PATCH 319/364] Bitcasts between FP and INT values using direct moves

This patch corresponds to review:
http://reviews.llvm.org/D15286

LLVM IR frequently contains bitcast operations between floating point and
integer values of the same width. Doing this through memory operations is
quite expensive on PPC. This patch allows the use of direct register moves
between FPRs and GPRs for lowering bitcasts.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255246 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/PowerPC/PPCInstrVSX.td             | 308 +++++++++++++-----
 .../PowerPC/p8-scalar_vector_conversions.ll   |   4 +-
 .../PowerPC/variable_elem_vec_extracts.ll     | 114 +++++++
 3 files changed, 334 insertions(+), 92 deletions(-)
 create mode 100644 test/CodeGen/PowerPC/variable_elem_vec_extracts.ll

diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index 2e54ef2ac0e3..0c3788c55c70 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -1266,16 +1266,14 @@ def MovesToVSR {
   dag LE_DWORD_0 = (XXPERMDI LE_DWORD_1, LE_DWORD_1, 2);
 }
 
-/*  Direct moves of various widths from VSR's to GPR's. Each moves the
-    respective element out of the VSR and ensures that it is lined up
-    to the right side of the GPR. In addition to the extraction from positions
-    specified by a constant, a pattern for extracting from a variable position
-    is provided. This is useful when the element number is not known at
-    compile time.
+/*  Patterns for extracting elements out of vectors. Integer elements are
+    extracted using direct move operations. Patterns for extracting elements
+    whose indices are not available at compile time are also provided with
+    various _VARIABLE_ patterns.
     The numbering for the DAG's is for LE, but when used on BE, the correct
     LE element can just be used (i.e. LE_BYTE_2 == BE_BYTE_13).
 */
-def MovesFromVSR {
+def VectorExtractions {
   // Doubleword extraction
   dag LE_DWORD_0 =
     (MFVSRD
@@ -1371,24 +1369,6 @@ def MovesFromVSR {
   dag LE_VARIABLE_BYTE = (EXTRACT_SUBREG (SRD LE_MV_VBYTE, LE_VBYTE_SHIFT),
                                          sub_32);
 
-  /*  BE variable byte
-      The algorithm here is the same as the LE variable byte except:
-      - The shift in the VMX register is by 0/8 for opposite element numbers so
-        we simply AND the element number with 0x8
-      - The order of elements after the move to GPR is reversed, so we invert
-        the bits of the index prior to truncating to the range 0-7
-  */
-  dag BE_VBYTE_PERM_VEC = (LVSL ZERO8, (ANDIo8 $Idx, 8));
-  dag BE_VBYTE_PERMUTE = (VPERM $S, $S, BE_VBYTE_PERM_VEC);
-  dag BE_MV_VBYTE = (MFVSRD
-                      (EXTRACT_SUBREG
-                        (v2i64 (COPY_TO_REGCLASS BE_VBYTE_PERMUTE, VSRC)),
-                        sub_64));
-  dag BE_VBYTE_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 7), $Idx), 3, 60),
-                                       sub_32);
-  dag BE_VARIABLE_BYTE = (EXTRACT_SUBREG (SRD BE_MV_VBYTE, BE_VBYTE_SHIFT),
-                                         sub_32);
-
   /*  LE variable halfword
       Number 1. above:
       - For elements 0-3, we shift left by 8 since they're on the right
@@ -1421,6 +1401,88 @@ def MovesFromVSR {
   dag LE_VARIABLE_HALF = (EXTRACT_SUBREG (SRD LE_MV_VHALF, LE_VHALF_SHIFT),
                                          sub_32);
 
+  /*  LE variable word
+      Number 1. above:
+      - For elements 0-1, we shift left by 8 since they're on the right
+      - For elements 2-3, we need not shift
+  */
+  dag LE_VWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDC8 (LI8 2), $Idx), 2, 61));
+
+  //  Number 2. above:
+  //  - Now that we set up the shift amount, we shift in the VMX register
+  dag LE_VWORD_PERMUTE = (VPERM $S, $S, LE_VWORD_PERM_VEC);
+
+  //  Number 3. above:
+  //  - The doubleword containing our element is moved to a GPR
+  dag LE_MV_VWORD = (MFVSRD
+                      (EXTRACT_SUBREG
+                        (v2i64 (COPY_TO_REGCLASS LE_VWORD_PERMUTE, VSRC)),
+                        sub_64));
+
+  /*  Number 4. above:
+      - Truncate the element number to the range 0-1 (2-3 are symmetrical
+        and out of range values are truncated accordingly)
+      - Multiply by 32 as we need to shift right by the number of bits
+      - Shift right in the GPR by the calculated value
+  */
+  dag LE_VWORD_SHIFT = (EXTRACT_SUBREG (RLDICR (AND8 (LI8 1), $Idx), 5, 58),
+                                       sub_32);
+  dag LE_VARIABLE_WORD = (EXTRACT_SUBREG (SRD LE_MV_VWORD, LE_VWORD_SHIFT),
+                                         sub_32);
+
+  /*  LE variable doubleword
+      Number 1. above:
+      - For element 0, we shift left by 8 since it's on the right
+      - For element 1, we need not shift
+  */
+  dag LE_VDWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDC8 (LI8 1), $Idx), 3, 60));
+
+  //  Number 2. above:
+  //  - Now that we set up the shift amount, we shift in the VMX register
+  dag LE_VDWORD_PERMUTE = (VPERM $S, $S, LE_VDWORD_PERM_VEC);
+
+  // Number 3. above:
+  //  - The doubleword containing our element is moved to a GPR
+  //  - Number 4. is not needed for the doubleword as the value is 64-bits
+  dag LE_VARIABLE_DWORD =
+        (MFVSRD (EXTRACT_SUBREG
+                  (v2i64 (COPY_TO_REGCLASS LE_VDWORD_PERMUTE, VSRC)),
+                  sub_64));
+
+  /*  LE variable float
+      - Shift the vector to line up the desired element to BE Word 0
+      - Convert 32-bit float to a 64-bit single precision float
+  */
+  dag LE_VFLOAT_PERM_VEC = (LVSL ZERO8, (RLDICR (XOR8 (LI8 3), $Idx), 2, 61));
+  dag LE_VFLOAT_PERMUTE = (VPERM $S, $S, LE_VFLOAT_PERM_VEC);
+  dag LE_VARIABLE_FLOAT = (XSCVSPDPN LE_VFLOAT_PERMUTE);
+
+  /*  LE variable double
+      Same as the LE doubleword except there is no move.
+  */
+  dag LE_VDOUBLE_PERMUTE = (VPERM (COPY_TO_REGCLASS $S, VRRC),
+                                  (COPY_TO_REGCLASS $S, VRRC),
+                                  LE_VDWORD_PERM_VEC);
+  dag LE_VARIABLE_DOUBLE = (COPY_TO_REGCLASS LE_VDOUBLE_PERMUTE, VSRC);
+
+  /*  BE variable byte
+      The algorithm here is the same as the LE variable byte except:
+      - The shift in the VMX register is by 0/8 for opposite element numbers so
+        we simply AND the element number with 0x8
+      - The order of elements after the move to GPR is reversed, so we invert
+        the bits of the index prior to truncating to the range 0-7
+  */
+  dag BE_VBYTE_PERM_VEC = (LVSL ZERO8, (ANDIo8 $Idx, 8));
+  dag BE_VBYTE_PERMUTE = (VPERM $S, $S, BE_VBYTE_PERM_VEC);
+  dag BE_MV_VBYTE = (MFVSRD
+                      (EXTRACT_SUBREG
+                        (v2i64 (COPY_TO_REGCLASS BE_VBYTE_PERMUTE, VSRC)),
+                        sub_64));
+  dag BE_VBYTE_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 7), $Idx), 3, 60),
+                                       sub_32);
+  dag BE_VARIABLE_BYTE = (EXTRACT_SUBREG (SRD BE_MV_VBYTE, BE_VBYTE_SHIFT),
+                                         sub_32);
+
   /*  BE variable halfword
       The algorithm here is the same as the LE variable halfword except:
       - The shift in the VMX register is by 0/8 for opposite element numbers so
@@ -1434,10 +1496,54 @@ def MovesFromVSR {
                       (EXTRACT_SUBREG
                         (v2i64 (COPY_TO_REGCLASS BE_VHALF_PERMUTE, VSRC)),
                         sub_64));
-  dag BE_VHALF_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 3), $Idx), 4, 60),
+  dag BE_VHALF_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 3), $Idx), 4, 59),
                                        sub_32);
   dag BE_VARIABLE_HALF = (EXTRACT_SUBREG (SRD BE_MV_VHALF, BE_VHALF_SHIFT),
                                          sub_32);
+
+  /*  BE variable word
+      The algorithm is the same as the LE variable word except:
+      - The shift in the VMX register happens for opposite element numbers
+      - The order of elements after the move to GPR is reversed, so we invert
+        the bits of the index prior to truncating to the range 0-1
+  */
+  dag BE_VWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDIo8 $Idx, 2), 2, 61));
+  dag BE_VWORD_PERMUTE = (VPERM $S, $S, BE_VWORD_PERM_VEC);
+  dag BE_MV_VWORD = (MFVSRD
+                      (EXTRACT_SUBREG
+                        (v2i64 (COPY_TO_REGCLASS BE_VWORD_PERMUTE, VSRC)),
+                        sub_64));
+  dag BE_VWORD_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 1), $Idx), 5, 58),
+                                       sub_32);
+  dag BE_VARIABLE_WORD = (EXTRACT_SUBREG (SRD BE_MV_VWORD, BE_VWORD_SHIFT),
+                                         sub_32);
+
+  /*  BE variable doubleword
+      Same as the LE doubleword except we shift in the VMX register for opposite
+      element indices.
+  */
+  dag BE_VDWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDIo8 $Idx, 1), 3, 60));
+  dag BE_VDWORD_PERMUTE = (VPERM $S, $S, BE_VDWORD_PERM_VEC);
+  dag BE_VARIABLE_DWORD =
+        (MFVSRD (EXTRACT_SUBREG
+                  (v2i64 (COPY_TO_REGCLASS BE_VDWORD_PERMUTE, VSRC)),
+                  sub_64));
+
+  /*  BE variable float
+      - Shift the vector to line up the desired element to BE Word 0
+      - Convert 32-bit float to a 64-bit single precision float
+  */
+  dag BE_VFLOAT_PERM_VEC = (LVSL ZERO8, (RLDICR $Idx, 2, 61));
+  dag BE_VFLOAT_PERMUTE = (VPERM $S, $S, BE_VFLOAT_PERM_VEC);
+  dag BE_VARIABLE_FLOAT = (XSCVSPDPN BE_VFLOAT_PERMUTE);
+
+  /* BE variable double
+      Same as the BE doubleword except there is no move.
+  */
+  dag BE_VDOUBLE_PERMUTE = (VPERM (COPY_TO_REGCLASS $S, VRRC),
+                                  (COPY_TO_REGCLASS $S, VRRC),
+                                  BE_VDWORD_PERM_VEC);
+  dag BE_VARIABLE_DOUBLE = (COPY_TO_REGCLASS BE_VDOUBLE_PERMUTE, VSRC);
 }
 
 // v4f32 scalar <-> vector conversions (BE)
@@ -1452,8 +1558,15 @@ let Predicates = [IsBigEndian, HasP8Vector] in {
             (f32 (XSCVSPDPN (XXSLDWI $S, $S, 2)))>;
   def : Pat<(f32 (vector_extract v4f32:$S, 3)),
             (f32 (XSCVSPDPN (XXSLDWI $S, $S, 3)))>;
+  def : Pat<(f32 (vector_extract v4f32:$S, i64:$Idx)),
+            (f32 VectorExtractions.BE_VARIABLE_FLOAT)>;
 } // IsBigEndian, HasP8Vector
 
+// Variable index vector_extract for v2f64 does not require P8Vector
+let Predicates = [IsBigEndian, HasVSX] in
+  def : Pat<(f64 (vector_extract v2f64:$S, i64:$Idx)),
+            (f64 VectorExtractions.BE_VARIABLE_DOUBLE)>;
+
 let Predicates = [IsBigEndian, HasDirectMove] in {
   // v16i8 scalar <-> vector conversions (BE)
   def : Pat<(v16i8 (scalar_to_vector i32:$A)),
@@ -1465,75 +1578,79 @@ let Predicates = [IsBigEndian, HasDirectMove] in {
   def : Pat<(v2i64 (scalar_to_vector i64:$A)),
             (v2i64 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_DWORD_0, sub_64))>;
   def : Pat<(i32 (vector_extract v16i8:$S, 0)),
-            (i32 MovesFromVSR.LE_BYTE_15)>;
+            (i32 VectorExtractions.LE_BYTE_15)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 1)),
-            (i32 MovesFromVSR.LE_BYTE_14)>;
+            (i32 VectorExtractions.LE_BYTE_14)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 2)),
-            (i32 MovesFromVSR.LE_BYTE_13)>;
+            (i32 VectorExtractions.LE_BYTE_13)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 3)),
-            (i32 MovesFromVSR.LE_BYTE_12)>;
+            (i32 VectorExtractions.LE_BYTE_12)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 4)),
-            (i32 MovesFromVSR.LE_BYTE_11)>;
+            (i32 VectorExtractions.LE_BYTE_11)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 5)),
-            (i32 MovesFromVSR.LE_BYTE_10)>;
+            (i32 VectorExtractions.LE_BYTE_10)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 6)),
-            (i32 MovesFromVSR.LE_BYTE_9)>;
+            (i32 VectorExtractions.LE_BYTE_9)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 7)),
-            (i32 MovesFromVSR.LE_BYTE_8)>;
+            (i32 VectorExtractions.LE_BYTE_8)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 8)),
-            (i32 MovesFromVSR.LE_BYTE_7)>;
+            (i32 VectorExtractions.LE_BYTE_7)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 9)),
-            (i32 MovesFromVSR.LE_BYTE_6)>;
+            (i32 VectorExtractions.LE_BYTE_6)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 10)),
-            (i32 MovesFromVSR.LE_BYTE_5)>;
+            (i32 VectorExtractions.LE_BYTE_5)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 11)),
-            (i32 MovesFromVSR.LE_BYTE_4)>;
+            (i32 VectorExtractions.LE_BYTE_4)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 12)),
-            (i32 MovesFromVSR.LE_BYTE_3)>;
+            (i32 VectorExtractions.LE_BYTE_3)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 13)),
-            (i32 MovesFromVSR.LE_BYTE_2)>;
+            (i32 VectorExtractions.LE_BYTE_2)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 14)),
-            (i32 MovesFromVSR.LE_BYTE_1)>;
+            (i32 VectorExtractions.LE_BYTE_1)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 15)),
-            (i32 MovesFromVSR.LE_BYTE_0)>;
+            (i32 VectorExtractions.LE_BYTE_0)>;
   def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)),
-            (i32 MovesFromVSR.BE_VARIABLE_BYTE)>;
+            (i32 VectorExtractions.BE_VARIABLE_BYTE)>;
 
   // v8i16 scalar <-> vector conversions (BE)
   def : Pat<(i32 (vector_extract v8i16:$S, 0)),
-            (i32 MovesFromVSR.LE_HALF_7)>;
+            (i32 VectorExtractions.LE_HALF_7)>;
   def : Pat<(i32 (vector_extract v8i16:$S, 1)),
-            (i32 MovesFromVSR.LE_HALF_6)>;
+            (i32 VectorExtractions.LE_HALF_6)>;
   def : Pat<(i32 (vector_extract v8i16:$S, 2)),
-            (i32 MovesFromVSR.LE_HALF_5)>;
+            (i32 VectorExtractions.LE_HALF_5)>;
   def : Pat<(i32 (vector_extract v8i16:$S, 3)),
-            (i32 MovesFromVSR.LE_HALF_4)>;
+            (i32 VectorExtractions.LE_HALF_4)>;
   def : Pat<(i32 (vector_extract v8i16:$S, 4)),
-            (i32 MovesFromVSR.LE_HALF_3)>;
+            (i32 VectorExtractions.LE_HALF_3)>;
   def : Pat<(i32 (vector_extract v8i16:$S, 5)),
-            (i32 MovesFromVSR.LE_HALF_2)>;
+            (i32 VectorExtractions.LE_HALF_2)>;
   def : Pat<(i32 (vector_extract v8i16:$S, 6)),
-            (i32 MovesFromVSR.LE_HALF_1)>;
+            (i32 VectorExtractions.LE_HALF_1)>;
   def : Pat<(i32 (vector_extract v8i16:$S, 7)),
-            (i32 MovesFromVSR.LE_HALF_0)>;
+            (i32 VectorExtractions.LE_HALF_0)>;
   def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)),
-            (i32 MovesFromVSR.BE_VARIABLE_HALF)>;
+            (i32 VectorExtractions.BE_VARIABLE_HALF)>;
 
   // v4i32 scalar <-> vector conversions (BE)
   def : Pat<(i32 (vector_extract v4i32:$S, 0)),
-            (i32 MovesFromVSR.LE_WORD_3)>;
+            (i32 VectorExtractions.LE_WORD_3)>;
   def : Pat<(i32 (vector_extract v4i32:$S, 1)),
-            (i32 MovesFromVSR.LE_WORD_2)>;
+            (i32 VectorExtractions.LE_WORD_2)>;
   def : Pat<(i32 (vector_extract v4i32:$S, 2)),
-            (i32 MovesFromVSR.LE_WORD_1)>;
+            (i32 VectorExtractions.LE_WORD_1)>;
   def : Pat<(i32 (vector_extract v4i32:$S, 3)),
-            (i32 MovesFromVSR.LE_WORD_0)>;
+            (i32 VectorExtractions.LE_WORD_0)>;
+  def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)),
+            (i32 VectorExtractions.BE_VARIABLE_WORD)>;
 
   // v2i64 scalar <-> vector conversions (BE)
   def : Pat<(i64 (vector_extract v2i64:$S, 0)),
-            (i64 MovesFromVSR.LE_DWORD_1)>;
+            (i64 VectorExtractions.LE_DWORD_1)>;
   def : Pat<(i64 (vector_extract v2i64:$S, 1)),
-            (i64 MovesFromVSR.LE_DWORD_0)>;
+            (i64 VectorExtractions.LE_DWORD_0)>;
+  def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)),
+            (i64 VectorExtractions.BE_VARIABLE_DWORD)>;
 } // IsBigEndian, HasDirectMove
 
 // v4f32 scalar <-> vector conversions (LE)
@@ -1548,8 +1665,15 @@ let Predicates = [IsLittleEndian, HasP8Vector] in {
             (f32 (XSCVSPDPN (XXSLDWI $S, $S, 1)))>;
   def : Pat<(f32 (vector_extract v4f32:$S, 3)),
             (f32 (XSCVSPDPN $S))>;
+  def : Pat<(f32 (vector_extract v4f32:$S, i64:$Idx)),
+            (f32 VectorExtractions.LE_VARIABLE_FLOAT)>;
 } // IsLittleEndian, HasP8Vector
 
+// Variable index vector_extract for v2f64 does not require P8Vector
+let Predicates = [IsLittleEndian, HasVSX] in
+  def : Pat<(f64 (vector_extract v2f64:$S, i64:$Idx)),
+            (f64 VectorExtractions.LE_VARIABLE_DOUBLE)>;
+
 let Predicates = [IsLittleEndian, HasDirectMove] in {
   // v16i8 scalar <-> vector conversions (LE)
   def : Pat<(v16i8 (scalar_to_vector i32:$A)),
@@ -1561,73 +1685,77 @@ let Predicates = [IsLittleEndian, HasDirectMove] in {
   def : Pat<(v2i64 (scalar_to_vector i64:$A)),
             (v2i64 MovesToVSR.LE_DWORD_0)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 0)),
-            (i32 MovesFromVSR.LE_BYTE_0)>;
+            (i32 VectorExtractions.LE_BYTE_0)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 1)),
-            (i32 MovesFromVSR.LE_BYTE_1)>;
+            (i32 VectorExtractions.LE_BYTE_1)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 2)),
-            (i32 MovesFromVSR.LE_BYTE_2)>;
+            (i32 VectorExtractions.LE_BYTE_2)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 3)),
-            (i32 MovesFromVSR.LE_BYTE_3)>;
+            (i32 VectorExtractions.LE_BYTE_3)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 4)),
-            (i32 MovesFromVSR.LE_BYTE_4)>;
+            (i32 VectorExtractions.LE_BYTE_4)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 5)),
-            (i32 MovesFromVSR.LE_BYTE_5)>;
+            (i32 VectorExtractions.LE_BYTE_5)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 6)),
-            (i32 MovesFromVSR.LE_BYTE_6)>;
+            (i32 VectorExtractions.LE_BYTE_6)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 7)),
-            (i32 MovesFromVSR.LE_BYTE_7)>;
+            (i32 VectorExtractions.LE_BYTE_7)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 8)),
-            (i32 MovesFromVSR.LE_BYTE_8)>;
+            (i32 VectorExtractions.LE_BYTE_8)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 9)),
-            (i32 MovesFromVSR.LE_BYTE_9)>;
+            (i32 VectorExtractions.LE_BYTE_9)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 10)),
-            (i32 MovesFromVSR.LE_BYTE_10)>;
+            (i32 VectorExtractions.LE_BYTE_10)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 11)),
-            (i32 MovesFromVSR.LE_BYTE_11)>;
+            (i32 VectorExtractions.LE_BYTE_11)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 12)),
-            (i32 MovesFromVSR.LE_BYTE_12)>;
+            (i32 VectorExtractions.LE_BYTE_12)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 13)),
-            (i32 MovesFromVSR.LE_BYTE_13)>;
+            (i32 VectorExtractions.LE_BYTE_13)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 14)),
-            (i32 MovesFromVSR.LE_BYTE_14)>;
+            (i32 VectorExtractions.LE_BYTE_14)>;
   def : Pat<(i32 (vector_extract v16i8:$S, 15)),
-            (i32 MovesFromVSR.LE_BYTE_15)>;
+            (i32 VectorExtractions.LE_BYTE_15)>;
   def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)),
-            (i32 MovesFromVSR.LE_VARIABLE_BYTE)>;
+            (i32 VectorExtractions.LE_VARIABLE_BYTE)>;
 
   // v8i16 scalar <-> vector conversions (LE)
   def : Pat<(i32 (vector_extract v8i16:$S, 0)),
-            (i32 MovesFromVSR.LE_HALF_0)>;
+            (i32 VectorExtractions.LE_HALF_0)>;
   def : Pat<(i32 (vector_extract v8i16:$S, 1)),
-            (i32 MovesFromVSR.LE_HALF_1)>;
+            (i32 VectorExtractions.LE_HALF_1)>;
   def : Pat<(i32 (vector_extract v8i16:$S, 2)),
-            (i32 MovesFromVSR.LE_HALF_2)>;
+            (i32 VectorExtractions.LE_HALF_2)>;
   def : Pat<(i32 (vector_extract v8i16:$S, 3)),
-            (i32 MovesFromVSR.LE_HALF_3)>;
+            (i32 VectorExtractions.LE_HALF_3)>;
   def : Pat<(i32 (vector_extract v8i16:$S, 4)),
-            (i32 MovesFromVSR.LE_HALF_4)>;
+            (i32 VectorExtractions.LE_HALF_4)>;
   def : Pat<(i32 (vector_extract v8i16:$S, 5)),
-            (i32 MovesFromVSR.LE_HALF_5)>;
+            (i32 VectorExtractions.LE_HALF_5)>;
   def : Pat<(i32 (vector_extract v8i16:$S, 6)),
-            (i32 MovesFromVSR.LE_HALF_6)>;
+            (i32 VectorExtractions.LE_HALF_6)>;
   def : Pat<(i32 (vector_extract v8i16:$S, 7)),
-            (i32 MovesFromVSR.LE_HALF_7)>;
+            (i32 VectorExtractions.LE_HALF_7)>;
   def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)),
-            (i32 MovesFromVSR.LE_VARIABLE_HALF)>;
+            (i32 VectorExtractions.LE_VARIABLE_HALF)>;
 
   // v4i32 scalar <-> vector conversions (LE)
   def : Pat<(i32 (vector_extract v4i32:$S, 0)),
-            (i32 MovesFromVSR.LE_WORD_0)>;
+            (i32 VectorExtractions.LE_WORD_0)>;
   def : Pat<(i32 (vector_extract v4i32:$S, 1)),
-            (i32 MovesFromVSR.LE_WORD_1)>;
+            (i32 VectorExtractions.LE_WORD_1)>;
   def : Pat<(i32 (vector_extract v4i32:$S, 2)),
-            (i32 MovesFromVSR.LE_WORD_2)>;
+            (i32 VectorExtractions.LE_WORD_2)>;
   def : Pat<(i32 (vector_extract v4i32:$S, 3)),
-            (i32 MovesFromVSR.LE_WORD_3)>;
+            (i32 VectorExtractions.LE_WORD_3)>;
+  def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)),
+            (i32 VectorExtractions.LE_VARIABLE_WORD)>;
 
   // v2i64 scalar <-> vector conversions (LE)
   def : Pat<(i64 (vector_extract v2i64:$S, 0)),
-            (i64 MovesFromVSR.LE_DWORD_0)>;
+            (i64 VectorExtractions.LE_DWORD_0)>;
   def : Pat<(i64 (vector_extract v2i64:$S, 1)),
-            (i64 MovesFromVSR.LE_DWORD_1)>;
+            (i64 VectorExtractions.LE_DWORD_1)>;
+  def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)),
+            (i64 VectorExtractions.LE_VARIABLE_DWORD)>;
 } // IsLittleEndian, HasDirectMove
diff --git a/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll b/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
index 7e8991647aee..8da8df58a85c 100644
--- a/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
+++ b/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
@@ -1036,7 +1036,7 @@ entry:
 ; CHECK-DAG: mfvsrd [[MOV:[0-9]+]],
 ; CHECK-DAG: li [[IMM3:[0-9]+]], 3
 ; CHECK-DAG: andc [[ANDC:[0-9]+]], [[IMM3]]
-; CHECK-DAG: rldicr [[SHL:[0-9]+]], [[ANDC]], 4, 60
+; CHECK-DAG: sldi [[SHL:[0-9]+]], [[ANDC]], 4
 ; CHECK-DAG: srd 3, [[MOV]], [[SHL]]
 ; CHECK-DAG: extsh 3, 3
 ; CHECK-LE-LABEL: @getvelss
@@ -1072,7 +1072,7 @@ entry:
 ; CHECK-DAG: mfvsrd [[MOV:[0-9]+]],
 ; CHECK-DAG: li [[IMM3:[0-9]+]], 3
 ; CHECK-DAG: andc [[ANDC:[0-9]+]], [[IMM3]]
-; CHECK-DAG: rldicr [[SHL:[0-9]+]], [[ANDC]], 4, 60
+; CHECK-DAG: sldi [[SHL:[0-9]+]], [[ANDC]], 4
 ; CHECK-DAG: srd 3, [[MOV]], [[SHL]]
 ; CHECK-DAG: clrldi   3, 3, 48
 ; CHECK-LE-LABEL: @getvelus
diff --git a/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll b/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll
new file mode 100644
index 000000000000..3d4789360f55
--- /dev/null
+++ b/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll
@@ -0,0 +1,114 @@
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64-unknown-unknown < %s | FileCheck %s \
+; RUN:  --check-prefix=CHECK-BE
+; RUN: llc -mcpu=pwr7 -mtriple=powerpc64-unknown-unknown < %s | FileCheck %s \
+; RUN:  --check-prefix=CHECK-P7
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @geti(<4 x i32> %a, i32 signext %b) {
+entry:
+  %vecext = extractelement <4 x i32> %a, i32 %b
+  ret i32 %vecext
+; CHECK-LABEL: @geti
+; CHECK-P7-LABEL: @geti
+; CHECK-BE-LABEL: @geti
+; CHECK-DAG: li [[TRUNCREG:[0-9]+]], 2
+; CHECK-DAG: andc [[MASKREG:[0-9]+]], [[TRUNCREG]], 5
+; CHECK-DAG: sldi [[SHIFTREG:[0-9]+]], [[MASKREG]], 2
+; CHECK-DAG: lvsl [[SHMSKREG:[0-9]+]], 0, [[SHIFTREG]]
+; CHECK-DAG: vperm [[PERMVEC:[0-9]+]], 2, 2, [[SHMSKREG]]
+; CHECK-DAG: li [[ONEREG:[0-9]+]], 1
+; CHECK-DAG: and [[ELEMSREG:[0-9]+]], [[ONEREG]], 5
+; CHECK-DAG: sldi [[SHAMREG:[0-9]+]], [[ELEMSREG]], 5
+; CHECK: mfvsrd [[TOGPR:[0-9]+]],
+; CHECK: srd [[RSHREG:[0-9]+]], [[TOGPR]], [[SHAMREG]]
+; CHECK: extsw 3, [[RSHREG]]
+; CHECK-P7-DAG: sldi [[ELEMOFFREG:[0-9]+]], 5, 2
+; CHECK-P7-DAG: stxvw4x 34,
+; CHECK-P7: lwax 3, [[ELEMOFFREG]],
+; CHECK-BE-DAG: andi. [[ANDREG:[0-9]+]], 5, 2
+; CHECK-BE-DAG: sldi [[SLREG:[0-9]+]], [[ANDREG]], 2
+; CHECK-BE-DAG: lvsl [[SHMSKREG:[0-9]+]], 0, [[SLREG]]
+; CHECK-BE-DAG: vperm {{[0-9]+}}, 2, 2, [[SHMSKREG]]
+; CHECK-BE-DAG: li [[IMMREG:[0-9]+]], 1
+; CHECK-BE-DAG: andc [[ANDCREG:[0-9]+]], [[IMMREG]], 5
+; CHECK-BE-DAG: sldi [[SHAMREG:[0-9]+]], [[ANDCREG]], 5
+; CHECK-BE: mfvsrd [[TOGPR:[0-9]+]],
+; CHECK-BE: srd [[RSHREG:[0-9]+]], [[TOGPR]], [[SHAMREG]]
+; CHECk-BE: extsw 3, [[RSHREG]]
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @getl(<2 x i64> %a, i32 signext %b) {
+entry:
+  %vecext = extractelement <2 x i64> %a, i32 %b
+  ret i64 %vecext
+; CHECK-LABEL: @getl
+; CHECK-P7-LABEL: @getl
+; CHECK-BE-LABEL: @getl
+; CHECK-DAG: li [[TRUNCREG:[0-9]+]], 1
+; CHECK-DAG: andc [[MASKREG:[0-9]+]], [[TRUNCREG]], 5
+; CHECK-DAG: sldi [[SHIFTREG:[0-9]+]], [[MASKREG]], 3
+; CHECK-DAG: lvsl [[SHMSKREG:[0-9]+]], 0, [[SHIFTREG]]
+; CHECK-DAG: vperm [[PERMVEC:[0-9]+]], 2, 2, [[SHMSKREG]]
+; CHECK: mfvsrd 3,
+; CHECK-P7-DAG: sldi [[ELEMOFFREG:[0-9]+]], 5, 3
+; CHECK-P7-DAG: stxvd2x 34,
+; CHECK-P7: ldx 3, [[ELEMOFFREG]],
+; CHECK-BE-DAG: andi. [[ANDREG:[0-9]+]], 5, 1
+; CHECK-BE-DAG: sldi [[SLREG:[0-9]+]], [[ANDREG]], 3
+; CHECK-BE-DAG: lvsl [[SHMSKREG:[0-9]+]], 0, [[SLREG]]
+; CHECK-BE-DAG: vperm {{[0-9]+}}, 2, 2, [[SHMSKREG]]
+; CHECK-BE: mfvsrd 3,
+}
+
+; Function Attrs: norecurse nounwind readnone
+define float @getf(<4 x float> %a, i32 signext %b) {
+entry:
+  %vecext = extractelement <4 x float> %a, i32 %b
+  ret float %vecext
+; CHECK-LABEL: @getf
+; CHECK-P7-LABEL: @getf
+; CHECK-BE-LABEL: @getf
+; CHECK: li [[IMMREG:[0-9]+]], 3
+; CHECK: xor [[TRUNCREG:[0-9]+]], [[IMMREG]], 5
+; CHECK: lvsl [[SHMSKREG:[0-9]+]], 0, [[TRUNCREG]]
+; CHECK: vperm {{[0-9]+}}, 2, 2, [[SHMSKREG]]
+; CHECK: xscvspdpn 1,
+; CHECK-P7-DAG: sldi [[ELEMOFFREG:[0-9]+]], 5, 2
+; CHECK-P7-DAG: stxvw4x 34,
+; CHECK-P7: lfsx 1, [[ELEMOFFREG]],
+; CHECK-BE: sldi [[ELNOREG:[0-9]+]], 5, 2
+; CHECK-BE: lvsl [[SHMSKREG:[0-9]+]], 0, [[ELNOREG]]
+; CHECK-BE: vperm {{[0-9]+}}, 2, 2, [[SHMSKREG]]
+; CHECK-BE: xscvspdpn 1,
+}
+
+; Function Attrs: norecurse nounwind readnone
+define double @getd(<2 x double> %a, i32 signext %b) {
+entry:
+  %vecext = extractelement <2 x double> %a, i32 %b
+  ret double %vecext
+; CHECK-LABEL: @getd
+; CHECK-P7-LABEL: @getd
+; CHECK-BE-LABEL: @getd
+; CHECK: li [[TRUNCREG:[0-9]+]], 1
+; CHECK: andc [[MASKREG:[0-9]+]], [[TRUNCREG]], 5
+; CHECK: sldi [[SHIFTREG:[0-9]+]], [[MASKREG]], 3
+; CHECK: lvsl [[SHMSKREG:[0-9]+]], 0, [[SHIFTREG]]
+; CHECK: vperm {{[0-9]+}}, 2, 2, [[SHMSKREG]]
+; FIXME: the instruction below is a redundant regclass copy, to be removed
+; CHECK: xxlor 1,
+; CHECK-P7-DAG: andi. [[ANDREG:[0-9]+]], 5, 1
+; CHECK-P7-DAG: sldi [[SLREG:[0-9]+]], [[ANDREG]], 3
+; CHECK-P7-DAG: lvsl [[SHMSKREG:[0-9]+]], 0, [[SLREG]]
+; CHECK-P7-DAG: vperm {{[0-9]+}}, 2, 2, [[SHMSKREG]]
+; FIXME: the instruction below is a redundant regclass copy, to be removed
+; CHECK-P7: xxlor 1,
+; CHECK-BE-DAG: andi. [[ANDREG:[0-9]+]], 5, 1
+; CHECK-BE-DAG: sldi [[SLREG:[0-9]+]], [[ANDREG]], 3
+; CHECK-BE-DAG: lvsl [[SHMSKREG:[0-9]+]], 0, [[SLREG]]
+; CHECK-BE-DAG: vperm {{[0-9]+}}, 2, 2, [[SHMSKREG]]
+; FIXME: the instruction below is a redundant regclass copy, to be removed
+; CHECK-BE: xxlor 1,
+}

From d4526d51db67aaa2d769b293d50e9e6cd756e232 Mon Sep 17 00:00:00 2001
From: Chad Rosier <mcrosier@codeaurora.org>
Date: Thu, 10 Dec 2015 13:51:43 +0000
Subject: [PATCH 320/364] [DeadStoreElimination] Add support for non-local DSE.

We extend the search for redundant stores to predecessor blocks that
unconditionally lead to the block BB with the current store instruction.  That
also includes single-block loops that unconditionally lead to BB, and
if-then-else blocks where then- and else-blocks unconditionally lead to BB.

http://reviews.llvm.org/D13363
Patch by Ivan Baev <ibaev@codeaurora.org>!

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255247 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../Scalar/DeadStoreElimination.cpp           | 332 +++++++++++++-----
 test/Transforms/DeadStoreElimination/cycle.ll |  26 ++
 .../Transforms/DeadStoreElimination/ifthen.ll |  22 ++
 .../DeadStoreElimination/ifthenelse.ll        |  27 ++
 .../DeadStoreElimination/ifthenelse2.ll       |  34 ++
 test/Transforms/DeadStoreElimination/loop.ll  |  42 +++
 6 files changed, 393 insertions(+), 90 deletions(-)
 create mode 100644 test/Transforms/DeadStoreElimination/cycle.ll
 create mode 100644 test/Transforms/DeadStoreElimination/ifthen.ll
 create mode 100644 test/Transforms/DeadStoreElimination/ifthenelse.ll
 create mode 100644 test/Transforms/DeadStoreElimination/ifthenelse2.ll
 create mode 100644 test/Transforms/DeadStoreElimination/loop.ll

diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 36ad0a5f7b91..d8b08c41d0a8 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -7,11 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements a trivial dead store elimination that only considers
-// basic-block local redundant stores.
-//
-// FIXME: This should eventually be extended to be a post-dominator tree
-// traversal.  Doing so would be pretty trivial.
+// This file implements dead store elimination that considers redundant stores
+// within a basic-block as well as across basic blocks in a reverse CFG order.
 //
 //===----------------------------------------------------------------------===//
 
@@ -44,6 +41,13 @@ using namespace llvm;
 STATISTIC(NumRedundantStores, "Number of redundant stores deleted");
 STATISTIC(NumFastStores, "Number of stores deleted");
 STATISTIC(NumFastOther , "Number of other instrs removed");
+STATISTIC(NumNonLocalStores, "Number of non-local stores deleted");
+
+static cl::opt<bool> EnableNonLocalDSE("enable-nonlocal-dse", cl::init(true));
+
+/// MaxNonLocalAttempts is an arbitrary threshold that provides
+/// an early opportunitiy for bail out to control compile time.
+static const unsigned MaxNonLocalAttempts = 100;
 
 namespace {
   struct DSE : public FunctionPass {
@@ -80,6 +84,7 @@ namespace {
     bool runOnBasicBlock(BasicBlock &BB);
     bool MemoryIsNotModifiedBetween(Instruction *FirstI, Instruction *SecondI);
     bool HandleFree(CallInst *F);
+    bool handleNonLocalDependency(Instruction *Inst);
     bool handleEndBlock(BasicBlock &BB);
     void RemoveAccessedObjects(const MemoryLocation &LoadedLoc,
                                SmallSetVector<Value *, 16> &DeadStackObjects,
@@ -485,6 +490,7 @@ static bool isPossibleSelfRead(Instruction *Inst,
 bool DSE::runOnBasicBlock(BasicBlock &BB) {
   const DataLayout &DL = BB.getModule()->getDataLayout();
   bool MadeChange = false;
+  unsigned NumNonLocalAttempts = 0;
 
   // Do a top-down walk on the BB.
   for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) {
@@ -554,99 +560,101 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
 
     MemDepResult InstDep = MD->getDependency(Inst);
 
-    // Ignore any store where we can't find a local dependence.
-    // FIXME: cross-block DSE would be fun. :)
-    if (!InstDep.isDef() && !InstDep.isClobber())
-      continue;
-
-    // Figure out what location is being stored to.
-    MemoryLocation Loc = getLocForWrite(Inst, *AA);
+    if (InstDep.isDef() || InstDep.isClobber()) {
+      // Figure out what location is being stored to.
+      MemoryLocation Loc = getLocForWrite(Inst, *AA);
 
-    // If we didn't get a useful location, fail.
-    if (!Loc.Ptr)
-      continue;
-
-    while (InstDep.isDef() || InstDep.isClobber()) {
-      // Get the memory clobbered by the instruction we depend on.  MemDep will
-      // skip any instructions that 'Loc' clearly doesn't interact with.  If we
-      // end up depending on a may- or must-aliased load, then we can't optimize
-      // away the store and we bail out.  However, if we depend on on something
-      // that overwrites the memory location we *can* potentially optimize it.
-      //
-      // Find out what memory location the dependent instruction stores.
-      Instruction *DepWrite = InstDep.getInst();
-      MemoryLocation DepLoc = getLocForWrite(DepWrite, *AA);
-      // If we didn't get a useful location, or if it isn't a size, bail out.
-      if (!DepLoc.Ptr)
-        break;
+      // If we didn't get a useful location, fail.
+      if (!Loc.Ptr)
+        continue;
 
-      // If we find a write that is a) removable (i.e., non-volatile), b) is
-      // completely obliterated by the store to 'Loc', and c) which we know that
-      // 'Inst' doesn't load from, then we can remove it.
-      if (isRemovable(DepWrite) &&
-          !isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) {
-        int64_t InstWriteOffset, DepWriteOffset;
-        OverwriteResult OR =
-            isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, InstWriteOffset);
-        if (OR == OverwriteComplete) {
-          DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: "
-                << *DepWrite << "\n  KILLER: " << *Inst << '\n');
-
-          // Delete the store and now-dead instructions that feed it.
-          DeleteDeadInstruction(DepWrite, *MD, *TLI);
-          ++NumFastStores;
-          MadeChange = true;
-
-          // DeleteDeadInstruction can delete the current instruction in loop
-          // cases, reset BBI.
-          BBI = Inst->getIterator();
-          if (BBI != BB.begin())
-            --BBI;
+      while (InstDep.isDef() || InstDep.isClobber()) {
+        // Get the memory clobbered by the instruction we depend on.  MemDep
+        // will skip any instructions that 'Loc' clearly doesn't interact with.
+        // If we end up depending on a may- or must-aliased load, then we can't
+        // optimize away the store and we bail out.  However, if we depend on on
+        // something that overwrites the memory location we *can* potentially
+        // optimize it.
+        //
+        // Find out what memory location the dependent instruction stores.
+        Instruction *DepWrite = InstDep.getInst();
+        MemoryLocation DepLoc = getLocForWrite(DepWrite, *AA);
+        // If we didn't get a useful location, or if it isn't a size, bail out.
+        if (!DepLoc.Ptr)
           break;
-        } else if (OR == OverwriteEnd && isShortenable(DepWrite)) {
-          // TODO: base this on the target vector size so that if the earlier
-          // store was too small to get vector writes anyway then its likely
-          // a good idea to shorten it
-          // Power of 2 vector writes are probably always a bad idea to optimize
-          // as any store/memset/memcpy is likely using vector instructions so
-          // shortening it to not vector size is likely to be slower
-          MemIntrinsic* DepIntrinsic = cast<MemIntrinsic>(DepWrite);
-          unsigned DepWriteAlign = DepIntrinsic->getAlignment();
-          if (llvm::isPowerOf2_64(InstWriteOffset) ||
-              ((DepWriteAlign != 0) && InstWriteOffset % DepWriteAlign == 0)) {
-
-            DEBUG(dbgs() << "DSE: Remove Dead Store:\n  OW END: "
-                  << *DepWrite << "\n  KILLER (offset "
-                  << InstWriteOffset << ", "
-                  << DepLoc.Size << ")"
-                  << *Inst << '\n');
-
-            Value* DepWriteLength = DepIntrinsic->getLength();
-            Value* TrimmedLength = ConstantInt::get(DepWriteLength->getType(),
-                                                    InstWriteOffset -
-                                                    DepWriteOffset);
-            DepIntrinsic->setLength(TrimmedLength);
+
+        // If we find a write that is a) removable (i.e., non-volatile), b) is
+        // completely obliterated by the store to 'Loc', and c) which we know
+        // that 'Inst' doesn't load from, then we can remove it.
+        if (isRemovable(DepWrite) &&
+            !isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) {
+          int64_t InstWriteOffset, DepWriteOffset;
+          OverwriteResult OR = isOverwrite(Loc, DepLoc, DL, *TLI,
+                                           DepWriteOffset, InstWriteOffset);
+          if (OR == OverwriteComplete) {
+            DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: " << *DepWrite
+                         << "\n  KILLER: " << *Inst << '\n');
+
+            // Delete the store and now-dead instructions that feed it.
+            DeleteDeadInstruction(DepWrite, *MD, *TLI);
+            ++NumFastStores;
             MadeChange = true;
+
+            // DeleteDeadInstruction can delete the current instruction in loop
+            // cases, reset BBI.
+            BBI = Inst->getIterator();
+            if (BBI != BB.begin())
+              --BBI;
+            break;
+          } else if (OR == OverwriteEnd && isShortenable(DepWrite)) {
+            // TODO: base this on the target vector size so that if the earlier
+            // store was too small to get vector writes anyway then its likely a
+            // good idea to shorten it.
+
+            // Power of 2 vector writes are probably always a bad idea to
+            // optimize as any store/memset/memcpy is likely using vector
+            // instructions so shortening it to not vector size is likely to be
+            // slower.
+            MemIntrinsic *DepIntrinsic = cast<MemIntrinsic>(DepWrite);
+            unsigned DepWriteAlign = DepIntrinsic->getAlignment();
+            if (llvm::isPowerOf2_64(InstWriteOffset) ||
+                ((DepWriteAlign != 0) &&
+                 InstWriteOffset % DepWriteAlign == 0)) {
+
+              DEBUG(dbgs() << "DSE: Remove Dead Store:\n  OW END: " << *DepWrite
+                           << "\n  KILLER (offset " << InstWriteOffset << ", "
+                           << DepLoc.Size << ")" << *Inst << '\n');
+
+              Value *DepWriteLength = DepIntrinsic->getLength();
+              Value *TrimmedLength = ConstantInt::get(
+                  DepWriteLength->getType(), InstWriteOffset - DepWriteOffset);
+              DepIntrinsic->setLength(TrimmedLength);
+              MadeChange = true;
+            }
           }
         }
-      }
 
-      // If this is a may-aliased store that is clobbering the store value, we
-      // can keep searching past it for another must-aliased pointer that stores
-      // to the same location.  For example, in:
-      //   store -> P
-      //   store -> Q
-      //   store -> P
-      // we can remove the first store to P even though we don't know if P and Q
-      // alias.
-      if (DepWrite == &BB.front()) break;
-
-      // Can't look past this instruction if it might read 'Loc'.
-      if (AA->getModRefInfo(DepWrite, Loc) & MRI_Ref)
-        break;
+        // If this is a may-aliased store that is clobbering the store value, we
+        // can keep searching past it for another must-aliased pointer that
+        // stores to the same location.  For example, in
+        //   store -> P
+        //   store -> Q
+        //   store -> P
+        // we can remove the first store to P even though we don't know if P and
+        // Q alias.
+        if (DepWrite == &BB.front())
+          break;
+
+        // Can't look past this instruction if it might read 'Loc'.
+        if (AA->getModRefInfo(DepWrite, Loc) & MRI_Ref)
+          break;
 
-      InstDep = MD->getPointerDependencyFrom(Loc, false,
-                                             DepWrite->getIterator(), &BB);
+        InstDep = MD->getPointerDependencyFrom(Loc, false,
+                                               DepWrite->getIterator(), &BB);
+      }
+    } else if (EnableNonLocalDSE && InstDep.isNonLocal()) { // DSE across BB
+      if (++NumNonLocalAttempts < MaxNonLocalAttempts)
+        MadeChange |= handleNonLocalDependency(Inst);
     }
   }
 
@@ -658,6 +666,150 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
   return MadeChange;
 }
 
+/// A helper for handleNonLocalDependency() function to find all blocks
+/// that lead to the input block BB and append them to the output PredBlocks.
+/// PredBlocks will include not only predecessors of BB that unconditionally
+/// lead to BB but also:
+///   - single-block loops that lead to BB, and
+///   - if-blocks for which one edge goes to BB and the other edge goes to
+///     a block in the input SafeBlocks.
+/// PredBlocks will not include blocks unreachable from the entry block, nor
+/// blocks that form cycles with BB.
+static void findSafePreds(SmallVectorImpl<BasicBlock *> &PredBlocks,
+                          SmallSetVector<BasicBlock *, 8> &SafeBlocks,
+                          BasicBlock *BB, DominatorTree *DT) {
+  for (auto I = pred_begin(BB), E = pred_end(BB); I != E; ++I) {
+    BasicBlock *Pred = *I;
+    if (Pred == BB)
+      continue;
+    // The second check below prevents adding blocks that form a cycle with BB
+    // in order to avoid potential problems due to MemoryDependenceAnalysis,
+    // isOverwrite, etc. being not loop-aware.
+    if (!DT->isReachableFromEntry(Pred) || DT->dominates(BB, Pred))
+      continue;
+
+    bool PredIsSafe = true;
+    for (auto II = succ_begin(Pred), EE = succ_end(Pred); II != EE; ++II) {
+      BasicBlock *Succ = *II;
+      if (Succ == BB || Succ == Pred) // shortcut, BB should be in SafeBlocks
+        continue;
+      if (!SafeBlocks.count(Succ)) {
+        PredIsSafe = false;
+        break;
+      }
+    }
+    if (PredIsSafe)
+      PredBlocks.push_back(Pred);
+  }
+}
+
+static bool underlyingObjectsDoNotAlias(StoreInst *SI, LoadInst *LI,
+                                        const DataLayout &DL,
+                                        AliasAnalysis &AA) {
+  Value *AObj = GetUnderlyingObject(SI->getPointerOperand(), DL);
+  SmallVector<Value *, 4> Pointers;
+  GetUnderlyingObjects(LI->getPointerOperand(), Pointers, DL);
+
+  for (auto I = Pointers.begin(), E = Pointers.end(); I != E; ++I) {
+    Value *BObj = *I;
+    if (!AA.isNoAlias(AObj, DL.getTypeStoreSize(AObj->getType()), BObj,
+                      DL.getTypeStoreSize(BObj->getType())))
+      return false;
+  }
+  return true;
+}
+
+/// handleNonLocalDependency - Handle a non-local dependency on
+/// the input instruction Inst located in BB in attempt to remove
+/// redundant stores outside BB.
+bool DSE::handleNonLocalDependency(Instruction *Inst) {
+  auto *SI = dyn_cast<StoreInst>(Inst);
+  if (!SI)
+    return false;
+  // Get the location being stored to.
+  // If we don't get a useful location, bail out.
+  MemoryLocation Loc = getLocForWrite(SI, *AA);
+  if (!Loc.Ptr)
+    return false;
+
+  bool MadeChange = false;
+  BasicBlock *BB = Inst->getParent();
+  const DataLayout &DL = BB->getModule()->getDataLayout();
+
+  // Worklist of predecessor blocks of BB
+  SmallVector<BasicBlock *, 8> Blocks;
+  // Keep track of all predecessor blocks that are safe to search through
+  SmallSetVector<BasicBlock *, 8> SafeBlocks;
+  SafeBlocks.insert(BB);
+  findSafePreds(Blocks, SafeBlocks, BB, DT);
+
+  while (!Blocks.empty()) {
+    BasicBlock *PB = Blocks.pop_back_val();
+    MemDepResult Dep =
+        MD->getPointerDependencyFrom(Loc, false, PB->end(), PB, SI);
+    while (Dep.isDef() || Dep.isClobber()) {
+      Instruction *Dependency = Dep.getInst();
+
+      // Filter out false dependency from a load to SI looking through phis.
+      if (auto *LI = dyn_cast<LoadInst>(Dependency)) {
+        if (underlyingObjectsDoNotAlias(SI, LI, DL, *AA)) {
+          Dep = MD->getPointerDependencyFrom(Loc, false,
+                                             Dependency->getIterator(), PB, SI);
+          continue;
+        }
+      }
+
+      // If we don't get a useful location for the dependent instruction,
+      // it doesn't write memory, it is not removable, or it might read Loc,
+      // then bail out.
+      MemoryLocation DepLoc = getLocForWrite(Dependency, *AA);
+      if (!DepLoc.Ptr || !hasMemoryWrite(Dependency, *TLI) ||
+          !isRemovable(Dependency) ||
+          (AA->getModRefInfo(Dependency, Loc) & MRI_Ref))
+        break;
+
+      // Don't remove a store within single-block loops;
+      // we need more analysis: e.g. looking for an interferring load
+      // above the store within the loop, etc.
+      bool SingleBlockLoop = false;
+      for (auto I = succ_begin(PB), E = succ_end(PB); I != E; ++I) {
+        BasicBlock *Succ = *I;
+        if (Succ == PB) {
+          SingleBlockLoop = true;
+          break;
+        }
+      }
+      if (SingleBlockLoop)
+        break;
+
+      int64_t InstWriteOffset, DepWriteOffset;
+      OverwriteResult OR =
+          isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, InstWriteOffset);
+      if (OR == OverwriteComplete) {
+        DEBUG(dbgs() << "DSE: Remove Non-Local Dead Store:\n  DEAD: "
+                     << *Dependency << "\n  KILLER: " << *SI << '\n');
+
+        // Delete redundant store and now-dead instructions that feed it.
+        auto Next = std::next(Dependency->getIterator());
+        DeleteDeadInstruction(Dependency, *MD, *TLI);
+        ++NumNonLocalStores;
+        MadeChange = true;
+        Dep = MD->getPointerDependencyFrom(Loc, false, Next, PB, SI);
+        continue;
+      }
+      // TODO: attempt shortening of Dependency inst as in the local case
+      break;
+    }
+
+    if (Dep.isNonLocal()) {
+      SafeBlocks.insert(PB);
+      findSafePreds(Blocks, SafeBlocks, PB, DT);
+    }
+  }
+
+  return MadeChange;
+}
+
 /// Returns true if the memory which is accessed by the second instruction is not
 /// modified between the first and the second instruction.
 /// Precondition: Second instruction must be dominated by the first
diff --git a/test/Transforms/DeadStoreElimination/cycle.ll b/test/Transforms/DeadStoreElimination/cycle.ll
new file mode 100644
index 000000000000..aa2de415c604
--- /dev/null
+++ b/test/Transforms/DeadStoreElimination/cycle.ll
@@ -0,0 +1,26 @@
+; RUN: opt < %s -basicaa -dse -S | FileCheck %s
+
+@Table = global [535 x i32] zeroinitializer, align 4
+
+; The store in for.inc block should NOT be removed by non-local DSE.
+; CHECK: store i32 64, i32* %arrayidx
+;
+define void @foo() {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %arrayidx = getelementptr inbounds [535 x i32], [535 x i32]* @Table, i32 0, i32 %i
+  store i32 %i, i32* %arrayidx, align 4
+  %cmp1 = icmp slt i32 %i, 64
+  br i1 %cmp1, label %for.inc, label %for.end
+
+for.inc:
+  store i32 64, i32* %arrayidx, align 4
+  %inc = add nsw i32 %i, 1
+  br label %for.body
+
+for.end:
+  ret void
+}
diff --git a/test/Transforms/DeadStoreElimination/ifthen.ll b/test/Transforms/DeadStoreElimination/ifthen.ll
new file mode 100644
index 000000000000..5fb1d3e7e51e
--- /dev/null
+++ b/test/Transforms/DeadStoreElimination/ifthen.ll
@@ -0,0 +1,22 @@
+; RUN: opt < %s -basicaa -dse -S | FileCheck %s
+
+; The store and add in if.then block should be removed by non-local DSE.
+; CHECK-NOT: %stval = add
+; CHECK-NOT: store i32 %stval
+;
+define void @foo(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %c) {
+entry:
+  %cmp = icmp sgt i32 %c, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %0 = load i32, i32* %b, align 4
+  %stval = add nsw i32 %0, 1
+  store i32 %stval, i32* %a, align 4
+  br label %if.end
+
+if.end:
+  %m.0 = phi i32 [ 13, %if.then ], [ 10, %entry ]
+  store i32 %m.0, i32* %a, align 4
+  ret void
+}
diff --git a/test/Transforms/DeadStoreElimination/ifthenelse.ll b/test/Transforms/DeadStoreElimination/ifthenelse.ll
new file mode 100644
index 000000000000..5ebbff887f7e
--- /dev/null
+++ b/test/Transforms/DeadStoreElimination/ifthenelse.ll
@@ -0,0 +1,27 @@
+; RUN: opt < %s -basicaa -dse -S | FileCheck %s
+
+; The add and store in entry block should be removed by non-local DSE.
+; CHECK-NOT: %stval = add
+; CHECK-NOT: store i32 %stval
+;
+define void @foo(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %c) {
+entry:
+  %0 = load i32, i32* %b, align 4
+  %stval = add nsw i32 %0, 1
+  store i32 %stval, i32* %a, align 4
+  %cmp = icmp sgt i32 %c, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %1 = add nsw i32 %c, 10
+  br label %if.end
+
+if.else:
+  %2 = add nsw i32 %c, 13
+  br label %if.end
+
+if.end:
+  %3 = phi i32 [ %1, %if.then ], [ %2, %if.else ]
+  store i32 %3, i32* %a, align 4
+  ret void
+}
diff --git a/test/Transforms/DeadStoreElimination/ifthenelse2.ll b/test/Transforms/DeadStoreElimination/ifthenelse2.ll
new file mode 100644
index 000000000000..7c95cfd089b4
--- /dev/null
+++ b/test/Transforms/DeadStoreElimination/ifthenelse2.ll
@@ -0,0 +1,34 @@
+; RUN: opt < %s -basicaa -dse -S | FileCheck %s
+
+; The add and store in entry block should be removed by non-local DSE.
+; CHECK-NOT: %stval = add
+; CHECK-NOT: store i32 %stval
+;
+; The stores in if.then and if.else blocks should be removed by non-local DSE.
+; CHECK-NOT: store i32 %1
+; CHECK-NOT: store i32 %2
+;
+define void @foo(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %c) {
+entry:
+  %0 = load i32, i32* %b, align 4
+  %stval = add nsw i32 %0, 1
+  store i32 %stval, i32* %a, align 4
+  %cmp = icmp sgt i32 %c, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %1 = add nsw i32 %c, 10
+  store i32 %1, i32* %a, align 4
+  br label %if.end
+
+if.else:
+  %2 = add nsw i32 %c, 13
+  store i32 %2, i32* %a, align 4
+  br label %if.end
+
+if.end:
+  %3 = phi i32 [ %1, %if.then ], [ %2, %if.else ]
+  %4 = sub nsw i32 %3, 6
+  store i32 %4, i32* %a, align 4
+  ret void
+}
diff --git a/test/Transforms/DeadStoreElimination/loop.ll b/test/Transforms/DeadStoreElimination/loop.ll
new file mode 100644
index 000000000000..80bd8529ad47
--- /dev/null
+++ b/test/Transforms/DeadStoreElimination/loop.ll
@@ -0,0 +1,42 @@
+; RUN: opt < %s -basicaa -dse -S | FileCheck %s
+
+; The store in for.body block should be removed by non-local DSE.
+; CHECK-NOT: store i32 0, i32* %arrayidx
+;
+define void @sum(i32 %N, i32* noalias nocapture %C, i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B) {
+entry:
+  %cmp24 = icmp eq i32 %N, 0
+  br i1 %cmp24, label %for.end11, label %for.body
+
+for.body:
+  %i.025 = phi i32 [ %inc10, %for.cond1.for.inc9_crit_edge ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %C, i32 %i.025
+  store i32 0, i32* %arrayidx, align 4
+  %mul = mul i32 %i.025, %N
+  %arrayidx4.gep = getelementptr i32, i32* %A, i32 %mul
+  br label %for.body3
+
+for.body3:
+  %0 = phi i32 [ 0, %for.body ], [ %add8, %for.body3 ]
+  %arrayidx4.phi = phi i32* [ %arrayidx4.gep, %for.body ], [ %arrayidx4.inc, %for.body3 ]
+  %arrayidx5.phi = phi i32* [ %B, %for.body ], [ %arrayidx5.inc, %for.body3 ]
+  %j.023 = phi i32 [ 0, %for.body ], [ %inc, %for.body3 ]
+  %1 = load i32, i32* %arrayidx4.phi, align 4
+  %2 = load i32, i32* %arrayidx5.phi, align 4
+  %add6 = add nsw i32 %2, %1
+  %add8 = add nsw i32 %add6, %0
+  %inc = add i32 %j.023, 1
+  %exitcond = icmp ne i32 %inc, %N
+  %arrayidx4.inc = getelementptr i32, i32* %arrayidx4.phi, i32 1
+  %arrayidx5.inc = getelementptr i32, i32* %arrayidx5.phi, i32 1
+  br i1 %exitcond, label %for.body3, label %for.cond1.for.inc9_crit_edge
+
+for.cond1.for.inc9_crit_edge:
+  store i32 %add8, i32* %arrayidx, align 4
+  %inc10 = add i32 %i.025, 1
+  %exitcond26 = icmp ne i32 %inc10, %N
+  br i1 %exitcond26, label %for.body, label %for.end11
+
+for.end11:
+  ret void
+}

From 4dc8c1b7edfe7ecee1d14346b96edbb2fb15e050 Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Thu, 10 Dec 2015 14:10:04 +0000
Subject: [PATCH 321/364] [WebAssembly] Remove an unneeded getAnalysisUsage
 override.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255250 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 1b175a7f8d5b..fc2928079d2c 100644
--- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -57,10 +57,6 @@ class WebAssemblyAsmPrinter final : public AsmPrinter {
   // MachineFunctionPass Implementation.
   //===------------------------------------------------------------------===//
 
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AsmPrinter::getAnalysisUsage(AU);
-  }
-
   bool runOnMachineFunction(MachineFunction &MF) override {
     MRI = &MF.getRegInfo();
     MFI = MF.getInfo<WebAssemblyFunctionInfo>();

From 9e8e46050a35977bb5731e6698a21006424d54a7 Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Thu, 10 Dec 2015 14:12:04 +0000
Subject: [PATCH 322/364] [WebAssembly] Declare that WebAssemblyPeephole does
 not modify the CFG.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255251 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/WebAssemblyPeephole.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/lib/Target/WebAssembly/WebAssemblyPeephole.cpp b/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
index 11f44775b5ea..77514bbde7ef 100644
--- a/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
@@ -26,6 +26,11 @@ class WebAssemblyPeephole final : public MachineFunctionPass {
     return "WebAssembly late peephole optimizer";
   }
 
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
   bool runOnMachineFunction(MachineFunction &MF) override;
 
 public:

From 5b3101aa981264f5adb64f15af401aafc57faf35 Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Thu, 10 Dec 2015 14:16:34 +0000
Subject: [PATCH 323/364] [WebAssembly] Fix WebAssemblyPeephole to set Changed
 to true when making changes.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255252 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/WebAssemblyPeephole.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/Target/WebAssembly/WebAssemblyPeephole.cpp b/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
index 77514bbde7ef..ab539e1c2870 100644
--- a/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
@@ -70,6 +70,7 @@ bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) {
         MachineOperand &MO = MI.getOperand(0);
         unsigned OldReg = MO.getReg();
         if (OldReg == MI.getOperand(3).getReg()) {
+          Changed = true;
           unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
           MO.setReg(NewReg);
           MO.setIsDead();

From 079d48b39ea641d6f7dad6307de73ef51c7da1d8 Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Thu, 10 Dec 2015 14:17:36 +0000
Subject: [PATCH 324/364] [WebAssembly] Make WebAssemblyStoreResults only
 return true when it has a change.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255253 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/WebAssembly/WebAssemblyStoreResults.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp b/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
index 21122ba2b2ea..650143a94095 100644
--- a/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
@@ -71,6 +71,7 @@ bool WebAssemblyStoreResults::runOnMachineFunction(MachineFunction &MF) {
 
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
+  bool Changed = false;
 
   assert(MRI.isSSA() && "StoreResults depends on SSA form");
 
@@ -108,6 +109,7 @@ bool WebAssemblyStoreResults::runOnMachineFunction(MachineFunction &MF) {
             if (&MI == Where || !MDT.dominates(&MI, Where))
               continue;
           }
+          Changed = true;
           DEBUG(dbgs() << "Setting operand " << O << " in " << *Where
                        << " from " << MI << "\n");
           O.setReg(ToReg);
@@ -115,5 +117,5 @@ bool WebAssemblyStoreResults::runOnMachineFunction(MachineFunction &MF) {
       }
   }
 
-  return true;
+  return Changed;
 }

From da5132018f8f8648d88956ad18cc4dc10e0b5acb Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Thu, 10 Dec 2015 14:19:35 +0000
Subject: [PATCH 325/364] Slit lib/Linker in two.

A linker normally has two stages: symbol resolution and "moving stuff".

In lib/Linker there is the complication of lazy linking some globals,
but it was still far more mixed than it needed to.

This splits the linker into a lower level IRMover and the linker proper.
The IRMover just takes a list of globals to move and a callback that
lets the user control what is lazy linked.

The main motivation is that now tools/gold (and soon lld) can use their
own symbol resolution to instruct IRMover what to do.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255254 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Linker/IRMover.h       |   78 ++
 include/llvm/Linker/Linker.h        |   49 +-
 lib/Linker/CMakeLists.txt           |    1 +
 lib/Linker/IRMover.cpp              | 1398 +++++++++++++++++++++++++++
 lib/Linker/LinkDiagnosticInfo.h     |   25 +
 lib/Linker/LinkModules.cpp          | 1359 ++------------------------
 test/Linker/alias.ll                |   35 +-
 test/tools/gold/X86/drop-linkage.ll |    2 +-
 tools/gold/gold-plugin.cpp          |  254 ++---
 9 files changed, 1718 insertions(+), 1483 deletions(-)
 create mode 100644 include/llvm/Linker/IRMover.h
 create mode 100644 lib/Linker/IRMover.cpp
 create mode 100644 lib/Linker/LinkDiagnosticInfo.h

diff --git a/include/llvm/Linker/IRMover.h b/include/llvm/Linker/IRMover.h
new file mode 100644
index 000000000000..89d02f129c87
--- /dev/null
+++ b/include/llvm/Linker/IRMover.h
@@ -0,0 +1,78 @@
+//===- IRMover.h ------------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LINKER_IRMOVER_H
+#define LLVM_LINKER_IRMOVER_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/IR/DiagnosticInfo.h"
+
+namespace llvm {
+class GlobalValue;
+class Module;
+class StructType;
+class Type;
+
+class IRMover {
+  struct StructTypeKeyInfo {
+    struct KeyTy {
+      ArrayRef<Type *> ETypes;
+      bool IsPacked;
+      KeyTy(ArrayRef<Type *> E, bool P);
+      KeyTy(const StructType *ST);
+      bool operator==(const KeyTy &that) const;
+      bool operator!=(const KeyTy &that) const;
+    };
+    static StructType *getEmptyKey();
+    static StructType *getTombstoneKey();
+    static unsigned getHashValue(const KeyTy &Key);
+    static unsigned getHashValue(const StructType *ST);
+    static bool isEqual(const KeyTy &LHS, const StructType *RHS);
+    static bool isEqual(const StructType *LHS, const StructType *RHS);
+  };
+
+public:
+  class IdentifiedStructTypeSet {
+    // The set of opaque types is the composite module.
+    DenseSet<StructType *> OpaqueStructTypes;
+
+    // The set of identified but non opaque structures in the composite module.
+    DenseSet<StructType *, StructTypeKeyInfo> NonOpaqueStructTypes;
+
+  public:
+    void addNonOpaque(StructType *Ty);
+    void switchToNonOpaque(StructType *Ty);
+    void addOpaque(StructType *Ty);
+    StructType *findNonOpaque(ArrayRef<Type *> ETypes, bool IsPacked);
+    bool hasType(StructType *Ty);
+  };
+
+  IRMover(Module &M, DiagnosticHandlerFunction DiagnosticHandler);
+
+  typedef std::function<void(GlobalValue &)> ValueAdder;
+  /// Move in the provide values. The source is destroyed.
+  /// Returns true on error.
+  bool move(Module &Src, ArrayRef<GlobalValue *> ValuesToLink,
+            std::function<void(GlobalValue &GV, ValueAdder Add)> AddLazyFor);
+  Module &getModule() { return Composite; }
+
+  DiagnosticHandlerFunction getDiagnosticHandler() const {
+    return DiagnosticHandler;
+  }
+
+private:
+  Module &Composite;
+  IdentifiedStructTypeSet IdentifiedStructTypes;
+  DiagnosticHandlerFunction DiagnosticHandler;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/include/llvm/Linker/Linker.h b/include/llvm/Linker/Linker.h
index aa4300942947..9ff61bc518b0 100644
--- a/include/llvm/Linker/Linker.h
+++ b/include/llvm/Linker/Linker.h
@@ -10,11 +10,9 @@
 #ifndef LLVM_LINKER_LINKER_H
 #define LLVM_LINKER_LINKER_H
 
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/FunctionInfo.h"
+#include "llvm/Linker/IRMover.h"
 
 namespace llvm {
 class Module;
@@ -26,41 +24,9 @@ class Type;
 /// module since it is assumed that the user of this class will want to do
 /// something with it after the linking.
 class Linker {
-public:
-  struct StructTypeKeyInfo {
-    struct KeyTy {
-      ArrayRef<Type *> ETypes;
-      bool IsPacked;
-      KeyTy(ArrayRef<Type *> E, bool P);
-      KeyTy(const StructType *ST);
-      bool operator==(const KeyTy &that) const;
-      bool operator!=(const KeyTy &that) const;
-    };
-    static StructType *getEmptyKey();
-    static StructType *getTombstoneKey();
-    static unsigned getHashValue(const KeyTy &Key);
-    static unsigned getHashValue(const StructType *ST);
-    static bool isEqual(const KeyTy &LHS, const StructType *RHS);
-    static bool isEqual(const StructType *LHS, const StructType *RHS);
-  };
-
-  typedef DenseSet<StructType *, StructTypeKeyInfo> NonOpaqueStructTypeSet;
-  typedef DenseSet<StructType *> OpaqueStructTypeSet;
-
-  struct IdentifiedStructTypeSet {
-    // The set of opaque types is the composite module.
-    OpaqueStructTypeSet OpaqueStructTypes;
-
-    // The set of identified but non opaque structures in the composite module.
-    NonOpaqueStructTypeSet NonOpaqueStructTypes;
-
-    void addNonOpaque(StructType *Ty);
-    void switchToNonOpaque(StructType *Ty);
-    void addOpaque(StructType *Ty);
-    StructType *findNonOpaque(ArrayRef<Type *> ETypes, bool IsPacked);
-    bool hasType(StructType *Ty);
-  };
+  IRMover Mover;
 
+public:
   enum Flags {
     None = 0,
     OverrideFromSrc = (1 << 0),
@@ -88,15 +54,8 @@ class Linker {
                           unsigned Flags = Flags::None);
 
   DiagnosticHandlerFunction getDiagnosticHandler() const {
-    return DiagnosticHandler;
+    return Mover.getDiagnosticHandler();
   }
-
-private:
-  Module &Composite;
-
-  IdentifiedStructTypeSet IdentifiedStructTypes;
-
-  DiagnosticHandlerFunction DiagnosticHandler;
 };
 
 /// Create a new module with exported local functions renamed and promoted
diff --git a/lib/Linker/CMakeLists.txt b/lib/Linker/CMakeLists.txt
index f9d8e0925ae3..8916fb3f7251 100644
--- a/lib/Linker/CMakeLists.txt
+++ b/lib/Linker/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_llvm_library(LLVMLinker
+  IRMover.cpp
   LinkModules.cpp
 
   ADDITIONAL_HEADER_DIRS
diff --git a/lib/Linker/IRMover.cpp b/lib/Linker/IRMover.cpp
new file mode 100644
index 000000000000..20a82327e198
--- /dev/null
+++ b/lib/Linker/IRMover.cpp
@@ -0,0 +1,1398 @@
+//===- lib/Linker/IRMover.cpp ---------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Linker/IRMover.h"
+#include "LinkDiagnosticInfo.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/TypeFinder.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// TypeMap implementation.
+//===----------------------------------------------------------------------===//
+
+namespace {
+class TypeMapTy : public ValueMapTypeRemapper {
+  /// This is a mapping from a source type to a destination type to use.
+  DenseMap<Type *, Type *> MappedTypes;
+
+  /// When checking to see if two subgraphs are isomorphic, we speculatively
+  /// add types to MappedTypes, but keep track of them here in case we need to
+  /// roll back.
+  SmallVector<Type *, 16> SpeculativeTypes;
+
+  SmallVector<StructType *, 16> SpeculativeDstOpaqueTypes;
+
+  /// This is a list of non-opaque structs in the source module that are mapped
+  /// to an opaque struct in the destination module.
+  SmallVector<StructType *, 16> SrcDefinitionsToResolve;
+
+  /// This is the set of opaque types in the destination modules who are
+  /// getting a body from the source module.
+  SmallPtrSet<StructType *, 16> DstResolvedOpaqueTypes;
+
+public:
+  TypeMapTy(IRMover::IdentifiedStructTypeSet &DstStructTypesSet)
+      : DstStructTypesSet(DstStructTypesSet) {}
+
+  IRMover::IdentifiedStructTypeSet &DstStructTypesSet;
+  /// Indicate that the specified type in the destination module is conceptually
+  /// equivalent to the specified type in the source module.
+  void addTypeMapping(Type *DstTy, Type *SrcTy);
+
+  /// Produce a body for an opaque type in the dest module from a type
+  /// definition in the source module.
+  void linkDefinedTypeBodies();
+
+  /// Return the mapped type to use for the specified input type from the
+  /// source module.
+  Type *get(Type *SrcTy);
+  Type *get(Type *SrcTy, SmallPtrSet<StructType *, 8> &Visited);
+
+  void finishType(StructType *DTy, StructType *STy, ArrayRef<Type *> ETypes);
+
+  FunctionType *get(FunctionType *T) {
+    return cast<FunctionType>(get((Type *)T));
+  }
+
+private:
+  Type *remapType(Type *SrcTy) override { return get(SrcTy); }
+
+  bool areTypesIsomorphic(Type *DstTy, Type *SrcTy);
+};
+}
+
+void TypeMapTy::addTypeMapping(Type *DstTy, Type *SrcTy) {
+  assert(SpeculativeTypes.empty());
+  assert(SpeculativeDstOpaqueTypes.empty());
+
+  // Check to see if these types are recursively isomorphic and establish a
+  // mapping between them if so.
+  if (!areTypesIsomorphic(DstTy, SrcTy)) {
+    // Oops, they aren't isomorphic.  Just discard this request by rolling out
+    // any speculative mappings we've established.
+    for (Type *Ty : SpeculativeTypes)
+      MappedTypes.erase(Ty);
+
+    SrcDefinitionsToResolve.resize(SrcDefinitionsToResolve.size() -
+                                   SpeculativeDstOpaqueTypes.size());
+    for (StructType *Ty : SpeculativeDstOpaqueTypes)
+      DstResolvedOpaqueTypes.erase(Ty);
+  } else {
+    for (Type *Ty : SpeculativeTypes)
+      if (auto *STy = dyn_cast<StructType>(Ty))
+        if (STy->hasName())
+          STy->setName("");
+  }
+  SpeculativeTypes.clear();
+  SpeculativeDstOpaqueTypes.clear();
+}
+
+/// Recursively walk this pair of types, returning true if they are isomorphic,
+/// false if they are not.
+bool TypeMapTy::areTypesIsomorphic(Type *DstTy, Type *SrcTy) {
+  // Two types with differing kinds are clearly not isomorphic.
+  if (DstTy->getTypeID() != SrcTy->getTypeID())
+    return false;
+
+  // If we have an entry in the MappedTypes table, then we have our answer.
+  Type *&Entry = MappedTypes[SrcTy];
+  if (Entry)
+    return Entry == DstTy;
+
+  // Two identical types are clearly isomorphic.  Remember this
+  // non-speculatively.
+  if (DstTy == SrcTy) {
+    Entry = DstTy;
+    return true;
+  }
+
+  // Okay, we have two types with identical kinds that we haven't seen before.
+
+  // If this is an opaque struct type, special case it.
+  if (StructType *SSTy = dyn_cast<StructType>(SrcTy)) {
+    // Mapping an opaque type to any struct, just keep the dest struct.
+    if (SSTy->isOpaque()) {
+      Entry = DstTy;
+      SpeculativeTypes.push_back(SrcTy);
+      return true;
+    }
+
+    // Mapping a non-opaque source type to an opaque dest.  If this is the first
+    // type that we're mapping onto this destination type then we succeed.  Keep
+    // the dest, but fill it in later. If this is the second (different) type
+    // that we're trying to map onto the same opaque type then we fail.
+    if (cast<StructType>(DstTy)->isOpaque()) {
+      // We can only map one source type onto the opaque destination type.
+      if (!DstResolvedOpaqueTypes.insert(cast<StructType>(DstTy)).second)
+        return false;
+      SrcDefinitionsToResolve.push_back(SSTy);
+      SpeculativeTypes.push_back(SrcTy);
+      SpeculativeDstOpaqueTypes.push_back(cast<StructType>(DstTy));
+      Entry = DstTy;
+      return true;
+    }
+  }
+
+  // If the number of subtypes disagree between the two types, then we fail.
+  if (SrcTy->getNumContainedTypes() != DstTy->getNumContainedTypes())
+    return false;
+
+  // Fail if any of the extra properties (e.g. array size) of the type disagree.
+  if (isa<IntegerType>(DstTy))
+    return false; // bitwidth disagrees.
+  if (PointerType *PT = dyn_cast<PointerType>(DstTy)) {
+    if (PT->getAddressSpace() != cast<PointerType>(SrcTy)->getAddressSpace())
+      return false;
+
+  } else if (FunctionType *FT = dyn_cast<FunctionType>(DstTy)) {
+    if (FT->isVarArg() != cast<FunctionType>(SrcTy)->isVarArg())
+      return false;
+  } else if (StructType *DSTy = dyn_cast<StructType>(DstTy)) {
+    StructType *SSTy = cast<StructType>(SrcTy);
+    if (DSTy->isLiteral() != SSTy->isLiteral() ||
+        DSTy->isPacked() != SSTy->isPacked())
+      return false;
+  } else if (ArrayType *DATy = dyn_cast<ArrayType>(DstTy)) {
+    if (DATy->getNumElements() != cast<ArrayType>(SrcTy)->getNumElements())
+      return false;
+  } else if (VectorType *DVTy = dyn_cast<VectorType>(DstTy)) {
+    if (DVTy->getNumElements() != cast<VectorType>(SrcTy)->getNumElements())
+      return false;
+  }
+
+  // Otherwise, we speculate that these two types will line up and recursively
+  // check the subelements.
+  Entry = DstTy;
+  SpeculativeTypes.push_back(SrcTy);
+
+  for (unsigned I = 0, E = SrcTy->getNumContainedTypes(); I != E; ++I)
+    if (!areTypesIsomorphic(DstTy->getContainedType(I),
+                            SrcTy->getContainedType(I)))
+      return false;
+
+  // If everything seems to have lined up, then everything is great.
+  return true;
+}
+
+void TypeMapTy::linkDefinedTypeBodies() {
+  SmallVector<Type *, 16> Elements;
+  for (StructType *SrcSTy : SrcDefinitionsToResolve) {
+    StructType *DstSTy = cast<StructType>(MappedTypes[SrcSTy]);
+    assert(DstSTy->isOpaque());
+
+    // Map the body of the source type over to a new body for the dest type.
+    Elements.resize(SrcSTy->getNumElements());
+    for (unsigned I = 0, E = Elements.size(); I != E; ++I)
+      Elements[I] = get(SrcSTy->getElementType(I));
+
+    DstSTy->setBody(Elements, SrcSTy->isPacked());
+    DstStructTypesSet.switchToNonOpaque(DstSTy);
+  }
+  SrcDefinitionsToResolve.clear();
+  DstResolvedOpaqueTypes.clear();
+}
+
+void TypeMapTy::finishType(StructType *DTy, StructType *STy,
+                           ArrayRef<Type *> ETypes) {
+  DTy->setBody(ETypes, STy->isPacked());
+
+  // Steal STy's name.
+  if (STy->hasName()) {
+    SmallString<16> TmpName = STy->getName();
+    STy->setName("");
+    DTy->setName(TmpName);
+  }
+
+  DstStructTypesSet.addNonOpaque(DTy);
+}
+
+Type *TypeMapTy::get(Type *Ty) {
+  SmallPtrSet<StructType *, 8> Visited;
+  return get(Ty, Visited);
+}
+
+Type *TypeMapTy::get(Type *Ty, SmallPtrSet<StructType *, 8> &Visited) {
+  // If we already have an entry for this type, return it.
+  Type **Entry = &MappedTypes[Ty];
+  if (*Entry)
+    return *Entry;
+
+  // These are types that LLVM itself will unique.
+  bool IsUniqued = !isa<StructType>(Ty) || cast<StructType>(Ty)->isLiteral();
+
+#ifndef NDEBUG
+  if (!IsUniqued) {
+    for (auto &Pair : MappedTypes) {
+      assert(!(Pair.first != Ty && Pair.second == Ty) &&
+             "mapping to a source type");
+    }
+  }
+#endif
+
+  if (!IsUniqued && !Visited.insert(cast<StructType>(Ty)).second) {
+    StructType *DTy = StructType::create(Ty->getContext());
+    return *Entry = DTy;
+  }
+
+  // If this is not a recursive type, then just map all of the elements and
+  // then rebuild the type from inside out.
+  SmallVector<Type *, 4> ElementTypes;
+
+  // If there are no element types to map, then the type is itself.  This is
+  // true for the anonymous {} struct, things like 'float', integers, etc.
+  if (Ty->getNumContainedTypes() == 0 && IsUniqued)
+    return *Entry = Ty;
+
+  // Remap all of the elements, keeping track of whether any of them change.
+  bool AnyChange = false;
+  ElementTypes.resize(Ty->getNumContainedTypes());
+  for (unsigned I = 0, E = Ty->getNumContainedTypes(); I != E; ++I) {
+    ElementTypes[I] = get(Ty->getContainedType(I), Visited);
+    AnyChange |= ElementTypes[I] != Ty->getContainedType(I);
+  }
+
+  // If we found our type while recursively processing stuff, just use it.
+  Entry = &MappedTypes[Ty];
+  if (*Entry) {
+    if (auto *DTy = dyn_cast<StructType>(*Entry)) {
+      if (DTy->isOpaque()) {
+        auto *STy = cast<StructType>(Ty);
+        finishType(DTy, STy, ElementTypes);
+      }
+    }
+    return *Entry;
+  }
+
+  // If all of the element types mapped directly over and the type is not
+  // a nomed struct, then the type is usable as-is.
+  if (!AnyChange && IsUniqued)
+    return *Entry = Ty;
+
+  // Otherwise, rebuild a modified type.
+  switch (Ty->getTypeID()) {
+  default:
+    llvm_unreachable("unknown derived type to remap");
+  case Type::ArrayTyID:
+    return *Entry = ArrayType::get(ElementTypes[0],
+                                   cast<ArrayType>(Ty)->getNumElements());
+  case Type::VectorTyID:
+    return *Entry = VectorType::get(ElementTypes[0],
+                                    cast<VectorType>(Ty)->getNumElements());
+  case Type::PointerTyID:
+    return *Entry = PointerType::get(ElementTypes[0],
+                                     cast<PointerType>(Ty)->getAddressSpace());
+  case Type::FunctionTyID:
+    return *Entry = FunctionType::get(ElementTypes[0],
+                                      makeArrayRef(ElementTypes).slice(1),
+                                      cast<FunctionType>(Ty)->isVarArg());
+  case Type::StructTyID: {
+    auto *STy = cast<StructType>(Ty);
+    bool IsPacked = STy->isPacked();
+    if (IsUniqued)
+      return *Entry = StructType::get(Ty->getContext(), ElementTypes, IsPacked);
+
+    // If the type is opaque, we can just use it directly.
+    if (STy->isOpaque()) {
+      DstStructTypesSet.addOpaque(STy);
+      return *Entry = Ty;
+    }
+
+    if (StructType *OldT =
+            DstStructTypesSet.findNonOpaque(ElementTypes, IsPacked)) {
+      STy->setName("");
+      return *Entry = OldT;
+    }
+
+    if (!AnyChange) {
+      DstStructTypesSet.addNonOpaque(STy);
+      return *Entry = Ty;
+    }
+
+    StructType *DTy = StructType::create(Ty->getContext());
+    finishType(DTy, STy, ElementTypes);
+    return *Entry = DTy;
+  }
+  }
+}
+
+LinkDiagnosticInfo::LinkDiagnosticInfo(DiagnosticSeverity Severity,
+                                       const Twine &Msg)
+    : DiagnosticInfo(DK_Linker, Severity), Msg(Msg) {}
+void LinkDiagnosticInfo::print(DiagnosticPrinter &DP) const { DP << Msg; }
+
+//===----------------------------------------------------------------------===//
+// ModuleLinker implementation.
+//===----------------------------------------------------------------------===//
+
+namespace {
+class IRLinker;
+
+/// Creates prototypes for functions that are lazily linked on the fly. This
+/// speeds up linking for modules with many/ lazily linked functions of which
+/// few get used.
+class GlobalValueMaterializer final : public ValueMaterializer {
+  IRLinker *ModLinker;
+
+public:
+  GlobalValueMaterializer(IRLinker *ModLinker) : ModLinker(ModLinker) {}
+  Value *materializeDeclFor(Value *V) override;
+  void materializeInitFor(GlobalValue *New, GlobalValue *Old) override;
+};
+
+class LocalValueMaterializer final : public ValueMaterializer {
+  IRLinker *ModLinker;
+
+public:
+  LocalValueMaterializer(IRLinker *ModLinker) : ModLinker(ModLinker) {}
+  Value *materializeDeclFor(Value *V) override;
+  void materializeInitFor(GlobalValue *New, GlobalValue *Old) override;
+};
+
+/// This is responsible for keeping track of the state used for moving data
+/// from SrcM to DstM.
+class IRLinker {
+  Module &DstM;
+  Module &SrcM;
+
+  std::function<void(GlobalValue &, IRMover::ValueAdder)> AddLazyFor;
+
+  TypeMapTy TypeMap;
+  GlobalValueMaterializer GValMaterializer;
+  LocalValueMaterializer LValMaterializer;
+
+  /// Mapping of values from what they used to be in Src, to what they are now
+  /// in DstM.  ValueToValueMapTy is a ValueMap, which involves some overhead
+  /// due to the use of Value handles which the Linker doesn't actually need,
+  /// but this allows us to reuse the ValueMapper code.
+  ValueToValueMapTy ValueMap;
+  ValueToValueMapTy AliasValueMap;
+
+  DenseSet<GlobalValue *> ValuesToLink;
+  std::vector<GlobalValue *> Worklist;
+
+  void maybeAdd(GlobalValue *GV) {
+    if (ValuesToLink.insert(GV).second)
+      Worklist.push_back(GV);
+  }
+
+  DiagnosticHandlerFunction DiagnosticHandler;
+
+  /// Set to true when all global value body linking is complete (including
+  /// lazy linking). Used to prevent metadata linking from creating new
+  /// references.
+  bool DoneLinkingBodies = false;
+
+  bool HasError = false;
+
+  /// Handles cloning of a global values from the source module into
+  /// the destination module, including setting the attributes and visibility.
+  GlobalValue *copyGlobalValueProto(const GlobalValue *SGV, bool ForDefinition);
+
+  /// Helper method for setting a message and returning an error code.
+  bool emitError(const Twine &Message) {
+    DiagnosticHandler(LinkDiagnosticInfo(DS_Error, Message));
+    HasError = true;
+    return true;
+  }
+
+  void emitWarning(const Twine &Message) {
+    DiagnosticHandler(LinkDiagnosticInfo(DS_Warning, Message));
+  }
+
+  /// Given a global in the source module, return the global in the
+  /// destination module that is being linked to, if any.
+  GlobalValue *getLinkedToGlobal(const GlobalValue *SrcGV) {
+    // If the source has no name it can't link.  If it has local linkage,
+    // there is no name match-up going on.
+    if (!SrcGV->hasName() || SrcGV->hasLocalLinkage())
+      return nullptr;
+
+    // Otherwise see if we have a match in the destination module's symtab.
+    GlobalValue *DGV = DstM.getNamedValue(SrcGV->getName());
+    if (!DGV)
+      return nullptr;
+
+    // If we found a global with the same name in the dest module, but it has
+    // internal linkage, we are really not doing any linkage here.
+    if (DGV->hasLocalLinkage())
+      return nullptr;
+
+    // Otherwise, we do in fact link to the destination global.
+    return DGV;
+  }
+
+  void computeTypeMapping();
+
+  Constant *linkAppendingVarProto(GlobalVariable *DstGV,
+                                  const GlobalVariable *SrcGV);
+
+  bool shouldLink(GlobalValue *DGV, GlobalValue &SGV);
+  Constant *linkGlobalValueProto(GlobalValue *GV, bool ForAlias);
+
+  bool linkModuleFlagsMetadata();
+
+  void linkGlobalInit(GlobalVariable &Dst, GlobalVariable &Src);
+  bool linkFunctionBody(Function &Dst, Function &Src);
+  void linkAliasBody(GlobalAlias &Dst, GlobalAlias &Src);
+  bool linkGlobalValueBody(GlobalValue &Dst, GlobalValue &Src);
+
+  /// Functions that take care of cloning a specific global value type
+  /// into the destination module.
+  GlobalVariable *copyGlobalVariableProto(const GlobalVariable *SGVar);
+  Function *copyFunctionProto(const Function *SF);
+  GlobalValue *copyGlobalAliasProto(const GlobalAlias *SGA);
+
+  void linkNamedMDNodes();
+
+public:
+  IRLinker(Module &DstM, IRMover::IdentifiedStructTypeSet &Set, Module &SrcM,
+           DiagnosticHandlerFunction DiagnosticHandler,
+           ArrayRef<GlobalValue *> ValuesToLink,
+           std::function<void(GlobalValue &, IRMover::ValueAdder)> AddLazyFor)
+      : DstM(DstM), SrcM(SrcM), AddLazyFor(AddLazyFor), TypeMap(Set),
+        GValMaterializer(this), LValMaterializer(this),
+        DiagnosticHandler(DiagnosticHandler) {
+    for (GlobalValue *GV : ValuesToLink)
+      maybeAdd(GV);
+  }
+
+  bool run();
+  Value *materializeDeclFor(Value *V, bool ForAlias);
+  void materializeInitFor(GlobalValue *New, GlobalValue *Old, bool ForAlias);
+};
+}
+
+/// The LLVM SymbolTable class autorenames globals that conflict in the symbol
+/// table. This is good for all clients except for us. Go through the trouble
+/// to force this back.
+static void forceRenaming(GlobalValue *GV, StringRef Name) {
+  // If the global doesn't force its name or if it already has the right name,
+  // there is nothing for us to do.
+  if (GV->hasLocalLinkage() || GV->getName() == Name)
+    return;
+
+  Module *M = GV->getParent();
+
+  // If there is a conflict, rename the conflict.
+  if (GlobalValue *ConflictGV = M->getNamedValue(Name)) {
+    GV->takeName(ConflictGV);
+    ConflictGV->setName(Name); // This will cause ConflictGV to get renamed
+    assert(ConflictGV->getName() != Name && "forceRenaming didn't work");
+  } else {
+    GV->setName(Name); // Force the name back
+  }
+}
+
+Value *GlobalValueMaterializer::materializeDeclFor(Value *V) {
+  return ModLinker->materializeDeclFor(V, false);
+}
+
+void GlobalValueMaterializer::materializeInitFor(GlobalValue *New,
+                                                 GlobalValue *Old) {
+  ModLinker->materializeInitFor(New, Old, false);
+}
+
+Value *LocalValueMaterializer::materializeDeclFor(Value *V) {
+  return ModLinker->materializeDeclFor(V, true);
+}
+
+void LocalValueMaterializer::materializeInitFor(GlobalValue *New,
+                                                GlobalValue *Old) {
+  ModLinker->materializeInitFor(New, Old, true);
+}
+
+Value *IRLinker::materializeDeclFor(Value *V, bool ForAlias) {
+  auto *SGV = dyn_cast<GlobalValue>(V);
+  if (!SGV)
+    return nullptr;
+
+  return linkGlobalValueProto(SGV, ForAlias);
+}
+
+void IRLinker::materializeInitFor(GlobalValue *New, GlobalValue *Old,
+                                  bool ForAlias) {
+  // If we already created the body, just return.
+  if (auto *F = dyn_cast<Function>(New)) {
+    if (!F->isDeclaration())
+      return;
+  } else if (auto *V = dyn_cast<GlobalVariable>(New)) {
+    if (V->hasInitializer())
+      return;
+  } else {
+    auto *A = cast<GlobalAlias>(New);
+    if (A->getAliasee())
+      return;
+  }
+
+  if (ForAlias || shouldLink(New, *Old))
+    linkGlobalValueBody(*New, *Old);
+}
+
+/// Loop through the global variables in the src module and merge them into the
+/// dest module.
+GlobalVariable *IRLinker::copyGlobalVariableProto(const GlobalVariable *SGVar) {
+  // No linking to be performed or linking from the source: simply create an
+  // identical version of the symbol over in the dest module... the
+  // initializer will be filled in later by LinkGlobalInits.
+  GlobalVariable *NewDGV =
+      new GlobalVariable(DstM, TypeMap.get(SGVar->getType()->getElementType()),
+                         SGVar->isConstant(), GlobalValue::ExternalLinkage,
+                         /*init*/ nullptr, SGVar->getName(),
+                         /*insertbefore*/ nullptr, SGVar->getThreadLocalMode(),
+                         SGVar->getType()->getAddressSpace());
+  NewDGV->setAlignment(SGVar->getAlignment());
+  return NewDGV;
+}
+
+/// Link the function in the source module into the destination module if
+/// needed, setting up mapping information.
+Function *IRLinker::copyFunctionProto(const Function *SF) {
+  // If there is no linkage to be performed or we are linking from the source,
+  // bring SF over.
+  return Function::Create(TypeMap.get(SF->getFunctionType()),
+                          GlobalValue::ExternalLinkage, SF->getName(), &DstM);
+}
+
+/// Set up prototypes for any aliases that come over from the source module.
+GlobalValue *IRLinker::copyGlobalAliasProto(const GlobalAlias *SGA) {
+  // If there is no linkage to be performed or we're linking from the source,
+  // bring over SGA.
+  auto *Ty = TypeMap.get(SGA->getValueType());
+  return GlobalAlias::create(Ty, SGA->getType()->getPointerAddressSpace(),
+                             GlobalValue::ExternalLinkage, SGA->getName(),
+                             &DstM);
+}
+
+GlobalValue *IRLinker::copyGlobalValueProto(const GlobalValue *SGV,
+                                            bool ForDefinition) {
+  GlobalValue *NewGV;
+  if (auto *SGVar = dyn_cast<GlobalVariable>(SGV)) {
+    NewGV = copyGlobalVariableProto(SGVar);
+  } else if (auto *SF = dyn_cast<Function>(SGV)) {
+    NewGV = copyFunctionProto(SF);
+  } else {
+    if (ForDefinition)
+      NewGV = copyGlobalAliasProto(cast<GlobalAlias>(SGV));
+    else
+      NewGV = new GlobalVariable(
+          DstM, TypeMap.get(SGV->getType()->getElementType()),
+          /*isConstant*/ false, GlobalValue::ExternalLinkage,
+          /*init*/ nullptr, SGV->getName(),
+          /*insertbefore*/ nullptr, SGV->getThreadLocalMode(),
+          SGV->getType()->getAddressSpace());
+  }
+
+  if (ForDefinition)
+    NewGV->setLinkage(SGV->getLinkage());
+  else if (SGV->hasExternalWeakLinkage() || SGV->hasWeakLinkage() ||
+           SGV->hasLinkOnceLinkage())
+    NewGV->setLinkage(GlobalValue::ExternalWeakLinkage);
+
+  NewGV->copyAttributesFrom(SGV);
+  return NewGV;
+}
+
+/// Loop over all of the linked values to compute type mappings.  For example,
+/// if we link "extern Foo *x" and "Foo *x = NULL", then we have two struct
+/// types 'Foo' but one got renamed when the module was loaded into the same
+/// LLVMContext.
+void IRLinker::computeTypeMapping() {
+  for (GlobalValue &SGV : SrcM.globals()) {
+    GlobalValue *DGV = getLinkedToGlobal(&SGV);
+    if (!DGV)
+      continue;
+
+    if (!DGV->hasAppendingLinkage() || !SGV.hasAppendingLinkage()) {
+      TypeMap.addTypeMapping(DGV->getType(), SGV.getType());
+      continue;
+    }
+
+    // Unify the element type of appending arrays.
+    ArrayType *DAT = cast<ArrayType>(DGV->getType()->getElementType());
+    ArrayType *SAT = cast<ArrayType>(SGV.getType()->getElementType());
+    TypeMap.addTypeMapping(DAT->getElementType(), SAT->getElementType());
+  }
+
+  for (GlobalValue &SGV : SrcM)
+    if (GlobalValue *DGV = getLinkedToGlobal(&SGV))
+      TypeMap.addTypeMapping(DGV->getType(), SGV.getType());
+
+  for (GlobalValue &SGV : SrcM.aliases())
+    if (GlobalValue *DGV = getLinkedToGlobal(&SGV))
+      TypeMap.addTypeMapping(DGV->getType(), SGV.getType());
+
+  // Incorporate types by name, scanning all the types in the source module.
+  // At this point, the destination module may have a type "%foo = { i32 }" for
+  // example.  When the source module got loaded into the same LLVMContext, if
+  // it had the same type, it would have been renamed to "%foo.42 = { i32 }".
+  std::vector<StructType *> Types = SrcM.getIdentifiedStructTypes();
+  for (StructType *ST : Types) {
+    if (!ST->hasName())
+      continue;
+
+    // Check to see if there is a dot in the name followed by a digit.
+    size_t DotPos = ST->getName().rfind('.');
+    if (DotPos == 0 || DotPos == StringRef::npos ||
+        ST->getName().back() == '.' ||
+        !isdigit(static_cast<unsigned char>(ST->getName()[DotPos + 1])))
+      continue;
+
+    // Check to see if the destination module has a struct with the prefix name.
+    StructType *DST = DstM.getTypeByName(ST->getName().substr(0, DotPos));
+    if (!DST)
+      continue;
+
+    // Don't use it if this actually came from the source module. They're in
+    // the same LLVMContext after all. Also don't use it unless the type is
+    // actually used in the destination module. This can happen in situations
+    // like this:
+    //
+    //      Module A                         Module B
+    //      --------                         --------
+    //   %Z = type { %A }                %B = type { %C.1 }
+    //   %A = type { %B.1, [7 x i8] }    %C.1 = type { i8* }
+    //   %B.1 = type { %C }              %A.2 = type { %B.3, [5 x i8] }
+    //   %C = type { i8* }               %B.3 = type { %C.1 }
+    //
+    // When we link Module B with Module A, the '%B' in Module B is
+    // used. However, that would then use '%C.1'. But when we process '%C.1',
+    // we prefer to take the '%C' version. So we are then left with both
+    // '%C.1' and '%C' being used for the same types. This leads to some
+    // variables using one type and some using the other.
+    if (TypeMap.DstStructTypesSet.hasType(DST))
+      TypeMap.addTypeMapping(DST, ST);
+  }
+
+  // Now that we have discovered all of the type equivalences, get a body for
+  // any 'opaque' types in the dest module that are now resolved.
+  TypeMap.linkDefinedTypeBodies();
+}
+
+static void getArrayElements(const Constant *C,
+                             SmallVectorImpl<Constant *> &Dest) {
+  unsigned NumElements = cast<ArrayType>(C->getType())->getNumElements();
+
+  for (unsigned i = 0; i != NumElements; ++i)
+    Dest.push_back(C->getAggregateElement(i));
+}
+
+/// If there were any appending global variables, link them together now.
+/// Return true on error.
+Constant *IRLinker::linkAppendingVarProto(GlobalVariable *DstGV,
+                                          const GlobalVariable *SrcGV) {
+  Type *EltTy = cast<ArrayType>(TypeMap.get(SrcGV->getType()->getElementType()))
+                    ->getElementType();
+
+  StringRef Name = SrcGV->getName();
+  bool IsNewStructor = false;
+  bool IsOldStructor = false;
+  if (Name == "llvm.global_ctors" || Name == "llvm.global_dtors") {
+    if (cast<StructType>(EltTy)->getNumElements() == 3)
+      IsNewStructor = true;
+    else
+      IsOldStructor = true;
+  }
+
+  PointerType *VoidPtrTy = Type::getInt8Ty(SrcGV->getContext())->getPointerTo();
+  if (IsOldStructor) {
+    auto &ST = *cast<StructType>(EltTy);
+    Type *Tys[3] = {ST.getElementType(0), ST.getElementType(1), VoidPtrTy};
+    EltTy = StructType::get(SrcGV->getContext(), Tys, false);
+  }
+
+  if (DstGV) {
+    ArrayType *DstTy = cast<ArrayType>(DstGV->getType()->getElementType());
+
+    if (!SrcGV->hasAppendingLinkage() || !DstGV->hasAppendingLinkage()) {
+      emitError(
+          "Linking globals named '" + SrcGV->getName() +
+          "': can only link appending global with another appending global!");
+      return nullptr;
+    }
+
+    // Check to see that they two arrays agree on type.
+    if (EltTy != DstTy->getElementType()) {
+      emitError("Appending variables with different element types!");
+      return nullptr;
+    }
+    if (DstGV->isConstant() != SrcGV->isConstant()) {
+      emitError("Appending variables linked with different const'ness!");
+      return nullptr;
+    }
+
+    if (DstGV->getAlignment() != SrcGV->getAlignment()) {
+      emitError(
+          "Appending variables with different alignment need to be linked!");
+      return nullptr;
+    }
+
+    if (DstGV->getVisibility() != SrcGV->getVisibility()) {
+      emitError(
+          "Appending variables with different visibility need to be linked!");
+      return nullptr;
+    }
+
+    if (DstGV->hasUnnamedAddr() != SrcGV->hasUnnamedAddr()) {
+      emitError(
+          "Appending variables with different unnamed_addr need to be linked!");
+      return nullptr;
+    }
+
+    if (StringRef(DstGV->getSection()) != SrcGV->getSection()) {
+      emitError(
+          "Appending variables with different section name need to be linked!");
+      return nullptr;
+    }
+  }
+
+  SmallVector<Constant *, 16> DstElements;
+  if (DstGV)
+    getArrayElements(DstGV->getInitializer(), DstElements);
+
+  SmallVector<Constant *, 16> SrcElements;
+  getArrayElements(SrcGV->getInitializer(), SrcElements);
+
+  if (IsNewStructor)
+    SrcElements.erase(
+        std::remove_if(SrcElements.begin(), SrcElements.end(),
+                       [this](Constant *E) {
+                         auto *Key = dyn_cast<GlobalValue>(
+                             E->getAggregateElement(2)->stripPointerCasts());
+                         if (!Key)
+                           return false;
+                         GlobalValue *DGV = getLinkedToGlobal(Key);
+                         return !shouldLink(DGV, *Key);
+                       }),
+        SrcElements.end());
+  uint64_t NewSize = DstElements.size() + SrcElements.size();
+  ArrayType *NewType = ArrayType::get(EltTy, NewSize);
+
+  // Create the new global variable.
+  GlobalVariable *NG = new GlobalVariable(
+      DstM, NewType, SrcGV->isConstant(), SrcGV->getLinkage(),
+      /*init*/ nullptr, /*name*/ "", DstGV, SrcGV->getThreadLocalMode(),
+      SrcGV->getType()->getAddressSpace());
+
+  NG->copyAttributesFrom(SrcGV);
+  forceRenaming(NG, SrcGV->getName());
+
+  Constant *Ret = ConstantExpr::getBitCast(NG, TypeMap.get(SrcGV->getType()));
+
+  // Stop recursion.
+  ValueMap[SrcGV] = Ret;
+
+  for (auto *V : SrcElements) {
+    Constant *NewV;
+    if (IsOldStructor) {
+      auto *S = cast<ConstantStruct>(V);
+      auto *E1 = MapValue(S->getOperand(0), ValueMap, RF_MoveDistinctMDs,
+                          &TypeMap, &GValMaterializer);
+      auto *E2 = MapValue(S->getOperand(1), ValueMap, RF_MoveDistinctMDs,
+                          &TypeMap, &GValMaterializer);
+      Value *Null = Constant::getNullValue(VoidPtrTy);
+      NewV =
+          ConstantStruct::get(cast<StructType>(EltTy), E1, E2, Null, nullptr);
+    } else {
+      NewV = MapValue(V, ValueMap, RF_MoveDistinctMDs, &TypeMap,
+                      &GValMaterializer);
+    }
+    DstElements.push_back(NewV);
+  }
+
+  NG->setInitializer(ConstantArray::get(NewType, DstElements));
+
+  // Replace any uses of the two global variables with uses of the new
+  // global.
+  if (DstGV) {
+    DstGV->replaceAllUsesWith(ConstantExpr::getBitCast(NG, DstGV->getType()));
+    DstGV->eraseFromParent();
+  }
+
+  return Ret;
+}
+
+static bool useExistingDest(GlobalValue &SGV, GlobalValue *DGV,
+                            bool ShouldLink) {
+  if (!DGV)
+    return false;
+
+  if (SGV.isDeclaration())
+    return true;
+
+  if (DGV->isDeclarationForLinker())
+    return false;
+
+  if (ShouldLink)
+    return false;
+
+  return true;
+}
+
+bool IRLinker::shouldLink(GlobalValue *DGV, GlobalValue &SGV) {
+  if (ValuesToLink.count(&SGV))
+    return true;
+
+  if (SGV.hasLocalLinkage())
+    return true;
+
+  if (DGV && !DGV->isDeclaration())
+    return false;
+
+  if (SGV.hasAvailableExternallyLinkage())
+    return true;
+
+  if (DoneLinkingBodies)
+    return false;
+
+  AddLazyFor(SGV, [this](GlobalValue &GV) { maybeAdd(&GV); });
+  return ValuesToLink.count(&SGV);
+}
+
+Constant *IRLinker::linkGlobalValueProto(GlobalValue *SGV, bool ForAlias) {
+  GlobalValue *DGV = getLinkedToGlobal(SGV);
+
+  bool ShouldLink = shouldLink(DGV, *SGV);
+
+  // just missing from map
+  if (ShouldLink) {
+    auto I = ValueMap.find(SGV);
+    if (I != ValueMap.end())
+      return cast<Constant>(I->second);
+
+    I = AliasValueMap.find(SGV);
+    if (I != AliasValueMap.end())
+      return cast<Constant>(I->second);
+  }
+
+  DGV = nullptr;
+  if (ShouldLink || !ForAlias)
+    DGV = getLinkedToGlobal(SGV);
+
+  // Handle the ultra special appending linkage case first.
+  assert(!DGV || SGV->hasAppendingLinkage() == DGV->hasAppendingLinkage());
+  if (SGV->hasAppendingLinkage())
+    return linkAppendingVarProto(cast_or_null<GlobalVariable>(DGV),
+                                 cast<GlobalVariable>(SGV));
+
+  GlobalValue *NewGV;
+  if (useExistingDest(*SGV, DGV, ShouldLink)) {
+    NewGV = DGV;
+  } else {
+    // If we are done linking global value bodies (i.e. we are performing
+    // metadata linking), don't link in the global value due to this
+    // reference, simply map it to null.
+    if (DoneLinkingBodies)
+      return nullptr;
+
+    NewGV = copyGlobalValueProto(SGV, ShouldLink);
+    if (!ForAlias)
+      forceRenaming(NewGV, SGV->getName());
+  }
+  if (ShouldLink || ForAlias) {
+    if (const Comdat *SC = SGV->getComdat()) {
+      if (auto *GO = dyn_cast<GlobalObject>(NewGV)) {
+        Comdat *DC = DstM.getOrInsertComdat(SC->getName());
+        DC->setSelectionKind(SC->getSelectionKind());
+        GO->setComdat(DC);
+      }
+    }
+  }
+
+  if (!ShouldLink && ForAlias)
+    NewGV->setLinkage(GlobalValue::InternalLinkage);
+
+  Constant *C = NewGV;
+  if (DGV)
+    C = ConstantExpr::getBitCast(NewGV, TypeMap.get(SGV->getType()));
+
+  if (DGV && NewGV != DGV) {
+    DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewGV, DGV->getType()));
+    DGV->eraseFromParent();
+  }
+
+  return C;
+}
+
+/// Update the initializers in the Dest module now that all globals that may be
+/// referenced are in Dest.
+void IRLinker::linkGlobalInit(GlobalVariable &Dst, GlobalVariable &Src) {
+  // Figure out what the initializer looks like in the dest module.
+  Dst.setInitializer(MapValue(Src.getInitializer(), ValueMap,
+                              RF_MoveDistinctMDs, &TypeMap, &GValMaterializer));
+}
+
+/// Copy the source function over into the dest function and fix up references
+/// to values. At this point we know that Dest is an external function, and
+/// that Src is not.
+bool IRLinker::linkFunctionBody(Function &Dst, Function &Src) {
+  assert(Dst.isDeclaration() && !Src.isDeclaration());
+
+  // Materialize if needed.
+  if (std::error_code EC = Src.materialize())
+    return emitError(EC.message());
+
+  // Link in the prefix data.
+  if (Src.hasPrefixData())
+    Dst.setPrefixData(MapValue(Src.getPrefixData(), ValueMap,
+                               RF_MoveDistinctMDs, &TypeMap,
+                               &GValMaterializer));
+
+  // Link in the prologue data.
+  if (Src.hasPrologueData())
+    Dst.setPrologueData(MapValue(Src.getPrologueData(), ValueMap,
+                                 RF_MoveDistinctMDs, &TypeMap,
+                                 &GValMaterializer));
+
+  // Link in the personality function.
+  if (Src.hasPersonalityFn())
+    Dst.setPersonalityFn(MapValue(Src.getPersonalityFn(), ValueMap,
+                                  RF_MoveDistinctMDs, &TypeMap,
+                                  &GValMaterializer));
+
+  // Go through and convert function arguments over, remembering the mapping.
+  Function::arg_iterator DI = Dst.arg_begin();
+  for (Argument &Arg : Src.args()) {
+    DI->setName(Arg.getName()); // Copy the name over.
+
+    // Add a mapping to our mapping.
+    ValueMap[&Arg] = &*DI;
+    ++DI;
+  }
+
+  // Copy over the metadata attachments.
+  SmallVector<std::pair<unsigned, MDNode *>, 8> MDs;
+  Src.getAllMetadata(MDs);
+  for (const auto &I : MDs)
+    Dst.setMetadata(I.first, MapMetadata(I.second, ValueMap, RF_MoveDistinctMDs,
+                                         &TypeMap, &GValMaterializer));
+
+  // Splice the body of the source function into the dest function.
+  Dst.getBasicBlockList().splice(Dst.end(), Src.getBasicBlockList());
+
+  // At this point, all of the instructions and values of the function are now
+  // copied over.  The only problem is that they are still referencing values in
+  // the Source function as operands.  Loop through all of the operands of the
+  // functions and patch them up to point to the local versions.
+  for (BasicBlock &BB : Dst)
+    for (Instruction &I : BB)
+      RemapInstruction(&I, ValueMap,
+                       RF_IgnoreMissingEntries | RF_MoveDistinctMDs, &TypeMap,
+                       &GValMaterializer);
+
+  // There is no need to map the arguments anymore.
+  for (Argument &Arg : Src.args())
+    ValueMap.erase(&Arg);
+
+  Src.dematerialize();
+  return false;
+}
+
+void IRLinker::linkAliasBody(GlobalAlias &Dst, GlobalAlias &Src) {
+  Constant *Aliasee = Src.getAliasee();
+  Constant *Val = MapValue(Aliasee, AliasValueMap, RF_MoveDistinctMDs, &TypeMap,
+                           &LValMaterializer);
+  Dst.setAliasee(Val);
+}
+
+bool IRLinker::linkGlobalValueBody(GlobalValue &Dst, GlobalValue &Src) {
+  if (auto *F = dyn_cast<Function>(&Src))
+    return linkFunctionBody(cast<Function>(Dst), *F);
+  if (auto *GVar = dyn_cast<GlobalVariable>(&Src)) {
+    linkGlobalInit(cast<GlobalVariable>(Dst), *GVar);
+    return false;
+  }
+  linkAliasBody(cast<GlobalAlias>(Dst), cast<GlobalAlias>(Src));
+  return false;
+}
+
+/// Insert all of the named MDNodes in Src into the Dest module.
+void IRLinker::linkNamedMDNodes() {
+  const NamedMDNode *SrcModFlags = SrcM.getModuleFlagsMetadata();
+  for (const NamedMDNode &NMD : SrcM.named_metadata()) {
+    // Don't link module flags here. Do them separately.
+    if (&NMD == SrcModFlags)
+      continue;
+    NamedMDNode *DestNMD = DstM.getOrInsertNamedMetadata(NMD.getName());
+    // Add Src elements into Dest node.
+    for (const MDNode *op : NMD.operands())
+      DestNMD->addOperand(MapMetadata(
+          op, ValueMap, RF_MoveDistinctMDs | RF_NullMapMissingGlobalValues,
+          &TypeMap, &GValMaterializer));
+  }
+}
+
+/// Merge the linker flags in Src into the Dest module.
+bool IRLinker::linkModuleFlagsMetadata() {
+  // If the source module has no module flags, we are done.
+  const NamedMDNode *SrcModFlags = SrcM.getModuleFlagsMetadata();
+  if (!SrcModFlags)
+    return false;
+
+  // If the destination module doesn't have module flags yet, then just copy
+  // over the source module's flags.
+  NamedMDNode *DstModFlags = DstM.getOrInsertModuleFlagsMetadata();
+  if (DstModFlags->getNumOperands() == 0) {
+    for (unsigned I = 0, E = SrcModFlags->getNumOperands(); I != E; ++I)
+      DstModFlags->addOperand(SrcModFlags->getOperand(I));
+
+    return false;
+  }
+
+  // First build a map of the existing module flags and requirements.
+  DenseMap<MDString *, std::pair<MDNode *, unsigned>> Flags;
+  SmallSetVector<MDNode *, 16> Requirements;
+  for (unsigned I = 0, E = DstModFlags->getNumOperands(); I != E; ++I) {
+    MDNode *Op = DstModFlags->getOperand(I);
+    ConstantInt *Behavior = mdconst::extract<ConstantInt>(Op->getOperand(0));
+    MDString *ID = cast<MDString>(Op->getOperand(1));
+
+    if (Behavior->getZExtValue() == Module::Require) {
+      Requirements.insert(cast<MDNode>(Op->getOperand(2)));
+    } else {
+      Flags[ID] = std::make_pair(Op, I);
+    }
+  }
+
+  // Merge in the flags from the source module, and also collect its set of
+  // requirements.
+  for (unsigned I = 0, E = SrcModFlags->getNumOperands(); I != E; ++I) {
+    MDNode *SrcOp = SrcModFlags->getOperand(I);
+    ConstantInt *SrcBehavior =
+        mdconst::extract<ConstantInt>(SrcOp->getOperand(0));
+    MDString *ID = cast<MDString>(SrcOp->getOperand(1));
+    MDNode *DstOp;
+    unsigned DstIndex;
+    std::tie(DstOp, DstIndex) = Flags.lookup(ID);
+    unsigned SrcBehaviorValue = SrcBehavior->getZExtValue();
+
+    // If this is a requirement, add it and continue.
+    if (SrcBehaviorValue == Module::Require) {
+      // If the destination module does not already have this requirement, add
+      // it.
+      if (Requirements.insert(cast<MDNode>(SrcOp->getOperand(2)))) {
+        DstModFlags->addOperand(SrcOp);
+      }
+      continue;
+    }
+
+    // If there is no existing flag with this ID, just add it.
+    if (!DstOp) {
+      Flags[ID] = std::make_pair(SrcOp, DstModFlags->getNumOperands());
+      DstModFlags->addOperand(SrcOp);
+      continue;
+    }
+
+    // Otherwise, perform a merge.
+    ConstantInt *DstBehavior =
+        mdconst::extract<ConstantInt>(DstOp->getOperand(0));
+    unsigned DstBehaviorValue = DstBehavior->getZExtValue();
+
+    // If either flag has override behavior, handle it first.
+    if (DstBehaviorValue == Module::Override) {
+      // Diagnose inconsistent flags which both have override behavior.
+      if (SrcBehaviorValue == Module::Override &&
+          SrcOp->getOperand(2) != DstOp->getOperand(2)) {
+        emitError("linking module flags '" + ID->getString() +
+                  "': IDs have conflicting override values");
+      }
+      continue;
+    } else if (SrcBehaviorValue == Module::Override) {
+      // Update the destination flag to that of the source.
+      DstModFlags->setOperand(DstIndex, SrcOp);
+      Flags[ID].first = SrcOp;
+      continue;
+    }
+
+    // Diagnose inconsistent merge behavior types.
+    if (SrcBehaviorValue != DstBehaviorValue) {
+      emitError("linking module flags '" + ID->getString() +
+                "': IDs have conflicting behaviors");
+      continue;
+    }
+
+    auto replaceDstValue = [&](MDNode *New) {
+      Metadata *FlagOps[] = {DstOp->getOperand(0), ID, New};
+      MDNode *Flag = MDNode::get(DstM.getContext(), FlagOps);
+      DstModFlags->setOperand(DstIndex, Flag);
+      Flags[ID].first = Flag;
+    };
+
+    // Perform the merge for standard behavior types.
+    switch (SrcBehaviorValue) {
+    case Module::Require:
+    case Module::Override:
+      llvm_unreachable("not possible");
+    case Module::Error: {
+      // Emit an error if the values differ.
+      if (SrcOp->getOperand(2) != DstOp->getOperand(2)) {
+        emitError("linking module flags '" + ID->getString() +
+                  "': IDs have conflicting values");
+      }
+      continue;
+    }
+    case Module::Warning: {
+      // Emit a warning if the values differ.
+      if (SrcOp->getOperand(2) != DstOp->getOperand(2)) {
+        emitWarning("linking module flags '" + ID->getString() +
+                    "': IDs have conflicting values");
+      }
+      continue;
+    }
+    case Module::Append: {
+      MDNode *DstValue = cast<MDNode>(DstOp->getOperand(2));
+      MDNode *SrcValue = cast<MDNode>(SrcOp->getOperand(2));
+      SmallVector<Metadata *, 8> MDs;
+      MDs.reserve(DstValue->getNumOperands() + SrcValue->getNumOperands());
+      MDs.append(DstValue->op_begin(), DstValue->op_end());
+      MDs.append(SrcValue->op_begin(), SrcValue->op_end());
+
+      replaceDstValue(MDNode::get(DstM.getContext(), MDs));
+      break;
+    }
+    case Module::AppendUnique: {
+      SmallSetVector<Metadata *, 16> Elts;
+      MDNode *DstValue = cast<MDNode>(DstOp->getOperand(2));
+      MDNode *SrcValue = cast<MDNode>(SrcOp->getOperand(2));
+      Elts.insert(DstValue->op_begin(), DstValue->op_end());
+      Elts.insert(SrcValue->op_begin(), SrcValue->op_end());
+
+      replaceDstValue(MDNode::get(DstM.getContext(),
+                                  makeArrayRef(Elts.begin(), Elts.end())));
+      break;
+    }
+    }
+  }
+
+  // Check all of the requirements.
+  for (unsigned I = 0, E = Requirements.size(); I != E; ++I) {
+    MDNode *Requirement = Requirements[I];
+    MDString *Flag = cast<MDString>(Requirement->getOperand(0));
+    Metadata *ReqValue = Requirement->getOperand(1);
+
+    MDNode *Op = Flags[Flag].first;
+    if (!Op || Op->getOperand(2) != ReqValue) {
+      emitError("linking module flags '" + Flag->getString() +
+                "': does not have the required value");
+      continue;
+    }
+  }
+
+  return HasError;
+}
+
+// This function returns true if the triples match.
+static bool triplesMatch(const Triple &T0, const Triple &T1) {
+  // If vendor is apple, ignore the version number.
+  if (T0.getVendor() == Triple::Apple)
+    return T0.getArch() == T1.getArch() && T0.getSubArch() == T1.getSubArch() &&
+           T0.getVendor() == T1.getVendor() && T0.getOS() == T1.getOS();
+
+  return T0 == T1;
+}
+
+// This function returns the merged triple.
+static std::string mergeTriples(const Triple &SrcTriple,
+                                const Triple &DstTriple) {
+  // If vendor is apple, pick the triple with the larger version number.
+  if (SrcTriple.getVendor() == Triple::Apple)
+    if (DstTriple.isOSVersionLT(SrcTriple))
+      return SrcTriple.str();
+
+  return DstTriple.str();
+}
+
+bool IRLinker::run() {
+  // Inherit the target data from the source module if the destination module
+  // doesn't have one already.
+  if (DstM.getDataLayout().isDefault())
+    DstM.setDataLayout(SrcM.getDataLayout());
+
+  if (SrcM.getDataLayout() != DstM.getDataLayout()) {
+    emitWarning("Linking two modules of different data layouts: '" +
+                SrcM.getModuleIdentifier() + "' is '" +
+                SrcM.getDataLayoutStr() + "' whereas '" +
+                DstM.getModuleIdentifier() + "' is '" +
+                DstM.getDataLayoutStr() + "'\n");
+  }
+
+  // Copy the target triple from the source to dest if the dest's is empty.
+  if (DstM.getTargetTriple().empty() && !SrcM.getTargetTriple().empty())
+    DstM.setTargetTriple(SrcM.getTargetTriple());
+
+  Triple SrcTriple(SrcM.getTargetTriple()), DstTriple(DstM.getTargetTriple());
+
+  if (!SrcM.getTargetTriple().empty() && !triplesMatch(SrcTriple, DstTriple))
+    emitWarning("Linking two modules of different target triples: " +
+                SrcM.getModuleIdentifier() + "' is '" + SrcM.getTargetTriple() +
+                "' whereas '" + DstM.getModuleIdentifier() + "' is '" +
+                DstM.getTargetTriple() + "'\n");
+
+  DstM.setTargetTriple(mergeTriples(SrcTriple, DstTriple));
+
+  // Append the module inline asm string.
+  if (!SrcM.getModuleInlineAsm().empty()) {
+    if (DstM.getModuleInlineAsm().empty())
+      DstM.setModuleInlineAsm(SrcM.getModuleInlineAsm());
+    else
+      DstM.setModuleInlineAsm(DstM.getModuleInlineAsm() + "\n" +
+                              SrcM.getModuleInlineAsm());
+  }
+
+  // Loop over all of the linked values to compute type mappings.
+  computeTypeMapping();
+
+  std::reverse(Worklist.begin(), Worklist.end());
+  while (!Worklist.empty()) {
+    GlobalValue *GV = Worklist.back();
+    Worklist.pop_back();
+
+    // Already mapped.
+    if (ValueMap.find(GV) != ValueMap.end() ||
+        AliasValueMap.find(GV) != AliasValueMap.end())
+      continue;
+
+    assert(!GV->isDeclaration());
+    MapValue(GV, ValueMap, RF_MoveDistinctMDs, &TypeMap, &GValMaterializer);
+    if (HasError)
+      return true;
+  }
+
+  // Note that we are done linking global value bodies. This prevents
+  // metadata linking from creating new references.
+  DoneLinkingBodies = true;
+
+  // Remap all of the named MDNodes in Src into the DstM module. We do this
+  // after linking GlobalValues so that MDNodes that reference GlobalValues
+  // are properly remapped.
+  linkNamedMDNodes();
+
+  // Merge the module flags into the DstM module.
+  if (linkModuleFlagsMetadata())
+    return true;
+
+  return false;
+}
+
+IRMover::StructTypeKeyInfo::KeyTy::KeyTy(ArrayRef<Type *> E, bool P)
+    : ETypes(E), IsPacked(P) {}
+
+IRMover::StructTypeKeyInfo::KeyTy::KeyTy(const StructType *ST)
+    : ETypes(ST->elements()), IsPacked(ST->isPacked()) {}
+
+bool IRMover::StructTypeKeyInfo::KeyTy::operator==(const KeyTy &That) const {
+  if (IsPacked != That.IsPacked)
+    return false;
+  if (ETypes != That.ETypes)
+    return false;
+  return true;
+}
+
+bool IRMover::StructTypeKeyInfo::KeyTy::operator!=(const KeyTy &That) const {
+  return !this->operator==(That);
+}
+
+StructType *IRMover::StructTypeKeyInfo::getEmptyKey() {
+  return DenseMapInfo<StructType *>::getEmptyKey();
+}
+
+StructType *IRMover::StructTypeKeyInfo::getTombstoneKey() {
+  return DenseMapInfo<StructType *>::getTombstoneKey();
+}
+
+unsigned IRMover::StructTypeKeyInfo::getHashValue(const KeyTy &Key) {
+  return hash_combine(hash_combine_range(Key.ETypes.begin(), Key.ETypes.end()),
+                      Key.IsPacked);
+}
+
+unsigned IRMover::StructTypeKeyInfo::getHashValue(const StructType *ST) {
+  return getHashValue(KeyTy(ST));
+}
+
+bool IRMover::StructTypeKeyInfo::isEqual(const KeyTy &LHS,
+                                         const StructType *RHS) {
+  if (RHS == getEmptyKey() || RHS == getTombstoneKey())
+    return false;
+  return LHS == KeyTy(RHS);
+}
+
+bool IRMover::StructTypeKeyInfo::isEqual(const StructType *LHS,
+                                         const StructType *RHS) {
+  if (RHS == getEmptyKey())
+    return LHS == getEmptyKey();
+
+  if (RHS == getTombstoneKey())
+    return LHS == getTombstoneKey();
+
+  return KeyTy(LHS) == KeyTy(RHS);
+}
+
+void IRMover::IdentifiedStructTypeSet::addNonOpaque(StructType *Ty) {
+  assert(!Ty->isOpaque());
+  NonOpaqueStructTypes.insert(Ty);
+}
+
+void IRMover::IdentifiedStructTypeSet::switchToNonOpaque(StructType *Ty) {
+  assert(!Ty->isOpaque());
+  NonOpaqueStructTypes.insert(Ty);
+  bool Removed = OpaqueStructTypes.erase(Ty);
+  (void)Removed;
+  assert(Removed);
+}
+
+void IRMover::IdentifiedStructTypeSet::addOpaque(StructType *Ty) {
+  assert(Ty->isOpaque());
+  OpaqueStructTypes.insert(Ty);
+}
+
+StructType *
+IRMover::IdentifiedStructTypeSet::findNonOpaque(ArrayRef<Type *> ETypes,
+                                                bool IsPacked) {
+  IRMover::StructTypeKeyInfo::KeyTy Key(ETypes, IsPacked);
+  auto I = NonOpaqueStructTypes.find_as(Key);
+  if (I == NonOpaqueStructTypes.end())
+    return nullptr;
+  return *I;
+}
+
+bool IRMover::IdentifiedStructTypeSet::hasType(StructType *Ty) {
+  if (Ty->isOpaque())
+    return OpaqueStructTypes.count(Ty);
+  auto I = NonOpaqueStructTypes.find(Ty);
+  if (I == NonOpaqueStructTypes.end())
+    return false;
+  return *I == Ty;
+}
+
+IRMover::IRMover(Module &M, DiagnosticHandlerFunction DiagnosticHandler)
+    : Composite(M), DiagnosticHandler(DiagnosticHandler) {
+  TypeFinder StructTypes;
+  StructTypes.run(M, true);
+  for (StructType *Ty : StructTypes) {
+    if (Ty->isOpaque())
+      IdentifiedStructTypes.addOpaque(Ty);
+    else
+      IdentifiedStructTypes.addNonOpaque(Ty);
+  }
+}
+
+bool IRMover::move(
+    Module &Src, ArrayRef<GlobalValue *> ValuesToLink,
+    std::function<void(GlobalValue &, ValueAdder Add)> AddLazyFor) {
+  IRLinker TheLinker(Composite, IdentifiedStructTypes, Src, DiagnosticHandler,
+                     ValuesToLink, AddLazyFor);
+  bool RetCode = TheLinker.run();
+  Composite.dropTriviallyDeadConstantArrays();
+  return RetCode;
+}
diff --git a/lib/Linker/LinkDiagnosticInfo.h b/lib/Linker/LinkDiagnosticInfo.h
new file mode 100644
index 000000000000..d91f19c69aac
--- /dev/null
+++ b/lib/Linker/LinkDiagnosticInfo.h
@@ -0,0 +1,25 @@
+//===- LinkDiagnosticInfo.h -------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_LINKER_LINK_DIAGNOSTIC_INFO_H
+#define LLVM_LIB_LINKER_LINK_DIAGNOSTIC_INFO_H
+
+#include "llvm/IR/DiagnosticInfo.h"
+
+namespace llvm {
+class LinkDiagnosticInfo : public DiagnosticInfo {
+  const Twine &Msg;
+
+public:
+  LinkDiagnosticInfo(DiagnosticSeverity Severity, const Twine &Msg);
+  void print(DiagnosticPrinter &DP) const override;
+};
+}
+
+#endif
diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index 3d40c126dd27..3d3454f3f7fd 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -12,389 +12,23 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Linker/Linker.h"
+#include "LinkDiagnosticInfo.h"
 #include "llvm-c/Linker.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/IR/DiagnosticPrinter.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/TypeFinder.h"
-#include "llvm/Transforms/Utils/Cloning.h"
 using namespace llvm;
 
-//===----------------------------------------------------------------------===//
-// TypeMap implementation.
-//===----------------------------------------------------------------------===//
-
 namespace {
-class TypeMapTy : public ValueMapTypeRemapper {
-  /// This is a mapping from a source type to a destination type to use.
-  DenseMap<Type *, Type *> MappedTypes;
-
-  /// When checking to see if two subgraphs are isomorphic, we speculatively
-  /// add types to MappedTypes, but keep track of them here in case we need to
-  /// roll back.
-  SmallVector<Type *, 16> SpeculativeTypes;
-
-  SmallVector<StructType *, 16> SpeculativeDstOpaqueTypes;
-
-  /// This is a list of non-opaque structs in the source module that are mapped
-  /// to an opaque struct in the destination module.
-  SmallVector<StructType *, 16> SrcDefinitionsToResolve;
-
-  /// This is the set of opaque types in the destination modules who are
-  /// getting a body from the source module.
-  SmallPtrSet<StructType *, 16> DstResolvedOpaqueTypes;
-
-public:
-  TypeMapTy(Linker::IdentifiedStructTypeSet &DstStructTypesSet)
-      : DstStructTypesSet(DstStructTypesSet) {}
-
-  Linker::IdentifiedStructTypeSet &DstStructTypesSet;
-  /// Indicate that the specified type in the destination module is conceptually
-  /// equivalent to the specified type in the source module.
-  void addTypeMapping(Type *DstTy, Type *SrcTy);
-
-  /// Produce a body for an opaque type in the dest module from a type
-  /// definition in the source module.
-  void linkDefinedTypeBodies();
-
-  /// Return the mapped type to use for the specified input type from the
-  /// source module.
-  Type *get(Type *SrcTy);
-  Type *get(Type *SrcTy, SmallPtrSet<StructType *, 8> &Visited);
-
-  void finishType(StructType *DTy, StructType *STy, ArrayRef<Type *> ETypes);
-
-  FunctionType *get(FunctionType *T) {
-    return cast<FunctionType>(get((Type *)T));
-  }
-
-  /// Dump out the type map for debugging purposes.
-  void dump() const {
-    for (auto &Pair : MappedTypes) {
-      dbgs() << "TypeMap: ";
-      Pair.first->print(dbgs());
-      dbgs() << " => ";
-      Pair.second->print(dbgs());
-      dbgs() << '\n';
-    }
-  }
-
-private:
-  Type *remapType(Type *SrcTy) override { return get(SrcTy); }
-
-  bool areTypesIsomorphic(Type *DstTy, Type *SrcTy);
-};
-}
-
-void TypeMapTy::addTypeMapping(Type *DstTy, Type *SrcTy) {
-  assert(SpeculativeTypes.empty());
-  assert(SpeculativeDstOpaqueTypes.empty());
-
-  // Check to see if these types are recursively isomorphic and establish a
-  // mapping between them if so.
-  if (!areTypesIsomorphic(DstTy, SrcTy)) {
-    // Oops, they aren't isomorphic.  Just discard this request by rolling out
-    // any speculative mappings we've established.
-    for (Type *Ty : SpeculativeTypes)
-      MappedTypes.erase(Ty);
-
-    SrcDefinitionsToResolve.resize(SrcDefinitionsToResolve.size() -
-                                   SpeculativeDstOpaqueTypes.size());
-    for (StructType *Ty : SpeculativeDstOpaqueTypes)
-      DstResolvedOpaqueTypes.erase(Ty);
-  } else {
-    for (Type *Ty : SpeculativeTypes)
-      if (auto *STy = dyn_cast<StructType>(Ty))
-        if (STy->hasName())
-          STy->setName("");
-  }
-  SpeculativeTypes.clear();
-  SpeculativeDstOpaqueTypes.clear();
-}
-
-/// Recursively walk this pair of types, returning true if they are isomorphic,
-/// false if they are not.
-bool TypeMapTy::areTypesIsomorphic(Type *DstTy, Type *SrcTy) {
-  // Two types with differing kinds are clearly not isomorphic.
-  if (DstTy->getTypeID() != SrcTy->getTypeID())
-    return false;
-
-  // If we have an entry in the MappedTypes table, then we have our answer.
-  Type *&Entry = MappedTypes[SrcTy];
-  if (Entry)
-    return Entry == DstTy;
-
-  // Two identical types are clearly isomorphic.  Remember this
-  // non-speculatively.
-  if (DstTy == SrcTy) {
-    Entry = DstTy;
-    return true;
-  }
-
-  // Okay, we have two types with identical kinds that we haven't seen before.
-
-  // If this is an opaque struct type, special case it.
-  if (StructType *SSTy = dyn_cast<StructType>(SrcTy)) {
-    // Mapping an opaque type to any struct, just keep the dest struct.
-    if (SSTy->isOpaque()) {
-      Entry = DstTy;
-      SpeculativeTypes.push_back(SrcTy);
-      return true;
-    }
-
-    // Mapping a non-opaque source type to an opaque dest.  If this is the first
-    // type that we're mapping onto this destination type then we succeed.  Keep
-    // the dest, but fill it in later. If this is the second (different) type
-    // that we're trying to map onto the same opaque type then we fail.
-    if (cast<StructType>(DstTy)->isOpaque()) {
-      // We can only map one source type onto the opaque destination type.
-      if (!DstResolvedOpaqueTypes.insert(cast<StructType>(DstTy)).second)
-        return false;
-      SrcDefinitionsToResolve.push_back(SSTy);
-      SpeculativeTypes.push_back(SrcTy);
-      SpeculativeDstOpaqueTypes.push_back(cast<StructType>(DstTy));
-      Entry = DstTy;
-      return true;
-    }
-  }
-
-  // If the number of subtypes disagree between the two types, then we fail.
-  if (SrcTy->getNumContainedTypes() != DstTy->getNumContainedTypes())
-    return false;
-
-  // Fail if any of the extra properties (e.g. array size) of the type disagree.
-  if (isa<IntegerType>(DstTy))
-    return false; // bitwidth disagrees.
-  if (PointerType *PT = dyn_cast<PointerType>(DstTy)) {
-    if (PT->getAddressSpace() != cast<PointerType>(SrcTy)->getAddressSpace())
-      return false;
-
-  } else if (FunctionType *FT = dyn_cast<FunctionType>(DstTy)) {
-    if (FT->isVarArg() != cast<FunctionType>(SrcTy)->isVarArg())
-      return false;
-  } else if (StructType *DSTy = dyn_cast<StructType>(DstTy)) {
-    StructType *SSTy = cast<StructType>(SrcTy);
-    if (DSTy->isLiteral() != SSTy->isLiteral() ||
-        DSTy->isPacked() != SSTy->isPacked())
-      return false;
-  } else if (ArrayType *DATy = dyn_cast<ArrayType>(DstTy)) {
-    if (DATy->getNumElements() != cast<ArrayType>(SrcTy)->getNumElements())
-      return false;
-  } else if (VectorType *DVTy = dyn_cast<VectorType>(DstTy)) {
-    if (DVTy->getNumElements() != cast<VectorType>(SrcTy)->getNumElements())
-      return false;
-  }
-
-  // Otherwise, we speculate that these two types will line up and recursively
-  // check the subelements.
-  Entry = DstTy;
-  SpeculativeTypes.push_back(SrcTy);
-
-  for (unsigned I = 0, E = SrcTy->getNumContainedTypes(); I != E; ++I)
-    if (!areTypesIsomorphic(DstTy->getContainedType(I),
-                            SrcTy->getContainedType(I)))
-      return false;
-
-  // If everything seems to have lined up, then everything is great.
-  return true;
-}
-
-void TypeMapTy::linkDefinedTypeBodies() {
-  SmallVector<Type *, 16> Elements;
-  for (StructType *SrcSTy : SrcDefinitionsToResolve) {
-    StructType *DstSTy = cast<StructType>(MappedTypes[SrcSTy]);
-    assert(DstSTy->isOpaque());
-
-    // Map the body of the source type over to a new body for the dest type.
-    Elements.resize(SrcSTy->getNumElements());
-    for (unsigned I = 0, E = Elements.size(); I != E; ++I)
-      Elements[I] = get(SrcSTy->getElementType(I));
-
-    DstSTy->setBody(Elements, SrcSTy->isPacked());
-    DstStructTypesSet.switchToNonOpaque(DstSTy);
-  }
-  SrcDefinitionsToResolve.clear();
-  DstResolvedOpaqueTypes.clear();
-}
-
-void TypeMapTy::finishType(StructType *DTy, StructType *STy,
-                           ArrayRef<Type *> ETypes) {
-  DTy->setBody(ETypes, STy->isPacked());
-
-  // Steal STy's name.
-  if (STy->hasName()) {
-    SmallString<16> TmpName = STy->getName();
-    STy->setName("");
-    DTy->setName(TmpName);
-  }
-
-  DstStructTypesSet.addNonOpaque(DTy);
-}
-
-Type *TypeMapTy::get(Type *Ty) {
-  SmallPtrSet<StructType *, 8> Visited;
-  return get(Ty, Visited);
-}
-
-Type *TypeMapTy::get(Type *Ty, SmallPtrSet<StructType *, 8> &Visited) {
-  // If we already have an entry for this type, return it.
-  Type **Entry = &MappedTypes[Ty];
-  if (*Entry)
-    return *Entry;
-
-  // These are types that LLVM itself will unique.
-  bool IsUniqued = !isa<StructType>(Ty) || cast<StructType>(Ty)->isLiteral();
-
-#ifndef NDEBUG
-  if (!IsUniqued) {
-    for (auto &Pair : MappedTypes) {
-      assert(!(Pair.first != Ty && Pair.second == Ty) &&
-             "mapping to a source type");
-    }
-  }
-#endif
-
-  if (!IsUniqued && !Visited.insert(cast<StructType>(Ty)).second) {
-    StructType *DTy = StructType::create(Ty->getContext());
-    return *Entry = DTy;
-  }
-
-  // If this is not a recursive type, then just map all of the elements and
-  // then rebuild the type from inside out.
-  SmallVector<Type *, 4> ElementTypes;
-
-  // If there are no element types to map, then the type is itself.  This is
-  // true for the anonymous {} struct, things like 'float', integers, etc.
-  if (Ty->getNumContainedTypes() == 0 && IsUniqued)
-    return *Entry = Ty;
-
-  // Remap all of the elements, keeping track of whether any of them change.
-  bool AnyChange = false;
-  ElementTypes.resize(Ty->getNumContainedTypes());
-  for (unsigned I = 0, E = Ty->getNumContainedTypes(); I != E; ++I) {
-    ElementTypes[I] = get(Ty->getContainedType(I), Visited);
-    AnyChange |= ElementTypes[I] != Ty->getContainedType(I);
-  }
-
-  // If we found our type while recursively processing stuff, just use it.
-  Entry = &MappedTypes[Ty];
-  if (*Entry) {
-    if (auto *DTy = dyn_cast<StructType>(*Entry)) {
-      if (DTy->isOpaque()) {
-        auto *STy = cast<StructType>(Ty);
-        finishType(DTy, STy, ElementTypes);
-      }
-    }
-    return *Entry;
-  }
-
-  // If all of the element types mapped directly over and the type is not
-  // a nomed struct, then the type is usable as-is.
-  if (!AnyChange && IsUniqued)
-    return *Entry = Ty;
-
-  // Otherwise, rebuild a modified type.
-  switch (Ty->getTypeID()) {
-  default:
-    llvm_unreachable("unknown derived type to remap");
-  case Type::ArrayTyID:
-    return *Entry = ArrayType::get(ElementTypes[0],
-                                   cast<ArrayType>(Ty)->getNumElements());
-  case Type::VectorTyID:
-    return *Entry = VectorType::get(ElementTypes[0],
-                                    cast<VectorType>(Ty)->getNumElements());
-  case Type::PointerTyID:
-    return *Entry = PointerType::get(ElementTypes[0],
-                                     cast<PointerType>(Ty)->getAddressSpace());
-  case Type::FunctionTyID:
-    return *Entry = FunctionType::get(ElementTypes[0],
-                                      makeArrayRef(ElementTypes).slice(1),
-                                      cast<FunctionType>(Ty)->isVarArg());
-  case Type::StructTyID: {
-    auto *STy = cast<StructType>(Ty);
-    bool IsPacked = STy->isPacked();
-    if (IsUniqued)
-      return *Entry = StructType::get(Ty->getContext(), ElementTypes, IsPacked);
-
-    // If the type is opaque, we can just use it directly.
-    if (STy->isOpaque()) {
-      DstStructTypesSet.addOpaque(STy);
-      return *Entry = Ty;
-    }
-
-    if (StructType *OldT =
-            DstStructTypesSet.findNonOpaque(ElementTypes, IsPacked)) {
-      STy->setName("");
-      return *Entry = OldT;
-    }
-
-    if (!AnyChange) {
-      DstStructTypesSet.addNonOpaque(STy);
-      return *Entry = Ty;
-    }
-
-    StructType *DTy = StructType::create(Ty->getContext());
-    finishType(DTy, STy, ElementTypes);
-    return *Entry = DTy;
-  }
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// ModuleLinker implementation.
-//===----------------------------------------------------------------------===//
-
-namespace {
-class ModuleLinker;
-
-/// Creates prototypes for functions that are lazily linked on the fly. This
-/// speeds up linking for modules with many/ lazily linked functions of which
-/// few get used.
-class ValueMaterializerTy final : public ValueMaterializer {
-  ModuleLinker *ModLinker;
-
-public:
-  ValueMaterializerTy(ModuleLinker *ModLinker) : ModLinker(ModLinker) {}
-
-  Value *materializeDeclFor(Value *V) override;
-  void materializeInitFor(GlobalValue *New, GlobalValue *Old) override;
-};
-
-class LinkDiagnosticInfo : public DiagnosticInfo {
-  const Twine &Msg;
-
-public:
-  LinkDiagnosticInfo(DiagnosticSeverity Severity, const Twine &Msg);
-  void print(DiagnosticPrinter &DP) const override;
-};
-LinkDiagnosticInfo::LinkDiagnosticInfo(DiagnosticSeverity Severity,
-                                       const Twine &Msg)
-    : DiagnosticInfo(DK_Linker, Severity), Msg(Msg) {}
-void LinkDiagnosticInfo::print(DiagnosticPrinter &DP) const { DP << Msg; }
 
 /// This is an implementation class for the LinkModules function, which is the
 /// entrypoint for this file.
 class ModuleLinker {
-  Module &DstM;
+  IRMover &Mover;
   Module &SrcM;
 
-  TypeMapTy TypeMap;
-  ValueMaterializerTy ValMaterializer;
-
-  /// Mapping of values from what they used to be in Src, to what they are now
-  /// in DstM.  ValueToValueMapTy is a ValueMap, which involves some overhead
-  /// due to the use of Value handles which the Linker doesn't actually need,
-  /// but this allows us to reuse the ValueMapper code.
-  ValueToValueMapTy ValueMap;
-
   SetVector<GlobalValue *> ValuesToLink;
-
-  DiagnosticHandlerFunction DiagnosticHandler;
+  StringSet<> Internalize;
 
   /// For symbol clashes, prefer those from Src.
   unsigned Flags;
@@ -413,12 +47,11 @@ class ModuleLinker {
   /// as part of a different backend compilation process.
   bool HasExportedFunctions = false;
 
-  /// Set to true when all global value body linking is complete (including
-  /// lazy linking). Used to prevent metadata linking from creating new
-  /// references.
-  bool DoneLinkingBodies = false;
-
-  bool HasError = false;
+  /// Used as the callback for lazy linking.
+  /// The mover has just hit GV and we have to decide if it, and other members
+  /// of the same comdat, should be linked. Every member to be linked is passed
+  /// to Add.
+  void addLazyFor(GlobalValue &GV, IRMover::ValueAdder Add);
 
   bool shouldOverrideFromSrc() { return Flags & Linker::OverrideFromSrc; }
   bool shouldLinkOnlyNeeded() { return Flags & Linker::LinkOnlyNeeded; }
@@ -426,27 +59,18 @@ class ModuleLinker {
     return Flags & Linker::InternalizeLinkedSymbols;
   }
 
-  /// Handles cloning of a global values from the source module into
-  /// the destination module, including setting the attributes and visibility.
-  GlobalValue *copyGlobalValueProto(const GlobalValue *SGV, bool ForDefinition);
-
   /// Check if we should promote the given local value to global scope.
   bool doPromoteLocalToGlobal(const GlobalValue *SGV);
 
   bool shouldLinkFromSource(bool &LinkFromSrc, const GlobalValue &Dest,
                             const GlobalValue &Src);
 
-  /// Helper method for setting a message and returning an error code.
+  /// Should we have mover and linker error diag info?
   bool emitError(const Twine &Message) {
-    DiagnosticHandler(LinkDiagnosticInfo(DS_Error, Message));
-    HasError = true;
+    Mover.getDiagnosticHandler()(LinkDiagnosticInfo(DS_Error, Message));
     return true;
   }
 
-  void emitWarning(const Twine &Message) {
-    DiagnosticHandler(LinkDiagnosticInfo(DS_Warning, Message));
-  }
-
   bool getComdatLeader(Module &M, StringRef ComdatName,
                        const GlobalVariable *&GVar);
   bool computeResultingSelectionKind(StringRef ComdatName,
@@ -464,6 +88,7 @@ class ModuleLinker {
   /// Given a global in the source module, return the global in the
   /// destination module that is being linked to, if any.
   GlobalValue *getLinkedToGlobal(const GlobalValue *SrcGV) {
+    Module &DstM = Mover.getModule();
     // If the source has no name it can't link.  If it has local linkage,
     // there is no name match-up going on.
     if (!SrcGV->hasName() || GlobalValue::isLocalLinkage(getLinkage(SrcGV)))
@@ -483,25 +108,7 @@ class ModuleLinker {
     return DGV;
   }
 
-  void computeTypeMapping();
-
   bool linkIfNeeded(GlobalValue &GV);
-  Constant *linkAppendingVarProto(GlobalVariable *DstGV,
-                                  const GlobalVariable *SrcGV);
-
-  Constant *linkGlobalValueProto(GlobalValue *GV);
-  bool linkModuleFlagsMetadata();
-
-  void linkGlobalInit(GlobalVariable &Dst, GlobalVariable &Src);
-  bool linkFunctionBody(Function &Dst, Function &Src);
-  void linkAliasBody(GlobalAlias &Dst, GlobalAlias &Src);
-  bool linkGlobalValueBody(GlobalValue &Dst, GlobalValue &Src);
-
-  /// Functions that take care of cloning a specific global value type
-  /// into the destination module.
-  GlobalVariable *copyGlobalVariableProto(const GlobalVariable *SGVar);
-  Function *copyFunctionProto(const Function *SF);
-  GlobalValue *copyGlobalAliasProto(const GlobalAlias *SGA);
 
   /// Helper methods to check if we are importing from or potentially
   /// exporting from the current source module.
@@ -517,6 +124,13 @@ class ModuleLinker {
   /// a local that is being promoted to global scope.
   std::string getName(const GlobalValue *SGV);
 
+  /// Process globals so that they can be used in ThinLTO. This includes
+  /// promoting local variables so that they can be reference externally by
+  /// thin lto imported globals and converting strong external globals to
+  /// available_externally.
+  void processGlobalsForThinLTO();
+  void processGlobalForThinLTO(GlobalValue &GV);
+
   /// Get the new linkage for SGV that should be used in the linked destination
   /// module. Specifically, for ThinLTO importing or exporting it may need
   /// to be adjusted.
@@ -532,15 +146,11 @@ class ModuleLinker {
   void setVisibility(GlobalValue *NewGV, const GlobalValue *SGV,
                      const GlobalValue *DGV = nullptr);
 
-  void linkNamedMDNodes();
-
 public:
-  ModuleLinker(Module &DstM, Linker::IdentifiedStructTypeSet &Set, Module &SrcM,
-               DiagnosticHandlerFunction DiagnosticHandler, unsigned Flags,
+  ModuleLinker(IRMover &Mover, Module &SrcM, unsigned Flags,
                const FunctionInfoIndex *Index = nullptr,
                DenseSet<const GlobalValue *> *FunctionsToImport = nullptr)
-      : DstM(DstM), SrcM(SrcM), TypeMap(Set), ValMaterializer(this),
-        DiagnosticHandler(DiagnosticHandler), Flags(Flags), ImportIndex(Index),
+      : Mover(Mover), SrcM(SrcM), Flags(Flags), ImportIndex(Index),
         ImportFunction(FunctionsToImport) {
     assert((ImportIndex || !ImportFunction) &&
            "Expect a FunctionInfoIndex when importing");
@@ -553,8 +163,6 @@ class ModuleLinker {
   }
 
   bool run();
-  Value *materializeDeclFor(Value *V);
-  void materializeInitFor(GlobalValue *New, GlobalValue *Old);
 };
 }
 
@@ -755,41 +363,6 @@ GlobalValue::LinkageTypes ModuleLinker::getLinkage(const GlobalValue *SGV) {
   llvm_unreachable("unknown linkage type");
 }
 
-/// Loop through the global variables in the src module and merge them into the
-/// dest module.
-GlobalVariable *
-ModuleLinker::copyGlobalVariableProto(const GlobalVariable *SGVar) {
-  // No linking to be performed or linking from the source: simply create an
-  // identical version of the symbol over in the dest module... the
-  // initializer will be filled in later by LinkGlobalInits.
-  GlobalVariable *NewDGV =
-      new GlobalVariable(DstM, TypeMap.get(SGVar->getType()->getElementType()),
-                         SGVar->isConstant(), GlobalValue::ExternalLinkage,
-                         /*init*/ nullptr, getName(SGVar),
-                         /*insertbefore*/ nullptr, SGVar->getThreadLocalMode(),
-                         SGVar->getType()->getAddressSpace());
-
-  return NewDGV;
-}
-
-/// Link the function in the source module into the destination module if
-/// needed, setting up mapping information.
-Function *ModuleLinker::copyFunctionProto(const Function *SF) {
-  // If there is no linkage to be performed or we are linking from the source,
-  // bring SF over.
-  return Function::Create(TypeMap.get(SF->getFunctionType()),
-                          GlobalValue::ExternalLinkage, getName(SF), &DstM);
-}
-
-/// Set up prototypes for any aliases that come over from the source module.
-GlobalValue *ModuleLinker::copyGlobalAliasProto(const GlobalAlias *SGA) {
-  // If there is no linkage to be performed or we're linking from the source,
-  // bring over SGA.
-  auto *Ty = TypeMap.get(SGA->getValueType());
-  return GlobalAlias::create(Ty, SGA->getType()->getPointerAddressSpace(),
-                             GlobalValue::ExternalLinkage, getName(SGA), &DstM);
-}
-
 static GlobalValue::VisibilityTypes
 getMinVisibility(GlobalValue::VisibilityTypes A,
                  GlobalValue::VisibilityTypes B) {
@@ -813,82 +386,6 @@ void ModuleLinker::setVisibility(GlobalValue *NewGV, const GlobalValue *SGV,
   NewGV->setVisibility(Visibility);
 }
 
-GlobalValue *ModuleLinker::copyGlobalValueProto(const GlobalValue *SGV,
-                                                bool ForDefinition) {
-  GlobalValue *NewGV;
-  if (auto *SGVar = dyn_cast<GlobalVariable>(SGV)) {
-    NewGV = copyGlobalVariableProto(SGVar);
-  } else if (auto *SF = dyn_cast<Function>(SGV)) {
-    NewGV = copyFunctionProto(SF);
-  } else {
-    if (ForDefinition)
-      NewGV = copyGlobalAliasProto(cast<GlobalAlias>(SGV));
-    else
-      NewGV = new GlobalVariable(
-          DstM, TypeMap.get(SGV->getType()->getElementType()),
-          /*isConstant*/ false, GlobalValue::ExternalLinkage,
-          /*init*/ nullptr, getName(SGV),
-          /*insertbefore*/ nullptr, SGV->getThreadLocalMode(),
-          SGV->getType()->getAddressSpace());
-  }
-
-  if (ForDefinition)
-    NewGV->setLinkage(getLinkage(SGV));
-  else if (SGV->hasAvailableExternallyLinkage() || SGV->hasWeakLinkage() ||
-           SGV->hasLinkOnceLinkage())
-    NewGV->setLinkage(GlobalValue::ExternalWeakLinkage);
-
-  copyGVAttributes(NewGV, SGV);
-  return NewGV;
-}
-
-Value *ValueMaterializerTy::materializeDeclFor(Value *V) {
-  return ModLinker->materializeDeclFor(V);
-}
-
-Value *ModuleLinker::materializeDeclFor(Value *V) {
-  auto *SGV = dyn_cast<GlobalValue>(V);
-  if (!SGV)
-    return nullptr;
-
-  return linkGlobalValueProto(SGV);
-}
-
-void ValueMaterializerTy::materializeInitFor(GlobalValue *New,
-                                             GlobalValue *Old) {
-  return ModLinker->materializeInitFor(New, Old);
-}
-
-static bool shouldLazyLink(const GlobalValue &GV) {
-  return GV.hasLocalLinkage() || GV.hasLinkOnceLinkage() ||
-         GV.hasAvailableExternallyLinkage();
-}
-
-void ModuleLinker::materializeInitFor(GlobalValue *New, GlobalValue *Old) {
-  if (auto *F = dyn_cast<Function>(New)) {
-    if (!F->isDeclaration())
-      return;
-  } else if (auto *V = dyn_cast<GlobalVariable>(New)) {
-    if (V->hasInitializer())
-      return;
-  } else {
-    auto *A = cast<GlobalAlias>(New);
-    if (A->getAliasee())
-      return;
-  }
-
-  if (Old->isDeclaration())
-    return;
-
-  if (isPerformingImport() && !doImportAsDefinition(Old))
-    return;
-
-  if (!ValuesToLink.count(Old) && !shouldLazyLink(*Old))
-    return;
-
-  linkGlobalValueBody(*New, *Old);
-}
-
 bool ModuleLinker::getComdatLeader(Module &M, StringRef ComdatName,
                                    const GlobalVariable *&GVar) {
   const GlobalValue *GVal = M.getNamedValue(ComdatName);
@@ -914,6 +411,7 @@ bool ModuleLinker::computeResultingSelectionKind(StringRef ComdatName,
                                                  Comdat::SelectionKind Dst,
                                                  Comdat::SelectionKind &Result,
                                                  bool &LinkFromSrc) {
+  Module &DstM = Mover.getModule();
   // The ability to mix Comdat::SelectionKind::Any with
   // Comdat::SelectionKind::Largest is a behavior that comes from COFF.
   bool DstAnyOrLargest = Dst == Comdat::SelectionKind::Any ||
@@ -981,6 +479,7 @@ bool ModuleLinker::computeResultingSelectionKind(StringRef ComdatName,
 bool ModuleLinker::getComdatResult(const Comdat *SrcC,
                                    Comdat::SelectionKind &Result,
                                    bool &LinkFromSrc) {
+  Module &DstM = Mover.getModule();
   Comdat::SelectionKind SSK = SrcC->getSelectionKind();
   StringRef ComdatName = SrcC->getName();
   Module::ComdatSymTabType &ComdatSymTab = DstM.getComdatSymbolTable();
@@ -1110,601 +609,6 @@ bool ModuleLinker::shouldLinkFromSource(bool &LinkFromSrc,
                    "': symbol multiply defined!");
 }
 
-/// Loop over all of the linked values to compute type mappings.  For example,
-/// if we link "extern Foo *x" and "Foo *x = NULL", then we have two struct
-/// types 'Foo' but one got renamed when the module was loaded into the same
-/// LLVMContext.
-void ModuleLinker::computeTypeMapping() {
-  for (GlobalValue &SGV : SrcM.globals()) {
-    GlobalValue *DGV = getLinkedToGlobal(&SGV);
-    if (!DGV)
-      continue;
-
-    if (!DGV->hasAppendingLinkage() || !SGV.hasAppendingLinkage()) {
-      TypeMap.addTypeMapping(DGV->getType(), SGV.getType());
-      continue;
-    }
-
-    // Unify the element type of appending arrays.
-    ArrayType *DAT = cast<ArrayType>(DGV->getType()->getElementType());
-    ArrayType *SAT = cast<ArrayType>(SGV.getType()->getElementType());
-    TypeMap.addTypeMapping(DAT->getElementType(), SAT->getElementType());
-  }
-
-  for (GlobalValue &SGV : SrcM) {
-    if (GlobalValue *DGV = getLinkedToGlobal(&SGV))
-      TypeMap.addTypeMapping(DGV->getType(), SGV.getType());
-  }
-
-  for (GlobalValue &SGV : SrcM.aliases()) {
-    if (GlobalValue *DGV = getLinkedToGlobal(&SGV))
-      TypeMap.addTypeMapping(DGV->getType(), SGV.getType());
-  }
-
-  // Incorporate types by name, scanning all the types in the source module.
-  // At this point, the destination module may have a type "%foo = { i32 }" for
-  // example.  When the source module got loaded into the same LLVMContext, if
-  // it had the same type, it would have been renamed to "%foo.42 = { i32 }".
-  std::vector<StructType *> Types = SrcM.getIdentifiedStructTypes();
-  for (StructType *ST : Types) {
-    if (!ST->hasName())
-      continue;
-
-    // Check to see if there is a dot in the name followed by a digit.
-    size_t DotPos = ST->getName().rfind('.');
-    if (DotPos == 0 || DotPos == StringRef::npos ||
-        ST->getName().back() == '.' ||
-        !isdigit(static_cast<unsigned char>(ST->getName()[DotPos + 1])))
-      continue;
-
-    // Check to see if the destination module has a struct with the prefix name.
-    StructType *DST = DstM.getTypeByName(ST->getName().substr(0, DotPos));
-    if (!DST)
-      continue;
-
-    // Don't use it if this actually came from the source module. They're in
-    // the same LLVMContext after all. Also don't use it unless the type is
-    // actually used in the destination module. This can happen in situations
-    // like this:
-    //
-    //      Module A                         Module B
-    //      --------                         --------
-    //   %Z = type { %A }                %B = type { %C.1 }
-    //   %A = type { %B.1, [7 x i8] }    %C.1 = type { i8* }
-    //   %B.1 = type { %C }              %A.2 = type { %B.3, [5 x i8] }
-    //   %C = type { i8* }               %B.3 = type { %C.1 }
-    //
-    // When we link Module B with Module A, the '%B' in Module B is
-    // used. However, that would then use '%C.1'. But when we process '%C.1',
-    // we prefer to take the '%C' version. So we are then left with both
-    // '%C.1' and '%C' being used for the same types. This leads to some
-    // variables using one type and some using the other.
-    if (TypeMap.DstStructTypesSet.hasType(DST))
-      TypeMap.addTypeMapping(DST, ST);
-  }
-
-  // Now that we have discovered all of the type equivalences, get a body for
-  // any 'opaque' types in the dest module that are now resolved.
-  TypeMap.linkDefinedTypeBodies();
-}
-
-static void getArrayElements(const Constant *C,
-                             SmallVectorImpl<Constant *> &Dest) {
-  unsigned NumElements = cast<ArrayType>(C->getType())->getNumElements();
-
-  for (unsigned i = 0; i != NumElements; ++i)
-    Dest.push_back(C->getAggregateElement(i));
-}
-
-/// If there were any appending global variables, link them together now.
-/// Return true on error.
-Constant *ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
-                                              const GlobalVariable *SrcGV) {
-  Type *EltTy = cast<ArrayType>(TypeMap.get(SrcGV->getType()->getElementType()))
-                    ->getElementType();
-
-  StringRef Name = SrcGV->getName();
-  bool IsNewStructor = false;
-  bool IsOldStructor = false;
-  if (Name == "llvm.global_ctors" || Name == "llvm.global_dtors") {
-    if (cast<StructType>(EltTy)->getNumElements() == 3)
-      IsNewStructor = true;
-    else
-      IsOldStructor = true;
-  }
-
-  PointerType *VoidPtrTy = Type::getInt8Ty(SrcGV->getContext())->getPointerTo();
-  if (IsOldStructor) {
-    auto &ST = *cast<StructType>(EltTy);
-    Type *Tys[3] = {ST.getElementType(0), ST.getElementType(1), VoidPtrTy};
-    EltTy = StructType::get(SrcGV->getContext(), Tys, false);
-  }
-
-  if (DstGV) {
-    ArrayType *DstTy = cast<ArrayType>(DstGV->getType()->getElementType());
-
-    if (!SrcGV->hasAppendingLinkage() || !DstGV->hasAppendingLinkage()) {
-      emitError(
-          "Linking globals named '" + SrcGV->getName() +
-          "': can only link appending global with another appending global!");
-      return nullptr;
-    }
-
-    // Check to see that they two arrays agree on type.
-    if (EltTy != DstTy->getElementType()) {
-      emitError("Appending variables with different element types!");
-      return nullptr;
-    }
-    if (DstGV->isConstant() != SrcGV->isConstant()) {
-      emitError("Appending variables linked with different const'ness!");
-      return nullptr;
-    }
-
-    if (DstGV->getAlignment() != SrcGV->getAlignment()) {
-      emitError(
-          "Appending variables with different alignment need to be linked!");
-      return nullptr;
-    }
-
-    if (DstGV->getVisibility() != SrcGV->getVisibility()) {
-      emitError(
-          "Appending variables with different visibility need to be linked!");
-      return nullptr;
-    }
-
-    if (DstGV->hasUnnamedAddr() != SrcGV->hasUnnamedAddr()) {
-      emitError(
-          "Appending variables with different unnamed_addr need to be linked!");
-      return nullptr;
-    }
-
-    if (StringRef(DstGV->getSection()) != SrcGV->getSection()) {
-      emitError(
-          "Appending variables with different section name need to be linked!");
-      return nullptr;
-    }
-  }
-
-  SmallVector<Constant *, 16> DstElements;
-  if (DstGV)
-    getArrayElements(DstGV->getInitializer(), DstElements);
-
-  SmallVector<Constant *, 16> SrcElements;
-  getArrayElements(SrcGV->getInitializer(), SrcElements);
-
-  if (IsNewStructor)
-    SrcElements.erase(
-        std::remove_if(SrcElements.begin(), SrcElements.end(),
-                       [this](Constant *E) {
-                         auto *Key = dyn_cast<GlobalValue>(
-                             E->getAggregateElement(2)->stripPointerCasts());
-                         return Key && !ValuesToLink.count(Key) &&
-                                !shouldLazyLink(*Key);
-                       }),
-        SrcElements.end());
-  uint64_t NewSize = DstElements.size() + SrcElements.size();
-  ArrayType *NewType = ArrayType::get(EltTy, NewSize);
-
-  // Create the new global variable.
-  GlobalVariable *NG = new GlobalVariable(
-      DstM, NewType, SrcGV->isConstant(), SrcGV->getLinkage(),
-      /*init*/ nullptr, /*name*/ "", DstGV, SrcGV->getThreadLocalMode(),
-      SrcGV->getType()->getAddressSpace());
-
-  // Propagate alignment, visibility and section info.
-  copyGVAttributes(NG, SrcGV);
-
-  Constant *Ret = ConstantExpr::getBitCast(NG, TypeMap.get(SrcGV->getType()));
-
-  // Stop recursion.
-  ValueMap[SrcGV] = Ret;
-
-  for (auto *V : SrcElements) {
-    Constant *NewV;
-    if (IsOldStructor) {
-      auto *S = cast<ConstantStruct>(V);
-      auto *E1 = MapValue(S->getOperand(0), ValueMap, RF_MoveDistinctMDs,
-                          &TypeMap, &ValMaterializer);
-      auto *E2 = MapValue(S->getOperand(1), ValueMap, RF_MoveDistinctMDs,
-                          &TypeMap, &ValMaterializer);
-      Value *Null = Constant::getNullValue(VoidPtrTy);
-      NewV =
-          ConstantStruct::get(cast<StructType>(EltTy), E1, E2, Null, nullptr);
-    } else {
-      NewV =
-          MapValue(V, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer);
-    }
-    DstElements.push_back(NewV);
-  }
-
-  NG->setInitializer(ConstantArray::get(NewType, DstElements));
-
-  // Replace any uses of the two global variables with uses of the new
-  // global.
-  if (DstGV) {
-    DstGV->replaceAllUsesWith(ConstantExpr::getBitCast(NG, DstGV->getType()));
-    DstGV->eraseFromParent();
-  }
-
-  return Ret;
-}
-
-Constant *ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) {
-  GlobalValue *DGV = getLinkedToGlobal(SGV);
-
-  // Handle the ultra special appending linkage case first.
-  assert(!DGV || SGV->hasAppendingLinkage() == DGV->hasAppendingLinkage());
-  if (SGV->hasAppendingLinkage()) {
-    // Should have prevented importing for appending linkage in linkIfNeeded.
-    assert(!isPerformingImport());
-    return linkAppendingVarProto(cast_or_null<GlobalVariable>(DGV),
-                                 cast<GlobalVariable>(SGV));
-  }
-
-  bool LinkFromSrc = true;
-  Comdat *C = nullptr;
-  bool HasUnnamedAddr = SGV->hasUnnamedAddr();
-
-  if (isPerformingImport() && !doImportAsDefinition(SGV)) {
-    LinkFromSrc = false;
-  } else if (const Comdat *SC = SGV->getComdat()) {
-    Comdat::SelectionKind SK;
-    std::tie(SK, LinkFromSrc) = ComdatsChosen[SC];
-    C = DstM.getOrInsertComdat(SC->getName());
-    C->setSelectionKind(SK);
-    if (SGV->hasLocalLinkage())
-      LinkFromSrc = true;
-  } else if (DGV) {
-    if (shouldLinkFromSource(LinkFromSrc, *DGV, *SGV))
-      return nullptr;
-  }
-
-  if (DGV)
-    HasUnnamedAddr = HasUnnamedAddr && DGV->hasUnnamedAddr();
-
-  GlobalValue *NewGV;
-  if (!LinkFromSrc && DGV) {
-    NewGV = DGV;
-  } else {
-    // If we are done linking global value bodies (i.e. we are performing
-    // metadata linking), don't link in the global value due to this
-    // reference, simply map it to null.
-    if (DoneLinkingBodies)
-      return nullptr;
-
-    NewGV = copyGlobalValueProto(SGV, LinkFromSrc);
-  }
-
-  setVisibility(NewGV, SGV, DGV);
-  NewGV->setUnnamedAddr(HasUnnamedAddr);
-
-  if (auto *NewGO = dyn_cast<GlobalObject>(NewGV)) {
-    if (C && LinkFromSrc)
-      NewGO->setComdat(C);
-
-    if (DGV && DGV->hasCommonLinkage() && SGV->hasCommonLinkage())
-      NewGO->setAlignment(std::max(DGV->getAlignment(), SGV->getAlignment()));
-  }
-
-  if (auto *NewGVar = dyn_cast<GlobalVariable>(NewGV)) {
-    auto *DGVar = dyn_cast_or_null<GlobalVariable>(DGV);
-    auto *SGVar = dyn_cast<GlobalVariable>(SGV);
-    if (DGVar && SGVar && DGVar->isDeclaration() && SGVar->isDeclaration() &&
-        (!DGVar->isConstant() || !SGVar->isConstant()))
-      NewGVar->setConstant(false);
-  }
-
-  if (NewGV != DGV && DGV) {
-    DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewGV, DGV->getType()));
-    DGV->eraseFromParent();
-  }
-
-  return ConstantExpr::getBitCast(NewGV, TypeMap.get(SGV->getType()));
-}
-
-/// Update the initializers in the Dest module now that all globals that may be
-/// referenced are in Dest.
-void ModuleLinker::linkGlobalInit(GlobalVariable &Dst, GlobalVariable &Src) {
-  // Figure out what the initializer looks like in the dest module.
-  Dst.setInitializer(MapValue(Src.getInitializer(), ValueMap,
-                              RF_MoveDistinctMDs, &TypeMap, &ValMaterializer));
-}
-
-/// Copy the source function over into the dest function and fix up references
-/// to values. At this point we know that Dest is an external function, and
-/// that Src is not.
-bool ModuleLinker::linkFunctionBody(Function &Dst, Function &Src) {
-  assert(Dst.isDeclaration() && !Src.isDeclaration());
-
-  // Materialize if needed.
-  if (std::error_code EC = Src.materialize())
-    return emitError(EC.message());
-
-  // Link in the prefix data.
-  if (Src.hasPrefixData())
-    Dst.setPrefixData(MapValue(Src.getPrefixData(), ValueMap,
-                               RF_MoveDistinctMDs, &TypeMap, &ValMaterializer));
-
-  // Link in the prologue data.
-  if (Src.hasPrologueData())
-    Dst.setPrologueData(MapValue(Src.getPrologueData(), ValueMap,
-                                 RF_MoveDistinctMDs, &TypeMap,
-                                 &ValMaterializer));
-
-  // Link in the personality function.
-  if (Src.hasPersonalityFn())
-    Dst.setPersonalityFn(MapValue(Src.getPersonalityFn(), ValueMap,
-                                  RF_MoveDistinctMDs, &TypeMap,
-                                  &ValMaterializer));
-
-  // Go through and convert function arguments over, remembering the mapping.
-  Function::arg_iterator DI = Dst.arg_begin();
-  for (Argument &Arg : Src.args()) {
-    DI->setName(Arg.getName()); // Copy the name over.
-
-    // Add a mapping to our mapping.
-    ValueMap[&Arg] = &*DI;
-    ++DI;
-  }
-
-  // Copy over the metadata attachments.
-  SmallVector<std::pair<unsigned, MDNode *>, 8> MDs;
-  Src.getAllMetadata(MDs);
-  for (const auto &I : MDs)
-    Dst.setMetadata(I.first, MapMetadata(I.second, ValueMap, RF_MoveDistinctMDs,
-                                         &TypeMap, &ValMaterializer));
-
-  // Splice the body of the source function into the dest function.
-  Dst.getBasicBlockList().splice(Dst.end(), Src.getBasicBlockList());
-
-  // At this point, all of the instructions and values of the function are now
-  // copied over.  The only problem is that they are still referencing values in
-  // the Source function as operands.  Loop through all of the operands of the
-  // functions and patch them up to point to the local versions.
-  for (BasicBlock &BB : Dst)
-    for (Instruction &I : BB)
-      RemapInstruction(&I, ValueMap,
-                       RF_IgnoreMissingEntries | RF_MoveDistinctMDs, &TypeMap,
-                       &ValMaterializer);
-
-  // There is no need to map the arguments anymore.
-  for (Argument &Arg : Src.args())
-    ValueMap.erase(&Arg);
-
-  Src.dematerialize();
-  return false;
-}
-
-void ModuleLinker::linkAliasBody(GlobalAlias &Dst, GlobalAlias &Src) {
-  Constant *Aliasee = Src.getAliasee();
-  Constant *Val = MapValue(Aliasee, ValueMap, RF_MoveDistinctMDs, &TypeMap,
-                           &ValMaterializer);
-  Dst.setAliasee(Val);
-}
-
-bool ModuleLinker::linkGlobalValueBody(GlobalValue &Dst, GlobalValue &Src) {
-  if (const Comdat *SC = Src.getComdat()) {
-    // To ensure that we don't generate an incomplete comdat group,
-    // we must materialize and map in any other members that are not
-    // yet materialized in Dst, which also ensures their definitions
-    // are linked in. Otherwise, linkonce and other lazy linked GVs will
-    // not be materialized if they aren't referenced.
-    for (auto *SGV : ComdatMembers[SC]) {
-      auto *DGV = cast_or_null<GlobalValue>(ValueMap.lookup(SGV));
-      if (DGV && !DGV->isDeclaration())
-        continue;
-      MapValue(SGV, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer);
-    }
-  }
-  if (shouldInternalizeLinkedSymbols())
-    if (auto *DGV = dyn_cast<GlobalValue>(&Dst))
-      DGV->setLinkage(GlobalValue::InternalLinkage);
-  if (auto *F = dyn_cast<Function>(&Src))
-    return linkFunctionBody(cast<Function>(Dst), *F);
-  if (auto *GVar = dyn_cast<GlobalVariable>(&Src)) {
-    linkGlobalInit(cast<GlobalVariable>(Dst), *GVar);
-    return false;
-  }
-  linkAliasBody(cast<GlobalAlias>(Dst), cast<GlobalAlias>(Src));
-  return false;
-}
-
-/// Insert all of the named MDNodes in Src into the Dest module.
-void ModuleLinker::linkNamedMDNodes() {
-  const NamedMDNode *SrcModFlags = SrcM.getModuleFlagsMetadata();
-  for (const NamedMDNode &NMD : SrcM.named_metadata()) {
-    // Don't link module flags here. Do them separately.
-    if (&NMD == SrcModFlags)
-      continue;
-    NamedMDNode *DestNMD = DstM.getOrInsertNamedMetadata(NMD.getName());
-    // Add Src elements into Dest node.
-    for (const MDNode *op : NMD.operands())
-      DestNMD->addOperand(MapMetadata(
-          op, ValueMap, RF_MoveDistinctMDs | RF_NullMapMissingGlobalValues,
-          &TypeMap, &ValMaterializer));
-  }
-}
-
-/// Merge the linker flags in Src into the Dest module.
-bool ModuleLinker::linkModuleFlagsMetadata() {
-  // If the source module has no module flags, we are done.
-  const NamedMDNode *SrcModFlags = SrcM.getModuleFlagsMetadata();
-  if (!SrcModFlags)
-    return false;
-
-  // If the destination module doesn't have module flags yet, then just copy
-  // over the source module's flags.
-  NamedMDNode *DstModFlags = DstM.getOrInsertModuleFlagsMetadata();
-  if (DstModFlags->getNumOperands() == 0) {
-    for (unsigned I = 0, E = SrcModFlags->getNumOperands(); I != E; ++I)
-      DstModFlags->addOperand(SrcModFlags->getOperand(I));
-
-    return false;
-  }
-
-  // First build a map of the existing module flags and requirements.
-  DenseMap<MDString *, std::pair<MDNode *, unsigned>> Flags;
-  SmallSetVector<MDNode *, 16> Requirements;
-  for (unsigned I = 0, E = DstModFlags->getNumOperands(); I != E; ++I) {
-    MDNode *Op = DstModFlags->getOperand(I);
-    ConstantInt *Behavior = mdconst::extract<ConstantInt>(Op->getOperand(0));
-    MDString *ID = cast<MDString>(Op->getOperand(1));
-
-    if (Behavior->getZExtValue() == Module::Require) {
-      Requirements.insert(cast<MDNode>(Op->getOperand(2)));
-    } else {
-      Flags[ID] = std::make_pair(Op, I);
-    }
-  }
-
-  // Merge in the flags from the source module, and also collect its set of
-  // requirements.
-  for (unsigned I = 0, E = SrcModFlags->getNumOperands(); I != E; ++I) {
-    MDNode *SrcOp = SrcModFlags->getOperand(I);
-    ConstantInt *SrcBehavior =
-        mdconst::extract<ConstantInt>(SrcOp->getOperand(0));
-    MDString *ID = cast<MDString>(SrcOp->getOperand(1));
-    MDNode *DstOp;
-    unsigned DstIndex;
-    std::tie(DstOp, DstIndex) = Flags.lookup(ID);
-    unsigned SrcBehaviorValue = SrcBehavior->getZExtValue();
-
-    // If this is a requirement, add it and continue.
-    if (SrcBehaviorValue == Module::Require) {
-      // If the destination module does not already have this requirement, add
-      // it.
-      if (Requirements.insert(cast<MDNode>(SrcOp->getOperand(2)))) {
-        DstModFlags->addOperand(SrcOp);
-      }
-      continue;
-    }
-
-    // If there is no existing flag with this ID, just add it.
-    if (!DstOp) {
-      Flags[ID] = std::make_pair(SrcOp, DstModFlags->getNumOperands());
-      DstModFlags->addOperand(SrcOp);
-      continue;
-    }
-
-    // Otherwise, perform a merge.
-    ConstantInt *DstBehavior =
-        mdconst::extract<ConstantInt>(DstOp->getOperand(0));
-    unsigned DstBehaviorValue = DstBehavior->getZExtValue();
-
-    // If either flag has override behavior, handle it first.
-    if (DstBehaviorValue == Module::Override) {
-      // Diagnose inconsistent flags which both have override behavior.
-      if (SrcBehaviorValue == Module::Override &&
-          SrcOp->getOperand(2) != DstOp->getOperand(2)) {
-        emitError("linking module flags '" + ID->getString() +
-                  "': IDs have conflicting override values");
-      }
-      continue;
-    } else if (SrcBehaviorValue == Module::Override) {
-      // Update the destination flag to that of the source.
-      DstModFlags->setOperand(DstIndex, SrcOp);
-      Flags[ID].first = SrcOp;
-      continue;
-    }
-
-    // Diagnose inconsistent merge behavior types.
-    if (SrcBehaviorValue != DstBehaviorValue) {
-      emitError("linking module flags '" + ID->getString() +
-                "': IDs have conflicting behaviors");
-      continue;
-    }
-
-    auto replaceDstValue = [&](MDNode *New) {
-      Metadata *FlagOps[] = {DstOp->getOperand(0), ID, New};
-      MDNode *Flag = MDNode::get(DstM.getContext(), FlagOps);
-      DstModFlags->setOperand(DstIndex, Flag);
-      Flags[ID].first = Flag;
-    };
-
-    // Perform the merge for standard behavior types.
-    switch (SrcBehaviorValue) {
-    case Module::Require:
-    case Module::Override:
-      llvm_unreachable("not possible");
-    case Module::Error: {
-      // Emit an error if the values differ.
-      if (SrcOp->getOperand(2) != DstOp->getOperand(2)) {
-        emitError("linking module flags '" + ID->getString() +
-                  "': IDs have conflicting values");
-      }
-      continue;
-    }
-    case Module::Warning: {
-      // Emit a warning if the values differ.
-      if (SrcOp->getOperand(2) != DstOp->getOperand(2)) {
-        emitWarning("linking module flags '" + ID->getString() +
-                    "': IDs have conflicting values");
-      }
-      continue;
-    }
-    case Module::Append: {
-      MDNode *DstValue = cast<MDNode>(DstOp->getOperand(2));
-      MDNode *SrcValue = cast<MDNode>(SrcOp->getOperand(2));
-      SmallVector<Metadata *, 8> MDs;
-      MDs.reserve(DstValue->getNumOperands() + SrcValue->getNumOperands());
-      MDs.append(DstValue->op_begin(), DstValue->op_end());
-      MDs.append(SrcValue->op_begin(), SrcValue->op_end());
-
-      replaceDstValue(MDNode::get(DstM.getContext(), MDs));
-      break;
-    }
-    case Module::AppendUnique: {
-      SmallSetVector<Metadata *, 16> Elts;
-      MDNode *DstValue = cast<MDNode>(DstOp->getOperand(2));
-      MDNode *SrcValue = cast<MDNode>(SrcOp->getOperand(2));
-      Elts.insert(DstValue->op_begin(), DstValue->op_end());
-      Elts.insert(SrcValue->op_begin(), SrcValue->op_end());
-
-      replaceDstValue(MDNode::get(DstM.getContext(),
-                                  makeArrayRef(Elts.begin(), Elts.end())));
-      break;
-    }
-    }
-  }
-
-  // Check all of the requirements.
-  for (unsigned I = 0, E = Requirements.size(); I != E; ++I) {
-    MDNode *Requirement = Requirements[I];
-    MDString *Flag = cast<MDString>(Requirement->getOperand(0));
-    Metadata *ReqValue = Requirement->getOperand(1);
-
-    MDNode *Op = Flags[Flag].first;
-    if (!Op || Op->getOperand(2) != ReqValue) {
-      emitError("linking module flags '" + Flag->getString() +
-                "': does not have the required value");
-      continue;
-    }
-  }
-
-  return HasError;
-}
-
-// This function returns true if the triples match.
-static bool triplesMatch(const Triple &T0, const Triple &T1) {
-  // If vendor is apple, ignore the version number.
-  if (T0.getVendor() == Triple::Apple)
-    return T0.getArch() == T1.getArch() && T0.getSubArch() == T1.getSubArch() &&
-           T0.getVendor() == T1.getVendor() && T0.getOS() == T1.getOS();
-
-  return T0 == T1;
-}
-
-// This function returns the merged triple.
-static std::string mergeTriples(const Triple &SrcTriple,
-                                const Triple &DstTriple) {
-  // If vendor is apple, pick the triple with the larger version number.
-  if (SrcTriple.getVendor() == Triple::Apple)
-    if (DstTriple.isOSVersionLT(SrcTriple))
-      return SrcTriple.str();
-
-  return DstTriple.str();
-}
-
 bool ModuleLinker::linkIfNeeded(GlobalValue &GV) {
   GlobalValue *DGV = getLinkedToGlobal(&GV);
 
@@ -1772,47 +676,49 @@ bool ModuleLinker::linkIfNeeded(GlobalValue &GV) {
   return false;
 }
 
-bool ModuleLinker::run() {
-  // Inherit the target data from the source module if the destination module
-  // doesn't have one already.
-  if (DstM.getDataLayout().isDefault())
-    DstM.setDataLayout(SrcM.getDataLayout());
-
-  if (SrcM.getDataLayout() != DstM.getDataLayout()) {
-    emitWarning("Linking two modules of different data layouts: '" +
-                SrcM.getModuleIdentifier() + "' is '" +
-                SrcM.getDataLayoutStr() + "' whereas '" +
-                DstM.getModuleIdentifier() + "' is '" +
-                DstM.getDataLayoutStr() + "'\n");
-  }
-
-  // Copy the target triple from the source to dest if the dest's is empty.
-  if (DstM.getTargetTriple().empty() && !SrcM.getTargetTriple().empty())
-    DstM.setTargetTriple(SrcM.getTargetTriple());
-
-  Triple SrcTriple(SrcM.getTargetTriple()), DstTriple(DstM.getTargetTriple());
+void ModuleLinker::addLazyFor(GlobalValue &GV, IRMover::ValueAdder Add) {
+  // Add these to the internalize list
+  if (!GV.hasLinkOnceLinkage())
+    return;
 
-  if (!SrcM.getTargetTriple().empty() && !triplesMatch(SrcTriple, DstTriple))
-    emitWarning("Linking two modules of different target triples: " +
-                SrcM.getModuleIdentifier() + "' is '" + SrcM.getTargetTriple() +
-                "' whereas '" + DstM.getModuleIdentifier() + "' is '" +
-                DstM.getTargetTriple() + "'\n");
+  if (shouldInternalizeLinkedSymbols())
+    Internalize.insert(GV.getName());
+  Add(GV);
 
-  DstM.setTargetTriple(mergeTriples(SrcTriple, DstTriple));
+  const Comdat *SC = GV.getComdat();
+  if (!SC)
+    return;
+  for (GlobalValue *GV2 : ComdatMembers[SC]) {
+    if (!GV2->hasLocalLinkage() && shouldInternalizeLinkedSymbols())
+      Internalize.insert(GV2->getName());
+    Add(*GV2);
+  }
+}
 
-  // Append the module inline asm string.
-  if (!SrcM.getModuleInlineAsm().empty()) {
-    if (DstM.getModuleInlineAsm().empty())
-      DstM.setModuleInlineAsm(SrcM.getModuleInlineAsm());
-    else
-      DstM.setModuleInlineAsm(DstM.getModuleInlineAsm() + "\n" +
-                              SrcM.getModuleInlineAsm());
+void ModuleLinker::processGlobalForThinLTO(GlobalValue &GV) {
+  if (GV.hasLocalLinkage() &&
+      (doPromoteLocalToGlobal(&GV) || isPerformingImport())) {
+    GV.setName(getName(&GV));
+    GV.setLinkage(getLinkage(&GV));
+    if (!GV.hasLocalLinkage())
+      GV.setVisibility(GlobalValue::HiddenVisibility);
+    if (isModuleExporting())
+      ValuesToLink.insert(&GV);
+    return;
   }
+  GV.setLinkage(getLinkage(&GV));
+}
 
-  // Loop over all of the linked values to compute type mappings.
-  computeTypeMapping();
+void ModuleLinker::processGlobalsForThinLTO() {
+  for (GlobalVariable &GV : SrcM.globals())
+    processGlobalForThinLTO(GV);
+  for (Function &SF : SrcM)
+    processGlobalForThinLTO(SF);
+  for (GlobalAlias &GA : SrcM.aliases())
+    processGlobalForThinLTO(GA);
+}
 
-  ComdatsChosen.clear();
+bool ModuleLinker::run() {
   for (const auto &SMEC : SrcM.getComdatSymbolTable()) {
     const Comdat &C = SMEC.getValue();
     if (ComdatsChosen.count(&C))
@@ -1850,138 +756,45 @@ bool ModuleLinker::run() {
     if (linkIfNeeded(GA))
       return true;
 
-  for (GlobalValue *GV : ValuesToLink) {
-    MapValue(GV, ValueMap, RF_MoveDistinctMDs, &TypeMap, &ValMaterializer);
-    if (HasError)
-      return true;
-  }
+  processGlobalsForThinLTO();
 
-  // Note that we are done linking global value bodies. This prevents
-  // metadata linking from creating new references.
-  DoneLinkingBodies = true;
+  for (unsigned I = 0; I < ValuesToLink.size(); ++I) {
+    GlobalValue *GV = ValuesToLink[I];
+    const Comdat *SC = GV->getComdat();
+    if (!SC)
+      continue;
+    for (GlobalValue *GV2 : ComdatMembers[SC])
+      ValuesToLink.insert(GV2);
+  }
 
-  // Remap all of the named MDNodes in Src into the DstM module. We do this
-  // after linking GlobalValues so that MDNodes that reference GlobalValues
-  // are properly remapped.
-  linkNamedMDNodes();
+  if (shouldInternalizeLinkedSymbols()) {
+    for (GlobalValue *GV : ValuesToLink)
+      Internalize.insert(GV->getName());
+  }
 
-  // Merge the module flags into the DstM module.
-  if (linkModuleFlagsMetadata())
+  if (Mover.move(SrcM,
+                 makeArrayRef(&*ValuesToLink.begin(), ValuesToLink.size()),
+                 [this](GlobalValue &GV, IRMover::ValueAdder Add) {
+                   addLazyFor(GV, Add);
+                 }))
     return true;
+  Module &DstM = Mover.getModule();
+  for (auto &P : Internalize) {
+    GlobalValue *GV = DstM.getNamedValue(P.first());
+    GV->setLinkage(GlobalValue::InternalLinkage);
+  }
 
   return false;
 }
 
-Linker::StructTypeKeyInfo::KeyTy::KeyTy(ArrayRef<Type *> E, bool P)
-    : ETypes(E), IsPacked(P) {}
-
-Linker::StructTypeKeyInfo::KeyTy::KeyTy(const StructType *ST)
-    : ETypes(ST->elements()), IsPacked(ST->isPacked()) {}
-
-bool Linker::StructTypeKeyInfo::KeyTy::operator==(const KeyTy &That) const {
-  if (IsPacked != That.IsPacked)
-    return false;
-  if (ETypes != That.ETypes)
-    return false;
-  return true;
-}
-
-bool Linker::StructTypeKeyInfo::KeyTy::operator!=(const KeyTy &That) const {
-  return !this->operator==(That);
-}
-
-StructType *Linker::StructTypeKeyInfo::getEmptyKey() {
-  return DenseMapInfo<StructType *>::getEmptyKey();
-}
-
-StructType *Linker::StructTypeKeyInfo::getTombstoneKey() {
-  return DenseMapInfo<StructType *>::getTombstoneKey();
-}
-
-unsigned Linker::StructTypeKeyInfo::getHashValue(const KeyTy &Key) {
-  return hash_combine(hash_combine_range(Key.ETypes.begin(), Key.ETypes.end()),
-                      Key.IsPacked);
-}
-
-unsigned Linker::StructTypeKeyInfo::getHashValue(const StructType *ST) {
-  return getHashValue(KeyTy(ST));
-}
-
-bool Linker::StructTypeKeyInfo::isEqual(const KeyTy &LHS,
-                                        const StructType *RHS) {
-  if (RHS == getEmptyKey() || RHS == getTombstoneKey())
-    return false;
-  return LHS == KeyTy(RHS);
-}
-
-bool Linker::StructTypeKeyInfo::isEqual(const StructType *LHS,
-                                        const StructType *RHS) {
-  if (RHS == getEmptyKey())
-    return LHS == getEmptyKey();
-
-  if (RHS == getTombstoneKey())
-    return LHS == getTombstoneKey();
-
-  return KeyTy(LHS) == KeyTy(RHS);
-}
-
-void Linker::IdentifiedStructTypeSet::addNonOpaque(StructType *Ty) {
-  assert(!Ty->isOpaque());
-  NonOpaqueStructTypes.insert(Ty);
-}
-
-void Linker::IdentifiedStructTypeSet::switchToNonOpaque(StructType *Ty) {
-  assert(!Ty->isOpaque());
-  NonOpaqueStructTypes.insert(Ty);
-  bool Removed = OpaqueStructTypes.erase(Ty);
-  (void)Removed;
-  assert(Removed);
-}
-
-void Linker::IdentifiedStructTypeSet::addOpaque(StructType *Ty) {
-  assert(Ty->isOpaque());
-  OpaqueStructTypes.insert(Ty);
-}
-
-StructType *
-Linker::IdentifiedStructTypeSet::findNonOpaque(ArrayRef<Type *> ETypes,
-                                               bool IsPacked) {
-  Linker::StructTypeKeyInfo::KeyTy Key(ETypes, IsPacked);
-  auto I = NonOpaqueStructTypes.find_as(Key);
-  if (I == NonOpaqueStructTypes.end())
-    return nullptr;
-  return *I;
-}
-
-bool Linker::IdentifiedStructTypeSet::hasType(StructType *Ty) {
-  if (Ty->isOpaque())
-    return OpaqueStructTypes.count(Ty);
-  auto I = NonOpaqueStructTypes.find(Ty);
-  if (I == NonOpaqueStructTypes.end())
-    return false;
-  return *I == Ty;
-}
-
 Linker::Linker(Module &M, DiagnosticHandlerFunction DiagnosticHandler)
-    : Composite(M), DiagnosticHandler(DiagnosticHandler) {
-  TypeFinder StructTypes;
-  StructTypes.run(M, true);
-  for (StructType *Ty : StructTypes) {
-    if (Ty->isOpaque())
-      IdentifiedStructTypes.addOpaque(Ty);
-    else
-      IdentifiedStructTypes.addNonOpaque(Ty);
-  }
-}
+    : Mover(M, DiagnosticHandler) {}
 
 bool Linker::linkInModule(Module &Src, unsigned Flags,
                           const FunctionInfoIndex *Index,
                           DenseSet<const GlobalValue *> *FunctionsToImport) {
-  ModuleLinker TheLinker(Composite, IdentifiedStructTypes, Src,
-                         DiagnosticHandler, Flags, Index, FunctionsToImport);
-  bool RetCode = TheLinker.run();
-  Composite.dropTriviallyDeadConstantArrays();
-  return RetCode;
+  ModuleLinker TheLinker(Mover, Src, Flags, Index, FunctionsToImport);
+  return TheLinker.run();
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/test/Linker/alias.ll b/test/Linker/alias.ll
index 4e2072396d47..ae9da70174e8 100644
--- a/test/Linker/alias.ll
+++ b/test/Linker/alias.ll
@@ -1,16 +1,37 @@
-; RUN: llvm-link %s %S/Inputs/alias.ll -S -o - | FileCheck %s
-; RUN: llvm-link %S/Inputs/alias.ll %s -S -o - | FileCheck %s
+; RUN: llvm-link %s %S/Inputs/alias.ll -S -o - | FileCheck --check-prefix=C1 %s
+; RUN: llvm-link %S/Inputs/alias.ll %s -S -o - | FileCheck --check-prefix=C2 %s
+
+; FIXME:
+; The C1 direction is incorrect.
+; When moving an alias to an existing module and we want to discard the aliasee
+; (the C2 case), the IRMover knows to copy the aliasee as internal.
+; When moving a replacement to an aliasee to a module that has an alias (C1),
+; a replace all uses with blindly changes the alias.
+; The C1 case doesn't happen when using a system linker with a plugin because
+; the linker does full symbol resolution first.
+; Given that this is a problem only with llvm-link and its 1 module at a time
+; linking, it should probably learn to changes the aliases in the destination
+; before using the IRMover.
 
 @foo = weak global i32 0
-; CHECK-DAG: @foo = alias i32, i32* @zed
+; C1-DAG: @foo = alias i32, i32* @zed
+; C2-DAG: @foo = alias i32, i32* @zed
 
 @bar = alias i32, i32* @foo
-; CHECK-DAG: @bar = alias i32, i32* @foo
+; C1-DAG: @bar = alias i32, i32* @foo
+
+; C2-DAG: @foo.1 = internal global i32 0
+; C2-DAG: @bar = alias i32, i32* @foo.1
 
 @foo2 = weak global i32 0
-; CHECK-DAG: @foo2 = alias i16, bitcast (i32* @zed to i16*)
+; C1-DAG: @foo2 = alias i16, bitcast (i32* @zed to i16*)
+; C2-DAG: @foo2 = alias i16, bitcast (i32* @zed to i16*)
 
 @bar2 = alias i32, i32* @foo2
-; CHECK-DAG: @bar2 = alias i32, bitcast (i16* @foo2 to i32*)
+; C1-DAG: @bar2 = alias i32, bitcast (i16* @foo2 to i32*)
+
+; C2-DAG: @foo2.2 = internal global i32 0
+; C2-DAG: @bar2 = alias i32, i32* @foo2.2
 
-; CHECK-DAG: @zed = global i32 42
+; C1-DAG: @zed = global i32 42
+; C2-DAG: @zed = global i32 42
diff --git a/test/tools/gold/X86/drop-linkage.ll b/test/tools/gold/X86/drop-linkage.ll
index 14d3a96718a8..d4c1dd052c79 100644
--- a/test/tools/gold/X86/drop-linkage.ll
+++ b/test/tools/gold/X86/drop-linkage.ll
@@ -11,4 +11,4 @@ define void @foo() {
   ret void
 }
 
-; CHECK: declare void @foo(){{$}}
+; CHECK: declare extern_weak void @foo(){{$}}
diff --git a/tools/gold/gold-plugin.cpp b/tools/gold/gold-plugin.cpp
index 9f8b0b20f613..d73e94922666 100644
--- a/tools/gold/gold-plugin.cpp
+++ b/tools/gold/gold-plugin.cpp
@@ -29,7 +29,7 @@
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
-#include "llvm/Linker/Linker.h"
+#include "llvm/Linker/IRMover.h"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Object/FunctionIndexObjectFile.h"
 #include "llvm/Object/IRObjectFile.h"
@@ -62,6 +62,17 @@ struct claimed_file {
   void *handle;
   std::vector<ld_plugin_symbol> syms;
 };
+
+struct ResolutionInfo {
+  bool IsLinkonceOdr = true;
+  bool UnnamedAddr = true;
+  GlobalValue::VisibilityTypes Visibility = GlobalValue::DefaultVisibility;
+  bool CommonInternal = false;
+  bool UseCommon = false;
+  unsigned CommonSize = 0;
+  unsigned CommonAlign = 0;
+  claimed_file *CommonFile = nullptr;
+};
 }
 
 static ld_plugin_status discard_message(int level, const char *format, ...) {
@@ -81,6 +92,7 @@ static ld_plugin_message message = discard_message;
 static Reloc::Model RelocationModel = Reloc::Default;
 static std::string output_name = "";
 static std::list<claimed_file> Modules;
+static StringMap<ResolutionInfo> ResInfo;
 static std::vector<std::string> Cleanup;
 static llvm::TargetOptions TargetOpts;
 
@@ -332,6 +344,18 @@ static void diagnosticHandlerForContext(const DiagnosticInfo &DI,
   diagnosticHandler(DI);
 }
 
+static GlobalValue::VisibilityTypes
+getMinVisibility(GlobalValue::VisibilityTypes A,
+                 GlobalValue::VisibilityTypes B) {
+  if (A == GlobalValue::HiddenVisibility)
+    return A;
+  if (B == GlobalValue::HiddenVisibility)
+    return B;
+  if (A == GlobalValue::ProtectedVisibility)
+    return A;
+  return B;
+}
+
 /// Called by gold to see whether this file is one that our plugin can handle.
 /// We'll try to open it and register all the symbols with add_symbol if
 /// possible.
@@ -411,8 +435,22 @@ static ld_plugin_status claim_file_hook(const ld_plugin_input_file *file,
 
     const GlobalValue *GV = Obj->getSymbolGV(Sym.getRawDataRefImpl());
 
+    ResolutionInfo &Res = ResInfo[sym.name];
+
     sym.visibility = LDPV_DEFAULT;
     if (GV) {
+      Res.UnnamedAddr &= GV->hasUnnamedAddr();
+      Res.IsLinkonceOdr &= GV->hasLinkOnceLinkage();
+      if (GV->hasCommonLinkage()) {
+        Res.CommonAlign = std::max(Res.CommonAlign, GV->getAlignment());
+        const DataLayout &DL = GV->getParent()->getDataLayout();
+        uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType());
+        if (Size >= Res.CommonSize) {
+          Res.CommonSize = Size;
+          Res.CommonFile = &cf;
+        }
+      }
+      Res.Visibility = getMinVisibility(Res.Visibility, GV->getVisibility());
       switch (GV->getVisibility()) {
       case GlobalValue::DefaultVisibility:
         sym.visibility = LDPV_DEFAULT;
@@ -466,27 +504,6 @@ static ld_plugin_status claim_file_hook(const ld_plugin_input_file *file,
   return LDPS_OK;
 }
 
-static void keepGlobalValue(GlobalValue &GV,
-                            std::vector<GlobalAlias *> &KeptAliases) {
-  assert(!GV.hasLocalLinkage());
-
-  if (auto *GA = dyn_cast<GlobalAlias>(&GV))
-    KeptAliases.push_back(GA);
-
-  switch (GV.getLinkage()) {
-  default:
-    break;
-  case GlobalValue::LinkOnceAnyLinkage:
-    GV.setLinkage(GlobalValue::WeakAnyLinkage);
-    break;
-  case GlobalValue::LinkOnceODRLinkage:
-    GV.setLinkage(GlobalValue::WeakODRLinkage);
-    break;
-  }
-
-  assert(!GV.isDiscardableIfUnused());
-}
-
 static void internalize(GlobalValue &GV) {
   if (GV.isDeclarationForLinker())
     return; // We get here if there is a matching asm definition.
@@ -494,33 +511,6 @@ static void internalize(GlobalValue &GV) {
     GV.setLinkage(GlobalValue::InternalLinkage);
 }
 
-static void drop(GlobalValue &GV) {
-  if (auto *F = dyn_cast<Function>(&GV)) {
-    F->deleteBody();
-    F->setComdat(nullptr); // Should deleteBody do this?
-    return;
-  }
-
-  if (auto *Var = dyn_cast<GlobalVariable>(&GV)) {
-    Var->setInitializer(nullptr);
-    Var->setLinkage(
-        GlobalValue::ExternalLinkage); // Should setInitializer do this?
-    Var->setComdat(nullptr); // and this?
-    return;
-  }
-
-  auto &Alias = cast<GlobalAlias>(GV);
-  Module &M = *Alias.getParent();
-  PointerType &Ty = *cast<PointerType>(Alias.getType());
-  GlobalValue::LinkageTypes L = Alias.getLinkage();
-  auto *Var =
-      new GlobalVariable(M, Ty.getElementType(), /*isConstant*/ false, L,
-                         /*Initializer*/ nullptr);
-  Var->takeName(&Alias);
-  Alias.replaceAllUsesWith(Var);
-  Alias.eraseFromParent();
-}
-
 static const char *getResolutionName(ld_plugin_symbol_resolution R) {
   switch (R) {
   case LDPR_UNKNOWN:
@@ -547,58 +537,6 @@ static const char *getResolutionName(ld_plugin_symbol_resolution R) {
   llvm_unreachable("Unknown resolution");
 }
 
-namespace {
-class LocalValueMaterializer final : public ValueMaterializer {
-  DenseSet<GlobalValue *> &Dropped;
-  DenseMap<GlobalObject *, GlobalObject *> LocalVersions;
-
-public:
-  LocalValueMaterializer(DenseSet<GlobalValue *> &Dropped) : Dropped(Dropped) {}
-  Value *materializeDeclFor(Value *V) override;
-};
-}
-
-Value *LocalValueMaterializer::materializeDeclFor(Value *V) {
-  auto *GO = dyn_cast<GlobalObject>(V);
-  if (!GO)
-    return nullptr;
-
-  auto I = LocalVersions.find(GO);
-  if (I != LocalVersions.end())
-    return I->second;
-
-  if (!Dropped.count(GO))
-    return nullptr;
-
-  Module &M = *GO->getParent();
-  GlobalValue::LinkageTypes L = GO->getLinkage();
-  GlobalObject *Declaration;
-  if (auto *F = dyn_cast<Function>(GO)) {
-    Declaration = Function::Create(F->getFunctionType(), L, "", &M);
-  } else {
-    auto *Var = cast<GlobalVariable>(GO);
-    Declaration = new GlobalVariable(M, Var->getType()->getElementType(),
-                                     Var->isConstant(), L,
-                                     /*Initializer*/ nullptr);
-  }
-  Declaration->takeName(GO);
-  Declaration->copyAttributesFrom(GO);
-
-  GO->setLinkage(GlobalValue::InternalLinkage);
-  GO->setName(Declaration->getName());
-  Dropped.erase(GO);
-  GO->replaceAllUsesWith(Declaration);
-
-  LocalVersions[Declaration] = GO;
-
-  return GO;
-}
-
-static Constant *mapConstantToLocalCopy(Constant *C, ValueToValueMapTy &VM,
-                                        LocalValueMaterializer *Materializer) {
-  return MapValue(C, VM, RF_IgnoreMissingEntries, nullptr, Materializer);
-}
-
 static void freeSymName(ld_plugin_symbol &Sym) {
   free(Sym.name);
   free(Sym.comdat_key);
@@ -640,7 +578,8 @@ getFunctionIndexForFile(claimed_file &F, ld_plugin_input_file &Info) {
 static std::unique_ptr<Module>
 getModuleForFile(LLVMContext &Context, claimed_file &F,
                  ld_plugin_input_file &Info, raw_fd_ostream *ApiFile,
-                 StringSet<> &Internalize, StringSet<> &Maybe) {
+                 StringSet<> &Internalize, StringSet<> &Maybe,
+                 std::vector<GlobalValue *> &Keep) {
 
   if (get_symbols(F.handle, F.syms.size(), F.syms.data()) != LDPS_OK)
     message(LDPL_FATAL, "Failed to get symbol information");
@@ -668,11 +607,12 @@ getModuleForFile(LLVMContext &Context, claimed_file &F,
   SmallPtrSet<GlobalValue *, 8> Used;
   collectUsedGlobalVariables(M, Used, /*CompilerUsed*/ false);
 
-  DenseSet<GlobalValue *> Drop;
-  std::vector<GlobalAlias *> KeptAliases;
-
   unsigned SymNum = 0;
   for (auto &ObjSym : Obj.symbols()) {
+    GlobalValue *GV = Obj.getSymbolGV(ObjSym.getRawDataRefImpl());
+    if (GV && GV->hasAppendingLinkage())
+      Keep.push_back(GV);
+
     if (shouldSkip(ObjSym.getFlags()))
       continue;
     ld_plugin_symbol &Sym = F.syms[SymNum];
@@ -684,20 +624,37 @@ getModuleForFile(LLVMContext &Context, claimed_file &F,
     if (options::generate_api_file)
       *ApiFile << Sym.name << ' ' << getResolutionName(Resolution) << '\n';
 
-    GlobalValue *GV = Obj.getSymbolGV(ObjSym.getRawDataRefImpl());
     if (!GV) {
       freeSymName(Sym);
       continue; // Asm symbol.
     }
 
-    if (Resolution != LDPR_PREVAILING_DEF_IRONLY && GV->hasCommonLinkage()) {
-      // Common linkage is special. There is no single symbol that wins the
-      // resolution. Instead we have to collect the maximum alignment and size.
-      // The IR linker does that for us if we just pass it every common GV.
-      // We still have to keep track of LDPR_PREVAILING_DEF_IRONLY so we
-      // internalize once the IR linker has done its job.
-      freeSymName(Sym);
-      continue;
+    ResolutionInfo &Res = ResInfo[Sym.name];
+    if (Resolution == LDPR_PREVAILING_DEF_IRONLY_EXP && !Res.IsLinkonceOdr)
+      Resolution = LDPR_PREVAILING_DEF;
+
+    GV->setUnnamedAddr(Res.UnnamedAddr);
+    GV->setVisibility(Res.Visibility);
+
+    // Override gold's resolution for common symbols. We want the largest
+    // one to win.
+    if (GV->hasCommonLinkage()) {
+      cast<GlobalVariable>(GV)->setAlignment(Res.CommonAlign);
+      if (Resolution == LDPR_PREVAILING_DEF_IRONLY)
+        Res.CommonInternal = true;
+
+      if (Resolution == LDPR_PREVAILING_DEF_IRONLY ||
+          Resolution == LDPR_PREVAILING_DEF)
+        Res.UseCommon = true;
+
+      if (Res.CommonFile == &F && Res.UseCommon) {
+        if (Res.CommonInternal)
+          Resolution = LDPR_PREVAILING_DEF_IRONLY;
+        else
+          Resolution = LDPR_PREVAILING_DEF;
+      } else {
+        Resolution = LDPR_PREEMPTED_IR;
+      }
     }
 
     switch (Resolution) {
@@ -707,40 +664,37 @@ getModuleForFile(LLVMContext &Context, claimed_file &F,
     case LDPR_RESOLVED_IR:
     case LDPR_RESOLVED_EXEC:
     case LDPR_RESOLVED_DYN:
-      assert(GV->isDeclarationForLinker());
+    case LDPR_PREEMPTED_IR:
+    case LDPR_PREEMPTED_REG:
       break;
 
     case LDPR_UNDEF:
-      if (!GV->isDeclarationForLinker()) {
+      if (!GV->isDeclarationForLinker())
         assert(GV->hasComdat());
-        Drop.insert(GV);
-      }
       break;
 
     case LDPR_PREVAILING_DEF_IRONLY: {
-      keepGlobalValue(*GV, KeptAliases);
-      if (!Used.count(GV)) {
-        // Since we use the regular lib/Linker, we cannot just internalize GV
-        // now or it will not be copied to the merged module. Instead we force
-        // it to be copied and then internalize it.
+      Keep.push_back(GV);
+      // The IR linker has to be able to map this value to a declaration,
+      // so we can only internalize after linking.
+      if (!Used.count(GV))
         Internalize.insert(GV->getName());
-      }
       break;
     }
 
     case LDPR_PREVAILING_DEF:
-      keepGlobalValue(*GV, KeptAliases);
-      break;
-
-    case LDPR_PREEMPTED_IR:
-      // Gold might have selected a linkonce_odr and preempted a weak_odr.
-      // In that case we have to make sure we don't end up internalizing it.
-      if (!GV->isDiscardableIfUnused())
-        Maybe.erase(GV->getName());
-
-      // fall-through
-    case LDPR_PREEMPTED_REG:
-      Drop.insert(GV);
+      Keep.push_back(GV);
+      // There is a non IR use, so we have to force optimizations to keep this.
+      switch (GV->getLinkage()) {
+      default:
+        break;
+      case GlobalValue::LinkOnceAnyLinkage:
+        GV->setLinkage(GlobalValue::WeakAnyLinkage);
+        break;
+      case GlobalValue::LinkOnceODRLinkage:
+        GV->setLinkage(GlobalValue::WeakODRLinkage);
+        break;
+      }
       break;
 
     case LDPR_PREVAILING_DEF_IRONLY_EXP: {
@@ -748,9 +702,8 @@ getModuleForFile(LLVMContext &Context, claimed_file &F,
       // reason is that this GV might have a copy in another module
       // and in that module the address might be significant, but that
       // copy will be LDPR_PREEMPTED_IR.
-      if (GV->hasLinkOnceODRLinkage())
-        Maybe.insert(GV->getName());
-      keepGlobalValue(*GV, KeptAliases);
+      Maybe.insert(GV->getName());
+      Keep.push_back(GV);
       break;
     }
     }
@@ -758,19 +711,6 @@ getModuleForFile(LLVMContext &Context, claimed_file &F,
     freeSymName(Sym);
   }
 
-  ValueToValueMapTy VM;
-  LocalValueMaterializer Materializer(Drop);
-  for (GlobalAlias *GA : KeptAliases) {
-    // Gold told us to keep GA. It is possible that a GV usied in the aliasee
-    // expression is being dropped. If that is the case, that GV must be copied.
-    Constant *Aliasee = GA->getAliasee();
-    Constant *Replacement = mapConstantToLocalCopy(Aliasee, VM, &Materializer);
-    GA->setAliasee(Replacement);
-  }
-
-  for (auto *GV : Drop)
-    drop(*GV);
-
   return Obj.takeModule();
 }
 
@@ -941,7 +881,7 @@ static ld_plugin_status allSymbolsReadHook(raw_fd_ostream *ApiFile) {
   Context.setDiagnosticHandler(diagnosticHandlerForContext, nullptr, true);
 
   std::unique_ptr<Module> Combined(new Module("ld-temp.o", Context));
-  Linker L(*Combined, diagnosticHandler);
+  IRMover L(*Combined, diagnosticHandler);
 
   std::string DefaultTriple = sys::getDefaultTargetTriple();
 
@@ -951,15 +891,15 @@ static ld_plugin_status allSymbolsReadHook(raw_fd_ostream *ApiFile) {
     ld_plugin_input_file File;
     if (get_input_file(F.handle, &File) != LDPS_OK)
       message(LDPL_FATAL, "Failed to get file information");
+    std::vector<GlobalValue *> Keep;
     std::unique_ptr<Module> M =
-        getModuleForFile(Context, F, File, ApiFile, Internalize, Maybe);
+        getModuleForFile(Context, F, File, ApiFile, Internalize, Maybe, Keep);
     if (!options::triple.empty())
       M->setTargetTriple(options::triple.c_str());
-    else if (M->getTargetTriple().empty()) {
+    else if (M->getTargetTriple().empty())
       M->setTargetTriple(DefaultTriple);
-    }
 
-    if (L.linkInModule(*M))
+    if (L.move(*M, Keep, [](GlobalValue &, IRMover::ValueAdder) {}))
       message(LDPL_FATAL, "Failed to link module");
     if (release_input_file(F.handle) != LDPS_OK)
       message(LDPL_FATAL, "Failed to release file information");

From 0abf891ab8b4fca8d59b107796125fe1026e9abb Mon Sep 17 00:00:00 2001
From: Dan Gohman <dan433584@gmail.com>
Date: Thu, 10 Dec 2015 14:52:34 +0000
Subject: [PATCH 326/364] [WebAssembly] Tighten up several CHECK tests.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255255 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/WebAssembly/copysign-casts.ll | 10 +++++-----
 test/CodeGen/WebAssembly/inline-asm.ll     |  2 +-
 test/CodeGen/WebAssembly/legalize.ll       | 16 ++++++++--------
 test/CodeGen/WebAssembly/store-results.ll  |  8 ++++----
 4 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/test/CodeGen/WebAssembly/copysign-casts.ll b/test/CodeGen/WebAssembly/copysign-casts.ll
index 0224febb4c00..760e49133018 100644
--- a/test/CodeGen/WebAssembly/copysign-casts.ll
+++ b/test/CodeGen/WebAssembly/copysign-casts.ll
@@ -10,17 +10,17 @@ declare double @copysign(double, double) nounwind readnone
 declare float @copysignf(float, float) nounwind readnone
 
 ; CHECK-LABEL: fold_promote:
-; CHECK: f64.promote/f32 $push0=, $1
-; CHECK: f64.copysign    $push1=, $0, $pop0
+; CHECK: f64.promote/f32 $push0=, $1{{$}}
+; CHECK: f64.copysign    $push1=, $0, $pop0{{$}}
 define double @fold_promote(double %a, float %b) {
   %c = fpext float %b to double
   %t = call double @copysign(double %a, double %c)
   ret double %t
 }
 
-; CHECK-LABEL: fold_demote:
-; CHECK: f32.demote/f64  $push0=, $1
-; CHECK: f32.copysign    $push1=, $0, $pop0
+; CHECK-LABEL: fold_demote:{{$}}
+; CHECK: f32.demote/f64  $push0=, $1{{$}}
+; CHECK: f32.copysign    $push1=, $0, $pop0{{$}}
 define float @fold_demote(float %a, double %b) {
   %c = fptrunc double %b to float
   %t = call float @copysignf(float %a, float %c)
diff --git a/test/CodeGen/WebAssembly/inline-asm.ll b/test/CodeGen/WebAssembly/inline-asm.ll
index 646ea779dc8f..ba5800286297 100644
--- a/test/CodeGen/WebAssembly/inline-asm.ll
+++ b/test/CodeGen/WebAssembly/inline-asm.ll
@@ -66,7 +66,7 @@ define void @X_i16(i16 * %t) {
 
 ; CHECK-LABEL: X_ptr:
 ; CHECK: foo $1{{$}}
-; CHECK: i32.store $discard=, 0($0), $1
+; CHECK: i32.store $discard=, 0($0), $1{{$}}
 define void @X_ptr(i16 ** %t) {
   call void asm sideeffect "foo $0", "=*X,~{dirflag},~{fpsr},~{flags},~{memory}"(i16** %t)
   ret void
diff --git a/test/CodeGen/WebAssembly/legalize.ll b/test/CodeGen/WebAssembly/legalize.ll
index 4dbf96d4d8ad..7cc1246a86e8 100644
--- a/test/CodeGen/WebAssembly/legalize.ll
+++ b/test/CodeGen/WebAssembly/legalize.ll
@@ -6,18 +6,18 @@ target datalayout = "e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: shl_i3:
-; CHECK: i32.const   $push0=, 7
-; CHECK: i32.and     $push1=, $1, $pop0
-; CHECK: i32.shl     $push2=, $0, $pop1
+; CHECK: i32.const   $push0=, 7{{$}}
+; CHECK: i32.and     $push1=, $1, $pop0{{$}}
+; CHECK: i32.shl     $push2=, $0, $pop1{{$}}
 define i3 @shl_i3(i3 %a, i3 %b, i3* %p) {
   %t = shl i3 %a, %b
   ret i3 %t
 }
 
 ; CHECK-LABEL: shl_i53:
-; CHECK: i64.const   $push0=, 9007199254740991
-; CHECK: i64.and     $push1=, $1, $pop0
-; CHECK: i64.shl     $push2=, $0, $pop1
+; CHECK: i64.const   $push0=, 9007199254740991{{$}}
+; CHECK: i64.and     $push1=, $1, $pop0{{$}}
+; CHECK: i64.shl     $push2=, $0, $pop1{{$}}
 define i53 @shl_i53(i53 %a, i53 %b, i53* %p) {
   %t = shl i53 %a, %b
   ret i53 %t
@@ -34,7 +34,7 @@ define i64 @sext_in_reg_i32_i64(i64 %a) {
 
 ; CHECK-LABEL: fpext_f32_f64:
 ; CHECK: f32.load $push0=, 0($0){{$}}
-; CHECK: f64.promote/f32 $push1=, $pop0
+; CHECK: f64.promote/f32 $push1=, $pop0{{$}}
 ; CHECK: return $pop1{{$}}
 define double @fpext_f32_f64(float *%p) {
   %v = load float, float* %p
@@ -44,7 +44,7 @@ define double @fpext_f32_f64(float *%p) {
 
 ; CHECK-LABEL: fpconv_f64_f32:
 ; CHECK: f64.load $push0=, 0($0){{$}}
-; CHECK: f32.demote/f64 $push1=, $pop0
+; CHECK: f32.demote/f64 $push1=, $pop0{{$}}
 ; CHECK: return $pop1{{$}}
 define float @fpconv_f64_f32(double *%p) {
   %v = load double, double* %p
diff --git a/test/CodeGen/WebAssembly/store-results.ll b/test/CodeGen/WebAssembly/store-results.ll
index 5bb7eafa4b1a..84f24e6e5491 100644
--- a/test/CodeGen/WebAssembly/store-results.ll
+++ b/test/CodeGen/WebAssembly/store-results.ll
@@ -8,8 +8,8 @@ target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: single_block:
 ; CHECK-NOT: .local
-; CHECK: i32.const $push{{[0-9]+}}=, 0
-; CHECK: i32.store $push[[STORE:[0-9]+]]=, 0($0), $pop{{[0-9]+}}
+; CHECK: i32.const $push{{[0-9]+}}=, 0{{$}}
+; CHECK: i32.store $push[[STORE:[0-9]+]]=, 0($0), $pop{{[0-9]+}}{{$}}
 ; CHECK: return $pop[[STORE]]{{$}}
 define i32 @single_block(i32* %p) {
 entry:
@@ -26,7 +26,7 @@ entry:
 @pos = global %class.Vec3 zeroinitializer, align 4
 
 ; CHECK-LABEL: foo:
-; CHECK: i32.store $discard=, 0($pop0), $0
+; CHECK: i32.store $discard=, 0($pop0), $0{{$}}
 define void @foo() {
 for.body.i:
   br label %for.body5.i
@@ -44,7 +44,7 @@ for.cond.cleanup4.i:
 }
 
 ; CHECK-LABEL: bar:
-; CHECK: i32.store $discard=, 0($0), $pop0
+; CHECK: i32.store $discard=, 0($0), $pop0{{$}}
 define void @bar() {
 for.body.i:
   br label %for.body5.i

From c818b3ef569b17e053f332b4a0eb36f1f0cb696e Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Thu, 10 Dec 2015 16:11:23 +0000
Subject: [PATCH 327/364] [ThinLTO] Release files in gold plugin during
 combined index (take 2)

Ensure we release the files even when they don't hold a function index
summary section, by restructuring the control flow a little bit.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255256 91177308-0d34-0410-b5e6-96231b3b80d8
---
 tools/gold/gold-plugin.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tools/gold/gold-plugin.cpp b/tools/gold/gold-plugin.cpp
index d73e94922666..e52606b8828b 100644
--- a/tools/gold/gold-plugin.cpp
+++ b/tools/gold/gold-plugin.cpp
@@ -855,10 +855,8 @@ static ld_plugin_status allSymbolsReadHook(raw_fd_ostream *ApiFile) {
           getFunctionIndexForFile(F, File);
 
       // Skip files without a function summary.
-      if (!Index)
-        continue;
-
-      CombinedIndex.mergeFrom(std::move(Index), ++NextModuleId);
+      if (Index)
+        CombinedIndex.mergeFrom(std::move(Index), ++NextModuleId);
 
       if (release_input_file(F.handle) != LDPS_OK)
         message(LDPL_FATAL, "Failed to release file information");

From f3ba0560c0fcb3267545d8732ec04f5fc93986a5 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Thu, 10 Dec 2015 16:34:21 +0000
Subject: [PATCH 328/364] remove duplicated comments and don't repeat function
 names in comments; NFC

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255257 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/MachineLICM.cpp | 225 +++++++++++++-----------------------
 1 file changed, 83 insertions(+), 142 deletions(-)

diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp
index fa59b0c6aca2..1a8e92332bc1 100644
--- a/lib/CodeGen/MachineLICM.cpp
+++ b/lib/CodeGen/MachineLICM.cpp
@@ -153,7 +153,7 @@ namespace {
     }
 
   private:
-    /// CandidateInfo - Keep track of information about hoisting candidates.
+    /// Keep track of information about hoisting candidates.
     struct CandidateInfo {
       MachineInstr *MI;
       unsigned      Def;
@@ -162,149 +162,76 @@ namespace {
         : MI(mi), Def(def), FI(fi) {}
     };
 
-    /// HoistRegionPostRA - Walk the specified region of the CFG and hoist loop
-    /// invariants out to the preheader.
     void HoistRegionPostRA();
 
-    /// HoistPostRA - When an instruction is found to only use loop invariant
-    /// operands that is safe to hoist, this instruction is called to do the
-    /// dirty work.
     void HoistPostRA(MachineInstr *MI, unsigned Def);
 
-    /// ProcessMI - Examine the instruction for potentai LICM candidate. Also
-    /// gather register def and frame object update information.
-    void ProcessMI(MachineInstr *MI,
-                   BitVector &PhysRegDefs,
-                   BitVector &PhysRegClobbers,
-                   SmallSet<int, 32> &StoredFIs,
+    void ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
+                   BitVector &PhysRegClobbers, SmallSet<int, 32> &StoredFIs,
                    SmallVectorImpl<CandidateInfo> &Candidates);
 
-    /// AddToLiveIns - Add register 'Reg' to the livein sets of BBs in the
-    /// current loop.
     void AddToLiveIns(unsigned Reg);
 
-    /// IsLICMCandidate - Returns true if the instruction may be a suitable
-    /// candidate for LICM. e.g. If the instruction is a call, then it's
-    /// obviously not safe to hoist it.
     bool IsLICMCandidate(MachineInstr &I);
 
-    /// IsLoopInvariantInst - Returns true if the instruction is loop
-    /// invariant. I.e., all virtual register operands are defined outside of
-    /// the loop, physical registers aren't accessed (explicitly or implicitly),
-    /// and the instruction is hoistable.
-    ///
     bool IsLoopInvariantInst(MachineInstr &I);
 
-    /// HasLoopPHIUse - Return true if the specified instruction is used by any
-    /// phi node in the current loop.
     bool HasLoopPHIUse(const MachineInstr *MI) const;
 
-    /// HasHighOperandLatency - Compute operand latency between a def of 'Reg'
-    /// and an use in the current loop, return true if the target considered
-    /// it 'high'.
     bool HasHighOperandLatency(MachineInstr &MI, unsigned DefIdx,
                                unsigned Reg) const;
 
     bool IsCheapInstruction(MachineInstr &MI) const;
 
-    /// CanCauseHighRegPressure - Visit BBs from header to current BB,
-    /// check if hoisting an instruction of the given cost matrix can cause high
-    /// register pressure.
     bool CanCauseHighRegPressure(const DenseMap<unsigned, int> &Cost,
                                  bool Cheap);
 
-    /// UpdateBackTraceRegPressure - Traverse the back trace from header to
-    /// the current block and update their register pressures to reflect the
-    /// effect of hoisting MI from the current block to the preheader.
     void UpdateBackTraceRegPressure(const MachineInstr *MI);
 
-    /// IsProfitableToHoist - Return true if it is potentially profitable to
-    /// hoist the given loop invariant.
     bool IsProfitableToHoist(MachineInstr &MI);
 
-    /// IsGuaranteedToExecute - Check if this mbb is guaranteed to execute.
-    /// If not then a load from this mbb may not be safe to hoist.
     bool IsGuaranteedToExecute(MachineBasicBlock *BB);
 
     void EnterScope(MachineBasicBlock *MBB);
 
     void ExitScope(MachineBasicBlock *MBB);
 
-    /// ExitScopeIfDone - Destroy scope for the MBB that corresponds to given
-    /// dominator tree node if its a leaf or all of its children are done. Walk
-    /// up the dominator tree to destroy ancestors which are now done.
-    void ExitScopeIfDone(MachineDomTreeNode *Node,
-                DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren,
-                DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> &ParentMap);
-
-    /// HoistOutOfLoop - Walk the specified loop in the CFG (defined by all
-    /// blocks dominated by the specified header block, and that are in the
-    /// current loop) in depth first order w.r.t the DominatorTree. This allows
-    /// us to visit definitions before uses, allowing us to hoist a loop body in
-    /// one pass without iteration.
-    ///
+    void ExitScopeIfDone(
+        MachineDomTreeNode *Node,
+        DenseMap<MachineDomTreeNode *, unsigned> &OpenChildren,
+        DenseMap<MachineDomTreeNode *, MachineDomTreeNode *> &ParentMap);
+
     void HoistOutOfLoop(MachineDomTreeNode *LoopHeaderNode);
+
     void HoistRegion(MachineDomTreeNode *N, bool IsHeader);
 
-    /// SinkIntoLoop - Sink instructions into loops if profitable. This
-    /// especially tries to prevent register spills caused by register pressure
-    /// if there is little to no overhead moving instructions into loops.
     void SinkIntoLoop();
 
-    /// InitRegPressure - Find all virtual register references that are liveout
-    /// of the preheader to initialize the starting "register pressure". Note
-    /// this does not count live through (livein but not used) registers.
     void InitRegPressure(MachineBasicBlock *BB);
 
-    /// calcRegisterCost - Calculate the additional register pressure that the
-    /// registers used in MI cause.
-    ///
-    /// If 'ConsiderSeen' is true, updates 'RegSeen' and uses the information to
-    /// figure out which usages are live-ins.
-    /// FIXME: Figure out a way to consider 'RegSeen' from all code paths.
     DenseMap<unsigned, int> calcRegisterCost(const MachineInstr *MI,
                                              bool ConsiderSeen,
                                              bool ConsiderUnseenAsDef);
 
-    /// UpdateRegPressure - Update estimate of register pressure after the
-    /// specified instruction.
     void UpdateRegPressure(const MachineInstr *MI,
                            bool ConsiderUnseenAsDef = false);
 
-    /// ExtractHoistableLoad - Unfold a load from the given machineinstr if
-    /// the load itself could be hoisted. Return the unfolded and hoistable
-    /// load, or null if the load couldn't be unfolded or if it wouldn't
-    /// be hoistable.
     MachineInstr *ExtractHoistableLoad(MachineInstr *MI);
 
-    /// LookForDuplicate - Find an instruction amount PrevMIs that is a
-    /// duplicate of MI. Return this instruction if it's found.
-    const MachineInstr *LookForDuplicate(const MachineInstr *MI,
-                                     std::vector<const MachineInstr*> &PrevMIs);
+    const MachineInstr *
+    LookForDuplicate(const MachineInstr *MI,
+                     std::vector<const MachineInstr *> &PrevMIs);
 
-    /// EliminateCSE - Given a LICM'ed instruction, look for an instruction on
-    /// the preheader that compute the same value. If it's found, do a RAU on
-    /// with the definition of the existing instruction rather than hoisting
-    /// the instruction to the preheader.
-    bool EliminateCSE(MachineInstr *MI,
-           DenseMap<unsigned, std::vector<const MachineInstr*> >::iterator &CI);
+    bool EliminateCSE(
+        MachineInstr *MI,
+        DenseMap<unsigned, std::vector<const MachineInstr *>>::iterator &CI);
 
-    /// MayCSE - Return true if the given instruction will be CSE'd if it's
-    /// hoisted out of the loop.
     bool MayCSE(MachineInstr *MI);
 
-    /// Hoist - When an instruction is found to only use loop invariant operands
-    /// that is safe to hoist, this instruction is called to do the dirty work.
-    /// It returns true if the instruction is hoisted.
     bool Hoist(MachineInstr *MI, MachineBasicBlock *Preheader);
 
-    /// InitCSEMap - Initialize the CSE map with instructions that are in the
-    /// current loop preheader that may become duplicates of instructions that
-    /// are hoisted out of the loop.
     void InitCSEMap(MachineBasicBlock *BB);
 
-    /// getCurPreheader - Get the preheader for the current loop, splitting
-    /// a critical edge if needed.
     MachineBasicBlock *getCurPreheader();
   };
 } // end anonymous namespace
@@ -319,8 +246,7 @@ INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(MachineLICM, "machinelicm",
                 "Machine Loop Invariant Code Motion", false, false)
 
-/// LoopIsOuterMostWithPredecessor - Test if the given loop is the outer-most
-/// loop that has a unique predecessor.
+/// Test if the given loop is the outer-most loop that has a unique predecessor.
 static bool LoopIsOuterMostWithPredecessor(MachineLoop *CurLoop) {
   // Check whether this loop even has a unique predecessor.
   if (!CurLoop->getLoopPredecessor())
@@ -402,8 +328,7 @@ bool MachineLICM::runOnMachineFunction(MachineFunction &MF) {
   return Changed;
 }
 
-/// InstructionStoresToFI - Return true if instruction stores to the
-/// specified frame.
+/// Return true if instruction stores to the specified frame.
 static bool InstructionStoresToFI(const MachineInstr *MI, int FI) {
   for (MachineInstr::mmo_iterator o = MI->memoperands_begin(),
          oe = MI->memoperands_end(); o != oe; ++o) {
@@ -418,7 +343,7 @@ static bool InstructionStoresToFI(const MachineInstr *MI, int FI) {
   return false;
 }
 
-/// ProcessMI - Examine the instruction for potentai LICM candidate. Also
+/// Examine the instruction for potentai LICM candidate. Also
 /// gather register def and frame object update information.
 void MachineLICM::ProcessMI(MachineInstr *MI,
                             BitVector &PhysRegDefs,
@@ -506,8 +431,8 @@ void MachineLICM::ProcessMI(MachineInstr *MI,
   }
 }
 
-/// HoistRegionPostRA - Walk the specified region of the CFG and hoist loop
-/// invariants out to the preheader.
+/// Walk the specified region of the CFG and hoist loop invariants out to the
+/// preheader.
 void MachineLICM::HoistRegionPostRA() {
   MachineBasicBlock *Preheader = getCurPreheader();
   if (!Preheader)
@@ -599,8 +524,8 @@ void MachineLICM::HoistRegionPostRA() {
   }
 }
 
-/// AddToLiveIns - Add register 'Reg' to the livein sets of BBs in the current
-/// loop, and make sure it is not killed by any instructions in the loop.
+/// Add register 'Reg' to the livein sets of BBs in the current loop, and make
+/// sure it is not killed by any instructions in the loop.
 void MachineLICM::AddToLiveIns(unsigned Reg) {
   const std::vector<MachineBasicBlock *> &Blocks = CurLoop->getBlocks();
   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
@@ -620,9 +545,8 @@ void MachineLICM::AddToLiveIns(unsigned Reg) {
   }
 }
 
-/// HoistPostRA - When an instruction is found to only use loop invariant
-/// operands that is safe to hoist, this instruction is called to do the
-/// dirty work.
+/// When an instruction is found to only use loop invariant operands that is
+/// safe to hoist, this instruction is called to do the dirty work.
 void MachineLICM::HoistPostRA(MachineInstr *MI, unsigned Def) {
   MachineBasicBlock *Preheader = getCurPreheader();
 
@@ -644,8 +568,8 @@ void MachineLICM::HoistPostRA(MachineInstr *MI, unsigned Def) {
   Changed = true;
 }
 
-// IsGuaranteedToExecute - Check if this mbb is guaranteed to execute.
-// If not then a load from this mbb may not be safe to hoist.
+/// Check if this mbb is guaranteed to execute. If not then a load from this mbb
+/// may not be safe to hoist.
 bool MachineLICM::IsGuaranteedToExecute(MachineBasicBlock *BB) {
   if (SpeculationState != SpeculateUnknown)
     return SpeculationState == SpeculateFalse;
@@ -677,9 +601,9 @@ void MachineLICM::ExitScope(MachineBasicBlock *MBB) {
   BackTrace.pop_back();
 }
 
-/// ExitScopeIfDone - Destroy scope for the MBB that corresponds to the given
-/// dominator tree node if its a leaf or all of its children are done. Walk
-/// up the dominator tree to destroy ancestors which are now done.
+/// Destroy scope for the MBB that corresponds to the given dominator tree node
+/// if its a leaf or all of its children are done. Walk up the dominator tree to
+/// destroy ancestors which are now done.
 void MachineLICM::ExitScopeIfDone(MachineDomTreeNode *Node,
                 DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren,
                 DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> &ParentMap) {
@@ -699,11 +623,10 @@ void MachineLICM::ExitScopeIfDone(MachineDomTreeNode *Node,
   }
 }
 
-/// HoistOutOfLoop - Walk the specified loop in the CFG (defined by all
-/// blocks dominated by the specified header block, and that are in the
-/// current loop) in depth first order w.r.t the DominatorTree. This allows
-/// us to visit definitions before uses, allowing us to hoist a loop body in
-/// one pass without iteration.
+/// Walk the specified loop in the CFG (defined by all blocks dominated by the
+/// specified header block, and that are in the current loop) in depth first
+/// order w.r.t the DominatorTree. This allows us to visit definitions before
+/// uses, allowing us to hoist a loop body in one pass without iteration.
 ///
 void MachineLICM::HoistOutOfLoop(MachineDomTreeNode *HeaderN) {
   MachineBasicBlock *Preheader = getCurPreheader();
@@ -784,6 +707,9 @@ void MachineLICM::HoistOutOfLoop(MachineDomTreeNode *HeaderN) {
   }
 }
 
+/// Sink instructions into loops if profitable. This especially tries to prevent
+/// register spills caused by register pressure if there is little to no
+/// overhead moving instructions into loops.
 void MachineLICM::SinkIntoLoop() {
   MachineBasicBlock *Preheader = getCurPreheader();
   if (!Preheader)
@@ -835,9 +761,9 @@ static bool isOperandKill(const MachineOperand &MO, MachineRegisterInfo *MRI) {
   return MO.isKill() || MRI->hasOneNonDBGUse(MO.getReg());
 }
 
-/// InitRegPressure - Find all virtual register references that are liveout of
-/// the preheader to initialize the starting "register pressure". Note this
-/// does not count live through (livein but not used) registers.
+/// Find all virtual register references that are liveout of the preheader to
+/// initialize the starting "register pressure". Note this does not count live
+/// through (livein but not used) registers.
 void MachineLICM::InitRegPressure(MachineBasicBlock *BB) {
   std::fill(RegPressure.begin(), RegPressure.end(), 0);
 
@@ -856,8 +782,7 @@ void MachineLICM::InitRegPressure(MachineBasicBlock *BB) {
     UpdateRegPressure(&MI, /*ConsiderUnseenAsDef=*/true);
 }
 
-/// UpdateRegPressure - Update estimate of register pressure after the
-/// specified instruction.
+/// Update estimate of register pressure after the specified instruction.
 void MachineLICM::UpdateRegPressure(const MachineInstr *MI,
                                     bool ConsiderUnseenAsDef) {
   auto Cost = calcRegisterCost(MI, /*ConsiderSeen=*/true, ConsiderUnseenAsDef);
@@ -870,6 +795,12 @@ void MachineLICM::UpdateRegPressure(const MachineInstr *MI,
   }
 }
 
+/// Calculate the additional register pressure that the registers used in MI
+/// cause.
+///
+/// If 'ConsiderSeen' is true, updates 'RegSeen' and uses the information to
+/// figure out which usages are live-ins.
+/// FIXME: Figure out a way to consider 'RegSeen' from all code paths.
 DenseMap<unsigned, int>
 MachineLICM::calcRegisterCost(const MachineInstr *MI, bool ConsiderSeen,
                               bool ConsiderUnseenAsDef) {
@@ -913,8 +844,8 @@ MachineLICM::calcRegisterCost(const MachineInstr *MI, bool ConsiderSeen,
   return Cost;
 }
 
-/// isLoadFromGOTOrConstantPool - Return true if this machine instruction
-/// loads from global offset table or constant pool.
+/// Return true if this machine instruction loads from global offset table or
+/// constant pool.
 static bool isLoadFromGOTOrConstantPool(MachineInstr &MI) {
   assert (MI.mayLoad() && "Expected MI that loads!");
   for (MachineInstr::mmo_iterator I = MI.memoperands_begin(),
@@ -927,9 +858,8 @@ static bool isLoadFromGOTOrConstantPool(MachineInstr &MI) {
   return false;
 }
 
-/// IsLICMCandidate - Returns true if the instruction may be a suitable
-/// candidate for LICM. e.g. If the instruction is a call, then it's obviously
-/// not safe to hoist it.
+/// Returns true if the instruction may be a suitable candidate for LICM.
+/// e.g. If the instruction is a call, then it's obviously not safe to hoist it.
 bool MachineLICM::IsLICMCandidate(MachineInstr &I) {
   // Check if it's safe to move the instruction.
   bool DontMoveAcrossStore = true;
@@ -949,9 +879,9 @@ bool MachineLICM::IsLICMCandidate(MachineInstr &I) {
   return true;
 }
 
-/// IsLoopInvariantInst - Returns true if the instruction is loop
-/// invariant. I.e., all virtual register operands are defined outside of the
-/// loop, physical registers aren't accessed explicitly, and there are no side
+/// Returns true if the instruction is loop invariant.
+/// I.e., all virtual register operands are defined outside of the loop,
+/// physical registers aren't accessed explicitly, and there are no side
 /// effects that aren't captured by the operands or other flags.
 ///
 bool MachineLICM::IsLoopInvariantInst(MachineInstr &I) {
@@ -1005,8 +935,8 @@ bool MachineLICM::IsLoopInvariantInst(MachineInstr &I) {
 }
 
 
-/// HasLoopPHIUse - Return true if the specified instruction is used by a
-/// phi node and hoisting it could cause a copy to be inserted.
+/// Return true if the specified instruction is used by a phi node and hoisting
+/// it could cause a copy to be inserted.
 bool MachineLICM::HasLoopPHIUse(const MachineInstr *MI) const {
   SmallVector<const MachineInstr*, 8> Work(1, MI);
   do {
@@ -1040,9 +970,8 @@ bool MachineLICM::HasLoopPHIUse(const MachineInstr *MI) const {
   return false;
 }
 
-/// HasHighOperandLatency - Compute operand latency between a def of 'Reg'
-/// and an use in the current loop, return true if the target considered
-/// it 'high'.
+/// Compute operand latency between a def of 'Reg' and an use in the current
+/// loop, return true if the target considered it high.
 bool MachineLICM::HasHighOperandLatency(MachineInstr &MI,
                                         unsigned DefIdx, unsigned Reg) const {
   if (MRI->use_nodbg_empty(Reg))
@@ -1072,8 +1001,8 @@ bool MachineLICM::HasHighOperandLatency(MachineInstr &MI,
   return false;
 }
 
-/// IsCheapInstruction - Return true if the instruction is marked "cheap" or
-/// the operand latency between its def and a use is one or less.
+/// Return true if the instruction is marked "cheap" or the operand latency
+/// between its def and a use is one or less.
 bool MachineLICM::IsCheapInstruction(MachineInstr &MI) const {
   if (TII->isAsCheapAsAMove(&MI) || MI.isCopyLike())
     return true;
@@ -1097,9 +1026,8 @@ bool MachineLICM::IsCheapInstruction(MachineInstr &MI) const {
   return isCheap;
 }
 
-/// CanCauseHighRegPressure - Visit BBs from header to current BB, check
-/// if hoisting an instruction of the given cost matrix can cause high
-/// register pressure.
+/// Visit BBs from header to current BB, check if hoisting an instruction of the
+/// given cost matrix can cause high register pressure.
 bool MachineLICM::CanCauseHighRegPressure(const DenseMap<unsigned, int>& Cost,
                                           bool CheapInstr) {
   for (const auto &RPIdAndCost : Cost) {
@@ -1122,9 +1050,9 @@ bool MachineLICM::CanCauseHighRegPressure(const DenseMap<unsigned, int>& Cost,
   return false;
 }
 
-/// UpdateBackTraceRegPressure - Traverse the back trace from header to the
-/// current block and update their register pressures to reflect the effect
-/// of hoisting MI from the current block to the preheader.
+/// Traverse the back trace from header to the current block and update their
+/// register pressures to reflect the effect of hoisting MI from the current
+/// block to the preheader.
 void MachineLICM::UpdateBackTraceRegPressure(const MachineInstr *MI) {
   // First compute the 'cost' of the instruction, i.e. its contribution
   // to register pressure.
@@ -1137,8 +1065,8 @@ void MachineLICM::UpdateBackTraceRegPressure(const MachineInstr *MI) {
       RP[RPIdAndCost.first] += RPIdAndCost.second;
 }
 
-/// IsProfitableToHoist - Return true if it is potentially profitable to hoist
-/// the given loop invariant.
+/// Return true if it is potentially profitable to hoist the given loop
+/// invariant.
 bool MachineLICM::IsProfitableToHoist(MachineInstr &MI) {
   if (MI.isImplicitDef())
     return true;
@@ -1228,6 +1156,9 @@ bool MachineLICM::IsProfitableToHoist(MachineInstr &MI) {
   return true;
 }
 
+/// Unfold a load from the given machineinstr if the load itself could be
+/// hoisted. Return the unfolded and hoistable load, or null if the load
+/// couldn't be unfolded or if it wouldn't be hoistable.
 MachineInstr *MachineLICM::ExtractHoistableLoad(MachineInstr *MI) {
   // Don't unfold simple loads.
   if (MI->canFoldAsLoad())
@@ -1285,6 +1216,9 @@ MachineInstr *MachineLICM::ExtractHoistableLoad(MachineInstr *MI) {
   return NewMIs[0];
 }
 
+/// Initialize the CSE map with instructions that are in the current loop
+/// preheader that may become duplicates of instructions that are hoisted
+/// out of the loop.
 void MachineLICM::InitCSEMap(MachineBasicBlock *BB) {
   for (MachineBasicBlock::iterator I = BB->begin(),E = BB->end(); I != E; ++I) {
     const MachineInstr *MI = &*I;
@@ -1293,6 +1227,8 @@ void MachineLICM::InitCSEMap(MachineBasicBlock *BB) {
   }
 }
 
+/// Find an instruction amount PrevMIs that is a duplicate of MI.
+/// Return this instruction if it's found.
 const MachineInstr*
 MachineLICM::LookForDuplicate(const MachineInstr *MI,
                               std::vector<const MachineInstr*> &PrevMIs) {
@@ -1304,6 +1240,10 @@ MachineLICM::LookForDuplicate(const MachineInstr *MI,
   return nullptr;
 }
 
+/// Given a LICM'ed instruction, look for an instruction on the preheader that
+/// computes the same value. If it's found, do a RAU on with the definition of
+/// the existing instruction rather than hoisting the instruction to the
+/// preheader.
 bool MachineLICM::EliminateCSE(MachineInstr *MI,
           DenseMap<unsigned, std::vector<const MachineInstr*> >::iterator &CI) {
   // Do not CSE implicit_def so ProcessImplicitDefs can properly propagate
@@ -1361,8 +1301,8 @@ bool MachineLICM::EliminateCSE(MachineInstr *MI,
   return false;
 }
 
-/// MayCSE - Return true if the given instruction will be CSE'd if it's
-/// hoisted out of the loop.
+/// Return true if the given instruction will be CSE'd if it's hoisted out of
+/// the loop.
 bool MachineLICM::MayCSE(MachineInstr *MI) {
   unsigned Opcode = MI->getOpcode();
   DenseMap<unsigned, std::vector<const MachineInstr*> >::iterator
@@ -1375,9 +1315,9 @@ bool MachineLICM::MayCSE(MachineInstr *MI) {
   return LookForDuplicate(MI, CI->second) != nullptr;
 }
 
-/// Hoist - When an instruction is found to use only loop invariant operands
+/// When an instruction is found to use only loop invariant operands
 /// that are safe to hoist, this instruction is called to do the dirty work.
-///
+/// It returns true if the instruction is hoisted.
 bool MachineLICM::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader) {
   // First check whether we should hoist this instruction.
   if (!IsLoopInvariantInst(*MI) || !IsProfitableToHoist(*MI)) {
@@ -1439,6 +1379,7 @@ bool MachineLICM::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader) {
   return true;
 }
 
+/// Get the preheader for the current loop, splitting a critical edge if needed.
 MachineBasicBlock *MachineLICM::getCurPreheader() {
   // Determine the block to which to hoist instructions. If we can't find a
   // suitable loop predecessor, we can't do any hoisting.

From 4d1332c4e7d3c428f5c5a9b08f02ea70deeac283 Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Thu, 10 Dec 2015 16:35:06 +0000
Subject: [PATCH 329/364] Avoid undefined behavior when vector is empty.

Found by ubsan.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255258 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ADT/SetVector.h | 2 ++
 lib/Linker/LinkModules.cpp   | 3 +--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/llvm/ADT/SetVector.h b/include/llvm/ADT/SetVector.h
index 3ab202d8d71c..bc563570c203 100644
--- a/include/llvm/ADT/SetVector.h
+++ b/include/llvm/ADT/SetVector.h
@@ -58,6 +58,8 @@ class SetVector {
     insert(Start, End);
   }
 
+  ArrayRef<T> getArrayRef() const { return vector_; }
+
   /// \brief Determine if the SetVector is empty or not.
   bool empty() const {
     return vector_.empty();
diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index 3d3454f3f7fd..a596697e8f51 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -772,8 +772,7 @@ bool ModuleLinker::run() {
       Internalize.insert(GV->getName());
   }
 
-  if (Mover.move(SrcM,
-                 makeArrayRef(&*ValuesToLink.begin(), ValuesToLink.size()),
+  if (Mover.move(SrcM, ValuesToLink.getArrayRef(),
                  [this](GlobalValue &GV, IRMover::ValueAdder Add) {
                    addLazyFor(GV, Add);
                  }))

From 1b061251f0979e2fbf45fe959b1d15355991578a Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Thu, 10 Dec 2015 16:39:07 +0000
Subject: [PATCH 330/364] [ThinLTO] Debug message cleanup (NFC)

Added some missing spaces between the module identifier and the start of
the debug message. Also added a ":" after the module identifier to make
this look a little nicer.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255259 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/IPO/FunctionImport.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp
index 6325a726673e..4d137e9fe588 100644
--- a/lib/Transforms/IPO/FunctionImport.cpp
+++ b/lib/Transforms/IPO/FunctionImport.cpp
@@ -126,7 +126,7 @@ static void findExternalCalls(const Module &DestModule, Function &F,
         if (SrcGV) {
           assert(isa<Function>(SrcGV) && "Name collision during import");
           if (!cast<Function>(SrcGV)->isDeclaration()) {
-            DEBUG(dbgs() << DestModule.getModuleIdentifier() << "Ignoring "
+            DEBUG(dbgs() << DestModule.getModuleIdentifier() << ": Ignoring "
                          << ImportedName << " already in DestinationModule\n");
             continue;
           }
@@ -134,7 +134,7 @@ static void findExternalCalls(const Module &DestModule, Function &F,
 
         Worklist.push_back(It.first->getKey());
         DEBUG(dbgs() << DestModule.getModuleIdentifier()
-                     << " Adding callee for : " << ImportedName << " : "
+                     << ": Adding callee for : " << ImportedName << " : "
                      << F.getName() << "\n");
       }
     }
@@ -157,13 +157,13 @@ static void GetImportList(
     const FunctionInfoIndex &Index, ModuleLazyLoaderCache &ModuleLoaderCache) {
   while (!Worklist.empty()) {
     auto CalledFunctionName = Worklist.pop_back_val();
-    DEBUG(dbgs() << DestModule.getModuleIdentifier() << "Process import for "
+    DEBUG(dbgs() << DestModule.getModuleIdentifier() << ": Process import for "
                  << CalledFunctionName << "\n");
 
     // Try to get a summary for this function call.
     auto InfoList = Index.findFunctionInfoList(CalledFunctionName);
     if (InfoList == Index.end()) {
-      DEBUG(dbgs() << DestModule.getModuleIdentifier() << "No summary for "
+      DEBUG(dbgs() << DestModule.getModuleIdentifier() << ": No summary for "
                    << CalledFunctionName << " Ignoring.\n");
       continue;
     }
@@ -177,13 +177,13 @@ static void GetImportList(
     if (!Summary) {
       // FIXME: in case we are lazyloading summaries, we can do it now.
       DEBUG(dbgs() << DestModule.getModuleIdentifier()
-                   << " Missing summary for  " << CalledFunctionName
+                   << ": Missing summary for  " << CalledFunctionName
                    << ", error at import?\n");
       llvm_unreachable("Missing summary");
     }
 
     if (Summary->instCount() > ImportInstrLimit) {
-      DEBUG(dbgs() << DestModule.getModuleIdentifier() << " Skip import of "
+      DEBUG(dbgs() << DestModule.getModuleIdentifier() << ": Skip import of "
                    << CalledFunctionName << " with " << Summary->instCount()
                    << " instructions (limit " << ImportInstrLimit << ")\n");
       continue;
@@ -191,7 +191,7 @@ static void GetImportList(
 
     // Get the module path from the summary.
     auto ModuleIdentifier = Summary->modulePath();
-    DEBUG(dbgs() << DestModule.getModuleIdentifier() << " Importing "
+    DEBUG(dbgs() << DestModule.getModuleIdentifier() << ": Importing "
                  << CalledFunctionName << " from " << ModuleIdentifier << "\n");
 
     auto &SrcModule = ModuleLoaderCache(ModuleIdentifier);
@@ -229,7 +229,7 @@ static void GetImportList(
     // semantics.
     if (SGV->hasWeakAnyLinkage()) {
       DEBUG(dbgs() << DestModule.getModuleIdentifier()
-                   << " Ignoring import request for weak-any "
+                   << ": Ignoring import request for weak-any "
                    << (isa<Function>(SGV) ? "function " : "alias ")
                    << CalledFunctionName << " from "
                    << SrcModule.getModuleIdentifier() << "\n");

From 2bce43140280644b88443f306a2858a5718a89f5 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Thu, 10 Dec 2015 17:09:28 +0000
Subject: [PATCH 331/364] [InstCombine] fold bitcasts around an extractelement
 (3rd try)

This is a redo of r255137 (reverted at r255227) which was a redo of
r255124 (reverted at r255126) with a fixed check for a scalar source
type and an added test for the failure that caused the revert.

Original commit message:

Example:
  bitcast (extractelement (bitcast <2 x float> %X to <2 x i32>), 1) to float
    --->
  extractelement <2 x float> %X, i32 1

This is part of fixing PR25543:
https://llvm.org/bugs/show_bug.cgi?id=25543

The next step will be to generalize this fold:
trunc ( lshr ( bitcast X) ) -> extractelement (X)

Ie, I'm hoping to replace the existing transform of:
bitcast ( trunc ( lshr ( bitcast X)))
added by:
http://reviews.llvm.org/rL112232

with 2 less specific transforms to catch the case in the bug report.

Differential Revision: http://reviews.llvm.org/D14879


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255261 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/InstCombineCasts.cpp          | 39 +++++++++++++++++
 test/Transforms/InstCombine/bitcast.ll        | 42 +++++++++++++++----
 2 files changed, 73 insertions(+), 8 deletions(-)

diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 4afe1bb243ff..dcd86db036b4 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1715,6 +1715,42 @@ static Value *optimizeIntegerToVectorInsertions(BitCastInst &CI,
   return Result;
 }
 
+/// Given a bitcasted source operand fed into an extract element instruction and
+/// then bitcasted again to a scalar type, eliminate at least one bitcast by
+/// changing the vector type of the extractelement instruction.
+/// Example:
+///   bitcast (extractelement (bitcast <2 x float> %X to <2 x i32>), 1) to float
+///    --->
+///   extractelement <2 x float> %X, i32 1
+static Instruction *foldBitCastExtElt(BitCastInst &BitCast, InstCombiner &IC,
+                                      const DataLayout &DL) {
+  Type *DestType = BitCast.getType();
+  if (DestType->isVectorTy())
+    return nullptr;
+
+  // TODO: Create and use a pattern matcher for ExtractElementInst.
+  auto *ExtElt = dyn_cast<ExtractElementInst>(BitCast.getOperand(0));
+  if (!ExtElt || !ExtElt->hasOneUse())
+    return nullptr;
+
+  Value *InnerBitCast = nullptr;
+  if (!match(ExtElt->getOperand(0), m_BitCast(m_Value(InnerBitCast))))
+    return nullptr;
+
+  // If the source is not a vector or its element type doesn't match the result
+  // type, bitcast it to a vector type that we can extract from.
+  Type *SourceType = InnerBitCast->getType();
+  if (SourceType->getScalarType() != DestType) {
+    unsigned VecWidth = SourceType->getPrimitiveSizeInBits();
+    unsigned DestWidth = DestType->getPrimitiveSizeInBits();
+    unsigned NumElts = VecWidth / DestWidth;
+    SourceType = VectorType::get(DestType, NumElts);
+    InnerBitCast = IC.Builder->CreateBitCast(InnerBitCast, SourceType, "bc");
+  }
+
+  return ExtractElementInst::Create(InnerBitCast, ExtElt->getOperand(1));
+}
+
 static Instruction *foldVecTruncToExtElt(Value *VecInput, Type *DestTy,
                                          unsigned ShiftAmt, InstCombiner &IC,
                                          const DataLayout &DL) {
@@ -1886,6 +1922,9 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     }
   }
 
+  if (Instruction *I = foldBitCastExtElt(CI, *this, DL))
+    return I;
+
   if (SrcTy->isPointerTy())
     return commonPointerCastTransforms(CI);
   return commonCastTransforms(CI);
diff --git a/test/Transforms/InstCombine/bitcast.ll b/test/Transforms/InstCombine/bitcast.ll
index 2a8194e53032..bccd19cc32ea 100644
--- a/test/Transforms/InstCombine/bitcast.ll
+++ b/test/Transforms/InstCombine/bitcast.ll
@@ -64,7 +64,7 @@ define float @test3(<2 x float> %A, <2 x i64> %B) {
 ; CHECK-NEXT:  ret float %add
 }
 
-; TODO: Both bitcasts are unnecessary; change the extractelement.
+; Both bitcasts are unnecessary; change the extractelement.
 
 define float @bitcast_extelt1(<2 x float> %A) {
   %bc1 = bitcast <2 x float> %A to <2 x i32>
@@ -73,13 +73,11 @@ define float @bitcast_extelt1(<2 x float> %A) {
   ret float %bc2
 
 ; CHECK-LABEL: @bitcast_extelt1(
-; CHECK-NEXT:  %bc1 = bitcast <2 x float> %A to <2 x i32>
-; CHECK-NEXT:  %ext = extractelement <2 x i32> %bc1, i32 0
-; CHECK-NEXT:  %bc2 = bitcast i32 %ext to float
+; CHECK-NEXT:  %bc2 = extractelement <2 x float> %A, i32 0
 ; CHECK-NEXT:  ret float %bc2
 }
 
-; TODO: Second bitcast can be folded into the first.
+; Second bitcast can be folded into the first.
 
 define i64 @bitcast_extelt2(<4 x float> %A) {
   %bc1 = bitcast <4 x float> %A to <2 x double>
@@ -88,12 +86,40 @@ define i64 @bitcast_extelt2(<4 x float> %A) {
   ret i64 %bc2
 
 ; CHECK-LABEL: @bitcast_extelt2(
-; CHECK-NEXT:  %bc1 = bitcast <4 x float> %A to <2 x double>
-; CHECK-NEXT:  %ext = extractelement <2 x double> %bc1, i32 1
-; CHECK-NEXT:  %bc2 = bitcast double %ext to i64
+; CHECK-NEXT:  %bc = bitcast <4 x float> %A to <2 x i64>
+; CHECK-NEXT:  %bc2 = extractelement <2 x i64> %bc, i32 1
 ; CHECK-NEXT:  ret i64 %bc2
 }
 
+; TODO: This should return %A. 
+
+define <2 x i32> @bitcast_extelt3(<2 x i32> %A) {
+  %bc1 = bitcast <2 x i32> %A to <1 x i64>
+  %ext = extractelement <1 x i64> %bc1, i32 0
+  %bc2 = bitcast i64 %ext to <2 x i32>
+  ret <2 x i32> %bc2
+
+; CHECK-LABEL: @bitcast_extelt3(
+; CHECK-NEXT:  %bc1 = bitcast <2 x i32> %A to <1 x i64>
+; CHECK-NEXT:  %ext = extractelement <1 x i64> %bc1, i32 0
+; CHECK-NEXT:  %bc2 = bitcast i64 %ext to <2 x i32>
+; CHECK-NEXT:  ret <2 x i32> %bc2
+}
+
+; Handle the case where the input is not a vector.
+
+define double @bitcast_extelt4(i128 %A) {
+  %bc1 = bitcast i128 %A to <2 x i64>
+  %ext = extractelement <2 x i64> %bc1, i32 0
+  %bc2 = bitcast i64 %ext to double
+  ret double %bc2
+
+; CHECK-LABEL: @bitcast_extelt4(
+; CHECK-NEXT:  %bc = bitcast i128 %A to <2 x double>
+; CHECK-NEXT:  %bc2 = extractelement <2 x double> %bc, i32 0
+; CHECK-NEXT:  ret double %bc2
+}
+
 define <2 x i32> @test4(i32 %A, i32 %B){
   %tmp38 = zext i32 %A to i64
   %tmp32 = zext i32 %B to i64

From 7845ed025cc5f41530cb43b9144ec4556bf69128 Mon Sep 17 00:00:00 2001
From: Pirama Arumuga Nainar <pirama@google.com>
Date: Thu, 10 Dec 2015 17:16:49 +0000
Subject: [PATCH 332/364] Fix fptosi, fptoui from f16 vectors to i8, i16
 vectors

Summary:
Convert f16 vectors to corresponding f32 vectors before doing the
conversion to int.

Add tests for v4f16, v8f16.

Reviewers: ab, jmolloy

Subscribers: llvm-commits, srhines

Differential Revision: http://reviews.llvm.org/D14936

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255263 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64ISelLowering.cpp   | 10 ++++
 test/CodeGen/AArch64/fp16-v4-instructions.ll | 42 ++++++++++++++-
 test/CodeGen/AArch64/fp16-v8-instructions.ll | 54 ++++++++++++++++++++
 3 files changed, 105 insertions(+), 1 deletion(-)

diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index a23032bb3beb..99b2edb38ef0 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1850,6 +1850,16 @@ static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
   // in the cost tables.
   EVT InVT = Op.getOperand(0).getValueType();
   EVT VT = Op.getValueType();
+  unsigned NumElts = InVT.getVectorNumElements();
+
+  // f16 vectors are promoted to f32 before a conversion.
+  if (InVT.getVectorElementType() == MVT::f16) {
+    MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
+    SDLoc dl(Op);
+    return DAG.getNode(
+        Op.getOpcode(), dl, Op.getValueType(),
+        DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
+  }
 
   if (VT.getSizeInBits() < InVT.getSizeInBits()) {
     SDLoc dl(Op);
diff --git a/test/CodeGen/AArch64/fp16-v4-instructions.ll b/test/CodeGen/AArch64/fp16-v4-instructions.ll
index 1249c14eb9cc..f6e4bdf73459 100644
--- a/test/CodeGen/AArch64/fp16-v4-instructions.ll
+++ b/test/CodeGen/AArch64/fp16-v4-instructions.ll
@@ -130,7 +130,6 @@ define <4 x i16> @bitcast_h_to_i(float, <4 x half> %a) {
   ret <4 x i16> %2
 }
 
-
 define <4 x half> @sitofp_i8(<4 x i8> %a) #0 {
 ; CHECK-LABEL: sitofp_i8:
 ; CHECK-NEXT: shl [[OP1:v[0-9]+\.4h]], v0.4h, #8
@@ -227,4 +226,45 @@ define void @test_insert_at_zero(half %a, <4 x half>* %b) #0 {
   ret void
 }
 
+define <4 x i8> @fptosi_i8(<4 x half> %a) #0 {
+; CHECK-LABEL: fptosi_i8:
+; CHECK-NEXT: fcvtl  [[REG1:v[0-9]+\.4s]], v0.4h
+; CHECK-NEXT: fcvtzs [[REG2:v[0-9]+\.4s]], [[REG1]]
+; CHECK-NEXT: xtn    v0.4h, [[REG2]]
+; CHECK-NEXT: ret
+  %1 = fptosi<4 x half> %a to <4 x i8>
+  ret <4 x i8> %1
+}
+
+define <4 x i16> @fptosi_i16(<4 x half> %a) #0 {
+; CHECK-LABEL: fptosi_i16:
+; CHECK-NEXT: fcvtl  [[REG1:v[0-9]+\.4s]], v0.4h
+; CHECK-NEXT: fcvtzs [[REG2:v[0-9]+\.4s]], [[REG1]]
+; CHECK-NEXT: xtn    v0.4h, [[REG2]]
+; CHECK-NEXT: ret
+  %1 = fptosi<4 x half> %a to <4 x i16>
+  ret <4 x i16> %1
+}
+
+define <4 x i8> @fptoui_i8(<4 x half> %a) #0 {
+; CHECK-LABEL: fptoui_i8:
+; CHECK-NEXT: fcvtl  [[REG1:v[0-9]+\.4s]], v0.4h
+; NOTE: fcvtzs selected here because the xtn shaves the sign bit
+; CHECK-NEXT: fcvtzs [[REG2:v[0-9]+\.4s]], [[REG1]]
+; CHECK-NEXT: xtn    v0.4h, [[REG2]]
+; CHECK-NEXT: ret
+  %1 = fptoui<4 x half> %a to <4 x i8>
+  ret <4 x i8> %1
+}
+
+define <4 x i16> @fptoui_i16(<4 x half> %a) #0 {
+; CHECK-LABEL: fptoui_i16:
+; CHECK-NEXT: fcvtl  [[REG1:v[0-9]+\.4s]], v0.4h
+; CHECK-NEXT: fcvtzu [[REG2:v[0-9]+\.4s]], [[REG1]]
+; CHECK-NEXT: xtn    v0.4h, [[REG2]]
+; CHECK-NEXT: ret
+  %1 = fptoui<4 x half> %a to <4 x i16>
+  ret <4 x i16> %1
+}
+
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/fp16-v8-instructions.ll b/test/CodeGen/AArch64/fp16-v8-instructions.ll
index dfad6bc12a17..137d1f358a30 100644
--- a/test/CodeGen/AArch64/fp16-v8-instructions.ll
+++ b/test/CodeGen/AArch64/fp16-v8-instructions.ll
@@ -367,4 +367,58 @@ define void @test_insert_at_zero(half %a, <8 x half>* %b) #0 {
   ret void
 }
 
+define <8 x i8> @fptosi_i8(<8 x half> %a) #0 {
+; CHECK-LABEL: fptosi_i8:
+; CHECK-DAG: fcvtl   [[LO:v[0-9]+\.4s]], v0.4h
+; CHECK-DAG: fcvtl2  [[HI:v[0-9]+\.4s]], v0.8h
+; CHECK-DAG: fcvtzs  [[LOF32:v[0-9]+\.4s]], [[LO]]
+; CHECK-DAG: xtn     [[I16:v[0-9]+]].4h, [[LOF32]]
+; CHECK-DAG: fcvtzs  [[HIF32:v[0-9]+\.4s]], [[HI]]
+; CHECK-DAG: xtn2    [[I16]].8h, [[HIF32]]
+; CHECK-NEXT: xtn     v0.8b, [[I16]].8h
+; CHECK-NEXT: ret
+  %1 = fptosi<8 x half> %a to <8 x i8>
+  ret <8 x i8> %1
+}
+
+define <8 x i16> @fptosi_i16(<8 x half> %a) #0 {
+; CHECK-LABEL: fptosi_i16:
+; CHECK-DAG: fcvtl   [[LO:v[0-9]+\.4s]], v0.4h
+; CHECK-DAG: fcvtl2  [[HI:v[0-9]+\.4s]], v0.8h
+; CHECK-DAG: fcvtzs  [[LOF32:v[0-9]+\.4s]], [[LO]]
+; CHECK-DAG: xtn     [[I16:v[0-9]+]].4h, [[LOF32]]
+; CHECK-DAG: fcvtzs  [[HIF32:v[0-9]+\.4s]], [[HI]]
+; CHECK-NEXT: xtn2    [[I16]].8h, [[HIF32]]
+; CHECK-NEXT: ret
+  %1 = fptosi<8 x half> %a to <8 x i16>
+  ret <8 x i16> %1
+}
+
+define <8 x i8> @fptoui_i8(<8 x half> %a) #0 {
+; CHECK-LABEL: fptoui_i8:
+; CHECK-DAG: fcvtl   [[LO:v[0-9]+\.4s]], v0.4h
+; CHECK-DAG: fcvtl2  [[HI:v[0-9]+\.4s]], v0.8h
+; CHECK-DAG: fcvtzu  [[LOF32:v[0-9]+\.4s]], [[LO]]
+; CHECK-DAG: xtn     [[I16:v[0-9]+]].4h, [[LOF32]]
+; CHECK-DAG: fcvtzu  [[HIF32:v[0-9]+\.4s]], [[HI]]
+; CHECK-DAG: xtn2    [[I16]].8h, [[HIF32]]
+; CHECK-NEXT: xtn     v0.8b, [[I16]].8h
+; CHECK-NEXT: ret
+  %1 = fptoui<8 x half> %a to <8 x i8>
+  ret <8 x i8> %1
+}
+
+define <8 x i16> @fptoui_i16(<8 x half> %a) #0 {
+; CHECK-LABEL: fptoui_i16:
+; CHECK-DAG: fcvtl   [[LO:v[0-9]+\.4s]], v0.4h
+; CHECK-DAG: fcvtl2  [[HI:v[0-9]+\.4s]], v0.8h
+; CHECK-DAG: fcvtzu  [[LOF32:v[0-9]+\.4s]], [[LO]]
+; CHECK-DAG: xtn     [[I16:v[0-9]+]].4h, [[LOF32]]
+; CHECK-DAG: fcvtzu  [[HIF32:v[0-9]+\.4s]], [[HI]]
+; CHECK-NEXT: xtn2    [[I16]].8h, [[HIF32]]
+; CHECK-NEXT: ret
+  %1 = fptoui<8 x half> %a to <8 x i16>
+  ret <8 x i16> %1
+}
+
 attributes #0 = { nounwind }

From 46cf0f07dcf34cfe59bfcfe7fb8b5a5797e786e6 Mon Sep 17 00:00:00 2001
From: Nathan Slingerland <slingn@gmail.com>
Date: Thu, 10 Dec 2015 17:21:42 +0000
Subject: [PATCH 333/364] [ProfileData] Add unit test infrastructure for sample
 profile reader/writer

Summary:
Adds support for in-memory round-trip of sample profile data along with basic
round trip unit tests. This will also make it easier to include unit tests for
future changes to sample profiling.

Reviewers: davidxl, dnovillo, silvas

Subscribers: llvm-commits

Differential Revision: http://reviews.llvm.org/D15211

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255264 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ProfileData/SampleProfReader.h |   4 +
 include/llvm/ProfileData/SampleProfWriter.h |  37 ++++---
 lib/ProfileData/SampleProfReader.cpp        |  26 +++--
 lib/ProfileData/SampleProfWriter.cpp        |  40 +++++++-
 unittests/ProfileData/CMakeLists.txt        |   1 +
 unittests/ProfileData/SampleProfTest.cpp    | 102 ++++++++++++++++++++
 6 files changed, 187 insertions(+), 23 deletions(-)
 create mode 100644 unittests/ProfileData/SampleProfTest.cpp

diff --git a/include/llvm/ProfileData/SampleProfReader.h b/include/llvm/ProfileData/SampleProfReader.h
index 1fb2cf6e0ca4..9762813264f4 100644
--- a/include/llvm/ProfileData/SampleProfReader.h
+++ b/include/llvm/ProfileData/SampleProfReader.h
@@ -267,6 +267,10 @@ class SampleProfileReader {
   static ErrorOr<std::unique_ptr<SampleProfileReader>>
   create(StringRef Filename, LLVMContext &C);
 
+  /// \brief Create a sample profile reader from the supplied memory buffer.
+  static ErrorOr<std::unique_ptr<SampleProfileReader>>
+  create(std::unique_ptr<MemoryBuffer> &B, LLVMContext &C);
+
 protected:
   /// \brief Map every function to its associated profile.
   ///
diff --git a/include/llvm/ProfileData/SampleProfWriter.h b/include/llvm/ProfileData/SampleProfWriter.h
index d1cd506d157c..029dd2ebacb0 100644
--- a/include/llvm/ProfileData/SampleProfWriter.h
+++ b/include/llvm/ProfileData/SampleProfWriter.h
@@ -29,9 +29,6 @@ enum SampleProfileFormat { SPF_None = 0, SPF_Text, SPF_Binary, SPF_GCC };
 /// \brief Sample-based profile writer. Base class.
 class SampleProfileWriter {
 public:
-  SampleProfileWriter(StringRef Filename, std::error_code &EC,
-                      sys::fs::OpenFlags Flags)
-      : OS(Filename, EC, Flags) {}
   virtual ~SampleProfileWriter() {}
 
   /// Write sample profiles in \p S for function \p FName.
@@ -55,30 +52,40 @@ class SampleProfileWriter {
     return sampleprof_error::success;
   }
 
+  raw_ostream &getOutputStream() { return *OutputStream; }
+
   /// Profile writer factory.
   ///
-  /// Create a new writer based on the value of \p Format.
+  /// Create a new file writer based on the value of \p Format.
   static ErrorOr<std::unique_ptr<SampleProfileWriter>>
   create(StringRef Filename, SampleProfileFormat Format);
 
+  /// Create a new stream writer based on the value of \p Format.
+  /// For testing.
+  static ErrorOr<std::unique_ptr<SampleProfileWriter>>
+  create(std::unique_ptr<raw_ostream> &OS, SampleProfileFormat Format);
+
 protected:
+  SampleProfileWriter(std::unique_ptr<raw_ostream> &OS)
+      : OutputStream(std::move(OS)) {}
+
   /// \brief Write a file header for the profile file.
   virtual std::error_code
   writeHeader(const StringMap<FunctionSamples> &ProfileMap) = 0;
 
   /// \brief Output stream where to emit the profile to.
-  raw_fd_ostream OS;
+  std::unique_ptr<raw_ostream> OutputStream;
 };
 
 /// \brief Sample-based profile writer (text format).
 class SampleProfileWriterText : public SampleProfileWriter {
 public:
-  SampleProfileWriterText(StringRef F, std::error_code &EC)
-      : SampleProfileWriter(F, EC, sys::fs::F_Text), Indent(0) {}
-
   std::error_code write(StringRef FName, const FunctionSamples &S) override;
 
 protected:
+  SampleProfileWriterText(std::unique_ptr<raw_ostream> &OS)
+      : SampleProfileWriter(OS), Indent(0) {}
+
   std::error_code
   writeHeader(const StringMap<FunctionSamples> &ProfileMap) override {
     return sampleprof_error::success;
@@ -89,17 +96,21 @@ class SampleProfileWriterText : public SampleProfileWriter {
   ///
   /// This is used when printing inlined callees.
   unsigned Indent;
+
+  friend ErrorOr<std::unique_ptr<SampleProfileWriter>>
+  SampleProfileWriter::create(std::unique_ptr<raw_ostream> &OS,
+                              SampleProfileFormat Format);
 };
 
 /// \brief Sample-based profile writer (binary format).
 class SampleProfileWriterBinary : public SampleProfileWriter {
 public:
-  SampleProfileWriterBinary(StringRef F, std::error_code &EC)
-      : SampleProfileWriter(F, EC, sys::fs::F_None), NameTable() {}
-
   std::error_code write(StringRef F, const FunctionSamples &S) override;
 
 protected:
+  SampleProfileWriterBinary(std::unique_ptr<raw_ostream> &OS)
+      : SampleProfileWriter(OS), NameTable() {}
+
   std::error_code
   writeHeader(const StringMap<FunctionSamples> &ProfileMap) override;
   std::error_code writeNameIdx(StringRef FName);
@@ -110,6 +121,10 @@ class SampleProfileWriterBinary : public SampleProfileWriter {
   void addNames(const FunctionSamples &S);
 
   MapVector<StringRef, uint32_t> NameTable;
+
+  friend ErrorOr<std::unique_ptr<SampleProfileWriter>>
+  SampleProfileWriter::create(std::unique_ptr<raw_ostream> &OS,
+                              SampleProfileFormat Format);
 };
 
 } // End namespace sampleprof
diff --git a/lib/ProfileData/SampleProfReader.cpp b/lib/ProfileData/SampleProfReader.cpp
index e71d0bae07bd..cdd98e8e8d03 100644
--- a/lib/ProfileData/SampleProfReader.cpp
+++ b/lib/ProfileData/SampleProfReader.cpp
@@ -693,15 +693,27 @@ SampleProfileReader::create(StringRef Filename, LLVMContext &C) {
   auto BufferOrError = setupMemoryBuffer(Filename);
   if (std::error_code EC = BufferOrError.getError())
     return EC;
+  return create(BufferOrError.get(), C);
+}
 
-  auto Buffer = std::move(BufferOrError.get());
+/// \brief Create a sample profile reader based on the format of the input data.
+///
+/// \param B The memory buffer to create the reader from (assumes ownership).
+///
+/// \param Reader The reader to instantiate according to \p Filename's format.
+///
+/// \param C The LLVM context to use to emit diagnostics.
+///
+/// \returns an error code indicating the status of the created reader.
+ErrorOr<std::unique_ptr<SampleProfileReader>>
+SampleProfileReader::create(std::unique_ptr<MemoryBuffer> &B, LLVMContext &C) {
   std::unique_ptr<SampleProfileReader> Reader;
-  if (SampleProfileReaderBinary::hasFormat(*Buffer))
-    Reader.reset(new SampleProfileReaderBinary(std::move(Buffer), C));
-  else if (SampleProfileReaderGCC::hasFormat(*Buffer))
-    Reader.reset(new SampleProfileReaderGCC(std::move(Buffer), C));
-  else if (SampleProfileReaderText::hasFormat(*Buffer))
-    Reader.reset(new SampleProfileReaderText(std::move(Buffer), C));
+  if (SampleProfileReaderBinary::hasFormat(*B))
+    Reader.reset(new SampleProfileReaderBinary(std::move(B), C));
+  else if (SampleProfileReaderGCC::hasFormat(*B))
+    Reader.reset(new SampleProfileReaderGCC(std::move(B), C));
+  else if (SampleProfileReaderText::hasFormat(*B))
+    Reader.reset(new SampleProfileReaderText(std::move(B), C));
   else
     return sampleprof_error::unrecognized_format;
 
diff --git a/lib/ProfileData/SampleProfWriter.cpp b/lib/ProfileData/SampleProfWriter.cpp
index c9f892334686..51feee5ad7d1 100644
--- a/lib/ProfileData/SampleProfWriter.cpp
+++ b/lib/ProfileData/SampleProfWriter.cpp
@@ -39,6 +39,8 @@ using namespace llvm;
 /// it needs to be parsed by the SampleProfileReaderText class.
 std::error_code SampleProfileWriterText::write(StringRef FName,
                                                const FunctionSamples &S) {
+  auto &OS = *OutputStream;
+
   OS << FName << ":" << S.getTotalSamples();
   if (Indent == 0)
     OS << ":" << S.getHeadSamples();
@@ -84,7 +86,7 @@ std::error_code SampleProfileWriterBinary::writeNameIdx(StringRef FName) {
   const auto &ret = NameTable.find(FName);
   if (ret == NameTable.end())
     return sampleprof_error::truncated_name_table;
-  encodeULEB128(ret->second, OS);
+  encodeULEB128(ret->second, *OutputStream);
   return sampleprof_error::success;
 }
 
@@ -112,6 +114,8 @@ void SampleProfileWriterBinary::addNames(const FunctionSamples &S) {
 
 std::error_code SampleProfileWriterBinary::writeHeader(
     const StringMap<FunctionSamples> &ProfileMap) {
+  auto &OS = *OutputStream;
+
   // Write file magic identifier.
   encodeULEB128(SPMagic(), OS);
   encodeULEB128(SPVersion(), OS);
@@ -134,6 +138,8 @@ std::error_code SampleProfileWriterBinary::writeHeader(
 
 std::error_code SampleProfileWriterBinary::writeBody(StringRef FName,
                                                      const FunctionSamples &S) {
+  auto &OS = *OutputStream;
+
   if (std::error_code EC = writeNameIdx(FName))
     return EC;
 
@@ -176,11 +182,11 @@ std::error_code SampleProfileWriterBinary::writeBody(StringRef FName,
 /// \returns true if the samples were written successfully, false otherwise.
 std::error_code SampleProfileWriterBinary::write(StringRef FName,
                                                  const FunctionSamples &S) {
-  encodeULEB128(S.getHeadSamples(), OS);
+  encodeULEB128(S.getHeadSamples(), *OutputStream);
   return writeBody(FName, S);
 }
 
-/// \brief Create a sample profile writer based on the specified format.
+/// \brief Create a sample profile file writer based on the specified format.
 ///
 /// \param Filename The file to create.
 ///
@@ -192,12 +198,36 @@ std::error_code SampleProfileWriterBinary::write(StringRef FName,
 ErrorOr<std::unique_ptr<SampleProfileWriter>>
 SampleProfileWriter::create(StringRef Filename, SampleProfileFormat Format) {
   std::error_code EC;
+  std::unique_ptr<raw_ostream> OS;
+  if (Format == SPF_Binary)
+    OS.reset(new raw_fd_ostream(Filename, EC, sys::fs::F_None));
+  else
+    OS.reset(new raw_fd_ostream(Filename, EC, sys::fs::F_Text));
+  if (EC)
+    return EC;
+
+  return create(OS, Format);
+}
+
+/// \brief Create a sample profile stream writer based on the specified format.
+///
+/// \param OS The output stream to store the profile data to.
+///
+/// \param Writer The writer to instantiate according to the specified format.
+///
+/// \param Format Encoding format for the profile file.
+///
+/// \returns an error code indicating the status of the created writer.
+ErrorOr<std::unique_ptr<SampleProfileWriter>>
+SampleProfileWriter::create(std::unique_ptr<raw_ostream> &OS,
+                            SampleProfileFormat Format) {
+  std::error_code EC;
   std::unique_ptr<SampleProfileWriter> Writer;
 
   if (Format == SPF_Binary)
-    Writer.reset(new SampleProfileWriterBinary(Filename, EC));
+    Writer.reset(new SampleProfileWriterBinary(OS));
   else if (Format == SPF_Text)
-    Writer.reset(new SampleProfileWriterText(Filename, EC));
+    Writer.reset(new SampleProfileWriterText(OS));
   else if (Format == SPF_GCC)
     EC = sampleprof_error::unsupported_writing_format;
   else
diff --git a/unittests/ProfileData/CMakeLists.txt b/unittests/ProfileData/CMakeLists.txt
index 79137c9510ae..011f8c581792 100644
--- a/unittests/ProfileData/CMakeLists.txt
+++ b/unittests/ProfileData/CMakeLists.txt
@@ -7,4 +7,5 @@ set(LLVM_LINK_COMPONENTS
 add_llvm_unittest(ProfileDataTests
   CoverageMappingTest.cpp
   InstrProfTest.cpp
+  SampleProfTest.cpp
   )
diff --git a/unittests/ProfileData/SampleProfTest.cpp b/unittests/ProfileData/SampleProfTest.cpp
new file mode 100644
index 000000000000..aa1144d7913d
--- /dev/null
+++ b/unittests/ProfileData/SampleProfTest.cpp
@@ -0,0 +1,102 @@
+//===- unittest/ProfileData/SampleProfTest.cpp -------------------*- C++
+//-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ProfileData/SampleProfReader.h"
+#include "llvm/ProfileData/SampleProfWriter.h"
+#include "gtest/gtest.h"
+
+#include <cstdarg>
+
+using namespace llvm;
+using namespace sampleprof;
+
+static ::testing::AssertionResult NoError(std::error_code EC) {
+  if (!EC)
+    return ::testing::AssertionSuccess();
+  return ::testing::AssertionFailure() << "error " << EC.value() << ": "
+                                       << EC.message();
+}
+
+namespace {
+
+struct SampleProfTest : ::testing::Test {
+  std::string Data;
+  std::unique_ptr<raw_ostream> OS;
+  std::unique_ptr<SampleProfileWriter> Writer;
+  std::unique_ptr<SampleProfileReader> Reader;
+
+  SampleProfTest()
+      : Data(), OS(new raw_string_ostream(Data)), Writer(), Reader() {}
+
+  void createWriter(SampleProfileFormat Format) {
+    auto WriterOrErr = SampleProfileWriter::create(OS, Format);
+    ASSERT_TRUE(NoError(WriterOrErr.getError()));
+    Writer = std::move(WriterOrErr.get());
+  }
+
+  void readProfile(std::unique_ptr<MemoryBuffer> &Profile) {
+    auto ReaderOrErr = SampleProfileReader::create(Profile, getGlobalContext());
+    ASSERT_TRUE(NoError(ReaderOrErr.getError()));
+    Reader = std::move(ReaderOrErr.get());
+  }
+
+  void testRoundTrip(SampleProfileFormat Format) {
+    createWriter(Format);
+
+    StringRef FooName("_Z3fooi");
+    FunctionSamples FooSamples;
+    FooSamples.addTotalSamples(7711);
+    FooSamples.addHeadSamples(610);
+    FooSamples.addBodySamples(1, 0, 610);
+
+    StringRef BarName("_Z3bari");
+    FunctionSamples BarSamples;
+    BarSamples.addTotalSamples(20301);
+    BarSamples.addHeadSamples(1437);
+    BarSamples.addBodySamples(1, 0, 1437);
+
+    StringMap<FunctionSamples> Profiles;
+    Profiles[FooName] = std::move(FooSamples);
+    Profiles[BarName] = std::move(BarSamples);
+
+    std::error_code EC;
+    EC = Writer->write(Profiles);
+    ASSERT_TRUE(NoError(EC));
+
+    Writer->getOutputStream().flush();
+
+    auto Profile = MemoryBuffer::getMemBufferCopy(Data);
+    readProfile(Profile);
+
+    EC = Reader->read();
+    ASSERT_TRUE(NoError(EC));
+
+    StringMap<FunctionSamples> &ReadProfiles = Reader->getProfiles();
+    ASSERT_EQ(2u, ReadProfiles.size());
+
+    FunctionSamples &ReadFooSamples = ReadProfiles[FooName];
+    ASSERT_EQ(7711u, ReadFooSamples.getTotalSamples());
+    ASSERT_EQ(610u, ReadFooSamples.getHeadSamples());
+
+    FunctionSamples &ReadBarSamples = ReadProfiles[BarName];
+    ASSERT_EQ(20301u, ReadBarSamples.getTotalSamples());
+    ASSERT_EQ(1437u, ReadBarSamples.getHeadSamples());
+  }
+};
+
+TEST_F(SampleProfTest, roundtrip_text_profile) {
+  testRoundTrip(SampleProfileFormat::SPF_Text);
+}
+
+TEST_F(SampleProfTest, roundtrip_binary_profile) {
+  testRoundTrip(SampleProfileFormat::SPF_Binary);
+}
+
+} // end anonymous namespace

From 2c4226901e82fba7c2fe9b244a037e1b12acbff7 Mon Sep 17 00:00:00 2001
From: Chad Rosier <mcrosier@codeaurora.org>
Date: Thu, 10 Dec 2015 17:27:18 +0000
Subject: [PATCH 334/364] [DeadStoreElimination] Use range-based loops. NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255265 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/DeadStoreElimination.cpp | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index d8b08c41d0a8..ba7e98f17b85 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -678,8 +678,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
 static void findSafePreds(SmallVectorImpl<BasicBlock *> &PredBlocks,
                           SmallSetVector<BasicBlock *, 8> &SafeBlocks,
                           BasicBlock *BB, DominatorTree *DT) {
-  for (auto I = pred_begin(BB), E = pred_end(BB); I != E; ++I) {
-    BasicBlock *Pred = *I;
+  for (auto *Pred : predecessors(BB)) {
     if (Pred == BB)
       continue;
     // The second check below prevents adding blocks that form a cycle with BB
@@ -689,8 +688,7 @@ static void findSafePreds(SmallVectorImpl<BasicBlock *> &PredBlocks,
       continue;
 
     bool PredIsSafe = true;
-    for (auto II = succ_begin(Pred), EE = succ_end(Pred); II != EE; ++II) {
-      BasicBlock *Succ = *II;
+    for (auto *Succ : successors(Pred)) {
       if (Succ == BB || Succ == Pred) // shortcut, BB should be in SafeBlocks
         continue;
       if (!SafeBlocks.count(Succ)) {
@@ -710,8 +708,7 @@ static bool underlyingObjectsDoNotAlias(StoreInst *SI, LoadInst *LI,
   SmallVector<Value *, 4> Pointers;
   GetUnderlyingObjects(LI->getPointerOperand(), Pointers, DL);
 
-  for (auto I = Pointers.begin(), E = Pointers.end(); I != E; ++I) {
-    Value *BObj = *I;
+  for (auto *BObj : Pointers) {
     if (!AA.isNoAlias(AObj, DL.getTypeStoreSize(AObj->getType()), BObj,
                       DL.getTypeStoreSize(BObj->getType())))
       return false;
@@ -858,10 +855,10 @@ bool DSE::MemoryIsNotModifiedBetween(Instruction *FirstI,
     if (B != FirstBB) {
       assert(B != &FirstBB->getParent()->getEntryBlock() &&
           "Should not hit the entry block because SI must be dominated by LI");
-      for (auto PredI = pred_begin(B), PE = pred_end(B); PredI != PE; ++PredI) {
-        if (!Visited.insert(*PredI).second)
+      for (auto *PredI : predecessors(B)) {
+        if (!Visited.insert(PredI).second)
           continue;
-        WorkList.push_back(*PredI);
+        WorkList.push_back(PredI);
       }
     }
   }

From fbc0a7d38b86d72481140d4e991327e86fdf2f2b Mon Sep 17 00:00:00 2001
From: "Duncan P. N. Exon Smith" <dexonsmith@apple.com>
Date: Thu, 10 Dec 2015 17:56:06 +0000
Subject: [PATCH 335/364] Verifier: Avoid quadratic checking of aggregates for
 bad bitcasts

Avoid O(N^2) behaviour when checking for bad bitcasts in `ConstantExpr`s
buried inside of aggregate initializers to `GlobalVariable`s.  I've:
- centralized the "visited" set for recursing through `ConstantExpr`s so
  that expressions are only visited once per Verifier run,
- removed the duplicate logic for the stack visit, and
- avoided recursing into other `GlobalValue`s.

This recovers roughly a 100x time difference in clang compiles of a
particular input file (filled with large cross-referencing tables) that
depends on whether `-disable-llvm-verifier` is on.  This slowdown was
caused by r187506, which introduced these checks.

Now, avoiding `-disable-llvm-verifier` only causes a 2x slowdown for
this case.

(Interestingly, dumping the textual IR for this file starts at least
50GB of global variable initializers (I don't know the total, since I
killed the dump)...)

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255269 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/IR/Verifier.cpp | 75 ++++++++++++++++++++++-----------------------
 1 file changed, 37 insertions(+), 38 deletions(-)

diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index 819d7bb9ad9b..58f9c5388bf5 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -204,6 +204,9 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   /// given function and the largest index passed to llvm.localrecover.
   DenseMap<Function *, std::pair<unsigned, unsigned>> FrameEscapeInfo;
 
+  /// Cache of constants visited in search of ConstantExprs.
+  SmallPtrSet<const Constant *, 32> ConstantExprVisited;
+
 public:
   explicit Verifier(raw_ostream &OS)
       : VerifierSupport(OS), Context(nullptr), LandingPadResultTy(nullptr),
@@ -420,7 +423,8 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   void VerifyFunctionMetadata(
       const SmallVector<std::pair<unsigned, MDNode *>, 4> MDs);
 
-  void VerifyConstantExprBitcastType(const ConstantExpr *CE);
+  void visitConstantExprsRecursively(const Constant *EntryC);
+  void visitConstantExpr(const ConstantExpr *CE);
   void VerifyStatepoint(ImmutableCallSite CS);
   void verifyFrameRecoverIndices();
 
@@ -545,25 +549,7 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
   }
 
   // Walk any aggregate initializers looking for bitcasts between address spaces
-  SmallPtrSet<const Value *, 4> Visited;
-  SmallVector<const Value *, 4> WorkStack;
-  WorkStack.push_back(cast<Value>(GV.getInitializer()));
-
-  while (!WorkStack.empty()) {
-    const Value *V = WorkStack.pop_back_val();
-    if (!Visited.insert(V).second)
-      continue;
-
-    if (const User *U = dyn_cast<User>(V)) {
-      WorkStack.append(U->op_begin(), U->op_end());
-    }
-
-    if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
-      VerifyConstantExprBitcastType(CE);
-      if (Broken)
-        return;
-    }
-  }
+  visitConstantExprsRecursively(GV.getInitializer());
 
   visitGlobalValue(GV);
 }
@@ -593,7 +579,7 @@ void Verifier::visitAliaseeSubExpr(SmallPtrSetImpl<const GlobalAlias*> &Visited,
   }
 
   if (const auto *CE = dyn_cast<ConstantExpr>(&C))
-    VerifyConstantExprBitcastType(CE);
+    visitConstantExprsRecursively(CE);
 
   for (const Use &U : C.operands()) {
     Value *V = &*U;
@@ -1493,7 +1479,35 @@ void Verifier::VerifyFunctionMetadata(
   }
 }
 
-void Verifier::VerifyConstantExprBitcastType(const ConstantExpr *CE) {
+void Verifier::visitConstantExprsRecursively(const Constant *EntryC) {
+  if (!ConstantExprVisited.insert(EntryC).second)
+    return;
+
+  SmallVector<const Constant *, 16> Stack;
+  Stack.push_back(EntryC);
+
+  while (!Stack.empty()) {
+    const Constant *C = Stack.pop_back_val();
+
+    // Check this constant expression.
+    if (const auto *CE = dyn_cast<ConstantExpr>(C))
+      visitConstantExpr(CE);
+
+    // Visit all sub-expressions.
+    for (const Use &U : C->operands()) {
+      const auto *OpC = dyn_cast<Constant>(U);
+      if (!OpC)
+        continue;
+      if (isa<GlobalValue>(OpC))
+        continue; // Global values get visited separately.
+      if (!ConstantExprVisited.insert(OpC).second)
+        continue;
+      Stack.push_back(OpC);
+    }
+  }
+}
+
+void Verifier::visitConstantExpr(const ConstantExpr *CE) {
   if (CE->getOpcode() != Instruction::BitCast)
     return;
 
@@ -3219,22 +3233,7 @@ void Verifier::visitInstruction(Instruction &I) {
       if (CE->getType()->isPtrOrPtrVectorTy()) {
         // If we have a ConstantExpr pointer, we need to see if it came from an
         // illegal bitcast (inttoptr <constant int> )
-        SmallVector<const ConstantExpr *, 4> Stack;
-        SmallPtrSet<const ConstantExpr *, 4> Visited;
-        Stack.push_back(CE);
-
-        while (!Stack.empty()) {
-          const ConstantExpr *V = Stack.pop_back_val();
-          if (!Visited.insert(V).second)
-            continue;
-
-          VerifyConstantExprBitcastType(V);
-
-          for (unsigned I = 0, N = V->getNumOperands(); I != N; ++I) {
-            if (ConstantExpr *Op = dyn_cast<ConstantExpr>(V->getOperand(I)))
-              Stack.push_back(Op);
-          }
-        }
+        visitConstantExprsRecursively(CE);
       }
     }
   }

From ef0af93883a5384e5ad47a7d89ab122de6a60a64 Mon Sep 17 00:00:00 2001
From: Rong Xu <xur@google.com>
Date: Thu, 10 Dec 2015 18:24:44 +0000
Subject: [PATCH 336/364] [PGO] Use %t as the temporary profdata filename in
 the test cases.

Using %t rather %T/<specific_name> as the temporary profdata filename.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255271 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/PGOProfile/branch1.ll              | 4 ++--
 test/Transforms/PGOProfile/branch2.ll              | 4 ++--
 test/Transforms/PGOProfile/criticaledge.ll         | 4 ++--
 test/Transforms/PGOProfile/diag_mismatch.ll        | 4 ++--
 test/Transforms/PGOProfile/diag_no_funcprofdata.ll | 4 ++--
 test/Transforms/PGOProfile/diag_no_profile.ll      | 2 +-
 test/Transforms/PGOProfile/landingpad.ll           | 4 ++--
 test/Transforms/PGOProfile/loop1.ll                | 4 ++--
 test/Transforms/PGOProfile/loop2.ll                | 4 ++--
 test/Transforms/PGOProfile/switch.ll               | 4 ++--
 10 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/test/Transforms/PGOProfile/branch1.ll b/test/Transforms/PGOProfile/branch1.ll
index a0c5d7aeea72..56b40bb72970 100644
--- a/test/Transforms/PGOProfile/branch1.ll
+++ b/test/Transforms/PGOProfile/branch1.ll
@@ -1,6 +1,6 @@
 ; RUN: opt < %s -pgo-instr-gen -S | FileCheck %s --check-prefix=GEN
-; RUN: llvm-profdata merge %S/Inputs/branch1.proftext -o %T/branch1.profdata
-; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%T/branch1.profdata -S | FileCheck %s --check-prefix=USE
+; RUN: llvm-profdata merge %S/Inputs/branch1.proftext -o %t.profdata
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/Transforms/PGOProfile/branch2.ll b/test/Transforms/PGOProfile/branch2.ll
index 97f49565d445..cfeb7688fb9d 100644
--- a/test/Transforms/PGOProfile/branch2.ll
+++ b/test/Transforms/PGOProfile/branch2.ll
@@ -1,6 +1,6 @@
 ; RUN: opt < %s -pgo-instr-gen -S | FileCheck %s --check-prefix=GEN
-; RUN: llvm-profdata merge %S/Inputs/branch2.proftext -o %T/branch2.profdata
-; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%T/branch2.profdata -S | FileCheck %s --check-prefix=USE
+; RUN: llvm-profdata merge %S/Inputs/branch2.proftext -o %t.profdata
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/Transforms/PGOProfile/criticaledge.ll b/test/Transforms/PGOProfile/criticaledge.ll
index 7d0d1fea9e77..7898f3b9eb1b 100644
--- a/test/Transforms/PGOProfile/criticaledge.ll
+++ b/test/Transforms/PGOProfile/criticaledge.ll
@@ -1,6 +1,6 @@
 ; RUN: opt < %s -pgo-instr-gen -S | FileCheck %s --check-prefix=GEN
-; RUN: llvm-profdata merge %S/Inputs/criticaledge.proftext -o %T/criticaledge.profdata
-; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%T/criticaledge.profdata -S | FileCheck %s --check-prefix=USE
+; RUN: llvm-profdata merge %S/Inputs/criticaledge.proftext -o %t.profdata
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/Transforms/PGOProfile/diag_mismatch.ll b/test/Transforms/PGOProfile/diag_mismatch.ll
index c655b4d77347..a2d0b20620f0 100644
--- a/test/Transforms/PGOProfile/diag_mismatch.ll
+++ b/test/Transforms/PGOProfile/diag_mismatch.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-profdata merge %S/Inputs/diag.proftext -o %T/diag.profdata
-; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%T/diag.profdata -S 2>&1 | FileCheck %s
+; RUN: llvm-profdata merge %S/Inputs/diag.proftext -o %t.profdata
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -S 2>&1 | FileCheck %s
 
 ; CHECK: Function control flow change detected (hash mismatch) foo
 
diff --git a/test/Transforms/PGOProfile/diag_no_funcprofdata.ll b/test/Transforms/PGOProfile/diag_no_funcprofdata.ll
index 24262fdf2909..2e5ec0444b42 100644
--- a/test/Transforms/PGOProfile/diag_no_funcprofdata.ll
+++ b/test/Transforms/PGOProfile/diag_no_funcprofdata.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-profdata merge %S/Inputs/diag.proftext -o %T/diag2.profdata
-; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%T/diag2.profdata -S 2>&1 | FileCheck %s
+; RUN: llvm-profdata merge %S/Inputs/diag.proftext -o %t.profdata
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -S 2>&1 | FileCheck %s
 
 ; CHECK: No profile data available for function bar
 
diff --git a/test/Transforms/PGOProfile/diag_no_profile.ll b/test/Transforms/PGOProfile/diag_no_profile.ll
index c3f9d15ba24f..ce7b59b8f69d 100644
--- a/test/Transforms/PGOProfile/diag_no_profile.ll
+++ b/test/Transforms/PGOProfile/diag_no_profile.ll
@@ -1,4 +1,4 @@
-; RUN: not opt < %s -pgo-instr-use -pgo-test-profile-file=%T/notexisting.profdata -S  2>&1
+; RUN: not opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -S  2>&1
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Transforms/PGOProfile/landingpad.ll b/test/Transforms/PGOProfile/landingpad.ll
index 66671be2d390..1a8cbe1b7a36 100644
--- a/test/Transforms/PGOProfile/landingpad.ll
+++ b/test/Transforms/PGOProfile/landingpad.ll
@@ -1,6 +1,6 @@
 ; RUN: opt < %s -pgo-instr-gen -S | FileCheck %s --check-prefix=GEN
-; RUN: llvm-profdata merge %S/Inputs/landingpad.proftext -o %T/landingpad.profdata
-; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%T/landingpad.profdata -S | FileCheck %s --check-prefix=USE
+; RUN: llvm-profdata merge %S/Inputs/landingpad.proftext -o %t.profdata
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/Transforms/PGOProfile/loop1.ll b/test/Transforms/PGOProfile/loop1.ll
index 71781d2abf78..ba962e4adc68 100644
--- a/test/Transforms/PGOProfile/loop1.ll
+++ b/test/Transforms/PGOProfile/loop1.ll
@@ -1,6 +1,6 @@
 ; RUN: opt < %s -pgo-instr-gen -S | FileCheck %s --check-prefix=GEN
-; RUN: llvm-profdata merge %S/Inputs/loop1.proftext -o %T/loop1.profdata
-; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%T/loop1.profdata -S | FileCheck %s --check-prefix=USE
+; RUN: llvm-profdata merge %S/Inputs/loop1.proftext -o %t.profdata
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/Transforms/PGOProfile/loop2.ll b/test/Transforms/PGOProfile/loop2.ll
index 2f879cd83772..04b3dcb55cfc 100644
--- a/test/Transforms/PGOProfile/loop2.ll
+++ b/test/Transforms/PGOProfile/loop2.ll
@@ -1,6 +1,6 @@
 ; RUN: opt < %s -pgo-instr-gen -S | FileCheck %s --check-prefix=GEN
-; RUN: llvm-profdata merge %S/Inputs/loop2.proftext -o %T/loop2.profdata
-; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%T/loop2.profdata -S | FileCheck %s --check-prefix=USE
+; RUN: llvm-profdata merge %S/Inputs/loop2.proftext -o %t.profdata
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/Transforms/PGOProfile/switch.ll b/test/Transforms/PGOProfile/switch.ll
index 4301ac207c26..4e99c10792f5 100644
--- a/test/Transforms/PGOProfile/switch.ll
+++ b/test/Transforms/PGOProfile/switch.ll
@@ -1,6 +1,6 @@
 ; RUN: opt < %s -pgo-instr-gen -S | FileCheck %s --check-prefix=GEN
-; RUN: llvm-profdata merge %S/Inputs/switch.proftext -o %T/switch.profdata
-; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%T/switch.profdata -S | FileCheck %s --check-prefix=USE
+; RUN: llvm-profdata merge %S/Inputs/switch.proftext -o %t.profdata
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 

From 161d4c68fc259a1896c419bf8b4b0092d1383b83 Mon Sep 17 00:00:00 2001
From: Rafael Espindola <rafael.espindola@gmail.com>
Date: Thu, 10 Dec 2015 18:44:26 +0000
Subject: [PATCH 337/364] Fix another case where the linkage was not set.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255272 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Linker/IRMover.cpp                               | 2 +-
 test/Linker/Inputs/available_externally_over_decl.ll | 5 +++++
 test/Linker/available_externally_over_decl.ll        | 8 +++++++-
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/lib/Linker/IRMover.cpp b/lib/Linker/IRMover.cpp
index 20a82327e198..251cfb71894b 100644
--- a/lib/Linker/IRMover.cpp
+++ b/lib/Linker/IRMover.cpp
@@ -831,7 +831,7 @@ static bool useExistingDest(GlobalValue &SGV, GlobalValue *DGV,
   if (SGV.isDeclaration())
     return true;
 
-  if (DGV->isDeclarationForLinker())
+  if (DGV->isDeclarationForLinker() && !SGV.isDeclarationForLinker())
     return false;
 
   if (ShouldLink)
diff --git a/test/Linker/Inputs/available_externally_over_decl.ll b/test/Linker/Inputs/available_externally_over_decl.ll
index b440fde92144..6bd0a939957b 100644
--- a/test/Linker/Inputs/available_externally_over_decl.ll
+++ b/test/Linker/Inputs/available_externally_over_decl.ll
@@ -1,5 +1,10 @@
 @h = global void ()* @f
+@h2 = global void ()* @g
 
 define available_externally void @f() {
   ret void
 }
+
+define available_externally void @g() {
+  ret void
+}
diff --git a/test/Linker/available_externally_over_decl.ll b/test/Linker/available_externally_over_decl.ll
index e220b17edf72..0104967ef544 100644
--- a/test/Linker/available_externally_over_decl.ll
+++ b/test/Linker/available_externally_over_decl.ll
@@ -2,8 +2,14 @@
 
 declare void @f()
 
+define available_externally void @g() {
+  ret void
+}
+
 define void ()* @main() {
+  call void @g()
   ret void ()* @f
 }
 
-; CHECK: define available_externally void @f() {
+; CHECK-DAG: define available_externally void @g() {
+; CHECK-DAG: define available_externally void @f() {

From 7d33195bda000e2af510fbc2ead504bcebaf7e97 Mon Sep 17 00:00:00 2001
From: Chad Rosier <mcrosier@codeaurora.org>
Date: Thu, 10 Dec 2015 19:23:02 +0000
Subject: [PATCH 338/364] [DSE] Disable non-local DSE to see if the bots go
 green.

I see a few bots timing out, so I'm speculatively disabling r255247.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255286 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/DeadStoreElimination.cpp      | 2 +-
 test/Transforms/DeadStoreElimination/ifthen.ll      | 2 +-
 test/Transforms/DeadStoreElimination/ifthenelse.ll  | 2 +-
 test/Transforms/DeadStoreElimination/ifthenelse2.ll | 2 +-
 test/Transforms/DeadStoreElimination/loop.ll        | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index ba7e98f17b85..e6996ab97a8c 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -43,7 +43,7 @@ STATISTIC(NumFastStores, "Number of stores deleted");
 STATISTIC(NumFastOther , "Number of other instrs removed");
 STATISTIC(NumNonLocalStores, "Number of non-local stores deleted");
 
-static cl::opt<bool> EnableNonLocalDSE("enable-nonlocal-dse", cl::init(true));
+static cl::opt<bool> EnableNonLocalDSE("enable-nonlocal-dse", cl::init(false));
 
 /// MaxNonLocalAttempts is an arbitrary threshold that provides
 /// an early opportunitiy for bail out to control compile time.
diff --git a/test/Transforms/DeadStoreElimination/ifthen.ll b/test/Transforms/DeadStoreElimination/ifthen.ll
index 5fb1d3e7e51e..21c87f892568 100644
--- a/test/Transforms/DeadStoreElimination/ifthen.ll
+++ b/test/Transforms/DeadStoreElimination/ifthen.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -basicaa -dse -S | FileCheck %s
+; RUN: opt < %s -basicaa -dse -enable-nonlocal-dse -S | FileCheck %s
 
 ; The store and add in if.then block should be removed by non-local DSE.
 ; CHECK-NOT: %stval = add
diff --git a/test/Transforms/DeadStoreElimination/ifthenelse.ll b/test/Transforms/DeadStoreElimination/ifthenelse.ll
index 5ebbff887f7e..59ef17e37a5b 100644
--- a/test/Transforms/DeadStoreElimination/ifthenelse.ll
+++ b/test/Transforms/DeadStoreElimination/ifthenelse.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -basicaa -dse -S | FileCheck %s
+; RUN: opt < %s -basicaa -dse -enable-nonlocal-dse -S | FileCheck %s
 
 ; The add and store in entry block should be removed by non-local DSE.
 ; CHECK-NOT: %stval = add
diff --git a/test/Transforms/DeadStoreElimination/ifthenelse2.ll b/test/Transforms/DeadStoreElimination/ifthenelse2.ll
index 7c95cfd089b4..7aab03004777 100644
--- a/test/Transforms/DeadStoreElimination/ifthenelse2.ll
+++ b/test/Transforms/DeadStoreElimination/ifthenelse2.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -basicaa -dse -S | FileCheck %s
+; RUN: opt < %s -basicaa -dse -enable-nonlocal-dse -S | FileCheck %s
 
 ; The add and store in entry block should be removed by non-local DSE.
 ; CHECK-NOT: %stval = add
diff --git a/test/Transforms/DeadStoreElimination/loop.ll b/test/Transforms/DeadStoreElimination/loop.ll
index 80bd8529ad47..941e6fafaf37 100644
--- a/test/Transforms/DeadStoreElimination/loop.ll
+++ b/test/Transforms/DeadStoreElimination/loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -basicaa -dse -S | FileCheck %s
+; RUN: opt < %s -basicaa -dse -enable-nonlocal-dse -S | FileCheck %s
 
 ; The store in for.body block should be removed by non-local DSE.
 ; CHECK-NOT: store i32 0, i32* %arrayidx

From 45d4194e91032db8ea6db1a98460ef74e2cd6f2b Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 10 Dec 2015 19:47:06 +0000
Subject: [PATCH 339/364] [DAGCombiner] Fix PR25763 - vector comparison
 constant folding + sign-extension

PR25763 demonstrated an issue with D14683 - vector comparison constant folding only works for i1 results, so we need to split off the sign-extension of the result to the required type. Luckily this can be done with the existing type legalization code.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255289 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 13 ++++++++-----
 test/CodeGen/AArch64/fold-constants.ll    | 16 ++++++++++++++++
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 771bb00d86ac..4596b8eba1a0 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3338,12 +3338,15 @@ SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode, SDLoc DL,
       !std::all_of(Ops.begin(), Ops.end(), IsScalarOrSameVectorSize))
     return SDValue();
 
+  // If we are comparing vectors, then the result needs to be a i1 boolean
+  // that is then sign-extended back to the legal result type.
+  EVT SVT = (Opcode == ISD::SETCC ? MVT::i1 : VT.getScalarType());
+
   // Find legal integer scalar type for constant promotion and
   // ensure that its scalar size is at least as large as source.
-  EVT SVT = VT.getScalarType();
-  EVT LegalSVT = SVT;
-  if (SVT.isInteger()) {
-    LegalSVT = TLI->getTypeToTransformTo(*getContext(), SVT);
+  EVT LegalSVT = VT.getScalarType();
+  if (LegalSVT.isInteger()) {
+    LegalSVT = TLI->getTypeToTransformTo(*getContext(), LegalSVT);
     if (LegalSVT.bitsLT(SVT))
       return SDValue();
   }
@@ -3380,7 +3383,7 @@ SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode, SDLoc DL,
 
     // Legalize the (integer) scalar constant if necessary.
     if (LegalSVT != SVT)
-      ScalarResult = getNode(ISD::ANY_EXTEND, DL, LegalSVT, ScalarResult);
+      ScalarResult = getNode(ISD::SIGN_EXTEND, DL, LegalSVT, ScalarResult);
 
     // Scalar folding only succeeded if the result is a constant or UNDEF.
     if (ScalarResult.getOpcode() != ISD::UNDEF &&
diff --git a/test/CodeGen/AArch64/fold-constants.ll b/test/CodeGen/AArch64/fold-constants.ll
index 3f70f0a7e9f9..c0fec4d171cd 100644
--- a/test/CodeGen/AArch64/fold-constants.ll
+++ b/test/CodeGen/AArch64/fold-constants.ll
@@ -16,3 +16,19 @@ entry:
   %vget_lane = extractelement <1 x i64> %4, i32 0
   ret i64 %vget_lane
 }
+
+; PR25763 - folding constant vector comparisons with sign-extended result
+define <8 x i16> @dotests_458() {
+; CHECK-LABEL: dotests_458
+; CHECK:       movi d0, #0x00000000ff0000
+; CHECK-NEXT:  sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:  ret
+entry:
+  %vclz_v.i = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> <i8 127, i8 38, i8 -1, i8 -128, i8 127, i8 0, i8 0, i8 0>, i1 false) #6
+  %vsra_n = lshr <8 x i8> %vclz_v.i, <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>
+  %name_6 = or <8 x i8> %vsra_n, <i8 127, i8 -128, i8 -1, i8 67, i8 84, i8 127, i8 -1, i8 0>
+  %cmp.i603 = icmp slt <8 x i8> %name_6, <i8 -57, i8 -128, i8 127, i8 -128, i8 -1, i8 0, i8 -1, i8 -1>
+  %vmovl.i4.i = sext <8 x i1> %cmp.i603 to <8 x i16>
+  ret <8 x i16> %vmovl.i4.i
+}
+declare <8 x i8> @llvm.ctlz.v8i8(<8 x i8>, i1)

From dd635a682c991ecf3b61b86e8caceb842d2e8667 Mon Sep 17 00:00:00 2001
From: Cong Hou <congh@google.com>
Date: Thu, 10 Dec 2015 19:57:22 +0000
Subject: [PATCH 340/364] Delete a duplicate branch in IfConversion.cpp. NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255291 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/IfConversion.cpp | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index e90cb02bd280..71bd61a15cb7 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -1679,15 +1679,6 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) {
         BranchProbability::getZero());
   }
 
-  if (AddEdges && ToBBI.BB->isSuccessor(FromBBI.BB)) {
-    // Set the edge probability from ToBBI.BB to FromBBI.BB to zero to avoid the
-    // edge probability being merged to other edges when this edge is removed
-    // later.
-    ToBBI.BB->setSuccProbability(
-        std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), FromBBI.BB),
-        BranchProbability::getZero());
-  }
-
   for (unsigned i = 0, e = FromSuccs.size(); i != e; ++i) {
     MachineBasicBlock *Succ = FromSuccs[i];
     // Fallthrough edge can't be transferred.

From d032a6a4e0daeb06d520b7305daefb599ad20de0 Mon Sep 17 00:00:00 2001
From: Xinliang David Li <davidxl@google.com>
Date: Thu, 10 Dec 2015 20:13:41 +0000
Subject: [PATCH 341/364] Add a forward declaration (NFC)

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255292 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ProfileData/InstrProfData.inc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/llvm/ProfileData/InstrProfData.inc b/include/llvm/ProfileData/InstrProfData.inc
index aefdbc1b3e47..48dae506cabb 100644
--- a/include/llvm/ProfileData/InstrProfData.inc
+++ b/include/llvm/ProfileData/InstrProfData.inc
@@ -362,6 +362,7 @@ uint32_t getValueProfDataSizeRT(const ValueProfRuntimeRecord *Record);
 ValueProfData *
 serializeValueProfDataFromRT(const ValueProfRuntimeRecord *Record,
                              ValueProfData *Dst);
+uint32_t getNumValueKindsRT(const void *R);
 
 #undef INSTR_PROF_VALUE_PROF_DATA
 #endif  /* INSTR_PROF_VALUE_PROF_DATA */ 

From 89415800361b94cb9e8b231f59ea55a84313fcd8 Mon Sep 17 00:00:00 2001
From: JF Bastien <jfb@google.com>
Date: Thu, 10 Dec 2015 20:24:34 +0000
Subject: [PATCH 342/364] EarlyCSE: add tests

Summary: As a follow-up to rL255054 I wasn't able to convince myself that the code did what I thought, so I wrote more tests.

Reviewers: reames

Subscribers: llvm-commits

Differential Revision: http://reviews.llvm.org/D15371

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255295 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Transforms/EarlyCSE/atomics.ll | 78 +++++++++++++++++++++++++----
 1 file changed, 68 insertions(+), 10 deletions(-)

diff --git a/test/Transforms/EarlyCSE/atomics.ll b/test/Transforms/EarlyCSE/atomics.ll
index f08220fd42d8..ea85a86fa911 100644
--- a/test/Transforms/EarlyCSE/atomics.ll
+++ b/test/Transforms/EarlyCSE/atomics.ll
@@ -29,12 +29,12 @@ define i32 @test14(i1 %B, i32* %P1) {
   %b = load atomic i32, i32* %P1 unordered, align 4
   %res = sub i32 %a, %b
   ret i32 %res
-  ; CHECK: load atomic i32, i32* %P1
-  ; CHECK: ret i32 0
+  ; CHECK: load atomic i32, i32* %P1 seq_cst
+  ; CHECK-NEXT: ret i32 0
 }
 
 ; CHECK-LABEL: @test15(
-; implementation restiction: can't forward to stonger
+; implementation restriction: can't forward to stonger
 ; than unordered
 define i32 @test15(i1 %B, i32* %P1, i32* %P2) {
   %a = load atomic i32, i32* %P1 seq_cst, align 4
@@ -60,8 +60,8 @@ define i32 @test16(i1 %B, i32* %P1, i32* %P2) {
 }
 
 ; Can't DSE across a full fence
-define void @test17(i1 %B, i32* %P1, i32* %P2) {
-; CHECK-LABEL: @test17
+define void @fence_seq_cst_store(i1 %B, i32* %P1, i32* %P2) {
+; CHECK-LABEL: @fence_seq_cst_store
 ; CHECK: store
 ; CHECK: store atomic
 ; CHECK: store
@@ -71,20 +71,68 @@ define void @test17(i1 %B, i32* %P1, i32* %P2) {
   ret void
 }
 
+; Can't DSE across a full fence
+define void @fence_seq_cst(i1 %B, i32* %P1, i32* %P2) {
+; CHECK-LABEL: @fence_seq_cst
+; CHECK: store
+; CHECK: fence seq_cst
+; CHECK: store
+  store i32 0, i32* %P1, align 4
+  fence seq_cst
+  store i32 0, i32* %P1, align 4
+  ret void
+}
+
+; Can't DSE across a full fence
+define void @fence_asm_sideeffect(i1 %B, i32* %P1, i32* %P2) {
+; CHECK-LABEL: @fence_asm_sideeffect
+; CHECK: store
+; CHECK: call void asm sideeffect
+; CHECK: store
+  store i32 0, i32* %P1, align 4
+  call void asm sideeffect "", ""()
+  store i32 0, i32* %P1, align 4
+  ret void
+}
+
+; Can't DSE across a full fence
+define void @fence_asm_memory(i1 %B, i32* %P1, i32* %P2) {
+; CHECK-LABEL: @fence_asm_memory
+; CHECK: store
+; CHECK: call void asm
+; CHECK: store
+  store i32 0, i32* %P1, align 4
+  call void asm "", "~{memory}"()
+  store i32 0, i32* %P1, align 4
+  ret void
+}
+
 ; Can't remove a volatile load
-define i32 @test18(i1 %B, i32* %P1, i32* %P2) {
+define i32 @volatile_load(i1 %B, i32* %P1, i32* %P2) {
   %a = load i32, i32* %P1, align 4
   %b = load volatile i32, i32* %P1, align 4
   %res = sub i32 %a, %b
   ret i32 %res
-  ; CHECK-LABEL: @test18
+  ; CHECK-LABEL: @volatile_load
   ; CHECK: load i32, i32* %P1
   ; CHECK: load volatile i32, i32* %P1
 }
 
+; Can't remove redundant volatile loads
+define i32 @redundant_volatile_load(i1 %B, i32* %P1, i32* %P2) {
+  %a = load volatile i32, i32* %P1, align 4
+  %b = load volatile i32, i32* %P1, align 4
+  %res = sub i32 %a, %b
+  ret i32 %res
+  ; CHECK-LABEL: @redundant_volatile_load
+  ; CHECK: load volatile i32, i32* %P1
+  ; CHECK: load volatile i32, i32* %P1
+  ; CHECK: sub
+}
+
 ; Can't DSE a volatile store
-define void @test19(i1 %B, i32* %P1, i32* %P2) {
-; CHECK-LABEL: @test19
+define void @volatile_store(i1 %B, i32* %P1, i32* %P2) {
+; CHECK-LABEL: @volatile_store
 ; CHECK: store volatile
 ; CHECK: store
   store volatile i32 0, i32* %P1, align 4
@@ -92,7 +140,17 @@ define void @test19(i1 %B, i32* %P1, i32* %P2) {
   ret void
 }
 
-; Can value forward from volailes
+; Can't DSE a redundant volatile store
+define void @redundant_volatile_store(i1 %B, i32* %P1, i32* %P2) {
+; CHECK-LABEL: @redundant_volatile_store
+; CHECK: store volatile
+; CHECK: store volatile
+  store volatile i32 0, i32* %P1, align 4
+  store volatile i32 0, i32* %P1, align 4
+  ret void
+}
+
+; Can value forward from volatiles
 define i32 @test20(i1 %B, i32* %P1, i32* %P2) {
   %a = load volatile i32, i32* %P1, align 4
   %b = load i32, i32* %P1, align 4

From 6d5ee5a800163515ea78b4dfa69c99a17a13ed62 Mon Sep 17 00:00:00 2001
From: Mike Aizatsky <aizatsky@chromium.org>
Date: Thu, 10 Dec 2015 20:41:53 +0000
Subject: [PATCH 343/364] [LibFuzzer] Introducing FUZZER_FLAG_UNSIGNED and
 using it for seeding.

Differential Revision: http://reviews.llvm.org/D15339

done

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255296 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Fuzzer/FuzzerDriver.cpp    | 20 ++++++++++++++++++--
 lib/Fuzzer/FuzzerFlags.def     |  2 +-
 lib/Fuzzer/FuzzerInterface.cpp |  2 +-
 lib/Fuzzer/FuzzerInterface.h   |  6 +++---
 lib/Fuzzer/test/fuzzer.test    |  4 ++--
 5 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/lib/Fuzzer/FuzzerDriver.cpp b/lib/Fuzzer/FuzzerDriver.cpp
index fa473811068f..872d2c88e6f2 100644
--- a/lib/Fuzzer/FuzzerDriver.cpp
+++ b/lib/Fuzzer/FuzzerDriver.cpp
@@ -32,23 +32,30 @@ struct FlagDescription {
   int   Default;
   int   *IntFlag;
   const char **StrFlag;
+  unsigned int *UIntFlag;
 };
 
 struct {
 #define FUZZER_FLAG_INT(Name, Default, Description) int Name;
+#define FUZZER_FLAG_UNSIGNED(Name, Default, Description) unsigned int Name;
 #define FUZZER_FLAG_STRING(Name, Description) const char *Name;
 #include "FuzzerFlags.def"
 #undef FUZZER_FLAG_INT
+#undef FUZZER_FLAG_UNSIGNED
 #undef FUZZER_FLAG_STRING
 } Flags;
 
 static const FlagDescription FlagDescriptions [] {
 #define FUZZER_FLAG_INT(Name, Default, Description)                            \
-  { #Name, Description, Default, &Flags.Name, nullptr},
+  {#Name, Description, Default, &Flags.Name, nullptr, nullptr},
+#define FUZZER_FLAG_UNSIGNED(Name, Default, Description)                       \
+  {#Name,   Description, static_cast<int>(Default),                            \
+   nullptr, nullptr, &Flags.Name},
 #define FUZZER_FLAG_STRING(Name, Description)                                  \
-  { #Name, Description, 0, nullptr, &Flags.Name },
+  {#Name, Description, 0, nullptr, &Flags.Name, nullptr},
 #include "FuzzerFlags.def"
 #undef FUZZER_FLAG_INT
+#undef FUZZER_FLAG_UNSIGNED
 #undef FUZZER_FLAG_STRING
 };
 
@@ -106,6 +113,12 @@ static bool ParseOneFlag(const char *Param) {
         if (Flags.verbosity >= 2)
           Printf("Flag: %s %d\n", Name, Val);;
         return true;
+      } else if (FlagDescriptions[F].UIntFlag) {
+        unsigned int Val = std::stoul(Str);
+        *FlagDescriptions[F].UIntFlag = Val;
+        if (Flags.verbosity >= 2)
+          Printf("Flag: %s %u\n", Name, Val);
+        return true;
       } else if (FlagDescriptions[F].StrFlag) {
         *FlagDescriptions[F].StrFlag = Str;
         if (Flags.verbosity >= 2)
@@ -123,6 +136,9 @@ static void ParseFlags(const std::vector<std::string> &Args) {
   for (size_t F = 0; F < kNumFlags; F++) {
     if (FlagDescriptions[F].IntFlag)
       *FlagDescriptions[F].IntFlag = FlagDescriptions[F].Default;
+    if (FlagDescriptions[F].UIntFlag)
+      *FlagDescriptions[F].UIntFlag =
+          static_cast<unsigned int>(FlagDescriptions[F].Default);
     if (FlagDescriptions[F].StrFlag)
       *FlagDescriptions[F].StrFlag = nullptr;
   }
diff --git a/lib/Fuzzer/FuzzerFlags.def b/lib/Fuzzer/FuzzerFlags.def
index 7aea5bf641c3..da7a2d3f5c77 100644
--- a/lib/Fuzzer/FuzzerFlags.def
+++ b/lib/Fuzzer/FuzzerFlags.def
@@ -11,7 +11,7 @@
 // portability and independence.
 //===----------------------------------------------------------------------===//
 FUZZER_FLAG_INT(verbosity, 1, "Verbosity level.")
-FUZZER_FLAG_INT(seed, 0, "Random seed. If 0, seed is generated.")
+FUZZER_FLAG_UNSIGNED(seed, 0, "Random seed. If 0, seed is generated.")
 FUZZER_FLAG_INT(runs, -1,
             "Number of individual test runs (-1 for infinite runs).")
 FUZZER_FLAG_INT(max_len, 64, "Maximum length of the test input.")
diff --git a/lib/Fuzzer/FuzzerInterface.cpp b/lib/Fuzzer/FuzzerInterface.cpp
index 79cb0e40f0e2..bcd726fc08e4 100644
--- a/lib/Fuzzer/FuzzerInterface.cpp
+++ b/lib/Fuzzer/FuzzerInterface.cpp
@@ -15,7 +15,7 @@
 
 namespace fuzzer {
 
-void FuzzerRandomLibc::ResetSeed(int seed) { srand(seed); }
+void FuzzerRandomLibc::ResetSeed(unsigned int seed) { srand(seed); }
 
 size_t FuzzerRandomLibc::Rand() { return rand(); }
 
diff --git a/lib/Fuzzer/FuzzerInterface.h b/lib/Fuzzer/FuzzerInterface.h
index c96343404f42..c409c06eca19 100644
--- a/lib/Fuzzer/FuzzerInterface.h
+++ b/lib/Fuzzer/FuzzerInterface.h
@@ -50,7 +50,7 @@ class FuzzerRandomBase {
  public:
   FuzzerRandomBase(){}
   virtual ~FuzzerRandomBase(){};
-  virtual void ResetSeed(int seed) = 0;
+  virtual void ResetSeed(unsigned int seed) = 0;
   // Return a random number.
   virtual size_t Rand() = 0;
   // Return a random number in range [0,n).
@@ -60,8 +60,8 @@ class FuzzerRandomBase {
 
 class FuzzerRandomLibc : public FuzzerRandomBase {
  public:
-  FuzzerRandomLibc(int seed) { ResetSeed(seed); }
-  void ResetSeed(int seed) override;
+  FuzzerRandomLibc(unsigned int seed) { ResetSeed(seed); }
+  void ResetSeed(unsigned int seed) override;
   ~FuzzerRandomLibc() override {}
   size_t Rand() override;
 };
diff --git a/lib/Fuzzer/test/fuzzer.test b/lib/Fuzzer/test/fuzzer.test
index 3b7045d8de8d..6515628d1086 100644
--- a/lib/Fuzzer/test/fuzzer.test
+++ b/lib/Fuzzer/test/fuzzer.test
@@ -48,8 +48,8 @@ RUN: not LLVMFuzzer-SimpleCmpTest -use_traces=1 -seed=1 -runs=1000000 -timeout=5
 
 RUN: not LLVMFuzzer-UserSuppliedFuzzerTest -seed=1 -timeout=15 2>&1 | FileCheck %s
 
-RUN: not LLVMFuzzer-MemcmpTest -use_traces=1 -seed=1 -runs=100000   2>&1 | FileCheck %s
-RUN:     LLVMFuzzer-MemcmpTest               -seed=1 -runs=1000000  2>&1 | FileCheck %s --check-prefix=Done1000000
+RUN: not LLVMFuzzer-MemcmpTest -use_traces=1 -seed=4294967295 -runs=100000   2>&1 | FileCheck %s
+RUN:     LLVMFuzzer-MemcmpTest               -seed=4294967295 -runs=1000000  2>&1 | FileCheck %s --check-prefix=Done1000000
 Done1000000: Done 1000000 runs in
 
 RUN: not LLVMFuzzer-StrncmpTest -use_traces=1 -seed=1 -runs=100000   2>&1 | FileCheck %s

From f6aeea168a04bf0dcc648c4f081af4d55e9ad419 Mon Sep 17 00:00:00 2001
From: Chris Bieneman <beanz@apple.com>
Date: Thu, 10 Dec 2015 21:19:07 +0000
Subject: [PATCH 344/364] [CMake] Add LLVM_BUILD_INSTRUMENTED option to enable
 building with -fprofile-instr-generate

This is the first step in supporting PGO data generation via CMake. I've marked the option as advanced and experimental until it is fleshed out further.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255298 91177308-0d34-0410-b5e6-96231b3b80d8
---
 cmake/modules/HandleLLVMOptions.cmake | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake
index 6c3109c4de76..f4ac6d7a73fd 100644
--- a/cmake/modules/HandleLLVMOptions.cmake
+++ b/cmake/modules/HandleLLVMOptions.cmake
@@ -587,6 +587,14 @@ if(LLVM_ENABLE_EH AND NOT LLVM_ENABLE_RTTI)
   message(FATAL_ERROR "Exception handling requires RTTI. You must set LLVM_ENABLE_RTTI to ON")
 endif()
 
+option(LLVM_BUILD_INSTRUMENTED "Build LLVM and tools with PGO instrumentation (experimental)" Off)
+mark_as_advanced(LLVM_BUILD_INSTRUMENTED)
+append_if(LLVM_BUILD_INSTRUMENTED "-fprofile-instr-generate"
+  CMAKE_CXX_FLAGS
+  CMAKE_C_FLAGS
+  CMAKE_EXE_LINKER_FLAGS
+  CMAKE_SHARED_LINKER_FLAGS)
+
 # Plugin support
 # FIXME: Make this configurable.
 if(WIN32 OR CYGWIN)

From e7ad314f80edd67ebe5a36a91cfa345bfea0bc62 Mon Sep 17 00:00:00 2001
From: Kyle Butt <kyle+llvm@iteratee.net>
Date: Thu, 10 Dec 2015 21:28:40 +0000
Subject: [PATCH 345/364] PPC: Teach FMA mutate to respect register classes.

This was causing bad code gen and assembly that won't assemble, as
mixed altivec and vsx code would end up with a vsx high register
assigned to an altivec instruction, which won't work. Constraining the
classes allows the optimization to proceed.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255299 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/PowerPC/PPCVSXFMAMutate.cpp        | 11 ++-
 .../PowerPC/fma-mutate-register-constraint.ll | 89 +++++++++++++++++++
 2 files changed, 98 insertions(+), 2 deletions(-)
 create mode 100644 test/CodeGen/PowerPC/fma-mutate-register-constraint.ll

diff --git a/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
index 0bd2bd84db06..6b19a2f7118b 100644
--- a/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
+++ b/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
@@ -220,6 +220,14 @@ namespace {
         if (OldFMAReg == KilledProdReg)
           continue;
 
+        // If there isn't a class that fits, we can't perform the transform.
+        // This is needed for correctness with a mixture of VSX and Altivec
+        // instructions to make sure that a low VSX register is not assigned to
+        // the Altivec instruction.
+        if (!MRI.constrainRegClass(KilledProdReg,
+                                   MRI.getRegClass(OldFMAReg)))
+          continue;
+
         assert(OldFMAReg == AddendMI->getOperand(0).getReg() &&
                "Addend copy not tied to old FMA output!");
 
@@ -262,8 +270,7 @@ namespace {
           if (UseMI == AddendMI)
             continue;
 
-          UseMO.setReg(KilledProdReg);
-          UseMO.setSubReg(KilledProdSubReg);
+          UseMO.substVirtReg(KilledProdReg, KilledProdSubReg, *TRI);
         }
 
         // Extend the live intervals of the killed product operand to hold the
diff --git a/test/CodeGen/PowerPC/fma-mutate-register-constraint.ll b/test/CodeGen/PowerPC/fma-mutate-register-constraint.ll
new file mode 100644
index 000000000000..fd2ba4ec635e
--- /dev/null
+++ b/test/CodeGen/PowerPC/fma-mutate-register-constraint.ll
@@ -0,0 +1,89 @@
+; RUN: llc -enable-unsafe-fp-math < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; CHECK-NOT: {{vmrg[hl]w.*(3[23456789]|[456][0-9])}}
+define void @__f0() {
+entry:
+  %0 = shufflevector <8 x float> zeroinitializer, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %1 = shufflevector <16 x float> %0, <16 x float> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %2 = shufflevector <8 x float> zeroinitializer, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %3 = shufflevector <16 x float> %2, <16 x float> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %4 = fmul <32 x float> %1, %3
+  %5 = load <4 x float>, <4 x float>* undef, align 128
+  %6 = load <4 x float>, <4 x float>* undef, align 128
+  %7 = shufflevector <4 x float> undef, <4 x float> %5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %8 = shufflevector <4 x float> undef, <4 x float> %6, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %9 = shufflevector <8 x float> %7, <8 x float> %8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %10 = shufflevector <16 x float> undef, <16 x float> %9, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %11 = load <4 x float>, <4 x float>* null, align 128
+  %12 = load <4 x float>, <4 x float>* undef, align 128
+  %13 = shufflevector <4 x float> undef, <4 x float> %11, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %14 = shufflevector <4 x float> undef, <4 x float> %12, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %15 = shufflevector <8 x float> %13, <8 x float> %14, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %16 = shufflevector <16 x float> undef, <16 x float> %15, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %17 = fmul <32 x float> %10, %16
+  %18 = fsub <32 x float> %4, %17
+  %19 = shufflevector <32 x float> %18, <32 x float> undef, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+  %20 = bitcast <64 x float> %19 to <32 x double>
+  %21 = shufflevector <32 x double> undef, <32 x double> %20, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+  %22 = bitcast <64 x double> %21 to <128 x float>
+  %23 = shufflevector <128 x float> undef, <128 x float> %22, <256 x i32> <i32 0, i32 128, i32 1, i32 129, i32 2, i32 130, i32 3, i32 131, i32 4, i32 132, i32 5, i32 133, i32 6, i32 134, i32 7, i32 135, i32 8, i32 136, i32 9, i32 137, i32 10, i32 138, i32 11, i32 139, i32 12, i32 140, i32 13, i32 141, i32 14, i32 142, i32 15, i32 143, i32 16, i32 144, i32 17, i32 145, i32 18, i32 146, i32 19, i32 147, i32 20, i32 148, i32 21, i32 149, i32 22, i32 150, i32 23, i32 151, i32 24, i32 152, i32 25, i32 153, i32 26, i32 154, i32 27, i32 155, i32 28, i32 156, i32 29, i32 157, i32 30, i32 158, i32 31, i32 159, i32 32, i32 160, i32 33, i32 161, i32 34, i32 162, i32 35, i32 163, i32 36, i32 164, i32 37, i32 165, i32 38, i32 166, i32 39, i32 167, i32 40, i32 168, i32 41, i32 169, i32 42, i32 170, i32 43, i32 171, i32 44, i32 172, i32 45, i32 173, i32 46, i32 174, i32 47, i32 175, i32 48, i32 176, i32 49, i32 177, i32 50, i32 178, i32 51, i32 179, i32 52, i32 180, i32 53, i32 181, i32 54, i32 182, i32 55, i32 183, i32 56, i32 184, i32 57, i32 185, i32 58, i32 186, i32 59, i32 187, i32 60, i32 188, i32 61, i32 189, i32 62, i32 190, i32 63, i32 191, i32 64, i32 192, i32 65, i32 193, i32 66, i32 194, i32 67, i32 195, i32 68, i32 196, i32 69, i32 197, i32 70, i32 198, i32 71, i32 199, i32 72, i32 200, i32 73, i32 201, i32 74, i32 202, i32 75, i32 203, i32 76, i32 204, i32 77, i32 205, i32 78, i32 206, i32 79, i32 207, i32 80, i32 208, i32 81, i32 209, i32 82, i32 210, i32 83, i32 211, i32 84, i32 212, i32 85, i32 213, i32 86, i32 214, i32 87, i32 215, i32 88, i32 216, i32 89, i32 217, i32 90, i32 218, i32 91, i32 219, i32 92, i32 220, i32 93, i32 221, i32 94, i32 222, i32 95, i32 223, i32 96, i32 224, i32 97, i32 225, i32 98, i32 226, i32 99, i32 227, i32 100, i32 228, i32 101, i32 229, i32 102, i32 230, i32 103, i32 231, i32 104, i32 232, i32 105, i32 233, i32 106, i32 234, i32 107, i32 235, i32 108, i32 236, i32 109, i32 237, i32 110, i32 238, i32 111, i32 239, i32 112, i32 240, i32 113, i32 241, i32 114, i32 242, i32 115, i32 243, i32 116, i32 244, i32 117, i32 245, i32 118, i32 246, i32 119, i32 247, i32 120, i32 248, i32 121, i32 249, i32 122, i32 250, i32 123, i32 251, i32 124, i32 252, i32 125, i32 253, i32 126, i32 254, i32 127, i32 255>
+  %24 = shufflevector <256 x float> undef, <256 x float> %23, <512 x i32> <i32 0, i32 256, i32 1, i32 257, i32 2, i32 258, i32 3, i32 259, i32 4, i32 260, i32 5, i32 261, i32 6, i32 262, i32 7, i32 263, i32 8, i32 264, i32 9, i32 265, i32 10, i32 266, i32 11, i32 267, i32 12, i32 268, i32 13, i32 269, i32 14, i32 270, i32 15, i32 271, i32 16, i32 272, i32 17, i32 273, i32 18, i32 274, i32 19, i32 275, i32 20, i32 276, i32 21, i32 277, i32 22, i32 278, i32 23, i32 279, i32 24, i32 280, i32 25, i32 281, i32 26, i32 282, i32 27, i32 283, i32 28, i32 284, i32 29, i32 285, i32 30, i32 286, i32 31, i32 287, i32 32, i32 288, i32 33, i32 289, i32 34, i32 290, i32 35, i32 291, i32 36, i32 292, i32 37, i32 293, i32 38, i32 294, i32 39, i32 295, i32 40, i32 296, i32 41, i32 297, i32 42, i32 298, i32 43, i32 299, i32 44, i32 300, i32 45, i32 301, i32 46, i32 302, i32 47, i32 303, i32 48, i32 304, i32 49, i32 305, i32 50, i32 306, i32 51, i32 307, i32 52, i32 308, i32 53, i32 309, i32 54, i32 310, i32 55, i32 311, i32 56, i32 312, i32 57, i32 313, i32 58, i32 314, i32 59, i32 315, i32 60, i32 316, i32 61, i32 317, i32 62, i32 318, i32 63, i32 319, i32 64, i32 320, i32 65, i32 321, i32 66, i32 322, i32 67, i32 323, i32 68, i32 324, i32 69, i32 325, i32 70, i32 326, i32 71, i32 327, i32 72, i32 328, i32 73, i32 329, i32 74, i32 330, i32 75, i32 331, i32 76, i32 332, i32 77, i32 333, i32 78, i32 334, i32 79, i32 335, i32 80, i32 336, i32 81, i32 337, i32 82, i32 338, i32 83, i32 339, i32 84, i32 340, i32 85, i32 341, i32 86, i32 342, i32 87, i32 343, i32 88, i32 344, i32 89, i32 345, i32 90, i32 346, i32 91, i32 347, i32 92, i32 348, i32 93, i32 349, i32 94, i32 350, i32 95, i32 351, i32 96, i32 352, i32 97, i32 353, i32 98, i32 354, i32 99, i32 355, i32 100, i32 356, i32 101, i32 357, i32 102, i32 358, i32 103, i32 359, i32 104, i32 360, i32 105, i32 361, i32 106, i32 362, i32 107, i32 363, i32 108, i32 364, i32 109, i32 365, i32 110, i32 366, i32 111, i32 367, i32 112, i32 368, i32 113, i32 369, i32 114, i32 370, i32 115, i32 371, i32 116, i32 372, i32 117, i32 373, i32 118, i32 374, i32 119, i32 375, i32 120, i32 376, i32 121, i32 377, i32 122, i32 378, i32 123, i32 379, i32 124, i32 380, i32 125, i32 381, i32 126, i32 382, i32 127, i32 383, i32 128, i32 384, i32 129, i32 385, i32 130, i32 386, i32 131, i32 387, i32 132, i32 388, i32 133, i32 389, i32 134, i32 390, i32 135, i32 391, i32 136, i32 392, i32 137, i32 393, i32 138, i32 394, i32 139, i32 395, i32 140, i32 396, i32 141, i32 397, i32 142, i32 398, i32 143, i32 399, i32 144, i32 400, i32 145, i32 401, i32 146, i32 402, i32 147, i32 403, i32 148, i32 404, i32 149, i32 405, i32 150, i32 406, i32 151, i32 407, i32 152, i32 408, i32 153, i32 409, i32 154, i32 410, i32 155, i32 411, i32 156, i32 412, i32 157, i32 413, i32 158, i32 414, i32 159, i32 415, i32 160, i32 416, i32 161, i32 417, i32 162, i32 418, i32 163, i32 419, i32 164, i32 420, i32 165, i32 421, i32 166, i32 422, i32 167, i32 423, i32 168, i32 424, i32 169, i32 425, i32 170, i32 426, i32 171, i32 427, i32 172, i32 428, i32 173, i32 429, i32 174, i32 430, i32 175, i32 431, i32 176, i32 432, i32 177, i32 433, i32 178, i32 434, i32 179, i32 435, i32 180, i32 436, i32 181, i32 437, i32 182, i32 438, i32 183, i32 439, i32 184, i32 440, i32 185, i32 441, i32 186, i32 442, i32 187, i32 443, i32 188, i32 444, i32 189, i32 445, i32 190, i32 446, i32 191, i32 447, i32 192, i32 448, i32 193, i32 449, i32 194, i32 450, i32 195, i32 451, i32 196, i32 452, i32 197, i32 453, i32 198, i32 454, i32 199, i32 455, i32 200, i32 456, i32 201, i32 457, i32 202, i32 458, i32 203, i32 459, i32 204, i32 460, i32 205, i32 461, i32 206, i32 462, i32 207, i32 463, i32 208, i32 464, i32 209, i32 465, i32 210, i32 466, i32 211, i32 467, i32 212, i32 468, i32 213, i32 469, i32 214, i32 470, i32 215, i32 471, i32 216, i32 472, i32 217, i32 473, i32 218, i32 474, i32 219, i32 475, i32 220, i32 476, i32 221, i32 477, i32 222, i32 478, i32 223, i32 479, i32 224, i32 480, i32 225, i32 481, i32 226, i32 482, i32 227, i32 483, i32 228, i32 484, i32 229, i32 485, i32 230, i32 486, i32 231, i32 487, i32 232, i32 488, i32 233, i32 489, i32 234, i32 490, i32 235, i32 491, i32 236, i32 492, i32 237, i32 493, i32 238, i32 494, i32 239, i32 495, i32 240, i32 496, i32 241, i32 497, i32 242, i32 498, i32 243, i32 499, i32 244, i32 500, i32 245, i32 501, i32 246, i32 502, i32 247, i32 503, i32 248, i32 504, i32 249, i32 505, i32 250, i32 506, i32 251, i32 507, i32 252, i32 508, i32 253, i32 509, i32 254, i32 510, i32 255, i32 511>
+  %25 = shufflevector <512 x float> %24, <512 x float> undef, <1024 x i32> <i32 0, i32 512, i32 1, i32 513, i32 2, i32 514, i32 3, i32 515, i32 4, i32 516, i32 5, i32 517, i32 6, i32 518, i32 7, i32 519, i32 8, i32 520, i32 9, i32 521, i32 10, i32 522, i32 11, i32 523, i32 12, i32 524, i32 13, i32 525, i32 14, i32 526, i32 15, i32 527, i32 16, i32 528, i32 17, i32 529, i32 18, i32 530, i32 19, i32 531, i32 20, i32 532, i32 21, i32 533, i32 22, i32 534, i32 23, i32 535, i32 24, i32 536, i32 25, i32 537, i32 26, i32 538, i32 27, i32 539, i32 28, i32 540, i32 29, i32 541, i32 30, i32 542, i32 31, i32 543, i32 32, i32 544, i32 33, i32 545, i32 34, i32 546, i32 35, i32 547, i32 36, i32 548, i32 37, i32 549, i32 38, i32 550, i32 39, i32 551, i32 40, i32 552, i32 41, i32 553, i32 42, i32 554, i32 43, i32 555, i32 44, i32 556, i32 45, i32 557, i32 46, i32 558, i32 47, i32 559, i32 48, i32 560, i32 49, i32 561, i32 50, i32 562, i32 51, i32 563, i32 52, i32 564, i32 53, i32 565, i32 54, i32 566, i32 55, i32 567, i32 56, i32 568, i32 57, i32 569, i32 58, i32 570, i32 59, i32 571, i32 60, i32 572, i32 61, i32 573, i32 62, i32 574, i32 63, i32 575, i32 64, i32 576, i32 65, i32 577, i32 66, i32 578, i32 67, i32 579, i32 68, i32 580, i32 69, i32 581, i32 70, i32 582, i32 71, i32 583, i32 72, i32 584, i32 73, i32 585, i32 74, i32 586, i32 75, i32 587, i32 76, i32 588, i32 77, i32 589, i32 78, i32 590, i32 79, i32 591, i32 80, i32 592, i32 81, i32 593, i32 82, i32 594, i32 83, i32 595, i32 84, i32 596, i32 85, i32 597, i32 86, i32 598, i32 87, i32 599, i32 88, i32 600, i32 89, i32 601, i32 90, i32 602, i32 91, i32 603, i32 92, i32 604, i32 93, i32 605, i32 94, i32 606, i32 95, i32 607, i32 96, i32 608, i32 97, i32 609, i32 98, i32 610, i32 99, i32 611, i32 100, i32 612, i32 101, i32 613, i32 102, i32 614, i32 103, i32 615, i32 104, i32 616, i32 105, i32 617, i32 106, i32 618, i32 107, i32 619, i32 108, i32 620, i32 109, i32 621, i32 110, i32 622, i32 111, i32 623, i32 112, i32 624, i32 113, i32 625, i32 114, i32 626, i32 115, i32 627, i32 116, i32 628, i32 117, i32 629, i32 118, i32 630, i32 119, i32 631, i32 120, i32 632, i32 121, i32 633, i32 122, i32 634, i32 123, i32 635, i32 124, i32 636, i32 125, i32 637, i32 126, i32 638, i32 127, i32 639, i32 128, i32 640, i32 129, i32 641, i32 130, i32 642, i32 131, i32 643, i32 132, i32 644, i32 133, i32 645, i32 134, i32 646, i32 135, i32 647, i32 136, i32 648, i32 137, i32 649, i32 138, i32 650, i32 139, i32 651, i32 140, i32 652, i32 141, i32 653, i32 142, i32 654, i32 143, i32 655, i32 144, i32 656, i32 145, i32 657, i32 146, i32 658, i32 147, i32 659, i32 148, i32 660, i32 149, i32 661, i32 150, i32 662, i32 151, i32 663, i32 152, i32 664, i32 153, i32 665, i32 154, i32 666, i32 155, i32 667, i32 156, i32 668, i32 157, i32 669, i32 158, i32 670, i32 159, i32 671, i32 160, i32 672, i32 161, i32 673, i32 162, i32 674, i32 163, i32 675, i32 164, i32 676, i32 165, i32 677, i32 166, i32 678, i32 167, i32 679, i32 168, i32 680, i32 169, i32 681, i32 170, i32 682, i32 171, i32 683, i32 172, i32 684, i32 173, i32 685, i32 174, i32 686, i32 175, i32 687, i32 176, i32 688, i32 177, i32 689, i32 178, i32 690, i32 179, i32 691, i32 180, i32 692, i32 181, i32 693, i32 182, i32 694, i32 183, i32 695, i32 184, i32 696, i32 185, i32 697, i32 186, i32 698, i32 187, i32 699, i32 188, i32 700, i32 189, i32 701, i32 190, i32 702, i32 191, i32 703, i32 192, i32 704, i32 193, i32 705, i32 194, i32 706, i32 195, i32 707, i32 196, i32 708, i32 197, i32 709, i32 198, i32 710, i32 199, i32 711, i32 200, i32 712, i32 201, i32 713, i32 202, i32 714, i32 203, i32 715, i32 204, i32 716, i32 205, i32 717, i32 206, i32 718, i32 207, i32 719, i32 208, i32 720, i32 209, i32 721, i32 210, i32 722, i32 211, i32 723, i32 212, i32 724, i32 213, i32 725, i32 214, i32 726, i32 215, i32 727, i32 216, i32 728, i32 217, i32 729, i32 218, i32 730, i32 219, i32 731, i32 220, i32 732, i32 221, i32 733, i32 222, i32 734, i32 223, i32 735, i32 224, i32 736, i32 225, i32 737, i32 226, i32 738, i32 227, i32 739, i32 228, i32 740, i32 229, i32 741, i32 230, i32 742, i32 231, i32 743, i32 232, i32 744, i32 233, i32 745, i32 234, i32 746, i32 235, i32 747, i32 236, i32 748, i32 237, i32 749, i32 238, i32 750, i32 239, i32 751, i32 240, i32 752, i32 241, i32 753, i32 242, i32 754, i32 243, i32 755, i32 244, i32 756, i32 245, i32 757, i32 246, i32 758, i32 247, i32 759, i32 248, i32 760, i32 249, i32 761, i32 250, i32 762, i32 251, i32 763, i32 252, i32 764, i32 253, i32 765, i32 254, i32 766, i32 255, i32 767, i32 256, i32 768, i32 257, i32 769, i32 258, i32 770, i32 259, i32 771, i32 260, i32 772, i32 261, i32 773, i32 262, i32 774, i32 263, i32 775, i32 264, i32 776, i32 265, i32 777, i32 266, i32 778, i32 267, i32 779, i32 268, i32 780, i32 269, i32 781, i32 270, i32 782, i32 271, i32 783, i32 272, i32 784, i32 273, i32 785, i32 274, i32 786, i32 275, i32 787, i32 276, i32 788, i32 277, i32 789, i32 278, i32 790, i32 279, i32 791, i32 280, i32 792, i32 281, i32 793, i32 282, i32 794, i32 283, i32 795, i32 284, i32 796, i32 285, i32 797, i32 286, i32 798, i32 287, i32 799, i32 288, i32 800, i32 289, i32 801, i32 290, i32 802, i32 291, i32 803, i32 292, i32 804, i32 293, i32 805, i32 294, i32 806, i32 295, i32 807, i32 296, i32 808, i32 297, i32 809, i32 298, i32 810, i32 299, i32 811, i32 300, i32 812, i32 301, i32 813, i32 302, i32 814, i32 303, i32 815, i32 304, i32 816, i32 305, i32 817, i32 306, i32 818, i32 307, i32 819, i32 308, i32 820, i32 309, i32 821, i32 310, i32 822, i32 311, i32 823, i32 312, i32 824, i32 313, i32 825, i32 314, i32 826, i32 315, i32 827, i32 316, i32 828, i32 317, i32 829, i32 318, i32 830, i32 319, i32 831, i32 320, i32 832, i32 321, i32 833, i32 322, i32 834, i32 323, i32 835, i32 324, i32 836, i32 325, i32 837, i32 326, i32 838, i32 327, i32 839, i32 328, i32 840, i32 329, i32 841, i32 330, i32 842, i32 331, i32 843, i32 332, i32 844, i32 333, i32 845, i32 334, i32 846, i32 335, i32 847, i32 336, i32 848, i32 337, i32 849, i32 338, i32 850, i32 339, i32 851, i32 340, i32 852, i32 341, i32 853, i32 342, i32 854, i32 343, i32 855, i32 344, i32 856, i32 345, i32 857, i32 346, i32 858, i32 347, i32 859, i32 348, i32 860, i32 349, i32 861, i32 350, i32 862, i32 351, i32 863, i32 352, i32 864, i32 353, i32 865, i32 354, i32 866, i32 355, i32 867, i32 356, i32 868, i32 357, i32 869, i32 358, i32 870, i32 359, i32 871, i32 360, i32 872, i32 361, i32 873, i32 362, i32 874, i32 363, i32 875, i32 364, i32 876, i32 365, i32 877, i32 366, i32 878, i32 367, i32 879, i32 368, i32 880, i32 369, i32 881, i32 370, i32 882, i32 371, i32 883, i32 372, i32 884, i32 373, i32 885, i32 374, i32 886, i32 375, i32 887, i32 376, i32 888, i32 377, i32 889, i32 378, i32 890, i32 379, i32 891, i32 380, i32 892, i32 381, i32 893, i32 382, i32 894, i32 383, i32 895, i32 384, i32 896, i32 385, i32 897, i32 386, i32 898, i32 387, i32 899, i32 388, i32 900, i32 389, i32 901, i32 390, i32 902, i32 391, i32 903, i32 392, i32 904, i32 393, i32 905, i32 394, i32 906, i32 395, i32 907, i32 396, i32 908, i32 397, i32 909, i32 398, i32 910, i32 399, i32 911, i32 400, i32 912, i32 401, i32 913, i32 402, i32 914, i32 403, i32 915, i32 404, i32 916, i32 405, i32 917, i32 406, i32 918, i32 407, i32 919, i32 408, i32 920, i32 409, i32 921, i32 410, i32 922, i32 411, i32 923, i32 412, i32 924, i32 413, i32 925, i32 414, i32 926, i32 415, i32 927, i32 416, i32 928, i32 417, i32 929, i32 418, i32 930, i32 419, i32 931, i32 420, i32 932, i32 421, i32 933, i32 422, i32 934, i32 423, i32 935, i32 424, i32 936, i32 425, i32 937, i32 426, i32 938, i32 427, i32 939, i32 428, i32 940, i32 429, i32 941, i32 430, i32 942, i32 431, i32 943, i32 432, i32 944, i32 433, i32 945, i32 434, i32 946, i32 435, i32 947, i32 436, i32 948, i32 437, i32 949, i32 438, i32 950, i32 439, i32 951, i32 440, i32 952, i32 441, i32 953, i32 442, i32 954, i32 443, i32 955, i32 444, i32 956, i32 445, i32 957, i32 446, i32 958, i32 447, i32 959, i32 448, i32 960, i32 449, i32 961, i32 450, i32 962, i32 451, i32 963, i32 452, i32 964, i32 453, i32 965, i32 454, i32 966, i32 455, i32 967, i32 456, i32 968, i32 457, i32 969, i32 458, i32 970, i32 459, i32 971, i32 460, i32 972, i32 461, i32 973, i32 462, i32 974, i32 463, i32 975, i32 464, i32 976, i32 465, i32 977, i32 466, i32 978, i32 467, i32 979, i32 468, i32 980, i32 469, i32 981, i32 470, i32 982, i32 471, i32 983, i32 472, i32 984, i32 473, i32 985, i32 474, i32 986, i32 475, i32 987, i32 476, i32 988, i32 477, i32 989, i32 478, i32 990, i32 479, i32 991, i32 480, i32 992, i32 481, i32 993, i32 482, i32 994, i32 483, i32 995, i32 484, i32 996, i32 485, i32 997, i32 486, i32 998, i32 487, i32 999, i32 488, i32 1000, i32 489, i32 1001, i32 490, i32 1002, i32 491, i32 1003, i32 492, i32 1004, i32 493, i32 1005, i32 494, i32 1006, i32 495, i32 1007, i32 496, i32 1008, i32 497, i32 1009, i32 498, i32 1010, i32 499, i32 1011, i32 500, i32 1012, i32 501, i32 1013, i32 502, i32 1014, i32 503, i32 1015, i32 504, i32 1016, i32 505, i32 1017, i32 506, i32 1018, i32 507, i32 1019, i32 508, i32 1020, i32 509, i32 1021, i32 510, i32 1022, i32 511, i32 1023>
+  %26 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x float> %26, <4 x float>* undef, align 128
+  %27 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+  store <4 x float> %27, <4 x float>* undef, align 128
+  %28 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 164, i32 165, i32 166, i32 167>
+  store <4 x float> %28, <4 x float>* undef, align 128
+  %29 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 168, i32 169, i32 170, i32 171>
+  store <4 x float> %29, <4 x float>* undef, align 128
+  %30 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 172, i32 173, i32 174, i32 175>
+  store <4 x float> %30, <4 x float>* undef, align 128
+  %31 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 176, i32 177, i32 178, i32 179>
+  store <4 x float> %31, <4 x float>* undef, align 128
+  %32 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 284, i32 285, i32 286, i32 287>
+  store <4 x float> %32, <4 x float>* undef, align 128
+  %33 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 328, i32 329, i32 330, i32 331>
+  store <4 x float> %33, <4 x float>* undef, align 128
+  %34 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 332, i32 333, i32 334, i32 335>
+  store <4 x float> %34, <4 x float>* undef, align 128
+  %35 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 524, i32 525, i32 526, i32 527>
+  store <4 x float> %35, <4 x float>* undef, align 128
+  %36 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 528, i32 529, i32 530, i32 531>
+  store <4 x float> %36, <4 x float>* undef, align 128
+  %37 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 648, i32 649, i32 650, i32 651>
+  store <4 x float> %37, <4 x float>* undef, align 128
+  %38 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 652, i32 653, i32 654, i32 655>
+  store <4 x float> %38, <4 x float>* undef, align 128
+  %39 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 656, i32 657, i32 658, i32 659>
+  store <4 x float> %39, <4 x float>* undef, align 128
+  %40 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 732, i32 733, i32 734, i32 735>
+  store <4 x float> %40, <4 x float>* undef, align 128
+  %41 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 736, i32 737, i32 738, i32 739>
+  store <4 x float> %41, <4 x float>* undef, align 128
+  %42 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 740, i32 741, i32 742, i32 743>
+  store <4 x float> %42, <4 x float>* undef, align 128
+  %43 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 872, i32 873, i32 874, i32 875>
+  store <4 x float> %43, <4 x float>* undef, align 128
+  %44 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 968, i32 969, i32 970, i32 971>
+  store <4 x float> %44, <4 x float>* undef, align 128
+  %45 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 1016, i32 1017, i32 1018, i32 1019>
+  store <4 x float> %45, <4 x float>* undef, align 128
+  %46 = shufflevector <1024 x float> %25, <1024 x float> undef, <4 x i32> <i32 1020, i32 1021, i32 1022, i32 1023>
+  store <4 x float> %46, <4 x float>* undef, align 128
+  %47 = shufflevector <1024 x float> undef, <1024 x float> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  store <4 x float> %47, <4 x float>* undef, align 128
+  %48 = shufflevector <1024 x float> undef, <1024 x float> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  store <4 x float> %48, <4 x float>* undef, align 128
+  %49 = shufflevector <1024 x float> undef, <1024 x float> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+  store <4 x float> %49, <4 x float>* undef, align 128
+  %50 = shufflevector <1024 x float> undef, <1024 x float> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+  store <4 x float> %50, <4 x float>* undef, align 128
+  %51 = shufflevector <1024 x float> undef, <1024 x float> undef, <4 x i32> <i32 148, i32 149, i32 150, i32 151>
+  store <4 x float> %51, <4 x float>* undef, align 128
+  %52 = shufflevector <1024 x float> undef, <1024 x float> undef, <4 x i32> <i32 632, i32 633, i32 634, i32 635>
+  store <4 x float> %52, <4 x float>* undef, align 128
+  ret void
+}

From 7a5bb662b560c748cf166222276cc68b52283346 Mon Sep 17 00:00:00 2001
From: Eric Christopher <echristo@gmail.com>
Date: Thu, 10 Dec 2015 21:33:53 +0000
Subject: [PATCH 346/364] Add C API guidelines to the developer policy to match
 discussions on the llvm mailing lists.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255300 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/DeveloperPolicy.rst | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/docs/DeveloperPolicy.rst b/docs/DeveloperPolicy.rst
index ea498dde459a..5eac458b8e72 100644
--- a/docs/DeveloperPolicy.rst
+++ b/docs/DeveloperPolicy.rst
@@ -525,6 +525,33 @@ for llvm users and not imposing a big burden on llvm developers:
   it is to drop it. That is not very user friendly and a bit more effort is
   expected, but no promises are made.
 
+C API Changes
+----------------
+
+* Stability Guarantees: The C API is, in general, a “best effort” for stability.
+This means that we make every attempt to keep the C API stable, but that
+stability will be limited by the abstractness of the interface and the stability
+of the C++ API that it wraps. In practice, this means that things like “create
+debug info” or “create this type of instruction” are likely to be less stable
+than “take this IR file and JIT it for my current machine”.
+
+* Release stability: We won’t break the C API on the release branch with patches
+that go on that branch, with the exception that if we will fix an unintentional
+C API break that will keep the release consistent with both the previous and next
+release.
+
+* Testing: Patches to the C API are expected to come with tests just like any
+other patch.
+
+* Including new things into the API: If an LLVM subcomponent has a C API already
+included, then expanding that C API is acceptable. Adding C API for subcomponents
+that don't currently have one is fine, and the details of how best to design that
+API should be discussed on the mailing list.
+
+* Documentation: Any changes to the C API are required to be documented in the
+release notes so that it’s clear to external users who do not follow the project
+how the C API is changing and evolving.
+
 .. _copyright-license-patents:
 
 Copyright, License, and Patents

From e4b68811b11a691215ea6b7e787b69f313ce8294 Mon Sep 17 00:00:00 2001
From: Eric Christopher <echristo@gmail.com>
Date: Thu, 10 Dec 2015 21:38:56 +0000
Subject: [PATCH 347/364] Fix non-ascii quotes.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255301 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/DeveloperPolicy.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/DeveloperPolicy.rst b/docs/DeveloperPolicy.rst
index 5eac458b8e72..5a83f5e43a87 100644
--- a/docs/DeveloperPolicy.rst
+++ b/docs/DeveloperPolicy.rst
@@ -528,12 +528,12 @@ for llvm users and not imposing a big burden on llvm developers:
 C API Changes
 ----------------
 
-* Stability Guarantees: The C API is, in general, a “best effort” for stability.
+* Stability Guarantees: The C API is, in general, a "best effort" for stability.
 This means that we make every attempt to keep the C API stable, but that
 stability will be limited by the abstractness of the interface and the stability
-of the C++ API that it wraps. In practice, this means that things like “create
-debug info” or “create this type of instruction” are likely to be less stable
-than “take this IR file and JIT it for my current machine”.
+of the C++ API that it wraps. In practice, this means that things like "create
+debug info" or "create this type of instruction" are likely to be less stable
+than "take this IR file and JIT it for my current machine".
 
 * Release stability: We won’t break the C API on the release branch with patches
 that go on that branch, with the exception that if we will fix an unintentional

From 8737acc70bee89ef086b7a066ac2f2d06a8e126a Mon Sep 17 00:00:00 2001
From: Eric Christopher <echristo@gmail.com>
Date: Thu, 10 Dec 2015 21:46:24 +0000
Subject: [PATCH 348/364] Clarify some of the wording on adding a new
 subcomponent to the C API.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255302 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/DeveloperPolicy.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/DeveloperPolicy.rst b/docs/DeveloperPolicy.rst
index 5a83f5e43a87..d357dd530b07 100644
--- a/docs/DeveloperPolicy.rst
+++ b/docs/DeveloperPolicy.rst
@@ -545,8 +545,8 @@ other patch.
 
 * Including new things into the API: If an LLVM subcomponent has a C API already
 included, then expanding that C API is acceptable. Adding C API for subcomponents
-that don't currently have one is fine, and the details of how best to design that
-API should be discussed on the mailing list.
+that don't currently have one need to be discussed on the mailing list for design
+and maintainability feedback prior to implementation.
 
 * Documentation: Any changes to the C API are required to be documented in the
 release notes so that it’s clear to external users who do not follow the project

From 7e0619a898af0142532bb9762500986d22d5278f Mon Sep 17 00:00:00 2001
From: Eric Christopher <echristo@gmail.com>
Date: Thu, 10 Dec 2015 21:47:38 +0000
Subject: [PATCH 349/364] More non-ascii quote characters.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255303 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/DeveloperPolicy.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/DeveloperPolicy.rst b/docs/DeveloperPolicy.rst
index d357dd530b07..47c3b8b3d293 100644
--- a/docs/DeveloperPolicy.rst
+++ b/docs/DeveloperPolicy.rst
@@ -535,7 +535,7 @@ of the C++ API that it wraps. In practice, this means that things like "create
 debug info" or "create this type of instruction" are likely to be less stable
 than "take this IR file and JIT it for my current machine".
 
-* Release stability: We won’t break the C API on the release branch with patches
+* Release stability: We won't break the C API on the release branch with patches
 that go on that branch, with the exception that if we will fix an unintentional
 C API break that will keep the release consistent with both the previous and next
 release.
@@ -549,7 +549,7 @@ that don't currently have one need to be discussed on the mailing list for desig
 and maintainability feedback prior to implementation.
 
 * Documentation: Any changes to the C API are required to be documented in the
-release notes so that it’s clear to external users who do not follow the project
+release notes so that it's clear to external users who do not follow the project
 how the C API is changing and evolving.
 
 .. _copyright-license-patents:

From 67b9896fe1fe5313de3a728d01a94fa2486e07ea Mon Sep 17 00:00:00 2001
From: Eric Christopher <echristo@gmail.com>
Date: Thu, 10 Dec 2015 22:04:11 +0000
Subject: [PATCH 350/364] Attempt to fix the ReST compilation to html of the C
 API docs.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255304 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/DeveloperPolicy.rst | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/docs/DeveloperPolicy.rst b/docs/DeveloperPolicy.rst
index 47c3b8b3d293..d4a681a8b313 100644
--- a/docs/DeveloperPolicy.rst
+++ b/docs/DeveloperPolicy.rst
@@ -529,28 +529,28 @@ C API Changes
 ----------------
 
 * Stability Guarantees: The C API is, in general, a "best effort" for stability.
-This means that we make every attempt to keep the C API stable, but that
-stability will be limited by the abstractness of the interface and the stability
-of the C++ API that it wraps. In practice, this means that things like "create
-debug info" or "create this type of instruction" are likely to be less stable
-than "take this IR file and JIT it for my current machine".
+  This means that we make every attempt to keep the C API stable, but that
+  stability will be limited by the abstractness of the interface and the
+  stability of the C++ API that it wraps. In practice, this means that things
+  like "create debug info" or "create this type of instruction" are likely to be
+  less stable than "take this IR file and JIT it for my current machine".
 
 * Release stability: We won't break the C API on the release branch with patches
-that go on that branch, with the exception that if we will fix an unintentional
-C API break that will keep the release consistent with both the previous and next
-release.
+  that go on that branch, with the exception that if we will fix an unintentional
+  C API break that will keep the release consistent with both the previous and
+  next release.
 
 * Testing: Patches to the C API are expected to come with tests just like any
-other patch.
+  other patch.
 
 * Including new things into the API: If an LLVM subcomponent has a C API already
-included, then expanding that C API is acceptable. Adding C API for subcomponents
-that don't currently have one need to be discussed on the mailing list for design
-and maintainability feedback prior to implementation.
+  included, then expanding that C API is acceptable. Adding C API for
+  subcomponents that don't currently have one need to be discussed on the mailing
+  list for design and maintainability feedback prior to implementation.
 
 * Documentation: Any changes to the C API are required to be documented in the
-release notes so that it's clear to external users who do not follow the project
-how the C API is changing and evolving.
+  release notes so that it's clear to external users who do not follow the
+  project how the C API is changing and evolving.
 
 .. _copyright-license-patents:
 

From 54f1a90d457ca270ee9ae84f521fe016cb830ee4 Mon Sep 17 00:00:00 2001
From: Eric Christopher <echristo@gmail.com>
Date: Thu, 10 Dec 2015 22:09:06 +0000
Subject: [PATCH 351/364] Fix (bitcast (fabs x)), (bitcast (fneg x)) and
 (bitcast (fcopysign cst, x)) combines for ppc_fp128, since signbit
 computation is more complicated.

Discussion thread:
http://lists.llvm.org/pipermail/llvm-dev/2015-November/092863.html

Patch by Tim Shen!

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255305 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp      |  68 ++++++++++++
 .../PowerPC/fp128-bitcast-after-operation.ll  | 103 ++++++++++++++++++
 2 files changed, 171 insertions(+)
 create mode 100644 test/CodeGen/PowerPC/fp128-bitcast-after-operation.ll

diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 97a9646c3232..0872d7a9a228 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -7244,6 +7244,12 @@ SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) {
   return SDValue();
 }
 
+static unsigned getPPCf128HiElementSelector(const SelectionDAG &DAG) {
+  // On little-endian machines, bitcasting from ppcf128 to i128 does swap the Hi
+  // and Lo parts; on big-endian machines it doesn't.
+  return DAG.getDataLayout().isBigEndian() ? 1 : 0;
+}
+
 SDValue DAGCombiner::visitBITCAST(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
@@ -7310,6 +7316,14 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
 
   // fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit)
   // fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit))
+  //
+  // For ppc_fp128:
+  // fold (bitcast (fneg x)) ->
+  //     flipbit = signbit
+  //     (xor (bitcast x) (build_pair flipbit, flipbit))
+  // fold (bitcast (fabs x)) ->
+  //     flipbit = (and (extract_element (bitcast x), 0), signbit)
+  //     (xor (bitcast x) (build_pair flipbit, flipbit))
   // This often reduces constant pool loads.
   if (((N0.getOpcode() == ISD::FNEG && !TLI.isFNegFree(N0.getValueType())) ||
        (N0.getOpcode() == ISD::FABS && !TLI.isFAbsFree(N0.getValueType()))) &&
@@ -7320,6 +7334,29 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
     AddToWorklist(NewConv.getNode());
 
     SDLoc DL(N);
+    if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) {
+      assert(VT.getSizeInBits() == 128);
+      SDValue SignBit = DAG.getConstant(
+          APInt::getSignBit(VT.getSizeInBits() / 2), SDLoc(N0), MVT::i64);
+      SDValue FlipBit;
+      if (N0.getOpcode() == ISD::FNEG) {
+        FlipBit = SignBit;
+        AddToWorklist(FlipBit.getNode());
+      } else {
+        assert(N0.getOpcode() == ISD::FABS);
+        SDValue Hi =
+            DAG.getNode(ISD::EXTRACT_ELEMENT, SDLoc(NewConv), MVT::i64, NewConv,
+                        DAG.getIntPtrConstant(getPPCf128HiElementSelector(DAG),
+                                              SDLoc(NewConv)));
+        AddToWorklist(Hi.getNode());
+        FlipBit = DAG.getNode(ISD::AND, SDLoc(N0), MVT::i64, Hi, SignBit);
+        AddToWorklist(FlipBit.getNode());
+      }
+      SDValue FlipBits =
+          DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit);
+      AddToWorklist(FlipBits.getNode());
+      return DAG.getNode(ISD::XOR, DL, VT, NewConv, FlipBits);
+    }
     APInt SignBit = APInt::getSignBit(VT.getSizeInBits());
     if (N0.getOpcode() == ISD::FNEG)
       return DAG.getNode(ISD::XOR, DL, VT,
@@ -7333,6 +7370,13 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
   //         (or (and (bitconvert x), sign), (and cst, (not sign)))
   // Note that we don't handle (copysign x, cst) because this can always be
   // folded to an fneg or fabs.
+  //
+  // For ppc_fp128:
+  // fold (bitcast (fcopysign cst, x)) ->
+  //     flipbit = (and (extract_element
+  //                     (xor (bitcast cst), (bitcast x)), 0),
+  //                    signbit)
+  //     (xor (bitcast cst) (build_pair flipbit, flipbit))
   if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse() &&
       isa<ConstantFPSDNode>(N0.getOperand(0)) &&
       VT.isInteger() && !VT.isVector()) {
@@ -7361,6 +7405,30 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
         AddToWorklist(X.getNode());
       }
 
+      if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) {
+        APInt SignBit = APInt::getSignBit(VT.getSizeInBits() / 2);
+        SDValue Cst = DAG.getNode(ISD::BITCAST, SDLoc(N0.getOperand(0)), VT,
+                                  N0.getOperand(0));
+        AddToWorklist(Cst.getNode());
+        SDValue X = DAG.getNode(ISD::BITCAST, SDLoc(N0.getOperand(1)), VT,
+                                N0.getOperand(1));
+        AddToWorklist(X.getNode());
+        SDValue XorResult = DAG.getNode(ISD::XOR, SDLoc(N0), VT, Cst, X);
+        AddToWorklist(XorResult.getNode());
+        SDValue XorResult64 = DAG.getNode(
+            ISD::EXTRACT_ELEMENT, SDLoc(XorResult), MVT::i64, XorResult,
+            DAG.getIntPtrConstant(getPPCf128HiElementSelector(DAG),
+                                  SDLoc(XorResult)));
+        AddToWorklist(XorResult64.getNode());
+        SDValue FlipBit =
+            DAG.getNode(ISD::AND, SDLoc(XorResult64), MVT::i64, XorResult64,
+                        DAG.getConstant(SignBit, SDLoc(XorResult64), MVT::i64));
+        AddToWorklist(FlipBit.getNode());
+        SDValue FlipBits =
+            DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit);
+        AddToWorklist(FlipBits.getNode());
+        return DAG.getNode(ISD::XOR, SDLoc(N), VT, Cst, FlipBits);
+      }
       APInt SignBit = APInt::getSignBit(VT.getSizeInBits());
       X = DAG.getNode(ISD::AND, SDLoc(X), VT,
                       X, DAG.getConstant(SignBit, SDLoc(X), VT));
diff --git a/test/CodeGen/PowerPC/fp128-bitcast-after-operation.ll b/test/CodeGen/PowerPC/fp128-bitcast-after-operation.ll
new file mode 100644
index 000000000000..5f3c9278f8d5
--- /dev/null
+++ b/test/CodeGen/PowerPC/fp128-bitcast-after-operation.ll
@@ -0,0 +1,103 @@
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s -check-prefix=PPC64
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr7 < %s | FileCheck %s -check-prefix=PPC64
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s -check-prefix=PPC64
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 < %s | FileCheck %s -check-prefix=PPC64
+; RUN: llc -mtriple=powerpc-unknown-linux-gnu < %s | FileCheck %s -check-prefix=PPC32
+
+define i128 @test_abs(ppc_fp128 %x) nounwind  {
+entry:
+; PPC64-LABEL: test_abs:
+; PPC64-DAG: stxsdx 2, 0, [[ADDR_HI:[0-9]+]]
+; PPC64-DAG: stxsdx 1, 0, [[ADDR_LO:[0-9]+]]
+; PPC64-DAG: addi [[ADDR_HI]], [[SP:[0-9]+]], [[OFFSET_HI:-?[0-9]+]]
+; PPC64-DAG: addi [[ADDR_LO]], [[SP]], [[OFFSET_LO:-?[0-9]+]]
+; PPC64-DAG: li [[MASK_REG:[0-9]+]], 1
+; PPC64: sldi [[MASK_REG]], [[MASK_REG]], 63
+; PPC64-DAG: ld [[HI:[0-9]+]], [[OFFSET_LO]]([[SP]])
+; PPC64-DAG: ld [[LO:[0-9]+]], [[OFFSET_HI]]([[SP]])
+; PPC64: and [[FLIP_BIT:[0-9]+]], [[HI]], [[MASK_REG]]
+; PPC64-DAG: xor 3, [[HI]], [[FLIP_BIT]]
+; PPC64-DAG: xor 4, [[LO]], [[FLIP_BIT]]
+; PPC64: blr
+
+; PPC32-DAG: stfd 1, 24(1)
+; PPC32-DAG: stfd 2, 16(1)
+; PPC32: nop
+; PPC32-DAG: lwz [[HI0:[0-9]+]], 24(1)
+; PPC32-DAG: lwz [[LO0:[0-9]+]], 16(1)
+; PPC32-DAG: lwz [[HI1:[0-9]+]], 28(1)
+; PPC32-DAG: lwz [[LO1:[0-9]+]], 20(1)
+; PPC32: rlwinm [[FLIP_BIT:[0-9]+]], [[HI0]], 0, 0, 0
+; PPC32-DAG: xor [[HI0]], [[HI0]], [[FLIP_BIT]]
+; PPC32-DAG: xor [[LO0]], [[LO0]], [[FLIP_BIT]]
+; PPC32: blr
+	%0 = tail call ppc_fp128 @llvm.fabs.ppcf128(ppc_fp128 %x)
+	%1 = bitcast ppc_fp128 %0 to i128
+	ret i128 %1
+}
+
+define i128 @test_neg(ppc_fp128 %x) nounwind  {
+entry:
+; PPC64-LABEL: test_neg:
+; PPC64-DAG: stxsdx 2, 0, [[ADDR_HI:[0-9]+]]
+; PPC64-DAG: stxsdx 1, 0, [[ADDR_LO:[0-9]+]]
+; PPC64-DAG: addi [[ADDR_HI]], [[SP:[0-9]+]], [[OFFSET_HI:-?[0-9]+]]
+; PPC64-DAG: addi [[ADDR_LO]], [[SP]], [[OFFSET_LO:-?[0-9]+]]
+; PPC64-DAG: li [[FLIP_BIT:[0-9]+]], 1
+; PPC64-DAG: sldi [[FLIP_BIT]], [[FLIP_BIT]], 63
+; PPC64-DAG: ld [[HI:[0-9]+]], [[OFFSET_LO]]([[SP]])
+; PPC64-DAG: ld [[LO:[0-9]+]], [[OFFSET_HI]]([[SP]])
+; PPC64-NOT: BARRIER
+; PPC64-DAG: xor 3, [[HI]], [[FLIP_BIT]]
+; PPC64-DAG: xor 4, [[LO]], [[FLIP_BIT]]
+; PPC64: blr
+
+; PPC32-DAG: stfd 1, 24(1)
+; PPC32-DAG: stfd 2, 16(1)
+; PPC32: nop
+; PPC32-DAG: lwz [[HI0:[0-9]+]], 24(1)
+; PPC32-DAG: lwz [[LO0:[0-9]+]], 16(1)
+; PPC32-DAG: lwz [[HI1:[0-9]+]], 28(1)
+; PPC32-DAG: lwz [[LO1:[0-9]+]], 20(1)
+; PPC32-NOT: BARRIER
+; PPC32-DAG: xoris [[HI0]], [[HI0]], 32768
+; PPC32-DAG: xoris [[LO0]], [[LO0]], 32768
+; PPC32: blr
+	%0 = fsub ppc_fp128 0xM80000000000000000000000000000000, %x
+	%1 = bitcast ppc_fp128 %0 to i128
+	ret i128 %1
+}
+
+define i128 @test_copysign(ppc_fp128 %x) nounwind  {
+entry:
+; PPC64-LABEL: test_copysign:
+; PPC64-DAG: stxsdx 1, 0, [[ADDR_REG:[0-9]+]]
+; PPC64-DAG: addi [[ADDR_REG]], 1, [[OFFSET:-?[0-9]+]]
+; PPC64-DAG: li [[SIGN:[0-9]+]], 1
+; PPC64-DAG: sldi [[SIGN]], [[SIGN]], 63
+; PPC64-DAG: li [[HI_TMP:[0-9]+]], 16399
+; PPC64-DAG: sldi [[CST_HI:[0-9]+]], [[HI_TMP]], 48
+; PPC64-DAG: li [[LO_TMP:[0-9]+]], 3019
+; PPC64-DAG: sldi [[CST_LO:[0-9]+]], [[LO_TMP]], 52
+; PPC64-NOT: BARRIER
+; PPC64-DAG: ld [[X_HI:[0-9]+]], [[OFFSET]](1)
+; PPC64-DAG: and [[NEW_HI_TMP:[0-9]+]], [[X_HI]], [[SIGN]]
+; PPC64-DAG: or 3, [[NEW_HI_TMP]], [[CST_HI]]
+; PPC64-DAG: xor 4, [[SIGN]], [[CST_LO]]
+; PPC64: blr
+
+; PPC32: stfd 1, [[STACK:[0-9]+]](1)
+; PPC32: nop
+; PPC32: lwz [[HI:[0-9]+]], [[STACK]](1)
+; PPC32: rlwinm [[FLIP_BIT:[0-9]+]], [[HI]], 0, 0, 0
+; PPC32-NOT: BARRIER
+; PPC32-DAG: oris {{[0-9]+}}, [[FLIP_BIT]], 16399
+; PPC32-DAG: xoris {{[0-9]+}}, [[FLIP_BIT]], 48304
+; PPC32: blr
+	%0 = tail call ppc_fp128 @llvm.copysign.ppcf128(ppc_fp128 0xMBCB0000000000000400F000000000000, ppc_fp128 %x)
+	%1 = bitcast ppc_fp128 %0 to i128
+	ret i128 %1
+}
+
+declare ppc_fp128 @llvm.fabs.ppcf128(ppc_fp128)
+declare ppc_fp128 @llvm.copysign.ppcf128(ppc_fp128, ppc_fp128)

From 337570a52e6bf78a81f9eeaf0a620790953cd79c Mon Sep 17 00:00:00 2001
From: Eric Christopher <echristo@gmail.com>
Date: Thu, 10 Dec 2015 22:29:26 +0000
Subject: [PATCH 352/364] s/need/needs

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255306 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/DeveloperPolicy.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/DeveloperPolicy.rst b/docs/DeveloperPolicy.rst
index d4a681a8b313..136f68f81ee7 100644
--- a/docs/DeveloperPolicy.rst
+++ b/docs/DeveloperPolicy.rst
@@ -545,8 +545,8 @@ C API Changes
 
 * Including new things into the API: If an LLVM subcomponent has a C API already
   included, then expanding that C API is acceptable. Adding C API for
-  subcomponents that don't currently have one need to be discussed on the mailing
-  list for design and maintainability feedback prior to implementation.
+  subcomponents that don't currently have one needs to be discussed on the
+  mailing list for design and maintainability feedback prior to implementation.
 
 * Documentation: Any changes to the C API are required to be documented in the
   release notes so that it's clear to external users who do not follow the

From 849df7927ee243d1bdce23cf5d5b91e0c465f18b Mon Sep 17 00:00:00 2001
From: Xinliang David Li <davidxl@google.com>
Date: Thu, 10 Dec 2015 23:48:05 +0000
Subject: [PATCH 353/364] Format fix (NFC)

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255313 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/ProfileData/InstrProfReader.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/lib/ProfileData/InstrProfReader.cpp b/lib/ProfileData/InstrProfReader.cpp
index 7683cad6ede4..10b8fa781d73 100644
--- a/lib/ProfileData/InstrProfReader.cpp
+++ b/lib/ProfileData/InstrProfReader.cpp
@@ -335,7 +335,8 @@ RawInstrProfReader<IntPtrT>::readNextRecord(InstrProfRecord &Record) {
     return EC;
 
   // Read value data and set Record.
-  if (std::error_code EC = readValueProfilingData(Record)) return EC;
+  if (std::error_code EC = readValueProfilingData(Record))
+    return EC;
 
   // Iterate.
   advanceData();
@@ -436,7 +437,8 @@ std::error_code InstrProfReaderIndex<HashTableImpl>::getRecords(
 
   Data = *RecordIterator;
 
-  if (Data.empty()) return instrprof_error::malformed;
+  if (Data.empty())
+    return instrprof_error::malformed;
 
   return instrprof_error::success;
 }

From e265d9a2d4e159632e281c8c55b5b4030e58d94b Mon Sep 17 00:00:00 2001
From: Cong Hou <congh@google.com>
Date: Fri, 11 Dec 2015 00:31:39 +0000
Subject: [PATCH 354/364] [X86][SSE] Update the cost table for integer-integer
 conversions on SSE2/SSE4.1.

Previously in the conversion cost table there are no entries for integer-integer
conversions on SSE2. This will result in imprecise costs for certain vectorized
operations. This patch adds those entries for SSE2 and SSE4.1. The cost numbers
are counted from the result of running llc on the new test case in this patch.


Differential revision: http://reviews.llvm.org/D15132


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255315 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86TargetTransformInfo.cpp |  81 ++++-
 test/Analysis/CostModel/X86/sitofp.ll     |   6 +-
 test/Analysis/CostModel/X86/sse-itoi.ll   | 353 ++++++++++++++++++++++
 3 files changed, 435 insertions(+), 5 deletions(-)
 create mode 100644 test/Analysis/CostModel/X86/sse-itoi.ll

diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index cf7a826ea85d..d33d57584128 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -528,6 +528,9 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
+  // FIXME: Need a better design of the cost table to handle non-simple types of
+  // potential massive combinations (elem_num x src_type x dst_type).
+
   static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  1 },
     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  1 },
@@ -705,7 +708,38 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
     { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f64, 4*4 },
   };
 
-  static const TypeConversionCostTblEntry SSE2ConvTbl[] = {
+  static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  2 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  2 },
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  4 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  4 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   2 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   2 },
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i8,   1 },
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i8,   1 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  2 },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  2 },
+    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i8,   1 },
+    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i8,   1 },
+    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i8,   1 },
+    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i8,   2 },
+
+    { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 6 },
+    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  3 },
+    { ISD::TRUNCATE,    MVT::v4i16,  MVT::v4i32,  1 },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, 30 },
+    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i32,  3 },
+    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i32,  1 },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 3 },
+    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  1 },
+    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  2 },
+  };
+
+  static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
     // These are somewhat magic numbers justified by looking at the output of
     // Intel's IACA, running some kernels and making sure when we take
     // legalization into account the throughput will be overestimated.
@@ -726,13 +760,42 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 },
     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
+
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  3 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  4 },
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i16,  2 },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  9 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  12 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   6 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   6 },
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i8,   2 },
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i8,   3 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  3 },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  4 },
+    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i8,   1 },
+    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i8,   2 },
+    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i8,   1 },
+    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i8,   6 },
+
+    { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 14 },
+    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  7 },
+    { ISD::TRUNCATE,    MVT::v4i16,  MVT::v4i32,  3 },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, 31 },
+    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i32,  4 },
+    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i32,  3 },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 3 },
+    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  2 },
+    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  4 },
   };
 
   std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
   std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
 
   if (ST->hasSSE2() && !ST->hasAVX()) {
-    if (const auto *Entry = ConvertCostTableLookup(SSE2ConvTbl, ISD,
+    if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
                                                    LTDest.second, LTSrc.second))
       return LTSrc.first * Entry->Cost;
   }
@@ -770,6 +833,20 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
       return Entry->Cost;
   }
 
+  if (ST->hasSSE41()) {
+    if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
+                                                   DstTy.getSimpleVT(),
+                                                   SrcTy.getSimpleVT()))
+      return Entry->Cost;
+  }
+
+  if (ST->hasSSE2()) {
+    if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
+                                                   DstTy.getSimpleVT(),
+                                                   SrcTy.getSimpleVT()))
+      return Entry->Cost;
+  }
+
   return BaseT::getCastInstrCost(Opcode, Dst, Src);
 }
 
diff --git a/test/Analysis/CostModel/X86/sitofp.ll b/test/Analysis/CostModel/X86/sitofp.ll
index 9913a4896912..9f0c4065c178 100644
--- a/test/Analysis/CostModel/X86/sitofp.ll
+++ b/test/Analysis/CostModel/X86/sitofp.ll
@@ -248,13 +248,13 @@ define <2 x double> @sitofpv2i64v2double(<2 x i64> %a) {
   ; SSE2: cost of 20 {{.*}} sitofp
   ;
   ; AVX1-LABEL: sitofpv2i64v2double
-  ; AVX1: cost of 4 {{.*}} sitofp
+  ; AVX1: cost of 20 {{.*}} sitofp
   ;
   ; AVX2-LABEL: sitofpv2i64v2double
-  ; AVX2: cost of 4 {{.*}} sitofp
+  ; AVX2: cost of 20 {{.*}} sitofp
   ;
   ; AVX512F-LABEL: sitofpv2i64v2double
-  ; AVX512F: cost of 4 {{.*}} sitofp
+  ; AVX512F: cost of 20 {{.*}} sitofp
   %1 = sitofp <2 x i64> %a to <2 x double>
   ret <2 x double> %1
 }
diff --git a/test/Analysis/CostModel/X86/sse-itoi.ll b/test/Analysis/CostModel/X86/sse-itoi.ll
new file mode 100644
index 000000000000..6429e4fae097
--- /dev/null
+++ b/test/Analysis/CostModel/X86/sse-itoi.ll
@@ -0,0 +1,353 @@
+; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse2 -cost-model -analyze < %s | FileCheck --check-prefix=SSE2 %s
+; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse4.1 -cost-model -analyze < %s | FileCheck --check-prefix=SSE41 %s
+
+define void @zext_v16i16_to_v16i32(<16 x i16>* %a) {
+; SSE2: zext_v16i16_to_v16i32
+; SSE2: cost of 6 {{.*}} zext
+;
+; SSE41: zext_v16i16_to_v16i32
+; SSE41: cost of 4 {{.*}} zext
+;
+  %1 = load <16 x i16>, <16 x i16>* %a
+  %2 = zext <16 x i16> %1 to <16 x i32>
+  store <16 x i32> %2, <16 x i32>* undef, align 4
+  ret void
+}
+
+define void @sext_v16i16_to_v16i32(<16 x i16>* %a) {
+; SSE2: sext_v16i16_to_v16i32
+; SSE2: cost of 8 {{.*}} sext
+;
+; SSE41: sext_v16i16_to_v16i32
+; SSE41: cost of 4 {{.*}} sext
+;
+  %1 = load <16 x i16>, <16 x i16>* %a
+  %2 = sext <16 x i16> %1 to <16 x i32>
+  store <16 x i32> %2, <16 x i32>* undef, align 4
+  ret void
+}
+
+define void @zext_v8i16_to_v8i32(<8 x i16>* %a) {
+; SSE2: zext_v8i16_to_v8i32
+; SSE2: cost of 3 {{.*}} zext
+;
+; SSE41: zext_v8i16_to_v8i32
+; SSE41: cost of 2 {{.*}} zext
+;
+  %1 = load <8 x i16>, <8 x i16>* %a
+  %2 = zext <8 x i16> %1 to <8 x i32>
+  store <8 x i32> %2, <8 x i32>* undef, align 4
+  ret void
+}
+
+define void @sext_v8i16_to_v8i32(<8 x i16>* %a) {
+; SSE2: sext_v8i16_to_v8i32
+; SSE2: cost of 4 {{.*}} sext
+;
+; SSE41: sext_v8i16_to_v8i32
+; SSE41: cost of 2 {{.*}} sext
+;
+  %1 = load <8 x i16>, <8 x i16>* %a
+  %2 = sext <8 x i16> %1 to <8 x i32>
+  store <8 x i32> %2, <8 x i32>* undef, align 4
+  ret void
+}
+
+define void @zext_v4i16_to_v4i32(<4 x i16>* %a) {
+; SSE2: zext_v4i16_to_v4i32
+; SSE2: cost of 1 {{.*}} zext
+;
+; SSE41: zext_v4i16_to_v4i32
+; SSE41: cost of 1 {{.*}} zext
+;
+  %1 = load <4 x i16>, <4 x i16>* %a
+  %2 = zext <4 x i16> %1 to <4 x i32>
+  store <4 x i32> %2, <4 x i32>* undef, align 4
+  ret void
+}
+
+define void @sext_v4i16_to_v4i32(<4 x i16>* %a) {
+; SSE2: sext_v4i16_to_v4i32
+; SSE2: cost of 2 {{.*}} sext
+;
+; SSE41: sext_v4i16_to_v4i32
+; SSE41: cost of 1 {{.*}} sext
+;
+  %1 = load <4 x i16>, <4 x i16>* %a
+  %2 = sext <4 x i16> %1 to <4 x i32>
+  store <4 x i32> %2, <4 x i32>* undef, align 4
+  ret void
+}
+
+define void @zext_v16i8_to_v16i32(<16 x i8>* %a) {
+; SSE2: zext_v16i8_to_v16i32
+; SSE2: cost of 9 {{.*}} zext
+;
+; SSE41: zext_v16i8_to_v16i32
+; SSE41: cost of 4 {{.*}} zext
+;
+  %1 = load <16 x i8>, <16 x i8>* %a
+  %2 = zext <16 x i8> %1 to <16 x i32>
+  store <16 x i32> %2, <16 x i32>* undef, align 4
+  ret void
+}
+
+define void @sext_v16i8_to_v16i32(<16 x i8>* %a) {
+; SSE2: sext_v16i8_to_v16i32
+; SSE2: cost of 12 {{.*}} sext
+;
+; SSE41: sext_v16i8_to_v16i32
+; SSE41: cost of 4 {{.*}} sext
+;
+  %1 = load <16 x i8>, <16 x i8>* %a
+  %2 = sext <16 x i8> %1 to <16 x i32>
+  store <16 x i32> %2, <16 x i32>* undef, align 4
+  ret void
+}
+
+define void @zext_v8i8_to_v8i32(<8 x i8>* %a) {
+; SSE2: zext_v8i8_to_v8i32
+; SSE2: cost of 6 {{.*}} zext
+;
+; SSE41: zext_v8i8_to_v8i32
+; SSE41: cost of 2 {{.*}} zext
+;
+  %1 = load <8 x i8>, <8 x i8>* %a
+  %2 = zext <8 x i8> %1 to <8 x i32>
+  store <8 x i32> %2, <8 x i32>* undef, align 4
+  ret void
+}
+
+define void @sext_v8i8_to_v8i32(<8 x i8>* %a) {
+; SSE2: sext_v8i8_to_v8i32
+; SSE2: cost of 6 {{.*}} sext
+;
+; SSE41: sext_v8i8_to_v8i32
+; SSE41: cost of 2 {{.*}} sext
+;
+  %1 = load <8 x i8>, <8 x i8>* %a
+  %2 = sext <8 x i8> %1 to <8 x i32>
+  store <8 x i32> %2, <8 x i32>* undef, align 4
+  ret void
+}
+
+define void @zext_v4i8_to_v4i32(<4 x i8>* %a) {
+; SSE2: zext_v4i8_to_v4i32
+; SSE2: cost of 2 {{.*}} zext
+;
+; SSE41: zext_v4i8_to_v4i32
+; SSE41: cost of 1 {{.*}} zext
+;
+  %1 = load <4 x i8>, <4 x i8>* %a
+  %2 = zext <4 x i8> %1 to <4 x i32>
+  store <4 x i32> %2, <4 x i32>* undef, align 4
+  ret void
+}
+
+define void @sext_v4i8_to_v4i32(<4 x i8>* %a) {
+; SSE2: sext_v4i8_to_v4i32
+; SSE2: cost of 3 {{.*}} sext
+;
+; SSE41: sext_v4i8_to_v4i32
+; SSE41: cost of 1 {{.*}} sext
+;
+  %1 = load <4 x i8>, <4 x i8>* %a
+  %2 = sext <4 x i8> %1 to <4 x i32>
+  store <4 x i32> %2, <4 x i32>* undef, align 4
+  ret void
+}
+
+define void @zext_v16i8_to_v16i16(<16 x i8>* %a) {
+; SSE2: zext_v16i8_to_v16i16
+; SSE2: cost of 3 {{.*}} zext
+;
+; SSE41: zext_v16i8_to_v16i16
+; SSE41: cost of 2 {{.*}} zext
+;
+  %1 = load <16 x i8>, <16 x i8>* %a
+  %2 = zext <16 x i8> %1 to <16 x i16>
+  store <16 x i16> %2, <16 x i16>* undef, align 4
+  ret void
+}
+
+define void @sext_v16i8_to_v16i16(<16 x i8>* %a) {
+; SSE2: sext_v16i8_to_v16i16
+; SSE2: cost of 4 {{.*}} sext
+;
+; SSE41: sext_v16i8_to_v16i16
+; SSE41: cost of 2 {{.*}} sext
+;
+  %1 = load <16 x i8>, <16 x i8>* %a
+  %2 = sext <16 x i8> %1 to <16 x i16>
+  store <16 x i16> %2, <16 x i16>* undef, align 4
+  ret void
+}
+
+define void @zext_v8i8_to_v8i16(<8 x i8>* %a) {
+; SSE2: zext_v8i8_to_v8i16
+; SSE2: cost of 1 {{.*}} zext
+;
+; SSE41: zext_v8i8_to_v8i16
+; SSE41: cost of 1 {{.*}} zext
+;
+  %1 = load <8 x i8>, <8 x i8>* %a
+  %2 = zext <8 x i8> %1 to <8 x i16>
+  store <8 x i16> %2, <8 x i16>* undef, align 4
+  ret void
+}
+
+define void @sext_v8i8_to_v8i16(<8 x i8>* %a) {
+; SSE2: sext_v8i8_to_v8i16
+; SSE2: cost of 2 {{.*}} sext
+;
+; SSE41: sext_v8i8_to_v8i16
+; SSE41: cost of 1 {{.*}} sext
+;
+  %1 = load <8 x i8>, <8 x i8>* %a
+  %2 = sext <8 x i8> %1 to <8 x i16>
+  store <8 x i16> %2, <8 x i16>* undef, align 4
+  ret void
+}
+
+define void @zext_v4i8_to_v4i16(<4 x i8>* %a) {
+; SSE2: zext_v4i8_to_v4i16
+; SSE2: cost of 1 {{.*}} zext
+;
+; SSE41: zext_v4i8_to_v4i16
+; SSE41: cost of 1 {{.*}} zext
+;
+  %1 = load <4 x i8>, <4 x i8>* %a
+  %2 = zext <4 x i8> %1 to <4 x i16>
+  store <4 x i16> %2, <4 x i16>* undef, align 4
+  ret void
+}
+
+define void @sext_v4i8_to_v4i16(<4 x i8>* %a) {
+; SSE2: sext_v4i8_to_v4i16
+; SSE2: cost of 6 {{.*}} sext
+;
+; SSE41: sext_v4i8_to_v4i16
+; SSE41: cost of 2 {{.*}} sext
+;
+  %1 = load <4 x i8>, <4 x i8>* %a
+  %2 = sext <4 x i8> %1 to <4 x i16>
+  store <4 x i16> %2, <4 x i16>* undef, align 4
+  ret void
+}
+
+define void @truncate_v16i32_to_v16i16(<16 x i32>* %a) {
+; SSE2: truncate_v16i32_to_v16i16
+; SSE2: cost of 14 {{.*}} trunc
+;
+; SSE41: truncate_v16i32_to_v16i16
+; SSE41: cost of 6 {{.*}} trunc
+;
+  %1 = load <16 x i32>, <16 x i32>* %a
+  %2 = trunc <16 x i32> %1 to <16 x i16>
+  store <16 x i16> %2, <16 x i16>* undef, align 4
+  ret void
+}
+
+define void @truncate_v8i32_to_v8i16(<8 x i32>* %a) {
+; SSE2: truncate_v8i32_to_v8i16
+; SSE2: cost of 7 {{.*}} trunc
+;
+; SSE41: truncate_v8i32_to_v8i16
+; SSE41: cost of 3 {{.*}} trunc
+;
+  %1 = load <8 x i32>, <8 x i32>* %a
+  %2 = trunc <8 x i32> %1 to <8 x i16>
+  store <8 x i16> %2, <8 x i16>* undef, align 4
+  ret void
+}
+
+define void @truncate_v4i32_to_v4i16(<4 x i32>* %a) {
+; SSE2: truncate_v4i32_to_v4i16
+; SSE2: cost of 3 {{.*}} trunc
+;
+; SSE41: truncate_v4i32_to_v4i16
+; SSE41: cost of 1 {{.*}} trunc
+;
+  %1 = load <4 x i32>, <4 x i32>* %a
+  %2 = trunc <4 x i32> %1 to <4 x i16>
+  store <4 x i16> %2, <4 x i16>* undef, align 4
+  ret void
+}
+
+define void @truncate_v16i32_to_v16i8(<16 x i32>* %a) {
+; SSE2: truncate_v16i32_to_v16i8
+; SSE2: cost of 31 {{.*}} trunc
+;
+; SSE41: truncate_v16i32_to_v16i8
+; SSE41: cost of 30 {{.*}} trunc
+;
+  %1 = load <16 x i32>, <16 x i32>* %a
+  %2 = trunc <16 x i32> %1 to <16 x i8>
+  store <16 x i8> %2, <16 x i8>* undef, align 4
+  ret void
+}
+
+define void @truncate_v8i32_to_v8i8(<8 x i32>* %a) {
+; SSE2: truncate_v8i32_to_v8i8
+; SSE2: cost of 4 {{.*}} trunc
+;
+; SSE41: truncate_v8i32_to_v8i8
+; SSE41: cost of 3 {{.*}} trunc
+;
+  %1 = load <8 x i32>, <8 x i32>* %a
+  %2 = trunc <8 x i32> %1 to <8 x i8>
+  store <8 x i8> %2, <8 x i8>* undef, align 4
+  ret void
+}
+
+define void @truncate_v4i32_to_v4i8(<4 x i32>* %a) {
+; SSE2: truncate_v4i32_to_v4i8
+; SSE2: cost of 3 {{.*}} trunc
+;
+; SSE41: truncate_v4i32_to_v4i8
+; SSE41: cost of 1 {{.*}} trunc
+;
+  %1 = load <4 x i32>, <4 x i32>* %a
+  %2 = trunc <4 x i32> %1 to <4 x i8>
+  store <4 x i8> %2, <4 x i8>* undef, align 4
+  ret void
+}
+
+define void @truncate_v16i16_to_v16i8(<16 x i16>* %a) {
+; SSE2: truncate_v16i16_to_v16i8
+; SSE2: cost of 3 {{.*}} trunc
+;
+; SSE41: truncate_v16i16_to_v16i8
+; SSE41: cost of 3 {{.*}} trunc
+;
+  %1 = load <16 x i16>, <16 x i16>* %a
+  %2 = trunc <16 x i16> %1 to <16 x i8>
+  store <16 x i8> %2, <16 x i8>* undef, align 4
+  ret void
+}
+
+define void @truncate_v8i16_to_v8i8(<8 x i16>* %a) {
+; SSE2: truncate_v8i16_to_v8i8
+; SSE2: cost of 2 {{.*}} trunc
+;
+; SSE41: truncate_v8i16_to_v8i8
+; SSE41: cost of 1 {{.*}} trunc
+;
+  %1 = load <8 x i16>, <8 x i16>* %a
+  %2 = trunc <8 x i16> %1 to <8 x i8>
+  store <8 x i8> %2, <8 x i8>* undef, align 4
+  ret void
+}
+
+define void @truncate_v4i16_to_v4i8(<4 x i16>* %a) {
+; SSE2: truncate_v4i16_to_v4i8
+; SSE2: cost of 4 {{.*}} trunc
+;
+; SSE41: truncate_v4i16_to_v4i8
+; SSE41: cost of 2 {{.*}} trunc
+;
+  %1 = load <4 x i16>, <4 x i16>* %a
+  %2 = trunc <4 x i16> %1 to <4 x i8>
+  store <4 x i8> %2, <4 x i8>* undef, align 4
+  ret void
+}

From 88b8ce2202a0a595054407c2ddb9f8d1346e2e02 Mon Sep 17 00:00:00 2001
From: Vedant Kumar <vsk@apple.com>
Date: Fri, 11 Dec 2015 00:40:05 +0000
Subject: [PATCH 355/364] [ProfileData] clang-format
 TextInstrProfReader::hasFormat. NFC.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255317 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/ProfileData/InstrProfReader.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lib/ProfileData/InstrProfReader.cpp b/lib/ProfileData/InstrProfReader.cpp
index 10b8fa781d73..394893d3701c 100644
--- a/lib/ProfileData/InstrProfReader.cpp
+++ b/lib/ProfileData/InstrProfReader.cpp
@@ -104,8 +104,9 @@ bool TextInstrProfReader::hasFormat(const MemoryBuffer &Buffer) {
   // 'reasonable' number of characters (up to profile magic size).
   size_t count = std::min(Buffer.getBufferSize(), sizeof(uint64_t));
   StringRef buffer = Buffer.getBufferStart();
-  return count == 0 || std::all_of(buffer.begin(), buffer.begin() + count,
-    [](char c) { return ::isprint(c) || ::isspace(c); });
+  return count == 0 ||
+         std::all_of(buffer.begin(), buffer.begin() + count,
+                     [](char c) { return ::isprint(c) || ::isspace(c); });
 }
 
 std::error_code TextInstrProfReader::readNextRecord(InstrProfRecord &Record) {

From 0dbeff1e733db465a10bcd34479515b9a4bdeade Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@hanshq.net>
Date: Fri, 11 Dec 2015 00:43:42 +0000
Subject: [PATCH 356/364] Check in the script for building Win snapshots

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255318 91177308-0d34-0410-b5e6-96231b3b80d8
---
 utils/release/build_llvm_package.bat | 93 ++++++++++++++++++++++++++++
 1 file changed, 93 insertions(+)
 create mode 100755 utils/release/build_llvm_package.bat

diff --git a/utils/release/build_llvm_package.bat b/utils/release/build_llvm_package.bat
new file mode 100755
index 000000000000..830f25e5cf38
--- /dev/null
+++ b/utils/release/build_llvm_package.bat
@@ -0,0 +1,93 @@
+@echo off
+setlocal
+
+REM Script for building the LLVM installer on Windows,
+REM used for the the weekly snapshots at http://www.llvm.org/builds.
+REM
+REM Usage: build_llvm_package.bat <revision>
+
+REM Prerequisites:
+REM
+REM   Visual Studio 2013, CMake, Ninja, SVN, GNUWin32,
+REM   NSIS with the strlen_8192 patch,
+REM   Visual Studio 2013 SDK (for the clang-format plugin).
+
+
+REM You may need to modify the paths below:
+set vcdir=c:\Program Files (x86)\Microsoft Visual Studio 12.0\VC
+set PATH=%PATH%;c:\gnuwin32\bin
+
+set revision=%1
+set branch=trunk
+set package_version=3.8.0-r%revision%
+set clang_format_vs_version=3.8.0.%revision%
+set build_dir=llvm_package_%revision%
+
+echo Branch: %branch%
+echo Revision: %revision%
+echo Package version: %package_version%
+echo Clang format plugin version: %clang_format_vs_version%
+echo Build dir: %build_dir%
+echo.
+pause
+
+mkdir %build_dir%
+cd %build_dir%
+
+echo Checking out %branch% at r%revision%...
+svn.exe export -r %revision% http://llvm.org/svn/llvm-project/llvm/%branch% llvm || exit /b
+svn.exe export -r %revision% http://llvm.org/svn/llvm-project/cfe/%branch% llvm/tools/clang || exit /b
+svn.exe export -r %revision% http://llvm.org/svn/llvm-project/clang-tools-extra/%branch% llvm/tools/clang/tools/extra || exit /b
+svn.exe export -r %revision% http://llvm.org/svn/llvm-project/lld/%branch% llvm/tools/lld || exit /b
+svn.exe export -r %revision% http://llvm.org/svn/llvm-project/compiler-rt/%branch% llvm/projects/compiler-rt || exit /b
+
+
+set cmake_flags=-DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON -DLLVM_INSTALL_TOOLCHAIN_ONLY=ON -DLLVM_USE_CRT_RELEASE=MT -DCLANG_FORMAT_VS_VERSION=%clang_format_vs_version% -DPACKAGE_VERSION=%package_version%
+
+REM TODO: Run all tests, including lld and compiler-rt.
+
+call "%vcdir%/vcvarsall.bat" x86
+set CC=
+set CXX=
+mkdir build32_stage0
+cd build32_stage0
+cmake -GNinja %cmake_flags% ..\llvm || exit /b
+ninja all || exit /b
+ninja check || exit /b
+ninja check-clang || exit /b
+cd..
+
+mkdir build32
+cd build32
+set CC=..\build32_stage0\bin\clang-cl
+set CXX=..\build32_stage0\bin\clang-cl
+cmake -GNinja %cmake_flags% -DBUILD_CLANG_FORMAT_VS_PLUGIN=ON ..\llvm || exit /b
+ninja all || exit /b
+ninja check || exit /b
+ninja check-clang || exit /b
+copy ..\llvm\tools\clang\tools\clang-format-vs\ClangFormat\bin\Release\ClangFormat.vsix ClangFormat-r%revision%.vsix
+ninja package || exit /b
+cd ..
+
+
+call "%vcdir%/vcvarsall.bat" amd64
+set CC=
+set CXX=
+mkdir build64_stage0
+cd build64_stage0
+cmake -GNinja %cmake_flags%  ..\llvm || exit /b
+ninja all || exit /b
+ninja check || exit /b
+ninja check-clang || exit /b
+cd..
+
+mkdir build64
+cd build64
+set CC=..\build64_stage0\bin\clang-cl
+set CXX=..\build64_stage0\bin\clang-cl
+cmake -GNinja %cmake_flags% ..\llvm || exit /b
+ninja all || exit /b
+ninja check || exit /b
+ninja check-clang || exit /b
+ninja package || exit /b
+cd ..

From 9a3d083ea53b0f155cbe5e049883a3730b103256 Mon Sep 17 00:00:00 2001
From: Kyle Butt <kyle+llvm@iteratee.net>
Date: Fri, 11 Dec 2015 00:47:36 +0000
Subject: [PATCH 357/364] [PPC]: Peephole optimize small accesss to aligned
 globals.

Access to aligned globals gives us a chance to peephole optimize nonzero
offsets. If a struct is 4 byte aligned, then accesses to bytes 0-3 won't
overflow the available displacement. For example:
        addis 3, 2, b4v@toc@ha
        addi 4, 3, b4v@toc@l
        lbz 5, b4v@toc@l(3) ; This is the result of the current peephole
        lbz 6, 1(4)         ; optimizer
        lbz 7, 2(4)
        lbz 8, 3(4)
If b4v is 4-byte aligned, we can skip using register 4 because we know
that b4v@toc@l+{1,2,3} won't overflow 32K, and instead generate:
        addis 3, 2, b4v@toc@ha
        lbz 4, b4v@toc@l(3)
        lbz 5, b4v@toc@l+1(3)
        lbz 6, b4v@toc@l+2(3)
        lbz 7, b4v@toc@l+3(3)
Saving a register and an addition.
Larger alignments allow larger structures/arrays to be optimized.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255319 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/PowerPC/PPCISelDAGToDAG.cpp |  35 ++-
 test/CodeGen/PowerPC/peephole-align.ll | 335 +++++++++++++++++++++++++
 2 files changed, 361 insertions(+), 9 deletions(-)
 create mode 100644 test/CodeGen/PowerPC/peephole-align.ll

diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 2261b71c5aa9..8bc410242500 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -4180,16 +4180,24 @@ void PPCDAGToDAGISel::PeepholePPC64() {
       break;
     }
 
-    // If this is a load or store with a zero offset, we may be able to
-    // fold an add-immediate into the memory operation.
-    if (!isa<ConstantSDNode>(N->getOperand(FirstOp)) ||
-        N->getConstantOperandVal(FirstOp) != 0)
+    // If this is a load or store with a zero offset, or within the alignment,
+    // we may be able to fold an add-immediate into the memory operation.
+    // The check against alignment is below, as it can't occur until we check
+    // the arguments to N
+    if (!isa<ConstantSDNode>(N->getOperand(FirstOp)))
       continue;
 
     SDValue Base = N->getOperand(FirstOp + 1);
     if (!Base.isMachineOpcode())
       continue;
 
+    // On targets with fusion, we don't want this to fire and remove a fusion
+    // opportunity, unless a) it results in another fusion opportunity or
+    // b) optimizing for size.
+    if (PPCSubTarget->hasFusion() &&
+        (!MF->getFunction()->optForSize() && !Base.hasOneUse())
+      continue;
+
     unsigned Flags = 0;
     bool ReplaceFlags = true;
 
@@ -4233,6 +4241,17 @@ void PPCDAGToDAGISel::PeepholePPC64() {
       break;
     }
 
+    SDValue ImmOpnd = Base.getOperand(1);
+    int MaxDisplacement = 0;
+    if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd)) {
+      const GlobalValue *GV = GA->getGlobal();
+      MaxDisplacement = GV->getAlignment() - 1;
+    }
+
+    int Offset = N->getConstantOperandVal(FirstOp);
+    if (Offset < 0 || Offset > MaxDisplacement)
+      continue;
+
     // We found an opportunity.  Reverse the operands from the add
     // immediate and substitute them into the load or store.  If
     // needed, update the target flags for the immediate operand to
@@ -4243,8 +4262,6 @@ void PPCDAGToDAGISel::PeepholePPC64() {
     DEBUG(N->dump(CurDAG));
     DEBUG(dbgs() << "\n");
 
-    SDValue ImmOpnd = Base.getOperand(1);
-
     // If the relocation information isn't already present on the
     // immediate operand, add it now.
     if (ReplaceFlags) {
@@ -4255,17 +4272,17 @@ void PPCDAGToDAGISel::PeepholePPC64() {
         // is insufficient for the instruction encoding.
         if (GV->getAlignment() < 4 &&
             (StorageOpcode == PPC::LD || StorageOpcode == PPC::STD ||
-             StorageOpcode == PPC::LWA)) {
+             StorageOpcode == PPC::LWA || (Offset % 4) != 0)) {
           DEBUG(dbgs() << "Rejected this candidate for alignment.\n\n");
           continue;
         }
-        ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, 0, Flags);
+        ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, Offset, Flags);
       } else if (ConstantPoolSDNode *CP =
                  dyn_cast<ConstantPoolSDNode>(ImmOpnd)) {
         const Constant *C = CP->getConstVal();
         ImmOpnd = CurDAG->getTargetConstantPool(C, MVT::i64,
                                                 CP->getAlignment(),
-                                                0, Flags);
+                                                Offset, Flags);
       }
     }
 
diff --git a/test/CodeGen/PowerPC/peephole-align.ll b/test/CodeGen/PowerPC/peephole-align.ll
new file mode 100644
index 000000000000..c8c2fe4d32ce
--- /dev/null
+++ b/test/CodeGen/PowerPC/peephole-align.ll
@@ -0,0 +1,335 @@
+; RUN: llc -mcpu=pwr7 -O1 -code-model=medium <%s | FileCheck -check-prefix=POWER7 -check-prefix=CHECK %s
+; RUN: llc -mcpu=pwr8 -O1 -code-model=medium <%s | FileCheck -check-prefix=POWER8 -check-prefix=CHECK %s
+
+; Test peephole optimization for medium code model (32-bit TOC offsets)
+; for loading and storing small offsets within aligned values.
+; For power8, verify that the optimization doesn't fire, as it prevents fusion
+; opportunities.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.b4 = type<{ i8, i8, i8, i8 }>
+%struct.h2 = type<{ i16, i16 }>
+
+%struct.b8 = type<{ i8, i8, i8, i8, i8, i8, i8, i8 }>
+%struct.h4 = type<{ i16, i16, i16, i16 }>
+%struct.w2 = type<{ i32, i32 }>
+
+%struct.d2 = type<{ i64, i64 }>
+%struct.misalign = type<{ i8, i64 }>
+
+@b4v = global %struct.b4 <{ i8 1, i8 2, i8 3, i8 4 }>, align 4
+@h2v = global %struct.h2 <{ i16 1, i16 2 }>, align 4
+
+@b8v = global %struct.b8 <{ i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8 }>, align 8
+@h4v = global %struct.h4 <{ i16 1, i16 2, i16 3, i16 4 }>, align 8
+@w2v = global %struct.w2 <{ i32 1, i32 2 }>, align 8
+
+@d2v = global %struct.d2 <{ i64 1, i64 2 }>, align 16
+@misalign_v = global %struct.misalign <{ i8 1, i64 2 }>, align 16
+
+; CHECK-LABEL: test_b4:
+; POWER7: addis [[REGSTRUCT:[0-9]+]], 2, b4v@toc@ha
+; POWER7-DAG: lbz [[REG0_0:[0-9]+]], b4v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: lbz [[REG1_0:[0-9]+]], b4v@toc@l+1([[REGSTRUCT]])
+; POWER7-DAG: lbz [[REG2_0:[0-9]+]], b4v@toc@l+2([[REGSTRUCT]])
+; POWER7-DAG: lbz [[REG3_0:[0-9]+]], b4v@toc@l+3([[REGSTRUCT]])
+; POWER7-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER7-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER7-DAG: addi [[REG2_1:[0-9]+]], [[REG2_0]], 3
+; POWER7-DAG: addi [[REG3_1:[0-9]+]], [[REG3_0]], 4
+; POWER7-DAG: stb [[REG0_1]], b4v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: stb [[REG1_1]], b4v@toc@l+1([[REGSTRUCT]])
+; POWER7-DAG: stb [[REG2_1]], b4v@toc@l+2([[REGSTRUCT]])
+; POWER7-DAG: stb [[REG3_1]], b4v@toc@l+3([[REGSTRUCT]])
+
+; POWER8: addis [[REGSTRUCT:[0-9]+]], 2, b4v@toc@ha
+; POWER8-NEXT: addi [[REGSTRUCT]], [[REGSTRUCT]], b4v@toc@l
+; POWER8-DAG: lbz [[REG0_0:[0-9]+]], 0([[REGSTRUCT]])
+; POWER8-DAG: lbz [[REG1_0:[0-9]+]], 1([[REGSTRUCT]])
+; POWER8-DAG: lbz [[REG2_0:[0-9]+]], 2([[REGSTRUCT]])
+; POWER8-DAG: lbz [[REG3_0:[0-9]+]], 3([[REGSTRUCT]])
+; POWER8-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER8-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER8-DAG: addi [[REG2_1:[0-9]+]], [[REG2_0]], 3
+; POWER8-DAG: addi [[REG3_1:[0-9]+]], [[REG3_0]], 4
+; POWER8-DAG: stb [[REG0_1]], 0([[REGSTRUCT]])
+; POWER8-DAG: stb [[REG1_1]], 1([[REGSTRUCT]])
+; POWER8-DAG: stb [[REG2_1]], 2([[REGSTRUCT]])
+; POWER8-DAG: stb [[REG3_1]], 3([[REGSTRUCT]])
+define void @test_b4() nounwind {
+entry:
+  %0 = load i8, i8* getelementptr inbounds (%struct.b4, %struct.b4* @b4v, i32 0, i32 0), align 1
+  %inc0 = add nsw i8 %0, 1
+  store i8 %inc0, i8* getelementptr inbounds (%struct.b4, %struct.b4* @b4v, i32 0, i32 0), align 1
+  %1 = load i8, i8* getelementptr inbounds (%struct.b4, %struct.b4* @b4v, i32 0, i32 1), align 1
+  %inc1 = add nsw i8 %1, 2
+  store i8 %inc1, i8* getelementptr inbounds (%struct.b4, %struct.b4* @b4v, i32 0, i32 1), align 1
+  %2 = load i8, i8* getelementptr inbounds (%struct.b4, %struct.b4* @b4v, i32 0, i32 2), align 1
+  %inc2 = add nsw i8 %2, 3
+  store i8 %inc2, i8* getelementptr inbounds (%struct.b4, %struct.b4* @b4v, i32 0, i32 2), align 1
+  %3 = load i8, i8* getelementptr inbounds (%struct.b4, %struct.b4* @b4v, i32 0, i32 3), align 1
+  %inc3 = add nsw i8 %3, 4
+  store i8 %inc3, i8* getelementptr inbounds (%struct.b4, %struct.b4* @b4v, i32 0, i32 3), align 1
+  ret void
+}
+
+; CHECK-LABEL: test_h2:
+; POWER7: addis [[REGSTRUCT:[0-9]+]], 2, h2v@toc@ha
+; POWER7-DAG: lhz [[REG0_0:[0-9]+]], h2v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: lhz [[REG1_0:[0-9]+]], h2v@toc@l+2([[REGSTRUCT]])
+; POWER7-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER7-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER7-DAG: sth [[REG0_1]], h2v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: sth [[REG1_1]], h2v@toc@l+2([[REGSTRUCT]])
+
+; POWER8: addis [[REGSTRUCT:[0-9]+]], 2, h2v@toc@ha
+; POWER8-NEXT: addi [[REGSTRUCT]], [[REGSTRUCT]], h2v@toc@l
+; POWER8-DAG: lhz [[REG0_0:[0-9]+]], 0([[REGSTRUCT]])
+; POWER8-DAG: lhz [[REG1_0:[0-9]+]], 2([[REGSTRUCT]])
+; POWER8-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER8-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER8-DAG: sth [[REG0_1]], 0([[REGSTRUCT]])
+; POWER8-DAG: sth [[REG1_1]], 2([[REGSTRUCT]])
+define void @test_h2() nounwind {
+entry:
+  %0 = load i16, i16* getelementptr inbounds (%struct.h2, %struct.h2* @h2v, i32 0, i32 0), align 2
+  %inc0 = add nsw i16 %0, 1
+  store i16 %inc0, i16* getelementptr inbounds (%struct.h2, %struct.h2* @h2v, i32 0, i32 0), align 2
+  %1 = load i16, i16* getelementptr inbounds (%struct.h2, %struct.h2* @h2v, i32 0, i32 1), align 2
+  %inc1 = add nsw i16 %1, 2
+  store i16 %inc1, i16* getelementptr inbounds (%struct.h2, %struct.h2* @h2v, i32 0, i32 1), align 2
+  ret void
+}
+
+; CHECK-LABEL: test_h2_optsize:
+; CHECK: addis [[REGSTRUCT:[0-9]+]], 2, h2v@toc@ha
+; CHECK-DAG: lhz [[REG0_0:[0-9]+]], h2v@toc@l([[REGSTRUCT]])
+; CHECK-DAG: lhz [[REG1_0:[0-9]+]], h2v@toc@l+2([[REGSTRUCT]])
+; CHECK-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; CHECK-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; CHECK-DAG: sth [[REG0_1]], h2v@toc@l([[REGSTRUCT]])
+; CHECK-DAG: sth [[REG1_1]], h2v@toc@l+2([[REGSTRUCT]])
+define void @test_h2_optsize() optsize nounwind {
+entry:
+  %0 = load i16, i16* getelementptr inbounds (%struct.h2, %struct.h2* @h2v, i32 0, i32 0), align 2
+  %inc0 = add nsw i16 %0, 1
+  store i16 %inc0, i16* getelementptr inbounds (%struct.h2, %struct.h2* @h2v, i32 0, i32 0), align 2
+  %1 = load i16, i16* getelementptr inbounds (%struct.h2, %struct.h2* @h2v, i32 0, i32 1), align 2
+  %inc1 = add nsw i16 %1, 2
+  store i16 %inc1, i16* getelementptr inbounds (%struct.h2, %struct.h2* @h2v, i32 0, i32 1), align 2
+  ret void
+}
+
+; CHECK-LABEL: test_b8:
+; POWER7: addis [[REGSTRUCT:[0-9]+]], 2, b8v@toc@ha
+; POWER7-DAG: lbz [[REG0_0:[0-9]+]], b8v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: lbz [[REG1_0:[0-9]+]], b8v@toc@l+1([[REGSTRUCT]])
+; POWER7-DAG: lbz [[REG2_0:[0-9]+]], b8v@toc@l+2([[REGSTRUCT]])
+; POWER7-DAG: lbz [[REG3_0:[0-9]+]], b8v@toc@l+3([[REGSTRUCT]])
+; POWER7-DAG: lbz [[REG4_0:[0-9]+]], b8v@toc@l+4([[REGSTRUCT]])
+; POWER7-DAG: lbz [[REG5_0:[0-9]+]], b8v@toc@l+5([[REGSTRUCT]])
+; POWER7-DAG: lbz [[REG6_0:[0-9]+]], b8v@toc@l+6([[REGSTRUCT]])
+; POWER7-DAG: lbz [[REG7_0:[0-9]+]], b8v@toc@l+7([[REGSTRUCT]])
+; POWER7-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER7-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER7-DAG: addi [[REG2_1:[0-9]+]], [[REG2_0]], 3
+; POWER7-DAG: addi [[REG3_1:[0-9]+]], [[REG3_0]], 4
+; POWER7-DAG: addi [[REG4_1:[0-9]+]], [[REG4_0]], 5
+; POWER7-DAG: addi [[REG5_1:[0-9]+]], [[REG5_0]], 6
+; POWER7-DAG: addi [[REG6_1:[0-9]+]], [[REG6_0]], 7
+; POWER7-DAG: addi [[REG7_1:[0-9]+]], [[REG7_0]], 8
+; POWER7-DAG: stb [[REG0_1]], b8v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: stb [[REG1_1]], b8v@toc@l+1([[REGSTRUCT]])
+; POWER7-DAG: stb [[REG2_1]], b8v@toc@l+2([[REGSTRUCT]])
+; POWER7-DAG: stb [[REG3_1]], b8v@toc@l+3([[REGSTRUCT]])
+; POWER7-DAG: stb [[REG4_1]], b8v@toc@l+4([[REGSTRUCT]])
+; POWER7-DAG: stb [[REG5_1]], b8v@toc@l+5([[REGSTRUCT]])
+; POWER7-DAG: stb [[REG6_1]], b8v@toc@l+6([[REGSTRUCT]])
+; POWER7-DAG: stb [[REG7_1]], b8v@toc@l+7([[REGSTRUCT]])
+
+; POWER8: addis [[REGSTRUCT:[0-9]+]], 2, b8v@toc@ha
+; POWER8-NEXT: addi [[REGSTRUCT]], [[REGSTRUCT]], b8v@toc@l
+; POWER8-DAG: lbz [[REG0_0:[0-9]+]], 0([[REGSTRUCT]])
+; POWER8-DAG: lbz [[REG1_0:[0-9]+]], 1([[REGSTRUCT]])
+; POWER8-DAG: lbz [[REG2_0:[0-9]+]], 2([[REGSTRUCT]])
+; POWER8-DAG: lbz [[REG3_0:[0-9]+]], 3([[REGSTRUCT]])
+; POWER8-DAG: lbz [[REG4_0:[0-9]+]], 4([[REGSTRUCT]])
+; POWER8-DAG: lbz [[REG5_0:[0-9]+]], 5([[REGSTRUCT]])
+; POWER8-DAG: lbz [[REG6_0:[0-9]+]], 6([[REGSTRUCT]])
+; POWER8-DAG: lbz [[REG7_0:[0-9]+]], 7([[REGSTRUCT]])
+; POWER8-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER8-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER8-DAG: addi [[REG2_1:[0-9]+]], [[REG2_0]], 3
+; POWER8-DAG: addi [[REG3_1:[0-9]+]], [[REG3_0]], 4
+; POWER8-DAG: addi [[REG4_1:[0-9]+]], [[REG4_0]], 5
+; POWER8-DAG: addi [[REG5_1:[0-9]+]], [[REG5_0]], 6
+; POWER8-DAG: addi [[REG6_1:[0-9]+]], [[REG6_0]], 7
+; POWER8-DAG: addi [[REG7_1:[0-9]+]], [[REG7_0]], 8
+; POWER8-DAG: stb [[REG0_1]], 0([[REGSTRUCT]])
+; POWER8-DAG: stb [[REG1_1]], 1([[REGSTRUCT]])
+; POWER8-DAG: stb [[REG2_1]], 2([[REGSTRUCT]])
+; POWER8-DAG: stb [[REG3_1]], 3([[REGSTRUCT]])
+; POWER8-DAG: stb [[REG4_1]], 4([[REGSTRUCT]])
+; POWER8-DAG: stb [[REG5_1]], 5([[REGSTRUCT]])
+; POWER8-DAG: stb [[REG6_1]], 6([[REGSTRUCT]])
+; POWER8-DAG: stb [[REG7_1]], 7([[REGSTRUCT]])
+define void @test_b8() nounwind {
+entry:
+  %0 = load i8, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 0), align 1
+  %inc0 = add nsw i8 %0, 1
+  store i8 %inc0, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 0), align 1
+  %1 = load i8, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 1), align 1
+  %inc1 = add nsw i8 %1, 2
+  store i8 %inc1, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 1), align 1
+  %2 = load i8, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 2), align 1
+  %inc2 = add nsw i8 %2, 3
+  store i8 %inc2, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 2), align 1
+  %3 = load i8, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 3), align 1
+  %inc3 = add nsw i8 %3, 4
+  store i8 %inc3, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 3), align 1
+  %4 = load i8, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 4), align 1
+  %inc4 = add nsw i8 %4, 5
+  store i8 %inc4, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 4), align 1
+  %5 = load i8, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 5), align 1
+  %inc5 = add nsw i8 %5, 6
+  store i8 %inc5, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 5), align 1
+  %6 = load i8, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 6), align 1
+  %inc6 = add nsw i8 %6, 7
+  store i8 %inc6, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 6), align 1
+  %7 = load i8, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 7), align 1
+  %inc7 = add nsw i8 %7, 8
+  store i8 %inc7, i8* getelementptr inbounds (%struct.b8, %struct.b8* @b8v, i32 0, i32 7), align 1
+  ret void
+}
+
+; CHECK-LABEL: test_h4:
+; POWER7: addis [[REGSTRUCT:[0-9]+]], 2, h4v@toc@ha
+; POWER7-DAG: lhz [[REG0_0:[0-9]+]], h4v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: lhz [[REG1_0:[0-9]+]], h4v@toc@l+2([[REGSTRUCT]])
+; POWER7-DAG: lhz [[REG2_0:[0-9]+]], h4v@toc@l+4([[REGSTRUCT]])
+; POWER7-DAG: lhz [[REG3_0:[0-9]+]], h4v@toc@l+6([[REGSTRUCT]])
+; POWER7-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER7-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER7-DAG: addi [[REG2_1:[0-9]+]], [[REG2_0]], 3
+; POWER7-DAG: addi [[REG3_1:[0-9]+]], [[REG3_0]], 4
+; POWER7-DAG: sth [[REG0_1]], h4v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: sth [[REG1_1]], h4v@toc@l+2([[REGSTRUCT]])
+; POWER7-DAG: sth [[REG2_1]], h4v@toc@l+4([[REGSTRUCT]])
+; POWER7-DAG: sth [[REG3_1]], h4v@toc@l+6([[REGSTRUCT]])
+
+; POWER8: addis [[REGSTRUCT:[0-9]+]], 2, h4v@toc@ha
+; POWER8-NEXT: addi [[REGSTRUCT]], [[REGSTRUCT]], h4v@toc@l
+; POWER8-DAG: lhz [[REG0_0:[0-9]+]], 0([[REGSTRUCT]])
+; POWER8-DAG: lhz [[REG1_0:[0-9]+]], 2([[REGSTRUCT]])
+; POWER8-DAG: lhz [[REG2_0:[0-9]+]], 4([[REGSTRUCT]])
+; POWER8-DAG: lhz [[REG3_0:[0-9]+]], 6([[REGSTRUCT]])
+; POWER8-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER8-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER8-DAG: addi [[REG2_1:[0-9]+]], [[REG2_0]], 3
+; POWER8-DAG: addi [[REG3_1:[0-9]+]], [[REG3_0]], 4
+; POWER8-DAG: sth [[REG0_1]], 0([[REGSTRUCT]])
+; POWER8-DAG: sth [[REG1_1]], 2([[REGSTRUCT]])
+; POWER8-DAG: sth [[REG2_1]], 4([[REGSTRUCT]])
+; POWER8-DAG: sth [[REG3_1]], 6([[REGSTRUCT]])
+define void @test_h4() nounwind {
+entry:
+  %0 = load i16, i16* getelementptr inbounds (%struct.h4, %struct.h4* @h4v, i32 0, i32 0), align 2
+  %inc0 = add nsw i16 %0, 1
+  store i16 %inc0, i16* getelementptr inbounds (%struct.h4, %struct.h4* @h4v, i32 0, i32 0), align 2
+  %1 = load i16, i16* getelementptr inbounds (%struct.h4, %struct.h4* @h4v, i32 0, i32 1), align 2
+  %inc1 = add nsw i16 %1, 2
+  store i16 %inc1, i16* getelementptr inbounds (%struct.h4, %struct.h4* @h4v, i32 0, i32 1), align 2
+  %2 = load i16, i16* getelementptr inbounds (%struct.h4, %struct.h4* @h4v, i32 0, i32 2), align 2
+  %inc2 = add nsw i16 %2, 3
+  store i16 %inc2, i16* getelementptr inbounds (%struct.h4, %struct.h4* @h4v, i32 0, i32 2), align 2
+  %3 = load i16, i16* getelementptr inbounds (%struct.h4, %struct.h4* @h4v, i32 0, i32 3), align 2
+  %inc3 = add nsw i16 %3, 4
+  store i16 %inc3, i16* getelementptr inbounds (%struct.h4, %struct.h4* @h4v, i32 0, i32 3), align 2
+  ret void
+}
+
+; CHECK-LABEL: test_w2:
+; POWER7: addis [[REGSTRUCT:[0-9]+]], 2, w2v@toc@ha
+; POWER7-DAG: lwz [[REG0_0:[0-9]+]], w2v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: lwz [[REG1_0:[0-9]+]], w2v@toc@l+4([[REGSTRUCT]])
+; POWER7-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER7-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER7-DAG: stw [[REG0_1]], w2v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: stw [[REG1_1]], w2v@toc@l+4([[REGSTRUCT]])
+
+; POWER8: addis [[REGSTRUCT:[0-9]+]], 2, w2v@toc@ha
+; POWER8-NEXT: addi [[REGSTRUCT]], [[REGSTRUCT]], w2v@toc@l
+; POWER8-DAG: lwz [[REG0_0:[0-9]+]], 0([[REGSTRUCT]])
+; POWER8-DAG: lwz [[REG1_0:[0-9]+]], 4([[REGSTRUCT]])
+; POWER8-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER8-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER8-DAG: stw [[REG0_1]], 0([[REGSTRUCT]])
+; POWER8-DAG: stw [[REG1_1]], 4([[REGSTRUCT]])
+define void @test_w2() nounwind {
+entry:
+  %0 = load i32, i32* getelementptr inbounds (%struct.w2, %struct.w2* @w2v, i32 0, i32 0), align 4
+  %inc0 = add nsw i32 %0, 1
+  store i32 %inc0, i32* getelementptr inbounds (%struct.w2, %struct.w2* @w2v, i32 0, i32 0), align 4
+  %1 = load i32, i32* getelementptr inbounds (%struct.w2, %struct.w2* @w2v, i32 0, i32 1), align 4
+  %inc1 = add nsw i32 %1, 2
+  store i32 %inc1, i32* getelementptr inbounds (%struct.w2, %struct.w2* @w2v, i32 0, i32 1), align 4
+  ret void
+}
+
+; CHECK-LABEL: test_d2:
+; POWER7: addis [[REGSTRUCT:[0-9]+]], 2, d2v@toc@ha
+; POWER7-DAG: ld [[REG0_0:[0-9]+]], d2v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: ld [[REG1_0:[0-9]+]], d2v@toc@l+8([[REGSTRUCT]])
+; POWER7-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER7-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER7-DAG: std [[REG0_1]], d2v@toc@l([[REGSTRUCT]])
+; POWER7-DAG: std [[REG1_1]], d2v@toc@l+8([[REGSTRUCT]])
+
+; POWER8: addis [[REGSTRUCT:[0-9]+]], 2, d2v@toc@ha
+; POWER8-NEXT: addi [[REGSTRUCT]], [[REGSTRUCT]], d2v@toc@l
+; POWER8-DAG: ld [[REG0_0:[0-9]+]], 0([[REGSTRUCT]])
+; POWER8-DAG: ld [[REG1_0:[0-9]+]], 8([[REGSTRUCT]])
+; POWER8-DAG: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER8-DAG: addi [[REG1_1:[0-9]+]], [[REG1_0]], 2
+; POWER8-DAG: std [[REG0_1]], 0([[REGSTRUCT]])
+; POWER8-DAG: std [[REG1_1]], 8([[REGSTRUCT]])
+define void @test_d2() nounwind {
+entry:
+  %0 = load i64, i64* getelementptr inbounds (%struct.d2, %struct.d2* @d2v, i32 0, i32 0), align 8
+  %inc0 = add nsw i64 %0, 1
+  store i64 %inc0, i64* getelementptr inbounds (%struct.d2, %struct.d2* @d2v, i32 0, i32 0), align 8
+  %1 = load i64, i64* getelementptr inbounds (%struct.d2, %struct.d2* @d2v, i32 0, i32 1), align 8
+  %inc1 = add nsw i64 %1, 2
+  store i64 %inc1, i64* getelementptr inbounds (%struct.d2, %struct.d2* @d2v, i32 0, i32 1), align 8
+  ret void
+}
+
+; Make sure the optimization fires on power8 if there is a single use resulting
+; in a better fusion opportunity.
+; register 3 is the return value, so it should be chosen
+; CHECK-LABEL: test_singleuse:
+; CHECK: addis 3, 2, d2v@toc@ha
+; CHECK: ld 3, d2v@toc@l+8(3)
+define i64 @test_singleuse() nounwind {
+entry:
+  %0 = load i64, i64* getelementptr inbounds (%struct.d2, %struct.d2* @d2v, i32 0, i32 1), align 8
+  ret i64 %0
+}
+
+; Make sure the optimization fails to fire if the symbol is aligned, but the offset is not.
+; CHECK-LABEL: test_misalign
+; POWER7: addis [[REGSTRUCT_0:[0-9]+]], 2, misalign_v@toc@ha
+; POWER7: addi [[REGSTRUCT:[0-9]+]], [[REGSTRUCT_0]], misalign_v@toc@l
+; POWER7: li [[OFFSET_REG:[0-9]+]], 1
+; POWER7: ldx [[REG0_0:[0-9]+]], [[REGSTRUCT]], [[OFFSET_REG]]
+; POWER7: addi [[REG0_1:[0-9]+]], [[REG0_0]], 1
+; POWER7: stdx [[REG0_1]], [[REGSTRUCT]], [[OFFSET_REG]]
+define void @test_misalign() nounwind {
+entry:
+  %0 = load i64, i64* getelementptr inbounds (%struct.misalign, %struct.misalign* @misalign_v, i32 0, i32 1), align 1
+  %inc0 = add nsw i64 %0, 1
+  store i64 %inc0, i64* getelementptr inbounds (%struct.misalign, %struct.misalign* @misalign_v, i32 0, i32 1), align 1
+  ret void
+}

From 283d7ebfd0a9ebe6cf7782530728d725b21eb9a8 Mon Sep 17 00:00:00 2001
From: Akira Hatanaka <ahatanaka@apple.com>
Date: Fri, 11 Dec 2015 00:49:47 +0000
Subject: [PATCH 358/364] [LazyValueInfo] Stop inserting overdefined values
 into ValueCache to reduce memory usage.

Previously, LazyValueInfoCache inserted overdefined lattice values into
both ValueCache and OverDefinedCache. This wasn't necessary and was
causing LazyValueInfo to use an excessive amount of memory in some cases.

This patch changes LazyValueInfoCache to insert overdefined values only
into OverDefinedCache. The memory usage decreases by 70 to 75% when one
of the files in llvm is compiled.

rdar://problem/11388615

Differential revision: http://reviews.llvm.org/D15391


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255320 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Analysis/LazyValueInfo.cpp | 66 ++++++++++++++++++++++++----------
 1 file changed, 48 insertions(+), 18 deletions(-)

diff --git a/lib/Analysis/LazyValueInfo.cpp b/lib/Analysis/LazyValueInfo.cpp
index 1d50e98c0fdd..0d1d34e0cb4f 100644
--- a/lib/Analysis/LazyValueInfo.cpp
+++ b/lib/Analysis/LazyValueInfo.cpp
@@ -105,7 +105,12 @@ class LVILatticeVal {
     Res.markConstantRange(CR);
     return Res;
   }
-
+  static LVILatticeVal getOverdefined() {
+    LVILatticeVal Res;
+    Res.markOverdefined();
+    return Res;
+  }
+  
   bool isUndefined() const     { return Tag == undefined; }
   bool isConstant() const      { return Tag == constant; }
   bool isNotConstant() const   { return Tag == notconstant; }
@@ -316,6 +321,8 @@ namespace {
     /// This is all of the cached block information for exactly one Value*.
     /// The entries are sorted by the BasicBlock* of the
     /// entries, allowing us to do a lookup with a binary search.
+    /// Over-defined lattice values are recorded in OverDefinedCache to reduce
+    /// memory overhead.
     typedef SmallDenseMap<AssertingVH<BasicBlock>, LVILatticeVal, 4>
         ValueCacheEntryTy;
 
@@ -324,8 +331,7 @@ namespace {
     std::map<LVIValueHandle, ValueCacheEntryTy> ValueCache;
 
     /// This tracks, on a per-block basis, the set of values that are
-    /// over-defined at the end of that block.  This is required
-    /// for cache updating.
+    /// over-defined at the end of that block.
     typedef DenseMap<AssertingVH<BasicBlock>, SmallPtrSet<Value *, 4>>
         OverDefinedCacheTy;
     OverDefinedCacheTy OverDefinedCache;
@@ -360,9 +366,13 @@ namespace {
 
     void insertResult(Value *Val, BasicBlock *BB, const LVILatticeVal &Result) {
       SeenBlocks.insert(BB);
-      lookup(Val)[BB] = Result;
+
+      // Insert over-defined values into their own cache to reduce memory
+      // overhead.
       if (Result.isOverdefined())
         OverDefinedCache[BB].insert(Val);
+      else
+        lookup(Val)[BB] = Result;
     }
 
     LVILatticeVal getBlockValue(Value *Val, BasicBlock *BB);
@@ -390,6 +400,34 @@ namespace {
       return ValueCache[LVIValueHandle(V, this)];
     }
 
+    bool isOverdefined(Value *V, BasicBlock *BB) const {
+      auto ODI = OverDefinedCache.find(BB);
+
+      if (ODI == OverDefinedCache.end())
+        return false;
+
+      return ODI->second.count(V);
+    }
+
+    bool hasCachedValueInfo(Value *V, BasicBlock *BB) {
+      if (isOverdefined(V, BB))
+        return true;
+
+      LVIValueHandle ValHandle(V, this);
+      auto I = ValueCache.find(ValHandle);
+      if (I == ValueCache.end())
+        return false;
+
+      return I->second.count(BB);
+    }
+
+    LVILatticeVal getCachedValueInfo(Value *V, BasicBlock *BB) {
+      if (isOverdefined(V, BB))
+        return LVILatticeVal::getOverdefined();
+
+      return lookup(V)[BB];
+    }
+    
   public:
     /// This is the query interface to determine the lattice
     /// value for the specified Value* at the end of the specified block.
@@ -467,7 +505,8 @@ void LazyValueInfoCache::solve() {
     if (solveBlockValue(e.second, e.first)) {
       // The work item was completely processed.
       assert(BlockValueStack.top() == e && "Nothing should have been pushed!");
-      assert(lookup(e.second).count(e.first) && "Result should be in cache!");
+      assert(hasCachedValueInfo(e.second, e.first) &&
+             "Result should be in cache!");
 
       BlockValueStack.pop();
       BlockValueSet.erase(e);
@@ -483,10 +522,7 @@ bool LazyValueInfoCache::hasBlockValue(Value *Val, BasicBlock *BB) {
   if (isa<Constant>(Val))
     return true;
 
-  LVIValueHandle ValHandle(Val, this);
-  auto I = ValueCache.find(ValHandle);
-  if (I == ValueCache.end()) return false;
-  return I->second.count(BB);
+  return hasCachedValueInfo(Val, BB);
 }
 
 LVILatticeVal LazyValueInfoCache::getBlockValue(Value *Val, BasicBlock *BB) {
@@ -495,7 +531,7 @@ LVILatticeVal LazyValueInfoCache::getBlockValue(Value *Val, BasicBlock *BB) {
     return LVILatticeVal::get(VC);
 
   SeenBlocks.insert(BB);
-  return lookup(Val)[BB];
+  return getCachedValueInfo(Val, BB);
 }
 
 static LVILatticeVal getFromRangeMetadata(Instruction *BBI) {
@@ -521,10 +557,10 @@ bool LazyValueInfoCache::solveBlockValue(Value *Val, BasicBlock *BB) {
   if (isa<Constant>(Val))
     return true;
 
-  if (lookup(Val).count(BB)) {
+  if (hasCachedValueInfo(Val, BB)) {
     // If we have a cached value, use that.
     DEBUG(dbgs() << "  reuse BB '" << BB->getName()
-                 << "' val=" << lookup(Val)[BB] << '\n');
+                 << "' val=" << getCachedValueInfo(Val, BB) << '\n');
 
     // Since we're reusing a cached value, we don't need to update the
     // OverDefinedCache. The cache will have been properly updated whenever the
@@ -1106,12 +1142,6 @@ void LazyValueInfoCache::threadEdge(BasicBlock *PredBB, BasicBlock *OldSucc,
       if (!ValueSet.count(V))
         continue;
 
-      // Remove it from the caches.
-      ValueCacheEntryTy &Entry = ValueCache[LVIValueHandle(V, this)];
-      ValueCacheEntryTy::iterator CI = Entry.find(ToUpdate);
-
-      assert(CI != Entry.end() && "Couldn't find entry to update?");
-      Entry.erase(CI);
       ValueSet.erase(V);
       if (ValueSet.empty())
         OverDefinedCache.erase(OI);

From 3719f99ba98fbaf419d77925598e7d7b2ec93d2d Mon Sep 17 00:00:00 2001
From: Eric Christopher <echristo@gmail.com>
Date: Fri, 11 Dec 2015 00:51:59 +0000
Subject: [PATCH 359/364] Fix a spurious if.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255321 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/DeveloperPolicy.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/DeveloperPolicy.rst b/docs/DeveloperPolicy.rst
index 136f68f81ee7..17baf2d27b13 100644
--- a/docs/DeveloperPolicy.rst
+++ b/docs/DeveloperPolicy.rst
@@ -536,7 +536,7 @@ C API Changes
   less stable than "take this IR file and JIT it for my current machine".
 
 * Release stability: We won't break the C API on the release branch with patches
-  that go on that branch, with the exception that if we will fix an unintentional
+  that go on that branch, with the exception that we will fix an unintentional
   C API break that will keep the release consistent with both the previous and
   next release.
 

From ee256944c571b4be88fc7a17b6f9578a09bfd274 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@hanshq.net>
Date: Fri, 11 Dec 2015 00:58:32 +0000
Subject: [PATCH 360/364] Fix build after r255319.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255322 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 8bc410242500..4dfa1650c1ad 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -4195,7 +4195,7 @@ void PPCDAGToDAGISel::PeepholePPC64() {
     // opportunity, unless a) it results in another fusion opportunity or
     // b) optimizing for size.
     if (PPCSubTarget->hasFusion() &&
-        (!MF->getFunction()->optForSize() && !Base.hasOneUse())
+        (!MF->getFunction()->optForSize() && !Base.hasOneUse()))
       continue;
 
     unsigned Flags = 0;

From 6f139ce20bc911456a8edcaabd408e09bd41c806 Mon Sep 17 00:00:00 2001
From: Xinliang David Li <davidxl@google.com>
Date: Fri, 11 Dec 2015 06:53:53 +0000
Subject: [PATCH 361/364] [PGO] Read VP raw data without depending on the Value
 field

Before this patch, each function's on-disk VP data is 'pointed'
to by the Value field of per-function ProfileData structue, and
read relies on this field (relocated with ValueDataDelta field)
to read the value data. However this means the Value field needs
to be updated during runtime before dumping, which creates undesirable
data races.

With this patch, the reading of VP data no longer depends on Value
field. There is no format change. ValueDataDelta header field becomes
obsolute but will be kept for compatibility reason (will be removed
next time the raw format change is needed).


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255329 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ProfileData/InstrProfReader.h | 18 +++++-------------
 lib/ProfileData/InstrProfReader.cpp        | 12 +++++++++---
 2 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/include/llvm/ProfileData/InstrProfReader.h b/include/llvm/ProfileData/InstrProfReader.h
index 2837e421ba87..304606de6791 100644
--- a/include/llvm/ProfileData/InstrProfReader.h
+++ b/include/llvm/ProfileData/InstrProfReader.h
@@ -136,7 +136,6 @@ class RawInstrProfReader : public InstrProfReader {
   bool ShouldSwapBytes;
   uint64_t CountersDelta;
   uint64_t NamesDelta;
-  uint64_t ValueDataDelta;
   const RawInstrProf::ProfileData<IntPtrT> *Data;
   const RawInstrProf::ProfileData<IntPtrT> *DataEnd;
   const uint64_t *CountersStart;
@@ -144,6 +143,7 @@ class RawInstrProfReader : public InstrProfReader {
   const uint8_t *ValueDataStart;
   const char *ProfileEnd;
   uint32_t ValueKindLast;
+  uint32_t CurValueDataSize;
 
   // String table for holding a unique copy of all the strings in the profile.
   InstrProfStringTable StringTable;
@@ -183,7 +183,10 @@ class RawInstrProfReader : public InstrProfReader {
   std::error_code readRawCounts(InstrProfRecord &Record);
   std::error_code readValueProfilingData(InstrProfRecord &Record);
   bool atEnd() const { return Data == DataEnd; }
-  void advanceData() { Data++; }
+  void advanceData() {
+    Data++;
+    ValueDataStart += CurValueDataSize;
+  }
 
   const uint64_t *getCounter(IntPtrT CounterPtr) const {
     ptrdiff_t Offset = (swap(CounterPtr) - CountersDelta) / sizeof(uint64_t);
@@ -193,17 +196,6 @@ class RawInstrProfReader : public InstrProfReader {
     ptrdiff_t Offset = (swap(NamePtr) - NamesDelta) / sizeof(char);
     return NamesStart + Offset;
   }
-  const uint8_t *getValueDataCounts(IntPtrT ValueCountsPtr) const {
-    ptrdiff_t Offset =
-        (swap(ValueCountsPtr) - ValueDataDelta) / sizeof(uint8_t);
-    return ValueDataStart + Offset;
-  }
-  // This accepts an already byte-swapped ValueDataPtr argument.
-  const InstrProfValueData *getValueData(IntPtrT ValueDataPtr) const {
-    ptrdiff_t Offset = (ValueDataPtr - ValueDataDelta) / sizeof(uint8_t);
-    return reinterpret_cast<const InstrProfValueData *>(ValueDataStart +
-                                                        Offset);
-  }
 };
 
 typedef RawInstrProfReader<uint32_t> RawInstrProfReader32;
diff --git a/lib/ProfileData/InstrProfReader.cpp b/lib/ProfileData/InstrProfReader.cpp
index 394893d3701c..da68242b4617 100644
--- a/lib/ProfileData/InstrProfReader.cpp
+++ b/lib/ProfileData/InstrProfReader.cpp
@@ -207,7 +207,6 @@ std::error_code RawInstrProfReader<IntPtrT>::readHeader(
 
   CountersDelta = swap(Header.CountersDelta);
   NamesDelta = swap(Header.NamesDelta);
-  ValueDataDelta = swap(Header.ValueDataDelta);
   auto DataSize = swap(Header.DataSize);
   auto CountersSize = swap(Header.CountersSize);
   auto NamesSize = swap(Header.NamesSize);
@@ -301,11 +300,17 @@ std::error_code
 RawInstrProfReader<IntPtrT>::readValueProfilingData(InstrProfRecord &Record) {
 
   Record.clearValueData();
-  if (!Data->Values || (ValueDataDelta == 0))
+  CurValueDataSize = 0;
+  // Need to match the logic in value profile dumper code in compiler-rt:
+  uint32_t NumValueKinds = 0;
+  for (uint32_t I = 0; I < IPVK_Last + 1; I++)
+    NumValueKinds += (Data->NumValueSites[I] != 0);
+
+  if (!NumValueKinds)
     return success();
 
   ErrorOr<std::unique_ptr<ValueProfData>> VDataPtrOrErr =
-      ValueProfData::getValueProfData(getValueDataCounts(Data->Values),
+      ValueProfData::getValueProfData(ValueDataStart,
                                       (const unsigned char *)ProfileEnd,
                                       getDataEndianness());
 
@@ -313,6 +318,7 @@ RawInstrProfReader<IntPtrT>::readValueProfilingData(InstrProfRecord &Record) {
     return VDataPtrOrErr.getError();
 
   VDataPtrOrErr.get()->deserializeTo(Record, &FunctionPtrToNameMap);
+  CurValueDataSize = VDataPtrOrErr.get()->getSize();
   return success();
 }
 

From 1d4e45c1ee080a72bae2693b385071ef33050e6d Mon Sep 17 00:00:00 2001
From: Maxim Ostapenko <m.ostapenko@partner.samsung.com>
Date: Fri, 11 Dec 2015 07:31:29 +0000
Subject: [PATCH 362/364] This is a test commit to check my commit access
 works.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255330 91177308-0d34-0410-b5e6-96231b3b80d8
---
 README.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.txt b/README.txt
index 7cfb1640995f..a8c3ee9cc333 100644
--- a/README.txt
+++ b/README.txt
@@ -15,3 +15,4 @@ documentation setup.
 
 If you are writing a package for LLVM, see docs/Packaging.rst for our
 suggestions.
+

From 11663248fea4f05ded465b0d7c03c8bfb2772af8 Mon Sep 17 00:00:00 2001
From: Maxim Ostapenko <m.ostapenko@partner.samsung.com>
Date: Fri, 11 Dec 2015 07:40:25 +0000
Subject: [PATCH 363/364] Revert previous test commit.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255331 91177308-0d34-0410-b5e6-96231b3b80d8
---
 README.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.txt b/README.txt
index a8c3ee9cc333..7cfb1640995f 100644
--- a/README.txt
+++ b/README.txt
@@ -15,4 +15,3 @@ documentation setup.
 
 If you are writing a package for LLVM, see docs/Packaging.rst for our
 suggestions.
-

From 1862332f7f2dbf7a85d616a194d15c2e77a08f76 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@hanshq.net>
Date: Tue, 15 Dec 2015 23:21:46 +0000
Subject: [PATCH 364/364] Fix "Not having LAHF/SAHF" assert.

It wants to assert that the subtarget is 64-bit, not the register.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255703 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86InstrInfo.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 34d4e90b3101..7812e4490e7d 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -4401,7 +4401,8 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     int AX = is64 ? X86::RAX : X86::EAX;
 
     if (!Subtarget.hasLAHFSAHF()) {
-      assert(is64 && "Not having LAHF/SAHF only happens on 64-bit.");
+      assert(Subtarget.is64Bit() &&
+             "Not having LAHF/SAHF only happens on 64-bit.");
       // Moving EFLAGS to / from another register requires a push and a pop.
       // Notice that we have to adjust the stack if we don't want to clobber the
       // first frame index. See X86FrameLowering.cpp - clobbersTheStack.