diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp index 281d5655ef9b..e660e210804a 100644 --- a/Source/Core/Common/Arm64Emitter.cpp +++ b/Source/Core/Common/Arm64Emitter.cpp @@ -19,21 +19,49 @@ namespace Arm64Gen { namespace { + const int kWRegSizeInBits = 32; const int kXRegSizeInBits = 64; // The below few functions are taken from V8. -int CountLeadingZeros(uint64_t value, int width) -{ - // TODO(jbramley): Optimize this for ARM64 hosts. - int count = 0; - uint64_t bit_test = 1ULL << (width - 1); - while ((count < width) && ((bit_test & value) == 0)) - { - count++; - bit_test >>= 1; - } - return count; +// CountPopulation32(value) returns the number of bits set in |value|. +inline unsigned CountPopulation32(uint32_t value) +{ + value = ((value >> 1) & 0x55555555) + (value & 0x55555555); + value = ((value >> 2) & 0x33333333) + (value & 0x33333333); + value = ((value >> 4) & 0x0f0f0f0f) + (value & 0x0f0f0f0f); + value = ((value >> 8) & 0x00ff00ff) + (value & 0x00ff00ff); + value = ((value >> 16) & 0x0000ffff) + (value & 0x0000ffff); + return static_cast(value); +} +// CountPopulation64(value) returns the number of bits set in |value|. +inline unsigned CountPopulation64(uint64_t value) +{ + return CountPopulation32(static_cast(value)) + + CountPopulation32(static_cast(value >> 32)); +} +// CountLeadingZeros32(value) returns the number of zero bits following the most +// significant 1 bit in |value| if |value| is non-zero, otherwise it returns 32. +inline unsigned CountLeadingZeros32(uint32_t value) +{ + value = value | (value >> 1); + value = value | (value >> 2); + value = value | (value >> 4); + value = value | (value >> 8); + value = value | (value >> 16); + return CountPopulation32(~value); +} +// CountLeadingZeros64(value) returns the number of zero bits following the most +// significant 1 bit in |value| if |value| is non-zero, otherwise it returns 64. +inline unsigned CountLeadingZeros64(uint64_t value) +{ + value = value | (value >> 1); + value = value | (value >> 2); + value = value | (value >> 4); + value = value | (value >> 8); + value = value | (value >> 16); + value = value | (value >> 32); + return CountPopulation64(~value); } uint64_t LargestPowerOf2Divisor(uint64_t value) @@ -41,24 +69,6 @@ uint64_t LargestPowerOf2Divisor(uint64_t value) return value & -(int64_t)value; } -// For ADD/SUB -bool IsImmArithmetic(uint64_t input, u32* val, bool* shift) -{ - if (input < 4096) - { - *val = input; - *shift = false; - return true; - } - else if ((input & 0xFFF000) == input) - { - *val = input >> 12; - *shift = true; - return true; - } - return false; -} - // For AND/TST/ORR/EOR etc bool IsImmLogical(uint64_t value, unsigned int width, unsigned int* n, unsigned int* imm_s, unsigned int* imm_r) @@ -151,8 +161,8 @@ bool IsImmLogical(uint64_t value, unsigned int width, unsigned int* n, unsigned // Compute the repeat distance d, and set up a bitmask covering the basic // unit of repetition (i.e. a word with the bottom d bits set). Also, in all // of these cases the N bit of the output will be zero. - clz_a = CountLeadingZeros(a, kXRegSizeInBits); - int clz_c = CountLeadingZeros(c, kXRegSizeInBits); + clz_a = CountLeadingZeros64(a); + int clz_c = CountLeadingZeros64(c); d = clz_a - clz_c; mask = ((UINT64_C(1) << d) - 1); out_n = 0; @@ -178,7 +188,7 @@ bool IsImmLogical(uint64_t value, unsigned int width, unsigned int* n, unsigned // of set bits in our word, meaning that we have the trivial case of // d == 64 and only one 'repetition'. Set up all the same variables as in // the general case above, and set the N bit in the output. - clz_a = CountLeadingZeros(a, kXRegSizeInBits); + clz_a = CountLeadingZeros64(a); d = 64; mask = ~UINT64_C(0); out_n = 1; @@ -210,7 +220,7 @@ bool IsImmLogical(uint64_t value, unsigned int width, unsigned int* n, unsigned 0x5555555555555555UL, }}; - int multiplier_idx = CountLeadingZeros(d, kXRegSizeInBits) - 57; + int multiplier_idx = CountLeadingZeros64(d) - 57; // Ensure that the index to the multipliers array is within bounds. DEBUG_ASSERT((multiplier_idx >= 0) && (static_cast(multiplier_idx) < multipliers.size())); @@ -229,7 +239,7 @@ bool IsImmLogical(uint64_t value, unsigned int width, unsigned int* n, unsigned // Count the set bits in our basic stretch. The special case of clz(0) == -1 // makes the answer come out right for stretches that reach the very top of // the word (e.g. numbers like 0xffffc00000000000). - int clz_b = (b == 0) ? -1 : CountLeadingZeros(b, kXRegSizeInBits); + int clz_b = (b == 0) ? -1 : CountLeadingZeros64(b); int s = clz_a - clz_b; // Decide how many bits to rotate right by, to put the low bit of that basic @@ -2002,14 +2012,9 @@ void ARM64XEmitter::ADRP(ARM64Reg Rd, s32 imm) EncodeAddressInst(1, Rd, imm >> 12); } - -static int Count(bool part[4]) { - int cnt = 0; - for (int i = 0; i < 4; i++) { - if (part[i]) - cnt++; - } - return cnt; +static int UploadPartCount(bool part[4]) +{ + return part[0] + part[1] + part[2] + part[3]; } // Wrapper around MOVZ+MOVK (and later MOVN) void ARM64XEmitter::MOVI2R(ARM64Reg Rd, u64 imm, bool optimize) @@ -2061,7 +2066,7 @@ void ARM64XEmitter::MOVI2R(ARM64Reg Rd, u64 imm, bool optimize) u64 aligned_pc = (u64)GetCodePtr() & ~0xFFF; s64 aligned_offset = (s64)imm - (s64)aligned_pc; // The offset for ADR/ADRP is an s32, so make sure it can be represented in that - if (Count(upload_part) > 1 && std::abs(aligned_offset) < 0x7FFFFFFFLL) + if (UploadPartCount(upload_part) > 1 && std::abs(aligned_offset) < 0x7FFFFFFFLL) { // Immediate we are loading is within 4GB of our aligned range // Most likely a address that we can load in one or two instructions @@ -2104,28 +2109,6 @@ void ARM64XEmitter::MOVI2R(ARM64Reg Rd, u64 imm, bool optimize) } } -bool ARM64XEmitter::MOVI2R2(ARM64Reg Rd, u64 imm1, u64 imm2) -{ - // TODO: Also optimize for performance, not just for code size. - u8* start_pointer = GetWritableCodePtr(); - - MOVI2R(Rd, imm1); - int size1 = GetCodePtr() - start_pointer; - - SetCodePtrUnsafe(start_pointer); - - MOVI2R(Rd, imm2); - int size2 = GetCodePtr() - start_pointer; - - SetCodePtrUnsafe(start_pointer); - - bool element = size1 > size2; - - MOVI2R(Rd, element ? imm2 : imm1); - - return element; -} - void ARM64XEmitter::ABI_PushRegisters(BitSet32 registers) { int num_regs = registers.Count(); @@ -2236,7 +2219,7 @@ void ARM64FloatEmitter::EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, else { ASSERT_MSG(DYNA_REC, !(imm < -256 || imm > 255), - "%s immediate offset must be within range of -256 to 256!", __func__); + "%s immediate offset must be within range of -256 to 255!", __func__); encoded_imm = (imm & 0x1FF) << 2; if (type == INDEX_POST) encoded_imm |= 1; @@ -4103,6 +4086,10 @@ void ARM64XEmitter::ANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) { AND(Rd, Rn, imm_r, imm_s, n != 0); } + else if (imm == 0) + { + MOVI2R(Rd, 0); + } else { ASSERT_MSG(DYNA_REC, scratch != INVALID_REG, @@ -4120,6 +4107,13 @@ void ARM64XEmitter::ORRI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) { ORR(Rd, Rn, imm_r, imm_s, n != 0); } + else if (imm == 0) + { + if (Rd != Rn) + { + MOV(Rd, Rn); + } + } else { ASSERT_MSG(DYNA_REC, scratch != INVALID_REG, @@ -4137,6 +4131,13 @@ void ARM64XEmitter::EORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) { EOR(Rd, Rn, imm_r, imm_s, n != 0); } + else if (imm == 0) + { + if (Rd != Rn) + { + MOV(Rd, Rn); + } + } else { ASSERT_MSG(DYNA_REC, scratch != INVALID_REG, @@ -4154,6 +4155,10 @@ void ARM64XEmitter::ANDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) { ANDS(Rd, Rn, imm_r, imm_s, n != 0); } + else if (imm == 0) + { + ANDS(Rd, Rn, Is64Bit(Rn) ? ZR : WZR, ArithOption(Rd, ST_LSL, 0)); + } else { ASSERT_MSG(DYNA_REC, scratch != INVALID_REG, @@ -4187,10 +4192,6 @@ void ARM64XEmitter::AddImmediate(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool shift, void ARM64XEmitter::ADDI2R_internal(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool negative, bool flags, ARM64Reg scratch) { - bool has_scratch = scratch != INVALID_REG; - u64 imm_neg = Is64Bit(Rd) ? -imm : -imm & 0xFFFFFFFFuLL; - bool neg_neg = negative ? false : true; - // Fast paths, aarch64 immediate instructions // Try them all first if (imm <= 0xFFF) @@ -4198,17 +4199,20 @@ void ARM64XEmitter::ADDI2R_internal(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool nega AddImmediate(Rd, Rn, imm, false, negative, flags); return; } - if (imm <= 0xFFFFFF && (imm & 0xFFF) == 0) + if ((imm & 0xFFF000) == imm) { AddImmediate(Rd, Rn, imm >> 12, true, negative, flags); return; } + + u64 imm_neg = Is64Bit(Rd) ? -imm : -imm & 0xFFFFFFFFuLL; + bool neg_neg = negative ? false : true; if (imm_neg <= 0xFFF) { AddImmediate(Rd, Rn, imm_neg, false, neg_neg, flags); return; } - if (imm_neg <= 0xFFFFFF && (imm_neg & 0xFFF) == 0) + if ((imm_neg & 0xFFF000) == imm_neg) { AddImmediate(Rd, Rn, imm_neg >> 12, true, neg_neg, flags); return; @@ -4217,6 +4221,7 @@ void ARM64XEmitter::ADDI2R_internal(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool nega // ADD+ADD is slower than MOVK+ADD, but inplace. // But it supports a few more bits, so use it to avoid MOVK+MOVK+ADD. // As this splits the addition in two parts, this must not be done on setting flags. + bool has_scratch = scratch != INVALID_REG; if (!flags && (imm >= 0x10000u || !has_scratch) && imm < 0x1000000u) { AddImmediate(Rd, Rn, imm & 0xFFF, false, negative, false); @@ -4234,7 +4239,7 @@ void ARM64XEmitter::ADDI2R_internal(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool nega "ADDI2R - failed to construct arithmetic immediate value from %08x, need scratch", (u32)imm); - negative ^= MOVI2R2(scratch, imm, imm_neg); + MOVI2R(scratch, imm); switch ((negative << 1) | flags) { case 0: @@ -4254,6 +4259,12 @@ void ARM64XEmitter::ADDI2R_internal(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool nega void ARM64XEmitter::ADDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) { + if (imm == 0) + { + // Prefer MOV (ORR) instead of ADD for moves. + MOV(Rd, Rn); + return; + } ADDI2R_internal(Rd, Rn, imm, false, false, scratch); } @@ -4264,6 +4275,12 @@ void ARM64XEmitter::ADDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) void ARM64XEmitter::SUBI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) { + if (imm == 0) + { + // Prefer MOV (ORR) instead of ADD for moves. + MOV(Rd, Rn); + return; + } ADDI2R_internal(Rd, Rn, imm, true, false, scratch); } @@ -4277,73 +4294,6 @@ void ARM64XEmitter::CMPI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch) ADDI2R_internal(Is64Bit(Rn) ? ZR : WZR, Rn, imm, true, true, scratch); } -bool ARM64XEmitter::TryADDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) -{ - u32 val; - bool shift; - if (IsImmArithmetic(imm, &val, &shift)) - ADD(Rd, Rn, val, shift); - else - return false; - - return true; -} - -bool ARM64XEmitter::TrySUBI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) -{ - u32 val; - bool shift; - if (IsImmArithmetic(imm, &val, &shift)) - SUB(Rd, Rn, val, shift); - else - return false; - - return true; -} - -bool ARM64XEmitter::TryCMPI2R(ARM64Reg Rn, u32 imm) -{ - u32 val; - bool shift; - if (IsImmArithmetic(imm, &val, &shift)) - CMP(Rn, val, shift); - else - return false; - - return true; -} - -bool ARM64XEmitter::TryANDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) -{ - u32 n, imm_r, imm_s; - if (IsImmLogical(imm, 32, &n, &imm_s, &imm_r)) - AND(Rd, Rn, imm_r, imm_s, n != 0); - else - return false; - - return true; -} -bool ARM64XEmitter::TryORRI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) -{ - u32 n, imm_r, imm_s; - if (IsImmLogical(imm, 32, &n, &imm_s, &imm_r)) - ORR(Rd, Rn, imm_r, imm_s, n != 0); - else - return false; - - return true; -} -bool ARM64XEmitter::TryEORI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) -{ - u32 n, imm_r, imm_s; - if (IsImmLogical(imm, 32, &n, &imm_s, &imm_r)) - EOR(Rd, Rn, imm_r, imm_s, n != 0); - else - return false; - - return true; -} - void ARM64FloatEmitter::MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch, bool negate) { ASSERT_MSG(DYNA_REC, !IsDouble(Rd), "MOVI2F does not yet support double precision"); diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h index 060de45e3c5a..46f14a36231a 100644 --- a/Source/Core/Common/Arm64Emitter.h +++ b/Source/Core/Common/Arm64Emitter.h @@ -833,7 +833,6 @@ class ARM64XEmitter // Wrapper around MOVZ+MOVK void MOVI2R(ARM64Reg Rd, u64 imm, bool optimize = true); - bool MOVI2R2(ARM64Reg Rd, u64 imm1, u64 imm2); template void MOVP2R(ARM64Reg Rd, P* ptr) { @@ -860,14 +859,6 @@ class ARM64XEmitter void SUBI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); void SUBSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); - bool TryADDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm); - bool TrySUBI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm); - bool TryCMPI2R(ARM64Reg Rn, u32 imm); - - bool TryANDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm); - bool TryORRI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm); - bool TryEORI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm); - // ABI related void ABI_PushRegisters(BitSet32 registers); void ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask = BitSet32(0)); diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index d59ba87fbee1..ddf2bec6ad73 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -753,7 +753,8 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std: u32 numFollows = 0; u32 num_inst = 0; - const u32 branchFollowThreshold = SConfig::GetInstance().bJITFollowBranch ? 2 : 0; + const u32 branchFollowThreshold = + (HasOption(OPTION_BRANCH_FOLLOW) && SConfig::GetInstance().bJITFollowBranch) ? 2 : 0; for (std::size_t i = 0; i < block_size; ++i) { @@ -781,7 +782,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std: // If it is small, the performance will be down. // If it is big, the size of generated code will be big and // cache clearning will happen many times. - if (HasOption(OPTION_BRANCH_FOLLOW) && numFollows < branchFollowThreshold) + if (numFollows < branchFollowThreshold) { bool follow = false; switch (inst.OPCD)