Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

close #11330 sets uses optimized countSetBits #17334

Merged
merged 18 commits into from
Mar 21, 2021
157 changes: 9 additions & 148 deletions lib/pure/bitops.nim
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,13 @@

import macros
import std/private/since
from std/private/vmutils import forwardImpl, toUnsigned


# sets.nim cannot import bitops, but bitops can use include
# system/sets to eliminate code duplication. sets.nim defines
# countBits32 and countBits64.
include system/sets

func bitnot*[T: SomeInteger](x: T): T {.magic: "BitnotI".}
## Computes the `bitwise complement` of the integer `x`.
Expand Down Expand Up @@ -58,34 +65,12 @@ macro bitxor*[T: SomeInteger](x, y: T; z: varargs[T]): T =
for extra in z:
result = newCall(fn, result, extra)

const useBuiltins = not defined(noIntrinsicsBitOpts)
const noUndefined = defined(noUndefinedBitOpts)
const useGCC_builtins = (defined(gcc) or defined(llvm_gcc) or
defined(clang)) and useBuiltins
const useICC_builtins = defined(icc) and useBuiltins
const useVCC_builtins = defined(vcc) and useBuiltins

const arch64 = sizeof(int) == 8
const useBuiltinsRotate = (defined(amd64) or defined(i386)) and
(defined(gcc) or defined(clang) or defined(vcc) or
(defined(icl) and not defined(cpp))) and useBuiltins

template toUnsigned(x: int8): uint8 = cast[uint8](x)
template toUnsigned(x: int16): uint16 = cast[uint16](x)
template toUnsigned(x: int32): uint32 = cast[uint32](x)
template toUnsigned(x: int64): uint64 = cast[uint64](x)
template toUnsigned(x: int): uint = cast[uint](x)

template forwardImpl(impl, arg) {.dirty.} =
when sizeof(x) <= 4:
when x is SomeSignedInt:
impl(cast[uint32](x.int32))
else:
impl(x.uint32)
else:
when x is SomeSignedInt:
impl(cast[uint64](x.int64))
else:
impl(x.uint64)

type BitsRange*[T] = range[0..sizeof(T)*8-1]
## A range with all bit positions for type `T`.
Expand Down Expand Up @@ -436,134 +421,10 @@ func fastlog2Nim(x: uint64): int {.inline.} =
v = v or v shr 32
result = lookup[(v * 0x03F6EAF2CD271461'u64) shr 58].int

# sets.nim cannot import bitops, but bitops can use include
# system/sets to eliminate code duplication. sets.nim defines
# countBits32 and countBits64.
include system/sets

template countSetBitsNim(n: uint32): int = countBits32(n)
template countSetBitsNim(n: uint64): int = countBits64(n)

template parityImpl[T](value: T): int =
# formula id from: https://graphics.stanford.edu/%7Eseander/bithacks.html#ParityParallel
var v = value
when sizeof(T) == 8:
v = v xor (v shr 32)
when sizeof(T) >= 4:
v = v xor (v shr 16)
when sizeof(T) >= 2:
v = v xor (v shr 8)
v = v xor (v shr 4)
v = v and 0xf
((0x6996'u shr v) and 1).int


when useGCC_builtins:
# Returns the number of set 1-bits in value.
proc builtin_popcount(x: cuint): cint {.importc: "__builtin_popcount", cdecl.}
proc builtin_popcountll(x: culonglong): cint {.
importc: "__builtin_popcountll", cdecl.}

# Returns the bit parity in value
proc builtin_parity(x: cuint): cint {.importc: "__builtin_parity", cdecl.}
proc builtin_parityll(x: culonglong): cint {.importc: "__builtin_parityll", cdecl.}

# Returns one plus the index of the least significant 1-bit of x, or if x is zero, returns zero.
proc builtin_ffs(x: cint): cint {.importc: "__builtin_ffs", cdecl.}
proc builtin_ffsll(x: clonglong): cint {.importc: "__builtin_ffsll", cdecl.}

# Returns the number of leading 0-bits in x, starting at the most significant bit position. If x is 0, the result is undefined.
proc builtin_clz(x: cuint): cint {.importc: "__builtin_clz", cdecl.}
proc builtin_clzll(x: culonglong): cint {.importc: "__builtin_clzll", cdecl.}

# Returns the number of trailing 0-bits in x, starting at the least significant bit position. If x is 0, the result is undefined.
proc builtin_ctz(x: cuint): cint {.importc: "__builtin_ctz", cdecl.}
proc builtin_ctzll(x: culonglong): cint {.importc: "__builtin_ctzll", cdecl.}

elif useVCC_builtins:
# Counts the number of one bits (population count) in a 16-, 32-, or 64-byte unsigned integer.
func builtin_popcnt16(a2: uint16): uint16 {.
importc: "__popcnt16", header: "<intrin.h>".}
func builtin_popcnt32(a2: uint32): uint32 {.
importc: "__popcnt", header: "<intrin.h>".}
func builtin_popcnt64(a2: uint64): uint64 {.
importc: "__popcnt64", header: "<intrin.h>".}

# Search the mask data from most significant bit (MSB) to least significant bit (LSB) for a set bit (1).
func bitScanReverse(index: ptr culong, mask: culong): cuchar {.
importc: "_BitScanReverse", header: "<intrin.h>".}
func bitScanReverse64(index: ptr culong, mask: uint64): cuchar {.
importc: "_BitScanReverse64", header: "<intrin.h>".}

# Search the mask data from least significant bit (LSB) to the most significant bit (MSB) for a set bit (1).
func bitScanForward(index: ptr culong, mask: culong): cuchar {.
importc: "_BitScanForward", header: "<intrin.h>".}
func bitScanForward64(index: ptr culong, mask: uint64): cuchar {.
importc: "_BitScanForward64", header: "<intrin.h>".}

template vcc_scan_impl(fnc: untyped; v: untyped): int =
var index: culong
discard fnc(index.addr, v)
index.int

elif useICC_builtins:

# Intel compiler intrinsics: http://fulla.fnal.gov/intel/compiler_c/main_cls/intref_cls/common/intref_allia_misc.htm
# see also: https://software.intel.com/en-us/node/523362
# Count the number of bits set to 1 in an integer a, and return that count in dst.
func builtin_popcnt32(a: cint): cint {.
importc: "_popcnt", header: "<immintrin.h>".}
func builtin_popcnt64(a: uint64): cint {.
importc: "_popcnt64", header: "<immintrin.h>".}

# Returns the number of trailing 0-bits in x, starting at the least significant bit position. If x is 0, the result is undefined.
func bitScanForward(p: ptr uint32, b: uint32): cuchar {.
importc: "_BitScanForward", header: "<immintrin.h>".}
func bitScanForward64(p: ptr uint32, b: uint64): cuchar {.
importc: "_BitScanForward64", header: "<immintrin.h>".}

# Returns the number of leading 0-bits in x, starting at the most significant bit position. If x is 0, the result is undefined.
func bitScanReverse(p: ptr uint32, b: uint32): cuchar {.
importc: "_BitScanReverse", header: "<immintrin.h>".}
func bitScanReverse64(p: ptr uint32, b: uint64): cuchar {.
importc: "_BitScanReverse64", header: "<immintrin.h>".}

template icc_scan_impl(fnc: untyped; v: untyped): int =
var index: uint32
discard fnc(index.addr, v)
index.int


func countSetBits*(x: SomeInteger): int {.inline.} =
## Counts the set bits in an integer (also called `Hamming weight`:idx:).
runnableExamples:
doAssert countSetBits(0b0000_0011'u8) == 2
doAssert countSetBits(0b1010_1010'u8) == 4

# TODO: figure out if ICC support _popcnt32/_popcnt64 on platform without POPCNT.
# like GCC and MSVC
when x is SomeSignedInt:
let x = x.toUnsigned
when nimvm:
result = forwardImpl(countSetBitsNim, x)
else:
when useGCC_builtins:
when sizeof(x) <= 4: result = builtin_popcount(x.cuint).int
else: result = builtin_popcountll(x.culonglong).int
elif useVCC_builtins:
when sizeof(x) <= 2: result = builtin_popcnt16(x.uint16).int
elif sizeof(x) <= 4: result = builtin_popcnt32(x.uint32).int
elif arch64: result = builtin_popcnt64(x.uint64).int
else: result = builtin_popcnt32((x.uint64 and 0xFFFFFFFF'u64).uint32).int +
builtin_popcnt32((x.uint64 shr 32'u64).uint32).int
elif useICC_builtins:
when sizeof(x) <= 4: result = builtin_popcnt32(x.cint).int
elif arch64: result = builtin_popcnt64(x.uint64).int
else: result = builtin_popcnt32((x.uint64 and 0xFFFFFFFF'u64).cint).int +
builtin_popcnt32((x.uint64 shr 32'u64).cint).int
else:
when sizeof(x) <= 4: result = countSetBitsNim(x.uint32)
else: result = countSetBitsNim(x.uint64)
result = countSetBitsImpl(x)

func popcount*(x: SomeInteger): int {.inline.} =
## Alias for `countSetBits <#countSetBits,SomeInteger>`_ (Hamming weight).
Expand Down
17 changes: 17 additions & 0 deletions lib/std/private/vmutils.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
template forwardImpl*(impl, arg) {.dirty.} =
when sizeof(x) <= 4:
when x is SomeSignedInt:
impl(cast[uint32](x.int32))
else:
impl(x.uint32)
else:
when x is SomeSignedInt:
impl(cast[uint64](x.int64))
else:
impl(x.uint64)

template toUnsigned*(x: int8): uint8 = cast[uint8](x)
template toUnsigned*(x: int16): uint16 = cast[uint16](x)
template toUnsigned*(x: int32): uint32 = cast[uint32](x)
template toUnsigned*(x: int64): uint64 = cast[uint64](x)
template toUnsigned*(x: int): uint = cast[uint](x)
142 changes: 138 additions & 4 deletions lib/system/sets.nim
Original file line number Diff line number Diff line change
Expand Up @@ -8,26 +8,160 @@
#

# set handling
from std/private/vmutils import forwardImpl, toUnsigned


const useBuiltins = not defined(noIntrinsicsBitOpts)
const noUndefined {.used.} = defined(noUndefinedBitOpts)
const useGCC_builtins = (defined(gcc) or defined(llvm_gcc) or
defined(clang)) and useBuiltins
const useICC_builtins = defined(icc) and useBuiltins
const useVCC_builtins = defined(vcc) and useBuiltins

type
NimSet = array[0..4*2048-1, uint8]

# bitops can't be imported here, therefore the code duplication.
ringabout marked this conversation as resolved.
Show resolved Hide resolved

proc countBits32(n: uint32): int {.compilerproc.} =
template countBitsImpl(n: uint32): int =
# generic formula is from: https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
var v = uint32(n)
v = v - ((v shr 1'u32) and 0x55555555'u32)
v = (v and 0x33333333'u32) + ((v shr 2'u32) and 0x33333333'u32)
result = (((v + (v shr 4'u32) and 0xF0F0F0F'u32) * 0x1010101'u32) shr 24'u32).int
(((v + (v shr 4'u32) and 0xF0F0F0F'u32) * 0x1010101'u32) shr 24'u32).int

proc countBits64(n: uint64): int {.compilerproc, inline.} =
template countBitsImpl(n: uint64): int =
# generic formula is from: https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
var v = uint64(n)
v = v - ((v shr 1'u64) and 0x5555555555555555'u64)
v = (v and 0x3333333333333333'u64) + ((v shr 2'u64) and 0x3333333333333333'u64)
v = (v + (v shr 4'u64) and 0x0F0F0F0F0F0F0F0F'u64)
result = ((v * 0x0101010101010101'u64) shr 56'u64).int
((v * 0x0101010101010101'u64) shr 56'u64).int

template parityImpl[T](value: T): int =
# formula id from: https://graphics.stanford.edu/%7Eseander/bithacks.html#ParityParallel
var v = value
when sizeof(T) == 8:
v = v xor (v shr 32)
when sizeof(T) >= 4:
v = v xor (v shr 16)
when sizeof(T) >= 2:
v = v xor (v shr 8)
v = v xor (v shr 4)
v = v and 0xf
((0x6996'u shr v) and 1).int

when useGCC_builtins:
# Returns the number of set 1-bits in value.
proc builtin_popcount(x: cuint): cint {.importc: "__builtin_popcount", cdecl.}
proc builtin_popcountll(x: culonglong): cint {.
importc: "__builtin_popcountll", cdecl.}

# Returns the bit parity in value
proc builtin_parity(x: cuint): cint {.importc: "__builtin_parity", cdecl.}
proc builtin_parityll(x: culonglong): cint {.importc: "__builtin_parityll", cdecl.}

# Returns one plus the index of the least significant 1-bit of x, or if x is zero, returns zero.
proc builtin_ffs(x: cint): cint {.importc: "__builtin_ffs", cdecl.}
proc builtin_ffsll(x: clonglong): cint {.importc: "__builtin_ffsll", cdecl.}

# Returns the number of leading 0-bits in x, starting at the most significant bit position. If x is 0, the result is undefined.
proc builtin_clz(x: cuint): cint {.importc: "__builtin_clz", cdecl.}
proc builtin_clzll(x: culonglong): cint {.importc: "__builtin_clzll", cdecl.}

# Returns the number of trailing 0-bits in x, starting at the least significant bit position. If x is 0, the result is undefined.
proc builtin_ctz(x: cuint): cint {.importc: "__builtin_ctz", cdecl.}
proc builtin_ctzll(x: culonglong): cint {.importc: "__builtin_ctzll", cdecl.}

elif useVCC_builtins:
# Counts the number of one bits (population count) in a 16-, 32-, or 64-byte unsigned integer.
func builtin_popcnt16(a2: uint16): uint16 {.
importc: "__popcnt16", header: "<intrin.h>".}
func builtin_popcnt32(a2: uint32): uint32 {.
importc: "__popcnt", header: "<intrin.h>".}
func builtin_popcnt64(a2: uint64): uint64 {.
importc: "__popcnt64", header: "<intrin.h>".}

# Search the mask data from most significant bit (MSB) to least significant bit (LSB) for a set bit (1).
func bitScanReverse(index: ptr culong, mask: culong): cuchar {.
importc: "_BitScanReverse", header: "<intrin.h>".}
func bitScanReverse64(index: ptr culong, mask: uint64): cuchar {.
importc: "_BitScanReverse64", header: "<intrin.h>".}

# Search the mask data from least significant bit (LSB) to the most significant bit (MSB) for a set bit (1).
func bitScanForward(index: ptr culong, mask: culong): cuchar {.
importc: "_BitScanForward", header: "<intrin.h>".}
func bitScanForward64(index: ptr culong, mask: uint64): cuchar {.
importc: "_BitScanForward64", header: "<intrin.h>".}

template vcc_scan_impl(fnc: untyped; v: untyped): int =
var index: culong
discard fnc(index.addr, v)
index.int

elif useICC_builtins:

# Intel compiler intrinsics: http://fulla.fnal.gov/intel/compiler_c/main_cls/intref_cls/common/intref_allia_misc.htm
# see also: https://software.intel.com/en-us/node/523362
# Count the number of bits set to 1 in an integer a, and return that count in dst.
func builtin_popcnt32(a: cint): cint {.
importc: "_popcnt", header: "<immintrin.h>".}
func builtin_popcnt64(a: uint64): cint {.
importc: "_popcnt64", header: "<immintrin.h>".}

# Returns the number of trailing 0-bits in x, starting at the least significant bit position. If x is 0, the result is undefined.
func bitScanForward(p: ptr uint32, b: uint32): cuchar {.
importc: "_BitScanForward", header: "<immintrin.h>".}
func bitScanForward64(p: ptr uint32, b: uint64): cuchar {.
importc: "_BitScanForward64", header: "<immintrin.h>".}

# Returns the number of leading 0-bits in x, starting at the most significant bit position. If x is 0, the result is undefined.
func bitScanReverse(p: ptr uint32, b: uint32): cuchar {.
importc: "_BitScanReverse", header: "<immintrin.h>".}
func bitScanReverse64(p: ptr uint32, b: uint64): cuchar {.
importc: "_BitScanReverse64", header: "<immintrin.h>".}

template icc_scan_impl(fnc: untyped; v: untyped): int =
var index: uint32
discard fnc(index.addr, v)
index.int

func countSetBitsImpl(x: SomeInteger): int {.inline.} =
## Counts the set bits in an integer (also called `Hamming weight`:idx:).
runnableExamples:
doAssert countSetBits(0b0000_0011'u8) == 2
doAssert countSetBits(0b1010_1010'u8) == 4

# TODO: figure out if ICC support _popcnt32/_popcnt64 on platform without POPCNT.
# like GCC and MSVC
when x is SomeSignedInt:
let x = x.toUnsigned
when nimvm:
result = forwardImpl(countBitsImpl, x)
else:
when useGCC_builtins:
when sizeof(x) <= 4: result = builtin_popcount(x.cuint).int
else: result = builtin_popcountll(x.culonglong).int
elif useVCC_builtins:
when sizeof(x) <= 2: result = builtin_popcnt16(x.uint16).int
elif sizeof(x) <= 4: result = builtin_popcnt32(x.uint32).int
elif arch64: result = builtin_popcnt64(x.uint64).int
else: result = builtin_popcnt32((x.uint64 and 0xFFFFFFFF'u64).uint32).int +
builtin_popcnt32((x.uint64 shr 32'u64).uint32).int
elif useICC_builtins:
when sizeof(x) <= 4: result = builtin_popcnt32(x.cint).int
elif arch64: result = builtin_popcnt64(x.uint64).int
else: result = builtin_popcnt32((x.uint64 and 0xFFFFFFFF'u64).cint).int +
builtin_popcnt32((x.uint64 shr 32'u64).cint).int
else:
when sizeof(x) <= 4: result = countBitsImpl(x.uint32)
else: result = countBitsImpl(x.uint64)

proc countBits32(n: uint32): int {.compilerproc.} =
result = countSetBitsImpl(n)

proc countBits64(n: uint64): int {.compilerproc.} =
result = countSetBitsImpl(n)

proc cardSet(s: NimSet, len: int): int {.compilerproc, inline.} =
var i = 0
Expand Down