bitops.nim 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2017 Nim Authors
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## This module implements a series of low level methods for bit manipulation.
  10. ## By default, this module use compiler intrinsics to improve performance
  11. ## on supported compilers: ``GCC``, ``LLVM_GCC``, ``CLANG``, ``VCC``, ``ICC``.
  12. ##
  13. ## The module will fallback to pure nim procs incase the backend is not supported.
  14. ## You can also use the flag `noIntrinsicsBitOpts` to disable compiler intrinsics.
  15. ##
  16. ## This module is also compatible with other backends: ``Javascript``, ``Nimscript``
  17. ## as well as the ``compiletime VM``.
  18. ##
  19. ## As a result of using optimized function/intrinsics some functions can return
  20. ## undefined results if the input is invalid. You can use the flag `noUndefinedBitOpts`
  21. ## to force predictable behaviour for all input, causing a small performance hit.
  22. ##
  23. ## At this time only `fastLog2`, `firstSetBit, `countLeadingZeroBits`, `countTrailingZeroBits`
  24. ## may return undefined and/or platform dependant value if given invalid input.
  25. const useBuiltins = not defined(noIntrinsicsBitOpts)
  26. const noUndefined = defined(noUndefinedBitOpts)
  27. const useGCC_builtins = (defined(gcc) or defined(llvm_gcc) or defined(clang)) and useBuiltins
  28. const useICC_builtins = defined(icc) and useBuiltins
  29. const useVCC_builtins = defined(vcc) and useBuiltins
  30. const arch64 = sizeof(int) == 8
  31. # #### Pure Nim version ####
  32. proc firstSetBit_nim(x: uint32): int {.inline, nosideeffect.} =
  33. ## Returns the 1-based index of the least significant set bit of x, or if x is zero, returns zero.
  34. # https://graphics.stanford.edu/%7Eseander/bithacks.html#ZerosOnRightMultLookup
  35. const lookup: array[32, uint8] = [0'u8, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15,
  36. 25, 17, 4, 8, 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9]
  37. var v = x.uint32
  38. var k = not v + 1 # get two's complement # cast[uint32](-cast[int32](v))
  39. result = 1 + lookup[uint32((v and k) * 0x077CB531'u32) shr 27].int
  40. proc firstSetBit_nim(x: uint64): int {.inline, nosideeffect.} =
  41. ## Returns the 1-based index of the least significant set bit of x, or if x is zero, returns zero.
  42. # https://graphics.stanford.edu/%7Eseander/bithacks.html#ZerosOnRightMultLookup
  43. var v = uint64(x)
  44. var k = uint32(v and 0xFFFFFFFF'u32)
  45. if k == 0:
  46. k = uint32(v shr 32'u32) and 0xFFFFFFFF'u32
  47. result = 32
  48. result += firstSetBit_nim(k)
  49. proc fastlog2_nim(x: uint32): int {.inline, nosideeffect.} =
  50. ## Quickly find the log base 2 of a 32-bit or less integer.
  51. # https://graphics.stanford.edu/%7Eseander/bithacks.html#IntegerLogDeBruijn
  52. # https://stackoverflow.com/questions/11376288/fast-computing-of-log2-for-64-bit-integers
  53. const lookup: array[32, uint8] = [0'u8, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18,
  54. 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31]
  55. var v = x.uint32
  56. v = v or v shr 1 # first round down to one less than a power of 2
  57. v = v or v shr 2
  58. v = v or v shr 4
  59. v = v or v shr 8
  60. v = v or v shr 16
  61. result = lookup[uint32(v * 0x07C4ACDD'u32) shr 27].int
  62. proc fastlog2_nim(x: uint64): int {.inline, nosideeffect.} =
  63. ## Quickly find the log base 2 of a 64-bit integer.
  64. # https://graphics.stanford.edu/%7Eseander/bithacks.html#IntegerLogDeBruijn
  65. # https://stackoverflow.com/questions/11376288/fast-computing-of-log2-for-64-bit-integers
  66. const lookup: array[64, uint8] = [0'u8, 58, 1, 59, 47, 53, 2, 60, 39, 48, 27, 54,
  67. 33, 42, 3, 61, 51, 37, 40, 49, 18, 28, 20, 55, 30, 34, 11, 43, 14, 22, 4, 62,
  68. 57, 46, 52, 38, 26, 32, 41, 50, 36, 17, 19, 29, 10, 13, 21, 56, 45, 25, 31,
  69. 35, 16, 9, 12, 44, 24, 15, 8, 23, 7, 6, 5, 63]
  70. var v = x.uint64
  71. v = v or v shr 1 # first round down to one less than a power of 2
  72. v = v or v shr 2
  73. v = v or v shr 4
  74. v = v or v shr 8
  75. v = v or v shr 16
  76. v = v or v shr 32
  77. result = lookup[(v * 0x03F6EAF2CD271461'u64) shr 58].int
  78. proc countSetBits_nim(n: uint32): int {.inline, noSideEffect.} =
  79. ## Counts the set bits in integer. (also called Hamming weight.)
  80. # generic formula is from: https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
  81. var v = uint32(n)
  82. v = v - ((v shr 1) and 0x55555555)
  83. v = (v and 0x33333333) + ((v shr 2) and 0x33333333)
  84. result = (((v + (v shr 4) and 0xF0F0F0F) * 0x1010101) shr 24).int
  85. proc countSetBits_nim(n: uint64): int {.inline, noSideEffect.} =
  86. ## Counts the set bits in integer. (also called Hamming weight.)
  87. # generic formula is from: https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
  88. var v = uint64(n)
  89. v = v - ((v shr 1'u64) and 0x5555555555555555'u64)
  90. v = (v and 0x3333333333333333'u64) + ((v shr 2'u64) and 0x3333333333333333'u64)
  91. v = (v + (v shr 4'u64) and 0x0F0F0F0F0F0F0F0F'u64)
  92. result = ((v * 0x0101010101010101'u64) shr 56'u64).int
  93. template parity_impl[T](value: T): int =
  94. # formula id from: https://graphics.stanford.edu/%7Eseander/bithacks.html#ParityParallel
  95. var v = value
  96. when sizeof(T) == 8:
  97. v = v xor (v shr 32)
  98. when sizeof(T) >= 4:
  99. v = v xor (v shr 16)
  100. when sizeof(T) >= 2:
  101. v = v xor (v shr 8)
  102. v = v xor (v shr 4)
  103. v = v and 0xf
  104. ((0x6996'u shr v) and 1).int
  105. when useGCC_builtins:
  106. # Returns the number of set 1-bits in value.
  107. proc builtin_popcount(x: cuint): cint {.importc: "__builtin_popcount", cdecl.}
  108. proc builtin_popcountll(x: culonglong): cint {.importc: "__builtin_popcountll", cdecl.}
  109. # Returns the bit parity in value
  110. proc builtin_parity(x: cuint): cint {.importc: "__builtin_parity", cdecl.}
  111. proc builtin_parityll(x: culonglong): cint {.importc: "__builtin_parityll", cdecl.}
  112. # Returns one plus the index of the least significant 1-bit of x, or if x is zero, returns zero.
  113. proc builtin_ffs(x: cint): cint {.importc: "__builtin_ffs", cdecl.}
  114. proc builtin_ffsll(x: clonglong): cint {.importc: "__builtin_ffsll", cdecl.}
  115. # Returns the number of leading 0-bits in x, starting at the most significant bit position. If x is 0, the result is undefined.
  116. proc builtin_clz(x: cuint): cint {.importc: "__builtin_clz", cdecl.}
  117. proc builtin_clzll(x: culonglong): cint {.importc: "__builtin_clzll", cdecl.}
  118. # Returns the number of trailing 0-bits in x, starting at the least significant bit position. If x is 0, the result is undefined.
  119. proc builtin_ctz(x: cuint): cint {.importc: "__builtin_ctz", cdecl.}
  120. proc builtin_ctzll(x: culonglong): cint {.importc: "__builtin_ctzll", cdecl.}
  121. elif useVCC_builtins:
  122. # Counts the number of one bits (population count) in a 16-, 32-, or 64-byte unsigned integer.
  123. proc builtin_popcnt16(a2: uint16): uint16 {.importc: "__popcnt16" header: "<intrin.h>", nosideeffect.}
  124. proc builtin_popcnt32(a2: uint32): uint32 {.importc: "__popcnt" header: "<intrin.h>", nosideeffect.}
  125. proc builtin_popcnt64(a2: uint64): uint64 {.importc: "__popcnt64" header: "<intrin.h>", nosideeffect.}
  126. # Search the mask data from most significant bit (MSB) to least significant bit (LSB) for a set bit (1).
  127. proc bitScanReverse(index: ptr culong, mask: culong): cuchar {.importc: "_BitScanReverse", header: "<intrin.h>", nosideeffect.}
  128. proc bitScanReverse64(index: ptr culong, mask: uint64): cuchar {.importc: "_BitScanReverse64", header: "<intrin.h>", nosideeffect.}
  129. # Search the mask data from least significant bit (LSB) to the most significant bit (MSB) for a set bit (1).
  130. proc bitScanForward(index: ptr culong, mask: culong): cuchar {.importc: "_BitScanForward", header: "<intrin.h>", nosideeffect.}
  131. proc bitScanForward64(index: ptr culong, mask: uint64): cuchar {.importc: "_BitScanForward64", header: "<intrin.h>", nosideeffect.}
  132. template vcc_scan_impl(fnc: untyped; v: untyped): int =
  133. var index: culong
  134. discard fnc(index.addr, v)
  135. index.int
  136. elif useICC_builtins:
  137. # Intel compiler intrinsics: http://fulla.fnal.gov/intel/compiler_c/main_cls/intref_cls/common/intref_allia_misc.htm
  138. # see also: https://software.intel.com/en-us/node/523362
  139. # Count the number of bits set to 1 in an integer a, and return that count in dst.
  140. proc builtin_popcnt32(a: cint): cint {.importc: "_popcnt" header: "<immintrin.h>", nosideeffect.}
  141. proc builtin_popcnt64(a: uint64): cint {.importc: "_popcnt64" header: "<immintrin.h>", nosideeffect.}
  142. # Returns the number of trailing 0-bits in x, starting at the least significant bit position. If x is 0, the result is undefined.
  143. proc bitScanForward(p: ptr uint32, b: uint32): cuchar {.importc: "_BitScanForward", header: "<immintrin.h>", nosideeffect.}
  144. proc bitScanForward64(p: ptr uint32, b: uint64): cuchar {.importc: "_BitScanForward64", header: "<immintrin.h>", nosideeffect.}
  145. # Returns the number of leading 0-bits in x, starting at the most significant bit position. If x is 0, the result is undefined.
  146. proc bitScanReverse(p: ptr uint32, b: uint32): cuchar {.importc: "_BitScanReverse", header: "<immintrin.h>", nosideeffect.}
  147. proc bitScanReverse64(p: ptr uint32, b: uint64): cuchar {.importc: "_BitScanReverse64", header: "<immintrin.h>", nosideeffect.}
  148. template icc_scan_impl(fnc: untyped; v: untyped): int =
  149. var index: uint32
  150. discard fnc(index.addr, v)
  151. index.int
  152. proc countSetBits*(x: SomeInteger): int {.inline, nosideeffect.} =
  153. ## Counts the set bits in integer. (also called `Hamming weight`:idx:.)
  154. # TODO: figure out if ICC support _popcnt32/_popcnt64 on platform without POPCNT.
  155. # like GCC and MSVC
  156. when nimvm:
  157. when sizeof(x) <= 4: result = countSetBits_nim(x.uint32)
  158. else: result = countSetBits_nim(x.uint64)
  159. else:
  160. when useGCC_builtins:
  161. when sizeof(x) <= 4: result = builtin_popcount(x.cuint).int
  162. else: result = builtin_popcountll(x.culonglong).int
  163. elif useVCC_builtins:
  164. when sizeof(x) <= 2: result = builtin_popcnt16(x.uint16).int
  165. elif sizeof(x) <= 4: result = builtin_popcnt32(x.uint32).int
  166. elif arch64: result = builtin_popcnt64(x.uint64).int
  167. else: result = builtin_popcnt32((x.uint64 and 0xFFFFFFFF'u64).uint32 ).int +
  168. builtin_popcnt32((x.uint64 shr 32'u64).uint32 ).int
  169. elif useICC_builtins:
  170. when sizeof(x) <= 4: result = builtin_popcnt32(x.cint).int
  171. elif arch64: result = builtin_popcnt64(x.uint64).int
  172. else: result = builtin_popcnt32((x.uint64 and 0xFFFFFFFF'u64).cint ).int +
  173. builtin_popcnt32((x.uint64 shr 32'u64).cint ).int
  174. else:
  175. when sizeof(x) <= 4: result = countSetBits_nim(x.uint32)
  176. else: result = countSetBits_nim(x.uint64)
  177. proc popcount*(x: SomeInteger): int {.inline, nosideeffect.} =
  178. ## Alias for for countSetBits (Hamming weight.)
  179. result = countSetBits(x)
  180. proc parityBits*(x: SomeInteger): int {.inline, nosideeffect.} =
  181. ## Calculate the bit parity in integer. If number of 1-bit
  182. ## is odd parity is 1, otherwise 0.
  183. # Can be used a base if creating ASM version.
  184. # https://stackoverflow.com/questions/21617970/how-to-check-if-value-has-even-parity-of-bits-or-odd
  185. when nimvm:
  186. when sizeof(x) <= 4: result = parity_impl(x.uint32)
  187. else: result = parity_impl(x.uint64)
  188. else:
  189. when useGCC_builtins:
  190. when sizeof(x) <= 4: result = builtin_parity(x.uint32).int
  191. else: result = builtin_parityll(x.uint64).int
  192. else:
  193. when sizeof(x) <= 4: result = parity_impl(x.uint32)
  194. else: result = parity_impl(x.uint64)
  195. proc firstSetBit*(x: SomeInteger): int {.inline, nosideeffect.} =
  196. ## Returns the 1-based index of the least significant set bit of x.
  197. ## If `x` is zero, when ``noUndefinedBitOpts`` is set, result is 0,
  198. ## otherwise result is undefined.
  199. # GCC builtin 'builtin_ffs' already handle zero input.
  200. when nimvm:
  201. when noUndefined:
  202. if x == 0:
  203. return 0
  204. when sizeof(x) <= 4: result = firstSetBit_nim(x.uint32)
  205. else: result = firstSetBit_nim(x.uint64)
  206. else:
  207. when noUndefined and not useGCC_builtins:
  208. if x == 0:
  209. return 0
  210. when useGCC_builtins:
  211. when sizeof(x) <= 4: result = builtin_ffs(cast[cint](x.cuint)).int
  212. else: result = builtin_ffsll(cast[clonglong](x.culonglong)).int
  213. elif useVCC_builtins:
  214. when sizeof(x) <= 4:
  215. result = 1 + vcc_scan_impl(bitScanForward, x.culong)
  216. elif arch64:
  217. result = 1 + vcc_scan_impl(bitScanForward64, x.uint64)
  218. else:
  219. result = firstSetBit_nim(x.uint64)
  220. elif useICC_builtins:
  221. when sizeof(x) <= 4:
  222. result = 1 + icc_scan_impl(bitScanForward, x.uint32)
  223. elif arch64:
  224. result = 1 + icc_scan_impl(bitScanForward64, x.uint64)
  225. else:
  226. result = firstSetBit_nim(x.uint64)
  227. else:
  228. when sizeof(x) <= 4: result = firstSetBit_nim(x.uint32)
  229. else: result = firstSetBit_nim(x.uint64)
  230. proc fastLog2*(x: SomeInteger): int {.inline, nosideeffect.} =
  231. ## Quickly find the log base 2 of an integer.
  232. ## If `x` is zero, when ``noUndefinedBitOpts`` is set, result is -1,
  233. ## otherwise result is undefined.
  234. when noUndefined:
  235. if x == 0:
  236. return -1
  237. when nimvm:
  238. when sizeof(x) <= 4: result = fastlog2_nim(x.uint32)
  239. else: result = fastlog2_nim(x.uint64)
  240. else:
  241. when useGCC_builtins:
  242. when sizeof(x) <= 4: result = 31 - builtin_clz(x.uint32).int
  243. else: result = 63 - builtin_clzll(x.uint64).int
  244. elif useVCC_builtins:
  245. when sizeof(x) <= 4:
  246. result = vcc_scan_impl(bitScanReverse, x.culong)
  247. elif arch64:
  248. result = vcc_scan_impl(bitScanReverse64, x.uint64)
  249. else:
  250. result = fastlog2_nim(x.uint64)
  251. elif useICC_builtins:
  252. when sizeof(x) <= 4:
  253. result = icc_scan_impl(bitScanReverse, x.uint32)
  254. elif arch64:
  255. result = icc_scan_impl(bitScanReverse64, x.uint64)
  256. else:
  257. result = fastlog2_nim(x.uint64)
  258. else:
  259. when sizeof(x) <= 4: result = fastlog2_nim(x.uint32)
  260. else: result = fastlog2_nim(x.uint64)
  261. proc countLeadingZeroBits*(x: SomeInteger): int {.inline, nosideeffect.} =
  262. ## Returns the number of leading zero bits in integer.
  263. ## If `x` is zero, when ``noUndefinedBitOpts`` is set, result is 0,
  264. ## otherwise result is undefined.
  265. when noUndefined:
  266. if x == 0:
  267. return 0
  268. when nimvm:
  269. when sizeof(x) <= 4: result = sizeof(x)*8 - 1 - fastlog2_nim(x.uint32)
  270. else: result = sizeof(x)*8 - 1 - fastlog2_nim(x.uint64)
  271. else:
  272. when useGCC_builtins:
  273. when sizeof(x) <= 4: result = builtin_clz(x.uint32).int - (32 - sizeof(x)*8)
  274. else: result = builtin_clzll(x.uint64).int
  275. else:
  276. when sizeof(x) <= 4: result = sizeof(x)*8 - 1 - fastlog2_nim(x.uint32)
  277. else: result = sizeof(x)*8 - 1 - fastlog2_nim(x.uint64)
  278. proc countTrailingZeroBits*(x: SomeInteger): int {.inline, nosideeffect.} =
  279. ## Returns the number of trailing zeros in integer.
  280. ## If `x` is zero, when ``noUndefinedBitOpts`` is set, result is 0,
  281. ## otherwise result is undefined.
  282. when noUndefined:
  283. if x == 0:
  284. return 0
  285. when nimvm:
  286. result = firstSetBit(x) - 1
  287. else:
  288. when useGCC_builtins:
  289. when sizeof(x) <= 4: result = builtin_ctz(x.uint32).int
  290. else: result = builtin_ctzll(x.uint64).int
  291. else:
  292. result = firstSetBit(x) - 1
  293. proc rotateLeftBits*(value: uint8;
  294. amount: range[0..8]): uint8 {.inline, noSideEffect.} =
  295. ## Left-rotate bits in a 8-bits value.
  296. # using this form instead of the one below should handle any value
  297. # out of range as well as negative values.
  298. # result = (value shl amount) or (value shr (8 - amount))
  299. # taken from: https://en.wikipedia.org/wiki/Circular_shift#Implementing_circular_shifts
  300. let amount = amount and 7
  301. result = (value shl amount) or (value shr ( (-amount) and 7))
  302. proc rotateLeftBits*(value: uint16;
  303. amount: range[0..16]): uint16 {.inline, noSideEffect.} =
  304. ## Left-rotate bits in a 16-bits value.
  305. let amount = amount and 15
  306. result = (value shl amount) or (value shr ( (-amount) and 15))
  307. proc rotateLeftBits*(value: uint32;
  308. amount: range[0..32]): uint32 {.inline, noSideEffect.} =
  309. ## Left-rotate bits in a 32-bits value.
  310. let amount = amount and 31
  311. result = (value shl amount) or (value shr ( (-amount) and 31))
  312. proc rotateLeftBits*(value: uint64;
  313. amount: range[0..64]): uint64 {.inline, noSideEffect.} =
  314. ## Left-rotate bits in a 64-bits value.
  315. let amount = amount and 63
  316. result = (value shl amount) or (value shr ( (-amount) and 63))
  317. proc rotateRightBits*(value: uint8;
  318. amount: range[0..8]): uint8 {.inline, noSideEffect.} =
  319. ## Right-rotate bits in a 8-bits value.
  320. let amount = amount and 7
  321. result = (value shr amount) or (value shl ( (-amount) and 7))
  322. proc rotateRightBits*(value: uint16;
  323. amount: range[0..16]): uint16 {.inline, noSideEffect.} =
  324. ## Right-rotate bits in a 16-bits value.
  325. let amount = amount and 15
  326. result = (value shr amount) or (value shl ( (-amount) and 15))
  327. proc rotateRightBits*(value: uint32;
  328. amount: range[0..32]): uint32 {.inline, noSideEffect.} =
  329. ## Right-rotate bits in a 32-bits value.
  330. let amount = amount and 31
  331. result = (value shr amount) or (value shl ( (-amount) and 31))
  332. proc rotateRightBits*(value: uint64;
  333. amount: range[0..64]): uint64 {.inline, noSideEffect.} =
  334. ## Right-rotate bits in a 64-bits value.
  335. let amount = amount and 63
  336. result = (value shr amount) or (value shl ( (-amount) and 63))