1
/**************************************************************************
3
* Copyright 2008 VMware, Inc.
6
* Permission is hereby granted, free of charge, to any person obtaining a
7
* copy of this software and associated documentation files (the
8
* "Software"), to deal in the Software without restriction, including
9
* without limitation the rights to use, copy, modify, merge, publish,
10
* distribute, sub license, and/or sell copies of the Software, and to
11
* permit persons to whom the Software is furnished to do so, subject to
12
* the following conditions:
14
* The above copyright notice and this permission notice (including the
15
* next paragraph) shall be included in all copies or substantial portions
18
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21
* IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22
* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26
**************************************************************************/
41
#if defined(__POPCNT__)
42
#include <popcntintrin.h>
45
#include "c99_compat.h"
53
* Find first bit set in word. Least significant bit is 1.
54
* Return 0 if no bits set.
56
#ifdef HAVE___BUILTIN_FFS
57
#define ffs __builtin_ffs
58
#elif defined(_MSC_VER) && (_M_IX86 || _M_ARM || _M_AMD64 || _M_IA64)
63
if (_BitScanForward(&index, i))
73
#ifdef HAVE___BUILTIN_FFSLL
74
#define ffsll __builtin_ffsll
75
#elif defined(_MSC_VER) && (_M_AMD64 || _M_ARM64 || _M_IA64)
77
ffsll(long long int i)
80
if (_BitScanForward64(&index, i))
87
ffsll(long long int val);
91
/* Destructively loop over all of the bits in a mask as in:
94
* int i = u_bit_scan(&mymask);
95
* ... process element i
100
u_bit_scan(unsigned *mask)
102
const int i = ffs(*mask) - 1;
107
#define u_foreach_bit(b, dword) \
108
for (uint32_t __dword = (dword), b; \
109
((b) = ffs(__dword) - 1, __dword); \
110
__dword &= ~(1 << (b)))
113
u_bit_scan64(uint64_t *mask)
115
const int i = ffsll(*mask) - 1;
116
*mask ^= (((uint64_t)1) << i);
120
#define u_foreach_bit64(b, dword) \
121
for (uint64_t __dword = (dword), b; \
122
((b) = ffsll(__dword) - 1, __dword); \
123
__dword &= ~(1ull << (b)))
125
/* Determine if an unsigned value is a power of two.
128
* Zero is treated as a power of two.
131
util_is_power_of_two_or_zero(unsigned v)
133
return (v & (v - 1)) == 0;
136
/* Determine if an uint64_t value is a power of two.
139
* Zero is treated as a power of two.
142
util_is_power_of_two_or_zero64(uint64_t v)
144
return (v & (v - 1)) == 0;
147
/* Determine if an unsigned value is a power of two.
150
* Zero is \b not treated as a power of two.
153
util_is_power_of_two_nonzero(unsigned v)
155
/* __POPCNT__ is different from HAVE___BUILTIN_POPCOUNT. The latter
156
* indicates the existence of the __builtin_popcount function. The former
157
* indicates that _mm_popcnt_u32 exists and is a native instruction.
159
* The other alternative is to use SSE 4.2 compile-time flags. This has
160
* two drawbacks. First, there is currently no build infrastructure for
161
* SSE 4.2 (only 4.1), so that would have to be added. Second, some AMD
162
* CPUs support POPCNT but not SSE 4.2 (e.g., Barcelona).
165
return _mm_popcnt_u32(v) == 1;
167
return v != 0 && (v & (v - 1)) == 0;
171
/* For looping over a bitmask when you want to loop over consecutive bits
172
* manually, for example:
175
* int start, count, i;
177
* u_bit_scan_consecutive_range(&mask, &start, &count);
179
* for (i = 0; i < count; i++)
180
* ... process element (start+i)
184
u_bit_scan_consecutive_range(unsigned *mask, int *start, int *count)
186
if (*mask == 0xffffffff) {
192
*start = ffs(*mask) - 1;
193
*count = ffs(~(*mask >> *start)) - 1;
194
*mask &= ~(((1u << *count) - 1) << *start);
198
u_bit_scan_consecutive_range64(uint64_t *mask, int *start, int *count)
200
if (*mask == ~0ull) {
206
*start = ffsll(*mask) - 1;
207
*count = ffsll(~(*mask >> *start)) - 1;
208
*mask &= ~(((((uint64_t)1) << *count) - 1) << *start);
213
* Find last bit set in a word. The least significant bit is 1.
214
* Return 0 if no bits are set.
215
* Essentially ffs() in the reverse direction.
217
static inline unsigned
218
util_last_bit(unsigned u)
220
#if defined(HAVE___BUILTIN_CLZ)
221
return u == 0 ? 0 : 32 - __builtin_clz(u);
222
#elif defined(_MSC_VER) && (_M_IX86 || _M_ARM || _M_AMD64 || _M_IA64)
224
if (_BitScanReverse(&index, u))
239
* Find last bit set in a word. The least significant bit is 1.
240
* Return 0 if no bits are set.
241
* Essentially ffsll() in the reverse direction.
243
static inline unsigned
244
util_last_bit64(uint64_t u)
246
#if defined(HAVE___BUILTIN_CLZLL)
247
return u == 0 ? 0 : 64 - __builtin_clzll(u);
248
#elif defined(_MSC_VER) && (_M_AMD64 || _M_ARM64 || _M_IA64)
250
if (_BitScanReverse64(&index, u))
265
* Find last bit in a word that does not match the sign bit. The least
266
* significant bit is 1.
267
* Return 0 if no bits are set.
269
static inline unsigned
270
util_last_bit_signed(int i)
273
return util_last_bit(i);
275
return util_last_bit(~(unsigned)i);
278
/* Returns a bitfield in which the first count bits starting at start are
281
static inline unsigned
282
u_bit_consecutive(unsigned start, unsigned count)
284
assert(start + count <= 32);
287
return ((1u << count) - 1) << start;
290
static inline uint64_t
291
u_bit_consecutive64(unsigned start, unsigned count)
293
assert(start + count <= 64);
296
return (((uint64_t)1 << count) - 1) << start;
300
* Return number of bits set in n.
302
static inline unsigned
303
util_bitcount(unsigned n)
305
#if defined(HAVE___BUILTIN_POPCOUNT)
306
return __builtin_popcount(n);
308
/* K&R classic bitcount.
310
* For each iteration, clear the LSB from the bitfield.
311
* Requires only one iteration per set bit, instead of
312
* one iteration per bit less than highest set bit.
315
for (bits = 0; n; bits++) {
323
* Return the number of bits set in n using the native popcnt instruction.
324
* The caller is responsible for ensuring that popcnt is supported by the CPU.
326
* gcc doesn't use it if -mpopcnt or -march= that has popcnt is missing.
329
static inline unsigned
330
util_popcnt_inline_asm(unsigned n)
332
#if defined(USE_X86_64_ASM) || defined(USE_X86_ASM)
334
__asm volatile("popcnt %1, %0" : "=r"(out) : "r"(n));
337
/* We should never get here by accident, but I'm sure it'll happen. */
338
return util_bitcount(n);
342
static inline unsigned
343
util_bitcount64(uint64_t n)
345
#ifdef HAVE___BUILTIN_POPCOUNTLL
346
return __builtin_popcountll(n);
348
return util_bitcount(n) + util_bitcount(n >> 32);
353
* Widens the given bit mask by a multiplier, meaning that it will
354
* replicate each bit by that amount.
357
* 0b101 widened by 2 will become: 0b110011
359
* This is typically used in shader I/O to transform a 64-bit
360
* writemask to a 32-bit writemask.
362
static inline uint32_t
363
util_widen_mask(uint32_t mask, unsigned multiplier)
365
uint32_t new_mask = 0;
366
u_foreach_bit(i, mask)
367
new_mask |= ((1u << multiplier) - 1u) << (i * multiplier);
374
/* util_bitcount has large measurable overhead (~2%), so it's recommended to
375
* use the POPCNT instruction via inline assembly if the CPU supports it.
382
/* Convenient function to select popcnt through a C++ template argument.
383
* This should be used as part of larger functions that are optimized
386
template<util_popcnt POPCNT> inline unsigned
387
util_bitcount_fast(unsigned n)
389
if (POPCNT == POPCNT_YES)
390
return util_popcnt_inline_asm(n);
392
return util_bitcount(n);
395
#endif /* __cplusplus */
397
#endif /* BITSCAN_H */