17 return n != 0 && (
n & (
n - 1)) == 0;
59 for (
size_t i = 0;
i <
n; ++
i) {
80template <
typename TagType,
typename WordType>
82 static_assert(
sizeof(WordType) == 4 ||
sizeof(WordType) == 8,
"WordType must be 32 or 64 bits");
84 if constexpr (
sizeof(WordType) == 8) {
85 if constexpr (
sizeof(TagType) == 1) {
86 return (
v - 0x0101010101010101ULL) &
~v & 0x8080808080808080ULL;
87 }
else if constexpr (
sizeof(TagType) == 2) {
88 return (
v - 0x0001000100010001ULL) &
~v & 0x8000800080008000ULL;
89 }
else if constexpr (
sizeof(TagType) == 4) {
90 return (
v - 0x0000000100000001ULL) &
~v & 0x8000000080000000ULL;
95 if constexpr (
sizeof(TagType) == 1) {
96 return (
v - 0x01010101U) &
~v & 0x80808080U;
97 }
else if constexpr (
sizeof(TagType) == 2) {
98 return (
v - 0x00010001U) &
~v & 0x80008000U;
99 }
else if constexpr (
sizeof(TagType) == 4) {
100 return (
v - 0x00000001U) &
~v & 0x80000000U;
115template <
typename TagType,
typename WordType>
128template <
typename TagType,
typename WordType>
130 static_assert(
sizeof(WordType) == 4 ||
sizeof(WordType) == 8,
"WordType must be 32 or 64 bits");
132 if constexpr (
sizeof(WordType) == 8) {
133 if constexpr (
sizeof(TagType) == 1) {
134 return static_cast<uint64_t>(
tag) * 0x0101010101010101ULL;
135 }
else if constexpr (
sizeof(TagType) == 2) {
136 return static_cast<uint64_t>(
tag) * 0x0001000100010001ULL;
137 }
else if constexpr (
sizeof(TagType) == 4) {
138 return static_cast<uint64_t>(
tag) * 0x0000000100000001ULL;
143 if constexpr (
sizeof(TagType) == 1) {
145 }
else if constexpr (
sizeof(TagType) == 2) {
147 }
else if constexpr (
sizeof(TagType) == 4) {
155#if __CUDA_ARCH__ >= 1000
173 static_assert(
sizeof(T) == 4 ||
sizeof(T) == 8,
"T must be uint32_t or uint64_t");
175 if constexpr (
sizeof(T) == 8) {
176 asm volatile(
"ld.global.nc.v4.u64 {%0, %1, %2, %3}, [%4];"
177 :
"=l"(
out[0]),
"=l"(
out[1]),
"=l"(
out[2]),
"=l"(
out[3])
180 asm volatile(
"ld.global.nc.v8.u32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];"
198#define SDIV(x, y) (((x) + (y) - 1) / (y))
204#define CUDA_CALL(err) \
206 cudaError_t err_ = (err); \
207 if (err_ == cudaSuccess) [[likely]] { \
210 printf("%s:%d %s\n", __FILE__, __LINE__, cudaGetErrorString(err_)); \
223template <
typename Kernel>
constexpr size_t maxOccupancyGridSize(int32_t blockSize, Kernel kernel, size_t dynamicSMemSize)
Calculates the maximum occupancy grid size for a kernel.
constexpr size_t nextPowerOfTwo(size_t n)
Calculates the next power of two greater than or equal to n.
__host__ __device__ __forceinline__ constexpr WordType getZeroMask(WordType v)
Returns a bitmask indicating which slots in a packed word are zero.
__host__ __device__ __forceinline__ constexpr bool hasZero(WordType v)
Checks if a packed word contains a zero slot.
__host__ __device__ __forceinline__ uint32_t globalThreadId()
Calculates the global thread ID in a 1D grid.
__host__ __device__ __forceinline__ constexpr WordType replicateTag(TagType tag)
Replicates a tag value across all slots in a word.
constexpr bool powerOfTwo(size_t n)
Checks if a number is a power of two.
size_t countOnes(T *data, size_t n)
Counts the number of non-zero elements in an array.