GPU-Accelerated Cuckoo Filter
Loading...
Searching...
No Matches
Functions
cuckoogpu::detail Namespace Reference

Functions

template<typename Config >
__global__ void insertKernel (const typename Config::KeyType *keys, bool *output, size_t n, Filter< Config > *filter, uint32_t *evictionAttempts)
 Kernel for inserting keys into the filter.
 
template<typename Config >
__global__ void insertKernelSorted (const typename Filter< Config >::PackedTagType *packedTags, bool *output, size_t n, Filter< Config > *filter, uint32_t *evictionAttempts)
 Kernel for inserting pre-sorted keys into the filter.
 
template<typename Config >
__global__ void computePackedTagsKernel (const typename Config::KeyType *keys, typename Filter< Config >::PackedTagType *packedTags, size_t n, size_t numBuckets)
 Kernel for computing packed tags for sorting.
 
template<typename Config >
__global__ void containsKernel (const typename Config::KeyType *keys, bool *output, size_t n, Filter< Config > *filter)
 Kernel for checking existence of keys.
 
template<typename Config >
__global__ void deleteKernel (const typename Config::KeyType *keys, bool *output, size_t n, Filter< Config > *filter)
 Kernel for deleting keys.
 
constexpr bool powerOfTwo (size_t n)
 Checks if a number is a power of two.
 
__host__ __device__ __forceinline__ uint32_t globalThreadId ()
 Calculates the global thread ID in a 1D grid.
 
constexpr size_t nextPowerOfTwo (size_t n)
 Calculates the next power of two greater than or equal to n.
 
template<typename T >
size_t countOnes (T *data, size_t n)
 Counts the number of non-zero elements in an array.
 
template<typename TagType , typename WordType >
__host__ __device__ __forceinline__ constexpr WordType getZeroMask (WordType v)
 Returns a bitmask indicating which slots in a packed word are zero.
 
template<typename TagType , typename WordType >
__host__ __device__ __forceinline__ constexpr bool hasZero (WordType v)
 Checks if a packed word contains a zero slot.
 
template<typename TagType , typename WordType >
__host__ __device__ __forceinline__ constexpr WordType replicateTag (TagType tag)
 Replicates a tag value across all slots in a word.
 
template<typename Kernel >
constexpr size_t maxOccupancyGridSize (int32_t blockSize, Kernel kernel, size_t dynamicSMemSize)
 Calculates the maximum occupancy grid size for a kernel.
 

Function Documentation

◆ computePackedTagsKernel()

template<typename Config >
__global__ void cuckoogpu::detail::computePackedTagsKernel ( const typename Config::KeyType keys,
typename Filter< Config >::PackedTagType *  packedTags,
size_t  n,
size_t  numBuckets 
)

Kernel for computing packed tags for sorting.

Definition at line 1498 of file CuckooFilter.cuh.

1503 {
1504 size_t idx = globalThreadId();
1505
1506 if (idx >= n) {
1507 return;
1508 }
1509
1510 using FilterType = Filter<Config>;
1511 using PackedTagType = typename FilterType::PackedTagType;
1512 constexpr size_t bitsPerTag = Config::bitsPerTag;
1513
1514 typename Config::KeyType key = keys[idx];
1515 auto [i1, i2, fp1, fp2] = FilterType::getCandidateBucketsAndFPs(key, numBuckets);
1516
1517 packedTags[idx] =
1518 (static_cast<PackedTagType>(i1) << bitsPerTag) | static_cast<PackedTagType>(fp1);
1519}
A CUDA-accelerated Cuckoo Filter implementation.
__host__ __device__ __forceinline__ uint32_t globalThreadId()
Calculates the global thread ID in a 1D grid.
Definition helpers.cuh:24
Here is the call graph for this function:

◆ containsKernel()

template<typename Config >
__global__ void cuckoogpu::detail::containsKernel ( const typename Config::KeyType keys,
bool output,
size_t  n,
Filter< Config > *  filter 
)

Kernel for checking existence of keys.

Definition at line 1460 of file CuckooFilter.cuh.

1465 {
1466 auto idx = globalThreadId();
1467
1468 if (idx < n) {
1469 output[idx] = filter->contains(keys[idx]);
1470 }
1471}
__device__ bool contains(const T &key) const
Checks if a key exists in the filter.
Here is the call graph for this function:

◆ countOnes()

template<typename T >
size_t cuckoogpu::detail::countOnes ( T *  data,
size_t  n 
)

Counts the number of non-zero elements in an array.

Template Parameters
TType of elements.
Parameters
dataPointer to the array.
nNumber of elements.
Returns
size_t Number of non-zero elements.

Definition at line 57 of file helpers.cuh.

57 {
58 size_t count = 0;
59 for (size_t i = 0; i < n; ++i) {
60 if (data[i]) {
61 count++;
62 }
63 }
64 return count;
65}
Here is the call graph for this function:

◆ deleteKernel()

template<typename Config >
__global__ void cuckoogpu::detail::deleteKernel ( const typename Config::KeyType keys,
bool output,
size_t  n,
Filter< Config > *  filter 
)

Kernel for deleting keys.

Definition at line 1475 of file CuckooFilter.cuh.

1475 {
1476 using BlockReduce = cub::BlockReduce<int32_t, Config::blockSize>;
1477 __shared__ typename BlockReduce::TempStorage tempStorage;
1478
1479 auto idx = globalThreadId();
1480
1481 int32_t success = 0;
1482 if (idx < n) {
1483 success = filter->remove(keys[idx]);
1484
1485 if (output != nullptr) {
1486 output[idx] = success;
1487 }
1488 }
1489
1490 int32_t blockSum = BlockReduce(tempStorage).Sum(success);
1491
1492 if (threadIdx.x == 0 && blockSum > 0) {
1493 filter->d_numOccupied->fetch_sub(blockSum, cuda::memory_order_relaxed);
1494 }
1495}
cuda::std::atomic< size_t > * d_numOccupied
Pointer to the device memory for the occupancy counter.
__device__ bool remove(const T &key)
Removes a key from the filter.
Here is the call graph for this function:

◆ getZeroMask()

template<typename TagType , typename WordType >
__host__ __device__ __forceinline__ constexpr WordType cuckoogpu::detail::getZeroMask ( WordType  v)
constexpr

Returns a bitmask indicating which slots in a packed word are zero.

Uses SWAR (SIMD Within A Register) to check multiple items in parallel. See https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord

The high bit of each slot that is zero will be set in the result.

Template Parameters
TagTypeThe type of the individual items (uint8_t, uint16_t, or uint32_t)
WordTypeThe packed word type (uint32_t or uint64_t)
Parameters
vThe packed integer
Returns
A bitmask with the high bit of each zero slot set

Definition at line 81 of file helpers.cuh.

81 {
82 static_assert(sizeof(WordType) == 4 || sizeof(WordType) == 8, "WordType must be 32 or 64 bits");
83
84 if constexpr (sizeof(WordType) == 8) {
85 if constexpr (sizeof(TagType) == 1) {
86 return (v - 0x0101010101010101ULL) & ~v & 0x8080808080808080ULL;
87 } else if constexpr (sizeof(TagType) == 2) {
88 return (v - 0x0001000100010001ULL) & ~v & 0x8000800080008000ULL;
89 } else if constexpr (sizeof(TagType) == 4) {
90 return (v - 0x0000000100000001ULL) & ~v & 0x8000000080000000ULL;
91 } else {
92 return 0;
93 }
94 } else {
95 if constexpr (sizeof(TagType) == 1) {
96 return (v - 0x01010101U) & ~v & 0x80808080U;
97 } else if constexpr (sizeof(TagType) == 2) {
98 return (v - 0x00010001U) & ~v & 0x80008000U;
99 } else if constexpr (sizeof(TagType) == 4) {
100 return (v - 0x00000001U) & ~v & 0x80000000U;
101 } else {
102 return 0;
103 }
104 }
105}
Here is the call graph for this function:

◆ globalThreadId()

__host__ __device__ __forceinline__ uint32_t cuckoogpu::detail::globalThreadId ( )

Calculates the global thread ID in a 1D grid.

Returns
uint32_t Global thread ID.

Definition at line 24 of file helpers.cuh.

24 {
25 return blockIdx.x * blockDim.x + threadIdx.x;
26}
Here is the call graph for this function:
Here is the caller graph for this function:

◆ hasZero()

template<typename TagType , typename WordType >
__host__ __device__ __forceinline__ constexpr bool cuckoogpu::detail::hasZero ( WordType  v)
constexpr

Checks if a packed word contains a zero slot.

Template Parameters
TagTypeThe type of the individual items (uint8_t, uint16_t, or uint32_t)
WordTypeThe packed word type (uint32_t or uint64_t)
Parameters
vThe packed integer
Returns
true if any of the items in v are zero

Definition at line 116 of file helpers.cuh.

116 {
117 return getZeroMask<TagType, WordType>(v) != 0;
118}
Here is the call graph for this function:
Here is the caller graph for this function:

◆ insertKernel()

template<typename Config >
__global__ void cuckoogpu::detail::insertKernel ( const typename Config::KeyType keys,
bool output,
size_t  n,
Filter< Config > *  filter,
uint32_t evictionAttempts 
)

Kernel for inserting keys into the filter.

Definition at line 1422 of file CuckooFilter.cuh.

1428 {
1429 using BlockReduce = cub::BlockReduce<int32_t, Config::blockSize>;
1430 __shared__ typename BlockReduce::TempStorage tempStorage;
1431
1432 auto idx = globalThreadId();
1433
1434 int32_t success = 0;
1435
1436 if (idx < n) {
1437 uint32_t threadEvictions = 0;
1438 success = filter->insert(keys[idx], &threadEvictions);
1439
1440 if (output != nullptr) {
1441 output[idx] = success;
1442 }
1443
1444 if (evictionAttempts != nullptr) {
1445 evictionAttempts[idx] = threadEvictions;
1446 }
1447 }
1448
1449 int32_t blockSuccessSum = BlockReduce(tempStorage).Sum(success);
1450 __syncthreads();
1451
1452 if (threadIdx.x == 0) {
1453 if (blockSuccessSum > 0) {
1454 filter->d_numOccupied->fetch_add(blockSuccessSum, cuda::memory_order_relaxed);
1455 }
1456 }
1457}
__device__ bool insert(const T &key, uint32_t *evictionAttempts=nullptr)
Inserts a single key into the filter.
Here is the call graph for this function:

◆ insertKernelSorted()

template<typename Config >
__global__ void cuckoogpu::detail::insertKernelSorted ( const typename Filter< Config >::PackedTagType *  packedTags,
bool output,
size_t  n,
Filter< Config > *  filter,
uint32_t evictionAttempts 
)

Kernel for inserting pre-sorted keys into the filter.

Definition at line 1522 of file CuckooFilter.cuh.

1528 {
1529 using BlockReduce = cub::BlockReduce<int, Config::blockSize>;
1530 __shared__ typename BlockReduce::TempStorage tempStorage;
1531
1532 size_t idx = globalThreadId();
1533
1534 using FilterType = Filter<Config>;
1535 using TagType = typename FilterType::TagType;
1536 using PackedTagType = typename FilterType::PackedTagType;
1537
1538 constexpr size_t bitsPerTag = Config::bitsPerTag;
1539 constexpr TagType fpMask = (1ULL << bitsPerTag) - 1;
1540
1541 int32_t success = 0;
1542 uint32_t threadEvictions = 0;
1543 if (idx < n) {
1544 PackedTagType packedTag = packedTags[idx];
1545 size_t primaryBucket = packedTag >> bitsPerTag;
1546 auto fp = static_cast<TagType>(packedTag & fpMask);
1547
1548 if (filter->tryInsertAtBucket(primaryBucket, fp)) {
1549 success = 1;
1550 } else {
1551 auto [i2, fp2] =
1552 FilterType::getAlternateBucketWithNewFp(primaryBucket, fp, filter->numBuckets);
1553
1554 if (filter->tryInsertAtBucket(i2, fp2)) {
1555 success = 1;
1556 } else {
1557 TagType evictFp;
1558 auto startBucket = (fp & 1) == 0 ? primaryBucket : i2;
1559
1560 if constexpr (Config::AltBucketPolicy::usesChoiceBit) {
1561 evictFp = (fp & 1) == 0 ? fp : fp2;
1562 } else {
1563 evictFp = fp;
1564 }
1565
1566 if constexpr (Config::evictionPolicy == EvictionPolicy::BFS) {
1567 success = filter->insertWithEvictionBFS(evictFp, startBucket, &threadEvictions);
1568 } else if constexpr (Config::evictionPolicy == EvictionPolicy::DFS) {
1569 success = filter->insertWithEvictionDFS(evictFp, startBucket, &threadEvictions);
1570 } else {
1571 static_assert(
1572 Config::evictionPolicy == EvictionPolicy::DFS ||
1573 Config::evictionPolicy == EvictionPolicy::BFS,
1574 "Unhandled eviction policy"
1575 );
1576 }
1577 }
1578 }
1579
1580 if (output != nullptr) {
1581 output[idx] = success;
1582 }
1583
1584 if (evictionAttempts != nullptr) {
1585 evictionAttempts[idx] = threadEvictions;
1586 }
1587 }
1588
1589 int32_t blockSum = BlockReduce(tempStorage).Sum(success);
1590
1591 if (threadIdx.x == 0 && blockSum > 0) {
1592 filter->d_numOccupied->fetch_add(blockSum, cuda::memory_order_relaxed);
1593 }
1594}
size_t numBuckets
Number of buckets in the filter.
__device__ bool tryInsertAtBucket(size_t bucketIdx, TagType tag)
Attempts to insert a tag into a specific bucket.
__device__ bool insertWithEvictionDFS(TagType fp, size_t startBucket, uint32_t *evictionAttempts=nullptr)
Inserts a fingerprint into the filter by evicting existing fingerprints.
__device__ bool insertWithEvictionBFS(TagType fp, size_t startBucket, uint32_t *evictionAttempts=nullptr)
Inserts a fingerprint using repeated shallow breadth-first attempts.
Here is the call graph for this function:

◆ maxOccupancyGridSize()

template<typename Kernel >
constexpr size_t cuckoogpu::detail::maxOccupancyGridSize ( int32_t  blockSize,
Kernel  kernel,
size_t  dynamicSMemSize 
)
constexpr

Calculates the maximum occupancy grid size for a kernel.

Template Parameters
KernelType of the kernel function.
Parameters
blockSizeBlock size (threads per block).
kernelThe kernel function.
dynamicSMemSizeDynamic shared memory size per block.
Returns
size_t The calculated grid size (number of blocks).

Definition at line 224 of file helpers.cuh.

224 {
225 int device = 0;
226 cudaGetDevice(&device);
227
228 int numSM = -1;
229 cudaDeviceGetAttribute(&numSM, cudaDevAttrMultiProcessorCount, device);
230
231 int maxActiveBlocksPerSM{};
232 cudaOccupancyMaxActiveBlocksPerMultiprocessor(
233 &maxActiveBlocksPerSM, kernel, blockSize, dynamicSMemSize
234 );
235
236 return maxActiveBlocksPerSM * numSM;
237}
Here is the call graph for this function:

◆ nextPowerOfTwo()

constexpr size_t cuckoogpu::detail::nextPowerOfTwo ( size_t  n)
constexpr

Calculates the next power of two greater than or equal to n.

Parameters
nInput number.
Returns
size_t Next power of two.

Definition at line 33 of file helpers.cuh.

33 {
34 if (powerOfTwo(n))
35 return n;
36
37 n--;
38 n |= n >> 1;
39 n |= n >> 2;
40 n |= n >> 4;
41 n |= n >> 8;
42 n |= n >> 16;
43 n |= n >> 32;
44 n++;
45
46 return n;
47}
constexpr bool powerOfTwo(size_t n)
Checks if a number is a power of two.
Definition helpers.cuh:16
Here is the call graph for this function:
Here is the caller graph for this function:

◆ powerOfTwo()

constexpr bool cuckoogpu::detail::powerOfTwo ( size_t  n)
constexpr

Checks if a number is a power of two.

Parameters
nNumber to check.
Returns
true if n is a power of two, false otherwise.

Definition at line 16 of file helpers.cuh.

16 {
17 return n != 0 && (n & (n - 1)) == 0;
18}
Here is the call graph for this function:
Here is the caller graph for this function:

◆ replicateTag()

template<typename TagType , typename WordType >
__host__ __device__ __forceinline__ constexpr WordType cuckoogpu::detail::replicateTag ( TagType  tag)
constexpr

Replicates a tag value across all slots in a word.

Template Parameters
TagTypeThe type of the tag (uint8_t, uint16_t, or uint32_t)
WordTypeThe target word type (uint32_t or uint64_t)
Parameters
tagThe tag value to replicate
Returns
A word with the tag replicated in every slot

Definition at line 129 of file helpers.cuh.

129 {
130 static_assert(sizeof(WordType) == 4 || sizeof(WordType) == 8, "WordType must be 32 or 64 bits");
131
132 if constexpr (sizeof(WordType) == 8) {
133 if constexpr (sizeof(TagType) == 1) {
134 return static_cast<uint64_t>(tag) * 0x0101010101010101ULL;
135 } else if constexpr (sizeof(TagType) == 2) {
136 return static_cast<uint64_t>(tag) * 0x0001000100010001ULL;
137 } else if constexpr (sizeof(TagType) == 4) {
138 return static_cast<uint64_t>(tag) * 0x0000000100000001ULL;
139 } else {
140 return tag;
141 }
142 } else {
143 if constexpr (sizeof(TagType) == 1) {
144 return static_cast<uint32_t>(tag) * 0x01010101U;
145 } else if constexpr (sizeof(TagType) == 2) {
146 return static_cast<uint32_t>(tag) * 0x00010001U;
147 } else if constexpr (sizeof(TagType) == 4) {
148 return static_cast<uint32_t>(tag);
149 } else {
150 return tag;
151 }
152 }
153}
Here is the call graph for this function: