GPU-Accelerated Cuckoo Filter
Loading...
Searching...
No Matches
Functions
cuckoogpu::detail Namespace Reference

Functions

template<typename Config >
__global__ void insertKernel (const typename Config::KeyType *keys, bool *output, size_t n, Filter< Config > *filter, uint32_t *evictionAttempts)
 Kernel for inserting keys into the filter.
 
template<typename Config >
__global__ void insertKernelSorted (const typename Filter< Config >::PackedTagType *packedTags, bool *output, size_t n, Filter< Config > *filter, uint32_t *evictionAttempts)
 Kernel for inserting pre-sorted keys into the filter.
 
template<typename Config >
__global__ void computePackedTagsKernel (const typename Config::KeyType *keys, typename Filter< Config >::PackedTagType *packedTags, size_t n, size_t numBuckets)
 Kernel for computing packed tags for sorting.
 
template<typename Config >
__global__ void containsKernel (const typename Config::KeyType *keys, bool *output, size_t n, Filter< Config > *filter)
 Kernel for checking existence of keys.
 
template<typename Config >
__global__ void deleteKernel (const typename Config::KeyType *keys, bool *output, size_t n, Filter< Config > *filter)
 Kernel for deleting keys.
 
constexpr bool powerOfTwo (size_t n)
 Checks if a number is a power of two.
 
__host__ __device__ __forceinline__ uint32_t globalThreadId ()
 Calculates the global thread ID in a 1D grid.
 
constexpr size_t nextPowerOfTwo (size_t n)
 Calculates the next power of two greater than or equal to n.
 
template<typename T >
size_t countOnes (T *data, size_t n)
 Counts the number of non-zero elements in an array.
 
template<typename TagType , typename WordType >
__host__ __device__ __forceinline__ constexpr WordType getZeroMask (WordType v)
 Returns a bitmask indicating which slots in a packed word are zero.
 
template<typename TagType , typename WordType >
__host__ __device__ __forceinline__ constexpr bool hasZero (WordType v)
 Checks if a packed word contains a zero slot.
 
template<typename TagType , typename WordType >
__host__ __device__ __forceinline__ constexpr WordType replicateTag (TagType tag)
 Replicates a tag value across all slots in a word.
 
template<typename Kernel >
constexpr size_t maxOccupancyGridSize (int32_t blockSize, Kernel kernel, size_t dynamicSMemSize)
 Calculates the maximum occupancy grid size for a kernel.
 

Function Documentation

◆ computePackedTagsKernel()

template<typename Config >
__global__ void cuckoogpu::detail::computePackedTagsKernel ( const typename Config::KeyType keys,
typename Filter< Config >::PackedTagType *  packedTags,
size_t  n,
size_t  numBuckets 
)

Kernel for computing packed tags for sorting.

Definition at line 1500 of file CuckooFilter.cuh.

1505 {
1506 size_t idx = globalThreadId();
1507
1508 if (idx >= n) {
1509 return;
1510 }
1511
1512 using FilterType = Filter<Config>;
1513 using PackedTagType = typename FilterType::PackedTagType;
1514 constexpr size_t bitsPerTag = Config::bitsPerTag;
1515
1516 typename Config::KeyType key = keys[idx];
1517 auto [i1, i2, fp1, fp2] = FilterType::getCandidateBucketsAndFPs(key, numBuckets);
1518
1519 packedTags[idx] =
1520 (static_cast<PackedTagType>(i1) << bitsPerTag) | static_cast<PackedTagType>(fp1);
1521}
A CUDA-accelerated Cuckoo Filter implementation.
__host__ __device__ __forceinline__ uint32_t globalThreadId()
Calculates the global thread ID in a 1D grid.
Definition helpers.cuh:24
Here is the call graph for this function:

◆ containsKernel()

template<typename Config >
__global__ void cuckoogpu::detail::containsKernel ( const typename Config::KeyType keys,
bool output,
size_t  n,
Filter< Config > *  filter 
)

Kernel for checking existence of keys.

Definition at line 1462 of file CuckooFilter.cuh.

1467 {
1468 auto idx = globalThreadId();
1469
1470 if (idx < n) {
1471 output[idx] = filter->contains(keys[idx]);
1472 }
1473}
__device__ bool contains(const T &key) const
Checks if a key exists in the filter.
Here is the call graph for this function:

◆ countOnes()

template<typename T >
size_t cuckoogpu::detail::countOnes ( T *  data,
size_t  n 
)

Counts the number of non-zero elements in an array.

Template Parameters
TType of elements.
Parameters
dataPointer to the array.
nNumber of elements.
Returns
size_t Number of non-zero elements.

Definition at line 57 of file helpers.cuh.

57 {
58 size_t count = 0;
59 for (size_t i = 0; i < n; ++i) {
60 if (data[i]) {
61 count++;
62 }
63 }
64 return count;
65}
Here is the call graph for this function:

◆ deleteKernel()

template<typename Config >
__global__ void cuckoogpu::detail::deleteKernel ( const typename Config::KeyType keys,
bool output,
size_t  n,
Filter< Config > *  filter 
)

Kernel for deleting keys.

Definition at line 1477 of file CuckooFilter.cuh.

1477 {
1478 using BlockReduce = cub::BlockReduce<int32_t, Config::blockSize>;
1479 __shared__ typename BlockReduce::TempStorage tempStorage;
1480
1481 auto idx = globalThreadId();
1482
1483 int32_t success = 0;
1484 if (idx < n) {
1485 success = filter->remove(keys[idx]);
1486
1487 if (output != nullptr) {
1488 output[idx] = success;
1489 }
1490 }
1491
1492 int32_t blockSum = BlockReduce(tempStorage).Sum(success);
1493
1494 if (threadIdx.x == 0 && blockSum > 0) {
1495 filter->d_numOccupied->fetch_sub(blockSum, cuda::memory_order_relaxed);
1496 }
1497}
cuda::std::atomic< size_t > * d_numOccupied
Pointer to the device memory for the occupancy counter.
__device__ bool remove(const T &key)
Removes a key from the filter.
Here is the call graph for this function:

◆ getZeroMask()

template<typename TagType , typename WordType >
__host__ __device__ __forceinline__ constexpr WordType cuckoogpu::detail::getZeroMask ( WordType  v)
constexpr

Returns a bitmask indicating which slots in a packed word are zero.

Uses SWAR (SIMD Within A Register) to check multiple items in parallel. See https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord

The high bit of each slot that is zero will be set in the result.

Template Parameters
TagTypeThe type of the individual items (uint8_t, uint16_t, or uint32_t)
WordTypeThe packed word type (uint32_t or uint64_t)
Parameters
vThe packed integer
Returns
A bitmask with the high bit of each zero slot set

Definition at line 81 of file helpers.cuh.

81 {
82 static_assert(sizeof(WordType) == 4 || sizeof(WordType) == 8, "WordType must be 32 or 64 bits");
83
84 if constexpr (sizeof(WordType) == 8) {
85 if constexpr (sizeof(TagType) == 1) {
86 return (v - 0x0101010101010101ULL) & ~v & 0x8080808080808080ULL;
87 } else if constexpr (sizeof(TagType) == 2) {
88 return (v - 0x0001000100010001ULL) & ~v & 0x8000800080008000ULL;
89 } else if constexpr (sizeof(TagType) == 4) {
90 return (v - 0x0000000100000001ULL) & ~v & 0x8000000080000000ULL;
91 } else {
92 return 0;
93 }
94 } else {
95 if constexpr (sizeof(TagType) == 1) {
96 return (v - 0x01010101U) & ~v & 0x80808080U;
97 } else if constexpr (sizeof(TagType) == 2) {
98 return (v - 0x00010001U) & ~v & 0x80008000U;
99 } else if constexpr (sizeof(TagType) == 4) {
100 return (v - 0x00000001U) & ~v & 0x80000000U;
101 } else {
102 return 0;
103 }
104 }
105}
Here is the call graph for this function:

◆ globalThreadId()

__host__ __device__ __forceinline__ uint32_t cuckoogpu::detail::globalThreadId ( )

Calculates the global thread ID in a 1D grid.

Returns
uint32_t Global thread ID.

Definition at line 24 of file helpers.cuh.

24 {
25 return blockIdx.x * blockDim.x + threadIdx.x;
26}
Here is the call graph for this function:
Here is the caller graph for this function:

◆ hasZero()

template<typename TagType , typename WordType >
__host__ __device__ __forceinline__ constexpr bool cuckoogpu::detail::hasZero ( WordType  v)
constexpr

Checks if a packed word contains a zero slot.

Template Parameters
TagTypeThe type of the individual items (uint8_t, uint16_t, or uint32_t)
WordTypeThe packed word type (uint32_t or uint64_t)
Parameters
vThe packed integer
Returns
true if any of the items in v are zero

Definition at line 116 of file helpers.cuh.

116 {
117 return getZeroMask<TagType, WordType>(v) != 0;
118}
Here is the call graph for this function:
Here is the caller graph for this function:

◆ insertKernel()

template<typename Config >
__global__ void cuckoogpu::detail::insertKernel ( const typename Config::KeyType keys,
bool output,
size_t  n,
Filter< Config > *  filter,
uint32_t evictionAttempts 
)

Kernel for inserting keys into the filter.

Definition at line 1424 of file CuckooFilter.cuh.

1430 {
1431 using BlockReduce = cub::BlockReduce<int32_t, Config::blockSize>;
1432 __shared__ typename BlockReduce::TempStorage tempStorage;
1433
1434 auto idx = globalThreadId();
1435
1436 int32_t success = 0;
1437
1438 if (idx < n) {
1439 uint32_t threadEvictions = 0;
1440 success = filter->insert(keys[idx], &threadEvictions);
1441
1442 if (output != nullptr) {
1443 output[idx] = success;
1444 }
1445
1446 if (evictionAttempts != nullptr) {
1447 evictionAttempts[idx] = threadEvictions;
1448 }
1449 }
1450
1451 int32_t blockSuccessSum = BlockReduce(tempStorage).Sum(success);
1452 __syncthreads();
1453
1454 if (threadIdx.x == 0) {
1455 if (blockSuccessSum > 0) {
1456 filter->d_numOccupied->fetch_add(blockSuccessSum, cuda::memory_order_relaxed);
1457 }
1458 }
1459}
__device__ bool insert(const T &key, uint32_t *evictionAttempts=nullptr)
Inserts a single key into the filter.
Here is the call graph for this function:

◆ insertKernelSorted()

template<typename Config >
__global__ void cuckoogpu::detail::insertKernelSorted ( const typename Filter< Config >::PackedTagType *  packedTags,
bool output,
size_t  n,
Filter< Config > *  filter,
uint32_t evictionAttempts 
)

Kernel for inserting pre-sorted keys into the filter.

Definition at line 1524 of file CuckooFilter.cuh.

1530 {
1531 using BlockReduce = cub::BlockReduce<int, Config::blockSize>;
1532 __shared__ typename BlockReduce::TempStorage tempStorage;
1533
1534 size_t idx = globalThreadId();
1535
1536 using FilterType = Filter<Config>;
1537 using TagType = typename FilterType::TagType;
1538 using PackedTagType = typename FilterType::PackedTagType;
1539
1540 constexpr size_t bitsPerTag = Config::bitsPerTag;
1541 constexpr TagType fpMask = (1ULL << bitsPerTag) - 1;
1542
1543 int32_t success = 0;
1544 uint32_t threadEvictions = 0;
1545 if (idx < n) {
1546 PackedTagType packedTag = packedTags[idx];
1547 size_t primaryBucket = packedTag >> bitsPerTag;
1548 auto fp = static_cast<TagType>(packedTag & fpMask);
1549
1550 if (filter->tryInsertAtBucket(primaryBucket, fp)) {
1551 success = 1;
1552 } else {
1553 auto [i2, fp2] =
1554 FilterType::getAlternateBucketWithNewFp(primaryBucket, fp, filter->numBuckets);
1555
1556 if (filter->tryInsertAtBucket(i2, fp2)) {
1557 success = 1;
1558 } else {
1559 TagType evictFp;
1560 auto startBucket = (fp & 1) == 0 ? primaryBucket : i2;
1561
1562 if constexpr (Config::AltBucketPolicy::usesChoiceBit) {
1563 evictFp = (fp & 1) == 0 ? fp : fp2;
1564 } else {
1565 evictFp = fp;
1566 }
1567
1568 if constexpr (Config::evictionPolicy == EvictionPolicy::BFS) {
1569 success = filter->insertWithEvictionBFS(evictFp, startBucket, &threadEvictions);
1570 } else if constexpr (Config::evictionPolicy == EvictionPolicy::DFS) {
1571 success = filter->insertWithEvictionDFS(evictFp, startBucket, &threadEvictions);
1572 } else {
1573 static_assert(
1574 Config::evictionPolicy == EvictionPolicy::DFS ||
1575 Config::evictionPolicy == EvictionPolicy::BFS,
1576 "Unhandled eviction policy"
1577 );
1578 }
1579 }
1580 }
1581
1582 if (output != nullptr) {
1583 output[idx] = success;
1584 }
1585
1586 if (evictionAttempts != nullptr) {
1587 evictionAttempts[idx] = threadEvictions;
1588 }
1589 }
1590
1591 int32_t blockSum = BlockReduce(tempStorage).Sum(success);
1592
1593 if (threadIdx.x == 0 && blockSum > 0) {
1594 filter->d_numOccupied->fetch_add(blockSum, cuda::memory_order_relaxed);
1595 }
1596}
size_t numBuckets
Number of buckets in the filter.
__device__ bool tryInsertAtBucket(size_t bucketIdx, TagType tag)
Attempts to insert a tag into a specific bucket.
__device__ bool insertWithEvictionDFS(TagType fp, size_t startBucket, uint32_t *evictionAttempts=nullptr)
Inserts a fingerprint into the filter by evicting existing fingerprints.
__device__ bool insertWithEvictionBFS(TagType fp, size_t startBucket, uint32_t *evictionAttempts=nullptr)
Inserts a fingerprint using repeated shallow breadth-first attempts.
Here is the call graph for this function:

◆ maxOccupancyGridSize()

template<typename Kernel >
constexpr size_t cuckoogpu::detail::maxOccupancyGridSize ( int32_t  blockSize,
Kernel  kernel,
size_t  dynamicSMemSize 
)
constexpr

Calculates the maximum occupancy grid size for a kernel.

Template Parameters
KernelType of the kernel function.
Parameters
blockSizeBlock size (threads per block).
kernelThe kernel function.
dynamicSMemSizeDynamic shared memory size per block.
Returns
size_t The calculated grid size (number of blocks).

Definition at line 224 of file helpers.cuh.

224 {
225 int device = 0;
226 cudaGetDevice(&device);
227
228 int numSM = -1;
229 cudaDeviceGetAttribute(&numSM, cudaDevAttrMultiProcessorCount, device);
230
231 int maxActiveBlocksPerSM{};
232 cudaOccupancyMaxActiveBlocksPerMultiprocessor(
233 &maxActiveBlocksPerSM, kernel, blockSize, dynamicSMemSize
234 );
235
236 return maxActiveBlocksPerSM * numSM;
237}
Here is the call graph for this function:

◆ nextPowerOfTwo()

constexpr size_t cuckoogpu::detail::nextPowerOfTwo ( size_t  n)
constexpr

Calculates the next power of two greater than or equal to n.

Parameters
nInput number.
Returns
size_t Next power of two.

Definition at line 33 of file helpers.cuh.

33 {
34 if (powerOfTwo(n))
35 return n;
36
37 n--;
38 n |= n >> 1;
39 n |= n >> 2;
40 n |= n >> 4;
41 n |= n >> 8;
42 n |= n >> 16;
43 n |= n >> 32;
44 n++;
45
46 return n;
47}
constexpr bool powerOfTwo(size_t n)
Checks if a number is a power of two.
Definition helpers.cuh:16
Here is the call graph for this function:
Here is the caller graph for this function:

◆ powerOfTwo()

constexpr bool cuckoogpu::detail::powerOfTwo ( size_t  n)
constexpr

Checks if a number is a power of two.

Parameters
nNumber to check.
Returns
true if n is a power of two, false otherwise.

Definition at line 16 of file helpers.cuh.

16 {
17 return n != 0 && (n & (n - 1)) == 0;
18}
Here is the call graph for this function:
Here is the caller graph for this function:

◆ replicateTag()

template<typename TagType , typename WordType >
__host__ __device__ __forceinline__ constexpr WordType cuckoogpu::detail::replicateTag ( TagType  tag)
constexpr

Replicates a tag value across all slots in a word.

Template Parameters
TagTypeThe type of the tag (uint8_t, uint16_t, or uint32_t)
WordTypeThe target word type (uint32_t or uint64_t)
Parameters
tagThe tag value to replicate
Returns
A word with the tag replicated in every slot

Definition at line 129 of file helpers.cuh.

129 {
130 static_assert(sizeof(WordType) == 4 || sizeof(WordType) == 8, "WordType must be 32 or 64 bits");
131
132 if constexpr (sizeof(WordType) == 8) {
133 if constexpr (sizeof(TagType) == 1) {
134 return static_cast<uint64_t>(tag) * 0x0101010101010101ULL;
135 } else if constexpr (sizeof(TagType) == 2) {
136 return static_cast<uint64_t>(tag) * 0x0001000100010001ULL;
137 } else if constexpr (sizeof(TagType) == 4) {
138 return static_cast<uint64_t>(tag) * 0x0000000100000001ULL;
139 } else {
140 return tag;
141 }
142 } else {
143 if constexpr (sizeof(TagType) == 1) {
144 return static_cast<uint32_t>(tag) * 0x01010101U;
145 } else if constexpr (sizeof(TagType) == 2) {
146 return static_cast<uint32_t>(tag) * 0x00010001U;
147 } else if constexpr (sizeof(TagType) == 4) {
148 return static_cast<uint32_t>(tag);
149 } else {
150 return tag;
151 }
152 }
153}
Here is the call graph for this function: