3#include <thrust/device_vector.h>
8#include <cuda/std/atomic>
9#include <cuda/std/cstddef>
10#include <cuda/std/cstdint>
45 size_t maxEvictions_ = 500,
46 size_t blockSize_ = 256,
47 size_t bucketSize_ = 16,
48 template <
typename,
typename,
size_t,
size_t>
class AltBucketPolicy_ = XorAltBucketPolicy,
50 typename WordType_ = uint64_t>
59 using TagType =
typename std::conditional<
62 typename std::conditional<bitsPerTag <= 16, uint16_t, uint32_t>::type>::type;
66 std::is_same_v<WordType, uint32_t> || std::is_same_v<WordType, uint64_t>,
67 "WordType must be uint32_t or uint64_t"
69 static_assert(
sizeof(
TagType) <=
sizeof(
WordType),
"TagType must fit within WordType");
71 using AltBucketPolicy = AltBucketPolicy_<KeyType, TagType, bitsPerTag, bucketSize_>;
74template <
typename Config>
82template <
typename Config>
88 uint32_t* evictionAttempts
94template <
typename Config>
100 uint32_t* evictionAttempts
106template <
typename Config>
117template <
typename Config>
128template <
typename Config>
143template <
typename Config>
158 "The tag must be 8, 16 or 32 bits"
163 using PackedTagType =
typename std::conditional<bitsPerTag <= 8, uint32_t, uint64_t>::type;
180 static_assert(
fpBits <
totalBits,
"fpBits must leave at least some bits for bucketIdx");
234 static_assert(
tagsPerWord >= 1,
"TagType must fit within WordType");
243 __host__ __device__ __forceinline__
TagType
248 __host__ __device__ __forceinline__
WordType
252 return cleared | (
static_cast<WordType>(newTag) << shift);
259 __device__ __forceinline__
static bool
262 for (
size_t j = 0; j < N; ++j) {
275 static_assert(N == 2 || N == 4,
"128-bit loads support 2 or 4 words");
276 if constexpr (
sizeof(
WordType) == 4) {
277 auto vec = __ldg(
reinterpret_cast<const uint4*
>(&
packedTags[startIdx]));
283 auto vec = __ldg(
reinterpret_cast<const ulonglong2*
>(&
packedTags[startIdx]));
303 const uint32_t startSlot = tag & (
bucketSize - 1);
304 const size_t startWordIdx = startSlot /
tagsPerWord;
306#if __CUDA_ARCH__ >= 1000 && !defined(CUCKOO_FILTER_DISABLE_256BIT_LOADS)
308 constexpr size_t wordsPerLoad256 = (
sizeof(
WordType) == 4) ? 8 : 4;
309 if constexpr (
wordCount >= wordsPerLoad256) {
310 constexpr size_t alignMask = wordsPerLoad256 - 1;
311 const size_t startAlignedIdx = startWordIdx & ~alignMask;
314 for (
size_t i = 0; i <
wordCount / wordsPerLoad256; i++) {
315 const size_t idx = (startAlignedIdx + i * wordsPerLoad256) & (
wordCount - 1);
327 constexpr size_t wordsPerLoad128 = (
sizeof(
WordType) == 4) ? 4 : 2;
328 if constexpr (
wordCount >= wordsPerLoad128) {
329 constexpr size_t alignMask = wordsPerLoad128 - 1;
330 const size_t startAlignedIdx = startWordIdx & ~alignMask;
333 for (
size_t i = 0; i <
wordCount / wordsPerLoad128; i++) {
334 const size_t idx = (startAlignedIdx + i * wordsPerLoad128) & (
wordCount - 1);
349 cuda::std::atomic<size_t>*
352#ifdef CUCKOO_FILTER_COUNT_EVICTIONS
353 cuda::std::atomic<size_t>*
359 template <
typename H>
360 static __host__ __device__ uint64_t
hash64(
const H& key) {
361 return AltBucketPolicy::hash64(key);
364 static __host__ __device__ cuda::std::tuple<size_t, size_t, TagType, TagType>
366 return AltBucketPolicy::getCandidateBucketsAndFPs(key,
numBuckets);
372 static __host__ __device__
size_t
374 return AltBucketPolicy::getAlternateBucket(bucket, fp,
numBuckets);
381 static __host__ __device__ cuda::std::tuple<size_t, TagType>
383 if constexpr (AltBucketPolicy::usesChoiceBit) {
384 return AltBucketPolicy::getAlternateBucketWithNewFp(bucket, fp,
numBuckets);
386 return {AltBucketPolicy::getAlternateBucket(bucket, fp,
numBuckets), fp};
395 return AltBucketPolicy::calculateNumBuckets(
capacity);
411#ifdef CUCKOO_FILTER_COUNT_EVICTIONS
430#ifdef CUCKOO_FILTER_COUNT_EVICTIONS
450 bool* d_output =
nullptr,
451 cudaStream_t stream = {}
455 <<<numBlocks,
blockSize, 0, stream>>>(d_keys, d_output, n,
this,
nullptr);
457 CUDA_CALL(cudaStreamSynchronize(stream));
462#ifdef CUCKOO_FILTER_COUNT_EVICTIONS
477 uint32_t* d_evictionAttempts,
478 bool* d_output =
nullptr,
479 cudaStream_t stream = {}
483 <<<numBlocks,
blockSize, 0, stream>>>(d_keys, d_output, n,
this, d_evictionAttempts);
485 CUDA_CALL(cudaStreamSynchronize(stream));
505 bool* d_output =
nullptr,
506 cudaStream_t stream = {}
517 void* d_tempStorage =
nullptr;
518 size_t tempStorageBytes = 0;
520 cub::DeviceRadixSort::SortKeys(
531 CUDA_CALL(cudaMallocAsync(&d_tempStorage, tempStorageBytes, stream));
533 cub::DeviceRadixSort::SortKeys(
544 CUDA_CALL(cudaFreeAsync(d_tempStorage, stream));
547 <<<numBlocks,
blockSize, 0, stream>>>(d_packedTags, d_output, n,
this,
nullptr);
549 CUDA_CALL(cudaFreeAsync(d_packedTags, stream));
550 CUDA_CALL(cudaStreamSynchronize(stream));
555#ifdef CUCKOO_FILTER_COUNT_EVICTIONS
570 uint32_t* d_evictionAttempts,
571 bool* d_output =
nullptr,
572 cudaStream_t stream = {}
583 void* d_tempStorage =
nullptr;
584 size_t tempStorageBytes = 0;
586 cub::DeviceRadixSort::SortKeys(
597 CUDA_CALL(cudaMallocAsync(&d_tempStorage, tempStorageBytes, stream));
599 cub::DeviceRadixSort::SortKeys(
610 CUDA_CALL(cudaFreeAsync(d_tempStorage, stream));
613 d_packedTags, d_output, n,
this, d_evictionAttempts
616 CUDA_CALL(cudaFreeAsync(d_packedTags, stream));
617 CUDA_CALL(cudaStreamSynchronize(stream));
631 void containsMany(
const T* d_keys,
const size_t n,
bool* d_output, cudaStream_t stream = {}) {
634 <<<numBlocks,
blockSize, 0, stream>>>(d_keys, d_output, n,
this);
636 CUDA_CALL(cudaStreamSynchronize(stream));
653 bool* d_output =
nullptr,
654 cudaStream_t stream = {}
658 <<<numBlocks,
blockSize, 0, stream>>>(d_keys, d_output, n,
this);
660 CUDA_CALL(cudaStreamSynchronize(stream));
673 const thrust::device_vector<T>& d_keys,
674 thrust::device_vector<bool>& d_output,
675 cudaStream_t stream = {}
677 if (d_output.size() != d_keys.size()) {
678 d_output.resize(d_keys.size());
681 thrust::raw_pointer_cast(d_keys.data()),
683 thrust::raw_pointer_cast(d_output.data()),
696 const thrust::device_vector<T>& d_keys,
697 thrust::device_vector<uint8_t>& d_output,
698 cudaStream_t stream = {}
700 if (d_output.size() != d_keys.size()) {
701 d_output.resize(d_keys.size());
704 thrust::raw_pointer_cast(d_keys.data()),
706 reinterpret_cast<bool*
>(thrust::raw_pointer_cast(d_output.data())),
717 size_t insertMany(
const thrust::device_vector<T>& d_keys, cudaStream_t stream = {}) {
718 return insertMany(thrust::raw_pointer_cast(d_keys.data()), d_keys.size(),
nullptr, stream);
721#ifdef CUCKOO_FILTER_COUNT_EVICTIONS
731 const thrust::device_vector<T>& d_keys,
732 thrust::device_vector<uint32_t>& d_evictionAttempts,
733 thrust::device_vector<uint8_t>* d_output =
nullptr,
734 cudaStream_t stream = {}
736 if (d_evictionAttempts.size() != d_keys.size()) {
737 d_evictionAttempts.resize(d_keys.size());
740 bool* d_outputPtr =
nullptr;
741 if (d_output !=
nullptr) {
742 if (d_output->size() != d_keys.size()) {
743 d_output->resize(d_keys.size());
745 d_outputPtr =
reinterpret_cast<bool*
>(thrust::raw_pointer_cast(d_output->data()));
749 thrust::raw_pointer_cast(d_keys.data()),
751 thrust::raw_pointer_cast(d_evictionAttempts.data()),
766 const thrust::device_vector<T>& d_keys,
767 thrust::device_vector<bool>& d_output,
768 cudaStream_t stream = {}
770 if (d_output.size() != d_keys.size()) {
771 d_output.resize(d_keys.size());
774 thrust::raw_pointer_cast(d_keys.data()),
776 thrust::raw_pointer_cast(d_output.data()),
789 const thrust::device_vector<T>& d_keys,
790 thrust::device_vector<uint8_t>& d_output,
791 cudaStream_t stream = {}
793 if (d_output.size() != d_keys.size()) {
794 d_output.resize(d_keys.size());
797 thrust::raw_pointer_cast(d_keys.data()),
799 reinterpret_cast<bool*
>(thrust::raw_pointer_cast(d_output.data())),
811 size_t insertManySorted(
const thrust::device_vector<T>& d_keys, cudaStream_t stream = {}) {
813 thrust::raw_pointer_cast(d_keys.data()), d_keys.size(),
nullptr, stream
817#ifdef CUCKOO_FILTER_COUNT_EVICTIONS
828 const thrust::device_vector<T>& d_keys,
829 thrust::device_vector<uint32_t>& d_evictionAttempts,
830 thrust::device_vector<uint8_t>* d_output =
nullptr,
831 cudaStream_t stream = {}
833 if (d_evictionAttempts.size() != d_keys.size()) {
834 d_evictionAttempts.resize(d_keys.size());
837 bool* d_outputPtr =
nullptr;
838 if (d_output !=
nullptr) {
839 if (d_output->size() != d_keys.size()) {
840 d_output->resize(d_keys.size());
842 d_outputPtr =
reinterpret_cast<bool*
>(thrust::raw_pointer_cast(d_output->data()));
846 thrust::raw_pointer_cast(d_keys.data()),
848 thrust::raw_pointer_cast(d_evictionAttempts.data()),
862 const thrust::device_vector<T>& d_keys,
863 thrust::device_vector<bool>& d_output,
864 cudaStream_t stream = {}
866 if (d_output.size() != d_keys.size()) {
867 d_output.resize(d_keys.size());
870 thrust::raw_pointer_cast(d_keys.data()),
872 thrust::raw_pointer_cast(d_output.data()),
884 const thrust::device_vector<T>& d_keys,
885 thrust::device_vector<uint8_t>& d_output,
886 cudaStream_t stream = {}
888 if (d_output.size() != d_keys.size()) {
889 d_output.resize(d_keys.size());
892 thrust::raw_pointer_cast(d_keys.data()),
894 reinterpret_cast<bool*
>(thrust::raw_pointer_cast(d_output.data())),
907 const thrust::device_vector<T>& d_keys,
908 thrust::device_vector<bool>& d_output,
909 cudaStream_t stream = {}
911 if (d_output.size() != d_keys.size()) {
912 d_output.resize(d_keys.size());
915 thrust::raw_pointer_cast(d_keys.data()),
917 thrust::raw_pointer_cast(d_output.data()),
930 const thrust::device_vector<T>& d_keys,
931 thrust::device_vector<uint8_t>& d_output,
932 cudaStream_t stream = {}
934 if (d_output.size() != d_keys.size()) {
935 d_output.resize(d_keys.size());
938 thrust::raw_pointer_cast(d_keys.data()),
940 reinterpret_cast<bool*
>(thrust::raw_pointer_cast(d_output.data())),
951 size_t deleteMany(
const thrust::device_vector<T>& d_keys, cudaStream_t stream = {}) {
952 return deleteMany(thrust::raw_pointer_cast(d_keys.data()), d_keys.size(),
nullptr, stream);
961#ifdef CUCKOO_FILTER_COUNT_EVICTIONS
989#ifdef CUCKOO_FILTER_COUNT_EVICTIONS
1049 size_t occupiedCount = 0;
1051 for (
size_t bucketIdx = 0; bucketIdx <
numBuckets; ++bucketIdx) {
1052 const Bucket& bucket = h_buckets[bucketIdx];
1055 uint64_t packed =
reinterpret_cast<const uint64_t&
>(bucket.
packedTags[atomicIdx]);
1058 auto tag = bucket.
extractTag(packed, tagIdx);
1067 return occupiedCount;
1091 const uint32_t startSlot = tag & (
bucketSize - 1);
1100 auto expected = bucket.
packedTags[currIdx].load(cuda::memory_order_relaxed);
1106 if (matchMask == 0) {
1113 if constexpr (
sizeof(WordType) == 4) {
1114 bitPos = __ffs(
static_cast<int>(matchMask)) - 1;
1116 bitPos = __ffsll(
static_cast<long long>(matchMask)) - 1;
1122 if (bucket.
packedTags[currIdx].compare_exchange_weak(
1123 expected, desired, cuda::memory_order_relaxed, cuda::memory_order_relaxed
1145 const uint32_t startIdx = tag & (
bucketSize - 1);
1152 auto expected = bucket.
packedTags[currWord].load(cuda::memory_order_relaxed);
1157 if (zeroMask == 0) {
1164 if constexpr (
sizeof(WordType) == 4) {
1165 bitPos = __ffs(
static_cast<int>(zeroMask)) - 1;
1167 bitPos = __ffsll(
static_cast<long long>(zeroMask)) - 1;
1171 auto desired = bucket.
replaceTag(expected, j, tag);
1173 if (bucket.
packedTags[currWord].compare_exchange_strong(
1174 expected, desired, cuda::memory_order_relaxed, cuda::memory_order_relaxed
1198 size_t currentBucket = startBucket;
1200 for (
size_t evictions = 0; evictions <
maxEvictions; ++evictions) {
1201 auto evictSlot = (currentFp + (evictions + 1) * 0x9E3779B1UL) & (
bucketSize - 1);
1207 auto expected = bucket.
packedTags[evictWord].load(cuda::memory_order_relaxed);
1212 evictedFp = bucket.extractTag(expected, evictTagIdx);
1213 desired = bucket.replaceTag(expected, evictTagIdx, currentFp);
1214 }
while (!bucket.packedTags[evictWord].compare_exchange_strong(
1215 expected, desired, cuda::memory_order_relaxed, cuda::memory_order_relaxed
1218#ifdef CUCKOO_FILTER_COUNT_EVICTIONS
1220 if (evictionAttempts !=
nullptr) {
1221 (*evictionAttempts)++;
1225 currentFp = evictedFp;
1226 auto [altBucket, newFp] =
1228 currentBucket = altBucket;
1254 constexpr size_t numCandidates = std::max(1UL,
bucketSize / 2);
1257 size_t currentBucket = startBucket;
1259 size_t evictions = 0;
1262 size_t restartWord = 0;
1263 size_t restartTagIdx = 0;
1265 for (
size_t i = 0; i < numCandidates; ++i) {
1266 size_t evictSlot = (currentFp + i * 0x9E3779B1UL + (evictions + 1) * 0x85EBCA77) &
1270 restartWord = evictWord;
1271 restartTagIdx = evictTagIdx;
1273 auto packed = bucket.
packedTags[evictWord].load(cuda::memory_order_relaxed);
1276 if (candidateFp ==
EMPTY) {
1283 auto [altBucket, altFp] =
1288 auto expected = bucket.
packedTags[evictWord].load(cuda::memory_order_relaxed);
1291 if (bucket.
extractTag(expected, evictTagIdx) == candidateFp) {
1292 auto desired = bucket.
replaceTag(expected, evictTagIdx, currentFp);
1294 if (bucket.
packedTags[evictWord].compare_exchange_strong(
1297 cuda::memory_order_relaxed,
1298 cuda::memory_order_relaxed
1300#ifdef CUCKOO_FILTER_COUNT_EVICTIONS
1302 if (evictionAttempts !=
nullptr) {
1303 (*evictionAttempts)++;
1316 auto expected = bucket.
packedTags[restartWord].load(cuda::memory_order_relaxed);
1321 evictedFp = bucket.
extractTag(expected, restartTagIdx);
1322 desired = bucket.
replaceTag(expected, restartTagIdx, currentFp);
1323 }
while (!bucket.
packedTags[restartWord].compare_exchange_strong(
1324 expected, desired, cuda::memory_order_relaxed, cuda::memory_order_relaxed
1327 if (evictedFp ==
EMPTY) {
1331#ifdef CUCKOO_FILTER_COUNT_EVICTIONS
1333 if (evictionAttempts !=
nullptr) {
1334 (*evictionAttempts)++;
1339 auto [altBucket, altFp] =
1341 currentBucket = altBucket;
1358 __device__
bool insert(
const T& key, uint32_t* evictionAttempts =
nullptr) {
1368 auto startBucket = (fp1 & 1) == 0 ? i1 : i2;
1371 if constexpr (AltBucketPolicy::usesChoiceBit) {
1372 evictFp = (fp1 & 1) == 0 ? fp1 : fp2;
1385 "Unhandled eviction policy"
1421template <
typename Config>
1429 using BlockReduce = cub::BlockReduce<int32_t, Config::blockSize>;
1459template <
typename Config>
1473template <
typename Config>
1476 using BlockReduce = cub::BlockReduce<int32_t, Config::blockSize>;
1497template <
typename Config>
1511 using PackedTagType =
typename FilterType::PackedTagType;
1515 auto [
i1,
i2,
fp1,
fp2] = FilterType::getCandidateBucketsAndFPs(
key, numBuckets);
1518 (
static_cast<PackedTagType
>(
i1) << bitsPerTag) |
static_cast<PackedTagType
>(
fp1);
1521template <
typename Config>
1529 using BlockReduce = cub::BlockReduce<int, Config::blockSize>;
1535 using TagType =
typename FilterType::TagType;
1536 using PackedTagType =
typename FilterType::PackedTagType;
1539 constexpr TagType fpMask = (1ULL << bitsPerTag) - 1;
1546 auto fp =
static_cast<TagType
>(
packedTag & fpMask);
1560 if constexpr (Config::AltBucketPolicy::usesChoiceBit) {
1574 "Unhandled eviction policy"
A CUDA-accelerated Cuckoo Filter implementation.
cuda::std::atomic< size_t > * d_numOccupied
Pointer to the device memory for the occupancy counter.
static constexpr TagType EMPTY
void containsMany(const thrust::device_vector< T > &d_keys, thrust::device_vector< uint8_t > &d_output, cudaStream_t stream={})
Checks for existence of keys in a Thrust device vector (uint8_t output).
static size_t calculateNumBuckets(size_t capacity)
The number of buckets is enforced to be a power of two in order to allow for efficient modulo on the ...
size_t getNumBuckets() const
Returns the number of buckets in the filter.
static __host__ __device__ uint64_t hash64(const H &key)
Filter(size_t capacity)
Constructs a new Cuckoo Filter.
static constexpr size_t maxEvictions
size_t occupiedSlots()
Returns the total number of occupied slots.
void clear()
Clears the filter, removing all items.
size_t insertMany(const thrust::device_vector< T > &d_keys, thrust::device_vector< uint8_t > &d_output, cudaStream_t stream={})
Inserts keys from a Thrust device vector (uint8_t output).
static constexpr size_t bucketSize
size_t sizeInBytes() const
Returns the size of the filter in bytes.
size_t insertMany(const thrust::device_vector< T > &d_keys, thrust::device_vector< bool > &d_output, cudaStream_t stream={})
Inserts keys from a Thrust device vector.
size_t insertManyWithEvictionCounts(const T *d_keys, const size_t n, uint32_t *d_evictionAttempts, bool *d_output=nullptr, cudaStream_t stream={})
Inserts a batch of keys and records per-attempt eviction counts.
size_t numBuckets
Number of buckets in the filter.
size_t insertManySorted(const thrust::device_vector< T > &d_keys, cudaStream_t stream={})
Inserts keys from a Thrust device vector, sorting them first, without outputting results.
static constexpr size_t fpMask
__device__ bool tryInsertAtBucket(size_t bucketIdx, TagType tag)
Attempts to insert a tag into a specific bucket.
void resetEvictionCount()
Resets the eviction counter to zero.
size_t deleteMany(const T *d_keys, const size_t n, bool *d_output=nullptr, cudaStream_t stream={})
Tries to remove a set of keys from the filter.
size_t insertManySorted(const thrust::device_vector< T > &d_keys, thrust::device_vector< uint8_t > &d_output, cudaStream_t stream={})
Inserts keys from a Thrust device vector, sorting them first (uint8_t output).
float loadFactor()
Calculates the current load factor of the filter.
typename std::conditional< bitsPerTag<=8, uint32_t, uint64_t >::type PackedTagType
size_t insertManySortedWithEvictionCounts(const thrust::device_vector< T > &d_keys, thrust::device_vector< uint32_t > &d_evictionAttempts, thrust::device_vector< uint8_t > *d_output=nullptr, cudaStream_t stream={})
Inserts keys from a Thrust device vector, sorting first, and records per-attempt eviction counts.
static __host__ __device__ size_t getAlternateBucket(size_t bucket, TagType fp, size_t numBuckets)
Computes the alternate bucket for a fingerprint.
size_t insertMany(const thrust::device_vector< T > &d_keys, cudaStream_t stream={})
Inserts keys from a Thrust device vector without outputting results.
cuda::std::atomic< size_t > * d_numEvictions
Pointer to the device memory for the eviction counter.
Filter(const Filter &)=delete
__device__ bool insert(const T &key, uint32_t *evictionAttempts=nullptr)
Inserts a single key into the filter.
Bucket * d_buckets
Pointer to the device memory for the buckets.
size_t countOccupiedSlots()
Counts occupied slots by iterating over all buckets on the host.
typename Config::AltBucketPolicy AltBucketPolicy
size_t insertManySortedWithEvictionCounts(const T *d_keys, const size_t n, uint32_t *d_evictionAttempts, bool *d_output=nullptr, cudaStream_t stream={})
Inserts a pre-sorted batch of keys and records per-attempt eviction counts.
static __host__ __device__ cuda::std::tuple< size_t, TagType > getAlternateBucketWithNewFp(size_t bucket, TagType fp, size_t numBuckets)
Computes alternate bucket AND updated fingerprint for choice bit policies.
void containsMany(const thrust::device_vector< T > &d_keys, thrust::device_vector< bool > &d_output, cudaStream_t stream={})
Checks for existence of keys in a Thrust device vector.
typename Config::TagType TagType
static __host__ __device__ cuda::std::tuple< size_t, size_t, TagType, TagType > getCandidateBucketsAndFPs(const T &key, size_t numBuckets)
__device__ bool tryRemoveAtBucket(size_t bucketIdx, TagType tag)
Attempt to remove a single instance of a fingerprint from a bucket.
__device__ bool remove(const T &key)
Removes a key from the filter.
size_t deleteMany(const thrust::device_vector< T > &d_keys, cudaStream_t stream={})
Deletes keys in a Thrust device vector without outputting results.
size_t deleteMany(const thrust::device_vector< T > &d_keys, thrust::device_vector< bool > &d_output, cudaStream_t stream={})
Deletes keys in a Thrust device vector.
static constexpr size_t tagEntryBytes
void containsMany(const T *d_keys, const size_t n, bool *d_output, cudaStream_t stream={})
Checks for the existence of a batch of keys.
__device__ bool insertWithEvictionDFS(TagType fp, size_t startBucket, uint32_t *evictionAttempts=nullptr)
Inserts a fingerprint into the filter by evicting existing fingerprints.
static constexpr size_t bitsPerTag
__device__ bool insertWithEvictionBFS(TagType fp, size_t startBucket, uint32_t *evictionAttempts=nullptr)
Inserts a fingerprint using repeated shallow breadth-first attempts.
static constexpr size_t blockSize
size_t deleteMany(const thrust::device_vector< T > &d_keys, thrust::device_vector< uint8_t > &d_output, cudaStream_t stream={})
Deletes keys in a Thrust device vector (uint8_t output).
~Filter()
Destroys the Cuckoo Filter.
size_t insertManySorted(const T *d_keys, const size_t n, bool *d_output=nullptr, cudaStream_t stream={})
This pre-sorts the input keys based on the primary bucket index to allow for coalesced memory access ...
__device__ bool contains(const T &key) const
Checks if a key exists in the filter.
size_t insertManySorted(const thrust::device_vector< T > &d_keys, thrust::device_vector< bool > &d_output, cudaStream_t stream={})
Inserts keys from a Thrust device vector, sorting them first.
size_t insertMany(const T *d_keys, const size_t n, bool *d_output=nullptr, cudaStream_t stream={})
Inserts a batch of keys into the filter.
Filter & operator=(const Filter &)=delete
size_t h_numOccupied
Number of occupied buckets in the filter.
size_t capacity()
Returns the total capacity of the filter.
size_t insertManyWithEvictionCounts(const thrust::device_vector< T > &d_keys, thrust::device_vector< uint32_t > &d_evictionAttempts, thrust::device_vector< uint8_t > *d_output=nullptr, cudaStream_t stream={})
Inserts keys from a Thrust device vector and records per-attempt eviction counts.
size_t evictionCount()
Returns the total number of evictions performed.
typename Config::KeyType T
#define SDIV(x, y)
Integer division with rounding up (ceiling).
#define CUDA_CALL(err)
Macro for checking CUDA errors.
__global__ void deleteKernel(const typename Config::KeyType *keys, bool *output, size_t n, Filter< Config > *filter)
Kernel for deleting keys.
__host__ __device__ __forceinline__ constexpr bool hasZero(WordType v)
Checks if a packed word contains a zero slot.
__host__ __device__ __forceinline__ uint32_t globalThreadId()
Calculates the global thread ID in a 1D grid.
constexpr bool powerOfTwo(size_t n)
Checks if a number is a power of two.
__global__ void computePackedTagsKernel(const typename Config::KeyType *keys, typename Filter< Config >::PackedTagType *packedTags, size_t n, size_t numBuckets)
Kernel for computing packed tags for sorting.
__global__ void containsKernel(const typename Config::KeyType *keys, bool *output, size_t n, Filter< Config > *filter)
Kernel for checking existence of keys.
__global__ void insertKernelSorted(const typename Filter< Config >::PackedTagType *packedTags, bool *output, size_t n, Filter< Config > *filter, uint32_t *evictionAttempts)
Kernel for inserting pre-sorted keys into the filter.
__global__ void insertKernel(const typename Config::KeyType *keys, bool *output, size_t n, Filter< Config > *filter, uint32_t *evictionAttempts)
Kernel for inserting keys into the filter.
EvictionPolicy
Eviction policy for the Cuckoo Filter.
@ BFS
Breadth-first search (default)
@ DFS
Pure depth-first search.
Configuration structure for the Cuckoo Filter.
static constexpr size_t bucketSize
static constexpr size_t blockSize
static constexpr size_t maxEvictions
static constexpr size_t bitsPerTag
static constexpr EvictionPolicy evictionPolicy
typename std::conditional< bitsPerTag<=8, uint8_t, typename std::conditional< bitsPerTag<=16, uint16_t, uint32_t >::type >::type TagType
AltBucketPolicy_< KeyType, TagType, bitsPerTag, bucketSize_ > AltBucketPolicy
Bucket structure that holds the fingerprint and tags for a given bucket.
__host__ __device__ __forceinline__ TagType extractTag(WordType packed, size_t tagIdx) const
typename Config::WordType WordType
__host__ __device__ __forceinline__ WordType replaceTag(WordType packed, size_t tagIdx, TagType newTag) const
static constexpr size_t tagsPerWord
__device__ __forceinline__ void load128Bit(size_t startIdx, WordType(&out)[N]) const
Loads words using 128-bit vectorized loads into a fixed-size array.
static constexpr size_t wordCount
__device__ static __forceinline__ bool checkWords(const WordType(&loaded)[N], WordType replicatedTag)
Checks an array of loaded words for a matching tag using SWAR.
cuda::std::atomic< WordType > packedTags[wordCount]
__device__ bool contains(TagType tag) const
Checks if a tag is present in the bucket using vectorized loads.
This is used by the sorted insert kernel to store the fingerprint and primary bucket index in a compa...
__host__ __device__ void setFingerprint(TagType fp)
static constexpr PackedTagType fpMask
__host__ __device__ uint64_t getBucketIndex() const
static constexpr size_t fpBits
__host__ __device__ void setBucketIdx(size_t bucketIdx)
static constexpr PackedTagType bucketIdxMask
__host__ __device__ PackedTag()
static constexpr size_t bucketIdxBits
static constexpr size_t totalBits
__host__ __device__ TagType getFingerprint() const
__host__ __device__ PackedTag(TagType fp, uint64_t bucketIdx)