GPU-Accelerated Cuckoo Filter
Loading...
Searching...
No Matches
CuckooFilter.cuh
Go to the documentation of this file.
1#pragma once
2
3#include <thrust/device_vector.h>
4#include <cmath>
5#include <cstdint>
6#include <ctime>
7#include <cub/cub.cuh>
8#include <cuda/std/atomic>
9#include <cuda/std/cstddef>
10#include <cuda/std/cstdint>
11#include <iostream>
12#include <vector>
13#include "bucket_policies.cuh"
14#include "hashutil.cuh"
15#include "helpers.cuh"
16
17namespace cuckoogpu {
18
22enum class EvictionPolicy {
23 BFS,
24 DFS
25};
26
42template <
43 typename T,
44 size_t bitsPerTag_,
45 size_t maxEvictions_ = 500,
46 size_t blockSize_ = 256,
47 size_t bucketSize_ = 16,
48 template <typename, typename, size_t, size_t> class AltBucketPolicy_ = XorAltBucketPolicy,
49 EvictionPolicy evictionPolicy_ = EvictionPolicy::BFS,
50 typename WordType_ = uint64_t>
51struct Config {
52 using KeyType = T;
53 static constexpr size_t bitsPerTag = bitsPerTag_;
54 static constexpr size_t maxEvictions = maxEvictions_;
55 static constexpr size_t blockSize = blockSize_;
56 static constexpr size_t bucketSize = bucketSize_;
57 static constexpr EvictionPolicy evictionPolicy = evictionPolicy_;
58
59 using TagType = typename std::conditional<
60 bitsPerTag <= 8,
61 uint8_t,
62 typename std::conditional<bitsPerTag <= 16, uint16_t, uint32_t>::type>::type;
63
64 using WordType = WordType_;
65 static_assert(
66 std::is_same_v<WordType, uint32_t> || std::is_same_v<WordType, uint64_t>,
67 "WordType must be uint32_t or uint64_t"
68 );
69 static_assert(sizeof(TagType) <= sizeof(WordType), "TagType must fit within WordType");
70
71 using AltBucketPolicy = AltBucketPolicy_<KeyType, TagType, bitsPerTag, bucketSize_>;
72};
73
74template <typename Config>
75class Filter;
76
77namespace detail {
78
82template <typename Config>
83__global__ void insertKernel(
84 const typename Config::KeyType* keys,
85 bool* output,
86 size_t n,
87 Filter<Config>* filter,
88 uint32_t* evictionAttempts
89);
90
94template <typename Config>
95__global__ void insertKernelSorted(
96 const typename Filter<Config>::PackedTagType* packedTags,
97 bool* output,
98 size_t n,
99 Filter<Config>* filter,
100 uint32_t* evictionAttempts
101);
102
106template <typename Config>
107__global__ void computePackedTagsKernel(
108 const typename Config::KeyType* keys,
109 typename Filter<Config>::PackedTagType* packedTags,
110 size_t n,
111 size_t numBuckets
112);
113
117template <typename Config>
118__global__ void containsKernel(
119 const typename Config::KeyType* keys,
120 bool* output,
121 size_t n,
122 Filter<Config>* filter
123);
124
128template <typename Config>
129__global__ void
130deleteKernel(const typename Config::KeyType* keys, bool* output, size_t n, Filter<Config>* filter);
131
132} // namespace detail
133
143template <typename Config>
144struct Filter {
145 using T = typename Config::KeyType;
146 static constexpr size_t bitsPerTag = Config::bitsPerTag;
147
148 using TagType = typename Config::TagType;
150
151 static constexpr size_t tagEntryBytes = sizeof(TagType);
152 static constexpr size_t bucketSize = Config::bucketSize;
153
154 static constexpr size_t maxEvictions = Config::maxEvictions;
155 static constexpr size_t blockSize = Config::blockSize;
156 static_assert(
157 bitsPerTag == 8 || bitsPerTag == 16 || bitsPerTag == 32,
158 "The tag must be 8, 16 or 32 bits"
159 );
160
161 static_assert(detail::powerOfTwo(bucketSize), "Bucket size must be a power of 2");
162
163 using PackedTagType = typename std::conditional<bitsPerTag <= 8, uint32_t, uint64_t>::type;
164
171 struct PackedTag {
173
174 // Lower bits = fingerprint
175 // Upper bits = bucket index
176 static constexpr size_t fpBits = bitsPerTag;
177 static constexpr size_t totalBits = sizeof(PackedTagType) * 8;
178 static constexpr size_t bucketIdxBits = totalBits - fpBits;
179
180 static_assert(fpBits < totalBits, "fpBits must leave at least some bits for bucketIdx");
181
182 static constexpr PackedTagType fpMask = PackedTagType((1ULL << fpBits) - 1ULL);
183
184 static constexpr PackedTagType bucketIdxMask =
185 PackedTagType(((1ULL << bucketIdxBits) - 1ULL) << fpBits);
186
187 __host__ __device__ PackedTag() : value(0) {
188 }
189
190 __host__ __device__ PackedTag(TagType fp, uint64_t bucketIdx) : value(0) {
191 setFingerprint(fp);
192 setBucketIdx(bucketIdx);
193 }
194
195 __host__ __device__ TagType getFingerprint() const {
196 return static_cast<TagType>(value & fpMask);
197 }
198
199 __host__ __device__ uint64_t getBucketIndex() const {
200 return uint64_t((value & bucketIdxMask) >> fpBits);
201 }
202
203 __host__ __device__ void setFingerprint(TagType fp) {
204 value = (value & ~fpMask) | (static_cast<PackedTagType>(fp) & fpMask);
205 }
206
207 __host__ __device__ void setBucketIdx(size_t bucketIdx) {
208 PackedTagType v = static_cast<PackedTagType>(bucketIdx) << fpBits;
209 value = (value & ~bucketIdxMask) | v;
210 }
211 };
212
213 static constexpr TagType EMPTY = 0;
214 static constexpr size_t fpMask = (1ULL << bitsPerTag) - 1;
215
228 struct Bucket {
229 static_assert(detail::powerOfTwo(bitsPerTag), "bitsPerTag must be a power of 2");
230
231 using WordType = typename Config::WordType;
232
233 static constexpr size_t tagsPerWord = sizeof(WordType) / sizeof(TagType);
234 static_assert(tagsPerWord >= 1, "TagType must fit within WordType");
235 static_assert(bucketSize % tagsPerWord == 0, "bucketSize must be divisible by tagsPerWord");
236 static_assert(detail::powerOfTwo(tagsPerWord), "tagsPerWord must be a power of 2");
237
238 static constexpr size_t wordCount = bucketSize / tagsPerWord;
239 static_assert(detail::powerOfTwo(wordCount), "wordCount must be a power of 2");
240
241 cuda::std::atomic<WordType> packedTags[wordCount];
242
243 __host__ __device__ __forceinline__ TagType
244 extractTag(WordType packed, size_t tagIdx) const {
245 return static_cast<TagType>((packed >> (tagIdx * bitsPerTag)) & fpMask);
246 }
247
248 __host__ __device__ __forceinline__ WordType
249 replaceTag(WordType packed, size_t tagIdx, TagType newTag) const {
250 size_t shift = tagIdx * bitsPerTag;
251 WordType cleared = packed & ~(static_cast<WordType>(fpMask) << shift);
252 return cleared | (static_cast<WordType>(newTag) << shift);
253 }
254
258 template <size_t N>
259 __device__ __forceinline__ static bool
260 checkWords(const WordType (&loaded)[N], WordType replicatedTag) {
261 _Pragma("unroll")
262 for (size_t j = 0; j < N; ++j) {
263 if (detail::hasZero<TagType, WordType>(loaded[j] ^ replicatedTag)) {
264 return true;
265 }
266 }
267 return false;
268 }
269
273 template <size_t N>
274 __device__ __forceinline__ void load128Bit(size_t startIdx, WordType (&out)[N]) const {
275 static_assert(N == 2 || N == 4, "128-bit loads support 2 or 4 words");
276 if constexpr (sizeof(WordType) == 4) {
277 auto vec = __ldg(reinterpret_cast<const uint4*>(&packedTags[startIdx]));
278 out[0] = vec.x;
279 out[1] = vec.y;
280 out[2] = vec.z;
281 out[3] = vec.w;
282 } else {
283 auto vec = __ldg(reinterpret_cast<const ulonglong2*>(&packedTags[startIdx]));
284 out[0] = vec.x;
285 out[1] = vec.y;
286 }
287 }
288
294 __device__ bool contains(TagType tag) const {
295 const WordType replicatedTag = detail::replicateTag<TagType, WordType>(tag);
296
297 // Scalar path
298 if constexpr (wordCount == 1) {
299 const auto packed = reinterpret_cast<const WordType&>(packedTags[0]);
300 return detail::hasZero<TagType, WordType>(packed ^ replicatedTag);
301 }
302
303 const uint32_t startSlot = tag & (bucketSize - 1);
304 const size_t startWordIdx = startSlot / tagsPerWord;
305
306#if __CUDA_ARCH__ >= 1000 && !defined(CUCKOO_FILTER_DISABLE_256BIT_LOADS)
307 // 256-bit load path
308 constexpr size_t wordsPerLoad256 = (sizeof(WordType) == 4) ? 8 : 4;
309 if constexpr (wordCount >= wordsPerLoad256) {
310 constexpr size_t alignMask = wordsPerLoad256 - 1;
311 const size_t startAlignedIdx = startWordIdx & ~alignMask;
312
313 WordType loaded[wordsPerLoad256];
314 for (size_t i = 0; i < wordCount / wordsPerLoad256; i++) {
315 const size_t idx = (startAlignedIdx + i * wordsPerLoad256) & (wordCount - 1);
317 reinterpret_cast<const WordType*>(&packedTags[idx]), loaded
318 );
319 if (checkWords(loaded, replicatedTag)) {
320 return true;
321 }
322 }
323 return false;
324 }
325#endif
326 // 128-bit load path
327 constexpr size_t wordsPerLoad128 = (sizeof(WordType) == 4) ? 4 : 2;
328 if constexpr (wordCount >= wordsPerLoad128) {
329 constexpr size_t alignMask = wordsPerLoad128 - 1;
330 const size_t startAlignedIdx = startWordIdx & ~alignMask;
331
332 WordType loaded[wordsPerLoad128];
333 for (size_t i = 0; i < wordCount / wordsPerLoad128; i++) {
334 const size_t idx = (startAlignedIdx + i * wordsPerLoad128) & (wordCount - 1);
335 load128Bit(idx, loaded);
336 if (checkWords(loaded, replicatedTag)) {
337 return true;
338 }
339 }
340 return false;
341 }
342
343 return false;
344 }
345 };
346
347 size_t numBuckets;
349 cuda::std::atomic<size_t>*
351
352#ifdef CUCKOO_FILTER_COUNT_EVICTIONS
353 cuda::std::atomic<size_t>*
355#endif
356
357 size_t h_numOccupied = 0;
358
359 template <typename H>
360 static __host__ __device__ uint64_t hash64(const H& key) {
361 return AltBucketPolicy::hash64(key);
362 }
363
364 static __host__ __device__ cuda::std::tuple<size_t, size_t, TagType, TagType>
366 return AltBucketPolicy::getCandidateBucketsAndFPs(key, numBuckets);
367 }
368
372 static __host__ __device__ size_t
373 getAlternateBucket(size_t bucket, TagType fp, size_t numBuckets) {
374 return AltBucketPolicy::getAlternateBucket(bucket, fp, numBuckets);
375 }
376
381 static __host__ __device__ cuda::std::tuple<size_t, TagType>
382 getAlternateBucketWithNewFp(size_t bucket, TagType fp, size_t numBuckets) {
383 if constexpr (AltBucketPolicy::usesChoiceBit) {
384 return AltBucketPolicy::getAlternateBucketWithNewFp(bucket, fp, numBuckets);
385 } else {
386 return {AltBucketPolicy::getAlternateBucket(bucket, fp, numBuckets), fp};
387 }
388 }
389
394 static size_t calculateNumBuckets(size_t capacity) {
395 return AltBucketPolicy::calculateNumBuckets(capacity);
396 }
397
398 Filter(const Filter&) = delete;
399 Filter& operator=(const Filter&) = delete;
400
409 CUDA_CALL(cudaMalloc(&d_buckets, numBuckets * sizeof(Bucket)));
410 CUDA_CALL(cudaMalloc(&d_numOccupied, sizeof(cuda::std::atomic<size_t>)));
411#ifdef CUCKOO_FILTER_COUNT_EVICTIONS
412 CUDA_CALL(cudaMalloc(&d_numEvictions, sizeof(cuda::std::atomic<size_t>)));
413#endif
414
415 clear();
416 }
417
424 if (d_buckets) {
425 CUDA_CALL(cudaFree(d_buckets));
426 }
427 if (d_numOccupied) {
428 CUDA_CALL(cudaFree(d_numOccupied));
429 }
430#ifdef CUCKOO_FILTER_COUNT_EVICTIONS
431 if (d_numEvictions) {
432 CUDA_CALL(cudaFree(d_numEvictions));
433 }
434#endif
435 }
436
448 const T* d_keys,
449 const size_t n,
450 bool* d_output = nullptr,
451 cudaStream_t stream = {}
452 ) {
453 size_t numBlocks = SDIV(n, blockSize);
455 <<<numBlocks, blockSize, 0, stream>>>(d_keys, d_output, n, this, nullptr);
456
457 CUDA_CALL(cudaStreamSynchronize(stream));
458
459 return occupiedSlots();
460 }
461
462#ifdef CUCKOO_FILTER_COUNT_EVICTIONS
475 const T* d_keys,
476 const size_t n,
477 uint32_t* d_evictionAttempts,
478 bool* d_output = nullptr,
479 cudaStream_t stream = {}
480 ) {
481 size_t numBlocks = SDIV(n, blockSize);
483 <<<numBlocks, blockSize, 0, stream>>>(d_keys, d_output, n, this, d_evictionAttempts);
484
485 CUDA_CALL(cudaStreamSynchronize(stream));
486
487 return occupiedSlots();
488 }
489#endif
490
503 const T* d_keys,
504 const size_t n,
505 bool* d_output = nullptr,
506 cudaStream_t stream = {}
507 ) {
508 PackedTagType* d_packedTags;
509
510 CUDA_CALL(cudaMallocAsync(&d_packedTags, n * sizeof(PackedTagType), stream));
511
512 size_t numBlocks = SDIV(n, blockSize);
513
515 <<<numBlocks, blockSize, 0, stream>>>(d_keys, d_packedTags, n, numBuckets);
516
517 void* d_tempStorage = nullptr;
518 size_t tempStorageBytes = 0;
519
520 cub::DeviceRadixSort::SortKeys(
521 d_tempStorage,
522 tempStorageBytes,
523 d_packedTags,
524 d_packedTags,
525 n,
526 0,
527 sizeof(PackedTagType) * 8,
528 stream
529 );
530
531 CUDA_CALL(cudaMallocAsync(&d_tempStorage, tempStorageBytes, stream));
532
533 cub::DeviceRadixSort::SortKeys(
534 d_tempStorage,
535 tempStorageBytes,
536 d_packedTags,
537 d_packedTags,
538 n,
539 0,
540 sizeof(PackedTagType) * 8,
541 stream
542 );
543
544 CUDA_CALL(cudaFreeAsync(d_tempStorage, stream));
545
547 <<<numBlocks, blockSize, 0, stream>>>(d_packedTags, d_output, n, this, nullptr);
548
549 CUDA_CALL(cudaFreeAsync(d_packedTags, stream));
550 CUDA_CALL(cudaStreamSynchronize(stream));
551
552 return occupiedSlots();
553 }
554
555#ifdef CUCKOO_FILTER_COUNT_EVICTIONS
568 const T* d_keys,
569 const size_t n,
570 uint32_t* d_evictionAttempts,
571 bool* d_output = nullptr,
572 cudaStream_t stream = {}
573 ) {
574 PackedTagType* d_packedTags;
575
576 CUDA_CALL(cudaMallocAsync(&d_packedTags, n * sizeof(PackedTagType), stream));
577
578 size_t numBlocks = SDIV(n, blockSize);
579
581 <<<numBlocks, blockSize, 0, stream>>>(d_keys, d_packedTags, n, numBuckets);
582
583 void* d_tempStorage = nullptr;
584 size_t tempStorageBytes = 0;
585
586 cub::DeviceRadixSort::SortKeys(
587 d_tempStorage,
588 tempStorageBytes,
589 d_packedTags,
590 d_packedTags,
591 n,
592 0,
593 sizeof(PackedTagType) * 8,
594 stream
595 );
596
597 CUDA_CALL(cudaMallocAsync(&d_tempStorage, tempStorageBytes, stream));
598
599 cub::DeviceRadixSort::SortKeys(
600 d_tempStorage,
601 tempStorageBytes,
602 d_packedTags,
603 d_packedTags,
604 n,
605 0,
606 sizeof(PackedTagType) * 8,
607 stream
608 );
609
610 CUDA_CALL(cudaFreeAsync(d_tempStorage, stream));
611
612 detail::insertKernelSorted<Config><<<numBlocks, blockSize, 0, stream>>>(
613 d_packedTags, d_output, n, this, d_evictionAttempts
614 );
615
616 CUDA_CALL(cudaFreeAsync(d_packedTags, stream));
617 CUDA_CALL(cudaStreamSynchronize(stream));
618
619 return occupiedSlots();
620 }
621#endif
622
631 void containsMany(const T* d_keys, const size_t n, bool* d_output, cudaStream_t stream = {}) {
632 size_t numBlocks = SDIV(n, blockSize);
634 <<<numBlocks, blockSize, 0, stream>>>(d_keys, d_output, n, this);
635
636 CUDA_CALL(cudaStreamSynchronize(stream));
637 }
638
651 const T* d_keys,
652 const size_t n,
653 bool* d_output = nullptr,
654 cudaStream_t stream = {}
655 ) {
656 size_t numBlocks = SDIV(n, blockSize);
658 <<<numBlocks, blockSize, 0, stream>>>(d_keys, d_output, n, this);
659
660 CUDA_CALL(cudaStreamSynchronize(stream));
661
662 return occupiedSlots();
663 }
664
673 const thrust::device_vector<T>& d_keys,
674 thrust::device_vector<bool>& d_output,
675 cudaStream_t stream = {}
676 ) {
677 if (d_output.size() != d_keys.size()) {
678 d_output.resize(d_keys.size());
679 }
680 return insertMany(
681 thrust::raw_pointer_cast(d_keys.data()),
682 d_keys.size(),
683 thrust::raw_pointer_cast(d_output.data()),
684 stream
685 );
686 }
687
696 const thrust::device_vector<T>& d_keys,
697 thrust::device_vector<uint8_t>& d_output,
698 cudaStream_t stream = {}
699 ) {
700 if (d_output.size() != d_keys.size()) {
701 d_output.resize(d_keys.size());
702 }
703 return insertMany(
704 thrust::raw_pointer_cast(d_keys.data()),
705 d_keys.size(),
706 reinterpret_cast<bool*>(thrust::raw_pointer_cast(d_output.data())),
707 stream
708 );
709 }
710
717 size_t insertMany(const thrust::device_vector<T>& d_keys, cudaStream_t stream = {}) {
718 return insertMany(thrust::raw_pointer_cast(d_keys.data()), d_keys.size(), nullptr, stream);
719 }
720
721#ifdef CUCKOO_FILTER_COUNT_EVICTIONS
731 const thrust::device_vector<T>& d_keys,
732 thrust::device_vector<uint32_t>& d_evictionAttempts,
733 thrust::device_vector<uint8_t>* d_output = nullptr,
734 cudaStream_t stream = {}
735 ) {
736 if (d_evictionAttempts.size() != d_keys.size()) {
737 d_evictionAttempts.resize(d_keys.size());
738 }
739
740 bool* d_outputPtr = nullptr;
741 if (d_output != nullptr) {
742 if (d_output->size() != d_keys.size()) {
743 d_output->resize(d_keys.size());
744 }
745 d_outputPtr = reinterpret_cast<bool*>(thrust::raw_pointer_cast(d_output->data()));
746 }
747
749 thrust::raw_pointer_cast(d_keys.data()),
750 d_keys.size(),
751 thrust::raw_pointer_cast(d_evictionAttempts.data()),
752 d_outputPtr,
753 stream
754 );
755 }
756#endif
757
766 const thrust::device_vector<T>& d_keys,
767 thrust::device_vector<bool>& d_output,
768 cudaStream_t stream = {}
769 ) {
770 if (d_output.size() != d_keys.size()) {
771 d_output.resize(d_keys.size());
772 }
773 return insertManySorted(
774 thrust::raw_pointer_cast(d_keys.data()),
775 d_keys.size(),
776 thrust::raw_pointer_cast(d_output.data()),
777 stream
778 );
779 }
780
789 const thrust::device_vector<T>& d_keys,
790 thrust::device_vector<uint8_t>& d_output,
791 cudaStream_t stream = {}
792 ) {
793 if (d_output.size() != d_keys.size()) {
794 d_output.resize(d_keys.size());
795 }
796 return insertManySorted(
797 thrust::raw_pointer_cast(d_keys.data()),
798 d_keys.size(),
799 reinterpret_cast<bool*>(thrust::raw_pointer_cast(d_output.data())),
800 stream
801 );
802 }
803
811 size_t insertManySorted(const thrust::device_vector<T>& d_keys, cudaStream_t stream = {}) {
812 return insertManySorted(
813 thrust::raw_pointer_cast(d_keys.data()), d_keys.size(), nullptr, stream
814 );
815 }
816
817#ifdef CUCKOO_FILTER_COUNT_EVICTIONS
828 const thrust::device_vector<T>& d_keys,
829 thrust::device_vector<uint32_t>& d_evictionAttempts,
830 thrust::device_vector<uint8_t>* d_output = nullptr,
831 cudaStream_t stream = {}
832 ) {
833 if (d_evictionAttempts.size() != d_keys.size()) {
834 d_evictionAttempts.resize(d_keys.size());
835 }
836
837 bool* d_outputPtr = nullptr;
838 if (d_output != nullptr) {
839 if (d_output->size() != d_keys.size()) {
840 d_output->resize(d_keys.size());
841 }
842 d_outputPtr = reinterpret_cast<bool*>(thrust::raw_pointer_cast(d_output->data()));
843 }
844
846 thrust::raw_pointer_cast(d_keys.data()),
847 d_keys.size(),
848 thrust::raw_pointer_cast(d_evictionAttempts.data()),
849 d_outputPtr,
850 stream
851 );
852 }
853#endif
854
862 const thrust::device_vector<T>& d_keys,
863 thrust::device_vector<bool>& d_output,
864 cudaStream_t stream = {}
865 ) {
866 if (d_output.size() != d_keys.size()) {
867 d_output.resize(d_keys.size());
868 }
870 thrust::raw_pointer_cast(d_keys.data()),
871 d_keys.size(),
872 thrust::raw_pointer_cast(d_output.data()),
873 stream
874 );
875 }
876
884 const thrust::device_vector<T>& d_keys,
885 thrust::device_vector<uint8_t>& d_output,
886 cudaStream_t stream = {}
887 ) {
888 if (d_output.size() != d_keys.size()) {
889 d_output.resize(d_keys.size());
890 }
892 thrust::raw_pointer_cast(d_keys.data()),
893 d_keys.size(),
894 reinterpret_cast<bool*>(thrust::raw_pointer_cast(d_output.data())),
895 stream
896 );
897 }
898
907 const thrust::device_vector<T>& d_keys,
908 thrust::device_vector<bool>& d_output,
909 cudaStream_t stream = {}
910 ) {
911 if (d_output.size() != d_keys.size()) {
912 d_output.resize(d_keys.size());
913 }
914 return deleteMany(
915 thrust::raw_pointer_cast(d_keys.data()),
916 d_keys.size(),
917 thrust::raw_pointer_cast(d_output.data()),
918 stream
919 );
920 }
921
930 const thrust::device_vector<T>& d_keys,
931 thrust::device_vector<uint8_t>& d_output,
932 cudaStream_t stream = {}
933 ) {
934 if (d_output.size() != d_keys.size()) {
935 d_output.resize(d_keys.size());
936 }
937 return deleteMany(
938 thrust::raw_pointer_cast(d_keys.data()),
939 d_keys.size(),
940 reinterpret_cast<bool*>(thrust::raw_pointer_cast(d_output.data())),
941 stream
942 );
943 }
944
951 size_t deleteMany(const thrust::device_vector<T>& d_keys, cudaStream_t stream = {}) {
952 return deleteMany(thrust::raw_pointer_cast(d_keys.data()), d_keys.size(), nullptr, stream);
953 }
954
958 void clear() {
959 CUDA_CALL(cudaMemset(d_buckets, 0, numBuckets * sizeof(Bucket)));
960 CUDA_CALL(cudaMemset(d_numOccupied, 0, sizeof(cuda::std::atomic<size_t>)));
961#ifdef CUCKOO_FILTER_COUNT_EVICTIONS
962 CUDA_CALL(cudaMemset(d_numEvictions, 0, sizeof(cuda::std::atomic<size_t>)));
963#endif
964 h_numOccupied = 0;
965 }
966
971 [[nodiscard]] float loadFactor() {
972 return static_cast<float>(occupiedSlots()) / (numBuckets * bucketSize);
973 }
974
982 size_t occupiedSlots() {
983 CUDA_CALL(
984 cudaMemcpy(&h_numOccupied, d_numOccupied, sizeof(size_t), cudaMemcpyDeviceToHost)
985 );
986 return h_numOccupied;
987 }
988
989#ifdef CUCKOO_FILTER_COUNT_EVICTIONS
997 size_t evictionCount() {
998 size_t count;
999 CUDA_CALL(cudaMemcpy(&count, d_numEvictions, sizeof(size_t), cudaMemcpyDeviceToHost));
1000 return count;
1001 }
1002
1007 CUDA_CALL(cudaMemset(d_numEvictions, 0, sizeof(cuda::std::atomic<size_t>)));
1008 }
1009#endif
1010
1015 size_t capacity() {
1016 return numBuckets * bucketSize;
1017 }
1018
1023 [[nodiscard]] size_t getNumBuckets() const {
1024 return numBuckets;
1025 }
1026
1031 [[nodiscard]] size_t sizeInBytes() const {
1032 return numBuckets * sizeof(Bucket);
1033 }
1034
1043 std::vector<Bucket> h_buckets(numBuckets);
1044
1045 CUDA_CALL(cudaMemcpy(
1046 h_buckets.data(), d_buckets, numBuckets * sizeof(Bucket), cudaMemcpyDeviceToHost
1047 ));
1048
1049 size_t occupiedCount = 0;
1050
1051 for (size_t bucketIdx = 0; bucketIdx < numBuckets; ++bucketIdx) {
1052 const Bucket& bucket = h_buckets[bucketIdx];
1053
1054 for (size_t atomicIdx = 0; atomicIdx < Bucket::wordCount; ++atomicIdx) {
1055 uint64_t packed = reinterpret_cast<const uint64_t&>(bucket.packedTags[atomicIdx]);
1056
1057 for (size_t tagIdx = 0; tagIdx < Bucket::tagsPerWord; ++tagIdx) {
1058 auto tag = bucket.extractTag(packed, tagIdx);
1059
1060 if (tag != EMPTY) {
1061 occupiedCount++;
1062 }
1063 }
1064 }
1065 }
1066
1067 return occupiedCount;
1068 }
1069
1088 __device__ bool tryRemoveAtBucket(size_t bucketIdx, TagType tag) {
1089 Bucket& bucket = d_buckets[bucketIdx];
1090
1091 const uint32_t startSlot = tag & (bucketSize - 1);
1092 const size_t startWord = startSlot / Bucket::tagsPerWord;
1093
1094 using WordType = typename Bucket::WordType;
1095
1096 for (size_t i = 0; i < Bucket::wordCount; ++i) {
1097 const size_t currIdx = (startWord + i) & (Bucket::wordCount - 1);
1098
1099 while (true) {
1100 auto expected = bucket.packedTags[currIdx].load(cuda::memory_order_relaxed);
1101
1102 WordType matchMask = detail::getZeroMask<TagType, WordType>(
1104 );
1105
1106 if (matchMask == 0) {
1107 // No matching tags in this word
1108 break;
1109 }
1110
1111 // Find position of first matching tag
1112 int bitPos;
1113 if constexpr (sizeof(WordType) == 4) {
1114 bitPos = __ffs(static_cast<int>(matchMask)) - 1;
1115 } else {
1116 bitPos = __ffsll(static_cast<long long>(matchMask)) - 1;
1117 }
1118 size_t tagIdx = bitPos / bitsPerTag;
1119
1120 auto desired = bucket.replaceTag(expected, tagIdx, EMPTY);
1121
1122 if (bucket.packedTags[currIdx].compare_exchange_weak(
1123 expected, desired, cuda::memory_order_relaxed, cuda::memory_order_relaxed
1124 )) {
1125 return true;
1126 }
1127 // CAS failed, retry with updated expected value
1128 }
1129 }
1130
1131 return false;
1132 }
1133
1143 __device__ bool tryInsertAtBucket(size_t bucketIdx, TagType tag) {
1144 Bucket& bucket = d_buckets[bucketIdx];
1145 const uint32_t startIdx = tag & (bucketSize - 1);
1146 const size_t startWord = startIdx / Bucket::tagsPerWord;
1147
1148 using WordType = typename Bucket::WordType;
1149
1150 for (size_t i = 0; i < Bucket::wordCount; ++i) {
1151 const size_t currWord = (startWord + i) & (Bucket::wordCount - 1);
1152 auto expected = bucket.packedTags[currWord].load(cuda::memory_order_relaxed);
1153
1154 while (true) {
1155 WordType zeroMask = detail::getZeroMask<TagType, WordType>(expected);
1156
1157 if (zeroMask == 0) {
1158 // No empty slots in this word, move to next
1159 break;
1160 }
1161
1162 // Find position of first empty slot (returns 1-indexed bit position)
1163 int bitPos;
1164 if constexpr (sizeof(WordType) == 4) {
1165 bitPos = __ffs(static_cast<int>(zeroMask)) - 1;
1166 } else {
1167 bitPos = __ffsll(static_cast<long long>(zeroMask)) - 1;
1168 }
1169 size_t j = bitPos / bitsPerTag;
1170
1171 auto desired = bucket.replaceTag(expected, j, tag);
1172
1173 if (bucket.packedTags[currWord].compare_exchange_strong(
1174 expected, desired, cuda::memory_order_relaxed, cuda::memory_order_relaxed
1175 )) {
1176 return true;
1177 }
1178 }
1179 }
1180 return false;
1181 }
1182
1195 __device__ bool
1196 insertWithEvictionDFS(TagType fp, size_t startBucket, uint32_t* evictionAttempts = nullptr) {
1197 TagType currentFp = fp;
1198 size_t currentBucket = startBucket;
1199
1200 for (size_t evictions = 0; evictions < maxEvictions; ++evictions) {
1201 auto evictSlot = (currentFp + (evictions + 1) * 0x9E3779B1UL) & (bucketSize - 1);
1202
1203 size_t evictWord = evictSlot / Bucket::tagsPerWord;
1204 size_t evictTagIdx = evictSlot & (Bucket::tagsPerWord - 1);
1205
1206 Bucket& bucket = d_buckets[currentBucket];
1207 auto expected = bucket.packedTags[evictWord].load(cuda::memory_order_relaxed);
1208 typename Bucket::WordType desired;
1209 TagType evictedFp;
1210
1211 do {
1212 evictedFp = bucket.extractTag(expected, evictTagIdx);
1213 desired = bucket.replaceTag(expected, evictTagIdx, currentFp);
1214 } while (!bucket.packedTags[evictWord].compare_exchange_strong(
1215 expected, desired, cuda::memory_order_relaxed, cuda::memory_order_relaxed
1216 ));
1217
1218#ifdef CUCKOO_FILTER_COUNT_EVICTIONS
1219 d_numEvictions->fetch_add(1, cuda::memory_order_relaxed);
1220 if (evictionAttempts != nullptr) {
1221 (*evictionAttempts)++;
1222 }
1223#endif
1224
1225 currentFp = evictedFp;
1226 auto [altBucket, newFp] =
1227 getAlternateBucketWithNewFp(currentBucket, evictedFp, numBuckets);
1228 currentBucket = altBucket;
1229 currentFp = newFp;
1230
1231 if (tryInsertAtBucket(currentBucket, currentFp)) {
1232 return true;
1233 }
1234 }
1235 return false;
1236 }
1237
1252 __device__ bool
1253 insertWithEvictionBFS(TagType fp, size_t startBucket, uint32_t* evictionAttempts = nullptr) {
1254 constexpr size_t numCandidates = std::max(1UL, bucketSize / 2);
1255
1256 TagType currentFp = fp;
1257 size_t currentBucket = startBucket;
1258
1259 size_t evictions = 0;
1260 while (evictions < maxEvictions) {
1261 Bucket& bucket = d_buckets[currentBucket];
1262 size_t restartWord = 0;
1263 size_t restartTagIdx = 0;
1264
1265 for (size_t i = 0; i < numCandidates; ++i) {
1266 size_t evictSlot = (currentFp + i * 0x9E3779B1UL + (evictions + 1) * 0x85EBCA77) &
1267 (bucketSize - 1);
1268 size_t evictWord = evictSlot / Bucket::tagsPerWord;
1269 size_t evictTagIdx = evictSlot & (Bucket::tagsPerWord - 1);
1270 restartWord = evictWord;
1271 restartTagIdx = evictTagIdx;
1272
1273 auto packed = bucket.packedTags[evictWord].load(cuda::memory_order_relaxed);
1274 TagType candidateFp = bucket.extractTag(packed, evictTagIdx);
1275
1276 if (candidateFp == EMPTY) {
1277 if (tryInsertAtBucket(currentBucket, currentFp)) {
1278 return true;
1279 }
1280 continue;
1281 }
1282
1283 auto [altBucket, altFp] =
1284 getAlternateBucketWithNewFp(currentBucket, candidateFp, numBuckets);
1285 if (tryInsertAtBucket(altBucket, altFp)) {
1286 // Successfully inserted the evicted tag at its alternate location
1287 // Now atomically swap in our tag at the original location
1288 auto expected = bucket.packedTags[evictWord].load(cuda::memory_order_relaxed);
1289
1290 // Verify the tag is still there and try to replace it
1291 if (bucket.extractTag(expected, evictTagIdx) == candidateFp) {
1292 auto desired = bucket.replaceTag(expected, evictTagIdx, currentFp);
1293
1294 if (bucket.packedTags[evictWord].compare_exchange_strong(
1295 expected,
1296 desired,
1297 cuda::memory_order_relaxed,
1298 cuda::memory_order_relaxed
1299 )) {
1300#ifdef CUCKOO_FILTER_COUNT_EVICTIONS
1301 d_numEvictions->fetch_add(1, cuda::memory_order_relaxed);
1302 if (evictionAttempts != nullptr) {
1303 (*evictionAttempts)++;
1304 }
1305#endif
1306 return true;
1307 }
1308 }
1309
1310 // Failed to swap, clean up the tag we inserted to avoid duplicates
1311 tryRemoveAtBucket(altBucket, candidateFp);
1312 }
1313 }
1314
1315 // Evict the last scanned candidate and continue from its alternate location.
1316 auto expected = bucket.packedTags[restartWord].load(cuda::memory_order_relaxed);
1317 typename Bucket::WordType desired;
1318 TagType evictedFp;
1319
1320 do {
1321 evictedFp = bucket.extractTag(expected, restartTagIdx);
1322 desired = bucket.replaceTag(expected, restartTagIdx, currentFp);
1323 } while (!bucket.packedTags[restartWord].compare_exchange_strong(
1324 expected, desired, cuda::memory_order_relaxed, cuda::memory_order_relaxed
1325 ));
1326
1327 if (evictedFp == EMPTY) {
1328 return true;
1329 }
1330
1331#ifdef CUCKOO_FILTER_COUNT_EVICTIONS
1332 d_numEvictions->fetch_add(1, cuda::memory_order_relaxed);
1333 if (evictionAttempts != nullptr) {
1334 (*evictionAttempts)++;
1335 }
1336#endif
1337
1338 evictions++;
1339 auto [altBucket, altFp] =
1340 getAlternateBucketWithNewFp(currentBucket, evictedFp, numBuckets);
1341 currentBucket = altBucket;
1342 currentFp = altFp;
1343 }
1344
1345 return false;
1346 }
1347
1358 __device__ bool insert(const T& key, uint32_t* evictionAttempts = nullptr) {
1359 auto [i1, i2, fp1, fp2] = getCandidateBucketsAndFPs(key, numBuckets);
1360
1361 // For all policies: fp1 is for bucket i1, fp2 is for bucket i2
1362 // For non-choice-bit policies, fp1 == fp2
1363 if (tryInsertAtBucket(i1, fp1) || tryInsertAtBucket(i2, fp2)) {
1364 return true;
1365 }
1366
1367 // For eviction, use correct fingerprint for the starting bucket
1368 auto startBucket = (fp1 & 1) == 0 ? i1 : i2;
1369 TagType evictFp;
1370
1371 if constexpr (AltBucketPolicy::usesChoiceBit) {
1372 evictFp = (fp1 & 1) == 0 ? fp1 : fp2;
1373 } else {
1374 evictFp = fp1;
1375 }
1376
1378 return insertWithEvictionBFS(evictFp, startBucket, evictionAttempts);
1379 } else if constexpr (Config::evictionPolicy == EvictionPolicy::DFS) {
1380 return insertWithEvictionDFS(evictFp, startBucket, evictionAttempts);
1381 } else {
1382 static_assert(
1385 "Unhandled eviction policy"
1386 );
1387 }
1388 }
1389
1396 __device__ bool contains(const T& key) const {
1397 auto [i1, i2, fp1, fp2] = getCandidateBucketsAndFPs(key, numBuckets);
1398
1399 // fp1 is for bucket i1, fp2 is for bucket i2
1400 // For non-choice-bit policies, fp1 == fp2
1401 return d_buckets[i1].contains(fp1) || d_buckets[i2].contains(fp2);
1402 }
1403
1410 __device__ bool remove(const T& key) {
1411 auto [i1, i2, fp1, fp2] = getCandidateBucketsAndFPs(key, numBuckets);
1412
1413 // fp1 is for bucket i1, fp2 is for bucket i2
1414 // For non-choice-bit policies, fp1 == fp2
1415 return tryRemoveAtBucket(i1, fp1) || tryRemoveAtBucket(i2, fp2);
1416 }
1417};
1418
1419namespace detail {
1420
1421template <typename Config>
1423 const typename Config::KeyType* keys,
1424 bool* output,
1425 size_t n,
1426 Filter<Config>* filter,
1428) {
1429 using BlockReduce = cub::BlockReduce<int32_t, Config::blockSize>;
1430 __shared__ typename BlockReduce::TempStorage tempStorage;
1431
1432 auto idx = globalThreadId();
1433
1434 int32_t success = 0;
1435
1436 if (idx < n) {
1438 success = filter->insert(keys[idx], &threadEvictions);
1439
1440 if (output != nullptr) {
1441 output[idx] = success;
1442 }
1443
1444 if (evictionAttempts != nullptr) {
1446 }
1447 }
1448
1450 __syncthreads();
1451
1452 if (threadIdx.x == 0) {
1453 if (blockSuccessSum > 0) {
1454 filter->d_numOccupied->fetch_add(blockSuccessSum, cuda::memory_order_relaxed);
1455 }
1456 }
1457}
1458
1459template <typename Config>
1461 const typename Config::KeyType* keys,
1462 bool* output,
1463 size_t n,
1464 Filter<Config>* filter
1465) {
1466 auto idx = globalThreadId();
1467
1468 if (idx < n) {
1469 output[idx] = filter->contains(keys[idx]);
1470 }
1471}
1472
1473template <typename Config>
1474__global__ void
1475deleteKernel(const typename Config::KeyType* keys, bool* output, size_t n, Filter<Config>* filter) {
1476 using BlockReduce = cub::BlockReduce<int32_t, Config::blockSize>;
1477 __shared__ typename BlockReduce::TempStorage tempStorage;
1478
1479 auto idx = globalThreadId();
1480
1481 int32_t success = 0;
1482 if (idx < n) {
1483 success = filter->remove(keys[idx]);
1484
1485 if (output != nullptr) {
1486 output[idx] = success;
1487 }
1488 }
1489
1491
1492 if (threadIdx.x == 0 && blockSum > 0) {
1493 filter->d_numOccupied->fetch_sub(blockSum, cuda::memory_order_relaxed);
1494 }
1495}
1496
1497template <typename Config>
1499 const typename Config::KeyType* keys,
1500 typename Filter<Config>::PackedTagType* packedTags,
1501 size_t n,
1502 size_t numBuckets
1503) {
1504 size_t idx = globalThreadId();
1505
1506 if (idx >= n) {
1507 return;
1508 }
1509
1510 using FilterType = Filter<Config>;
1511 using PackedTagType = typename FilterType::PackedTagType;
1512 constexpr size_t bitsPerTag = Config::bitsPerTag;
1513
1514 typename Config::KeyType key = keys[idx];
1515 auto [i1, i2, fp1, fp2] = FilterType::getCandidateBucketsAndFPs(key, numBuckets);
1516
1517 packedTags[idx] =
1518 (static_cast<PackedTagType>(i1) << bitsPerTag) | static_cast<PackedTagType>(fp1);
1519}
1520
1521template <typename Config>
1523 const typename Filter<Config>::PackedTagType* packedTags,
1524 bool* output,
1525 size_t n,
1526 Filter<Config>* filter,
1528) {
1529 using BlockReduce = cub::BlockReduce<int, Config::blockSize>;
1530 __shared__ typename BlockReduce::TempStorage tempStorage;
1531
1532 size_t idx = globalThreadId();
1533
1534 using FilterType = Filter<Config>;
1535 using TagType = typename FilterType::TagType;
1536 using PackedTagType = typename FilterType::PackedTagType;
1537
1538 constexpr size_t bitsPerTag = Config::bitsPerTag;
1539 constexpr TagType fpMask = (1ULL << bitsPerTag) - 1;
1540
1541 int32_t success = 0;
1543 if (idx < n) {
1544 PackedTagType packedTag = packedTags[idx];
1545 size_t primaryBucket = packedTag >> bitsPerTag;
1546 auto fp = static_cast<TagType>(packedTag & fpMask);
1547
1548 if (filter->tryInsertAtBucket(primaryBucket, fp)) {
1549 success = 1;
1550 } else {
1551 auto [i2, fp2] =
1552 FilterType::getAlternateBucketWithNewFp(primaryBucket, fp, filter->numBuckets);
1553
1554 if (filter->tryInsertAtBucket(i2, fp2)) {
1555 success = 1;
1556 } else {
1557 TagType evictFp;
1558 auto startBucket = (fp & 1) == 0 ? primaryBucket : i2;
1559
1560 if constexpr (Config::AltBucketPolicy::usesChoiceBit) {
1561 evictFp = (fp & 1) == 0 ? fp : fp2;
1562 } else {
1563 evictFp = fp;
1564 }
1565
1568 } else if constexpr (Config::evictionPolicy == EvictionPolicy::DFS) {
1570 } else {
1571 static_assert(
1574 "Unhandled eviction policy"
1575 );
1576 }
1577 }
1578 }
1579
1580 if (output != nullptr) {
1581 output[idx] = success;
1582 }
1583
1584 if (evictionAttempts != nullptr) {
1586 }
1587 }
1588
1590
1591 if (threadIdx.x == 0 && blockSum > 0) {
1592 filter->d_numOccupied->fetch_add(blockSum, cuda::memory_order_relaxed);
1593 }
1594}
1595
1596} // namespace detail
1597
1598} // namespace cuckoogpu
A CUDA-accelerated Cuckoo Filter implementation.
cuda::std::atomic< size_t > * d_numOccupied
Pointer to the device memory for the occupancy counter.
static constexpr TagType EMPTY
void containsMany(const thrust::device_vector< T > &d_keys, thrust::device_vector< uint8_t > &d_output, cudaStream_t stream={})
Checks for existence of keys in a Thrust device vector (uint8_t output).
static size_t calculateNumBuckets(size_t capacity)
The number of buckets is enforced to be a power of two in order to allow for efficient modulo on the ...
size_t getNumBuckets() const
Returns the number of buckets in the filter.
static __host__ __device__ uint64_t hash64(const H &key)
Filter(size_t capacity)
Constructs a new Cuckoo Filter.
static constexpr size_t maxEvictions
size_t occupiedSlots()
Returns the total number of occupied slots.
void clear()
Clears the filter, removing all items.
size_t insertMany(const thrust::device_vector< T > &d_keys, thrust::device_vector< uint8_t > &d_output, cudaStream_t stream={})
Inserts keys from a Thrust device vector (uint8_t output).
static constexpr size_t bucketSize
size_t sizeInBytes() const
Returns the size of the filter in bytes.
size_t insertMany(const thrust::device_vector< T > &d_keys, thrust::device_vector< bool > &d_output, cudaStream_t stream={})
Inserts keys from a Thrust device vector.
size_t insertManyWithEvictionCounts(const T *d_keys, const size_t n, uint32_t *d_evictionAttempts, bool *d_output=nullptr, cudaStream_t stream={})
Inserts a batch of keys and records per-attempt eviction counts.
size_t numBuckets
Number of buckets in the filter.
size_t insertManySorted(const thrust::device_vector< T > &d_keys, cudaStream_t stream={})
Inserts keys from a Thrust device vector, sorting them first, without outputting results.
static constexpr size_t fpMask
__device__ bool tryInsertAtBucket(size_t bucketIdx, TagType tag)
Attempts to insert a tag into a specific bucket.
void resetEvictionCount()
Resets the eviction counter to zero.
size_t deleteMany(const T *d_keys, const size_t n, bool *d_output=nullptr, cudaStream_t stream={})
Tries to remove a set of keys from the filter.
size_t insertManySorted(const thrust::device_vector< T > &d_keys, thrust::device_vector< uint8_t > &d_output, cudaStream_t stream={})
Inserts keys from a Thrust device vector, sorting them first (uint8_t output).
float loadFactor()
Calculates the current load factor of the filter.
typename std::conditional< bitsPerTag<=8, uint32_t, uint64_t >::type PackedTagType
size_t insertManySortedWithEvictionCounts(const thrust::device_vector< T > &d_keys, thrust::device_vector< uint32_t > &d_evictionAttempts, thrust::device_vector< uint8_t > *d_output=nullptr, cudaStream_t stream={})
Inserts keys from a Thrust device vector, sorting first, and records per-attempt eviction counts.
static __host__ __device__ size_t getAlternateBucket(size_t bucket, TagType fp, size_t numBuckets)
Computes the alternate bucket for a fingerprint.
size_t insertMany(const thrust::device_vector< T > &d_keys, cudaStream_t stream={})
Inserts keys from a Thrust device vector without outputting results.
cuda::std::atomic< size_t > * d_numEvictions
Pointer to the device memory for the eviction counter.
Filter(const Filter &)=delete
__device__ bool insert(const T &key, uint32_t *evictionAttempts=nullptr)
Inserts a single key into the filter.
Bucket * d_buckets
Pointer to the device memory for the buckets.
size_t countOccupiedSlots()
Counts occupied slots by iterating over all buckets on the host.
typename Config::AltBucketPolicy AltBucketPolicy
size_t insertManySortedWithEvictionCounts(const T *d_keys, const size_t n, uint32_t *d_evictionAttempts, bool *d_output=nullptr, cudaStream_t stream={})
Inserts a pre-sorted batch of keys and records per-attempt eviction counts.
static __host__ __device__ cuda::std::tuple< size_t, TagType > getAlternateBucketWithNewFp(size_t bucket, TagType fp, size_t numBuckets)
Computes alternate bucket AND updated fingerprint for choice bit policies.
void containsMany(const thrust::device_vector< T > &d_keys, thrust::device_vector< bool > &d_output, cudaStream_t stream={})
Checks for existence of keys in a Thrust device vector.
typename Config::TagType TagType
static __host__ __device__ cuda::std::tuple< size_t, size_t, TagType, TagType > getCandidateBucketsAndFPs(const T &key, size_t numBuckets)
__device__ bool tryRemoveAtBucket(size_t bucketIdx, TagType tag)
Attempt to remove a single instance of a fingerprint from a bucket.
__device__ bool remove(const T &key)
Removes a key from the filter.
size_t deleteMany(const thrust::device_vector< T > &d_keys, cudaStream_t stream={})
Deletes keys in a Thrust device vector without outputting results.
size_t deleteMany(const thrust::device_vector< T > &d_keys, thrust::device_vector< bool > &d_output, cudaStream_t stream={})
Deletes keys in a Thrust device vector.
static constexpr size_t tagEntryBytes
void containsMany(const T *d_keys, const size_t n, bool *d_output, cudaStream_t stream={})
Checks for the existence of a batch of keys.
__device__ bool insertWithEvictionDFS(TagType fp, size_t startBucket, uint32_t *evictionAttempts=nullptr)
Inserts a fingerprint into the filter by evicting existing fingerprints.
static constexpr size_t bitsPerTag
__device__ bool insertWithEvictionBFS(TagType fp, size_t startBucket, uint32_t *evictionAttempts=nullptr)
Inserts a fingerprint using repeated shallow breadth-first attempts.
static constexpr size_t blockSize
size_t deleteMany(const thrust::device_vector< T > &d_keys, thrust::device_vector< uint8_t > &d_output, cudaStream_t stream={})
Deletes keys in a Thrust device vector (uint8_t output).
~Filter()
Destroys the Cuckoo Filter.
size_t insertManySorted(const T *d_keys, const size_t n, bool *d_output=nullptr, cudaStream_t stream={})
This pre-sorts the input keys based on the primary bucket index to allow for coalesced memory access ...
__device__ bool contains(const T &key) const
Checks if a key exists in the filter.
size_t insertManySorted(const thrust::device_vector< T > &d_keys, thrust::device_vector< bool > &d_output, cudaStream_t stream={})
Inserts keys from a Thrust device vector, sorting them first.
size_t insertMany(const T *d_keys, const size_t n, bool *d_output=nullptr, cudaStream_t stream={})
Inserts a batch of keys into the filter.
Filter & operator=(const Filter &)=delete
size_t h_numOccupied
Number of occupied buckets in the filter.
size_t capacity()
Returns the total capacity of the filter.
size_t insertManyWithEvictionCounts(const thrust::device_vector< T > &d_keys, thrust::device_vector< uint32_t > &d_evictionAttempts, thrust::device_vector< uint8_t > *d_output=nullptr, cudaStream_t stream={})
Inserts keys from a Thrust device vector and records per-attempt eviction counts.
size_t evictionCount()
Returns the total number of evictions performed.
typename Config::KeyType T
#define SDIV(x, y)
Integer division with rounding up (ceiling).
Definition helpers.cuh:198
#define CUDA_CALL(err)
Macro for checking CUDA errors.
Definition helpers.cuh:204
__global__ void deleteKernel(const typename Config::KeyType *keys, bool *output, size_t n, Filter< Config > *filter)
Kernel for deleting keys.
__host__ __device__ __forceinline__ constexpr bool hasZero(WordType v)
Checks if a packed word contains a zero slot.
Definition helpers.cuh:116
__host__ __device__ __forceinline__ uint32_t globalThreadId()
Calculates the global thread ID in a 1D grid.
Definition helpers.cuh:24
constexpr bool powerOfTwo(size_t n)
Checks if a number is a power of two.
Definition helpers.cuh:16
__global__ void computePackedTagsKernel(const typename Config::KeyType *keys, typename Filter< Config >::PackedTagType *packedTags, size_t n, size_t numBuckets)
Kernel for computing packed tags for sorting.
__global__ void containsKernel(const typename Config::KeyType *keys, bool *output, size_t n, Filter< Config > *filter)
Kernel for checking existence of keys.
__global__ void insertKernelSorted(const typename Filter< Config >::PackedTagType *packedTags, bool *output, size_t n, Filter< Config > *filter, uint32_t *evictionAttempts)
Kernel for inserting pre-sorted keys into the filter.
__global__ void insertKernel(const typename Config::KeyType *keys, bool *output, size_t n, Filter< Config > *filter, uint32_t *evictionAttempts)
Kernel for inserting keys into the filter.
EvictionPolicy
Eviction policy for the Cuckoo Filter.
@ BFS
Breadth-first search (default)
@ DFS
Pure depth-first search.
Configuration structure for the Cuckoo Filter.
static constexpr size_t bucketSize
static constexpr size_t blockSize
static constexpr size_t maxEvictions
static constexpr size_t bitsPerTag
static constexpr EvictionPolicy evictionPolicy
typename std::conditional< bitsPerTag<=8, uint8_t, typename std::conditional< bitsPerTag<=16, uint16_t, uint32_t >::type >::type TagType
AltBucketPolicy_< KeyType, TagType, bitsPerTag, bucketSize_ > AltBucketPolicy
Bucket structure that holds the fingerprint and tags for a given bucket.
__host__ __device__ __forceinline__ TagType extractTag(WordType packed, size_t tagIdx) const
typename Config::WordType WordType
__host__ __device__ __forceinline__ WordType replaceTag(WordType packed, size_t tagIdx, TagType newTag) const
static constexpr size_t tagsPerWord
__device__ __forceinline__ void load128Bit(size_t startIdx, WordType(&out)[N]) const
Loads words using 128-bit vectorized loads into a fixed-size array.
static constexpr size_t wordCount
__device__ static __forceinline__ bool checkWords(const WordType(&loaded)[N], WordType replicatedTag)
Checks an array of loaded words for a matching tag using SWAR.
cuda::std::atomic< WordType > packedTags[wordCount]
__device__ bool contains(TagType tag) const
Checks if a tag is present in the bucket using vectorized loads.
This is used by the sorted insert kernel to store the fingerprint and primary bucket index in a compa...
__host__ __device__ void setFingerprint(TagType fp)
static constexpr PackedTagType fpMask
__host__ __device__ uint64_t getBucketIndex() const
static constexpr size_t fpBits
__host__ __device__ void setBucketIdx(size_t bucketIdx)
static constexpr PackedTagType bucketIdxMask
__host__ __device__ PackedTag()
static constexpr size_t bucketIdxBits
static constexpr size_t totalBits
__host__ __device__ TagType getFingerprint() const
__host__ __device__ PackedTag(TagType fp, uint64_t bucketIdx)