Functions
template<typename Config >
__global__ void	insertKernel (const typename Config::KeyType keys, bool output, size_t n, Filter< Config > filter, uint32_t evictionAttempts)
	Kernel for inserting keys into the filter.

template<typename Config >
__global__ void	insertKernelSorted (const typename Filter< Config >::PackedTagType packedTags, bool output, size_t n, Filter< Config > filter, uint32_t evictionAttempts)
	Kernel for inserting pre-sorted keys into the filter.

template<typename Config >
__global__ void	computePackedTagsKernel (const typename Config::KeyType keys, typename Filter< Config >::PackedTagType packedTags, size_t n, size_t numBuckets)
	Kernel for computing packed tags for sorting.

template<typename Config >
__global__ void	containsKernel (const typename Config::KeyType keys, bool output, size_t n, Filter< Config > *filter)
	Kernel for checking existence of keys.

template<typename Config >
__global__ void	deleteKernel (const typename Config::KeyType keys, bool output, size_t n, Filter< Config > *filter)
	Kernel for deleting keys.

constexpr bool	powerOfTwo (size_t n)
	Checks if a number is a power of two.

__host__ __device__ __forceinline__ uint32_t	globalThreadId ()
	Calculates the global thread ID in a 1D grid.

constexpr size_t	nextPowerOfTwo (size_t n)
	Calculates the next power of two greater than or equal to n.

template<typename T >
size_t	countOnes (T *data, size_t n)
	Counts the number of non-zero elements in an array.

template<typename TagType , typename WordType >
__host__ __device__ __forceinline__ constexpr WordType	getZeroMask (WordType v)
	Returns a bitmask indicating which slots in a packed word are zero.

template<typename TagType , typename WordType >
__host__ __device__ __forceinline__ constexpr bool	hasZero (WordType v)
	Checks if a packed word contains a zero slot.

template<typename TagType , typename WordType >
__host__ __device__ __forceinline__ constexpr WordType	replicateTag (TagType tag)
	Replicates a tag value across all slots in a word.

template<typename Kernel >
constexpr size_t	maxOccupancyGridSize (int32_t blockSize, Kernel kernel, size_t dynamicSMemSize)
	Calculates the maximum occupancy grid size for a kernel.

Function Documentation

◆ computePackedTagsKernel()

template<typename Config >

__global__ void cuckoogpu::detail::computePackedTagsKernel	(	const typename Config::KeyType *	keys,
		typename Filter< Config >::PackedTagType *	packedTags,
		size_t	n,
		size_t	numBuckets
	)

Kernel for computing packed tags for sorting.

Definition at line 1498 of file CuckooFilter.cuh.

  {
    size_t idx = globalThreadId();
 
    if (idx >= n) {
        return;
    }
 
    using FilterType = Filter<Config>;
    using PackedTagType = typename FilterType::PackedTagType;
    constexpr size_t bitsPerTag = Config::bitsPerTag;
 
    typename Config::KeyType key = keys[idx];
    auto [i1, i2, fp1, fp2] = FilterType::getCandidateBucketsAndFPs(key, numBuckets);
 
    packedTags[idx] =
        (static_cast<PackedTagType>(i1) << bitsPerTag) | static_cast<PackedTagType>(fp1);
}

Here is the call graph for this function:

◆ containsKernel()

template<typename Config >

__global__ void cuckoogpu::detail::containsKernel	(	const typename Config::KeyType *	keys,
		bool *	output,
		size_t	n,
		Filter< Config > *	filter
	)

Kernel for checking existence of keys.

Definition at line 1460 of file CuckooFilter.cuh.

  {
    auto idx = globalThreadId();
 
    if (idx < n) {
        output[idx] = filter->contains(keys[idx]);
    }
}

Here is the call graph for this function:

◆ countOnes()

template<typename T >

size_t cuckoogpu::detail::countOnes	(	T *	data,
		size_t	n
	)

Counts the number of non-zero elements in an array.

Template Parameters

T	Type of elements.

Parameters

data	Pointer to the array.
n	Number of elements.

Returns: size_t Number of non-zero elements.

Definition at line 57 of file helpers.cuh.

                                    {
    size_t count = 0;
    for (size_t i = 0; i < n; ++i) {
        if (data[i]) {
            count++;
        }
    }
    return count;
}

Here is the call graph for this function:

◆ deleteKernel()

template<typename Config >

__global__ void cuckoogpu::detail::deleteKernel	(	const typename Config::KeyType *	keys,
		bool *	output,
		size_t	n,
		Filter< Config > *	filter
	)

Kernel for deleting keys.

Definition at line 1475 of file CuckooFilter.cuh.

                                                                                                 {
    using BlockReduce = cub::BlockReduce<int32_t, Config::blockSize>;
    __shared__ typename BlockReduce::TempStorage tempStorage;
 
    auto idx = globalThreadId();
 
    int32_t success = 0;
    if (idx < n) {
        success = filter->remove(keys[idx]);
 
        if (output != nullptr) {
            output[idx] = success;
        }
    }
 
    int32_t blockSum = BlockReduce(tempStorage).Sum(success);
 
    if (threadIdx.x == 0 && blockSum > 0) {
        filter->d_numOccupied->fetch_sub(blockSum, cuda::memory_order_relaxed);
    }
}

Here is the call graph for this function:

◆ getZeroMask()

template<typename TagType , typename WordType >

__host__ __device__ __forceinline__ constexpr WordType cuckoogpu::detail::getZeroMask ( WordType v )

constexpr

Returns a bitmask indicating which slots in a packed word are zero.

Uses SWAR (SIMD Within A Register) to check multiple items in parallel. See https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord

The high bit of each slot that is zero will be set in the result.

Template Parameters

TagType	The type of the individual items (uint8_t, uint16_t, or uint32_t)
WordType	The packed word type (uint32_t or uint64_t)

Parameters

v	The packed integer

Returns: A bitmask with the high bit of each zero slot set

Definition at line 81 of file helpers.cuh.

                                                                               {
    static_assert(sizeof(WordType) == 4 || sizeof(WordType) == 8, "WordType must be 32 or 64 bits");
 
    if constexpr (sizeof(WordType) == 8) {
        if constexpr (sizeof(TagType) == 1) {
            return (v - 0x0101010101010101ULL) & ~v & 0x8080808080808080ULL;
        } else if constexpr (sizeof(TagType) == 2) {
            return (v - 0x0001000100010001ULL) & ~v & 0x8000800080008000ULL;
        } else if constexpr (sizeof(TagType) == 4) {
            return (v - 0x0000000100000001ULL) & ~v & 0x8000000080000000ULL;
        } else {
            return 0;
        }
    } else {
        if constexpr (sizeof(TagType) == 1) {
            return (v - 0x01010101U) & ~v & 0x80808080U;
        } else if constexpr (sizeof(TagType) == 2) {
            return (v - 0x00010001U) & ~v & 0x80008000U;
        } else if constexpr (sizeof(TagType) == 4) {
            return (v - 0x00000001U) & ~v & 0x80000000U;
        } else {
            return 0;
        }
    }
}

Here is the call graph for this function:

◆ globalThreadId()

__host__ __device__ __forceinline__ uint32_t cuckoogpu::detail::globalThreadId ( )

Calculates the global thread ID in a 1D grid.

Returns: uint32_t Global thread ID.

Definition at line 24 of file helpers.cuh.

                                                              {
    return blockIdx.x * blockDim.x + threadIdx.x;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ hasZero()

template<typename TagType , typename WordType >

__host__ __device__ __forceinline__ constexpr bool cuckoogpu::detail::hasZero ( WordType v )

constexpr

Checks if a packed word contains a zero slot.

Template Parameters

TagType	The type of the individual items (uint8_t, uint16_t, or uint32_t)
WordType	The packed word type (uint32_t or uint64_t)

Parameters

v	The packed integer

Returns: true if any of the items in v are zero

Definition at line 116 of file helpers.cuh.

                                                                       {
    return getZeroMask<TagType, WordType>(v) != 0;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ insertKernel()

template<typename Config >

__global__ void cuckoogpu::detail::insertKernel	(	const typename Config::KeyType *	keys,
		bool *	output,
		size_t	n,
		Filter< Config > *	filter,
		uint32_t *	evictionAttempts
	)

Kernel for inserting keys into the filter.

Definition at line 1422 of file CuckooFilter.cuh.

  {
    using BlockReduce = cub::BlockReduce<int32_t, Config::blockSize>;
    __shared__ typename BlockReduce::TempStorage tempStorage;
 
    auto idx = globalThreadId();
 
    int32_t success = 0;
 
    if (idx < n) {
        uint32_t threadEvictions = 0;
        success = filter->insert(keys[idx], &threadEvictions);
 
        if (output != nullptr) {
            output[idx] = success;
        }
 
        if (evictionAttempts != nullptr) {
            evictionAttempts[idx] = threadEvictions;
        }
    }
 
    int32_t blockSuccessSum = BlockReduce(tempStorage).Sum(success);
    __syncthreads();
 
    if (threadIdx.x == 0) {
        if (blockSuccessSum > 0) {
            filter->d_numOccupied->fetch_add(blockSuccessSum, cuda::memory_order_relaxed);
        }
    }
}

Here is the call graph for this function:

◆ insertKernelSorted()

template<typename Config >

__global__ void cuckoogpu::detail::insertKernelSorted	(	const typename Filter< Config >::PackedTagType *	packedTags,
		bool *	output,
		size_t	n,
		Filter< Config > *	filter,
		uint32_t *	evictionAttempts
	)

Kernel for inserting pre-sorted keys into the filter.

Definition at line 1522 of file CuckooFilter.cuh.

  {
    using BlockReduce = cub::BlockReduce<int, Config::blockSize>;
    __shared__ typename BlockReduce::TempStorage tempStorage;
 
    size_t idx = globalThreadId();
 
    using FilterType = Filter<Config>;
    using TagType = typename FilterType::TagType;
    using PackedTagType = typename FilterType::PackedTagType;
 
    constexpr size_t bitsPerTag = Config::bitsPerTag;
    constexpr TagType fpMask = (1ULL << bitsPerTag) - 1;
 
    int32_t success = 0;
    uint32_t threadEvictions = 0;
    if (idx < n) {
        PackedTagType packedTag = packedTags[idx];
        size_t primaryBucket = packedTag >> bitsPerTag;
        auto fp = static_cast<TagType>(packedTag & fpMask);
 
        if (filter->tryInsertAtBucket(primaryBucket, fp)) {
            success = 1;
        } else {
            auto [i2, fp2] =
                FilterType::getAlternateBucketWithNewFp(primaryBucket, fp, filter->numBuckets);
 
            if (filter->tryInsertAtBucket(i2, fp2)) {
                success = 1;
            } else {
                TagType evictFp;
                auto startBucket = (fp & 1) == 0 ? primaryBucket : i2;
 
                if constexpr (Config::AltBucketPolicy::usesChoiceBit) {
                    evictFp = (fp & 1) == 0 ? fp : fp2;
                } else {
                    evictFp = fp;
                }
 
                if constexpr (Config::evictionPolicy == EvictionPolicy::BFS) {
                    success = filter->insertWithEvictionBFS(evictFp, startBucket, &threadEvictions);
                } else if constexpr (Config::evictionPolicy == EvictionPolicy::DFS) {
                    success = filter->insertWithEvictionDFS(evictFp, startBucket, &threadEvictions);
                } else {
                    static_assert(
                        Config::evictionPolicy == EvictionPolicy::DFS ||
                            Config::evictionPolicy == EvictionPolicy::BFS,
                        "Unhandled eviction policy"
                    );
                }
            }
        }
 
        if (output != nullptr) {
            output[idx] = success;
        }
 
        if (evictionAttempts != nullptr) {
            evictionAttempts[idx] = threadEvictions;
        }
    }
 
    int32_t blockSum = BlockReduce(tempStorage).Sum(success);
 
    if (threadIdx.x == 0 && blockSum > 0) {
        filter->d_numOccupied->fetch_add(blockSum, cuda::memory_order_relaxed);
    }
}

Here is the call graph for this function:

◆ maxOccupancyGridSize()

template<typename Kernel >

constexpr size_t cuckoogpu::detail::maxOccupancyGridSize	(	int32_t	blockSize,
		Kernel	kernel,
		size_t	dynamicSMemSize
	)

constexpr

Calculates the maximum occupancy grid size for a kernel.

Template Parameters

Kernel Type of the kernel function.

Parameters

blockSize	Block size (threads per block).
kernel	The kernel function.
dynamicSMemSize	Dynamic shared memory size per block.

Returns: size_t The calculated grid size (number of blocks).

Definition at line 224 of file helpers.cuh.

                                                                                                {
    int device = 0;
    cudaGetDevice(&device);
 
    int numSM = -1;
    cudaDeviceGetAttribute(&numSM, cudaDevAttrMultiProcessorCount, device);
 
    int maxActiveBlocksPerSM{};
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &maxActiveBlocksPerSM, kernel, blockSize, dynamicSMemSize
    );
 
    return maxActiveBlocksPerSM * numSM;
}

Here is the call graph for this function:

◆ nextPowerOfTwo()

constexpr size_t cuckoogpu::detail::nextPowerOfTwo ( size_t n )

constexpr

Calculates the next power of two greater than or equal to n.

Parameters

n	Input number.

Returns: size_t Next power of two.

Definition at line 33 of file helpers.cuh.

                                          {
    if (powerOfTwo(n))
        return n;
 
    n--;
    n |= n >> 1;
    n |= n >> 2;
    n |= n >> 4;
    n |= n >> 8;
    n |= n >> 16;
    n |= n >> 32;
    n++;
 
    return n;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ powerOfTwo()

constexpr bool cuckoogpu::detail::powerOfTwo ( size_t n )

constexpr

Checks if a number is a power of two.

Parameters

n	Number to check.

Returns: true if n is a power of two, false otherwise.

Definition at line 16 of file helpers.cuh.

                                    {
    return n != 0 && (n & (n - 1)) == 0;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ replicateTag()

template<typename TagType , typename WordType >

__host__ __device__ __forceinline__ constexpr WordType cuckoogpu::detail::replicateTag ( TagType tag )

constexpr

Replicates a tag value across all slots in a word.

Template Parameters

TagType	The type of the tag (uint8_t, uint16_t, or uint32_t)
WordType	The target word type (uint32_t or uint64_t)

Parameters

tag	The tag value to replicate

Returns: A word with the tag replicated in every slot

Definition at line 129 of file helpers.cuh.

                                                                                 {
    static_assert(sizeof(WordType) == 4 || sizeof(WordType) == 8, "WordType must be 32 or 64 bits");
 
    if constexpr (sizeof(WordType) == 8) {
        if constexpr (sizeof(TagType) == 1) {
            return static_cast<uint64_t>(tag) * 0x0101010101010101ULL;
        } else if constexpr (sizeof(TagType) == 2) {
            return static_cast<uint64_t>(tag) * 0x0001000100010001ULL;
        } else if constexpr (sizeof(TagType) == 4) {
            return static_cast<uint64_t>(tag) * 0x0000000100000001ULL;
        } else {
            return tag;
        }
    } else {
        if constexpr (sizeof(TagType) == 1) {
            return static_cast<uint32_t>(tag) * 0x01010101U;
        } else if constexpr (sizeof(TagType) == 2) {
            return static_cast<uint32_t>(tag) * 0x00010001U;
        } else if constexpr (sizeof(TagType) == 4) {
            return static_cast<uint32_t>(tag);
        } else {
            return tag;
        }
    }
}

Here is the call graph for this function:

Functions

Function Documentation

◆ computePackedTagsKernel()

◆ containsKernel()

◆ countOnes()

◆ deleteKernel()

◆ getZeroMask()

◆ globalThreadId()

◆ hasZero()

◆ insertKernel()

◆ insertKernelSorted()

◆ maxOccupancyGridSize()

◆ nextPowerOfTwo()

◆ powerOfTwo()

◆ replicateTag()